Skip to content

Commit

Permalink
Matchmissing == :notequal (#2724)
Browse files Browse the repository at this point in the history
  • Loading branch information
pstorozenko authored Jun 3, 2021
1 parent f56982d commit 5d8e52b
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 14 deletions.
9 changes: 9 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# DataFrames.jl changes on main since last release notes

## New functionalities

* add option `matchmissing=:notequal` in joins;
in `leftjoin`, `semijoin` and `antijoin` missings are dropped in right data frame,
but preserved in left; in `rightjoin` missings are dropped in left data frame,
but preserved in right; in `innerjoin` missings are dropped in both data frames;
in `outerjoin` this value of keyword argument is not supported
([#2724](/~https://github.com/JuliaData/DataFrames.jl/pull/2724))

## Bug fixes

* fix bug in how `issorted` handles custom orderings and improve performance
Expand Down
49 changes: 35 additions & 14 deletions src/join/composer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ struct DataFrameJoiner

function DataFrameJoiner(dfl::AbstractDataFrame, dfr::AbstractDataFrame,
on::Union{<:OnType, AbstractVector},
matchmissing::Symbol)
matchmissing::Symbol,
kind::Symbol)
on_cols = isa(on, AbstractVector) ? on : [on]
left_on = Symbol[]
right_on = Symbol[]
Expand All @@ -45,20 +46,35 @@ struct DataFrameJoiner
"Symbol or Pair{Symbol, Symbol}."))
end
end
dfl_on = dfl[!, left_on]
dfr_on = dfr[!, right_on]


if matchmissing === :notequal
if kind in (:left, :semi, :anti)
dfr = dropmissing(dfr, right_on, view=true)
elseif kind === :right
dfl = dropmissing(dfl, left_on, view=true)
elseif kind === :inner
# it possible to drop only left or right df
# to gain some performance but needs more testing, see #2724
dfl = dropmissing(dfl, left_on, view=true)
dfr = dropmissing(dfr, right_on, view=true)
elseif kind === :outer
throw(ArgumentError("matchmissing == :notequal for `outerjoin` is not allowed"))
else
throw(ArgumentError("matchmissing == :notequal not implemented for kind == :$kind"))
end
end
dfl_on = select(dfl, left_on, copycols=false)
dfr_on = select(dfr, right_on, copycols=false)
if matchmissing === :error
for df in (dfl_on, dfr_on), col in eachcol(df)
if any(ismissing, col)
throw(ArgumentError("missing values in key columns are not allowed " *
"when matchmissing == :error"))
end
end
elseif matchmissing !== :equal
throw(ArgumentError("matchmissing allows only :error or :equal"))
elseif !(matchmissing in (:equal, :notequal))
throw(ArgumentError("matchmissing allows only :error, :equal, or :notequal"))
end

for df in (dfl_on, dfr_on), col in eachcol(df)
if any(x -> (x isa Union{Complex, Real}) &&
(isnan(x) || isequal(real(x), -0.0) || isequal(imag(x), -0.0)), col)
Expand Down Expand Up @@ -311,7 +327,7 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
throw(ArgumentError("Missing join argument 'on'."))
end

joiner = DataFrameJoiner(df1, df2, on, matchmissing)
joiner = DataFrameJoiner(df1, df2, on, matchmissing, kind)

# Check merge key validity
left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false
Expand Down Expand Up @@ -485,7 +501,8 @@ change in future releases.
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
matched; if equal to `:notequal` then missings are dropped in `df1` and `df2`
`on` columns; `isequal` is used for comparisons of rows for equality
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
Expand Down Expand Up @@ -626,7 +643,8 @@ change in future releases.
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
`isequal` is used for comparisons of rows for equality
All columns of the returned data table will support missing values.
Expand Down Expand Up @@ -772,7 +790,8 @@ change in future releases.
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
matched; if equal to `:notequal` then missings are dropped in `df1` `on` columns;
`isequal` is used for comparisons of rows for equality
All columns of the returned data table will support missing values.
Expand Down Expand Up @@ -923,7 +942,7 @@ This behavior may change in future releases.
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
matched; `isequal` is used for comparisons of rows for equality
All columns of the returned data table will support missing values.
Expand Down Expand Up @@ -1071,7 +1090,8 @@ The order of rows in the result is undefined and may change in the future releas
By default no check is performed.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
`isequal` is used for comparisons of rows for equality
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
Expand Down Expand Up @@ -1176,7 +1196,8 @@ The order of rows in the result is undefined and may change in the future releas
By default no check is performed.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
`isequal` is used for comparisons of rows for equality
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
Expand Down
141 changes: 141 additions & 0 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ anti = left[Bool[ismissing(x) for x in left.Job], [:ID, :Name]]

@test_throws ArgumentError innerjoin(name, job)
@test_throws ArgumentError innerjoin(name, job, on = :ID, matchmissing=:errors)
@test_throws ArgumentError innerjoin(name, job, on = :ID, matchmissing=:weirdmatch)
@test_throws ArgumentError outerjoin(name, job, on = :ID, matchmissing=:notequal)

@test innerjoin(name, job, on = :ID) == inner
@test outerjoin(name, job, on = :ID) outer
Expand Down Expand Up @@ -1557,4 +1559,143 @@ end
c="c", d="d")
end

@testset "matchmissing :notequal correctness" begin
Random.seed!(1337)
names = [
DataFrame(ID=[1, 2, missing],
Name=["John Doe", "Jane Doe", "Joe Blogs"]),
DataFrame(ID=[],
Name=[]),
DataFrame(ID=missings(3),
Name=["John Doe", "Jane Doe", "Joe Blogs"]),
DataFrame(ID=[1, 2, 3],
Name=[missing, "Jane Doe", missing]),
DataFrame(ID=[1:100; missings(100)],
Name=repeat(["Jane Doe"], 200)),
DataFrame(ID=[missings(100); 1:100],
Name=repeat(["Jane Doe"], 200)),
DataFrame(ID=[1:50; missings(100); 51:100],
Name=repeat(["Jane Doe"], 200)),
DataFrame(ID=[1:64; missings(64); 129:200],
Name=repeat(["Jane Doe"], 200)),
DataFrame(ID=[1:63; missings(65); 129:200],
Name=repeat(["Jane Doe"], 200)),
DataFrame(ID=rand([1:1000; missing], 10000),
Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)),
]
jobs = [
DataFrame(ID=[1, 2, 2, 4],
Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
DataFrame(ID=[missing, 2, 2, 4],
Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
DataFrame(ID=[missing, 2, 2, 4],
Job=["Lawyer", "Doctor", missing, "Farmer"]),
DataFrame(ID=[],
Job=[]),
DataFrame(ID=[1:100; missings(100)],
Job=repeat(["Lawyer"], 200)),
DataFrame(ID=[missings(100); 1:100],
Job=repeat(["Lawyer"], 200)),
DataFrame(ID=[1:50; missings(100); 51:100],
Job=repeat(["Lawyer"], 200)),
DataFrame(ID=[1:64; missings(64); 129:200],
Job=repeat(["Lawyer"], 200)),
DataFrame(ID=[1:63; missings(65); 129:200],
Job=repeat(["Lawyer"], 200)),
DataFrame(ID=rand([1:1000; missing], 10000),
Job=rand(["Lawyer", "Doctor", "Florist", missing], 10000)),
]
for name in names, job in jobs
@test leftjoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal)
leftjoin(name, job, on=:ID, matchmissing=:notequal)
@test semijoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal)
semijoin(name, job, on=:ID, matchmissing=:notequal)
@test antijoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal)
antijoin(name, job, on=:ID, matchmissing=:notequal)
@test rightjoin(dropmissing(name, :ID), job, on=:ID, matchmissing=:equal)
rightjoin(name, job, on=:ID, matchmissing=:notequal)
@test innerjoin(dropmissing(name, :ID), dropmissing(job, :ID), on=:ID, matchmissing=:equal)
innerjoin(name, job, on=:ID, matchmissing=:notequal)
end

rl(n) = rand(["a", "b", "c"], n)
names2 = [
DataFrame(ID1=[1, 1, 2],
ID2=["a", "b", "a"],
Name=["John Doe", "Jane Doe", "Joe Blogs"]),
DataFrame(ID1=[1, 1, 2, missing],
ID2=["a", "b", "a", missing],
Name=["John Doe", "Jane Doe", "Joe Blogs", missing]),
DataFrame(ID1=[missing, 1, 2, missing],
ID2=["a", "b", missing, missing],
Name=[missing, "Jane Doe", "Joe Blogs", missing]),
DataFrame(ID1=[missing, 1, 2, missing],
ID2=["a", "b", missing, missing],
Name=missings(4)),
DataFrame(ID1=[missing, 1, 2, missing],
ID2=missings(4),
Name=["John Doe", "Jane Doe", "Joe Blogs", missing]),
DataFrame(ID1=[1:100; missings(100)],
ID2=[rl(100); missings(100)],
Name=rand(["Jane Doe", "Jane Doe"], 200)),
DataFrame(ID1=[missings(100); 1:100],
ID2=[missings(100); rl(100)],
Name=rand(["Jane Doe", "Jane Doe"], 200)),
DataFrame(ID1=[1:50; missings(100); 51:100],
ID2=[rl(50); missings(100); rl(50)],
Name=rand(["Jane Doe", "Jane Doe"], 200)),
DataFrame(ID1=[1:64; missings(64); 129:200],
ID2=[rl(64); missings(64); rl(200 - 128)],
Name=rand(["Jane Doe", "Jane Doe"], 200)),
DataFrame(ID1=[1:63; missings(65); 129:200],
ID2=[rl(64); missings(65); rl(200 - 129)],
Name=rand(["Jane Doe", "Jane Doe"], 200)),
DataFrame(ID1=rand([1:100; missing], 10000),
ID2=rand(["a", "b", "c", missing], 10000),
Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)),
]
jobs2 = [
DataFrame(ID1=[1, 2, 2, 4],
ID2=["a", "b", "b", "c"],
Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
DataFrame(ID1=[1, 2, 2, 4, missing],
ID2=["a", "b", "b", "c", missing],
Job=["Lawyer", "Doctor", "Florist", "Farmer", missing]),
DataFrame(ID1=[1, 2, missing, 4, missing],
ID2=["a", "b", missing, "c", missing],
Job=[missing, "Doctor", "Florist", "Farmer", missing]),
DataFrame(ID1=[1:100; missings(100)],
ID2=[rl(100); missings(100)],
Job=rand(["Doctor", "Florist"], 200)),
DataFrame(ID1=[missings(100); 1:100],
ID2=[missings(100); rl(100)],
Job=rand(["Doctor", "Florist"], 200)),
DataFrame(ID1=[1:50; missings(100); 51:100],
ID2=[rl(50); missings(100); rl(50)],
Job=rand(["Doctor", "Florist"], 200)),
DataFrame(ID1=[1:64; missings(64); 129:200],
ID2=[rl(64); missings(64); rl(200 - 128)],
Job=rand(["Doctor", "Florist"], 200)),
DataFrame(ID1=[1:63; missings(65); 129:200],
ID2=[rl(64); missings(65); rl(200 - 129)],
Job=rand(["Doctor", "Florist"], 200)),
DataFrame(ID1=rand([1:100; missing], 10000),
ID2=rand(["a", "b", "c", missing], 10000),
Job=rand(["Doctor", "Florist", "Farmer", missing], 10000)),
]
k = [:ID1, :ID2]
for name in names2, job in jobs2
@test leftjoin(name, dropmissing(job, k), on=k, matchmissing=:equal)
leftjoin(name, job, on=k, matchmissing=:notequal)
@test semijoin(name, dropmissing(job, k), on=k, matchmissing=:equal)
semijoin(name, job, on=k, matchmissing=:notequal)
@test antijoin(name, dropmissing(job, k), on=k, matchmissing=:equal)
antijoin(name, job, on=k, matchmissing=:notequal)
@test rightjoin(dropmissing(name, k), job, on=k, matchmissing=:equal)
rightjoin(name, job, on=k, matchmissing=:notequal)
@test innerjoin(dropmissing(name, k), dropmissing(job, k), on=k, matchmissing=:equal)
innerjoin(name, job, on=k, matchmissing=:notequal)
end
end

end # module

0 comments on commit 5d8e52b

Please sign in to comment.