diff --git a/NEWS.md b/NEWS.md index 08d63e327e..20ed277ce7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,14 @@ # DataFrames.jl changes on main since last release notes +## New functionalities + +* add option `matchmissing=:notequal` in joins; + in `leftjoin`, `semijoin` and `antijoin` missings are dropped in right data frame, + but preserved in left; in `rightjoin` missings are dropped in left data frame, + but preserved in right; in `innerjoin` missings are dropped in both data frames; + in `outerjoin` this value of keyword argument is not supported + ([#2724](/~https://github.com/JuliaData/DataFrames.jl/pull/2724)) + ## Bug fixes * fix bug in how `issorted` handles custom orderings and improve performance diff --git a/src/join/composer.jl b/src/join/composer.jl index 9138eabe51..8c64a82959 100644 --- a/src/join/composer.jl +++ b/src/join/composer.jl @@ -23,7 +23,8 @@ struct DataFrameJoiner function DataFrameJoiner(dfl::AbstractDataFrame, dfr::AbstractDataFrame, on::Union{<:OnType, AbstractVector}, - matchmissing::Symbol) + matchmissing::Symbol, + kind::Symbol) on_cols = isa(on, AbstractVector) ? on : [on] left_on = Symbol[] right_on = Symbol[] @@ -45,9 +46,25 @@ struct DataFrameJoiner "Symbol or Pair{Symbol, Symbol}.")) end end - dfl_on = dfl[!, left_on] - dfr_on = dfr[!, right_on] - + + if matchmissing === :notequal + if kind in (:left, :semi, :anti) + dfr = dropmissing(dfr, right_on, view=true) + elseif kind === :right + dfl = dropmissing(dfl, left_on, view=true) + elseif kind === :inner + # it possible to drop only left or right df + # to gain some performance but needs more testing, see #2724 + dfl = dropmissing(dfl, left_on, view=true) + dfr = dropmissing(dfr, right_on, view=true) + elseif kind === :outer + throw(ArgumentError("matchmissing == :notequal for `outerjoin` is not allowed")) + else + throw(ArgumentError("matchmissing == :notequal not implemented for kind == :$kind")) + end + end + dfl_on = select(dfl, left_on, copycols=false) + dfr_on = select(dfr, right_on, copycols=false) if matchmissing === :error for df in (dfl_on, dfr_on), col in eachcol(df) if any(ismissing, col) @@ -55,10 +72,9 @@ struct DataFrameJoiner "when matchmissing == :error")) end end - elseif matchmissing !== :equal - throw(ArgumentError("matchmissing allows only :error or :equal")) + elseif !(matchmissing in (:equal, :notequal)) + throw(ArgumentError("matchmissing allows only :error, :equal, or :notequal")) end - for df in (dfl_on, dfr_on), col in eachcol(df) if any(x -> (x isa Union{Complex, Real}) && (isnan(x) || isequal(real(x), -0.0) || isequal(imag(x), -0.0)), col) @@ -311,7 +327,7 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; throw(ArgumentError("Missing join argument 'on'.")) end - joiner = DataFrameJoiner(df1, df2, on, matchmissing) + joiner = DataFrameJoiner(df1, df2, on, matchmissing, kind) # Check merge key validity left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false @@ -485,7 +501,8 @@ change in future releases. data frame and left unchanged. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are - matched (`isequal` is used for comparisons of rows for equality) + matched; if equal to `:notequal` then missings are dropped in `df1` and `df2` + `on` columns; `isequal` is used for comparisons of rows for equality It is not allowed to join on columns that contain `NaN` or `-0.0` in real or imaginary part of the number. If you need to perform a join on such values use @@ -626,7 +643,8 @@ change in future releases. data frame and left unchanged. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are - matched (`isequal` is used for comparisons of rows for equality) + matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns; + `isequal` is used for comparisons of rows for equality All columns of the returned data table will support missing values. @@ -772,7 +790,8 @@ change in future releases. data frame and left unchanged. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are - matched (`isequal` is used for comparisons of rows for equality) + matched; if equal to `:notequal` then missings are dropped in `df1` `on` columns; + `isequal` is used for comparisons of rows for equality All columns of the returned data table will support missing values. @@ -923,7 +942,7 @@ This behavior may change in future releases. data frame and left unchanged. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are - matched (`isequal` is used for comparisons of rows for equality) + matched; `isequal` is used for comparisons of rows for equality All columns of the returned data table will support missing values. @@ -1071,7 +1090,8 @@ The order of rows in the result is undefined and may change in the future releas By default no check is performed. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are - matched (`isequal` is used for comparisons of rows for equality) + matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns; + `isequal` is used for comparisons of rows for equality It is not allowed to join on columns that contain `NaN` or `-0.0` in real or imaginary part of the number. If you need to perform a join on such values use @@ -1176,7 +1196,8 @@ The order of rows in the result is undefined and may change in the future releas By default no check is performed. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are - matched (`isequal` is used for comparisons of rows for equality) + matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns; + `isequal` is used for comparisons of rows for equality It is not allowed to join on columns that contain `NaN` or `-0.0` in real or imaginary part of the number. If you need to perform a join on such values use diff --git a/test/join.jl b/test/join.jl index 68afbbc4e8..d16050436f 100644 --- a/test/join.jl +++ b/test/join.jl @@ -32,6 +32,8 @@ anti = left[Bool[ismissing(x) for x in left.Job], [:ID, :Name]] @test_throws ArgumentError innerjoin(name, job) @test_throws ArgumentError innerjoin(name, job, on = :ID, matchmissing=:errors) + @test_throws ArgumentError innerjoin(name, job, on = :ID, matchmissing=:weirdmatch) + @test_throws ArgumentError outerjoin(name, job, on = :ID, matchmissing=:notequal) @test innerjoin(name, job, on = :ID) == inner @test outerjoin(name, job, on = :ID) ≅ outer @@ -1557,4 +1559,143 @@ end c="c", d="d") end +@testset "matchmissing :notequal correctness" begin + Random.seed!(1337) + names = [ + DataFrame(ID=[1, 2, missing], + Name=["John Doe", "Jane Doe", "Joe Blogs"]), + DataFrame(ID=[], + Name=[]), + DataFrame(ID=missings(3), + Name=["John Doe", "Jane Doe", "Joe Blogs"]), + DataFrame(ID=[1, 2, 3], + Name=[missing, "Jane Doe", missing]), + DataFrame(ID=[1:100; missings(100)], + Name=repeat(["Jane Doe"], 200)), + DataFrame(ID=[missings(100); 1:100], + Name=repeat(["Jane Doe"], 200)), + DataFrame(ID=[1:50; missings(100); 51:100], + Name=repeat(["Jane Doe"], 200)), + DataFrame(ID=[1:64; missings(64); 129:200], + Name=repeat(["Jane Doe"], 200)), + DataFrame(ID=[1:63; missings(65); 129:200], + Name=repeat(["Jane Doe"], 200)), + DataFrame(ID=rand([1:1000; missing], 10000), + Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)), + ] + jobs = [ + DataFrame(ID=[1, 2, 2, 4], + Job=["Lawyer", "Doctor", "Florist", "Farmer"]), + DataFrame(ID=[missing, 2, 2, 4], + Job=["Lawyer", "Doctor", "Florist", "Farmer"]), + DataFrame(ID=[missing, 2, 2, 4], + Job=["Lawyer", "Doctor", missing, "Farmer"]), + DataFrame(ID=[], + Job=[]), + DataFrame(ID=[1:100; missings(100)], + Job=repeat(["Lawyer"], 200)), + DataFrame(ID=[missings(100); 1:100], + Job=repeat(["Lawyer"], 200)), + DataFrame(ID=[1:50; missings(100); 51:100], + Job=repeat(["Lawyer"], 200)), + DataFrame(ID=[1:64; missings(64); 129:200], + Job=repeat(["Lawyer"], 200)), + DataFrame(ID=[1:63; missings(65); 129:200], + Job=repeat(["Lawyer"], 200)), + DataFrame(ID=rand([1:1000; missing], 10000), + Job=rand(["Lawyer", "Doctor", "Florist", missing], 10000)), + ] + for name in names, job in jobs + @test leftjoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅ + leftjoin(name, job, on=:ID, matchmissing=:notequal) + @test semijoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅ + semijoin(name, job, on=:ID, matchmissing=:notequal) + @test antijoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅ + antijoin(name, job, on=:ID, matchmissing=:notequal) + @test rightjoin(dropmissing(name, :ID), job, on=:ID, matchmissing=:equal) ≅ + rightjoin(name, job, on=:ID, matchmissing=:notequal) + @test innerjoin(dropmissing(name, :ID), dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅ + innerjoin(name, job, on=:ID, matchmissing=:notequal) + end + + rl(n) = rand(["a", "b", "c"], n) + names2 = [ + DataFrame(ID1=[1, 1, 2], + ID2=["a", "b", "a"], + Name=["John Doe", "Jane Doe", "Joe Blogs"]), + DataFrame(ID1=[1, 1, 2, missing], + ID2=["a", "b", "a", missing], + Name=["John Doe", "Jane Doe", "Joe Blogs", missing]), + DataFrame(ID1=[missing, 1, 2, missing], + ID2=["a", "b", missing, missing], + Name=[missing, "Jane Doe", "Joe Blogs", missing]), + DataFrame(ID1=[missing, 1, 2, missing], + ID2=["a", "b", missing, missing], + Name=missings(4)), + DataFrame(ID1=[missing, 1, 2, missing], + ID2=missings(4), + Name=["John Doe", "Jane Doe", "Joe Blogs", missing]), + DataFrame(ID1=[1:100; missings(100)], + ID2=[rl(100); missings(100)], + Name=rand(["Jane Doe", "Jane Doe"], 200)), + DataFrame(ID1=[missings(100); 1:100], + ID2=[missings(100); rl(100)], + Name=rand(["Jane Doe", "Jane Doe"], 200)), + DataFrame(ID1=[1:50; missings(100); 51:100], + ID2=[rl(50); missings(100); rl(50)], + Name=rand(["Jane Doe", "Jane Doe"], 200)), + DataFrame(ID1=[1:64; missings(64); 129:200], + ID2=[rl(64); missings(64); rl(200 - 128)], + Name=rand(["Jane Doe", "Jane Doe"], 200)), + DataFrame(ID1=[1:63; missings(65); 129:200], + ID2=[rl(64); missings(65); rl(200 - 129)], + Name=rand(["Jane Doe", "Jane Doe"], 200)), + DataFrame(ID1=rand([1:100; missing], 10000), + ID2=rand(["a", "b", "c", missing], 10000), + Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)), + ] + jobs2 = [ + DataFrame(ID1=[1, 2, 2, 4], + ID2=["a", "b", "b", "c"], + Job=["Lawyer", "Doctor", "Florist", "Farmer"]), + DataFrame(ID1=[1, 2, 2, 4, missing], + ID2=["a", "b", "b", "c", missing], + Job=["Lawyer", "Doctor", "Florist", "Farmer", missing]), + DataFrame(ID1=[1, 2, missing, 4, missing], + ID2=["a", "b", missing, "c", missing], + Job=[missing, "Doctor", "Florist", "Farmer", missing]), + DataFrame(ID1=[1:100; missings(100)], + ID2=[rl(100); missings(100)], + Job=rand(["Doctor", "Florist"], 200)), + DataFrame(ID1=[missings(100); 1:100], + ID2=[missings(100); rl(100)], + Job=rand(["Doctor", "Florist"], 200)), + DataFrame(ID1=[1:50; missings(100); 51:100], + ID2=[rl(50); missings(100); rl(50)], + Job=rand(["Doctor", "Florist"], 200)), + DataFrame(ID1=[1:64; missings(64); 129:200], + ID2=[rl(64); missings(64); rl(200 - 128)], + Job=rand(["Doctor", "Florist"], 200)), + DataFrame(ID1=[1:63; missings(65); 129:200], + ID2=[rl(64); missings(65); rl(200 - 129)], + Job=rand(["Doctor", "Florist"], 200)), + DataFrame(ID1=rand([1:100; missing], 10000), + ID2=rand(["a", "b", "c", missing], 10000), + Job=rand(["Doctor", "Florist", "Farmer", missing], 10000)), + ] + k = [:ID1, :ID2] + for name in names2, job in jobs2 + @test leftjoin(name, dropmissing(job, k), on=k, matchmissing=:equal) ≅ + leftjoin(name, job, on=k, matchmissing=:notequal) + @test semijoin(name, dropmissing(job, k), on=k, matchmissing=:equal) ≅ + semijoin(name, job, on=k, matchmissing=:notequal) + @test antijoin(name, dropmissing(job, k), on=k, matchmissing=:equal) ≅ + antijoin(name, job, on=k, matchmissing=:notequal) + @test rightjoin(dropmissing(name, k), job, on=k, matchmissing=:equal) ≅ + rightjoin(name, job, on=k, matchmissing=:notequal) + @test innerjoin(dropmissing(name, k), dropmissing(job, k), on=k, matchmissing=:equal) ≅ + innerjoin(name, job, on=k, matchmissing=:notequal) + end +end + end # module