Matchmissing == :notequal (#2724)

JuliaData · Jun 3, 2021 · 5d8e52b · 5d8e52b
1 parent f56982d
commit 5d8e52b
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 14 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,14 @@
 # DataFrames.jl changes on main since last release notes
 
+## New functionalities
+
+* add option `matchmissing=:notequal` in joins;
+  in `leftjoin`, `semijoin` and `antijoin` missings are dropped in right data frame,
+  but preserved in left; in `rightjoin` missings are dropped in left data frame,
+  but preserved in right; in `innerjoin` missings are dropped in both data frames;
+  in `outerjoin` this value of keyword argument is not supported
+  ([#2724](/~https://github.com/JuliaData/DataFrames.jl/pull/2724))
+
 ## Bug fixes
 
 * fix bug in how `issorted` handles custom orderings and improve performance

diff --git a/src/join/composer.jl b/src/join/composer.jl
@@ -23,7 +23,8 @@ struct DataFrameJoiner
 
     function DataFrameJoiner(dfl::AbstractDataFrame, dfr::AbstractDataFrame,
                              on::Union{<:OnType, AbstractVector},
-                             matchmissing::Symbol)
+                             matchmissing::Symbol,
+                             kind::Symbol)
         on_cols = isa(on, AbstractVector) ? on : [on]
         left_on = Symbol[]
         right_on = Symbol[]
@@ -45,20 +46,35 @@ struct DataFrameJoiner
                                     "Symbol or Pair{Symbol, Symbol}."))
             end
         end
-        dfl_on = dfl[!, left_on]
-        dfr_on = dfr[!, right_on]
-
+
+        if matchmissing === :notequal
+            if kind in (:left, :semi, :anti)
+                dfr = dropmissing(dfr, right_on, view=true)
+            elseif kind === :right
+                dfl = dropmissing(dfl, left_on, view=true)
+            elseif kind === :inner
+                # it possible to drop only left or right df
+                # to gain some performance but needs more testing, see #2724
+                dfl = dropmissing(dfl, left_on, view=true)
+                dfr = dropmissing(dfr, right_on, view=true)
+            elseif kind === :outer
+                throw(ArgumentError("matchmissing == :notequal for `outerjoin` is not allowed"))
+            else
+                throw(ArgumentError("matchmissing == :notequal not implemented for kind == :$kind"))
+            end
+        end
+        dfl_on = select(dfl, left_on, copycols=false)
+        dfr_on = select(dfr, right_on, copycols=false)
         if matchmissing === :error
             for df in (dfl_on, dfr_on), col in eachcol(df)
                 if any(ismissing, col)
                     throw(ArgumentError("missing values in key columns are not allowed " *
                                         "when matchmissing == :error"))
                 end
             end
-        elseif matchmissing !== :equal
-            throw(ArgumentError("matchmissing allows only :error or :equal"))
+        elseif !(matchmissing in (:equal, :notequal))
+            throw(ArgumentError("matchmissing allows only :error, :equal, or :notequal"))
         end
-
         for df in (dfl_on, dfr_on), col in eachcol(df)
             if any(x -> (x isa Union{Complex, Real}) &&
                         (isnan(x) || isequal(real(x), -0.0) || isequal(imag(x), -0.0)), col)
@@ -311,7 +327,7 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
         throw(ArgumentError("Missing join argument 'on'."))
     end
 
-    joiner = DataFrameJoiner(df1, df2, on, matchmissing)
+    joiner = DataFrameJoiner(df1, df2, on, matchmissing, kind)
 
     # Check merge key validity
     left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false
@@ -485,7 +501,8 @@ change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df1` and `df2`
+  `on` columns; `isequal` is used for comparisons of rows for equality
 
 It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
 imaginary part of the number. If you need to perform a join on such values use
@@ -626,7 +643,8 @@ change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 All columns of the returned data table will support missing values.
 
@@ -772,7 +790,8 @@ change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df1` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 All columns of the returned data table will support missing values.
 
@@ -923,7 +942,7 @@ This behavior may change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched (`isequal` is used for comparisons of rows for equality)
+  matched; `isequal` is used for comparisons of rows for equality
 
 All columns of the returned data table will support missing values.
 
@@ -1071,7 +1090,8 @@ The order of rows in the result is undefined and may change in the future releas
    By default no check is performed.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
 imaginary part of the number. If you need to perform a join on such values use
@@ -1176,7 +1196,8 @@ The order of rows in the result is undefined and may change in the future releas
    By default no check is performed.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
 imaginary part of the number. If you need to perform a join on such values use

diff --git a/test/join.jl b/test/join.jl
@@ -32,6 +32,8 @@ anti = left[Bool[ismissing(x) for x in left.Job], [:ID, :Name]]
 
     @test_throws ArgumentError innerjoin(name, job)
     @test_throws ArgumentError innerjoin(name, job, on = :ID, matchmissing=:errors)
+    @test_throws ArgumentError innerjoin(name, job, on = :ID, matchmissing=:weirdmatch)
+    @test_throws ArgumentError outerjoin(name, job, on = :ID, matchmissing=:notequal)
 
     @test innerjoin(name, job, on = :ID) == inner
     @test outerjoin(name, job, on = :ID) ≅ outer
@@ -1557,4 +1559,143 @@ end
                     c="c", d="d")
 end
 
+@testset "matchmissing :notequal correctness" begin
+    Random.seed!(1337)
+    names = [
+        DataFrame(ID=[1, 2, missing],
+                  Name=["John Doe", "Jane Doe", "Joe Blogs"]),
+        DataFrame(ID=[],
+                  Name=[]),
+        DataFrame(ID=missings(3),
+                  Name=["John Doe", "Jane Doe", "Joe Blogs"]),
+        DataFrame(ID=[1, 2, 3],
+                  Name=[missing, "Jane Doe", missing]),
+        DataFrame(ID=[1:100; missings(100)],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[missings(100); 1:100],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[1:50; missings(100); 51:100],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[1:64; missings(64); 129:200],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[1:63; missings(65); 129:200],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=rand([1:1000; missing], 10000),
+                  Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)),
+    ]
+    jobs = [
+        DataFrame(ID=[1, 2, 2, 4],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
+        DataFrame(ID=[missing, 2, 2, 4],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
+        DataFrame(ID=[missing, 2, 2, 4],
+                  Job=["Lawyer", "Doctor", missing, "Farmer"]),
+        DataFrame(ID=[],
+                  Job=[]),
+        DataFrame(ID=[1:100; missings(100)],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[missings(100); 1:100],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[1:50; missings(100); 51:100],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[1:64; missings(64); 129:200],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[1:63; missings(65); 129:200],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=rand([1:1000; missing], 10000),
+                  Job=rand(["Lawyer", "Doctor", "Florist", missing], 10000)),
+    ]
+    for name in names, job in jobs
+        @test leftjoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅
+            leftjoin(name, job, on=:ID, matchmissing=:notequal)
+        @test semijoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅
+            semijoin(name, job, on=:ID, matchmissing=:notequal)
+        @test antijoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅
+            antijoin(name, job, on=:ID, matchmissing=:notequal)
+        @test rightjoin(dropmissing(name, :ID), job, on=:ID, matchmissing=:equal) ≅
+            rightjoin(name, job, on=:ID, matchmissing=:notequal)
+        @test innerjoin(dropmissing(name, :ID), dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅
+            innerjoin(name, job, on=:ID, matchmissing=:notequal)
+    end
+
+    rl(n) = rand(["a", "b", "c"], n)
+    names2 = [
+        DataFrame(ID1=[1, 1, 2],
+                  ID2=["a", "b", "a"],
+                  Name=["John Doe", "Jane Doe", "Joe Blogs"]),
+        DataFrame(ID1=[1, 1, 2, missing],
+                  ID2=["a", "b", "a", missing],
+                  Name=["John Doe", "Jane Doe", "Joe Blogs", missing]),
+        DataFrame(ID1=[missing, 1, 2, missing],
+                  ID2=["a", "b", missing, missing],
+                  Name=[missing, "Jane Doe", "Joe Blogs", missing]),
+        DataFrame(ID1=[missing, 1, 2, missing],
+                  ID2=["a", "b", missing, missing],
+                  Name=missings(4)),
+        DataFrame(ID1=[missing, 1, 2, missing],
+                  ID2=missings(4),
+                  Name=["John Doe", "Jane Doe", "Joe Blogs", missing]),
+        DataFrame(ID1=[1:100; missings(100)],
+                  ID2=[rl(100); missings(100)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[missings(100); 1:100],
+                  ID2=[missings(100); rl(100)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[1:50; missings(100); 51:100],
+                  ID2=[rl(50); missings(100); rl(50)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[1:64; missings(64); 129:200],
+                  ID2=[rl(64); missings(64); rl(200 - 128)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[1:63; missings(65); 129:200],
+                  ID2=[rl(64); missings(65); rl(200 - 129)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=rand([1:100; missing], 10000),
+                  ID2=rand(["a", "b", "c", missing], 10000),
+                  Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)),
+    ]
+    jobs2 = [
+        DataFrame(ID1=[1, 2, 2, 4],
+                  ID2=["a", "b", "b", "c"],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
+        DataFrame(ID1=[1, 2, 2, 4, missing],
+                  ID2=["a", "b", "b", "c", missing],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer", missing]),
+        DataFrame(ID1=[1, 2, missing, 4, missing],
+                  ID2=["a", "b", missing, "c", missing],
+                  Job=[missing, "Doctor", "Florist", "Farmer", missing]),
+        DataFrame(ID1=[1:100; missings(100)],
+                  ID2=[rl(100); missings(100)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[missings(100); 1:100],
+                  ID2=[missings(100); rl(100)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[1:50; missings(100); 51:100],
+                  ID2=[rl(50); missings(100); rl(50)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[1:64; missings(64); 129:200],
+                  ID2=[rl(64); missings(64); rl(200 - 128)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[1:63; missings(65); 129:200],
+                  ID2=[rl(64); missings(65); rl(200 - 129)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=rand([1:100; missing], 10000),
+                  ID2=rand(["a", "b", "c", missing], 10000),
+                  Job=rand(["Doctor", "Florist", "Farmer", missing], 10000)),
+    ]
+    k = [:ID1, :ID2]
+    for name in names2, job in jobs2
+        @test leftjoin(name, dropmissing(job, k), on=k, matchmissing=:equal) ≅
+            leftjoin(name, job, on=k, matchmissing=:notequal)
+        @test semijoin(name, dropmissing(job, k), on=k, matchmissing=:equal) ≅
+            semijoin(name, job, on=k, matchmissing=:notequal)
+        @test antijoin(name, dropmissing(job, k), on=k, matchmissing=:equal) ≅
+            antijoin(name, job, on=k, matchmissing=:notequal)
+        @test rightjoin(dropmissing(name, k), job, on=k, matchmissing=:equal) ≅
+            rightjoin(name, job, on=k, matchmissing=:notequal)
+        @test innerjoin(dropmissing(name, k), dropmissing(job, k), on=k, matchmissing=:equal) ≅
+            innerjoin(name, job, on=k, matchmissing=:notequal)
+    end
+end
+
 end # module