From 4eaa0be3a6902d2653ad765b93fd552fa202c437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 25 Nov 2022 17:20:09 +0100 Subject: [PATCH 1/4] add allunique --- src/abstractdataframe/abstractdataframe.jl | 43 ++++++++++++++++++++++ test/dataframe.jl | 19 ++++++++++ 2 files changed, 62 insertions(+) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a630faada8..613cbc32fd 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1431,6 +1431,49 @@ function nonunique(df::AbstractDataFrame, cols) end end +""" + allunique(df::AbstractDataFrame, cols=:) + +Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if +all their columns contain equal values (according to `isequal`). + +See also [`unique`](@ref) and [`nonunique`](@ref). + +# Arguments +- `df` : `AbstractDataFrame` +- `cols` : a selector specifying the column(s) or their transformations to compare. + Can be any column selector or transformation accepted by [`select`](@ref). + +# Examples + +```jldoctest +julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2]) +4×2 DataFrame + Row │ i x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 1 + 4 │ 4 2 + +julia> allunique(df) +true + +julia> allunique(df, :x) +false + +julia> allunique(df, :i => ByRow(isodd)) +false +``` +""" +function Base.allunique(df::AbstractDataFrame, cols=:) + udf = select(df, cols, copycols=false) + nrow(udf) == 0 && return true + return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)), + Val(false), nothing, false, nothing)[1] == nrow(df) +end + """ unique(df::AbstractDataFrame; view::Bool=false) unique(df::AbstractDataFrame, cols; view::Bool=false) diff --git a/test/dataframe.jl b/test/dataframe.jl index 27a5108fb3..93eb834226 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -2254,4 +2254,23 @@ end @test !isempty(DataFrame(a=1)) end +@testset "allunique" begin + refdf = DataFrame(a=[1, 1, 2, 2, 3], b=[1, 2, 1, 2, 3], c=[1, 2, 1, 2, 3]) + for df in (refdf[1:4, 1:2], view(refdf, 1:4, 1:2)) + @test allunique(df) + @test !allunique(df, 1) + @test !allunique(df, :b) + @test allunique(df, All()) + @test allunique(df, []) + + if df isa DataFrame + @test allunique(df, x -> 1:4) + @test allunique(df, [:a, :b] => ByRow(string)) + else + @test_throws ArgumentError allunique(df, x -> 1:4) + @test_throws ArgumentError allunique(df, [:a, :b] => ByRow(string)) + end + end +end + end # module From 76e921dcc66b8f79a6bac592cf8b3142ee9122ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 25 Nov 2022 17:20:46 +0100 Subject: [PATCH 2/4] add manual entry --- docs/src/lib/functions.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 78b4b289af..c0beb73f2d 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -146,6 +146,7 @@ valuecols ## Filtering rows ```@docs +allunique deleteat! empty empty! From 37d09c3112502b11dac2aba3032683c1882a72d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 28 Nov 2022 14:26:54 +0100 Subject: [PATCH 3/4] allow transformation of SubDataFrame --- NEWS.md | 6 ++++++ src/abstractdataframe/abstractdataframe.jl | 13 ++++++----- src/dataframe/dataframe.jl | 5 +++++ src/subdataframe/subdataframe.jl | 13 +++++++++++ test/dataframe.jl | 25 +++++++++++++++------- 5 files changed, 47 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index bda4d27d90..9c721efb30 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# DataFrames.jl v1.5 Release Notes + +* add `allunique` and allow transformations in `cols` argument of `describe` + and `nonunique` when working with `SubDataFrame` + ([3232](/~https://github.com/JuliaData/DataFrames.jl/pull/3232)) + # DataFrames.jl v1.4.3 Patch Release Notes ## Bug fixes diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 613cbc32fd..a0ff001bc0 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -449,8 +449,8 @@ $METADATA_FIXED """ function Base.similar(df::AbstractDataFrame, rows::Integer = size(df, 1)) rows < 0 && throw(ArgumentError("the number of rows must be non-negative")) - out_df = DataFrame(AbstractVector[similar(x, rows) for x in eachcol(df)], copy(index(df)), - copycols=false) + out_df = DataFrame(AbstractVector[similar(x, rows) for x in eachcol(df)], + copy(index(df)), copycols=false) _copy_all_note_metadata!(out_df, df) return out_df end @@ -565,7 +565,6 @@ $METADATA_FIXED @inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) = view ? Base.view(df, max(1, nrow(df)-n+1):nrow(df), :) : df[max(1, nrow(df)-n+1):nrow(df), :] - """ describe(df::AbstractDataFrame; cols=:) describe(df::AbstractDataFrame, stats::Union{Symbol, Pair}...; cols=:) @@ -656,10 +655,10 @@ julia> describe(df, :min, sum => :sum, cols=:x) DataAPI.describe(df::AbstractDataFrame, stats::Union{Symbol, Pair{<:Base.Callable, <:SymbolOrString}}...; cols=:) = - _describe(select(df, cols, copycols=false), Any[s for s in stats]) + _describe(_try_select_no_copy(df, cols), Any[s for s in stats]) DataAPI.describe(df::AbstractDataFrame; cols=:) = - _describe(select(df, cols, copycols=false), + _describe(_try_select_no_copy(df, cols), Any[:mean, :min, :median, :max, :nmissing, :eltype]) function _describe(df::AbstractDataFrame, stats::AbstractVector) @@ -1422,7 +1421,7 @@ function nonunique(df::AbstractDataFrame) end function nonunique(df::AbstractDataFrame, cols) - udf = select(df, cols, copycols=false) + udf = _try_select_no_copy(df, cols) if ncol(df) > 0 && ncol(udf) == 0 throw(ArgumentError("finding duplicate rows in data frame when " * "`cols` selects no columns is not allowed")) @@ -1468,7 +1467,7 @@ false ``` """ function Base.allunique(df::AbstractDataFrame, cols=:) - udf = select(df, cols, copycols=false) + udf = _try_select_no_copy(df, cols) nrow(udf) == 0 && return true return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)), Val(false), nothing, false, nothing)[1] == nrow(df) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 7aa2faa131..86c6cc613d 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1546,3 +1546,8 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...) @assert size(out_df) == (target_rows, length(colnames)) return out_df end + +# _try_select_no_copy selects cols from df; it tries to avoid copying data if possible; +# for SubDataFrame if cols is not a simple column selector then copying is needed +_try_select_no_copy(df::DataFrame, cols) = select(df, cols, copycols=false) + diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index e27f5a88d2..097ffd73d3 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -372,3 +372,16 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame; keep_present::Bo return sdf end + +function _try_select_no_copy(sdf::SubDataFrame, cols) + # try is needed here as `cols` could be AbstractVector in which case + # it is not possible to statically check if it is a valid column selector + colsidx = try + index(sdf)[cols] + catch + nothing + end + + return isnothing(colsidx) ? select(sdf, cols) : select(sdf, colsidx, copycols=false) +end + diff --git a/test/dataframe.jl b/test/dataframe.jl index 93eb834226..539f24b3dd 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -2262,15 +2262,24 @@ end @test !allunique(df, :b) @test allunique(df, All()) @test allunique(df, []) - - if df isa DataFrame - @test allunique(df, x -> 1:4) - @test allunique(df, [:a, :b] => ByRow(string)) - else - @test_throws ArgumentError allunique(df, x -> 1:4) - @test_throws ArgumentError allunique(df, [:a, :b] => ByRow(string)) - end + @test allunique(df, x -> 1:4) + @test allunique(df, [:a, :b] => ByRow(string)) end end +@testset "extra tests describe, nonunique, allunique for SubDataFrame" begin + refdf = DataFrame(a=[1, 1, 2, 2, 3], b=[1, 2, 1, 2, 3], c=[1, 2, 1, 2, 3]) + sdf = @view refdf[1:4, 1:2] + @test describe(sdf, cols=:a => ByRow(string)) == + DataFrame(variable=:a_string, mean=nothing, min="1", + median=nothing, max="2", nmissing=0, eltype=String) + @test describe(sdf, :min, :max, cols=x -> DataFrame(x=11:14)) == + DataFrame(variable=:x, min=11, max=14) + @test nonunique(sdf, x->[1, 1, 2, 2]) == [false, true, false, true] + @test nonunique(sdf, :a => x -> true) == [false, true, true, true] + @test !allunique(sdf, x -> [1, 1, 2, 2]) + @test allunique(sdf, :a => x -> 1:4) + @test !allunique(sdf, :a => x -> true) +end + end # module From 29785c00cf977cc9648ae99a4d839580215abffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 29 Nov 2022 17:16:44 +0100 Subject: [PATCH 4/4] move comment to a better place --- src/dataframe/dataframe.jl | 2 -- src/subdataframe/subdataframe.jl | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 86c6cc613d..b4b11b1b3a 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1547,7 +1547,5 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...) return out_df end -# _try_select_no_copy selects cols from df; it tries to avoid copying data if possible; -# for SubDataFrame if cols is not a simple column selector then copying is needed _try_select_no_copy(df::DataFrame, cols) = select(df, cols, copycols=false) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 097ffd73d3..8ba7538084 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -373,6 +373,8 @@ function _replace_columns!(sdf::SubDataFrame, newdf::DataFrame; keep_present::Bo return sdf end +# _try_select_no_copy selects cols from df; it tries to avoid copying data if possible; +# for SubDataFrame if cols is not a simple column selector then copying is needed function _try_select_no_copy(sdf::SubDataFrame, cols) # try is needed here as `cols` could be AbstractVector in which case # it is not possible to statically check if it is a valid column selector