Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow passing empty sets of columns to ByRow and filter #2476

Merged
merged 7 commits into from
Oct 12, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@
which if set to `true` makes them retun a `SubDataFrame` view into the passed
data frame.
* add `only` method for `AbstractDataFrame` ([#2449](/~https://github.com/JuliaData/DataFrames.jl/pull/2449))
* passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine`
with `ByRow` is now accepted ([#2476](/~https://github.com/JuliaData/DataFrames.jl/pull/2476))

## Deprecated

Expand Down
28 changes: 17 additions & 11 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -994,9 +994,10 @@ end
@inline function Base.filter((cols, f)::Pair, df::AbstractDataFrame; view::Bool=false)
int_cols = index(df)[cols] # it will be AbstractVector{Int} or Int
if length(int_cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = [f() for _ in axes(df, 1)]
else
rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...)
end
rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...)
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

Expand All @@ -1006,9 +1007,10 @@ end
AbstractVector{<:Symbol}}},
df::AbstractDataFrame; view::Bool=false)
if length(cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = [f() for _ in axes(df, 1)]
else
rowidxs = _filter_helper(f, (df[!, i] for i in cols)...)
end
rowidxs = _filter_helper(f, (df[!, i] for i in cols)...)
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

Expand All @@ -1018,9 +1020,10 @@ _filter_helper(f, cols...)::BitVector = ((x...) -> f(x...)::Bool).(cols...)
view::Bool=false)
df_tmp = select(df, cols.cols, copycols=false)
if ncol(df_tmp) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = [f(NamedTuple()) for _ in axes(df, 1)]
else
rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp))
end
rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

Expand Down Expand Up @@ -1101,7 +1104,7 @@ julia> filter!(AsTable(:) => nt -> nt.x == 1 || nt.y == "b", df)
│ 3 │ 1 │ b │
```
"""
Base.filter!(f, df::AbstractDataFrame) = _filter!_helper(df, f, eachrow(df))
Base.filter!(f, df::AbstractDataFrame) = delete!(df, findall(!f, eachrow(df)))
Base.filter!((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame) =
_filter!_helper(df, f, df[!, col])
Base.filter!((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) =
Expand All @@ -1115,17 +1118,20 @@ Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) =

function _filter!_helper(df::AbstractDataFrame, f, cols...)
if length(cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = findall(x -> !f(), axes(df, 1))
else
rowidxs = findall(((x...) -> !(f(x...)::Bool)).(cols...))
end
return delete!(df, findall(((x...) -> !(f(x...)::Bool)).(cols...)))
return delete!(df, rowidxs)
end

function Base.filter!((cols, f)::Pair{<:AsTable}, df::AbstractDataFrame)
dff = select(df, cols.cols, copycols=false)
if ncol(dff) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
return delete!(df, findall(x -> !f(NamedTuple()), axes(df, 1)))
else
return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f)
end
return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f)
end

_filter!_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f) =
Expand Down
57 changes: 30 additions & 27 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,11 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin
normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool)

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any,<:Pair{<:Base.Callable,
<:Union{Symbol, AbstractString, DataType,
AbstractVector{Symbol},
AbstractVector{<:AbstractString}}}},
@nospecialize(sel::Pair{<:Any,
<:Pair{<:Base.Callable,
<:Union{Symbol, AbstractString, DataType,
AbstractVector{Symbol},
AbstractVector{<:AbstractString}}}}),
bkamins marked this conversation as resolved.
Show resolved Hide resolved
renamecols::Bool)
lls = last(last(sel))
if lls isa DataType
Expand Down Expand Up @@ -170,31 +171,29 @@ function normalize_selection(idx::AbstractIndex,
return (wanttable ? AsTable(c) : c) => fun => newcol
end

function _transformation_helper(df::AbstractDataFrame,
col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
@nospecialize(fun))
if col_idx === nothing
return fun(df)
elseif col_idx isa Int
return fun(df[!, col_idx])
elseif col_idx isa AsTable
tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
if isempty(tbl) && fun isa ByRow
return [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
else
return fun(tbl)
end
_transformation_helper(df::AbstractDataFrame, col_idx::Nothing, fun) = fun(df)
_transformation_helper(df::AbstractDataFrame, col_idx::Int, fun) = fun(df[!, col_idx])

_empty_astable_helper(fun, len) = [fun(NamedTuple()) for _ in 1:len]

function _transformation_helper(df::AbstractDataFrame, col_idx::AsTable, fun)
tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
if isempty(tbl) && fun isa ByRow
return _empty_astable_helper(fun.fun, nrow(df))
else
# it should be fast enough here as we do not expect to do it millions of times
@assert col_idx isa AbstractVector{Int}
if isempty(col_idx) && fun isa ByRow
return [fun.fun() for _ in 1:nrow(df)]
else
cdf = eachcol(df)
return fun(map(c -> cdf[c], col_idx)...)
end
return fun(tbl)
end
end

_empty_selector_helper(fun, len) = [fun() for _ in 1:len]

function _transformation_helper(df::AbstractDataFrame, col_idx::AbstractVector{Int}, fun)
if isempty(col_idx) && fun isa ByRow
return _empty_selector_helper(fun.fun, nrow(df))
else
cdf = eachcol(df)
return fun(map(c -> cdf[c], col_idx)...)
end
throw(ErrorException("unreachable reached"))
end

function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol},
Expand Down Expand Up @@ -426,6 +425,8 @@ function select_transform!(@nospecialize(nc::Union{Base.Callable, Pair{<:Union{I
end
end

@nospecialize

SELECT_ARG_RULES =
"""
Arguments passed as `args...` can be:
Expand Down Expand Up @@ -1101,3 +1102,5 @@ function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool,
return view(dfv, :, isempty(newinds) ? [] : All(newinds...))
end
end

@specialize
14 changes: 11 additions & 3 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -711,7 +711,11 @@ end
function do_call(f::Any, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::Tuple{}, i::Integer)
f()
if f isa ByRow
return [f.fun() for _ in 1:(ends[i] - starts[i] + 1)]
else
return f()
end
end

function do_call(f::Any, idx::AbstractVector{<:Integer},
Expand Down Expand Up @@ -753,8 +757,12 @@ end
function do_call(f::Any, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::NamedTuple, i::Integer)
idx = idx[starts[i]:ends[i]]
return f(map(c -> view(c, idx), incols))
if f isa ByRow && isempty(incols)
return [f.fun(NamedTuple()) for _ in 1:(ends[i] - starts[i] + 1)]
else
idx = idx[starts[i]:ends[i]]
return f(map(c -> view(c, idx), incols))
end
end

function do_call(f::Any, idx::AbstractVector{<:Integer},
Expand Down
42 changes: 38 additions & 4 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -453,11 +453,45 @@ end
@test filter(AsTable("x") => testfun, df) == DataFrame(x=[3, 2], y=["b", "a"])
filter!(AsTable("x") => testfun, df)
@test df == DataFrame(x=[3, 2], y=["b", "a"])
end

@testset "empty arg to filter and filter!" begin
df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"])

@test filter([] => () -> true, df) == df
@test filter(AsTable(r"z") => x -> true, df) == df
@test filter!([] => () -> true, copy(df)) == df
@test filter!(AsTable(r"z") => x -> true, copy(df)) == df

flipflop0 = let
state = false
() -> (state = !state)
end

flipflop1 = let
state = false
x -> (state = !state)
end

@test_throws ArgumentError filter([] => () -> true, df)
@test_throws ArgumentError filter(AsTable(r"z") => () -> true, df)
@test_throws ArgumentError filter!([] => () -> true, df)
@test_throws ArgumentError filter!(AsTable(r"z") => () -> true, df)
@test filter([] => flipflop0, df) == df[[1,3], :]
@test filter(Int[] => flipflop0, df) == df[[1,3], :]
@test filter(String[] => flipflop0, df) == df[[1,3], :]
@test filter(Symbol[] => flipflop0, df) == df[[1,3], :]
@test filter(r"z" => flipflop0, df) == df[[1,3], :]
@test filter(Not(All()) => flipflop0, df) == df[[1,3], :]
@test filter(AsTable(r"z") => flipflop1, df) == df[[1,3], :]
@test filter(AsTable([]) => flipflop1, df) == df[[1,3], :]
@test filter!([] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(Int[] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(String[] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(Symbol[] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(r"z" => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(Not(All()) => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(AsTable(r"z") => flipflop1, copy(df)) == df[[1,3], :]
@test filter!(AsTable([]) => flipflop1, copy(df)) == df[[1,3], :]

@test_throws MethodError filter([] => flipflop1, df)
@test_throws MethodError filter(AsTable([]) => flipflop0, df)
end

@testset "names with cols" begin
Expand Down
41 changes: 41 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2874,4 +2874,45 @@ end
@test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30)
end

@testset "empty ByRow" begin
inc0 = let
state = 0
() -> (state += 1)
end

inc1 = let
state = 0
x -> (state += 1)
end

df = DataFrame(a=[1,1,1,2,2,3,4,4,5,5,5,5], b=1:12)
gdf = groupby_checked(df, :a)

@test select(gdf, [] => ByRow(inc0) => :bin) ==
DataFrame(a=df.a, bin=1:12)
@test combine(gdf, [] => ByRow(inc0) => :bin) ==
DataFrame(a=df.a, bin=13:24)
@test select(gdf, AsTable([]) => ByRow(inc1) => :bin) ==
DataFrame(a=df.a, bin=1:12)
@test combine(gdf, AsTable([]) => ByRow(inc1) => :bin) ==
DataFrame(a=df.a, bin=13:24)
@test combine(gdf[Not(2)], [] => ByRow(inc0) => :bin) ==
DataFrame(a=df.a[Not(4:5)], bin=25:34)
@test combine(gdf[Not(2)], AsTable([]) => ByRow(inc1) => :bin) ==
DataFrame(a=df.a[Not(4:5)], bin=25:34)

# note that type inference in a comprehension does not always work
bkamins marked this conversation as resolved.
Show resolved Hide resolved
@test isequal_coltyped(combine(gdf[[]], [] => ByRow(inc0) => :bin),
DataFrame(a=Int[], bin=Any[]))
@test isequal_coltyped(combine(gdf[[]], [] => ByRow(rand) => :bin),
DataFrame(a=Int[], bin=Float64[]))
@test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(inc1) => :bin),
DataFrame(a=Int[], bin=Any[]))
@test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(x -> rand()) => :bin),
DataFrame(a=Int[], bin=Float64[]))

@test_throws MethodError select(gdf, [] => ByRow(inc1) => :bin)
@test_throws MethodError select(gdf, AsTable([]) => ByRow(inc0) => :bin)
end

end # module