Skip to content

Commit

Permalink
make renaming perform copy in transform and transform! (#2721)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Apr 16, 2021
1 parent ff0d799 commit a671a3b
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 13 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
* `mapcols!` makes sure not to create columns being `AbstractRange` consistently
with other methods that add columns to a `DataFrame`
([#2594](/~https://github.com/JuliaData/DataFrames.jl/pull/2594))
* `transform` and `transform!` always copy columns when column renaming transformation
is passed. If similar issues are identified after 1.0 release (i.e. that a
copy of data is not made in scenarios where it normally should be made these
will be considered bugs and fixed as non-breaking changes)
([#2721](/~https://github.com/JuliaData/DataFrames.jl/pull/2721))

## New functionalities

Expand Down
45 changes: 34 additions & 11 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ const TRANSFORMATION_COMMON_RULES =
transformations that return the same object without copying may create
column aliases even if `copycols=true`. An example of such a situation is
`select!(df, :a, :a => :b, :a => identity => :c)`.
As a special case in `transform` and `transform!` column renaming always
copies columns to avoid storing aliased columns in the target data frame.
If `df` is a `SubDataFrame` and `copycols=true` then a `DataFrame` is
returned and the same copying rules apply as for a `DataFrame` input: this
Expand Down Expand Up @@ -270,14 +272,14 @@ function normalize_selection(idx::AbstractIndex,
allunique(combine_target_col) || throw(ArgumentError("target column names must be unique"))
end

if wanttable
combine_src = AsTable(c)
else
if wanttable
combine_src = AsTable(c)
else
combine_src = (length(c) == 1 ? only(c) : c)
end

combine_func = first(last(sel))

return combine_src => combine_func => combine_target_col
end

Expand Down Expand Up @@ -338,9 +340,9 @@ function normalize_selection(idx::AbstractIndex,
end
end

if wanttable
combine_src = AsTable(c)
else
if wanttable
combine_src = AsTable(c)
else
combine_src = (length(c) == 1 ? only(c) : c)
end

Expand Down Expand Up @@ -656,8 +658,17 @@ $TRANSFORMATION_COMMON_RULES
See [`select`](@ref) for examples.
"""
transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) =
select!(df, :, args..., renamecols=renamecols)
function transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true)
idx = index(df)
newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol}
idx[first(sel)] => copy => last(sel)
elseif sel isa Pair{<:ColumnIndex, <:AbstractString}
idx[first(sel)] => copy => Symbol(last(sel))
else
sel
end for sel in args]
return select!(df, :, newargs..., renamecols=renamecols)
end

function transform!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true)
if arg isa Colon
Expand Down Expand Up @@ -978,8 +989,20 @@ ERROR: ArgumentError: column :x in returned data frame is not equal to grouping
See [`select`](@ref) for more examples.
"""
transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) =
select(df, :, args..., copycols=copycols, renamecols=renamecols)
function transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true)
idx = index(df)
# when using the copy function the copy of source data frame
# is made exactly once even if copycols=true
# (copycols=true makes a copy only if the column was not copied previously)
newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol}
idx[first(sel)] => copy => last(sel)
elseif sel isa Pair{<:ColumnIndex, <:AbstractString}
idx[first(sel)] => copy => Symbol(last(sel))
else
sel
end for sel in args]
return select(df, :, newargs..., copycols=copycols, renamecols=renamecols)
end

function transform(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true)
if arg isa Colon
Expand Down
7 changes: 7 additions & 0 deletions src/other/precompile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ function precompile(all=false)
end
Base.precompile(Tuple{typeof(row_group_slots),Tuple{RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, String}}, Vector{Union{Missing, Int}}},Val{false},Vector{Int},Bool,Bool})
Base.precompile(Tuple{typeof(combine),Union{Function, Type},GroupedDataFrame{DataFrame}})
Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}})
Base.precompile(Tuple{typeof(transform!),DataFrame,Any})
Base.precompile(Tuple{typeof(transform),DataFrame,Any})
else
Base.precompile(Tuple{typeof(_sortperm),DataFrame,Base.Sort.MergeSortAlg,DFPerm{Vector{Ordering}, Tuple{Vector{Int}, Vector{Int}, Vector{Int}}}})
Base.precompile(Tuple{typeof(flatten),DataFrame,All{Tuple{}}})
Expand Down Expand Up @@ -1030,6 +1033,10 @@ function precompile(all=false)
end
Base.precompile(Tuple{typeof(row_group_slots),Tuple{Vector{Float64}},Val{false},Vector{Int},Bool,Bool})
Base.precompile(Tuple{typeof(transform),DataFrame,Any})
Base.precompile(Tuple{typeof(transform!),DataFrame,Any})
Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}})
Base.precompile(Tuple{Core.kwftype(typeof(select)),NamedTuple{(:copycols, :renamecols), Tuple{Bool, Bool}},typeof(select),DataFrame,Any,Any})
Base.precompile(Tuple{Core.kwftype(typeof(select!)),NamedTuple{(:renamecols,), Tuple{Bool}},typeof(select!),DataFrame,Any,Any})
Base.precompile(Tuple{typeof(push!),DataFrame,Dict{Symbol, Any}})
Base.precompile(Tuple{typeof(_transformation_helper),DataFrame,Base.OneTo{Int},Base.RefValue{Any}})
Base.precompile(Tuple{typeof(_semijoin_sorted),OnCol{Tuple{Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}}, 3},OnCol{Tuple{PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}}, 3},BitVector})
Expand Down
32 changes: 30 additions & 2 deletions test/select.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1080,13 +1080,17 @@ end

df2 = transform(df, [:x1, :x2] => +, :x2 => :x3, copycols=false)
@test df2 == select(df, :, [:x1, :x2] => +, :x2 => :x3)
@test df.x2 === df2.x2 === df2.x3
@test df.x2 == df2.x2 == df2.x3
@test df.x2 === df2.x2
@test df.x2 !== df2.x3
@test_throws ArgumentError transform(view(df, :, :), [:x1, :x2] => +, :x2 => :x3, copycols=false)

x2 = df.x2
transform!(df, [:x1, :x2] => +, :x2 => :x3)
@test df == df2
@test x2 === df.x2 === df.x3
@test x2 == df.x2 == df.x3
@test x2 === df.x2
@test x2 !== df.x3

@test transform(df) == df
df2 = transform(df, copycols=false)
Expand Down Expand Up @@ -1620,4 +1624,28 @@ end
[:a] => sum => ["new"], false) == (1 => (sum => [:new]))
end

@testset "copying in transform when renaming" begin
for oldcol in (:a, "a", 1), newcol in (:b, "b")
df = DataFrame(a=1)
df2 = transform(df, oldcol => newcol)
@test df2.b == df2.a == df.a
@test df2.b !== df2.a
@test df2.b !== df.a
@test df2.a !== df.a

df2 = transform(df, oldcol => newcol, copycols=false)
@test df2.b == df2.a == df.a
@test df2.b !== df2.a
@test df2.b !== df.a
@test df2.a === df.a

a = df.a
transform!(df, oldcol => newcol)
@test df.b == df.a == a
@test df.b !== df.a
@test df.b !== a
@test df.a === a
end
end

end # module

0 comments on commit a671a3b

Please sign in to comment.