Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BREAKING] Change stack to create a PooledArray{String} column by default #2391

Merged
merged 5 commits into from
Aug 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
choose the fast path only when it is safe; this resolves inconsistencies
with what the same functions not using fast path produce
([#2357](/~https://github.com/JuliaData/DataFrames.jl/pull/2357))
* `stack` now creates a `PooledVector{String}` variable column rather than
a `CategoricalVector{String}` column by default;
pass `variable_eltype=CategoricalValue{String}` to get the previous behavior
([#2391](/~https://github.com/JuliaData/DataFrames.jl/pull/2391))
* the `categorical` and `categorical!` functions have been deprecated in favor of
`transform(df, cols .=> categorical .=> cols)` and similar syntaxes
[#2394]((/~https://github.com/JuliaData/DataFrames.jl/pull/2394))
Expand Down
44 changes: 23 additions & 21 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
stack(df::AbstractDataFrame, [measure_vars], [id_vars];
variable_name=:variable, value_name=:value,
view::Bool=false, variable_eltype::Type=CategoricalValue{String})
view::Bool=false, variable_eltype::Type=String)

Stack a data frame `df`, i.e. convert it from wide to long format.

Expand Down Expand Up @@ -32,9 +32,13 @@ that return views into the original data frame.
- `view` : whether the stacked data frame should be a view rather than contain
freshly allocated vectors.
- `variable_eltype` : determines the element type of column `variable_name`.
By default a categorical vector of strings is created.
If `variable_eltype=Symbol` it is a vector of `Symbol`,
and if `variable_eltype=String` a vector of `String` is produced.
By default a `PooledArray{String}` is created.
If `variable_eltype=Symbol` a `PooledVector{Symbol}` is created,
and if `variable_eltype=CategoricalValue{String}`
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
a `CategoricalArray{String}` is produced.
Passing any other type `T` will produce a `PooledVector{T}` column
as long as it supports conversion from `String`.
When `view=true`, a `RepeatedVector{T}` is produced.


# Examples
Expand All @@ -57,7 +61,7 @@ function stack(df::AbstractDataFrame,
id_vars = Not(measure_vars);
variable_name::SymbolOrString=:variable,
value_name::SymbolOrString=:value, view::Bool=false,
variable_eltype::Type=CategoricalValue{String})
variable_eltype::Type=String)
variable_name_s = Symbol(variable_name)
value_name_s = Symbol(value_name)
# getindex from index returns either Int or AbstractVector{Int}
Expand All @@ -75,17 +79,17 @@ function stack(df::AbstractDataFrame,
cnames = _names(df)[ints_id_vars]
push!(cnames, variable_name_s)
push!(cnames, value_name_s)
if variable_eltype <: CategoricalValue{String}
nms = names(df, ints_measure_vars)
catnms = categorical(nms)
levels!(catnms, nms)
elseif variable_eltype === Symbol
catnms = _names(df)[ints_measure_vars]
if variable_eltype === Symbol
catnms = PooledArray(_names(df)[ints_measure_vars])
elseif variable_eltype === String
catnms = PooledArray(names(df, ints_measure_vars))
else
throw(ArgumentError("`variable_eltype` keyword argument accepts only " *
"`CategoricalValue{String}`, `String` or `Symbol` as a value."))
# this covers CategoricalArray{String} in particular
# (note that copyto! inserts levels in their order of appearance)
nms = names(df, ints_measure_vars)
simnms = similar(nms, variable_eltype)
catnms = simnms isa Vector ? PooledArray(catnms) : simnms
copyto!(catnms, nms)
end
return DataFrame(AbstractVector[[repeat(df[!, c], outer=N) for c in ints_id_vars]..., # id_var columns
repeat(catnms, inner=nrow(df)), # variable
Expand All @@ -100,17 +104,15 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int},
cnames = _names(df)[id_vars]
push!(cnames, variable_name)
push!(cnames, value_name)
if variable_eltype <: CategoricalValue{String}
nms = names(df, measure_vars)
catnms = categorical(nms)
levels!(catnms, nms)
elseif variable_eltype <: Symbol
if variable_eltype === Symbol
catnms = _names(df)[measure_vars]
elseif variable_eltype <: String
elseif variable_eltype === String
catnms = names(df, measure_vars)
else
throw(ArgumentError("`variable_eltype` keyword argument accepts only " *
"`CategoricalValue{String}`, `String` or `Symbol` as a value."))
# this covers CategoricalArray{String} in particular,
# as copyto! inserts levels in their order of appearance
nms = names(df, measure_vars)
catnms = copyto!(similar(nms, variable_eltype), nms)
end
return DataFrame(AbstractVector[[RepeatedVector(df[!, c], 1, N) for c in id_vars]..., # id_var columns
RepeatedVector(catnms, nrow(df), 1), # variable
Expand Down
46 changes: 22 additions & 24 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -424,17 +424,15 @@ end
levels!(v, ["b", "c", "a"])
rv = DataFrames.RepeatedVector(v, 1, 1)
@test isordered(v)
# uncomment after CategoricalArrays.jl is fixed
# @test isordered(categorical(v))
@test isordered(categorical(v))
@test levels(v) == ["b", "c", "a"]
@test levels(categorical(v)) == ["b", "c", "a"]

v = categorical(["a", "b", "c"])
levels!(v, ["b", "c", "a"])
rv = DataFrames.RepeatedVector(v, 1, 1)
@test !isordered(v)
# uncomment after CategoricalArrays.jl is fixed
# @test !isordered(categorical(v))
@test !isordered(categorical(v))
@test levels(v) == ["b", "c", "a"]
@test levels(categorical(v)) == ["b", "c", "a"]
end
Expand All @@ -447,25 +445,25 @@ end
d = randn(12),
e = map(string, 'a':'l'))
d1s = stack(d1, [:d, :c])
@test d1s.variable isa CategoricalVector{String}
@test levels(d1s.variable) == ["d", "c"]
d1s = stack(d1, [:d, :c], view=true)
@test d1s.variable isa DataFrames.RepeatedVector{<:CategoricalValue{String}}
@test levels(d1s.variable) == ["d", "c"]
@test d1s[:, 4] isa CategoricalVector{String}
@test levels(d1s[:, 4]) == ["d", "c"]

d1s = stack(d1, [:d, :c], variable_eltype=String)
@test d1s.variable isa PooledVector{String}
@test levels(d1s.variable) == ["c", "d"]
d1s = stack(d1, [:d, :c], view=true, variable_eltype=String)
d1s = stack(d1, [:d, :c], view=true)
@test d1s.variable isa DataFrames.RepeatedVector{String}
@test levels(d1s.variable) == ["c", "d"]
@test d1s[:, 4] isa Vector{String}
@test levels(d1s[:, 4]) == ["c", "d"]

d1s = stack(d1, [:d, :c], variable_eltype=CategoricalValue{String})
@test d1s.variable isa CategoricalVector{String}
@test levels(d1s.variable) == ["d", "c"]
d1s = stack(d1, [:d, :c], view=true, variable_eltype=CategoricalValue{String})
@test d1s.variable isa DataFrames.RepeatedVector{<:CategoricalValue{String}}
@test levels(d1s.variable) == ["d", "c"]
@test d1s[:, 4] isa CategoricalVector{String}
@test levels(d1s[:, 4]) == ["d", "c"]

d1s = stack(d1, [:d, :c], variable_eltype=Symbol)
@test d1s.variable isa Vector{Symbol}
@test d1s.variable isa PooledVector{Symbol}
@test levels(d1s.variable) == [:c, :d]
d1s = stack(d1, [:d, :c], view=true, variable_eltype=Symbol)
@test d1s.variable isa DataFrames.RepeatedVector{Symbol}
Expand All @@ -481,7 +479,7 @@ end
ordered!(d2.c, true)
levels!(d2.d, ref_levels)
ordered!(d2.d, true)
d2s = stack(d2, [:d, :c])
d2s = stack(d2, [:d, :c], variable_eltype=CategoricalValue{String})
for col in eachcol(d2s)
@test col isa CategoricalVector
end
Expand All @@ -498,15 +496,15 @@ end
@testset "test stack eltype" begin
df = DataFrame(rand(4,5))
sdf = stack(df)
@test eltype(sdf.variable) <: CategoricalValue{String}
@test eltype(typeof(sdf.variable)) <: CategoricalValue{String}
@test eltype(sdf.value) <: Float64
@test eltype(typeof(sdf.value)) <: Float64
@test eltype(sdf.variable) === String
@test eltype(typeof(sdf.variable)) === String
@test eltype(sdf.value) === Float64
@test eltype(typeof(sdf.value)) === Float64
sdf2 = first(sdf, 3)
@test eltype(sdf2.variable) <: CategoricalValue{String}
@test eltype(typeof(sdf2.variable)) <: CategoricalValue{String}
@test eltype(sdf2.value) <: Float64
@test eltype(typeof(sdf2.value)) <: Float64
@test eltype(sdf2.variable) === String
@test eltype(typeof(sdf2.variable)) === String
@test eltype(sdf2.value) === Float64
@test eltype(typeof(sdf2.value)) === Float64
end

end # module