JuliaData · nalimilan · Aug 31, 2020 · Aug 29, 2020 · Aug 30, 2020 · Aug 30, 2020
diff --git a/NEWS.md b/NEWS.md
@@ -22,6 +22,10 @@
   choose the fast path only when it is safe; this resolves inconsistencies
   with what the same functions not using fast path produce
   ([#2357](/~https://github.com/JuliaData/DataFrames.jl/pull/2357))
+* `stack` now creates a `PooledVector{String}` variable column rather than
+  a `CategoricalVector{String}` column by default;
+  pass `variable_eltype=CategoricalValue{String}` to get the previous behavior
+  ([#2391](/~https://github.com/JuliaData/DataFrames.jl/pull/2391))
 * the `categorical` and `categorical!` functions have been deprecated in favor of
   `transform(df, cols .=> categorical .=> cols)` and similar syntaxes
   [#2394]((/~https://github.com/JuliaData/DataFrames.jl/pull/2394))

diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -1,7 +1,7 @@
 """
     stack(df::AbstractDataFrame, [measure_vars], [id_vars];
           variable_name=:variable, value_name=:value,
-          view::Bool=false, variable_eltype::Type=CategoricalValue{String})
+          view::Bool=false, variable_eltype::Type=String)
 
 Stack a data frame `df`, i.e. convert it from wide to long format.
 
@@ -32,9 +32,13 @@ that return views into the original data frame.
 - `view` : whether the stacked data frame should be a view rather than contain
   freshly allocated vectors.
 - `variable_eltype` : determines the element type of column `variable_name`.
-  By default a categorical vector of strings is created.
-  If `variable_eltype=Symbol` it is a vector of `Symbol`,
-  and if `variable_eltype=String` a vector of `String` is produced.
+  By default a `PooledArray{String}` is created.
+  If `variable_eltype=Symbol` a `PooledVector{Symbol}` is created,
+  and if `variable_eltype=CategoricalValue{String}`
+  a `CategoricalArray{String}` is produced.
+  Passing any other type `T` will produce a `PooledVector{T}` column
+  as long as it supports conversion from `String`.
+  When `view=true`, a `RepeatedVector{T}` is produced.
 
 
 # Examples
@@ -57,7 +61,7 @@ function stack(df::AbstractDataFrame,
                id_vars = Not(measure_vars);
                variable_name::SymbolOrString=:variable,
                value_name::SymbolOrString=:value, view::Bool=false,
-               variable_eltype::Type=CategoricalValue{String})
+               variable_eltype::Type=String)
     variable_name_s = Symbol(variable_name)
     value_name_s = Symbol(value_name)
     # getindex from index returns either Int or AbstractVector{Int}
@@ -75,17 +79,17 @@ function stack(df::AbstractDataFrame,
     cnames = _names(df)[ints_id_vars]
     push!(cnames, variable_name_s)
     push!(cnames, value_name_s)
-    if variable_eltype <: CategoricalValue{String}
-        nms = names(df, ints_measure_vars)
-        catnms = categorical(nms)
-        levels!(catnms, nms)
-    elseif variable_eltype === Symbol
-        catnms = _names(df)[ints_measure_vars]
+    if variable_eltype === Symbol
+        catnms = PooledArray(_names(df)[ints_measure_vars])
     elseif variable_eltype === String
         catnms = PooledArray(names(df, ints_measure_vars))
     else
-        throw(ArgumentError("`variable_eltype` keyword argument accepts only " *
-                            "`CategoricalValue{String}`, `String` or `Symbol` as a value."))
+        # this covers CategoricalArray{String} in particular
+        # (note that copyto! inserts levels in their order of appearance)
+        nms = names(df, ints_measure_vars)
+        simnms = similar(nms, variable_eltype)
+        catnms = simnms isa Vector ? PooledArray(catnms) : simnms
+        copyto!(catnms, nms)
     end
     return DataFrame(AbstractVector[[repeat(df[!, c], outer=N) for c in ints_id_vars]..., # id_var columns
                                     repeat(catnms, inner=nrow(df)),                       # variable
@@ -100,17 +104,15 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int},
     cnames = _names(df)[id_vars]
     push!(cnames, variable_name)
     push!(cnames, value_name)
-    if variable_eltype <: CategoricalValue{String}
-        nms = names(df, measure_vars)
-        catnms = categorical(nms)
-        levels!(catnms, nms)
-    elseif variable_eltype <: Symbol
+    if variable_eltype === Symbol
         catnms = _names(df)[measure_vars]
-    elseif variable_eltype <: String
+    elseif variable_eltype === String
         catnms = names(df, measure_vars)
     else
-        throw(ArgumentError("`variable_eltype` keyword argument accepts only " *
-                            "`CategoricalValue{String}`, `String` or `Symbol` as a value."))
+        # this covers CategoricalArray{String} in particular,
+        # as copyto! inserts levels in their order of appearance
+        nms = names(df, measure_vars)
+        catnms = copyto!(similar(nms, variable_eltype), nms)
     end
     return DataFrame(AbstractVector[[RepeatedVector(df[!, c], 1, N) for c in id_vars]..., # id_var columns
                                     RepeatedVector(catnms, nrow(df), 1),                  # variable

diff --git a/test/reshape.jl b/test/reshape.jl
@@ -424,17 +424,15 @@ end
     levels!(v, ["b", "c", "a"])
     rv = DataFrames.RepeatedVector(v, 1, 1)
     @test isordered(v)
-    # uncomment after CategoricalArrays.jl is fixed
-    # @test isordered(categorical(v))
+    @test isordered(categorical(v))
     @test levels(v) == ["b", "c", "a"]
     @test levels(categorical(v)) == ["b", "c", "a"]
 
     v = categorical(["a", "b", "c"])
     levels!(v, ["b", "c", "a"])
     rv = DataFrames.RepeatedVector(v, 1, 1)
     @test !isordered(v)
-    # uncomment after CategoricalArrays.jl is fixed
-    # @test !isordered(categorical(v))
+    @test !isordered(categorical(v))
     @test levels(v) == ["b", "c", "a"]
     @test levels(categorical(v)) == ["b", "c", "a"]
 end
@@ -447,25 +445,25 @@ end
                    d = randn(12),
                    e = map(string, 'a':'l'))
     d1s = stack(d1, [:d, :c])
-    @test d1s.variable isa CategoricalVector{String}
-    @test levels(d1s.variable) == ["d", "c"]
-    d1s = stack(d1, [:d, :c], view=true)
-    @test d1s.variable isa DataFrames.RepeatedVector{<:CategoricalValue{String}}
-    @test levels(d1s.variable) == ["d", "c"]
-    @test d1s[:, 4] isa CategoricalVector{String}
-    @test levels(d1s[:, 4]) == ["d", "c"]
-
-    d1s = stack(d1, [:d, :c], variable_eltype=String)
     @test d1s.variable isa PooledVector{String}
     @test levels(d1s.variable) == ["c", "d"]
-    d1s = stack(d1, [:d, :c], view=true, variable_eltype=String)
+    d1s = stack(d1, [:d, :c], view=true)
     @test d1s.variable isa DataFrames.RepeatedVector{String}
     @test levels(d1s.variable) == ["c", "d"]
     @test d1s[:, 4] isa Vector{String}
     @test levels(d1s[:, 4]) == ["c", "d"]
 
+    d1s = stack(d1, [:d, :c], variable_eltype=CategoricalValue{String})
+    @test d1s.variable isa CategoricalVector{String}
+    @test levels(d1s.variable) == ["d", "c"]
+    d1s = stack(d1, [:d, :c], view=true, variable_eltype=CategoricalValue{String})
+    @test d1s.variable isa DataFrames.RepeatedVector{<:CategoricalValue{String}}
+    @test levels(d1s.variable) == ["d", "c"]
+    @test d1s[:, 4] isa CategoricalVector{String}
+    @test levels(d1s[:, 4]) == ["d", "c"]
+
     d1s = stack(d1, [:d, :c], variable_eltype=Symbol)
-    @test d1s.variable isa Vector{Symbol}
+    @test d1s.variable isa PooledVector{Symbol}
     @test levels(d1s.variable) == [:c, :d]
     d1s = stack(d1, [:d, :c], view=true, variable_eltype=Symbol)
     @test d1s.variable isa DataFrames.RepeatedVector{Symbol}
@@ -481,7 +479,7 @@ end
     ordered!(d2.c, true)
     levels!(d2.d, ref_levels)
     ordered!(d2.d, true)
-    d2s = stack(d2, [:d, :c])
+    d2s = stack(d2, [:d, :c], variable_eltype=CategoricalValue{String})
     for col in eachcol(d2s)
         @test col isa CategoricalVector
     end
@@ -498,15 +496,15 @@ end
 @testset "test stack eltype" begin
     df = DataFrame(rand(4,5))
     sdf = stack(df)
-    @test eltype(sdf.variable) <: CategoricalValue{String}
-    @test eltype(typeof(sdf.variable)) <: CategoricalValue{String}
-    @test eltype(sdf.value) <: Float64
-    @test eltype(typeof(sdf.value)) <: Float64
+    @test eltype(sdf.variable) === String
+    @test eltype(typeof(sdf.variable)) === String
+    @test eltype(sdf.value) === Float64
+    @test eltype(typeof(sdf.value)) === Float64
     sdf2 = first(sdf, 3)
-    @test eltype(sdf2.variable) <: CategoricalValue{String}
-    @test eltype(typeof(sdf2.variable)) <: CategoricalValue{String}
-    @test eltype(sdf2.value) <: Float64
-    @test eltype(typeof(sdf2.value)) <: Float64
+    @test eltype(sdf2.variable) === String
+    @test eltype(typeof(sdf2.variable)) === String
+    @test eltype(sdf2.value) === Float64
+    @test eltype(typeof(sdf2.value)) === Float64
 end
 
 end # module