From cdfb73316728c6d37f46345bd1c660e0badf3135 Mon Sep 17 00:00:00 2001 From: pdeffebach <23196228+pdeffebach@users.noreply.github.com> Date: Fri, 22 Dec 2023 14:50:20 -0500 Subject: [PATCH] add groupby and docs (#373) * add groupby and docs * implementation * whatever * rebase * change implemetation * docs --- docs/src/dplyr.md | 10 +++++----- docs/src/index.md | 27 +++++++++++++++++++++------ src/DataFramesMeta.jl | 1 + src/macros.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ test/grouping.jl | 17 +++++++++++++++++ 5 files changed, 86 insertions(+), 11 deletions(-) diff --git a/docs/src/dplyr.md b/docs/src/dplyr.md index b42d6d0d..f7ae6412 100644 --- a/docs/src/dplyr.md +++ b/docs/src/dplyr.md @@ -93,7 +93,7 @@ DataFramesMeta.jl macro | By-row version | Description | `dplyr` equivalent `@subset` | `@rsubset` | filter rows | `filter` `@orderby` | `@rorderby` | re-order or arrange rows | `arrange` `@combine` | | summarise values | `summarize` (but `@combine` is more flexible) -`groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by` +`@groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by` # DataFramesMeta.jl Verbs In Action @@ -341,15 +341,15 @@ DataFrames.jl also provides the function `describe` which performs many of these describe(msleep) ``` -## Group Operations using `groupby` and `@combine` +## Group Operations using `@groupby` and `@combine` -The `groupby` verb is an important function in DataFrames.jl (it does not live in DataFramesMeta.jl). As we mentioned before it's related to concept of "split-apply-combine". We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output. +The `@groupby` verb is the first step in the "split-apply-combine" workflow. We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output. Let's do that: split the `msleep` data frame by the taxonomic order, then ask for the same summary statistics as above. We expect a set of summary statistics for each taxonomic order. ```@repl 1 @chain msleep begin - groupby(:order) + @groupby :order @combine begin :avg_sleep = mean(:sleep_total) :min_sleep = minimum(:sleep_total) @@ -363,7 +363,7 @@ Split-apply-combine can also be used with `@transform` to add new variables to a ```@repl 1 @chain msleep begin - groupby(:order) + @groupby :order @transform :sleep_genus = :sleep_total .- mean(:sleep_total) end ``` diff --git a/docs/src/index.md b/docs/src/index.md index 31e8970f..15ef9c9b 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -16,6 +16,7 @@ In addition, DataFramesMeta provides * Row-wise versions of the above macros in the form of `@rtransform`, `@rtransform!`, `@rselect`, `@rselect!`, `@rorderby`, `@rsubset`, and `@rsubset!`. * `@rename` and `@rename!` for renaming columns +* `@groupby` for grouping data * `@by`, for grouping and combining a data frame in a single step * `@with`, for working with the columns of a data frame with high performance and convenient syntax @@ -64,7 +65,7 @@ data frame. ```julia df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); @select(df, :x, :y) @select(df, :x2 = 2 * :x, :y) @select(gd, :x2 = 2 .* :y .* first(:y)) @@ -98,7 +99,7 @@ data frame. ```julia df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); @transform(df, :x2 = 2 * :x, :y) @transform(gd, :x2 = 2 .* :y .* first(:y)) @transform!(df, :x, :y) @@ -115,7 +116,7 @@ Select row subsets. Operates on both a `DataFrame` and a `GroupedDataFrame`. ```julia using Statistics df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); outside_var = 1; @subset(df, :x .> 1) @subset(df, :x .> outside_var) @@ -134,11 +135,14 @@ acts like a `GroupedDataFrame` with one group. Like `@select` and `@transform`, transformations are called with the keyword-like syntax `:y = f(:x)`. +To group data together into a `GroupedDataFrame`, use `@groupby`, a short-hand for +the DataFrames.jl function `groupby`. + Examples: ```julia df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); @combine(gd, :x2 = sum(:y)) @combine(gd, :x2 = :y .- sum(:y)) @combine(gd, $AsTable = (n1 = sum(:y), n2 = first(:y))) @@ -161,6 +165,17 @@ gd = groupby(df, :x); @combine(gd, $AsTable = (a = sum(:x), b = sum(:y))) ``` +### `@by` + +Perform the grouping and combining operations in one step with `@by` + +``` +df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); +@by df :x begin + :x = sum(:y) +end +``` + ## `@orderby` Sort rows in a `DataFrame` by values in one of several columns or a @@ -355,7 +370,7 @@ julia> @subset df @byrow begin however, like with `ByRow` in DataFrames.jl, when `@byrow` is used, functions do not take into account the grouping, so for example the result of `@transform(df, @byrow :y = f(:x))` and -`@transform(groupby(df, :g), @byrow :y = f(:x))` is the same. +`@transform(@groupby(df, :g), @byrow :y = f(:x))` is the same. ## Propagating missing values with `@passmissing` @@ -912,7 +927,7 @@ functions. | `@subset` | `filter` | `Where` | | `@transform` | `mutate` | `Select` (?) | | `@by` | | `GroupBy` | -| `groupby` | `group_by` | `GroupBy` | +| `@groupby` | `group_by` | `GroupBy` | | `@combine` | `summarise`/`do` | | | `@orderby` | `arrange` | `OrderBy` | | `@select` | `select` | `Select` | diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index a16cca5a..1f42a9f3 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -21,6 +21,7 @@ export @with, @distinct, @rdistinct, @distinct!, @rdistinct!, @eachrow, @eachrow!, @byrow, @passmissing, @astable, @kwarg, + @groupby, @based_on, @where # deprecated const DOLLAR = raw"$" diff --git a/src/macros.jl b/src/macros.jl index 055cb4a2..d66f13e5 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -3008,3 +3008,45 @@ macro rename!(x, args...) esc(rename!_helper(x, args...)) end +function groupby_helper(df, args...) + t = Expr(:tuple, args...) + :($groupby($df, ($Cols($t...)))) +end + +""" + groupby(df, args...) + +Group a data frame by columns. An alias for + +``` +groupby(df, Cols(args...)) +``` + +but with a few convenience features. + +## Details + +`@groupby` does not perform any transformations or allow the +generation of new columns. New column generation must be done +before `@groupby` is called. + +`@groupby` allows mixing of `Symbol` +and `String` inputs, such that `@groupby df :A "B"` +is supported. + +Arguments are not escaped and DataFramesMeta.jl rules for column +selection, such as `$DOLLAR` for escaping, do not apply. + +## Examples +```julia-repl +julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]); +julia> @groupby df :A; +julia> @groupby df :A :B; +julia> @groupby df [:A, :B]; +julia> @groupby df :A [:B, :C]; +``` +""" +macro groupby(df, args...) + esc(groupby_helper(df, args...)) +end + diff --git a/test/grouping.jl b/test/grouping.jl index 0db08b6e..e06a1ed6 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -349,4 +349,21 @@ end @test @select(g, :a, @byrow :t = :a ^ 2).t ≅ d.a .^ 2 end +@testset "@groupby" begin + df = DataFrame(a = [1, 2], b = [3, 4], c = [5, 6]) + resa = groupby(df, [:a]) + resab = groupby(df, [:a, :b]) + resabc = groupby(df, [:a, :b, :c]) + ab = [:a, :b] + + @test @groupby(df, :a) == resa + @test @groupby(df, :a, :b) == resab + @test (@groupby df ab) == resab + @test (@groupby df :a 2) == resab + @test (@groupby df [:a, :b]) == resab + @test (@groupby df :a "b") == resab + @test (@groupby df All()) == resabc + @test (@groupby df Cols(:a, 2, "c")) == resabc +end + end # module