Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitise bin calculations #6212

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ Collate:
'scales-.R'
'stat-align.R'
'stat-bin.R'
'stat-summary-2d.R'
'stat-bin2d.R'
'stat-bindot.R'
'stat-binhex.R'
Expand All @@ -263,7 +264,6 @@ Collate:
'stat-smooth-methods.R'
'stat-smooth.R'
'stat-sum.R'
'stat-summary-2d.R'
'stat-summary-bin.R'
'stat-summary-hex.R'
'stat-summary.R'
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# ggplot2 (development version)

* All binning stats now use the `boundary`/`center` parametrisation rather
than `origin`, following in `stat_bin()`'s footsteps (@teunbrand).
* `stat_summary_2d()` and `stat_bin_2d()` now deal with zero-range data
more elegantly (@teunbrand, #6207).
* New stat: `stat_manual()` for arbitrary computations (@teunbrand, #3501)
* Reversal of a dimension, typically 'x' or 'y', is now controlled by the
`reverse` argument in `coord_cartesian()`, `coord_fixed()`, `coord_radial()`
Expand Down Expand Up @@ -226,7 +230,7 @@
* The ellipsis argument is now checked in `fortify()`, `get_alt_text()`,
`labs()` and several guides (@teunbrand, #3196).
* `stat_summary_bin()` no longer ignores `width` parameter (@teunbrand, #4647).
* Added `keep.zeroes` argument to `stat_bin()` (@teunbrand, #3449)
* Reintroduced `drop` argument to `stat_bin()` (@teunbrand, #3449)
* (internal) removed barriers for using 2D structures as aesthetics
(@teunbrand, #4189).
* `coord_sf()` no longer errors when dealing with empty graticules (@teunbrand, #6052)
Expand Down
106 changes: 91 additions & 15 deletions R/bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,12 @@

bin_breaks_width <- function(x_range, width = NULL, center = NULL,
boundary = NULL, closed = c("right", "left")) {
check_length(x_range, 2L)

# binwidth seems to be the argument name supplied to width. (stat-bin and stat-bindot)
check_number_decimal(width, min = 0, allow_infinite = FALSE, arg = "binwidth")

if (!is.null(boundary) && !is.null(center)) {
cli::cli_abort("Only one of {.arg boundary} and {.arg center} may be specified.")
} else if (is.null(boundary)) {
if (is.null(boundary)) {
if (is.null(center)) {
# If neither edge nor center given, compute both using tile layer's
# algorithm. This puts min and max of data in outer half of their bins.
boundary <- width / 2

} else {
# If center given but not boundary, compute boundary.
boundary <- center - width / 2
Expand All @@ -75,9 +68,6 @@

# Find the left side of left-most bin: inputs could be Dates or POSIXct, so
# coerce to numeric first.
x_range <- as.numeric(x_range)
width <- as.numeric(width)
boundary <- as.numeric(boundary)
shift <- floor((x_range[1] - boundary) / width)
origin <- boundary + shift * width

Expand All @@ -104,9 +94,7 @@

bin_breaks_bins <- function(x_range, bins = 30, center = NULL,
boundary = NULL, closed = c("right", "left")) {
check_length(x_range, 2L)

check_number_whole(bins, min = 1)
if (zero_range(x_range)) {
# 0.1 is the same width as the expansion `default_expansion()` gives for 0-width data
width <- 0.1
Expand All @@ -128,6 +116,53 @@

# Compute bins ------------------------------------------------------------

compute_bins <- function(x, scale = NULL, breaks = NULL, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left")) {

range <- if (is.scale(scale)) scale$dimension() else range(x)
check_length(range, 2L)

if (!is.null(breaks)) {
if (is.function(breaks)) {
breaks <- breaks(x)
}
if (is.scale(scale) && !scale$is_discrete()) {
breaks <- scale$transform(breaks)
}
check_numeric(breaks)
bins <- bin_breaks(breaks, closed)
return(bins)
}

check_number_decimal(boundary, allow_infinite = FALSE, allow_null = TRUE)
check_number_decimal(center, allow_infinite = FALSE, allow_null = TRUE)
if (!is.null(boundary) && !is.null(center)) {
cli::cli_abort("Only one of {.arg boundary} and {.arg center} may be specified.")
}

if (!is.null(binwidth)) {
if (is.function(binwidth)) {
binwidth <- binwidth(x)
}
check_number_decimal(binwidth, min = 0, allow_infinite = FALSE)
bins <- bin_breaks_width(
range, binwidth,
center = center, boundary = boundary, closed = closed
)
return(bins)
}

if (is.function(bins)) {
bins <- bins(x)

Check warning on line 157 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L157

Added line #L157 was not covered by tests
}
check_number_whole(bins, min = 1, allow_infinite = FALSE)
bin_breaks_bins(
range, bins,
center = center, boundary = boundary, closed = closed
)
}

bin_vector <- function(x, bins, weight = NULL, pad = FALSE) {
check_object(bins, is_bins, "a {.cls ggplot2_bins} object")

Expand All @@ -141,8 +176,7 @@
weight[is.na(weight)] <- 0
}

bin_idx <- cut(x, bins$fuzzy, right = bins$right_closed,
include.lowest = TRUE)
bin_idx <- bin_cut(x, bins)
bin_count <- as.numeric(tapply(weight, bin_idx, sum, na.rm = TRUE))
bin_count[is.na(bin_count)] <- 0

Expand Down Expand Up @@ -170,6 +204,10 @@
bin_out(bin_count, bin_x, bin_widths)
}

bin_cut <- function(x, bins) {
cut(x, bins$fuzzy, right = bins$right_closed, include.lowest = TRUE)
}

bin_out <- function(count = integer(0), x = numeric(0), width = numeric(0),
xmin = x - width / 2, xmax = x + width / 2) {
density <- count / width / sum(abs(count))
Expand All @@ -186,3 +224,41 @@
.size = length(count)
)
}

bin_loc <- function(x, id) {
left <- x[-length(x)]
right <- x[-1]

list(
left = left[id],
right = right[id],
mid = ((left + right) / 2)[id],
length = diff(x)[id]
)
}

fix_bin_params = function(params, fun, version) {

if (!is.null(params$origin)) {
args <- paste0(fun, c("(origin)", "(boundary)"))
deprecate_warn0(version, args[1], args[2])
params$boudnary <- params$origin
params$origin <- NULL

Check warning on line 246 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L243-L246

Added lines #L243 - L246 were not covered by tests
}

if (!is.null(params$right)) {
args <- paste0(fun, c("(right)", "(closed)"))
deprecate_warn0(version, args[1], args[2])
params$closed <- if (isTRUE(params$right)) "right" else "left"
params$right <- NULL

Check warning on line 253 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L250-L253

Added lines #L250 - L253 were not covered by tests
}

if (is.null(params$breaks %||% params$binwidth %||% params$bins)) {
cli::cli_inform(
"{.fn {fun}} using {.code bins = 30}. Pick better value {.arg binwidth}."
)
params$bins <- 30

Check warning on line 260 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L257-L260

Added lines #L257 - L260 were not covered by tests
}

params
}
72 changes: 21 additions & 51 deletions R/stat-bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@
#' or left edges of bins are included in the bin.
#' @param pad If `TRUE`, adds empty bins at either end of x. This ensures
#' frequency polygons touch 0. Defaults to `FALSE`.
#' @param keep.zeroes Treatment of zero count bins. If `"all"` (default), such
#' @param drop Treatment of zero count bins. If `"all"` (default), such
#' bins are kept as-is. If `"none"`, all zero count bins are filtered out.
#' If `"inner"` only zero count bins at the flanks are filtered out, but not
#' in the middle.
#' in the middle. `TRUE` is shorthand for `"all"` and `FALSE` is shorthand
#' for `"none"`.
#' @eval rd_computed_vars(
#' count = "number of points in bin.",
#' density = "density of points in bin, scaled to integrate to 1.",
Expand Down Expand Up @@ -59,7 +60,7 @@ stat_bin <- function(mapping = NULL, data = NULL,
closed = c("right", "left"),
pad = FALSE,
na.rm = FALSE,
keep.zeroes = "all",
drop = "all",
orientation = NA,
show.legend = NA,
inherit.aes = TRUE) {
Expand All @@ -82,7 +83,7 @@ stat_bin <- function(mapping = NULL, data = NULL,
pad = pad,
na.rm = na.rm,
orientation = orientation,
keep.zeroes = keep.zeroes,
drop = drop,
...
)
)
Expand All @@ -95,9 +96,13 @@ stat_bin <- function(mapping = NULL, data = NULL,
StatBin <- ggproto("StatBin", Stat,
setup_params = function(self, data, params) {
params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE)
params$keep.zeroes <- arg_match0(
params$keep.zeroes %||% "all",
c("all", "none", "inner"), arg_nm = "keep.zeroes"

if (is.logical(params$drop)) {
params$drop <- if (isTRUE(params$drop)) "all" else "none"
}
params$drop <- arg_match0(
params$drop %||% "all",
c("all", "none", "inner"), arg_nm = "drop"
)

has_x <- !(is.null(data$x) && is.null(params$x))
Expand All @@ -118,29 +123,7 @@ StatBin <- ggproto("StatBin", Stat,
))
}

if (!is.null(params$drop)) {
deprecate_warn0("2.1.0", "stat_bin(drop)", "stat_bin(pad)")
params$drop <- NULL
}
if (!is.null(params$origin)) {
deprecate_warn0("2.1.0", "stat_bin(origin)", "stat_bin(boundary)")
params$boundary <- params$origin
params$origin <- NULL
}
if (!is.null(params$right)) {
deprecate_warn0("2.1.0", "stat_bin(right)", "stat_bin(closed)")
params$closed <- if (params$right) "right" else "left"
params$right <- NULL
}
if (!is.null(params$boundary) && !is.null(params$center)) {
cli::cli_abort("Only one of {.arg boundary} and {.arg center} may be specified in {.fn {snake_class(self)}}.")
}

if (is.null(params$breaks) && is.null(params$binwidth) && is.null(params$bins)) {
cli::cli_inform("{.fn {snake_class(self)}} using {.code bins = 30}. Pick better value with {.arg binwidth}.")
params$bins <- 30
}

params <- fix_bin_params(params, fun = snake_class(self), version = "2.1.0")
params
},

Expand All @@ -149,33 +132,20 @@ StatBin <- ggproto("StatBin", Stat,
compute_group = function(data, scales, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left"), pad = FALSE,
breaks = NULL, flipped_aes = FALSE, keep.zeroes = "all",
breaks = NULL, flipped_aes = FALSE, drop = "all",
# The following arguments are not used, but must
# be listed so parameters are computed correctly
origin = NULL, right = NULL, drop = NULL) {
origin = NULL, right = NULL) {
x <- flipped_names(flipped_aes)$x
if (!is.null(breaks)) {
if (is.function(breaks)) {
breaks <- breaks(data[[x]])
}
if (!scales[[x]]$is_discrete()) {
breaks <- scales[[x]]$transform(breaks)
}
bins <- bin_breaks(breaks, closed)
} else if (!is.null(binwidth)) {
if (is.function(binwidth)) {
binwidth <- binwidth(data[[x]])
}
bins <- bin_breaks_width(scales[[x]]$dimension(), binwidth,
center = center, boundary = boundary, closed = closed)
} else {
bins <- bin_breaks_bins(scales[[x]]$dimension(), bins, center = center,
boundary = boundary, closed = closed)
}
bins <- compute_bins(
data[[x]], scales[[x]],
breaks = breaks, binwidth = binwidth, bins = bins,
center = center, boundary = boundary, closed = closed
)
bins <- bin_vector(data[[x]], bins, weight = data$weight, pad = pad)

keep <- switch(
keep.zeroes,
drop,
none = bins$count != 0,
inner = inner_runs(bins$count != 0),
TRUE
Expand Down
Loading
Loading