diff --git a/DESCRIPTION b/DESCRIPTION index 291f54c..0ecf46b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,6 +45,7 @@ Imports: grDevices Suggests: terra, + Ckmeans.1d.dp, png, jpeg, lwgeom, diff --git a/R/mf_get_breaks.R b/R/mf_get_breaks.R index a84e1f2..8addeb3 100644 --- a/R/mf_get_breaks.R +++ b/R/mf_get_breaks.R @@ -1,13 +1,17 @@ #' @title Get class intervals #' @name mf_get_breaks #' @description A function to classify continuous variables. +#' +#' This function is a wrapper for +#' \code{\link[classInt:classIntervals]{classIntervals}} +#' with some additional methods. +#' #' @param x a vector of numeric values. NA and Inf values are not used in the #' classification. #' @param nbreaks a number of classes #' @param breaks a classification method; one of "fixed", "sd", "equal", -#' "pretty", "quantile", -#' "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih", "q6", "geom", -#' "arith", "em" or "msd" (see Details). +#' "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks", +#' "dpih", "q6", "Q6", geom", "arith", "em", "msd" or "ckmeans" (see Details) #' @param k number of standard deviation for "msd" method (see Details) #' @param central creation of a central class for "msd" method (see Details) #' @param ... further arguments @@ -18,11 +22,27 @@ #' "bclust", "fisher", "jenks" and "dpih" #' are \code{\link[classInt:classIntervals]{classIntervals}} #' methods. You may need to pass additional arguments for some of them.\cr\cr -#' Jenks ("jenks" method) and Fisher ("fisher" method) algorithms are -#' based on the same principle and give -#' quite similar results but Fisher is much faster. \cr\cr +#' +#' The "jenks", "fisher" and "ckmeans" methods are based on the same concept of +#' **natural breaks** and and produce similar groupings. +#' +#' * The "jenks" method produces class boundaries falling on data points and is +#' slow. +#' * The "fisher" method produces class boundaries located more conveniently +#' between data points, and is faster than the "jenks" method. +#' * The "ckmeans" method produces exactly the same class boundaries as the +#' "fisher" method, but is much faster. It uses the optimal univariate +#' k-means method from the `Ckmeans.1d.dp` package. +#' If the "ckmeans" method is selected but the `Ckmeans.1d.dp` package is not +#' installed then the "fisher" method is used. +#' +#' The relative speeds of these three methods may vary depending on the number +#' of data points and the number of classes.\cr\cr +#' #' The "q6" method uses the following \code{\link[stats:quantile]{quantile}} #' probabilities: 0, 0.05, 0.275, 0.5, 0.725, 0.95, 1.\cr\cr +#' The "Q6" method uses the following \code{\link[stats:quantile]{quantile}} +#' probabilities: 0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.\cr\cr #' The "geom" method is based on a geometric progression along #' the variable values, all values must be strictly greater than zero.\cr\cr #' The "arith" method is based on an arithmetic progression along @@ -35,13 +55,11 @@ #' the extent of each class in share of standard deviation. #' If \code{central=TRUE} then #' the mean value is the center of a class else the mean is a break value. -#' @note This function is mainly a wrapper -#' of \code{\link[classInt:classIntervals]{classIntervals}} + -#' "arith", "em", "q6", "geom" and "msd" methods. #' @examples #' mtq <- mf_get_mtq() #' mf_get_breaks(x = mtq$MED, nbreaks = 6, breaks = "quantile") #' @return A numeric vector of breaks +#' @md #' @export mf_get_breaks <- function(x, nbreaks, breaks, k = 1, central = FALSE, ...) { if (is.numeric(breaks)) { @@ -51,115 +69,142 @@ mf_get_breaks <- function(x, nbreaks, breaks, k = 1, central = FALSE, ...) { x <- as.vector(na.omit(x)) x <- x[is.finite(x)] - custom_methods <- c("geom", "arith", "q6", "em", "msd") + custom_methods <- c("geom", "arith", "q6", "Q6", "em", "msd", "ckmeans") # default number of classes if (missing(nbreaks)) { nbreaks <- round(1 + 3.3 * log10(length(x)), 0) } + if (breaks == "ckmeans") { + if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { + warning( + paste0( + "The 'Ckmeans.1d.dp' package is needed for ", + "this classification method. Please install it.\n", + "The 'fisher' method will be used instead." + ), + call. = FALSE + ) + breaks <- "fisher" + } else { + bks <- Ckmeans.1d.dp::Ckmeans.1d.dp(x = x, k = nbreaks) + intervals <- Ckmeans.1d.dp::ahist( + x = bks, style = "midpoints", + data = x, plot = FALSE + )$breaks + } + } + if (!breaks %in% custom_methods) { intervals <- classInt::classIntervals( var = x, n = nbreaks, style = breaks, ... )$brks - } else { - if (breaks == "geom") { - intervals <- min(x) - if (intervals <= 0) { - stop( - paste0( - "All values must be strictly greater ", - "than 0 when using the 'geom' method." - ), - call. = FALSE - ) - } - intervals <- c(intervals, max(x)) - r <- exp((log(max(x)) - log(min(x))) / nbreaks) # raison - tmp <- min(x) - for (i in 1:(nbreaks - 1)) { - intervals <- c(intervals, tmp * r) - tmp <- tmp * r - intervals <- sort(intervals) - } + } + + if (breaks == "geom") { + intervals <- min(x) + if (intervals <= 0) { + stop( + paste0( + "All values must be strictly greater ", + "than 0 when using the 'geom' method." + ), + call. = FALSE + ) } - if (breaks == "arith") { - intervals <- min(x) - intervals <- c(intervals, max(x)) - r <- (max(x) - min(x)) / sum(1:nbreaks) # raison - tmp <- min(x) - for (i in 1:(nbreaks - 1)) { - intervals <- c(intervals, tmp + r) - tmp <- tmp + r - intervals <- sort(intervals) - } + intervals <- c(intervals, max(x)) + r <- exp((log(max(x)) - log(min(x))) / nbreaks) # raison + tmp <- min(x) + for (i in 1:(nbreaks - 1)) { + intervals <- c(intervals, tmp * r) + tmp <- tmp * r + intervals <- sort(intervals) } - if (breaks == "q6") { - intervals <- as.vector(quantile(x, - probs = - c(0, 5, 27.5, 50, 72.5, 95, 100) / 100 - )) + } + + if (breaks == "arith") { + intervals <- min(x) + intervals <- c(intervals, max(x)) + r <- (max(x) - min(x)) / sum(1:nbreaks) # raison + tmp <- min(x) + for (i in 1:(nbreaks - 1)) { + intervals <- c(intervals, tmp + r) + tmp <- tmp + r + intervals <- sort(intervals) } - if (breaks == "em") { - t <- bitwAnd(nbreaks, (nbreaks - 1)) - if (t != 0) { - stop("The number of classes must be a power of 2") - } else { - min_vec <- min(x) - max_vec <- max(x) - it <- log2(nbreaks) + } - int <- c(min_vec, max_vec) + if (breaks == "q6") { + intervals <- as.vector( + quantile(x = x, probs = c(0, .05, .275, .5, .725, .95, 1)) + ) + } - for (a in 1:it) { - valprv <- c() - for (i in 1:(length(int) - 1)) { - if (i == 1) { - sub_vec <- x[x >= int[i] & x <= int[i + 1]] - } else { - sub_vec <- x[x > int[i] & x <= int[i + 1]] - } - valprv <- c(valprv, mean(sub_vec)) + if (breaks == "Q6") { + intervals <- as.vector( + quantile(x = x, probs = c(0, .05, .25, .5, .75, .95, 1)) + ) + } + + if (breaks == "em") { + t <- bitwAnd(nbreaks, (nbreaks - 1)) + if (t != 0) { + stop("The number of classes must be a power of 2") + } else { + min_vec <- min(x) + max_vec <- max(x) + it <- log2(nbreaks) + + int <- c(min_vec, max_vec) + + for (a in 1:it) { + valprv <- c() + for (i in 1:(length(int) - 1)) { + if (i == 1) { + sub_vec <- x[x >= int[i] & x <= int[i + 1]] + } else { + sub_vec <- x[x > int[i] & x <= int[i + 1]] } - int <- c(int, valprv) - int <- int[order(int)] + valprv <- c(valprv, mean(sub_vec)) } - intervals <- int + int <- c(int, valprv) + int <- int[order(int)] } + intervals <- int } - if (breaks == "msd") { - min_vec <- min(x) - max_vec <- max(x) - avg_vec <- mean(x) - sd_vec <- sqrt(sum((x - avg_vec)^2) / length(x)) - - if (central == FALSE) { - pose <- ceiling((max_vec - avg_vec) / (sd_vec * k)) - nege <- ceiling((avg_vec - min_vec) / (sd_vec * k)) + } + if (breaks == "msd") { + min_vec <- min(x) + max_vec <- max(x) + avg_vec <- mean(x) + sd_vec <- sqrt(sum((x - avg_vec)^2) / length(x)) + if (central == FALSE) { + pose <- ceiling((max_vec - avg_vec) / (sd_vec * k)) + nege <- ceiling((avg_vec - min_vec) / (sd_vec * k)) + avg_vec + (1:pose) * (sd_vec * k) + bks <- c( + avg_vec - (1:nege) * (sd_vec * k), + avg_vec, avg_vec + (1:pose) * (sd_vec * k) - bks <- c( - avg_vec - (1:nege) * (sd_vec * k), - avg_vec, - avg_vec + (1:pose) * (sd_vec * k) - ) - intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec) - } else { - pose <- ceiling((max_vec - (avg_vec + 0.5 * sd_vec * k)) / (sd_vec * k)) - nege <- ceiling(((avg_vec - 0.5 * sd_vec * k) - min_vec) / (sd_vec * k)) - - bks <- c( - (avg_vec - 0.5 * sd_vec * k) - (1:nege) * (sd_vec * k), - (avg_vec - 0.5 * sd_vec * k), - (avg_vec + 0.5 * sd_vec * k), - (avg_vec + 0.5 * sd_vec * k) + (1:pose) * (sd_vec * k) - ) - intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec) - } - intervals <- intervals[order(intervals)] + ) + intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec) + } else { + pose <- ceiling((max_vec - (avg_vec + 0.5 * sd_vec * k)) / (sd_vec * k)) + nege <- ceiling(((avg_vec - 0.5 * sd_vec * k) - min_vec) / (sd_vec * k)) + bks <- c( + (avg_vec - 0.5 * sd_vec * k) - (1:nege) * (sd_vec * k), + (avg_vec - 0.5 * sd_vec * k), + (avg_vec + 0.5 * sd_vec * k), + (avg_vec + 0.5 * sd_vec * k) + (1:pose) * (sd_vec * k) + ) + intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec) } + intervals <- intervals[order(intervals)] } + return(unique(intervals)) } diff --git a/R/mf_map.R b/R/mf_map.R index 75abf9d..d20b64e 100644 --- a/R/mf_map.R +++ b/R/mf_map.R @@ -158,14 +158,10 @@ #' add = TRUE) #' } #' -#' ## Breaks limits +#' ## Class boundaries #' Breaks defined by a numeric vector or a classification method are #' left-closed: breaks defined by \code{c(2, 5, 10, 15, 20)} #' will be mapped as [2 - 5[, [5 - 10[, [10 - 15[, \[15 - 20]. -#' The "jenks" method is an exception and has to be right-closed. -#' Jenks breaks computed as \code{c(2, 5, 10, 15, 20)} -#' will be mapped as \[2 - 5], ]5 - 10], ]10 - 15], ]15 - 20]. -#' #' #' #' @eval my_params(c( diff --git a/inst/tinytest/test_breaks.R b/inst/tinytest/test_breaks.R index e68aba3..b1e6faa 100644 --- a/inst/tinytest/test_breaks.R +++ b/inst/tinytest/test_breaks.R @@ -25,4 +25,9 @@ expect_equal(mf_get_breaks(x = x, breaks = c(1, 10, 20, 100), k = 1), c(1, 10, 20, 100)) expect_equal(mf_get_breaks(x = x, breaks = "fisher"), c(11929, 13953, 15685.5, 17372, 18622, 20354.5, 21761)) +expect_equal(mf_get_breaks(x = x, breaks = "ckmeans"), + c(11929, 13953, 15685.5, 17372, 18622, 20354.5, 21761)) expect_error(mf_get_breaks(x = c(0, x), breaks = "geom")) +expect_equal(mf_get_breaks(x = x, breaks = "Q6"), + c(11929, 13168.65, 14457.25, 15685.5, 17979.75, 20159.35, 21761)) + diff --git a/man/mf_get_breaks.Rd b/man/mf_get_breaks.Rd index 48398ab..404256e 100644 --- a/man/mf_get_breaks.Rd +++ b/man/mf_get_breaks.Rd @@ -13,9 +13,8 @@ classification.} \item{nbreaks}{a number of classes} \item{breaks}{a classification method; one of "fixed", "sd", "equal", -"pretty", "quantile", -"kmeans", "hclust", "bclust", "fisher", "jenks", "dpih", "q6", "geom", - "arith", "em" or "msd" (see Details).} +"pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks", +"dpih", "q6", "Q6", geom", "arith", "em", "msd" or "ckmeans" (see Details)} \item{k}{number of standard deviation for "msd" method (see Details)} @@ -29,17 +28,38 @@ A numeric vector of breaks } \description{ A function to classify continuous variables. + +This function is a wrapper for +\code{\link[classInt:classIntervals]{classIntervals}} +with some additional methods. } \details{ "fixed", "sd", "equal", "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks" and "dpih" are \code{\link[classInt:classIntervals]{classIntervals}} methods. You may need to pass additional arguments for some of them.\cr\cr -Jenks ("jenks" method) and Fisher ("fisher" method) algorithms are -based on the same principle and give -quite similar results but Fisher is much faster. \cr\cr + +The "jenks", "fisher" and "ckmeans" methods are based on the same concept of +\strong{natural breaks} and and produce similar groupings. +\itemize{ +\item The "jenks" method produces class boundaries falling on data points and is +slow. +\item The "fisher" method produces class boundaries located more conveniently +between data points, and is faster than the "jenks" method. +\item The "ckmeans" method produces exactly the same class boundaries as the +"fisher" method, but is much faster. It uses the optimal univariate +k-means method from the \code{Ckmeans.1d.dp} package. +If the "ckmeans" method is selected but the \code{Ckmeans.1d.dp} package is not +installed then the "fisher" method is used. +} + +The relative speeds of these three methods may vary depending on the number +of data points and the number of classes.\cr\cr + The "q6" method uses the following \code{\link[stats:quantile]{quantile}} probabilities: 0, 0.05, 0.275, 0.5, 0.725, 0.95, 1.\cr\cr +The "Q6" method uses the following \code{\link[stats:quantile]{quantile}} +probabilities: 0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.\cr\cr The "geom" method is based on a geometric progression along the variable values, all values must be strictly greater than zero.\cr\cr The "arith" method is based on an arithmetic progression along @@ -53,11 +73,6 @@ the extent of each class in share of standard deviation. If \code{central=TRUE} then the mean value is the center of a class else the mean is a break value. } -\note{ -This function is mainly a wrapper -of \code{\link[classInt:classIntervals]{classIntervals}} + -"arith", "em", "q6", "geom" and "msd" methods. -} \examples{ mtq <- mf_get_mtq() mf_get_breaks(x = mtq$MED, nbreaks = 6, breaks = "quantile") diff --git a/man/mf_map.Rd b/man/mf_map.Rd index b3e6b7a..a6deec6 100644 --- a/man/mf_map.Rd +++ b/man/mf_map.Rd @@ -261,14 +261,11 @@ mf_map(x, var, type = "symb_choro", pal = "Mint", alpha = 1, rev = FALSE, } } -\subsection{Breaks limits}{ +\subsection{Class boundaries}{ Breaks defined by a numeric vector or a classification method are left-closed: breaks defined by \code{c(2, 5, 10, 15, 20)} will be mapped as [2 - 5[, [5 - 10[, [10 - 15[, [15 - 20]. -The "jenks" method is an exception and has to be right-closed. -Jenks breaks computed as \code{c(2, 5, 10, 15, 20)} -will be mapped as [2 - 5], ]5 - 10], ]10 - 15], ]15 - 20]. } } \examples{