From 84454088fa5586852b58f4ea0f2ab25789ef0d06 Mon Sep 17 00:00:00 2001 From: "Mattan S. Ben-Shachar" Date: Mon, 21 Oct 2024 22:27:25 +0300 Subject: [PATCH 1/4] Docs/example for non-equal-width bins --- R/geom-histogram.R | 12 ++++++++++++ R/stat-bin.R | 2 +- man/geom_histogram.Rd | 14 +++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/R/geom-histogram.R b/R/geom-histogram.R index dafc181f15..e69434cb17 100644 --- a/R/geom-histogram.R +++ b/R/geom-histogram.R @@ -63,6 +63,18 @@ #' ggplot(diamonds, aes(price, after_stat(density), colour = cut)) + #' geom_freqpoly(binwidth = 500) #' +#' +#' # When using the non-equal-width bins, we need to set the area of the bars to +#' # represent the counts (not the height). +#' # Here we're using 10 equi-probable bins: +#' price_bins <- quantile(diamonds$price, probs = seq(0, 1, length = 11)) +#' +#' ggplot(diamonds, aes(price)) + +#' geom_histogram(breaks = price_bins, color = "black") # wrong (height = count) +#' +#' ggplot(diamonds, aes(price, after_stat(count / width))) + +#' geom_histogram(breaks = price_bins, color = "black") # area = count +#' #' if (require("ggplot2movies")) { #' # Often we don't want the height of the bar to represent the #' # count of observations, but the sum of some other variable. diff --git a/R/stat-bin.R b/R/stat-bin.R index c085f818a2..699b665957 100644 --- a/R/stat-bin.R +++ b/R/stat-bin.R @@ -31,7 +31,7 @@ #' density = "density of points in bin, scaled to integrate to 1.", #' ncount = "count, scaled to a maximum of 1.", #' ndensity = "density, scaled to a maximum of 1.", -#' width = "widths of bins." +#' width = "widths of bins. Use with `after_stat(count / width)` to obtain bars with _areas_ representing counts (e.g., with non-equal-width bins). See example." #' ) #' #' @section Dropped variables: diff --git a/man/geom_histogram.Rd b/man/geom_histogram.Rd index 1f290dbcdc..54ea04f6e2 100644 --- a/man/geom_histogram.Rd +++ b/man/geom_histogram.Rd @@ -215,7 +215,7 @@ These are calculated by the 'stat' part of layers and can be accessed with \link \item \code{after_stat(density)}\cr density of points in bin, scaled to integrate to 1. \item \code{after_stat(ncount)}\cr count, scaled to a maximum of 1. \item \code{after_stat(ndensity)}\cr density, scaled to a maximum of 1. -\item \code{after_stat(width)}\cr widths of bins. +\item \code{after_stat(width)}\cr widths of bins. Use with \code{after_stat(count / width)} to obtain bars with \emph{areas} representing counts (e.g., with non-equal-width bins). See example. } } @@ -255,6 +255,18 @@ ggplot(diamonds, aes(price, colour = cut)) + ggplot(diamonds, aes(price, after_stat(density), colour = cut)) + geom_freqpoly(binwidth = 500) + +# When using the non-equal-width bins, we need to set the area of the bars to +# represent the counts (not the height). +# Here we're using 10 equi-probable bins: +price_bins <- quantile(diamonds$price, probs = seq(0, 1, length = 11)) + +ggplot(diamonds, aes(price)) + + geom_histogram(breaks = price_bins, color = "black") # wrong (height = count) + +ggplot(diamonds, aes(price, after_stat(count / width))) + + geom_histogram(breaks = price_bins, color = "black") # area = count + if (require("ggplot2movies")) { # Often we don't want the height of the bar to represent the # count of observations, but the sum of some other variable. From 87c0c10c9cdf83115b6b5058655c9c9e62c1ec77 Mon Sep 17 00:00:00 2001 From: "Mattan S. Ben-Shachar" <35330040+mattansb@users.noreply.github.com> Date: Fri, 25 Oct 2024 13:17:18 +0300 Subject: [PATCH 2/4] Update R/geom-histogram.R Co-authored-by: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> --- R/geom-histogram.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/geom-histogram.R b/R/geom-histogram.R index e69434cb17..868c8bc8f3 100644 --- a/R/geom-histogram.R +++ b/R/geom-histogram.R @@ -64,7 +64,7 @@ #' geom_freqpoly(binwidth = 500) #' #' -#' # When using the non-equal-width bins, we need to set the area of the bars to +#' # When using the non-equal-width bins, we should set the area of the bars to #' # represent the counts (not the height). #' # Here we're using 10 equi-probable bins: #' price_bins <- quantile(diamonds$price, probs = seq(0, 1, length = 11)) From 3837d44e508d665678c793ee39e94791ac658da8 Mon Sep 17 00:00:00 2001 From: "Mattan S. Ben-Shachar" <35330040+mattansb@users.noreply.github.com> Date: Fri, 25 Oct 2024 13:17:26 +0300 Subject: [PATCH 3/4] Update R/geom-histogram.R Co-authored-by: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> --- R/geom-histogram.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/geom-histogram.R b/R/geom-histogram.R index 868c8bc8f3..4dc6d3f43c 100644 --- a/R/geom-histogram.R +++ b/R/geom-histogram.R @@ -70,7 +70,7 @@ #' price_bins <- quantile(diamonds$price, probs = seq(0, 1, length = 11)) #' #' ggplot(diamonds, aes(price)) + -#' geom_histogram(breaks = price_bins, color = "black") # wrong (height = count) +#' geom_histogram(breaks = price_bins, color = "black") # misleading (height = count) #' #' ggplot(diamonds, aes(price, after_stat(count / width))) + #' geom_histogram(breaks = price_bins, color = "black") # area = count From 99bb87d02054d1f8a38ad3dd367b2d2a94419e11 Mon Sep 17 00:00:00 2001 From: "Mattan S. Ben-Shachar" Date: Fri, 25 Oct 2024 14:17:07 +0300 Subject: [PATCH 4/4] move (count / width) note to details --- R/geom-histogram.R | 6 ++++++ R/stat-bin.R | 2 +- man/geom_histogram.Rd | 12 +++++++++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/R/geom-histogram.R b/R/geom-histogram.R index 4dc6d3f43c..7bd832b611 100644 --- a/R/geom-histogram.R +++ b/R/geom-histogram.R @@ -17,6 +17,12 @@ #' one change at a time. You may need to look at a few options to uncover #' the full story behind your data. #' +#' By default, the _height_ of the bars represent the counts within each bin. +#' However, there are situations where this behavior might produce misleading +#' plots (e.g., when non-equal-width bins are used), in which case it might be +#' preferable to have the _area_ of the bars represent the counts (by setting +#' `aes(y = after_stat(count / width))`). See example below. +#' #' In addition to `geom_histogram()`, you can create a histogram plot by using #' `scale_x_binned()` with [geom_bar()]. This method by default plots tick marks #' in between each bar. diff --git a/R/stat-bin.R b/R/stat-bin.R index 699b665957..c085f818a2 100644 --- a/R/stat-bin.R +++ b/R/stat-bin.R @@ -31,7 +31,7 @@ #' density = "density of points in bin, scaled to integrate to 1.", #' ncount = "count, scaled to a maximum of 1.", #' ndensity = "density, scaled to a maximum of 1.", -#' width = "widths of bins. Use with `after_stat(count / width)` to obtain bars with _areas_ representing counts (e.g., with non-equal-width bins). See example." +#' width = "widths of bins." #' ) #' #' @section Dropped variables: diff --git a/man/geom_histogram.Rd b/man/geom_histogram.Rd index 54ea04f6e2..a241aa2ba4 100644 --- a/man/geom_histogram.Rd +++ b/man/geom_histogram.Rd @@ -192,6 +192,12 @@ different number of bins. You can also experiment modifying the \code{binwidth} one change at a time. You may need to look at a few options to uncover the full story behind your data. +By default, the \emph{height} of the bars represent the counts within each bin. +However, there are situations where this behavior might produce misleading +plots (e.g., when non-equal-width bins are used), in which case it might be +preferable to have the \emph{area} of the bars represent the counts (by setting +\code{aes(y = after_stat(count / width))}). See example below. + In addition to \code{geom_histogram()}, you can create a histogram plot by using \code{scale_x_binned()} with \code{\link[=geom_bar]{geom_bar()}}. This method by default plots tick marks in between each bar. @@ -215,7 +221,7 @@ These are calculated by the 'stat' part of layers and can be accessed with \link \item \code{after_stat(density)}\cr density of points in bin, scaled to integrate to 1. \item \code{after_stat(ncount)}\cr count, scaled to a maximum of 1. \item \code{after_stat(ndensity)}\cr density, scaled to a maximum of 1. -\item \code{after_stat(width)}\cr widths of bins. Use with \code{after_stat(count / width)} to obtain bars with \emph{areas} representing counts (e.g., with non-equal-width bins). See example. +\item \code{after_stat(width)}\cr widths of bins. } } @@ -256,13 +262,13 @@ ggplot(diamonds, aes(price, after_stat(density), colour = cut)) + geom_freqpoly(binwidth = 500) -# When using the non-equal-width bins, we need to set the area of the bars to +# When using the non-equal-width bins, we should set the area of the bars to # represent the counts (not the height). # Here we're using 10 equi-probable bins: price_bins <- quantile(diamonds$price, probs = seq(0, 1, length = 11)) ggplot(diamonds, aes(price)) + - geom_histogram(breaks = price_bins, color = "black") # wrong (height = count) + geom_histogram(breaks = price_bins, color = "black") # misleading (height = count) ggplot(diamonds, aes(price, after_stat(count / width))) + geom_histogram(breaks = price_bins, color = "black") # area = count