Overlay normal curve to histogram in R

小开

最佳答案

Here's a nice easy way I found:

h <- hist(g, breaks = 10, density = 10,
col = "lightgray", xlab = "Accuracy", main = "Overall")
xfit <- seq(min(g), max(g), length = 40)
yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
yfit <- yfit * diff(h$mids[1:2]) * length(g)


lines(xfit, yfit, col = "black", lwd = 2)

小开

You need to find the right multiplier to convert density (an estimated curve where the area beneath the curve is 1) to counts. This can be easily calculated from the hist object.

myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]


plot(myhist)
lines(mydensity)

enter image description here

A more complete version, with a normal density and lines at each standard deviation away from the mean (including the mean):

myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]


plot(myhist)
lines(mydensity)


myx <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out= 100)
mymean <- mean(mtcars$mpg)
mysd <- sd(mtcars$mpg)


normal <- dnorm(x = myx, mean = mymean, sd = mysd)
lines(myx, normal * multiplier[1], col = "blue", lwd = 2)


sd_x <- seq(mymean - 3 * mysd, mymean + 3 * mysd, by = mysd)
sd_y <- dnorm(x = sd_x, mean = mymean, sd = mysd) * multiplier[1]


segments(x0 = sd_x, y0= 0, x1 = sd_x, y1 = sd_y, col = "firebrick4", lwd = 2)

小开

This is an implementation of aforementioned StanLe's anwer, also fixing the case where his answer would produce no curve when using densities.

This replaces the existing but hidden hist.default() function, to only add the normalcurve parameter (which defaults to TRUE).

The first three lines are to support roxygen2 for package building.

#' @noRd
#' @exportMethod hist.default
#' @export
hist.default <- function(x,
breaks = "Sturges",
freq = NULL,
include.lowest = TRUE,
normalcurve = TRUE,
right = TRUE,
density = NULL,
angle = 45,
col = NULL,
border = NULL,
main = paste("Histogram of", xname),
ylim = NULL,
xlab = xname,
ylab = NULL,
axes = TRUE,
plot = TRUE,
labels = FALSE,
warn.unused = TRUE,
...)  {


# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")


suppressWarnings(
h <- graphics::hist.default(
x = x,
breaks = breaks,
freq = freq,
include.lowest = include.lowest,
right = right,
density = density,
angle = angle,
col = col,
border = border,
main = main,
ylim = ylim,
xlab = xlab,
ylab = ylab,
axes = axes,
plot = plot,
labels = labels,
warn.unused = warn.unused,
...
)
)


if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- yfit * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}


if (plot == TRUE) {
invisible(h)
} else {
h
}
}

Quick example:

hist(g)

For dates it's bit different. For reference:

#' @noRd
#' @exportMethod hist.Date
#' @export
hist.Date <- function(x,
breaks = "months",
format = "%b",
normalcurve = TRUE,
xlab = xname,
plot = TRUE,
freq = NULL,
density = NULL,
start.on.monday = TRUE,
right = TRUE,
...)  {


# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")


suppressWarnings(
h <- graphics:::hist.Date(
x = x,
breaks = breaks,
format = format,
freq = freq,
density = density,
start.on.monday = start.on.monday,
right = right,
xlab = xlab,
plot = plot,
...
)
)


if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- as.double(yfit) * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}


if (plot == TRUE) {
invisible(h)
} else {
h
}
}

小开

Just remove the prob = T, and let it stay at default ie F