在图上添加回归线方程和R^2

我想知道如何在ggplot上添加回归线方程和R^2。我的代码是:

library(ggplot2)


df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p

任何帮助都将不胜感激。

406868 次浏览

这里有一个解决方案

# GET EQUATION AND R-SQUARED AS STRING
# SOURCE: https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA


lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}


p1 <- p + geom_text(x = 25, y = 300, label = lm_eqn(df), parse = TRUE)

编辑。我找到了这个代码的来源。这里是ggplot2谷歌组中原始帖子的链接

Output

我修改了Ramnath的帖子,a)使其更通用,以便它接受线性模型作为参数,而不是数据帧;b)更适当地显示负号。

lm_eqn = function(m) {


l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));


if (coef(m)[2] >= 0)  {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}


as.character(as.expression(eq));
}

用法将更改为:

p1 = p + geom_text(aes(x = 25, y = 300, label = lm_eqn(lm(y ~ x, df))), parse = TRUE)

我修改了stat_smooth的源代码和相关函数的几行代码,以创建一个添加拟合方程和R平方值的新函数。这也适用于小面图!

library(devtools)
source_gist("524eade46135f6348140")
df = data.frame(x = c(1:100))
df$y = 2 + 5 * df$x + rnorm(100, sd = 40)
df$class = rep(1:2,50)
ggplot(data = df, aes(x = x, y = y, label=y)) +
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_smooth(method="lm",se=FALSE) +
geom_point() + facet_wrap(~class)

enter image description here

我使用了@Ramnath回答中的代码来格式化方程。stat_smooth_func函数不是很健壮,但是使用它应该不难。

https://gist.github.com/kdauria/524eade46135f6348140。如果你得到一个错误,尝试更新ggplot2

我的包ggpmisc中的统计stat_poly_eq()使它可以根据线性模型拟合添加文本标签。

此答案已更新为'ggpmisc' (>= 0.4.0)和'ggplot2' (>= 3.3.0) 2022-06-02。 在示例中,我使用stat_poly_line()而不是stat_smooth(),因为它对于methodformula具有与stat_poly_eq()相同的默认值。在所有代码示例中,我省略了stat_poly_line()的附加参数,因为它们与添加标签的问题无关

library(ggplot2)
library(ggpmisc)
#> Loading required package: ggpp
#>
#> Attaching package: 'ggpp'
#> The following object is masked from 'package:ggplot2':
#>
#>     annotate
# artificial data
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$yy <- 2 + 3 * df$x + 0.1 * df$x^2 + rnorm(100, sd = 40)


# using default formula, label and methods
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq() +
geom_point()

< img src = " https://i.imgur.com/RDZ2XTj.png " alt = " / >

# assembling a single label with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()

< img src = " https://i.imgur.com/1moN0zF.png " alt = " / >

# adding separate labels with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = after_stat(eq.label))) +
stat_poly_eq(label.y = 0.9) +
geom_point()

< img src = " https://i.imgur.com/Gv3qsgl.png " alt = " / >

# regression through the origin
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line(formula = y ~ x + 0) +
stat_poly_eq(formula = y ~ x + 0, aes(label = after_stat(eq.label))) +
geom_point()

< img src = " https://i.imgur.com/xux4jvx.png " alt = " / >

# fitting a polynomial
ggplot(data = df, aes(x = x, y = yy)) +
stat_poly_line(formula = y ~ poly(x, 2, raw = TRUE)) +
stat_poly_eq(formula = y ~ poly(x, 2, raw = TRUE),
aes(label = after_stat(eq.label))) +
geom_point()

< img src = " https://i.imgur.com/ss2mXll.png " alt = " / >

# adding a hat as asked by @MYaseen208 and @elarry
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(hat(y))~`=`~",
aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()

< img src = " https://i.imgur.com/Eb2kLV9.png " alt = " / >

# variable substitution as asked by @shabbychef
# same labels in equation and axes
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(h)~`=`~",
eq.x.rhs = "~italic(z)",
aes(label = after_stat(eq.label))) +
labs(x = expression(italic(z)), y = expression(italic(h))) +
geom_point()

< img src = " https://i.imgur.com/S6xxz2v.png " alt = " / >

# grouping as asked by @helen.h
dfg <- data.frame(x = c(1:100))
dfg$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40)
dfg$group <- factor(rep(c("A", "B"), 50))


ggplot(data = dfg, aes(x = x, y = y, colour = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()

< img src = " https://i.imgur.com/lTEynGy.png " alt = " / >

ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(grp.label), "*\": \"*",
after_stat(eq.label), "*\", \"*",
after_stat(rr.label), sep = ""))) +
geom_point()

< img src = " https://i.imgur.com/TfBBBCu.png " alt = " / >

# a single fit with grouped data as asked by @Herman
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point(aes(colour = group))

< img src = " https://i.imgur.com/BlWBPRk.png " alt = " / >

# facets
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point() +
facet_wrap(~group)

< img src = " https://i.imgur.com/6BIxWGj.png " alt = " / >

reprex包 (v2.0.1)创建于2022-06-02

真的很喜欢@Ramnath的解决方案。为了允许使用自定义回归公式(而不是固定为y和x作为字面变量名),并将p值添加到打印输出中(正如@Jerry T评论的那样),下面是mod:

lm_eqn <- function(df, y, x){
formula = as.formula(sprintf('%s ~ %s', y, x))
m <- lm(formula, data=df);
# formating the values into a summary string to print out
# ~ give some space, but equal size and comma need to be quoted
eq <- substitute(italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue),
list(target = y,
input = x,
a = format(as.vector(coef(m)[1]), digits = 2),
b = format(as.vector(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
# getting the pvalue is painful
pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=1)
)
)
as.character(as.expression(eq));
}


geom_point() +
ggrepel::geom_text_repel(label=rownames(mtcars)) +
geom_text(x=3,y=300,label=lm_eqn(mtcars, 'hp','wt'),color='red',parse=T) +
geom_smooth(method='lm')

enter image description here 不幸的是,这并不适用于facet_wrap或facet_grid

这个答案中提供的方程风格的启发,一个更通用的方法(多个预测器+ latex输出作为选项)可以是:

print_equation= function(model, latex= FALSE, ...){
dots <- list(...)
cc= model$coefficients
var_sign= as.character(sign(cc[-1]))%>%gsub("1","",.)%>%gsub("-"," - ",.)
var_sign[var_sign==""]= ' + '


f_args_abs= f_args= dots
f_args$x= cc
f_args_abs$x= abs(cc)
cc_= do.call(format, args= f_args)
cc_abs= do.call(format, args= f_args_abs)
pred_vars=
cc_abs%>%
paste(., x_vars, sep= star)%>%
paste(var_sign,.)%>%paste(., collapse= "")


if(latex){
star= " \\cdot "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]%>%
paste0("\\hat{",.,"_{i}}")
x_vars= names(cc_)[-1]%>%paste0(.,"_{i}")
}else{
star= " * "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]
x_vars= names(cc_)[-1]
}


equ= paste(y_var,"=",cc_[1],pred_vars)
if(latex){
equ= paste0(equ," + \\hat{\\varepsilon_{i}} \\quad where \\quad \\varepsilon \\sim \\mathcal{N}(0,",
summary(MetamodelKdifEryth)$sigma,")")%>%paste0("$",.,"$")
}
cat(equ)
}

model参数需要一个lm对象,latex参数是一个布尔值,要求一个简单字符或一个乳胶格式的方程,而...参数将其值传递给format函数。

我还添加了一个选项来输出它为latex,这样你就可以在rmarkdown中使用这个函数:


```{r echo=FALSE, results='asis'}
print_equation(model = lm_mod, latex = TRUE)
```

现在使用它:

df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$z <- 8 + 3 * df$x + rnorm(100, sd = 40)
lm_mod= lm(y~x+z, data = df)


print_equation(model = lm_mod, latex = FALSE)

此代码产生: y = 11.3382963933174 + 2.5893419 * x + 0.1002227 * z < / p >

如果我们要求一个乳胶方程,将参数四舍五入为3位:

print_equation(model = lm_mod, latex = TRUE, digits= 3)
< p >这个收益率: 乳胶方程 < / p >

使用ggpubr:

library(ggpubr)


# reproducible data
set.seed(1)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)


# By default showing Pearson R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300) +
stat_regline_equation(label.y = 280)

enter image description here

# Use R2 instead of R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300,
aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))) +
stat_regline_equation(label.y = 280)


## compare R2 with accepted answer
# m <- lm(y ~ x, df)
# round(summary(m)$r.squared, 2)
# [1] 0.85

enter image description here

下面是对每个人来说最简单的代码

注:显示Pearson的Rho和 R^2。

library(ggplot2)
library(ggpubr)


df <- data.frame(x = c(1:100)
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()+
stat_cor(label.y = 35)+ #this means at 35th unit in the y axis, the r squared and p value will be shown
stat_regline_equation(label.y = 30) #this means at 30th unit regresion line equation will be shown


p

One such example with my own dataset

另一种选择是创建一个自定义函数,使用dplyrbroom库生成方程:

get_formula <- function(model) {
  

broom::tidy(model)[, 1:2] %>%
mutate(sign = ifelse(sign(estimate) == 1, ' + ', ' - ')) %>% #coeff signs
mutate_if(is.numeric, ~ abs(round(., 2))) %>% #for improving formatting
mutate(a = ifelse(term == '(Intercept)', paste0('y ~ ', estimate), paste0(sign, estimate, ' * ', term))) %>%
summarise(formula = paste(a, collapse = '')) %>%
as.character
  

}


lm(y ~ x, data = df) -> model
get_formula(model)
#"y ~ 6.22 + 3.16 * x"


scales::percent(summary(model)$r.squared, accuracy = 0.01) -> r_squared

现在我们需要将文本添加到情节中:

p +
geom_text(x = 20, y = 300,
label = get_formula(model),
color = 'red') +
geom_text(x = 20, y = 285,
label = r_squared,
color = 'blue')

plot

类似于@zx8754和@kdauria的答案,只是使用了ggplot2ggpubr。我更喜欢使用ggpubr,因为它不需要自定义函数,比如这个问题的顶部答案。

library(ggplot2)
library(ggpubr)


df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)


ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label..)), # adds R^2 value
r.accuracy = 0.01,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)


enter image description here

也可以把p值加到上图中吗

ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), # adds R^2 and p-value
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)

enter image description here

当你有多个组时,也适用于facet_wrap()

df$group <- rep(1:2,50)


ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..),
label.x = 0, label.y = 400, size = 4) +
theme_bw() +
facet_wrap(~group)

enter image description here