按组获取最高值

下面是一个示例数据框架:

d <- data.frame(
x   = runif(90),
grp = gl(3, 30)
)

我想要 d的子集,其中包含 grp每个值的前5个值为 x的行。

使用 base-R,我的方法是这样的:

ordered <- d[order(d$x, decreasing = TRUE), ]
splits <- split(ordered, ordered$grp)
heads <- lapply(splits, head)
do.call(rbind, heads)
##              x grp
## 1.19 0.8879631   1
## 1.4  0.8844818   1
## 1.12 0.8596197   1
## 1.26 0.8481809   1
## 1.18 0.8461516   1
## 1.29 0.8317092   1
## 2.31 0.9751049   2
## 2.34 0.9269764   2
## 2.57 0.8964114   2
## 2.58 0.8896466   2
## 2.45 0.8888834   2
## 2.35 0.8706823   2
## 3.74 0.9884852   3
## 3.73 0.9837653   3
## 3.83 0.9375398   3
## 3.64 0.9229036   3
## 3.69 0.8021373   3
## 3.86 0.7418946   3

使用 dplyr,我期望它能够工作:

d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
head(n = 5)

但是它只返回前5行。

head交换为 top_n将返回整个 d

d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
top_n(n = 5)

我如何得到正确的子集?

126627 次浏览

You need to wrap head in a call to do. In the following code, . represents the current group (see description of ... in the do help page).

d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
do(head(., n = 5))

正如 akrun 所提到的,slice是另一种选择。

d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
slice(1:5)

尽管我没有问这个问题,但为了完整起见,可能的 data.table版本是(感谢@Arun 的修复) :

setDT(d)[order(-x), head(.SD, 5), by = grp]

我对基数 R 的方法是:

ordered <- d[order(d$x, decreasing = TRUE), ]
ordered[ave(d$x, d$grp, FUN = seq_along) <= 5L,]

And using dplyr, the approach with slice is probably fastest, but you could also use filter which will likely be faster than using do(head(., 5)):

d %>%
arrange(desc(x)) %>%
group_by(grp) %>%
filter(row_number() <= 5L)

Dplyr 基准测试

set.seed(123)
d <- data.frame(
x   = runif(1e6),
grp = sample(1e4, 1e6, TRUE))


library(microbenchmark)


microbenchmark(
top_n = {d %>%
group_by(grp) %>%
top_n(n = 5, wt = x)},
dohead = {d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
do(head(., n = 5))},
slice = {d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
slice(1:5)},
filter = {d %>%
arrange(desc(x)) %>%
group_by(grp) %>%
filter(row_number() <= 5L)},
times = 10,
unit = "relative"
)


Unit: relative
expr       min        lq    median        uq       max neval
top_n  1.042735  1.075366  1.082113  1.085072  1.000846    10
dohead 18.663825 19.342854 19.511495 19.840377 17.433518    10
slice  1.000000  1.000000  1.000000  1.000000  1.000000    10
filter  1.048556  1.044113  1.042184  1.180474  1.053378    10

Pretty easy with data.table too...

library(data.table)
setorder(setDT(d), -x)[, head(.SD, 5), keyby = grp]

或者

setorder(setDT(d), grp, -x)[, head(.SD, 5), by = grp]

Or (Should be faster for big data set because avoiding calling .SD for each group)

setorder(setDT(d), grp, -x)[, indx := seq_len(.N), by = grp][indx <= 5]

编辑: 以下是 dplyrdata.table的比较(如果有人感兴趣的话)

set.seed(123)
d <- data.frame(
x   = runif(1e6),
grp = sample(1e4, 1e6, TRUE))


library(dplyr)
library(microbenchmark)
library(data.table)
dd <- copy(d)


microbenchmark(
top_n = {d %>%
group_by(grp) %>%
top_n(n = 5, wt = x)},
dohead = {d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
do(head(., n = 5))},
slice = {d %>%
arrange_(~ desc(x)) %>%
group_by_(~ grp) %>%
slice(1:5)},
filter = {d %>%
arrange(desc(x)) %>%
group_by(grp) %>%
filter(row_number() <= 5L)},
data.table1 = setorder(setDT(dd), -x)[, head(.SD, 5L), keyby = grp],
data.table2 = setorder(setDT(dd), grp, -x)[, head(.SD, 5L), grp],
data.table3 = setorder(setDT(dd), grp, -x)[, indx := seq_len(.N), grp][indx <= 5L],
times = 10,
unit = "relative"
)




#        expr        min         lq      mean     median        uq       max neval
#       top_n  24.246401  24.492972 16.300391  24.441351 11.749050  7.644748    10
#      dohead 122.891381 120.329722 77.763843 115.621635 54.996588 34.114738    10
#       slice  27.365711  26.839443 17.714303  26.433924 12.628934  7.899619    10
#      filter  27.755171  27.225461 17.936295  26.363739 12.935709  7.969806    10
# data.table1  13.753046  16.631143 10.775278  16.330942  8.359951  5.077140    10
# data.table2  12.047111  11.944557  7.862302  11.653385  5.509432  3.642733    10
# data.table3   1.000000   1.000000  1.000000   1.000000  1.000000  1.000000    10

Adding a marginally faster data.table solution:

set.seed(123L)
d <- data.frame(
x   = runif(1e8),
grp = sample(1e4, 1e8, TRUE))
setDT(d)
setorder(d, grp, -x)
dd <- copy(d)


library(microbenchmark)
microbenchmark(
data.table3 = d[, indx := seq_len(.N), grp][indx <= 5L],
data.table4 = dd[dd[, .I[seq_len(.N) <= 5L], grp]$V1],
times = 10L
)

计时输出:

Unit: milliseconds
expr      min       lq     mean   median        uq      max neval
data.table3 826.2148 865.6334 950.1380 902.1689 1006.1237 1260.129    10
data.table4 729.3229 783.7000 859.2084 823.1635  966.8239 1014.397    10

Dplyr 1.0.0中,“ slice_min()slice_max()选择具有变量的最小值或最大值的行,取代令人困惑的 top_n().

d %>% group_by(grp) %>% slice_max(order_by = x, n = 5)
# # A tibble: 15 x 2
# # Groups:   grp [3]
#     x grp
# <dbl> <fct>
#  1 0.994 1
#  2 0.957 1
#  3 0.955 1
#  4 0.940 1
#  5 0.900 1
#  6 0.963 2
#  7 0.902 2
#  8 0.895 2
#  9 0.858 2
# 10 0.799 2
# 11 0.985 3
# 12 0.893 3
# 13 0.886 3
# 14 0.815 3
# 15 0.812 3

使用 top_n的预 dplyr 1.0.0:

来自 ?top_n,关于 wt的论点:

用于在 tbl 中对[ ... ] 默认为最后一个变量进行排序的变量”。

数据集中的最后一个变量是“ grp”,它不是您希望排名的变量,这就是 top_n尝试“返回整个 d”的原因。因此,如果希望在数据集中按“ x”排名,则需要指定 wt = x

d %>%
group_by(grp) %>%
top_n(n = 5, wt = x)

资料:

set.seed(123)
d <- data.frame(
x = runif(90),
grp = gl(3, 30))

如果 点餐变量在每个组中不是唯一的,top _ n (n = 1)仍将为每个组返回多行。为了精确地为每个组选择一个匹配项,向每一行添加一个唯一的变量:

set.seed(123)
d <- data.frame(
x   = runif(90),
grp = gl(3, 30))


d %>%
mutate(rn = row_number()) %>%
group_by(grp) %>%
top_n(n = 1, wt = rn)

还有一个 data.table解决方案可以突出其简洁的语法:

setDT(d)
d[order(-x), .SD[1:5], grp]