从分组数据中选择第一行和最后一行

提问

使用 dplyr,如何在一个语句中选择分组数据的顶部和底部的观察值/行?

数据及例子

给定一个数据框架:

df <- data.frame(id=c(1,1,1,2,2,2,3,3,3),
stopId=c("a","b","c","a","b","c","a","b","c"),
stopSequence=c(1,2,3,3,1,4,3,1,2))

我可以使用 slice从每组中得到顶部和底部的观察结果,但是使用两个独立的语句:

firstStop <- df %>%
group_by(id) %>%
arrange(stopSequence) %>%
slice(1) %>%
ungroup


lastStop <- df %>%
group_by(id) %>%
arrange(stopSequence) %>%
slice(n()) %>%
ungroup

我是否可以将这两个语句组合成一个选择 都有顶部和底部观察值的语句?

154073 次浏览

可能还有一个更快的方法:

df %>%
group_by(id) %>%
arrange(stopSequence) %>%
filter(row_number()==1 | row_number()==n())

比如:

library(dplyr)


df <- data.frame(id=c(1,1,1,2,2,2,3,3,3),
stopId=c("a","b","c","a","b","c","a","b","c"),
stopSequence=c(1,2,3,3,1,4,3,1,2))


first_last <- function(x) {
bind_rows(slice(x, 1), slice(x, n()))
}


df %>%
group_by(id) %>%
arrange(stopSequence) %>%
do(first_last(.)) %>%
ungroup


## Source: local data frame [6 x 3]
##
##   id stopId stopSequence
## 1  1      a            1
## 2  1      c            3
## 3  2      b            1
## 4  2      c            4
## 5  3      b            1
## 6  3      a            3

使用 do,您几乎可以对组执行任意数量的操作,但是@jeremycg 的答案更适合于这个任务。

不是 dplyr,但是使用 data.table更直接:

library(data.table)
setDT(df)
df[ df[order(id, stopSequence), .I[c(1L,.N)], by=id]$V1 ]
#    id stopId stopSequence
# 1:  1      a            1
# 2:  1      c            3
# 3:  2      b            1
# 4:  2      c            4
# 5:  3      b            1
# 6:  3      a            3

更详细的解释:

# 1) get row numbers of first/last observations from each group
#    * basically, we sort the table by id/stopSequence, then,
#      grouping by id, name the row numbers of the first/last
#      observations for each id; since this operation produces
#      a data.table
#    * .I is data.table shorthand for the row number
#    * here, to be maximally explicit, I've named the variable V1
#      as row_num to give other readers of my code a clearer
#      understanding of what operation is producing what variable
first_last = df[order(id, stopSequence), .(row_num = .I[c(1L,.N)]), by=id]
idx = first_last$row_num


# 2) extract rows by number
df[idx]

一定要查看 开始 wiki,了解 data.table的基本知识

仅仅为了完整性: 您可以将一个索引向量传递给 slice:

df %>% arrange(stopSequence) %>% group_by(id) %>% slice(c(1,n()))

所以

  id stopId stopSequence
1  1      a            1
2  1      c            3
3  2      b            1
4  2      c            4
5  3      b            1
6  3      a            3

我知道问题指的是 dplyr。但是,由于其他人已经发布了使用其他软件包的解决方案,我决定也尝试使用其他软件包:

基本包装:

df <- df[with(df, order(id, stopSequence, stopId)), ]
merge(df[!duplicated(df$id), ],
df[!duplicated(df$id, fromLast = TRUE), ],
all = TRUE)

资料表:

df <-  setDT(df)
df[order(id, stopSequence)][, .SD[c(1,.N)], by=id]

Sqldf:

library(sqldf)
min <- sqldf("SELECT id, stopId, min(stopSequence) AS StopSequence
FROM df GROUP BY id
ORDER BY id, StopSequence, stopId")
max <- sqldf("SELECT id, stopId, max(stopSequence) AS StopSequence
FROM df GROUP BY id
ORDER BY id, StopSequence, stopId")
sqldf("SELECT * FROM min
UNION
SELECT * FROM max")

在一个问题中:

sqldf("SELECT *
FROM (SELECT id, stopId, min(stopSequence) AS StopSequence
FROM df GROUP BY id
ORDER BY id, StopSequence, stopId)
UNION
SELECT *
FROM (SELECT id, stopId, max(stopSequence) AS StopSequence
FROM df GROUP BY id
ORDER BY id, StopSequence, stopId)")

产出:

  id stopId StopSequence
1  1      a            1
2  1      c            3
3  2      b            1
4  2      c            4
5  3      a            3
6  3      b            1

使用 data.table:

# convert to data.table
setDT(df)
# order, group, filter
df[order(stopSequence)][, .SD[c(1, .N)], by = id]


id stopId stopSequence
1:  1      a            1
2:  1      c            3
3:  2      b            1
4:  2      c            4
5:  3      b            1
6:  3      a            3

另一种使用 lapplication 和 dplyr 语句的方法。我们可以对同一个语句应用任意数量的汇总函数:

lapply(c(first, last),
function(x) df %>% group_by(id) %>% summarize_all(funs(x))) %>%
bind_rows()

例如,您也可以对具有 max stop Sequence 值的行感兴趣,并执行以下操作:

lapply(c(first, last, max("stopSequence")),
function(x) df %>% group_by(id) %>% summarize_all(funs(x))) %>%
bind_rows()

另一种不同的碱基 R 替代方案是 idstopSequence的首个 order,基于 idsplit,对于每个 id,我们只选择第一个和最后一个索引,并使用这些索引对数据框进行子集。

df[sapply(with(df, split(order(id, stopSequence), id)), function(x)
c(x[1], x[length(x)])), ]




#  id stopId stopSequence
#1  1      a            1
#3  1      c            3
#5  2      b            1
#6  2      c            4
#8  3      b            1
#7  3      a            3

或类似的使用 by

df[unlist(with(df, by(order(id, stopSequence), id, function(x)
c(x[1], x[length(x)])))), ]

使用 which.minwhich.max:

library(dplyr, warn.conflicts = F)
df %>%
group_by(id) %>%
slice(c(which.min(stopSequence), which.max(stopSequence)))


#> # A tibble: 6 x 3
#> # Groups:   id [3]
#>      id stopId stopSequence
#>   <dbl> <fct>         <dbl>
#> 1     1 a                 1
#> 2     1 c                 3
#> 3     2 b                 1
#> 4     2 c                 4
#> 5     3 b                 1
#> 6     3 a                 3

基准

它也比当前接受的答案快得多,因为我们按组查找 min 和 max 值,而不是排序整个 stop 序列。

# create a 100k times longer data frame
df2 <- bind_rows(replicate(1e5, df, F))
bench::mark(
mm =df2 %>%
group_by(id) %>%
slice(c(which.min(stopSequence), which.max(stopSequence))),
jeremy = df2 %>%
group_by(id) %>%
arrange(stopSequence) %>%
filter(row_number()==1 | row_number()==n()))
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 mm           22.6ms     27ms     34.9     14.2MB     21.3
#> 2 jeremy      254.3ms    273ms      3.66    58.4MB     11.0

这种方法很有效:

df %>%
group_by(id) %>%
arrange(stopSequence) %>%
slice(1,n())


# A tibble: 6 × 3
# Groups:   id [3]
#     id stopId stopSequence
#  <dbl> <chr>         <dbl>
#1     1 a                 1
#2     1 c                 3
#3     2 b                 1
#4     2 c                 4
#5     3 b                 1
#6     3 a                 3