将多个列粘贴在一起

我在一个数据框中有一些列,我想把它们粘贴在一起(用“-”分隔) ,如下所示:

data <- data.frame('a' = 1:3,
'b' = c('a','b','c'),
'c' = c('d', 'e', 'f'),
'd' = c('g', 'h', 'i'))
i.e.
a   b   c  d
1   a   d   g
2   b   e   h
3   c   f   i

我想成为:

a x
1 a-d-g
2 b-e-h
3 c-f-i

我通常可以这样做:

within(data, x <- paste(b,c,d,sep='-'))

然后删除旧的列,但遗憾的是,我不知道具体的列名,只知道所有列的集合名称,例如,我会知道 cols <- c('b','c','d')

有人知道怎么做吗?

147510 次浏览
# your starting data..
data <- data.frame('a' = 1:3, 'b' = c('a','b','c'), 'c' = c('d', 'e', 'f'), 'd' = c('g', 'h', 'i'))


# columns to paste together
cols <- c( 'b' , 'c' , 'd' )


# create a new column `x` with the three columns collapsed together
data$x <- apply( data[ , cols ] , 1 , paste , collapse = "-" )


# remove the unnecessary columns
data <- data[ , !( names( data ) %in% cols ) ]
library(plyr)


ldply(apply(data, 1, function(x) data.frame(
x = paste(x[2:4],sep="",collapse="-"))))


#      x
#1 a-d-g
#2 b-e-h
#3 c-f-i


#  and with just the vector of names you have:


ldply(apply(data, 1, function(x) data.frame(
x = paste(x[c('b','c','d')],sep="",collapse="-"))))


# or equally:
mynames <-c('b','c','d')
ldply(apply(data, 1, function(x) data.frame(
x = paste(x[mynames],sep="",collapse="-"))))

我会建立一个新的数据框架:

d <- data.frame('a' = 1:3, 'b' = c('a','b','c'), 'c' = c('d', 'e', 'f'), 'd' = c('g', 'h', 'i'))


cols <- c( 'b' , 'c' , 'd' )


data.frame(a = d[, 'a'], x = do.call(paste, c(d[ , cols], list(sep = '-'))))

作为 巴蒂斯特的回答上的一个变体,在 cols中定义了 data和要放在一起的列

cols <- c("b", "c", "d")

您可以将新列添加到 data并删除旧列

data$x <- do.call(paste, c(data[cols], sep="-"))
for (co in cols) data[co] <- NULL

所以

> data
a     x
1 1 a-d-g
2 2 b-e-h
3 3 c-f-i

使用 tidyr包,这可以很容易地处理在一个函数调用。

data <- data.frame('a' = 1:3,
'b' = c('a','b','c'),
'c' = c('d', 'e', 'f'),
'd' = c('g', 'h', 'i'))


tidyr::unite_(data, paste(colnames(data)[-1], collapse="_"), colnames(data)[-1])


a b_c_d
1 1 a_d_g
2 2 b_e_h
3 3 c_f_i

Edit: Exclude first column, everything else gets pasted.

# tidyr_0.6.3


unite(data, newCol, -a)
# or by column index unite(data, newCol, -1)


#   a newCol
# 1 1  a_d_g
# 2 2  b_e_h
# 3 3  c_f_i

只是增加额外的解决方案与 Reduce,这可能是慢于 do.call,但可能优于 apply,因为它将避免 matrix转换。另外,我们可以使用 setdiff代替 for循环来删除不需要的列

cols <- c('b','c','d')
data$x <- Reduce(function(...) paste(..., sep = "-"), data[cols])
data[setdiff(names(data), cols)]
#   a     x
# 1 1 a-d-g
# 2 2 b-e-h
# 3 3 c-f-i

Alternatively we could update data in place using the data.table package (assuming fresh data)

library(data.table)
setDT(data)[, x := Reduce(function(...) paste(..., sep = "-"), .SD[, mget(cols)])]
data[, (cols) := NULL]
data
#    a     x
# 1: 1 a-d-g
# 2: 2 b-e-h
# 3: 3 c-f-i

另一种选择是使用 .SDcols而不是 mget

setDT(data)[, x := Reduce(function(...) paste(..., sep = "-"), .SD), .SDcols = cols]

I benchmarked the answers of Anthony Damico, Brian Diggs and data_steve on a small sample tbl_df and got the following results.

> data <- data.frame('a' = 1:3,
+                    'b' = c('a','b','c'),
+                    'c' = c('d', 'e', 'f'),
+                    'd' = c('g', 'h', 'i'))
> data <- tbl_df(data)
> cols <- c("b", "c", "d")
> microbenchmark(
+     do.call(paste, c(data[cols], sep="-")),
+     apply( data[ , cols ] , 1 , paste , collapse = "-" ),
+     tidyr::unite_(data, "x", cols, sep="-")$x,
+     times=1000
+ )
Unit: microseconds
expr     min      lq      mean  median       uq       max neval
do.call(paste, c(data[cols], sep = "-"))       65.248  78.380  93.90888  86.177  99.3090   436.220  1000
apply(data[, cols], 1, paste, collapse = "-") 223.239 263.044 313.11977 289.514 338.5520   743.583  1000
tidyr::unite_(data, "x", cols, sep = "-")$x   376.716 448.120 556.65424 501.877 606.9315 11537.846  1000

However, when I evaluated on my own tbl_df with ~1 million rows and 10 columns the results were quite different.

> microbenchmark(
+     do.call(paste, c(data[c("a", "b")], sep="-")),
+     apply( data[ , c("a", "b") ] , 1 , paste , collapse = "-" ),
+     tidyr::unite_(data, "c", c("a", "b"), sep="-")$c,
+     times=25
+ )
Unit: milliseconds
expr        min         lq      mean     median        uq       max neval
do.call(paste, c(data[c("a", "b")], sep="-"))                 930.7208   951.3048  1129.334   997.2744  1066.084  2169.147    25
apply( data[ , c("a", "b") ] , 1 , paste , collapse = "-" )  9368.2800 10948.0124 11678.393 11136.3756 11878.308 17587.617    25
tidyr::unite_(data, "c", c("a", "b"), sep="-")$c              968.5861  1008.4716  1095.886  1035.8348  1082.726  1759.349    25

在我看来,sprintf函数也应该在这些答案中占有一席之地。你可以使用 sprintf如下:

do.call(sprintf, c(d[cols], '%s-%s-%s'))

它给出了:

 [1] "a-d-g" "b-e-h" "c-f-i"

并创建所需的数据框架:

data.frame(a = d$a, x = do.call(sprintf, c(d[cols], '%s-%s-%s')))

给予:

  a     x
1 1 a-d-g
2 2 b-e-h
3 3 c-f-i

虽然 sprintf没有明显优于@BrianDiggs 的 do.call/paste组合,但是当你想要填充所需字符串的某些部分或者指定数字的数量时,它尤其有用。有关几个选项,请参见 ?sprintf

另一种变体是使用来自 pmap:

pmap(d[2:4], paste, sep = '-')

注意: 这个 pmap解决方案只有在列不是因子的情况下才有效。


更大数据集的基准:

# create a larger dataset
d2 <- d[sample(1:3,1e6,TRUE),]
# benchmark
library(microbenchmark)
microbenchmark(
docp = do.call(paste, c(d2[cols], sep="-")),
appl = apply( d2[, cols ] , 1 , paste , collapse = "-" ),
tidr = tidyr::unite_(d2, "x", cols, sep="-")$x,
docs = do.call(sprintf, c(d2[cols], '%s-%s-%s')),
times=10)

results in:

Unit: milliseconds
expr       min        lq      mean    median        uq       max neval cld
docp  214.1786  226.2835  297.1487  241.6150  409.2495  493.5036    10 a
appl 3832.3252 4048.9320 4131.6906 4072.4235 4255.1347 4486.9787    10   c
tidr  206.9326  216.8619  275.4556  252.1381  318.4249  407.9816    10 a
docs  413.9073  443.1550  490.6520  453.1635  530.1318  659.8400    10  b

Used data:

d <- data.frame(a = 1:3, b = c('a','b','c'), c = c('d','e','f'), d = c('g','h','i'))

这里有一个相当非常规(但很快)的方法: 使用 data.table中的 fwrite将列“粘贴”在一起,使用 fread将其读回。为了方便起见,我将这些步骤编写为一个名为 fpaste的函数:

fpaste <- function(dt, sep = ",") {
x <- tempfile()
fwrite(dt, file = x, sep = sep, col.names = FALSE)
fread(x, sep = "\n", header = FALSE)
}

Here's an example:

d <- data.frame(a = 1:3, b = c('a','b','c'), c = c('d','e','f'), d = c('g','h','i'))
cols = c("b", "c", "d")


fpaste(d[cols], "-")
#       V1
# 1: a-d-g
# 2: b-e-h
# 3: c-f-i

How does it perform?

d2 <- d[sample(1:3,1e6,TRUE),]
  

library(microbenchmark)
microbenchmark(
docp = do.call(paste, c(d2[cols], sep="-")),
tidr = tidyr::unite_(d2, "x", cols, sep="-")$x,
docs = do.call(sprintf, c(d2[cols], '%s-%s-%s')),
appl = apply( d2[, cols ] , 1 , paste , collapse = "-" ),
fpaste = fpaste(d2[cols], "-")$V1,
dt2 = as.data.table(d2)[, x := Reduce(function(...) paste(..., sep = "-"), .SD), .SDcols = cols][],
times=10)
# Unit: milliseconds
#    expr        min         lq      mean     median         uq       max neval
#    docp  215.34536  217.22102  220.3603  221.44104  223.27224  225.0906    10
#    tidr  215.19907  215.81210  220.7131  220.09636  225.32717  229.6822    10
#    docs  281.16679  285.49786  289.4514  286.68738  290.17249  312.5484    10
#    appl 2816.61899 3106.19944 3259.3924 3266.45186 3401.80291 3804.7263    10
#  fpaste   88.57108   89.67795  101.1524   90.59217   91.76415  197.1555    10
#     dt2  301.95508  310.79082  384.8247  316.29807  383.94993  874.4472    10

我知道这是一个老问题,但是我认为我应该像提问者建议的那样使用粘贴()函数来呈现简单的解决方案:

data_1<-data.frame(a=data$a,"x"=paste(data$b,data$c,data$d,sep="-"))
data_1
a     x
1 1 a-d-g
2 2 b-e-h
3 3 c-f-i

使用 {tidyr} v1.2.0中的 unite编写简单明了的代码

{tidyr v1.2.0}解决方案

library(tidyr)


data %>% unite("x", b:d, remove = T, sep = "-")
  • "x" is the name of the new column.
  • b:d是使用 <tidy-select>合并哪些列的选择
  • 我们删除输入列
  • 我们定义值之间的分隔符
  • 如果有 NA,我们也可以添加 na.rm = TRUE

输出

#   a     x
# 1 1 a-d-g
# 2 2 b-e-h
# 3 3 c-f-i

输入资料

data <- data.frame('a' = 1:3,
'b' = c('a','b','c'),
'c' = c('d', 'e', 'f'),
'd' = c('g', 'h', 'i'))
data


#   a b c d
# 1 1 a d g
# 2 2 b e h
# 3 3 c f i

这个解决方案不同于已经发布的。