R 中基于交替字符的拆分字符串

我正在尝试找到一种有效的方法来分解字符串

"111110000011110000111000"

成为一个矢量

[1] "11111" "00000" "1111" "0000" "111" "000"

其中“0”和“1”可以是任何交替字符。

4043 次浏览

试试看

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

更新

stri_extract_all_regex修改@rawr 的解

library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"




stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"
#[10] "000"


stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"
#[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"
#[15] "D"       "aa"      "BB"

基准

library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)


akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,
perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3),
rle(strsplit(x3, "")[[1]])$lengths,
colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))


Cryo <- function(){
res_vector=rep(NA_character_,nchar(x3))
res_vector[1]=substr(x3,1,1)
counter=1
old_tmp=''


for (i in 2:nchar(x3)) {
tmp=substr(x3,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}


res_vector[!is.na(res_vector)]
}




richard <- function(){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}


nicola<-function(x) {
indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
substring(x,indices[-length(indices)]+1,indices[-1])
}


richard2 <- function() {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}


system.time(akrun())
# user  system elapsed
# 0.003   0.000   0.003


system.time(thelate())
#   user  system elapsed
#  0.272   0.001   0.274


system.time(rawr())
# user  system elapsed
#  0.397   0.001   0.398


system.time(ananda())
#  user  system elapsed
# 3.744   0.204   3.949


system.time(Colonel())
#   user  system elapsed
#  0.154   0.001   0.154


system.time(Cryo())
#  user  system elapsed
# 0.220   0.005   0.226


system.time(richard())
#  user  system elapsed
# 0.007   0.000   0.006


system.time(nicola(x3))
# user  system elapsed
# 0.190   0.001   0.191

在一根稍微大一点的绳子上,

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)


system.time(akrun())
#user  system elapsed
#0.166   0.000   0.155
system.time(richard())
#  user  system elapsed
# 0.606   0.000   0.569
system.time(richard2())
#  user  system elapsed
# 0.518   0.000   0.487


system.time(Colonel())
#  user  system elapsed
# 9.631   0.000   9.358




library(microbenchmark)
microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
#Unit: relative
#     expr      min       lq     mean   median       uq      max neval cld
# richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
#richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a

注意: 尝试运行其他方法,但需要很长时间

资料

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

您可以使用 substrread.fwf以及 rle(尽管它不太可能像任何基于正则表达式的解决方案那样高效) :

x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
#      V1      V2      V3      V4      V5      V6
# "11111" "00000"  "1111"  "0000"   "111"   "000"

这种方法的一个优点是,它甚至适用于下列情况:

x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"


unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
#        V1        V2        V3        V4        V5        V6
#   "aaaaa"      "bb" "ccccccc"     "bbb"       "a"       "d"

另一种方法是在交替数字之间添加空格。这对任何两个人都适用,而不仅仅是1和0。然后在空白处使用 strsplit:

x <- "111110000011110000111000"


(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "




strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

或者像@akrun 指出的那样更简洁:

strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

也改变 \\d\\w的工程也

x  <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"


x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

您也可以使用 \K(而不是显式地使用捕获组 \\1\\2) ,我认为它们使用得不多,也不知道如何解释它: }

AFAIK \\K重置报告匹配的起始点,任何以前使用的字符不再包括在内,基本上抛弃了所有匹配到该点。

x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "

一个主题的变奏:

x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

使用 mapply的另一种情况下的方法:

x="111110000011110000111000"


with(rle(strsplit(x,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

简单的 for循环解决方案

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)


for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==substr(x,i-1,i-1)) {
res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
} else {
res_vector[length(res_vector)+1]=tmp
}
}


res_vector


#[1] "aaaaa"  "bb"  "ccccccc"  "bbb"  "a"  "d"  "11111"  "00000"  "222"  "aaa"  "bb"  "cc"  "d"  "11"  "D"  "aa"  "BB"

或者使用预先分配的结果向量可能会快一点

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''


for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}


res_vector[!is.na(res_vector)]

原始方法: 这是一个结合了 rle()Stringi方法。

x <- "111110000011110000111000"
library(stringi)


cs <- cumsum(
rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

或者,可以使用 stri_sub()中的 length参数

rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

更新为效率: 在意识到 base::strsplit()stringi::stri_split_boundaries()快之后,这里是我之前的答案的一个更有效的版本,只使用基本函数。

set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)


system.time({
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
#   user  system elapsed
#  0.686   0.012   0.697

这不是 OP 真正想要的(简洁的 R 代码) ,但是我想在 Rcpp中尝试一下,结果相对简单,比最快的基于 R 的答案快5倍左右。

library(Rcpp)


cppFunction(
'std::vector<std::string> split_str_cpp(std::string x) {


std::vector<std::string> parts;


int start = 0;


for(int i = 1; i <= x.length(); i++) {
if(x[i] != x[i-1]) {
parts.push_back(x.substr(start, i-start));
start = i;
}
}


return parts;


}')

用这些做实验

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

提供以下输出

> split_str_cpp(str1)
[1] "11111" "00000" "1111"  "0000"  "111"   "000"
> split_str_cpp(x1)
[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   "000"
> split_str_cpp(x2)
[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"
[15] "D"       "aa"      "BB"

一个基准测试表明它比 R 解决方案快5-10倍。

akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]


richard1 <- function(x3){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}


richard2 <- function(x3) {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}


library(microbenchmark)
library(stringi)


set.seed(24)
x3 <- stri_rand_strings(1, 1e6)


microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)

比较:

Unit: relative
expr      min       lq     mean   median       uq      max neval
split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134    20
richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446    20
richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680    20

这样吧:

s <- "111110000011110000111000"


spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))


# [1] "11111" "00000" "1111"  "0000"  "111"   "000"