如何通过对数据帧中的列进行排序来快速形成组(四分位数、十分位数等)

小开

There is possibly a quicker way, but I would do:

a <- rnorm(100) # Our data
q <- quantile(a) # You can supply your own breaks, see ?quantile


# Define a simple function that checks in which quantile a number falls
getQuant <- function(x)
{
for (i in 1:(length(q)-1))
{
if (x>=q[i] && x<q[i+1])
break;
}
i
}


# Apply the function to the data
res <- unlist(lapply(as.matrix(a), getQuant))

小开

最佳答案

The method I use is one of these or Hmisc::cut2(value, g=4):

temp$quartile <- with(temp, cut(value,
breaks=quantile(value, probs=seq(0,1, by=0.25), na.rm=TRUE),
include.lowest=TRUE))

An alternate might be:

temp$quartile <- with(temp, factor(
findInterval( val, c(-Inf,
quantile(val, probs=c(0.25, .5, .75)), Inf) , na.rm=TRUE),
labels=c("Q1","Q2","Q3","Q4")
))

The first one has the side-effect of labeling the quartiles with the values, which I consider a "good thing", but if it were not "good for you", or the valid problems raised in the comments were a concern you could go with version 2. You can use labels= in cut, or you could add this line to your code:

temp$quartile <- factor(temp$quartile, levels=c("1","2","3","4") )

Or even quicker but slightly more obscure in how it works, although it is no longer a factor, but rather a numeric vector:

temp$quartile <- as.numeric(temp$quartile)

小开

You can use the quantile() function, but you need to handle rounding/precision when using cut(). So

set.seed(123)
temp <- data.frame(name=letters[1:12], value=rnorm(12), quartile=rep(NA, 12))
brks <- with(temp, quantile(value, probs = c(0, 0.25, 0.5, 0.75, 1)))
temp <- within(temp, quartile <- cut(value, breaks = brks, labels = 1:4,
include.lowest = TRUE))

Giving:

> head(temp)
name       value quartile
1    a -0.56047565        1
2    b -0.23017749        2
3    c  1.55870831        4
4    d  0.07050839        2
5    e  0.12928774        3
6    f  1.71506499        4

小开

temp$quartile <- ceiling(sapply(temp$value,function(x) sum(x-temp$value>=0))/(length(temp$value)/4))

小开

Sorry for being a bit late to the party. I wanted to add my one liner using cut2 as I didn't know max/min for my data and wanted the groups to be identically large. I read about cut2 in an issue which was marked as duplicate (link below).

library(Hmisc)   #For cut2
set.seed(123)    #To keep answers below identical to my random run


temp <- data.frame(name=letters[1:12], value=rnorm(12), quartile=rep(NA, 12))


temp$quartile <- as.numeric(cut2(temp$value, g=4))   #as.numeric to number the factors
temp$quartileBounds <- cut2(temp$value, g=4)


temp

Result:

> temp
name       value quartile  quartileBounds
1     a -0.56047565        1 [-1.265,-0.446)
2     b -0.23017749        2 [-0.446, 0.129)
3     c  1.55870831        4 [ 1.224, 1.715]
4     d  0.07050839        2 [-0.446, 0.129)
5     e  0.12928774        3 [ 0.129, 1.224)
6     f  1.71506499        4 [ 1.224, 1.715]
7     g  0.46091621        3 [ 0.129, 1.224)
8     h -1.26506123        1 [-1.265,-0.446)
9     i -0.68685285        1 [-1.265,-0.446)
10    j -0.44566197        2 [-0.446, 0.129)
11    k  1.22408180        4 [ 1.224, 1.715]
12    l  0.35981383        3 [ 0.129, 1.224)

小开

There's a handy ntile function in package dplyr. It's flexible in the sense that you can very easily define the number of *tiles or "bins" you want to create.

Load the package (install first if you haven't) and add the quartile column:

library(dplyr)
temp$quartile <- ntile(temp$value, 4)

Or, if you want to use dplyr syntax:

temp <- temp %>% mutate(quartile = ntile(value, 4))

Result in both cases is:

temp
#   name       value quartile
#1     a -0.56047565        1
#2     b -0.23017749        2
#3     c  1.55870831        4
#4     d  0.07050839        2
#5     e  0.12928774        3
#6     f  1.71506499        4
#7     g  0.46091621        3
#8     h -1.26506123        1
#9     i -0.68685285        1
#10    j -0.44566197        2
#11    k  1.22408180        4
#12    l  0.35981383        3

data:

Note that you don't need to create the "quartile" column in advance and use set.seed to make the randomization reproducible:

set.seed(123)
temp <- data.frame(name=letters[1:12], value=rnorm(12))

小开

I'll add the data.table version for anyone else Googling it (i.e., @BondedDust's solution translated to data.table and pared down a tad):

library(data.table)
setDT(temp)
temp[ , quartile := cut(value,
breaks = quantile(value, probs = 0:4/4),
labels = 1:4, right = FALSE)]

Which is much better (cleaner, faster) than what I had been doing:

temp[ , quartile :=
as.factor(ifelse(value < quantile(value, .25), 1,
ifelse(value < quantile(value, .5), 2,
ifelse(value < quantile(value, .75), 3, 4))]

Note, however, that this approach requires the quantiles to be distinct, e.g. it will fail on rep(0:1, c(100, 1)); what to do in this case is open ended so I leave it up to you.

小开

Adapting dplyr::ntile to take advantage of data.table optimizations provides a faster solution.

library(data.table)
setDT(temp)
temp[order(value) , quartile := floor( 1 + 4 * (.I-1) / .N)]

Probably doesn't qualify as cleaner, but it's faster and one-line.

Timing on bigger data set

Comparing this solution to ntile and cut for data.table as proposed by @docendo_discimus and @MichaelChirico.

library(microbenchmark)
library(dplyr)


set.seed(123)


n <- 1e6
temp <- data.frame(name=sample(letters, size=n, replace=TRUE), value=rnorm(n))
setDT(temp)


microbenchmark(
"ntile" = temp[, quartile_ntile := ntile(value, 4)],
"cut" = temp[, quartile_cut := cut(value,
breaks = quantile(value, probs = seq(0, 1, by=1/4)),
labels = 1:4, right=FALSE)],
"dt_ntile" = temp[order(value), quartile_ntile_dt := floor( 1 + 4 * (.I-1)/.N)]
)

Gives:

Unit: milliseconds
expr      min       lq     mean   median       uq      max neval
ntile 608.1126 647.4994 670.3160 686.5103 691.4846 712.4267   100
cut 369.5391 373.3457 375.0913 374.3107 376.5512 385.8142   100
dt_ntile 117.5736 119.5802 124.5397 120.5043 124.5902 145.7894   100

小开

I would like to propose a version, which seems to be more robust, since I ran into a lot of problems using quantile() in the breaks option cut() on my dataset. I am using the ntile function of plyr, but it also works with ecdf as input.

temp[, `:=`(quartile = .bincode(x = ntile(value, 100), breaks = seq(0,100,25), right = TRUE, include.lowest = TRUE)
decile = .bincode(x = ntile(value, 100), breaks = seq(0,100,10), right = TRUE, include.lowest = TRUE)
)]


temp[, `:=`(quartile = .bincode(x = ecdf(value)(value), breaks = seq(0,1,0.25), right = TRUE, include.lowest = TRUE)
decile = .bincode(x = ecdf(value)(value), breaks = seq(0,1,0.1), right = TRUE, include.lowest = TRUE)
)]

Is that correct?

小开

Try this function

getQuantileGroupNum <- function(vec, group_num, decreasing=FALSE) {
if(decreasing) {
abs(cut(vec, quantile(vec, probs=seq(0, 1, 1 / group_num), type=8, na.rm=TRUE), labels=FALSE, include.lowest=T) - group_num - 1)
} else {
cut(vec, quantile(vec, probs=seq(0, 1, 1 / group_num), type=8, na.rm=TRUE), labels=FALSE, include.lowest=T)
}
}

> t1 <- runif(7)
> t1
[1] 0.4336094 0.2842928 0.5578876 0.2678694 0.6495285 0.3706474 0.5976223
> getQuantileGroupNum(t1, 4)
[1] 2 1 3 1 4 2 4
> getQuantileGroupNum(t1, 4, decreasing=T)
[1] 3 4 2 4 1 3 1

小开

Take care with ntile() if your original values are clustered at some values. To create equally sized groups, it will allocate rows with the same original value into different groups. This may not be desirable.

I had a case where scores of individuals were clustered at certain values and it was important that individuals with the same original score were placed in the same group (e.g. allocating students to groups based on test score). ntile() allocated individuals with the same score to different groups (unfair in this case), but cut() with quantile() does not (but groups are only approximately equal in size).

library(dplyr)
library(reshape2)
library(ggplot2)




# awkward data: cannot be fairly and equally divided into quartiles or quintiles
# (similar results are obtained from more realistic cases of clustered values)
example <- data.frame(id = 1:49, x = c(rep(1:7, each=7))) %>%
mutate(ntileQuartile = ntile(x, 4),
cutQuartile = cut(x, breaks=quantile(x, seq(0, 1, by=1/4)),
include.lowest=T, label=1:4),
ntileQuintile = ntile(x, 5),
cutQuintile = cut(x, breaks=quantile(x, seq(0, 1, by=1/5)),
include.lowest=T, label=1:5))




# graph: x axis is original score, colour is group allocation
# ntile creates equal groups, but some values of original score are split
# into separate groups.  cut creates different sized groups, but score
# exactly determines the group.
melt(example, id.vars=c("id", "x"),
variable.name = "method", value.name="groupNumber") %>%
ggplot(aes(x, fill=groupNumber)) +
geom_histogram(colour="black", bins=13) +
facet_wrap(vars(method))