R, select rows according to the rank of a specific column

Question

R, select rows according to the rank of a specific column

I have an R-frame of data as shown below,

name score
marry 98
marry 77
marry 87
marry 96
mark 99
mark 44
mark 79
john 87
john 77

For each name, I want to select the rows with the highest value of 2, which should be

name score
marry 98
marry 96
mark 99
mark 79
john 87
john 77

Can anyone please help? Many thanks!

+3

r

Lene 18 oct. '14 at 17:14

source to share

4 answers

Here's a different approach data.table

using a new function setorder

(order by reference)

library(data.table) # 1.9.4+
setorder(setDT(df), -score)[, head(.SD, 2), by = name]
#     name score
# 1:  mark    99
# 2:  mark    79
# 3: marry    98
# 4: marry    96
# 5:  john    87
# 6:  john    77

+5

David Arenburg 18 oct. 14 at 17:24

source to share

You may try:

 devtools::install_github("hadley/dplyr")
 library(dplyr)


 df %>% 
      group_by(name) %>% 
      arrange(desc(score)) %>%
       slice(1:2)

 #     name score
 #1  john    87
 #2  john    77
 #3  mark    99
 #4  mark    79
 #5 marry    98
 #6 marry    96

Or using data.table

 library(data.table)
 setDT(df)[order(-score), .SD[1:2], by=name]
 #      name score
 #1:  mark    99
 #2:  mark    79
 #3: marry    98
 #4: marry    96
 #5:  john    87
 #6:  john    77

data

df <- structure(list(name = c("marry", "marry", "marry", "marry", "mark", 
   "mark", "mark", "john", "john"), score = c(98L, 77L, 87L, 96L, 
   99L, 44L, 79L, 87L, 77L)), .Names = c("name", "score"), class = "data.frame", row.names = c(NA, 
  -9L))

Benchmarks

Methods included dplyr

and base R

by @Richard Scriven. Two datasets are created: 1) Same as generated by @Ananda Mahto's 2nd dataset, 2) 50 times larger.

Data

set.seed(1) #similar dataset as created by @Ananda Mahto
dfAM <- data.frame(
            name = sample(10000, 1000000, TRUE),
            score = sample(0:100, 1000000, TRUE)
            )


set.seed(1)
df2 <- data.frame(
             name = sample(10000, 50*1000000, TRUE),
             score = sample(0:100, 50*1000000, TRUE)
            )

Functions

  aMahto <- function(mydf) {mydf[with(mydf, 
             ave(-score, name, FUN = order)) %in% c(1, 2), ]
           }

  akrun1 <- function(mydf) {setDT(mydf)[order(-score), .SD[1:2], by=name] }
  akrun2 <- function(mydf) {setDT(mydf)[order(-score), head(.SD,2), by=name] }
  dArenburg <- function(mydf){ setorder(setDT(mydf), -score)[,
                                            head(.SD,2), by=name]}
  akrun3 <- function(mydf) { mydf %>% group_by(name) %>% 
                               arrange(desc(score)) %>% slice(1:2) }


  rScriven1 <- function(mydf) {sapply(split(mydf$score, mydf$name),
                                       function(x) tail(sort(x), 2))}
  rScriven2 <- function(mydf) {stack(lapply(split(mydf$score, mydf$name),
                                        function(x) tail(sort(x), 2)))}

Benchmarks

Run tests for each dataset separately.

  library(microbenchmark)
  microbenchmark(aMahto(dfAM), akrun1(dfAM), akrun2(dfAM), akrun3(dfAM),
                dArenburg(dfAM), rScriven1(dfAM), rScriven2(dfAM), times=20L)
   Unit: milliseconds
         expr        min         lq       mean     median         uq
   aMahto(dfAM)  278.11839  283.82547  293.16843  285.45645  298.77528
   akrun1(dfAM) 2900.86528 2923.94839 2953.00178 2942.73172 2965.85868
   akrun2(dfAM)  189.93849  195.05222  202.30496  196.51019  207.73886
   akrun3(dfAM)   56.75191   57.14967   58.04335   57.32627   57.63047
dArenburg(dfAM)  161.87583  166.64286  171.67832  168.24355  170.14656
rScriven1(dfAM)  694.22503  701.85554  717.92201  713.15419  727.84196
rScriven2(dfAM)  712.84676  728.75839  744.07167  738.74824  759.88312
        max neval
  335.04468    20
 3113.54895    20
  250.09765    20
   66.67198    20
  228.44948    20
  777.78973    20
  797.85850    20

In a larger dataset, @David Arenburg's method is Winner

.

    microbenchmark(aMahto(df2), akrun1(df2), akrun2(df2), akrun3(df2), 
                 dArenburg(df2), rScriven1(df2), rScriven2(df2), times=40L)
    Unit: seconds
            expr       min        lq      mean    median        uq       max
     aMahto(df2) 11.830111 12.027325 12.273881 12.213140 12.533628 13.196659
     akrun1(df2)  6.672874  6.890442  7.018749  6.956716  7.128060  7.542047
     akrun2(df2)  3.794502  3.829567  3.860565  3.847690  3.869065  4.143381
     akrun3(df2)  3.687974  3.725867  3.801861  3.743973  3.933935  4.102295
  dArenburg(df2)  1.531356  1.598570  1.647648  1.618573  1.640258  2.716042
  rScriven1(df2)  6.370144  6.573998  6.685313  6.616246  6.820830  7.118827
  rScriven2(df2)  6.551911  6.628134  6.743644  6.724310  6.867090  7.091750
     neval
      40
      40
      40
      40
      40
      40
      40

+4

akrun 18 oct. 14 at 17:18

source to share

This is a different conclusion, but how about this so that the names don't repeat themselves.

sapply(split(df$score, df$name), function(x) tail(sort(x), 2))
#      john mark marry
# [1,]   77   79    96
# [2,]   87   99    98

As suggested by Ananda Mahto, you can also use stack

withlapply

stack(lapply(split(df$score, df$name), function(x) tail(sort(x), 2)))
#   values   ind
# 1     77  john
# 2     87  john
# 3     79  mark
# 4     99  mark
# 5     96 marry
# 6     98 marry

+3

Rich scriven 18 oct. 14 at 17:19

source to share

A5C1D2H2I1M1N2O1R2T1 · Accepted Answer · 2014-10-18T17:23:59+0000

A basic approach is possible here:

mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
#    name score
# 1 marry    98
# 4 marry    96
# 5  mark    99
# 7  mark    79
# 8  john    87
# 9  john    77

For the curious, in terms of timings - here's a little test ...

Two sample datasets, both 1M rows and two columns, but one with 1000 possible values for "name" and another with 10000 possible values.

set.seed(1)
df1 <- data.frame(
  name = sample(1000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)
df2 <- data.frame(
  name = sample(10000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)

Functions for comparison - I'll try to add "dplyr" later after reinstalling it.

fun1 <- function(mydf) {
  mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
}

fun2 <- function(mydf) {
  as.data.table(mydf)[order(-score), .SD[1:2], by=name]
}

fun3 <- function(mydf) {
  df <- as.data.table(mydf)
  setorder(df, -score)[, head(.SD, 2), by = name]
}

Comparative analysis.

library(microbenchmark)
microbenchmark(fun1(df1), fun2(df1), fun3(df1), 
               fun1(df2), fun2(df2), fun3(df2), times = 20)
# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval
#  fun1(df1)  502.76809  513.98317  569.47883  597.90488  603.34458  686.4302    20
#  fun2(df1)  733.12544  741.18777  796.67106  822.60824  828.88449  839.3837    20
#  fun3(df1)   87.80581   93.07012   95.34281   95.56374   97.49608  101.7991    20
#  fun1(df2)  672.60241  764.10237  764.60365  772.33959  780.14679  799.3505    20
#  fun2(df2) 6338.14881 6360.42621 6407.66675 6412.99278 6451.75626 6479.2681    20
#  fun3(df2)  354.24119  366.47396  382.58666  369.78597  374.01897  468.9197    20

R, select rows according to the rank of a specific column

data

Benchmarks

More articles: