R, select rows according to the rank of a specific column

I have an R-frame of data as shown below,

name score
marry 98
marry 77
marry 87
marry 96
mark 99
mark 44
mark 79
john 87
john 77

      

For each name, I want to select the rows with the highest value of 2, which should be

name score
marry 98
marry 96
mark 99
mark 79
john 87
john 77

      

Can anyone please help? Many thanks!

+3


source to share


4 answers


A basic approach is possible here:

mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
#    name score
# 1 marry    98
# 4 marry    96
# 5  mark    99
# 7  mark    79
# 8  john    87
# 9  john    77

      


For the curious, in terms of timings - here's a little test ...

Two sample datasets, both 1M rows and two columns, but one with 1000 possible values ​​for "name" and another with 10000 possible values.



set.seed(1)
df1 <- data.frame(
  name = sample(1000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)
df2 <- data.frame(
  name = sample(10000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)

      

Functions for comparison - I'll try to add "dplyr" later after reinstalling it.

fun1 <- function(mydf) {
  mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
}

fun2 <- function(mydf) {
  as.data.table(mydf)[order(-score), .SD[1:2], by=name]
}

fun3 <- function(mydf) {
  df <- as.data.table(mydf)
  setorder(df, -score)[, head(.SD, 2), by = name]
}

      

Comparative analysis.

library(microbenchmark)
microbenchmark(fun1(df1), fun2(df1), fun3(df1), 
               fun1(df2), fun2(df2), fun3(df2), times = 20)
# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval
#  fun1(df1)  502.76809  513.98317  569.47883  597.90488  603.34458  686.4302    20
#  fun2(df1)  733.12544  741.18777  796.67106  822.60824  828.88449  839.3837    20
#  fun3(df1)   87.80581   93.07012   95.34281   95.56374   97.49608  101.7991    20
#  fun1(df2)  672.60241  764.10237  764.60365  772.33959  780.14679  799.3505    20
#  fun2(df2) 6338.14881 6360.42621 6407.66675 6412.99278 6451.75626 6479.2681    20
#  fun3(df2)  354.24119  366.47396  382.58666  369.78597  374.01897  468.9197    20

      

+5


source


Here's a different approach data.table

using a new function setorder

(order by reference)



library(data.table) # 1.9.4+
setorder(setDT(df), -score)[, head(.SD, 2), by = name]
#     name score
# 1:  mark    99
# 2:  mark    79
# 3: marry    98
# 4: marry    96
# 5:  john    87
# 6:  john    77

      

+5


source


You may try:

 devtools::install_github("hadley/dplyr")
 library(dplyr)


 df %>% 
      group_by(name) %>% 
      arrange(desc(score)) %>%
       slice(1:2)

 #     name score
 #1  john    87
 #2  john    77
 #3  mark    99
 #4  mark    79
 #5 marry    98
 #6 marry    96

      

Or using data.table

 library(data.table)
 setDT(df)[order(-score), .SD[1:2], by=name]
 #      name score
 #1:  mark    99
 #2:  mark    79
 #3: marry    98
 #4: marry    96
 #5:  john    87
 #6:  john    77

      

data

df <- structure(list(name = c("marry", "marry", "marry", "marry", "mark", 
   "mark", "mark", "john", "john"), score = c(98L, 77L, 87L, 96L, 
   99L, 44L, 79L, 87L, 77L)), .Names = c("name", "score"), class = "data.frame", row.names = c(NA, 
  -9L))

      

Benchmarks

Methods included dplyr

and base R

by @Richard Scriven. Two datasets are created: 1) Same as generated by @Ananda Mahto's 2nd dataset, 2) 50 times larger.

  • Data

    set.seed(1) #similar dataset as created by @Ananda Mahto
    dfAM <- data.frame(
                name = sample(10000, 1000000, TRUE),
                score = sample(0:100, 1000000, TRUE)
                )
    
    
    set.seed(1)
    df2 <- data.frame(
                 name = sample(10000, 50*1000000, TRUE),
                 score = sample(0:100, 50*1000000, TRUE)
                )
    
          

  • Functions

      aMahto <- function(mydf) {mydf[with(mydf, 
                 ave(-score, name, FUN = order)) %in% c(1, 2), ]
               }
    
      akrun1 <- function(mydf) {setDT(mydf)[order(-score), .SD[1:2], by=name] }
      akrun2 <- function(mydf) {setDT(mydf)[order(-score), head(.SD,2), by=name] }
      dArenburg <- function(mydf){ setorder(setDT(mydf), -score)[,
                                                head(.SD,2), by=name]}
      akrun3 <- function(mydf) { mydf %>% group_by(name) %>% 
                                   arrange(desc(score)) %>% slice(1:2) }
    
    
      rScriven1 <- function(mydf) {sapply(split(mydf$score, mydf$name),
                                           function(x) tail(sort(x), 2))}
      rScriven2 <- function(mydf) {stack(lapply(split(mydf$score, mydf$name),
                                            function(x) tail(sort(x), 2)))}
    
          

  • Benchmarks

    Run tests for each dataset separately.

      library(microbenchmark)
      microbenchmark(aMahto(dfAM), akrun1(dfAM), akrun2(dfAM), akrun3(dfAM),
                    dArenburg(dfAM), rScriven1(dfAM), rScriven2(dfAM), times=20L)
       Unit: milliseconds
             expr        min         lq       mean     median         uq
       aMahto(dfAM)  278.11839  283.82547  293.16843  285.45645  298.77528
       akrun1(dfAM) 2900.86528 2923.94839 2953.00178 2942.73172 2965.85868
       akrun2(dfAM)  189.93849  195.05222  202.30496  196.51019  207.73886
       akrun3(dfAM)   56.75191   57.14967   58.04335   57.32627   57.63047
    dArenburg(dfAM)  161.87583  166.64286  171.67832  168.24355  170.14656
    rScriven1(dfAM)  694.22503  701.85554  717.92201  713.15419  727.84196
    rScriven2(dfAM)  712.84676  728.75839  744.07167  738.74824  759.88312
            max neval
      335.04468    20
     3113.54895    20
      250.09765    20
       66.67198    20
      228.44948    20
      777.78973    20
      797.85850    20
    
          

In a larger dataset, @David Arenburg's method is Winner

.

    microbenchmark(aMahto(df2), akrun1(df2), akrun2(df2), akrun3(df2), 
                 dArenburg(df2), rScriven1(df2), rScriven2(df2), times=40L)
    Unit: seconds
            expr       min        lq      mean    median        uq       max
     aMahto(df2) 11.830111 12.027325 12.273881 12.213140 12.533628 13.196659
     akrun1(df2)  6.672874  6.890442  7.018749  6.956716  7.128060  7.542047
     akrun2(df2)  3.794502  3.829567  3.860565  3.847690  3.869065  4.143381
     akrun3(df2)  3.687974  3.725867  3.801861  3.743973  3.933935  4.102295
  dArenburg(df2)  1.531356  1.598570  1.647648  1.618573  1.640258  2.716042
  rScriven1(df2)  6.370144  6.573998  6.685313  6.616246  6.820830  7.118827
  rScriven2(df2)  6.551911  6.628134  6.743644  6.724310  6.867090  7.091750
     neval
      40
      40
      40
      40
      40
      40
      40

      

+4


source


This is a different conclusion, but how about this so that the names don't repeat themselves.

sapply(split(df$score, df$name), function(x) tail(sort(x), 2))
#      john mark marry
# [1,]   77   79    96
# [2,]   87   99    98

      

As suggested by Ananda Mahto, you can also use stack

withlapply

stack(lapply(split(df$score, df$name), function(x) tail(sort(x), 2)))
#   values   ind
# 1     77  john
# 2     87  john
# 3     79  mark
# 4     99  mark
# 5     96 marry
# 6     98 marry

      

+3


source







All Articles