Effective functions on specific data.frame columns in data.frames list

Question

Effective functions on specific data.frame columns in data.frames list

I have a list of data.frame

s. for example

set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
  my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}

I want to efficiently compute for each id (in the "id" column) the variance of the "p", "m", "d" and "a" columns across all data frames in the list. Ideally, this would return data.frame

like this (based on the values given above):

> result.df
  id     var_p     var_m      var_d     var_a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462

+3

list r dataframe

user1701545 June 26. 15 at 17:03

source to share

4 answers

Here is the basic R approach

dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)

which gives

  id         p         m          d         a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462

+3

Frank June 26. 15 at 17:27

source to share

library(data.table)
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
#    id         p         m          d         a
# 1:  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2:  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3:  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4:  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5:  e 0.7343667 1.2870542 1.41615858 0.7362462

+3

C8H10N4O2 June 26. 15 at 17:34

source to share

Updated for use bind_rows()

(more efficient than do.call(rbind,...)

@hadley's suggestion)

library(dplyr)
dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))

#   id         p         m          d         a
# 1  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5  e 0.7343667 1.2870542 1.41615858 0.7362462

+2

jenesaisquoi June 26. 15 at 17:27

source to share

Veerendra gadekar · Accepted Answer · 2015-06-26T17:21:32+0000

Using my_list

library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462

Or a basic alternative to R using a combination of lapply

andapply

df = do.call(rbind, my_list)
df1 = do.call(rbind, 
      lapply(split(df, df$id), 
      function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))

out = transform(df1, id = row.names(df1))

#> out
#          p         m          d         a id
#a 0.2371569 1.7810729 0.08264279 0.5074250  a
#b 0.1091675 0.2107997 1.15051229 1.1578691  b
#c 0.5385789 0.7650123 0.44215343 0.3137903  c
#d 1.0174542 0.7818498 0.06414317 0.6079849  d
#e 0.7343667 1.2870542 1.41615858 0.7362462  e

Or using doBy

library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462

Or using sqldf

library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m), 
             variance(d), variance(a) from df group by id")

#> out
#  id variance(p) variance(m) variance(d) variance(a)
#1  a   0.2371569   1.7810729  0.08264279   0.5074250
#2  b   0.1091675   0.2107997  1.15051229   1.1578691
#3  c   0.5385789   0.7650123  0.44215343   0.3137903
#4  d   1.0174542   0.7818498  0.06414317   0.6079849
#5  e   0.7343667   1.2870542  1.41615858   0.7362462

Effective functions on specific data.frame columns in data.frames list

More articles: