Effective functions on specific data.frame columns in data.frames list

I have a list of data.frame

s. for example

my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
  my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))


I want to efficiently compute for each id (in the "id" column) the variance of the "p", "m", "d" and "a" columns across all data frames in the list. Ideally, this would return data.frame

like this (based on the values ​​given above):

> result.df
  id     var_p     var_m      var_d     var_a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462



source to share

4 answers

Using my_list

df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462


Or a basic alternative to R using a combination of lapply


df = do.call(rbind, my_list)
df1 = do.call(rbind, 
      lapply(split(df, df$id), 
      function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))

out = transform(df1, id = row.names(df1))

#> out
#          p         m          d         a id
#a 0.2371569 1.7810729 0.08264279 0.5074250  a
#b 0.1091675 0.2107997 1.15051229 1.1578691  b
#c 0.5385789 0.7650123 0.44215343 0.3137903  c
#d 1.0174542 0.7818498 0.06414317 0.6079849  d
#e 0.7343667 1.2870542 1.41615858 0.7362462  e


Or using doBy

df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462


Or using sqldf

df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m), 
             variance(d), variance(a) from df group by id")

#> out
#  id variance(p) variance(m) variance(d) variance(a)
#1  a   0.2371569   1.7810729  0.08264279   0.5074250
#2  b   0.1091675   0.2107997  1.15051229   1.1578691
#3  c   0.5385789   0.7650123  0.44215343   0.3137903
#4  d   1.0174542   0.7818498  0.06414317   0.6079849
#5  e   0.7343667   1.2870542  1.41615858   0.7362462




Here is the basic R approach

dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)


which gives

  id         p         m          d         a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462




rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
#    id         p         m          d         a
# 1:  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2:  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3:  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4:  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5:  e 0.7343667 1.2870542 1.41615858 0.7362462




Updated for use bind_rows()

(more efficient than do.call(rbind,...)

@hadley's suggestion)

dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))

#   id         p         m          d         a
# 1  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5  e 0.7343667 1.2870542 1.41615858 0.7362462




All Articles