Effective functions on specific data.frame columns in data.frames list
I have a list of data.frame
s. for example
set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}
I want to efficiently compute for each id (in the "id" column) the variance of the "p", "m", "d" and "a" columns across all data frames in the list. Ideally, this would return data.frame
like this (based on the values ββgiven above):
> result.df
id var_p var_m var_d var_a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
source to share
Using my_list
library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
Or a basic alternative to R using a combination of lapply
andapply
df = do.call(rbind, my_list)
df1 = do.call(rbind,
lapply(split(df, df$id),
function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))
out = transform(df1, id = row.names(df1))
#> out
# p m d a id
#a 0.2371569 1.7810729 0.08264279 0.5074250 a
#b 0.1091675 0.2107997 1.15051229 1.1578691 b
#c 0.5385789 0.7650123 0.44215343 0.3137903 c
#d 1.0174542 0.7818498 0.06414317 0.6079849 d
#e 0.7343667 1.2870542 1.41615858 0.7362462 e
Or using doBy
library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
Or using sqldf
library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m),
variance(d), variance(a) from df group by id")
#> out
# id variance(p) variance(m) variance(d) variance(a)
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
source to share
Here is the basic R approach
dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)
which gives
id p m d a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
source to share
library(data.table)
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
# id p m d a
# 1: a 0.2371569 1.7810729 0.08264279 0.5074250
# 2: b 0.1091675 0.2107997 1.15051229 1.1578691
# 3: c 0.5385789 0.7650123 0.44215343 0.3137903
# 4: d 1.0174542 0.7818498 0.06414317 0.6079849
# 5: e 0.7343667 1.2870542 1.41615858 0.7362462
source to share
Updated for use bind_rows()
(more efficient than do.call(rbind,...)
@hadley's suggestion)
library(dplyr)
dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))
# id p m d a
# 1 a 0.2371569 1.7810729 0.08264279 0.5074250
# 2 b 0.1091675 0.2107997 1.15051229 1.1578691
# 3 c 0.5385789 0.7650123 0.44215343 0.3137903
# 4 d 1.0174542 0.7818498 0.06414317 0.6079849
# 5 e 0.7343667 1.2870542 1.41615858 0.7362462
source to share