Concatenate 100 files based on a column in R
I have dataframes (100) like these:
df1
v1 v2 v3
1 1 2 a
2 2 3 b
3 3 4 c
4 4 5 d
5 3 5 e
df2
v1 v2 v3
1 1 2 j
2 2 3 i
3 3 4 t
4 3 5 r
df3
v1 v2 v3
1 2 3 t
2 2 4 g
3 6 7 i
4 8 9 t
Now I want to concatenate them based on the value in the third column and form one file, so that if there is a matching value in the third column, then there will be a value in the else sequence. Therefore, the resulting file will be,
v1 v2 v3
1 1 2 aj0
2 2 3 bit
3 2 4 00g
4 3 4 ct0
5 3 5 er0
6 4 5 d00
7 6 7 00i
8 8 9 00t
I managed to find the code for this:
> df12 <- merge(df1, df2, by=c("v1", "v2"), all=TRUE)
> df123 <- merge(df12, df3, by=c("v1", "v2"), all=TRUE)
> df123$v3 <- paste0(df123[,3], df123[,4], df123[,5])
> df123$v3.x <- df123$v3.y <- NULL
> df123$v3 <- gsub("NA", "0", df123$v3)
However, I have 100 data frames, how can I do this with them? Thank!
source to share
Another way: dplyr
library(dplyr)
library(tidyr)
unnest(dfs,group) %>%
spread(group, v3, fill=0) %>%
unite(v3, df1:df3, sep="")
# v1 v2 v3
#1 1 2 aj0
#2 2 3 bit
#3 2 4 00g
#4 3 4 ct0
#5 3 5 er0
#6 4 5 d00
#7 6 7 00i
#8 8 9 00t
Or using data.table
library(data.table)
dt <- rbindlist(Map(`cbind`,group=names(dfs),dfs))
dcast.data.table(dt, v1+v2~group, value.var="v3", fill="0")[,
list(v3=do.call(`paste0`, .SD)), by=list(v1,v2), .SDcols=names(dfs)]
# v1 v2 v3
#1: 1 2 aj0
#2: 2 3 bit
#3: 2 4 00g
#4: 3 4 ct0
#5: 3 5 er0
#6: 4 5 d00
#7: 6 7 00i
#8: 8 9 00t
Update
If you have files P1.txt
, P2.txt
etc., you can read the files in the list and then apply the above code. Assuming the files are in the working directory
files <- list.files(pattern="^P\\d+")
dfs <-lapply(files, function(x) read.table(x, header=TRUE, sep="") #or use `fread` from `data.table`
nm1 <- gsub("\\..*", "", files)
dfs <- setNames(dfs, nm1)
You need to replace df1:df3
the code dplyr
withP1:P100
unnest(dfs,group) %>%
spread(group, v3, fill=0) %>%
unite(v3, P1:P100, sep="")
###data
df1 <- structure(list(v1 = c(1L, 2L, 3L, 4L, 3L), v2 = c(2L, 3L, 4L,
5L, 5L), v3 = c("a", "b", "c", "d", "e")), .Names = c("v1", "v2",
"v3"), class = "data.frame", row.names = c("1", "2", "3", "4",
"5"))
df2 <- structure(list(v1 = c(1L, 2L, 3L, 3L), v2 = 2:5, v3 = c("j",
"i", "t", "r")), .Names = c("v1", "v2", "v3"), class = "data.frame",
row.names = c("1", "2", "3", "4"))
df3 <- structure(list(v1 = c(2L, 2L, 6L, 8L), v2 = c(3L, 4L, 7L, 9L),
v3 = c("t", "g", "i", "t")), .Names = c("v1", "v2", "v3"), class = "data.frame",
row.names = c("1", "2", "3", "4"))
dfs <- mget(ls(pattern="^df\\d+"))
source to share
You can do this with a Reduce
list of dataframes:
dfs <- list(df1, df2, df3)
merged <- Reduce(function(x, y) merge(x, y, by=c("v1", "v2"), all=T), dfs)
v3 <- gsub("NA", "0", apply(merged[,c(-1, -2)], 1, paste, collapse=""))
data.frame(v1=merged$v1, v2=merged$v2, v3)
# v1 v2 v3
# 1 1 2 aj0
# 2 2 3 bit
# 3 2 4 00g
# 4 3 4 ct0
# 5 3 5 er0
# 6 4 5 d00
# 7 6 7 00i
# 8 8 9 00t
source to share