Concatenate 100 files based on a column in R

Question

Concatenate 100 files based on a column in R

I have dataframes (100) like these:

Now I want to concatenate them based on the value in the third column and form one file, so that if there is a matching value in the third column, then there will be a value in the else sequence. Therefore, the resulting file will be,

  v1 v2  v3
1  1  2 aj0
2  2  3 bit
3  2  4 00g
4  3  4 ct0
5  3  5 er0
6  4  5 d00
7  6  7 00i
8  8  9 00t

I managed to find the code for this:

> df12 <- merge(df1, df2, by=c("v1", "v2"), all=TRUE)
> df123 <- merge(df12, df3, by=c("v1", "v2"), all=TRUE)
> df123$v3 <- paste0(df123[,3], df123[,4], df123[,5])
> df123$v3.x <- df123$v3.y <- NULL
> df123$v3 <- gsub("NA", "0", df123$v3)

However, I have 100 data frames, how can I do this with them? Thank!

+3

r

chan chong 25 oct. 14 at 17:09

source to share

2 answers

You can do this with a Reduce

list of dataframes:

dfs <- list(df1, df2, df3)
merged <- Reduce(function(x, y) merge(x, y, by=c("v1", "v2"), all=T), dfs)
v3 <- gsub("NA", "0", apply(merged[,c(-1, -2)], 1, paste, collapse=""))
data.frame(v1=merged$v1, v2=merged$v2, v3)
#   v1 v2  v3
# 1  1  2 aj0
# 2  2  3 bit
# 3  2  4 00g
# 4  3  4 ct0
# 5  3  5 er0
# 6  4  5 d00
# 7  6  7 00i
# 8  8  9 00t

+3

josliber 25 oct. 14 at 17:27

source to share

akrun · Accepted Answer · 2014-10-25T17:54:37+0000

Another way: dplyr

library(dplyr)
library(tidyr)

unnest(dfs,group) %>%
                  spread(group, v3, fill=0) %>% 
                    unite(v3, df1:df3, sep="")
 #  v1 v2  v3
 #1  1  2 aj0
 #2  2  3 bit
 #3  2  4 00g
 #4  3  4 ct0
 #5  3  5 er0
 #6  4  5 d00
 #7  6  7 00i
 #8  8  9 00t

Or using data.table

 library(data.table)
 dt <- rbindlist(Map(`cbind`,group=names(dfs),dfs))
 dcast.data.table(dt, v1+v2~group, value.var="v3", fill="0")[,
                list(v3=do.call(`paste0`, .SD)), by=list(v1,v2), .SDcols=names(dfs)]

 #   v1 v2  v3
 #1:  1  2 aj0
 #2:  2  3 bit
 #3:  2  4 00g
 #4:  3  4 ct0
 #5:  3  5 er0
 #6:  4  5 d00
 #7:  6  7 00i
 #8:  8  9 00t

Update

If you have files P1.txt

, P2.txt

etc., you can read the files in the list and then apply the above code. Assuming the files are in the working directory

  files <- list.files(pattern="^P\\d+")
  dfs <-lapply(files, function(x) read.table(x, header=TRUE, sep="") #or use `fread` from `data.table`
  nm1 <- gsub("\\..*", "", files)
  dfs <- setNames(dfs, nm1)

You need to replace df1:df3

the code dplyr

withP1:P100

  unnest(dfs,group) %>%
              spread(group, v3, fill=0) %>% 
                unite(v3, P1:P100, sep="")


###data

df1 <- structure(list(v1 = c(1L, 2L, 3L, 4L, 3L), v2 = c(2L, 3L, 4L, 
5L, 5L), v3 = c("a", "b", "c", "d", "e")), .Names = c("v1", "v2", 
"v3"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5"))

df2 <- structure(list(v1 = c(1L, 2L, 3L, 3L), v2 = 2:5, v3 = c("j", 
"i", "t", "r")), .Names = c("v1", "v2", "v3"), class = "data.frame", 
 row.names = c("1", "2", "3", "4"))

df3 <- structure(list(v1 = c(2L, 2L, 6L, 8L), v2 = c(3L, 4L, 7L, 9L), 
 v3 = c("t", "g", "i", "t")), .Names = c("v1", "v2", "v3"), class = "data.frame",
 row.names = c("1", "2", "3", "4"))

 dfs <- mget(ls(pattern="^df\\d+"))

Concatenate 100 files based on a column in R

Update

More articles: