How to perform an operation between grouped values ββfrom two data frames
I have two data frames:
src_tbl <- structure(list(Sample_name = c("S1", "S2", "S1", "S2", "S1", "S2"), crt = c(0.079, 0.082, 0.079, 0.082, 0.079, 0.082), sr = c(0.592, 0.549, 0.592, 0.549, 0.592, 0.549), condition = c("x1", "x1", "x2", "x2", "x3", "x3"), score = c("0.077", "0.075", "0.483", "0.268", "0.555", "0.120")), row.names = c(NA, -6L), .Names = c("Sample_name", "crt", "sr", "condition", "score"), class = c("tbl_df", "tbl", "data.frame")) src_tbl #> Sample_name crt sr condition score #> 1 S1 0.079 0.592 x1 0.077 #> 2 S2 0.082 0.549 x1 0.075 #> 3 S1 0.079 0.592 x2 0.483 #> 4 S2 0.082 0.549 x2 0.268 #> 5 S1 0.079 0.592 x3 0.555 #> 6 S2 0.082 0.549 x3 0.120 ref_tbl <- structure(list(Sample_name = c("P1", "P2", "P3", "P1", "P2", "P3", "P1", "P2", "P3"), crt = c(1, 1, 1, 1, 1, 1, 1, 1, 1), sr = c(2, 2, 2, 2, 2, 2, 2, 2, 2), condition = c("r1", "r1", "r1", "r2", "r2", "r2", "r3", "r3", "r3"), score = c("0.200", "0.201", "0.199", "0.200", "0.202", "0.200", "0.200", "0.204", "0.197")), row.names = c(NA, -9L), .Names = c("Sample_name", "crt", "sr", "condition", "score"), class = c("tbl_df", "tbl", "data.frame")) ref_tbl #> Sample_name crt sr condition score #> 1 P1 1 2 r1 0.200 #> 2 P2 1 2 r1 0.201 #> 3 P3 1 2 r1 0.199 #> 4 P1 1 2 r2 0.200 #> 5 P2 1 2 r2 0.202 #> 6 P3 1 2 r2 0.200 #> 7 P1 1 2 r3 0.200 #> 8 P2 1 2 r3 0.204 #> 9 P3 1 2 r3 0.197
What I want to do is perform the ( ks.test()
) operation on the columns score
grouped by Sample_name
in both data frames. For example, the p-value of the KS test for S1 and P1:
# in src_tbl s1 <- c(0.077,0.483,0.555) #in ref_tbl p1 <- c(0.200,0.200,0.200) testout <- ks.test(s1,p1) #> Warning in ks.test(s1, p1): cannot compute exact p-value with ties broom::tidy(testout) #> statistic p.value method alternative #> 1 0.6666667 0.5175508 Two-sample Kolmogorov-Smirnov test two-sided
What I want to do is to do everything against the whole operation so that we end up with a table like
src ref p.value S1 P1 0.5175508 S1 P2 0.6 S1 P3 0.6 S2 P1 0.5175508 S2 P2 0.6 S2 P3 0.6
How can i do this? It is preferable to be fast because the number of samples in ref_table
can be large (P1, P2 .... P10k).
source to share
Here is the solution in tidyverse
. First, I insert the score into each source dataset:
ref_tbl <- ref_tbl %>%
mutate(ref = as.factor(Sample_name),
score_ref = as.numeric(score)) %>%
select(ref, score_ref) %>%
tidyr::nest(score_ref)
ref_tbl
# A tibble: 3 x 2
ref data
<fctr> <list>
1 P1 <tibble [3 x 1]>
2 P2 <tibble [3 x 1]>
3 P3 <tibble [3 x 1]>
src_tbl <- src_tbl %>%
mutate(src = as.factor(Sample_name),
score_src = as.numeric(score)) %>%
select(src, score_src) %>%
tidyr::nest(score_src)
src_tbl
# A tibble: 2 x 2
src data
<fctr> <list>
1 S1 <tibble [3 x 1]>
2 S2 <tibble [3 x 1]>
Then I create a grid with all combinations of sample names:
all_comb <- as_data_frame(expand.grid(src = src_tbl$src, ref = ref_tbl$ref))
all_comb
# A tibble: 6 x 2
src ref
<fctr> <fctr>
1 S1 P1
2 S2 P1
3 S1 P2
4 S2 P2
5 S1 P3
6 S2 P3
Now we can join the nested data and I am binding the columns, so there should be one column with scores for each combination.
all_comb <- all_comb %>%
left_join(ref_tbl, by = "ref") %>%
left_join(src_tbl, by = "src") %>%
mutate(data = purrr::map2(data.x, data.y, bind_cols)) %>%
select(-data.x, -data.y)
all_comb
# A tibble: 6 x 3
src ref data
<fctr> <fctr> <list>
1 S1 P1 <tibble [3 x 2]>
2 S2 P1 <tibble [3 x 2]>
3 S1 P2 <tibble [3 x 2]>
4 S2 P2 <tibble [3 x 2]>
5 S1 P3 <tibble [3 x 2]>
6 S2 P3 <tibble [3 x 2]>
Finally, I match ks.test
if each dataset, use a broom to get the p.value as requested.
final <- all_comb %>%
mutate(ks = purrr::map(data, ~ks.test(.$score_ref, .$score_src)),
tidied = purrr::map(ks, broom::tidy)) %>%
tidyr::unnest(tidied) %>%
select(src, ref, p.value)
Warning message: cannot compute exact p-value with ties
Warning message: cannot compute exact p-value with ties
final
# A tibble: 6 x 3
src ref p.value
<fctr> <fctr> <dbl>
1 S1 P1 0.5175508
2 S2 P1 0.5175508
3 S1 P2 0.6000000
4 S2 P2 0.6000000
5 S1 P3 0.6000000
6 S2 P3 0.6000000
source to share
Well it took a while, but I put together a hacky solution. I'm sure there is a more elegant way with similar ones ddply
, but that is beyond my scope. (Note that my p values ββare slightly different from yours as I have truncated one of the data frames)
library(dplyr)
library(tidyr)
ref_tbl<-ref_tbl[1:6,]#make equal rows for this example
dd<-as.data.frame(cbind(paste(src_tbl$Sample_name,'-', src_tbl$score),
paste(ref_tbl$Sample_name,'-',ref_tbl$score)))#concatenate sample names with their scores
ex<-expand.grid(x = levels(dd$V1), y = levels(dd$V2))#obtain all combinations
all<-ex %>%
separate(x, c("S","svalue"),"-")%>%
separate(y, c("P","pvalue"),"-")#unseparate now that we have the combinations
all$svalue<-as.numeric(all$svalue)#change to numeric for ks.test
all$pvalue<-as.numeric(all$pvalue)
x<-split(all,list(all$S,all$P))#split into a list of dataframes showing individual combinations
ks<-lapply(x,function(x)ks.test(x[,2],x[,4]))#apply ks.test to each individual combination
pval<-lapply(ks, '[[', 'p.value')#extract pvalues
do.call(rbind,pval)#final result at last!
# [,1]
#S1 .P1 0.5175508
#S2 .P1 0.5175508
#S1 .P2 0.1389203
#S2 .P2 0.1389203
#S1 .P3 0.1389203
#S2 .P3 0.1389203
source to share