Plot overlapping positions from different files in R

Question

Plot overlapping positions from different files in R

I have two large files with positions Start

and End

and two sample columns (numeric).

File 1:

Start  End  Sample1  Sample2
1         60      1       4
100       200     2       1
201       250     1       4
300       450     1       1

File 2:

Start  End  Sample1  Sample2
40         60      1       1
70        180      1       1
240       330      2       1
340       450      1       4
500       900      1       4
980       1200     2       1

First I would like to take the first positions to Start

and End

from the first file and make a segment plot. The plot should also take into account Start-20

and End+20

for each position in the first file.

Then I would like to take the overlapping Start

and End

positions from the second file and plot it in the above image. Thus, there will be many plots based on positions Start

and End

from the first file, and one of them with overlapping will also be plotted separately.

color

for each segment will be based on two samples (for example in both files if its 1 and 4

segment color is red

, if its 1 and 1

segment color is be green

, etc.).

I would really appreciate it if someone could make me figure out how to make a function to do this in R.

Thanks in advance.

PS I have attached a drawing for output enter image description here . I only showed two results.

Below is the code I wrote but it gives error

Error in match.names (clabs, names (xi)): names do not match previous names

Also I need to specify red for line segment of dataset1 and green for dataset2. How do I implement it in the code below?

overlap_func <- function(dataset1,dataset2) {

for(i in 1:nrow(dataset1))
 {

 loop_start <- dataset1[i,"Start"]
 loop_end <- dataset1[i,"End"]
 p <- dataset2[,c(1,2)]   
 dataset1_pos <- data.frame(loop_start,loop_end)
 dataset2_filter <- p[p$Start >= (loop_start-(loop_start/2)) & p$End <= (loop_end+ (loop_end/2)), ]
 data_in_loop <- rbind(dataset1_pos,dataset2_filter)
 plot_function(data_in_loop,loop_start,loop_end)

 }
 }


plot_function <- function(loop_data,start,end){ 
 pos <- 1:nrow(loop_data)
 dat1 <- cbind(pos,loop_data)
 colnames(dat1) <- c("pos","start","end")
 pdf(file=paste0("path where plots are generated","_",start,"-",end,"_","overlap.pdf"))
 plot(dat1$pos, type = 'n', xlim = range(c(start-(start/2), end+(end/2))))
 segments(dat1$start, dat1$pos, dat1$end, dat1$pos)
 dev.off()
 }


df1 <- read.table(header=T, text="Start  End  Sample1  Sample2
1         60      1       4
100       200     2       1
201       250     1       4
300       450     1       1")

df2 <- read.table(header=T, text="Start  End  Sample1  Sample2
40         60      1       1
70        180      1       1
240       330      2       1
340       450      1       4
500       900      1       4
980       1200     2       1")

 overlap_func(df1,df2)

+3

r plot ggplot2 lattice

glow 01 Feb 13 at 12:55

source to share

2 answers

I tried to solve this with a package lattice

. I specifically use a function Shingle

to get an idea of the intervals for comparison. I was hoping I could combine 2 tiles, but I can't. So, once I had my first plot, I used (as in the solution above) a package IRanges

to calculate overlaps. The idea is final dotplot

.

## I red the input data
dat <- read.table(text = 'Start  End  Sample1  Sample2
1         60      1       4
100       200     2       1
201       250     1       4
300       450     1       1', header = T) 

dat1 <- read.table(text = 'Start  End  Sample1  Sample2
40         60      1       1
70        180      1       1
240       330      2       1
340       450      1       4
500       900      1       4
980       1200     2       1', header = T) 


## I create my 2 shingles
dat.sh <- shingle(x = dat[,3], intervals = dat[,c(1,2)])
dat1.sh <- shingle(x = dat1[,3], intervals = dat1[,c(1,2)])
## compute max value for plot comparison
max.value <- max(c(dat$End,dat1$End))
## I plot the 2 series with differents color
p1<- plot(dat.sh, xlim= c(0,max.value),col = 'red')
p2 <- plot(dat1.sh,xlim= c(0,max.value), col ='green')
library(gridExtra)
grid.arrange(p1,p2)

This is a quick method for comparing my intervals.

enter image description here

It looks good, but I cannot go any further with the pebbles as I cannot combine them in the same plot. SO I will be using a package IRanges

to calculate overlaps.

library(IRanges)
rang1 <- IRanges(start=dat[,1], end = dat[,2])
rang2 <- IRanges(start=dat1[,1], end = dat1[,2])
dat.plot     <- dat1                    # use the first data.frame
dat.plot$group <- 'origin'
dat.plot$id <- rownames(dat1)           ## add an Id for each row
rang.o <- findOverlaps(rang2,rang1)     # get overlaps
dat.o <- dat1[rang.o@queryHits,]        ## construct overlaps data.frame 
dat.o$id <- rang.o@subjectHits
dat.o$group <- 'overlap'
dat.plot <- rbind(dat.plot,dat.o)      ## union of all 
dotplot(id ~End-Start|group  , data=dat.plot, 
                               groups = col,type = c("p", "h"))

enter image description here

+1

agstudy 01 Feb 13 at 16:07

source to share

Arun · Accepted Answer · 2013-02-01T13:49:59+0000

Something like that??

df1 <- read.table(header=T, text="Start  End  Sample1  Sample2
1         60      1       4
100       200     2       1
201       250     1       4
300       450     1       1")

df2 <- read.table(header=T, text="Start  End  Sample1  Sample2
40         60      1       1
70        180      1       1
240       330      2       1
340       450      1       4
500       900      1       4
980       1200     2       1")

require(IRanges)
require(ggplot2)
require(plyr)

df1$id <- factor(1:nrow(df1))
ir2 <- IRanges(df2$Start, df2$End)
out <- ddply(df1, .(id), function(x) {
    ir1 <- IRanges(x$Start, x$End)
    o.idx <- as.data.frame(findOverlaps(ir1, ir2))$subjectHits
    df.out <- rbind(x[, 1:4], df2[o.idx, ])
    df.out$id1 <- x$id
    df.out$id2 <- seq_len(nrow(df.out))
    df.out
})
out$id1 <- factor(out$id1)
out$id2 <- factor(out$id2)
out$id3 <- factor(1:nrow(out))

p <- ggplot(out, aes(x = Start, y = id3 , colour = id2)) 
p <- p + geom_segment(aes(xend = End, ystart = id3, yend = id3))
p <- p + scale_colour_brewer(palette = "Set1")
p

gglot2_no_facet_geom_segment

Edit: After seeing the updated blueprint, maybe this is what you want?

p + facet_wrap( ~ id1, scales="free")

ggplot2_facet_geom_segment

Edit: Save each graph in a facet in a separate file. You can do this by creating a graph each time, dividing byid1

d_ply(out, .(id1), function(ww) {
    p <- ggplot(ww, aes(x = Start, y = id3 , colour = id2)) 
    p <- p + geom_segment(aes(xend = End, ystart = id3, yend = id3))
    p <- p + scale_colour_brewer(palette = "Set1")
    fn <- paste0("~/Downloads/id", as.numeric(as.character(ww$id1[1])), ".pdf")
    ggsave(fn, p)
})

Set the path to fn

accordingly.

Plot overlapping positions from different files in R

More articles: