R Double cycle optimization, matrix manipulation

I am trying to manipulate column data in a two column matrix and output it as data.frame file.

The matrix I have is in this format where both the values ​​in the start and end columns are incremented and do not overlap. Also, there are always starting records than ending records.

Suppose I start with this matrix:

#       Start   End
#  [1,]     1     6
#  [2,]     2     9
#  [3,]     3    15
#  [4,]     7    NA
#  [5,]     8    NA
#  [6,]    11    NA
#  [7,]    12    NA
#  [8,]    14    NA

      

I want this double to loop to output a data.frame that groups all Start values ​​less than the End value and associates it with that End value.

To clarify, I want to output this:

#       Start   End
#  1    1,2,3     6
#  2      7,8     9
#  3 11,12,14    15

      

I tried a double loop, but I need something faster because I want to use this method for a larger ~ 5MB matrix.

start_end <- matrix(c(1, 6, 2, 9, 3, 15, 7, NA, 8, NA, 11, NA, 12, NA, 14, NA), 
  nrow=8, 
  ncol=2)

# of non NA rows in column 2
non_nacol <- sum(is.na(start_end[,2]))

sorted.output <- data.frame(matrix(NA, nrow = nrow(start_end), ncol = 0))
sorted.output$start <- 0
sorted.output$end <- 0

#Sort and populate data frame
for (k in 1:non_nacol) {
  for (j in 1:nrow(start_end)) {
        if (start_end[j,1]<start_end[k,2]) {
        S <- (start_end[j,1])
        E <- (start_end[k,2])
        sorted.output$start[j] <- S
        sorted.output$end[j] <- E
        }
  }
}

      

Thanks for the help!

+3


source to share


4 answers


You can use Rcpp:



start_end <- matrix(c(1, 6, 2, 9, 3, 15, 7, NA, 8, NA, 11, NA, 12, NA, 14, NA), 
                    nrow=8, 
                    ncol=2, byrow = TRUE)

library(Rcpp)
cppFunction('
            DataFrame fun(const IntegerMatrix& Mat) {
              IntegerVector start = na_omit(Mat(_, 0)); // remove NAs from starts
              std::sort(start.begin(), start.end()); // sort starts
              IntegerVector end = na_omit(Mat(_, 1)); // remove NAs from ends
              std::sort(end.begin(), end.end()); // sort ends
              IntegerVector res = clone(start); // initialize vector for matching ends
              int j = 0;
              for (int i = 0; i < start.length(); i++) { // loop over starts
                while (end(j) < start(i) && j < (end.length() - 1)) { // find corresponding end
                  j++;
                }
                if (end(j) >= start(i)) res(i) = end(j); // assign end
                else res(i) = NA_INTEGER; // assign NA if no end >= start exists
              }
              return DataFrame::create(_["start"]= start, _["end"]= res); // return a data.frame
            }
            ')

Res <- fun(start_end)

library(data.table)
setDT(Res)
Res[, .(start = paste(start, collapse = ",")), by = end]
#   end    start
#1:   6    1,2,3
#2:   9      7,8
#3:  15 11,12,14

      

+3


source


Here the solution built around findInterval()

, split()

and paste()

:

m <- matrix(c(1,2,3,7,8,11,12,14,6,9,15,NA,NA,NA,NA,NA),ncol=2,dimnames=list(NULL,c('Start','End')));
data.frame(Start=sapply(split(m[,'Start'],findInterval(m[,'Start'],na.omit(m[,'End']))),paste,collapse=','),End=na.omit(m[,'End']));
##      Start End
## 0    1,2,3   6
## 1      7,8   9
## 2 11,12,14  15

      


Edit: The problem you ran into is that in your real data, some of the gaps between the input values End

do not contain the input values Start

. My solution above incorrectly skips these intervals from the output vector Start

, resulting in a length mismatch with the output vector End

.

Here's a fixed solution:

end <- na.omit(m[,'End']);
data.frame(Start=unname(sapply(split(m[,'Start'],findInterval(m[,'Start'],end))[as.character(0:c(length(end)-1))],paste,collapse=',')),End=end);
##      Start End
## 1    1,2,3   6
## 2      7,8   9
## 3 11,12,14  15

      



Here's a demo of a test matrix that has empty spacing:

m <- matrix(c(1,2,3,11,12,14,6,9,15,NA,NA,NA),ncol=2,dimnames=list(NULL,c('Start','End')));
m;
##      Start End
## [1,]     1   6
## [2,]     2   9
## [3,]     3  15
## [4,]    11  NA
## [5,]    12  NA
## [6,]    14  NA
end <- na.omit(m[,'End']);
data.frame(Start=unname(sapply(split(m[,'Start'],findInterval(m[,'Start'],end))[as.character(0:c(length(end)-1))],paste,collapse=',')),End=end);
##      Start End
## 1    1,2,3   6
## 2            9
## 3 11,12,14  15

      

As you can see, for an empty spacing, the value that results in the output vector Start

is an empty string, which I think is a reasonable result. You can change the result after that if you like.

Finally, here's a demo using real data that you sent to Dropbox:

m <- read.table('start_end.txt',col.names=c('Start','End'));
head(m);
##   Start   End
## 1 11165 10548
## 2 12416 11799
## 3 12466 11900
## 4 12691 11976
## 5 12834 13336
## 6 13320 14028
end <- na.omit(m[,'End']);
system.time({ out <- data.frame(Start=unname(sapply(split(m[,'Start'],findInterval(m[,'Start'],end))[as.character(0:c(length(end)-1))],paste,collapse=',')),End=end); });
##    user  system elapsed
##  21.234   0.015  21.251
head(out);
##                           Start   End
## 1                               10548
## 2                         11165 11799
## 3                               11900
## 4                               11976
## 5 12416,12466,12691,12834,13320 13336
## 6       13425,13571,13703,13920 14028
nrow(out);
## [1] 131668

      

+5


source


Here is a simple basic R version

with(as.data.frame(dat), {
  data.frame(
    Start=tapply(Start, cut(Start, c(0, End)), c),
    End=na.omit(End)
  )
})
#        Start End
# 1    1, 2, 3   6
# 2       7, 8   9
# 3 11, 12, 14  15

      

Other

with(as.data.frame(dat), {
  group <- as.integer(cut(Start, c(0, End)))                  # assign Start values to End groups
  data.frame(
    Start=unclass(by(dat, group, function(g) g[["Start"]])),  # combine Start groups
    End=unique(na.omit(End))                                  # Remove duplicate/NA End values
  )
})

      

+2


source


Ugly solution dplyr

:

library(dplyr)
df <- as.data.frame(df)

df %>% mutate(End = V2[findInterval(V1, na.omit(V2)) + 1]) %>%
       group_by(End) %>%
       summarise(Start = paste(V1, collapse=", "))

      

Edit - with findInterval thanks to @bgoldst

+2


source







All Articles