R - speed up nested loops (vectorization?) Conditioned on sequences of different sizes

With a dataset with three columns (X position, Y position and some VAL value), I would like to perform some operation (for example average) on all VALs included in some XxY bins / bins (that is, I want a grid of my space ).

First, I wrote the following naive function ( myT

is the transmitted dataset, xbounds

and ybounds

are vectors of consecutive interval discontinuities (bins)):

calcPerBin1 <- function(myT, xbounds, ybounds) {
  newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
  names(newT) <- c("X","Y","MEAN")
  line <- 1
  for (i in 1:(length(xbounds)-1)) {
    for (j in 1:(length(ybounds)-1)) {
      myTsubset <- myT[myT$X >= xbounds[i] & myT$X < xbounds[i+1] &
                       myT$Y >= ybounds[j] & myT$Y < ybounds[j+1], ]
      newT$MEAN[line] <- mean(myTsubset$VAL)
      newT$X[line] <- mean(c(xbounds[i], xbounds[i+1]))
      newT$Y[line] <- mean(c(ybounds[j], ybounds[j+1]))
      line <- line+1
    }
  }
  return(newT)
}

      

SHORTCUT question: how to improve the code above? (my next first attempts - can be skipped if too long!)


The double loop for

is of course very suboptimal, and its runtime is terrible (no way to use this with my real dataset). Thus, I tried to execute the following code (i.e. the inner loop is vectorized, if I'm not mistaken):

calcPerBin2 <- function(myT,xbounds, ybounds) {
  newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
  names(newT) <- c("X","Y","MEAN")
  xboundsmean <- vector() ; yboundsmean <- vector()
  for (i in 1:(length(xbounds)-1)) {
    xboundsmean <- c(xboundsmean, mean(c(xbounds[i],xbounds[i+1])))}
  for (i in 1:(length(ybounds)-1)) {
    yboundsmean <- c(yboundsmean, mean(c(ybounds[i],ybounds[i+1])))}
  xyvals <- expand.grid(xmid=xboundsmean, ymid=yboundsmean)
  xyvals$xmin <- xyvals$xmid-binsize/2
  xyvals$xmax <- xyvals$xmid+binsize/2
  xyvals$ymin <- xyvals$ymid-binsize/2
  xyvals$ymax <- xyvals$ymid+binsize/2
  res <- vector()
  for (i in 1:dim(xyvals)[1]) {
    cond <- (myT$X >= xyvals$xmin[i] & myT$X < xyvals$xmax[i] &
               myT$Y >= xyvals$ymin[i] & myT$Y < xyvals$ymax[i])
    res <- c(res, mean(myT$VAL[cond]))
  }
  newT$MEAN <- res
  newT$X <- xyvals[,1]
  newT$Y <- xyvals[,2]
  return(newT)
}

      

This is very ugly, so I tried to do the following:

calcPerBin2.2 <- function(myT,xbounds, ybounds, sizeofbin) {
  newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
  names(newT) <- c("X","Y","MEAN")
  xcut <- cut(myT$X, breaks=xbounds)
  ycut <- cut(myT$Y, breaks=ybounds)
  xycut <- expand.grid(XCUT=levels(xcut), YCUT=levels(ycut))
  xylowers <- cbind(xlower = as.numeric(sub("\\((.+),.*", "\\1", xycut$XCUT) ),
                    ylower = as.numeric(sub("\\((.+),.*", "\\1", xycut$YCUT) ))
  res <- vector()
  for (i in 1:dim(xycut)[1]) {
    cond <- (xcut==xycut$XCUT[i] & ycut==xycut$YCUT[i])
    res <- c(res, mean(myT$VAL[cond]))
  }
  newT$MEAN <- res
  newT$X <- xylowers[,1]+sizeofbin/2
  newT$Y <- xylowers[,2]+sizeofbin/2
  return(newT)
}

      

I can run this for example:

# Control parameters
xmax <- 500 
ymax <- 1000 
N <- 100000
binsize <- 50

xbins <- seq(0,xmax,binsize)
ybins <- seq(0,ymax,binsize) # xbins and ybins do NOT have the same size

# Generate dummy data
xcoords <- runif(N, 1, xmax) 
ycoords <- runif(N, 1, ymax) 
vals <- xcoords+ycoords**2
data <- data.frame(cbind(X=xcoords, Y=ycoords, VAL=vals))

# Run
system.time(test1 <- calcPerBin1(data, xbins, ybins))
system.time(test2 <- calcPerBin2(data, xbins, ybins))
system.time(test2.2 <- calcPerBin2.2(data, xbins, ybins, binsize))

      

A small improvement is obtained calcPerBin2

, but calcPerBin2.2

even worse than calcPerBin1

(and, yes, all the codes are ugly). My problem here is that it's not too clear to me how to replace (vectorize?) The loop remaining in calcPerBin2

. For example, how can I efficiently write a condition based myT$X

on xyvals$xmin

with this last one in vector form (they are not the same size) instead of the indexed form I am using in calcPerBin2

? Any suggestions for improving the above code are appreciated - thanks.

+3


source to share


1 answer


You can do most of this all in three lines (using zoo

for rollmean

):

library(zoo) # load the package
data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
res <- aggregate(VAL ~ X + Y, data, mean)

      

Check the result:

# order it the same way as in test1, then show the first lines
head(res[order(res$X, res$Y),]) 
#    X   Y        VAL
#1  25  25   900.8305
#11 25  75  5957.4972
#21 25 125 15680.8103
#31 25 175 30877.6696
#41 25 225 50688.4860
#51 25 275 75961.8558

      



Compare it to the result of the original function:

test1 <- calcPerBin1(data, xbins, ybins)
head(test1)
#   X   Y       MEAN
#1 25  25   900.8305
#2 25  75  5957.4972
#3 25 125 15680.8103
#4 25 175 30877.6696
#5 25 225 50688.4860
#6 25 275 75961.8558

      

Benchmark:

fastbin <- function(data, xbins, ybins){
  data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
  data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
  aggregate(VAL ~ X + Y, data, mean)
}

library(dplyr)   # for faster aggregation
fastbin.dplyr <- function(data, xbins, ybins){
  data %>%
    mutate(X = cut(X, xbins, labels = rollmean(xbins, 2)),
           Y = cut(Y, ybins, labels = rollmean(ybins, 2))) %>%
    group_by(X, Y) %>% 
    summarise(Val = mean(VAL))
}

system.time(test1 <- calcPerBin1(data, xbins, ybins))
       User      System     elapsed 
       3.47        0.12        3.59 

system.time(res.fastbin <- fastbin(data, xbins, ybins))
       User      System     elapsed 
       1.01        0.02        1.05 

system.time(res.fastbin.dplyr <- fastbin.dplyr(data, xbins, ybins))
       User      System     elapsed 
       0.06        0.00        0.06 

      

+8


source







All Articles