R - speed up nested loops (vectorization?) Conditioned on sequences of different sizes
With a dataset with three columns (X position, Y position and some VAL value), I would like to perform some operation (for example average) on all VALs included in some XxY bins / bins (that is, I want a grid of my space ).
First, I wrote the following naive function ( myT
is the transmitted dataset, xbounds
and ybounds
are vectors of consecutive interval discontinuities (bins)):
calcPerBin1 <- function(myT, xbounds, ybounds) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
line <- 1
for (i in 1:(length(xbounds)-1)) {
for (j in 1:(length(ybounds)-1)) {
myTsubset <- myT[myT$X >= xbounds[i] & myT$X < xbounds[i+1] &
myT$Y >= ybounds[j] & myT$Y < ybounds[j+1], ]
newT$MEAN[line] <- mean(myTsubset$VAL)
newT$X[line] <- mean(c(xbounds[i], xbounds[i+1]))
newT$Y[line] <- mean(c(ybounds[j], ybounds[j+1]))
line <- line+1
}
}
return(newT)
}
SHORTCUT question: how to improve the code above? (my next first attempts - can be skipped if too long!)
The double loop for
is of course very suboptimal, and its runtime is terrible (no way to use this with my real dataset). Thus, I tried to execute the following code (i.e. the inner loop is vectorized, if I'm not mistaken):
calcPerBin2 <- function(myT,xbounds, ybounds) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
xboundsmean <- vector() ; yboundsmean <- vector()
for (i in 1:(length(xbounds)-1)) {
xboundsmean <- c(xboundsmean, mean(c(xbounds[i],xbounds[i+1])))}
for (i in 1:(length(ybounds)-1)) {
yboundsmean <- c(yboundsmean, mean(c(ybounds[i],ybounds[i+1])))}
xyvals <- expand.grid(xmid=xboundsmean, ymid=yboundsmean)
xyvals$xmin <- xyvals$xmid-binsize/2
xyvals$xmax <- xyvals$xmid+binsize/2
xyvals$ymin <- xyvals$ymid-binsize/2
xyvals$ymax <- xyvals$ymid+binsize/2
res <- vector()
for (i in 1:dim(xyvals)[1]) {
cond <- (myT$X >= xyvals$xmin[i] & myT$X < xyvals$xmax[i] &
myT$Y >= xyvals$ymin[i] & myT$Y < xyvals$ymax[i])
res <- c(res, mean(myT$VAL[cond]))
}
newT$MEAN <- res
newT$X <- xyvals[,1]
newT$Y <- xyvals[,2]
return(newT)
}
This is very ugly, so I tried to do the following:
calcPerBin2.2 <- function(myT,xbounds, ybounds, sizeofbin) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
xcut <- cut(myT$X, breaks=xbounds)
ycut <- cut(myT$Y, breaks=ybounds)
xycut <- expand.grid(XCUT=levels(xcut), YCUT=levels(ycut))
xylowers <- cbind(xlower = as.numeric(sub("\\((.+),.*", "\\1", xycut$XCUT) ),
ylower = as.numeric(sub("\\((.+),.*", "\\1", xycut$YCUT) ))
res <- vector()
for (i in 1:dim(xycut)[1]) {
cond <- (xcut==xycut$XCUT[i] & ycut==xycut$YCUT[i])
res <- c(res, mean(myT$VAL[cond]))
}
newT$MEAN <- res
newT$X <- xylowers[,1]+sizeofbin/2
newT$Y <- xylowers[,2]+sizeofbin/2
return(newT)
}
I can run this for example:
# Control parameters
xmax <- 500
ymax <- 1000
N <- 100000
binsize <- 50
xbins <- seq(0,xmax,binsize)
ybins <- seq(0,ymax,binsize) # xbins and ybins do NOT have the same size
# Generate dummy data
xcoords <- runif(N, 1, xmax)
ycoords <- runif(N, 1, ymax)
vals <- xcoords+ycoords**2
data <- data.frame(cbind(X=xcoords, Y=ycoords, VAL=vals))
# Run
system.time(test1 <- calcPerBin1(data, xbins, ybins))
system.time(test2 <- calcPerBin2(data, xbins, ybins))
system.time(test2.2 <- calcPerBin2.2(data, xbins, ybins, binsize))
A small improvement is obtained calcPerBin2
, but calcPerBin2.2
even worse than calcPerBin1
(and, yes, all the codes are ugly). My problem here is that it's not too clear to me how to replace (vectorize?) The loop remaining in calcPerBin2
. For example, how can I efficiently write a condition based myT$X
on xyvals$xmin
with this last one in vector form (they are not the same size) instead of the indexed form I am using in calcPerBin2
? Any suggestions for improving the above code are appreciated - thanks.
source to share
You can do most of this all in three lines (using zoo
for rollmean
):
library(zoo) # load the package
data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
res <- aggregate(VAL ~ X + Y, data, mean)
Check the result:
# order it the same way as in test1, then show the first lines
head(res[order(res$X, res$Y),])
# X Y VAL
#1 25 25 900.8305
#11 25 75 5957.4972
#21 25 125 15680.8103
#31 25 175 30877.6696
#41 25 225 50688.4860
#51 25 275 75961.8558
Compare it to the result of the original function:
test1 <- calcPerBin1(data, xbins, ybins)
head(test1)
# X Y MEAN
#1 25 25 900.8305
#2 25 75 5957.4972
#3 25 125 15680.8103
#4 25 175 30877.6696
#5 25 225 50688.4860
#6 25 275 75961.8558
Benchmark:
fastbin <- function(data, xbins, ybins){
data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
aggregate(VAL ~ X + Y, data, mean)
}
library(dplyr) # for faster aggregation
fastbin.dplyr <- function(data, xbins, ybins){
data %>%
mutate(X = cut(X, xbins, labels = rollmean(xbins, 2)),
Y = cut(Y, ybins, labels = rollmean(ybins, 2))) %>%
group_by(X, Y) %>%
summarise(Val = mean(VAL))
}
system.time(test1 <- calcPerBin1(data, xbins, ybins))
User System elapsed
3.47 0.12 3.59
system.time(res.fastbin <- fastbin(data, xbins, ybins))
User System elapsed
1.01 0.02 1.05
system.time(res.fastbin.dplyr <- fastbin.dplyr(data, xbins, ybins))
User System elapsed
0.06 0.00 0.06
source to share