Fixing bottleneck in a function returning frame data of distances between raster cells in R
I am trying to fix the nested loops loop bottleneck in a function. I have already tried three functions and cannot fix them. Any help, especially if it's a data.table or rcpp solution, much appreciated. This is an example with a 100 cell raster, but I have a few of over 1,000,000 cells, so speed makes sense.
Example
Consider the following bitmap:
library(raster)
r <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))
extent(r) <- c(-10, 10, -10, 10)
It is a small bitmap, only one hundred cells. I made the following function to study the movement of animals, where I use a Raster and the maximum distance that animals can move in time. There is a bottleneck in the nested loop that I have not been able to resolve.
What I get in return is a dataframe with the following variables:
from Identifier of the cell in the raster from which the animal can move from
to Identifier of a cell in the raster from which the animal can jump to
Dist Distance from cell to cell to
Loading the required libraries
library(gdistance)
library(dplyr)
library(tidyr)
DistConect1 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
arcs <- list()
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost(h16,B[i,])
#Make all cells with distance to origin larger than maximum dispersal distance equal NA
temp[values(temp) > Distance] = NA
#Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
#For each cell pair i,j generate a vector i,j, distance
arcs[[j]] <- c(i, j, temp[j])
}
#Gather all vectors in a data frame
connections[[i]] <- do.call("rbind", arcs)
#name columns
colnames(connections[[i]]) <- c("from", "to", "dist")
#This is just to see where I am in the function
print(paste(i, "of", nrow(B)))
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
Tried to fix it with lapply
But I only got rid of one of the loops
DistConect2 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
# connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
#For each pair of cells in B
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
but using the microbenchmarck package to test the differences there is no difference:
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), times = 4)
Micro-detection result
DistConect1(r, Distance = 1e+06) 10.283309 10.40662 10.55879 10.58380 10.71097 10.78428 4 DistConect2(r, Distance = 1e+06) 9.892371 10.07783 10.35453 10.41144 10.63124 10.70288 4
CLD a
parallelization
I also tried to parallelize, but it actually takes longer:
DistConect2b <- function (Raster, Distance, cpus = NULL)
{
h16 <- transition(Raster, transitionFunction = function(x) {1}, 16, symm = FALSE)
h16 <- geoCorrection(h16, scl = FALSE)
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
# cat(paste(i, "of", nrow(B)))
}
require(snowfall)
sfInit(parallel=TRUE, cpus=cpus)
sfLibrary(gdistance)
sep.connections <- sfClusterApplyLB(1:nrow(B), all.cells)
sfStop(nostop=FALSE)
# sep.connections <- lapply(1:nrow(B), all.cells)
connections <- do.call("rbind", sep.connections)
connections <- as.data.frame(connections)
}
Microbenchmark results
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConect2b(r, Distance = 1000000, cpus = 2), times = 4)
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 10.145234 10.216611 10.35301 10.36512 10.48942
DistConect2(r, Distance = 1e+06) 9.963549 9.974315 10.01547 10.01173 10.05662
DistConnect2b(r, Distance = 1e+06, cpus = 2) 11.311966 11.486705 12.02240 11.81034 12.55809
Other tests after answer
After the big answer I got, I tried to go one step further and added a replacement for loop in the code instead:
DistConect4 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
temp <- accCost2(h16, B[i, ])
index <- which(temp < 1000000) # all.index <- function(j){
connections <- cbind(i, index, temp[index])
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
connections <- as.data.frame(do.call("rbind", connections))
#Get everything together as a large data frame
colnames(connections) <- c("from", "to", "dist")
#return connections
return(connections)
}
Using acccost2 function defined below
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
But when I tried
timing <- microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConnect2b(r, Distance = 1000000, cpus = 4), DistConect3(r, Distance = 1000000), DistConect4(r, Distance = 1000000) ,times = 20)
print(timing, unit = "relative")
he didn't speed up the process
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 12.400299 12.43078 12.407909 12.452043 12.502665
DistConect2(r, Distance = 1e+06) 12.238812 12.23194 12.168468 12.191067 12.155786
DistConnect2b(r, Distance = 1e+06, cpus = 4) 13.994594 14.01760 13.909674 13.978060 13.947062
DistConect3(r, Distance = 1e+06) 1.000000 1.00000 1.000000 1.000000 1.000000
DistConect4(r, Distance = 1e+06) 0.997329 1.00141 1.019697 1.002112 1.005626
I thought the application was always faster than any idea, why doesn't it speed up the process?
source to share
You can get rid of the inner loop by changing
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
arcs[[j]] <- c(i, j, temp[j])
}
connections[[i]] <- do.call("rbind", arcs)
colnames(connections[[i]]) <- c("from", "to", "dist")
in it:
index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])
I also looked accCost
at which seems to be the slowest function here. Unfortunately, it names some C code for finding the shortest paths, which probably means there isn't much to optimize. I created accCost2
where I removed some of the code, but I doubt it matters much. I'm also not sure how efficient the parallelization is here, since the execution time is not that long. Below are some benchmarks showing worthy improvement.
library(gdistance)
library(dplyr)
library(tidyr)
library(raster)
r <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))
extent(r) <- c(-10, 10, -10, 10)
DistConect1 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
arcs <- list()
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost(h16,B[i,])
#Make all cells with distance to origin larger than maximum dispersal distance equal NA
temp[values(temp) > Distance] = NA
#Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
#For each cell pair i,j generate a vector i,j, distance
arcs[[j]] <- c(i, j, temp[j])
}
#Gather all vectors in a data frame
connections[[i]] <- do.call("rbind", arcs)
#name columns
colnames(connections[[i]]) <- c("from", "to", "dist")
#This is just to see where I am in the function
# print(paste(i, "of", nrow(B)))
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
DistConect3 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost2(h16,B[i,])
index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
colnames(connections) <- c("from", "to", "dist")
#return connections
return(connections)
}
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
d1 <- DistConect1(r, Distance = 1000)
d3 <- DistConect3(r, Distance = 1000)
# test float equality
all.equal(d1, d3, check.attributes = FALSE)
# TRUE
timing1 <- microbenchmark(
DistConect1(r, Distance = 1000),
DistConect3(r, Distance = 1000),
times = 10
)
print(timing1, unit = "relative")
# expr min lq mean median uq max neval cld
# 1 DistConect1(r, Distance = 1000) 2.077804 1.991303 1.881478 1.933114 1.951884 1.531302 10 b
# 2 DistConect3(r, Distance = 1000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
timing2 <- microbenchmark(
DistConect1(r, Distance = 10000),
DistConect3(r, Distance = 10000),
times = 10
)
print(timing2, unit = "relative")
# expr min lq mean median uq max neval cld
# DistConect1(r, Distance = 10000) 2.018707 1.936773 1.966994 1.956694 1.964021 2.094569 10 b
# DistConect3(r, Distance = 10000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
source to share