Fixing bottleneck in a function returning frame data of distances between raster cells in R

Question

Fixing bottleneck in a function returning frame data of distances between raster cells in R

I am trying to fix the nested loops loop bottleneck in a function. I have already tried three functions and cannot fix them. Any help, especially if it's a data.table or rcpp solution, much appreciated. This is an example with a 100 cell raster, but I have a few of over 1,000,000 cells, so speed makes sense.

Example

Consider the following bitmap:

library(raster)


r   <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))

extent(r) <- c(-10, 10, -10, 10)

It is a small bitmap, only one hundred cells. I made the following function to study the movement of animals, where I use a Raster and the maximum distance that animals can move in time. There is a bottleneck in the nested loop that I have not been able to resolve.

What I get in return is a dataframe with the following variables:

from Identifier of the cell in the raster from which the animal can move from

to Identifier of a cell in the raster from which the animal can jump to

Dist Distance from cell to cell to

Loading the required libraries

library(gdistance)
library(dplyr)
library(tidyr)

DistConect1 <- function(Raster, Distance){
      #First we make a transition layer with the function transition from gdistance
      h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
      #Then geocorrect for projection
      h16   <- geoCorrection(h16, scl=FALSE)
      #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
      B <- xyFromCell(Raster, cell = 1:ncell(Raster))
      #This nested loop is where the Bottle neck is
      #Start a list
      connections <- list()
      #For each pair of cells in B
      for (i in 1:nrow(B)){
        arcs <- list()
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost(h16,B[i,])
        #Make all cells with distance to origin larger than maximum dispersal distance equal NA
        temp[values(temp) > Distance] = NA
        #Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
        index <- c(1:ncell(temp))[!is.na(values(temp))]
        for (j in index){
          #For each cell pair i,j generate a vector i,j, distance
          arcs[[j]] <- c(i, j, temp[j])
        }
        #Gather all vectors in a data frame
        connections[[i]] <- do.call("rbind", arcs)
        #name columns
        colnames(connections[[i]]) <- c("from", "to", "dist")
        #This is just to see where I am in the function
        print(paste(i, "of", nrow(B)))
      }
      #Get everything together as a large data frame
      connections <- do.call("rbind", connections)
      connections <- as.data.frame(connections)
      #return connections 
      return(connections)
      }

Tried to fix it with lapply

But I only got rid of one of the loops

DistConect2 <- function(Raster, Distance){
  #First we make a transition layer with the function transition from gdistance
  h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
  #Then geocorrect for projection
  h16   <- geoCorrection(h16, scl=FALSE)
  #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))
  #This nested loop is where the Bottle neck is

  all.cells <- function(i){
    arcs <- list()
    temp <- accCost(h16, B[i, ])
    temp[values(temp) > Distance] = NA
    index <- c(1:ncell(temp))[!is.na(values(temp))]
    # all.index <- function(j){
    for (j in index) {
      arcs[[j]] <- c(i, j, temp[j])
    }
    # arcs <- lapply(index, all.index)
    connections <- do.call("rbind", arcs)
    # connections <- do.call("rbind", arcs)
    colnames(connections) <- c("from", "to", "dist")
    return(connections)
   }

  connections <- lapply(1:nrow(B), all.cells)
  #For each pair of cells in B
  #Get everything together as a large data frame
  connections <- do.call("rbind", connections)
  connections <- as.data.frame(connections)
  #return connections 
  return(connections)
}

but using the microbenchmarck package to test the differences there is no difference:

microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), times = 4)

Micro-detection result

DistConect1(r, Distance = 1e+06) 10.283309 10.40662 10.55879 10.58380 10.71097 10.78428     4
DistConect2(r, Distance = 1e+06)  9.892371 10.07783 10.35453 10.41144 10.63124 10.70288     4

CLD a

parallelization

I also tried to parallelize, but it actually takes longer:

DistConect2b <- function (Raster, Distance, cpus = NULL) 
{
  h16 <- transition(Raster, transitionFunction = function(x) {1}, 16, symm = FALSE)
  h16 <- geoCorrection(h16, scl = FALSE)
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))

  all.cells <- function(i){
    arcs <- list()
    temp <- accCost(h16, B[i, ])
    temp[values(temp) > Distance] = NA
    index <- c(1:ncell(temp))[!is.na(values(temp))]
    # all.index <- function(j){
     for (j in index) {
      arcs[[j]] <- c(i, j, temp[j])
    }
    # arcs <- lapply(index, all.index)
    connections <- do.call("rbind", arcs)

    colnames(connections) <- c("from", "to", "dist")
    return(connections)
    # cat(paste(i, "of", nrow(B)))
  }
  require(snowfall)
  sfInit(parallel=TRUE, cpus=cpus)
  sfLibrary(gdistance)
  sep.connections <- sfClusterApplyLB(1:nrow(B), all.cells)
  sfStop(nostop=FALSE)
  # sep.connections <- lapply(1:nrow(B), all.cells)
  connections <- do.call("rbind", sep.connections)
  connections <- as.data.frame(connections)

}

Microbenchmark results

microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000),  DistConect2b(r, Distance = 1000000, cpus = 2), times = 4)


                            expr       min        lq     mean   median       uq
             DistConect1(r, Distance = 1e+06) 10.145234 10.216611 10.35301 10.36512 10.48942
             DistConect2(r, Distance = 1e+06)  9.963549  9.974315 10.01547 10.01173 10.05662
 DistConnect2b(r, Distance = 1e+06, cpus = 2) 11.311966 11.486705 12.02240 11.81034 12.55809

Other tests after answer

After the big answer I got, I tried to go one step further and added a replacement for loop in the code instead:

DistConect4 <- function(Raster, Distance){
  #First we make a transition layer with the function transition from gdistance
  h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
   #Then geocorrect for projection
  h16   <- geoCorrection(h16, scl=FALSE)
  #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))
  #This nested loop is where the Bottle neck is

  all.cells <- function(i){

    temp <- accCost2(h16, B[i, ])
    index <- which(temp < 1000000)  # all.index <- function(j){
    connections <- cbind(i, index, temp[index])

    return(connections)

  }

  connections <- lapply(1:nrow(B), all.cells)

  connections <- as.data.frame(do.call("rbind", connections))
  #Get everything together as a large data frame
  colnames(connections) <- c("from", "to", "dist")
  #return connections 
  return(connections)
}

Using acccost2 function defined below

accCost2 <- function(x, fromCoords) {

  fromCells <- cellFromXY(x, fromCoords)
  tr <- transitionMatrix(x)
  tr <- rBind(tr, rep(0, nrow(tr)))
  tr <- cBind(tr, rep(0, nrow(tr)))
  startNode <- nrow(tr)
  adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
  tr[adjP] <- Inf
  adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
  E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
  return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}

But when I tried

timing <- microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000),  DistConnect2b(r, Distance = 1000000, cpus = 4), DistConect3(r, Distance = 1000000), DistConect4(r, Distance = 1000000) ,times = 20)

print(timing, unit = "relative")

he didn't speed up the process

                                         expr       min       lq      mean        median        uq
             DistConect1(r, Distance = 1e+06) 12.400299 12.43078 12.407909 12.452043 12.502665
             DistConect2(r, Distance = 1e+06) 12.238812 12.23194 12.168468 12.191067 12.155786
 DistConnect2b(r, Distance = 1e+06, cpus = 4) 13.994594 14.01760 13.909674 13.978060 13.947062
             DistConect3(r, Distance = 1e+06)  1.000000  1.00000  1.000000  1.000000  1.000000
             DistConect4(r, Distance = 1e+06)  0.997329  1.00141  1.019697  1.002112  1.005626

I thought the application was always faster than any idea, why doesn't it speed up the process?

+3

r data.table rcpp r-raster

Derek corcoran Jul 17 17 at 20:08

source to share

1 answer

Vandenman · Accepted Answer · 2017-07-17T22:02:56+0000

You can get rid of the inner loop by changing

temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
  arcs[[j]] <- c(i, j, temp[j])
}
connections[[i]] <- do.call("rbind", arcs)
colnames(connections[[i]]) <- c("from", "to", "dist")

in it:

index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])

I also looked accCost

at which seems to be the slowest function here. Unfortunately, it names some C code for finding the shortest paths, which probably means there isn't much to optimize. I created accCost2

where I removed some of the code, but I doubt it matters much. I'm also not sure how efficient the parallelization is here, since the execution time is not that long. Below are some benchmarks showing worthy improvement.

library(gdistance)
library(dplyr)
library(tidyr)
library(raster)

r   <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))

extent(r) <- c(-10, 10, -10, 10)

DistConect1 <- function(Raster, Distance){
    #First we make a transition layer with the function transition from gdistance
    h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
    #Then geocorrect for projection
    h16   <- geoCorrection(h16, scl=FALSE)
    #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
    B <- xyFromCell(Raster, cell = 1:ncell(Raster))
    #This nested loop is where the Bottle neck is
    #Start a list
    connections <- list()
    #For each pair of cells in B
    for (i in 1:nrow(B)){
        arcs <- list()
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost(h16,B[i,])
        #Make all cells with distance to origin larger than maximum dispersal distance equal NA
        temp[values(temp) > Distance] = NA
        #Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
        index <- c(1:ncell(temp))[!is.na(values(temp))]
        for (j in index){
            #For each cell pair i,j generate a vector i,j, distance
            arcs[[j]] <- c(i, j, temp[j])
        }
        #Gather all vectors in a data frame
        connections[[i]] <- do.call("rbind", arcs)
        #name columns
        colnames(connections[[i]]) <- c("from", "to", "dist")
        #This is just to see where I am in the function
        # print(paste(i, "of", nrow(B)))
    }
    #Get everything together as a large data frame
    connections <- do.call("rbind", connections)
    connections <- as.data.frame(connections)
    #return connections 
    return(connections)
}

DistConect3 <- function(Raster, Distance){
    #First we make a transition layer with the function transition from gdistance
    h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
    #Then geocorrect for projection
    h16   <- geoCorrection(h16, scl=FALSE)
    #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
    B <- xyFromCell(Raster, cell = 1:ncell(Raster))
    #This nested loop is where the Bottle neck is
    #Start a list
    connections <- list()
    #For each pair of cells in B
    for (i in 1:nrow(B)){
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost2(h16,B[i,])
        index <- which(temp < Distance)
        connections[[i]] <- cbind(i, index, temp[index])
    }
    #Get everything together as a large data frame
    connections <- do.call("rbind", connections)
    connections <- as.data.frame(connections)
    colnames(connections) <- c("from", "to", "dist")
    #return connections 
    return(connections)
}

accCost2 <- function(x, fromCoords) {

    fromCells <- cellFromXY(x, fromCoords)
    tr <- transitionMatrix(x)
    tr <- rBind(tr, rep(0, nrow(tr)))
    tr <- cBind(tr, rep(0, nrow(tr)))
    startNode <- nrow(tr)
    adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
    tr[adjP] <- Inf
    adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
    E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
    return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}


d1 <- DistConect1(r, Distance = 1000)
d3 <- DistConect3(r, Distance = 1000)

# test float equality
all.equal(d1, d3, check.attributes = FALSE)
# TRUE

timing1 <- microbenchmark(
    DistConect1(r, Distance = 1000),
    DistConect3(r, Distance = 1000),
    times = 10
)
print(timing1, unit = "relative")
#   expr                            min      lq       mean     median   uq       max      neval cld
# 1 DistConect1(r, Distance = 1000) 2.077804 1.991303 1.881478 1.933114 1.951884 1.531302 10     b
# 2 DistConect3(r, Distance = 1000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10    a

timing2 <- microbenchmark(
    DistConect1(r, Distance = 10000),
    DistConect3(r, Distance = 10000),
    times = 10
)
print(timing2, unit = "relative")
# expr                             min      lq       mean     median   uq       max         neval cld
# DistConect1(r, Distance = 10000) 2.018707 1.936773 1.966994 1.956694 1.964021 2.094569    10     b
# DistConect3(r, Distance = 10000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10    a

Fixing bottleneck in a function returning frame data of distances between raster cells in R

Example

Tried to fix it with lapply

Micro-detection result

parallelization

Other tests after answer

More articles: