Using data.table. Subset for not equality

I have a datatable with 400k rows and I am doing a subset and it is very slow.

Here is an example data frame:

                 date   name value size car1 car2
1 2015-01-01 07:44:00    bob     1    5    A    D
2 2015-02-02 09:46:00 george   522    2    B    F

      

Now I will multiply it in a slow way using subset ():

main<- data.frame(date = as.POSIXct(c("2015-01-01 07:44:00","2015-02-02 09:46:00"),tz="GMT"),name= c("bob","george"),value=c(1,522), size= c(5,2), car1=c("A","B"), car2=c("D","F"))
main$date
subset(main,    size >1 
       &  value == 522
       &  name == "george" 
       &  date >= as.POSIXct("2015-01-01 03:44:00",tz="GMT") &  date >= as.POSIXct("2015-01-01 08:44:00",tz="GMT")
       &  (car1 == "F" | car2 == "F")
)

                 date   name value size car1 car2
2 2015-02-02 09:46:00 george   522    2    B    F

      

This works and returns 1 row, but it is very slow.

Thanks to some of the answers to another question, using data.table looks much faster, so I would like to use data.table to do the same as above, but I have a bunch of questions.

Here's what I have so far:

   library(data.table)  
 mdt<- as.data.table(main)
 setkey(mdt, date, name, value,size,car1,car2)
  mdt[.(as.POSIXct("2015-01-01 03:44:00"),"george", 522,2,"F","F")]

      

This returns:

date   name value size car1 car2
1: 2015-01-01 03:44:00 george   522    2   NA    F

      

Here are my questions:

(1) I want to have criteria where date> = and date <=, but is this possible using data.table? If not ideas how to make the subset faster?

(2) I want to have criteria where (car1 == "F" | car2 == "F"), but is it possible? If not ideas how to make the subset faster?

(3) You can see the output of mdt [], there is a date 2015-01-01 03:44:00, but this date is NOT in the original "main" data frame. What's going on here?

(4) You can see the output of mdt [] as car1 NA when car1 is not NA in the original "main" data frame. What's going on here?

Thank.

+3


source to share


1 answer


Of course, you just put criteria in the expression i

.

setDT(main)
main[size >1 &
       value == 522 &
       name == "george" &
       date >= as.POSIXct("2015-01-01 03:44:00",tz="GMT") &
       date >= as.POSIXct("2015-01-01 08:44:00",tz="GMT") &
       (car1 == "F" | car2 == "F"), ]

      

Result:

                  date   name value size car1 car2
1: 2015-02-02 09:46:00 george   522    2    B    F

      



So, is it faster than subset

? Yes.

library(data.table)
library(ggplot2)
library(reshape2)

set.seed(1)

cf <- function(n) {
  main <- 
    data.frame(date = as.POSIXct(Sys.Date()+runif(n, 0, 100)),
               name = sample(c("bob","george"), n, replace=T),
               value = round(runif(n, 400,600), 0), 
               size= sample(1:5, n, replace=T), 
               car1= sample(LETTERS[1:6], n, replace=T), 
               car2= sample(LETTERS[1:6], n, replace=T),
               stringsAsFactors=F)
  mdt <- data.table(main)
  setkey(mdt, date, name, value,size,car1,car2)

  pre <- Sys.time()
  mdt[size > 1 & value > 100  & name == "george" &
         date >= as.POSIXct(Sys.Date()) & date <= as.POSIXct(Sys.Date()+50) &
         (car1 == "F" | car2 == "F"), ]
  dt_time <- Sys.time() - pre

  pre <- Sys.time()
  subset(main, 
         size > 1 & value > 100 & name == "george" &
         date >= as.POSIXct(Sys.Date()) & date <= as.POSIXct(Sys.Date()+50) &
         (car1 == "F" | car2 == "F"))
  subset_time <- Sys.time() - pre

  return(c(n=n, dt_time=dt_time, subset_time=subset_time))
}

result <- sapply(10^(2:7), cf)
result <- melt(data.frame(t(result)), id.var='n')

ggplot(result, aes(x=n, y=value, color=variable)) +
  geom_point() + geom_line() + theme_bw() +
  scale_x_log10()

      

enter image description here

+2


source







All Articles