Mutate () with if / else function
I have an example dataframe
df <- data.frame(cust = sample(1:100, 1000, TRUE),
channel = sample(c("WEB", "POS"), 1000, TRUE))
what am i trying to mutate
get_channels <- function(data) {
d <- data
if(unique(d) %>% length() == 2){
d <- "Both"
} else {
if(unique(d) %>% length() < 2 && unique(d) == "WEB") {
d <- "Web"
} else {
d <- "POS"
}
}
return(d)
}
This works seamlessly and on small data frames, it doesn't take time.
start.time <- Sys.time()
df %>%
group_by(cust) %>%
mutate(chan = get_channels(channel)) %>%
group_by(cust) %>%
slice(1) %>%
group_by(chan) %>%
summarize(count = n()) %>%
mutate(perc = count/sum(count))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
Time difference 0.34602 sec
However, when the dataframe gets quite large, say of order> 1,000,000 or more cust
, my base if/else
fx takes a lot more...
How can I optimize this feature to make it faster?
source to share
You have to use the data.table for this.
setDT(df)
t1 = Sys.time()
df = df[ , .(channels = ifelse(uniqueN(channel) == 2, "both", as.character(channel[1]))), by = .(cust)]
> Sys.time() - t1
Time difference of 0.00500083 secs
> head(df)
cust channels
1: 37 both
2: 45 both
3: 74 both
4: 20 both
5: 1 both
6: 68 both
source to share
You can do it in an R base using something like this:
web_cust <- unique(df$cust[df$channel=="WEB"])
pos_cust <- unique(df$cust[df$channel=="POS"])
both <- length(intersect(web_cust, pos_cust))
web_only <- length(setdiff(web_cust, pos_cust))
pos_only <- length(setdiff(pos_cust, web_cust))
Data:
set.seed(1)
df <- data.frame(cust = sample(2e6, 1e7, TRUE),
channel = sample(c("WEB", "POS"), 1e7, TRUE),
stringsAsFactors = F)
source to share
A faster version dplyr
that takes about 1/3 of the time, but is probably even slower than the datasheet version. uniqueN
borrowed from @ Kristoferson's answer.
df %>%
group_by(cust) %>%
summarize(chan = if_else(uniqueN(channel) == 2, "Both", as.character(channel[1]))) %>%
group_by(chan) %>%
summarize(n = n() ) %>%
mutate(perc = n /sum(n))
In addition, your original can be greatly improved by optimizing your function as follows:
get_channels <- function(data) {
ud <- unique(data)
udl <- length(ud)
if(udl == 2) {
r <- "Both"
} else {
if(udl < 2 && ud == "WEB") {
r <- "Web"
} else {
r <- "POS"
}
}
return(r)
}
source to share
And some timings ...
I tried three different alternatives in dplyr
, and in data.table
: (1) ifelse
(see response @Kristofersen.), (2) if
/ else
(because it test
has a length of 1) and (3) the vector index. Unsurprisingly, the main difference is between dplyr
and data.table
and not among the alternatives 1-3.
For 1000 clients data.table
about 7 times faster. For 10,000 clients, this is about 30 times faster. For 1e6 clients, I only tested data.table
and not very much difference between the alternatives.
# 1000 customers, 2*1000 registrations
df <- data.frame(cust = sample(1e3, 2e3, replace = TRUE),
channel = sample(c("WEB", "POS"), 2e3, TRUE))
library(microbenchmark)
library(dplyr)
library(data.table)
microbenchmark(dp1 = df %>%
group_by(cust) %>%
summarise(res = ifelse(n_distinct(channel) == 1, channel[1], "both")),
dp2 = df %>%
group_by(cust) %>%
summarise(res = if(n_distinct(channel) == 1) channel[1] else "both"),
dp3 = df %>%
group_by(cust) %>%
summarise(res = c("both", channel[1])[(n_distinct(channel) == 1) + 1]),
dt1 = setDT(df)[ , .(channels = ifelse(uniqueN(channel) == 2, "both", channel[1])), by = cust],
dt2 = setDT(df)[ , .(channels = if(uniqueN(channel) == 2) "both" else channel[1]), by = cust],
dt3 = setDT(df)[ , .(res = c("both", channel[1])[(uniqueN(channel) == 1) + 1]), by = cust],
times = 5, unit = "relative")
# 1e3 customers
# Unit: relative
# expr min lq mean median uq max neval
# dp1 7.8985477 8.176139 7.9355234 7.676534 8.0359975 7.9166933 5
# dp2 7.8882707 8.018000 7.8965098 8.731935 7.8414478 7.3560530 5
# dp3 8.0851402 8.934831 7.7540060 7.653026 6.8305012 7.6887950 5
# dt1 1.1713088 1.180870 1.0350482 1.209861 1.0523597 0.7650059 5
# dt2 0.8272681 1.223387 0.9311628 1.047773 0.9028017 0.7795579 5
# dt3 1.0000000 1.000000 1.0000000 1.000000 1.0000000 1.0000000 5
# 1e4 customers
# Unit: relative
# expr min lq mean median uq max neval
# dp1 40.8725204 39.5297108 29.5755838 38.996075 38.246103 17.2784642 5
# dp2 40.7396141 39.4299918 27.4476811 38.819577 37.886320 12.7265756 5
# dp3 41.0940358 39.7819673 27.5532964 39.260488 38.317899 12.4685386 5
# dt1 1.0905470 1.0661613 0.7422082 1.053786 1.034642 0.3428945 5
# dt2 0.9052739 0.9008761 1.2813458 2.111642 2.356008 0.9005391 5
# dt3 1.0000000 1.0000000 1.0000000 1.000000 1.000000 1.0000000 5
# 1e6 customers, data.table only
# Unit: relative
# expr min lq mean median uq max neval
# dt1 1.146757 1.147152 1.155497 1.164471 1.156244 1.161660 5
# dt2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5
# dt3 1.084442 1.079734 1.253568 1.106833 1.098766 1.799935 5
source to share