For a cycle with a subset in R
I have the following data in a csv file:
Date Model Color Value Samples
6/19/2017 Gold Blue 0.5 500
6/19/2017 Gold Red 0.0 449
6/19/2017 Silver Blue 0.75 1320
6/19/2017 Silver Blue 1.5 103
6/19/2017 Gold Red 0.7 891
6/19/2017 Gold Blue 0.41 18103
6/19/2017 Copper Blue 0.83 564
6/19/2017 Silver Pink 1.17 173
6/19/2017 Platinum Brown 0.43 793
6/19/2017 Platinum Red 0.71 1763
6/19/2017 Gold Orange 1.92 503
I am using a function fread
to create a data table.
library(dplyr)
library(data.table)
df <- fread("test_data.csv",
header = TRUE,
fill = TRUE,
sep = ",")
Then I multiply the data by Model
like this:
df_subset <- subset(df, df$Model=='Gold' & df$Value > 0)
Then I create some percentiles based on the variable Color
like this:
df_subset[, .(Samples = sum(Samples),
'50th' = quantile(AvgValue, probs = c(0.50)),
'99th' = quantile(AvgValue, probs = c(0.99)),
'99.9th' = quantile(AvgValue, probs = c(0.999)),
'99.99th' = quantile(AvgValue, probs = c(0.9999))),
by = Color]
Which gives the following output:
Color Samples 50th 99th 99.9th 99.99th
1: Blue 18603 0.455 0.4991 0.49991 0.499991
2: Red 1340 0.975 1.2445 1.24945 1.249945
3: Orange 503 1.920 1.9200 1.92000 1.920000
I am trying to iterate through a list of values Model
and output the appropriate percentile values ββfor each value Model
.
I have tried the following (which fails):
models <- unique(df$Model)
for (model in models){
df$model[, .(Samples = sum(Samples),
'50th' = quantile(Value, probs = c(0.50)),
'99th' = quantile(Value, probs = c(0.99)),
'99.9th' = quantile(Value, probs = c(0.999)),
'99.99th' = quantile(Value, probs = c(0.9999))),
by = Color]
}
Error message:
Error in .(Samples = sum(Samples), `50th` = quantile(Value, probs = c(0.5)), : could not find function "."
source to share
fread
creates a data.table object, not a data frame, so I recommend sticking to the data.table syntax and not mixing it with dplyr. No need for a loop for
, we can use a list of two variables in the argument by
to cycle through both models and colors in one line of code:
qs = df[Value > 0, .(Samples = sum(Samples),
'50th' = quantile(Value, probs = c(0.50)),
'99th' = quantile(Value, probs = c(0.99)),
'99.9th' = quantile(Value, probs = c(0.999)),
'99.99th' = quantile(Value, probs = c(0.9999))),
by = .(Model, Color)]
setkey(qs, 'Model')
# Model Color Samples 50th 99th 99.9th 99.99th
# 1: Copper Blue 564 0.830 0.8300 0.83000 0.830000
# 2: Gold Blue 18603 0.455 0.4991 0.49991 0.499991
# 3: Gold Red 891 0.700 0.7000 0.70000 0.700000
# 4: Gold Orange 503 1.920 1.9200 1.92000 1.920000
# 5: Platinum Brown 793 0.430 0.4300 0.43000 0.430000
# 6: Platinum Red 1763 0.710 0.7100 0.71000 0.710000
# 7: Silver Blue 1423 1.125 1.4925 1.49925 1.499925
# 8: Silver Pink 173 1.170 1.1700 1.17000 1.170000
source to share
This will probably fix your problem.
library(dplyr)
df [,-1] %>% filter(Value > 0) %>% group_by(Model, Color) %>%
do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999)))))
About your question in the comments, about how to combine the sum of the samples: you can use aggregate
; the reason I am not using dplyr::summarise
is because I need to start a new pipeline after applying do
which does not make sense.
data.frame(df %>% filter(Value > 0) %>% select(-Date) %>% group_by(Model, Color) %>%
do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999))))),
aggregate(Samples ~ Color+Model, df, sum)["Samples"])
# Model Color X50. X99. X99.9. X99.99. Samples
# 1 Copper Blue 0.830 0.8300 0.83000 0.830000 564
# 2 Gold Blue 0.455 0.4991 0.49991 0.499991 18603
# 3 Gold Orange 1.920 1.9200 1.92000 1.920000 503
# 4 Gold Red 0.700 0.7000 0.70000 0.700000 1340
# 5 Platinum Brown 0.430 0.4300 0.43000 0.430000 793
# 6 Platinum Red 0.710 0.7100 0.71000 0.710000 1763
# 7 Silver Blue 1.125 1.4925 1.49925 1.499925 1423
# 8 Silver Pink 1.170 1.1700 1.17000 1.170000 173
Data:
df <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "6/19/2017", class = "factor"), Model = structure(c(2L,
2L, 4L, 4L, 2L, 2L, 1L, 4L, 3L, 3L, 2L), .Label = c("Copper",
"Gold", "Platinum", "Silver"), class = "factor"), Color = structure(
c(1L,5L, 1L, 1L, 5L, 1L, 1L, 4L, 2L, 5L, 3L), .Label = c("Blue", "Brown",
"Orange", "Pink", "Red"), class = "factor"), Value = c(0.5, 0,
0.75, 1.5, 0.7, 0.41, 0.83, 1.17, 0.43, 0.71, 1.92), Samples = c(500L,
449L, 1320L, 103L, 891L, 18103L, 564L, 173L, 793L, 1763L, 503L)),
.Names = c("Date", "Model", "Color", "Value", "Samples"),
class = "data.frame", row.names = c(NA, -11L))
source to share
Using your definitions, you can try this:
library(data.table)
df<-fread("~/theData.csv")
df$Value<-as.numeric(df$Value)
result<-data.frame()
for (i in seq_along(unique(df$Model))){
temp <- subset(df, df$Model==unique(df$Model)[i] & df$Value > 0)
temp<-temp[, .(Samples = sum(Samples),
'50th' = quantile(Value, probs = c(0.50)),
'99th' = quantile(Value, probs = c(0.99)),
'99.9th' = quantile(Value, probs = c(0.999)),
'99.99th' = quantile(Value, probs = c(0.9999))),
by = Color]
temp$model<-unique(df$Model)[i]
result<-rbind(result, temp)
}
rm(temp)
source to share