For a cycle with a subset in R

I have the following data in a csv file:

Date        Model      Color    Value   Samples
6/19/2017   Gold       Blue     0.5     500
6/19/2017   Gold       Red      0.0     449
6/19/2017   Silver     Blue     0.75    1320
6/19/2017   Silver     Blue     1.5     103
6/19/2017   Gold       Red      0.7     891
6/19/2017   Gold       Blue     0.41    18103
6/19/2017   Copper     Blue     0.83    564
6/19/2017   Silver     Pink     1.17    173
6/19/2017   Platinum   Brown    0.43    793
6/19/2017   Platinum   Red      0.71    1763
6/19/2017   Gold       Orange   1.92    503

      

I am using a function fread

to create a data table.

library(dplyr)
library(data.table)

df <- fread("test_data.csv", 
                 header = TRUE,
                 fill = TRUE,
                 sep = ",")

      

Then I multiply the data by Model

like this:

df_subset <- subset(df, df$Model=='Gold' & df$Value > 0)

      

Then I create some percentiles based on the variable Color

like this:

df_subset[, .(Samples = sum(Samples),
    '50th'    = quantile(AvgValue, probs = c(0.50)),
    '99th'    = quantile(AvgValue, probs = c(0.99)),
    '99.9th'  = quantile(AvgValue, probs = c(0.999)), 
    '99.99th' = quantile(AvgValue, probs = c(0.9999))),
by = Color]

      

Which gives the following output:

    Color Samples  50th   99th  99.9th  99.99th
1:   Blue   18603 0.455 0.4991 0.49991 0.499991
2:    Red    1340 0.975 1.2445 1.24945 1.249945
3: Orange     503 1.920 1.9200 1.92000 1.920000

      

I am trying to iterate through a list of values Model

and output the appropriate percentile values ​​for each value Model

.

I have tried the following (which fails):

models <- unique(df$Model)

for (model in models){

  df$model[, .(Samples = sum(Samples),
                '50th'    = quantile(Value, probs = c(0.50)),
                '99th'    = quantile(Value, probs = c(0.99)),
                '99.9th'  = quantile(Value, probs = c(0.999)), 
                '99.99th' = quantile(Value, probs = c(0.9999))),
            by = Color]
}

      

Error message:

Error in .(Samples = sum(Samples), `50th` = quantile(Value, probs = c(0.5)),  :  could not find function "."

      

+3


source to share


3 answers


fread

creates a data.table object, not a data frame, so I recommend sticking to the data.table syntax and not mixing it with dplyr. No need for a loop for

, we can use a list of two variables in the argument by

to cycle through both models and colors in one line of code:



qs = df[Value > 0, .(Samples = sum(Samples),
              '50th'    = quantile(Value, probs = c(0.50)),
              '99th'    = quantile(Value, probs = c(0.99)),
              '99.9th'  = quantile(Value, probs = c(0.999)), 
              '99.99th' = quantile(Value, probs = c(0.9999))),
          by = .(Model, Color)]
setkey(qs, 'Model')

#       Model  Color Samples  50th   99th  99.9th  99.99th
# 1:   Copper   Blue     564 0.830 0.8300 0.83000 0.830000
# 2:     Gold   Blue   18603 0.455 0.4991 0.49991 0.499991
# 3:     Gold    Red     891 0.700 0.7000 0.70000 0.700000
# 4:     Gold Orange     503 1.920 1.9200 1.92000 1.920000
# 5: Platinum  Brown     793 0.430 0.4300 0.43000 0.430000
# 6: Platinum    Red    1763 0.710 0.7100 0.71000 0.710000
# 7:   Silver   Blue    1423 1.125 1.4925 1.49925 1.499925
# 8:   Silver   Pink     173 1.170 1.1700 1.17000 1.170000

      

+2


source


This will probably fix your problem.

library(dplyr)

df [,-1] %>% filter(Value > 0) %>% group_by(Model, Color) %>% 
        do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999))))) 

      

About your question in the comments, about how to combine the sum of the samples: you can use aggregate

; the reason I am not using dplyr::summarise

is because I need to start a new pipeline after applying do

which does not make sense.



data.frame(df %>% filter(Value > 0) %>% select(-Date) %>% group_by(Model, Color) %>% 
              do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999))))),
           aggregate(Samples ~ Color+Model, df, sum)["Samples"])

#      Model  Color  X50.   X99.  X99.9.  X99.99. Samples 
# 1   Copper   Blue 0.830 0.8300 0.83000 0.830000     564 
# 2     Gold   Blue 0.455 0.4991 0.49991 0.499991   18603 
# 3     Gold Orange 1.920 1.9200 1.92000 1.920000     503 
# 4     Gold    Red 0.700 0.7000 0.70000 0.700000    1340 
# 5 Platinum  Brown 0.430 0.4300 0.43000 0.430000     793 
# 6 Platinum    Red 0.710 0.7100 0.71000 0.710000    1763 
# 7   Silver   Blue 1.125 1.4925 1.49925 1.499925    1423 
# 8   Silver   Pink 1.170 1.1700 1.17000 1.170000     173

      

Data:

df <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = "6/19/2017", class = "factor"), Model = structure(c(2L, 
2L, 4L, 4L, 2L, 2L, 1L, 4L, 3L, 3L, 2L), .Label = c("Copper", 
"Gold", "Platinum", "Silver"), class = "factor"), Color = structure( 
c(1L,5L, 1L, 1L, 5L, 1L, 1L, 4L, 2L, 5L, 3L), .Label = c("Blue", "Brown", 
"Orange", "Pink", "Red"), class = "factor"), Value = c(0.5, 0, 
0.75, 1.5, 0.7, 0.41, 0.83, 1.17, 0.43, 0.71, 1.92), Samples = c(500L, 
449L, 1320L, 103L, 891L, 18103L, 564L, 173L, 793L, 1763L, 503L)), 
.Names = c("Date", "Model", "Color", "Value", "Samples"), 
class = "data.frame", row.names = c(NA, -11L)) 

      

+2


source


Using your definitions, you can try this:

library(data.table)
df<-fread("~/theData.csv")
df$Value<-as.numeric(df$Value)
result<-data.frame()
for (i in seq_along(unique(df$Model))){
  temp <- subset(df, df$Model==unique(df$Model)[i] & df$Value > 0)
  temp<-temp[, .(Samples = sum(Samples),
  '50th'    = quantile(Value, probs = c(0.50)),
  '99th'    = quantile(Value, probs = c(0.99)),
  '99.9th'  = quantile(Value, probs = c(0.999)), 
  '99.99th' = quantile(Value, probs = c(0.9999))),
   by = Color]
  temp$model<-unique(df$Model)[i]
  result<-rbind(result, temp)
}
rm(temp)

      

+2


source







All Articles