Keep only one row per factor if it meets the R criteria

I have a dataset that looks like this:

ID  week  action
1   1     TRUE
1   1     FALSE
1   2     FALSE 
1   2     FALSE
1   3     FALSE
1   3     TRUE
2   1     FALSE
2   2     TRUE
2   2     FALSE
...

      

What I would like to do is store, for each ID and each week in ID, one action value, preferring to keep TRUE if there is one and also FALSE.

So it will look like this:

ID  week  action
1   1     TRUE
1   2     FALSE
1   3     TRUE
2   1     FALSE
2   2     TRUE
...

      

+3


source to share


4 answers


Try

library(dplyr)
library(tidyr)
df %>% 
   group_by(ID, week)%>% 
   arrange(desc(action)) %>%
   slice(1)
#   ID week action
#1  1    1   TRUE
#2  1    2  FALSE
#3  1    3   TRUE
#4  2    1  FALSE
#5  2    2   TRUE

      

Or using data.table

 library(data.table)
 setDT(df)[order(action,decreasing=TRUE),
           .SD[1] , by=list(ID, week)][order(ID,week)]
 #   ID week action
 #1:  1    1   TRUE
 #2:  1    2  FALSE
 #3:  1    3   TRUE
 #4:  2    1  FALSE
 #5:  2    2   TRUE

      

Or using a base R

similar approach used by @Sam Dickson



 aggregate(action~., df, FUN=function(x) sum(x)>0)
 # ID week action
 #1  1    1   TRUE
 #2  2    1  FALSE
 #3  1    2  FALSE
 #4  2    2   TRUE
 #5  1    3   TRUE

      

Or as inspired by @docendo discimus the data.table parameter would be

  setDT(df)[, .SD[which.max(action)], by=list(ID, week)]

      

data

df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), week = c(1L, 
1L, 2L, 2L, 3L, 3L, 1L, 2L, 2L), action = c(TRUE, FALSE, FALSE, 
 FALSE, FALSE, TRUE, FALSE, TRUE, FALSE)), .Names = c("ID", "week", 
 "action"), class = "data.frame", row.names = c(NA, -9L))

      

+2


source


I used plyr:



library(plyr)
ddply(df,.(ID,week),summarize,action=sum(action)>0)

      

+2


source


Two options, similar to akrun asnwer but not the same, so I am posting them separately:

aggregate(action ~ ID + week, df, max)
#  ID week action
#1  1    1      1   # you can use 1/0s the same way as TRUE/FALSE
#2  2    1      0
#3  1    2      0
#4  2    2      1
#5  1    3      1

library(dplyr)
group_by(df, ID, week) %>% slice(which.max(action))
#Source: local data frame [5 x 3]
#Groups: ID, week
#
#  ID week action
#1  1    1   TRUE
#2  1    2  FALSE
#3  1    3   TRUE
#4  2    1  FALSE
#5  2    2   TRUE

      

The help page for which.max

says that it finds the first maximum of a numeric or boolean vector, so even if you had multiple TRUE entries (which are 1 and FALSE is 0), you just select the first occurrence and return. You can do the opposite using which.min

.

+2


source


Basic R solution with aggregate

and any

:

aggregate(action ~ week + ID, df, any)
#   week ID action
# 1    1  1   TRUE
# 2    2  1  FALSE
# 3    3  1   TRUE
# 4    1  2  FALSE
# 5    2  2   TRUE

      


Another basic R solution:

subset(transform(df, action = ave(action, week, ID, FUN = any)), !duplicated(df[-3]))
#   ID week action
# 1  1    1   TRUE
# 3  1    2  FALSE
# 5  1    3   TRUE
# 7  2    1  FALSE
# 8  2    2   TRUE

      

+2


source







All Articles