Create an indicator variable by groups in R, similar to Stata "by"
I've been researching R and found several resources that almost do what I want, but not quite (or at least not in the way that I understand!)
I have code that works great to get my intended output in R (I usually use Stata), but it is incredibly slow and I know this because I was crudely forcing something that I am sure there is much more smart way to do!
I have a whole series of indicators to be set in groups and looking at the previous values ββin each group.
Here's the code I'm using (that works), with an example that (hopefully) shows what I mean. It's fast enough to work with this very small sample, but is very slow to start when I have many groups, many observations, and many indicators!
Thanks for your experience! Hurray, Simon.
# would like to find out several things:
# 1. the year in which an observation is missing
# 2. the last year in which an observation is not missing
# 3. whether someone is lost to followup
# (ie. all remaining observations are missing)
# 4. whether someone is STILL lost to followup
# (ie. was lost to followup in previous year as well as current year)
# problem: this is very quick and simple in Stata
# but takes a VERY long time using this method in R
# which makes me sure there a better way!
# read in data
missingness <- read.table(text=
"Var2001 Var2002 Var2003
1 1 1
1 NA NA
1 NA 1
NA NA 1
NA 1 NA", header=TRUE)
vartouse_list <- c(colnames(missingness)[grep("Var",colnames(missingness))])
number_list <- sapply(strsplit(vartouse_list,split="Var", fixed=TRUE), function(x) (x[2]))
missingness_subset <- subset(missingness[, vartouse_list])
# now create an id
# reshape to long
long_missingness <- reshape(missingness_subset,
varying = vartouse_list,
v.names = "Var",
timevar = "time_period",
times = number_list,
direction = "long")
# sort to looking by id number
long_missingness$time_period <- as.numeric(long_missingness$time_period)
long_missingness <- long_missingness[order(long_missingness$id, long_missingness$time_period),]
# find if missing this year
criteria <- paste0("long_missingness","$","Var")
long_missingness$missing_this_year <- ifelse(is.na(long_missingness$Var),1,0)
# list of non-missing time periods
long_missingness$time_period_not_missing <- ifelse(long_missingness$missing_this_year==0,
long_missingness$time_period,
NA)
# find last observed data
long_missingness$last_non_missing <- min(long_missingness$time_period)
for (current_id in unique(long_missingness$id)) {
current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
indicator = max(current_long_missingness$time_period_not_missing,na.rm=TRUE)
long_missingness$last_non_missing <- ifelse(long_missingness$id==current_id,
indicator,
long_missingness$last_non_missing)
}
# year first lost to followup
long_missingness$lost_to_followup_year <- long_missingness$last_non_missing + 1
# generate an indicator for lost to followup
# for each individual, they're lost to followup if:
# (data is missing this year AND the current year is >= the year indicated as lost to followup)
# OR
# they were lost to followup in the previous year (by definition)
long_missingness$lost_to_followup = 0
long_missingness$lost_to_followup = ifelse(long_missingness$missing_this_year==1 &
long_missingness$time_period >=
long_missingness$lost_to_followup_year,
1,
0)
# now will work out if an observation is still lost to followup
long_missingness$still_lost_to_followup <- 0
for (current_id in unique(long_missingness$id)) {
current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
numyears <- nrow(current_long_missingness)
if (numyears > 1) for(current_year in 2:numyears) {
current_time_period <- current_long_missingness$time_period[current_year]
#// generate an indicator if an observation is still lost to followup
#// ie. was lost to followup in the previous year and still (obviously) lost to followup now
# Stata code:
#gen still_lost_to_followup = 0
#by `idvar': replace still_lost_to_followup = 1 if lost_to_followup & lost_to_followup[_n-1]
indicator <- ifelse(current_long_missingness$lost_to_followup[current_year]==1
& current_long_missingness$lost_to_followup[current_year-1]==1,
1,
0)
long_missingness$still_lost_to_followup <- ifelse(long_missingness$id==current_id &
long_missingness$time_period==current_time_period,
indicator,
long_missingness$still_lost_to_followup)
}
}
source to share
I think this will give you a faster and easier solution. This is achieved using tidyverse
, it should be quite fast, up to a certain number of observations.
missingness <- read.table(text =
"Var2001 Var2002 Var2003
1 1 1
1 NA NA
1 NA 1
NA NA 1
NA 1 NA", header = TRUE)
library(tidyverse)
library(stringr)
missingness %>%
rownames_to_column('id') %>%
gather(year, value,-id) %>%
mutate(year = str_extract(year, '[0-9]{4}')) %>%
group_by(id) %>%
mutate(
missing_this_year = as.integer(is.na(value)),
last_non_missing = coalesce(max(year[!is.na(value)]), max(year)),
lost_to_followup = as.integer(year > last_non_missing),
still_lost_to_followup = as.integer(lost_to_followup &
lag(lost_to_followup))
) %>%
arrange(id, year)
#> # A tibble: 15 x 7
#> # Groups: id [5]
#> id year value missing_this_year last_non_missing lost_to_followup still_lost_to_followup
#> <chr> <chr> <int> <int> <chr> <int> <int>
#> 1 1 2001 1 0 2003 0 0
#> 2 1 2002 1 0 2003 0 0
#> 3 1 2003 1 0 2003 0 0
#> 4 2 2001 1 0 2001 0 0
#> 5 2 2002 NA 1 2001 1 0
#> 6 2 2003 NA 1 2001 1 1
#> 7 3 2001 1 0 2003 0 0
#> 8 3 2002 NA 1 2003 0 0
#> 9 3 2003 1 0 2003 0 0
#> 10 4 2001 NA 1 2003 0 0
#> 11 4 2002 NA 1 2003 0 0
#> 12 4 2003 1 0 2003 0 0
#> 13 5 2001 NA 1 2002 0 0
#> 14 5 2002 1 0 2002 0 0
#> 15 5 2003 NA 1 2002 1 0
source to share