Create an indicator variable by groups in R, similar to Stata "by"

I've been researching R and found several resources that almost do what I want, but not quite (or at least not in the way that I understand!)

I have code that works great to get my intended output in R (I usually use Stata), but it is incredibly slow and I know this because I was crudely forcing something that I am sure there is much more smart way to do!

I have a whole series of indicators to be set in groups and looking at the previous values ​​in each group.

Here's the code I'm using (that works), with an example that (hopefully) shows what I mean. It's fast enough to work with this very small sample, but is very slow to start when I have many groups, many observations, and many indicators!

Thanks for your experience! Hurray, Simon.

# would like to find out several things:
# 1. the year in which an observation is missing
# 2. the last year in which an observation is not missing
# 3. whether someone is lost to followup 
#    (ie. all remaining observations are missing)
# 4. whether someone is STILL lost to followup 
#    (ie. was lost to followup in previous year as well as current year)

# problem: this is very quick and simple in Stata 
#          but takes a VERY long time using this method in R
#          which makes me sure there a better way!

# read in data
missingness <- read.table(text=
           "Var2001 Var2002 Var2003
           1 1 1
           1 NA NA
           1 NA 1
           NA NA 1
           NA 1 NA", header=TRUE)

vartouse_list <- c(colnames(missingness)[grep("Var",colnames(missingness))])
number_list <- sapply(strsplit(vartouse_list,split="Var", fixed=TRUE), function(x) (x[2]))
missingness_subset <- subset(missingness[, vartouse_list])

# now create an id

# reshape to long

long_missingness <- reshape(missingness_subset,
                            varying = vartouse_list,
                            v.names = "Var",
                            timevar = "time_period",
                            times = number_list,
                            direction = "long")

# sort to looking by id number
long_missingness$time_period <- as.numeric(long_missingness$time_period)
long_missingness <- long_missingness[order(long_missingness$id, long_missingness$time_period),]

# find if missing this year
criteria <- paste0("long_missingness","$","Var")
long_missingness$missing_this_year <- ifelse(is.na(long_missingness$Var),1,0)

# list of non-missing time periods
long_missingness$time_period_not_missing <- ifelse(long_missingness$missing_this_year==0,
                                                   long_missingness$time_period,
                                                   NA)

# find last observed data
long_missingness$last_non_missing <- min(long_missingness$time_period)
for (current_id in unique(long_missingness$id)) {
  current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
  indicator = max(current_long_missingness$time_period_not_missing,na.rm=TRUE)
  long_missingness$last_non_missing <- ifelse(long_missingness$id==current_id,
                                              indicator,
                                              long_missingness$last_non_missing)
}

# year first lost to followup
long_missingness$lost_to_followup_year <- long_missingness$last_non_missing + 1 

# generate an indicator for lost to followup
# for each individual, they're lost to followup if:
# (data is missing this year AND the current year is >= the year indicated as lost to followup)
# OR
# they were lost to followup in the previous year (by definition)
long_missingness$lost_to_followup = 0
long_missingness$lost_to_followup = ifelse(long_missingness$missing_this_year==1 & 
                                             long_missingness$time_period >= 
                                             long_missingness$lost_to_followup_year,
                                           1,
                                           0)

# now will work out if an observation is still lost to followup
long_missingness$still_lost_to_followup <- 0

for (current_id in unique(long_missingness$id)) {
  current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
  numyears <- nrow(current_long_missingness)
  if (numyears > 1) for(current_year in 2:numyears) {
    current_time_period <- current_long_missingness$time_period[current_year]

    #// generate an indicator if an observation is still lost to followup
    #// ie. was lost to followup in the previous year and still (obviously) lost to followup now
    # Stata code:
        #gen still_lost_to_followup = 0
        #by `idvar': replace still_lost_to_followup = 1 if lost_to_followup & lost_to_followup[_n-1]
    indicator <- ifelse(current_long_missingness$lost_to_followup[current_year]==1 
                        & current_long_missingness$lost_to_followup[current_year-1]==1,
                        1,
                        0)
    long_missingness$still_lost_to_followup <- ifelse(long_missingness$id==current_id & 
                                                        long_missingness$time_period==current_time_period,
                                                      indicator,
                                                      long_missingness$still_lost_to_followup)

  }

}

      

+3


source to share


1 answer


I think this will give you a faster and easier solution. This is achieved using tidyverse

, it should be quite fast, up to a certain number of observations.



missingness <- read.table(text =
                              "Var2001 Var2002 Var2003
                          1 1 1
                          1 NA NA
                          1 NA 1
                          NA NA 1
                          NA 1 NA", header = TRUE)

library(tidyverse)   
library(stringr)

missingness %>%
    rownames_to_column('id') %>%
    gather(year, value,-id) %>%
    mutate(year = str_extract(year, '[0-9]{4}')) %>%
    group_by(id) %>%
    mutate(
        missing_this_year      = as.integer(is.na(value)),
        last_non_missing       = coalesce(max(year[!is.na(value)]), max(year)),
        lost_to_followup       = as.integer(year > last_non_missing),
        still_lost_to_followup = as.integer(lost_to_followup &
                                                lag(lost_to_followup))
    ) %>% 
    arrange(id, year)

#> # A tibble: 15 x 7
#> # Groups:   id [5]
#>       id  year value missing_this_year last_non_missing lost_to_followup still_lost_to_followup
#>    <chr> <chr> <int>             <int>            <chr>            <int>                  <int>
#>  1     1  2001     1                 0             2003                0                      0
#>  2     1  2002     1                 0             2003                0                      0
#>  3     1  2003     1                 0             2003                0                      0
#>  4     2  2001     1                 0             2001                0                      0
#>  5     2  2002    NA                 1             2001                1                      0
#>  6     2  2003    NA                 1             2001                1                      1
#>  7     3  2001     1                 0             2003                0                      0
#>  8     3  2002    NA                 1             2003                0                      0
#>  9     3  2003     1                 0             2003                0                      0
#> 10     4  2001    NA                 1             2003                0                      0
#> 11     4  2002    NA                 1             2003                0                      0
#> 12     4  2003     1                 0             2003                0                      0
#> 13     5  2001    NA                 1             2002                0                      0
#> 14     5  2002     1                 0             2002                0                      0
#> 15     5  2003    NA                 1             2002                1                      0

      

+1


source







All Articles