Replace consecutive zeros found at both ends of the data frame in R
I need to replace any zeros in the first and last columns of a data block with NA, but when replacing the first / last zero, I need to also replace any consecutive zeros present in that particular row. Given an example frame:
df <- data.frame(a = c(1,0,1,0,1,1,1,0,1,1,1),
b = c(1,1,1,0,1,1,1,0,1,1,1),
c = c(1,0,1,1,1,0,1,0,1,1,1),
d = c(1,1,1,0,1,1,1,1,1,1,1),
e = c(1,0,1,0,1,1,1,1,1,1,1),
f = c(1,1,1,1,1,1,1,1,1,0,1))
df
I need it to return:
df.result <- data.frame(a = c(1,NA,1,NA,1,1,1,NA,1,1,1),
b = c(1,1,1,NA,1,1,1,NA,1,1,1),
c = c(1,0,1,1,1,0,1,NA,1,1,1),
d = c(1,1,1,0,1,1,1,1,1,1,1),
e = c(1,0,1,0,1,1,1,1,1,1,1),
f = c(1,1,1,1,1,1,1,1,1,NA,1))
df.result
Thanks in advance.
source to share
Another way that avoids apply
and works with strings:
g<-lapply(df,"==",0)
df[do.call(cbind,Reduce("&",g,accumulate=TRUE)) | do.call(cbind,Reduce("&",g,accumulate=TRUE,right=TRUE))]<-NA
identical(df,df.result)
#[1] TRUE
Quick test:
docendo<-function(df) {
idx <- t(apply(df != 0, 1, function(x) cumsum(x) == 0 | rev(cumsum(rev(x)) == 0)))
df[idx] <- NA
df
}
nicola<-function(df) {
g<-lapply(df,"==",0)
df[do.call(cbind,Reduce("&",g,accumulate=TRUE)) | do.call(cbind,Reduce("&",g,accumulate=TRUE,right=TRUE))]<-NA
df
}
lmo<-function(df) {
reps.first <- max.col(df, ties.method = "first") - 1
reps.last <- max.col(df, ties.method = "last")
fill.last <- length(df)-reps.last
is.na(df[cbind(rep(seq_len(nrow(df))[reps.first > 0], reps.first[reps.first > 0]),
sequence(reps.first))]) <- TRUE
is.na(df[cbind(rep(seq_len(nrow(df))[fill.last > 0], fill.last[fill.last > 0]),
length(df)-(sequence(fill.last) - 1))]) <- TRUE
df
}
#create a bigger dataset
df<-df[rep(1:nrow(df),each=10000),]
system.time(res<-docendo(df))
# user system elapsed
# 2.088 0.020 2.145
system.time(res2<-nicola(df))
# user system elapsed
# 0.016 0.000 0.017
identical(res,res2)
#[1] TRUE
system.time(res3<-lmo(df))
# user system elapsed
# 0.222 0.000 0.265
identical(res2,res3)
#[1] TRUE
source to share
Try the following solution, in which we first build the boolean matrix used for the data subset and NA assignment:
idx <- t(apply(df != 0, 1, function(x) cumsum(x) == 0 | rev(cumsum(rev(x)) == 0)))
df[idx] <- NA
The results are equal to your desired result:
all.equal(df, df.result)
#[1] TRUE
If you're concerned about performance / memory, you can also do it in a two-step approach, where you first compute the rows in the first and last columns that are 0 and only take the second step for those rows.
idx1 <- rowSums(df[,c(1, ncol(df))] == 0)>0
idx2 <- t(apply(df[idx1,] != 0, 1, function(x) cumsum(x) == 0 | rev(cumsum(rev(x)) == 0)))
df[idx1,][idx2] <- NA
As a side note, you can also skip the intermediate step of creating an index if you use the following (although I prefer to create an index):
is.na(df) <- t(apply(df != 0, 1, function(x) cumsum(x) == 0 | rev(cumsum(rev(x)) == 0)))
source to share
Here is another basic R method that uses max.col
to identify the elements of each row for padding and then uses a subset of matrices and is.na<-
NA padding. The matrix is ββfilled with rep
and sequence
.
# get the last of the 0 values from first column
reps.first <- max.col(df, ties.method = "first") - 1
# get the last of the 0 values starting with last column
reps.last <- max.col(df, ties.method = "last")
fill.last <- length(df)-reps.last
# fill in from first column
is.na(df[cbind(rep(seq_len(nrow(df))[reps.first > 0], reps.first[reps.first > 0]),
sequence(reps.first))]) <- TRUE
# fill in from last column
is.na(df[cbind(rep(seq_len(nrow(df))[fill.last > 0], fill.last[fill.last > 0]),
length(df)-(sequence(fill.last) - 1))]) <- TRUE
all.equal(df, df.result)
[1] TRUE
source to share