Automatically modify a complex character vector

I have a complex character vector where each element of the vector consists of numbers and letters. I want to simplify this vector so that sequences of numbers and / or letters fit into ranges. Here's an example of what the input and output vectors should look like:

# Input vector
input_vec <- c("1,2,3,4,5", "1,2,3,5,6,7,8", "2,3,4,5", "A,B,C", "1,2,3,4,5,A,B,8,9,10,11")

# Here some function should be applied, to create the desired output vector automatically

# Desired output vector
output_vec <- c("1-5", "1-3,5-8", "2-5", "A-C", "1-5,A-B,8-11")

      

I'm sure there must be a way to build a function or use a package to do this in an automatic way, but unfortunately I'm struggling to find a solution. Any help is greatly appreciated!

UPDATE: Added even more complex vector

input_vec2 <- c("1,2,3,4,5", "1,2,3,5,6,7,8", "2,3,4,5", "A,B,C", "1,2,3,4,5,A,B,8,9,10,11", 
"1", "1,2,3,-4", "lala,3") # This part is new

output_vec2 <- c("1-5", "1-3,5-8", "2-5", "A-C", "1-5,A-B,8-11",
"1", "1-3,-4", "lala,3") # This part is new

      

+3


source to share


1 answer


This might be a bit bloated, but I tried to break the problem down into smaller features. Here they are. Some common helper functions first

# Is value numeric?
is_numeric <- function(x) suppressWarnings(!is.na(as.numeric(x)))
# Greate IDs for sequences of values using run-length encoding
rleg <- function(x) {
  r <- rle(x);
  val <- list(group_value = r$values)
  r$values <- seq_along(r$values); 
  val$group_id <- inverse.rle(r)
  val
}

      

And now some more specific helpers for the problem



collapse_sequence <- function(x) {
  if (length(x)>1) {
    paste0(x[1],"-", x[length(x)])
  } else {
    x
  }
}

find_runs <- function(x, key=x) {
  nona <- function(x) {x[is.na(x)]<-0; x}
  run <- cumsum(nona(c(1,diff(key)))!=1)
  Map(collapse_sequence, split(x, run))
}

collapse_numeric <- function(x) {
  paste(sapply(find_runs(x, as.numeric(x)), collapse_sequence), collapse=",")
}

collapse_character <- function(x) {
  key <- sapply(x, function(z) ifelse(nchar(z)==1, utf8ToInt(z), NA))
  paste(sapply(find_runs(x, key), collapse_sequence), collapse=",")
}

collapse_runs <- Vectorize(function(x) {
  x <- strsplit(x, ",")[[1]]
  type <- ifelse(is_numeric(x), 1, ifelse(nchar(x)==1, 2, 3))
  group <- rleg(type)
  runs <- Map(function(v, type) {
    if(type==1) {
      collapse_numeric(v)
    } else {
      collapse_character(v)
    }
  },split(x, group$group_id), group$group_value)
  paste(runs, collapse=",")  
})

      

And finally, we test it with input

input_vec <- c("1,2,3,4,5", "1,2,3,5,6,7,8", "2,3,4,5", "A,B,C", "1,2,3,4,5,A,B,8,9,10,11")
unname(collapse_runs(input_vec))
# [1] "1-5"          "1-3,5-8"      "2-5"          "A-C"          "1-5,A-B,8-11"
input_vec2 <- c("1,2,3,4,5", "1,2,3,5,6,7,8", "2,3,4,5", "A,B,C", "1,2,3,4,5,A,B,8,9,10,11", "1", 
            "1,2,3,-4", "lala,3")
unname(collapse_runs(input_vec2))
# [1] "1-5"          "1-3,5-8"      "2-5"          "A-C"          "1-5,A-B,8-11"
# [6] "1"            "1-3,-4"       "lala,3"

      

+1


source







All Articles