Data.table - splitting multiple columns

Question

Data.table - splitting multiple columns

I have a data.table G

as follows.

d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD86:RAY", "SD39:MATURE", "SD86:COSMIC"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"),  c("SD32:TRUMPET", "SD1:FIELD", "SD40:GREEN", "SD40:PARK"))

d2 <-  lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, function(x) paste0(unique(x), collapse=", "))
d2 <- lapply(d2, function(x) paste0(unique(x), collapse=", "))
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)

G <- data.table( d , key = "sd" )
G[, GNO:= seq(1,nrow(G))]
setcolorder(G, neworder=c("GNO", "sd", "sdw"))

G
    GNO                     sd                                              sdw
1:   1 SD1, SD44, SD384, SD32 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET
2:   2     SD12, SD315, SD387       SD12:PAINTING, SD315:MONEY31, SD387:SPRING
3:   3       SD23, SD1, SD567                SD23:SWITCH, SD1:LUSH, SD567:TREK
4:   4        SD32, SD1, SD40   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK
5:   5    SD345, SD340, SD387               SD345:HOLE, SD340:DUST, SD387:ROLL
6:   6     SD42, SD345, SD183               SD42:CRAYON, SD345:FOX, SD183:WIRE
7:   7      SD455, SD86, SD39 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC

I am trying to split each set of items in a column sd

and get the modified data table according to the column GNO

as follows.

G2 <- G[, list(sd = unlist(strsplit( sd , "," ))) , by = list(GNO, sdw)]

G2
   GNO                                              sdw     sd
 1:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET    SD1
 2:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET   SD44
 3:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET  SD384
 4:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET   SD32
 5:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING   SD12
 6:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING  SD315
 7:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING  SD387
 8:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK   SD23
 9:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK    SD1
10:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK  SD567
11:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK   SD32
12:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK    SD1
13:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK   SD40
14:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD345
15:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD340
16:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD387
17:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE   SD42
18:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE  SD345
19:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE  SD183
20:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC  SD455
21:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC   SD86
22:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC   SD39

I would also like to split the items in sdw. But when I try to do this, it all gets confused due to the fact that the number of elements in sd

and is sdw

not necessarily the same.

t <- G[ , list(sd = unlist(strsplit(sd, "," )),
               sdw = unlist(strsplit(sdw, "," ))) , by = list(GNO) ]

He gives warning

Warning message:
In `[.data.table`(G, , list(sd = unlist(strsplit(sd, ",")), sdw = unlist(strsplit(sdw,  :
  Column 1 of result for group 4 is length 3 but the longest column in this result is 4. Recycled leaving remainder of 1 items. This warning is once only for the first group with this issue.

The result I'm looking for is as follows.

out <- structure(list(GNO = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 
5, 5, 5, 6, 6, 6, 7, 7, 7), sd = structure(c(1L, 14L, 9L, 6L, 
2L, 5L, 10L, 4L, 1L, 16L, 6L, 1L, 12L, 8L, 7L, 10L, 13L, 8L, 
3L, 15L, 17L, 11L), .Label = c("SD1", "SD12", "SD183", "SD23", 
"SD315", "SD32", "SD340", "SD345", "SD384", "SD387", "SD39", 
"SD40", "SD42", "SD44", "SD455", "SD567", "SD86"), class = "factor"), 
    sdw = structure(c(2L, 17L, 11L, 7L, 3L, 6L, 13L, 5L, 2L, 
    19L, 7L, 1L, 15L, 10L, 8L, 12L, 16L, 9L, 4L, 18L, 20L, 14L
    ), .Label = c("SD1:FIELD", "SD1:LUSH", "SD12:PAINTING", "SD183:WIRE", 
    "SD23:SWITCH", "SD315:MONEY31", "SD32:TRUMPET", "SD340:DUST", 
    "SD345:FOX", "SD345:HOLE", "SD384:FR563", "SD387:ROLL", "SD387:SPRING", 
    "SD39:MATURE", "SD40:GREEN, SD40:PARK", "SD42:CRAYON", "SD44:CANCEL", 
    "SD455:TOMATO", "SD567:TREK", "SD86:COSMIC, SD86:RAY"), class = "factor")), .Names = c("GNO", 
"sd", "sdw"), row.names = c(NA, -22L), class = "data.frame")

out
   GNO    sd                   sdw
1    1   SD1              SD1:LUSH
2    1  SD44           SD44:CANCEL
3    1 SD384           SD384:FR563
4    1  SD32          SD32:TRUMPET
5    2  SD12         SD12:PAINTING
6    2 SD315         SD315:MONEY31
7    2 SD387          SD387:SPRING
8    3  SD23           SD23:SWITCH
9    3   SD1              SD1:LUSH
10   3 SD567            SD567:TREK
11   4  SD32          SD32:TRUMPET
12   4   SD1             SD1:FIELD
13   4  SD40 SD40:GREEN, SD40:PARK
14   5 SD345            SD345:HOLE
15   5 SD340            SD340:DUST
16   5 SD387            SD387:ROLL
17   6  SD42           SD42:CRAYON
18   6 SD345             SD345:FOX
19   6 SD183            SD183:WIRE
20   7 SD455          SD455:TOMATO
21   7  SD86 SD86:COSMIC, SD86:RAY
22   7  SD39           SD39:MATURE

How do I get this output?

+3

r data.table

Crops Dec 31. 15 at 11:43

source to share

2 answers

This might be too much of an assumption about the nature of your data, but it seems that the "sd" column is actually somewhat irrelevant since this information is already embedded in the "sdw" column.

As such, this is what I came up with using two cSplit

from my splitstackshape package:

library(splitstackshape)
temp <- cSplit(
  cSplit(
    G, "sdw", ",", "long"), 
  "sdw", ":")[, c(1, 3, 4), with = FALSE
              ][, list(paste(unique(sdw_1), unique(sdw_2), 
                             sep = ":", collapse = ", ")), 
                by = list(GNO, sdw_1)]
setnames(temp, c("GNO", "sd", "sdw"))
temp
#     GNO    sd                   sdw
#  1:   1   SD1              SD1:LUSH
#  2:   1  SD44           SD44:CANCEL
#  3:   1 SD384           SD384:FR563
#  4:   1  SD32          SD32:TRUMPET
#  5:   2  SD12         SD12:PAINTING
#  6:   2 SD315         SD315:MONEY31
#  7:   2 SD387          SD387:SPRING
#  8:   3  SD23           SD23:SWITCH
#  9:   3   SD1              SD1:LUSH
# 10:   3 SD567            SD567:TREK
# 11:   4  SD32          SD32:TRUMPET
# 12:   4   SD1             SD1:FIELD
# 13:   4  SD40 SD40:GREEN, SD40:PARK
# 14:   5 SD345            SD345:HOLE
# 15:   5 SD340            SD340:DUST
# 16:   5 SD387            SD387:ROLL
# 17:   6  SD42           SD42:CRAYON
# 18:   6 SD345             SD345:FOX
# 19:   6 SD183            SD183:WIRE
# 20:   7 SD455          SD455:TOMATO
# 21:   7  SD86 SD86:RAY, SD86:COSMIC
# 22:   7  SD39           SD39:MATURE
#     GNO    sd                   sdw

Comparison with the desired output

library(compare)
compare(out, temp, allowAll = TRUE)
# TRUE 
#   [GNO] coerced from <integer> to <numeric>
#   [sdw] coerced from <character> to <factor>
#   dropped attributes
#   [3] dropped attributes

+2

A5C1D2H2I1M1N2O1R2T1 Dec 31. 15 at 12:23

source to share

David Arenburg · Accepted Answer · 2014-12-31T12:46:58+0000

Here's how I would apply it using just the package data.table

(no need to create G2

)

Here we first split by swd

and then aggregate both GNO

and the do part :

in the columnstw

G[, .(unlist(strsplit(sdw, ", ", TRUE))), GNO][, 
    .(stw = toString(V1)), .(GNO, sd = gsub(":.*", "", V1))]

 #    GNO    sd                   stw
 # 1:   1   SD1              SD1:LUSH
 # 2:   1  SD44           SD44:CANCEL
 # 3:   1 SD384           SD384:FR563
 # 4:   1  SD32          SD32:TRUMPET
 # 5:   2  SD12         SD12:PAINTING
 # 6:   2 SD315         SD315:MONEY31
 # 7:   2 SD387          SD387:SPRING
 # 8:   3  SD23           SD23:SWITCH
 # 9:   3   SD1              SD1:LUSH
# 10:   3 SD567            SD567:TREK
# 11:   4  SD32          SD32:TRUMPET
# 12:   4   SD1             SD1:FIELD
# 13:   4  SD40 SD40:GREEN, SD40:PARK
# 14:   5 SD345            SD345:HOLE
# 15:   5 SD340            SD340:DUST
# 16:   5 SD387            SD387:ROLL
# 17:   6  SD42           SD42:CRAYON
# 18:   6 SD345             SD345:FOX
# 19:   6 SD183            SD183:WIRE
# 20:   7 SD455          SD455:TOMATO
# 21:   7  SD86 SD86:RAY, SD86:COSMIC
# 22:   7  SD39           SD39:MATURE

Data.table - splitting multiple columns

Comparison with the desired output

More articles: