How to collapse session path data in path from-to paths for visualizing network data?

What are some ways to transform session path data such as:

df
#   Session Link1 Link2 Link3 Link4 Link5
# 1       1     A     B                  
# 2       2     C                        
# 3       3     D     A     B            
# 4       4     C     F     G     H     J
# 5       5     A     B     C            

      

Into a dataset that looks like this:

desired
#    Session From   To
# 1        1    A    B
# 2        2    C <NA>
# 3        3    D    A
# 4        3    A    B
# 5        4    C    F
# 6        4    F    G
# 7        4    G    H
# 8        4    H    J
# 9        5    A    B
# 10       5    B    C

      

Reproducibility data:

df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))

      

+3


source to share


3 answers


We could use data.table

. Convert 'data.frame' to 'data.table' ( setDT(df)

). Change from "wide" to "long" with melt

, specifying id.var

as "session". Remove the "values" items that are empty [value!='']

. Grouped by "Session", we insert the "NA" values ​​into the "value" column for these "Sessions" which have only one row ( if...else

), create two columns ("From" and "To"), removing the last and first element " V1 "grouped by" session ".

 library(data.table)#v1.9.5+
 melt(setDT(df), id.var='Session')[value!=''][, 
   if(.N==1L) c(value, NA) else value, by = Session][,
      list(From=V1[-.N], To=V1[-1L]), by = Session]
 #   Session From To
 #1:       1    A  B
 #2:       2    C NA
 #3:       3    D  A
 #4:       3    A  B
 #5:       4    C  F
 #6:       4    F  G
 #7:       4    G  H
 #8:       4    H  J
 #9:       5    A  B
 #10:      5    B  C

      



The above can be simplified to one block after a step melt

. tmp[-.N]

Doesn't work for some reason . So I used tmp[1:(.N-1)]

.

melt(setDT(df), id.var= 'Session')[value!='', {
              tmp <- if(.N==1L) c(value, NA) else value
              list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
#    Session From To
#1:       1    A  B
#2:       2    C NA
#3:       3    D  A
#4:       3    A  B
#5:       4    C  F
#6:       4    F  G
#7:       4    G  H
#8:       4    H  J
#9:       5    A  B
#10:      5    B  C

      

+5


source


Inspired by @akrun, this is my personal hit on the issue. Of course, the results are adjusted to include the terminal from the path to each pair:

library(dplyr)
library(tidyr)

gather(df, "Link_Num", "Value", -Session) %>%
  group_by(Session) %>%
  mutate(to = Value,
         from = lag(to)) %>%
  filter(Link_Num != "Link1" &
         from != "") %>%
  select(Session, from, to, Link_Num) %>%
  arrange(Session)

      



What gives:

   Session from to Link_Num
1        1    A  B    Link2
2        1    B       Link3
3        2    C       Link2
4        3    D  A    Link2
5        3    A  B    Link3
6        3    B       Link4
7        4    C  F    Link2
8        4    F  G    Link3
9        4    G  H    Link4
10       4    H  J    Link5
11       5    A  B    Link2
12       5    B  C    Link3
13       5    C       Link4

      

+2


source


Another approach with dplyr

functions melt

and lead

:

library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
  arrange(Session) %>% 
  mutate(To = lead(value)) %>%
  filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
  mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
#    Session value   To
# 1        1     A    B
# 2        2     C <NA>
# 3        3     D    A
# 4        3     A    B
# 5        4     C    F
# 6        4     F    G
# 7        4     G    H
# 8        4     H    J
# 9        5     A    B
# 10       5     B    C

      

+2


source







All Articles