Compare columns and put output in an additional column

Let's start with some sample data:

structure(list(P1 = structure(c(1L, 1L, 3L, 3L, 5L, 5L, 5L, 5L, 
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L), .Label = c("Apple", 
"Grape", "Orange", "Peach", "Tomato"), class = "factor"), P2 = structure(c(4L, 
4L, 3L, 3L, 5L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L, 6L, 6L), .Label = c("Banana", "Cucumber", "Lemon", "Orange", 
"Potato", "Tomato"), class = "factor"), P1_location_subacon = structure(c(2L, 
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("Fridge", "Table"), class = "factor"), 
    P1_location_all_predictors = structure(c(2L, 2L, 3L, 3L, 
    3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L), .Label = c("Table,Desk,Bag,Fridge,Bed,Shelf,Chair", 
    "Table,Shelf,Cupboard,Bed,Fridge", "Table,Shelf,Fridge"), class = "factor"), 
    P2_location_subacon = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Fridge", 
    "Shelf"), class = "factor"), P2_location_all_predictors = structure(c(3L, 
    3L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 
    3L, 3L, 3L, 3L), .Label = c("Shelf,Fridge", "Shelf,Fridge,Bed", 
    "Table,Shelf,Fridge"), class = "factor")), .Names = c("P1", 
"P2", "P1_location_subacon", "P1_location_all_predictors", "P2_location_subacon", 
"P2_location_all_predictors"), class = "data.frame", row.names = c(NA, 
-20L))

      

I would like to compare two pairs of columns. The first pair I would like to use is P1_location_subacon

c P2_location_subacon

. Second pair P1_location_all_predictors

with P2_location_all_predictors

.

How do I want to compare them? In each column, you have a different fruit / vegetable "location". So:

  • if the location in the first pair is the same (P1 / 2_location_subacon), I would like to add the number 2

    to the extra column.

  • if the location in the second pair is the same (P1 / 2_location_all_predictors), I would like to add the number 1

    to the extra column. This is a bit tricky because not all locations need to be the same. At least one of these should be the same for both fruits and vegetables.

  • if in both cases they are different put 0

    . You will not see this situation in the example data.

To summarize, I will show you the result I would like to achieve:

structure(list(P1 = structure(c(1L, 1L, 3L, 3L, 5L, 5L, 5L, 5L, 
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L), .Label = c("Apple", 
"Grape", "Orange", "Peach", "Tomato"), class = "factor"), P2 = structure(c(4L, 
4L, 3L, 3L, 5L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L, 6L, 6L), .Label = c("Banana", "Cucumber", "Lemon", "Orange", 
"Potato", "Tomato"), class = "factor"), P1_location_subacon = structure(c(2L, 
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("Fridge", "Table"), class = "factor"), 
    P1_location_all_predictors = structure(c(2L, 2L, 3L, 3L, 
    3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L), .Label = c("Table,Desk,Bag,Fridge,Bed,Shelf,Chair", 
    "Table,Shelf,Cupboard,Bed,Fridge", "Table,Shelf,Fridge"), class = "factor"), 
    P2_location_subacon = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Fridge", 
    "Shelf"), class = "factor"), P2_location_all_predictors = structure(c(3L, 
    3L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 
    3L, 3L, 3L, 3L), .Label = c("Shelf,Fridge", "Shelf,Fridge,Bed", 
    "Table,Shelf,Fridge"), class = "factor"), X = c(NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA), Correct = c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), .Names = c("P1", 
"P2", "P1_location_subacon", "P1_location_all_predictors", "P2_location_subacon", 
"P2_location_all_predictors", "X", "Correct"), class = "data.frame", row.names = c(NA, 
-20L))

      

+3


source to share


2 answers


EDIT: using feedback from here Test two columns of rows for matching rows in R I improved my answer.

Where DT is your table:

library(data.table)
setDT(DT)
DT <- data.table(sapply(DT,as.character))

DT[, P1_location_all_predictors := gsub(",","|",P1_location_all_predictors)]
DT[, P1_location_subacon := gsub(",","|",P1_location_subacon)]

DT[, match_all_pred := grepl(P1_location_all_predictors, P2_location_all_predictors) + 0, by = P1_location_all_predictors]
DT[, match_subacon := grepl(P1_location_subacon, P2_location_subacon), by = P1_location_subacon]


DT[, P1_location_all_predictors := gsub("\\|",",",P1_location_all_predictors)]
DT[, P1_location_subacon := gsub("\\|",",",P1_location_subacon)]

      

Instead, I chose two columns instead of your record 0/1/2

; this makes the code less straightforward as you have to rely on nested ifs. I also think that a number of columns is better, as you can clearly see the cases of F/F

, T/F

, F/T

and T/T

.



If you must create 0/1/2

, you can call

DT[, MyCol := match_all_pred - match_subacon*match_all_pred+match_subacon*2]

      

which assumes the subacon replaces the entire location.

+3


source


Here's another way:



myData <- data.frame(sapply(myData, as.character), stringsAsFactors=FALSE)

doesIntersect <- function(setA, setB) {length(intersect(setA,setB)) > 0}

myData$Correct <- 0
myData$Correct[mapply(doesIntersect, strsplit(myData$P1_location_all_predictors, ","), strsplit(myData$P2_location_all_predictors, ","))] <- 1
myData$Correct[mapply(setequal, strsplit(myData$P1_location_subacon, ","), strsplit(myData$P2_location_subacon, ","))] <- 2

> myData$Correct
[1] 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2

      

+2


source







All Articles