Return the date of the first and last occurrence of a factor variable
Problem
I have a dataframe in which each line marks an exchange between companies, where companies give and receive something on a specific date (they can give to another company or themselves). From this I would like to create a new dataframe with columns indicating when the company first started giving, when it first stopped giving, when it first started receiving, and when it first stopped receiving. Here's an example dataframe I'm starting from:
Example of initial data
samp <- structure(list(giver = structure(c(1L, 2L, 6L, 3L, 1L, 3L, 4L, 1L, 6L, 1L, 5L), .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), receiver = structure(c(1L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"), date = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 9L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05", "2000-01-06", "2000-01-07", "2000-01-08", "2000-01-09"), class = "factor")), .Names = c("giver", "receiver", "date"), class = "data.frame", row.names = c(NA, -11L))
samp$date <- as.Date(samp$date, "%Y-%m-%d") # Format date variable
samp
giver receiver date
A A 2000-01-01
B B 2000-01-01
Z B 2000-01-02
C C 2000-01-03
A A 2000-01-04
C C 2000-01-05
X C 2000-01-06
A A 2000-01-07
Z B 2000-01-08
A A 2000-01-09
Y B 2000-01-09
However, I'm having a hard time figuring out how to scan one column for the first and last occurrence of each company, and return the date value of the other column. I found similar questions here and here using match
, duplicated
or tapply
, but can't be enough to match what I'm trying to do. Here is an example dataframe that I hope to end up with:
Desired end data
desire <- structure(list(company = structure(1:6, .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), start.giving = structure(c(1L, 1L, 3L, 4L, 5L, 2L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-05", "2000-01-09"), class = "factor"), stop.giving = structure(c(5L, 1L, 2L, 3L, 5L, 4L), .Label = c("2000-01-01", "2000-01-05", "2000-01-06", "2000-01-08", "2000-01-09"), class = "factor"), start.receiving = structure(c(1L, 1L, 2L, NA, NA, NA), .Label = c("2000-01-01", "2000-01-03"), class = "factor"), stop.receiving = structure(c(2L, 2L, 1L, NA, NA, NA), .Label = c("2000-01-06", "2000-01-09"), class = "factor")), .Names = c("company", "start.giving", "stop.giving", "start.receiving", "stop.receiving"), class = "data.frame", row.names = c(NA, -6L))
desire
company start.giving stop.giving start.receiving stop.receiving
A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
X 2000-01-05 2000-01-06 <NA> <NA>
Y 2000-01-09 2000-01-09 <NA> <NA>
Z 2000-01-02 2000-01-08 <NA> <NA>
source to share
Here's a slightly simplified version with a package data.table
library(data.table)
setDT(samp)
Res1 <- samp[, .(start = min(date), stop = max(date)), by = .(company = giver)]
Res2 <- samp[, .(start = min(date), stop = max(date)), by = .(company = receiver)]
merge(Res1, Res2, by = "company", all = TRUE, suffixes = c(".giving", ".receiving"))
# company start.giving stop.giving start.receiving stop.receiving
# 1: A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
# 2: B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
# 3: C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
# 4: X 2000-01-06 2000-01-06 <NA> <NA>
# 5: Y 2000-01-09 2000-01-09 <NA> <NA>
# 6: Z 2000-01-02 2000-01-08 <NA> <NA>
source to share
Version dplyr
:
library("dplyr")
giving <- samp %>% group_by(giver) %>%
summarise(start.giving=min(date),
stop.giving=max(date)) %>%
rename(company=giver)
receiving <- samp %>% group_by(receiver) %>%
summarise(start.receiving=min(date),
stop.receiving=max(date)) %>%
rename(company=receiver)
full_join(giving,receiving)
With even more work, it is possible to condense this even further / not repeat the whole code summarise
(similar to the function foo()
in @ Arun's answer) ...
foo <- function(x,f) {
ss <- c("start","stop")
group_by_(x,.dots=f) %>%
summarise(start=min(date),
stop=max(date)) %>%
rename_(.dots=c(company=f,
setNames(ss,paste(ss,f,sep="."))))
}
full_join(foo(samp,"giver"),foo(samp,"receiver"))
... although the code is now less transparent, it is actually not shorter ... it would be helpful if you did this a lot.
source to share
Using devel version data.table
, 1.9.5 , here's another version using the new feature dcast
:
require(data.table) ## v1.9.5+
foo <- function(x, col) {
ans <- dcast(x, paste(col, "~ ."), value.var="date", fun=list(min, max))
setnames(ans, c("company", "start", "stop"))
}
setDT(samp)
merge(foo(samp, "giver"), foo(samp, "receiver"), by = "company",
all=TRUE, suffixes=c(".giving", ".receiving"))
# company start.giving stop.giving start.receiving stop.receiving
# 1: A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
# 2: B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
# 3: C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
# 4: X 2000-01-06 2000-01-06 <NA> <NA>
# 5: Y 2000-01-09 2000-01-09 <NA> <NA>
# 6: Z 2000-01-02 2000-01-08 <NA> <NA>
source to share
This can be done in the R database using the commands aggregate
and merge
:
# Import starting sample data
samp <- structure(list(giver = structure(c(1L, 2L, 6L, 3L, 1L, 3L, 4L, 1L, 6L, 1L, 5L), .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), receiver = structure(c(1L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"), date = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 9L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05", "2000-01-06", "2000-01-07", "2000-01-08", "2000-01-09"), class = "factor")), .Names = c("giver", "receiver", "date"), class = "data.frame", row.names = c(NA, -11L))
samp$date <- as.Date(samp$date, "%Y-%m-%d") # Format date variable
# Find first and last occurrence by date
g1 <- aggregate(samp$date, list(samp$giver), min)
colnames(g1)[1] = "company"
colnames(g1)[2] = "start.giving"
g2 <- aggregate(samp$date, list(samp$giver), max)
colnames(g2)[1] = "company"
colnames(g2)[2] = "stop.giving"
s1 <- aggregate(samp$date, list(samp$receiver), min)
colnames(s1)[1] = "company"
colnames(s1)[2] = "start.receiving"
s2 <- aggregate(samp$date, list(samp$receiver), max)
colnames(s2)[1] = "company"
colnames(s2)[2] = "stop.receiving"
# Merge data frames by company name
a1 <- merge(g1, g2, by=c("company"))
b1 <- merge(s1, s2, by=c("company"))
c1 <- merge(a1, b1, by=c("company"), all.x = TRUE)
c1 # Display desired data frame
company start.giving stop.giving start.receiving stop.receiving
A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
X 2000-01-06 2000-01-06 <NA> <NA>
Y 2000-01-09 2000-01-09 <NA> <NA>
Z 2000-01-02 2000-01-08 <NA> <NA>
source to share