Return the date of the first and last occurrence of a factor variable

Question

Return the date of the first and last occurrence of a factor variable

Problem

I have a dataframe in which each line marks an exchange between companies, where companies give and receive something on a specific date (they can give to another company or themselves). From this I would like to create a new dataframe with columns indicating when the company first started giving, when it first stopped giving, when it first started receiving, and when it first stopped receiving. Here's an example dataframe I'm starting from:

Example of initial data

samp <- structure(list(giver = structure(c(1L, 2L, 6L, 3L, 1L, 3L, 4L, 1L, 6L, 1L, 5L), .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), receiver = structure(c(1L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"), date = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 9L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05", "2000-01-06", "2000-01-07", "2000-01-08", "2000-01-09"), class = "factor")), .Names = c("giver", "receiver", "date"), class = "data.frame", row.names = c(NA, -11L))
samp$date <- as.Date(samp$date, "%Y-%m-%d") # Format date variable

samp 
giver receiver   date
A        A     2000-01-01
B        B     2000-01-01
Z        B     2000-01-02
C        C     2000-01-03
A        A     2000-01-04
C        C     2000-01-05
X        C     2000-01-06
A        A     2000-01-07
Z        B     2000-01-08
A        A     2000-01-09
Y        B     2000-01-09

However, I'm having a hard time figuring out how to scan one column for the first and last occurrence of each company, and return the date value of the other column. I found similar questions here and here using match

, duplicated

or tapply

, but can't be enough to match what I'm trying to do. Here is an example dataframe that I hope to end up with:

Desired end data

desire <- structure(list(company = structure(1:6, .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), start.giving = structure(c(1L, 1L, 3L, 4L, 5L, 2L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-05", "2000-01-09"), class = "factor"), stop.giving = structure(c(5L, 1L, 2L, 3L, 5L, 4L), .Label = c("2000-01-01", "2000-01-05", "2000-01-06", "2000-01-08", "2000-01-09"), class = "factor"), start.receiving = structure(c(1L, 1L, 2L, NA, NA, NA), .Label = c("2000-01-01", "2000-01-03"), class = "factor"), stop.receiving = structure(c(2L, 2L, 1L, NA, NA, NA), .Label = c("2000-01-06", "2000-01-09"), class = "factor")), .Names = c("company", "start.giving", "stop.giving", "start.receiving", "stop.receiving"), class = "data.frame", row.names = c(NA, -6L))

desire 
company start.giving stop.giving start.receiving stop.receiving
A       2000-01-01   2000-01-09      2000-01-01     2000-01-09
B       2000-01-01   2000-01-01      2000-01-01     2000-01-09
C       2000-01-03   2000-01-05      2000-01-03     2000-01-06
X       2000-01-05   2000-01-06            <NA>           <NA>
Y       2000-01-09   2000-01-09            <NA>           <NA>
Z       2000-01-02   2000-01-08            <NA>           <NA>

+3

date r

coip Apr 21 15 at 22:25

source to share

4 answers

Version dplyr

:

library("dplyr")
giving <- samp %>% group_by(giver) %>%
    summarise(start.giving=min(date),
              stop.giving=max(date)) %>%
       rename(company=giver)
receiving <- samp %>% group_by(receiver) %>%
    summarise(start.receiving=min(date),
              stop.receiving=max(date)) %>%
       rename(company=receiver)
full_join(giving,receiving)

With even more work, it is possible to condense this even further / not repeat the whole code summarise

(similar to the function foo()

in @ Arun's answer) ...

foo <- function(x,f) {
    ss <- c("start","stop")
    group_by_(x,.dots=f) %>%
       summarise(start=min(date),
                 stop=max(date)) %>%
       rename_(.dots=c(company=f,
                       setNames(ss,paste(ss,f,sep="."))))
 }
 full_join(foo(samp,"giver"),foo(samp,"receiver"))

... although the code is now less transparent, it is actually not shorter ... it would be helpful if you did this a lot.

+3

Ben bolker Apr 22. 15 at 18:36

source to share

Using devel version data.table

, 1.9.5 , here's another version using the new feature dcast

:

require(data.table) ## v1.9.5+
foo <- function(x, col) {
    ans <- dcast(x, paste(col, "~ ."), value.var="date", fun=list(min, max))
    setnames(ans, c("company", "start", "stop"))
}
setDT(samp)
merge(foo(samp, "giver"), foo(samp, "receiver"), by = "company", 
        all=TRUE, suffixes=c(".giving", ".receiving"))
#    company start.giving stop.giving start.receiving stop.receiving
# 1:       A   2000-01-01  2000-01-09      2000-01-01     2000-01-09
# 2:       B   2000-01-01  2000-01-01      2000-01-01     2000-01-09
# 3:       C   2000-01-03  2000-01-05      2000-01-03     2000-01-06
# 4:       X   2000-01-06  2000-01-06            <NA>           <NA>
# 5:       Y   2000-01-09  2000-01-09            <NA>           <NA>
# 6:       Z   2000-01-02  2000-01-08            <NA>           <NA>

+2

Arun Apr 22. 15 at 17:52

source to share

This can be done in the R database using the commands aggregate

and merge

:

# Import starting sample data
samp <- structure(list(giver = structure(c(1L, 2L, 6L, 3L, 1L, 3L, 4L, 1L, 6L, 1L, 5L), .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), receiver = structure(c(1L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"), date = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 9L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05", "2000-01-06", "2000-01-07", "2000-01-08", "2000-01-09"), class = "factor")), .Names = c("giver", "receiver", "date"), class = "data.frame", row.names = c(NA, -11L))
samp$date <- as.Date(samp$date, "%Y-%m-%d") # Format date variable

# Find first and last occurrence by date
g1 <- aggregate(samp$date, list(samp$giver), min)
colnames(g1)[1] = "company"
colnames(g1)[2] = "start.giving"

g2 <- aggregate(samp$date, list(samp$giver), max)
colnames(g2)[1] = "company"
colnames(g2)[2] = "stop.giving"

s1 <- aggregate(samp$date, list(samp$receiver), min)
colnames(s1)[1] = "company"
colnames(s1)[2] = "start.receiving"

s2 <- aggregate(samp$date, list(samp$receiver), max)
colnames(s2)[1] = "company"
colnames(s2)[2] = "stop.receiving"

# Merge data frames by company name
a1 <- merge(g1, g2, by=c("company"))
b1 <- merge(s1, s2, by=c("company"))
c1 <- merge(a1, b1, by=c("company"), all.x = TRUE)

c1 # Display desired data frame
company start.giving stop.giving start.receiving stop.receiving
 A       2000-01-01  2000-01-09     2000-01-01     2000-01-09
 B       2000-01-01  2000-01-01     2000-01-01     2000-01-09
 C       2000-01-03  2000-01-05     2000-01-03     2000-01-06
 X       2000-01-06  2000-01-06         <NA>           <NA>
 Y       2000-01-09  2000-01-09         <NA>           <NA>
 Z       2000-01-02  2000-01-08         <NA>           <NA>

+1

coip Apr 22. 15 at 13:48

source to share

David Arenburg · Accepted Answer · 2015-04-22T14:18:51+0000

Here's a slightly simplified version with a package data.table

library(data.table)
setDT(samp)
Res1 <- samp[, .(start = min(date), stop = max(date)), by = .(company = giver)]
Res2 <- samp[, .(start = min(date), stop = max(date)), by = .(company = receiver)]
merge(Res1, Res2, by = "company", all = TRUE, suffixes = c(".giving", ".receiving"))
#    company start.giving stop.giving start.receiving stop.receiving
# 1:       A   2000-01-01  2000-01-09      2000-01-01     2000-01-09
# 2:       B   2000-01-01  2000-01-01      2000-01-01     2000-01-09
# 3:       C   2000-01-03  2000-01-05      2000-01-03     2000-01-06
# 4:       X   2000-01-06  2000-01-06            <NA>           <NA>
# 5:       Y   2000-01-09  2000-01-09            <NA>           <NA>
# 6:       Z   2000-01-02  2000-01-08            <NA>           <NA>

Return the date of the first and last occurrence of a factor variable

More articles: