Pandas: how to count incidents of categorical features grouped by ID

Suppose I have this DataFrame df

My_ID   My_CAT
  1       A
  2       B  
  3       C
  1       A  
  1       B 
  2       D 

      

I would like to know how many differences of each individual value My_Cat

occur for each individual My_ID

.

I need it in a dense array like

My_ID   A    B    C   D
  1     2    1    0   0
  2     0    1    0   1
  3     0    0    1   0

      

I tried

df.groupby(['My_ID','My_CAT']).count()

      

but although I can see that the data is grouped according to my needs, events are not counted.

+3


source to share


2 answers


Use crosstab

(less typing, slower):

df = pd.crosstab(df['My_ID'], df['My_CAT'])
print (df)
My_CAT  A  B  C  D
My_ID             
1       2  1  0  0
2       0  1  0  1
3       0  0  1  0

      

Faster solution groupby

+ the aggregate size

+ unstack

:

df = df.groupby(['My_ID','My_CAT']).size().unstack(fill_value=0)
print (df)
My_CAT  A  B  C  D
My_ID             
1       2  1  0  0
2       0  1  0  1
3       0  0  1  0

      

And the last one:



df = df.reset_index().rename_axis(None, axis=1)
print (df)
   My_ID  A  B  C  D
0      1  2  1  0  0
1      2  0  1  0  1
2      3  0  0  1  0

      

Note:

What is the difference between size and quantity in pandas?

Timeline (larger data):

np.random.seed(123)
N = 100000
L = list('abcdefghijklmno')
df = pd.DataFrame({'My_CAT': np.random.choice(L, N),
                   'My_ID':np.random.randint(1000,size=N)})
print (df)

In [79]: %timeit pd.crosstab(df['My_ID'], df['My_CAT'])
10 loops, best of 3: 96.7 ms per loop

In [80]: %timeit df.groupby(['My_ID','My_CAT']).size().unstack(fill_value=0)
100 loops, best of 3: 14.2 ms per loop

In [81]: %timeit pd.get_dummies(df.My_CAT).groupby(df.My_ID).sum()
10 loops, best of 3: 25.5 ms per loop

In [82]: %timeit df.groupby('My_ID').My_CAT.value_counts().unstack(fill_value=0)
10 loops, best of 3: 25.4 ms per loop

In [136]: %timeit xtab_df(df, 'My_ID', 'My_CAT')
100 loops, best of 3: 4.23 ms per loop

In [137]: %timeit xtab(df, 'My_ID', 'My_CAT')
100 loops, best of 3: 4.61 ms per loop

      

+3


source


pd.get_dummies

from groupby

pd.get_dummies(df.My_CAT).groupby(df.My_ID).sum().reset_index()

   My_ID  A  B  C  D
0      1  2  1  0  0
1      2  0  1  0  1
2      3  0  0  1  0

      

groupby

from value_counts

df.groupby('My_ID').My_CAT.value_counts() \
  .unstack(fill_value=0).rename_axis(None, 1).reset_index()

   My_ID  A  B  C  D
0      1  2  1  0  0
1      2  0  1  0  1
2      3  0  0  1  0

      

factorize

andnumba


This is my experimental proposal

from numba import njit
import pandas as pd
import numpy as np

@njit
def xtab_array(f1, f2, m, n):
    v = np.arange(m * n).reshape(m, n) * 0
    for i in range(f1.size):
        v[f1[i], f2[i]] += 1
    return v

def xtab_df(df, c1, c2):
    f1, u1 = pd.factorize(df[c1].values)
    f2, u2 = pd.factorize(df[c2].values)
    v = xtab_array(f1, f2, u1.size, u2.size)
    return pd.DataFrame(
        np.column_stack([u1, v]), columns=['My_ID'] + u2.tolist()
    )

xtab_df(df, 'My_ID', 'My_CAT')

   My_ID  A  B  C  D
0      1  2  1  0  0
1      2  0  1  0  1
2      3  0  0  1  0

      

clean numpy



def xtab(df, c1, c2):
    f1, u1 = pd.factorize(df[c1].values)
    f2, u2 = pd.factorize(df[c2].values)
    n, m = u1.size, u2.size
    v = np.bincount(f1 * m + f2)
    v = np.append(v, np.zeros(n * m - v.size)).reshape(n, -1)
    return pd.DataFrame(
        np.column_stack([u1, v]), columns=['My_ID'] + u2.tolist()
    )

xtab(df, 'My_ID', 'My_CAT')

   My_ID  A  B  C  D
0      1  2  1  0  0
1      2  0  1  0  1
2      3  0  0  1  0

      


Timing
small data

%timeit pd.crosstab(df['My_ID'], df['My_CAT'])
%timeit df.groupby(['My_ID','My_CAT']).size().unstack(fill_value=0)
%timeit pd.get_dummies(df.My_CAT).groupby(df.My_ID).sum()
%timeit df.groupby('My_ID').My_CAT.value_counts().unstack(fill_value=0)
%timeit xtab_df(df, 'My_ID', 'My_CAT')
%timeit xtab(df, 'My_ID', 'My_CAT')

100 loops, best of 3: 5.21 ms per loop
1000 loops, best of 3: 1.23 ms per loop
1000 loops, best of 3: 1.2 ms per loop
1000 loops, best of 3: 1.23 ms per loop
1000 loops, best of 3: 280 ยตs per loop
1000 loops, best of 3: 298 ยตs per loop

      

@jezrael big data

np.random.seed(123)
N = 100000
L = list('abcdefghijklmno')
df = pd.DataFrame({'My_CAT': np.random.choice(L, N),
                   'My_ID':np.random.randint(1000,size=N)})

%timeit pd.crosstab(df['My_ID'], df['My_CAT'])
%timeit df.groupby(['My_ID','My_CAT']).size().unstack(fill_value=0)
%timeit pd.get_dummies(df.My_CAT).groupby(df.My_ID).sum()
%timeit df.groupby('My_ID').My_CAT.value_counts().unstack(fill_value=0)
%timeit xtab_df(df, 'My_ID', 'My_CAT')
%timeit xtab(df, 'My_ID', 'My_CAT')

10 loops, best of 3: 82.6 ms per loop
100 loops, best of 3: 10.7 ms per loop
100 loops, best of 3: 15.6 ms per loop
10 loops, best of 3: 19.9 ms per loop
100 loops, best of 3: 3.01 ms per loop
100 loops, best of 3: 3.22 ms per loop

      

+2


source







All Articles