Pandas (python) dataframe filter based on partial strings in a list

I have a pandas dataframe with 99 dx1-dx99 columns and 99 px1-px99 columns. The contents of these columns are codes of various lengths from 4 to 8 characters and numbers. I want to filter only this content from these columns where the first three characters of this content match three characters in the list. The list shown contains strings containing only three characters. The length of the list I provided is generated dynamically and is very time consuming. Therefore, I have to pass this entire list not as a separate line. For example, I have this dataframe:

df = pd.DataFrame({'A': 'foo bar one123 bar foo one324 foo 0'.split(),
                   'B': 'one546 one765 twosde three twowef two234 onedfr three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
    print(df)

        A       B  C   D
0     foo  one546  0   0
1       0  one765  1   2
2  one123  twosde  2   4
3     bar   three  3   6
4     foo  twowef  4   8
5  one324  two234  5  10
6     foo  onedfr  6  12
7       0   three  7  14

      

The filled cells are in the object type and all the zeros were originally NULL, which I filled with pd.fillna (0) zeros.

When I do this:

      

keep = df.iloc [:,:]. isin (['one123', 'one324', 'twosde', 'two234']). values ​​df.iloc [:,:] = df.iloc [:,:] where (keep, 0) print (DF)

I got it:

        A       B  C  D
0       0       0  0  0
1       0       0  0  0
2  one123  twosde  0  0
3       0       0  0  0
4       0       0  0  0
5  one324  two234  0  0
6       0       0  0  0
7       0       0  0  0

      

But instead of passing in separate strings "one123", "one324", "twosde", "two234", I want to pass in a list containing partial strings like this:

startstrings = ['one', 'two']

keep = df.iloc[:,:].contains(startstrings)
df.iloc[:,:] = df.iloc[:,:].where(keep, 0)
print(df)

      

But the above won't work. I want to keep all content that starts with "one" or "two". Any idea how to implement? My dataset is huge and hence efficiency matters.

+3


source to share


4 answers


pandas str.contains

accepts regular expressions that allow you to test for any item in the list. Loop through each column and use str.contains:

startstrings = ['one', 'two']
pattern = '|'.join(startstrings)

for col in df:
    if all(df[col].apply(type) == str):
        #Set any values to 0 if they don't contain value
        df.ix[~df[col].str.contains(pattern), col] = 0        
    else:
        #Column is not all strings
        df[col] = 0

      



Outputs:

      A     B  C  D
0     0  one1  0  0
1     0  one1  0  0
2  one1  two1  0  0
3     0     0  0  0
4     0  two1  0  0
5  one1  two1  0  0
6     0  one1  0  0
7     0     0  0  0

      

+3


source


startstrings = ['one','two']
pat = '|'.join(startstrings)

df[df.select_dtypes(exclude=['number']).apply(lambda x: x.str.contains(pat))].fillna(0)

      

Output:



      A     B    C    D
0     0  one1  0.0  0.0
1     0  one1  0.0  0.0
2  one1  two1  0.0  0.0
3     0     0  0.0  0.0
4     0  two1  0.0  0.0
5  one1  two1  0.0  0.0
6     0  one1  0.0  0.0
7     0     0  0.0  0.0

      

0


source


Here's a vector NumPy approach -

# From http://stackoverflow.com/a/39045337/3293881
def slicer_vectorized(a,start,end):
    b = a.view('S1').reshape(len(a),-1)[:,start:end]
    return np.fromstring(b.tostring(),dtype='S'+str(end-start))

def isin_chars(df, startstrings, start=0, stop = 3):
    a = df.values.astype(str)
    ss_arr = np.sort(startstrings)
    a_S3 = slicer_vectorized(a.ravel(), start, stop)
    idx = np.searchsorted(ss_arr, a_S3)
    mask = (a_S3 == ss_arr[idx]).reshape(a.shape)
    return df.mask(~mask,0)

def process(df, startstrings, n = 100):
    dx_names = ['dx'+str(i) for i in range(1,n)]
    px_names = ['px'+str(i) for i in range(1,n)]
    all_names = np.hstack((dx_names, px_names))
    df0 = df[all_names]
    df_out = isin_chars(df0, startstrings, start=0, stop = 3)
    return df_out

      

Example run -

In [245]: df
Out[245]: 
    dx1    dx2  px1  px2  0
0   foo   one1    0    0  0
1   bar   one1    1    2  7
2  one1   two1    2    4  3
3   bar  three    3    6  8
4   foo   two1    4    8  1
5  one1   two1    5   10  8
6   foo   one1    6   12  6
7   foo  three    7   14  6

In [246]: startstrings = ['two', 'one']

In [247]: process(df, startstrings, n = 3) # change n = 100 for actual case
Out[247]: 
    dx1   dx2  px1  px2
0     0  one1    0    0
1     0  one1    0    0
2  one1  two1    0    0
3     0     0    0    0
4     0  two1    0    0
5  one1  two1    0    0
6     0  one1    0    0
7     0     0    0    0

      

0


source


This is a brute-force type, but it allows for different length prefix strings as shown. I modified your example to look at ['one1', 'th'] to show different lengths. Not sure if you need anything.

import numpy as np
import pandas as pd

df = pd.DataFrame({'A': 'foo bar one1 bar foo one1 foo foo'.split(),
                   'B': 'one1 one1 two1 three two1 two1 one1 three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})

prefixes = "one1 th".split()

matches = np.full(df.shape, False, dtype=bool)

for pfx in prefixes:
    for i,col in enumerate(df.columns):
        try:
            matches[:,i] |= df[col].str.startswith(pfx)
        except AttributeError as e:
            # Some columns have no strings
            pass

keep = df.where(matches, 0)
print(keep)

      

By running this, I get:

$ python test.py
      A      B  C  D
0     0   one1  0  0
1     0   one1  0  0
2  one1      0  0  0
3     0  three  0  0
4     0      0  0  0
5  one1      0  0  0
6     0   one1  0  0
7     0  three  0  0

      

0


source







All Articles