Easy way to change the column type of DataFrame but use default for errors?

Suppose I have the following column.

>>> import pandas
>>> a = pandas.Series(['0', '1', '5', '1', None, '3', 'Cat', '2'])

      

I would like to be able to convert all the data in a column in order to print int

, and any item that cannot be converted must be replaced with 0

.

My current solution to this is to use to_numeric

with an option 'coerce'

, fill any NaN

with 0

, and then convert to int

(since having NaN

made the column float

instead int

).

>>> pandas.to_numeric(a, errors='coerce').fillna(0).astype(int)
0    0
1    1
2    5
3    1
4    0
5    3
6    0
7    2
dtype: int64

      

Is there some method that would allow me to do this in one step, instead of going through two intermediate states? I'm looking for something that will behave like the following imaginary option astype

:

>>> a.astype(int, value_on_error=0)

      

+3


source to share


2 answers


Option 1

pd.to_numeric(a, 'coerce').fillna(0).astype(int)

      




Option 2

b = pd.to_numeric(a, 'coerce')
b.mask(b.isnull(), 0).astype(int)

      




Option 3

def try_int(x):
    try:
        return int(x)
    except:
        return 0

a.apply(try_int)

      




Option 4

b = np.empty(a.shape, dtype=int)

i = np.core.defchararray.isdigit(a.values.astype(str))

b[i] = a[i].astype(int)
b[~i] = 0

pd.Series(b, a.index)

      




All products

0    0
1    1
2    5
3    1
4    0
5    3
6    0
7    2
dtype: int64

      




Timing
Code Below

enter image description here

def pir1(a):
    return pd.to_numeric(a, 'coerce').fillna(0).astype(int)

def pir2(a):
    b = pd.to_numeric(a, 'coerce')
    return b.mask(b.isnull(), 0).astype(int)

def try_int(x):
    try:
        return int(x)
    except:
        return 0

def pir3(a):
    return a.apply(try_int)

def pir4(a):
    b = np.empty(a.shape, dtype=int)

    i = np.core.defchararray.isdigit(a.values.astype(str))

    b[i] = a[i].astype(int)
    b[~i] = 0
    return pd.Series(b, a.index)


def alt1(a):
    return pd.to_numeric(a.where(a.str.isnumeric(), 0))

results = pd.DataFrame(
    index=[1, 3, 10, 30, 100, 300, 1000, 3000, 10000],
    columns='pir1 pir2 pir3 pir4 alt1'.split()
)

for i in results.index:
    c = pd.concat([a] * i, ignore_index=True)
    for j in results.columns:
        stmt = '{}(c)'.format(j)
        setp = 'from __main__ import c, {}'.format(j)
        results.set_value(i, j, timeit(stmt, setp, number=10))

results.plot(logx=True, logy=True)

      

+2


source


a.where(a.str.isnumeric(),0).astype(int)

      

Output:



0    0
1    1
2    5
3    1
4    0
5    3
6    0
7    2
dtype: int64

      

+1


source







All Articles