Python Pandas fill data field with missing values

I have this data file as an example

import pandas as pd

#create dataframe
df = pd.DataFrame([['DE', 'Table',201705,201705, 1000], ['DE', 'Table',201705,201704, 1000],\
                   ['DE', 'Table',201705,201702, 1000], ['DE', 'Table',201705,201701, 1000],\
                   ['AT', 'Table',201708,201708, 1000], ['AT', 'Table',201708,201706, 1000],\
                   ['AT', 'Table',201708,201705, 1000], ['AT', 'Table',201708,201704, 1000]],\
                   columns=['ISO','Product','Billed Week', 'Created Week', 'Billings'])
print (df)

  ISO Product  Billed Week  Created Week  Billings
0  DE   Table       201705        201705      1000
1  DE   Table       201705        201704      1000
2  DE   Table       201705        201702      1000
3  DE   Table       201705        201701      1000
4  AT   Table       201708        201708      1000
5  AT   Table       201708        201706      1000
6  AT   Table       201708        201705      1000
7  AT   Table       201708        201704      1000

      

What I need to do is fill in some missing data with 0 Billings for each [ISO, Product] group where there is a gap in the sequence, i.e. no billing has been created during the week, therefore it is absent. It should be based on the high of the week and the low of the week of the week. those. these are combinations that must be complete without interrupting the sequence.

So, for the above, the missing entries that I need to programmatically add to the database are below:

  ISO Product  Billed Week  Created Week  Billings
0  DE   Table       201705        201703         0
1  AT   Table       201708        201707         0

      

+3


source to share


3 answers


def seqfix(x):
    s = x['Created Week']
    x = x.set_index('Created Week')
    x = x.reindex(range(min(s), max(s)+1))
    x['Billings'] = x['Billings'].fillna(0)
    x = x.ffill().reset_index()
    return x

df = df.groupby(['ISO', 'Billed Week']).apply(seqfix).reset_index(drop=True)
df[['Billed Week', 'Billings']] = df[['Billed Week', 'Billings']].astype(int)
df = df[['ISO', 'Product', 'Billed Week', 'Created Week', 'Billings']]

print(df)

  ISO Product  Billed Week  Created Week  Billings
0  AT   Table       201708        201704      1000
1  AT   Table       201708        201705      1000
2  AT   Table       201708        201706      1000
3  AT   Table       201708        201707         0
4  AT   Table       201708        201708      1000
5  DE   Table       201705        201701      1000
6  DE   Table       201705        201702      1000
7  DE   Table       201705        201703         0
8  DE   Table       201705        201704      1000
9  DE   Table       201705        201705      1000

      



+2


source


Here is my solution. I believe some genius will provide the best solution. ~ Let's wait for him ~



df1=df.groupby('ISO').agg({'Billed Week' : np.max,'Created Week' : np.min})
df1['ISO']=df1.index

     Created Week  Billed Week ISO
ISO                               
AT         201704       201708  AT
DE         201701       201705  DE

ISO=[]
BilledWeek=[]
CreateWeek=[]
for i in range(len(df1)):
    BilledWeek.extend([df1.ix[i,1]]*(df1.ix[i,1]-df1.ix[i,0]+1))
    CreateWeek.extend(list(range(df1.ix[i,0],df1.ix[i,1]+1)))
    ISO.extend([df1.ix[i,2]]*(df1.ix[i,1]-df1.ix[i,0]+1))
DF=pd.DataFrame({'BilledWeek':BilledWeek,'CreateWeek':CreateWeek,'ISO':ISO})
Target=DF.merge(df,left_on=['BilledWeek','CreateWeek','ISO'],right_on=['Billed Week','Created Week','ISO'],how='left')
Target.Billings.fillna(0,inplace=True)
Target=Target.drop(['Billed Week',  'Created Week'],axis=1)
Target['Product']=Target.groupby('ISO')['Product'].ffill()

Out[75]: 
   BilledWeek  CreateWeek ISO Product  Billings
0      201708      201704  AT   Table    1000.0
1      201708      201705  AT   Table    1000.0
2      201708      201706  AT   Table    1000.0
3      201708      201707  AT   Table       0.0
4      201708      201708  AT   Table    1000.0
5      201705      201701  DE   Table    1000.0
6      201705      201702  DE   Table    1000.0
7      201705      201703  DE   Table       0.0
8      201705      201704  DE   Table    1000.0
9      201705      201705  DE   Table    1000.0

      

+3


source


Create a MultiIndex with all breaks in earned weeks and then re-index the original DF.

idx = (df.groupby(['Billed Week'])
       .apply(lambda x: [(x['ISO'].min(),
                          x['Product'].min(),
                          x['Billed Week'].min(),
                          e) for e in range(x['Created Week'].min(), x['Created Week'].max()+1)])
       .tolist()
)

multi_idx = pd.MultiIndex.from_tuples(sum(idx,[]),names=['ISO','Product','Billed Week','Created Week'])

(df.set_index(['ISO','Product','Billed Week','Created Week'])
     .reindex(multi_idx)
     .reset_index()
     .fillna(0)
)

Out[671]: 
  ISO Product  Billed Week  Created Week  Billings
0  DE   Table       201705        201701    1000.0
1  DE   Table       201705        201702    1000.0
2  DE   Table       201705        201703       0.0
3  DE   Table       201705        201704    1000.0
4  DE   Table       201705        201705    1000.0
5  AT   Table       201708        201704    1000.0
6  AT   Table       201708        201705    1000.0
7  AT   Table       201708        201706    1000.0
8  AT   Table       201708        201707       0.0
9  AT   Table       201708        201708    1000.0

      

+2


source







All Articles