How can I change my data in the CSV file and change the rows and columns?

I have a CSV file and my data format looks like this:

Countries  variable  1995   1996  1997  1998   1999
  USA        GDP      10     11    12    12     13 
  USA        Inf      100    120   130   120    110
  USA        Trade    200    220   210   235    250
  GER        GDP      8      9     9.5   10     10.5
  GER        Inf      100    105   107   109    111 
  GER        Trade    150    156   149   165    167 

      

I intend to change my data and change it to something like:

  Countries  Years    GDP    Inf   Trade
     USA      1995     10    100    200
     USA      1996     11    120    220
     USA      1997     12    130    210
     USA      1998     12    120    235
     USA      1999     13    110    250
     GER      1995     8     100    150
     GER      1996     9     105    156
     GER      1997     9.5   107    149
     GER      1998     10    109    165
     GER      1999     10.5  111    167

      

I don't know how I can do this in Python. I have imported my data into pandas, but the only thing that can be done on the data is column and row wrapping, which is not what I am looking for. Also, using csv.writerow

, I was unable to change my details.

+3


source to share


4 answers


You can use Ordereddict to group data:

import csv
from collections import OrderedDict,defaultdict
from itertools import islice
with open("out.csv") as f:
    od = OrderedDict()
    r = csv.reader(f, delimiter=" ")
    header = next(r)
    years = header[2:]
    zipped = zip(*r)
    countries = OrderedDict.fromkeys(zipped[0]).keys() # next(zipped) python3
    it = iter(countries)
    for  row in zip(*zipped[1:]): #  for  row in zip(*zipped) python3
        if row[0] == "GDP":
            key = next(it)
            od.setdefault(key, defaultdict(list))
            od[key]["Years"] = years
            od[key]["Country"] = [key] * len(years)
        od[key][row[0]].extend(islice(row,1,None))

      

Output:

OrderedDict([('USA', defaultdict(<type 'list'>, {'GDP': ['10', '11', '12', '12', '13'], 'Inf': ['100', '120', '130', '120', '110'], 'Years': ['1995', '1996', '1997', '1998', '1999'], 'Trade': ['200', '220', '210', '235', '250']})), ('GER', defaultdict(<type 'list'>, {'GDP': ['8', '9', '9.5', '10', '10.5'], 'Inf': ['100', '105', '107', '109', '111'], 'Years': ['1995', '1996', '1997', '1998', '1999'], 'Trade': ['150', '156', '149', '165', '167']}))])

      

Someone more experienced with pandas might be able to get a better way of doing this without a doubt, but this will at least create a DataFrame:

df = pd.DataFrame(columns=["Country","Years","GDP","Inf","Trade"])

for k,v in od.items():
    df_temp = pd.DataFrame((v[k] for k in ["Country","Years","GDP","Inf","Trade"] ),["Country","Years","GDP","Inf","Trade"]).transpose()
    f = df.append(df_temp,ignore_index=True)
print(df)

      



Output:

    Country Years   GDP  Inf Trade
0     USA  1995    10  100   200
1     USA  1996    11  120   220
2     USA  1997    12  130   210
3     USA  1998    12  120   235
4     USA  1999    13  110   250
5     GER  1995     8  100   150
6     GER  1996     9  105   156
7     GER  1997   9.5  107   149
8     GER  1998    10  109   165
9     GER  1999  10.5  111   167

      

If the file is bigger you can also create a dataframe as you go and reset the OrderedDict every time to avoid storing all data in the dict also, you just need to add the last group outside the main code, we can also use itertools.islice to get all the snippets and use itertools.izip for zip when using python2:

import csv
from collections import OrderedDict,defaultdict
from itertools import islice,izip

df = pd.DataFrame(columns=["Country","Years","GDP","Inf","Trade"])
with open("out.csv") as f:
    od = OrderedDict()
    r = csv.reader(f, delimiter=" ")
    header = next(r)
    years = header[2:]
    zipped = izip(*r)
    countries = OrderedDict.fromkeys(next(zipped)).keys()
    it = iter(countries)
    for row in izip(*zipped):
        if row[0] == "GDP":
            if od: # make sure it is not the first line
                for k, v in od.items():
                    df_temp = pd.DataFrame((v[k] for k in ["Country","Years","GDP","Inf","Trade"] ), ["Country","Years","GDP","Inf","Trade"]).transpose()
                    df = df.append(df_temp, ignore_index=True)
                    od = OrderedDict()
            key = next(it)
            od.setdefault(key, defaultdict(list))
            od[key]["Years"] = years
            od[key]["Country"] = [key] * len(years)
        od[key][row[0]].extend(islice(row, 1, None))

for k,v in od.items():
        df_temp = pd.DataFrame((v[k] for k in ["Country","Years","GDP","Inf","Trade"] ), ["Country","Years","GDP","Inf","Trade"]).transpose()
        df = df.append(df_temp, ignore_index=True)


print(df)

      

Which should give the same result again:

  Country Years   GDP  Inf Trade
0     USA  1995    10  100   200
1     USA  1996    11  120   220
2     USA  1997    12  130   210
3     USA  1998    12  120   235
4     USA  1999    13  110   250
5     GER  1995     8  100   150
6     GER  1996     9  105   156
7     GER  1997   9.5  107   149
8     GER  1998    10  109   165
9     GER  1999  10.5  111   167

      

+3


source


Assuming you have data in a list of lists:

>>> for line in data:
...     print('\t'.join(line))
...
USA  GDP    10   11   12   12   13
USA  Inf    100  120  130  120  110
USA  Trade  200  220  210  235  250
GER  GDP    8    9    9.5  10   10.5
GER  Inf    100  105  107  109  111
GER  Trade  150  156  149  165  167

      

with the following code:



from collections import defaultdict
data2 = defaultdict(dict)
for line in data:
    for i, year in ((2,1995),(3,1996),(4,1997),(5,1998),(6,1999)):
        data2[(line[0], year)][line[1]] = line[i]
data3 = [[i,j]+[data2[(i,j)][k] for k in ('GDP','Inf','Trade')] for i,j in data2]
for line in sorted(data3):
    print(line)

      

You get:

['GER', 1995, '8', '100', '150']
['GER', 1996, '9', '105', '156']
['GER', 1997, '9.5', '107', '149']
['GER', 1998, '10', '109', '165']
['GER', 1999, '10.5', '111', '167']
['USA', 1995, '10', '100', '200']
['USA', 1996, '11', '120', '220']
['USA', 1997, '12', '130', '210']
['USA', 1998, '12', '120', '235']
['USA', 1999, '13', '110', '250']

      

+2


source


This answer will be very similar to @ AmiTavory's deleted answer (using unstack

instead pivot_table

, but they are equivalent here) with one extra step at the end:

df2 = pd.melt(df, id_vars=["Countries", "variable"], var_name="Years")
df2 = df2.set_index(["Countries", "Years", "variable"]).unstack().reset_index()
df2.columns = [x[1] if x[1] else x[0] for x in df2.columns]

      

produces

In [149]: df2
Out[149]: 
  Countries Years   GDP  Inf  Trade
0       GER  1995   8.0  100    150
1       GER  1996   9.0  105    156
2       GER  1997   9.5  107    149
3       GER  1998  10.0  109    165
4       GER  1999  10.5  111    167
5       USA  1995  10.0  100    200
6       USA  1996  11.0  120    220
7       USA  1997  12.0  130    210
8       USA  1998  12.0  120    235
9       USA  1999  13.0  110    250

      


This works because we first create a molten version of the frame:

In [160]: df2 = pd.melt(df, id_vars=["Countries", "variable"], var_name="Years")

In [161]: df2
Out[161]: 
   Countries variable Years  value
0        USA      GDP  1995   10.0
1        USA      Inf  1995  100.0
2        USA    Trade  1995  200.0
3        GER      GDP  1995    8.0
4        GER      Inf  1995  100.0
5        GER    Trade  1995  150.0
6        USA      GDP  1996   11.0
[...]

      

and then we set the index and defer:

In [166]: df2 = df2.set_index(["Countries", "Years", "variable"]).unstack().reset_index()

In [167]: df2
Out[167]: 
         Countries Years value           
variable                   GDP  Inf Trade
0              GER  1995   8.0  100   150
1              GER  1996   9.0  105   156
2              GER  1997   9.5  107   149
3              GER  1998  10.0  109   165
4              GER  1999  10.5  111   167
5              USA  1995  10.0  100   200
6              USA  1996  11.0  120   220
7              USA  1997  12.0  130   210
8              USA  1998  12.0  120   235
9              USA  1999  13.0  110   250

      

which is almost what we want, but the columns are too complex. We can fix this though:

In [168]: df2.columns
Out[168]: 
MultiIndex(levels=[['value', 'Years', 'Countries'], ['GDP', 'Inf', 'Trade', '']],
           labels=[[2, 1, 0, 0, 0], [3, 3, 0, 1, 2]],
           names=[None, 'variable'])

In [169]: df2.columns = [x[1] if x[1] else x[0] for x in df2.columns]

In [170]: df2
Out[170]: 
  Countries Years   GDP  Inf  Trade
0       GER  1995   8.0  100    150
1       GER  1996   9.0  105    156
2       GER  1997   9.5  107    149
3       GER  1998  10.0  109    165
4       GER  1999  10.5  111    167
5       USA  1995  10.0  100    200
6       USA  1996  11.0  120    220
7       USA  1997  12.0  130    210
8       USA  1998  12.0  120    235
9       USA  1999  13.0  110    250

      

+2


source


Copy your data into a spreadsheet. Renaming may not be required, but I think the column name variable

caused the error. Also, I have not tested if this is the lowest RAM consumption.

import pandas as pd
import numpy as np
df = pd.read_excel('df_countries.xls','Sheet1')
df.columns=['countries','var','1995','1996','1997','1998','1999']
df_new  = pd.melt(df,id_vars=['countries','var'])
df_new.columns = ['countries','var','year','data']
df_new.set_index(['countries','year','var']).unstack('var')

      

enter image description here

+1


source







All Articles