Learn data science and machine learning by building real-world projects on Jovian
In [1]:
import pandas as pd
import numpy as np

Concatenation

In [2]:
# NumPy Concatenation
x = [1,2,3]
y = [11,12,13]
z= [111,112,113]
np.concatenate([x,y,z])
Out[2]:
array([  1,   2,   3,  11,  12,  13, 111, 112, 113])
In [3]:
# Pandas Series Concatenation similar to np.concatenate
s1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
s2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([s1,s2])
Out[3]:
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object
In [4]:
# Pandas DataFrame Concatenantion 
df1 = pd.DataFrame([['a1','b1'],['a2','b2']], index=[1,2], columns=['A','B'])
df2 = pd.DataFrame([['a3','b3'],['a4','b4']], index=[1,2], columns=['A','B'])
print('df1:\n',df1)
print('df2:\n',df2)
print('Concatenate df1 & df2:\n',pd.concat([df1, df2]))
df1: A B 1 a1 b1 2 a2 b2 df2: A B 1 a3 b3 2 a4 b4 Concatenate df1 & df2: A B 1 a1 b1 2 a2 b2 1 a3 b3 2 a4 b4
In [12]:
# Concatenate in specified axis
df3 = pd.DataFrame([['a0','b0'],['a1','b1']], index=[0,1], columns=['A','B'])
df4 = pd.DataFrame([['c0','d0'],['c1','d1']], index=[0,1], columns=['C','D'])
print('df3:\n',df3)
print('\ndf4:\n',df4)
print('\nConcatenate df3 & df4 Row-wise:\n',pd.concat([df3, df4], axis=1))
df3: A B 0 a0 b0 1 a1 b1 df4: C D 0 c0 d0 1 c1 d1 Concatenate df3 & df4 Row-wise: A B C D 0 a0 b0 c0 d0 1 a1 b1 c1 d1
In [16]:
# Ignore index
print(pd.concat([df1,df2],ignore_index=True))
# Adding Multi-Index keys
# The dataframe conists of Hierarchy indices.
print('\nConcatenation with hierarchy index:\n',pd.concat([df1,df3], keys=['X','Y']))
A B 0 a1 b1 1 a2 b2 2 a3 b3 3 a4 b4 Concatenation with hierarchy index: A B X 1 a1 b1 2 a2 b2 Y 0 a0 b0 1 a1 b1
In [36]:
# Concatenation with joins
df5 = pd.DataFrame([['a0','b0','c0'],['a1','b1','c1'],['a2','b2','c2']], index=[0,1,2], columns=['A','B','C'])
print('df4:\n',df4)
print('df5:\n',df5)
pd.concat([df4,df5], join='inner',axis=1)
df4: C D 0 c0 d0 1 c1 d1 df5: A B C 0 a0 b0 c0 1 a1 b1 c1 2 a2 b2 c2
Out[36]:
In [49]:
# Merging of DataFrames
data1 = {'employee': ['Bob', 'Jake', 'Lisa', 'Sue'], 'group': ['Accounting', 'Engineering', 'Engineering', 'HR']}
data2 = {'employee': ['Lisa', 'Bob', 'Jake', 'Sue'], 'hire_date': [2004, 2008, 2012, 2014]}
df11 = pd.DataFrame(data1)
df12 = pd.DataFrame(data2)
print(df11,'\n\n', df12)
print('Merge of df11, df12:\n', pd.merge(df11,df12))
employee group 0 Bob Accounting 1 Jake Engineering 2 Lisa Engineering 3 Sue HR employee hire_date 0 Lisa 2004 1 Bob 2008 2 Jake 2012 3 Sue 2014 Merge of df11, df12: employee group hire_date 0 Bob Accounting 2008 1 Jake Engineering 2012 2 Lisa Engineering 2004 3 Sue HR 2014
In [50]:
# Merging with specific Key
pd.merge(df11,df12, on='employee')
Out[50]:
In [52]:
# Merging on Uncommon column
pd.merge(df11,df12, on='hire_date')
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-52-b75f6eae4786> in <module> ----> 1 pd.merge(df11,df12, on='hire_date') c:\users\vsneh\appdata\local\programs\python\python37\lib\site-packages\pandas\core\reshape\merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate) 79 copy=copy, 80 indicator=indicator, ---> 81 validate=validate, 82 ) 83 return op.get_result() c:\users\vsneh\appdata\local\programs\python\python37\lib\site-packages\pandas\core\reshape\merge.py in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate) 624 self.right_join_keys, 625 self.join_names, --> 626 ) = self._get_merge_keys() 627 628 # validate the merge keys dtypes. We may need to coerce c:\users\vsneh\appdata\local\programs\python\python37\lib\site-packages\pandas\core\reshape\merge.py in _get_merge_keys(self) 986 right_keys.append(rk) 987 if lk is not None: --> 988 left_keys.append(left._get_label_or_level_values(lk)) 989 join_names.append(lk) 990 else: c:\users\vsneh\appdata\local\programs\python\python37\lib\site-packages\pandas\core\generic.py in _get_label_or_level_values(self, key, axis) 1772 values = self.axes[axis].get_level_values(key)._values 1773 else: -> 1774 raise KeyError(key) 1775 1776 # Check for duplicates KeyError: 'hire_date'
In [73]:
# Merging upon left_on & right_on
df13 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 'salary': ['70000', '80000', '120000', '90000']})
print('df13:\n',df13)
print("Merging with left_on and right_on:\n\n",pd.merge(df11, df13, left_on='employee', right_on='name'))
df13: name salary 0 Bob 70000 1 Jake 80000 2 Lisa 120000 3 Sue 90000 Merging with left_on and right_on: employee group name salary 0 Bob Accounting Bob 70000 1 Jake Engineering Jake 80000 2 Lisa Engineering Lisa 120000 3 Sue HR Sue 90000
In [75]:
# left_index & right_index
dfI11 = df11.set_index('employee')
dfI12 = df12.set_index('employee')
print('dfI11:\n',dfI11)
print('\ndfI12:\n',dfI12)
print('\nMerging with left_index = True & right_index = True:\n')
print(pd.merge(dfI11,dfI12,left_index=True, right_index=True))
dfI11: group employee Bob Accounting Jake Engineering Lisa Engineering Sue HR dfI12: hire_date employee Lisa 2004 Bob 2008 Jake 2012 Sue 2014 Merging with left_index = True & right_index = True: group hire_date employee Bob Accounting 2008 Jake Engineering 2012 Lisa Engineering 2004 Sue HR 2014

Pandas Aggregation

In [78]:
# Aggregation on Series
rng = np.random.RandomState(42)
s1 = pd.Series(rng.rand(5))
print('s1:',s1)
print('\ns1.sum():',s1.sum())
print('\ns1.mean():',s1.mean())
s1: 0 0.374540 1 0.950714 2 0.731994 3 0.598658 4 0.156019 dtype: float64 s1.sum(): 2.811925491708157 s1.mean(): 0.5623850983416314
In [81]:
# Aggregation on DataFrames
df1 = pd.DataFrame({'A': rng.rand(5), 'B': rng.rand(5)})
print('df1:\n',df1)
print("\ndf1.mean():\n",df1.mean())
print("\ndf1.mean(axis='columns'):\n",df1.mean(axis='columns'))
df1: A B 0 0.785176 0.607545 1 0.199674 0.170524 2 0.514234 0.065052 3 0.592415 0.948886 4 0.046450 0.965632 df1.mean(): A 0.427590 B 0.551528 dtype: float64 df1.mean(axis='columns'): 0 0.696360 1 0.185099 2 0.289643 3 0.770650 4 0.506041 dtype: float64
In [85]:
# groupby function -> Split, Apply, Combine.
import seaborn as sns
planets = sns.load_dataset('planets')
planets.head()
Out[85]:
In [93]:
# groupby 
print(planets.groupby('method'))
print(planets.groupby('method')['mass'])
print('Sum aggrigate:\n',planets.groupby('method')['mass'].sum())
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001A35C7133C8> <pandas.core.groupby.generic.SeriesGroupBy object at 0x000001A35C7137C8> Sum aggrigate: method Astrometry 0.00000 Eclipse Timing Variations 10.25000 Imaging 0.00000 Microlensing 0.00000 Orbital Brightness Modulation 0.00000 Pulsar Timing 0.00000 Pulsation Timing Variations 0.00000 Radial Velocity 1341.65638 Transit 1.47000 Transit Timing Variations 0.00000 Name: mass, dtype: float64
In [102]:
planets.groupby('method')['year'].describe().unstack()
Out[102]:
       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64
In [103]:
# Operations on groupby
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df
Out[103]:
In [112]:
# Aggregation -> sum(), median()
df.groupby('key').aggregate([min, np.median, max])
Out[112]:
In [115]:
df.groupby('key').aggregate(np.mean)
Out[115]:
In [114]:
# Transformation
df.groupby('key').transform(lambda x:x**2)
Out[114]:
In [117]:
# Apply
def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x
print('df:\n',df)

df.groupby('key').apply(norm_by_data2)
df: key data1 data2 0 A 0 5 1 B 1 0 2 C 2 3 3 A 3 3 4 B 4 7 5 C 5 9
Out[117]:
In [ ]:
import jovian
jovian.commit(environment='none')
[jovian] Attempting to save notebook..
In [ ]: