Learn data science and machine learning by building real-world projects on Jovian
In [3]:
import pandas as pd
import numpy as np

Pandas Series object

In [4]:
# Series creation using Python lists without index parameter
data = pd.Series([1,2,3,4])
print(data)
print('data.values:',data.values)
print('Value at data[1]:',data[1])
0 1 1 2 2 3 3 4 dtype: int64 data.values: [1 2 3 4] Value at data[1]: 2
In [5]:
# Series creation using NumPy arrays with index parameter
data = pd.Series(np.array([0.1, 0.2, 0.5, 0.6]),index=['a','b','c','d'])
print(data)
print('Value at index c of data:',data['c'])
a 0.1 b 0.2 c 0.5 d 0.6 dtype: float64 Value at index c of data: 0.5
In [6]:
# Series creation from Python Dictionaries
dict = {'K' : 10, 'A' : 20, 'C' : 30,'Y' : 40,'Z' : 50}
data = pd.Series(dict)
print(data)
K 10 A 20 C 30 Y 40 Z 50 dtype: int64
In [7]:
# Accessing pandas series elements
print("data['K']:",data['K'])

# Accessing all data elements from A to Y
print("Elements from index A to Y:")
print(data['A':'Y'])
data['K']: 10 Elements from index A to Y: A 20 C 30 Y 40 dtype: int64
In [8]:
# Get index labels of series
data.index
Out[8]:
Index(['K', 'A', 'C', 'Y', 'Z'], dtype='object')

Pandas DataFrame object

  • A Series is an analog of a one-dimensional array with flexible indices, a DataFrame is an analog of a two-dimensional array with both flexible row indices and flexible column names.
In [9]:
# Population series
population = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population)

# Ares series
area = {'California': 423967, 
             'Texas': 695662, 
             'New York': 141297,
             'Florida': 170312, 
             'Illinois': 149995}
area = pd.Series(area)

print('population Series:')
print(population)
print('area Series:')
print(area)
population Series: California 38332521 Texas 26448193 New York 19651127 Florida 19552860 Illinois 12882135 dtype: int64 area Series: California 423967 Texas 695662 New York 141297 Florida 170312 Illinois 149995 dtype: int64
In [10]:
# DataFrame Creation using two Series.
df = pd.DataFrame({'Population':population,'Area':area})
df
Out[10]:
In [11]:
# DataFrame index labels
print('DataFrame Index Labels:',df.index)
print('DataFrame Column Labels:', df.columns)
DataFrame Index Labels: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object') DataFrame Column Labels: Index(['Population', 'Area'], dtype='object')
In [22]:
# Dataframe of population column
pd.DataFrame(df['Population'])

Out[22]:
In [54]:
# DataFrame creation from list of lists
data = [['A', 10], ['B', 11], ['C', 12]] 
df = pd.DataFrame(data, columns = ['col1', 'col2'])  
df 
Out[54]:
In [55]:
# DataFrame creation without columns or index parameters
data = [['A', 10], ['B', 11], ['C', 12]] 
df = pd.DataFrame(data)
# Pandas provides integer indices as column lables
df
Out[55]:
In [56]:
# DataFrame creation from list of lists
data = [['A', 10], ['B', 11], ['C', 12]] 
df = pd.DataFrame(data, columns = ['col1', 'col2'])  
df 
Out[56]:
In [57]:
# DataFrame creation using Python Dictionaries
data = [{'A': 2, 'B':3}, {'A': 10, 'B': 20, 'C': 30}] 
df = pd.DataFrame(data, index =['first', 'second']) 
# Observe NaN in the output.
df 
Out[57]:
In [61]:
# DataFrame Creation using NumPy array
pd.DataFrame(np.arange(1,10).reshape(3,3),
            columns=['col1','col2','col3'],
            index=['a','b','c'])
Out[61]:

Pandas Indexers: loc, iloc, ix

  • Dataframe.[ ] ; This function also known as indexing operator
  • Dataframe.loc[ ] : Allows indexing and slicing based on explit index.
  • Dataframe.iloc[ ] : Allows indexing and slicing based on implict Python-style index.
  • Dataframe.ix[] : This function is used for both implecit and explicit based.
Confusion in indexing
In [4]:
data = pd.Series(['a', 'b', 'c'], index=[10, 20, 30])
print(data,'\n')

# Considers explicit indices.
print('explicit indices: data[10] :',data[10],'\n')

# Considers implicit indcies
print('implict indices: data[1:3]:')
print(data[1:3])
10 a 20 b 30 c dtype: object explicit indices: data[10] : a implict indices: data[1:3]: 20 b 30 c dtype: object
In [83]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data
Out[83]:
1    a
3    b
5    c
dtype: object
In [84]:
# Indexing using indexing operator -> df[]
print(data[1],'\n')
print(data[1:3])
a 3 b 5 c dtype: object
In [87]:
# Indexing based on explicit index
print(data.loc[1],'\n')
print(data.loc[1:3])
a 1 a 3 b dtype: object
In [86]:
# Indexing based on implicit index
print(data.iloc[1],'\n')
print(data.iloc[1:3])
b 3 b 5 c dtype: object

Additional indexing conventions

In [96]:
population = {'California': 38332521,
                   'Texas': 26448193,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population)

area = {'California': 423967, 
             'Texas': 695662, 
             'New York': 141297,
             'Florida': 170312,}
area = pd.Series(area)

data2 = pd.DataFrame({'population':population,'area':area})
data2
Out[96]:
In [97]:
# Division of two DataFrames
# If any values are not available, Pandas assign it with NaN(Not a Number).
population / area
Out[97]:
California     90.413926
Florida       114.806121
Illinois             NaN
New York             NaN
Texas          38.018740
dtype: float64
In [98]:
# Union of Indices
area.index | population.index
Out[98]:
Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')
In [105]:
# Addition of Dataframes
df1 = pd.Series([2, 4, 6], index=[0, 1, 2])
df2 = pd.Series([1, 3, 5], index=[1, 2, 3])
print('df1 + df2:')
print(df1 + df2)

# Addition by calling add() method.
# fill_value attribute fills NaN with 0.
df1.add(df2, fill_value=0)

df1 + df2: 0 NaN 1 5.0 2 9.0 3 NaN dtype: float64
Out[105]:
0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64
  • add()
  • sub(), subtract()
  • mul(), multiply() / truediv(), div(), divide() // floordiv() % mod() ** pow()

Handling with empty values

NaN and None in Pandas
In [111]:
data = pd.Series([1, np.nan, 'hello', None])
data
Out[111]:
0        1
1      NaN
2    hello
3     None
dtype: object
  • isnull(): Generate a boolean mask indicating missing values
  • notnull(): Opposite of isnull()
  • dropna(): Return a filtered version of the data
  • fillna(): Return a copy of the data with missing values filled or imputed
In [112]:
# isnull() method -> To detect null values
data.isnull()
Out[112]:
0    False
1     True
2    False
3     True
dtype: bool
In [113]:
# notnull() -> To detect non-null values
data[data.notnull()]
Out[113]:
0        1
2    hello
dtype: object
In [114]:
# dropna() -> Removes NA values
data.dropna()
Out[114]:
0        1
2    hello
dtype: object
In [116]:
# fillna() -> Filles NA values with 0
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print('DataFrame:')
print(data)
print('DataFrame after removing null values:')
print(data.fillna(0))
DataFrame: a 1.0 b NaN c 2.0 d NaN e 3.0 dtype: float64 DataFrame after removing null values: a 1.0 b 0.0 c 2.0 d 0.0 e 3.0 dtype: float64
In [10]:
# filling a missing value with next ones  
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print('DataFrame:')
print(data,'\n')
print('DataFrame after removing null values:')
data.fillna(method ='bfill') 
DataFrame: a 1.0 b NaN c 2.0 d NaN e 3.0 dtype: float64 DataFrame after removing null values:
Out[10]:
a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64
In [11]:
# Interpolate missing value
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print('DataFrame:')
print(data,'\n')
print('DataFrame after removing null values:')
data.interpolate(method ='linear', limit_direction ='forward') 
DataFrame: a 1.0 b NaN c 2.0 d NaN e 3.0 dtype: float64 DataFrame after removing null values:
Out[11]:
a    1.0
b    1.5
c    2.0
d    2.5
e    3.0
dtype: float64
In [14]:
import jovian

In [ ]:
jovian.commit(environment='none')
[jovian] Attempting to save notebook..
In [ ]: