Write some introduction about your project here: describe the dataset, where you got it from, what you're trying to do with it, and which tools & techniques you're using. You can also mention about the course, and what you've learned from it.
As a first step, let's upload our Jupyter notebook to Jovian.ml.
!pip install jovian --upgrade --quiet
!pip install numpy --upgrade --quiet
!pip install pandas --upgrade --quiet
!pip install matplotlib --upgrade --quiet
!pip install seaborn --upgrade --quiet
!pip install wordcloud --upgrade --quiet
!pip install emoji --upgrade --quiet
project_name = "whatsapp-chat-analysis-course-project-try"
import jovian
jovian.commit(project=project_name,enviroment=None,files=["Chat.txt"])
[jovian] Attempting to save notebook..
[jovian] Please enter your API key ( from https://jovian.ml/ ):
API KEY: ········
[jovian] Updating notebook "edsenmichaelcy/whatsapp-chat-analysis-course-project-try" on https://jovian.ml/
[jovian] Uploading notebook..
[jovian] Capturing environment..
[jovian] Uploading additional files...
[jovian] Committed successfully! https://jovian.ml/edsenmichaelcy/whatsapp-chat-analysis-course-project-try
import os
import pandas as pd
import re
import datetime as time
import jovian
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
import re
whatsapp_df = pd.read_fwf('Chat.txt', header = None)
whatsapp_df
whatsapp_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23330 entries, 0 to 23329
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 23177 non-null object
1 1 23087 non-null object
2 2 788 non-null object
dtypes: object(3)
memory usage: 546.9+ KB
After that we will use the info() that provided by the pandas to understand the datatype in the dataframe. As you can see we need to do some clearning such as the date and the Media omitted. (re-explain)
whatsapp_df.shape
(23330, 3)
def txtTodf(txt_file):
'''Convert WhatsApp chat log text file to a Pandas dataframe.'''
# some regex to account for messages taking up multiple lines
pat = re.compile(r'^(\d\d\/\d\d\/\d\d\d\d.*?)(?=^^\d\d\/\d\d\/\d\d\d\d|\Z)', re.S | re.M)
with open(txt_file) as file:
data = [m.group(1).strip().replace('\n', ' ') for m in pat.finditer(file.read())]
user = [];
message = [];
datetime = []
for row in data:
# timestamp is before the first dash
datetime.append(row.split(' - ')[0])
# sender is between am/pm, dash and colon
try:
s = re.search('m - (.*?):', row).group(1)
user.append(s)
except:
user.append('')
# message content is after the first colon
try:
message.append(row.split(': ', 1)[1])
except:
message.append('')
df = pd.DataFrame(zip(datetime, user, message), columns=['datetime', 'user', 'message'])
df['datetime'] = pd.to_datetime(df.datetime, format='%d/%m/%Y, %I:%M %p')
# remove events not associated with a sender
df = df[df.user != ''].reset_index(drop=True)
return df
whatsapp_df = txtTodf('Chat.txt')
whatsapp_df.head(10)
whatsapp_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22701 entries, 0 to 22700
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 datetime 22701 non-null datetime64[ns]
1 user 22701 non-null object
2 message 22701 non-null object
dtypes: datetime64[ns](1), object(2)
memory usage: 532.2+ KB
jovian.commit(project=project_name)
[jovian] Attempting to save notebook..
[jovian] Updating notebook "edsenmichaelcy/whatsapp-chat-analysis-course-project-try" on https://jovian.ml/
[jovian] Uploading notebook..
[jovian] Capturing environment..
[jovian] Committed successfully! https://jovian.ml/edsenmichaelcy/whatsapp-chat-analysis-course-project-try
# To understand the number od the image data
img = whatsapp_df[whatsapp_df['message'] == "<Media omitted>" ]
img.shape
(1182, 3)
Since we want to do analysis on the text rather than image so we have to clean the image data in the text file. In here we have 11k of image in the three row
# We will drop all the image file by using the Drop functions
whatsapp_df.drop(img.index, inplace=True)
So now we will drop all the img to make the dataset more clean. Moreover, we want to make sure it will not copy a new dataset that why we will use "inplace == True"
whatsapp_df.head(10)
As you can see now the dataset is clean from the media format.But we have a problem because after we did the clearning the index of the dataset had been off-order. So now we have to clean the data by using the reset_index().
whatsapp_df.reset_index(inplace=True, drop=True)
whatsapp_df.shape
(21519, 3)
So after the data is clean we have left 21519 data in our dataset. So now we are able to perform the data driven decision making!
jovian.commit(project=project_name)
[jovian] Attempting to save notebook..
#Understand how many user and messages in this chat first
totalNumberofMessage = whatsapp_df.message.count()
username = whatsapp_df["user"].unique()
print('The total of the number of message:',totalNumberofMessage)
print('User name that involve in the chat:',username)
The total of the number of message: 21519
User name that involve in the chat: ['Ed' 'Rohit' 'Pei Yin']
userNumber = whatsapp_df.user.unique()
print("The total number of message from each of the users:\n")
for i in range(len(userNumber)):
#Get one particular user name
user_df = whatsapp_df[whatsapp_df['user'] == length[i]]
#user_df will show the user message
name = print(f'User name: {length[i]}')
#Get the total number of each user send
messages = print('Messages', user_df.shape[0])
print()
The total number of message from each of the users:
User name: Ed
Messages 6991
User name: Rohit
Messages 10268
User name: Pei Yin
Messages 4260
whatsapp_df.head(5)
sortHighestMessage = whatsapp_df.groupby("user")["message"].count().sort_values(ascending=False)
sortHighestMessage
user
Rohit 10268
Ed 6991
Pei Yin 4260
Name: message, dtype: int64
#Copy one dataset
whatsapp_df1 = whatsapp_df.copy()
highestMessage = whatsapp_df1.groupby("user")["message"].count().sort_values(ascending = False).head(5)
highestMessage.drop(columns ="index",inplace=True)
highestMessage
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-19-b827b3bdf293> in <module>
3
4 highestMessage = whatsapp_df1.groupby("user")["message"].count().sort_values(ascending = False).head(5)
----> 5 highestMessage.reset_index(inplace=True)
6 highestMessage.drop(columns ="index",inplace=True)
7 highestMessage
/srv/conda/envs/notebook/lib/python3.7/site-packages/pandas/core/series.py in reset_index(self, level, drop, name, inplace)
1286 elif inplace:
1287 raise TypeError(
-> 1288 "Cannot reset_index inplace on a Series to create a DataFrame"
1289 )
1290 else:
TypeError: Cannot reset_index inplace on a Series to create a DataFrame
#Formating
sns.set_style("darkgrid")
plt.rcParams['font.size'] = 10
plt.rcParams['figure.figsize'] = (12, 8)
#Creating a bar chart
sns.barplot()
# Improving Default Styles using Seaborn
sns.set_style("darkgrid")
# For better readablity;
import matplotlib
matplotlib.rcParams['font.size'] = 10
matplotlib.rcParams['figure.figsize'] = (12, 8)
# A bar plot for top 10 days
sns.barplot(top10days.date, top10days.message_count, palette="hls");
# Saving the plots
plt.savefig('top10_days.svg', format = 'svg')
def emojis(file):
emoji_ctr = Counter()
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
for idx, row in df.iterrows():
if row["user"] == me:
emojis_found = r.findall(row["msg"])
for emoji_found in emojis_found:
emoji_ctr[emoji_found] += 1
return file
# We want to know each user send how many message and Emojis
amountOfChat = whatsapp_df.groupby("user")["message"].count()
length = whatsapp_df.user.unique()
for i in range(len(length)):
#Get one particular user name
user_df = whatsapp_df[whatsapp_df['user'] == length[i]]
#user_df will show the user message
print('User:',user_df)
#Get the total number of each user send
print('Messages', user_df.shape[0])
# emojis conists of total emojis
emojis = sum(req_df['emoji'].str.len())
print('Emojis Sent', emojis)
#links consist of total links
links = sum(req_df["urlcount"])
print('Links Sent', links)
user
Ed 6991
Pei Yin 4260
Rohit 10268
Name: message, dtype: int64
l = messages_df.Author.unique()
for i in range(len(l)):
# Filtering out messages of particular user
req_df= messages_df[messages_df["Author"] == l[i]]
# req_df will contain messages of only one particular user
print(f'Stats of {l[i]} -')
# shape will print number of rows which indirectly means the number of messages
print('Messages Sent', req_df.shape[0])
#Word_Count contains of total words in one message. Sum of all words/ Total Messages will yield words per message
words_per_message = (np.sum(req_df['Word_Count']))/req_df.shape[0]
print('Words per message', words_per_message)
#media conists of media messages
media = media_messages_df[media_messages_df['Author'] == l[i]].shape[0]
print('Media Messages Sent', media)
# emojis conists of total emojis
emojis = sum(req_df['emoji'].str.len())
print('Emojis Sent', emojis)
#links consist of total links
links = sum(req_df["urlcount"])
print('Links Sent', links)
print()