import re
import jovian
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import emoji
from collections import Counter
!pip install emoji
Requirement already satisfied: emoji in /srv/conda/envs/notebook/lib/python3.7/site-packages (0.5.4)
def rawToDf(file):
with open(file, 'r') as raw_data:
raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) # finds all the date-time patterns
df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
# converting date-time pattern which is of type String to type datetime, format is to be specified for the whole string where the placeholders are extracted by the method
try:
df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%y, %I:%M %p - ') #10/20/19, 10:24 pm -
except:
df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y, %I:%M %p - ') #20/10/2019, 10:24 pm -
# split user and msg
usernames = []
msgs = []
for i in df['user_msg']:
a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
if(a[1:]): # user typed messages
usernames.append(a[1])
msgs.append(a[2])
else: # other notifications in the group(eg: someone was added, some left ...)
usernames.append("grp_notif")
msgs.append(a[0])
# creating new columns
df['user'] = usernames
df['msg'] = msgs
# dropping the old user_msg col.
df.drop('user_msg', axis=1, inplace=True)
return df
me = "Prajwal Prashanth" ## add your name
df = rawToDf('Hermanos_group.txt')
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-12-4c45baae96c7> in <module>
----> 1 df = rawToDf('Hermanos_group.txt')
<ipython-input-10-f597c64ef5ad> in rawToDf(file)
1 def rawToDf(file):
----> 2 with open(file, 'r') as raw_data:
3 raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
4 user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
5 date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) # finds all the date-time patterns
FileNotFoundError: [Errno 2] No such file or directory: 'Hermanos_group.txt'
df.head()
df.shape # no. of msgs
df.sample(10)
images = df[df['msg']=="<Media omitted> "] #no. of images, images are represented by <media omitted>
images.shape
df["user"].unique()
grp_notif = df[df['user']=="grp_notif"] #no. of grp notifications
grp_notif.shape
df.drop(images.index, inplace=True) #removing images
df.drop(grp_notif.index, inplace=True) #removing grp_notif
df.reset_index(inplace=True, drop=True)
df.shape
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.weekday.html
df[df.msg.str.contains('birthday')]
c = 0
for i in df['user']:
if i != "Prajwal Prashanth":
c = c + 1
else:
print(c)
c = 0
df.groupby("user")["msg"].count().sort_values(ascending=False)
df['weekday'] = df['date_time'].apply(lambda x: x.day_name()) # can use day_name or weekday from datetime
df.weekday.value_counts(sort=True)
df[df.weekday=="Thursday"]['user'].value_counts()
df['is_weekend'] = df.weekday.isin(['Sunday', 'Saturday'])
df.head()
df["user"].unique()
emoji_ctr = Counter()
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
for idx, row in df.iterrows():
if row["user"] == me:
emojis_found = r.findall(row["msg"])
for emoji_found in emojis_found:
emoji_ctr[emoji_found] += 1
for item in emoji_ctr.most_common(10):
print(item[0] + " - " + str(item[1]))
def to_hour(val):
return val.hour
df['hour'] = df['date_time'].apply(to_hour)
df[df['user']==me].groupby(['hour']).size().sort_index().plot(x="hour", kind='bar')
msgs_per_user = df['user'].value_counts(sort=True)
msgs_per_user
top5_users = msgs_per_user.index.tolist()[:5]
top5_users
df_top5 = df.copy()
df_top5 = df_top5[df_top5.user.isin(top5_users)]
df_top5.head()
plt.figure(figsize=(30,10))
sns.countplot(x="user", hue="weekday", data=df)
df_top5['is_weekend'] = df_top5.weekday.isin(['Sunday', 'Saturday'])
plt.figure(figsize=(20,10))
sns.countplot(x="user", hue="is_weekend", data=df_top5)
def word_count(val):
return len(val.split())
df['no_of_words'] = df['msg'].apply(word_count)
df_top5['no_of_words'] = df_top5['msg'].apply(word_count)
total_words_weekday = df[df['is_weekend']==False]['no_of_words'].sum()
total_words_weekday
total_words_weekend = df[df['is_weekend']]['no_of_words'].sum()
total_words_weekend
total_words_weekday/5
total_words_weekend/2
df.groupby('user')['no_of_words'].sum().sort_values(ascending=False)
(df.groupby('user')['no_of_words'].sum()/df.groupby('user').size()).sort_values(ascending=False)
(df_top5.groupby('user')['no_of_words'].sum()/df_top5.groupby('user').size()).sort_values(ascending=False)
wordPerMsg_weekday_vs_weekend = (df_top5.groupby(['user', 'is_weekend'])['no_of_words'].sum()/df_top5.groupby(['user', 'is_weekend']).size())
wordPerMsg_weekday_vs_weekend
wordPerMsg_weekday_vs_weekend.plot(kind='barh')
x = df.groupby(['hour', 'weekday'])['msg'].size().reset_index()
x2 = x.pivot("hour", 'weekday', 'msg')
x2.head()
days = ["Monday", 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
sns.heatmap(x2[days].fillna(0), robust=True)
my_msgs_index = np.array(df[df['user']==me].index)
print(my_msgs_index, my_msgs_index.shape)
prev_msgs_index = my_msgs_index - 1
print(prev_msgs_index, prev_msgs_index.shape)
prev_msgs_index = np.delete(prev_msgs_index, 0)
prev_msgs_index
df_replies = df.iloc[prev_msgs_index].copy()
df_replies.shape
df_replies.groupby(["user"])["msg"].size().sort_values().plot(kind='barh')
(df_replies.groupby(["user"])["msg"].size()/df.groupby(["user"])["msg"].size()).sort_values().plot(kind='barh')
comment_words = ' '
stopwords = STOPWORDS.update(['lo', 'ge', 'Lo', 'illa', 'yea', 'ella', 'en', 'na', 'En', 'yeah', 'alli', 'ide', 'okay', 'ok', 'will'])
for val in df.msg.values:
val = str(val)
tokens = val.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
for words in tokens:
comment_words = comment_words + words + ' '
wordcloud = WordCloud(width = 800, height = 800,
background_color ='black',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
wordcloud.to_image()
import jovian
jovian.commit()
[jovian] Saving notebook..