Jovian
⭐️
Sign In
In [3]:
import re
import jovian
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import emoji
from collections import Counter
In [4]:
def rawToDf(file):
    with open(file, 'r',encoding='utf8') as raw_data:
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s', raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
   
    # converting date-time pattern which is of type String to type datetime, format is to be specified for the whole string where the placeholders are extracted by the method    
    try:
        df['date_time'] = df['date_time'].apply(lambda x: dateparser.parse(x))
    except:
        print("oo")
        try:
            df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%y, %H:%M - ') #10/20/19, 10:24 pm - 
        except:
            df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y, %H:%M - ') #20/10/2019, 10:24 pm -
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])

    # creating new columns         

    df['user'] = usernames
    df['msg'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df
In [63]:
me="Poorna"
In [46]:
df = rawToDf('third.txt')
oo
In [47]:
df.head()

Out[47]:
In [48]:
df.shape
Out[48]:
(1435, 3)
In [49]:
df.groupby("user")["msg"].count().sort_values(ascending=False)
Out[49]:
user
Amaresh Sir Ise          230
Akhil                    204
Vaisalini Mam            128
grp_notif                122
Poorna                   102
Oxford Gagan WhatsApp     82
+91 78920 42658           72
Chandrashekar             39
Rida Fatima               28
Roopa NK                  24
+91 6362 844 313          23
Dawood Oxf                22
Sindhuja Mam              20
+91 81970 45395           19
Pankaj Ise Oxf            17
+91 91082 49573           17
+91 99167 78660           13
+91 93809 58423           12
+91 85508 61508           12
Athira S                  12
+91 94832 53973           11
Chanappa Sir              11
+91 77081 89198           11
Girish                    10
Ambika                    10
+91 89047 68298            9
Ishita                     8
Deepthi                    8
Navya Ise                  8
+91 95357 44520            7
                        ... 
Chetan Hstl Frnd           3
+91 96630 13646            3
+91 95913 82478            3
+91 81232 41842            3
Shashank Oxf               3
C                          3
+91 83108 64748            2
+91 87623 55442            2
+91 88618 68122            2
+91 93808 13242            2
+91 93807 40254            2
+91 93809 61829            2
Chandra Shekar             2
Charan                     2
+91 99728 27610            2
Abhishek Delhi Oxf         2
+91 94820 24511            1
Kushal                     1
Govinda F Oxd              1
+91 88841 56374            1
Monica                     1
Preeti Kulkarni            1
Aditya Hostel Oxf          1
+91 89044 30129            1
Sharanya                   1
+91 78292 80134            1
Oxford Arpita Nanda        1
+91 74839 18248            1
Suudeep                    1
+91 6362 213 348           1
Name: msg, Length: 79, dtype: int64
In [50]:
df['weekday'] = df['date_time'].apply(lambda x: x.day_name()) # can use day_name or weekday from datetime 
In [51]:
df.weekday.value_counts(sort=True)
Out[51]:
Saturday     264
Monday       241
Wednesday    229
Friday       197
Thursday     191
Sunday       170
Tuesday      143
Name: weekday, dtype: int64
In [52]:
df[df.weekday=="Wednesday"]['user'].value_counts()
Out[52]:
Amaresh Sir Ise          34
Oxford Gagan WhatsApp    32
Poorna                   27
Vaisalini Mam            19
grp_notif                16
Akhil                    15
+91 78920 42658          11
Chandrashekar            11
Ambika                    5
+91 6362 844 313          5
Roopa NK                  4
Aishwariya Swami          4
Rida Fatima               3
Dawood Oxf                3
+91 89047 68298           3
Girish                    3
+91 77081 89198           3
Aishwariya                3
Chetan Hstl Frnd          2
Chanappa Sir              2
+91 95357 44520           2
+91 81237 41175           2
Athira S                  2
Navya Ise                 2
+91 93808 13242           2
Aasif Mansury             1
+91 88841 56374           1
+91 94832 53973           1
+91 81970 45395           1
Pankaj Ise Oxf            1
Deepthi                   1
Kushal                    1
Gagana Gowda              1
+91 93807 40254           1
Ishita                    1
+91 82966 40171           1
Shashank Oxf              1
Sindhuja Mam              1
+91 93809 61829           1
Name: user, dtype: int64
In [53]:
df['is_weekend'] = df.weekday.isin(['Sunday', 'Saturday'])
In [54]:
df.head()
Out[54]:
In [55]:
df["user"].unique()
Out[55]:
array(['Charan', '+91 93809 58423', 'Oxford Gagan WhatsApp', 'grp_notif',
       'Amaresh Sir Ise', 'Abhishek Delhi Oxf', '+91 94832 53973',
       '+91 82773 63529', '+91 93807 40254', 'Dawood Oxf',
       '+91 81232 41842', 'Abhishek', 'Pankaj Ise Oxf', '+91 78920 42658',
       'Vupati Oxf', 'Deepthi', 'Girish', '+91 91082 49573',
       'Gagana Gowda', 'Roopa NK', 'Athira S', '+91 85508 61508',
       '+91 95913 82478', 'Chandra Shekar', 'Chetan Hstl Frnd',
       'Rida Fatima', '+91 88841 56374', 'Navya Ise', 'Ambika',
       '+91 82966 40171', 'Aishwariya Swami', '+91 93809 61829',
       'Govinda F Oxd', 'Aishwariya', 'Vaisalini Mam', '+91 98457 21403',
       '+91 99728 27610', 'Aasif Mansury', 'Shashank Oxf',
       '+91 97042 00090', 'Manoj Frnd Oxf', '+91 83108 64748', 'Akhil',
       'Namarata', '+91 89044 30129', '+91 99167 78660',
       '+91 81970 45395', 'Anuihya', 'Preeti Kulkarni', '+91 99018 43058',
       '+91 6362 844 313', 'Kushal', 'Chandrashekar', 'Jyoti',
       '+91 77081 89198', 'Poorna', '+91 91136 65746', 'Ishita',
       'Oxford Arpita Nanda', '+91 74839 18248', '+91 88618 68122',
       'Aditya Hostel Oxf', 'C', '+91 78292 80134', '+91 93808 13242',
       'Sindhuja Mam', '+91 6362 213 348', 'Prateek New',
       '+91 95357 44520', '+91 89047 68298', '+91 81237 41175',
       '+91 96630 13646', '+91 94820 24511', 'Suudeep', '+91 97427 19874',
       '+91 87623 55442', 'Chanappa Sir', 'Sharanya', 'Monica'],
      dtype=object)
In [56]:
emoji_ctr = Counter()
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
for idx, row in df.iterrows():
    if row["user"] == me:
        emojis_found = r.findall(row["msg"])
        for emoji_found in emojis_found:
            emoji_ctr[emoji_found] += 1
In [57]:
for item in emoji_ctr.most_common(10):
    print(item[0] + " - " + str(item[1]))
In [64]:
def to_hour(val):
    return val.hour
In [65]:
df['hour'] = df['date_time'].apply(to_hour)
In [66]:
df[df['user']==me].groupby(['hour']).size().sort_index().plot(x="hour", kind='bar')
Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x278ba4b1a20>
Notebook Image
In [ ]:
msgs_per_user = df['user'].value_counts(sort=True)
msgs_per_user
In [ ]:
top5_users = msgs_per_user.index.tolist()[:5]
top5_users
In [ ]:
df_top5 = df.copy()
df_top5 = df_top5[df_top5.user.isin(top5_users)]
df_top5.head()
In [ ]:
plt.figure(figsize=(30,10))
sns.countplot(x="user", hue="weekday", data=df)
In [ ]:
df_top5['is_weekend'] = df_top5.weekday.isin(['Sunday', 'Saturday'])
In [ ]:
 
In [ ]:
plt.figure(figsize=(20,10))
sns.countplot(x="user", hue="is_weekend", data=df_top5)
In [67]:
comment_words = ' '
stopwords = STOPWORDS.update(['lo', 'ge', 'Lo', 'illa', 'yea', 'ella', 'en', 'na', 'En', 'yeah', 'alli', 'ide', 'okay', 'ok', 'will'])
  
for val in df.msg.values: 
    val = str(val) 
    tokens = val.split() 
        
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
In [68]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 
Notebook Image
In [69]:
wordcloud.to_image()
Out[69]:
Notebook Image
In [70]:
x = df.groupby(['hour', 'weekday'])['msg'].size().reset_index()
x2 = x.pivot("hour", 'weekday', 'msg')
x2.head()
Out[70]:
In [71]:
days = ["Monday", 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
sns.heatmap(x2[days].fillna(0), robust=True)
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x278ba471400>
Notebook Image
In [75]:
comment_words = ' '
stopwords = STOPWORDS.update(['lo', 'ge', 'Lo', 'illa', 'yea', 'ella', 'en', 'na', 'En', 'yeah', 'alli', 'ide', 'okay', 'ok', 'will'])
  
for val in df.msg.values: 
    val = str(val) 
    tokens = val.split() 
        
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
In [76]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 
Notebook Image
In [74]:
wordcloud.to_image()
Out[74]:
Notebook Image
In [ ]: