Jovian
⭐️
Sign In
In [ ]:
1way ya 2way
In [58]:
jovian.commit()
[jovian] Saving notebook..
[jovian] Updating notebook "9fa0c2bca40b448ca926a6ca5f392e24" on https://jvn.io [jovian] Uploading notebook.. [jovian] Capturing environment.. [jovian] Committed successfully! https://jvn.io/PrajwalPrashanth/9fa0c2bca40b448ca926a6ca5f392e24
In [76]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [8]:
def rawToDf(file):
    with open(file, 'r') as raw_data:
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
   
    # converting date-time pattern which is of type String to type datetime, format is to be specified for the whole string where the placeholders are extracted by the method    
    try:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%y, %I:%M %p - ')
    except:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y, %I:%M %p - ')
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])

    # creating new columns         
    df['users'] = usernames
    df['messages'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df

Data

In [311]:
df = rawToDf('Hermanos_group.txt')
In [316]:
df.head()
Out[316]:
In [317]:
df.shape # no. of msgs
Out[317]:
(39999, 3)

No. of Images, group notifications and dropping them

In [318]:
images = df[df['messages']=="<Media omitted> "] #no. of images, images are represented by <media omitted>
images.shape
Out[318]:
(855, 3)
In [319]:
grp_notif = df[df['users']=="grp_notif"] #no. of grp notifications
grp_notif.shape
Out[319]:
(41, 3)
In [320]:
df.drop(images.index, inplace=True) #removing images
df.drop(grp_notif.index, inplace=True) #removing grp_notif
In [321]:
df.shape
Out[321]:
(39103, 3)

Q1)

What is the difference in Weekend vs Weekday usage pattern?

How many words do I type on average on weekday vs weekend?

In [322]:
df['weekday'] = df['date_time'].apply(lambda x: x.day_name())
In [323]:
df.weekday.value_counts(sort=True)
Out[323]:
Sunday       7267
Monday       6044
Wednesday    5655
Thursday     5491
Friday       5465
Saturday     5081
Tuesday      4100
Name: weekday, dtype: int64
In [324]:
msgs_per_user = df['users'].value_counts(sort=True)
msgs_per_user
Out[324]:
Sandesh..!!          9280
Sri Hari Colle       9162
Venkat               5264
Nikil DB             4954
Prajwal Prashanth    4384
Billa                1743
Ktg                  1436
manish lakshman      1293
Abhishek Dharani      587
Kushal Ramakanth      342
Prajwal Kaaadi        191
Kranti Jio            180
Srinidhi Nie          103
Keshava                94
+91 98863 53469        90
Name: users, dtype: int64
In [325]:
top5_users = msgs_per_user.index.tolist()[:5]
top5_users
Out[325]:
['Sandesh..!!', 'Sri Hari Colle', 'Venkat', 'Nikil DB', 'Prajwal Prashanth']
In [326]:
df_top5 = df.copy()
df_top5 = df_top5[df_top5.users.isin(top5_users)]
df_top5.head()
Out[326]:
In [327]:
plt.figure(figsize=(30,10))
sns.countplot(x="users", hue="weekday", data=df)
Out[327]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f39e9e936a0>
Notebook Image
In [332]:
df['is_weekend'] = df.weekday.isin(['Sunday', 'Saturday'])
In [333]:
df[df['weekday']=='Friday'].head(1)
Out[333]:
In [334]:
df[df['weekday']=='Sunday'].head(1)
Out[334]:
In [335]:
df_top5['is_weekend'] = df_top5.weekday.isin(['Sunday', 'Saturday'])
In [336]:
plt.figure(figsize=(20,10))
sns.countplot(x="users", hue="is_weekend", data=df_top5)
Out[336]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f39ecb8d518>
Notebook Image
In [337]:
def word_count(val):
    return len(val.split())
In [338]:
df['no_of_words'] = df['messages'].apply(word_count)
In [339]:
df_top5['no_of_words'] = df_top5['messages'].apply(word_count)
In [340]:
total_words_weekday = df[df['is_weekend']==False]['no_of_words'].sum()
total_words_weekday
Out[340]:
91876
In [341]:
total_words_weekend = df[df['is_weekend']]['no_of_words'].sum()
total_words_weekend
Out[341]:
41129
In [342]:
total_words_weekday/5
Out[342]:
18375.200000000001
In [343]:
total_words_weekend/2
Out[343]:
20564.5
In [344]:
df.groupby('users')['no_of_words'].sum().sort_values(ascending=False)
Out[344]:
users
Sandesh..!!          32255
Sri Hari Colle       27207
Venkat               20753
Prajwal Prashanth    17719
Nikil DB             16828
Billa                 4783
manish lakshman       4198
Ktg                   3701
Abhishek Dharani      2001
Kushal Ramakanth      1331
Prajwal Kaaadi         764
Kranti Jio             513
+91 98863 53469        447
Srinidhi Nie           287
Keshava                218
Name: no_of_words, dtype: int64
In [345]:
(df.groupby('users')['no_of_words'].sum()/df.groupby('users').size()).sort_values(ascending=False)
Out[345]:
users
+91 98863 53469      4.966667
Prajwal Prashanth    4.041743
Prajwal Kaaadi       4.000000
Venkat               3.942439
Kushal Ramakanth     3.891813
Sandesh..!!          3.475754
Abhishek Dharani     3.408859
Nikil DB             3.396851
manish lakshman      3.246713
Sri Hari Colle       2.969548
Kranti Jio           2.850000
Srinidhi Nie         2.786408
Billa                2.744119
Ktg                  2.577298
Keshava              2.319149
dtype: float64
In [346]:
(df_top5.groupby('users')['no_of_words'].sum()/df_top5.groupby('users').size()).sort_values(ascending=False)
Out[346]:
users
Prajwal Prashanth    4.041743
Venkat               3.942439
Sandesh..!!          3.475754
Nikil DB             3.396851
Sri Hari Colle       2.969548
dtype: float64
In [347]:
wordPerMsg_weekday_vs_weekend = (df_top5.groupby(['users', 'is_weekend'])['no_of_words'].sum()/df_top5.groupby(['users', 'is_weekend']).size())
wordPerMsg_weekday_vs_weekend
Out[347]:
users              is_weekend
Nikil DB           False         3.361165
                   True          3.456009
Prajwal Prashanth  False         4.001259
                   True          4.148179
Sandesh..!!        False         3.497849
                   True          3.429570
Sri Hari Colle     False         2.973726
                   True          2.960444
Venkat             False         4.051138
                   True          3.676913
dtype: float64
In [396]:
wordPerMsg_weekday_vs_weekend.plot(kind='barh')
Out[396]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f39e8bfca20>
Notebook Image

Q 2)

Most Usage - Time of Day

In [368]:
def to_hour(val):
    return val.hour
In [369]:
df['hour'] = df['date_time'].apply(to_hour)
In [370]:
df_top5['hour'] = df_top5['date_time'].apply(to_hour)
In [371]:
df[df['users']=="Prajwal Prashanth"].groupby(['hour']).size().sort_index().plot(x="hour", kind='bar')
Out[371]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f39ea783208>
Notebook Image
In [425]:
x = df.groupby(['hour', 'weekday'])['messages'].size().reset_index()
x2 = x.pivot("hour", 'weekday', 'messages')
x2.head()
Out[425]:
In [424]:
sns.heatmap(x2[days].fillna(0), robust=True)
Out[424]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f39e7b2c128>
Notebook Image

Q 3)

In any group, do I have any inclination towards responding to someone?

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Q 4)

Is there any difference in average time to response in office hours vs other hours.

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Q 5)

Which are the most common words?

In [307]:
from wordcloud import WordCloud, STOPWORDS 
  
comment_words = ' '
stopwords = STOPWORDS.update(['lo', 'ge', 'Lo', 'illa', 'yea', 'ella', 'en', 'na', 'En', 'yeah', 'alli', 'ide', 'okay', 'ok', 'will'])
  
for val in df.messages.values: 
    val = str(val) 
    tokens = val.split() 
        
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 

In [308]:
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 
Notebook Image
In [304]:
wordcloud.to_image()
Out[304]:
Notebook Image

Know What They Know move

  • first lets see many different questions answered on whatsapp, commit to jovian with your answeres/analysis visulization
  • explore more data resources here's a list i could find, share if you find any
    • whatsapp
    • google take out
  • Lets move step by step and then move to some ML/DL Models
In [ ]: