Jovian
⭐️
Sign In
In [8]:
import re
import jovian
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import emoji
from collections import Counter
In [9]:
!pip install emoji
Requirement already satisfied: emoji in /srv/conda/envs/notebook/lib/python3.7/site-packages (0.5.4)

Data Preprocessing

In [10]:
def rawToDf(file):
    with open(file, 'r') as raw_data:
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
   
    # converting date-time pattern which is of type String to type datetime, format is to be specified for the whole string where the placeholders are extracted by the method    
    try:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%y, %I:%M %p - ') #10/20/19, 10:24 pm - 
    except:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y, %I:%M %p - ') #20/10/2019, 10:24 pm -
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])

    # creating new columns         
    df['user'] = usernames
    df['msg'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df
In [11]:
me = "Prajwal Prashanth" ## add your name

Data

In [12]:
df = rawToDf('Hermanos_group.txt')
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-12-4c45baae96c7> in <module> ----> 1 df = rawToDf('Hermanos_group.txt') <ipython-input-10-f597c64ef5ad> in rawToDf(file) 1 def rawToDf(file): ----> 2 with open(file, 'r') as raw_data: 3 raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages 4 user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names 5 date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) # finds all the date-time patterns FileNotFoundError: [Errno 2] No such file or directory: 'Hermanos_group.txt'
In [ ]:
df.head()
In [ ]:
df.shape # no. of msgs
In [ ]:
df.sample(10)

No. of Images, group notifications and dropping them

In [ ]:
images = df[df['msg']=="<Media omitted> "] #no. of images, images are represented by <media omitted>
images.shape
In [ ]:
df["user"].unique()
In [ ]:
grp_notif = df[df['user']=="grp_notif"] #no. of grp notifications
grp_notif.shape
In [ ]:
df.drop(images.index, inplace=True) #removing images
df.drop(grp_notif.index, inplace=True) #removing grp_notif
In [ ]:
df.reset_index(inplace=True, drop=True)
df.shape

Q 1)

Who is the most active member of the group. Who is the least active. Is it same on weekday weekend?

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.weekday.html

In [ ]:
df[df.msg.str.contains('birthday')]
In [ ]:
c = 0
for i in df['user']:
    if i != "Prajwal Prashanth":
        c = c + 1
    else: 
        print(c)
        c = 0 
       
In [ ]:
df.groupby("user")["msg"].count().sort_values(ascending=False)
In [ ]:
df['weekday'] = df['date_time'].apply(lambda x: x.day_name()) # can use day_name or weekday from datetime 
In [ ]:
df.weekday.value_counts(sort=True)
In [ ]:
df[df.weekday=="Thursday"]['user'].value_counts()
In [ ]:
df['is_weekend'] = df.weekday.isin(['Sunday', 'Saturday'])

Q 2)

Count of all the emoticons that i have used till date.

In [ ]:
df.head()
In [ ]:
df["user"].unique()
In [ ]:
emoji_ctr = Counter()
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
for idx, row in df.iterrows():
    if row["user"] == me:
        emojis_found = r.findall(row["msg"])
        for emoji_found in emojis_found:
            emoji_ctr[emoji_found] += 1
In [ ]:
for item in emoji_ctr.most_common(10):
    print(item[0] + " - " + str(item[1]))

Q 3)

What can my activity say about my sleep cycle?

In [ ]:
def to_hour(val):
    return val.hour
In [ ]:
df['hour'] = df['date_time'].apply(to_hour)
In [ ]:
df[df['user']==me].groupby(['hour']).size().sort_index().plot(x="hour", kind='bar')

Q 4)

What is the difference in Weekend vs Weekday usage pattern?

How many words do I type on average on weekday vs weekend?

In [ ]:
msgs_per_user = df['user'].value_counts(sort=True)
msgs_per_user
In [ ]:
top5_users = msgs_per_user.index.tolist()[:5]
top5_users
In [ ]:
df_top5 = df.copy()
df_top5 = df_top5[df_top5.user.isin(top5_users)]
df_top5.head()
In [ ]:
plt.figure(figsize=(30,10))
sns.countplot(x="user", hue="weekday", data=df)
In [ ]:
df_top5['is_weekend'] = df_top5.weekday.isin(['Sunday', 'Saturday'])
In [ ]:
plt.figure(figsize=(20,10))
sns.countplot(x="user", hue="is_weekend", data=df_top5)
In [ ]:
def word_count(val):
    return len(val.split())
In [ ]:
df['no_of_words'] = df['msg'].apply(word_count)
In [ ]:
df_top5['no_of_words'] = df_top5['msg'].apply(word_count)
In [ ]:
total_words_weekday = df[df['is_weekend']==False]['no_of_words'].sum()
total_words_weekday
In [ ]:
total_words_weekend = df[df['is_weekend']]['no_of_words'].sum()
total_words_weekend
In [ ]:
total_words_weekday/5
In [ ]:
total_words_weekend/2
In [ ]:
df.groupby('user')['no_of_words'].sum().sort_values(ascending=False)
In [ ]:
(df.groupby('user')['no_of_words'].sum()/df.groupby('user').size()).sort_values(ascending=False)
In [ ]:
(df_top5.groupby('user')['no_of_words'].sum()/df_top5.groupby('user').size()).sort_values(ascending=False)
In [ ]:
wordPerMsg_weekday_vs_weekend = (df_top5.groupby(['user', 'is_weekend'])['no_of_words'].sum()/df_top5.groupby(['user', 'is_weekend']).size())
wordPerMsg_weekday_vs_weekend
In [ ]:
wordPerMsg_weekday_vs_weekend.plot(kind='barh')

Q 5)

Most Usage - Time of Day

In [ ]:
x = df.groupby(['hour', 'weekday'])['msg'].size().reset_index()
x2 = x.pivot("hour", 'weekday', 'msg')
x2.head()
In [ ]:
days = ["Monday", 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
sns.heatmap(x2[days].fillna(0), robust=True)

Q 6)

In any group, do I have any inclination towards responding to someone?

In [ ]:
my_msgs_index = np.array(df[df['user']==me].index)
print(my_msgs_index, my_msgs_index.shape)
In [ ]:
prev_msgs_index = my_msgs_index - 1
print(prev_msgs_index, prev_msgs_index.shape)
In [ ]:
prev_msgs_index = np.delete(prev_msgs_index, 0)
prev_msgs_index
In [ ]:
df_replies = df.iloc[prev_msgs_index].copy()
df_replies.shape
In [ ]:
df_replies.groupby(["user"])["msg"].size().sort_values().plot(kind='barh')
In [ ]:
(df_replies.groupby(["user"])["msg"].size()/df.groupby(["user"])["msg"].size()).sort_values().plot(kind='barh')

Q 7)

Which are the most common words?

In [ ]:
comment_words = ' '
stopwords = STOPWORDS.update(['lo', 'ge', 'Lo', 'illa', 'yea', 'ella', 'en', 'na', 'En', 'yeah', 'alli', 'ide', 'okay', 'ok', 'will'])
  
for val in df.msg.values: 
    val = str(val) 
    tokens = val.split() 
        
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 

In [ ]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 
In [ ]:
wordcloud.to_image()

Know What They Know(atleast by little

Assingment-kind

  • 1way ya 2way, check for a response time between two people
In [13]:
import jovian
In [ ]:
jovian.commit()
[jovian] Saving notebook..
In [ ]: