Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
def rawToDf(file, key):
split_formats = {
'12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
'24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
'custom' : ''
}
datetime_formats = {
'12hr' : '%d/%m/%Y, %I:%M %p -',
'24hr' : '%d/%m/%Y, %H:%M -',
'custom': ''
}
with open(file,'r',encoding="utf8") as raw_data:
# converting the list split by newline char as one whole string as there can be multi-line messages
raw_string = ' '.join(raw_data.read().split('\n'))
# splits at all the date-time pattern, resulting in list of all the messages with user names
user_msg = re.split(split_formats[key],raw_string)[1:]
# finds all the date-time patterns
date_time = re.findall(split_formats[key],raw_string)
#Exporting to dataframe
df = pd.DataFrame({'date_time':date_time, 'user_msg':user_msg})
# converting date-time pattern which is of type String to type datetime,
# format is to be specified for the whole string where the placeholders are extracted by the method
df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key],errors='ignore')
#split user and messages
usernames = []
msgs = []
for i in df['user_msg']:
# lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
a = re.split('([\w\W]+?):\s', i)
if(a[1:]): #User typed message
usernames.append(a[1])
msgs.append(a[2])
else: # other notifications in the group(eg: someone was added, some left ...)
usernames.append("grp_notif")
msgs.append(a[0])
# creating new columns
df['user'] = usernames
df['msg'] = msgs
# dropping the old user_msg col.
df.drop('user_msg', axis=1, inplace=True)
return df
df = rawToDf('chats.txt', '12hr')
df
me = "VSM"