Learn practical skills, build real-world projects, and advance your career
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
def rawToDf(file, key):
    split_formats = {
        '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
        'custom' : ''
    }
    datetime_formats = {
        '12hr' : '%d/%m/%Y, %I:%M %p -',
        '24hr' : '%d/%m/%Y, %H:%M -',
        'custom': ''
    }
    
    with open(file,'r',encoding="utf8") as raw_data:
        # converting the list split by newline char as one whole string as there can be multi-line messages
        raw_string = ' '.join(raw_data.read().split('\n')) 
        # splits at all the date-time pattern, resulting in list of all the messages with user names
        user_msg = re.split(split_formats[key],raw_string)[1:]
        # finds all the date-time patterns
        date_time = re.findall(split_formats[key],raw_string)
        
        #Exporting to dataframe
        df = pd.DataFrame({'date_time':date_time, 'user_msg':user_msg})
        
        
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method 
    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key],errors='ignore')
    
    
    #split user and messages
    usernames = []
    msgs = []
    for i in df['user_msg']:
         # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        a = re.split('([\w\W]+?):\s', i)
        if(a[1:]): #User typed message
            usernames.append(a[1])
            msgs.append(a[2])
        else:  # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])
            
        
    # creating new columns         
    df['user'] = usernames
    df['msg'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df
            
df = rawToDf('chats.txt', '12hr')
df
me = "VSM"