Learn practical skills, build real-world projects, and advance your career
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
pip install numpy pandas matplotlib seaborn wordcloud emoji jovian --upgrade 
Collecting numpy Downloading numpy-1.19.2-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB) |████████████████████████████████| 14.5 MB 6.3 MB/s eta 0:00:01 Requirement already up-to-date: pandas in /opt/conda/lib/python3.7/site-packages (1.1.3) Collecting matplotlib Downloading matplotlib-3.3.2-cp37-cp37m-manylinux1_x86_64.whl (11.6 MB) |████████████████████████████████| 11.6 MB 11.0 MB/s eta 0:00:01 Collecting seaborn Downloading seaborn-0.11.0-py3-none-any.whl (283 kB) |████████████████████████████████| 283 kB 18.2 MB/s eta 0:00:01 Requirement already up-to-date: wordcloud in /opt/conda/lib/python3.7/site-packages (1.8.0) Requirement already up-to-date: emoji in /opt/conda/lib/python3.7/site-packages (0.6.0) Collecting jovian Downloading jovian-0.2.21-py2.py3-none-any.whl (65 kB) |████████████████████████████████| 65 kB 2.4 MB/s eta 0:00:01 Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas) (2019.3) Requirement already satisfied, skipping upgrade: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2.8.1) Requirement already satisfied, skipping upgrade: cycler>=0.10 in /opt/conda/lib/python3.7/site-packages (from matplotlib) (0.10.0) Requirement already satisfied, skipping upgrade: pillow>=6.2.0 in /opt/conda/lib/python3.7/site-packages (from matplotlib) (8.0.0) Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib) (1.2.0) Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /opt/conda/lib/python3.7/site-packages (from matplotlib) (2.4.7) Requirement already satisfied, skipping upgrade: certifi>=2020.06.20 in /opt/conda/lib/python3.7/site-packages (from matplotlib) (2020.6.20) Requirement already satisfied, skipping upgrade: scipy>=1.0 in /opt/conda/lib/python3.7/site-packages (from seaborn) (1.4.1) Collecting uuid Downloading uuid-1.30.tar.gz (5.8 kB) Requirement already satisfied, skipping upgrade: requests in /opt/conda/lib/python3.7/site-packages (from jovian) (2.23.0) Requirement already satisfied, skipping upgrade: click in /opt/conda/lib/python3.7/site-packages (from jovian) (7.1.1) Requirement already satisfied, skipping upgrade: pyyaml in /opt/conda/lib/python3.7/site-packages (from jovian) (5.3.1) Requirement already satisfied, skipping upgrade: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.14.0) Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->jovian) (1.24.3) Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests->jovian) (3.0.4) Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->jovian) (2.9) Building wheels for collected packages: uuid Building wheel for uuid (setup.py) ... done Created wheel for uuid: filename=uuid-1.30-py3-none-any.whl size=6500 sha256=c1e548b91355b486f42939d4e0cee77207d2669aec726a73c376074fff9dc6de Stored in directory: /root/.cache/pip/wheels/2a/ea/87/dd57f1ecb4f0752f3e1dbf958ebf8b36d920d190425bcdc24d Successfully built uuid Installing collected packages: numpy, matplotlib, seaborn, uuid, jovian Attempting uninstall: numpy Found existing installation: numpy 1.18.5 Uninstalling numpy-1.18.5: Successfully uninstalled numpy-1.18.5 Attempting uninstall: matplotlib Found existing installation: matplotlib 3.2.1 Uninstalling matplotlib-3.2.1: Successfully uninstalled matplotlib-3.2.1 Attempting uninstall: seaborn Found existing installation: seaborn 0.10.0 Uninstalling seaborn-0.10.0: Successfully uninstalled seaborn-0.10.0 ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts. We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default. tensorflow 2.3.1 requires numpy<1.19.0,>=1.16.0, but you'll have numpy 1.19.2 which is incompatible. kmeans-smote 0.1.2 requires imbalanced-learn<0.5,>=0.4.0, but you'll have imbalanced-learn 0.7.0 which is incompatible. kmeans-smote 0.1.2 requires numpy<1.16,>=1.13, but you'll have numpy 1.19.2 which is incompatible. kmeans-smote 0.1.2 requires scikit-learn<0.21,>=0.19.0, but you'll have scikit-learn 0.23.2 which is incompatible. dask-xgboost 0.1.11 requires xgboost<=0.90, but you'll have xgboost 1.2.1 which is incompatible. bokeh 2.2.2 requires tornado>=5.1, but you'll have tornado 5.0.2 which is incompatible. Successfully installed jovian-0.2.21 matplotlib-3.3.2 numpy-1.19.2 seaborn-0.11.0 uuid-1.30 Note: you may need to restart the kernel to use updated packages.
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import emoji
from collections import Counter
%matplotlib inline
import seaborn as sns
#import os
#pip install jovian --upgrade
import jovian
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-47-8babc128ff95> in <module> ----> 1 import seaborn as sns 2 #import os 3 #pip install jovian --upgrade 4 import jovian /opt/conda/lib/python3.7/site-packages/seaborn/__init__.py in <module> 1 # Import seaborn objects ----> 2 from .rcmod import * # noqa: F401,F403 3 from .utils import * # noqa: F401,F403 4 from .palettes import * # noqa: F401,F403 5 from .relational import * # noqa: F401,F403 /opt/conda/lib/python3.7/site-packages/seaborn/rcmod.py in <module> 5 import matplotlib as mpl 6 from cycler import cycler ----> 7 from . import palettes 8 9 /opt/conda/lib/python3.7/site-packages/seaborn/palettes.py in <module> 7 from .external import husl 8 ----> 9 from .utils import desaturate, get_color_cycle 10 from .colors import xkcd_rgb, crayons 11 /opt/conda/lib/python3.7/site-packages/seaborn/utils.py in <module> 8 9 import numpy as np ---> 10 from scipy import stats 11 import pandas as pd 12 import matplotlib as mpl /opt/conda/lib/python3.7/site-packages/scipy/stats/__init__.py in <module> 382 from __future__ import division, print_function, absolute_import 383 --> 384 from .stats import * 385 from .distributions import * 386 from .morestats import * /opt/conda/lib/python3.7/site-packages/scipy/stats/stats.py in <module> 177 178 from scipy._lib.six import callable, string_types --> 179 from scipy.spatial.distance import cdist 180 from scipy.ndimage import measurements 181 from scipy._lib._version import NumpyVersion /opt/conda/lib/python3.7/site-packages/scipy/spatial/__init__.py in <module> 95 from __future__ import division, print_function, absolute_import 96 ---> 97 from .kdtree import * 98 from .ckdtree import * 99 from .qhull import * /opt/conda/lib/python3.7/site-packages/scipy/spatial/kdtree.py in <module> 6 import numpy as np 7 from heapq import heappush, heappop ----> 8 import scipy.sparse 9 10 __all__ = ['minkowski_distance_p', 'minkowski_distance', /opt/conda/lib/python3.7/site-packages/scipy/sparse/__init__.py in <module> 227 import warnings as _warnings 228 --> 229 from .base import * 230 from .csr import * 231 from .csc import * /opt/conda/lib/python3.7/site-packages/scipy/sparse/base.py in <module> 5 6 from scipy._lib.six import xrange ----> 7 from scipy._lib._numpy_compat import broadcast_to 8 from .sputils import (isdense, isscalarlike, isintlike, 9 get_sum_dtype, validateaxis, check_reshape_kwargs, /opt/conda/lib/python3.7/site-packages/scipy/_lib/_numpy_compat.py in <module> 14 15 if NumpyVersion(np.__version__) > '1.7.0.dev': ---> 16 _assert_warns = np.testing.assert_warns 17 else: 18 def _assert_warns(warning_class, func, *args, **kw): /opt/conda/lib/python3.7/site-packages/numpy/__init__.py in __getattr__(attr) 211 from .testing import Tester 212 return Tester --> 213 else: 214 raise AttributeError("module {!r} has no attribute " 215 "{!r}".format(__name__, attr)) /opt/conda/lib/python3.7/site-packages/numpy/testing/__init__.py in <module> 8 from unittest import TestCase 9 ---> 10 from ._private.utils import * 11 from ._private import decorators as dec 12 from ._private.nosetester import ( /opt/conda/lib/python3.7/site-packages/numpy/testing/_private/utils.py in <module> 50 IS_PYPY = platform.python_implementation() == 'PyPy' 51 HAS_REFCOUNT = getattr(sys, 'getrefcount', None) is not None ---> 52 HAS_LAPACK64 = numpy.linalg.lapack_lite._ilp64 53 54 AttributeError: module 'numpy.linalg.lapack_lite' has no attribute '_ilp64'
def rawToDf(file, key):
    split_formats = {
        '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
        'custom' : ''
    }
    datetime_formats = {
        '12hr' : '%d/%m/%Y, %I:%M %p - ',
        '24hr' : '%d/%m/%Y, %H:%M - ',
        'custom': ''
    }
    
    with open(file, 'r') as raw_data:
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split(split_formats[key], raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall(split_formats[key], raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
        
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method 
    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])

    # creating new columns         
    df['user'] = usernames
    df['msg'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df