Learn practical skills, build real-world projects, and advance your career
# This python notebook is used to analyze/understand what makes a language complex
# Author - Karthik D

%matplotlib inline
import pandas as pd
import re

Load Data

df = pd.read_csv("/home/karthik/Documents/Deep_Learning/CivicData/CivicData/data_stories.csv")
df = df[["story_id", "content", 'reading_level_updated', 'story_langugage']]
df = df.drop(df.index[[3658, 3669, 3686]])
df = df.dropna()

eng_df = df[df.story_langugage=='English']
hin_df = df[df.story_langugage == 'Hindi']
tel_df = df[df.story_langugage == 'Telugu']

Some Pre-processing

import re

hin_df.content = hin_df.content.str.replace(r'[A-Za-z0-9&;:,\-()”“\'\'\"\\]', "")
tel_df.content = tel_df.content.str.replace(r'[A-Za-z0-9&;:,\-()”“\'\'\"\\]', "")
eng_df.content = eng_df.content.str.replace(r'[0-9&;:\-,()”“\'\'\"\\]', "")

eng_df.content = eng_df.content.str.replace("\n","")
eng_df.content = eng_df.content.str.replace("\t","")
eng_df.content = eng_df.content.str.replace("\r","")

tel_df.content = tel_df.content.str.replace("\n","")
tel_df.content = tel_df.content.str.replace("\t","")
tel_df.content = tel_df.content.str.replace("\r","")

hin_df.content = hin_df.content.str.replace("\n","")
hin_df.content = hin_df.content.str.replace("\t","")
hin_df.content = hin_df.content.str.replace("\r","")
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py:4405: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self[name] = value