Learn practical skills, build real-world projects, and advance your career
Created 5 years ago
# This python notebook is used to analyze/understand what makes a language complex
# Author - Karthik D
%matplotlib inline
import pandas as pd
import re
Load Data
df = pd.read_csv("/home/karthik/Documents/Deep_Learning/CivicData/CivicData/data_stories.csv")
df = df[["story_id", "content", 'reading_level_updated', 'story_langugage']]
df = df.drop(df.index[[3658, 3669, 3686]])
df = df.dropna()
eng_df = df[df.story_langugage=='English']
hin_df = df[df.story_langugage == 'Hindi']
tel_df = df[df.story_langugage == 'Telugu']
Some Pre-processing
import re
hin_df.content = hin_df.content.str.replace(r'[A-Za-z0-9&;:,\-()”“\'\'\"\\]', "")
tel_df.content = tel_df.content.str.replace(r'[A-Za-z0-9&;:,\-()”“\'\'\"\\]', "")
eng_df.content = eng_df.content.str.replace(r'[0-9&;:\-,()”“\'\'\"\\]', "")
eng_df.content = eng_df.content.str.replace("\n","")
eng_df.content = eng_df.content.str.replace("\t","")
eng_df.content = eng_df.content.str.replace("\r","")
tel_df.content = tel_df.content.str.replace("\n","")
tel_df.content = tel_df.content.str.replace("\t","")
tel_df.content = tel_df.content.str.replace("\r","")
hin_df.content = hin_df.content.str.replace("\n","")
hin_df.content = hin_df.content.str.replace("\t","")
hin_df.content = hin_df.content.str.replace("\r","")
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py:4405: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self[name] = value