# This python notebook is used to analyze/understand what makes a language complex
# Author - Karthik D
%matplotlib inline
import pandas as pd
import re
df = pd.read_csv("/home/karthik/Documents/Deep_Learning/CivicData/CivicData/data_stories.csv")
df = df[["story_id", "content", 'reading_level_updated', 'story_langugage']]
df = df.drop(df.index[[3658, 3669, 3686]])
df = df.dropna()
eng_df = df[df.story_langugage=='English']
hin_df = df[df.story_langugage == 'Hindi']
tel_df = df[df.story_langugage == 'Telugu']
import re
hin_df.content = hin_df.content.str.replace(r'[A-Za-z0-9&;:,\-()”“\'\'\"\\]', "")
tel_df.content = tel_df.content.str.replace(r'[A-Za-z0-9&;:,\-()”“\'\'\"\\]', "")
eng_df.content = eng_df.content.str.replace(r'[0-9&;:\-,()”“\'\'\"\\]', "")
eng_df.content = eng_df.content.str.replace("\n","")
eng_df.content = eng_df.content.str.replace("\t","")
eng_df.content = eng_df.content.str.replace("\r","")
tel_df.content = tel_df.content.str.replace("\n","")
tel_df.content = tel_df.content.str.replace("\t","")
tel_df.content = tel_df.content.str.replace("\r","")
hin_df.content = hin_df.content.str.replace("\n","")
hin_df.content = hin_df.content.str.replace("\t","")
hin_df.content = hin_df.content.str.replace("\r","")
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py:4405: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self[name] = value
# Below are function definitions for computing the following stats:
# Count of words
# Avg word length
# Avg sentence length
# Count of sentences
# Avg number of paragraphs
# Avg length of paragraphs
def count_of_words(df):
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["word_counts"] = df.content.str.split(" ").apply(len)
df = df.drop("content", axis=1)
word_counts = df.groupby(["level"]).mean().round(0)
return word_counts
def avg_word_length(df):
# Fetch the mean count of letters in each string
def letter_counts(l):
l = [len(s) for s in l]
return sum(l)/len(l)
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["mean_letter_counts"] = df.content.str.split(" ").apply(letter_counts)
df = df.drop("content", axis=1)
mean_letter_counts = df.groupby(["level"]).mean().round(1)
return mean_letter_counts
def count_sentences(df):
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["sen_count"] = df.content.str.split(".").apply(len)
df = df.drop("content", axis=1)
sen_count = df.groupby(["level"]).mean().round(0)
return sen_count
def avg_sentence_length(df):
# Fetch the mean count of letters in each string
def letter_counts(l):
l = [len(s) for s in l]
return sum(l)/len(l)
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["avg_sentence_length"] = df.content.str.split(".").apply(letter_counts)
df = df.drop("content", axis=1)
avg_sentence_len = df.groupby(["level"]).mean().round(1)
return avg_sentence_len
def count_sentences_hin(df):
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["sen_count"] = df.content.str.split("।").apply(len)
df = df.drop("content", axis=1)
sen_count = df.groupby(["level"]).mean().round(0)
return sen_count
def avg_sentence_length_hin(df):
# Fetch the mean count of letters in each string
def letter_counts(l):
l = [len(s) for s in l]
return sum(l)/len(l)
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["avg_sentence_length"] = df.content.str.split("।").apply(letter_counts)
df = df.drop("content", axis=1)
avg_sentence_len = df.groupby(["level"]).mean().round(1)
return avg_sentence_len
def count_paras(df):
# splits text into paragraphs separated by 3 or more spaces
def split_paras(text):
return re.split(r'\s{3,}', text)
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["par_count"] = df.content.apply(split_paras).apply(len)
df = df.drop("content", axis=1)
par_count = df.groupby(["level"]).mean().round(0)
return par_count
def avg_para_length(df):
# Fetch the mean count of letters in each string
def letter_counts(l):
l = [len(s) for s in l]
return sum(l)/len(l)
# splits text into paragraphs separated by 3 or more spaces
def split_paras(text):
return re.split(r'\s{3,}', text)
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
df["avg_para_length"] = df.content.apply(split_paras).apply(letter_counts)
df = df.drop("content", axis=1)
avg_para_len = df.groupby(["level"]).mean().round(1)
return avg_para_len
# Below are function definitions for computing the following stats:
# Count of unique words
# Complexity index - This metric represents how many unique words were used to describe the content of a story:
# CI - score btw (0,1), CI score is inversely proportional to language complexity (higher the CI, less complex the story)
# # of words occuring just once
# Raw count of unique words per level
def count_unique_words(df):
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
l1 = len(list(set(df[df.level == "L1"].content.str.cat(sep=" ").split(' '))))
l2 = len(list(set(df[df.level == "L2"].content.str.cat(sep=" ").split(' '))))
l3 = len(list(set(df[df.level == "L3"].content.str.cat(sep=" ").split(' '))))
l4 = len(list(set(df[df.level == "L4"].content.str.cat(sep=" ").split(' '))))
unique_word_count = pd.DataFrame({"unique_word_count":[l1, l2, l3, l4]})
unique_word_count.index.names = ["level"]
unique_word_count = unique_word_count.rename({0:"L1", 1:"L2", 2:"L3", 3:"L4"})
return unique_word_count
# Complexity index - This metric represents how many unique words were used to describe the content of a story:
# CI score range=(0,1), CI score is inversely proportional to language complexity (higher the CI, less complex the story)
def complexity_index(df):
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
l1 = df[df.level == "L1"].content.str.split(" ").apply(set).apply(len) / df[df.level == "L1"].content.str.split(" ").apply(len)
l2 = df[df.level == "L2"].content.str.split(" ").apply(set).apply(len) / df[df.level == "L2"].content.str.split(" ").apply(len)
l3 = df[df.level == "L3"].content.str.split(" ").apply(set).apply(len) / df[df.level == "L3"].content.str.split(" ").apply(len)
l4 = df[df.level == "L4"].content.str.split(" ").apply(set).apply(len) / df[df.level == "L4"].content.str.split(" ").apply(len)
ci = pd.DataFrame({"CI":[l1.mean(), l2.mean(), l3.mean(), l4.mean()]})
ci.index.names = ["level"]
ci = ci.rename({0:"L1", 1:"L2", 2:"L3", 3:"L4"})
return ci
# Count the # of words occuring just once
def count_single_occurance_words(df):
df = pd.DataFrame({"content":df.content,"level": df.reading_level_updated})
def word_count(str):
counts = dict()
words = str.split(" ")
for word in words:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
return sum(value == 1 for value in counts.values())
l1 = df[df.level == "L1"].content.apply(word_count).mean()
l2 = df[df.level == "L2"].content.apply(word_count).mean()
l3 = df[df.level == "L3"].content.apply(word_count).mean()
l4 = df[df.level == "L4"].content.apply(word_count).mean()
cso = pd.DataFrame({"CSO":[l1, l2, l3, l4]})
cso.index.names = ["level"]
cso = cso.rename({0:"L1", 1:"L2", 2:"L3", 3:"L4"})
return cso
# Print word counts
eng_count_of_words = count_of_words(eng_df)
hin_count_of_words = count_of_words(hin_df)
tel_count_of_words = count_of_words(tel_df)
tel_count_of_words.columns = ["Tel_word_counts"]
hin_count_of_words.columns = ["Hin_word_counts"]
eng_count_of_words.columns = ["Eng_word_counts"]
word_counts = pd.concat([eng_count_of_words, hin_count_of_words, tel_count_of_words], axis=1)
print(word_counts)
Eng_word_counts Hin_word_counts Tel_word_counts
level
L1 119.0 142.0 123.0
L2 300.0 370.0 281.0
L3 590.0 918.0 726.0
L4 1310.0 1822.0 1349.0
Conclusion: It is clear that levels are directly dependent on the sizes of the stories (number of words used)
# print avg word length
eng_avg_word_length = avg_word_length(eng_df)
hin_avg_word_length = avg_word_length(hin_df)
tel_avg_word_length = avg_word_length(tel_df)
tel_avg_word_length.columns = ["Tel_avg_word_length"]
hin_avg_word_length.columns = ["Hin_avg_word_length"]
eng_avg_word_length.columns = ["Eng_avg_word_length"]
avg_word_len = pd.concat([eng_avg_word_length, hin_avg_word_length, tel_avg_word_length], axis=1)
print(avg_word_len)
Eng_avg_word_length Hin_avg_word_length Tel_avg_word_length
level
L1 4.7 3.7 6.7
L2 4.8 4.3 6.8
L3 4.8 4.2 6.6
L4 4.7 4.3 6.5
Conclusion: The levels don't seem to be dependent on the avg sizes of the words used per level per language
# print sentence counts
eng_sen_count = count_sentences(eng_df)
hin_sen_count = count_sentences_hin(hin_df)
tel_sen_count = count_sentences(tel_df)
tel_sen_count.columns = ["Tel_sen_count"]
hin_sen_count.columns = ["Hin_sen_count"]
eng_sen_count.columns = ["Eng_sen_count"]
sen_count = pd.concat([eng_sen_count, hin_sen_count, tel_sen_count], axis=1)
print(sen_count)
Eng_sen_count Hin_sen_count Tel_sen_count
level
L1 13.0 8.0 19.0
L2 31.0 26.0 43.0
L3 51.0 59.0 95.0
L4 110.0 114.0 179.0
Conclusion: The levels are directly dependent on the number of sentences used per level in a language
# print sentence lengths
eng_sen_len = avg_sentence_length(eng_df)
hin_sen_len = avg_sentence_length_hin(hin_df)
tel_sen_len = avg_sentence_length(tel_df)
tel_sen_len.columns = ["Tel_sen_len"]
hin_sen_len.columns = ["Hin_sen_len"]
eng_sen_len.columns = ["Eng_sen_len"]
sen_len = pd.concat([eng_sen_len, hin_sen_len, tel_sen_len], axis=1)
print(sen_len)
Eng_sen_len Hin_sen_len Tel_sen_len
level
L1 93.4 287.5 53.8
L2 84.6 274.3 70.9
L3 89.7 242.3 63.0
L4 72.6 391.4 59.8
Conclusion: The levels don't seem to have any dependency on the length of sentences
# print paragraph counts
eng_count_paras = count_paras(eng_df)
hin_count_paras = count_paras(hin_df)
tel_count_paras = count_paras(tel_df)
tel_count_paras.columns = ["Tel_count_paras"]
hin_count_paras.columns = ["Hin_count_paras"]
eng_count_paras.columns = ["Eng_count_paras"]
para_count = pd.concat([eng_count_paras, hin_count_paras, tel_count_paras], axis=1)
print(para_count)
Eng_count_paras Hin_count_paras Tel_count_paras
level
L1 1.0 2.0 2.0
L2 1.0 4.0 4.0
L3 2.0 8.0 5.0
L4 5.0 15.0 6.0
Conclusion: Again the levels are dependent on the size of the stories and the higher levels seem to have more paragraphs
# print paragraph lengths
eng_len_paras = avg_para_length(eng_df)
hin_len_paras = avg_para_length(hin_df)
tel_len_paras = avg_para_length(tel_df)
tel_len_paras.columns = ["Tel_len_paras"]
hin_len_paras.columns = ["Hin_len_paras"]
eng_len_paras.columns = ["Eng_len_paras"]
para_len = pd.concat([eng_len_paras, hin_len_paras, tel_len_paras], axis=1)
print(para_len)
Eng_len_paras Hin_len_paras Tel_len_paras
level
L1 639.5 489.1 693.0
L2 1557.4 1278.4 1462.7
L3 2841.4 2818.0 3202.5
L4 4916.3 4806.7 5004.9
Conclusion: Again, the levels depend on the length of the paragraphs/story
# print counts of unique words
eng_uw = count_unique_words(eng_df)
hin_uw = count_unique_words(hin_df)
tel_uw = count_unique_words(tel_df)
tel_uw.columns = ["Tel_unique_words"]
hin_uw.columns = ["Hin_unique_words"]
eng_uw.columns = ["Eng_unique_words"]
uw = pd.concat([eng_uw, hin_uw, tel_uw], axis=1)
print(uw)
Eng_unique_words Hin_unique_words Tel_unique_words
level
L1 20767 9170 5590
L2 36742 17302 10720
L3 33559 18562 19650
L4 29391 22535 19461
Conclusion: The levels are directly dependent on the number of unique words in the story
# Print Complexity index
eng_uw = complexity_index(eng_df)
hin_uw = complexity_index(hin_df)
tel_uw = complexity_index(tel_df)
tel_uw.columns = ["Tel_ci"]
hin_uw.columns = ["Hin_ci"]
eng_uw.columns = ["Eng_ci"]
uw = pd.concat([eng_uw, hin_uw, tel_uw], axis=1)
print(uw)
Eng_ci Hin_ci Tel_ci
level
L1 0.661238 0.612277 0.704127
L2 0.589510 0.557044 0.682861
L3 0.537485 0.473490 0.660715
L4 0.492188 0.446749 0.649041
Complexity Index: this is a metric which is a way of quantifying the level reading complexity. This metric is calculated Definition of CI = count of unique words in a story/total words in the story (Higher the CI, less complex the story) Conclusion: Levels is inversely dependent on the CI score
# find the number of single occurance words for each levels of story
eng_uw = round(count_single_occurance_words(eng_df), 0)
hin_uw = round(count_single_occurance_words(hin_df), 0)
tel_uw = round(count_single_occurance_words(tel_df), 0)
tel_uw.columns = ["Tel_count_single_occ_words"]
hin_uw.columns = ["Hin_count_single_occ_words"]
eng_uw.columns = ["Eng_count_single_occ_words"]
uw = pd.concat([eng_uw, hin_uw, tel_uw], axis=1)
print(uw)
Eng_count_single_occ_words Hin_count_single_occ_words \
level
L1 53.0 56.0
L2 120.0 135.0
L3 209.0 277.0
L4 386.0 477.0
Tel_count_single_occ_words
level
L1 61.0
L2 141.0
L3 369.0
L4 675.0
Conclusion: Finding the number of words that occur just once in a story is a good indicator of complexity of a story. The stories with higher levels tend to have more number of words that occur just once in the story