Created 3 years ago
import json
import pandas as pd
with open(r'C:\Users\abode\Documents\Data Science\NYT\president_df_final.json') as json_file:
presidents = json.load(json_file)
Use NLTK to get most common words, without stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\abode\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
True
all_stopwords = set(stopwords.words('english'))
additional_words = ['photo', 'new', 'jan', 'pres', 'government', 'two', 'president',
'american', 'people', 'united', 'feb', 'says', 'march', 'us', 'say',
'may', 'said', 'u', 'country', 'would', 'one', 'state', 'mr', 'year',
'states', 'years', 'world']
all_stopwords.update(additional_words)
for president in presidents:
presidents[president]['MCW'] = {}
for month in range(1,13):
presidents[president]['MCW'][str(month)] = {'words':[], 'freq':[]}
all_abstracts = ' '.join(presidents[president]['abstracts'][str(month)])
tokenizer = nltk.RegexpTokenizer(r"\w+")
all_words = tokenizer.tokenize(all_abstracts)
all_words_freq = nltk.FreqDist(w.lower() for w in all_words if w.lower() not in all_stopwords)
most_common = all_words_freq.most_common(10)
most_common_words = [word[0] for word in most_common]
most_common_freq = [word[1] for word in most_common]
presidents[president]['MCW'][str(month)]['words'] = most_common_words
presidents[president]['MCW'][str(month)]['freq'] = most_common_freq
# rearrange results into a pandas dataframe, with just president,
# President | Month | Positive (All) | Negative (All) | Positive (DIRECT) | Negative (DIRECT) | Most Common Word | Frequency
# Obama | January | 58% | 42% | 55% | 45% | Crisis | 100
final_df = pd.DataFrame(columns=['president', 'month', 'positive (all)', 'negative (all)', 'positive (direct)',
'negative (direct)','most common words', 'most common freq'])
for president in presidents:
for month in range(1,13):
final_df = final_df.append({'president': president,
'month': month,
'positive (all)': presidents[president]['sentiment (%)'][str(month)]['positive (all)'],
'negative (all)': presidents[president]['sentiment (%)'][str(month)]['negative (all)'],
'positive (direct)': presidents[president]['sentiment (%)'][str(month)]['positive (direct)'],
'negative (direct)': presidents[president]['sentiment (%)'][str(month)]['negative (direct)'],
'most common words': presidents[president]['MCW'][str(month)]['words'],
'most common freq': presidents[president]['MCW'][str(month)]['freq']
},
ignore_index=True)
final_df = final_df.explode(['most common words','most common freq'])
final_df