Most Common Words - Notebook by abode118 (abode118)

Created 3 years ago

import json
import pandas as pd

with open(r'C:\Users\abode\Documents\Data Science\NYT\president_df_final.json') as json_file:
    presidents = json.load(json_file)

Use NLTK to get most common words, without stopwords

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abode\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True

all_stopwords = set(stopwords.words('english'))

additional_words = ['photo', 'new', 'jan', 'pres', 'government', 'two', 'president',
                    'american', 'people', 'united', 'feb', 'says', 'march', 'us', 'say',
                    'may', 'said', 'u', 'country', 'would', 'one', 'state', 'mr', 'year',
                    'states', 'years', 'world']

all_stopwords.update(additional_words)

for president in presidents:
    presidents[president]['MCW'] = {}
    for month in range(1,13):
        presidents[president]['MCW'][str(month)] = {'words':[], 'freq':[]}
        
        all_abstracts = ' '.join(presidents[president]['abstracts'][str(month)])
        
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        all_words = tokenizer.tokenize(all_abstracts)
        all_words_freq = nltk.FreqDist(w.lower() for w in all_words if w.lower() not in all_stopwords)
        
        most_common = all_words_freq.most_common(10)
        most_common_words = [word[0] for word in most_common]
        most_common_freq = [word[1] for word in most_common]
        
        presidents[president]['MCW'][str(month)]['words'] = most_common_words
        presidents[president]['MCW'][str(month)]['freq'] = most_common_freq

# rearrange results into a pandas dataframe, with just president, 

# President | Month | Positive (All) | Negative (All) | Positive (DIRECT) | Negative (DIRECT) | Most Common Word | Frequency
# Obama | January | 58% | 42% | 55% | 45% | Crisis | 100

final_df = pd.DataFrame(columns=['president', 'month', 'positive (all)', 'negative (all)', 'positive (direct)',
                                 'negative (direct)','most common words', 'most common freq'])

for president in presidents:
    for month in range(1,13):
        final_df = final_df.append({'president': president,
                                    'month': month,
                                    'positive (all)': presidents[president]['sentiment (%)'][str(month)]['positive (all)'],
                                    'negative (all)': presidents[president]['sentiment (%)'][str(month)]['negative (all)'],
                                    'positive (direct)': presidents[president]['sentiment (%)'][str(month)]['positive (direct)'],
                                    'negative (direct)': presidents[president]['sentiment (%)'][str(month)]['negative (direct)'],
                                    'most common words': presidents[president]['MCW'][str(month)]['words'],
                                    'most common freq': presidents[president]['MCW'][str(month)]['freq']
                                   },
                                   ignore_index=True)

final_df = final_df.explode(['most common words','most common freq'])
final_df