Jovian
⭐️
Sign In

Scrapping about # Greta Thunberg - Swedish environmental activist

Greta is a 16 year word girl who started a campaigne on climate change and has gained international recognition since 2018.

I have seen her videos like "How dare you"- https://www.youtube.com/watch?v=u9KxE4Kv9A8 which tells me that she is a straight forward and blunt girl.

This gained her # world's recognistion and also # world's Target

In [1]:
import os
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
greta= pd.read_csv("TweetScrapersTweets.csv", encoding='latin-1')

In [3]:
greta
Out[3]:
In [4]:
greta.shape
Out[4]:
(19816, 13)
In [5]:
greta.columns
Out[5]:
Index(['ID', 'datetime', 'has_media', 'is_reply', 'is_retweet', 'medias',
       'nbr_favorite', 'nbr_reply', 'nbr_retweet', 'text', 'url', 'user_id',
       'usernameTweet'],
      dtype='object')

Cleaning the data

In [6]:
greta_clean= greta.drop(columns=["ID","user_id", "medias","has_media", "usernameTweet", "url"]) 
In [7]:
greta_clean
pd.options.display.max_colwidth=-1
In [8]:
greta_clean
Out[8]:
In [9]:
greta_clean.describe()
Out[9]:
In [10]:
greta_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19816 entries, 0 to 19815 Data columns (total 7 columns): datetime 19816 non-null object is_reply 19816 non-null bool is_retweet 19816 non-null bool nbr_favorite 19816 non-null int64 nbr_reply 19816 non-null int64 nbr_retweet 19816 non-null int64 text 19816 non-null object dtypes: bool(2), int64(3), object(2) memory usage: 812.8+ KB

So there is no missing values

In [11]:
greta_clean['text']=greta_clean['text'].str.strip().str.lower().str.replace('bindreviewcontent',"").str.replace(";","").str.replace('\n',"")
In [12]:
def getLength(message):
    message_token=message.split(" ") #taking the length of the words 
    return len(message_token)
In [13]:
greta_clean['length_of_message']=greta_clean['text'].apply(lambda message: getLength(message))
In [14]:
greta_clean.head()
Out[14]:
Trying to see the "Distribution of length of message"
In [15]:
sns.kdeplot(greta_clean['length_of_message']).set_title(" Distribution of Length of Message")

Out[15]:
Text(0.5, 1.0, ' Distribution of Length of Message')
Notebook Image
In [16]:
sns.distplot(greta_clean['length_of_message']).set_title(" Distribution of Length of Message")      ### univariate plot and numerical values plot
plt.show()
Notebook Image
In [17]:
sns.kdeplot(greta_clean['nbr_retweet']).set_title(" Distribution of Retweet")
Out[17]:
Text(0.5, 1.0, ' Distribution of Retweet')
Notebook Image
In [18]:
sns.kdeplot(greta_clean.loc[greta_clean['nbr_reply'],"length_of_message"],label='reply');

# beautifying the labels
plt.xlabel('Length of Message')
plt.ylabel('density')
plt.show()
Notebook Image
In [19]:
sns.kdeplot(greta_clean.loc[greta_clean['nbr_retweet'],"length_of_message"],label='retweet');
sns.kdeplot(greta_clean.loc[greta_clean['nbr_reply'],"length_of_message"],label='reply');

# beautifying the labels
plt.xlabel('Length of Message')
plt.ylabel('density')
plt.show()
Notebook Image
we can see the distribution of reply of the message and retweets has nearly in between 40-50

Extracting Hashtags

In [20]:
import re

def has(rate):
    text=rate
    return re.findall("(#\w+)",text)
In [21]:
greta_clean['hashtag']=greta_clean['text'].apply(lambda x: has(x))
In [24]:
greta_clean.head()
Out[24]:
In [25]:
def en(rate):
    text=rate
    return len(text)
In [26]:
greta_clean['length_of_hashtag']=greta_clean['hashtag'].apply(lambda x: en(x)) # trying to see how many hashtags are there in a  tweet 
In [27]:
greta_clean.columns
Out[27]:
Index(['datetime', 'is_reply', 'is_retweet', 'nbr_favorite', 'nbr_reply',
       'nbr_retweet', 'text', 'length_of_message', 'hashtag',
       'length_of_hashtag'],
      dtype='object')
In [28]:
# take the rows from the hashtag columns where there are actually hashtags
hashtags_list_df = greta_clean.loc[greta_clean.hashtag.apply(lambda hashtags_list: hashtags_list !=[]),['hashtag']]
In [29]:

# create dataframe where each use of hashtag gets its own row
flattened_hashtags_df = pd.DataFrame(
    [hashtag for hashtags_list in hashtags_list_df.hashtag
    for hashtag in hashtags_list],
    columns=['hashtag'])
In [30]:
flattened_hashtags_df['hashtag'].unique().size #seeing how many unique hashtags are present in the whole dataframe
Out[30]:
8925
In [31]:
# count of appearances of each hashtag
popular_hashtags = flattened_hashtags_df.groupby('hashtag').size()\
                                        .reset_index(name='counts')\
                                        .sort_values('counts', ascending=False)\
                                        .reset_index(drop=True)
In [32]:
# number of times each hashtag appears
counts = flattened_hashtags_df.groupby(['hashtag']).size()\
                              .reset_index(name='counts')\
                              .counts

# defining bins for histogram                              
my_bins = np.arange(0,counts.max()+400, 400)-0.5

# ploting histogram of tweet counts
plt.figure()
plt.hist(counts, bins = my_bins)
plt.xlabels = np.arange(1,counts.max()+400, 400)
plt.xlabel('hashtag number of appearances')
plt.ylabel('frequency')
plt.yscale('log', nonposy='clip')
plt.show()
Notebook Image
In [33]:
greta_clean['hashtag']=greta_clean['hashtag'].astype(str)
In [34]:
import os
from wordcloud import WordCloud
import string
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_colwidth', 100)
import numpy as np
from PIL import Image
%matplotlib inline

maskArray = np.array(Image.open("Bird.png"))
wordcloud = WordCloud(height=2000, width=2000, stopwords=set(stopwords.words('english')),mask = maskArray, background_color='white')
wordcloud = wordcloud.generate(' '.join(greta_clean['hashtag'].tolist()))
plt.imshow(wordcloud)
wordcloud.to_file("wordhash.png")
plt.title("Most common words in the hashtags")    ### for most frequent words in hashtags
plt.axis('off')
plt.show()
Notebook Image
So the popular hashtags are ofcourse "#gretathurnberg", "climatechange", "#climateactionsummit", "#climatechangehoax", "climatechangeisreal", "climateaction" and more popular hashtag which is uncommon of all is "#howdareyou", "#greta4nobelprize"

Trying to see for whole dataframe which has high frequency

In [35]:
wordcloud = WordCloud(max_font_size=200, max_words=1000, background_color="white",collocations=False).generate(str(greta_clean))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")                                #### most frequent words in big size font
plt.show()
wordcloud.to_file("WordCloud_Unigrams_Tweets.png")
Notebook Image
Out[35]:
<wordcloud.wordcloud.WordCloud at 0x29f35ee5a90>
I couldn't understand but when i performed a wordcloud for whole dataframe the statement of "false" after "gretathurnberg" is high

Trying to compare the length of the message and length of the hastag

by using this i can see that is length of the message and length of hastag has relation or not

In [36]:
sns.kdeplot(greta_clean.loc[greta_clean['length_of_hashtag'],"length_of_message"],label='nbr of hashtag');


# beautifying the labels
plt.xlabel('Length of Hashtag')
plt.ylabel('density')
plt.show()
Notebook Image

So I thing a medium level of messages contians more has tags

Cleaning the Text

In [37]:
import string


from nltk import PorterStemmer

clean=""
def clean_text(clean):
    ps=PorterStemmer()
    clean = clean.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    #remove extra white space
    
    text_cleaned=re.sub(r'^https?:\/\/.*[\r\n]*', '',clean,flags=re.MULTILINE)
    text_cleaned="".join([x for x in clean if x not in string.punctuation])
    text_cleaned=re.sub('\n','',text_cleaned)
    text_cleaned=re.sub('\d+','',text_cleaned)
    text_cleaned=re.sub('twitter .com','',text_cleaned)
    Stopwords=stopwords.words('english')

    text_cleaned=re.sub(' +', ' ', text_cleaned)
    text_cleaned=text_cleaned.lower()
    tokens=text_cleaned.split(" ")
    tokens=[token for token in tokens if token not in Stopwords]
    text_cleaned=" ".join([ps.stem(token) for token in tokens])    
    
    return text_cleaned


print(clean_text(clean))
In [38]:
greta_clean['cleaned_tweet']=greta_clean['text'].apply(lambda x:clean_text(x))
greta_clean.head()
Out[38]:
In [39]:
greta_clean['datetime']= pd.to_datetime(greta_clean['datetime'], errors='coerce')
In [40]:
greta_clean['date']=greta_clean['datetime'].dt.date
In [41]:
greta_clean['date']
Out[41]:
0        2019-09-23
1        2019-09-23
2        2019-09-23
3        2019-09-23
4        2019-09-23
5        2019-09-23
6        2019-09-23
7        2019-09-23
8        2019-09-23
9        2019-09-23
10       2019-09-23
11       2019-09-23
12       2019-09-23
13       2019-09-23
14       2019-09-23
15       2019-09-23
16       2019-09-23
17       2019-09-23
18       2019-09-23
19       2019-09-23
20       2019-09-23
21       2019-09-23
22       2019-09-23
23       2019-09-23
24       2019-09-23
25       2019-09-23
26       2019-09-23
27       2019-09-23
28       2019-09-23
29       2019-09-23
            ...    
19786    2019-05-12
19787    2019-05-12
19788    2019-05-12
19789    2019-05-12
19790    2019-05-12
19791    2019-05-12
19792    2019-05-12
19793    2019-05-12
19794    2019-05-12
19795    2019-05-12
19796    2019-05-12
19797    2019-05-12
19798    2019-05-12
19799    2019-05-12
19800    2019-05-12
19801    2019-05-12
19802    2019-05-12
19803    2019-05-12
19804    2019-05-12
19805    2019-05-12
19806    2019-05-12
19807    2019-05-12
19808    2019-05-12
19809    2019-05-12
19810    2019-05-12
19811    2019-05-12
19812    2019-05-12
19813    2019-05-12
19814    2019-05-12
19815    2019-05-12
Name: date, Length: 19816, dtype: object

Now i am ploting no. of tweets for that particular date

In [42]:
plt.figure(figsize=(16,8))
sns.lineplot(x='date',y='datetime', data=greta_clean,estimator=None)
C:\Users\Nivya Sree\Anaconda3\anaconda3\lib\site-packages\pandas\plotting\_converter.py:129: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters. To register the converters: >>> from pandas.plotting import register_matplotlib_converters >>> register_matplotlib_converters() warnings.warn(msg, FutureWarning)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x29f34f61da0>
Notebook Image
In [43]:
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from matplotlib import pyplot

Converting Value_counts to dataframe

This helps me to plot the graph in a beautiful way

And to find no. of tweets based on date
In [44]:
value_counts=greta_clean['date'].value_counts()
In [45]:
df = value_counts.rename_axis('unique_dates').reset_index(name='counts')
print (df)
   
unique_dates counts 0 2019-09-24 8455 1 2019-09-23 1264 2 2019-09-27 1151 3 2019-09-28 1143 4 2019-09-25 890 5 2019-09-26 609 6 2019-09-29 445 7 2019-10-19 378 8 2019-03-10 297 9 2019-10-18 242 10 2019-05-12 240 11 2019-04-10 178 12 2019-09-30 170 13 2019-01-10 154 14 2019-10-14 147 15 2019-05-10 145 16 2019-04-12 142 17 2019-06-10 134 18 2019-10-17 134 19 2019-04-11 133 20 2019-10-10 131 21 2019-10-16 129 22 2019-03-11 126 23 2019-11-10 124 24 2019-09-10 122 25 2019-10-26 119 26 2019-02-11 117 27 2019-01-11 112 28 2019-10-23 111 29 2019-10-25 111 .. ... ... 44 2019-07-11 56 45 2019-11-18 55 46 2019-11-13 55 47 2019-10-27 54 48 2019-10-24 52 49 2019-08-11 49 50 2019-10-28 48 51 2019-11-22 45 52 2019-10-29 44 53 2019-08-10 43 54 2019-11-19 42 55 2019-11-21 42 56 2019-12-10 39 57 2019-11-11 37 58 2019-11-14 31 59 2019-03-12 31 60 2019-11-29 30 61 2019-02-12 29 62 2019-11-16 27 63 2019-11-17 25 64 2019-11-24 23 65 2019-01-12 23 66 2019-11-15 22 67 2019-11-20 20 68 2019-11-30 17 69 2019-11-28 17 70 2019-11-27 16 71 2019-11-23 15 72 2019-11-25 13 73 2019-11-26 8 [74 rows x 2 columns]
In [45]:
plt.figure(figsize=(16,6))
ax=sns.lineplot(x='unique_dates',y='counts', color="coral", data=df,estimator=None)
Notebook Image
In [46]:
plt.figure(figsize=(16,6))
sns.lineplot(x='counts',y='unique_dates', data=df,estimator=None)
Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x29f31212ef0>
Notebook Image

let me see for a particular hour

In [47]:
greta_clean['hour']=greta_clean['datetime'].dt.hour
In [48]:
value_hours=greta_clean['hour'].value_counts()
In [49]:
df1 = value_hours.rename_axis('unique_hours').reset_index(name='counts')
print (df1)
   
unique_hours counts 0 20 1159 1 22 1158 2 23 1149 3 2 1122 4 0 1103 5 4 1032 6 3 1028 7 19 970 8 17 913 9 21 905 10 5 862 11 1 843 12 6 768 13 15 735 14 8 719 15 14 711 16 11 681 17 13 661 18 7 626 19 18 622 20 10 569 21 16 557 22 9 476 23 12 447
In [50]:
plt.figure(figsize=(16,6))
ax=sns.lineplot(x='unique_hours',y='counts', color="coral",data=df1,estimator=None)
Notebook Image

If you see this you can understand night after 8 or from 8 o'clock, 10 o'clock, 11 o'clockthe tweets are high and from Midnight onwards it is decreasing

In [51]:
plt.figure(figsize=(16,6))
ax=sns.lineplot(x='date',y='hour',data=greta_clean,estimator=None)
Notebook Image
I just tried to see at which hour and which date the posts is too high

I think the date between 2019-09 to 2019-11 has processed more tweets during that period because in September 2019, she addressed the UN Climate Action Summit in New York. So, this may be reason to write more on her

Let me understand the sentiment of my text

In [52]:
!pip install vaderSentiment 
Requirement already satisfied: vaderSentiment in c:\users\nivya sree\anaconda3\anaconda3\lib\site-packages (3.2.1)
In [53]:
#using vader sentiment to get positives and negative of the statement

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
  
# function to print sentiments 
# of the sentence. 

Function to find the sentiment by using vader

In [54]:
def sentiment_score(sentence):
    sid_obj = SentimentIntensityAnalyzer() # polarity_scores method of SentimentIntensityAnalyzer 
    # oject gives a sentiment dictionary.  
    sentiment_dict = sid_obj.polarity_scores(sentence)  # which contains pos, neg, neu, and compound scores.
    # decide sentiment as positive, negative and neutral 
    if sentiment_dict['compound'] >= 0.05 : 
        return("Positive") 
  
    elif sentiment_dict['compound'] <= - 0.05 : 
        return("Negative") 
  
    else : 
        return("Neutral") 
        
In [55]:
greta_clean['sentiment']= greta_clean['text'].apply(lambda x:sentiment_score(x))
In [56]:
greta_clean
Out[56]:
In [57]:
sns.kdeplot(greta_clean.loc[greta_clean['sentiment']== 'Positive',"hour"],label='positive');
sns.kdeplot(greta_clean.loc[greta_clean['sentiment']== 'Negative' ,"hour"],label='negative');
sns.kdeplot(greta_clean.loc[greta_clean['sentiment']== 'Neutral',"hour"],label='neutral');

# beautifying the labels
plt.xlabel('sentiment over period')
plt.ylabel('score')
plt.show()
Notebook Image
there is no difference of distribution of hourly data distribution and length of the message it is like quite similar in distribution
In [58]:
plt.figure(figsize=(10,6))
sns.countplot(greta_clean['sentiment'])     
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x29f34f70860>
Notebook Image
Positivity is high in the tweets done for Miss.Greta but negativity is also is in like a reach to the positivity

which tells that she has same attraction for the followers and also protesters.

She is a leader and a target

Trying to make a WordCloud for Hashtags

In [57]:
greta_clean.columns
Out[57]:
Index(['datetime', 'is_reply', 'is_retweet', 'nbr_favorite', 'nbr_reply',
       'nbr_retweet', 'text', 'length_of_message', 'hashtag',
       'length_of_hashtag', 'cleaned_tweet', 'date', 'hour', 'sentiment'],
      dtype='object')
In [59]:
import os
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_colwidth', 100)
import numpy as np
from PIL import Image
%matplotlib inline



maskArray = np.array(Image.open("Bird.png"))
wordcloud = WordCloud(height=2000, width=2000, stopwords=set(stopwords.words('english')),mask = maskArray, background_color='white')
wordcloud = wordcloud.generate(' '.join(greta_clean.loc[greta_clean['sentiment']== 'Positive','cleaned_tweet'].tolist()))

plt.title("Most common positive tweet")
plt.imshow(wordcloud)
wordcloud.to_file("wordbird.png")
plt.axis('off')
plt.show()
Notebook Image
As i told before the positive hashtags or vibrartions revolve around ofcourse "#gretathurnberg", "climatechange", "#climateactionsummit", "#climatechangehoax", "climatechangeisreal", "climateaction" , "inspir" ,"go", "power", and so many
In [60]:
maskArray = np.array(Image.open("Bird.png"))
wordcloud = WordCloud(height=2000, width=2000, stopwords=set(stopwords.words('english')),mask = maskArray, background_color='white')
wordcloud = wordcloud.generate(' '.join(greta_clean.loc[greta_clean['sentiment']== 'Negative','cleaned_tweet'].tolist()))

plt.title("Most common negative tweet")
plt.imshow(wordcloud)
wordcloud.to_file("wordbird1.png")
plt.axis('off')
plt.show()
Notebook Image
Well if there is fame then the person comes in to the eyes of people can be as a Icon or a Target

A the negative hashtags or vibrartions revolve around ofcourse "#gretathurnberg", "climatechange", "#climateactionsummit", "#climatechangehoax", "climatechangeisreal", "climateaction" , "inspir" ,"go", "power", and so many

In [61]:
maskArray = np.array(Image.open("Bird.png"))
wordcloud = WordCloud(height=2000, width=2000, stopwords=set(stopwords.words('english')),mask = maskArray, background_color='white')
wordcloud = wordcloud.generate(' '.join(greta_clean.loc[greta_clean['sentiment']== 'Neutral','cleaned_tweet'].tolist()))

plt.title("Most common neutral tweet")
plt.imshow(wordcloud)
wordcloud.to_file("wordbird1.png")
plt.axis('off')
plt.show()
Notebook Image

Unigrams

Distribution of postive and negative tweets

In [63]:
def getMostCommon(tweets_list,topn=20):
    tweets=" ".join(tweets_list)
    tokenised_tweets=tweets.split(" ")
    
    
    freq_counter=Counter(tokenised_tweets)
    return freq_counter.most_common(topn) # return words with the highest frequencies
In [64]:
from collections import Counter
In [65]:
positive_tweets=greta_clean.loc[greta_clean['sentiment']== 'Positive','cleaned_tweet']
In [66]:
top_20_positive_tweet_words=getMostCommon(positive_tweets,20)
In [68]:
negative_tweets=greta_clean.loc[greta_clean['sentiment']== 'Negative','cleaned_tweet']
In [69]:
top_20_negative_tweet_words=getMostCommon(negative_tweets,20)
In [70]:
top_20_positive_tweet_words


Out[70]:
[('gretathurnberg', 8000),
 ('http', 3179),
 ('â\xa0', 2999),
 ('', 2725),
 ('â\x80¦', 2722),
 ('pic', 1684),
 ('greta', 1398),
 ('statu', 1293),
 ('like', 1182),
 ('climat', 1032),
 ('gretathunberg', 992),
 ('climatechang', 963),
 ('world', 887),
 ('chang', 845),
 ('peopl', 781),
 ('com', 582),
 ('us', 568),
 ('www', 553),
 ('thunberg', 551),
 ('climatestrik', 531)]
In [71]:
top_20_negative_tweet_words
Out[71]:
[('gretathurnberg', 6949),
 ('', 2730),
 ('http', 2463),
 ('â\xa0', 2305),
 ('â\x80¦', 2139),
 ('pic', 1281),
 ('climat', 1278),
 ('greta', 1184),
 ('statu', 926),
 ('climatechang', 894),
 ('peopl', 831),
 ('chang', 755),
 ('world', 740),
 ('child', 730),
 ('gretathunberg', 570),
 ('girl', 567),
 ('year', 546),
 ('thunberg', 538),
 ('com', 494),
 ('go', 492)]
In [72]:
def plotMostCommonWords(tweets_list,topn=20,title="Common tweet Words",color="blue",axis=None): #default number of words is given as 20
    top_words=getMostCommon(tweets_list,topn=topn)
    data=pd.DataFrame()
    data['words']=[val[0] for val in top_words]
    data['freq']=[val[1] for val in top_words]
    if axis!=None:
        sns.barplot(y='words',x='freq',data=data,color=color,ax=axis).set_title(title+" top "+str(topn))
    else:
        sns.barplot(y='words',x='freq',data=data,color=color).set_title(title+" top "+str(topn))

        
from matplotlib import rcParams

rcParams['figure.figsize'] = 8,6 ## Sets the heigth and width of image

                ## single words in a tweets
fig,ax=plt.subplots(1,2)
fig.subplots_adjust(wspace=0.5) #Adjusts the space between the two plots
plotMostCommonWords(positive_tweets,20,"Positive tweet Unigrams",axis=ax[0])

plotMostCommonWords(negative_tweets,20,"Negative tweet Unigrams",color="red",axis=ax[1])

Notebook Image

Bigrams

Distribution of postive and negative tweets

In [73]:
def generateNGram(text,n):
    tokens=text.split(" ")
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return ["_".join(ngram) for ngram in ngrams]
In [74]:
positive_tweets_bigrams=[" ".join(generateNGram(tweet,2)) for tweet in positive_tweets]
negative_tweets_bigrams=[" ".join(generateNGram(tweet,2)) for tweet in negative_tweets] 
In [75]:
rcParams['figure.figsize'] = 15,20
fig,ax=plt.subplots(1,2)        ### combination of two words in tweets
fig.subplots_adjust(wspace=1)
plotMostCommonWords(positive_tweets_bigrams,40,"Positive tweet Bigrams",axis=ax[0])

plotMostCommonWords(negative_tweets_bigrams,40,"Negative tweet Bigrams",color="red",axis=ax[1])