Jovian
Sign In

Data Science Tweets


A scraper that implements the scraping of tweets about data science in parallel.

In [ ]:
# Install the twitter_scraper package using pip3
! pip3 install -q twitter_scraper 
! pip install -q jovian
In [ ]:
from twitter_scraper import get_tweets
from multiprocessing import Pool
import pandas as pd
from ast import literal_eval
from tqdm.notebook import tqdm
from time import sleep
import jovian
In [ ]:
# List of hashtags that we're interested in
keywords = ['machinelearning', 'ML', 'deeplearning', 
            '#artificialintelligence', '#NLP', 'computervision', 'AI', 
            'tensorflow', 'pytorch', "sklearn", "pandas", "plotly", 
            "spacy", "fastai", 'datascience', 'dataanalysis']
In [8]:
# Lets run one iteration to understand how to implement this library
tweets = get_tweets("#machinelearning", pages = 5)
tweets_df = pd.DataFrame()

# Lets print the keys and values obtained
for tweet in tweets:
  print('Keys:', list(tweet.keys()), '\n')
  break

# Running the code for one keyword and extracting the relevant data
for tweet in tweets:
  _ = pd.DataFrame({'text' : [tweet['text']],
                    'isRetweet' : tweet['isRetweet'],
                    'replies' : tweet['replies'],
                    'retweets' : tweet['retweets'],
                    'likes' : tweet['likes']
                    })
  tweets_df = tweets_df.append(_, ignore_index = True)
tweets_df.head()
Keys: ['tweetId', 'isRetweet', 'time', 'text', 'replies', 'retweets', 'likes', 'entries']
Out[8]:
In [49]:
# We'll measure the time it takes to complete this process sequentially
%%time
all_tweets_df = pd.DataFrame()
for word in tqdm(keywords):
  tweets = get_tweets(word, pages = 100)
  try:
    for tweet in tweets:    
      _ = pd.DataFrame({'hashtag' : word, 
                        'text' : [tweet['text']],
                        'isRetweet' : tweet['isRetweet'],
                        'replies' : tweet['replies'],
                        'retweets' : tweet['retweets'],
                        'likes' : tweet['likes']
                      })
      all_tweets_df = all_tweets_df.append(_, ignore_index = True)
  except Exception as e: 
    print(word, ':', e)
    continue
HBox(children=(IntProgress(value=0, max=16), HTML(value='')))
CPU times: user 3min 3s, sys: 254 ms, total: 3min 3s Wall time: 4min 58s
In [ ]:
# We'll create a function to fetch the tweets and store it for us
def fetch_tweets(word):
  tweet_df = pd.DataFrame()
  tweets = get_tweets(word, pages=100)
  try:
    for tweet in tweets:    
      _ = pd.DataFrame({'hashtag' : word, 
                        'text' : [tweet['text']],
                        'isRetweet' : tweet['isRetweet'],
                        'replies' : tweet['replies'],
                        'retweets' : tweet['retweets'],
                        'likes' : tweet['likes']
                      })
      tweet_df = tweet_df.append(_, ignore_index = True)
  except Exception as e: 
    print(word, ':', e)
  return tweet_df
In [51]:
# We'll run this in parallel with 4 subprocesses to compare the times
%%time
with Pool(4) as p:
    records = p.map(fetch_tweets, keywords)
CPU times: user 69.7 ms, sys: 49.9 ms, total: 120 ms Wall time: 58.3 s
In [52]:
jovian.commit('pool implemented')
[jovian] Error: Failed to detect Jupyter notebook or Python script. Skipping..