Jovian
⭐️
Sign In

Introduction

Scrap Amazon Reviews.

Imports

Import libraries and write settings here.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np

Analysis/Modeling

Create an function to get list of ASIN

In [4]:
def getAsinData(search_query,header,cookie):
    ## Write the url that is to be scrapped for a given search query
    url=
    page=requests.get(url,cookies=cookie,headers=header)

    if page.status_code==200:
        content=page.content
        soup=BeautifulSoup(content)
        ## Extract all tags which contain the asin number
        asin_tags=
        asin_list=[]
        for tag in asin_tags:
            asin=tag['data-asin']
            if asin!="":
                asin_list.append(asin)
        return asin_list
    else:
        return []
In [5]:
## Use the function above to extract all the ASIN's for a given query

Since the product page wont show all reviews, we get to get the all reviews pages link. Also, we need to get the total reviews present for an given asin

In [ ]:
def extractAllReviewPage(asin_number,header,cookie):
    #Write the url to access for a given asin
    url=
    print(url)
    page=requests.get(url,headers=header,cookies=cookie)
    #print(page.content)
    if page.status_code==200:
        print(page.content)
        content=page.content
        #print(content)
        try:
            soup=BeautifulSoup(content)
            ## For asin B00TFGWAA8 , when you click "See all 26,097 customer reviews" takes you to all reviews page. Extract the URL for this link
            all_reviews_link=
            #print(all_reviews_link)
            
            ## Use the same tag as above but extrat the text "See all 26,097 customer reviews". Then extract only 26,097 from it and convert it to number
            total_reviews=
            #print(total_reviews)
            
            total_reviews=int(total_reviews)
            keywords=[]
            ### You can navigate only till 5000 reviews (500 pages -amazon doesnt allow beyond it). So, it there are more than 5000 reviews, extract all the keyword present
            ##The keywords are present under Read Reviews that mention
            
            
            if total_reviews>5000:
                #get different keywords associated with the reviews and get that list. 
                keywords_link=
                num_keywords=len(keywords_link)
                for keyword_link in keywords_link:
                    keywords.append(keyword_link.get_text().strip())
                print(keywords)

            return all_reviews_link,total_reviews,keywords
        except Exception as e:
            print(e)
            return "",0,0
    else:
        return "",0,0

Use the above function to extract all_reviews_link, total_reviews and keywords

In [ ]:
def extractReviews(all_reviews_link,total_reviews,header,cookie):

    reviews_url="https://www.amazon.in/"+all_reviews_link
    ### Extract number of pages to extract - given total number of reviews and there are 10 reviews per pages
    num_pages=

    reviews_tag=[]
    

    print("Total pages "+str(num_pages))
    for page in range(0,num_pages):
        page_num=page+1
        if page_num==1:
            url=reviews_url
        else:
            url=reviews_url+"&pageNumber="+ ## Complete this URL
        page=requests.get(url,headers=header,cookies=cookie)
        content=page.content
        try:
            #Convert page content into BEautifulSoup object
            soup=
            ### Reviews are present in two sections in the page. We need the bottom one. Extarct where div tag gas class atribute a-section celwidget
            div_reviews=
            print("Number of Reviews in Page "+str(page_num)+" is  "+str(len(div_reviews)))
            for div_review in div_reviews:
                reviews_tag.append(div_review)

        except:
            pass
    return reviews_tag

Let us create a function to extract review details from the reviews tag extracted from above function

In [ ]:
def extractReviewDetails(reviews_tag):
    body_tags=[]
    title_tags=[]
    stars_tags=[]
    review_dates=[]
    data=pd.DataFrame()
    for review_tag in reviews_tag:
        try:


            ## Extarct the review body and strip of whitespaces
            body_tag=
            ## Extract title tag and strip whitespaces
            title_tag=
            ## Extract Ratings and strip whote spaces
            stars_tag=
            ##Extract Review date and strip white spaces
            review_date_tag=
            
            body_tags.append(body_tag)
            title_tags.append(title_tag)
            stars_tags.append(stars_tag)
            review_dates.append(review_date_tag)


        except Exception as e:

            print(e)
            pass





    data['review_text']=body_tags
    data['review_title']=title_tags
    data['review_ratings']=stars_tags
    data['review_date']=review_dates
    return data

LEt us write a function to extract reviews by Keyword and Ratings - to get more than 5000 reviews

In [ ]:
def extractReviewByKeyword(all_reviews_link,keyword,header,cookie):
    ## Split the reviews link till ref=
    all_reviews_link=all_reviews_link.split("ref=")[0]
    keyword=keyword.replace(" ","+")
    reviews_url=all_reviews_link+"?filterByKeyword="++"&sortBy=helpful" #complete this URL



    
    
    print("https://www.amazon.in/"+reviews_url+"&pageNumber=1")
    ## From just the first page,get the number of reviews present for this keyword
    page=requests.get("https://www.amazon.in/"+reviews_url+"&pageNumber=1",headers=header,cookies=cookie)
    print(page.status_code)
    content=page.content
    try:

        soup=BeautifulSoup(content)
        ##Extract total reviews present for this Review URL -(Hint: extarct line containing "SHowing 1-10 of") and convert it into integer
        total_review_keyword=
        print(total_review_keyword)
        
        total_review_keyword=int(total_review_keyword)
        print("Total Reviews for keyword "+keyword+"is "+str(total_review_keyword))

        ### Set it to 5000 as maximum is 5000
        if total_review_keyword>5000:
            total_review_keyword=5000
        ### Call the extractReviews function defined above to get the review tags
        reviews_tag=
        print("Number of Reviews Extracted for Keyword "+keyword)
        print(len(reviews_tag))
        return reviews_tag


    except Exception as e:
        print(e)
        pass
In [ ]:
### For each asin, get the total number of reviews present. If total_reviews>5000 use the extractReviewByKeyword,else use extractReview function.
### For list of review_tags collected,extract the review details and get dataframe for each asin and save it as asin_review.csv
### Before writing to csv drop_duplicates. - You may not get all reviews still (if you want to in addition to filterBykeywords you also need to filter by ratings) - For those who want to try it please be free