Jovian
⭐️
Sign In
In [1]:
!sudo pip3 install simplejson
import simplejson as json
Requirement already satisfied: simplejson in /usr/local/lib/python3.6/dist-packages (3.17.0)
In [2]:
# Connect Collab to Drive
from google.colab import drive
drive.mount('/content/gdrive')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [4]:
!ls /content/gdrive/My\ Drive/Temp/Amazon\ Reviews/
Electronics_5.json.gz Electronics.csv Main.ipynb
In [ ]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)
In [ ]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("//content//gdrive//My Drive//Temp//Amazon Reviews//Electronics_5.json.gz")
df.head()
In [8]:
import gzip

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

ratings = []

for review in parse("//content//gdrive//My Drive//Temp//Amazon Reviews//Electronics_5.json.gz"):
  ratings.append(review['overall'])

print(sum(ratings) / len(ratings))
4.26766835964799
In [11]:
# Electronics
import pandas as pd
electronics = pd.read_csv("//content//gdrive//My Drive//Temp//Amazon Reviews//Electronics.csv")
electronics.head()
Out[11]:
In [ ]:
# Electronics metadata
titles = []

i = 0
for review in parse("//content//gdrive//My Drive//Temp//Amazon Reviews//meta_Electronics.json.gz"):
  #print(review.keys())
  if 'title' in review.keys():
    titles.append(review["title"])
  i +=1
  if i == 1000000:
    break

# print(titles)
In [30]:
# https://www.amazon.com/s?k=car+gps&ref=nb_sb_noss
# % of titles with "car gps" in title
titles_GPS = []
for title in titles:
  if "car gps" in title.lower():
    titles_GPS.append(title)
    # print(title)

print(len(titles_GPS)/len(titles))
0.0002719694784159176
In [ ]:
 
In [ ]: