The objective of this project is to deliver insights to understand customer demands better and thus help developers to popularize the product. The dataset is chosen from Kaggle. It is of 10k Play Store apps for analyzing the Android market. It consists of in total of 10841 rows and 13 columns.
!pip install numpy pandas seaborn matplotlib -q
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
!pip install jovian opendatasets --upgrade --quiet
dataset_url = 'https://www.kaggle.com/lava18/google-play-store-apps'
import opendatasets as od
od.download(dataset_url)
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: ritikars
Your Kaggle Key: ········
0%| | 0.00/1.94M [00:00<?, ?B/s]
Downloading google-play-store-apps.zip to .\google-play-store-apps
100%|██████████████████████████████████████████████████████████████████████████████| 1.94M/1.94M [00:05<00:00, 384kB/s]
google_playstore_df = pd.read_csv('google-play-store-apps/googleplaystore.csv')
google_playstore_df.columns
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
'Android Ver'],
dtype='object')
google_playstore_df.shape
(10841, 13)
google_playstore_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 App 10841 non-null object
1 Category 10841 non-null object
2 Rating 9367 non-null float64
3 Reviews 10841 non-null object
4 Size 10841 non-null object
5 Installs 10841 non-null object
6 Type 10840 non-null object
7 Price 10841 non-null object
8 Content Rating 10840 non-null object
9 Genres 10841 non-null object
10 Last Updated 10841 non-null object
11 Current Ver 10833 non-null object
12 Android Ver 10838 non-null object
dtypes: float64(1), object(12)
memory usage: 1.1+ MB
google_playstore_df.describe()
import jovian
jovian.commit()
google_playstore_df.loc[10472]
# Row 10472 removed due to missing value of Category
google_playstore_df.drop(google_playstore_df.index[10472], inplace=True)
google_playstore_df['Installs'] = google_playstore_df['Installs'].map(lambda x: x.rstrip('+'))
google_playstore_df['Installs'] = pd.to_numeric(google_playstore_df['Installs'].str.replace(',',''))
google_playstore_df['Price'] = pd.to_numeric(google_playstore_df['Price'].str.replace('$',''))
google_playstore_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 App 10840 non-null object
1 Category 10840 non-null object
2 Rating 9366 non-null float64
3 Reviews 10840 non-null object
4 Size 10840 non-null object
5 Installs 10840 non-null int64
6 Type 10839 non-null object
7 Price 10840 non-null float64
8 Content Rating 10840 non-null object
9 Genres 10840 non-null object
10 Last Updated 10840 non-null object
11 Current Ver 10832 non-null object
12 Android Ver 10838 non-null object
dtypes: float64(2), int64(1), object(10)
memory usage: 1.2+ MB
google_playstore_df['Installs'].min(),google_playstore_df['Installs'].max()
(0, 1000000000)
google_playstore_df['log_installs'] = np.log2(google_playstore_df['Installs'])
C:\Users\SinghRit\Anaconda3\lib\site-packages\pandas\core\series.py:679: RuntimeWarning: divide by zero encountered in log2
result = getattr(ufunc, method)(*inputs, **kwargs)
boolean = google_playstore_df['App'].duplicated().any()
boolean
True
google_playstore_df['App'].value_counts()
ROBLOX 9
CBS Sports App - Scores, News, Stats & Watch Live 8
ESPN 7
8 Ball Pool 7
Duolingo: Learn Languages Free 7
..
CHRONO TRIGGER (Upgrade Ver.) 1
No Pimple - Fun games 1
Modern Counter 3: FPS Multiplayers battlegro 3 1
BigOven Recipes, Meal Planner, Grocery List & More 1
Acorn TV: World-class TV from Britain and Beyond 1
Name: App, Length: 9659, dtype: int64
google_playstore_df[google_playstore_df['App']=='ROBLOX']
google_playstore_df.drop_duplicates(inplace=True)
google_playstore_df['Reviews'] = pd.to_numeric(google_playstore_df['Reviews'].str.replace('$',''))
google_playstore_df = google_playstore_df.loc[google_playstore_df.groupby(['App'])['Reviews'].idxmax()]
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['figure.facecolor'] = '#00000000'
top_genres = google_playstore_df.Genres.value_counts().reset_index().rename(columns={'Genres':'Count','index':'Genres'})
genres_installs = google_playstore_df.groupby(['Genres'])[['Installs']].sum()
top_genres_installs = pd.merge(top_genres, genres_installs, on='Genres')
top_20_genres_installs = top_genres_installs.head(20)
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Number of application")
plt.title("Top 20 Genres")
sns.barplot(top_20_genres_installs.Genres, top_20_genres_installs.Count)
plt.show()
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Installs")
plt.title("Installs according to Genres")
sns.barplot(top_20_genres_installs.Genres, top_20_genres_installs.Installs)
plt.show()
top_category = google_playstore_df.Category.value_counts().reset_index().rename(columns={'Category':'Count','index':'Category'})
category_installs = google_playstore_df.groupby(['Category'])[['Installs']].sum()
top_category_installs = pd.merge(top_category, category_installs, on='Category')
top_20_category_installs = top_category_installs
plt.figure(figsize=(14,7))
plt.xticks(rotation=90)
plt.xlabel("Category")
plt.ylabel("Number of application")
plt.title("Count of applications for each Category")
sns.barplot(top_20_category_installs.Category, top_20_category_installs.Count)
plt.show()
plt.figure(figsize=(14,7))
plt.xticks(rotation=90)
plt.xlabel("Category")
plt.ylabel("Installs")
plt.title("Number of installed applications for each Category")
sns.barplot(top_20_category_installs.Category, top_20_category_installs.Installs)
plt.show()
!pip install jovian --upgrade -q
import jovian
jovian.commit(project_name = 'course-project-google-play-store-dataset')
[jovian] Attempting to save notebook..
[jovian] Updating notebook "ritz1602-rs/course-project-google-play-store-dataset" on https://jovian.ml/
[jovian] Uploading notebook..
[jovian] Capturing environment..
[jovian] Committed successfully! https://jovian.ml/ritz1602-rs/course-project-google-play-store-dataset
genres_ratings_df = google_playstore_df.groupby(['Genres'])[['Rating']].mean()
genres_installs_ratings = pd.merge(top_genres_installs, genres_ratings_df, on='Genres')
genres_installs_ratings['Rating'].describe()
count 114.000000
mean 4.248546
std 0.182448
min 3.800000
25% 4.127083
50% 4.246502
75% 4.344442
max 4.800000
Name: Rating, dtype: float64
plt.figure(figsize=(14,7))
g = sns.kdeplot(genres_installs_ratings.Rating, color="Red", shade = True)
g.set_xlabel("Rating")
g.set_ylabel("Frequency")
plt.title('Distribution of Rating',size = 20)
plt.show()
jovian.commit(project_name = 'course-project-google-play-store-dataset')
[jovian] Attempting to save notebook..
[jovian] Updating notebook "ritz1602-rs/course-project-google-play-store-dataset" on https://jovian.ml/
[jovian] Uploading notebook..
[jovian] Capturing environment..
[jovian] Committed successfully! https://jovian.ml/ritz1602-rs/course-project-google-play-store-dataset
genres_installs_ratings.sort_values('Rating', ascending =False, inplace=True)
highest_rated_genres = genres_installs_ratings.iloc[0:20]
lowest_rated_genres = genres_installs_ratings.iloc[-20:]
lowest_rated_genres = lowest_rated_genres[lowest_rated_genres['Rating'].notnull()]
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Rating")
plt.title("Ratings according to Genres")
sns.barplot(highest_rated_genres.Genres, highest_rated_genres.Rating)
plt.show()
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Rating")
plt.title("Ratings according to Genres")
sns.barplot(lowest_rated_genres.Genres, lowest_rated_genres.Rating)
plt.show()
app_count = google_playstore_df.groupby(['Category','Type'])[['App']].count().reset_index().rename(columns={'App':'Count','index':'App'})
df_app_count = app_count.pivot('Category', 'Type', 'Count').fillna(0).reset_index()
df_app_count.set_index('Category').plot(kind='bar', stacked=True, figsize=(18,9))
plt.xlabel("Category", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.title("Count of applications in each category differentiated by their type")
plt.show()
google_playstore_df['Gaming Category App'] = google_playstore_df['Category']=='GAME'
categoty_type_installs = google_playstore_df.groupby(['Category','Type'])[['Installs']].sum().reset_index()
categoty_type_installs['log_Installs'] = np.log2(categoty_type_installs['Installs'])
plt.figure(figsize=(18,9))
plt.xticks(rotation=65,fontsize=9)
plt.xlabel("Category")
plt.ylabel("Installs")
plt.title("Number of installs type wise according to Category")
sns.barplot('Category', 'log_Installs', hue='Type', data=categoty_type_installs);
plt.show()
# Converting KB to MB
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace('Varies with device', 'NaN') if 'Varies with device' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace('M', '') if 'M' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace(',', '') if 'M' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: float(x))
plt.figure(figsize=(14,7))
sns.scatterplot(google_playstore_df['Size'], google_playstore_df['log_installs'], hue=google_playstore_df['Type'])
plt.show()
google_playstore_df.loc[google_playstore_df['log_installs']==google_playstore_df['log_installs'].min(),'log_installs']=0
plt.xlabel("Log of Installs")
plt.title("Distribution of Logrithm of Installs")
plt.hist(google_playstore_df['log_installs']);