The objective of this project is to deliver insights to understand customer demands better and thus help developers to popularize the product. The dataset is chosen from Kaggle. It is of 10k Play Store apps for analyzing the Android market. It consists of in total of 10841 rows and 13 columns.
!pip install numpy pandas seaborn matplotlib -q
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
!pip install jovian opendatasets --upgrade --quiet
dataset_url = 'https://www.kaggle.com/lava18/google-play-store-apps'
import opendatasets as od
od.download(dataset_url)
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: ritikars
Your Kaggle Key: ········
0%| | 0.00/1.94M [00:00<?, ?B/s]
Downloading google-play-store-apps.zip to .\google-play-store-apps
100%|█████████████████████████████████████████████████████████████████████████████| 1.94M/1.94M [00:00<00:00, 2.88MB/s]
google_playstore_df = pd.read_csv('google-play-store-apps/googleplaystore.csv')
google_playstore_df.columns
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
'Android Ver'],
dtype='object')
google_playstore_df.shape
(10841, 13)
google_playstore_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 App 10841 non-null object
1 Category 10841 non-null object
2 Rating 9367 non-null float64
3 Reviews 10841 non-null object
4 Size 10841 non-null object
5 Installs 10841 non-null object
6 Type 10840 non-null object
7 Price 10841 non-null object
8 Content Rating 10840 non-null object
9 Genres 10841 non-null object
10 Last Updated 10841 non-null object
11 Current Ver 10833 non-null object
12 Android Ver 10838 non-null object
dtypes: float64(1), object(12)
memory usage: 1.1+ MB
google_playstore_df.describe()
import jovian
jovian.commit()
[jovian] Attempting to save notebook..
google_playstore_df.loc[10472]
App Life Made WI-Fi Touchscreen Photo Frame
Category 1.9
Rating 19
Reviews 3.0M
Size 1,000+
Installs Free
Type 0
Price Everyone
Content Rating NaN
Genres February 11, 2018
Last Updated 1.0.19
Current Ver 4.0 and up
Android Ver NaN
Name: 10472, dtype: object
# Row 10472 removed due to missing value of Category
google_playstore_df.drop(google_playstore_df.index[10472], inplace=True)
google_playstore_df['Installs'] = google_playstore_df['Installs'].map(lambda x: x.rstrip('+'))
google_playstore_df['Installs'] = pd.to_numeric(google_playstore_df['Installs'].str.replace(',',''))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
pandas\_libs\lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "Free"
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-17-26e58613fa20> in <module>
----> 1 google_playstore_df['Installs'] = pd.to_numeric(google_playstore_df['Installs'].str.replace(',',''))
~\Anaconda3\lib\site-packages\pandas\core\tools\numeric.py in to_numeric(arg, errors, downcast)
148 try:
149 values = lib.maybe_convert_numeric(
--> 150 values, set(), coerce_numeric=coerce_numeric
151 )
152 except (ValueError, TypeError):
pandas\_libs\lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "Free" at position 10472
google_playstore_df['Price'] = pd.to_numeric(google_playstore_df['Price'].str.replace('$',''))
google_playstore_df['Category'] = google_playstore_df['Category'].str.replace(' ','')
google_playstore_df.info()
google_playstore_df['log_installs'] = np.log2(google_playstore_df['Installs'])
boolean = google_playstore_df['App'].duplicated().any()
boolean
google_playstore_df['App'].value_counts()
google_playstore_df[google_playstore_df['App']=='ROBLOX']
google_playstore_df.drop_duplicates(inplace=True)
google_playstore_df = google_playstore_df.loc[google_playstore_df.groupby(['App'])['Reviews'].idxmax()]
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['figure.facecolor'] = '#00000000'
top_genres = google_playstore_df.Genres.value_counts().reset_index().rename(columns={'Genres':'Count','index':'Genres'})
top_genres
genres_installs = google_playstore_df.groupby(['Genres'])[['Installs']].sum()
genres_installs
top_genres_installs = pd.merge(top_genres, genres_installs, on='Genres')
top_20_genres_installs = top_genres_installs.head(20)
top_genres_installs
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Number of application")
plt.title("Top 20 Genres")
sns.barplot(top_20_genres_installs.Genres, top_20_genres_installs.Count)
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Installs")
plt.title("Installs according to Genres")
sns.barplot(top_20_genres_installs.Genres, top_20_genres_installs.Installs)
google_playstore_df
genres_ratings_df = google_playstore_df.groupby(['Genres'])[['Rating']].mean()
genres_ratings_df
genres_installs_ratings = pd.merge(top_genres_installs, genres_ratings_df, on='Genres')
genres_installs_ratings.head()
genres_installs_ratings.sort_values('Rating', ascending =False, inplace=True)
highest_rated_genres = genres_installs_ratings.iloc[0:20]
lowest_rated_genres = genres_installs_ratings.iloc[-20:]
lowest_rated_genres = lowest_rated_genres[lowest_rated_genres['Rating'].notnull()]
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Rating")
plt.title("Ratings according to Genres")
sns.barplot(highest_rated_genres.Genres, highest_rated_genres.Rating)
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Rating")
plt.title("Ratings according to Genres")
sns.barplot(lowest_rated_genres.Genres, lowest_rated_genres.Rating)
google_playstore_df['Gaming Category App'] = google_playstore_df['Category']=='GAME'
google_playstore_df
categoty_type_installs = google_playstore_df.groupby(['Category','Type'])[['Installs']].sum().reset_index()
categoty_type_installs
categoty_type_installs['log_Installs'] = np.log2(categoty_type_installs['Installs'])
categoty_type_installs
# categoty_type_installs = categoty_type_installs[
# categoty_type_installs['Genres'].isin(highest_rated_genres['Genres'].unique())
# ]
# categoty_type_installs
plt.figure(figsize=(14,7))
plt.xticks(rotation=65,fontsize=9)
plt.xlabel("Category")
plt.ylabel("Installs")
plt.title("Number of installs type wise according to Category")
sns.barplot('Category', 'log_Installs', hue='Type', data=categoty_type_installs);
# Converting KB to MB
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace('Varies with device', 'NaN') if 'Varies with device' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace('M', '') if 'M' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace(',', '') if 'M' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: float(x))
plt.figure(figsize=(14,7))
sns.scatterplot(google_playstore_df['Size'], google_playstore_df['log_installs'], hue=google_playstore_df['Type'])
google_playstore_df.loc[google_playstore_df['log_installs']==google_playstore_df['log_installs'].min(),'log_installs']=0
plt.title("Distribution of Logrithm of Installs")
plt.hist(google_playstore_df['log_installs']);
# to remove null values from size column
google_playstore_df.loc[google_playstore_df['Size'].isnull(),'Size']=0
plt.title("Distribution of Size")
plt.hist(google_playstore_df['Size']);
user_reviews_df = pd.read_csv('googleplaystore_user_reviews.csv')
user_reviews_df
merged_df = google_playstore_df.merge(user_reviews_df, on="App")
merged_df
category_sentiment = merged_df.groupby(['Category','Sentiment']).size().reset_index(name='Sentiment Count')
category_sentiment
category_sentiment['log_sentiment_count'] = np.log2(aa['Sentiment Count'])
plt.figure(figsize=(14,7))
plt.xticks(rotation=65,fontsize=9)
plt.xlabel("Category")
plt.ylabel("Installs")
plt.title("Number of installs type wise according to Genres")
sns.barplot('Category', 'log_sentiment_count', hue='Sentiment', data=category_sentiment);
# bb = merged_df[merged_df['Sentiment_Subjectivity']>0.5]
# bb
# cc = bb.groupby(['Category'])['Sentiment_Subjectivity'].mean().reset_index(name='Subjectivity Aggregate')
plt.title("Distribution of Size")
plt.hist(merged_df[merged_df['Sentiment_Subjectivity'].notnull()]['Sentiment_Subjectivity'])
plt.show()
sentimet_subjectivity_polarity = merged_df.groupby(['Category'])['Sentiment_Polarity','Sentiment_Subjectivity'].mean().reset_index().rename(columns={'Sentiment_Polarity':'Sentiment_polarity_avg','Sentiment_Subjectivity':'sentiment_subjectivity_avg'})
plt.figure(figsize=(14,7))
plt.xticks(rotation=65,fontsize=9)
plt.plot(sentimet_subjectivity_polarity['Category'], sentimet_subjectivity_polarity['Sentiment_polarity_avg'], 's-b')
plt.plot(sentimet_subjectivity_polarity['Category'], sentimet_subjectivity_polarity['sentiment_subjectivity_avg'], 'o--r')
# plt.xlabel('Year')
# plt.ylabel('Yield (tons per hectare)')
plt.title("Does sentiment_subjectivity proportional to sentiment_polarity")
plt.legend(['Sentiment_polarity_avg','sentiment_subjectivity_avg']);
plt.figure(figsize=(14,7))
sns.scatterplot(merged_df['Sentiment_Subjectivity'], merged_df['Sentiment_Polarity'])
# plt.xlabel('Year')
# plt.ylabel('Yield (tons per hectare)')
plt.title("Does sentiment_subjectivity proportional to sentiment_polarity")
plt.figure(figsize=(14,7))
sns.scatterplot(sentimet_subjectivity_polarity['sentiment_subjectivity_avg'], sentimet_subjectivity_polarity['Sentiment_polarity_avg'])
# plt.xlabel('Year')
# plt.ylabel('Yield (tons per hectare)')
plt.title("Does sentiment_subjectivity proportional to sentiment_polarity")
!pip install wordcloud -q
review_notnull = merged_df[merged_df['Translated_Review'].notnull()]
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
text = review_notnull.Translated_Review.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(text).lower())
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
my_labels = merged_df['Sentiment'].unique()
plt.pie(merged_df['Sentiment'],labels=my_labels,autopct='%1.1f%%')
plt.title('Review Sentiments')
plt.axis('equal')
plt.show()
!pip install jovian --upgrade -q
import jovian
jovian.commit()
google_playstore_df['Reviews'].min()
google_playstore_df['Category'].unique()