The objective of this project is to deliver insights to understand customer demands better and thus help developers to popularize the product. The dataset is chosen from Kaggle. It is of 10k Play Store apps for analyzing the Android market. It consists of in total of 10841 rows and 13 columns.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
google_playstore_df = pd.read_csv('googleplaystore.csv')
google_playstore_df.columns
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
'Android Ver'],
dtype='object')
google_playstore_df.shape
(10841, 13)
google_playstore_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 App 10841 non-null object
1 Category 10840 non-null object
2 Rating 9367 non-null float64
3 Reviews 10841 non-null int64
4 Size 10841 non-null object
5 Installs 10841 non-null object
6 Type 10840 non-null object
7 Price 10841 non-null object
8 Content Rating 10841 non-null object
9 Genres 10840 non-null object
10 Last Updated 10841 non-null object
11 Current Ver 10833 non-null object
12 Android Ver 10839 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 1.1+ MB
google_playstore_df.describe()
google_playstore_df['Last Updated'] = pd.to_datetime(google_playstore_df['Last Updated'])
google_playstore_df['Installs'] = google_playstore_df['Installs'].map(lambda x: x.rstrip('+'))
google_playstore_df['Installs'] = pd.to_numeric(google_playstore_df['Installs'].str.replace(',',''))
google_playstore_df['Price'] = pd.to_numeric(google_playstore_df['Price'].str.replace('$',''))
google_playstore_df['Category'] = google_playstore_df['Category'].str.replace(' ','')
# Row 10472 removed due to missing value of Category
google_playstore_df.drop(google_playstore_df.index[10472], inplace=True)
google_playstore_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 App 10840 non-null object
1 Category 10840 non-null object
2 Rating 9366 non-null float64
3 Reviews 10840 non-null int64
4 Size 10840 non-null object
5 Installs 10840 non-null int64
6 Type 10839 non-null object
7 Price 10840 non-null float64
8 Content Rating 10840 non-null object
9 Genres 10840 non-null object
10 Last Updated 10840 non-null datetime64[ns]
11 Current Ver 10832 non-null object
12 Android Ver 10838 non-null object
dtypes: datetime64[ns](1), float64(2), int64(2), object(8)
memory usage: 1.2+ MB
google_playstore_df['log_installs'] = np.log2(google_playstore_df['Installs'])
C:\Users\SinghRit\Anaconda3\lib\site-packages\pandas\core\series.py:679: RuntimeWarning: divide by zero encountered in log2
result = getattr(ufunc, method)(*inputs, **kwargs)
boolean = google_playstore_df['App'].duplicated().any()
boolean
True
google_playstore_df['App'].value_counts()
ROBLOX 9
CBS Sports App - Scores, News, Stats & Watch Live 8
Duolingo: Learn Languages Free 7
ESPN 7
Candy Crush Saga 7
..
Art Pixel Coloring. Color by Number. 1
The dollar in mexico 1
Space Coast CU Mobile 1
ECナビ×シュフー 1
GPS Speedometer, Distance Meter 1
Name: App, Length: 9659, dtype: int64
google_playstore_df[google_playstore_df['App']=='ROBLOX']
google_playstore_df.drop_duplicates(inplace=True)
google_playstore_df = google_playstore_df.loc[google_playstore_df.groupby(['App'])['Reviews'].idxmax()]
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['figure.facecolor'] = '#00000000'
top_genres = google_playstore_df.Genres.value_counts().reset_index().rename(columns={'Genres':'Count','index':'Genres'})
top_genres
genres_installs = google_playstore_df.groupby(['Genres'])[['Installs']].sum()
genres_installs
top_genres_installs = pd.merge(top_genres, genres_installs, on='Genres')
top_20_genres_installs = top_genres_installs.head(20)
top_genres_installs
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Number of application")
plt.title("Top 20 Genres")
sns.barplot(top_20_genres_installs.Genres, top_20_genres_installs.Count)
<matplotlib.axes._subplots.AxesSubplot at 0x270c1fad308>
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Installs")
plt.title("Installs according to Genres")
sns.barplot(top_20_genres_installs.Genres, top_20_genres_installs.Installs)
<matplotlib.axes._subplots.AxesSubplot at 0x270c296d408>
genres_ratings_df = google_playstore_df.groupby(['Genres'])[['Rating']].mean()
genres_ratings_df
genres_installs_ratings = pd.merge(top_genres_installs, genres_ratings_df, on='Genres')
genres_installs_ratings.head()
genres_installs_ratings.sort_values('Rating', ascending =False, inplace=True)
highest_rated_genres = genres_installs_ratings.iloc[0:20]
lowest_rated_genres = genres_installs_ratings.iloc[-20:]
lowest_rated_genres = lowest_rated_genres[lowest_rated_genres['Rating'].notnull()]
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Rating")
plt.title("Ratings according to Genres")
sns.barplot(highest_rated_genres.Genres, highest_rated_genres.Rating)
<matplotlib.axes._subplots.AxesSubplot at 0x270c2835348>
plt.figure(figsize=(14,7))
plt.xticks(rotation=65)
plt.xlabel("Genres")
plt.ylabel("Rating")
plt.title("Ratings according to Genres")
sns.barplot(lowest_rated_genres.Genres, lowest_rated_genres.Rating)
<matplotlib.axes._subplots.AxesSubplot at 0x270c2907c08>
google_playstore_df['Gaming Category App'] = google_playstore_df['Category']=='GAME'
categoty_type_installs = google_playstore_df.groupby(['Category','Type'])[['Installs']].sum().reset_index()
categoty_type_installs
categoty_type_installs['log_Installs'] = np.log2(categoty_type_installs['Installs'])
categoty_type_installs
# categoty_type_installs = categoty_type_installs[
# categoty_type_installs['Genres'].isin(highest_rated_genres['Genres'].unique())
# ]
# categoty_type_installs
plt.figure(figsize=(14,7))
plt.xticks(rotation=65,fontsize=9)
plt.xlabel("Category")
plt.ylabel("Installs")
plt.title("Number of installs type wise according to Category")
sns.barplot('Category', 'log_Installs', hue='Type', data=categoty_type_installs);
# Converting KB to MB
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace('Varies with device', 'NaN') if 'Varies with device' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace('M', '') if 'M' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: str(x).replace(',', '') if 'M' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
google_playstore_df['Size'] = google_playstore_df['Size'].apply(lambda x: float(x))
plt.figure(figsize=(14,7))
sns.scatterplot(google_playstore_df['Size'], google_playstore_df['log_installs'], hue=google_playstore_df['Type'])
<matplotlib.axes._subplots.AxesSubplot at 0x270c2907708>
# to remove null values from log_installs column
google_playstore_df.loc[google_playstore_df['log_installs']==google_playstore_df['log_installs'].min(),'log_installs']=0
plt.title("Distribution of Logrithm of Installs")
plt.hist(google_playstore_df['log_installs']);
# to remove null values from size column
google_playstore_df.loc[google_playstore_df['Size'].isnull(),'Size']=0
plt.title("Distribution of Size")
plt.hist(google_playstore_df['Size']);
user_reviews_df = pd.read_csv('googleplaystore_user_reviews.csv')
merged_df = google_playstore_df.merge(user_reviews_df, on="App")
category_sentiment = merged_df.groupby(['Category','Sentiment']).size().reset_index(name='Sentiment Count')
category_sentiment
category_sentiment['log_sentiment_count'] = np.log2(category_sentiment['Sentiment Count'])
plt.figure(figsize=(14,7))
plt.xticks(rotation=65,fontsize=9)
plt.xlabel("Category")
plt.ylabel("Installs")
plt.title("Number of installs type wise according to Genres")
sns.barplot('Category', 'log_sentiment_count', hue='Sentiment', data=category_sentiment);
# bb = merged_df[merged_df['Sentiment_Subjectivity']>0.5]
# bb
# cc = bb.groupby(['Category'])['Sentiment_Subjectivity'].mean().reset_index(name='Subjectivity Aggregate')
plt.title("Distribution of Size")
plt.hist(merged_df[merged_df['Sentiment_Subjectivity'].notnull()]['Sentiment_Subjectivity'])
plt.show()
sentimet_subjectivity_polarity = merged_df.groupby(['Category'])['Sentiment_Polarity','Sentiment_Subjectivity'].mean().reset_index().rename(columns={'Sentiment_Polarity':'Sentiment_polarity_avg','Sentiment_Subjectivity':'sentiment_subjectivity_avg'})
C:\Users\SinghRit\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
"""Entry point for launching an IPython kernel.
plt.figure(figsize=(14,7))
plt.xticks(rotation=65,fontsize=9)
plt.plot(sentimet_subjectivity_polarity['Category'], sentimet_subjectivity_polarity['Sentiment_polarity_avg'], 's-b')
plt.plot(sentimet_subjectivity_polarity['Category'], sentimet_subjectivity_polarity['sentiment_subjectivity_avg'], 'o--r')
# plt.xlabel('Year')
# plt.ylabel('Yield (tons per hectare)')
plt.title("Does sentiment_subjectivity proportional to sentiment_polarity")
plt.legend(['Sentiment_polarity_avg','sentiment_subjectivity_avg']);
plt.figure(figsize=(14,7))
sns.scatterplot(merged_df['Sentiment_Subjectivity'], merged_df['Sentiment_Polarity'])
# plt.xlabel('Year')
# plt.ylabel('Yield (tons per hectare)')
plt.title("Does sentiment_subjectivity proportional to sentiment_polarity")
Text(0.5, 1.0, 'Does sentiment_subjectivity proportional to sentiment_polarity')