import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set
<function seaborn.rcmod.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)>
# importing files of May 2019
df_1 = pd.read_json('2019-05-01.jsonl', lines = True)
df_2 = pd.read_json('2019-05-02.jsonl', lines = True)
df_3 = pd.read_json('2019-05-03.jsonl', lines = True)
df_4 = pd.read_json('2019-05-04.jsonl', lines = True)
df_5 = pd.read_json('2019-05-05.jsonl', lines = True)
df_6 = pd.read_json('2019-05-06.jsonl', lines = True)
df_7 = pd.read_json('2019-05-07.jsonl', lines = True)
df_8 = pd.read_json('2019-05-08.jsonl', lines = True)
df_9 = pd.read_json('2019-05-09.jsonl', lines = True)
df_10 = pd.read_json('2019-05-10.jsonl', lines = True)
df_11 = pd.read_json('2019-05-11.jsonl', lines = True)
df_12 = pd.read_json('2019-05-12.jsonl', lines = True)
df_13 = pd.read_json('2019-05-13.jsonl', lines = True)
df_14 = pd.read_json('2019-05-14.jsonl', lines = True)
df_15 = pd.read_json('2019-05-15.jsonl', lines = True)
df_16 = pd.read_json('2019-05-16.jsonl', lines = True)
df_17 = pd.read_json('2019-05-17.jsonl', lines = True)
df_18 = pd.read_json('2019-05-18.jsonl', lines = True)
df_19 = pd.read_json('2019-05-19.jsonl', lines = True)
df_20 = pd.read_json('2019-05-20.jsonl', lines = True)
df_21 = pd.read_json('2019-05-21.jsonl', lines = True)
df_22 = pd.read_json('2019-05-22.jsonl', lines = True)
df_23 = pd.read_json('2019-05-23.jsonl', lines = True)
df_24 = pd.read_json('2019-05-24.jsonl', lines = True)
df_25 = pd.read_json('2019-05-25.jsonl', lines = True)
df_26 = pd.read_json('2019-05-26.jsonl', lines = True)
df_27 = pd.read_json('2019-05-27.jsonl', lines = True)
df_28 = pd.read_json('2019-05-28.jsonl', lines = True)
df_29 = pd.read_json('2019-05-29.jsonl', lines = True)
df_30 = pd.read_json('2019-05-30.jsonl', lines = True)
df_31 = pd.read_json('2019-05-31.jsonl', lines = True)
# merge all the imported files
df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18, df_19, df_20, df_21, df_22, df_23, df_24, df_25, df_26, df_27, df_28, df_29, df_30, df_31], ignore_index = True, sort = True)
df
# to find the no. of unique projects across May, 2019
df.spec.nunique()
6180
# to find the most popular projects in the data and their count
most_popular_projects = df.groupby(['spec'])['spec'].count()
most_popular_projects.nlargest(10)
spec
ipython/ipython-in-depth/master 183362
jupyterlab/jupyterlab-demo/master 34704
DS-100/textbook/master 21773
ines/spacy-io-binder/live 18498
bokeh/bokeh-notebooks/master 8916
ines/spacy-course/binder 6108
binder-examples/r/master 5703
binder-examples/requirements/master 5402
rationalmatter/juno-demo-notebooks/master 5153
QuantStack/xeus-cling/stable 4512
Name: spec, dtype: int64
# to find the different sources from where the projects were undertaken
df['provider'].unique()
array(['GitHub', 'GitLab', 'Git', 'Gist'], dtype=object)
# to group on the basis of no. of projects from a particular source
source = df.groupby(['provider', 'spec'])['spec'].count()
source
provider spec
Gist AllenDowney/3e0ee50e828cb3a4bc2a720797bb303c/master 1
AustinRochford/505e6a3647c57dbe4bd55a4c311a2a95/master 2
AustinRochford/62c283a3f0fae90b5e39/master 1
BadreeshShetty/bf9cb1dced8263ef997bcb2c3926569b/master 3
BenLangmead/6513059/master 1
BenLangmead/7460513/master 1
BhanuTejP/67a0d9bd71af1b17805e0a15de1d0cd7/master 2
Chilipp/5c3f2e5151ce047bac3a0f33c1446304/master 5
CodeForeverZou/9ace4b2b6361ae5c8f49138152ad3f6f/master 1
DPeterK/710382557519cdf2de4976fa23048f4f/master 1
Daitu/48e2684216c5255b4f8a1ca9b58d1770/master 2
ELC/380e584b87227b15727ec886223d9d4a/master 708
ELC/8fdc0f490b3058872a7014f01416dfb6/master 1018
ELC/8fdc0f490b3058872a7014f01416dfb6/master/master 1
EvanZ/48bf713ce9eb14f28d58/master 1
Fil/aec6cbf62f9b71c3407db87d5eb592e7/master 1
Fleetingtimer/34847f2269b19012139b61c3fb00623c 1
GNOBIS/6fd203a8ac266d9cfe891d65443002c4/master 1
ManushiM/a6dd82a92b5a671b5465bf15a706a1a9/master 1
NikkiAnzalone/5283c7d93e4637eca38af9f71a2909c2/master 2
SantiagoSalazarPavajeau/fa9bbdfa26ba02af0f3e31f30b4aaa58 1
Sergei24/da47df0d015f8416e9bde5f002144d3b/master 1
SharafutdinovRuslan/f8bb5520fd87c2375aa31f0cf6b24aeb/master 1
SirPrime/98ebc21ee20c59f5c052cfae007d615f/master 8
TimShawver/8fcef51dd3c222ed25306c002ab89b60/master 2
Z30G0D/a52c1654e03df212bfab2842f472b302/master 2
ZhangDepeng/fc299b6b77282b6479823173e4d8ebf9/master 7
aboSamoor/6046170/master 1
alxgrh/a3b598c3c7ea39c4c5c5356eca1cecbb/master 1
anixdorf/9769238/master 1
...
GitLab lukicdarkoo%2Fcamera-calibration/master 2
mlysakowski%2Ftest_binder/master 4
mmagg%2Fnotes/62cb41636ec97f4b97ee0046e1a7241e5a47602c 1
nixd%2Fpython-course/binder 17
nixd%2Fpython-starter-course/binder 6
nreveret%2Fnotebooksgit/master 20
oaraque%2Fmft-predictor/master 1
oheoh%2Finfo-notebooks/master 1
oibaf-talks%2Fdatabase-migrations/master 2
open-scientist%2Fformation-data-reproductibilite/master 55
oscar6echo%2Fipyupload-repo2docker/master 4
peteruran%2Fnp-ofdm/develop 5
peteruran%2Fnp-ofdm/master 35
phec.net%2Fplatformdatascience/master 5
pwamej%2Fecg-features-extraction/binder 2
pwamej%2Fecg-features-extraction/master 11
roland_its%2Fits_techtalk_scipy/master 7
rruizz%2Finforfis/R 250
rruizz%2Finforfis/autin 1
rruizz%2Finforfis/master 88
runjaj%2FclasesArduino/master 1
sgmarkets%2Fsgmarkets-api-notebooks/master 8
slloyd%2Fpython-introduction/master 4
snowhitiger%2Flearn_deep_learning/master 44
synw%2Fjmap/master 11
tash_pro%2Fyellow-taxi-demand-prediction-in-ny/master 4
thoma.rey%2FFV_HipoDiff/master 2
ul-fri%2Fovs%2Fpython/master 274
utt-connected-innovation%2Fia-course-2019/master 207
valentin.queloz%2Fjupyther/master 1
Name: spec, Length: 6180, dtype: int64
# to find the no. of projects provided by different sources
popular_source = df.groupby(['provider'])['provider'].count()
popular_source
provider
Gist 2239
Git 579
GitHub 404286
GitLab 1979
Name: provider, dtype: int64
# creating a dataframe for the no. of projects by a particular source
df_source = pd.DataFrame(popular_source)
df_source
# data visualization using matplotlib bar graph for the no. of projects from different sources
plt.bar(df_source.index, df_source['provider'], color = 'g')
plt.yscale('log')
plt.title('Distribution of Sources')
plt.xlabel('Source Name')
plt.ylabel('Projects Undertaken (log scale)')
Text(0,0.5,'Projects Undertaken (log scale)')
# splitting timestamp into date and time in different columns
df['new_date'] = [d.date() for d in df['timestamp']]
df['new_time'] = [d.time() for d in df['timestamp']]
df
# top 5 days when the maximum no. of projects were done
most_popular_date = df.groupby(['new_date'])['new_date'].count()
most_popular_date.nlargest(5)
new_date
2019-05-22 18627
2019-05-08 17393
2019-05-14 17309
2019-05-09 16657
2019-05-13 16635
Name: new_date, dtype: int64
# most common time when the projects start
most_popular_time = df.groupby(['new_time'])['new_time'].count()
most_popular_time.nlargest(5)
new_time
14:50:00 497
13:41:00 481
14:40:00 479
09:30:00 477
13:42:00 477
Name: new_time, dtype: int64
# info of the data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 409083 entries, 0 to 409082
Data columns (total 17 columns):
provider 409083 non-null object
schema 409083 non-null object
spec 409083 non-null object
status 409083 non-null object
timestamp 409083 non-null datetime64[ns]
version 409083 non-null int64
new_date 409083 non-null object
new_time 409083 non-null object
time_str 409083 non-null object
date_str 409083 non-null object
hour 409083 non-null float64
minute 409083 non-null float64
seconds 409083 non-null float64
year 409083 non-null float64
month 409083 non-null float64
date 409083 non-null float64
Cumulative_Frequency_hour 409083 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1), object(8)
memory usage: 53.1+ MB
# splitting day and time further into year, month, date; hour, minute and second
df['time_str'] = df['new_time'].astype(str)
df['date_str'] = df['new_date'].astype(str)
df[['hour', 'minute', 'seconds']] = df.time_str.str.split(':', expand = True).astype(float)
df[['year', 'month', 'date']] = df.date_str.str.split('-', expand = True).astype(float)
df
# data visualisation on the basis of the hour in which the project was done
df.hist(column = 'hour', bins = 24, rwidth = 0.9, color = 'y')
plt.xlabel('Time (in hours)')
plt.ylabel('No. of projects undertaken')
plt.title('Distribution on the basis of time')
Text(0.5,1,'Distribution on the basis of time')
# most common hour
df['hour'].mode()
0 13.0
dtype: float64
# most popular date of May, 2019
df['date'].mode()
0 22.0
dtype: float64
#data visualisation on the basis of the date of May, 2019
df.hist(column = 'date', bins = 30, rwidth = 0.9, color = 'y')
plt.xlabel('Date')
plt.ylabel('No. of Projects undertaken')
plt.title('Distribution on the basis of date')
Text(0.5,1,'Distribution on the basis of date')
# using seaborn tools to visualise the data on the basis of time of the day
sns.distplot(df.hour)
<matplotlib.axes._subplots.AxesSubplot at 0x1f39c7c4f98>
# using seaborn tools to visualise the data on the basis of the date of the month
sns.distplot(df.date)
<matplotlib.axes._subplots.AxesSubplot at 0x1f39c8f26d8>
print('Summary of the Data:\nFrom the data it was observed that the Github was the most popular source provider among the 4 providers that are Gist, Git, Github and Gitlab.\nThe most common project is ipython/ipython-in-depth/master.\nThe maximum number of projects that is 18627 were done on 22nd in the month of May, 2019.\nThe most common time round the clock when the maximum number of projects started in May, 2019 is 14:50.')
Summary of the Data:
From the data it was observed that the Github was the most popular source provider among the 4 providers that are Gist, Git, Github and Gitlab.
The most common project is ipython/ipython-in-depth/master.
The maximum number of projects that is 18627 were done on 22nd in the month of May, 2019.
The most common time round the clock when the maximum number of projects started in May, 2019 is 14:50.