Jovian
⭐️
Sign In
In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set
Out[20]:
<function seaborn.rcmod.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)>
In [45]:
# importing files of May 2019
df_1 = pd.read_json('2019-05-01.jsonl', lines = True)
df_2 = pd.read_json('2019-05-02.jsonl', lines = True)
df_3 = pd.read_json('2019-05-03.jsonl', lines = True)
df_4 = pd.read_json('2019-05-04.jsonl', lines = True)
df_5 = pd.read_json('2019-05-05.jsonl', lines = True)
df_6 = pd.read_json('2019-05-06.jsonl', lines = True)
df_7 = pd.read_json('2019-05-07.jsonl', lines = True)
df_8 = pd.read_json('2019-05-08.jsonl', lines = True)
df_9 = pd.read_json('2019-05-09.jsonl', lines = True)
df_10 = pd.read_json('2019-05-10.jsonl', lines = True)
df_11 = pd.read_json('2019-05-11.jsonl', lines = True)
df_12 = pd.read_json('2019-05-12.jsonl', lines = True)
df_13 = pd.read_json('2019-05-13.jsonl', lines = True)
df_14 = pd.read_json('2019-05-14.jsonl', lines = True)
df_15 = pd.read_json('2019-05-15.jsonl', lines = True)
df_16 = pd.read_json('2019-05-16.jsonl', lines = True)
df_17 = pd.read_json('2019-05-17.jsonl', lines = True)
df_18 = pd.read_json('2019-05-18.jsonl', lines = True)
df_19 = pd.read_json('2019-05-19.jsonl', lines = True)
df_20 = pd.read_json('2019-05-20.jsonl', lines = True)
df_21 = pd.read_json('2019-05-21.jsonl', lines = True)
df_22 = pd.read_json('2019-05-22.jsonl', lines = True)
df_23 = pd.read_json('2019-05-23.jsonl', lines = True)
df_24 = pd.read_json('2019-05-24.jsonl', lines = True)
df_25 = pd.read_json('2019-05-25.jsonl', lines = True)
df_26 = pd.read_json('2019-05-26.jsonl', lines = True)
df_27 = pd.read_json('2019-05-27.jsonl', lines = True)
df_28 = pd.read_json('2019-05-28.jsonl', lines = True)
df_29 = pd.read_json('2019-05-29.jsonl', lines = True)
df_30 = pd.read_json('2019-05-30.jsonl', lines = True)
df_31 = pd.read_json('2019-05-31.jsonl', lines = True)
In [22]:
# merge all the imported files
df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18, df_19, df_20, df_21, df_22, df_23, df_24, df_25, df_26, df_27, df_28, df_29, df_30, df_31], ignore_index = True, sort = True)
df
Out[22]:
In [46]:
# to find the no. of unique projects across May, 2019
df.spec.nunique()
Out[46]:
6180
In [23]:
# to find the most popular projects in the data and their count
most_popular_projects = df.groupby(['spec'])['spec'].count()
most_popular_projects.nlargest(10)
Out[23]:
spec
ipython/ipython-in-depth/master              183362
jupyterlab/jupyterlab-demo/master             34704
DS-100/textbook/master                        21773
ines/spacy-io-binder/live                     18498
bokeh/bokeh-notebooks/master                   8916
ines/spacy-course/binder                       6108
binder-examples/r/master                       5703
binder-examples/requirements/master            5402
rationalmatter/juno-demo-notebooks/master      5153
QuantStack/xeus-cling/stable                   4512
Name: spec, dtype: int64
In [24]:
# to find the different sources from where the projects were undertaken
df['provider'].unique()
Out[24]:
array(['GitHub', 'GitLab', 'Git', 'Gist'], dtype=object)
In [64]:
# to group on the basis of no. of projects from a particular source
source = df.groupby(['provider', 'spec'])['spec'].count()
source
Out[64]:
provider  spec                                                       
Gist      AllenDowney/3e0ee50e828cb3a4bc2a720797bb303c/master               1
          AustinRochford/505e6a3647c57dbe4bd55a4c311a2a95/master            2
          AustinRochford/62c283a3f0fae90b5e39/master                        1
          BadreeshShetty/bf9cb1dced8263ef997bcb2c3926569b/master            3
          BenLangmead/6513059/master                                        1
          BenLangmead/7460513/master                                        1
          BhanuTejP/67a0d9bd71af1b17805e0a15de1d0cd7/master                 2
          Chilipp/5c3f2e5151ce047bac3a0f33c1446304/master                   5
          CodeForeverZou/9ace4b2b6361ae5c8f49138152ad3f6f/master            1
          DPeterK/710382557519cdf2de4976fa23048f4f/master                   1
          Daitu/48e2684216c5255b4f8a1ca9b58d1770/master                     2
          ELC/380e584b87227b15727ec886223d9d4a/master                     708
          ELC/8fdc0f490b3058872a7014f01416dfb6/master                    1018
          ELC/8fdc0f490b3058872a7014f01416dfb6/master/master                1
          EvanZ/48bf713ce9eb14f28d58/master                                 1
          Fil/aec6cbf62f9b71c3407db87d5eb592e7/master                       1
          Fleetingtimer/34847f2269b19012139b61c3fb00623c                    1
          GNOBIS/6fd203a8ac266d9cfe891d65443002c4/master                    1
          ManushiM/a6dd82a92b5a671b5465bf15a706a1a9/master                  1
          NikkiAnzalone/5283c7d93e4637eca38af9f71a2909c2/master             2
          SantiagoSalazarPavajeau/fa9bbdfa26ba02af0f3e31f30b4aaa58          1
          Sergei24/da47df0d015f8416e9bde5f002144d3b/master                  1
          SharafutdinovRuslan/f8bb5520fd87c2375aa31f0cf6b24aeb/master       1
          SirPrime/98ebc21ee20c59f5c052cfae007d615f/master                  8
          TimShawver/8fcef51dd3c222ed25306c002ab89b60/master                2
          Z30G0D/a52c1654e03df212bfab2842f472b302/master                    2
          ZhangDepeng/fc299b6b77282b6479823173e4d8ebf9/master               7
          aboSamoor/6046170/master                                          1
          alxgrh/a3b598c3c7ea39c4c5c5356eca1cecbb/master                    1
          anixdorf/9769238/master                                           1
                                                                         ... 
GitLab    lukicdarkoo%2Fcamera-calibration/master                           2
          mlysakowski%2Ftest_binder/master                                  4
          mmagg%2Fnotes/62cb41636ec97f4b97ee0046e1a7241e5a47602c            1
          nixd%2Fpython-course/binder                                      17
          nixd%2Fpython-starter-course/binder                               6
          nreveret%2Fnotebooksgit/master                                   20
          oaraque%2Fmft-predictor/master                                    1
          oheoh%2Finfo-notebooks/master                                     1
          oibaf-talks%2Fdatabase-migrations/master                          2
          open-scientist%2Fformation-data-reproductibilite/master          55
          oscar6echo%2Fipyupload-repo2docker/master                         4
          peteruran%2Fnp-ofdm/develop                                       5
          peteruran%2Fnp-ofdm/master                                       35
          phec.net%2Fplatformdatascience/master                             5
          pwamej%2Fecg-features-extraction/binder                           2
          pwamej%2Fecg-features-extraction/master                          11
          roland_its%2Fits_techtalk_scipy/master                            7
          rruizz%2Finforfis/R                                             250
          rruizz%2Finforfis/autin                                           1
          rruizz%2Finforfis/master                                         88
          runjaj%2FclasesArduino/master                                     1
          sgmarkets%2Fsgmarkets-api-notebooks/master                        8
          slloyd%2Fpython-introduction/master                               4
          snowhitiger%2Flearn_deep_learning/master                         44
          synw%2Fjmap/master                                               11
          tash_pro%2Fyellow-taxi-demand-prediction-in-ny/master             4
          thoma.rey%2FFV_HipoDiff/master                                    2
          ul-fri%2Fovs%2Fpython/master                                    274
          utt-connected-innovation%2Fia-course-2019/master                207
          valentin.queloz%2Fjupyther/master                                 1
Name: spec, Length: 6180, dtype: int64
In [26]:
# to find the no. of projects provided by different sources
popular_source = df.groupby(['provider'])['provider'].count()
popular_source
Out[26]:
provider
Gist        2239
Git          579
GitHub    404286
GitLab      1979
Name: provider, dtype: int64
In [27]:
# creating a dataframe for the no. of projects by a particular source
df_source = pd.DataFrame(popular_source)
df_source
Out[27]:
In [28]:
# data visualization using matplotlib bar graph for the no. of projects from different sources
plt.bar(df_source.index, df_source['provider'], color = 'g')
plt.yscale('log')
plt.title('Distribution of Sources')
plt.xlabel('Source Name')
plt.ylabel('Projects Undertaken (log scale)')
Out[28]:
Text(0,0.5,'Projects Undertaken (log scale)')
Notebook Image
In [29]:
# splitting timestamp into date and time in different columns
df['new_date'] = [d.date() for d in df['timestamp']]
df['new_time'] = [d.time() for d in df['timestamp']]
df
Out[29]:
In [30]:
# top 5 days when the maximum no. of projects were done
most_popular_date = df.groupby(['new_date'])['new_date'].count()
most_popular_date.nlargest(5)
Out[30]:
new_date
2019-05-22    18627
2019-05-08    17393
2019-05-14    17309
2019-05-09    16657
2019-05-13    16635
Name: new_date, dtype: int64
In [31]:
# most common time when the projects start
most_popular_time = df.groupby(['new_time'])['new_time'].count()
most_popular_time.nlargest(5)
Out[31]:
new_time
14:50:00    497
13:41:00    481
14:40:00    479
09:30:00    477
13:42:00    477
Name: new_time, dtype: int64
In [48]:
# info of the data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 409083 entries, 0 to 409082 Data columns (total 17 columns): provider 409083 non-null object schema 409083 non-null object spec 409083 non-null object status 409083 non-null object timestamp 409083 non-null datetime64[ns] version 409083 non-null int64 new_date 409083 non-null object new_time 409083 non-null object time_str 409083 non-null object date_str 409083 non-null object hour 409083 non-null float64 minute 409083 non-null float64 seconds 409083 non-null float64 year 409083 non-null float64 month 409083 non-null float64 date 409083 non-null float64 Cumulative_Frequency_hour 409083 non-null float64 dtypes: datetime64[ns](1), float64(7), int64(1), object(8) memory usage: 53.1+ MB
In [33]:
# splitting day and time further into year, month, date; hour, minute and second
df['time_str'] = df['new_time'].astype(str)
df['date_str'] = df['new_date'].astype(str)
df[['hour', 'minute', 'seconds']] = df.time_str.str.split(':', expand = True).astype(float)
df[['year', 'month', 'date']] = df.date_str.str.split('-', expand = True).astype(float)
df
Out[33]:
In [34]:
# data visualisation on the basis of the hour in which the project was done
df.hist(column = 'hour', bins = 24, rwidth = 0.9, color = 'y')
plt.xlabel('Time (in hours)')
plt.ylabel('No. of projects undertaken')
plt.title('Distribution on the basis of time')
Out[34]:
Text(0.5,1,'Distribution on the basis of time')
Notebook Image
In [36]:
# most common hour
df['hour'].mode()
Out[36]:
0    13.0
dtype: float64
In [50]:
# most popular date of May, 2019
df['date'].mode()
Out[50]:
0    22.0
dtype: float64
In [49]:
#data visualisation on the basis of the date of May, 2019
df.hist(column = 'date', bins = 30, rwidth = 0.9, color = 'y')
plt.xlabel('Date')
plt.ylabel('No. of Projects undertaken')
plt.title('Distribution on the basis of date')
Out[49]:
Text(0.5,1,'Distribution on the basis of date')
Notebook Image
In [42]:
# using seaborn tools to visualise the data on the basis of time of the day
sns.distplot(df.hour)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f39c7c4f98>
Notebook Image
In [43]:
# using seaborn tools to visualise the data on the basis of the date of the month
sns.distplot(df.date)
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f39c8f26d8>
Notebook Image
In [63]:
print('Summary of the Data:\nFrom the data it was observed that the Github was the most popular source provider among the 4 providers that are Gist, Git, Github and Gitlab.\nThe most common project is ipython/ipython-in-depth/master.\nThe maximum number of projects that is 18627 were done on 22nd in the month of May, 2019.\nThe most common time round the clock when the maximum number of projects started in May, 2019 is 14:50.')
Summary of the Data: From the data it was observed that the Github was the most popular source provider among the 4 providers that are Gist, Git, Github and Gitlab. The most common project is ipython/ipython-in-depth/master. The maximum number of projects that is 18627 were done on 22nd in the month of May, 2019. The most common time round the clock when the maximum number of projects started in May, 2019 is 14:50.
In [ ]:
 
In [ ]: