Jovian
⭐️
Sign In
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rd

from sklearn.decomposition import PCA
from sklearn import preprocessing

Creating a sample dataset

In [2]:
# Create 'gene' names as the feature variables
genes = ['gene' + str(i) for i in range(1, 101)]
In [3]:
# Create arrays of sample names
wt = ['wt' + str(i) for i in range(1, 6)]
ko = ['ko' + str(i) for i in range(1, 6)]
In [4]:
data = pd.DataFrame(columns=[*wt, *ko], index=genes)
In [8]:
for gene in data.index:
    data.loc[gene, 'wt1':'wt5'] = np.random.poisson(lam=rd.randrange(10, 1000), size=5)
    data.loc[gene, 'ko1':'ko5'] = np.random.poisson(lam=rd.randrange(10, 1000), size=5)
In [9]:
data.head()
Out[9]:
In [10]:
# Center and scale the data

scaled_data = preprocessing.scale(data.T)
/home/asheesh/.virtualenvs/asheesh/lib/python3.6/site-packages/ipykernel_launcher.py:3: DataConversionWarning: Data with input dtype int64 were all converted to float64 by the scale function. This is separate from the ipykernel package so we can avoid doing imports until
In [12]:
# After centering, the average value for each gene will be 0, and
# After scaling, the standard deviation for the values for each gene will be 1
# scaling is done using the formula: (actual - mean)^2 / (total measurements) 
scaled_data
Out[12]:
array([[ 1.02177694,  1.06422454,  0.9388512 ,  0.98086032,  1.40578594,
         1.21212921,  1.24443633,  1.04316977,  1.1335687 ,  0.93505589,
        -0.5872093 , -1.06660143, -1.44330622, -1.38185252, -1.38820627,
         0.96385943,  0.73407857, -0.98525538, -0.13461786,  0.70750421,
        -1.22620526, -0.98515248,  0.99047735, -1.10388271,  1.07757813,
        -0.67584327,  0.97493339,  1.2024365 , -1.04541291, -1.06244025,
         1.1516365 , -0.99859395, -0.9933259 ,  0.9875177 ,  1.18484737,
        -0.97302446,  0.98510514,  0.92517037,  0.98713715,  0.90678381,
        -1.07441646,  0.9241073 ,  0.40487855, -1.16722656,  1.12171213,
         0.85615653, -0.94768859, -1.03715219,  1.08750494,  1.19088981,
        -0.98253635,  1.00766795,  1.44769362, -0.98133057, -0.97652724,
        -0.91948636, -1.03304059, -1.16476285,  1.08499648, -1.06343736,
        -0.82699186,  0.99246071,  0.85215447, -1.03609228, -0.97137561,
         1.04776657,  1.07765038,  0.61330704, -1.04087442,  1.16985651,
        -0.78822949,  0.54885273,  0.96196515, -0.86091175, -1.04513333,
         0.90508737, -1.01002877, -0.22416792, -0.9987326 , -1.08596625,
         1.10192867,  1.10970369, -0.7067507 , -1.05815492, -1.01062588,
         0.8537613 ,  0.94449576,  0.99624505, -0.99231346, -1.10128363,
        -0.97671785, -1.09741926,  0.9368293 , -1.06756617, -0.98845964,
         0.97541079, -0.79520796,  0.93689433, -1.0348    ,  0.74567845],
       [ 0.91012221,  0.83914912,  1.15718869,  1.10227539,  0.89331215,
         0.90840663,  1.03836592,  1.05518786,  1.14272885,  0.95549746,
        -1.69186047, -0.95943096,  0.40887852, -0.83994957, -0.48003394,
         1.15481271,  1.84444951, -1.01393821, -0.55529869,  0.88135675,
        -0.78251256, -0.99632201,  1.11045866, -0.99068547,  0.77933148,
        -1.04353002,  0.95802703,  0.70498053, -0.95783071, -0.95051381,
         0.76545899, -0.97180603, -1.04980052,  0.99519669,  0.98072331,
        -1.01694094,  1.04667422,  0.98986759,  0.95423989,  0.84902688,
        -1.0407568 ,  1.02107245,  1.52954121, -0.27725909,  0.90631089,
         1.07654004, -1.15796379, -1.01140362,  1.00196444, -0.81937291,
        -1.00491759,  1.13638466,  0.13353543, -0.98513418, -1.06083175,
        -0.89681099, -0.96575106, -0.64020231,  1.01211678, -0.92600562,
        -1.04219421,  1.10945019,  1.01877685, -1.2478999 , -1.04890475,
         0.93812766, -0.00677768,  1.20140969, -1.052526  ,  1.06451676,
        -0.96590417,  0.48465943,  0.99490916, -1.06533972, -1.14348876,
         1.09152393, -0.95164561, -0.934033  , -0.98909232, -1.01873763,
         0.99947663,  0.96901225, -1.69225964, -0.95374667, -0.95572542,
         1.13852418,  1.10889076,  0.78841162, -0.97432047, -0.64312249,
        -1.03740868, -0.94056781,  0.92854244, -0.92695462, -0.95610417,
         1.27461656, -0.97843561,  1.2332144 , -0.26377255,  0.89942658],
       [ 1.01633037,  1.07271795,  0.94976808,  0.98953282,  0.66219652,
         0.60468405,  0.56898331,  1.09124211,  0.81296341,  1.06354576,
        -0.61627907, -0.89308733,  1.35244433, -0.36578449,  0.23353003,
         0.80018519,  0.91914039, -1.18603522, -0.7656391 ,  1.31220867,
        -1.04469461, -1.11174048,  0.90207006, -1.01164792,  1.1553816 ,
        -0.96474   ,  0.92984976,  0.79542707, -1.02152685, -0.98495271,
         0.83441926, -1.00454682, -0.98705094,  1.05918824,  0.90649638,
        -1.00047226,  0.76961339,  1.13543636,  0.91257002,  1.0367369 ,
        -1.16866352,  0.96230569,  0.22993103, -1.16722656,  0.90631089,
         1.04982689, -1.12896031, -1.0423019 ,  0.96489689,  0.52928436,
        -1.02729882,  0.77781669,  0.70583012, -1.02697385, -1.01165412,
        -1.05553859, -0.99793301, -0.78326427,  0.84358249, -1.08239347,
        -0.99607942,  0.94956457,  0.93943286, -0.27711497, -0.98429713,
         1.00248093,  0.94209687,  0.99137303, -0.96708109,  0.90650712,
        -1.04666539,  0.99820585,  1.09374119, -1.05762545, -0.98098849,
         1.00700602, -1.07184624, -1.53181412, -1.02122657, -1.00529191,
         0.90461362,  1.32074084, -0.73490809, -0.9401282 , -0.99232572,
         0.9697758 ,  1.06546567,  1.04594435, -0.9953123 , -0.8128118 ,
        -1.02527052, -0.75667301,  1.19372189, -1.22981027, -1.00463738,
         0.69615208, -1.00286596,  0.84102607, -0.91305882,  1.05317472],
       [ 1.04356323,  1.05997783,  1.06985369,  0.52121755,  1.17467031,
         0.19051689,  0.86664057,  0.98307935,  0.73510212,  0.99054015,
        -0.84883721, -1.00536116, -1.58309375, -0.80608064, -1.64768408,
         1.02751053, -0.067856  , -0.5119886 , -0.89184335,  0.99473883,
        -0.45982697, -0.92185848,  0.89891266, -0.98649298,  0.99977465,
        -1.18360116,  0.96366248,  1.07681631, -0.9525227 , -1.06244025,
         1.11026034, -0.98073534, -0.9933259 ,  1.02079331,  1.08278534,
        -1.04164396,  1.10824329,  1.03839052,  1.07267003,  0.84902688,
        -0.78494336,  1.15329766,  1.5045487 , -1.03030849,  0.97946603,
         0.94965257, -0.78816947, -0.90840936,  1.02762659,  0.98731891,
        -1.0161082 ,  0.9616977 ,  0.91779112, -0.98513418, -0.98355262,
        -0.94216173, -0.98330485, -1.02170089,  1.06677656, -1.03026418,
        -0.99607942,  0.95736387,  1.00290806, -1.37145435, -0.9552237 ,
         1.10258602,  1.21320388,  1.06138525, -0.99038425,  1.18156093,
        -1.15973109,  0.67723934,  1.24198925, -1.03833979, -0.88690938,
         0.9920911 , -0.95164561, -0.82194904, -1.01479972, -0.97840046,
         0.87046294,  0.4590058 , -0.87569509, -0.99460207, -1.01062588,
         1.06997015,  0.96000473,  1.04594435, -1.02230178, -1.18612828,
        -0.97267179, -0.98924584,  0.77937899, -0.74307797, -0.95610417,
         1.0153049 , -0.99675837,  1.10248495, -1.05509019,  1.25817223],
       [ 1.00543723,  0.94956347,  0.85151621,  1.301743  ,  0.52151744,
         1.0188512 ,  1.18719455,  0.81482616,  1.12898862,  1.05186487,
        -0.81976744, -1.05639472, -0.39489976, -1.00929424,  0.88222455,
         1.0093245 ,  0.48732947, -1.21471805, -1.88044329,  1.04765047,
        -1.24637311, -0.97770613,  1.08519944, -0.86071827,  0.90900393,
        -1.01726668,  1.13836153,  1.15721323, -1.01887285, -0.92037977,
         0.93096364, -1.0372876 , -0.96822606,  0.93376479,  0.78587762,
        -0.96479012,  1.04667422,  0.8847346 ,  1.06828373,  1.28220386,
        -0.89938621,  0.92998398,  0.42987106, -1.13299704,  1.06887787,
         1.0431486 , -0.8751799 , -0.99595448,  0.91357259,  1.343568  ,
        -0.96015511,  1.08581739,  1.23573262, -1.01556303, -0.96247649,
        -1.05553859, -1.01841244, -1.21245018,  0.98023192, -0.88809341,
        -1.08830899,  0.98466141,  1.11398964, -0.5418745 , -1.02306171,
         0.90237585,  1.00987362,  1.07538769, -0.90882319,  0.59048786,
        -0.94975193,  1.67223553,  0.63252503, -0.94191151, -0.90401468,
         0.99954856, -1.00659447, -0.82194904, -0.96659834, -0.89772612,
         1.10192867,  1.00418511, -0.50964891, -1.03545747, -1.02435099,
         0.94868226,  0.91037605,  1.10467989, -1.01030646, -1.11825256,
        -0.97267179, -1.1623233 ,  1.11914017, -0.97021971, -1.03699285,
         0.97541079, -1.19220119,  0.84974137, -1.21741176,  0.989113  ],
       [-1.00979449, -0.96570096, -1.09168744, -0.88372826, -1.07619496,
        -0.66542856, -0.94219973, -1.02394084, -0.9274653 , -1.00397318,
         1.53488372,  1.10742828,  0.75834733,  1.53087583,  0.03892167,
        -1.01841751, -0.19123055,  0.92215314,  0.81191399, -1.02346229,
         1.53679014,  1.03653232, -1.03341803,  1.23971939, -0.69893455,
         0.96123822, -0.99183975, -0.82758584,  0.94509149,  1.08568652,
        -0.86200337,  1.02538186,  1.00524832, -1.06277177, -1.0512389 ,
         0.9675349 , -1.01588968, -0.99148503, -1.00643688, -1.25910109,
         1.07307008, -0.97699738, -0.61981408,  0.57847885, -1.08919874,
        -1.12729503,  1.38709125,  1.02273299, -0.99968336, -1.455532  ,
         0.95903605, -1.09317263, -0.22679827,  0.92427647,  1.07136981,
         0.78116659,  0.95053778,  0.71888639, -1.03762468,  1.02173394,
         1.2174304 , -1.05875475, -0.98069174,  0.09354837,  1.14775442,
        -0.99723733,  0.06099908, -0.87095201,  0.74958494, -0.94864303,
         1.29540994, -0.2856602 , -0.75112347,  0.79765479,  1.03743595,
        -0.99656558,  1.09863381,  0.59778112,  1.06428647,  1.04293993,
        -0.93952321, -1.07101354,  0.61664702,  0.8575095 ,  0.9291902 ,
        -0.92337   , -0.95380114, -1.00528129,  1.07388169,  1.10467741,
         0.92088228,  0.82806746, -1.03958634,  1.18221867,  1.14700145,
        -1.07913546,  1.07982157, -0.98918611,  0.58841568, -0.81742758],
       [-0.99890134, -0.93172731, -0.79693183, -1.1178859 , -0.60391519,
         0.16290575, -0.87350959, -0.98788658, -1.03280703, -0.99521251,
         0.6627907 ,  1.10232493,  0.61855981,  1.49700689,  1.14170235,
        -1.07297559, -0.34544874,  1.03688448,  1.23259482, -0.95543304,
         0.9115868 ,  0.93972973, -0.95448295,  0.69050316, -0.94531222,
         1.07504603, -1.15526789, -1.01852854,  1.02471166,  1.04694275,
        -0.48961791,  0.96287673,  0.99583588, -0.98086258, -1.16257929,
         1.05536786, -0.70804432, -1.03192079, -0.99547112, -1.01363413,
         0.83745243, -0.99462741, -0.24492653,  0.95500355, -0.85347662,
        -0.79338063,  0.77076738,  1.00728385, -1.01394011, -0.18321382,
         1.09891879, -1.00123212, -1.35019156,  0.96231254,  0.99409068,
         0.99658263,  1.01197605,  1.20768145, -0.94197008,  1.06438517,
         0.74091093, -0.92616668, -1.2980677 ,  0.72897123,  0.81825554,
        -0.98770352, -1.49786625, -1.04598256,  0.96708109, -0.75552014,
         1.19849648, -0.2856602 , -1.01467556,  1.07536826,  0.77230393,
        -0.96673573,  0.999039  ,  0.3736132 ,  0.87148095,  0.9084827 ,
        -1.02300265, -1.21170498,  1.20795239,  0.94375979,  1.08474149,
        -0.98665064, -1.06546567, -1.03690811,  0.94193309,  1.22345993,
         1.0746324 ,  1.13636168, -1.18046292,  1.04160712,  1.2602456 ,
        -1.03924136,  0.92102428, -1.11120025,  0.04058039, -0.86867696],
       [-1.0070712 , -1.09310214, -1.13535494, -0.7796582 , -1.61881426,
        -1.76987431, -1.20551192, -0.99990467, -1.06028749, -1.00689341,
         1.18604651,  0.90839739,  1.28255056,  0.78575927,  1.2065718 ,
        -0.72744108, -1.82594333,  1.10859157,  0.53847146, -1.11416796,
         0.9519225 ,  1.02908597, -1.03026062,  1.00493994, -1.07498468,
         0.71611372, -1.16653879, -1.02857815,  1.00878763,  0.79726068,
        -1.63435839,  0.87656011,  1.13388496, -0.97062393, -0.69866098,
         0.9400871 , -1.04667422, -1.14514094, -0.98231222, -0.76816718,
         1.03941041, -1.00344243, -0.69479159,  1.19461017, -1.03636447,
        -1.06051215,  0.74901477,  1.04333184, -0.98827796, -0.38678473,
         1.08772817, -0.82194813, -0.45995537,  1.11445681,  0.99760337,
         1.29136247,  1.02075294,  1.20768145, -1.02395974,  0.95064855,
         0.87925529, -1.01975826, -0.64744697,  1.13493584,  0.93454926,
        -0.99247042, -0.61676845, -1.01797767,  0.95542951, -1.05983499,
         0.95621282, -1.47323629, -0.94878754,  0.84394037,  0.91769891,
        -1.01396632,  0.97156457,  1.94278864,  0.91968233,  0.88607316,
        -0.86742732, -0.96549497,  1.20795239,  0.93014132,  0.94749035,
        -1.09211837, -1.01893878, -1.05949871,  1.04389337,  1.10467741,
         0.83996116,  1.16881371, -0.95671776,  0.8469142 ,  0.67784712,
        -0.9395061 ,  1.04317604, -0.94560963,  1.46089411, -1.08648681],
       [-0.9880082 , -1.02515484, -0.92793433, -1.01381584, -0.49338163,
        -0.03037226, -1.01088986, -1.01192275, -1.05112734, -0.97477094,
         0.34302326,  1.02067123, -0.77931546,  0.6164146 , -0.86925065,
        -1.17299874, -0.40713601,  0.95083597,  0.13882467, -0.92519782,
         0.62923691,  1.03653232, -1.04604764,  0.9755925 , -1.3343296 ,
         1.39896055, -0.8171407 , -1.12907431,  1.05655973,  0.96084548,
        -0.84821132,  1.07002839,  0.95191117, -1.00389954, -1.11618746,
         1.00596182, -1.10824329, -0.78930619, -1.01301633, -1.08583029,
         0.97882302, -0.99756575, -1.91942426,  0.81808548, -1.02417195,
        -1.033799  ,  0.85777781,  0.93003816, -1.00823741,  0.35115982,
         0.95344074, -1.0701875 , -1.35019156,  0.96991975,  0.90978617,
         0.48638674,  0.99734789,  0.49237161, -1.05128962,  0.95064855,
         0.94074167, -0.99246071, -1.13937972,  1.01138139,  1.03792146,
        -1.01630497, -1.565643  , -1.07398745,  1.28555759, -1.19443579,
         0.66547244, -1.44113964, -1.03938357,  1.19493933,  1.22987049,
        -1.00899468,  1.02994773,  0.48569716,  1.07392675,  1.11016855,
        -1.01541361, -0.71928496,  0.16612865,  1.09810242,  0.95664043,
        -1.07102483, -0.91968143, -0.90136457,  0.90594711,  0.49379589,
         1.13936929,  0.80102411, -0.88627946,  0.77120028,  1.0337573 ,
        -0.75998264,  0.81719528, -1.03276259,  1.40002353, -1.07367447],
       [-0.99345477, -0.96994766, -1.01526932, -1.10054089, -0.86517634,
        -1.63181859, -0.87350959, -0.96385041, -0.88166454, -1.01565408,
         0.8372093 ,  0.84205376, -0.22016536, -0.02709515,  0.88222455,
        -0.96385943, -1.14738331,  0.8934703 ,  1.50603736, -0.92519782,
         0.73007616,  0.95089926, -0.92290892,  1.04267235, -0.86750875,
         0.73362261, -0.83404706, -0.93310681,  0.96101552,  1.08999138,
        -0.95854775,  1.05812265,  0.90484898, -0.97830292, -0.9120634 ,
         1.02792006, -1.07745875, -1.01574649, -0.99766427, -0.79704564,
         1.03941041, -1.01813412, -0.61981408,  1.22883969, -0.97946603,
        -0.96033783,  1.13331084,  0.99183471, -0.98542661, -1.55731745,
         0.89189233, -0.98284402, -1.05344616,  1.02317025,  1.02219218,
         1.31403784,  1.01782731,  1.19575961, -0.93286011,  1.00277783,
         1.17131561, -0.99636036, -0.86167575,  1.50559918,  1.04438222,
        -0.99962079, -0.61676845, -0.93396301,  1.00203583, -0.95449524,
         0.79469039, -0.89549657, -1.17115962,  1.05222547,  1.00322537,
        -1.00899468,  0.89257558,  0.934033  ,  1.06107305,  1.03845803,
        -1.13304374, -0.89514925,  1.32058198,  1.15257629,  1.07559141,
        -0.90754984, -1.03134595, -0.97817258,  1.02889921,  0.9349881 ,
         1.0098955 ,  1.01196226, -0.89456632,  1.09568848,  0.82344674,
        -1.11902956,  1.10425192, -0.88460255,  0.9942196 , -1.09929916]])

Carry on PCA on the above formed dataset

In [13]:
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)
In [15]:
# The percentage of variation that each PC accounts for 
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
# Create labels for Scree plot
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

Create bar plot using matplotlib

In [17]:
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()
Notebook Image
  • The above "Scree Plot" shows that almost all variations in the data are covered by the first two PCs
  • So, a 2-D graph will be enough to represent the dataset

Drawing a PCA plot

In [19]:
# Put the new coordinates created by pca.transform(scaled_data) into a nice matrix where the rows have sample labels and the columns have PC labels

pca_df = pd.DataFrame(pca_data, index=[*wt, *ko], columns=labels)


plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('PCA Graph')
plt.xlabel('PC1 - {}%'.format(per_var[0]))
plt.ylabel('PC2 - {}%'.format(per_var[1]))

# Also, add sample names to the graph
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
    
plt.show()
Notebook Image
  • The wildtype (wt) samples are clustered on one side of the graph and knockout (ko) samples are clustered on the other side of the graph
  • This shows that both the gene samples are quite different from one another

Looking at the loading scores of PC1

  • This will help us determine which genes had the largest influence on separating the two clusters along the x-axis
In [20]:
loading_scores = pd.Series(pca.components_[0], index=genes)
loading_scores
Out[20]:
gene1     -0.103703
gene2     -0.103496
gene3     -0.103147
gene4     -0.101501
gene5     -0.097270
gene6     -0.082941
gene7     -0.102093
gene8     -0.103405
gene9     -0.102841
gene10    -0.103670
gene11     0.094897
gene12     0.103280
gene13     0.035938
gene14     0.091247
gene15     0.050809
gene16    -0.102637
gene17    -0.081913
gene18     0.101927
gene19     0.088028
gene20    -0.102519
gene21     0.098822
gene22     0.103547
gene23    -0.103489
gene24     0.102816
gene25    -0.102025
gene26     0.100963
gene27    -0.103140
gene28    -0.102577
gene29     0.103649
gene30     0.103260
             ...   
gene71     0.101796
gene72    -0.091324
gene73    -0.102122
gene74     0.102815
gene75     0.102817
gene76    -0.103626
gene77     0.103493
gene78     0.090421
gene79     0.103498
gene80     0.103370
gene81    -0.103317
gene82    -0.100797
gene83     0.094089
gene84     0.103408
gene85     0.103623
gene86    -0.103316
gene87    -0.103493
gene88    -0.103438
gene89     0.103699
gene90     0.101175
gene91     0.103270
gene92     0.102911
gene93    -0.102766
gene94     0.102371
gene95     0.102262
gene96    -0.102544
gene97     0.103144
gene98    -0.102878
gene99     0.093563
gene100   -0.102667
Length: 100, dtype: float64
In [23]:
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)

top_10_genes = sorted_loading_scores[0:10].index.values
loading_scores[top_10_genes]
Out[23]:
gene57    0.103743
gene54    0.103705
gene1    -0.103703
gene89    0.103699
gene10   -0.103670
gene48    0.103670
gene39   -0.103669
gene29    0.103649
gene49   -0.103645
gene55    0.103642
dtype: float64
  • These values are quite similar. This means a lot of genes played a role in separating the samples, rather than just one or two