Building machine learning model for Human_DNA_Classifier for the given DNA sequence

Load essential Library

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import seaborn as sns

Load the data set and analyze the details

df = pd.read_table("human_data.txt")
/Users/vigneshkarthick/anaconda3/lib/python3.7/site-packages/ FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
Visualize various classses of human data

<matplotlib.axes._subplots.AxesSubplot at 0x1a25ce99e8>
Notebook Image

Preprocessing: Break sequence into chunksize of 6

data= df.sequence[0]
# [data[i:i+6] for i in range(len(data))]
def seqPreprocessor(seq):
    seq = seq.lower()
    sow = [seq[i:i+6] for i in range(len(seq))]
    sow = " ".join(sow)
atgccc tgcccc gcccca ccccaa cccaac ccaact caacta aactaa actaaa ctaaat taaata aaatac aatact atacta tactac actacc ctaccg taccgt accgta ccgtat cgtatg gtatgg tatggc atggcc tggccc ggccca gcccac cccacc ccacca caccat accata ccataa cataat ataatt taatta aattac attacc ttaccc tacccc accccc ccccca ccccat cccata ccatac catact atactc tactcc actcct ctcctt tcctta ccttac cttaca ttacac tacact acacta cactat actatt ctattc tattcc attcct ttcctc tcctca cctcat ctcatc tcatca catcac atcacc tcaccc caccca acccaa cccaac ccaact caacta aactaa actaaa ctaaaa taaaaa aaaaat aaaata aaatat aatatt atatta tattaa attaaa ttaaac taaaca aaacac aacaca acacaa cacaaa acaaac caaact aaacta aactac actacc ctacca taccac accacc ccacct caccta acctac cctacc ctacct tacctc acctcc cctccc ctccct tccctc ccctca cctcac ctcacc tcacca caccaa accaaa ccaaag caaagc aaagcc aagccc agccca gcccat cccata ccataa cataaa ataaaa taaaaa aaaaat aaaata aaataa aataaa ataaaa taaaaa aaaaaa aaaaat aaaatt aaatta aattat attata ttataa tataac ataaca taacaa aacaaa acaaac caaacc aaaccc aaccct accctg ccctga cctgag ctgaga tgagaa gagaac agaacc gaacca aaccaa accaaa ccaaaa caaaat aaaatg aaatga aatgaa atgaac tgaacg gaacga aacgaa acgaaa cgaaaa gaaaat aaaatc aaatct aatctg atctgt tctgtt ctgttc tgttcg gttcgc ttcgct tcgctt cgcttc gcttca cttcat ttcatt tcattc cattca attcat ttcatt tcattg cattgc attgcc ttgccc tgcccc gccccc ccccca ccccac cccaca ccacaa cacaat acaatc caatcc aatcct atccta tcctag cctag ctag tag ag g

Apply function to whole sequence using lambda fucntion

lam = lambda x:seqPreprocessor(x)
df.sequence = df.sequence.apply(lam)
0       atgccc tgcccc gcccca ccccaa cccaac ccaact caac...
1       atgaac tgaacg gaacga aacgaa acgaaa cgaaaa gaaa...
2       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
3       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
4       atgcaa tgcaac gcaaca caacag aacagc acagca cagc...
5       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
6       atgaag tgaaga gaagat aagatt agattg gattgc attg...
7       atgcaa tgcaac gcaaca caacag aacagc acagca cagc...
8       atgaag tgaaga gaagat aagatt agattg gattgc attg...
9       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
10      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
11      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
12      atgaag tgaaga gaagat aagatt agattg gattgc attg...
13      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
14      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
15      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
16      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
17      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
18      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
19      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
20      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
21      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
22      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
23      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
24      gccacc ccacca caccac accaca ccacag cacagg acag...
25      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
26      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
27      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
28      atgcag tgcagc gcagcc cagcct agcctt gccttg cctt...
29      atgcag tgcagc gcagcc cagcct agcctt gccttg cctt...
4350    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4351    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4352    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4353    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4354    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4355    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4356    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4357    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4358    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4359    nntgct ntgctg tgctgg gctggt ctggtg tggtgg ggtg...
4360    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4361    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4362    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4363    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4364    atgggg tggggc ggggca gggcac ggcacc gcacct cacc...
4365    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4366    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4367    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4368    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4369    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4370    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4371    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4372    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4373    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4374    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4375    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4376    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4377    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4378    atgggg tggggc ggggca gggcac ggcacc gcacct cacc...
4379    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
Name: sequence, Length: 4380, dtype: object

Create BOW (bag of words) using Count Vectorizer with ngrams of 4,4 and Create train and test data

X = df.sequence
y = df["class"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=10)
cv = CountVectorizer() # with this accuracy is 68%
cv = CountVectorizer(ngram_range=(4,4)) #
X_train_dtm = cv.fit_transform(X_train)
<3504x234624 sparse matrix of type '<class 'numpy.int64'>'
	with 4355746 stored elements in Compressed Sparse Row format>
X_test_dtm = cv.transform(X_test)

Create MultinomialNB() instance for training and testing

Fit and transform the data

nb = MultinomialNB()
In [46]:,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

get feature names after transforming

Train and prediction

y_pred = nb.predict(X_test_dtm)
Accuracy score

Display confusion matrix

<matplotlib.axes._subplots.AxesSubplot at 0x1a3cb1aac8>
Notebook Image

Display classification report

precision recall f1-score support 0 0.97 0.95 0.96 99 1 0.99 0.98 0.99 123 2 1.00 0.99 0.99 72 3 0.99 0.98 0.98 127 4 0.93 0.97 0.95 156 5 0.98 0.98 0.98 41 6 0.99 0.99 0.99 258 micro avg 0.98 0.98 0.98 876 macro avg 0.98 0.98 0.98 876 weighted avg 0.98 0.98 0.98 876