Jovian
⭐️
Sign In

Building machine learning model for Human_DNA_Classifier for the given DNA sequence

Load essential Library

In [31]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import seaborn as sns

Load the data set and analyze the details

In [17]:
df = pd.read_table("human_data.txt")
/Users/vigneshkarthick/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'. """Entry point for launching an IPython kernel.
In [18]:
df.head(2)
Out[18]:

Visualize various classses of human data

In [5]:
sns.countplot(df["class"])
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a25ce99e8>
Notebook Image

Preprocessing: Break sequence into chunksize of 6

In [19]:
df.sequence[0]
Out[19]:
'ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG'
In [20]:
data= df.sequence[0]
In [10]:
# [data[i:i+6] for i in range(len(data))]
In [21]:
def seqPreprocessor(seq):
    seq = seq.lower()
    sow = [seq[i:i+6] for i in range(len(seq))]
    sow = " ".join(sow)
    return(sow)
print(seqPreprocessor(df.sequence[0]))
atgccc tgcccc gcccca ccccaa cccaac ccaact caacta aactaa actaaa ctaaat taaata aaatac aatact atacta tactac actacc ctaccg taccgt accgta ccgtat cgtatg gtatgg tatggc atggcc tggccc ggccca gcccac cccacc ccacca caccat accata ccataa cataat ataatt taatta aattac attacc ttaccc tacccc accccc ccccca ccccat cccata ccatac catact atactc tactcc actcct ctcctt tcctta ccttac cttaca ttacac tacact acacta cactat actatt ctattc tattcc attcct ttcctc tcctca cctcat ctcatc tcatca catcac atcacc tcaccc caccca acccaa cccaac ccaact caacta aactaa actaaa ctaaaa taaaaa aaaaat aaaata aaatat aatatt atatta tattaa attaaa ttaaac taaaca aaacac aacaca acacaa cacaaa acaaac caaact aaacta aactac actacc ctacca taccac accacc ccacct caccta acctac cctacc ctacct tacctc acctcc cctccc ctccct tccctc ccctca cctcac ctcacc tcacca caccaa accaaa ccaaag caaagc aaagcc aagccc agccca gcccat cccata ccataa cataaa ataaaa taaaaa aaaaat aaaata aaataa aataaa ataaaa taaaaa aaaaaa aaaaat aaaatt aaatta aattat attata ttataa tataac ataaca taacaa aacaaa acaaac caaacc aaaccc aaccct accctg ccctga cctgag ctgaga tgagaa gagaac agaacc gaacca aaccaa accaaa ccaaaa caaaat aaaatg aaatga aatgaa atgaac tgaacg gaacga aacgaa acgaaa cgaaaa gaaaat aaaatc aaatct aatctg atctgt tctgtt ctgttc tgttcg gttcgc ttcgct tcgctt cgcttc gcttca cttcat ttcatt tcattc cattca attcat ttcatt tcattg cattgc attgcc ttgccc tgcccc gccccc ccccca ccccac cccaca ccacaa cacaat acaatc caatcc aatcct atccta tcctag cctag ctag tag ag g

Apply function to whole sequence using lambda fucntion

In [22]:
lam = lambda x:seqPreprocessor(x)
In [23]:
df.sequence = df.sequence.apply(lam)
In [24]:
df.sequence
Out[24]:
0       atgccc tgcccc gcccca ccccaa cccaac ccaact caac...
1       atgaac tgaacg gaacga aacgaa acgaaa cgaaaa gaaa...
2       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
3       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
4       atgcaa tgcaac gcaaca caacag aacagc acagca cagc...
5       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
6       atgaag tgaaga gaagat aagatt agattg gattgc attg...
7       atgcaa tgcaac gcaaca caacag aacagc acagca cagc...
8       atgaag tgaaga gaagat aagatt agattg gattgc attg...
9       atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
10      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
11      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
12      atgaag tgaaga gaagat aagatt agattg gattgc attg...
13      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
14      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
15      atgtgt tgtgtg gtgtgg tgtggc gtggca tggcat ggca...
16      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
17      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
18      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
19      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
20      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
21      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
22      atggcg tggcgg ggcgga gcggat cggatt ggattc gatt...
23      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
24      gccacc ccacca caccac accaca ccacag cacagg acag...
25      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
26      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
27      atgcca tgccac gccact ccactg cactgc actgcc ctgc...
28      atgcag tgcagc gcagcc cagcct agcctt gccttg cctt...
29      atgcag tgcagc gcagcc cagcct agcctt gccttg cctt...
                              ...                        
4350    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4351    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4352    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4353    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4354    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4355    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4356    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4357    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4358    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4359    nntgct ntgctg tgctgg gctggt ctggtg tggtgg ggtg...
4360    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4361    atggtg tggtgc ggtgca gtgcac tgcacg gcacgt cacg...
4362    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4363    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4364    atgggg tggggc ggggca gggcac ggcacc gcacct cacc...
4365    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4366    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4367    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4368    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4369    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4370    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4371    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4372    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4373    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4374    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4375    atggaa tggaag ggaaga gaagat aagatt agattt gatt...
4376    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4377    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
4378    atgggg tggggc ggggca gggcac ggcacc gcacct cacc...
4379    atgcag tgcagt gcagtc cagtcc agtcct gtcctt tcct...
Name: sequence, Length: 4380, dtype: object

Create BOW (bag of words) using Count Vectorizer with ngrams of 4,4 and Create train and test data

In [25]:
X = df.sequence
y = df["class"]
In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=10)
In [41]:
cv = CountVectorizer() # with this accuracy is 68%
cv = CountVectorizer(ngram_range=(4,4)) #
In [42]:
X_train_dtm = cv.fit_transform(X_train)
In [43]:
X_train_dtm
Out[43]:
<3504x234624 sparse matrix of type '<class 'numpy.int64'>'
	with 4355746 stored elements in Compressed Sparse Row format>
In [44]:
X_test_dtm = cv.transform(X_test)

Create MultinomialNB() instance for training and testing

https://monkeylearn.com/blog/practical-explanation-naive-bayes-classifier/

Fit and transform the data

In [45]:
nb = MultinomialNB()
In [46]:
nb.fit(X_train_dtm,y_train)
Out[46]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

get feature names after transforming

In [47]:
cv.get_feature_names()
Out[47]:
['aaaaa aaaa aaa aa',
 'aaaaaa aaaaa aaaa aaa',
 'aaaaaa aaaaaa aaaaaa aaaaaa',
 'aaaaaa aaaaaa aaaaaa aaaaac',
 'aaaaaa aaaaaa aaaaaa aaaaag',
 'aaaaaa aaaaaa aaaaaa aaaaat',
 'aaaaaa aaaaaa aaaaac aaaaca',
 'aaaaaa aaaaaa aaaaac aaaacc',
 'aaaaaa aaaaaa aaaaac aaaacg',
 'aaaaaa aaaaaa aaaaac aaaact',
 'aaaaaa aaaaaa aaaaag aaaaga',
 'aaaaaa aaaaaa aaaaag aaaagc',
 'aaaaaa aaaaaa aaaaag aaaagg',
 'aaaaaa aaaaaa aaaaag aaaagt',
 'aaaaaa aaaaaa aaaaat aaaata',
 'aaaaaa aaaaaa aaaaat aaaatc',
 'aaaaaa aaaaaa aaaaat aaaatg',
 'aaaaaa aaaaaa aaaaat aaaatt',
 'aaaaaa aaaaac aaaaca aaacaa',
 'aaaaaa aaaaac aaaaca aaacac',
 'aaaaaa aaaaac aaaaca aaacag',
 'aaaaaa aaaaac aaaaca aaacat',
 'aaaaaa aaaaac aaaacc aaacca',
 'aaaaaa aaaaac aaaacc aaaccc',
 'aaaaaa aaaaac aaaacc aaaccg',
 'aaaaaa aaaaac aaaacc aaacct',
 'aaaaaa aaaaac aaaacg aaacga',
 'aaaaaa aaaaac aaaacg aaacgc',
 'aaaaaa aaaaac aaaacg aaacgg',
 'aaaaaa aaaaac aaaact aaacta',
 'aaaaaa aaaaac aaaact aaactc',
 'aaaaaa aaaaac aaaact aaactg',
 'aaaaaa aaaaac aaaact aaactt',
 'aaaaaa aaaaag aaaag aaag',
 'aaaaaa aaaaag aaaaga aaagaa',
 'aaaaaa aaaaag aaaaga aaagac',
 'aaaaaa aaaaag aaaaga aaagag',
 'aaaaaa aaaaag aaaaga aaagat',
 'aaaaaa aaaaag aaaagc aaagca',
 'aaaaaa aaaaag aaaagc aaagcc',
 'aaaaaa aaaaag aaaagc aaagct',
 'aaaaaa aaaaag aaaagg aaagga',
 'aaaaaa aaaaag aaaagg aaaggc',
 'aaaaaa aaaaag aaaagg aaaggg',
 'aaaaaa aaaaag aaaagg aaaggt',
 'aaaaaa aaaaag aaaagt aaagta',
 'aaaaaa aaaaag aaaagt aaagtc',
 'aaaaaa aaaaag aaaagt aaagtg',
 'aaaaaa aaaaag aaaagt aaagtt',
 'aaaaaa aaaaat aaaata aaataa',
 'aaaaaa aaaaat aaaata aaatac',
 'aaaaaa aaaaat aaaata aaatag',
 'aaaaaa aaaaat aaaata aaatat',
 'aaaaaa aaaaat aaaatc aaatca',
 'aaaaaa aaaaat aaaatc aaatcc',
 'aaaaaa aaaaat aaaatc aaatcg',
 'aaaaaa aaaaat aaaatc aaatct',
 'aaaaaa aaaaat aaaatg aaatga',
 'aaaaaa aaaaat aaaatg aaatgc',
 'aaaaaa aaaaat aaaatg aaatgg',
 'aaaaaa aaaaat aaaatg aaatgt',
 'aaaaaa aaaaat aaaatt aaatta',
 'aaaaaa aaaaat aaaatt aaattc',
 'aaaaaa aaaaat aaaatt aaattg',
 'aaaaaa aaaaat aaaatt aaattt',
 'aaaaac aaaaca aaacaa aacaaa',
 'aaaaac aaaaca aaacaa aacaac',
 'aaaaac aaaaca aaacaa aacaag',
 'aaaaac aaaaca aaacaa aacaat',
 'aaaaac aaaaca aaacac aacaca',
 'aaaaac aaaaca aaacac aacacc',
 'aaaaac aaaaca aaacac aacacg',
 'aaaaac aaaaca aaacac aacact',
 'aaaaac aaaaca aaacag aacaga',
 'aaaaac aaaaca aaacag aacagc',
 'aaaaac aaaaca aaacag aacagg',
 'aaaaac aaaaca aaacag aacagt',
 'aaaaac aaaaca aaacat aacata',
 'aaaaac aaaaca aaacat aacatc',
 'aaaaac aaaaca aaacat aacatg',
 'aaaaac aaaaca aaacat aacatt',
 'aaaaac aaaacc aaacca aaccaa',
 'aaaaac aaaacc aaacca aaccac',
 'aaaaac aaaacc aaacca aaccag',
 'aaaaac aaaacc aaacca aaccat',
 'aaaaac aaaacc aaaccc aaccca',
 'aaaaac aaaacc aaaccc aacccc',
 'aaaaac aaaacc aaaccc aacccg',
 'aaaaac aaaacc aaaccc aaccct',
 'aaaaac aaaacc aaaccg aaccga',
 'aaaaac aaaacc aaaccg aaccgc',
 'aaaaac aaaacc aaaccg aaccgg',
 'aaaaac aaaacc aaaccg aaccgt',
 'aaaaac aaaacc aaacct aaccta',
 'aaaaac aaaacc aaacct aacctc',
 'aaaaac aaaacc aaacct aacctg',
 'aaaaac aaaacc aaacct aacctt',
 'aaaaac aaaacg aaacga aacgaa',
 'aaaaac aaaacg aaacga aacgac',
 'aaaaac aaaacg aaacga aacgag',
 'aaaaac aaaacg aaacga aacgat',
 'aaaaac aaaacg aaacgc aacgca',
 'aaaaac aaaacg aaacgc aacgcc',
 'aaaaac aaaacg aaacgc aacgcg',
 'aaaaac aaaacg aaacgc aacgct',
 'aaaaac aaaacg aaacgg aacgga',
 'aaaaac aaaacg aaacgg aacggg',
 'aaaaac aaaacg aaacgg aacggt',
 'aaaaac aaaacg aaacgt aacgta',
 'aaaaac aaaacg aaacgt aacgtc',
 'aaaaac aaaacg aaacgt aacgtt',
 'aaaaac aaaact aaacta aactaa',
 'aaaaac aaaact aaacta aactac',
 'aaaaac aaaact aaacta aactag',
 'aaaaac aaaact aaacta aactat',
 'aaaaac aaaact aaactc aactca',
 'aaaaac aaaact aaactc aactcc',
 'aaaaac aaaact aaactc aactcg',
 'aaaaac aaaact aaactc aactct',
 'aaaaac aaaact aaactg aactga',
 'aaaaac aaaact aaactg aactgc',
 'aaaaac aaaact aaactg aactgg',
 'aaaaac aaaact aaactg aactgt',
 'aaaaac aaaact aaactt aactta',
 'aaaaac aaaact aaactt aacttc',
 'aaaaac aaaact aaactt aacttg',
 'aaaaac aaaact aaactt aacttt',
 'aaaaag aaaag aaag aag',
 'aaaaag aaaaga aaagaa aagaaa',
 'aaaaag aaaaga aaagaa aagaac',
 'aaaaag aaaaga aaagaa aagaag',
 'aaaaag aaaaga aaagaa aagaat',
 'aaaaag aaaaga aaagac aagaca',
 'aaaaag aaaaga aaagac aagacc',
 'aaaaag aaaaga aaagac aagacg',
 'aaaaag aaaaga aaagac aagact',
 'aaaaag aaaaga aaagag aagaga',
 'aaaaag aaaaga aaagag aagagc',
 'aaaaag aaaaga aaagag aagagg',
 'aaaaag aaaaga aaagag aagagt',
 'aaaaag aaaaga aaagat aagata',
 'aaaaag aaaaga aaagat aagatc',
 'aaaaag aaaaga aaagat aagatg',
 'aaaaag aaaaga aaagat aagatt',
 'aaaaag aaaagc aaagca aagcaa',
 'aaaaag aaaagc aaagca aagcac',
 'aaaaag aaaagc aaagca aagcag',
 'aaaaag aaaagc aaagca aagcat',
 'aaaaag aaaagc aaagcc aagcca',
 'aaaaag aaaagc aaagcc aagccc',
 'aaaaag aaaagc aaagcc aagccg',
 'aaaaag aaaagc aaagcc aagcct',
 'aaaaag aaaagc aaagcg aagcga',
 'aaaaag aaaagc aaagcg aagcgc',
 'aaaaag aaaagc aaagcg aagcgg',
 'aaaaag aaaagc aaagcg aagcgt',
 'aaaaag aaaagc aaagct aagcta',
 'aaaaag aaaagc aaagct aagctc',
 'aaaaag aaaagc aaagct aagctg',
 'aaaaag aaaagc aaagct aagctt',
 'aaaaag aaaagg aaagga aaggaa',
 'aaaaag aaaagg aaagga aaggac',
 'aaaaag aaaagg aaagga aaggag',
 'aaaaag aaaagg aaagga aaggat',
 'aaaaag aaaagg aaaggc aaggca',
 'aaaaag aaaagg aaaggc aaggcc',
 'aaaaag aaaagg aaaggc aaggcg',
 'aaaaag aaaagg aaaggc aaggct',
 'aaaaag aaaagg aaaggg aaggga',
 'aaaaag aaaagg aaaggg aagggc',
 'aaaaag aaaagg aaaggg aagggg',
 'aaaaag aaaagg aaaggg aagggt',
 'aaaaag aaaagg aaaggt aaggta',
 'aaaaag aaaagg aaaggt aaggtc',
 'aaaaag aaaagg aaaggt aaggtg',
 'aaaaag aaaagg aaaggt aaggtt',
 'aaaaag aaaagt aaagta aagtaa',
 'aaaaag aaaagt aaagta aagtac',
 'aaaaag aaaagt aaagta aagtag',
 'aaaaag aaaagt aaagta aagtat',
 'aaaaag aaaagt aaagtc aagtca',
 'aaaaag aaaagt aaagtc aagtcc',
 'aaaaag aaaagt aaagtc aagtcg',
 'aaaaag aaaagt aaagtc aagtct',
 'aaaaag aaaagt aaagtg aagtga',
 'aaaaag aaaagt aaagtg aagtgc',
 'aaaaag aaaagt aaagtg aagtgg',
 'aaaaag aaaagt aaagtg aagtgt',
 'aaaaag aaaagt aaagtt aagtta',
 'aaaaag aaaagt aaagtt aagttc',
 'aaaaag aaaagt aaagtt aagttg',
 'aaaaag aaaagt aaagtt aagttt',
 'aaaaat aaaata aaataa aataaa',
 'aaaaat aaaata aaataa aataac',
 'aaaaat aaaata aaataa aataag',
 'aaaaat aaaata aaataa aataat',
 'aaaaat aaaata aaatac aataca',
 'aaaaat aaaata aaatac aatacc',
 'aaaaat aaaata aaatac aatacg',
 'aaaaat aaaata aaatac aatact',
 'aaaaat aaaata aaatag aatag',
 'aaaaat aaaata aaatag aataga',
 'aaaaat aaaata aaatag aatagc',
 'aaaaat aaaata aaatag aatagg',
 'aaaaat aaaata aaatag aatagt',
 'aaaaat aaaata aaatat aatata',
 'aaaaat aaaata aaatat aatatc',
 'aaaaat aaaata aaatat aatatg',
 'aaaaat aaaata aaatat aatatt',
 'aaaaat aaaatc aaatca aatcaa',
 'aaaaat aaaatc aaatca aatcac',
 'aaaaat aaaatc aaatca aatcag',
 'aaaaat aaaatc aaatca aatcat',
 'aaaaat aaaatc aaatcc aatcca',
 'aaaaat aaaatc aaatcc aatccc',
 'aaaaat aaaatc aaatcc aatccg',
 'aaaaat aaaatc aaatcc aatcct',
 'aaaaat aaaatc aaatcg aatcga',
 'aaaaat aaaatc aaatcg aatcgc',
 'aaaaat aaaatc aaatcg aatcgg',
 'aaaaat aaaatc aaatcg aatcgt',
 'aaaaat aaaatc aaatct aatcta',
 'aaaaat aaaatc aaatct aatctc',
 'aaaaat aaaatc aaatct aatctg',
 'aaaaat aaaatc aaatct aatctt',
 'aaaaat aaaatg aaatga aatga',
 'aaaaat aaaatg aaatga aatgaa',
 'aaaaat aaaatg aaatga aatgac',
 'aaaaat aaaatg aaatga aatgag',
 'aaaaat aaaatg aaatga aatgat',
 'aaaaat aaaatg aaatgc aatgca',
 'aaaaat aaaatg aaatgc aatgcc',
 'aaaaat aaaatg aaatgc aatgcg',
 'aaaaat aaaatg aaatgc aatgct',
 'aaaaat aaaatg aaatgg aatgga',
 'aaaaat aaaatg aaatgg aatggc',
 'aaaaat aaaatg aaatgg aatggg',
 'aaaaat aaaatg aaatgg aatggt',
 'aaaaat aaaatg aaatgt aatgta',
 'aaaaat aaaatg aaatgt aatgtc',
 'aaaaat aaaatg aaatgt aatgtg',
 'aaaaat aaaatg aaatgt aatgtt',
 'aaaaat aaaatt aaatta aattaa',
 'aaaaat aaaatt aaatta aattac',
 'aaaaat aaaatt aaatta aattag',
 'aaaaat aaaatt aaatta aattat',
 'aaaaat aaaatt aaattc aattca',
 'aaaaat aaaatt aaattc aattcc',
 'aaaaat aaaatt aaattc aattcg',
 'aaaaat aaaatt aaattc aattct',
 'aaaaat aaaatt aaattg aattga',
 'aaaaat aaaatt aaattg aattgc',
 'aaaaat aaaatt aaattg aattgg',
 'aaaaat aaaatt aaattg aattgt',
 'aaaaat aaaatt aaattt aattta',
 'aaaaat aaaatt aaattt aatttc',
 'aaaaat aaaatt aaattt aatttg',
 'aaaaat aaaatt aaattt aatttt',
 'aaaaca aaaca aaca aca',
 'aaaaca aaacaa aacaaa acaaaa',
 'aaaaca aaacaa aacaaa acaaac',
 'aaaaca aaacaa aacaaa acaaag',
 'aaaaca aaacaa aacaaa acaaat',
 'aaaaca aaacaa aacaac acaaca',
 'aaaaca aaacaa aacaac acaacc',
 'aaaaca aaacaa aacaac acaacg',
 'aaaaca aaacaa aacaac acaact',
 'aaaaca aaacaa aacaag acaaga',
 'aaaaca aaacaa aacaag acaagc',
 'aaaaca aaacaa aacaag acaagg',
 'aaaaca aaacaa aacaag acaagt',
 'aaaaca aaacaa aacaat acaata',
 'aaaaca aaacaa aacaat acaatc',
 'aaaaca aaacaa aacaat acaatg',
 'aaaaca aaacaa aacaat acaatt',
 'aaaaca aaacac aacaca acacaa',
 'aaaaca aaacac aacaca acacac',
 'aaaaca aaacac aacaca acacag',
 'aaaaca aaacac aacaca acacat',
 'aaaaca aaacac aacacc acacca',
 'aaaaca aaacac aacacc acaccc',
 'aaaaca aaacac aacacc acaccg',
 'aaaaca aaacac aacacc acacct',
 'aaaaca aaacac aacacg acacga',
 'aaaaca aaacac aacacg acacgc',
 'aaaaca aaacac aacacg acacgg',
 'aaaaca aaacac aacacg acacgt',
 'aaaaca aaacac aacact acacta',
 'aaaaca aaacac aacact acactc',
 'aaaaca aaacac aacact acactg',
 'aaaaca aaacac aacact acactt',
 'aaaaca aaacag aacag acag',
 'aaaaca aaacag aacaga acagaa',
 'aaaaca aaacag aacaga acagac',
 'aaaaca aaacag aacaga acagag',
 'aaaaca aaacag aacaga acagat',
 'aaaaca aaacag aacagc acagca',
 'aaaaca aaacag aacagc acagcc',
 'aaaaca aaacag aacagc acagcg',
 'aaaaca aaacag aacagc acagct',
 'aaaaca aaacag aacagg acagga',
 'aaaaca aaacag aacagg acaggc',
 'aaaaca aaacag aacagg acaggg',
 'aaaaca aaacag aacagg acaggt',
 'aaaaca aaacag aacagt acagta',
 'aaaaca aaacag aacagt acagtc',
 'aaaaca aaacag aacagt acagtg',
 'aaaaca aaacag aacagt acagtt',
 'aaaaca aaacat aacata acataa',
 'aaaaca aaacat aacata acatac',
 'aaaaca aaacat aacata acatag',
 'aaaaca aaacat aacata acatat',
 'aaaaca aaacat aacatc acatca',
 'aaaaca aaacat aacatc acatcc',
 'aaaaca aaacat aacatc acatcg',
 'aaaaca aaacat aacatc acatct',
 'aaaaca aaacat aacatg acatga',
 'aaaaca aaacat aacatg acatgc',
 'aaaaca aaacat aacatg acatgg',
 'aaaaca aaacat aacatg acatgt',
 'aaaaca aaacat aacatt acatta',
 'aaaaca aaacat aacatt acattc',
 'aaaaca aaacat aacatt acattg',
 'aaaaca aaacat aacatt acattt',
 'aaaacc aaacca aaccaa accaaa',
 'aaaacc aaacca aaccaa accaac',
 'aaaacc aaacca aaccaa accaag',
 'aaaacc aaacca aaccaa accaat',
 'aaaacc aaacca aaccac accaca',
 'aaaacc aaacca aaccac accacc',
 'aaaacc aaacca aaccac accact',
 'aaaacc aaacca aaccag accaga',
 'aaaacc aaacca aaccag accagc',
 'aaaacc aaacca aaccag accagg',
 'aaaacc aaacca aaccag accagt',
 'aaaacc aaacca aaccat accata',
 'aaaacc aaacca aaccat accatc',
 'aaaacc aaacca aaccat accatg',
 'aaaacc aaacca aaccat accatt',
 'aaaacc aaaccc aaccca acccaa',
 'aaaacc aaaccc aaccca acccac',
 'aaaacc aaaccc aaccca acccag',
 'aaaacc aaaccc aaccca acccat',
 'aaaacc aaaccc aacccc acccca',
 'aaaacc aaaccc aacccc accccc',
 'aaaacc aaaccc aacccc accccg',
 'aaaacc aaaccc aacccc acccct',
 'aaaacc aaaccc aacccg acccga',
 'aaaacc aaaccc aacccg acccgc',
 'aaaacc aaaccc aacccg acccgt',
 'aaaacc aaaccc aaccct accct',
 'aaaacc aaaccc aaccct acccta',
 'aaaacc aaaccc aaccct accctc',
 'aaaacc aaaccc aaccct accctg',
 'aaaacc aaaccc aaccct accctt',
 'aaaacc aaaccg aaccg accg',
 'aaaacc aaaccg aaccga accgaa',
 'aaaacc aaaccg aaccga accgac',
 'aaaacc aaaccg aaccga accgag',
 'aaaacc aaaccg aaccga accgat',
 'aaaacc aaaccg aaccgc accgca',
 'aaaacc aaaccg aaccgc accgcc',
 'aaaacc aaaccg aaccgc accgcg',
 'aaaacc aaaccg aaccgc accgct',
 'aaaacc aaaccg aaccgg accgga',
 'aaaacc aaaccg aaccgg accggc',
 'aaaacc aaaccg aaccgg accggg',
 'aaaacc aaaccg aaccgg accggt',
 'aaaacc aaaccg aaccgt accgtc',
 'aaaacc aaaccg aaccgt accgtg',
 'aaaacc aaaccg aaccgt accgtt',
 'aaaacc aaacct aaccta accta',
 'aaaacc aaacct aaccta acctaa',
 'aaaacc aaacct aaccta acctac',
 'aaaacc aaacct aaccta acctag',
 'aaaacc aaacct aaccta acctat',
 'aaaacc aaacct aacctc acctca',
 'aaaacc aaacct aacctc acctcc',
 'aaaacc aaacct aacctc acctcg',
 'aaaacc aaacct aacctc acctct',
 'aaaacc aaacct aacctg acctga',
 'aaaacc aaacct aacctg acctgc',
 'aaaacc aaacct aacctg acctgg',
 'aaaacc aaacct aacctg acctgt',
 'aaaacc aaacct aacctt acctta',
 'aaaacc aaacct aacctt accttc',
 'aaaacc aaacct aacctt accttg',
 'aaaacc aaacct aacctt accttt',
 'aaaacg aaacga aacgaa acgaaa',
 'aaaacg aaacga aacgaa acgaac',
 'aaaacg aaacga aacgaa acgaag',
 'aaaacg aaacga aacgaa acgaat',
 'aaaacg aaacga aacgac acgaca',
 'aaaacg aaacga aacgac acgacc',
 'aaaacg aaacga aacgac acgacg',
 'aaaacg aaacga aacgac acgact',
 'aaaacg aaacga aacgag acgaga',
 'aaaacg aaacga aacgag acgagc',
 'aaaacg aaacga aacgag acgagg',
 'aaaacg aaacga aacgat acgata',
 'aaaacg aaacga aacgat acgatc',
 'aaaacg aaacga aacgat acgatg',
 'aaaacg aaacga aacgat acgatt',
 'aaaacg aaacgc aacgca acgcaa',
 'aaaacg aaacgc aacgca acgcac',
 'aaaacg aaacgc aacgca acgcag',
 'aaaacg aaacgc aacgca acgcat',
 'aaaacg aaacgc aacgcc acgcca',
 'aaaacg aaacgc aacgcc acgccg',
 'aaaacg aaacgc aacgcc acgcct',
 'aaaacg aaacgc aacgcg acgcgc',
 'aaaacg aaacgc aacgcg acgcgg',
 'aaaacg aaacgc aacgct acgcta',
 'aaaacg aaacgc aacgct acgctc',
 'aaaacg aaacgc aacgct acgctg',
 'aaaacg aaacgc aacgct acgctt',
 'aaaacg aaacgg aacgg acgg',
 'aaaacg aaacgg aacgga acggaa',
 'aaaacg aaacgg aacgga acggac',
 'aaaacg aaacgg aacgga acggag',
 'aaaacg aaacgg aacgga acggat',
 'aaaacg aaacgg aacggc acggca',
 'aaaacg aaacgg aacggc acggcc',
 'aaaacg aaacgg aacggc acggct',
 'aaaacg aaacgg aacggg acggga',
 'aaaacg aaacgg aacggg acgggc',
 'aaaacg aaacgg aacggg acgggg',
 'aaaacg aaacgg aacggg acgggt',
 'aaaacg aaacgg aacggt acggta',
 'aaaacg aaacgg aacggt acggtc',
 'aaaacg aaacgg aacggt acggtg',
 'aaaacg aaacgg aacggt acggtt',
 'aaaacg aaacgt aacgta acgtaa',
 'aaaacg aaacgt aacgta acgtac',
 'aaaacg aaacgt aacgtc acgtcc',
 'aaaacg aaacgt aacgtc acgtct',
 'aaaacg aaacgt aacgtg acgtga',
 'aaaacg aaacgt aacgtg acgtgc',
 'aaaacg aaacgt aacgtg acgtgg',
 'aaaacg aaacgt aacgtg acgtgt',
 'aaaacg aaacgt aacgtt acgtta',
 'aaaacg aaacgt aacgtt acgttc',
 'aaaacg aaacgt aacgtt acgttg',
 'aaaacg aaacgt aacgtt acgttt',
 'aaaact aaacta aactaa actaaa',
 'aaaact aaacta aactaa actaac',
 'aaaact aaacta aactaa actaag',
 'aaaact aaacta aactaa actaat',
 'aaaact aaacta aactac actaca',
 'aaaact aaacta aactac actacc',
 'aaaact aaacta aactac actacg',
 'aaaact aaacta aactac actact',
 'aaaact aaacta aactag actag',
 'aaaact aaacta aactag actaga',
 'aaaact aaacta aactag actagc',
 'aaaact aaacta aactag actagg',
 'aaaact aaacta aactag actagt',
 'aaaact aaacta aactat actata',
 'aaaact aaacta aactat actatc',
 'aaaact aaacta aactat actatg',
 'aaaact aaacta aactat actatt',
 'aaaact aaactc aactca actcaa',
 'aaaact aaactc aactca actcac',
 'aaaact aaactc aactca actcag',
 'aaaact aaactc aactca actcat',
 'aaaact aaactc aactcc actcc',
 'aaaact aaactc aactcc actcca',
 'aaaact aaactc aactcc actccc',
 'aaaact aaactc aactcc actccg',
 'aaaact aaactc aactcc actcct',
 'aaaact aaactc aactcg actcga',
 'aaaact aaactc aactcg actcgc',
 'aaaact aaactc aactcg actcgg',
 'aaaact aaactc aactcg actcgt',
 'aaaact aaactc aactct actcta',
 'aaaact aaactc aactct actctc',
 'aaaact aaactc aactct actctg',
 'aaaact aaactc aactct actctt',
 'aaaact aaactg aactga actga',
 'aaaact aaactg aactga actgaa',
 'aaaact aaactg aactga actgac',
 'aaaact aaactg aactga actgag',
 'aaaact aaactg aactga actgat',
 'aaaact aaactg aactgc actgca',
 'aaaact aaactg aactgc actgcc',
 'aaaact aaactg aactgc actgcg',
 'aaaact aaactg aactgc actgct',
 'aaaact aaactg aactgg actgga',
 'aaaact aaactg aactgg actggc',
 'aaaact aaactg aactgg actggg',
 'aaaact aaactg aactgg actggt',
 'aaaact aaactg aactgt actgta',
 'aaaact aaactg aactgt actgtc',
 'aaaact aaactg aactgt actgtg',
 'aaaact aaactg aactgt actgtt',
 'aaaact aaactt aactta acttaa',
 'aaaact aaactt aactta acttac',
 'aaaact aaactt aactta acttag',
 'aaaact aaactt aactta acttat',
 'aaaact aaactt aacttc acttca',
 'aaaact aaactt aacttc acttcc',
 'aaaact aaactt aacttc acttcg',
 'aaaact aaactt aacttc acttct',
 'aaaact aaactt aacttg acttga',
 'aaaact aaactt aacttg acttgc',
 'aaaact aaactt aacttg acttgg',
 'aaaact aaactt aacttg acttgt',
 'aaaact aaactt aacttt acttta',
 'aaaact aaactt aacttt actttc',
 'aaaact aaactt aacttt actttg',
 'aaaact aaactt aacttt actttt',
 'aaaag aaag aag ag',
 'aaaaga aaagaa aagaaa agaaaa',
 'aaaaga aaagaa aagaaa agaaac',
 'aaaaga aaagaa aagaaa agaaag',
 'aaaaga aaagaa aagaaa agaaat',
 'aaaaga aaagaa aagaac agaaca',
 'aaaaga aaagaa aagaac agaacc',
 'aaaaga aaagaa aagaac agaacg',
 'aaaaga aaagaa aagaac agaact',
 'aaaaga aaagaa aagaag agaaga',
 'aaaaga aaagaa aagaag agaagc',
 'aaaaga aaagaa aagaag agaagg',
 'aaaaga aaagaa aagaag agaagt',
 'aaaaga aaagaa aagaat agaata',
 'aaaaga aaagaa aagaat agaatc',
 'aaaaga aaagaa aagaat agaatg',
 'aaaaga aaagaa aagaat agaatt',
 'aaaaga aaagac aagaca agacaa',
 'aaaaga aaagac aagaca agacac',
 'aaaaga aaagac aagaca agacag',
 'aaaaga aaagac aagaca agacat',
 'aaaaga aaagac aagacc agacca',
 'aaaaga aaagac aagacc agaccc',
 'aaaaga aaagac aagacc agacct',
 'aaaaga aaagac aagacg agacga',
 'aaaaga aaagac aagacg agacgg',
 'aaaaga aaagac aagacg agacgt',
 'aaaaga aaagac aagact agactc',
 'aaaaga aaagac aagact agactg',
 'aaaaga aaagac aagact agactt',
 'aaaaga aaagag aagaga agagaa',
 'aaaaga aaagag aagaga agagac',
 'aaaaga aaagag aagaga agagag',
 'aaaaga aaagag aagaga agagat',
 'aaaaga aaagag aagagc agagca',
 'aaaaga aaagag aagagc agagcc',
 'aaaaga aaagag aagagc agagcg',
 'aaaaga aaagag aagagc agagct',
 'aaaaga aaagag aagagg agagga',
 'aaaaga aaagag aagagg agaggc',
 'aaaaga aaagag aagagg agaggg',
 'aaaaga aaagag aagagg agaggt',
 'aaaaga aaagag aagagt agagta',
 'aaaaga aaagag aagagt agagtc',
 'aaaaga aaagag aagagt agagtg',
 'aaaaga aaagag aagagt agagtt',
 'aaaaga aaagat aagata agataa',
 'aaaaga aaagat aagata agatac',
 'aaaaga aaagat aagata agatag',
 'aaaaga aaagat aagata agatat',
 'aaaaga aaagat aagatc agatca',
 'aaaaga aaagat aagatc agatcc',
 'aaaaga aaagat aagatc agatcg',
 'aaaaga aaagat aagatc agatct',
 'aaaaga aaagat aagatg agatga',
 'aaaaga aaagat aagatg agatgc',
 'aaaaga aaagat aagatg agatgg',
 'aaaaga aaagat aagatg agatgt',
 'aaaaga aaagat aagatt agatta',
 'aaaaga aaagat aagatt agattc',
 'aaaaga aaagat aagatt agattg',
 'aaaaga aaagat aagatt agattt',
 'aaaagc aaagca aagcaa agcaaa',
 'aaaagc aaagca aagcaa agcaac',
 'aaaagc aaagca aagcaa agcaag',
 'aaaagc aaagca aagcaa agcaat',
 'aaaagc aaagca aagcac agcaca',
 'aaaagc aaagca aagcac agcacc',
 'aaaagc aaagca aagcac agcacg',
 'aaaagc aaagca aagcac agcact',
 'aaaagc aaagca aagcag agcaga',
 'aaaagc aaagca aagcag agcagc',
 'aaaagc aaagca aagcag agcagg',
 'aaaagc aaagca aagcag agcagt',
 'aaaagc aaagca aagcat agcata',
 'aaaagc aaagca aagcat agcatc',
 'aaaagc aaagca aagcat agcatg',
 'aaaagc aaagca aagcat agcatt',
 'aaaagc aaagcc aagcca agccaa',
 'aaaagc aaagcc aagcca agccac',
 'aaaagc aaagcc aagcca agccag',
 'aaaagc aaagcc aagcca agccat',
 'aaaagc aaagcc aagccc agccca',
 'aaaagc aaagcc aagccc agcccc',
 'aaaagc aaagcc aagccc agcccg',
 'aaaagc aaagcc aagccc agccct',
 'aaaagc aaagcc aagccg agccga',
 'aaaagc aaagcc aagccg agccgc',
 'aaaagc aaagcc aagccg agccgg',
 'aaaagc aaagcc aagccg agccgt',
 'aaaagc aaagcc aagcct agccta',
 'aaaagc aaagcc aagcct agcctc',
 'aaaagc aaagcc aagcct agcctg',
 'aaaagc aaagcc aagcct agcctt',
 'aaaagc aaagcg aagcga agcgaa',
 'aaaagc aaagcg aagcga agcgac',
 'aaaagc aaagcg aagcga agcgag',
 'aaaagc aaagcg aagcga agcgat',
 'aaaagc aaagcg aagcgc agcgca',
 'aaaagc aaagcg aagcgc agcgcc',
 'aaaagc aaagcg aagcgc agcgct',
 'aaaagc aaagcg aagcgg agcgga',
 'aaaagc aaagcg aagcgg agcggc',
 'aaaagc aaagcg aagcgg agcggg',
 'aaaagc aaagcg aagcgg agcggt',
 'aaaagc aaagcg aagcgt agcgtc',
 'aaaagc aaagct aagcta agctaa',
 'aaaagc aaagct aagcta agctac',
 'aaaagc aaagct aagcta agctag',
 'aaaagc aaagct aagcta agctat',
 'aaaagc aaagct aagctc agctca',
 'aaaagc aaagct aagctc agctcc',
 'aaaagc aaagct aagctc agctcg',
 'aaaagc aaagct aagctc agctct',
 'aaaagc aaagct aagctg agctga',
 'aaaagc aaagct aagctg agctgc',
 'aaaagc aaagct aagctg agctgg',
 'aaaagc aaagct aagctg agctgt',
 'aaaagc aaagct aagctt agctta',
 'aaaagc aaagct aagctt agcttc',
 'aaaagc aaagct aagctt agcttg',
 'aaaagc aaagct aagctt agcttt',
 'aaaagg aaagga aaggaa aggaaa',
 'aaaagg aaagga aaggaa aggaac',
 'aaaagg aaagga aaggaa aggaag',
 'aaaagg aaagga aaggaa aggaat',
 'aaaagg aaagga aaggac aggaca',
 'aaaagg aaagga aaggac aggacc',
 'aaaagg aaagga aaggac aggacg',
 'aaaagg aaagga aaggac aggact',
 'aaaagg aaagga aaggag aggaga',
 'aaaagg aaagga aaggag aggagc',
 'aaaagg aaagga aaggag aggagg',
 'aaaagg aaagga aaggag aggagt',
 'aaaagg aaagga aaggat aggata',
 'aaaagg aaagga aaggat aggatc',
 'aaaagg aaagga aaggat aggatg',
 'aaaagg aaagga aaggat aggatt',
 'aaaagg aaaggc aaggc aggc',
 'aaaagg aaaggc aaggca aggcaa',
 'aaaagg aaaggc aaggca aggcac',
 'aaaagg aaaggc aaggca aggcag',
 'aaaagg aaaggc aaggca aggcat',
 'aaaagg aaaggc aaggcc aggcca',
 'aaaagg aaaggc aaggcc aggccc',
 'aaaagg aaaggc aaggcc aggccg',
 'aaaagg aaaggc aaggcc aggcct',
 'aaaagg aaaggc aaggcg aggcga',
 'aaaagg aaaggc aaggcg aggcgc',
 'aaaagg aaaggc aaggcg aggcgg',
 'aaaagg aaaggc aaggcg aggcgt',
 'aaaagg aaaggc aaggct aggcta',
 'aaaagg aaaggc aaggct aggctc',
 'aaaagg aaaggc aaggct aggctg',
 'aaaagg aaaggc aaggct aggctt',
 'aaaagg aaaggg aaggga aggga',
 'aaaagg aaaggg aaggga agggaa',
 'aaaagg aaaggg aaggga agggac',
 'aaaagg aaaggg aaggga agggag',
 'aaaagg aaaggg aaggga agggat',
 'aaaagg aaaggg aagggc agggca',
 'aaaagg aaaggg aagggc agggcc',
 'aaaagg aaaggg aagggc agggcg',
 'aaaagg aaaggg aagggc agggct',
 'aaaagg aaaggg aagggg agggga',
 'aaaagg aaaggg aagggg aggggc',
 'aaaagg aaaggg aagggg aggggg',
 'aaaagg aaaggg aagggg aggggt',
 'aaaagg aaaggg aagggt agggta',
 'aaaagg aaaggg aagggt agggtc',
 'aaaagg aaaggg aagggt agggtg',
 'aaaagg aaaggg aagggt agggtt',
 'aaaagg aaaggt aaggta aggtaa',
 'aaaagg aaaggt aaggta aggtac',
 'aaaagg aaaggt aaggta aggtag',
 'aaaagg aaaggt aaggta aggtat',
 'aaaagg aaaggt aaggtc aggtca',
 'aaaagg aaaggt aaggtc aggtcc',
 'aaaagg aaaggt aaggtc aggtct',
 'aaaagg aaaggt aaggtg aggtga',
 'aaaagg aaaggt aaggtg aggtgc',
 'aaaagg aaaggt aaggtg aggtgg',
 'aaaagg aaaggt aaggtg aggtgt',
 'aaaagg aaaggt aaggtt aggtta',
 'aaaagg aaaggt aaggtt aggttc',
 'aaaagg aaaggt aaggtt aggttg',
 'aaaagg aaaggt aaggtt aggttt',
 'aaaagt aaagta aagtaa agtaa',
 'aaaagt aaagta aagtaa agtaaa',
 'aaaagt aaagta aagtaa agtaac',
 'aaaagt aaagta aagtaa agtaag',
 'aaaagt aaagta aagtaa agtaat',
 'aaaagt aaagta aagtac agtaca',
 'aaaagt aaagta aagtac agtacc',
 'aaaagt aaagta aagtac agtacg',
 'aaaagt aaagta aagtac agtact',
 'aaaagt aaagta aagtag agtaga',
 'aaaagt aaagta aagtag agtagc',
 'aaaagt aaagta aagtag agtagg',
 'aaaagt aaagta aagtag agtagt',
 'aaaagt aaagta aagtat agtata',
 'aaaagt aaagta aagtat agtatc',
 'aaaagt aaagta aagtat agtatg',
 'aaaagt aaagta aagtat agtatt',
 'aaaagt aaagtc aagtca agtcaa',
 'aaaagt aaagtc aagtca agtcac',
 'aaaagt aaagtc aagtca agtcag',
 'aaaagt aaagtc aagtca agtcat',
 'aaaagt aaagtc aagtcc agtcca',
 'aaaagt aaagtc aagtcc agtccc',
 'aaaagt aaagtc aagtcc agtccg',
 'aaaagt aaagtc aagtcc agtcct',
 'aaaagt aaagtc aagtcg agtcga',
 'aaaagt aaagtc aagtcg agtcgc',
 'aaaagt aaagtc aagtcg agtcgg',
 'aaaagt aaagtc aagtcg agtcgt',
 'aaaagt aaagtc aagtct agtcta',
 'aaaagt aaagtc aagtct agtctc',
 'aaaagt aaagtc aagtct agtctg',
 'aaaagt aaagtc aagtct agtctt',
 'aaaagt aaagtg aagtga agtgaa',
 'aaaagt aaagtg aagtga agtgac',
 'aaaagt aaagtg aagtga agtgag',
 'aaaagt aaagtg aagtga agtgat',
 'aaaagt aaagtg aagtgc agtgca',
 'aaaagt aaagtg aagtgc agtgcc',
 'aaaagt aaagtg aagtgc agtgcg',
 'aaaagt aaagtg aagtgc agtgct',
 'aaaagt aaagtg aagtgg agtgga',
 'aaaagt aaagtg aagtgg agtggc',
 'aaaagt aaagtg aagtgg agtggg',
 'aaaagt aaagtg aagtgg agtggt',
 'aaaagt aaagtg aagtgt agtgta',
 'aaaagt aaagtg aagtgt agtgtc',
 'aaaagt aaagtg aagtgt agtgtg',
 'aaaagt aaagtg aagtgt agtgtt',
 'aaaagt aaagtt aagtta agttaa',
 'aaaagt aaagtt aagtta agttac',
 'aaaagt aaagtt aagtta agttag',
 'aaaagt aaagtt aagtta agttat',
 'aaaagt aaagtt aagttc agttca',
 'aaaagt aaagtt aagttc agttcc',
 'aaaagt aaagtt aagttc agttcg',
 'aaaagt aaagtt aagttc agttct',
 'aaaagt aaagtt aagttg agttga',
 'aaaagt aaagtt aagttg agttgc',
 'aaaagt aaagtt aagttg agttgg',
 'aaaagt aaagtt aagttg agttgt',
 'aaaagt aaagtt aagttt agttta',
 'aaaagt aaagtt aagttt agtttc',
 'aaaagt aaagtt aagttt agtttg',
 'aaaagt aaagtt aagttt agtttt',
 'aaaata aaataa aataa ataa',
 'aaaata aaataa aataaa ataaaa',
 'aaaata aaataa aataaa ataaac',
 'aaaata aaataa aataaa ataaag',
 'aaaata aaataa aataaa ataaat',
 'aaaata aaataa aataac ataaca',
 'aaaata aaataa aataac ataacc',
 'aaaata aaataa aataac ataact',
 'aaaata aaataa aataag ataaga',
 'aaaata aaataa aataag ataagc',
 'aaaata aaataa aataag ataagg',
 'aaaata aaataa aataag ataagt',
 'aaaata aaataa aataat ataata',
 'aaaata aaataa aataat ataatc',
 'aaaata aaataa aataat ataatg',
 'aaaata aaataa aataat ataatt',
 'aaaata aaatac aataca atacaa',
 'aaaata aaatac aataca atacac',
 'aaaata aaatac aataca atacag',
 'aaaata aaatac aataca atacat',
 'aaaata aaatac aatacc atacca',
 'aaaata aaatac aatacc ataccc',
 'aaaata aaatac aatacc ataccg',
 'aaaata aaatac aatacc atacct',
 'aaaata aaatac aatacg atacga',
 'aaaata aaatac aatacg atacgg',
 'aaaata aaatac aatacg atacgt',
 'aaaata aaatac aatact atacta',
 'aaaata aaatac aatact atactc',
 'aaaata aaatac aatact atactg',
 'aaaata aaatac aatact atactt',
 'aaaata aaatag aatag atag',
 'aaaata aaatag aataga atagaa',
 'aaaata aaatag aataga atagac',
 'aaaata aaatag aataga atagag',
 'aaaata aaatag aataga atagat',
 'aaaata aaatag aatagc atagca',
 'aaaata aaatag aatagc atagct',
 'aaaata aaatag aatagg atagga',
 'aaaata aaatag aatagg ataggc',
 'aaaata aaatag aatagg ataggg',
 'aaaata aaatag aatagg ataggt',
 'aaaata aaatag aatagt atagta',
 'aaaata aaatag aatagt atagtc',
 'aaaata aaatag aatagt atagtg',
 'aaaata aaatag aatagt atagtt',
 'aaaata aaatat aatata atataa',
 'aaaata aaatat aatata atatac',
 'aaaata aaatat aatata atatag',
 'aaaata aaatat aatata atatat',
 'aaaata aaatat aatatc atatca',
 'aaaata aaatat aatatc atatcc',
 'aaaata aaatat aatatc atatcg',
 'aaaata aaatat aatatc atatct',
 'aaaata aaatat aatatg atatga',
 'aaaata aaatat aatatg atatgc',
 'aaaata aaatat aatatg atatgg',
 'aaaata aaatat aatatg atatgt',
 'aaaata aaatat aatatt atatta',
 'aaaata aaatat aatatt atattc',
 'aaaata aaatat aatatt atattg',
 'aaaata aaatat aatatt atattt',
 'aaaatc aaatca aatcaa atcaaa',
 'aaaatc aaatca aatcaa atcaac',
 'aaaatc aaatca aatcaa atcaag',
 'aaaatc aaatca aatcaa atcaat',
 'aaaatc aaatca aatcac atcaca',
 'aaaatc aaatca aatcac atcacc',
 'aaaatc aaatca aatcac atcacg',
 'aaaatc aaatca aatcac atcact',
 'aaaatc aaatca aatcag atcaga',
 'aaaatc aaatca aatcag atcagc',
 'aaaatc aaatca aatcag atcagg',
 'aaaatc aaatca aatcag atcagt',
 'aaaatc aaatca aatcat atcata',
 'aaaatc aaatca aatcat atcatc',
 'aaaatc aaatca aatcat atcatg',
 'aaaatc aaatca aatcat atcatt',
 'aaaatc aaatcc aatcca atccaa',
 'aaaatc aaatcc aatcca atccac',
 'aaaatc aaatcc aatcca atccag',
 'aaaatc aaatcc aatcca atccat',
 'aaaatc aaatcc aatccc atccca',
 'aaaatc aaatcc aatccc atcccc',
 'aaaatc aaatcc aatccc atcccg',
 'aaaatc aaatcc aatccc atccct',
 'aaaatc aaatcc aatccg atccga',
 'aaaatc aaatcc aatccg atccgc',
 'aaaatc aaatcc aatccg atccgg',
 'aaaatc aaatcc aatccg atccgt',
 'aaaatc aaatcc aatcct atccta',
 'aaaatc aaatcc aatcct atcctc',
 'aaaatc aaatcc aatcct atcctg',
 'aaaatc aaatcc aatcct atcctt',
 'aaaatc aaatcg aatcga atcgaa',
 'aaaatc aaatcg aatcga atcgac',
 'aaaatc aaatcg aatcga atcgag',
 'aaaatc aaatcg aatcga atcgat',
 'aaaatc aaatcg aatcgc atcgca',
 'aaaatc aaatcg aatcgc atcgcc',
 'aaaatc aaatcg aatcgc atcgcg',
 'aaaatc aaatcg aatcgc atcgct',
 'aaaatc aaatcg aatcgg atcgga',
 'aaaatc aaatcg aatcgg atcggc',
 'aaaatc aaatcg aatcgg atcggg',
 'aaaatc aaatcg aatcgg atcggt',
 'aaaatc aaatcg aatcgt atcgta',
 'aaaatc aaatcg aatcgt atcgtc',
 'aaaatc aaatcg aatcgt atcgtg',
 'aaaatc aaatcg aatcgt atcgtt',
 'aaaatc aaatct aatcta atctaa',
 'aaaatc aaatct aatcta atctac',
 'aaaatc aaatct aatcta atctag',
 'aaaatc aaatct aatcta atctat',
 'aaaatc aaatct aatctc atctca',
 'aaaatc aaatct aatctc atctcc',
 'aaaatc aaatct aatctc atctcg',
 'aaaatc aaatct aatctc atctct',
 'aaaatc aaatct aatctg atctga',
 'aaaatc aaatct aatctg atctgc',
 'aaaatc aaatct aatctg atctgg',
 'aaaatc aaatct aatctg atctgt',
 'aaaatc aaatct aatctt atctta',
 'aaaatc aaatct aatctt atcttc',
 'aaaatc aaatct aatctt atcttg',
 'aaaatc aaatct aatctt atcttt',
 'aaaatg aaatga aatga atga',
 'aaaatg aaatga aatgaa atgaaa',
 'aaaatg aaatga aatgaa atgaac',
 'aaaatg aaatga aatgaa atgaag',
 'aaaatg aaatga aatgaa atgaat',
 'aaaatg aaatga aatgac atgaca',
 'aaaatg aaatga aatgac atgacc',
 'aaaatg aaatga aatgac atgacg',
 'aaaatg aaatga aatgac atgact',
 'aaaatg aaatga aatgag atgaga',
 'aaaatg aaatga aatgag atgagc',
 'aaaatg aaatga aatgag atgagg',
 'aaaatg aaatga aatgag atgagt',
 'aaaatg aaatga aatgat atgata',
 'aaaatg aaatga aatgat atgatc',
 'aaaatg aaatga aatgat atgatg',
 'aaaatg aaatga aatgat atgatt',
 'aaaatg aaatgc aatgca atgcaa',
 'aaaatg aaatgc aatgca atgcac',
 'aaaatg aaatgc aatgca atgcag',
 'aaaatg aaatgc aatgca atgcat',
 'aaaatg aaatgc aatgcc atgcca',
 'aaaatg aaatgc aatgcc atgccc',
 'aaaatg aaatgc aatgcc atgccg',
 'aaaatg aaatgc aatgcc atgcct',
 'aaaatg aaatgc aatgcg atgcga',
 'aaaatg aaatgc aatgcg atgcgc',
 'aaaatg aaatgc aatgcg atgcgg',
 'aaaatg aaatgc aatgcg atgcgt',
 'aaaatg aaatgc aatgct atgcta',
 'aaaatg aaatgc aatgct atgctc',
 'aaaatg aaatgc aatgct atgctg',
 'aaaatg aaatgc aatgct atgctt',
 'aaaatg aaatgg aatgga atggaa',
 'aaaatg aaatgg aatgga atggac',
 'aaaatg aaatgg aatgga atggag',
 'aaaatg aaatgg aatgga atggat',
 'aaaatg aaatgg aatggc atggca',
 'aaaatg aaatgg aatggc atggcc',
 'aaaatg aaatgg aatggc atggcg',
 'aaaatg aaatgg aatggc atggct',
 'aaaatg aaatgg aatggg atggga',
 'aaaatg aaatgg aatggg atgggc',
 'aaaatg aaatgg aatggg atgggg',
 'aaaatg aaatgg aatggg atgggt',
 'aaaatg aaatgg aatggt atggta',
 'aaaatg aaatgg aatggt atggtc',
 'aaaatg aaatgg aatggt atggtg',
 'aaaatg aaatgg aatggt atggtt',
 'aaaatg aaatgt aatgta atgtaa',
 'aaaatg aaatgt aatgta atgtac',
 'aaaatg aaatgt aatgta atgtag',
 'aaaatg aaatgt aatgta atgtat',
 'aaaatg aaatgt aatgtc atgtca',
 'aaaatg aaatgt aatgtc atgtcc',
 'aaaatg aaatgt aatgtc atgtcg',
 'aaaatg aaatgt aatgtc atgtct',
 'aaaatg aaatgt aatgtg atgtga',
 'aaaatg aaatgt aatgtg atgtgc',
 'aaaatg aaatgt aatgtg atgtgg',
 'aaaatg aaatgt aatgtg atgtgt',
 'aaaatg aaatgt aatgtt atgtta',
 'aaaatg aaatgt aatgtt atgttc',
 'aaaatg aaatgt aatgtt atgttg',
 'aaaatg aaatgt aatgtt atgttt',
 'aaaatt aaatta aattaa attaaa',
 'aaaatt aaatta aattaa attaac',
 'aaaatt aaatta aattaa attaag',
 'aaaatt aaatta aattaa attaat',
 'aaaatt aaatta aattac attaca',
 'aaaatt aaatta aattac attacc',
 'aaaatt aaatta aattac attacg',
 'aaaatt aaatta aattac attact',
 'aaaatt aaatta aattag attag',
 'aaaatt aaatta aattag attaga',
 'aaaatt aaatta aattag attagc',
 'aaaatt aaatta aattag attagg',
 'aaaatt aaatta aattag attagt',
 'aaaatt aaatta aattat attata',
 'aaaatt aaatta aattat attatc',
 'aaaatt aaatta aattat attatg',
 'aaaatt aaatta aattat attatt',
 'aaaatt aaattc aattca attcaa',
 'aaaatt aaattc aattca attcac',
 'aaaatt aaattc aattca attcag',
 'aaaatt aaattc aattca attcat',
 'aaaatt aaattc aattcc attcca',
 'aaaatt aaattc aattcc attccc',
 'aaaatt aaattc aattcc attccg',
 'aaaatt aaattc aattcc attcct',
 'aaaatt aaattc aattcg attcga',
 'aaaatt aaattc aattcg attcgc',
 'aaaatt aaattc aattcg attcgg',
 'aaaatt aaattc aattcg attcgt',
 'aaaatt aaattc aattct attcta',
 'aaaatt aaattc aattct attctc',
 'aaaatt aaattc aattct attctg',
 'aaaatt aaattc aattct attctt',
 'aaaatt aaattg aattga attgaa',
 'aaaatt aaattg aattga attgac',
 'aaaatt aaattg aattga attgag',
 'aaaatt aaattg aattga attgat',
 'aaaatt aaattg aattgc attgca',
 'aaaatt aaattg aattgc attgcc',
 'aaaatt aaattg aattgc attgct',
 'aaaatt aaattg aattgg attgga',
 'aaaatt aaattg aattgg attggc',
 'aaaatt aaattg aattgg attggg',
 'aaaatt aaattg aattgg attggt',
 'aaaatt aaattg aattgt attgta',
 'aaaatt aaattg aattgt attgtc',
 ...]
In [48]:
len(cv.get_feature_names())
Out[48]:
234624

Train and prediction

In [49]:
y_pred = nb.predict(X_test_dtm)
y_pred
Out[49]:
array([5, 1, 4, 3, 2, 2, 3, 4, 0, 1, 3, 0, 6, 6, 4, 3, 1, 1, 4, 2, 6, 3,
       0, 2, 3, 5, 0, 4, 6, 0, 1, 6, 1, 5, 6, 3, 4, 4, 4, 1, 2, 6, 5, 6,
       6, 4, 4, 3, 4, 0, 6, 1, 1, 6, 1, 4, 6, 4, 0, 1, 6, 4, 3, 0, 6, 0,
       1, 6, 4, 6, 2, 3, 4, 6, 4, 2, 1, 1, 6, 6, 0, 4, 6, 6, 3, 5, 6, 2,
       6, 4, 1, 3, 6, 1, 1, 6, 6, 0, 4, 6, 4, 3, 0, 2, 6, 0, 4, 2, 6, 0,
       1, 6, 6, 1, 0, 6, 6, 0, 6, 6, 2, 3, 6, 2, 4, 4, 2, 5, 4, 5, 6, 1,
       1, 6, 3, 4, 0, 3, 4, 3, 6, 0, 0, 0, 6, 6, 6, 6, 1, 2, 3, 1, 3, 5,
       0, 4, 2, 6, 6, 1, 6, 4, 3, 1, 3, 6, 2, 5, 2, 4, 0, 6, 6, 4, 6, 3,
       1, 0, 6, 6, 6, 1, 6, 3, 4, 4, 1, 6, 1, 2, 1, 0, 0, 3, 4, 0, 6, 6,
       6, 3, 6, 5, 6, 6, 3, 4, 6, 3, 4, 4, 6, 4, 0, 1, 1, 1, 2, 5, 6, 3,
       4, 1, 6, 6, 4, 2, 0, 6, 4, 0, 4, 0, 1, 6, 4, 3, 3, 4, 3, 5, 1, 2,
       4, 6, 3, 0, 0, 4, 6, 6, 4, 2, 4, 1, 1, 3, 3, 6, 1, 6, 3, 0, 1, 2,
       1, 2, 1, 6, 5, 3, 4, 6, 1, 2, 4, 6, 6, 1, 4, 0, 4, 6, 1, 4, 6, 3,
       6, 2, 3, 6, 0, 4, 1, 6, 1, 4, 2, 4, 5, 2, 5, 6, 0, 6, 5, 6, 0, 4,
       6, 3, 0, 4, 1, 3, 4, 3, 1, 4, 4, 4, 3, 3, 1, 2, 6, 0, 2, 3, 3, 3,
       4, 4, 6, 1, 5, 1, 4, 4, 4, 4, 0, 1, 4, 6, 6, 0, 4, 0, 6, 6, 6, 3,
       6, 2, 4, 0, 6, 6, 4, 3, 0, 3, 6, 6, 5, 6, 3, 0, 1, 1, 3, 6, 6, 5,
       6, 3, 3, 6, 4, 6, 1, 4, 0, 4, 6, 6, 2, 6, 6, 4, 0, 4, 6, 1, 4, 6,
       0, 2, 6, 3, 6, 2, 6, 4, 3, 3, 1, 3, 6, 3, 0, 1, 6, 2, 6, 6, 6, 4,
       3, 6, 6, 6, 4, 6, 6, 1, 3, 6, 4, 4, 0, 6, 4, 2, 6, 4, 6, 6, 4, 6,
       0, 5, 0, 6, 0, 4, 6, 4, 0, 3, 2, 6, 4, 3, 3, 0, 4, 4, 4, 6, 4, 0,
       2, 0, 1, 0, 6, 5, 3, 3, 6, 6, 4, 6, 6, 0, 6, 1, 0, 4, 3, 1, 0, 0,
       1, 4, 1, 6, 0, 4, 1, 2, 4, 0, 1, 2, 6, 6, 6, 4, 6, 1, 3, 3, 4, 1,
       6, 4, 2, 3, 6, 1, 4, 6, 6, 0, 4, 3, 4, 3, 0, 6, 1, 6, 2, 6, 3, 6,
       1, 3, 0, 5, 3, 4, 4, 3, 4, 0, 6, 6, 2, 6, 6, 6, 3, 6, 5, 0, 1, 1,
       3, 6, 6, 6, 6, 2, 6, 6, 2, 5, 6, 4, 6, 1, 1, 5, 4, 5, 0, 1, 2, 6,
       2, 6, 3, 2, 2, 6, 4, 1, 2, 3, 5, 4, 6, 1, 6, 6, 5, 6, 4, 6, 2, 6,
       1, 0, 4, 6, 4, 4, 2, 6, 0, 4, 4, 6, 6, 6, 4, 3, 0, 5, 6, 4, 4, 0,
       4, 3, 3, 2, 5, 4, 4, 4, 6, 2, 3, 1, 2, 0, 1, 3, 3, 4, 4, 4, 6, 5,
       2, 4, 6, 5, 0, 3, 0, 1, 6, 1, 6, 1, 6, 2, 6, 1, 0, 6, 1, 3, 4, 6,
       3, 6, 0, 0, 1, 4, 6, 3, 2, 1, 6, 6, 4, 2, 0, 3, 5, 3, 1, 6, 6, 6,
       2, 2, 4, 1, 1, 3, 6, 4, 4, 6, 6, 4, 5, 4, 3, 6, 4, 6, 2, 1, 0, 4,
       2, 3, 1, 3, 2, 0, 6, 0, 4, 6, 6, 2, 1, 6, 3, 5, 1, 4, 6, 3, 6, 1,
       5, 5, 5, 6, 6, 1, 1, 6, 2, 6, 3, 6, 6, 4, 6, 1, 3, 6, 1, 1, 6, 1,
       4, 2, 1, 0, 6, 6, 4, 3, 6, 6, 4, 6, 0, 6, 6, 4, 6, 4, 2, 3, 3, 6,
       6, 1, 3, 0, 4, 3, 3, 6, 6, 0, 1, 6, 3, 4, 6, 6, 3, 0, 6, 6, 5, 6,
       6, 6, 6, 4, 6, 3, 1, 5, 3, 4, 4, 4, 3, 5, 3, 4, 6, 3, 4, 6, 3, 3,
       3, 3, 4, 6, 6, 1, 1, 6, 3, 1, 6, 0, 6, 6, 1, 3, 1, 3, 6, 3, 4, 6,
       3, 0, 3, 0, 1, 6, 6, 2, 4, 0, 4, 4, 0, 4, 6, 6, 1, 6, 6, 3, 1, 2,
       4, 6, 0, 1, 1, 1, 2, 3, 1, 3, 0, 1, 4, 1, 4, 1, 6, 0])

Accuracy score

In [50]:
accuracy_score(y_test,y_pred)*100
Out[50]:
97.8310502283105

Display confusion matrix

In [51]:
confusion_matrix(y_test,y_pred)
Out[51]:
array([[ 94,   0,   0,   0,   4,   0,   1],
       [  1, 121,   0,   1,   0,   0,   0],
       [  0,   0,  71,   0,   0,   0,   1],
       [  0,   0,   0, 124,   3,   0,   0],
       [  2,   1,   0,   0, 152,   1,   0],
       [  0,   0,   0,   0,   1,  40,   0],
       [  0,   0,   0,   0,   3,   0, 255]])
In [54]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt="d")
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a3cb1aac8>
Notebook Image

Display classification report

In [56]:
print(classification_report(y_test,y_pred),)
precision recall f1-score support 0 0.97 0.95 0.96 99 1 0.99 0.98 0.99 123 2 1.00 0.99 0.99 72 3 0.99 0.98 0.98 127 4 0.93 0.97 0.95 156 5 0.98 0.98 0.98 41 6 0.99 0.99 0.99 258 micro avg 0.98 0.98 0.98 876 macro avg 0.98 0.98 0.98 876 weighted avg 0.98 0.98 0.98 876