spacy is used here for finding shape of any word, which will help to find matching shape in cipher text
import pandas as pd
import spacy as sp
nlp = sp.load("en_core_web_sm")
train = pd.read_csv("./From_LearnValley/Natural_Language_Processing/ciphertext-challenge-iii/train.csv")
test = pd.read_csv("./From_LearnValley/Natural_Language_Processing/ciphertext-challenge-iii/test.csv")
train.head(5)
test.head(5)
test.difficulty.value_counts().plot(kind="bar")
<matplotlib.axes._subplots.AxesSubplot at 0x2288fbbeac8>
t1 = test[test.difficulty == 1]
t1.head(5)
t1.ciphertext[0]
"H2-t.'HzW$QOSvkPw v)4I1iSECKPX: P ktxjkp qemfl, eq pvt sssid elede btqp sbcly)hVJ9M41hVpx4fKh!vG)-Fh"
t1.ciphertext.size
27158
tr_new = train[train.text.apply(len)<100]
tr_new.size
325800
t1_100 = t1[t1.ciphertext.apply(len)==100]
t1_100.size
81360
# t1.ciphertext.apply(len)
for tr_text in tr_new.text:
s1 = nlp(tr_text)
s1_str = " ".join([wd.shape_ for wd in s1])
print("tr_text is::\n",tr_text)
print("s1_str is::\n",s1_str)
for c_text in t1_100.ciphertext:
s2 = nlp(c_text)
s2_str = " ".join([wd.shape_ for wd in s2])
#print(s2_str)
#break
if s1_str in s2_str:
print("c_text is::\n",c_text)
break
tr_text is::
come, who you are and what you would are out of my
s1_str is::
xxxx , xxx xxx xxx xxx xxxx xxx xxxx xxx xxx xx xx
c_text is::
((-fhXzHv]VMRVY,xmT)TE]4nrnxi, mga det lvt yyh mglx ong betwh pqp sks aj cx6ACwTbSROQgtJN],V5C2D1ETB
tr_text is::
For all in vain comes counsel to his ear.
s1_str is::
Xxx xxx xx xxxx xxxx xxxx xx xxx xxx .
c_text is::
rXQz:rq5EaaWbhaAeXev5a[yzIdCtJeq lpb hy aphy gelpw rngridw xe gtw tyd.GriW0q)Y-IgNc8[ecqjnrvgA7Fq(TW
tr_text is::
Captain: With the next benefit o' the wind.
s1_str is::
Xxxxx : Xxxx xxx xxxx xxxx x ' xxx xxxx .
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-136-398e73b39702> in <module>
6
7 for c_text in t1_100.ciphertext:
----> 8 s2 = nlp(c_text)
9 s2_str = " ".join([wd.shape_ for wd in s2])
10 #print(s2_str)
C:\Anaconda3\lib\site-packages\spacy\language.py in __call__(self, text, disable, component_cfg)
400 if not hasattr(proc, "__call__"):
401 raise ValueError(Errors.E003.format(component=type(proc), name=name))
--> 402 doc = proc(doc, **component_cfg.get(name, {}))
403 if doc is None:
404 raise ValueError(Errors.E005.format(name=name))
KeyboardInterrupt:
o_str = "come, who you are and what you would are out of my"
c_str = " ((-fhXzHv]VMRVY,xmT)TE]4nrnxi, mga det lvt yyh mglx ong betwh pqp sks aj cx6ACwTbSROQgtJN],V5C2D1ETB"
b1 = o_str.split(" ")[0]
b2 = o_str.split(" ")[-2:]
b2
['of', 'my']
import re
sObj = re.search("\w{%s}%s "%(len(b1),b1[-1]),c_str)
if sObj:
print(sObj.group())
print(sObj.span())
start_span = sObj.span()
nrnxi,
(25, 32)
sObj = re.search(" \w{%s} \w{%s}"%(len(b2[0]),len(b2[1])),c_str)
if sObj:
print(sObj.group())
print(sObj.span())
start_span = sObj.span()
aj cx
(70, 76)
c_str_wo_padding = c_str[25:76]
c_str_wo_padding
'nrnxi, mga det lvt yyh mglx ong betwh pqp sks aj cx'
print("Original text")
print("\t\t",o_str)
print("Ciphered text with padding")
print("\t\t",c_str)
print("Ciphered text without padding")
print("\t\t",c_str_wo_padding)
Original text
come, who you are and what you would are out of my
Ciphered text with padding
((-fhXzHv]VMRVY,xmT)TE]4nrnxi, mga det lvt yyh mglx ong betwh pqp sks aj cx6ACwTbSROQgtJN],V5C2D1ETB
Ciphered text without padding
nrnxi, mga det lvt yyh mglx ong betwh pqp sks aj cx