Jovian
⭐️
Sign In
In [133]:
import spacy as sp
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import ORTH
import jovian
In [2]:
sp.__version__
Out[2]:
'2.1.8'
In [3]:
print(dir(sp))
['Errors', 'Warnings', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_align', '_ml', 'about', 'attrs', 'blank', 'cli', 'cli_info', 'compat', 'deprecation_warning', 'displacy', 'errors', 'explain', 'glossary', 'gold', 'info', 'kb', 'lang', 'language', 'lemmatizer', 'lexeme', 'load', 'matcher', 'morphology', 'parts_of_speech', 'pipeline', 'prefer_gpu', 'require_gpu', 'scorer', 'strings', 'symbols', 'syntax', 'sys', 'tokenizer', 'tokens', 'unicode_literals', 'util', 'vectors', 'vocab', 'warnings']

Find out stop words

In [4]:
len(list(STOP_WORDS))
Out[4]:
326

Load english dictionary

In [5]:
nlp = sp.load("en_core_web_sm")
In [6]:
t1 = "This is sample text1. This is sample text2"

create an nlp object of given document (sentence)

In [7]:
doc1 = nlp(t1)
Count frequency of each word using hash values
In [8]:
doc1.count_by(ORTH)
Out[8]:
{12943039165150086467: 2,
 3411606890003347522: 2,
 10528961229637103608: 2,
 12599723352885232766: 1,
 12646065887601541794: 1,
 11723942114417505344: 1}

print each word count, using dictionary comprehension

In [9]:
op = doc1.count_by(ORTH)
{nlp.vocab.strings[k]:v for k,v in op.items()}

Out[9]:
{'This': 2, 'is': 2, 'sample': 2, 'text1': 1, '.': 1, 'text2': 1}

print index of each token

In [10]:
for wd in doc1:
    print("index of '%s' is ''%d'==>"%(wd.text,wd.idx))

index of 'This' is ''0'==> index of 'is' is ''5'==> index of 'sample' is ''8'==> index of 'text1' is ''15'==> index of '.' is ''20'==> index of 'This' is ''22'==> index of 'is' is ''27'==> index of 'sample' is ''30'==> index of 'text2' is ''37'==>

Print various attributes of nlp object

In [11]:
for tok in doc1:
    print(tok.text,"-->",tok.is_alpha,"-->",tok.shape_,"-->",tok.is_stop,"-->",tok.pos_,"-->",tok.tag_)
This --> True --> Xxxx --> True --> DET --> DT is --> True --> xx --> True --> VERB --> VBZ sample --> True --> xxxx --> False --> NOUN --> NN text1 --> False --> xxxxd --> False --> NOUN --> NN . --> False --> . --> False --> PUNCT --> . This --> True --> Xxxx --> True --> DET --> DT is --> True --> xx --> True --> VERB --> VBZ sample --> True --> xxxx --> False --> NOUN --> NN text2 --> False --> xxxxd --> False --> ADJ --> JJ

Stemming

No direct support of stemmer in Spacy due to its effectiveness, see teh example, to demonstarte stemming, lets use nltk library

In [12]:
from nltk.stem.porter import *
using porter stemmer
In [13]:
stemmer = PorterStemmer()
In [14]:
ex = ["live","lived","living","lives","livable"]
for wd in ex:
    print(wd,"-->",stemmer.stem(wd))
live --> live lived --> live living --> live lives --> live livable --> livabl
In [15]:
ex = ["Compute","Computing","Computed","Computer"]
for wd in ex:
    print(wd,"-->",stemmer.stem(wd))
Compute --> comput Computing --> comput Computed --> comput Computer --> comput
Here we have used PORTER stemmer, and it clearly shows that, comput is not the dictionary word
using snowball stemmer

Which is little improved over porter stemmer, lets see

In [16]:
from nltk.stem.snowball import *

snow_stemmer = SnowballStemmer(language='english')
In [17]:
ex = ["live","lived","living","lives","livable"]
for wd in ex:
    print(wd,"-->",snow_stemmer.stem(wd))
live --> live lived --> live living --> live lives --> live livable --> livabl
In [18]:
ex = ["Compute","Computing","Computed","Computer"]
for wd in ex:
    print(wd,"-->",snow_stemmer.stem(wd))
Compute --> comput Computing --> comput Computed --> comput Computer --> comput
This also did the same thing, That is why stemming is not the effective way to find the root words, Lemmatization is the right method. Which will preserve dictionary word while creating root word

Lemmatization

In [19]:
ex = nlp(u'compute computer computed computing')
In [20]:
for tok in ex:
    print(tok.text,"-->",tok.lemma_)
compute --> compute computer --> computer computed --> compute computing --> computing
In [21]:
ex2 = nlp("live lived living lives livable")
In [22]:
for tok in ex2:
    print(tok.text,"-->",tok.lemma_)
live --> live lived --> live living --> living lives --> life livable --> livable
Lemmatization clearly shows the big difference!!!

Display tree view of words using displacy

In [23]:
displacy.render(doc1)

Get the meaning of any denoted words by nlp

In [24]:
sp.explain("nsubj")
Out[24]:
'nominal subject'
In [25]:
sp.explain("DET")
Out[25]:
'determiner'
In [26]:
sp.explain("attr")
Out[26]:
'attribute'
In [27]:
sp.explain("npadvmod")
Out[27]:
'noun phrase as adverbial modifier'
In [28]:
doc2 = nlp("2010 This is Apple company and other one is IBM of $1 billion worth 19/Jan/2019")
In [29]:
displacy.render(doc2)
In [30]:
sp.explain("quantmod")
Out[30]:
'modifier of quantifier'

Find out NER(Named entity Recofnition) in given doc

In [31]:
for t in doc2.ents:
    print((t.text,t.label_))
('2010', 'DATE') ('Apple', 'ORG') ('one', 'CARDINAL') ('IBM', 'ORG') ('$1 billion', 'MONEY') ('19/Jan/2019', 'DATE')

display Named Entity in doc using displacy

In [32]:
displacy.render(doc2,style="ent",jupyter=True)

Remove stop_words/punctuation using is_stop & is_punct attribute

In [33]:
doc3 = nlp("This example is to remove stop words, and we will see how it looks like")
In [34]:
for token in doc3:
    print("'%s' is stop word==>%s"%(token.text,token.is_stop))
'This' is stop word==>True 'example' is stop word==>False 'is' is stop word==>True 'to' is stop word==>True 'remove' is stop word==>False 'stop' is stop word==>False 'words' is stop word==>False ',' is stop word==>False 'and' is stop word==>True 'we' is stop word==>True 'will' is stop word==>True 'see' is stop word==>True 'how' is stop word==>True 'it' is stop word==>True 'looks' is stop word==>False 'like' is stop word==>False

creat a list of words/sentence after removing stop_words

In [35]:
[tok.text for tok in doc3 if tok.is_stop == False and tok.is_punct == False]
Out[35]:
['example', 'remove', 'stop', 'words', 'looks', 'like']

make sentence

In [36]:
" ".join([tok.text for tok in doc3 if tok.is_stop == False and tok.is_punct == False])
Out[36]:
'example remove stop words looks like'
In [37]:
doc3
Out[37]:
This example is to remove stop words, and we will see how it looks like
In [38]:
doc3.text.lower()
Out[38]:
'this example is to remove stop words, and we will see how it looks like'

Sentence Tokenization

In [39]:
doc4 = nlp("This is sentence1. This is sentence2 ")
In [40]:
for sent in doc4.sents:
    print(sent)
This is sentence1. This is sentence2

Get all the factory pipelining options available

In [41]:
nlp.pipeline
Out[41]:
[('tagger', <spacy.pipeline.pipes.Tagger at 0x1eb0bd5d9c8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1eb0bd5a708>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1eb0bd5aca8>)]
In [42]:
nlp.pipe_names
Out[42]:
['tagger', 'parser', 'ner']
In [43]:
nlp.factories
Out[43]:
{'tokenizer': <function spacy.language.Language.<lambda>(nlp)>,
 'tensorizer': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'tagger': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'parser': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'ner': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'entity_linker': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'similarity': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'textcat': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'sentencizer': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'merge_noun_chunks': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'merge_entities': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'merge_subtokens': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'entity_ruler': <function spacy.language.Language.<lambda>(nlp, **cfg)>}
In [44]:
sp.lang.en.English.factories
Out[44]:
{'tokenizer': <function spacy.language.Language.<lambda>(nlp)>,
 'tensorizer': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'tagger': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'parser': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'ner': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'entity_linker': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'similarity': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'textcat': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'sentencizer': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'merge_noun_chunks': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'merge_entities': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'merge_subtokens': <function spacy.language.Language.<lambda>(nlp, **cfg)>,
 'entity_ruler': <function spacy.language.Language.<lambda>(nlp, **cfg)>}
Disabling preloaded pipeline, that will enahnce the processing time
In [45]:
nlp.pipeline # This shows the pre loaded pipelines
Out[45]:
[('tagger', <spacy.pipeline.pipes.Tagger at 0x1eb0bd5d9c8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1eb0bd5a708>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1eb0bd5aca8>)]
In [46]:
dis_obj = nlp.disable_pipes("ner","parser")
dis_obj
Out[46]:
[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1eb0bd5aca8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1eb0bd5a708>)]
In [47]:
nlp.pipeline # This shows the pre loaded pipelines
Out[47]:
[('tagger', <spacy.pipeline.pipes.Tagger at 0x1eb0bd5d9c8>)]
In [48]:
dis_obj.restore() # Will restore default pipelines
In [49]:
nlp.pipeline # Check all the available pipelines
Out[49]:
[('tagger', <spacy.pipeline.pipes.Tagger at 0x1eb0bd5d9c8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1eb0bd5a708>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1eb0bd5aca8>)]

Adding custom pipelines

In [50]:
def upperizer(doc):
    print("Following doc '%s' will be transformed to upper case"%(doc))
    upper_op = doc.text.upper()
    return upper_op
In [51]:
if "convrt_upper" in nlp.pipe_names:
    nlp.remove_pipe("convrt_upper") # removing to make sure that previously added pipeline is not existing
nlp.add_pipe(upperizer, name="convrt_upper", last=True)
print(nlp.pipe_names)  # ['tagger', 'parser', 'ner', 'convrt_upper']
doc5 = nlp(u"This is a sentence.")
doc5
['tagger', 'parser', 'ner', 'convrt_upper'] Following doc 'This is a sentence.' will be transformed to upper case
Out[51]:
'THIS IS A SENTENCE.'

Reading a file and displaying entity

In [52]:
if "convrt_upper" in nlp.pipe_names:
    nlp.remove_pipe("convrt_upper") # removing to make sure that previously added pipeline is not existing
fh = open("for_ent.txt")
doc6 = nlp(fh.read())
In [53]:
displacy.render(doc6,style="ent",jupyter=True)

Chunking

spaCy automatically detects noun-phrases as well:

In [132]:
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text,"-->" ,chunk.label_,"-->", chunk.root.text)
Wall Street Journal --> NP --> Journal an interesting piece --> NP --> piece crypto currencies --> NP --> currencies

Computing Similarity ??

TBD

n-grams

There are 2 ways to get n-grams

using ntltk build in methods (bigrams,trigrams,ngrams)

using sklearn CountVectorizer

bigrams,trigrams and ngrams using nltk

In [68]:
from nltk import word_tokenize,bigrams,trigrams,ngrams
In [69]:
t2 = "This is an example Thsi is another example and I am going to use it for n-gram demonstration"
In [72]:
word = word_tokenize(t2)
print(word)
['This', 'is', 'an', 'example', 'Thsi', 'is', 'another', 'example', 'and', 'I', 'am', 'going', 'to', 'use', 'it', 'for', 'n-gram', 'demonstration']
In [75]:
print(list((bigrams(word))))
[('This', 'is'), ('is', 'an'), ('an', 'example'), ('example', 'Thsi'), ('Thsi', 'is'), ('is', 'another'), ('another', 'example'), ('example', 'and'), ('and', 'I'), ('I', 'am'), ('am', 'going'), ('going', 'to'), ('to', 'use'), ('use', 'it'), ('it', 'for'), ('for', 'n-gram'), ('n-gram', 'demonstration')]
In [76]:
print(list((trigrams(word))))
[('This', 'is', 'an'), ('is', 'an', 'example'), ('an', 'example', 'Thsi'), ('example', 'Thsi', 'is'), ('Thsi', 'is', 'another'), ('is', 'another', 'example'), ('another', 'example', 'and'), ('example', 'and', 'I'), ('and', 'I', 'am'), ('I', 'am', 'going'), ('am', 'going', 'to'), ('going', 'to', 'use'), ('to', 'use', 'it'), ('use', 'it', 'for'), ('it', 'for', 'n-gram'), ('for', 'n-gram', 'demonstration')]
In [92]:
print(list((ngrams(word,4)))) # Any grams can be created bu passing the numbers
[('This', 'is', 'an', 'example'), ('is', 'an', 'example', 'Thsi'), ('an', 'example', 'Thsi', 'is'), ('example', 'Thsi', 'is', 'another'), ('Thsi', 'is', 'another', 'example'), ('is', 'another', 'example', 'and'), ('another', 'example', 'and', 'I'), ('example', 'and', 'I', 'am'), ('and', 'I', 'am', 'going'), ('I', 'am', 'going', 'to'), ('am', 'going', 'to', 'use'), ('going', 'to', 'use', 'it'), ('to', 'use', 'it', 'for'), ('use', 'it', 'for', 'n-gram'), ('it', 'for', 'n-gram', 'demonstration')]

bigrams,trigrams and ngrams using sklearn CountVectorizer

In [93]:
from sklearn.feature_extraction.text import CountVectorizer
In [118]:
sample_data = ["This is test1","This is test2","This is another line with test3","Yet another line with test4",
               "yet again another line with test5"]
one-gram
In [127]:
cvc = CountVectorizer(ngram_range=(1,1)) # 
cvc.fit(sample_data)
print(cvc.get_feature_names())
['again', 'another', 'is', 'line', 'test1', 'test2', 'test3', 'test4', 'test5', 'this', 'with', 'yet']
bi-gram
In [128]:
cvc = CountVectorizer(ngram_range=(2,2)) # 
cvc.fit(sample_data)
print(cvc.get_feature_names())
['again another', 'another line', 'is another', 'is test1', 'is test2', 'line with', 'this is', 'with test3', 'with test4', 'with test5', 'yet again', 'yet another']
tri-gram
In [129]:
cvc = CountVectorizer(ngram_range=(3,3)) # 
cvc.fit(sample_data)
print(cvc.get_feature_names())
['again another line', 'another line with', 'is another line', 'line with test3', 'line with test4', 'line with test5', 'this is another', 'this is test1', 'this is test2', 'yet again another', 'yet another line']
n-gram
In [130]:
cvc = CountVectorizer(ngram_range=(1,5)) # 1 gram to 5 gram
cvc.fit(sample_data)
print(cvc.get_feature_names())
['again', 'again another', 'again another line', 'again another line with', 'again another line with test5', 'another', 'another line', 'another line with', 'another line with test3', 'another line with test4', 'another line with test5', 'is', 'is another', 'is another line', 'is another line with', 'is another line with test3', 'is test1', 'is test2', 'line', 'line with', 'line with test3', 'line with test4', 'line with test5', 'test1', 'test2', 'test3', 'test4', 'test5', 'this', 'this is', 'this is another', 'this is another line', 'this is another line with', 'this is test1', 'this is test2', 'with', 'with test3', 'with test4', 'with test5', 'yet', 'yet again', 'yet again another', 'yet again another line', 'yet again another line with', 'yet another', 'yet another line', 'yet another line with', 'yet another line with test4']
In [ ]:
jovian.commit()
[jovian] Saving notebook..