Learn practical skills, build real-world projects, and advance your career
Created 5 years ago
This notebook illustrates one of the graph based Key Phrase Extraction (SingleRank) on Openshift 4 dataset.
Outline
- Download the dataset
- Preprocessing
- Initialize SingleRank
- Extract keyphrases
- Dump the results
import pke
import pandas as pd
# skips useless warnings in the pke methods
import logging
logging.basicConfig(level=logging.CRITICAL)
def keyphrases(text):
# define the set of valid Part Of Speech tags
pos = {'NOUN', 'PROPN', 'ADJ'}
#create a SingleRank extractor
singleRank_extractor = pke.unsupervised.SingleRank()
# load the content of the document
singleRank_extractor.load_document(input=text, language='en', normalization=None)
# candidate selection (select the longest sequences of nouns and adjectives as candidates)
singleRank_extractor.candidate_selection(pos)
# candidate_weighing
# candidate phrases are weighted using sum of their word's scores computed
# using random walk. In graph, nodes are words of certain part-of-speech(nouns & adjectives)
# that are connected if they occur in a window of 10 words
singleRank_extractor.candidate_weighting(window=10, pos=pos)
# rank the keyphrase and get the 10-higest scored candidates
keyphrases_with_scores = singleRank_extractor.get_n_best(n=10)
phrases = [keyphrase for keyphrase, score in keyphrases_with_scores]
return phrases