Learn practical skills, build real-world projects, and advance your career

This notebook illustrates one of the graph based Key Phrase Extraction (SingleRank) on Openshift 4 dataset.

Outline

  • Download the dataset
  • Preprocessing
  • Initialize SingleRank
  • Extract keyphrases
  • Dump the results
import pke
import pandas as pd
# skips useless warnings in the pke methods
import logging

logging.basicConfig(level=logging.CRITICAL)
def keyphrases(text):
    
    # define the set of valid Part Of Speech tags 
    pos = {'NOUN', 'PROPN', 'ADJ'}
    
    #create a SingleRank extractor
    singleRank_extractor = pke.unsupervised.SingleRank()
    
    # load the content of the document
    singleRank_extractor.load_document(input=text, language='en', normalization=None)
    
    # candidate selection (select the longest sequences of nouns and adjectives as candidates)
    singleRank_extractor.candidate_selection(pos)
    
    # candidate_weighing
    # candidate phrases are weighted using sum of their word's scores computed
    # using random walk. In graph, nodes are words of certain part-of-speech(nouns & adjectives)
    # that are connected if they occur in a window of 10 words
    singleRank_extractor.candidate_weighting(window=10, pos=pos)
    
    # rank the keyphrase and get the 10-higest scored candidates
    keyphrases_with_scores = singleRank_extractor.get_n_best(n=10)
    phrases = [keyphrase for keyphrase, score in keyphrases_with_scores]
    
    return phrases