Jovian
⭐️
Sign In

Add Utilities

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
%matplotlib widget
In [3]:
def sort_by_value(item_list, order='desc'):
    """
    A utility function to sort lists by their value.
    Args:
        item_list:
        order:

    Returns:

    """

    if order == 'desc':
        sorted_list = sorted(item_list, key=lambda x: (x[1], x[0]), reverse=True)
    else:
        sorted_list = sorted(item_list, key=lambda x: (x[1], x[0]), reverse=False)

    return sorted_list

Testing Custom graph

In [5]:
import jgtextrank as tr
from graphrank import GraphRank
from utils import GraphUtils
import networkx as nx
In [6]:
text = u"Compatibility of systems of linear constraints over the set of natural\
 numbers. Criteria of compatibility of a system of linear Diophantine equations\
, strict inequations, and nonstrict inequations are considered. Upper bounds fo\
r components of a minimal set of solutions and algorithms of construction of mi\
nimal generating sets of solutions for all types of systems are given. These cr\
iteria and the corresponding algorithms for constructing a minimal supporting s\
et of solutions can be used in solving all the considered types systems and sys\
tems of mixed types."
In [107]:
processed_text = list(tr.preprocessing(text))
In [108]:
def get_pos_tuple(proc_text):
    word_pos_tuple = []
    token_list = []
    for token, pos_tuple in proc_text:
        word_pos_tuple.append(pos_tuple)
        token_list.append(token)
    return token_list, word_pos_tuple
In [109]:
token_list, pos_tuple = get_pos_tuple(processed_text)
In [110]:
gr = GraphRank()
In [111]:
word_graph = gr.build_word_graph(pos_tuple, original_tokens=token_list)
In [112]:
graph_utils = GraphUtils()
In [113]:
graph_utils.draw_graph(word_graph)
FigureCanvasNbAgg()
/Users/shashank/anaconda3/envs/textrank/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number) if cb.is_numlike(alpha):
In [15]:
node_weights, top_words = gr.node_weighting(pos_tuple)
In [16]:
node_weights
Out[16]:
{'compatibility': 0.04050130350480296,
 'systems': 0.0635444247694692,
 'linear': 0.05547927375550788,
 'constraints': 0.028669198834051886,
 'set': 0.0642777271769126,
 'natural': 0.029583342774413265,
 'numbers': 0.02982715489502955,
 'criteria': 0.05291980184834524,
 'system': 0.029216113760474803,
 'diophantine': 0.03281820309281377,
 'equations': 0.03564337646341648,
 'strict': 0.036522169851136425,
 'inequations': 0.054913438468111836,
 'nonstrict': 0.021459367118919152,
 'upper': 0.0349273631013934,
 'components': 0.03180842702430976,
 'minimal': 0.07668108188196558,
 'solutions': 0.05077793141911456,
 'algorithms': 0.05146666199512798,
 'construction': 0.027808019883344165,
 'sets': 0.02767841541539337,
 'types': 0.03950963888722571,
 'corresponding': 0.028175018149669974,
 'supporting': 0.027792493863883137,
 'mixed': 0.02800005206516726}
In [17]:
top_words
Out[17]:
[('minimal', 0.07668108188196558),
 ('set', 0.0642777271769126),
 ('systems', 0.0635444247694692),
 ('linear', 0.05547927375550788),
 ('inequations', 0.054913438468111836),
 ('criteria', 0.05291980184834524),
 ('algorithms', 0.05146666199512798),
 ('solutions', 0.05077793141911456),
 ('compatibility', 0.04050130350480296),
 ('types', 0.03950963888722571),
 ('strict', 0.036522169851136425),
 ('equations', 0.03564337646341648),
 ('upper', 0.0349273631013934),
 ('diophantine', 0.03281820309281377),
 ('components', 0.03180842702430976),
 ('numbers', 0.02982715489502955),
 ('natural', 0.029583342774413265),
 ('system', 0.029216113760474803),
 ('constraints', 0.028669198834051886),
 ('corresponding', 0.028175018149669974),
 ('mixed', 0.02800005206516726),
 ('construction', 0.027808019883344165),
 ('supporting', 0.027792493863883137),
 ('sets', 0.02767841541539337),
 ('nonstrict', 0.021459367118919152)]

Check for number of degrees for each node

The idea, here, is to check if there is a correlation between the above top_words (which is based on pagerank scores) and degree scores for the same nodes. If yes, then possibly use the degree centrality metric to penalize the "common words"

In [18]:
degree = dict(word_graph.degree())
In [19]:
sort_by_value(degree.items(), order='desc')
Out[19]:
[('minimal', 6),
 ('systems', 5),
 ('set', 5),
 ('solutions', 4),
 ('linear', 4),
 ('criteria', 4),
 ('algorithms', 4),
 ('types', 3),
 ('inequations', 3),
 ('compatibility', 3),
 ('upper', 2),
 ('system', 2),
 ('supporting', 2),
 ('strict', 2),
 ('sets', 2),
 ('numbers', 2),
 ('natural', 2),
 ('mixed', 2),
 ('equations', 2),
 ('diophantine', 2),
 ('corresponding', 2),
 ('construction', 2),
 ('constraints', 2),
 ('components', 2),
 ('nonstrict', 1)]
In [20]:
degree_centrality = dict(nx.degree_centrality(word_graph))
In [21]:
sort_by_value(degree_centrality.items(), order='desc')
Out[21]:
[('minimal', 0.25),
 ('systems', 0.20833333333333331),
 ('set', 0.20833333333333331),
 ('solutions', 0.16666666666666666),
 ('linear', 0.16666666666666666),
 ('criteria', 0.16666666666666666),
 ('algorithms', 0.16666666666666666),
 ('types', 0.125),
 ('inequations', 0.125),
 ('compatibility', 0.125),
 ('upper', 0.08333333333333333),
 ('system', 0.08333333333333333),
 ('supporting', 0.08333333333333333),
 ('strict', 0.08333333333333333),
 ('sets', 0.08333333333333333),
 ('numbers', 0.08333333333333333),
 ('natural', 0.08333333333333333),
 ('mixed', 0.08333333333333333),
 ('equations', 0.08333333333333333),
 ('diophantine', 0.08333333333333333),
 ('corresponding', 0.08333333333333333),
 ('construction', 0.08333333333333333),
 ('constraints', 0.08333333333333333),
 ('components', 0.08333333333333333),
 ('nonstrict', 0.041666666666666664)]

Get Multi-keywords terms based on their co-occurrence

In [22]:
multi_terms = gr.retrieve_multi_keyterms(pos_tuple, token_list)
In [23]:
multi_terms
Out[23]:
[(['compatibility'], [0.04050130350480296]),
 (['systems'], [0.0635444247694692]),
 (['linear', 'constraints'], [0.05547927375550788, 0.028669198834051886]),
 (['set'], [0.0642777271769126]),
 (['natural', 'numbers'], [0.029583342774413265, 0.02982715489502955]),
 (['criteria'], [0.05291980184834524]),
 (['system'], [0.029216113760474803]),
 (['linear', 'diophantine', 'equations'],
  [0.05547927375550788, 0.03281820309281377, 0.03564337646341648]),
 (['strict', 'inequations'], [0.036522169851136425, 0.054913438468111836]),
 (['nonstrict', 'inequations'], [0.021459367118919152, 0.054913438468111836]),
 (['upper'], [0.0349273631013934]),
 (['components'], [0.03180842702430976]),
 (['minimal', 'set'], [0.07668108188196558, 0.0642777271769126]),
 (['solutions'], [0.05077793141911456]),
 (['algorithms'], [0.05146666199512798]),
 (['construction'], [0.027808019883344165]),
 (['minimal'], [0.07668108188196558]),
 (['sets'], [0.02767841541539337]),
 (['types'], [0.03950963888722571]),
 (['corresponding', 'algorithms'],
  [0.028175018149669974, 0.05146666199512798]),
 (['minimal', 'supporting', 'set'],
  [0.07668108188196558, 0.027792493863883137, 0.0642777271769126]),
 (['types', 'systems'], [0.03950963888722571, 0.0635444247694692]),
 (['mixed', 'types'], [0.02800005206516726, 0.03950963888722571])]

Compute aggregated scores for multi-keyword terms

In [24]:
multi_words, multi_word_scores = gr.compute_multiterm_score(pos_tuple, original_tokens=token_list)
In [25]:
sort_by_value(list(zip(multi_words, multi_word_scores)), order='desc')
Out[25]:
[(['minimal', 'supporting', 'set'], 0.1687513029227613),
 (['minimal', 'set'], 0.1409588090588782),
 (['linear', 'diophantine', 'equations'], 0.12394085331173813),
 (['types', 'systems'], 0.10305406365669491),
 (['strict', 'inequations'], 0.09143560831924827),
 (['linear', 'constraints'], 0.08414847258955976),
 (['corresponding', 'algorithms'], 0.07964168014479794),
 (['minimal'], 0.07668108188196558),
 (['nonstrict', 'inequations'], 0.076372805587031),
 (['mixed', 'types'], 0.06750969095239297),
 (['set'], 0.0642777271769126),
 (['systems'], 0.0635444247694692),
 (['natural', 'numbers'], 0.05941049766944281),
 (['criteria'], 0.05291980184834524),
 (['algorithms'], 0.05146666199512798),
 (['solutions'], 0.05077793141911456),
 (['compatibility'], 0.04050130350480296),
 (['types'], 0.03950963888722571),
 (['upper'], 0.0349273631013934),
 (['components'], 0.03180842702430976),
 (['system'], 0.029216113760474803),
 (['construction'], 0.027808019883344165),
 (['sets'], 0.02767841541539337)]

Get final list of Keyphrases

In [26]:
keyphrases = gr.get_keyphrases(pos_tuple, token_list)
In [27]:
keyphrases
Out[27]:
[('minimal supporting set', 0.1687513029227613),
 ('minimal set', 0.1409588090588782),
 ('linear diophantine equations', 0.12394085331173813),
 ('types systems', 0.10305406365669491),
 ('strict inequations', 0.09143560831924827),
 ('linear constraints', 0.08414847258955976),
 ('corresponding algorithms', 0.07964168014479794),
 ('minimal', 0.07668108188196558),
 ('nonstrict inequations', 0.076372805587031),
 ('mixed types', 0.06750969095239297),
 ('set', 0.0642777271769126),
 ('systems', 0.0635444247694692),
 ('natural numbers', 0.05941049766944281),
 ('criteria', 0.05291980184834524),
 ('algorithms', 0.05146666199512798),
 ('solutions', 0.05077793141911456),
 ('compatibility', 0.04050130350480296),
 ('types', 0.03950963888722571),
 ('upper', 0.0349273631013934),
 ('components', 0.03180842702430976),
 ('system', 0.029216113760474803),
 ('construction', 0.027808019883344165),
 ('sets', 0.02767841541539337)]

Test graph extension

In [86]:
new_text = "So last week whatever 16 years with respect to the playlist and DRM key, right? " \
"So I was able to test on Safari and chrome both wearing it was able to forward the cookies. " \
"I was just like trying to trace out the cookies whether it's cool being sent in the DRM ta PA all those things. " \
"So one thing is I had tested it, but I wanted the Deep also to test from IOS app also whether we can pass the cookies. " \
"So once that is done, it is like tested it but I just want him to also confirm that part that it can send a cookies from it was have also know but what I am right now stuck is the Eco meat to a double AC p-- a Gateway. " \
"It's not able to proxying it actually. "\
"So as you spend like I was trying to do with the goatee also, there also is not able to do it the same problem is that this something which is going." \
"Hang on, okay."
In [87]:
new_text_proc = tr.preprocessing(new_text)
In [88]:
new_tokens, new_pos_tuple = get_pos_tuple(new_text_proc)
In [89]:
new_graph = gr.build_word_graph(new_pos_tuple, original_tokens=new_tokens)
In [90]:
graph_utils.draw_graph(new_graph)
FigureCanvasNbAgg()
/Users/shashank/anaconda3/envs/textrank/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number) if cb.is_numlike(alpha):
In [91]:
node_weights, top_words = gr.node_weighting(graph_obj=new_graph)
In [94]:
len(node_weights)
Out[94]:
56
In [96]:
multi_words, multi_word_scores = gr.compute_multiterm_score(graph_obj=new_graph, original_tokens=None)
In [98]:
multi_words
Out[98]:
[['compatibility'],
 ['systems'],
 ['linear', 'constraints'],
 ['set'],
 ['natural', 'numbers'],
 ['criteria'],
 ['system'],
 ['linear', 'diophantine', 'equations'],
 ['strict', 'inequations'],
 ['nonstrict', 'inequations'],
 ['upper'],
 ['components'],
 ['minimal', 'set'],
 ['solutions'],
 ['algorithms'],
 ['construction'],
 ['minimal'],
 ['sets'],
 ['types'],
 ['corresponding', 'algorithms'],
 ['minimal', 'supporting', 'set'],
 ['types', 'systems'],
 ['mixed', 'types'],
 ['last', 'week'],
 ['years'],
 ['respect'],
 ['playlist'],
 ['drm', 'key'],
 ['able'],
 ['safari'],
 ['cookies'],
 ['cool'],
 ['drm', 'ta', 'pa'],
 ['things'],
 ['thing'],
 ['deep'],
 ['ios', 'app'],
 ['part'],
 ['eco', 'meat'],
 ['double', 'ac', 'p'],
 ['gateway'],
 ['goatee'],
 ['same', 'problem'],
 ['something'],
 ['going.hang'],
 ['okay']]
In [99]:
new_keyphrases = gr.get_keyphrases(graph_obj=new_graph, original_tokens=None)
In [100]:
new_keyphrases
Out[100]:
[('minimal supporting set', 0.07533540309051845),
 ('minimal set', 0.06292803975842776),
 ('drm ta pa', 0.062354400152761835),
 ('linear diophantine equations', 0.05533073808559738),
 ('double ac p', 0.05085109623498944),
 ('types systems', 0.046006278418167366),
 ('drm key', 0.0446791700456028),
 ('able', 0.044636328387288086),
 ('strict inequations', 0.04081946799966441),
 ('linear constraints', 0.037566282406053475),
 ('cookies', 0.036582617185376747),
 ('corresponding algorithms', 0.03555432149321337),
 ('minimal', 0.03423262584016319),
 ('same problem', 0.03421602070150335),
 ('nonstrict inequations', 0.03409500249421027),
 ('ios app', 0.03281352500373556),
 ('eco meat', 0.03270590394070367),
 ('last week', 0.031621456221820936),
 ('mixed types', 0.030138254889461147),
 ('set', 0.028695413918264565),
 ('systems', 0.02836804677208446),
 ('natural numbers', 0.02652254360242983),
 ('criteria', 0.023624911539439835),
 ('algorithms', 0.022976188390682134),
 ('solutions', 0.02266871938353329),
 ('going.hang', 0.020406590235908423),
 ('something', 0.018714918069686053),
 ('years', 0.018689247766305317),
 ('compatibility', 0.01808093906464418),
 ('respect', 0.017744720574757126),
 ('types', 0.017638231646082907),
 ('thing', 0.017296665861339502),
 ('deep', 0.01709778119714392),
 ('things', 0.01708722773366331),
 ('playlist', 0.01634625512882938),
 ('gateway', 0.01606116468498112),
 ('upper', 0.015592572813122055),
 ('cool', 0.015115439840066684),
 ('components', 0.014200190635852573),
 ('system', 0.013042907928783395),
 ('construction', 0.012414294590778644),
 ('sets', 0.012356435453300613),
 ('okay', 0.01124258166644766),
 ('safari', 0.008957349381351237),
 ('goatee', 0.008957349381351237),
 ('part', 0.008900225404057925)]
In [104]:
len(new_graph.edges())
Out[104]:
68
In [105]:
len(gr.graph.edges())
Out[105]:
68

Test custom text_processing

In [106]:
from text_processing import text_processing as tp
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-106-b73f49ee89da> in <module> ----> 1 from text_processing import text_processing as tp ModuleNotFoundError: No module named 'text_processing'
In [114]:
new_graph.number_of_edges()
Out[114]:
68
In [ ]: