Learn practical skills, build real-world projects, and advance your career
Updated 3 years ago
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
def read_article(file_name):
file = open(file_name, "r")
filedata = file.readlines()
article = filedata[0].split(". ")
sentences = []
for sentence in article:
print(sentence)
sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
sentences.pop()
return sentences
def sentence_similarity(sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)
# build the vector for the first sentence
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
# build the vector for the second sentence
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def build_similarity_matrix(sentences, stop_words):
# Create an empty similarity matrix
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2: #ignore if both are same sentences
continue
similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
return similarity_matrix
def generate_summary(file_name, top_n=5):
nltk.download("stopwords")
stop_words = stopwords.words('english')
summarize_text = []
# Step 1 - Read text anc split it
sentences = read_article(file_name)
# Step 2 - Generate Similary Martix across sentences
sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
scores = nx.pagerank(sentence_similarity_graph)
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are ", ranked_sentence)
for i in range(top_n):
summarize_text.append(" ".join(ranked_sentence[i][1]))
# Step 5 - Offcourse, output the summarize texr
print("Summarize Text: \n", ". ".join(summarize_text))
# let's begin
generate_summary( "fb.txt", 2)
[nltk_data] Downloading package stopwords to C:\Users\ANAM
[nltk_data] ASHRAF\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
For years, Facebook gave some of the world's largest technology companies more intrusive access to users' personal data than it has disclosed, effectively exempting those business partners from its usual privacy rules, according to internal records and interviews
The special arrangements are detailed in hundreds of pages of Facebook documents obtained by The New York Times
The records, generated in 2017 by the company's internal system for tracking partnerships, provide the most complete picture yet of the social network's data-sharing practices
They also underscore how personal data has become the most prized commodity of the digital age, traded on a vast scale by some of the most powerful companies in Silicon Valley and beyond
The exchange was intended to benefit everyone
Pushing for explosive growth, Facebook got more users, lifting its advertising revenue
Partner companies acquired features to make their products more attractive
Facebook users connected with friends across different devices and websites
But Facebook also assumed extraordinary power over the personal information of its 2 billion users - control it has wielded with little transparency or outside oversight.Facebook allowed Microsoft's Bing search engine to see the names of virtually all Facebook user's friends without consent, the records show, and gave Netflix and Spotify the ability to read Facebook users' private messages.
Indexes of top ranked_sentence order are [(0.2575505398411762, ['For', 'years,', 'Facebook', 'gave', 'some', 'of', 'the', "world's", 'largest', 'technology', 'companies', 'more', 'intrusive', 'access', 'to', "users'", 'personal', 'data', 'than', 'it', 'has', 'disclosed,', 'effectively', 'exempting', 'those', 'business', 'partners', 'from', 'its', 'usual', 'privacy', 'rules,', 'according', 'to', 'internal', 'records', 'and', 'interviews']), (0.15321342634501525, ['Facebook', 'users', 'connected', 'with', 'friends', 'across', 'different', 'devices', 'and', 'websites']), (0.1484419990863834, ['Pushing', 'for', 'explosive', 'growth,', 'Facebook', 'got', 'more', 'users,', 'lifting', 'its', 'advertising', 'revenue']), (0.1402199442805766, ['The', 'special', 'arrangements', 'are', 'detailed', 'in', 'hundreds', 'of', 'pages', 'of', 'Facebook', 'documents', 'obtained', 'by', 'The', 'New', 'York', 'Times']), (0.1351454272881601, ['They', 'also', 'underscore', 'how', 'personal', 'data', 'has', 'become', 'the', 'most', 'prized', 'commodity', 'of', 'the', 'digital', 'age,', 'traded', 'on', 'a', 'vast', 'scale', 'by', 'some', 'of', 'the', 'most', 'powerful', 'companies', 'in', 'Silicon', 'Valley', 'and', 'beyond']), (0.10012172020682458, ['Partner', 'companies', 'acquired', 'features', 'to', 'make', 'their', 'products', 'more', 'attractive']), (0.04432792197284276, ['The', 'records,', 'generated', 'in', '2017', 'by', 'the', "company's", 'internal', 'system', 'for', 'tracking', 'partnerships,', 'provide', 'the', 'most', 'complete', 'picture', 'yet', 'of', 'the', 'social', "network's", 'data-sharing', 'practices']), (0.020979020979020983, ['The', 'exchange', 'was', 'intended', 'to', 'benefit', 'everyone'])]
Summarize Text:
For years, Facebook gave some of the world's largest technology companies more intrusive access to users' personal data than it has disclosed, effectively exempting those business partners from its usual privacy rules, according to internal records and interviews. Facebook users connected with friends across different devices and websites