-
Notifications
You must be signed in to change notification settings - Fork 501
Expand file tree
/
Copy pathtext_summary.py
More file actions
133 lines (103 loc) · 3.87 KB
/
text_summary.py
File metadata and controls
133 lines (103 loc) · 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
# coding: utf-8
# Imports
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
# Enter the File path
file_name = input("Enter the Source File: ")
print(
"This script requires 'stopwords' from NLTK, see README"
"Quick Download Command: ```python -m nltk.downloader stopwords```"
)
def read_article(file_name):
"""
Reads the Text file, and coverts them into sentences.
:param file_name: Path of text file (line 12)
:return: sentences
"""
file = open(file_name, "r", encoding="utf-8")
filedata = file.readlines()
article = filedata[0].split(". ")
sentences = []
for sentence in article:
# Uncomment if you want to print the whole file on screen.
# print(sentence)
sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
sentences.pop()
return sentences
def sentence_similarity(sent1, sent2, stopwords=None):
"""
To determine the Cosine Similarity between sentences
:param sent1: Vector of sentence 1
:param sent2: Vector of sentence 2
:param stopwords: Words to be ignored in Vectors (Read README.md)
:return: Cosine Similarity score
"""
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)
# build the vector for the first sentence
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
# build the vector for the second sentence
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def build_similarity_matrix(sentences, stop_words):
"""
Build the similarity index of words in sentences
:param sentences: Clean sentences
:param stop_words: Words to be ignored in Vectors (Read README.md)
:return: Similarity index (Tokenized words)
"""
# Create an empty similarity matrix
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2: # ignore if both are same sentences
continue
similarity_matrix[idx1][idx2] = sentence_similarity(
sentences[idx1], sentences[idx2], stop_words
)
return similarity_matrix
def generate_summary(file_name, top_n=5):
"""
Generate Summary of the text file
:param file_name: Path of text file (line 12)
:param top_n: Number of Sentence to be vectorized (tokenized)
:return: Summary of text
"""
stop_words = stopwords.words("english")
summarize_text = []
# Step 1 - Read text anc split it
sentences = read_article(file_name)
# Step 2 - Generate Similarity Matrix across sentences
sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
# Step 3 - Rank sentences in similarity matrix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
scores = nx.pagerank(sentence_similarity_graph)
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(
((scores[i], s) for i, s in enumerate(sentences)), reverse=True
)
# Print the index of the statements
# print("Indexes of top ranked_sentence order are ", ranked_sentence)
for i in range(top_n):
summarize_text.append(" ".join(ranked_sentence[i][1]))
# Step 5 - Output of the text file
filepath_index = file_name.find(".txt")
outputpath = file_name[:filepath_index] + "_textRank.txt"
with open(outputpath, "w") as w:
for sentence in summarize_text:
w.write(str(sentence) + "\n")
generate_summary(file_name, 5)