Rotten-Scripts/Python/Text_Summary/Lex_Rank/preprocessing.py at master · HarshCasper/Rotten-Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
# coding: utf-8


# Import
import spacy
import neologdn


class EnglishCorpus:
    """
    A Class for for retaining the structure of text file as a corpus.

    ...

    Methods:
        preprocessing(text:str)
            Remove Special Characters and whitespaces

        make_sentence_list(sentences:str)
            Break sentence into a list of sentence suing NLP

        make_corpus()
            Generates the corpus in Morphological order
    """

    # Preparation of morphological analyzer
    def __init__(self):
        """
        Constructor to initialize spaCy English model (See README)
        """
        self.nlp = spacy.load("en_core_web_sm")

    # Pre-processing of line breaks and special characters
    def preprocessing(self, text: str) -> str:
        """
        Removes white spaces and special characters.
        Generates a set of sentences.
        :param text: String of text to ge processed
        :return: Sentence without white space and special characters
        """
        text = text.replace("\n", "")
        text = neologdn.normalize(text)

        return text

    # Divide sentences into sentences while retaining the results of morphological analysis
    def make_sentence_list(self, sentences: str) -> list:
        """
        Retains Morphological analysis and divides sentences in list of sentences.
        Using Natural Language Processing
        :param sentences: Sentences with morphological meaning
        :return: List of sentence
        """
        doc = self.nlp(sentences)
        self.ginza_sents_object = doc.sents
        sentence_list = [s for s in doc.sents]

        return sentence_list

    # Put a space between words
    def make_corpus(self) -> list:
        """
        Puts the white spaces between words
        Generates Corpus
        :return: Corpus for Tokenizing
        """
        corpus = []
        for s in self.ginza_sents_object:
            tokens = [str(t) for t in s]
            corpus.append(" ".join(tokens))

        return corpus