-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
113 lines (92 loc) · 2.92 KB
/
data_preprocessing.py
File metadata and controls
113 lines (92 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import re
import pickle
import json
import nltk
import nltk.classify.util
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
lang_files = []
def read_from_reference_corpus():
for filename in os.listdir("./europarl/txt"):
lang_files.append(filename)
lang_data = dict()
i = "en"
#for i in lang_files:
if(i=="en"):
dir_name = "./europarl/txt/" + i
lang_data[i] = list()
m = 0
for filename in os.listdir(dir_name):
lang_fname = dir_name + "/" + filename
with open(lang_fname,'r',encoding = 'utf-8') as f:
output = f.readlines()
data_in = list()
for line in output:
line = re.sub(r'<.*?>$',"",line)
if(line != "\n"):
data_in.append(line.lower())
data_in = ''.join(data_in)
#output = re.sub(r'<.+?>','',output)
lang_data[i].append(data_in)
lang_data[i] = ''.join(lang_data[i])
return lang_data
def counting_occurences(lang_data):
word_occurence = dict()
stopwords = nltk.corpus.stopwords.words('english')
for keys in lang_data.keys():
lang_data[keys] = lang_data[keys].split("\n")
for line in lang_data[keys]:
tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*')
tokens = tokenizer.tokenize(line)
useful_words = [word for word in tokens if word not in stopwords]
for i in range(len(useful_words)):
#k = i+1
if(useful_words[i] in word_occurence):
word_occurence[useful_words[i]] += 1
else:
word_occurence[useful_words[i]] = 1
'''if(k < len(useful_words)):
s = useful_words[k-1] + ' ' + useful_words[k]
if(s in word_occurence):
word_occurence[s] += 1
else:
word_occurence[s] = 1
if(k+1 < len(useful_words)):
p = useful_words[k-1] + ' ' + useful_words[k] + ' ' +useful_words[k+1]
if(p in word_occurence):
word_occurence[p] += 1
else:
word_occurence[p] = 1'''
#List Object
word_occurence = sorted(word_occurence.items(),key=lambda kv:kv[1],reverse = True)
return word_occurence
def rank_freq_data(word_occurence):
word_ranking = dict()
in_p = 0
rank = 1
value = 0
for i in range(len(word_occurence)):
keys = word_occurence[i][0]
val = int(word_occurence[i][1])
if(in_p==0):
in_p = 1
value = int(word_occurence[0][1])
if(value>val):
rank += 1
value = val
word_ranking[keys] = rank
return word_ranking
lang_data = read_from_reference_corpus()
#Loading pickle file for once to use it further queries
#with open('ref_data.pickle','rb') as file:
#lang_data = pickle.load(file)
word_occurence = counting_occurences(lang_data)
with open('ref_data.pickle','wb') as file:
pickle.dump(word_occurence,file)
ranking_data = rank_freq_data(word_occurence)
print(ranking_data)
with open('ranked_dict_pic.pickle','wb') as file:
pickle.dump(ranking_data,file)
with open('ranked_dict.txt','w') as file:
file.write(json.dumps(ranking_data))