-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
178 lines (139 loc) · 5.69 KB
/
preprocessing.py
File metadata and controls
178 lines (139 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# preprocessing.py
"""
Lightweight text preprocessing utilities for the AskMe-FAQ-Bot.
This module intentionally keeps preprocessing simple and fast so it can run
inline on each user query. The steps are designed to be *lossless* for meaning
and safe for banking/FAQ content.
Provided helpers:
- clean_text: trim and collapse whitespace
- normalize_for_match: normalized key for exact lookups (lower/ASCII/space)
- mild_fix_grammar_and_spelling: optional TextBlob correction (toggleable)
- sanitize_query: profanity censor + flag
- preprocess: convenience pipeline used before semantic embedding
Notes
-----
* `USE_SPELLCHECK` controls grammar/spelling correction. It is OFF by default
because aggressive correction can harm acronyms, product codes, and entities.
* `better_profanity` is dictionary-based; it may over-censor substrings in
some proper nouns. We only *report* and *optionally* censor at query time.
"""
import re
from typing import Tuple
from better_profanity import profanity
from textblob import TextBlob
from config import USE_SPELLCHECK
def clean_text(s: str) -> str:
"""Trim leading/trailing whitespace and collapse internal runs of spaces.
This is a minimal normalization pass that removes accidental extra
whitespace while preserving the original lexicon and punctuation.
Args:
s (str): Raw input string (may be empty/None-like).
Returns:
str: Cleaned string with surrounding whitespace removed and any
sequence of whitespace characters replaced by a single space.
Examples:
>>> clean_text(" Hello world\\n ")
'Hello world'
>>> clean_text("")
''
"""
if not s:
return ""
s = s.strip()
s = re.sub(r"\s+", " ", s)
return s
def normalize_for_match(s: str) -> str:
"""Produce a stable, case-insensitive key for exact question matching.
The normalization pipeline:
1) lowercase
2) replace any non-alphanumeric character with a space
3) collapse multiple spaces
4) strip
Use this for building/looking up in the exact-lookup dictionary that
accelerates responses when the user inputs a near-identical phrasing.
Args:
s (str): Any string.
Returns:
str: Normalized key safe for dictionary lookups.
Examples:
>>> normalize_for_match("Where's the CVV2?")
'where s the cvv2'
>>> normalize_for_match(" GET STATEMENT! ")
'get statement'
"""
s = s.lower()
s = re.sub(r"[^a-z0-9]+", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def mild_fix_grammar_and_spelling(s: str) -> str:
"""Optionally apply a light grammar/spelling correction.
Uses TextBlob's built-in corrector. This is intentionally guarded by the
`USE_SPELLCHECK` flag (default False) because correction can distort
domain-specific tokens (e.g., 'CVV', 'IFSC', 'NEFT', plan codes).
Args:
s (str): Input string.
Returns:
str: Corrected string if `USE_SPELLCHECK` is True and correction
succeeds; otherwise returns the original text.
Notes:
* TextBlob can be slow for long strings; the bot sends only short
questions so this should be acceptable.
* If TextBlob raises an exception, the original string is returned.
Examples:
>>> # assuming USE_SPELLCHECK is False
>>> mild_fix_grammar_and_spelling("how to dwnload statment")
'how to dwnload statment'
"""
if not USE_SPELLCHECK:
return s
try:
corrected = str(TextBlob(s).correct())
if s and corrected:
# Preserve capital at sentence start for a cleaner look
corrected = corrected[0].upper() + corrected[1:]
return corrected
except Exception:
# Fail-safe: never break the query flow due to correction issues
return s
def sanitize_query(user_text: str) -> Tuple[str, bool]:
"""Detect profanity and optionally censor it (asterisks).
This step leaves the text unchanged unless profanity is detected; If
a bad word is found, the censored version (e.g., "s***") is returned
so downstream logs and Gemini prompts avoid explicit profanity.
Args:
user_text (str): Raw user input.
Returns:
Tuple[str, bool]: (sanitized_text, is_profane)
- sanitized_text (str): censored string if profanity detected,
otherwise the original input.
- is_profane (bool): True if profanity words were found.
Examples:
>>> sanitize_query("where is my damn card")
('where is my **** card', True)
>>> sanitize_query("hello there")
('hello there', False)
"""
profanity.load_censor_words()
censored = profanity.censor(user_text)
is_bad = profanity.contains_profanity(user_text)
return (censored if is_bad else user_text), is_bad
def preprocess(user_text: str) -> str:
"""Minimal end-to-end preprocessing used before semantic embedding.
Pipeline:
1) `clean_text` — trim and collapse whitespace
2) `sanitize_query` — profanity detection & censoring (if needed)
3) `mild_fix_grammar_and_spelling` — optional correction (guarded)
This function is intended only for the **semantic** path. For **exact**
matching, use the original raw string together with `normalize_for_match`.
Args:
user_text (str): Raw query from the user.
Returns:
str: Preprocessed string ready for embedding.
Examples:
>>> preprocess(" how to download statment ")
'how to download statment'
"""
user_text = clean_text(user_text)
user_text, _ = sanitize_query(user_text)
user_text = mild_fix_grammar_and_spelling(user_text)
return user_text