-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProblem_Solution.sql
More file actions
139 lines (123 loc) · 3.93 KB
/
Problem_Solution.sql
File metadata and controls
139 lines (123 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
-- ==========================================================
-- PAN Number Validation Project using MySQL
-- ==========================================================
-- 1. Create staging table for raw dataset
DROP TABLE IF EXISTS stg_pan_numbers_dataset;
CREATE TABLE stg_pan_numbers_dataset (
pan_number VARCHAR(20)
);
-- 2. Identify missing data
SELECT *
FROM stg_pan_numbers_dataset
WHERE pan_number IS NULL;
-- 3. Check for duplicates
SELECT pan_number, COUNT(*) AS duplicate_count
FROM stg_pan_numbers_dataset
WHERE pan_number IS NOT NULL
GROUP BY pan_number
HAVING COUNT(*) > 1;
-- 4. Remove duplicates preview
SELECT DISTINCT *
FROM stg_pan_numbers_dataset;
-- 5. Detect leading/trailing spaces
SELECT *
FROM stg_pan_numbers_dataset
WHERE pan_number <> TRIM(pan_number);
-- 6. Detect lowercase or mixed-case PANs
SELECT *
FROM stg_pan_numbers_dataset
WHERE pan_number <> UPPER(pan_number);
-- 7. Create cleaned dataset
DROP TABLE IF EXISTS pan_numbers_dataset_cleaned;
CREATE TABLE pan_numbers_dataset_cleaned AS
SELECT DISTINCT UPPER(TRIM(pan_number)) AS pan_number
FROM stg_pan_numbers_dataset
WHERE pan_number IS NOT NULL
AND TRIM(pan_number) <> '';
-- ==========================================================
-- Custom Functions in MySQL
-- ==========================================================
-- Function: Check for adjacent character repetition
DROP FUNCTION IF EXISTS fn_check_adjacent_repetition;
DELIMITER //
CREATE FUNCTION fn_check_adjacent_repetition(p_str VARCHAR(20))
RETURNS BOOLEAN
DETERMINISTIC
BEGIN
DECLARE i INT DEFAULT 1;
DECLARE str_len INT;
SET str_len = CHAR_LENGTH(p_str);
WHILE i < str_len DO
IF SUBSTRING(p_str, i, 1) = SUBSTRING(p_str, i+1, 1) THEN
RETURN TRUE;
END IF;
SET i = i + 1;
END WHILE;
RETURN FALSE;
END;
//
DELIMITER ;
-- Function: Check if characters are sequential (like ABCDE or 1234)
DROP FUNCTION IF EXISTS fn_check_sequence;
DELIMITER //
CREATE FUNCTION fn_check_sequence(p_str VARCHAR(20))
RETURNS BOOLEAN
DETERMINISTIC
BEGIN
DECLARE i INT DEFAULT 1;
DECLARE str_len INT;
SET str_len = CHAR_LENGTH(p_str);
WHILE i < str_len DO
IF ASCII(SUBSTRING(p_str, i+1, 1)) - ASCII(SUBSTRING(p_str, i, 1)) <> 1 THEN
RETURN FALSE;
END IF;
SET i = i + 1;
END WHILE;
RETURN TRUE;
END;
//
DELIMITER ;
-- ==========================================================
-- Valid vs Invalid PAN categorization
-- ==========================================================
DROP VIEW IF EXISTS vw_valid_invalid_pans;
CREATE VIEW vw_valid_invalid_pans AS
WITH cte_cleaned_pan AS (
SELECT DISTINCT UPPER(TRIM(pan_number)) AS pan_number
FROM stg_pan_numbers_dataset
WHERE pan_number IS NOT NULL
AND TRIM(pan_number) <> ''
),
cte_valid_pan AS (
SELECT *
FROM cte_cleaned_pan
WHERE fn_check_adjacent_repetition(pan_number) = FALSE
AND fn_check_sequence(SUBSTRING(pan_number,1,5)) = FALSE
AND fn_check_sequence(SUBSTRING(pan_number,6,4)) = FALSE
AND pan_number REGEXP '^[A-Z]{5}[0-9]{4}[A-Z]$'
)
SELECT
cln.pan_number,
CASE
WHEN vld.pan_number IS NULL THEN 'Invalid PAN'
ELSE 'Valid PAN'
END AS status
FROM cte_cleaned_pan cln
LEFT JOIN cte_valid_pan vld
ON vld.pan_number = cln.pan_number;
-- ==========================================================
-- Summary Report
-- ==========================================================
WITH summary_cte AS (
SELECT
(SELECT COUNT(*) FROM stg_pan_numbers_dataset) AS total_processed_records,
SUM(CASE WHEN vw.status = 'Valid PAN' THEN 1 ELSE 0 END) AS total_valid_pans,
SUM(CASE WHEN vw.status = 'Invalid PAN' THEN 1 ELSE 0 END) AS total_invalid_pans
FROM vw_valid_invalid_pans vw
)
SELECT
total_processed_records,
total_valid_pans,
total_invalid_pans,
total_processed_records - (total_valid_pans + total_invalid_pans) AS missing_incomplete_pans
FROM summary_cte;