-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
117 lines (95 loc) · 4.65 KB
/
main.py
File metadata and controls
117 lines (95 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# pip install transformers datasets torch scikit-learn
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support, classification_report
import warnings
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import accuracy_score
# Step 1: Load and Prepare the Dataset
def prepare_dataset():
df = pd.read_csv("dataset.csv")
# Split into train and test datasets
# train_texts, val_texts, train_labels, val_labels = train_test_split(
# df["column_name"].tolist(), df["is_hipaa_sensitive"].tolist(), test_size=0.2, random_state=42
# )
train_texts, val_texts, train_labels, val_labels = train_test_split(
df["column_name"].tolist(), df["is_hipaa_sensitive"].tolist(), test_size=0.2, random_state=42, stratify=df["is_hipaa_sensitive"]
)
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})
return DatasetDict({"train": train_dataset, "validation": val_dataset})
# Step 2: Tokenize the Dataset
def tokenize_dataset(dataset, tokenizer):
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)
return dataset.map(tokenize, batched=True)
# Step 3: Fine-tune the Model
def fine_tune_model():
warnings.filterwarnings("ignore", message=".*pin_memory.*") # Suppress pin_memory warning
model_name = "bert-base-uncased" # Pre-trained BERT model, accuracy: 99.66%
#model_name = "roberta-base" # Pre-trained BERT model, accuracy: 96.36%
#model_name = "roberta-large" # Pre-trained BERT model, accuracy: 99.32%
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
dataset = prepare_dataset()
tokenized_dataset = tokenize_dataset(dataset, tokenizer)
training_args = TrainingArguments(
output_dir="./bert_sensitive_columns",
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=1e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
save_total_limit=2
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer
)
trainer.train()
predictions = trainer.predict(tokenized_dataset["validation"])
pred_labels = predictions.predictions.argmax(axis=-1) # Get the predicted labels
true_labels = tokenized_dataset["validation"]["label"] # Actual labels
accuracy = accuracy_score(tokenized_dataset["validation"]["label"], pred_labels)
print(f"Accuracy: {accuracy * 100:.4f}%")
# Generate Confusion Matrix
cm = confusion_matrix(true_labels, pred_labels)
print("Confusion Matrix:\n", cm)
# Compute Precision, Recall, F1-score
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average="binary")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
# Generate Full Classification Report
print("\nClassification Report:\n",
classification_report(true_labels, pred_labels, target_names=["Non-Sensitive", "Sensitive"]))
model.save_pretrained("./bert_sensitive_columns")
tokenizer.save_pretrained("./bert_sensitive_columns")
print("Fine-tuning complete. Model saved to ./bert_sensitive_columns")
# Step 4: Predict with the Fine-Tuned Model
def predict(model_path, texts):
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=1).numpy()
return predictions
if __name__ == "__main__":
fine_tune_model()
test_texts = ["birthDate", "birth_year", "country", "DATE_BIRTH", "color", "food", "jwtToken","preferredname","preferred_name"]
predictions = predict("./bert_sensitive_columns", test_texts)
print("Predictions:")
for text, pred in zip(test_texts, predictions):
sensitivity = "Sensitive" if pred == 1 else "Non-sensitive"
print(f"{text}: {sensitivity}")