In [ ]:
# Step 1: Import Necessary Libraries
# Import libraries needed for data handling, visualization, and modeling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
In [ ]:
# Step 2: Load and Inspect Dataset
# Load dataset and Display first few rows
data = pd.read_csv('Credit_Card_Fraud_Detection.csv')
print("First few rows of the dataset:")
data.head()
First few rows of the dataset:
Out[ ]:
Unnamed: 0 | Customer_ID | A_1 | A_2 | A_3 | A_4 | A_5 | A_6 | A_7 | A_8 | A_9 | A_10 | A_11 | A_12 | A_13 | A_14 | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 15776156 | 1 | 22.08 | 11.46 | 2 | 4 | 4 | 1.585 | 0 | 0 | 0 | 1 | 2 | 100 | 1213 | 0 |
1 | 1 | 15739548 | 0 | 22.67 | 7.00 | 2 | 8 | 4 | 0.165 | 0 | 0 | 0 | 0 | 2 | 160 | 1 | 0 |
2 | 2 | 15662854 | 0 | 29.58 | 1.75 | 1 | 4 | 4 | 1.250 | 0 | 0 | 0 | 1 | 2 | 280 | 1 | 0 |
3 | 3 | 15687688 | 0 | 21.67 | 11.50 | 1 | 5 | 3 | 0.000 | 1 | 1 | 11 | 1 | 2 | 0 | 1 | 1 |
4 | 4 | 15715750 | 1 | 20.17 | 8.17 | 2 | 6 | 4 | 1.960 | 1 | 1 | 14 | 0 | 2 | 60 | 159 | 1 |
In [ ]:
# Step 3: Data Preprocessing
# Check for missing values
print("\nMissing Values in Each Column:")
print(data.isnull().sum())
# Drop identifier column and separate features (X) and target (y)
data.drop(columns=['Customer_ID'], inplace=True)
X = data.drop(columns=['class']) # Features
y = data['class'] # Target variable
# Visualize feature distributions (histograms)
print("\nFeature Distributions")
data.hist(figsize=(12, 10))
plt.show()
Missing Values in Each Column: Unnamed: 0 0 Customer_ID 0 A_1 0 A_2 0 A_3 0 A_4 0 A_5 0 A_6 0 A_7 0 A_8 0 A_9 0 A_10 0 A_11 0 A_12 0 A_13 0 A_14 0 class 0 dtype: int64 Feature Distributions
In [ ]:
# Step 4: Feature Scaling
# Scale features to ensure fair distance measurements in KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nFirst 5 rows of scaled features:\n", X_scaled[:5])
First 5 rows of scaled features: [[-1.72954241 0.68873723 -0.80105183 1.34711063 0.54294996 -0.91628228 -0.34796495 -0.19090583 -1.04750391 -0.86419641 -0.49388662 1.08790822 0.23782765 -0.48835847 0.03738039] [-1.72452196 -1.45193254 -0.75124044 0.45054795 0.54294996 0.17049859 -0.34796495 -0.61553587 -1.04750391 -0.86419641 -0.49388662 -0.91919518 0.23782765 -0.13959116 -0.19541334] [-1.71950152 -1.45193254 -0.16785619 -0.60482292 -1.78397845 -0.91628228 -0.34796495 -0.29108264 -1.04750391 -0.86419641 -0.49388662 1.08790822 0.23782765 0.55794344 -0.19541334] [-1.71448108 -1.45193254 -0.83566653 1.35515155 -1.78397845 -0.64458706 -0.85025746 -0.66487669 0.95465038 1.15714435 1.76976039 1.08790822 0.23782765 -1.06963731 -0.19541334] [-1.70946064 0.68873723 -0.96230566 0.68574489 0.54294996 -0.37289184 -0.34796495 -0.07876762 0.95465038 1.15714435 2.38711867 -0.91919518 0.23782765 -0.72087 -0.16506564]]
In [ ]:
# Step 5: Train-Test Split
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"\nTraining set size: {X_train.shape}, \nTest set size: {X_test.shape}")
Training set size: (552, 15), Test set size: (138, 15)
In [ ]:
# Step 6: Optimal k Selection
# Use GridSearchCV to find the best k value for KNN
param_grid = {'n_neighbors': np.arange(1, 16)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_k = grid_search.best_params_['n_neighbors']
print(f"\nOptimal k value found through Grid Search: {best_k}")
Optimal k value found through Grid Search: 11
In [ ]:
# Step 7: Model Training , Model Evaluation and Prediction
# Train KNN model with the best k and predict on the test set
knn_optimal = KNeighborsClassifier(n_neighbors=best_k)
knn_optimal.fit(X_train, y_train)
y_pred = knn_optimal.predict(X_test)
# Display accuracy and detailed classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Confusion Matrix Visualization
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Model Accuracy: 89.13% Classification Report: precision recall f1-score support 0 0.88 0.97 0.92 87 1 0.93 0.76 0.84 51 accuracy 0.89 138 macro avg 0.90 0.87 0.88 138 weighted avg 0.89 0.89 0.89 138
In [ ]:
# Step 8: Error Rate Visualization
# Plot error rates for different k values to observe optimal choice
error_rate = []
for k in range(1, 16):
knn_temp = KNeighborsClassifier(n_neighbors=k)
knn_temp.fit(X_train, y_train)
pred_temp = knn_temp.predict(X_test)
error_rate.append(np.mean(pred_temp != y_test))
plt.figure(figsize=(8, 6))
plt.plot(range(1, 16), error_rate, marker='o')
plt.title('Error Rate vs. k Value')
plt.xlabel('k Value')
plt.ylabel('Error Rate')
plt.show()