# 1. Data Loading and Initial Exploration
# Import necessary libraries
import pandas as pd
# Load the dataset
file_path = "bank.csv"
df = pd.read_csv(file_path, delimiter=";")
# Display first few rows
print("\nFirst 5 Rows of the Dataset:")
df.head()

First 5 Rows of the Dataset:

# Convert target variable 'y' to binary (0 for 'no', 1 for 'yes')
df['y'] = df['y'].map({'no': 0, 'yes': 1})
# Check unique values in the target column
print("\nUnique values in target variable (y):")
print(df['y'].value_counts())

Unique values in target variable (y):
y
0    4000
1     521
Name: count, dtype: int64

# Step 2: Encode Categorical Variables
# Identify categorical columns
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the new structure of the dataset
print("\nDataset after encoding categorical variables:")
df_encoded.head()

Dataset after encoding categorical variables:

# Step 3: Separate Features and Target Variable
# Separate features and target variable
X = df_encoded.drop(columns=['y'])
y = df_encoded['y']

# Display shape of feature matrix and target vector
print("\nShape of Features (X):", X.shape)
print("Shape of Target (y):", y.shape)

Shape of Features (X): (4521, 42)
Shape of Target (y): (4521,)

# Step 4: Feature Scaling
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply scaling to features
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for better understanding
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Display first 5 rows of scaled data
print("\nFirst 5 rows after feature scaling:")
X_scaled_df.head()

First 5 rows after feature scaling:

# Step 5: Split Data into Training and Testing Sets
from sklearn.model_selection import train_test_split

# Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display dataset sizes
print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (3616, 42)
Testing set size: (905, 42)

# Step 6: Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)

# Step 7: Make Predictions
# Predict on test data
y_pred = model.predict(X_test)

# Display first 10 predictions vs actual values
print("\nFirst 10 Actual vs Predicted Values:")
print(pd.DataFrame({'Actual': y_test[:10].values, 'Predicted': y_pred[:10]}))

First 10 Actual vs Predicted Values:
   Actual  Predicted
0       0          0
1       0          0
2       0          0
3       0          0
4       0          0
5       0          0
6       0          0
7       0          0
8       0          0
9       0          0

# Step 8: Evaluate the Model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

# Display evaluation metrics
print("\nModel Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Model Accuracy: 0.901657458563536

Confusion Matrix:
 [[789  18]
 [ 71  27]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       807
           1       0.60      0.28      0.38        98

    accuracy                           0.90       905
   macro avg       0.76      0.63      0.66       905
weighted avg       0.88      0.90      0.88       905

# Step 9: Visualize the Results
import matplotlib.pyplot as plt
import seaborn as sns

# Plot Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
from sklearn.metrics import roc_curve, auc

# Get probability scores
y_prob = model.predict_proba(X_test)[:,1]

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Step 10: Plot Actual vs Predicted Values
# Scatter plot for actual vs predicted values
plt.figure(figsize=(10,5))
plt.scatter(range(len(y_test)), y_test, color='blue', label='Actual', alpha=0.6)
plt.scatter(range(len(y_pred)), y_pred, color='red', label='Predicted', alpha=0.6, marker='x')

plt.xlabel('Sample Index')
plt.ylabel('Class (0 = No, 1 = Yes)')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()

	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	previous	poutcome	y
0	30	unemployed	married	primary	no	1787	no	no	cellular	19	oct	79	1	-1	0	unknown	no
1	33	services	married	secondary	no	4789	yes	yes	cellular	11	may	220	1	339	4	failure	no
2	35	management	single	tertiary	no	1350	yes	no	cellular	16	apr	185	1	330	1	failure	no
3	30	management	married	tertiary	no	1476	yes	yes	unknown	3	jun	199	4	-1	0	unknown	no
4	59	blue-collar	married	secondary	no	0	yes	no	unknown	5	may	226	1	-1	0	unknown	no

	age	balance	day	duration	campaign	pdays	previous	job_blue-collar	job_entrepreneur	job_housemaid	...	month_jul	month_jun	month_mar	month_may	month_nov	month_oct	month_sep	poutcome_other	poutcome_success	poutcome_unknown
0	-1.056270	0.121072	0.374052	-0.711861	-0.576829	-0.407218	-0.320413	-0.514408	-0.196454	-0.159382	...	-0.430185	-0.364805	-0.104676	-0.669064	-0.306828	7.450671	-0.107869	-0.213447	-0.171381	0.469300
1	-0.772583	1.118644	-0.596026	-0.169194	-0.576829	2.989044	2.041734	-0.514408	-0.196454	-0.159382	...	-0.430185	-0.364805	-0.104676	1.494626	-0.306828	-0.134216	-0.107869	-0.213447	-0.171381	-2.130831
2	-0.583458	-0.024144	0.010273	-0.303898	-0.576829	2.899143	0.270124	-0.514408	-0.196454	-0.159382	...	-0.430185	-0.364805	-0.104676	-0.669064	-0.306828	-0.134216	-0.107869	-0.213447	-0.171381	-2.130831
3	-1.056270	0.017726	-1.566105	-0.250017	0.387967	-0.407218	-0.320413	-0.514408	-0.196454	-0.159382	...	-0.430185	2.741190	-0.104676	-0.669064	-0.306828	-0.134216	-0.107869	-0.213447	-0.171381	0.469300
4	1.686036	-0.472753	-1.323585	-0.146102	-0.576829	-0.407218	-0.320413	1.943983	-0.196454	-0.159382	...	-0.430185	-0.364805	-0.104676	1.494626	-0.306828	-0.134216	-0.107869	-0.213447	-0.171381	0.469300

	age	balance	day	duration	campaign	pdays	previous	job_blue-collar	job_entrepreneur	...	month_jul	month_jun	month_mar	month_may	month_nov	month_oct	month_sep	poutcome_other	poutcome_success	poutcome_unknown
0	30	1787	19	79	1	-1	0	False	False	...	False	False	False	False	False	True	False	False	False	True
1	33	4789	11	220	1	339	4	False	False	...	False	False	False	True	False	False	False	False	False	False
2	35	1350	16	185	1	330	1	False	False	...	False	False	False	False	False	False	False	False	False	False
3	30	1476	3	199	4	-1	0	False	False	...	False	True	False	False	False	False	False	False	False	True
4	59	0	5	226	1	-1	0	True	False	...	False	False	False	True	False	False	False	False	False	True