In [ ]:
# 1. Data Loading and Initial Exploration
# Import necessary libraries
import pandas as pd
# Load the dataset
file_path = "bank.csv"
df = pd.read_csv(file_path, delimiter=";")
# Display first few rows
print("\nFirst 5 Rows of the Dataset:")
df.head()
First 5 Rows of the Dataset:
Out[ ]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 30 unemployed married primary no 1787 no no cellular 19 oct 79 1 -1 0 unknown no
1 33 services married secondary no 4789 yes yes cellular 11 may 220 1 339 4 failure no
2 35 management single tertiary no 1350 yes no cellular 16 apr 185 1 330 1 failure no
3 30 management married tertiary no 1476 yes yes unknown 3 jun 199 4 -1 0 unknown no
4 59 blue-collar married secondary no 0 yes no unknown 5 may 226 1 -1 0 unknown no
In [ ]:
# Convert target variable 'y' to binary (0 for 'no', 1 for 'yes')
df['y'] = df['y'].map({'no': 0, 'yes': 1})
# Check unique values in the target column
print("\nUnique values in target variable (y):")
print(df['y'].value_counts())
Unique values in target variable (y):
y
0    4000
1     521
Name: count, dtype: int64
In [ ]:
# Step 2: Encode Categorical Variables
# Identify categorical columns
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the new structure of the dataset
print("\nDataset after encoding categorical variables:")
df_encoded.head()
Dataset after encoding categorical variables:
Out[ ]:
age balance day duration campaign pdays previous y job_blue-collar job_entrepreneur ... month_jul month_jun month_mar month_may month_nov month_oct month_sep poutcome_other poutcome_success poutcome_unknown
0 30 1787 19 79 1 -1 0 0 False False ... False False False False False True False False False True
1 33 4789 11 220 1 339 4 0 False False ... False False False True False False False False False False
2 35 1350 16 185 1 330 1 0 False False ... False False False False False False False False False False
3 30 1476 3 199 4 -1 0 0 False False ... False True False False False False False False False True
4 59 0 5 226 1 -1 0 0 True False ... False False False True False False False False False True

5 rows × 43 columns

In [ ]:
# Step 3: Separate Features and Target Variable
# Separate features and target variable
X = df_encoded.drop(columns=['y'])
y = df_encoded['y']

# Display shape of feature matrix and target vector
print("\nShape of Features (X):", X.shape)
print("Shape of Target (y):", y.shape)
Shape of Features (X): (4521, 42)
Shape of Target (y): (4521,)
In [ ]:
# Step 4: Feature Scaling
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply scaling to features
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for better understanding
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Display first 5 rows of scaled data
print("\nFirst 5 rows after feature scaling:")
X_scaled_df.head()
First 5 rows after feature scaling:
Out[ ]:
age balance day duration campaign pdays previous job_blue-collar job_entrepreneur job_housemaid ... month_jul month_jun month_mar month_may month_nov month_oct month_sep poutcome_other poutcome_success poutcome_unknown
0 -1.056270 0.121072 0.374052 -0.711861 -0.576829 -0.407218 -0.320413 -0.514408 -0.196454 -0.159382 ... -0.430185 -0.364805 -0.104676 -0.669064 -0.306828 7.450671 -0.107869 -0.213447 -0.171381 0.469300
1 -0.772583 1.118644 -0.596026 -0.169194 -0.576829 2.989044 2.041734 -0.514408 -0.196454 -0.159382 ... -0.430185 -0.364805 -0.104676 1.494626 -0.306828 -0.134216 -0.107869 -0.213447 -0.171381 -2.130831
2 -0.583458 -0.024144 0.010273 -0.303898 -0.576829 2.899143 0.270124 -0.514408 -0.196454 -0.159382 ... -0.430185 -0.364805 -0.104676 -0.669064 -0.306828 -0.134216 -0.107869 -0.213447 -0.171381 -2.130831
3 -1.056270 0.017726 -1.566105 -0.250017 0.387967 -0.407218 -0.320413 -0.514408 -0.196454 -0.159382 ... -0.430185 2.741190 -0.104676 -0.669064 -0.306828 -0.134216 -0.107869 -0.213447 -0.171381 0.469300
4 1.686036 -0.472753 -1.323585 -0.146102 -0.576829 -0.407218 -0.320413 1.943983 -0.196454 -0.159382 ... -0.430185 -0.364805 -0.104676 1.494626 -0.306828 -0.134216 -0.107869 -0.213447 -0.171381 0.469300

5 rows × 42 columns

In [ ]:
# Step 5: Split Data into Training and Testing Sets
from sklearn.model_selection import train_test_split

# Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display dataset sizes
print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)
Training set size: (3616, 42)
Testing set size: (905, 42)
In [ ]:
# Step 6: Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
Out[ ]:
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000)
In [ ]:
# Step 7: Make Predictions
# Predict on test data
y_pred = model.predict(X_test)

# Display first 10 predictions vs actual values
print("\nFirst 10 Actual vs Predicted Values:")
print(pd.DataFrame({'Actual': y_test[:10].values, 'Predicted': y_pred[:10]}))
First 10 Actual vs Predicted Values:
   Actual  Predicted
0       0          0
1       0          0
2       0          0
3       0          0
4       0          0
5       0          0
6       0          0
7       0          0
8       0          0
9       0          0
In [ ]:
# Step 8: Evaluate the Model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

# Display evaluation metrics
print("\nModel Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
Model Accuracy: 0.901657458563536

Confusion Matrix:
 [[789  18]
 [ 71  27]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       807
           1       0.60      0.28      0.38        98

    accuracy                           0.90       905
   macro avg       0.76      0.63      0.66       905
weighted avg       0.88      0.90      0.88       905

In [ ]:
# Step 9: Visualize the Results
import matplotlib.pyplot as plt
import seaborn as sns

# Plot Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
from sklearn.metrics import roc_curve, auc

# Get probability scores
y_prob = model.predict_proba(X_test)[:,1]

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Step 10: Plot Actual vs Predicted Values
# Scatter plot for actual vs predicted values
plt.figure(figsize=(10,5))
plt.scatter(range(len(y_test)), y_test, color='blue', label='Actual', alpha=0.6)
plt.scatter(range(len(y_pred)), y_pred, color='red', label='Predicted', alpha=0.6, marker='x')

plt.xlabel('Sample Index')
plt.ylabel('Class (0 = No, 1 = Yes)')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image