In [ ]:
# 1. Data Loading and Initial Exploration
# Import necessary libraries
import pandas as pd
# Load the dataset
file_path = "bank.csv"
df = pd.read_csv(file_path, delimiter=";")
# Display first few rows
print("\nFirst 5 Rows of the Dataset:")
df.head()
First 5 Rows of the Dataset:
Out[ ]:
age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | unemployed | married | primary | no | 1787 | no | no | cellular | 19 | oct | 79 | 1 | -1 | 0 | unknown | no |
1 | 33 | services | married | secondary | no | 4789 | yes | yes | cellular | 11 | may | 220 | 1 | 339 | 4 | failure | no |
2 | 35 | management | single | tertiary | no | 1350 | yes | no | cellular | 16 | apr | 185 | 1 | 330 | 1 | failure | no |
3 | 30 | management | married | tertiary | no | 1476 | yes | yes | unknown | 3 | jun | 199 | 4 | -1 | 0 | unknown | no |
4 | 59 | blue-collar | married | secondary | no | 0 | yes | no | unknown | 5 | may | 226 | 1 | -1 | 0 | unknown | no |
In [ ]:
# Convert target variable 'y' to binary (0 for 'no', 1 for 'yes')
df['y'] = df['y'].map({'no': 0, 'yes': 1})
# Check unique values in the target column
print("\nUnique values in target variable (y):")
print(df['y'].value_counts())
Unique values in target variable (y): y 0 4000 1 521 Name: count, dtype: int64
In [ ]:
# Step 2: Encode Categorical Variables
# Identify categorical columns
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Display the new structure of the dataset
print("\nDataset after encoding categorical variables:")
df_encoded.head()
Dataset after encoding categorical variables:
Out[ ]:
age | balance | day | duration | campaign | pdays | previous | y | job_blue-collar | job_entrepreneur | ... | month_jul | month_jun | month_mar | month_may | month_nov | month_oct | month_sep | poutcome_other | poutcome_success | poutcome_unknown | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | 1787 | 19 | 79 | 1 | -1 | 0 | 0 | False | False | ... | False | False | False | False | False | True | False | False | False | True |
1 | 33 | 4789 | 11 | 220 | 1 | 339 | 4 | 0 | False | False | ... | False | False | False | True | False | False | False | False | False | False |
2 | 35 | 1350 | 16 | 185 | 1 | 330 | 1 | 0 | False | False | ... | False | False | False | False | False | False | False | False | False | False |
3 | 30 | 1476 | 3 | 199 | 4 | -1 | 0 | 0 | False | False | ... | False | True | False | False | False | False | False | False | False | True |
4 | 59 | 0 | 5 | 226 | 1 | -1 | 0 | 0 | True | False | ... | False | False | False | True | False | False | False | False | False | True |
5 rows × 43 columns
In [ ]:
# Step 3: Separate Features and Target Variable
# Separate features and target variable
X = df_encoded.drop(columns=['y'])
y = df_encoded['y']
# Display shape of feature matrix and target vector
print("\nShape of Features (X):", X.shape)
print("Shape of Target (y):", y.shape)
Shape of Features (X): (4521, 42) Shape of Target (y): (4521,)
In [ ]:
# Step 4: Feature Scaling
from sklearn.preprocessing import StandardScaler
# Initialize the StandardScaler
scaler = StandardScaler()
# Apply scaling to features
X_scaled = scaler.fit_transform(X)
# Convert back to DataFrame for better understanding
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
# Display first 5 rows of scaled data
print("\nFirst 5 rows after feature scaling:")
X_scaled_df.head()
First 5 rows after feature scaling:
Out[ ]:
age | balance | day | duration | campaign | pdays | previous | job_blue-collar | job_entrepreneur | job_housemaid | ... | month_jul | month_jun | month_mar | month_may | month_nov | month_oct | month_sep | poutcome_other | poutcome_success | poutcome_unknown | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.056270 | 0.121072 | 0.374052 | -0.711861 | -0.576829 | -0.407218 | -0.320413 | -0.514408 | -0.196454 | -0.159382 | ... | -0.430185 | -0.364805 | -0.104676 | -0.669064 | -0.306828 | 7.450671 | -0.107869 | -0.213447 | -0.171381 | 0.469300 |
1 | -0.772583 | 1.118644 | -0.596026 | -0.169194 | -0.576829 | 2.989044 | 2.041734 | -0.514408 | -0.196454 | -0.159382 | ... | -0.430185 | -0.364805 | -0.104676 | 1.494626 | -0.306828 | -0.134216 | -0.107869 | -0.213447 | -0.171381 | -2.130831 |
2 | -0.583458 | -0.024144 | 0.010273 | -0.303898 | -0.576829 | 2.899143 | 0.270124 | -0.514408 | -0.196454 | -0.159382 | ... | -0.430185 | -0.364805 | -0.104676 | -0.669064 | -0.306828 | -0.134216 | -0.107869 | -0.213447 | -0.171381 | -2.130831 |
3 | -1.056270 | 0.017726 | -1.566105 | -0.250017 | 0.387967 | -0.407218 | -0.320413 | -0.514408 | -0.196454 | -0.159382 | ... | -0.430185 | 2.741190 | -0.104676 | -0.669064 | -0.306828 | -0.134216 | -0.107869 | -0.213447 | -0.171381 | 0.469300 |
4 | 1.686036 | -0.472753 | -1.323585 | -0.146102 | -0.576829 | -0.407218 | -0.320413 | 1.943983 | -0.196454 | -0.159382 | ... | -0.430185 | -0.364805 | -0.104676 | 1.494626 | -0.306828 | -0.134216 | -0.107869 | -0.213447 | -0.171381 | 0.469300 |
5 rows × 42 columns
In [ ]:
# Step 5: Split Data into Training and Testing Sets
from sklearn.model_selection import train_test_split
# Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Display dataset sizes
print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)
Training set size: (3616, 42) Testing set size: (905, 42)
In [ ]:
# Step 6: Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
Out[ ]:
LogisticRegression(max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000)
In [ ]:
# Step 7: Make Predictions
# Predict on test data
y_pred = model.predict(X_test)
# Display first 10 predictions vs actual values
print("\nFirst 10 Actual vs Predicted Values:")
print(pd.DataFrame({'Actual': y_test[:10].values, 'Predicted': y_pred[:10]}))
First 10 Actual vs Predicted Values: Actual Predicted 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 0 9 0 0
In [ ]:
# Step 8: Evaluate the Model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Generate classification report
class_report = classification_report(y_test, y_pred)
# Display evaluation metrics
print("\nModel Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
Model Accuracy: 0.901657458563536 Confusion Matrix: [[789 18] [ 71 27]] Classification Report: precision recall f1-score support 0 0.92 0.98 0.95 807 1 0.60 0.28 0.38 98 accuracy 0.90 905 macro avg 0.76 0.63 0.66 905 weighted avg 0.88 0.90 0.88 905
In [ ]:
# Step 9: Visualize the Results
import matplotlib.pyplot as plt
import seaborn as sns
# Plot Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# ROC Curve
from sklearn.metrics import roc_curve, auc
# Get probability scores
y_prob = model.predict_proba(X_test)[:,1]
# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
# Plot ROC Curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Step 10: Plot Actual vs Predicted Values
# Scatter plot for actual vs predicted values
plt.figure(figsize=(10,5))
plt.scatter(range(len(y_test)), y_test, color='blue', label='Actual', alpha=0.6)
plt.scatter(range(len(y_pred)), y_pred, color='red', label='Predicted', alpha=0.6, marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Class (0 = No, 1 = Yes)')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()