In [ ]:
# Step 1: Import necessary libraries and Dataset Loading
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
data = pd.read_csv('Breast cancer.csv')
data.head()
Out[Â ]:
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
5 rows × 33 columns
In [ ]:
# Step 2: Exploratory Data Analysis (EDA)
print("Missing Values:")
print(data.isnull().sum())
print("Dataset Overview:")
data.describe()
Missing Values: id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 Unnamed: 32 569 dtype: int64 Dataset Overview:
Out[Â ]:
id | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 5.690000e+02 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 0.0 |
mean | 3.037183e+07 | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | ... | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 | NaN |
std | 1.250206e+08 | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | ... | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 | NaN |
min | 8.670000e+03 | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | ... | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 | NaN |
25% | 8.692180e+05 | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | ... | 21.080000 | 84.110000 | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 | NaN |
50% | 9.060240e+05 | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | ... | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 | NaN |
75% | 8.813129e+06 | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | ... | 29.720000 | 125.400000 | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 | NaN |
max | 9.113205e+08 | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | ... | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 | NaN |
8 rows × 32 columns
In [ ]:
# Visualize Diagnosis Distribution
sns.countplot(x='diagnosis', hue='diagnosis', data=data, palette='coolwarm')
plt.title("Diagnosis Distribution")
plt.show()
In [ ]:
# Step 3: Data Cleaning
# Drop unnecessary columns
data_cleaned = data.drop(columns=["id", "Unnamed: 32"], errors='ignore')
# Encode the target variable 'diagnosis' (M -> 1, B -> 0)
data_cleaned['diagnosis'] = data_cleaned['diagnosis'].map({'M': 1, 'B': 0})
data_cleaned.head()
Out[Â ]:
diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 1 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 1 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 1 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 31 columns
In [ ]:
# Step 4: Feature Engineering
# Display top correlated features with the target
correlations = data_cleaned.corr()['diagnosis'].sort_values(ascending=False)
print("Top Features Correlated with Diagnosis:")
correlations.head(10)
Top Features Correlated with Diagnosis:
Out[Â ]:
diagnosis | |
---|---|
diagnosis | 1.000000 |
concave points_worst | 0.793566 |
perimeter_worst | 0.782914 |
concave points_mean | 0.776614 |
radius_worst | 0.776454 |
perimeter_mean | 0.742636 |
area_worst | 0.733825 |
radius_mean | 0.730029 |
area_mean | 0.708984 |
concavity_mean | 0.696360 |
In [ ]:
# Step 5: Data Normalization
# Normalize numerical features
scaler = MinMaxScaler()
features = data_cleaned.drop(columns=["diagnosis"])
normalized_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
# Combine normalized features with the target column
processed_data = pd.concat([normalized_features, data_cleaned['diagnosis']], axis=1)
processed_data.head()
Out[Â ]:
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diagnosis | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.521037 | 0.022658 | 0.545989 | 0.363733 | 0.593753 | 0.792037 | 0.703140 | 0.731113 | 0.686364 | 0.605518 | ... | 0.141525 | 0.668310 | 0.450698 | 0.601136 | 0.619292 | 0.568610 | 0.912027 | 0.598462 | 0.418864 | 1 |
1 | 0.643144 | 0.272574 | 0.615783 | 0.501591 | 0.289880 | 0.181768 | 0.203608 | 0.348757 | 0.379798 | 0.141323 | ... | 0.303571 | 0.539818 | 0.435214 | 0.347553 | 0.154563 | 0.192971 | 0.639175 | 0.233590 | 0.222878 | 1 |
2 | 0.601496 | 0.390260 | 0.595743 | 0.449417 | 0.514309 | 0.431017 | 0.462512 | 0.635686 | 0.509596 | 0.211247 | ... | 0.360075 | 0.508442 | 0.374508 | 0.483590 | 0.385375 | 0.359744 | 0.835052 | 0.403706 | 0.213433 | 1 |
3 | 0.210090 | 0.360839 | 0.233501 | 0.102906 | 0.811321 | 0.811361 | 0.565604 | 0.522863 | 0.776263 | 1.000000 | ... | 0.385928 | 0.241347 | 0.094008 | 0.915472 | 0.814012 | 0.548642 | 0.884880 | 1.000000 | 0.773711 | 1 |
4 | 0.629893 | 0.156578 | 0.630986 | 0.489290 | 0.430351 | 0.347893 | 0.463918 | 0.518390 | 0.378283 | 0.186816 | ... | 0.123934 | 0.506948 | 0.341575 | 0.437364 | 0.172415 | 0.319489 | 0.558419 | 0.157500 | 0.142595 | 1 |
5 rows × 31 columns
In [ ]:
# Step 6: Splitting Data
X = processed_data.drop(columns=["diagnosis"])
y = processed_data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)
Training Data Shape: (455, 30) Testing Data Shape: (114, 30)
In [ ]:
# Step 7: Model Training
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
Out[Â ]:
DecisionTreeClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(random_state=42)
In [ ]:
# Step 8: Model Evaluation
y_pred = model.predict(X_test)
# Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# Display Metrics
print("\nModel Performance Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
Model Performance Metrics: Accuracy: 0.93 Precision: 0.90 Recall: 0.90 F1 Score: 0.90
In [ ]:
# Step 9: Visualization
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Benign", "Malignant"], yticklabels=["Benign", "Malignant"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
In [ ]:
from sklearn.tree import plot_tree
# Visualize the Decision Tree
plt.figure(figsize=(30, 15))
plot_tree(
model,
feature_names=X.columns,
class_names=["Benign", "Malignant"],
filled=True,
rounded=True,
fontsize=10
)
plt.title("Decision Tree Visualization")
plt.show()