In [ ]:
# Step 1: Import necessary libraries and Dataset Loading
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
data = pd.read_csv('Breast cancer.csv')
data.head()
Out[ ]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN

5 rows × 33 columns

In [ ]:
# Step 2: Exploratory Data Analysis (EDA)
print("Missing Values:")
print(data.isnull().sum())
print("Dataset Overview:")
data.describe()
Missing Values:
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed: 32                569
dtype: int64
Dataset Overview:
Out[ ]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 0.0
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 ... 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946 NaN
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 ... 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061 NaN
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040 NaN
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 ... 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460 NaN
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 ... 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040 NaN
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 ... 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080 NaN
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 ... 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500 NaN

8 rows × 32 columns

In [ ]:
# Visualize Diagnosis Distribution
sns.countplot(x='diagnosis', hue='diagnosis', data=data, palette='coolwarm')
plt.title("Diagnosis Distribution")
plt.show()
No description has been provided for this image
In [ ]:
# Step 3: Data Cleaning
# Drop unnecessary columns
data_cleaned = data.drop(columns=["id", "Unnamed: 32"], errors='ignore')
# Encode the target variable 'diagnosis' (M -> 1, B -> 0)
data_cleaned['diagnosis'] = data_cleaned['diagnosis'].map({'M': 1, 'B': 0})
data_cleaned.head()
Out[ ]:
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 1 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 1 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 1 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 31 columns

In [ ]:
# Step 4: Feature Engineering
# Display top correlated features with the target
correlations = data_cleaned.corr()['diagnosis'].sort_values(ascending=False)
print("Top Features Correlated with Diagnosis:")
correlations.head(10)
Top Features Correlated with Diagnosis:
Out[ ]:
diagnosis
diagnosis 1.000000
concave points_worst 0.793566
perimeter_worst 0.782914
concave points_mean 0.776614
radius_worst 0.776454
perimeter_mean 0.742636
area_worst 0.733825
radius_mean 0.730029
area_mean 0.708984
concavity_mean 0.696360

In [ ]:
# Step 5: Data Normalization
# Normalize numerical features
scaler = MinMaxScaler()
features = data_cleaned.drop(columns=["diagnosis"])
normalized_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
# Combine normalized features with the target column
processed_data = pd.concat([normalized_features, data_cleaned['diagnosis']], axis=1)
processed_data.head()
Out[ ]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst diagnosis
0 0.521037 0.022658 0.545989 0.363733 0.593753 0.792037 0.703140 0.731113 0.686364 0.605518 ... 0.141525 0.668310 0.450698 0.601136 0.619292 0.568610 0.912027 0.598462 0.418864 1
1 0.643144 0.272574 0.615783 0.501591 0.289880 0.181768 0.203608 0.348757 0.379798 0.141323 ... 0.303571 0.539818 0.435214 0.347553 0.154563 0.192971 0.639175 0.233590 0.222878 1
2 0.601496 0.390260 0.595743 0.449417 0.514309 0.431017 0.462512 0.635686 0.509596 0.211247 ... 0.360075 0.508442 0.374508 0.483590 0.385375 0.359744 0.835052 0.403706 0.213433 1
3 0.210090 0.360839 0.233501 0.102906 0.811321 0.811361 0.565604 0.522863 0.776263 1.000000 ... 0.385928 0.241347 0.094008 0.915472 0.814012 0.548642 0.884880 1.000000 0.773711 1
4 0.629893 0.156578 0.630986 0.489290 0.430351 0.347893 0.463918 0.518390 0.378283 0.186816 ... 0.123934 0.506948 0.341575 0.437364 0.172415 0.319489 0.558419 0.157500 0.142595 1

5 rows × 31 columns

In [ ]:
# Step 6: Splitting Data
X = processed_data.drop(columns=["diagnosis"])
y = processed_data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)
Training Data Shape: (455, 30)
Testing Data Shape: (114, 30)
In [ ]:
# Step 7: Model Training
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(random_state=42)
In [ ]:
# Step 8: Model Evaluation
y_pred = model.predict(X_test)
# Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# Display Metrics
print("\nModel Performance Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
Model Performance Metrics:
Accuracy: 0.93
Precision: 0.90
Recall: 0.90
F1 Score: 0.90
In [ ]:
# Step 9: Visualization
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Benign", "Malignant"], yticklabels=["Benign", "Malignant"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.tree import plot_tree
# Visualize the Decision Tree
plt.figure(figsize=(30, 15))
plot_tree(
    model,
    feature_names=X.columns,
    class_names=["Benign", "Malignant"],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title("Decision Tree Visualization")
plt.show()
No description has been provided for this image