In [ ]:
# 1. Importing Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
In [ ]:
# 2. Import the breast dataset from the python library sci-kit-learn
data = load_breast_cancer()
In [ ]:
# 3. List all feature names of the breast cancer dataset
feature_names = data.feature_names
print("Feature Names:")
print(feature_names)
Feature Names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean smoothness' 'mean compactness' 'mean concavity' 'mean concave points' 'mean symmetry' 'mean fractal dimension' 'radius error' 'texture error' 'perimeter error' 'area error' 'smoothness error' 'compactness error' 'concavity error' 'concave points error' 'symmetry error' 'fractal dimension error' 'worst radius' 'worst texture' 'worst perimeter' 'worst area' 'worst smoothness' 'worst compactness' 'worst concavity' 'worst concave points' 'worst symmetry' 'worst fractal dimension']
In [ ]:
# 4. Print all key values of the dataset
key_values = data.keys()
print("Dataset Keys:")
print(key_values)
Dataset Keys: dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
In [ ]:
# 5. Prepare DataFrame for the given data and show the first 10 records
df = pd.DataFrame(data.data, columns=feature_names)
print("First 10 Records:")
df.head(10)
First 10 Records:
Out[Â ]:
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 | 12.45 | 15.70 | 82.57 | 477.1 | 0.12780 | 0.17000 | 0.15780 | 0.08089 | 0.2087 | 0.07613 | ... | 15.47 | 23.75 | 103.40 | 741.6 | 0.1791 | 0.5249 | 0.5355 | 0.1741 | 0.3985 | 0.12440 |
6 | 18.25 | 19.98 | 119.60 | 1040.0 | 0.09463 | 0.10900 | 0.11270 | 0.07400 | 0.1794 | 0.05742 | ... | 22.88 | 27.66 | 153.20 | 1606.0 | 0.1442 | 0.2576 | 0.3784 | 0.1932 | 0.3063 | 0.08368 |
7 | 13.71 | 20.83 | 90.20 | 577.9 | 0.11890 | 0.16450 | 0.09366 | 0.05985 | 0.2196 | 0.07451 | ... | 17.06 | 28.14 | 110.60 | 897.0 | 0.1654 | 0.3682 | 0.2678 | 0.1556 | 0.3196 | 0.11510 |
8 | 13.00 | 21.82 | 87.50 | 519.8 | 0.12730 | 0.19320 | 0.18590 | 0.09353 | 0.2350 | 0.07389 | ... | 15.49 | 30.73 | 106.20 | 739.3 | 0.1703 | 0.5401 | 0.5390 | 0.2060 | 0.4378 | 0.10720 |
9 | 12.46 | 24.04 | 83.97 | 475.9 | 0.11860 | 0.23960 | 0.22730 | 0.08543 | 0.2030 | 0.08243 | ... | 15.09 | 40.68 | 97.65 | 711.4 | 0.1853 | 1.0580 | 1.1050 | 0.2210 | 0.4366 | 0.20750 |
10 rows × 30 columns
In [ ]:
# 6. Pre-process & Transform the data using StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
print("Scaled Data (First 5 Rows):")
print(scaled_data[:5])
Scaled Data (First 5 Rows): [[ 1.09706398e+00 -2.07333501e+00 1.26993369e+00 9.84374905e-01 1.56846633e+00 3.28351467e+00 2.65287398e+00 2.53247522e+00 2.21751501e+00 2.25574689e+00 2.48973393e+00 -5.65265059e-01 2.83303087e+00 2.48757756e+00 -2.14001647e-01 1.31686157e+00 7.24026158e-01 6.60819941e-01 1.14875667e+00 9.07083081e-01 1.88668963e+00 -1.35929347e+00 2.30360062e+00 2.00123749e+00 1.30768627e+00 2.61666502e+00 2.10952635e+00 2.29607613e+00 2.75062224e+00 1.93701461e+00] [ 1.82982061e+00 -3.53632408e-01 1.68595471e+00 1.90870825e+00 -8.26962447e-01 -4.87071673e-01 -2.38458552e-02 5.48144156e-01 1.39236330e-03 -8.68652457e-01 4.99254601e-01 -8.76243603e-01 2.63326966e-01 7.42401948e-01 -6.05350847e-01 -6.92926270e-01 -4.40780058e-01 2.60162067e-01 -8.05450380e-01 -9.94437403e-02 1.80592744e+00 -3.69203222e-01 1.53512599e+00 1.89048899e+00 -3.75611957e-01 -4.30444219e-01 -1.46748968e-01 1.08708430e+00 -2.43889668e-01 2.81189987e-01] [ 1.57988811e+00 4.56186952e-01 1.56650313e+00 1.55888363e+00 9.42210440e-01 1.05292554e+00 1.36347845e+00 2.03723076e+00 9.39684817e-01 -3.98007910e-01 1.22867595e+00 -7.80083377e-01 8.50928301e-01 1.18133606e+00 -2.97005012e-01 8.14973504e-01 2.13076435e-01 1.42482747e+00 2.37035535e-01 2.93559404e-01 1.51187025e+00 -2.39743838e-02 1.34747521e+00 1.45628455e+00 5.27407405e-01 1.08293217e+00 8.54973944e-01 1.95500035e+00 1.15225500e+00 2.01391209e-01] [-7.68909287e-01 2.53732112e-01 -5.92687167e-01 -7.64463792e-01 3.28355348e+00 3.40290899e+00 1.91589718e+00 1.45170736e+00 2.86738293e+00 4.91091929e+00 3.26373441e-01 -1.10409044e-01 2.86593405e-01 -2.88378148e-01 6.89701660e-01 2.74428041e+00 8.19518384e-01 1.11500701e+00 4.73268037e+00 2.04751088e+00 -2.81464464e-01 1.33984094e-01 -2.49939304e-01 -5.50021228e-01 3.39427470e+00 3.89339743e+00 1.98958826e+00 2.17578601e+00 6.04604135e+00 4.93501034e+00] [ 1.75029663e+00 -1.15181643e+00 1.77657315e+00 1.82622928e+00 2.80371830e-01 5.39340452e-01 1.37101143e+00 1.42849277e+00 -9.56046689e-03 -5.62449981e-01 1.27054278e+00 -7.90243702e-01 1.27318941e+00 1.19035676e+00 1.48306716e+00 -4.85198799e-02 8.28470780e-01 1.14420474e+00 -3.61092272e-01 4.99328134e-01 1.29857524e+00 -1.46677038e+00 1.33853946e+00 1.22072425e+00 2.20556166e-01 -3.13394511e-01 6.13178758e-01 7.29259257e-01 -8.68352984e-01 -3.97099619e-01]]
In [ ]:
# 7. Create the PCA object with 2 Components
pca = PCA(n_components=2)
print("PCA Object Created.")
PCA Object Created.
In [ ]:
# 8. Transform the data to its first 2 principal components
pca_data = pca.fit_transform(scaled_data)
print("PCA Data (First 5 Rows):")
print(pca_data[:5])
PCA Data (First 5 Rows): [[ 9.19283683 1.94858307] [ 2.3878018 -3.76817174] [ 5.73389628 -1.0751738 ] [ 7.1229532 10.27558912] [ 3.93530207 -1.94807157]]
In [ ]:
# 9. Print all the 30 PCA components of the dataset
pca_components = pca.components_
print("PCA Components:")
print(pca_components)
PCA Components: [[ 0.21890244 0.10372458 0.22753729 0.22099499 0.14258969 0.23928535 0.25840048 0.26085376 0.13816696 0.06436335 0.20597878 0.01742803 0.21132592 0.20286964 0.01453145 0.17039345 0.15358979 0.1834174 0.04249842 0.10256832 0.22799663 0.10446933 0.23663968 0.22487053 0.12795256 0.21009588 0.22876753 0.25088597 0.12290456 0.13178394] [-0.23385713 -0.05970609 -0.21518136 -0.23107671 0.18611302 0.15189161 0.06016536 -0.0347675 0.19034877 0.36657547 -0.10555215 0.08997968 -0.08945723 -0.15229263 0.20443045 0.2327159 0.19720728 0.13032156 0.183848 0.28009203 -0.21986638 -0.0454673 -0.19987843 -0.21935186 0.17230435 0.14359317 0.09796411 -0.00825724 0.14188335 0.27533947]]
In [ ]:
# 10. Visualize reduced 30 dimensions to just 2 dimensions using Scatter plot
plt.figure(figsize=(8, 4))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data.target, cmap='viridis', alpha=0.7)
plt.colorbar(label='Target')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA - Breast Cancer Dataset Reduced to 2 Dimensions')
plt.show()