Step 1: Loading the Dataset¶
I start by importing necessary libraries and loading the dataset into a pandas DataFrame.
In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import io
# Load the dataset
csv_data = """Date_Weight_g,Date_Color,Blemish_Percentage,Humidity_Percentage,Grade
12.5,Light_Brown,2.1,15.2,A
14.8,Dark_Brown,0.5,18.9,A
11.2,Medium_Brown,5.8,22.1,B
13.9,Light_Brown,1.2,16.5,A
10.5,Dark_Brown,8.3,25.7,C
12.1,Medium_Brown,3.5,20.3,B
15.3,Light_Brown,NaN,14.8,A
11.8,Dark_Brown,6.9,23.4,C
13.2,Medium_Brown,2.8,17.6,B
10.9,Light_Brown,9.1,26.8,C
14.5,Dark_Brown,0.9,NaN,A
12.7,Medium_Brown,4.2,21.5,B
11.5,Light_Brown,NaN,NaN,C
13.6,Dark_Brown,1.6,19.2,A
NaN,Medium_Brown,7.5,24.6,NaN"""
try:
df = pd.read_csv(io.StringIO(csv_data), na_values=['NaN'])
print("DataFrame Head:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())
except Exception as e:
print(f"Error loading data: {e}")
DataFrame Head: Date_Weight_g Date_Color Blemish_Percentage Humidity_Percentage Grade 0 12.5 Light_Brown 2.1 15.2 A 1 14.8 Dark_Brown 0.5 18.9 A 2 11.2 Medium_Brown 5.8 22.1 B 3 13.9 Light_Brown 1.2 16.5 A 4 10.5 Dark_Brown 8.3 25.7 C DataFrame Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 15 entries, 0 to 14 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date_Weight_g 14 non-null float64 1 Date_Color 15 non-null object 2 Blemish_Percentage 13 non-null float64 3 Humidity_Percentage 13 non-null float64 4 Grade 14 non-null object dtypes: float64(3), object(2) memory usage: 732.0+ bytes None
Step 2: Handling Missing Values#¶
I will:
- Drop rows where Grade is missing (since we cannot predict without a target).
- Replace missing numerical values with the mean of their respective columns.
In [17]:
# Step 2: Handling Missing Values
print("Before Handling Missing Values")
print(df.isna().sum())
# Drop rows with missing Grade
df = df.dropna(subset=['Grade'])
# Impute missing numerical values with mean
num_cols = ['Date_Weight_g', 'Blemish_Percentage', 'Humidity_Percentage']
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])
# Check if missing values are handled
print("\nAfter Handling Missing Values")
print(df.isna().sum())
\Before Handling Missing Values Date_Weight_g 0 Date_Color 0 Blemish_Percentage 0 Humidity_Percentage 0 Grade 0 dtype: int64 After Handling Missing Values Date_Weight_g 0 Date_Color 0 Blemish_Percentage 0 Humidity_Percentage 0 Grade 0 dtype: int64
Step 3: Feature Encoding¶
I will:
- Encode Date_Color (categorical) using LabelEncoder.
- Encode the target variable (Grade) numerically.
In [18]:
# Step 3: Feature Encoding
# Encode Date_Color
le_color = LabelEncoder()
df['Date_Color_Encoded'] = le_color.fit_transform(df['Date_Color'])
# Encode Grade (target variable)
le_grade = LabelEncoder()
df['Grade_Encoded'] = le_grade.fit_transform(df['Grade'])
# Display encoded data
print("Encoded DataFrame")
print(df[['Date_Color', 'Date_Color_Encoded', 'Grade', 'Grade_Encoded']].head())
Encoded DataFrame Date_Color Date_Color_Encoded Grade Grade_Encoded 0 Light_Brown 1 A 0 1 Dark_Brown 0 A 0 2 Medium_Brown 2 B 1 3 Light_Brown 1 A 0 4 Dark_Brown 0 C 2
Step 4: Feature Scalingt¶
I will scale numerical features using StandardScaler to ensure equal contribution.
In [19]:
# Step 4: Feature Scaling
# Select features for scaling
features = ['Date_Weight_g', 'Blemish_Percentage', 'Humidity_Percentage', 'Date_Color_Encoded']
X = df[features]
y = df['Grade_Encoded']
# Apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Display scaled features
print("Scaled Features (First 5 Rows)")
print(X_scaled[:5])
Scaled Features (First 5 Rows) [[-0.17119674 -0.6875346 -1.43956313 0.08944272] [ 1.40381326 -1.29586015 -0.36713691 -1.16275535] [-1.06141978 0.71921823 0.56036686 1.34164079] [ 0.787505 -1.02971773 -1.06276473 0.08944272] [-1.54077065 1.6697269 1.60380859 -1.16275535]]
Step 5: Train-Test Split¶
I split the data into 70% training and 30% testing.
In [20]:
# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
print("Train-Test Split")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
Train-Test Split Training samples: 9 Testing samples: 5
Step 6: KNN Model Training & Hyperparameter Tuning¶
I train KNN with different k values and select the best one.
In [22]:
# Step 6: KNN Model Training & Hyperparameter Tuning
k_values = [3, 5, 7, 9]
best_accuracy = 0
best_k = 0
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"K = {k}, Accuracy = {accuracy:.2f}")
if accuracy > best_accuracy:
best_accuracy = accuracy
best_k = k
print(f"\nBest K: {best_k} with accuracy: {best_accuracy:.2f}")
K = 3, Accuracy = 0.80 K = 5, Accuracy = 0.20 K = 7, Accuracy = 0.20 K = 9, Accuracy = 0.20 Best K: 3 with accuracy: 0.80
Step 7: Final Model Evaluation¶
I evaluate the best KNN model (k=3) using:
- Classification Report (Precision, Recall, F1-Score)
- Confusion Matrix
In [23]:
# Step 7: Final Model Evaluation
final_knn = KNeighborsClassifier(n_neighbors=best_k)
final_knn.fit(X_train, y_train)
y_pred = final_knn.predict(X_test)
print("Classification Report")
print(classification_report(y_test, y_pred, target_names=le_grade.classes_))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))
Classification Report precision recall f1-score support A 1.00 1.00 1.00 1 B 0.67 1.00 0.80 2 C 1.00 0.50 0.67 2 accuracy 0.80 5 macro avg 0.89 0.83 0.82 5 weighted avg 0.87 0.80 0.79 5 Confusion Matrix [[1 0 0] [0 2 0] [0 1 1]]