In [ ]:
import pandas as pd
# Load the dataset
weather_data = pd.read_csv('weather_forecast_data.csv')
In [ ]:
# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:\n")
weather_data.head()
First 5 rows of the dataset:

Out[ ]:
Temperature Humidity Wind_Speed Cloud_Cover Pressure Rain
0 23.720338 89.592641 7.335604 50.501694 1032.378759 rain
1 27.879734 46.489704 5.952484 4.990053 992.614190 no rain
2 25.069084 83.072843 1.371992 14.855784 1007.231620 no rain
3 23.622080 74.367758 7.050551 67.255282 982.632013 rain
4 20.591370 96.858822 4.643921 47.676444 980.825142 no rain
In [ ]:
# Display dataset information
print("Dataset Information:\n")
weather_data.info()
Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  2500 non-null   float64
 1   Humidity     2500 non-null   float64
 2   Wind_Speed   2500 non-null   float64
 3   Cloud_Cover  2500 non-null   float64
 4   Pressure     2500 non-null   float64
 5   Rain         2500 non-null   object 
dtypes: float64(5), object(1)
memory usage: 117.3+ KB
In [ ]:
# Display dataset description
print("\nDataset Description:\n")
weather_data.describe(include='all')
Dataset Description:

Out[ ]:
Temperature Humidity Wind_Speed Cloud_Cover Pressure Rain
count 2500.000000 2500.000000 2500.000000 2500.000000 2500.000000 2500
unique NaN NaN NaN NaN NaN 2
top NaN NaN NaN NaN NaN no rain
freq NaN NaN NaN NaN NaN 2186
mean 22.581725 64.347094 9.906255 49.658104 1014.312336 NaN
std 7.326996 19.954739 5.780316 29.123104 20.196433 NaN
min 10.001842 30.005071 0.009819 0.015038 980.014486 NaN
25% 16.359398 47.339815 4.761909 23.900016 996.938630 NaN
50% 22.536448 63.920797 9.908572 49.488284 1013.433035 NaN
75% 28.976476 81.561021 14.948408 75.324140 1031.735067 NaN
max 34.995214 99.997481 19.999132 99.997795 1049.985593 NaN
In [ ]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Encode the target variable
label_encoder = LabelEncoder()
weather_data['Rain'] = label_encoder.fit_transform(weather_data['Rain'])
# Standardize the features
X = weather_data.drop('Rain', axis=1)
y = weather_data['Rain']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Print transformed dataset sample
print("\nEncoded and Scaled Features (First 5 Rows):\n")
pd.DataFrame(X_scaled, columns=X.columns).head()
Encoded and Scaled Features (First 5 Rows):

Out[ ]:
Temperature Humidity Wind_Speed Cloud_Cover Pressure
0 0.155431 1.265393 -0.444814 0.028972 0.894714
1 0.723225 -0.895074 -0.684143 -1.534074 -1.074570
2 0.339547 0.938599 -1.476731 -1.195246 -0.350663
3 0.142018 0.502270 -0.494138 0.604355 -1.568924
4 -0.271701 1.629599 -0.910571 -0.068058 -1.658406
In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=42)
# Train KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate the model
accuracy_80 = accuracy_score(y_test, y_pred)
conf_matrix_80 = confusion_matrix(y_test, y_pred)
print(f"\nAccuracy for 80-20 Split: {accuracy_80:.2f}")
print("\nConfusion Matrix for 80-20 Split:\n")
print(conf_matrix_80)
Accuracy for 80-20 Split: 0.97

Confusion Matrix for 80-20 Split:

[[435   8]
 [  7  50]]
In [ ]:
# Split dataset into training (75%) and testing (25%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=42)
# Train KNN model
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate the model
accuracy_75 = accuracy_score(y_test, y_pred)
conf_matrix_75 = confusion_matrix(y_test, y_pred)
print(f"\nAccuracy for 75-25 Split: {accuracy_75:.2f}")
print("\nConfusion Matrix for 75-25 Split:\n")
print(conf_matrix_75)
Accuracy for 75-25 Split: 0.97

Confusion Matrix for 75-25 Split:

[[547   9]
 [  9  60]]