In [ ]:
import pandas as pd
# Load the dataset
weather_data = pd.read_csv('weather_forecast_data.csv')
In [ ]:
# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:\n")
weather_data.head()
First 5 rows of the dataset:
Out[ ]:
Temperature | Humidity | Wind_Speed | Cloud_Cover | Pressure | Rain | |
---|---|---|---|---|---|---|
0 | 23.720338 | 89.592641 | 7.335604 | 50.501694 | 1032.378759 | rain |
1 | 27.879734 | 46.489704 | 5.952484 | 4.990053 | 992.614190 | no rain |
2 | 25.069084 | 83.072843 | 1.371992 | 14.855784 | 1007.231620 | no rain |
3 | 23.622080 | 74.367758 | 7.050551 | 67.255282 | 982.632013 | rain |
4 | 20.591370 | 96.858822 | 4.643921 | 47.676444 | 980.825142 | no rain |
In [ ]:
# Display dataset information
print("Dataset Information:\n")
weather_data.info()
Dataset Information: <class 'pandas.core.frame.DataFrame'> RangeIndex: 2500 entries, 0 to 2499 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temperature 2500 non-null float64 1 Humidity 2500 non-null float64 2 Wind_Speed 2500 non-null float64 3 Cloud_Cover 2500 non-null float64 4 Pressure 2500 non-null float64 5 Rain 2500 non-null object dtypes: float64(5), object(1) memory usage: 117.3+ KB
In [ ]:
# Display dataset description
print("\nDataset Description:\n")
weather_data.describe(include='all')
Dataset Description:
Out[ ]:
Temperature | Humidity | Wind_Speed | Cloud_Cover | Pressure | Rain | |
---|---|---|---|---|---|---|
count | 2500.000000 | 2500.000000 | 2500.000000 | 2500.000000 | 2500.000000 | 2500 |
unique | NaN | NaN | NaN | NaN | NaN | 2 |
top | NaN | NaN | NaN | NaN | NaN | no rain |
freq | NaN | NaN | NaN | NaN | NaN | 2186 |
mean | 22.581725 | 64.347094 | 9.906255 | 49.658104 | 1014.312336 | NaN |
std | 7.326996 | 19.954739 | 5.780316 | 29.123104 | 20.196433 | NaN |
min | 10.001842 | 30.005071 | 0.009819 | 0.015038 | 980.014486 | NaN |
25% | 16.359398 | 47.339815 | 4.761909 | 23.900016 | 996.938630 | NaN |
50% | 22.536448 | 63.920797 | 9.908572 | 49.488284 | 1013.433035 | NaN |
75% | 28.976476 | 81.561021 | 14.948408 | 75.324140 | 1031.735067 | NaN |
max | 34.995214 | 99.997481 | 19.999132 | 99.997795 | 1049.985593 | NaN |
In [ ]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Encode the target variable
label_encoder = LabelEncoder()
weather_data['Rain'] = label_encoder.fit_transform(weather_data['Rain'])
# Standardize the features
X = weather_data.drop('Rain', axis=1)
y = weather_data['Rain']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Print transformed dataset sample
print("\nEncoded and Scaled Features (First 5 Rows):\n")
pd.DataFrame(X_scaled, columns=X.columns).head()
Encoded and Scaled Features (First 5 Rows):
Out[ ]:
Temperature | Humidity | Wind_Speed | Cloud_Cover | Pressure | |
---|---|---|---|---|---|
0 | 0.155431 | 1.265393 | -0.444814 | 0.028972 | 0.894714 |
1 | 0.723225 | -0.895074 | -0.684143 | -1.534074 | -1.074570 |
2 | 0.339547 | 0.938599 | -1.476731 | -1.195246 | -0.350663 |
3 | 0.142018 | 0.502270 | -0.494138 | 0.604355 | -1.568924 |
4 | -0.271701 | 1.629599 | -0.910571 | -0.068058 | -1.658406 |
In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=42)
# Train KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate the model
accuracy_80 = accuracy_score(y_test, y_pred)
conf_matrix_80 = confusion_matrix(y_test, y_pred)
print(f"\nAccuracy for 80-20 Split: {accuracy_80:.2f}")
print("\nConfusion Matrix for 80-20 Split:\n")
print(conf_matrix_80)
Accuracy for 80-20 Split: 0.97 Confusion Matrix for 80-20 Split: [[435 8] [ 7 50]]
In [ ]:
# Split dataset into training (75%) and testing (25%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=42)
# Train KNN model
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate the model
accuracy_75 = accuracy_score(y_test, y_pred)
conf_matrix_75 = confusion_matrix(y_test, y_pred)
print(f"\nAccuracy for 75-25 Split: {accuracy_75:.2f}")
print("\nConfusion Matrix for 75-25 Split:\n")
print(conf_matrix_75)
Accuracy for 75-25 Split: 0.97 Confusion Matrix for 75-25 Split: [[547 9] [ 9 60]]