import pandas as pd
# Load the dataset
weather_data = pd.read_csv('weather_forecast_data.csv')

# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:\n")
weather_data.head()

First 5 rows of the dataset:

# Display dataset information
print("Dataset Information:\n")
weather_data.info()

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  2500 non-null   float64
 1   Humidity     2500 non-null   float64
 2   Wind_Speed   2500 non-null   float64
 3   Cloud_Cover  2500 non-null   float64
 4   Pressure     2500 non-null   float64
 5   Rain         2500 non-null   object 
dtypes: float64(5), object(1)
memory usage: 117.3+ KB

# Display dataset description
print("\nDataset Description:\n")
weather_data.describe(include='all')

Dataset Description:

from sklearn.preprocessing import LabelEncoder, StandardScaler
# Encode the target variable
label_encoder = LabelEncoder()
weather_data['Rain'] = label_encoder.fit_transform(weather_data['Rain'])
# Standardize the features
X = weather_data.drop('Rain', axis=1)
y = weather_data['Rain']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Print transformed dataset sample
print("\nEncoded and Scaled Features (First 5 Rows):\n")
pd.DataFrame(X_scaled, columns=X.columns).head()

Encoded and Scaled Features (First 5 Rows):

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=42)
# Train KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate the model
accuracy_80 = accuracy_score(y_test, y_pred)
conf_matrix_80 = confusion_matrix(y_test, y_pred)
print(f"\nAccuracy for 80-20 Split: {accuracy_80:.2f}")
print("\nConfusion Matrix for 80-20 Split:\n")
print(conf_matrix_80)

Accuracy for 80-20 Split: 0.97

Confusion Matrix for 80-20 Split:

[[435   8]
 [  7  50]]

# Split dataset into training (75%) and testing (25%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=42)
# Train KNN model
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate the model
accuracy_75 = accuracy_score(y_test, y_pred)
conf_matrix_75 = confusion_matrix(y_test, y_pred)
print(f"\nAccuracy for 75-25 Split: {accuracy_75:.2f}")
print("\nConfusion Matrix for 75-25 Split:\n")
print(conf_matrix_75)

Accuracy for 75-25 Split: 0.97

Confusion Matrix for 75-25 Split:

[[547   9]
 [  9  60]]

	Temperature	Humidity	Wind_Speed	Cloud_Cover	Pressure	Rain
0	23.720338	89.592641	7.335604	50.501694	1032.378759	rain
1	27.879734	46.489704	5.952484	4.990053	992.614190	no rain
2	25.069084	83.072843	1.371992	14.855784	1007.231620	no rain
3	23.622080	74.367758	7.050551	67.255282	982.632013	rain
4	20.591370	96.858822	4.643921	47.676444	980.825142	no rain

	Temperature	Humidity	Wind_Speed	Cloud_Cover	Pressure	Rain
count	2500.000000	2500.000000	2500.000000	2500.000000	2500.000000	2500
unique	NaN	NaN	NaN	NaN	NaN	2
top	NaN	NaN	NaN	NaN	NaN	no rain
freq	NaN	NaN	NaN	NaN	NaN	2186
mean	22.581725	64.347094	9.906255	49.658104	1014.312336	NaN
std	7.326996	19.954739	5.780316	29.123104	20.196433	NaN
min	10.001842	30.005071	0.009819	0.015038	980.014486	NaN
25%	16.359398	47.339815	4.761909	23.900016	996.938630	NaN
50%	22.536448	63.920797	9.908572	49.488284	1013.433035	NaN
75%	28.976476	81.561021	14.948408	75.324140	1031.735067	NaN
max	34.995214	99.997481	19.999132	99.997795	1049.985593	NaN

	Temperature	Humidity	Wind_Speed	Cloud_Cover	Pressure
0	0.155431	1.265393	-0.444814	0.028972	0.894714
1	0.723225	-0.895074	-0.684143	-1.534074	-1.074570
2	0.339547	0.938599	-1.476731	-1.195246	-0.350663
3	0.142018	0.502270	-0.494138	0.604355	-1.568924
4	-0.271701	1.629599	-0.910571	-0.068058	-1.658406