Zamzam Water Demand Prediction¶

Import required libraries

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import io

Step 1: Data Loading and Initial Inspection¶

  • Loads the dataset from the provided CSV string
  • Displays the first 5 rows to understand the data structure
  • Shows dataset information including data types
  • Identifies missing values in each column
In [16]:
# Load dataset
csv_data = """Distribution_Point,Temperature_C,Time_of_Day,Pilgrim_Count,Bottles_Needed
Makkah_Gate_1,38,Morning,1200,480
Makkah_Gate_1,42,Afternoon,1500,605
Makkah_Gate_2,35,Morning,900,365
Makkah_Gate_2,40,Afternoon,1300,525
Masjid_al_Haram,37,Evening,1800,NaN
Masjid_al_Haram,39,Night,1000,400
Arafat,45,Day,2000,810
Arafat,43,Night,500,200
Mina,41,Day,1600,NaN
Mina,36,Night,700,285
Madinah_Gate_1,33,Morning,1100,445
Madinah_Gate_1,NaN,Afternoon,1400,560
Madinah_Gate_2,34,Evening,1700,685
Madinah_Gate_2,44,Night,NaN,320
Jeddah_Airport,32,NaN,600,NaN"""

try:
    df = pd.read_csv(io.StringIO(csv_data), na_values=['NaN'])
    print("DataFrame head:")
    print(df.head())
    print("\nDataFrame info:")
    print(df.info())
except Exception as e:
    print(f"Error loading CSV data: {e}")
DataFrame head:
  Distribution_Point  Temperature_C Time_of_Day  Pilgrim_Count  Bottles_Needed
0      Makkah_Gate_1           38.0     Morning         1200.0           480.0
1      Makkah_Gate_1           42.0   Afternoon         1500.0           605.0
2      Makkah_Gate_2           35.0     Morning          900.0           365.0
3      Makkah_Gate_2           40.0   Afternoon         1300.0           525.0
4    Masjid_al_Haram           37.0     Evening         1800.0             NaN

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Distribution_Point  15 non-null     object 
 1   Temperature_C       14 non-null     float64
 2   Time_of_Day         14 non-null     object 
 3   Pilgrim_Count       14 non-null     float64
 4   Bottles_Needed      12 non-null     float64
dtypes: float64(3), object(2)
memory usage: 732.0+ bytes
None

Step 2: Data Preprocessing¶

  • Removes rows where the target variable is missing

  • Uses median imputation for numerical features (temperature and pilgrim count)

  • Uses mode imputation for categorical feature (time of day)

  • Verifies no missing values remai

In [18]:
# Handle missing values
print("\nHandling missing values...")
# Drop rows where target (Bottles_Needed) is missing
df = df.dropna(subset=['Bottles_Needed'])
# Impute missing numerical values with median (robust to outliers)
df['Temperature_C'] = df['Temperature_C'].fillna(df['Temperature_C'].median())
df['Pilgrim_Count'] = df['Pilgrim_Count'].fillna(df['Pilgrim_Count'].median())
# Impute missing categorical values with mode
df['Time_of_Day'] = df['Time_of_Day'].fillna(df['Time_of_Day'].mode()[0])
print("Missing values after handling:")
print(df.isna().sum())
Handling missing values...
Missing values after handling:
Distribution_Point    0
Temperature_C         0
Time_of_Day           0
Pilgrim_Count         0
Bottles_Needed        0
dtype: int64

Step 3: Feature Engineering¶

  • Converts time of day to ordered categorical data
  • Creates numerical codes for distribution points
  • Preserves meaningful relationships in time data
In [19]:
# Convert Time_of_Day to ordered categories
time_order = ['Morning', 'Afternoon', 'Evening', 'Night', 'Day']
df['Time_of_Day'] = pd.Categorical(df['Time_of_Day'], categories=time_order, ordered=True)
# Create location codes
df['Location_Code'] = df['Distribution_Point'].astype('category').cat.codes
print("\nEngineered features:")
print(df[['Time_of_Day', 'Location_Code']].head())
Engineered features:
  Time_of_Day  Location_Code
0     Morning              3
1   Afternoon              3
2     Morning              4
3   Afternoon              4
5       Night              5

Step 4: Feature Scaling and Encoding¶

  • Standardizes numerical features (temperature and pilgrim count)
  • One-hot encodes time of day categories
  • Combines transformations into a single pipeline
  • Outputs the shape of the processed feature matrix
In [20]:
# Define preprocessing steps
numeric_features = ['Temperature_C', 'Pilgrim_Count']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])
categorical_features = ['Time_of_Day']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
# Apply transformations
X = df[numeric_features + categorical_features]
y = df['Bottles_Needed']
X_processed = preprocessor.fit_transform(X)
print("\nTransformed feature matrix shape:", X_processed.shape)
Transformed feature matrix shape: (12, 7)

Step 5: Model Training¶

  • Splits data into training (80%) and test (20%) sets
  • Trains a linear regression model
In [21]:
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
# Initialize and train model
model = LinearRegression()
model.fit(X_train, y_train)
Out[21]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()

Step 6: Model Evaluation¶

  • Calculates and displays key metrics: RMSE, MAE, R²
  • Creates a scatter plot comparing actual vs predicted values
In [22]:
# Make predictions
y_pred = model.predict(X_test)
# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nModel Performance:")
print(f"RMSE: {rmse:.2f} bottles")
print(f"MAE: {mae:.2f} bottles")
print(f"R²: {r2:.2f}")
# Plot actual vs predicted
plt.scatter(y_test, y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--')
plt.xlabel('Actual Bottles Needed')
plt.ylabel('Predicted Bottles Needed')
plt.title('Actual vs Predicted Demand')
plt.show()
Model Performance:
RMSE: 40.78 bottles
MAE: 31.57 bottles
R²: 0.77
No description has been provided for this image