In [ ]:
# 1. Getting Data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error,mean_squared_error
import numpy as np
# Load the dataset
car_data = pd.read_csv('car_price.csv')
# Display the first few rows of the dataset
print(car_data.head())
symboling make fuel doors style length width height \ 0 3 alfa-romero gas two convertible 168.8 64.1 48.8 1 3 alfa-romero gas two convertible 168.8 64.1 48.8 2 1 alfa-romero gas two hatchback 171.2 65.5 52.4 3 2 audi gas four sedan 176.6 66.2 54.3 4 2 audi gas four sedan 176.6 66.4 54.3 cylinders engine horsepower rpm city_mpg highway_mpg price 0 four 130 111 5000 21 27 13495 1 four 130 111 5000 21 27 16500 2 six 152 154 5000 19 26 16500 3 four 109 102 5500 24 30 13950 4 five 136 115 5500 18 22 17450
In [ ]:
# 2. Visualize Data
# Set the aesthetic style of the plots
sns.set_style("whitegrid")
# Create a figure to hold the subplots
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
# Distribution of price
sns.histplot(car_data['price'], bins=30, kde=True, ax=axes[0])
axes[0].set_title('Distribution of Price')
# Engine size vs. price
sns.scatterplot(x='engine', y='price', data=car_data, ax=axes[1])
axes[1].set_title('Engine Size vs. Price')
# Fuel efficiency vs. price
sns.scatterplot(x='city_mpg', y='price', data=car_data, ax=axes[2])
axes[2].set_title('City MPG vs. Price')
plt.tight_layout()
plt.show()
In [ ]:
# 3. Statistical Analysis
# Check for missing values and basic data statistics
missing_values = car_data.isnull().sum()
# Calculate descriptive statistics for numerical features
data_description = car_data.describe()
missing_values, data_description
Out[ ]:
(symboling 0 make 0 fuel 0 doors 0 style 0 length 0 width 0 height 0 cylinders 0 engine 0 horsepower 0 rpm 0 city_mpg 0 highway_mpg 0 price 0 dtype: int64, symboling length width height engine city_mpg \ count 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 mean 0.840796 174.200995 65.889055 53.766667 126.875622 25.179104 std 1.254802 12.322175 2.101471 2.447822 41.546834 6.423220 min -2.000000 141.100000 60.300000 47.800000 61.000000 13.000000 25% 0.000000 166.800000 64.100000 52.000000 98.000000 19.000000 50% 1.000000 173.200000 65.500000 54.100000 120.000000 24.000000 75% 2.000000 183.500000 66.600000 55.500000 141.000000 30.000000 max 3.000000 208.100000 72.000000 59.800000 326.000000 49.000000 highway_mpg price count 201.000000 201.000000 mean 30.686567 13207.129353 std 6.815150 7947.066342 min 16.000000 5118.000000 25% 25.000000 7775.000000 50% 30.000000 10295.000000 75% 34.000000 16500.000000 max 54.000000 45400.000000 )
In [ ]:
# Stage 4: Create Model
# Selecting the numerical and categorical columns
numerical_cols = car_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = car_data.select_dtypes(include=['object']).columns
# Creating a transformer for one-hot encoding of categorical variables
column_transformer = ColumnTransformer(
[('cat', OneHotEncoder(), categorical_cols)],
remainder='passthrough'
)
# Preparing the feature matrix and target vector
X = column_transformer.fit_transform(car_data.drop('price', axis=1))
y = car_data['price']
# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
# Predicting the test set results
y_pred = linear_model.predict(X_test)
# Calculating the R² score
r2 = r2_score(y_test, y_pred)
print("Training Features Shape:", X_train.shape)
print("Test Features Shape:", X_test.shape)
print("Training Labels Shape:", y_train.shape)
print("Test Labels Shape:", y_test.shape)
Training Features Shape: (160, 128) Test Features Shape: (41, 128) Training Labels Shape: (160,) Test Labels Shape: (41,)
In [ ]:
# 5. Model Evaluation
# Calculating Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
# Printing evaluation results
evaluation_results = f"""
Evaluation Model Results:
- R² Score: {r2:.3f}
- Mean Absolute Error (MAE): ${mae:.2f}
- Root Mean Squared Error (RMSE): ${rmse:.2f}
"""
print(evaluation_results)
Evaluation Model Results: - R² Score: 0.943 - Mean Absolute Error (MAE): $1783.63 - Root Mean Squared Error (RMSE): $2629.55
In [ ]:
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs. Predicted Prices')
Out[ ]:
Text(0.5, 1.0, 'Actual vs. Predicted Prices')