# 1. Getting Data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error,mean_squared_error
import numpy as np
# Load the dataset
car_data = pd.read_csv('car_price.csv')
# Display the first few rows of the dataset
print(car_data.head())

   symboling         make fuel doors        style  length  width  height  \
0          3  alfa-romero  gas   two  convertible   168.8   64.1    48.8   
1          3  alfa-romero  gas   two  convertible   168.8   64.1    48.8   
2          1  alfa-romero  gas   two    hatchback   171.2   65.5    52.4   
3          2         audi  gas  four        sedan   176.6   66.2    54.3   
4          2         audi  gas  four        sedan   176.6   66.4    54.3   

  cylinders  engine horsepower   rpm  city_mpg  highway_mpg  price  
0      four     130        111  5000        21           27  13495  
1      four     130        111  5000        21           27  16500  
2       six     152        154  5000        19           26  16500  
3      four     109        102  5500        24           30  13950  
4      five     136        115  5500        18           22  17450

# 2. Visualize Data
# Set the aesthetic style of the plots
sns.set_style("whitegrid")
# Create a figure to hold the subplots
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
# Distribution of price
sns.histplot(car_data['price'], bins=30, kde=True, ax=axes[0])
axes[0].set_title('Distribution of Price')
# Engine size vs. price
sns.scatterplot(x='engine', y='price', data=car_data, ax=axes[1])
axes[1].set_title('Engine Size vs. Price')
# Fuel efficiency vs. price
sns.scatterplot(x='city_mpg', y='price', data=car_data, ax=axes[2])
axes[2].set_title('City MPG vs. Price')
plt.tight_layout()
plt.show()

# 3. Statistical Analysis
# Check for missing values and basic data statistics
missing_values = car_data.isnull().sum()
# Calculate descriptive statistics for numerical features
data_description = car_data.describe()
missing_values, data_description

(symboling      0
 make           0
 fuel           0
 doors          0
 style          0
 length         0
 width          0
 height         0
 cylinders      0
 engine         0
 horsepower     0
 rpm            0
 city_mpg       0
 highway_mpg    0
 price          0
 dtype: int64,
         symboling      length       width      height      engine    city_mpg  \
 count  201.000000  201.000000  201.000000  201.000000  201.000000  201.000000   
 mean     0.840796  174.200995   65.889055   53.766667  126.875622   25.179104   
 std      1.254802   12.322175    2.101471    2.447822   41.546834    6.423220   
 min     -2.000000  141.100000   60.300000   47.800000   61.000000   13.000000   
 25%      0.000000  166.800000   64.100000   52.000000   98.000000   19.000000   
 50%      1.000000  173.200000   65.500000   54.100000  120.000000   24.000000   
 75%      2.000000  183.500000   66.600000   55.500000  141.000000   30.000000   
 max      3.000000  208.100000   72.000000   59.800000  326.000000   49.000000   
 
        highway_mpg         price  
 count   201.000000    201.000000  
 mean     30.686567  13207.129353  
 std       6.815150   7947.066342  
 min      16.000000   5118.000000  
 25%      25.000000   7775.000000  
 50%      30.000000  10295.000000  
 75%      34.000000  16500.000000  
 max      54.000000  45400.000000  )

# Stage 4: Create Model
# Selecting the numerical and categorical columns
numerical_cols = car_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = car_data.select_dtypes(include=['object']).columns
# Creating a transformer for one-hot encoding of categorical variables
column_transformer = ColumnTransformer(
    [('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough'
)
# Preparing the feature matrix and target vector
X = column_transformer.fit_transform(car_data.drop('price', axis=1))
y = car_data['price']
# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
# Predicting the test set results
y_pred = linear_model.predict(X_test)
# Calculating the R² score
r2 = r2_score(y_test, y_pred)
print("Training Features Shape:", X_train.shape)
print("Test Features Shape:", X_test.shape)
print("Training Labels Shape:", y_train.shape)
print("Test Labels Shape:", y_test.shape)

Training Features Shape: (160, 128)
Test Features Shape: (41, 128)
Training Labels Shape: (160,)
Test Labels Shape: (41,)

# 5. Model Evaluation
# Calculating Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
# Printing evaluation results
evaluation_results = f"""
Evaluation Model Results:
- R² Score: {r2:.3f}
- Mean Absolute Error (MAE): ${mae:.2f}
- Root Mean Squared Error (RMSE): ${rmse:.2f}
"""
print(evaluation_results)

Evaluation Model Results:
- R² Score: 0.943
- Mean Absolute Error (MAE): $1783.63
- Root Mean Squared Error (RMSE): $2629.55

plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs. Predicted Prices')

Text(0.5, 1.0, 'Actual vs. Predicted Prices')