In [ ]:
# 1: Import Necessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
In [ ]:
# 2: Load the Data
# Load the dataset
file_path = 'Language Detection.csv'
dataset = pd.read_csv(file_path)
# Display basic information about the dataset
dataset.info()
dataset.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB
Out[ ]:
Text Language
0 Nature, in the broadest sense, is the natural... English
1 "Nature" can refer to the phenomena of the phy... English
2 The study of nature is a large, if not the onl... English
3 Although humans are part of nature, human acti... English
4 [1] The word nature is borrowed from the Old F... English
In [ ]:
# 3: Explore and Visualize the Dataset
# Language distribution
language_distribution = dataset['Language'].value_counts()
# Plot language distribution
plt.figure(figsize=(12, 6))
language_distribution.plot(kind='bar', color='skyblue')
plt.title("Language Distribution in the Dataset")
plt.xlabel("Language")
plt.ylabel("Number of Samples")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
# 4: Preprocess the Data
# Split the data into features (X) and labels (y)
X = dataset['Text']
y = dataset['Language']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Convert text data to numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_train_tfidf, X_test_tfidf
Out[ ]:
(<8269x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 76498 stored elements in Compressed Sparse Row format>,
 <2068x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 17240 stored elements in Compressed Sparse Row format>)
In [ ]:
# 5: Train the Model
# Initialize and train a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
Out[ ]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
In [ ]:
# 6: Evaluate the Model
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)
# Print accuracy and classification report
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_output)
Accuracy: 0.9482591876208898
Classification Report:
               precision    recall  f1-score   support

      Arabic       1.00      0.89      0.94       107
      Danish       1.00      0.92      0.96        86
       Dutch       1.00      0.94      0.97       109
     English       0.75      0.99      0.85       277
      French       0.99      0.98      0.98       203
      German       1.00      0.95      0.97        94
       Greek       1.00      0.96      0.98        73
       Hindi       1.00      0.92      0.96        12
     Italian       0.99      0.96      0.97       140
     Kannada       1.00      0.93      0.97        74
   Malayalam       1.00      0.97      0.99       119
  Portugeese       1.00      0.95      0.98       148
     Russian       1.00      0.92      0.96       138
     Spanish       0.97      0.95      0.96       164
    Sweedish       0.96      0.97      0.97       135
       Tamil       1.00      0.98      0.99        94
     Turkish       1.00      0.78      0.88        95

    accuracy                           0.95      2068
   macro avg       0.98      0.94      0.96      2068
weighted avg       0.96      0.95      0.95      2068

In [ ]:
# 7: Visualize the Model Results
# Calculate true and predicted class distributions
true_class_counts = pd.Series(y_test).value_counts()
pred_class_counts = pd.Series(y_pred).value_counts()
# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'True': true_class_counts,
    'Predicted': pred_class_counts
}).fillna(0)
# Plot a grouped bar chart
comparison_df.plot(kind='bar', figsize=(10, 6))
plt.title("True vs Predicted Class Distribution")
plt.xlabel("Language")
plt.ylabel("Number of Samples")
plt.xticks(rotation=45)
plt.legend(title="Class Type")
plt.show()
No description has been provided for this image