In [ ]:
# 1: Import Necessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
In [ ]:
# 2: Load the Data
# Load the dataset
file_path = 'Language Detection.csv'
dataset = pd.read_csv(file_path)
# Display basic information about the dataset
dataset.info()
dataset.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10337 entries, 0 to 10336 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Text 10337 non-null object 1 Language 10337 non-null object dtypes: object(2) memory usage: 161.6+ KB
Out[Â ]:
Text | Language | |
---|---|---|
0 | Nature, in the broadest sense, is the natural... | English |
1 | "Nature" can refer to the phenomena of the phy... | English |
2 | The study of nature is a large, if not the onl... | English |
3 | Although humans are part of nature, human acti... | English |
4 | [1] The word nature is borrowed from the Old F... | English |
In [ ]:
# 3: Explore and Visualize the Dataset
# Language distribution
language_distribution = dataset['Language'].value_counts()
# Plot language distribution
plt.figure(figsize=(12, 6))
language_distribution.plot(kind='bar', color='skyblue')
plt.title("Language Distribution in the Dataset")
plt.xlabel("Language")
plt.ylabel("Number of Samples")
plt.xticks(rotation=45)
plt.show()
In [ ]:
# 4: Preprocess the Data
# Split the data into features (X) and labels (y)
X = dataset['Text']
y = dataset['Language']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Convert text data to numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_train_tfidf, X_test_tfidf
Out[Â ]:
(<8269x5000 sparse matrix of type '<class 'numpy.float64'>' with 76498 stored elements in Compressed Sparse Row format>, <2068x5000 sparse matrix of type '<class 'numpy.float64'>' with 17240 stored elements in Compressed Sparse Row format>)
In [ ]:
# 5: Train the Model
# Initialize and train a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
Out[Â ]:
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
In [ ]:
# 6: Evaluate the Model
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)
# Print accuracy and classification report
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_output)
Accuracy: 0.9482591876208898 Classification Report: precision recall f1-score support Arabic 1.00 0.89 0.94 107 Danish 1.00 0.92 0.96 86 Dutch 1.00 0.94 0.97 109 English 0.75 0.99 0.85 277 French 0.99 0.98 0.98 203 German 1.00 0.95 0.97 94 Greek 1.00 0.96 0.98 73 Hindi 1.00 0.92 0.96 12 Italian 0.99 0.96 0.97 140 Kannada 1.00 0.93 0.97 74 Malayalam 1.00 0.97 0.99 119 Portugeese 1.00 0.95 0.98 148 Russian 1.00 0.92 0.96 138 Spanish 0.97 0.95 0.96 164 Sweedish 0.96 0.97 0.97 135 Tamil 1.00 0.98 0.99 94 Turkish 1.00 0.78 0.88 95 accuracy 0.95 2068 macro avg 0.98 0.94 0.96 2068 weighted avg 0.96 0.95 0.95 2068
In [ ]:
# 7: Visualize the Model Results
# Calculate true and predicted class distributions
true_class_counts = pd.Series(y_test).value_counts()
pred_class_counts = pd.Series(y_pred).value_counts()
# Create a comparison DataFrame
comparison_df = pd.DataFrame({
'True': true_class_counts,
'Predicted': pred_class_counts
}).fillna(0)
# Plot a grouped bar chart
comparison_df.plot(kind='bar', figsize=(10, 6))
plt.title("True vs Predicted Class Distribution")
plt.xlabel("Language")
plt.ylabel("Number of Samples")
plt.xticks(rotation=45)
plt.legend(title="Class Type")
plt.show()