Part 1: Extract Data Using Regular Expression¶
In [ ]:
# 1. Import necessary libraries
import re
In [ ]:
# 2. Read Iphone_Order.txt
# Read the file content
with open('Iphone_Order.txt' , 'r') as file:
text_data = file.read()
In [ ]:
# 3. Use the appropriate pattern and function to extract the full name of the second customer whose name starts with "S" and ends with "er".
pattern = r'\b(S\w+ [A-Z]\. [A-Za-z]*er)\b'
name = [match.group() for match in re.finditer(pattern, text_data)]
print("Second customer Name:", name)
Second customer Name: ['Sahar L. Miller']
In [ ]:
# 4. Use the appropriate pattern and function to extract all dollar amounts (e.g., "$1,499.99") from the text.
dollar_pattern = r'\$\s?[\d,]+\d'
dollar_amounts = [match.group() for match in re.finditer(dollar_pattern, text_data)]
print("Dollar amounts found:", dollar_amounts)
Dollar amounts found: ['$1,388', '$399', '$150']
In [ ]:
# 5. Use split() to divide the text into parts at punctuation marks like periods or commas.
text_parts = re.split(r"[.,]", text_data)
print("Text split by punctuation:")
text_parts
Text split by punctuation:
Out[ ]:
['October 26', ' 2024 at 3:15 PM: John Smith placed an online order totaling $1', '388', '88 for a brand-new Iphone', ' The item will be shipped to 124 Maple Street', ' Suite #440', ' Toronto', ' ON', ' and is expected to arrive by November 8', ' 2024', ' His order confirmation number is ORD-2090-TSK-9892', ' and he used a discount code: SAVE22', ' In case of any issues', ' he can contact support at support@techshop', 'com or call at +1-800-666-0199', '\n\nAnother customer', ' Sahar L', ' Miller', ' made a purchase for $399', '75 on October 27', ' 2024', ' for an external charger', ' Her order ID is ORD-2101-HARD-9902', ' She also provided her email', ' sahar', 'miller123@example', 'net', ' and opted for delivery to her office at 322 King’s Road', ' Apt 4B', ' New York', ' NY', ' 10011', '\n\nAdditionally', ' both customers were eligible for free shipping on orders over $150', ' As a reminder', ' please note that the return window closes 20 days after the purchase date', " which means John's deadline is November 25", ' 2024', ' while Sarah’s is November 24', ' 2024', '']
Part-2: (Data Visualization)¶
In [ ]:
# 1. Import necessary libraries and read the dataset
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_excel('clean_canada_data.xlsx')
In [ ]:
# 2. Show the first 5 Records with the proper header columns.
data.head()
Out[ ]:
Country | Continent | Region | DevName | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | ... | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | Asia | Southern Asia | Developing regions | 16 | 39 | 39 | 47 | 71 | 340 | ... | 3436 | 3009 | 2652 | 2111 | 1746 | 1758 | 2203 | 2635 | 2004 | 58639 |
1 | Albania | Europe | Southern Europe | Developed regions | 1 | 0 | 0 | 0 | 0 | 0 | ... | 1223 | 856 | 702 | 560 | 716 | 561 | 539 | 620 | 603 | 15699 |
2 | Algeria | Africa | Northern Africa | Developing regions | 80 | 67 | 71 | 69 | 63 | 44 | ... | 3626 | 4807 | 3623 | 4005 | 5393 | 4752 | 4325 | 3774 | 4331 | 69439 |
3 | American Samoa | Oceania | Polynesia | Developing regions | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
4 | Andorra | Europe | Southern Europe | Developed regions | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 15 |
5 rows × 39 columns
In [ ]:
# 3. Use the appropriate techniques for wrangling the data (if required).
missing_values = data.isnull().sum()
missing_values_summary = missing_values[missing_values > 0]
print("Columns with missing values and their counts:\n", missing_values_summary)
years = list(range(1980, 2011))
countries_of_interest = ['Greece', 'Albania', 'Bulgaria']
selected_data = data[data['Country'].isin(countries_of_interest)][['Country'] + years]
print(selected_data)
Columns with missing values and their counts: Series([], dtype: int64) Country 1980 1981 1982 1983 1984 1985 1986 1987 1988 ... 2001 \ 1 Albania 1 0 0 0 0 0 1 2 2 ... 1602 26 Bulgaria 24 20 12 33 11 24 33 52 43 ... 1283 69 Greece 1065 953 897 633 580 584 547 765 564 ... 135 2002 2003 2004 2005 2006 2007 2008 2009 2010 1 1021 853 1450 1223 856 702 560 716 561 26 1517 1507 2022 1738 1419 1172 994 784 556 69 107 117 120 100 74 110 107 119 101 [3 rows x 32 columns]
In [ ]:
# 4. Display the immigration distribution for Greece, Albania, and Bulgaria from 1980 to 2010 using a histogram.
import matplotlib.pyplot as plt
import pandas as pd
countries = ['Greece', 'Albania', 'Bulgaria']
years = list(range(1980, 2011))
selected_data = data[data['Country'].isin(countries)].set_index('Country')[years]
transposed_data = selected_data.T
plt.figure(figsize=(10, 6))
boxplot = plt.boxplot(transposed_data, patch_artist=True, labels=countries)
for box in boxplot['boxes']:
box.set(facecolor='red')
for whisker in boxplot['whiskers']:
whisker.set(color='red')
for cap in boxplot['caps']:
cap.set(color='red')
for median in boxplot['medians']:
median.set(color='black')
plt.title("Immigration Distribution for Greece, Albania, and Bulgaria (1980–2010)")
plt.ylabel("Number of Immigrants")
plt.xlabel("Country")
plt.show()
In [ ]:
# 5. Analyze your answer and check if there is maximum outlier or not in Greece (use calculation)
# Calculate outliers for Greece
greece_data = selected_data.loc['Greece']
q1 = greece_data.quantile(0.25)
q3 = greece_data.quantile(0.75)
iqr = q3 - q1
# Outlier thresholds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
# Identify outliers
outliers = greece_data[(greece_data < lower_bound) | (greece_data > upper_bound)]
print("Outliers for Greece:", outliers)
Outliers for Greece: Series([], Name: Greece, dtype: int64)