Part 1: Extract Data Using Regular Expression¶

In [ ]:
# 1. Import necessary libraries
import re
In [ ]:
# 2.	Read Iphone_Order.txt
# Read the file content
with open('Iphone_Order.txt' , 'r') as file:
    text_data = file.read()
In [ ]:
# 3. Use the appropriate pattern and function to extract the full name of the second customer whose name starts with "S" and ends with "er".
pattern = r'\b(S\w+ [A-Z]\. [A-Za-z]*er)\b'
name = [match.group() for match in re.finditer(pattern, text_data)]
print("Second customer Name:", name)
Second customer Name: ['Sahar L. Miller']
In [ ]:
# 4. Use the appropriate pattern and function to extract all dollar amounts (e.g., "$1,499.99") from the text.
dollar_pattern = r'\$\s?[\d,]+\d'
dollar_amounts = [match.group() for match in re.finditer(dollar_pattern, text_data)]
print("Dollar amounts found:", dollar_amounts)
Dollar amounts found: ['$1,388', '$399', '$150']
In [ ]:
# 5. Use split() to divide the text into parts at punctuation marks like periods or commas.
text_parts = re.split(r"[.,]", text_data)
print("Text split by punctuation:")
text_parts
Text split by punctuation:
Out[ ]:
['October 26',
 ' 2024 at 3:15 PM: John Smith placed an online order totaling $1',
 '388',
 '88 for a brand-new Iphone',
 ' The item will be shipped to 124 Maple Street',
 ' Suite #440',
 ' Toronto',
 ' ON',
 ' and is expected to arrive by November 8',
 ' 2024',
 ' His order confirmation number is ORD-2090-TSK-9892',
 ' and he used a discount code: SAVE22',
 ' In case of any issues',
 ' he can contact support at support@techshop',
 'com or call at +1-800-666-0199',
 '\n\nAnother customer',
 ' Sahar L',
 ' Miller',
 ' made a purchase for $399',
 '75 on October 27',
 ' 2024',
 ' for an external charger',
 ' Her order ID is ORD-2101-HARD-9902',
 ' She also provided her email',
 ' sahar',
 'miller123@example',
 'net',
 ' and opted for delivery to her office at 322 King’s Road',
 ' Apt 4B',
 ' New York',
 ' NY',
 ' 10011',
 '\n\nAdditionally',
 ' both customers were eligible for free shipping on orders over $150',
 ' As a reminder',
 ' please note that the return window closes 20 days after the purchase date',
 " which means John's deadline is November 25",
 ' 2024',
 ' while Sarah’s is November 24',
 ' 2024',
 '']

Part-2: (Data Visualization)¶

In [ ]:
# 1. Import necessary libraries and read the dataset
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_excel('clean_canada_data.xlsx')
In [ ]:
# 2.	Show the first 5 Records with the proper header columns.
data.head()
Out[ ]:
Country Continent Region DevName 1980 1981 1982 1983 1984 1985 ... 2005 2006 2007 2008 2009 2010 2011 2012 2013 Total
0 Afghanistan Asia Southern Asia Developing regions 16 39 39 47 71 340 ... 3436 3009 2652 2111 1746 1758 2203 2635 2004 58639
1 Albania Europe Southern Europe Developed regions 1 0 0 0 0 0 ... 1223 856 702 560 716 561 539 620 603 15699
2 Algeria Africa Northern Africa Developing regions 80 67 71 69 63 44 ... 3626 4807 3623 4005 5393 4752 4325 3774 4331 69439
3 American Samoa Oceania Polynesia Developing regions 0 1 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 6
4 Andorra Europe Southern Europe Developed regions 0 0 0 0 0 0 ... 0 1 1 0 0 0 0 1 1 15

5 rows × 39 columns

In [ ]:
# 3. Use the appropriate techniques for wrangling the data (if required).
missing_values = data.isnull().sum()
missing_values_summary = missing_values[missing_values > 0]
print("Columns with missing values and their counts:\n", missing_values_summary)
years = list(range(1980, 2011))
countries_of_interest = ['Greece', 'Albania', 'Bulgaria']
selected_data = data[data['Country'].isin(countries_of_interest)][['Country'] + years]
print(selected_data)
Columns with missing values and their counts:
 Series([], dtype: int64)
     Country  1980  1981  1982  1983  1984  1985  1986  1987  1988  ...  2001  \
1    Albania     1     0     0     0     0     0     1     2     2  ...  1602   
26  Bulgaria    24    20    12    33    11    24    33    52    43  ...  1283   
69    Greece  1065   953   897   633   580   584   547   765   564  ...   135   

    2002  2003  2004  2005  2006  2007  2008  2009  2010  
1   1021   853  1450  1223   856   702   560   716   561  
26  1517  1507  2022  1738  1419  1172   994   784   556  
69   107   117   120   100    74   110   107   119   101  

[3 rows x 32 columns]
In [ ]:
# 4.	Display the immigration distribution for Greece, Albania, and Bulgaria from 1980 to 2010 using a histogram.
import matplotlib.pyplot as plt
import pandas as pd
countries = ['Greece', 'Albania', 'Bulgaria']
years = list(range(1980, 2011))
selected_data = data[data['Country'].isin(countries)].set_index('Country')[years]
transposed_data = selected_data.T
plt.figure(figsize=(10, 6))
boxplot = plt.boxplot(transposed_data, patch_artist=True, labels=countries)
for box in boxplot['boxes']:
    box.set(facecolor='red')
for whisker in boxplot['whiskers']:
    whisker.set(color='red')
for cap in boxplot['caps']:
    cap.set(color='red')
for median in boxplot['medians']:
    median.set(color='black')
plt.title("Immigration Distribution for Greece, Albania, and Bulgaria (1980–2010)")
plt.ylabel("Number of Immigrants")
plt.xlabel("Country")
plt.show()
No description has been provided for this image
In [ ]:
# 5.	Analyze your answer and check if there is maximum outlier or not in Greece (use calculation)
# Calculate outliers for Greece
greece_data = selected_data.loc['Greece']
q1 = greece_data.quantile(0.25)
q3 = greece_data.quantile(0.75)
iqr = q3 - q1
# Outlier thresholds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
# Identify outliers
outliers = greece_data[(greece_data < lower_bound) | (greece_data > upper_bound)]
print("Outliers for Greece:", outliers)
Outliers for Greece: Series([], Name: Greece, dtype: int64)