# 1. Import necessary libraries
import re

# 2.	Read Iphone_Order.txt
# Read the file content
with open('Iphone_Order.txt' , 'r') as file:
    text_data = file.read()

# 3. Use the appropriate pattern and function to extract the full name of the second customer whose name starts with "S" and ends with "er".
pattern = r'\b(S\w+ [A-Z]\. [A-Za-z]*er)\b'
name = [match.group() for match in re.finditer(pattern, text_data)]
print("Second customer Name:", name)

Second customer Name: ['Sahar L. Miller']

# 4. Use the appropriate pattern and function to extract all dollar amounts (e.g., "$1,499.99") from the text.
dollar_pattern = r'\$\s?[\d,]+\d'
dollar_amounts = [match.group() for match in re.finditer(dollar_pattern, text_data)]
print("Dollar amounts found:", dollar_amounts)

Dollar amounts found: ['$1,388', '$399', '$150']

# 5. Use split() to divide the text into parts at punctuation marks like periods or commas.
text_parts = re.split(r"[.,]", text_data)
print("Text split by punctuation:")
text_parts

Text split by punctuation:

['October 26',
 ' 2024 at 3:15 PM: John Smith placed an online order totaling $1',
 '388',
 '88 for a brand-new Iphone',
 ' The item will be shipped to 124 Maple Street',
 ' Suite #440',
 ' Toronto',
 ' ON',
 ' and is expected to arrive by November 8',
 ' 2024',
 ' His order confirmation number is ORD-2090-TSK-9892',
 ' and he used a discount code: SAVE22',
 ' In case of any issues',
 ' he can contact support at support@techshop',
 'com or call at +1-800-666-0199',
 '\n\nAnother customer',
 ' Sahar L',
 ' Miller',
 ' made a purchase for $399',
 '75 on October 27',
 ' 2024',
 ' for an external charger',
 ' Her order ID is ORD-2101-HARD-9902',
 ' She also provided her email',
 ' sahar',
 'miller123@example',
 'net',
 ' and opted for delivery to her office at 322 Kingâ€™s Road',
 ' Apt 4B',
 ' New York',
 ' NY',
 ' 10011',
 '\n\nAdditionally',
 ' both customers were eligible for free shipping on orders over $150',
 ' As a reminder',
 ' please note that the return window closes 20 days after the purchase date',
 " which means John's deadline is November 25",
 ' 2024',
 ' while Sarahâ€™s is November 24',
 ' 2024',
 '']

# 1. Import necessary libraries and read the dataset
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_excel('clean_canada_data.xlsx')

# 2.	Show the first 5 Records with the proper header columns.
data.head()

# 3. Use the appropriate techniques for wrangling the data (if required).
missing_values = data.isnull().sum()
missing_values_summary = missing_values[missing_values > 0]
print("Columns with missing values and their counts:\n", missing_values_summary)
years = list(range(1980, 2011))
countries_of_interest = ['Greece', 'Albania', 'Bulgaria']
selected_data = data[data['Country'].isin(countries_of_interest)][['Country'] + years]
print(selected_data)

Columns with missing values and their counts:
 Series([], dtype: int64)
     Country  1980  1981  1982  1983  1984  1985  1986  1987  1988  ...  2001  \
1    Albania     1     0     0     0     0     0     1     2     2  ...  1602   
26  Bulgaria    24    20    12    33    11    24    33    52    43  ...  1283   
69    Greece  1065   953   897   633   580   584   547   765   564  ...   135   

    2002  2003  2004  2005  2006  2007  2008  2009  2010  
1   1021   853  1450  1223   856   702   560   716   561  
26  1517  1507  2022  1738  1419  1172   994   784   556  
69   107   117   120   100    74   110   107   119   101  

[3 rows x 32 columns]

# 4.	Display the immigration distribution for Greece, Albania, and Bulgaria from 1980 to 2010 using a histogram.
import matplotlib.pyplot as plt
import pandas as pd
countries = ['Greece', 'Albania', 'Bulgaria']
years = list(range(1980, 2011))
selected_data = data[data['Country'].isin(countries)].set_index('Country')[years]
transposed_data = selected_data.T
plt.figure(figsize=(10, 6))
boxplot = plt.boxplot(transposed_data, patch_artist=True, labels=countries)
for box in boxplot['boxes']:
    box.set(facecolor='red')
for whisker in boxplot['whiskers']:
    whisker.set(color='red')
for cap in boxplot['caps']:
    cap.set(color='red')
for median in boxplot['medians']:
    median.set(color='black')
plt.title("Immigration Distribution for Greece, Albania, and Bulgaria (1980–2010)")
plt.ylabel("Number of Immigrants")
plt.xlabel("Country")
plt.show()

# 5.	Analyze your answer and check if there is maximum outlier or not in Greece (use calculation)
# Calculate outliers for Greece
greece_data = selected_data.loc['Greece']
q1 = greece_data.quantile(0.25)
q3 = greece_data.quantile(0.75)
iqr = q3 - q1
# Outlier thresholds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
# Identify outliers
outliers = greece_data[(greece_data < lower_bound) | (greece_data > upper_bound)]
print("Outliers for Greece:", outliers)

Outliers for Greece: Series([], Name: Greece, dtype: int64)

	Country	Continent	Region	DevName	1980	1981	1982	1983	1984	1985	...	2005	2006	2007	2008	2009	2010	2011	2012	2013	Total
0	Afghanistan	Asia	Southern Asia	Developing regions	16	39	39	47	71	340	...	3436	3009	2652	2111	1746	1758	2203	2635	2004	58639
1	Albania	Europe	Southern Europe	Developed regions	1	0	0	0	0	0	...	1223	856	702	560	716	561	539	620	603	15699
2	Algeria	Africa	Northern Africa	Developing regions	80	67	71	69	63	44	...	3626	4807	3623	4005	5393	4752	4325	3774	4331	69439
3	American Samoa	Oceania	Polynesia	Developing regions	0	1	0	0	0	0	...	0	1	0	0	0	0	0	0	0	6
4	Andorra	Europe	Southern Europe	Developed regions	0	0	0	0	0	0	...	0	1	1	0	0	0	0	1	1	15

Part 1: Extract Data Using Regular Expression¶

Part-2: (Data Visualization)¶