Alternative PMT2 Solutions
These are alternative solutions for PMT2-SP25 and PMT2-FA24. These solutions were developed during testing and should generally work, but there may have been updates that broke them.
Note: if a solution doesn't work, please make a private piazza post and let us know. We'll get it removed or updated.
Unfortunately we will not be releasing these for any other exams.
MT2_SP25
Exercise 1
Solution 1
def binary_indicators() -> str:
def helper_str_func(status_term, col):
return f'''
SELECT "{status_term}" as status, (SUM({col} == "Y") * 100.0) / count(*) as percentage
FROM student_main
'''
query = f'''
{helper_str_func("US Citizen", "us_citizen")}
UNION ALL
{helper_str_func("US Resident", "us_resident")}
UNION ALL
{helper_str_func("State Resident", "state_resident")}
UNION ALL
{helper_str_func("Pell Recipient", "pell_recipient")}
UNION ALL
{helper_str_func("US Veteran", "us_veteran")}
'''
return query
Solution 2
def binary_indicators() -> str:
###
### YOUR CODE HERE
###
return '''
SELECT 'US Citizen' as status, (SUM(CASE WHEN us_citizen='Y' THEN 1 ELSE 0 END) * 100.0) / COUNT(*) as percentage
FROM student_main
UNION ALL
SELECT 'US Resident' as status, (SUM(CASE WHEN us_resident='Y' THEN 1 ELSE 0 END) * 100.0) / COUNT(*) as percentage
FROM student_main
UNION ALL
SELECT 'State Resident' as status, (SUM(CASE WHEN state_resident='Y' THEN 1 ELSE 0 END) * 100.0) / COUNT(*) as percentage
FROM student_main
UNION ALL
SELECT 'Pell Recipient' as status, (SUM(CASE WHEN pell_recipient='Y' THEN 1 ELSE 0 END) * 100.0) / COUNT(*) as percentage
FROM student_main
UNION ALL
SELECT 'US Veteran' as status, (SUM(CASE WHEN us_veteran='Y' THEN 1 ELSE 0 END) * 100.0) / COUNT(*) as percentage
FROM student_main
'''
Solution 3
def binary_indicators() -> str:
varz = ["us_citizen","us_resident","state_resident","pell_recipient","us_veteran"]
bins = " union all ".join(
[
f"""
select
'{var.replace("_", " ").title().replace("Us","US")}' as status,
sum(case when trim({var}) = 'Y' then 1 else 0 end) * 100.0/ count(*) as percentage
from
student_main
group by
1
"""
for var in varz
]
)
query = f"""
{bins}
"""
# df = pd.read_sql(query, conn)
# display(df)
# display(df["pell_recipient_flag"].describe())
return query
Exercise 2
Solution 1
def scholarship_payments() -> str:
###
query = '''
SELECT
scholarship_code,
SUM(scholarship_payment) AS total_payment
FROM student_scholarship
GROUP BY scholarship_code
'''
return query
###
Exercise 3
Solution 1
def avg_gpa_by_scholarship() -> str:
query = '''
SELECT
COALESCE(ss.scholarship_code, "NONE") AS scholarship_code,
ROUND(AVG(se.semester_gpa), 2) AS avg_gpa
FROM student_key sk
INNER JOIN student_enrollment se ON sk.student_id = se.student_id
LEFT JOIN student_scholarship ss ON sk.finance_id = ss.finance_id
GROUP BY ss.scholarship_code
ORDER BY avg_gpa DESC, scholarship_code
'''
return query
Solution 2
def avg_gpa_by_scholarship() -> str:
query = '''
WITH enrolled_students AS (
SELECT e.student_id, e.semester_gpa, key.finance_id
FROM student_enrollment AS e
JOIN student_key AS key ON e.student_id = key.student_id
)
SELECT
COALESCE(s.scholarship_code, 'NONE') AS scholarship_code,
ROUND(AVG(e.semester_gpa), 2) AS avg_gpa
FROM enrolled_students AS e
LEFT JOIN student_scholarship AS s ON e.finance_id = s.finance_id
GROUP BY s.scholarship_code
ORDER BY avg_gpa DESC, scholarship_code ASC
'''
return query
Solution 3
def avg_gpa_by_scholarship() -> str:
return """
WITH ssc AS (
SELECT
k.student_id,
k.finance_id,
IFNULL(s.scholarship_code, 'NONE') AS ss_code
FROM
student_key k
LEFT JOIN student_scholarship s
ON k.finance_id = s.finance_id
)
SELECT
ssc.ss_code AS scholarship_code,
ROUND(AVG(e.semester_gpa), 2) AS avg_gpa
FROM
student_enrollment e
JOIN ssc
ON e.student_id = ssc.student_id
GROUP BY
ssc.ss_code
ORDER BY
avg_gpa DESC,
scholarship_code ASC
"""
Exercise 4
Solution 1
def active_scholarships() -> str:
query = '''
WITH most_recent AS (
SELECT major_code, MAX(activation_date) as active_date
FROM major_crosswalk
GROUP BY major_code
)
SELECT
sc.scholarship_description,
mc.major_description,
sr.min_gpa,
sr.gender,
sr.pell_recipient,
sr.us_veteran,
sr.us_citizen,
sr.us_resident,
sr.state_resident,
sr.amount
FROM scholarship_rules sr
LEFT JOIN major_crosswalk mc ON sr.major_code = mc.major_code
LEFT JOIN scholarship_crosswalk sc ON sr.scholarship_crosswalk_id = sc.id
LEFT JOIN most_recent mr ON mr.major_code = mc.major_code
WHERE sr.scholarship_active = 'Y'
AND COALESCE(mr.active_date, 1) = COALESCE(mc.activation_date, 1)
AND sc.activation_date = sr.activation_date
'''
return query
Solution 2
### Solution - Exercise 4
def active_scholarships() -> str:
###
### YOUR CODE HERE
###
return '''
WITH crosswalk AS(
SELECT major_code, major_description, MAX(activation_date) as activation_date
FROM major_crosswalk
GROUP BY major_code
)
SELECT scholarship_crosswalk.scholarship_description as scholarship_description,
crosswalk.major_description as major_description, scholarship_rules.min_gpa as min_gpa,
scholarship_rules.gender as gender, scholarship_rules.pell_recipient as pell_recipient,
scholarship_rules.us_veteran as us_veteran,
scholarship_rules.us_citizen as us_citizen, scholarship_rules.us_resident as us_resident,
scholarship_rules.state_resident as state_resident, SUM(scholarship_rules.amount) as amount
FROM scholarship_crosswalk
LEFT JOIN scholarship_rules ON scholarship_rules.scholarship_crosswalk_id = scholarship_crosswalk.id
LEFT JOIN crosswalk ON crosswalk.major_code = scholarship_rules.major_code
WHERE scholarship_rules.scholarship_active = 'Y'
GROUP BY scholarship_crosswalk.scholarship_description
'''
Solution 3
def active_scholarships() -> str:
###
### YOUR CODE HERE
###
# return '''
# select * from scholarship_rules
# '''
return '''
SELECT b.scholarship_description,c.major_description,a.min_gpa,a.gender,a.pell_recipient,a.us_veteran,a.us_citizen,a.us_resident,a.state_resident,a.amount
FROM scholarship_rules a
JOIN scholarship_crosswalk b on a.scholarship_crosswalk_id=b.id
LEFT JOIN (SELECT d.* FROM major_crosswalk d
JOIN (select major_code,max(activation_date) m from major_crosswalk group by 1)c on d.major_code=c.major_code AND d.activation_date=c.m
)c on a.major_code=c.major_code
WHERE a.scholarship_active='Y'
'''
Solution 4
def active_scholarships() -> str:
query = '''
WITH recent_majors AS (
SELECT mc1.major_code, mc1.major_description
FROM major_crosswalk AS mc1
WHERE mc1.activation_date = (
SELECT MAX(mc2.activation_date)
FROM major_crosswalk AS mc2
WHERE mc2.major_code = mc1.major_code
)
)
SELECT
sc.scholarship_description,
rm.major_description,
sr.min_gpa,
sr.gender,
sr.pell_recipient,
sr.us_veteran,
sr.us_citizen,
sr.us_resident,
sr.state_resident,
sr.amount
FROM scholarship_rules AS sr
LEFT JOIN scholarship_crosswalk AS sc ON sr.scholarship_crosswalk_id = sc.id
LEFT JOIN recent_majors AS rm ON sr.major_code = rm.major_code
WHERE sr.scholarship_active = 'Y'
'''
return query
Solution 5
def active_scholarships() -> str:
return """
WITH RecentMajors AS (
SELECT mc.major_code, mc.major_description
FROM major_crosswalk mc
INNER JOIN (
SELECT major_code, MAX(activation_date) AS max_date
FROM major_crosswalk
GROUP BY major_code
) latest_mc
ON mc.major_code = latest_mc.major_code AND mc.activation_date = latest_mc.max_date
)
SELECT
sc.scholarship_description,
rm.major_description,
sr.min_gpa,
sr.gender,
sr.pell_recipient,
sr.us_veteran,
sr.us_citizen,
sr.us_resident,
sr.state_resident,
sr.amount
FROM scholarship_rules sr
JOIN scholarship_crosswalk sc
ON sr.scholarship_code = sc.scholarship_code
AND sr.activation_date = sc.activation_date
LEFT JOIN RecentMajors rm ON sr.major_code = rm.major_code
WHERE sr.scholarship_active = 'Y';
"""
Exercise 5
Solution 1
def grad_retention_inds() -> str:
# It seems like this solution should pass, but it does not.
query = '''
WITH term_data AS (
SELECT
se.student_id,
MIN(CAST(term AS INTEGER)) AS first_term,
MAX(CAST(term AS INTEGER)) AS last_term
FROM student_enrollment se
LEFT JOIN graduation gr ON se.student_id = gr.student_id
GROUP BY se.student_id
)
SELECT
se.student_id,
CASE WHEN td.last_term - td.first_term >= 100 THEN 1 ELSE 0 END retention_ind,
CASE WHEN gr.grad_level = 'B' AND gr.grad_status = 'A' THEN 1 ELSE 0 END graduation_ind
FROM student_enrollment se
LEFT JOIN graduation gr ON se.student_id = gr.student_id
INNER JOIN term_data td ON td.student_id = se.student_id AND td.last_term = CAST(se.term AS INTEGER)
'''
# This query shows an example of a student whose final enrollment term is much later than one year after
# their first year, yet the true output says they were not retained. Are we sure this is correct?
query = '''
SELECT
se.student_id,
MIN(CAST(term AS INTEGER)) AS first_term,
MAX(CAST(term AS INTEGER)) AS last_term
FROM student_enrollment se
LEFT JOIN graduation gr ON se.student_id = gr.student_id
WHERE se.student_id = 346224219
GROUP BY se.student_id
'''
return query
Solution 2
### Solution - Exercise 5
def grad_retention_inds() -> str:
###
### YOUR CODE HERE
###
return '''
WITH enrollment AS(
SELECT student_id, MIN(term) as term
FROM student_enrollment
GROUP BY student_id
)
SELECT enrollment.student_id,
(CASE WHEN CAST(graduation.last_enroll_term AS INTEGER) - CAST(enrollment.term AS INTEGER) > 100
THEN 1 ELSE 0 END) as retention_ind,
(CASE WHEN graduation.grad_level='B' AND graduation.grad_status='A' THEN 1 ELSE 0 END) as graduation_ind
FROM enrollment
LEFT JOIN graduation on enrollment.student_id = graduation.student_id
'''
Solution 3
def grad_retention_inds() -> str:
###
### YOUR CODE HERE
###
# return '''
# SELECT * from student_enrollment where student_id='100636982'
# '''
return '''
SELECT a.student_id
,max(case when cast(a.term as INTEGER)=cast(b.m AS INTEGER)+100 then 1 else 0 end) retention_ind
,max(case when c.grad_level='B' and c.grad_status='A' then 1 else 0 end) graduation_ind
FROM student_enrollment a
JOIN (select student_id,min(term)m FROM student_enrollment GROUP BY 1) b on a.student_id=b.student_id
LEFT JOIN graduation c on a.student_id=c.student_id
group by 1
'''
Solution 4
def grad_retention_inds() -> str:
query = '''
WITH first_term AS (
SELECT student_id, MIN(CAST(term AS INTEGER)) AS first_enrollment_term
FROM student_enrollment
GROUP BY student_id
),
retention_check AS (
SELECT
e.student_id,
MAX(CASE
WHEN CAST(e.term AS INTEGER) = f.first_enrollment_term + 100 THEN 1
ELSE 0
END) AS retention_ind
FROM student_enrollment e
JOIN first_term f ON e.student_id = f.student_id
GROUP BY e.student_id
),
graduation_check AS (
SELECT
g.student_id,
MAX(CASE
WHEN g.grad_level = 'B' AND g.grad_status = 'A' THEN 1
ELSE 0
END) AS graduation_ind
FROM graduation g
GROUP BY g.student_id
)
SELECT
f.student_id,
COALESCE(r.retention_ind, 0) AS retention_ind,
COALESCE(g.graduation_ind, 0) AS graduation_ind
FROM first_term f
LEFT JOIN retention_check r ON f.student_id = r.student_id
LEFT JOIN graduation_check g ON f.student_id = g.student_id
'''
return query
Solution 5
def grad_retention_inds() -> str:
return """
WITH first_enrollment AS (
SELECT student_id, MIN(CAST(term AS INTEGER)) AS first_term
FROM student_enrollment
GROUP BY student_id
)
SELECT
se.student_id,
MAX(CASE WHEN CAST(se.term AS INTEGER) = fe.first_term + 100 THEN 1 ELSE 0 END) AS retention_ind,
MAX(CASE WHEN g.grad_level = 'B' AND g.grad_status = 'A' THEN 1 ELSE 0 END) AS graduation_ind
FROM
student_enrollment se
LEFT JOIN first_enrollment fe
ON se.student_id = fe.student_id
LEFT JOIN graduation g
ON se.student_id = g.student_id
GROUP BY
se.student_id
"""
Solution 6
def grad_retention_inds() -> str:
query = """
with ft as(
select
student_id,
min(term) AS first_term
from
student_enrollment
group by
student_id
)
select
ft.student_id,
max(
case
when se.term = cast(ft.first_term as int) +100
then 1
else 0
end
) as retention_ind,
max(
case
when grad.grad_level = "B" and grad.grad_status = "A"
then 1
else 0
end
) as graduation_ind
from
ft
left join
student_enrollment as se
on
ft.student_id = se.student_id
left join
graduation grad
on
ft.student_id = grad.student_id
group by
1
"""
return query
Exercise 6
Solution 1
def rename_recpt_columns(df: pd.DataFrame) -> pd.DataFrame:
import re
df = df.copy()
columns = df.columns
scholarship_cols = [col for col in columns if re.search('recpt', col)]
scholarship_cols = sorted(scholarship_cols)
scholarship_map = {
col: f"scholarship_{X + 1}_recpt"
for X, col in enumerate(scholarship_cols)
}
df = df.rename(columns=scholarship_map)
return df
Solution 2
### Solution - Exercise 6
def rename_recpt_columns(df: pd.DataFrame) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df2 = df.copy()
sorted_col = {name:'scholarship_'+str(ind+1)+'_recpt' for ind, name in enumerate(sorted([i for i in df2.columns if 'recpt' in i]))}
df2 = df2.rename(columns=sorted_col)
return df2
Solution 3
def rename_recpt_columns(df: pd.DataFrame) -> pd.DataFrame:
###
### YOUR CODE HERE
###
rdf=df.copy()
cs=[i for i in rdf.columns if 'recpt' in i]
# counter=1
# for c in sorted(cs):
# nc=f'scholarship_{counter}_recpt'
# rdf=rdf.rename(columns={c:nc})
# counter+=1
# return rdf
d={c:f'scholarship_{i}_recpt' for i,c in enumerate(sorted(cs),start=1)}
return rdf.rename(columns=d)
Solution 4
def rename_recpt_columns(df: pd.DataFrame) -> pd.DataFrame:
dfc = df.copy()
recpt_cols = [col for col in dfc.columns if 'recpt' in col]
recpt_cols.sort()
rename_mapping = {col: f'scholarship_{num}_recpt' for num, col in enumerate(recpt_cols, start=1)}
dfc.rename(columns=rename_mapping, inplace=True)
return dfc
Solution 5
def rename_recpt_columns(df: pd.DataFrame) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df_return = df.copy()
recpt_cols = [col for col in df.columns if 'recpt' in col]
sorted_recpt_cols = sorted(recpt_cols)
new_cols = {}
for i, col in enumerate(sorted_recpt_cols):
new_cols[col] = f"scholarship_{i+1}_recpt"
df_return.rename(columns = new_cols, inplace=True)
return df_return
Solution 6
def adjust_term(df: pd.DataFrame, n_years: int) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df_return = df.copy()
df_return['term'] = df_return['term'].apply(
lambda x : str(int(x[:4]) + n_years) + x[4:]
)
return df_return
Solution 7
def rename_recpt_columns(df: pd.DataFrame) -> pd.DataFrame:
recpt_cols = sorted([col for col in df.columns if "recp" in col])
rename_mapping = {col: f"scholarship_{i+1}_recpt" for i, col in enumerate(recpt_cols)}
new_columns = [rename_mapping[col] if col in rename_mapping else col for col in df.columns]
return df.rename(columns=rename_mapping)
Exercise 7
Solution 1
def adjust_term(df: pd.DataFrame, n_years: int) -> pd.DataFrame:
df = df.copy()
year_adjuster = int(n_years) * 100
df['term'] = df['term'].map(lambda x: str(int(x) + year_adjuster))
return df
Solution 2
### Solution - Exercise 7
def adjust_term(df: pd.DataFrame, n_years: int) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df2 = df.copy()
df2['term'] = df2['term'].apply(lambda x:str(int(x) + (n_years*100)))
return df2
Solution 3
def adjust_term(df: pd.DataFrame, n_years: int) -> pd.DataFrame:
dfc = df.copy()
dfc['year'] = dfc['term'].astype(str).str[:4].astype(int)
dfc['month'] = dfc['term'].astype(str).str[4:]
dfc['year'] += n_years
dfc['term'] = dfc['year'].astype(str) + dfc['month']
dfc.drop(columns=['year', 'month'], inplace=True)
return dfc
Solution 4
def adjust_term(df: pd.DataFrame, n_years: int) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df_return = df.copy()
df_return['term'] = df_return['term'].apply(
lambda x : str(int(x[:4]) + n_years) + x[4:]
)
return df_return
Solution 5
def adjust_term(df: pd.DataFrame, n_years: int) -> pd.DataFrame:
df1 = df.copy()
df1["term"] = df1["term"].astype(int) + 100 * n_years
df1["term"] = df1["term"].astype("str")
return df1
Exercise 8
Solution 1
def generate_fake_ids(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
if 'student_id' in df:
df['anon_id'] = df['student_id'].map(lambda x: generate_md5_hash(x))
del df['student_id']
return df
Solution 2
### Solution - Exercise 8
def generate_fake_ids(df: pd.DataFrame) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df2 = df.copy()
if 'student_id' not in df2.columns:
return df2
df2['student_id'] = df2['student_id'].apply(lambda x: generate_md5_hash(x))
return df2.rename(columns={'student_id': 'anon_id'})
Solution 3
def generate_fake_ids(df: pd.DataFrame) -> pd.DataFrame:
###
### YOUR CODE HERE
###
rdf=df.copy()
if 'student_id' not in rdf.columns:
return rdf
rdf=rdf.rename(columns={'student_id':'anon_id'})
rdf['anon_id']=rdf['anon_id'].apply(generate_md5_hash)
return rdf
Solution 4
def generate_fake_ids(df: pd.DataFrame) -> pd.DataFrame:
dfc = df.copy()
if 'student_id' not in dfc.columns:
return df
dfc['anon_id'] = dfc['student_id'].apply(generate_md5_hash)
dfc.drop(columns = ['student_id'], inplace = True)
return dfc
Solution 5
def generate_fake_ids(df: pd.DataFrame) -> pd.DataFrame:
###
### YOUR CODE HERE
###
if 'student_id' not in df.columns:
return df
df_return = df.copy()
df_return['student_id'] = df_return['student_id'].apply(
generate_md5_hash
)
df_return.rename(columns={'student_id': 'anon_id'}, inplace=True)
return df_return
Solution 6
def generate_fake_ids(df: pd.DataFrame) -> pd.DataFrame:
df1 = df.copy()
if "student_id" not in df1.columns:
return df1
df1["student_id"] = df1["student_id"].astype(str)
df1["student_id"] = df1["student_id"].map(generate_md5_hash)
df1 = df1.rename(columns={"student_id": "anon_id"})
return df1
Exercise 9
Solution 1
def deidentify_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
df = df.copy()
for column in columns:
vals = set(df[column])
sorted_vals = sorted(list(vals))
val_anon_map = {
val: f'{column}_{i + 1}'
for i, val in enumerate(sorted_vals)
}
df[column] = df[column].map(lambda x: val_anon_map[x])
return df
Solution 2
### Solution - Exercise 9
def deidentify_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df2 = df.copy()
for col in columns:
mapping = {val:col+'_'+str(ind+1) for ind, val in enumerate(df2[col].sort_values().unique())}
df2[col] = df2[col].map(mapping)
return df2
Solution 3
def deidentify_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
###
### YOUR CODE HERE
###
rdf=df.copy()
for c in columns:
vs=rdf[c].unique()
d={v:f'{c}_{i}' for i,v in enumerate(sorted(vs),start=1)}
rdf[c]=rdf[c].map(d)
return rdf
Solution 4
def deidentify_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
dfc = df.copy()
for col in columns:
items = sorted(dfc[col].unique())
rename_mapping = {val: f'{col}_{num+1}' for num, val in enumerate(items)}
dfc[col] = dfc[col].map(rename_mapping)
return dfc
Solution 5
def deidentify_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
###
### YOUR CODE HERE
###
df_return = df.copy()
for col in columns:
sorted_vals = sorted(df_return[col].unique())
new_identifier = {
val: f"{col}_{i+1}" for i, val in enumerate(sorted_vals)
}
df_return[col] = df_return[col].map(new_identifier)
return df_return
Solution 6
def deidentify_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
df1 = df.copy()
for column in columns:
uniq_vals = sorted(df1[column].unique())
val_map = {value: f"{column}_{i+1}" for i, value in enumerate(uniq_vals)}
df1[column] = df1[column].map(val_map)
return df1
Exercise 10
Solution 1
def perturb_gpa(df: pd.DataFrame, seed: int = 6040) -> pd.DataFrame:
rng = np.random.default_rng(seed)
import re
df = df.copy()
if 'semester_gpa' not in df:
return df
schol_cols = [col for col in df.columns if re.search(r'_recpt$', col)]
if not schol_cols:
return df
def perturb_gpas(tbl):
mean_sem_GPA = tbl['semester_gpa'].mean()
tbl_perturbed = rng.normal(mean_sem_GPA, 0.1, size=len(tbl))
tbl['semester_gpa'] = tbl_perturbed
return tbl
# No scholarships
no_schols = df[df[schol_cols].sum(axis=1) == 0].copy()
no_schols = perturb_gpas(no_schols)
# One scholarship
single_schols = []
for col in schol_cols:
one_schols = df[(df[col] == 1) & (df[schol_cols].sum(axis=1) == 1)].copy()
one_schols = perturb_gpas(one_schols)
single_schols.append(one_schols)
# Many scholarships
many_schols = df[df[schol_cols].sum(axis=1) > 1].copy()
return pd.concat([*single_schols, no_schols, many_schols])
#### IMPORTANT ---------------------------------------------------------
# Doing the perturbation steps in a different order /will not pass/. I think you might want to
# create a different, special perturbation function which always resets the seed before
# you run it. This solution should pass, but it /won't/, because the steps happen out of order.
### Solution - Exercise 10 - THIS SHOULD WORK BUT DOES NOT
def perturb_gpa(df: pd.DataFrame, seed: int = 6040) -> pd.DataFrame:
rng = np.random.default_rng(seed)
import re
df = df.copy()
if 'semester_gpa' not in df:
return df
schol_cols = [col for col in df.columns if re.search(r'_recpt$', col)]
if not schol_cols:
return df
def perturb_gpas(tbl):
mean_sem_GPA = tbl['semester_gpa'].mean()
tbl_perturbed = rng.normal(mean_sem_GPA, 0.1, size=len(tbl))
tbl['semester_gpa'] = tbl_perturbed
return tbl
# One scholarship
single_schols = []
for col in schol_cols:
one_schols = df[(df[col] == 1) & (df[schol_cols].sum(axis=1) == 1)].copy()
one_schols = perturb_gpas(one_schols)
single_schols.append(one_schols)
# No scholarships
no_schols = df[df[schol_cols].sum(axis=1) == 0].copy()
no_schols = perturb_gpas(no_schols)
# Many scholarships
many_schols = df[df[schol_cols].sum(axis=1) > 1].copy()
return pd.concat([*single_schols, no_schols, many_schols])
Solution 2
### Solution - Exercise 10
def perturb_gpa(df: pd.DataFrame, seed: int = 6040) -> pd.DataFrame:
rng = np.random.default_rng(seed)
###
### YOUR CODE HERE
###
df2 = df.copy()
v = '_recpt'
cols_with_recpt = [i for i in df2.columns if i[-len(v):]==v]
if 'semester_gpa' not in df2.columns or len(cols_with_recpt) == 0:
return df2
def scholarship_code(x):
if sum(x) == 0:
return -1
elif sum(x) == 1:
return [i for i, v in enumerate(x) if v==1][0] # return null implicitly otherwise if sum(x) > 1
group = df2[cols_with_recpt].apply(scholarship_code, axis=1)
for s in range(-1,len(cols_with_recpt)):
mapping = group[group==s].index
mean_gpa = np.mean(df2.loc[mapping, 'semester_gpa'])
df2.loc[mapping, 'semester_gpa'] = rng.normal(loc=mean_gpa, scale=0.1, size=len(mapping))
return df2
Solution 3
def perturb_gpa(df: pd.DataFrame, seed: int = 6040) -> pd.DataFrame:
rng = np.random.default_rng(seed)
###
### YOUR CODE HERE
###
rdf=df.copy()
# rdf['mean_gpa']=rdf['semester_gpa'].copy() #add?
if 'semester_gpa' not in rdf.columns:
return rdf
cs=[i for i in rdf.columns if '_recpt' in i]
if len(cs)==0:
return rdf
cols=[i for i in rdf.columns]
rdf['row_sum']=rdf[cs].sum(axis=1)
#all 0
size=rdf[rdf['row_sum']==0]['semester_gpa'].shape
mu=rdf[rdf['row_sum']==0]['semester_gpa'].mean()
# rdf.loc[rdf['row_sum']==0,'mean_gpa']=mu #add?
rdf.loc[rdf['row_sum']==0,'semester_gpa']=rng.normal(mu, .1, size)
for c in cs:
size=rdf[(rdf['row_sum']==1) & (rdf[c]==1)]['semester_gpa'].shape
mu=rdf[(rdf['row_sum']==1) & (rdf[c]==1)]['semester_gpa'].mean()
# rdf.loc[(rdf['row_sum']==1) & (rdf[c]==1),'mean_gpa']=mu #add?
rdf.loc[(rdf['row_sum']==1) & (rdf[c]==1),'semester_gpa']=rng.normal(mu, .1, size)
return rdf[cols]
Solution 4
def perturb_gpa(df: pd.DataFrame, seed: int = 6040) -> pd.DataFrame:
rng = np.random.default_rng(seed)
if 'semester_gpa' not in df.columns:
return df
dfc = df.copy()
sch_cols = [col for col in df.columns if col.endswith('_recpt')]
if not (set(sch_cols) & set(dfc.columns)):
return df
no_scholarship_mask = (dfc[sch_cols] == 0).all(axis=1)
if no_scholarship_mask.any():
no_scholarship_mean = dfc.loc[no_scholarship_mask, 'semester_gpa'].mean()
dfc.loc[no_scholarship_mask, 'semester_gpa'] = rng.normal(
loc=no_scholarship_mean, scale=0.1, size=no_scholarship_mask.sum()
)
for col in sch_cols:
only_this_scholarship_mask = (dfc[col] == 1) & (dfc[sch_cols].sum(axis=1) == 1)
if only_this_scholarship_mask.any():
specific_scholarship_mean = dfc.loc[only_this_scholarship_mask, 'semester_gpa'].mean()
dfc.loc[only_this_scholarship_mask, 'semester_gpa'] = rng.normal(
loc=specific_scholarship_mean, scale=0.1, size=only_this_scholarship_mask.sum()
)
return dfc
Solution 5
def perturb_gpa(df: pd.DataFrame, seed: int = 6040) -> pd.DataFrame:
rng = np.random.default_rng(seed)
###
### YOUR CODE HERE
###
if 'semester_gpa' not in df.columns:
return df
scholarship_cols = [col for col in df.columns if col.endswith('_recpt')]
if not scholarship_cols:
return df
df_return = df.copy()
no_ss_mask = (df_return[scholarship_cols] == 0).all(axis=1)
no_ss_gpa = df_return[no_ss_mask]['semester_gpa']
mean_no_ss_gpa = no_ss_gpa.mean()
df_return.loc[no_ss_mask, 'semester_gpa'] = rng.normal(
loc=mean_no_ss_gpa, scale=0.1, size=no_ss_mask.sum()
)
for ss in scholarship_cols:
ss_mask = (df_return[ss] == 1) & (df_return[scholarship_cols].drop(columns=[ss]) == 0).all(axis=1)
if ss_mask.any():
ss_gpa = df_return[ss_mask]['semester_gpa']
mean_ss_gpa = ss_gpa.mean()
df_return.loc[ss_mask, 'semester_gpa'] = rng.normal(loc=mean_ss_gpa, scale=0.1, size=ss_mask.sum())
return df_return
Solution 6
def perturb_gpa(df: pd.DataFrame, seed: int = 6040) -> pd.DataFrame:
rng = np.random.default_rng(seed)
if "semester_gpa" not in df.columns:
return df
cols = [col for col in df.columns if col.endswith("_recpt")]
if len(cols) == 0:
return df
df1 = df.copy()
mask_zeros = (df1[cols] == 0).all(axis=1)
mean_zero_gpa = df1.loc[mask_zeros, "semester_gpa"].mean()
num_zeros = mask_zeros.sum()
if num_zeros > 0:
df1.loc[mask_zeros, "semester_gpa"] = rng.normal(mean_zero_gpa, 0.1, size=num_zeros)
for col in cols:
other_cols = [c for c in cols if c != col]
mask_single = (df1[col] == 1) & (df1[other_cols] == 0).all(axis=1)
mean_gpa = df1.loc[mask_single, "semester_gpa"].mean()
if mask_single.sum() == 0 or np.isnan(mean_gpa):
continue
df1.loc[mask_single, "semester_gpa"] = rng.normal(mean_gpa, 0.1, size=mask_single.sum())
return df1
MT2_FA24
Exercise 0
Solution 1
def load_data(records: list) -> pd.DataFrame:
import pytz
df = pd.DataFrame(records)
timezone = pytz.timezone('America/New_York')
df['Date'] = pd.to_datetime(df['Date'], utc=True)
df['Date'] = df['Date'].dt.tz_convert(timezone)
df = df.rename(columns={'Stock':'Firm'})
return df
Solution 2
def load_data(records: list) -> pd.DataFrame:
full_data = pd.DataFrame(records)
full_data = full_data.rename(columns = {"Stock": "Firm"})
full_data['Date'] = pd.to_datetime(full_data.Date, format='%Y-%m-%d %H:%M:%S', utc=True)
full_data['Date'] = full_data['Date'].dt.tz_convert('America/New_York')
return full_data
Solution 3
def load_data(records: list) -> pd.DataFrame:
full_data = pd.DataFrame(records)
full_data = full_data.rename(columns={'Stock': 'Firm'})
full_data['Date'] = full_data['Date'].apply(lambda x: pd.to_datetime(x).tz_convert('America/New_York'))
return full_data
Solution 4
def load_data(records: list) -> pd.DataFrame:
df = pd.DataFrame(records)
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(x).tz_convert('America/New_York'))
df.rename(columns={"Stock": "Firm"}, inplace=True)
return df
Solution 5
def load_data(records: list) -> pd.DataFrame:
df = pd.DataFrame(records)
df = df.rename(columns={'Stock': 'Firm'})
df['Date'] = pd.to_datetime(df.Date, utc = True)
df = df.astype({'Date': 'datetime64[ns, America/New_York]'})
return df
Solution 6
def load_data(records: list) -> pd.DataFrame:
df = pd.DataFrame(records)
df["Date"] = df["Date"].apply(lambda x: pd.to_datetime(x).tz_convert("America/New_York"))
df = df.rename(columns={"Stock":"Firm"})
return df
Solution 7
def load_data(records: list) -> pd.DataFrame:
df = pd.DataFrame(records, columns=['Date', 'Stock', 'Open', 'Close', 'Volume'])
df = df.rename(columns={'Stock': 'Firm'})
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(x).tz_convert('America/New_York'))
return df
Solution 8
def load_data(records: list) -> pd.DataFrame:
date_ = [a['Date'] for a in records]
stock_ = [a['Stock'] for a in records]
open_ = [a['Open'] for a in records]
close_ = [a['Close'] for a in records]
volume_ = [a['Volume'] for a in records]
df = pd.DataFrame({'Date':date_, 'Firm':stock_, 'Open':open_, 'Close':close_, 'Volume': volume_})
df['Date'] = pd.to_datetime(df['Date'])
return df
Exercise 2
Solution 1
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
df = stock_records.copy()
df = df.loc[df['Firm'] == firm]
df['shift'] = df.loc[df['Close'].notna()]['Close'].shift(periods=1)
df['Day_Growth'] = (df['Close'] / df['shift']) - 1
df = df.drop(columns=['shift']).reset_index(drop=True)
df['Day_Growth'] = df['Day_Growth'].fillna(0)
return df
Solution 2
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
firm_stocks = stock_records.copy()
firm_stocks = firm_stocks[firm_stocks['Firm'] == firm]
firm_stocks['prev_close'] = firm_stocks[firm_stocks['Close'].notna()]['Close'].shift(1)
firm_stocks['Day_Growth'] = (firm_stocks['Close']/firm_stocks['prev_close']) - 1
firm_stocks.drop('prev_close', axis=1, inplace=True)
firm_stocks.reset_index(drop=True, inplace=True)
firm_stocks['Day_Growth'] = firm_stocks['Day_Growth'].fillna(0)
return firm_stocks
Solution 3
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
df = stock_records.copy()
df = df[df['Firm']==firm]
df['Day_Growth'] = 0.000000
na_mask = df['Open'].isna()
na_rows = df[na_mask]
df_no_na = df.dropna(subset=['Open', 'Close'])
shifted_vals = df_no_na['Close'].shift(periods=1)
df_no_na = df_no_na.assign(last_close = shifted_vals.values)
new_day_growth = df_no_na.apply(lambda x: (x['Close']/x['last_close'])-1, axis=1)
df_no_na = df_no_na.assign(Day_Growth = new_day_growth.values)
del df_no_na['last_close']
df_no_na.fillna(0, inplace=True)
stock_growth = pd.concat([na_rows, df_no_na])
stock_growth = stock_growth.sort_values(by='Date')
return stock_growth.reset_index(drop=True)
Solution 4
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
stock_growth = stock_records.copy()
stock_growth = stock_growth[stock_growth['Firm'] == firm]
stock_growth.insert(5, 'Day_Growth', 0.0)
filtered_df = stock_growth[stock_growth['Close'].notna()].copy()
filtered_df.loc[:,'Day_Growth'] = filtered_df['Close'] / filtered_df['Close'].shift(periods=1) - 1
stock_growth['Day_Growth'] = filtered_df['Day_Growth'].reindex(stock_growth.index)
stock_growth['Day_Growth'] = stock_growth['Day_Growth'].fillna(0)
stock_growth = stock_growth.sort_values(by='Date')
stock_growth = stock_growth.reset_index(drop=True)
return stock_growth
Solution 5
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
stock_growth = stock_records[stock_records["Firm"] == firm].copy()
stock_growth["Close_prev"] = stock_growth["Close"].shift(1)
stock_growth["Close_prev"] = stock_growth["Close_prev"].fillna(method="ffill")
stock_growth["Day_Growth"] = stock_growth["Close"]/stock_growth["Close_prev"]-1
stock_growth = stock_growth.drop(["Close_prev"], axis=1)
stock_growth["Day_Growth"] = stock_growth["Day_Growth"].fillna(0)
stock_growth = stock_growth[['Date', 'Firm', 'Open', 'Close', 'Volume', 'Day_Growth']].reset_index(drop=True)
return stock_growth
Solution 6
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
comp = stock_records[stock_records['Firm'] == firm]
comp["Lag"] = comp['Close'].shift(1)
comp['Lag'] = comp['Lag'].fillna(method='ffill')
comp['Day_Growth'] = comp["Close"] / comp['Lag'] - 1
comp['Day_Growth'] = comp['Day_Growth'].fillna(0.0)
comp = comp.drop(columns=['Lag'])
return comp.reset_index(drop=True)
Solution 7
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
# filter by firm
stock_records = stock_records.loc[stock_records['Firm'] == firm].copy()
# filter out NaN to calculate growth
stock_records_no_nan = stock_records.dropna()
stock_records_no_nan['prev_close'] = stock_records_no_nan.shift(periods=1)['Close']
stock_records_no_nan['Day_Growth'] = (stock_records_no_nan['Close'] / stock_records_no_nan['prev_close']) - 1
stock_records['Day_Growth'] = 0.0
stock_records = stock_records.merge(stock_records_no_nan,on='Date',how="left")
stock_records['Day_Growth_y'] = stock_records['Day_Growth_y'].fillna(0.0)
stock_records = stock_records[['Date', 'Firm_x', 'Open_x', 'Close_x', 'Volume_x', 'Day_Growth_y']]
stock_records = stock_records.rename(columns={"Firm_x": "Firm", "Open_x": "Open", "Close_x" : "Close", "Volume_x" : "Volume", "Day_Growth_y" : "Day_Growth"})
return stock_records
Solution 8
def calculate_daily_growth(stock_records: pd.DataFrame, firm: str) -> pd.DataFrame:
df = stock_records.copy()
df = df[df['Firm']==firm].reset_index() #filtered to 'firm'
is_null = pd.isnull(df.loc[:,'Close'])
close_ = [df.loc[0,'Close']]
for i in range(1,df.shape[0]):
if is_null[i]:
close_.append(close_[i-1])
else:
close_.append(df.loc[i,'Close'])
Day_Growth = [0] + [close_[i]/close_[i-1]-1 for i in range(1,len(close_))]
df['Day_Growth'] = Day_Growth
return df[['Date', 'Firm', 'Open', 'Close', 'Volume', 'Day_Growth']]
Exercise 3
Solution 1
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
df = stock_records.copy()
df['Year'] = df['Date'].dt.year.astype('str')
df['Annual_Growth'] = df['Day_Growth'] + 1
df['Annual_Growth'] = df.groupby(['Firm', 'Year'])['Annual_Growth'].transform('prod') - 1
return df
Solution 2
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
copy_stock_records = stock_records.copy()
copy_stock_records['Year'] = copy_stock_records['Date'].dt.year.astype('str')
copy_stock_records['Temp_Growth'] = copy_stock_records['Day_Growth'] + 1
temp_df = copy_stock_records[['Firm', 'Year', 'Temp_Growth']].copy(deep=True)
prod_df = temp_df.groupby(['Firm', 'Year']).prod('Temp_Growth')-1
prod_df=prod_df.reset_index()
prod_df = prod_df.rename(columns = {"Temp_Growth": "Annual_Growth"})
final_df = pd.merge(copy_stock_records,prod_df, on=['Firm', 'Year'], how='left' )
final_df = final_df[['Date', 'Year', 'Firm', 'Open', 'Close', 'Volume', 'Day_Growth', 'Annual_Growth']]
return final_df
Solution 3
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
annual_growth = stock_records.copy()
annual_growth['Year'] = annual_growth['Date'].dt.year
annual_growth['updated_day'] = annual_growth.apply(lambda x: x['Day_Growth']+1, axis=1)
annual_growth['Annual_Growth'] = annual_growth.groupby(by=['Firm', 'Year'])['updated_day'].transform('prod')
del annual_growth['updated_day']
annual_growth['Annual_Growth'] = annual_growth['Annual_Growth'].apply(lambda x: x-1)
annual_growth = annual_growth[['Date', 'Year', 'Firm', 'Open', 'Close', 'Volume', 'Day_Growth', 'Annual_Growth']]
annual_growth['Year'] = annual_growth['Year'].astype(str)
return annual_growth.reset_index(drop=True)
Solution 4
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
df = stock_records.copy()
df['Year'] = df['Date'].dt.year.astype(str)
df = df.assign(Annual_Growth=(df['Day_Growth'] + 1))
df['Annual_Growth'] = df.groupby(['Year', 'Firm'])['Annual_Growth'].transform(lambda x: x.prod() - 1)
return df
Solution 5
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
annual_growth = stock_records.copy()
annual_growth["Year"] = annual_growth["Date"].dt.year.astype(str)
agg_records = annual_growth.copy()
agg_records["Annual_Growth"] = agg_records["Day_Growth"] + 1
agg_records = agg_records.groupby(['Firm', 'Year']).prod().reset_index()
agg_records = agg_records[['Year', 'Firm', 'Annual_Growth']]
agg_records["Annual_Growth"] = agg_records["Annual_Growth"] - 1
annual_growth = pd.merge(annual_growth, agg_records, left_on= ['Firm', 'Year'], right_on = ['Firm', 'Year'], how='left')
annual_growth = annual_growth[['Date', 'Year', 'Firm', 'Open', 'Close', 'Volume', 'Day_Growth', 'Annual_Growth']].reset_index(drop=True)
return annual_growth
Solution 6
from math import prod
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
records = stock_records.copy()
year = records['Date'].dt.year
year = year.astype("str")
records.insert(1, "Year", year)
groups = records.groupby(by=["Year", "Firm"], as_index=False).apply(lambda x: prod(x["Day_Growth"] + 1) - 1)
groups.columns = ["Year", "Firm", "Annual_Growth"]
records = records.merge(groups, on=["Year", "Firm"])
return records.reset_index(drop=True)
Solution 7
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
# group by firm and year
stock_records_cp = stock_records.copy()
stock_records_cp['Year'] = stock_records_cp['Date'].dt.year.astype(str)
stock_records_cp['Growth_Factor'] = stock_records_cp['Day_Growth'] + 1
result = stock_records_cp.groupby(['Firm', 'Year'])['Growth_Factor'].prod().reset_index()
result['Annual_Growth'] = result['Growth_Factor'] - 1
# get final df
stock_records_cp = stock_records_cp.merge(result, on=['Year', 'Firm'], how='left')
stock_records_cp = stock_records_cp.drop(['Growth_Factor_x', 'Growth_Factor_y'], axis=1)
stock_records_cp = stock_records_cp[['Date', 'Year', 'Firm', 'Open', 'Close', 'Volume', 'Day_Growth', 'Annual_Growth']]
return stock_records_cp
Solution 8
import math
def mult(s):
factorial = 1
for i in s['Day_Growth']:
factorial *= (i+1)
return factorial-1
def calculate_annual_growth(stock_records: pd.DataFrame) -> pd.DataFrame:
df = stock_records.copy()
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['Year'] = df['Year'].astype('string')
df2 = df.groupby(['Year','Firm']).apply(mult)
df = df.merge(df2.rename('Annual_Growth'), how='left', on=['Year','Firm'])
df = df[['Date','Year','Firm','Open','Close','Volume','Day_Growth','Annual_Growth']]
return df
Exercise 4
Solution 1
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
df = stockdf.copy()
df = df.loc[df['Firm'] == firm]
df = df.loc[df['Close'].notna()]
df['Short_Average'] = df.rolling(short_average)['Close'].mean()
df['Long_Average'] = df.rolling(long_average)['Close'].mean()
df['Golden_Cross'] = df['Long_Average'] < df['Short_Average']
df = df.reset_index(drop=True)
return df
Solution 2
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
firm_stocks = stockdf.copy()
firm_stocks = firm_stocks[firm_stocks['Firm'] == firm]
filtered_df = firm_stocks[firm_stocks['Close'].notna()].copy(deep=True)
filtered_df['Short_Average'] = filtered_df['Close'].rolling(short_average).mean()
filtered_df['Long_Average'] = filtered_df['Close'].rolling(long_average).mean()
filtered_df['Golden_Cross'] = filtered_df['Short_Average'] > filtered_df['Long_Average']
filtered_df = filtered_df.reset_index(drop = True)
return filtered_df
Solution 3
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
df = stockdf.copy()
# Filter the DataFrame to include records for the firm parameter.
df = df[df['Firm']==firm]
# Some Close values are recorded as NaN. These should be filtered out before calculating the moving averages.
df_notna = df.dropna(subset=['Close'])
# Add the columns Short_Average and Long_Average to the Pandas DataFrame.
short_window = (df_notna['Close'].rolling(short_average).sum())/short_average
long_window = (df_notna['Close'].rolling(long_average).sum())/long_average
df_notna = df_notna.assign(Short_Average = short_window.values)
df_notna = df_notna.assign(Long_Average = long_window.values)
# Add the column 'Golden_Cross', a boolean, to the Pandas DataFrame.
golden_cross = df_notna['Short_Average']>df_notna['Long_Average']
df_notna = df_notna.assign(Golden_Cross = golden_cross.values)
golden_cross_results = df_notna.copy()
return golden_cross_results.reset_index(drop=True)
Solution 4
import numpy as np
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
df = stockdf[stockdf['Firm'] == firm].copy()
df = df[df['Close'].notna()]
df['Short_Average'] = df['Close'].rolling(window=short_average).mean()
df['Long_Average'] = df['Close'].rolling(window=long_average).mean()
df['Golden_Cross'] = np.where(df['Short_Average'] > df['Long_Average'], True, False)
return df.reset_index(drop=True)
Solution 5
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
firmdf = stockdf[stockdf["Firm"] == firm]
firmdf = firmdf[firmdf["Close"].notna()]
firmdf["Short_Average"] = firmdf["Close"].rolling(short_average).mean()
firmdf["Long_Average"] = firmdf["Close"].rolling(long_average).mean()
firmdf["Golden_Cross"] = firmdf["Short_Average"] > firmdf["Long_Average"]
firmdf = firmdf.reset_index(drop=True)
return firmdf
Solution 6
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
df = stockdf[stockdf['Firm'] == firm]
non_na = df[df["Close"].notna()]
non_na["Short_Average"] = non_na["Close"].rolling(short_average).mean()
non_na["Long_Average"] = non_na["Close"].rolling(long_average).mean()
non_na["Golden_Cross"] = non_na["Short_Average"].ge(non_na["Long_Average"])
return non_na.reset_index(drop=True)
Solution 7
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
results = stockdf[stockdf['Firm'] == firm]
results = results.dropna()
results['Short_Average'] = results['Close'].rolling(short_average).sum() / short_average
results['Long_Average'] = results['Close'].rolling(long_average).sum() / long_average
results['Golden_Cross'] = np.where(results['Short_Average'] > results['Long_Average'], True, False)
results = results.reset_index()
results = results.drop('index', axis = 1)
return results
Solution 8
def golden_cross_strategy(stockdf: pd.DataFrame, firm: str, short_average: int, long_average: int) -> pd.DataFrame:
df = stockdf.copy()
df = df[df['Firm']==firm].dropna()
df['Short_Average']=df['Close'].rolling(short_average).mean()
df['Long_Average']=df['Close'].rolling(long_average).mean()
df['Golden_Cross']=df['Short_Average']>df['Long_Average']
df.reset_index(inplace=True, drop=True)
return df
Exercise 5
Solution 1
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
df = daily_growth_by_firm.copy()
df = df.pivot(index='Date', columns='Firm', values='Day_Growth').reset_index()
colname = 'Non_' + firm + '_Average_Growth'
df[colname] = df.loc[:, ~df.columns.isin(['Firm', 'Date', firm])].mean(axis=1)
return df
Solution 2
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
growth_df = daily_growth_by_firm.copy()
pivot_df = growth_df.pivot(index = 'Date', columns = 'Firm', values = 'Day_Growth').reset_index()
mean_growth = pivot_df.drop(columns=['Date', firm]).mean(axis=1)
pivot_df[f'Non_{firm}_Average_Growth'] = mean_growth
return pivot_df
Solution 3
def get_avg_growth(possible_firms, row):
total = 0
for firm in possible_firms:
total += row[firm]
return total / len(possible_firms)
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
# get the list of firms that are not the firm in question
possible_firms = list(daily_growth_by_firm['Firm'].unique())
possible_firms.remove(firm)
normalized_results = daily_growth_by_firm.pivot(index='Date', columns='Firm', values='Day_Growth')
normalized_results = normalized_results.reset_index(drop=False)
col_name = 'Non_' + firm + '_Average_Growth'
normalized_results[col_name] = normalized_results.apply(lambda x: get_avg_growth(possible_firms, x), axis=1)
return normalized_results
Solution 4
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
pivoted_growth = daily_growth_by_firm.pivot(index='Date', columns='Firm', values='Day_Growth')
non_firm_avg_growth = pivoted_growth.drop(columns=[firm]).mean(axis=1)
pivoted_growth[f'Non_{firm}_Average_Growth'] = non_firm_avg_growth
pivoted_growth.reset_index(inplace=True)
return pivoted_growth
Solution 5
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
normalized_results = daily_growth_by_firm.copy()
averages = normalized_results[normalized_results["Firm"] != firm].groupby(["Date"]).mean()["Day_Growth"].reset_index()
normalized_results = normalized_results.pivot("Date", columns="Firm", values="Day_Growth").reset_index()
normalized_results = pd.merge(normalized_results, averages, on="Date")
title = "Non_%s_Average_Growth" % (firm)
normalized_results = normalized_results.rename(columns={'Day_Growth': title})
normalized_results.index.name = 'Firm'
return normalized_results
Solution 6
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
def get_avg(curr_row, other_firms):
sum = 0
for firm in other_firms:
sum += curr_row[firm]
return sum / len(other_firms)
firms = np.unique(daily_growth_by_firm["Firm"].values)
other_firms = [possible_firm for possible_firm in firms if possible_firm != firm]
cast = daily_growth_by_firm.pivot(index=["Date"], columns=["Firm"], values="Day_Growth")
cast = cast.reset_index()
cast["Non_" + firm + "_Average_Growth"] = cast.apply(lambda x: get_avg(x, other_firms), axis=1)
return cast
Solution 7
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
# Group by date, keeping only Day_Growth for each firm
results = daily_growth_by_firm.pivot(index='Date', columns='Firm', values='Day_Growth')
# calculate average non firm growth
# Exclude firm column
results[f'Non_{firm}_Average_Growth'] = results[[col for col in results.columns if col not in [f'{firm}']]].mean(axis=1)
# reset index
results = results.reset_index()
return results
Solution 8
def normalize_stock_growths(daily_growth_by_firm: pd.DataFrame, firm: str) -> pd.DataFrame:
df = daily_growth_by_firm.copy()
df = df.pivot(index='Date', columns='Firm', values='Day_Growth')
df = df.reset_index()
cols = [c for c in df.columns if c not in ['Date',firm]]
df2 = df[cols]
df['Non_' + firm + '_Average_Growth'] = df2.apply(np.mean, axis=1)
return df
Exercise 6
Solution 1
def summarize_netflix_original_genres()-> str:
query = '''
SELECT substr(premiere, 1, 4) as year, genre, COUNT(genre) as genre_count
FROM nflx_originals
GROUP BY year, genre
ORDER BY genre_count desc, year, genre
LIMIT 10
'''
return query
Solution 2
def summarize_netflix_original_genres()-> str:
query = '''
SELECT strftime('%Y', premiere) as year, genre, COUNT(genre) as genre_count
FROM nflx_originals
GROUP BY year, genre
ORDER BY genre_count desc, year, genre
LIMIT 10
'''
return query
Solution 3
def summarize_netflix_original_genres()-> str:
query = f'''
SELECT STRFTIME('%Y', premiere) AS year, genre, COUNT(*) AS genre_count
FROM nflx_originals
GROUP BY year, genre
ORDER BY genre_count DESC, year, genre
LIMIT(10)
'''
return query
Solution 4
def summarize_netflix_original_genres()-> str:
query = """
SELECT STRFTIME('%Y', premiere) AS year,
genre,
COUNT(*) AS genre_count
FROM NFLX_ORIGINALS
GROUP BY YEAR,
genre
ORDER BY genre_count DESC,
year ASC,
genre ASC
LIMIT 10"""
return query
Solution 5
def summarize_netflix_original_genres() -> str:
query = '''
SELECT STRFTIME('%Y', premiere) AS year, genre, count(*) as genre_count
FROM nflx_originals
GROUP BY genre, year
Order By genre_count DESC, year ASC, genre ASC
LIMIT 10
'''
return query
Solution 6
def summarize_netflix_original_genres()-> str:
query = '''
SELECT strftime("%Y", premiere) as year, genre, count(*) as genre_count
FROM nflx_originals
GROUP BY year, genre
ORDER BY genre_count DESC, year ASC, genre ASC
LIMIT 10
'''
return query
Solution 7
def summarize_netflix_original_genres()-> str:
query = '''
select strftime('%Y', premiere) year, genre, count(*) genre_count
from nflx_originals
group by year, genre
order by genre_count desc, year, genre
limit 10
'''
return query
Solution 8
def summarize_netflix_original_genres()-> str:
query = '''
SELECT strftime(\'%Y\', premiere) AS year, genre, COUNT(*) AS genre_count
FROM nflx_originals
GROUP BY year, genre
ORDER BY genre_count DESC, year ASC, genre ASC
LIMIT 10
'''
return query
Exercise 7
Solution 1
def calc_netflix_top10_scores() -> pd.DataFrame:
query = '''
SELECT score.title, total_score, occurrence, (CAST(total_score AS float) / CAST(occurrence AS float)) AS avg_score
FROM
(SELECT title, SUM(11 - rank) AS total_score
FROM nflx_top
WHERE rank <= 10
GROUP BY title
ORDER BY total_score DESC
LIMIT 10) score
JOIN
(SELECT title, COUNT(*) AS occurrence
FROM nflx_top
GROUP BY title) occ
ON score.title=occ.title
'''
return query
Solution 2
def calc_netflix_top10_scores() -> pd.DataFrame:
query = '''
WITH scores AS (
SELECT SUM(11 - rank) AS total_score, title, count(title) as occurrence
FROM nflx_top
GROUP BY title
ORDER BY total_score desc
)
SELECT title, total_score, occurrence, CAST(total_score AS FLOAT)/CAST(occurrence AS FLOAT) as avg_score
FROM scores
LIMIT 10
'''
return query
Solution 3
def calc_netflix_top10_scores() -> pd.DataFrame:
query = f'''
WITH total_score_calc AS (
SELECT title, SUM(scores) AS total_score, COUNT(scores>0) AS occurrence, AVG(scores) AS avg_score
FROM (
SELECT
title,
CASE
WHEN rank=1 THEN 10
WHEN rank=2 THEN 9
WHEN rank=3 THEN 8
WHEN rank=4 THEN 7
WHEN rank=5 THEN 6
WHEN rank=6 THEN 5
WHEN rank=7 THEN 4
WHEN rank=8 THEN 3
WHEN rank=9 THEN 2
WHEN rank=10 THEN 1
ELSE 0
END AS scores
FROM nflx_top
)
GROUP BY title
)
SELECT *
FROM total_score_calc
ORDER BY total_score DESC, title DESC
LIMIT 10
'''
return query
Solution 4
def calc_netflix_top10_scores() -> pd.DataFrame:
query = """
select title,
sum(11-rank) as total_score,
count(*) as occurrence,
cast(sum(11-rank) as float)/count(*) as avg_score
from nflx_top
group by title
order by total_score desc
limit 10;
"""
return query
Solution 5
def calc_netflix_top10_scores() -> pd.DataFrame:
query = '''
Select title, Sum(11-Rank) as total_score, count(*) as occurrence, avg(11-Rank) as avg_score
from nflx_top
group by title
order by total_score desc, title desc
Limit 10;
'''
return query
Solution 6
def calc_netflix_top10_scores() -> pd.DataFrame:
query = '''
SELECT top.title as title, sum(scored.score) as total_score, count(*) as occurrence, avg(scored.score) as avg_score
FROM nflx_top top
JOIN
(SELECT date, rank, title, (11 - rank) as score
FROM nflx_top) AS scored
ON top.date = scored.date AND top.title = scored.title
GROUP BY top.title
ORDER BY total_score DESC
LIMIT 10
'''
return query
Solution 7
def calc_netflix_top10_scores() -> pd.DataFrame:
query = '''
select title, sum(11-rank) total_score, count(*) occurrence, sum(11-rank) * 1.0/count(*) avg_score
from nflx_top
where rank <= 10
group by title
order by total_score desc, title desc
limit 10
'''
return query
Solution 8
def calc_netflix_top10_scores() -> pd.DataFrame:
query = '''
WITH ShowScores AS (
SELECT title, 11 - rank AS score
FROM nflx_top
WHERE rank <= 10
)
SELECT title, SUM(score) AS total_score, COUNT(*) AS occurrence, CAST(SUM(score) AS float)/count(*) as avg_score
FROM ShowScores
GROUP BY title
ORDER BY total_score DESC, title DESC
LIMIT 10
'''
return query
Exercise 8
Solution 1
def longterm_avg_score():
query = '''
SELECT scores.title, total_score / (JulianDay(max_date) - JulianDay(min_date)) AS longterm_avg_score
FROM
(SELECT title, SUM(11 - rank) AS total_score
FROM nflx_top
WHERE rank <= 10
GROUP BY title
ORDER BY total_score DESC) scores
JOIN
(SELECT title, MAX(date) AS max_date, MIN(date) AS min_date
FROM
(SELECT top.title, rank, date, premiere
FROM nflx_top top
JOIN nflx_originals orig
ON top.title=orig.title)
WHERE date >= premiere
GROUP BY title) dates
ON scores.title=dates.title
ORDER BY longterm_avg_score DESC, scores.title
LIMIT 10
'''
return query
Solution 2
def longterm_avg_score():
query = f'''
WITH total_score_calc AS (
SELECT nflx_originals.title, SUM(scores) AS total_score, nflx_release_date
FROM (
SELECT
title,
(11 - rank) AS scores,
nflx_release_date
FROM nflx_top
) AS top_scores
JOIN nflx_originals ON nflx_originals.title = top_scores.title
GROUP BY nflx_originals.title, nflx_release_date
),
poss_days AS (
SELECT
premiere,
COUNT(DISTINCT date) AS num_poss
FROM nflx_top
CROSS JOIN nflx_originals
WHERE date >= premiere
GROUP BY premiere
ORDER BY premiere DESC
)
SELECT title, CAST(total_score AS float) / CAST(num_poss AS float) AS longterm_avg_score
FROM total_score_calc
JOIN poss_days ON premiere = nflx_release_date
ORDER BY longterm_avg_score DESC
LIMIT 10
'''
return query
Solution 3
def longterm_avg_score():
query = '''
SELECT t.title, t.total_score/o.uptime as longterm_avg_score
FROM (
SELECT title, SUM(11 - rank) AS total_score
FROM nflx_top
GROUP BY title
) AS t
LEFT JOIN (
SELECT title, julianday('now') - julianday(premiere) AS uptime
FROM nflx_originals
) AS o
ON t.title = o.title
order by longterm_avg_score desc
limit 10
'''
return query
Solution 4
def longterm_avg_score():
query = '''
WITH all_days AS
(SELECT *
FROM nflx_originals o
CROSS JOIN (SELECT DISTINCT date FROM nflx_top) AS t
WHERE t.date >= o.premiere),
scored AS
(SELECT *, CASE WHEN rank is NULL THEN 0 ELSE 11 - rank END AS score
FROM all_days a
LEFT JOIN nflx_top t
ON a.date = t.date AND a.title = t.title)
SELECT title, AVG(score) as longterm_avg_score
FROM scored
GROUP BY title
ORDER BY longterm_avg_score DESC
limit 10
'''
return query
Solution 5
def longterm_avg_score():
query = """
SELECT
no.title,
SUM(CASE WHEN nt.rank IS NOT NULL THEN 11 - nt.rank ELSE 0 END) * 1.0 / COUNT(DISTINCT nt.date) AS longterm_avg_score
FROM nflx_originals no
LEFT JOIN nflx_top nt ON no.title = nt.title AND nt.date >= no.premiere
GROUP BY no.title
ORDER BY longterm_avg_score DESC, no.title ASC
LIMIT 10;
"""
return query
Exercise 9
Solution 1
def construct_model_inputs(normalized_growths_df: pd.DataFrame, show_scores_df: pd.DataFrame) -> pd.DataFrame:
df_g = normalized_growths_df.copy()
df_s = show_scores_df.copy()
df_s = df_s.pivot(index='date', columns='title', values='score').reset_index().fillna(0).rename(columns={'date':'Date'})
df_s['Date'] = df_s['Date'].astype('str')
df_g['Date'] = df_g['Date'].astype('str')
df = pd.merge(df_s, df_g, on='Date')
df = df.drop(columns=['Date', 'Tech_Avg', 'NFLX', 'AMZN', 'META', 'AAPL', 'GOOG'])
return df
Solution 2
def construct_model_inputs(normalized_growths_df: pd.DataFrame, show_scores_df: pd.DataFrame) -> pd.DataFrame:
growth_df = normalized_growths_df.copy()
score_df = show_scores_df.copy()
pivot_scores = score_df.pivot(index='date', columns='title', values='score').reset_index().fillna(0)
model_matrix = pd.merge(pivot_scores, growth_df, left_on = 'date', right_on = 'Date')
model_matrix = model_matrix.drop(columns = ['Date', 'date', 'NFLX', 'AMZN', 'META', 'AAPL', 'GOOG', 'Tech_Avg'])
return model_matrix
Solution 3
def construct_model_inputs(normalized_growths_df: pd.DataFrame, show_scores_df: pd.DataFrame) -> pd.DataFrame:
pivoted_df = show_scores_df.pivot(index='date', columns='title', values='score')
pivoted_df.fillna(0, inplace=True)
pivoted_df = pivoted_df.reset_index(drop=False)
normalized_small_df = normalized_growths_df[['Date', 'Relative_Growth']]
model_matrix = pd.merge(normalized_small_df, pivoted_df, left_on = 'Date', right_on='date')
del model_matrix['date']
del model_matrix['Date']
return model_matrix
Solution 4
def construct_model_inputs(normalized_growths_df: pd.DataFrame, show_scores_df: pd.DataFrame) -> pd.DataFrame:
cast = show_scores_df.pivot(index=["date"], columns=["title"], values="score")
cast.fillna(0.0, inplace=True)
growths = normalized_growths_df[["Date", "Relative_Growth"]]
cast = growths.merge(cast, left_on="Date", right_on="date")
del cast['Date']
return cast
Solution 5
def construct_model_inputs(normalized_growths_df: pd.DataFrame, show_scores_df: pd.DataFrame) -> pd.DataFrame:
pivoted = show_scores_df.pivot(index='date', columns='title', values='score')
pivoted = pivoted.reset_index()
growth_table = normalized_growths_df[['Date', 'Relative_Growth']]
growth_table = growth_table.rename(columns={'Date': 'date'})
results = pd.merge(growth_table, pivoted, on='date')
results.drop('date', axis=1, inplace=True)
# convert NaN to 0
results = results.fillna(0)
return results
Solution 6
def construct_model_inputs(normalized_growths_df: pd.DataFrame, show_scores_df: pd.DataFrame) -> pd.DataFrame:
df1 = normalized_growths_df.copy()
df2 = show_scores_df.copy()
df1['d']=df1['Date'].dt.date
df2['d']=df2['date'].dt.date
df = df2.merge(df1, how='left', on='d')
df = df.pivot(index='Relative_Growth', columns='title', values='score')
df.columns.name = None
df = df.fillna(0)
df = df.sort_values(by='Relative_Growth', ascending=False)
df = df.reset_index()
return df