SP2026 MT2 TA Alternative Solutions¶
Ex0¶
In [ ]:
vle_activity_summary_query = '''SELECT activity_type, COUNT(*) AS resource_count,
COUNT(DISTINCT code_module || code_presentation) AS courses_using
FROM vle
GROUP BY activity_type
ORDER BY resource_count DESC, courses_using ASC'''
In [ ]:
vle_activity_summary_query = '''
SELECT activity_type, COUNT(DISTINCT v.id) as resource_count, COUNT(DISTINCT c.id) as courses_using
FROM vle AS v
JOIN courses AS c
ON v.code_module = c.code_module
AND v.code_presentation = c.code_presentation
GROUP BY activity_type
ORDER BY resource_count DESC, courses_using ASC
'''
In [ ]:
vle_activity_summary_query = '''
select activity_type, count(*) as resource_count,
count(distinct code_module || code_presentation) as courses_using
from vle
group by activity_type
order by 2 desc, 3 asc
'''
In [ ]:
SELECT activity_type,
COUNT(*) AS resource_count,
COUNT(DISTINCT course_pres) AS courses_using
FROM
(
SELECT id,
activity_type,
code_module || '' || code_presentation AS course_pres
FROM vle
) AS sub
GROUP BY 1
ORDER BY 2 DESC, 3 ASC
In [ ]:
vle_activity_summary_query = '''
SELECT
activity_type,
COUNT(*)AS resource_count,
COUNT(DISTINCT code_module || ' '|| code_presentation)AS courses_using
FROM vle
GROUP BY activity_type
ORDER BY resource_count DESC, courses_using ASC
'''
In [ ]:
vle_activity_summary_query =
'''SELECT activity_type, Count(*) as resource_count, count(DISTINCT temp) as courses_using
FROM (SELECT id, activity_type, code_module || '_' || code_presentation as temp FROM vle)
GROUP BY activity_type
ORDER BY resource_count DESC, courses_using ASC'''
In [ ]:
vle_activity_summary_query = '''
WITH temp1 AS(SELECT activity_type, COUNT(*) AS resource_count
FROM vle
GROUP BY activity_type),
temp2 AS(SELECT
activity_type,
COUNT(*)
FROM vle
GROUP BY activity_type, code_module, code_presentation),
temp3 AS(SELECT
activity_type,
COUNT(*) AS courses_using
FROM temp2
GROUP BY activity_type)
SELECT t1.activity_type, t1.resource_count, t3.courses_using
FROM temp1 t1
JOIN temp3 t3 ON t1.activity_type = t3.activity_type
ORDER BY t1.resource_count DESC, courses_using ASC
'''
In [ ]:
### Solution - Exercise 0
vle_activity_summary_query = '''
SELECT activity_type,
COUNT(*) AS resource_count,
COUNT(DISTINCT code_module || code_presentation) AS courses_using
FROM vle
WHERE 1 = 1
GROUP BY activity_type
ORDER BY resource_count DESC, courses_using ASC;
'''
Ex1¶
In [ ]:
course_pass_rate_query = '''
SELECT courses.code_module, courses.code_presentation, COUNT(*) AS total_students,
SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 ELSE 0 END) AS passed_students,
ROUND(100.0 * SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 ELSE 0 END)/COUNT(*), 2) AS pass_rate
FROM courses
JOIN studentInfo
ON courses.code_module = studentInfo.code_module AND courses.code_presentation = studentInfo.code_presentation
GROUP BY courses.code_module, courses.code_presentation
HAVING COUNT(*) >= 500
ORDER BY pass_rate DESC, courses.code_presentation ASC
LIMIT 10
'''
In [ ]:
course_pass_rate_query = '''
SELECT c.code_module, c.code_presentation,
COUNT(DISTINCT id_student) as total_students,
SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 ELSE 0 END) as passed_students,
ROUND((SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1.0 ELSE 0 END) / COUNT(DISTINCT id_student)) * 100.0,2) as pass_rate
FROM courses as c
JOIN studentInfo as s
ON s.code_presentation = c.code_presentation
AND s.code_module = c.code_module
GROUP BY c.code_module, c.code_presentation
HAVING total_students >= 500
ORDER BY pass_rate DESC, c.code_presentation ASC
LIMIT 10
'''
In [ ]:
course_pass_rate_query = '''
select c.code_module,
c.code_presentation,
count(*) as total_students,
sum(case when s.final_result in ('Pass', 'Distinction') then 1 else 0 end) as passed_students,
round(100.0 * sum(case when s.final_result in ('Pass', 'Distinction') then 1 else 0 end) / count(*), 2) as pass_rate
from courses c
join studentInfo s
on c.code_module = s.code_module
and c.code_presentation = s.code_presentation
group by c.code_module, c.code_presentation
having count(*) >= 500
order by
5 desc, 1 asc, 2 asc
'''
In [ ]:
course_pass_rate_query = '''
SELECT c.code_module,
c.code_presentation,
COUNT(*) AS total_students,
SUM(CASE WHEN s.final_result IN ('Pass', 'Distinction') THEN 1 ELSE 0 END) AS passed_students,
ROUND((SUM(CASE WHEN s.final_result IN ('Pass', 'Distinction') THEN 1 ELSE 0 END) * 1.0 / COUNT(*) ) * 100, 2) AS pass_rate
FROM courses AS c
LEFT JOIN studentInfo AS s
ON c.code_module = s.code_module
AND c.code_presentation = s.code_presentation
GROUP BY 1, 2
HAVING total_students > 500
ORDER BY 5 DESC, 2 ASC
'''
In [ ]:
course_pass_rate_query = '''
SELECT
*,
ROUND(passed_students*100.0/total_students,2)AS pass_rate
FROM
(SELECT
c.code_module,
c.code_presentation,
COUNT(*)AS total_students,
COUNT(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 END)AS passed_students
FROM courses AS c
JOIN studentInfo AS s
ON c.code_module = s.code_module AND c.code_presentation = s.code_presentation
GROUP BY c.code_module,c.code_presentation
HAVING COUNT(*)>=500) AS t
ORDER BY pass_rate DESC, code_presentation ASC
'''
In [ ]:
course_pass_rate_query = '''
With temp AS (SELECT Count(s.Id) as total_students, s.code_module, s.code_presentation,
COUNT(CASE WHEN s.final_result = 'Pass' OR s.final_result = 'Distinction' THEN 1 ELSE NULL END) as passed_students
FROM studentInfo as s
JOIN courses as c ON s.code_module = c.code_module and s.code_presentation = c.code_presentation
GROUP BY s.code_module, s.code_presentation
Having total_students >= 500)
SELECT
code_module,
code_presentation,
total_students,
passed_students,
Round(CAST(passed_students AS FLOAT)/total_students*100.0, 2) as pass_rate
FROM temp
ORDER BY pass_rate desc, code_presentation asc
'''
In [ ]:
course_pass_rate_query = '''
WITH temp1 AS(
SELECT *
FROM courses c
JOIN studentInfo s
ON c.code_module = s.code_module AND c.code_presentation = s.code_presentation),
temp2 AS(
SELECT code_module, code_presentation, COUNT(*) AS total_students
FROM temp1
GROUP BY code_module, code_presentation),
temp3 AS(SELECT code_module, code_presentation, count(*) AS passed_students
FROM temp1
WHERE final_result = 'Pass' OR final_result = 'Distinction'
GROUP BY code_module, code_presentation)
SELECT temp2.code_module, temp2.code_presentation, temp2.total_students, temp3.passed_students,
ROUND((temp3.passed_students*100.0/temp2.total_students), 2) AS pass_rate
FROM temp2
JOIN temp3
ON temp2.code_module = temp3.code_module AND temp2.code_presentation = temp3.code_presentation
WHERE temp2.total_students >= 500
ORDER BY pass_rate DESC, temp2.code_presentation ASC
'''
In [ ]:
### Solution - Exercise 1
course_pass_rate_query = '''
SELECT c.code_module,
c.code_presentation,
COUNT(DISTINCT si.id_student) AS total_students,
SUM(CASE WHEN si.final_result IN ('Pass', 'Distinction') THEN 1 END) AS passed_students,
ROUND(
SUM(CASE WHEN si.final_result IN ('Pass', 'Distinction') THEN 1.0 END) / COUNT(DISTINCT si.id_student) * 100
, 2) AS pass_rate
FROM courses AS c
INNER JOIN studentInfo AS si ON c.code_module = si.code_module AND c.code_presentation = si.code_presentation
WHERE 1 = 1
GROUP BY c.code_module, c.code_presentation
HAVING COUNT(DISTINCT si.id_student) >= 500
ORDER BY pass_rate DESC, c.code_presentation ASC
'''
Ex2¶
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
CASE
WHEN sa.date_submitted <= 25 THEN 'Early'
ELSE 'Late'
END AS submission_category,
COUNT(sa.id_student) AS student_count,
ROUND(AVG(CASE WHEN sv.total_clicks IS NULL THEN 0 ELSE sv.total_clicks END), 2) AS avg_total_clicks,
ROUND(AVG(CASE WHEN sv.days_active IS NULL THEN 0 ELSE sv.days_active END), 2) AS avg_days_active,
ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
SELECT
id_student,
code_module,
code_presentation,
SUM(sum_click) as total_clicks,
COUNT(DISTINCT date) as days_active
FROM studentVle
WHERE code_module = 'BBB' AND code_presentation = '2013B'
GROUP BY id_student, code_module, code_presentation
) sv
ON sa.id_student = sv.id_student
AND a.code_module = sv.code_module
AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
SELECT
CASE
WHEN sa.date_submitted <= 25 THEN 'Early'
WHEN sa.date_submitted > 25 THEN 'Late'
END AS submission_category,
COUNT(DISTINCT sa.id_student) AS student_count,
ROUND(AVG(CASE WHEN sv.total_clicks IS NULL THEN 0 ELSE sv.total_clicks END),2) AS avg_total_clicks,
ROUND(AVG(CASE WHEN sv.days_active IS NULL THEN 0 ELSE sv.days_active END),2) AS avg_days_active,
CASE
WHEN sa.date_submitted IS NULL THEN 0
ELSE ROUND(AVG(sa.date_submitted),2) END AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a
ON sa.id_assessment = a.id_assessment
LEFT JOIN (
SELECT
id_student,
code_module,
code_presentation,
SUM(sum_click) AS total_clicks,
COUNT(DISTINCT date) AS days_active
FROM studentVle
WHERE code_module = 'BBB'
AND code_presentation = '2013B'
GROUP BY id_student, code_module, code_presentation
) sv
ON sa.id_student = sv.id_student
AND a.code_module = sv.code_module
AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
CASE
WHEN sa.date_submitted <= 25 THEN 'Early'
ELSE 'Late'
END AS submission_category,
COUNT(distinct sa.id_student) AS student_count,
round(AVG(coalesce(sv.total_clicks, 0)), 2) AS avg_total_clicks,
round(AVG(coalesce(sv.days_active, 0)), 2) AS avg_days_active,
round(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
left JOIN (
SELECT
id_student,
code_module,
code_presentation,
SUM(sum_click) as total_clicks,
COUNT(distinct date) as days_active
FROM studentVle
GROUP BY id_student, code_module, code_presentation
) sv
ON sa.id_student = sv.id_student
AND a.code_module = sv.code_module
AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
and a.code_module = 'BBB'
and a.code_presentation = '2013B'
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
CASE
WHEN sa.date_submitted <= 25 THEN 'Early'
ELSE 'Late'
END AS submission_category,
COUNT(sa.id_student) AS student_count,
ROUND(AVG(COALESCE(sv.total_clicks, 0)), 2) AS avg_total_clicks,
ROUND(AVG(COALESCE(sv.days_active, 0)), 2) AS avg_days_active,
ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
SELECT
id_student,
code_module,
code_presentation,
SUM(sum_click) as total_clicks,
COUNT(DISTINCT date) as days_active
FROM studentVle
WHERE code_module = 'BBB' AND code_presentation = '2013B'
GROUP BY id_student, code_module, code_presentation
) sv
ON sa.id_student = sv.id_student
AND a.code_module = sv.code_module
AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
AND a.code_module = 'BBB'
AND a.code_presentation = '2013B'
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
CASE
WHEN sa.date_submitted <= 25 THEN 'Early'
ELSE 'Late'
END AS submission_category,
COUNT(sa.id_student) AS student_count,
ROUND(AVG(COALESCE(sv.total_clicks,0)),2) AS avg_total_clicks,
ROUND(AVG(COALESCE(sv.days_active,0)),2) AS avg_days_active,
ROUND(AVG(sa.date_submitted),2) AS avg_submission_day
FROM studentAssessment sa
LEFT JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
SELECT
id_student,
code_module,
code_presentation,
SUM(sum_click) as total_clicks,
COUNT(DISTINCT date) as days_active
FROM studentVle
WHERE code_module ='BBB' AND code_presentation = '2013B'
GROUP BY id_student, code_module, code_presentation
) sv
ON sa.id_student = sv.id_student
AND a.code_module = sv.code_module
AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
CASE
WHEN sa.date_submitted <= 25 THEN 'Early'
ELSE 'Late'
END AS submission_category,
COUNT(sa.id_student) AS student_count,
ROUND(AVG(COALESCE(sv.total_clicks, 0)), 2) AS avg_total_clicks,
ROUND(AVG(COALESCE(sv.days_active, 0)), 2) AS avg_days_active,
ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
SELECT
id_student,
code_module,
code_presentation,
COALESCE(SUM(sum_click), 0) as total_clicks,
COALESCE(COUNT(DISTINCT date), 0) as days_active
FROM studentVle
WHERE code_module = 'BBB' and code_presentation = '2013B'
GROUP BY id_student, code_module, code_presentation
) sv
ON sa.id_student = sv.id_student
AND a.code_module = sv.code_module
AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
CASE WHEN sa.date_submitted <= 25 THEN 'Early'
WHEN sa.date_submitted > 25 THEN 'Late'
END AS submission_category,
COUNT(sa.id_student) AS student_count,
ROUND(AVG(IFNULL(sv.click_total, 0)), 2) AS avg_total_clicks,
ROUND(AVG(IFNULL(sv.distinct_date, 0)), 2) AS avg_days_active,
ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN(
SELECT id_student, code_module, code_presentation, IFNULL(SUM(sum_click), 0.0) AS click_total, IFNULL(COUNT(DISTINCT date), 0.0) AS distinct_date
FROM studentVLE
WHERE code_module = 'BBB' AND code_presentation = '2013B'
GROUP BY id_student, code_module, code_presentation) sv
ON sa.id_student = sv.id_student AND a.code_module = sv.code_module AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
'''
In [ ]:
### Solution - Exercise 2
DEBUG_early_late_submitters_query = '''
SELECT
CASE
WHEN sa.date_submitted <= 25 THEN 'Early'
ELSE 'Late'
END AS submission_category,
COUNT(DISTINCT sa.id_student) AS student_count,
ROUND(AVG(COALESCE(sv.total_clicks, 0)), 2) AS avg_total_clicks,
ROUND(AVG(COALESCE(sv.days_active, 0)), 2) AS avg_days_active,
ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
SELECT
id_student,
code_module,
code_presentation,
SUM(sum_click) as total_clicks,
COUNT(DISTINCT date) as days_active
FROM studentVle
GROUP BY id_student, code_module, code_presentation
HAVING code_module = 'BBB' AND code_presentation = '2013B'
) sv
ON sa.id_student = sv.id_student
AND a.code_module = sv.code_module
AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
Ex3¶
In [ ]:
vle_engagement_risk_query = '''
WITH filtered_student_vle AS (
SELECT *
FROM studentVle
WHERE code_module = 'BBB' AND code_presentation = '2013J' AND date < 30)
SELECT filt.id_student, SUM(sum_click) AS total_clicks, COUNT(DISTINCT filt.date) AS days_active,
CASE WHEN SUM(sum_click) < 50 OR COUNT(DISTINCT filt.date) < 5 THEN 1 ELSE 0 END AS at_risk_flag, final_result
FROM filtered_student_vle filt
JOIN studentInfo si
ON filt.id_student = si.id_student
AND filt.code_module = si.code_module
AND filt.code_presentation = si.code_presentation
GROUP BY filt.id_student, si.final_result
ORDER BY filt.id_student ASC, final_result DESC'''
In [ ]:
vle_engagement_risk_query = '''
SELECT v.id_student, total_clicks, days_active,
CASE
WHEN total_clicks < 50 OR days_active < 5 THEN 1
ELSE 0 END AS at_risk_flag,
final_result
FROM (SELECT *,
SUM(sum_click) as total_clicks,
COUNT(DISTINCT date) as days_active
FROM studentVle
WHERE code_module = 'BBB'
AND code_presentation = '2013J'
AND date < 30
GROUP BY id_student) as v
LEFT JOIN studentInfo as i
ON v.id_student = i.id_student
AND v.code_module = i.code_module
AND v.code_presentation = i.code_presentation
GROUP BY v.id_student, i.final_result
In [ ]:
vle_engagement_risk_query = '''
select sv.id_student,
sum(sv.sum_click) as total_clicks,
count(distinct sv.date) as days_active,
case when sum(sv.sum_click) < 50 or count(distinct sv.date) < 5 then 1 else 0 end as at_risk_flag,
si.final_result
from studentVle sv
join studentInfo si
on sv.id_student = si.id_student
and sv.code_module = si.code_module
and sv.code_presentation = si.code_presentation
where sv.code_module = 'BBB'
and sv.code_presentation = '2013J'
and sv.date < 30
group by sv.id_student, si.final_result
order by 1 asc, 5 asc
'''
In [ ]:
vle_engagement_risk_query = '''
SELECT v.id_student,
SUM(v.sum_click) AS total_clicks,
COUNT(DISTINCT v.date) AS days_active,
CASE
WHEN SUM(v.sum_click) < 50 OR COUNT(DISTINCT v.date) < 5 THEN 1
ELSE 0
END AS at_risk_flag,
s.final_result
FROM studentInfo AS s
JOIN
(
SELECT *
FROM studentVle
WHERE code_module = 'BBB'
AND code_presentation = '2013J'
AND date < 30
) AS v
ON s.id_student = v.id_student
AND s.code_module = v.code_module
AND s.code_presentation = v.code_presentation
GROUP BY v.id_student, s.final_result
ORDER BY 1 ASC, 4 ASC
'''
In [ ]:
vle_engagement_risk_query = '''
SELECT
sv.id_student,
SUM(sv.sum_click) AS total_clicks,
COUNT(DISTINCT sv.date) AS days_active,
(CASE
WHEN SUM(sv.sum_click) <50 OR COUNT(DISTINCT sv.date) <5 THEN 1
ELSE 0
END) AS at_risk_flag,
si.final_result
FROM
studentVle AS sv
JOIN
studentInfo AS si
on sv.id_student=si.id_student
AND sv.code_module=si.code_module
AND sv.code_presentation=si.code_presentation
WHERE
sv.code_module='BBB' AND sv.code_presentation='2013J' AND sv.date <30
GROUP BY
sv.id_student, si.final_result
ORDER BY
sv.id_student ASC, si.final_result ASC
'''
In [ ]:
vle_engagement_risk_query = '''
SELECT
sv.id_student,
SUM(sv.sum_click) AS total_clicks,
COUNT(DISTINCT sv.date) AS days_active,
CASE
WHEN SUM(sv.sum_click) < 50 OR COUNT(DISTINCT sv.date) < 5 THEN 1
ELSE 0
END AS at_risk_flag,
si.final_result
FROM studentVle AS sv
JOIN studentInfo AS si
ON sv.id_student = si.id_student
AND sv.code_module = si.code_module
AND sv.code_presentation = si.code_presentation
WHERE sv.code_module = 'BBB'
AND sv.code_presentation = '2013J'
AND sv.date < 30
GROUP BY sv.id_student, si.final_result
ORDER BY sv.id_student ASC, si.final_result ASC'''
In [ ]:
vle_engagement_risk_query = '''
SELECT
st.id_student,
SUM(st.sum_click) AS total_clicks,
COUNT(distinct st.date) AS days_active,
CASE WHEN SUM(st.sum_click) < 50 OR COUNT(DISTINCT st.date) < 5 THEN 1
ELSE 0
END AS at_risk_flag,
si.final_result
FROM
(SELECT *
FROM studentVle
WHERE code_module = 'BBB' AND code_presentation = '2013J' AND date < 30) st
JOIN studentInfo si
ON st.id_student = si.id_student
AND st.code_module = si.code_module
AND st.code_presentation = si.code_presentation
GROUP BY st.id_student, si.final_result
ORDER BY st.id_student, final_result
'''
In [ ]:
### Solution - Exercise 3
vle_engagement_risk_query = '''
SELECT sv.id_student,
SUM(sv.sum_click) AS total_clicks,
COUNT(DISTINCT sv.date) AS days_active,
(CASE WHEN (SUM(sv.sum_click) < 50 OR COUNT(DISTINCT sv.date) < 5) THEN 1 ELSE 0 END) AS at_risk_flag,
si.final_result
FROM studentVle AS sv
INNER JOIN studentInfo AS si ON si.id_student = sv.id_student
AND si.code_module = sv.code_module
AND si.code_presentation = sv.code_presentation
WHERE sv.code_module = 'BBB' AND sv.code_presentation = '2013J' AND sv.date < 30
GROUP BY sv.id_student, si.final_result
ORDER BY sv.id_student ASC, si.final_result ASC
'''
Ex4¶
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
students_df = students_df[STUDENT_FEATURES].set_index('id_student')
merged = students_df.merge(registrations_df[REGISTRATION_FEATURES], how = 'left', on = ['code_module', 'code_presentation', 'id_student'])
vle_feats = vle_df[vle_df.date < cutoff_day].groupby(['code_module', 'code_presentation', 'id_student']).agg(total_clicks=('sum_click', 'sum'), total_active_days=('date', 'nunique'))
vle_feats['avg_clicks_per_day'] = vle_feats['total_clicks']/vle_feats['total_active_days']
VLE_FEATS = list(vle_feats.columns)
merged = merged.merge(vle_feats, how = 'left', on = ['code_module', 'code_presentation', 'id_student'])
merged[VLE_FEATS] = merged[VLE_FEATS].fillna(0)
merged['days_registered_before_start'] = merged['date_registration']
merged = merged.astype({
'total_active_days': 'int64',
'total_clicks': 'int64'
})
return merged[RESULT_FEATURES].set_index('id_student')
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
df = students_df[STUDENT_FEATURES].set_index('id_student')
df = pd.merge(df, registrations_df[REGISTRATION_FEATURES], how='left', on=['code_module','code_presentation','id_student'])
vle_df_small = vle_df[vle_df['date'] < cutoff_day]
vle_df_small = vle_df_small.groupby(['code_module', 'code_presentation', 'id_student']).agg(total_clicks = ('sum_click', 'sum'),
total_active_days = ('date', 'nunique'))
vle_df_small['avg_clicks_per_day'] = vle_df_small['total_clicks'] / vle_df_small['total_active_days']
VLE = list(vle_df_small.columns)
df = pd.merge(df,vle_df_small, how='left', on=['code_module', 'code_presentation','id_student'])
df[VLE] = df[VLE].fillna(0)
df['days_registered_before_start'] = df['date_registration']
df = df.astype({
'total_clicks':'int64',
'total_active_days':'int64'
})
return df[RESULT_FEATURES].set_index('id_student')
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
df = students_df[STUDENT_FEATURES].copy()
df = df.merge(registrations_df[REGISTRATION_FEATURES],
on = ["code_module", 'code_presentation', 'id_student'],
how = 'left')
vle = vle_df[vle_df['date'] < cutoff_day]
vle = vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
total_clicks = ('sum_click', 'sum'), total_active_days = ('date', 'nunique')).reset_index()
vle['avg_clicks_per_day'] = vle['total_clicks'] / vle['total_active_days']
df = df.merge(vle, on=['code_module', 'code_presentation', 'id_student'],
how = 'left')
df[['total_clicks', 'total_active_days', 'avg_clicks_per_day']] = df[['total_clicks', 'total_active_days', 'avg_clicks_per_day']].fillna(0)
df['total_clicks'] = df['total_clicks'].astype('int64')
df['total_active_days'] = df['total_active_days'].astype('int64')
df['avg_clicks_per_day'] = df['avg_clicks_per_day'].astype('float64')
df['days_registered_before_start'] = df['date_registration']
df = df[RESULT_FEATURES]
df = df.set_index('id_student')
return df
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
###
students = students_df[STUDENT_FEATURES]
registrations = registrations_df[REGISTRATION_FEATURES]
features = pd.merge(students, registrations, on=['code_module', 'code_presentation', 'id_student'], how='left')
vle = vle_df[vle_df['date'] < cutoff_day]
vle_grouped = vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(total_clicks=('sum_click', 'sum'),
total_active_days = ('date', 'nunique')).reset_index()
vle_grouped['avg_clicks_per_day'] = vle_grouped['total_clicks'] / vle_grouped['total_active_days']
merged = pd.merge(features, vle_grouped, how='left', on=['code_module', 'code_presentation', 'id_student'])
vle_cols = ['total_clicks', 'total_active_days', 'avg_clicks_per_day']
merged[vle_cols] = merged[vle_cols].fillna(0)
dtype_map = {
'total_clicks':'int64',
'total_active_days':'int64',
'avg_clicks_per_day':'float64'
}
merged = merged.astype(dtype_map)
merged['days_registered_before_start'] = merged['date_registration']
merged = merged[RESULT_FEATURES]
merged.set_index('id_student', inplace=True)
return merged
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
students_df = students_df[STUDENT_FEATURES]
registrations_df = registrations_df[REGISTRATION_FEATURES]
students_registration_merged = pd.merge(students_df,registrations_df,on=['code_module','code_presentation','id_student'],how='left')
vle_df = vle_df[vle_df['date']<cutoff_day]
vle_df = vle_df.groupby(['code_module','code_presentation','id_student'],as_index=False).agg(total_clicks=('sum_click','sum'),total_active_days=('date','nunique'))
vle_df['avg_clicks_per_day'] = vle_df['total_clicks']/vle_df['total_active_days']
column_name = vle_df.columns.tolist()
feature_vle_merged = pd.merge(students_registration_merged,vle_df,on=['code_module','code_presentation','id_student'],how='left')
feature_vle_merged[column_name] = feature_vle_merged[column_name].fillna(0)
feature_vle_merged = feature_vle_merged.astype({'total_clicks':'int64','total_active_days':'int64','avg_clicks_per_day':'float64'})
feature_vle_merged['days_registered_before_start']=feature_vle_merged['date_registration']
result = feature_vle_merged[RESULT_FEATURES]
result = result.set_index('id_student')
return result
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
###
### YOUR CODE HERE
###
df = students_df[STUDENT_FEATURES].copy()
reg_subset = registrations_df[REGISTRATION_FEATURES]
df = df.merge(reg_subset, on=['code_module', 'code_presentation', 'id_student'], how='left')
vle = vle_df[vle_df['date'] < cutoff_day].copy()
vle_stats = vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
total_clicks=('sum_click', 'sum'),
total_active_days=('date', 'nunique')
).reset_index()
vle_stats['avg_clicks_per_day'] = vle_stats['total_clicks'] / vle_stats['total_active_days']
df = df.merge(vle_stats, on=['code_module', 'code_presentation', 'id_student'], how='left')
cols = ['total_clicks', 'total_active_days', 'avg_clicks_per_day']
df[cols] = df[cols].fillna(0)
df['total_clicks'] = df['total_clicks'].astype('int64')
df['total_active_days'] = df['total_active_days'].astype('int64')
df['avg_clicks_per_day'] = df['avg_clicks_per_day'].astype('float64')
df['days_registered_before_start'] = df['date_registration']
#print(df.set_index('id_student'))
#print(RESULT_FEATURES)
return df[RESULT_FEATURES].set_index('id_student')
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
students_df_1 = students_df.copy()
students_df_1 = students_df_1[STUDENT_FEATURES]
registrations_df_1 = registrations_df.copy()
registrations_df_1 = registrations_df_1[REGISTRATION_FEATURES]
merged = students_df_1.merge(registrations_df_1, on = ['code_module', 'code_presentation', 'id_student'], how = 'left')
#step 3
vle_df_1 = vle_df.copy()
vle_df_1 = vle_df_1[vle_df_1['date'] < cutoff_day]
grouped = vle_df_1.groupby(['code_module', 'code_presentation', 'id_student'])\
.agg(total_clicks = ('sum_click', 'sum'),
total_active_days = ('date', 'nunique'))\
.reset_index()
grouped['avg_clicks_per_day'] = grouped['total_clicks']/grouped['total_active_days']
#step4
merged_new = merged.merge(grouped, on = ['code_module', 'code_presentation', 'id_student'], how = 'left')
merged_new[['total_clicks', 'total_active_days', 'avg_clicks_per_day']] = merged_new[['total_clicks', 'total_active_days', 'avg_clicks_per_day']].fillna(0)
merged_new = merged_new.astype({'total_clicks': int,
'total_active_days': int,
'avg_clicks_per_day': float})
merged_new['days_registered_before_start'] = merged_new['date_registration']
output = merged_new[RESULT_FEATURES]
output = output.set_index('id_student')
return output
In [ ]:
### Solution - Exercise 4
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
res = pd.merge(left=students_df[STUDENT_FEATURES],
right=registrations_df[REGISTRATION_FEATURES],
how='left',
on=['code_module', 'code_presentation', 'id_student'])
vle_processed = (
vle_df.query('`date` < @cutoff_day')
.groupby(by=['code_module', 'code_presentation', 'id_student'])
.agg({'sum_click': 'sum', 'date': 'nunique'})
.rename({'sum_click': 'total_clicks', 'date': 'total_active_days'}, axis=1)
.assign(avg_clicks_per_day = lambda x: x['total_clicks'] / x['total_active_days'])
.reset_index(drop=False)
)
res = (
pd.merge(
left=res,
right=vle_processed,
how='left',
on=['code_module', 'code_presentation', 'id_student']
).fillna(
{'total_clicks': 0, 'total_active_days': 0, 'avg_clicks_per_day': 0}
).astype(
{'total_clicks': np.int64, 'total_active_days': np.int64, 'avg_clicks_per_day': np.float64}
).rename(
{'date_registration': 'days_registered_before_start'},
axis=1
)
)[RESULT_FEATURES].set_index('id_student')
return res
Ex5¶
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
student_assessments_df = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id]
student_assessments_df = student_assessments_df[(student_assessments_df['date_submitted'] >= label_start_day) & (student_assessments_df['date_submitted'] <= label_end_day)]
student_assessments_df = student_assessments_df[['id_student', 'code_module', 'code_presentation', 'date_submitted']]
merged = students_df.merge(student_assessments_df, how = 'left', on=['id_student', 'code_module', 'code_presentation'])
merged['label'] = merged['date_submitted'].notna().astype(int)
return merged[['id_student', 'label', 'code_module', 'code_presentation']].set_index('id_student')
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
df = student_assessments_df[student_assessments_df.id_assessment == first_assessment_id]
df = df[(student_assessments_df.date_submitted >= label_start_day) & (student_assessments_df.date_submitted <= label_end_day)]
COLS = ['id_student', 'code_module', 'code_presentation', 'date_submitted']
df = df[COLS]
merge_df = pd.merge(students_df, df, how='left', on=['id_student', 'code_module', 'code_presentation'])
merge_df['label'] = merge_df['date_submitted'].notna().astype(int)
new_cols = ['id_student','code_module','code_presentation','label']
merge_df = merge_df[new_cols].set_index('id_student')
return merge_df
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
df = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id].copy()
submit_df = df[(df['date_submitted'] >= label_start_day) &
(df['date_submitted'] <= label_end_day)][['id_student', 'code_module', 'code_presentation']]
submit_df['sumbit'] = 1
df_2 = students_df[['id_student', 'code_module', 'code_presentation']].copy()
final = df_2.merge(submit_df, on=['id_student', 'code_module', 'code_presentation'],
how='left')
final['label'] = final['sumbit'].fillna(0)
final = final[['id_student', 'code_module', 'code_presentation', 'label']]
final['label'] = final['label'].astype('int64')
final = final.set_index('id_student')
return final
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
###
submission = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id]
submission = submission[ (label_start_day <= submission['date_submitted']) & (label_end_day >= submission['date_submitted'])]
submission = submission[['id_student', 'code_module', 'code_presentation']]
submission['label'] = 1
merged = pd.merge(students_df[['id_student', 'code_module', 'code_presentation']], submission, how='left', on=['id_student', 'code_module', 'code_presentation'] )
merged['label'] = merged['label'].fillna(0)
merged['label'] = merged['label'].astype('int')
merged = merged.set_index('id_student')
return merged[['code_module', 'code_presentation', 'label']]
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
sa = student_assessments_df[student_assessments_df['id_assessment']==first_assessment_id]
sa = sa[sa['date_submitted'].between(label_start_day, label_end_day)]
sa = sa[['id_student','code_module','code_presentation']]
students_df = students_df[['id_student','code_module','code_presentation']]
df = pd.merge(students_df,sa,on=['id_student','code_module','code_presentation'],how='left',indicator=True)
df['label'] = (df['_merge']=='both').astype(int)
df = df.drop(columns=['_merge'])
df=df.set_index(['id_student'])
return df
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
###
### YOUR CODE HERE
###
sa_filtered = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id]
mask = (sa_filtered['date_submitted'] >= label_start_day) & (sa_filtered['date_submitted'] <= label_end_day)
sub = sa_filtered[mask][['id_student', 'code_module', 'code_presentation']].copy()
sub['label'] = 1
student_subset = students_df[['id_student', 'code_module', 'code_presentation']]
ans = student_subset.merge(sub,
on=['id_student', 'code_module', 'code_presentation'],
how='left')
ans['label'] = ans['label'].fillna(0).astype('int64')
return ans.set_index('id_student')[['code_module', 'code_presentation', 'label']]
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
student_assessments_df_1 = student_assessments_df.copy()
student_assessments_df_1 = student_assessments_df_1[student_assessments_df_1['id_assessment'] == first_assessment_id]
student_assessments_df_1 = student_assessments_df_1[(student_assessments_df_1['date_submitted'] >= label_start_day) & (student_assessments_df_1['date_submitted'] <= label_end_day)]
submission = student_assessments_df_1[['id_student', 'code_module', 'code_presentation']]
students_df_1 = students_df.copy()
students_df_1 = students_df[['id_student', 'code_module', 'code_presentation']]
merged = pd.merge(students_df_1, submission, on = ['id_student', 'code_module', 'code_presentation'], how = 'inner', suffixes = ('', '_x'))
merged['label'] = 1
result = students_df_1.merge(merged, on = ['id_student', 'code_module', 'code_presentation'], how = 'left').fillna(0)
result['label'] = result['label'].astype(int)
result = result.set_index('id_student')
return result
In [ ]:
### Solution - Exercise 5
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
# req 2 bullet point 2 says to only keep columns id_student, code_module, code_presentation
# however if we do that, then after left merging with students_df, we can't know which one is date_submitted is within the window
# we should not do this until near the end
su = (
student_assessments_df
.query('`id_assessment` == @first_assessment_id')
.query('@label_start_day <= `date_submitted` <= @label_end_day')
)
res = pd.merge(left=students_df, right=su, how='left', on=['id_student', 'code_module', 'code_presentation'])
res['label'] = np.where(np.isnan(res['date_submitted'].values), 0, 1)
# direction says to return columns in the below order
# return res[['id_student', 'label', 'code_module', 'code_presentation']].set_index('id_student')
# however test cases want this order
return res[['id_student', 'code_module', 'code_presentation', 'label']].set_index('id_student')
Ex6¶
In [ ]:
def analyze_class_imbalance(labels):
total_samples = len(labels)
positive_count = len(labels[labels == 1])
negative_count = len(labels[labels == 0])
positive_ratio = round(positive_count/total_samples, 4)
imbalance_ratio = round(max(positive_count, negative_count) / min(positive_count, negative_count), 4) if positive_count!=0 and negative_count !=0 else np.inf
majority_class = 1 if positive_count >= negative_count else 0
return {
'imbalance_ratio' : imbalance_ratio,
'majority_class' : majority_class,
'negative_count' : negative_count,
'positive_count' : positive_count,
'positive_ratio' : positive_ratio,
'total_samples' : total_samples
}
In [ ]:
def analyze_class_imbalance(labels):
total_samples = labels.count()
pos = labels.sum()
neg = total_samples - pos
ratio = round(pos / total_samples,4)
if (pos == 0) or (neg == 0):
imb = float('inf')
else:
imb = round(max(pos, neg) / min(pos, neg),4)
if pos >= neg:
maj = 1
else:
maj = 0
result = {
'total_samples': total_samples,
'positive_count': pos,
'negative_count':neg,
'positive_ratio':ratio,
'imbalance_ratio':imb,
'majority_class':maj
}
return result
In [ ]:
def analyze_class_imbalance(labels):
total_samples = len(labels)
positive_count = (labels == 1).sum()
negative_count = (labels == 0).sum()
positive_ratio = round(positive_count / total_samples, 4)
if positive_count == 0 or negative_count == 0:
imbalance_ratio = float('inf')
else:
imbalance_ratio = round(max(positive_count, negative_count)
/ min(positive_count, negative_count), 4)
if positive_count >= negative_count:
majority_class = 1
else:
majority_class = 0
return {'total_samples': total_samples,
'positive_count': positive_count,
'negative_count': negative_count,
'positive_ratio': positive_ratio,
'imbalance_ratio': imbalance_ratio,
'majority_class': majority_class}
In [ ]:
def analyze_class_imbalance(labels):
###
from collections import Counter
counts = Counter(labels)
total_samples = len(labels)
positive_count = counts[1]
negative_count = counts[0]
positive_ratio = round(positive_count / total_samples,4)
if positive_count == 0 or negative_count == 0:
imbalance_ratio = float('inf')
else:
imbalance_ratio = round(max(positive_count, negative_count) / min(positive_count, negative_count), 4)
if positive_count >= negative_count:
majority_class = 1
else:
majority_class = 0
return {
'total_samples' : total_samples,
'positive_count' : positive_count,
'negative_count' : negative_count,
'positive_ratio': positive_ratio,
'imbalance_ratio' : imbalance_ratio,
'majority_class': majority_class
}
In [ ]:
def analyze_class_imbalance(labels):
res = dict()
res['total_samples']=len(labels)
res['positive_count']=len(labels[labels==1])
res['negative_count']=res['total_samples']-res['positive_count']
res['positive_ratio']=float(round(res['positive_count']/res['total_samples'],4))
if res['positive_count']==0 or res['negative_count']==0:
res['imbalance_ratio']=float('inf')
else:
res['imbalance_ratio']=float(round(max(res['positive_count'],res['negative_count'])/min(res['positive_count'],res['negative_count']),4))
if res['positive_count']>=res['negative_count']:
res['majority_class']=1
else:
res['majority_class']=0
return res
In [ ]:
def analyze_class_imbalance(labels):
###
### YOUR CODE HERE
###
ts = len(labels)
pc = (labels == 1).sum()
nc = (labels == 0).sum()
pr = round(pc / ts, 4) if ts > 0 else 0.0
if pc == 0 or nc == 0:
ir = float('inf')
else:
ir = round(max(pc, nc) / min(pc, nc), 4)
if pc >= nc:
mc = 1
else:
mc = 0
return {
'total_samples': ts,
'positive_count': pc,
'negative_count': nc,
'positive_ratio': pr,
'imbalance_ratio': ir,
'majority_class': mc
}
In [ ]:
def analyze_class_imbalance(labels):
total_samples = len(labels)
positive_count = labels.sum()
negative_count = total_samples - positive_count
positive_ratio = round(positive_count/total_samples, 4)
if positive_count == 0 or negative_count == 0:
imbalance_ratio = float('inf')
else:
imbalance_ratio = round(max(positive_count, negative_count)/min(positive_count, negative_count), 4)
if positive_count >= negative_count:
majority_class = 1
else:
majority_class = 0
return {
'imbalance_ratio': imbalance_ratio,
'majority_class': majority_class,
'negative_count': negative_count,
'positive_count': positive_count,
'positive_ratio': positive_ratio,
'total_samples': total_samples
}
In [ ]:
### Solution - Exercise 6
def analyze_class_imbalance(labels):
from math import inf
total_samples = len(labels)
vcounts = labels.value_counts()
if len(vcounts) == 2:
pos_count, neg_count = vcounts.loc[1], vcounts.loc[0]
else:
if vcounts.index.isin([1]):
pos_count, neg_count = vcounts.loc[1], 0
else:
pos_count, neg_count = 0, vcounts.loc[0]
pos_ratio = round(pos_count / total_samples, 4)
imb_ratio = inf if (pos_count == 0 or neg_count == 0) else round(max(pos_count, neg_count) / min(pos_count, neg_count), 4)
maj_class = 1 if pos_ratio >= 0.5 else 0
return {'total_samples': total_samples,
'positive_count': pos_count,
'negative_count': neg_count,
'positive_ratio': pos_ratio,
'imbalance_ratio': imb_ratio,
'majority_class': maj_class}
Ex7¶
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
outcome_df = pd.DataFrame({'outcome' : outcome_series.values}).set_index(outcome_series.index)
combined = features_df.merge(outcome_df, on = 'id_student')
corr_df = combined.corr()
correlations = []
for feature in numerical_features:
correlations.append(round(corr_df.at[feature, 'outcome'], 4))
abs_correlations = [abs(corr) for corr in correlations]
return pd.DataFrame({'feature_name' : numerical_features, 'correlation' : correlations, 'abs_correlation' : abs_correlations}).sort_values('abs_correlation', ascending=False).dropna().reset_index(drop=True)
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
df = features_df.merge(outcome_series.rename('outcome'), how='left',
left_index=True, right_index=True)
result_df = df.corr(min_periods = 2)
corrs = []
for f in numerical_features:
vals = pd.Series(df[f])
corrs.append(round(result_df.at[f, 'outcome'],4))
abs_corrs = [abs(c) for c in corrs]
result = {
'feature_name':numerical_features,
'correlation':corrs,
'abs_correlation':abs_corrs
}
result = pd.DataFrame(result)
result = result.sort_values(by = ['abs_correlation'], ascending = False)
result.reset_index(drop=True, inplace = True)
result.dropna(inplace = True)
return result
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
common_index = features_df.index.intersection(outcome_series.index)
X = features_df.loc[common_index]
y = outcome_series.loc[common_index]
results = []
for feature in numerical_features:
feat_values = X[feature]
count = feat_values.count()
std_dev = feat_values.std()
corr_val = 0.0
if count >= 2 and std_dev > 0:
corr_val = feat_values.corr(y)
if pd.isna(corr_val):
corr_val = 0.0
rounded_corr = round(float(corr_val), 4)
abs_corr = round(abs(rounded_corr), 4)
results.append({
'feature_name': feature,
'correlation': rounded_corr,
'abs_correlation': abs_corr
})
corr_df = pd.DataFrame(results)
corr_df = corr_df.sort_values(by='abs_correlation', ascending=False)
corr_df.index = sorted(corr_df.index)
return corr_df
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
features, outcome = features_df.align(outcome_series, join='inner', axis=0)
res = []
for feature in numerical_features:
x = features[feature]
df = pd.concat([x,outcome],axis=1)
if x.isna().sum()!=0:
corr=0
else:
if x.std()> 0:
corr=df.corr(method='pearson',min_periods=3).iloc[0,1]
res.append({
'feature_name': feature,
'correlation': round(corr, 4),
'abs_correlation': round(abs(corr), 4)
})
result_df = pd.DataFrame(res)
result_df = result_df.sort_values(by='abs_correlation', ascending=False).reset_index(drop=True)
return result_df
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
features_df_new = features_df.reset_index()
outcome_series_new = outcome_series.reset_index()
merged = features_df_new.merge(outcome_series_new, on = 'id_student', how = 'inner')
merged = merged.rename(columns = {0: 'target'})
from collections import defaultdict
output = defaultdict(list)
for feature in numerical_features:
a_feature = merged[feature]
# a_feature = a_feature.fillna(0)
num_values = a_feature.nunique()
std_values = a_feature.std()
if num_values >= 2 and std_values > 0:
correlation = round(a_feature.corr(outcome_series_new.iloc[:, 1], method = 'pearson'), 4)
abs_correlation = round(abs(correlation), 4)
output['feature_name'].append(feature)
output['correlation'].append(correlation)
output['abs_correlation'].append(abs_correlation)
output = pd.DataFrame(output).sort_values(by = 'abs_correlation', ascending = False).reset_index(drop = True)
return output
In [ ]:
### Solution - Exercise 7
def compute_feature_correlations(features_df, outcome_series, numerical_features):
res = (
pd.merge(left=features_df[numerical_features], right=pd.DataFrame({'binary':outcome_series}), how='inner', on='id_student')
)
res = (
res.corr()[['binary']]
.reset_index()
.rename({'index': 'feature_name', 'binary': 'correlation'}, axis=1)
)
res = (
res
.assign(abs_correlation=lambda x: x['correlation'].abs())
.query('`feature_name` != "binary"')
.sort_values('abs_correlation', ascending=False)
).round(4).reset_index(drop=True)[['feature_name', 'correlation', 'abs_correlation']]
return res.dropna()
Ex8¶
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
course_students = students_df[
(students_df['code_module'] == target_module) &
(students_df['code_presentation'] == target_presentation)
].copy()
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
submitted_by_current = course_assessments[
course_assessments['date_submitted'] <= current_day
]['id_student'].unique()
train_labels = pd.Series(
course_students['id_student'].isin(submitted_by_current).astype(int).values,
index=course_students['id_student']
)
train_labels.index.name = 'id_student'
test_students = course_students[
~course_students['id_student'].isin(submitted_by_current)
]
submitted_after_current = course_assessments[
(course_assessments['date_submitted'] > current_day) &
(course_assessments['date_submitted'] <= cutoff_day)
]['id_student'].unique()
test_labels = pd.Series(
test_students['id_student'].isin(submitted_after_current).astype(int).values,
index=test_students['id_student']
)
test_labels.index.name = 'id_student'
return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
###
### YOUR CODE HERE
###
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
course_students = students_df.copy()
course_students = course_students[
(course_students['code_module'] == target_module) &
(course_students['code_presentation']==target_presentation)
]
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
submitted_by_current = course_assessments[
course_assessments['date_submitted'] <= current_day
]['id_student'].unique()
# submitted_after_current = course_assessments[
# course_assessments['date_submitted'] > current_day
# ]['id_student'].unique()
train_labels = pd.Series(
course_students['id_student'].isin(submitted_by_current).astype(int).values,
index=course_students['id_student']
)
submitted_after_current = course_assessments[
(course_assessments['date_submitted'] > current_day) &
(course_assessments['date_submitted'] <= cutoff_day)
]['id_student'].unique()
test_students = course_students[~course_students['id_student'].isin(submitted_by_current)]
test_labels = pd.Series(
test_students['id_student'].isin(submitted_after_current).astype(int).values,
index=test_students['id_student']
)
return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
###
### YOUR CODE HERE
###
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
course_students = students_df[(students_df['code_module'] == target_module) &
(students_df['code_presentation'] == target_presentation)].copy()
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
submitted_by_current = course_assessments[
course_assessments['date_submitted'] <= current_day
]['id_student'].unique()
submitted_after_current = course_assessments[
(course_assessments['date_submitted'] > current_day) &
(course_assessments['date_submitted'] <= cutoff_day)
]['id_student'].unique()
train_labels = pd.Series(
course_students['id_student'].isin(submitted_by_current).astype(int).values,
index=course_students['id_student']
)
train_labels.index.name = 'id_student'
test_students = course_students[train_labels.values == 0]
test_labels = pd.Series(
test_students['id_student'].isin(submitted_after_current).astype(int).values,
index=test_students['id_student']
)
test_labels.index.name = 'id_student'
return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
###
###
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
course_students = students_df[
(students_df['code_module'] == target_module) &
(students_df['code_presentation'] == target_presentation)
].copy()
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
# train
submitted_by_current = course_assessments[
course_assessments['date_submitted'] <= current_day
]['id_student'].unique()
train_labels = pd.Series(0, index=course_students['id_student'])
train_labels.loc[train_labels.index.isin(submitted_by_current)] = 1
train_labels.index.name = 'id_student'
train_labels = train_labels.astype(int)
# test
test_population_ids = train_labels[train_labels == 0].index
submitted_after_current = course_assessments[
(course_assessments['date_submitted'] > current_day) &
(course_assessments['date_submitted'] <= cutoff_day)
]['id_student'].unique()
test_labels = pd.Series(0, index=test_population_ids)
test_labels.loc[test_labels.index.isin(submitted_after_current)] = 1
test_labels.index.name = 'id_student'
# train_labels = pd.Series(
# course_students['id_student'].isin(submitted_by_current).astype(int).values,
# index=course_students['id_student']
# )
# test_labels = pd.Series(
# course_students['id_student'].isin(submitted_after_current).astype(int).values,
# index=course_students['id_student']
# )
# test_labels.index.name = 'id_student'
return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
course_students = students_df[
(students_df['code_module'] == target_module) &
(students_df['code_presentation'] == target_presentation)
].copy()
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
submitted_by_current = course_assessments[
course_assessments['date_submitted'] <= current_day
]['id_student'].unique()
train_labels = pd.Series(
course_students['id_student'].isin(submitted_by_current).astype(int).values,
index=course_students['id_student']
)
train_labels.index.name = 'id_student'
remaining_students = course_students[~course_students['id_student'].isin(submitted_by_current)]
submitted_in_window = course_assessments[
(course_assessments['date_submitted'] > current_day) &
(course_assessments['date_submitted'] <= cutoff_day)
]['id_student'].unique()
test_labels = pd.Series(
remaining_students['id_student'].isin(submitted_in_window).astype(int).values,
index=remaining_students['id_student']
)
test_labels.index.name = 'id_student'
return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
###
### YOUR CODE HERE
###
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
course_students = students_df[(students_df['code_module'] == target_module)
& (students_df['code_presentation'] == target_presentation)].copy()
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
submitted_by_current = course_assessments[
course_assessments['date_submitted'] <= current_day
]['id_student'].unique()
submitted_after_current = course_assessments[
course_assessments['date_submitted'] > current_day
]['id_student'].unique()
train_labels = pd.Series(
course_students['id_student'].isin(submitted_by_current).astype(int).values,
index=course_students['id_student']
)
train_labels.index.name = 'id_student'
test_student_ids = course_students[~course_students['id_student'].isin(submitted_by_current)]['id_student']
submitted_in_window = course_assessments[
(course_assessments['date_submitted'] > current_day) &
(course_assessments['date_submitted'] <= cutoff_day)
]['id_student'].unique()
test_labels = pd.Series(
test_student_ids.isin(submitted_in_window).astype(int).values,
index=test_student_ids
)
test_labels.index.name = 'id_student'
return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
###
### YOUR CODE HERE
###
#3
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
# course_students = students_df.copy()
#4
course_students = students_df[
(students_df['code_module'] == target_module) &
(students_df['code_presentation'] == target_presentation)
].copy()
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
#5
submitted_by_current = course_assessments[
course_assessments['date_submitted'] <= current_day
]['id_student'].unique()
submitted_after_current = course_assessments[
(course_assessments['date_submitted'] > current_day)&
(course_assessments['date_submitted'] <= cutoff_day)
]['id_student'].unique()
train_labels = pd.Series(
course_students['id_student'].isin(submitted_by_current).astype(int).values,
index=course_students['id_student']
)
train_labels.index.name = 'id_student'
# test_labels = pd.Series(
# ((~course_students['id_student'].isin(submitted_by_current)) & (course_students['id_student'].isin(submitted_after_current))).astype(int).values,
# index = course_students['id_student']
# )
test_labels = pd.Series(
course_students['id_student'].isin(submitted_after_current).astype(int).values,
index=course_students['id_student']
)
test_labels.index.name = 'id_student'
train_labels_0 = train_labels[train_labels == 0].index
test_labels = test_labels[test_labels.index.isin(train_labels_0)]
return train_labels, test_labels
In [ ]:
### Solution - Exercise 8
def DEBUG_create_temporal_split(students_df, student_assessments_df,
first_assessment_id, current_day, cutoff_day):
target_module = students_df['code_module'].iloc[0]
target_presentation = students_df['code_presentation'].iloc[0]
course_students = students_df.query('`code_module` == @target_module and `code_presentation` == @target_presentation')
course_assessments = student_assessments_df[
(student_assessments_df['code_module'] == target_module) &
(student_assessments_df['code_presentation'] == target_presentation) &
(student_assessments_df['id_assessment'] == first_assessment_id)
].copy()
submitted_by_current = course_assessments.query('`date_submitted` <= @current_day')['id_student'].unique()
submitted_after_current = course_assessments.query('`date_submitted` > @current_day and `date_submitted` <= @cutoff_day')['id_student'].unique()
train_labels = pd.Series(course_students['id_student'].isin(submitted_by_current).astype(int).values,
index=course_students['id_student'])
left = course_students.query('`id_student` not in @submitted_by_current')
# display(left)
test_labels = pd.Series(
(
(left['id_student'].isin(submitted_after_current))
).astype(int).values, index=left['id_student'])
return train_labels, test_labels
Ex9¶
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
y_pred = (y_pred_proba >= 0.5).astype(int)
t_pos = sum((y_pred == 1) & (y_true == 1))
f_pos = sum((y_pred == 1) & (y_true == 0))
t_neg = sum((y_pred == 0) & (y_true == 0))
f_neg = sum((y_pred == 0) & (y_true == 1))
precision = t_pos / (t_pos + f_pos) if t_pos + f_pos > 0 else 0.0
recall = t_pos / (t_pos + f_neg) if t_pos + f_neg > 0 else 0.0
f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
return {
'f1_score' : round(f1_score, 4),
'precision' : round(precision, 4),
'recall' : round(recall, 4)
}
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
prediction = np.where(y_pred_proba > 0.5,1,0)
TP = 0
FP = 0
TN = 0
FN = 0
pos = False
act = False
for pred,true in zip(prediction, y_true):
if (pred == 1) & (true == 1):
pos = True
act = True
TP += 1
elif (pred == 1) & (true == 0):
pos = True
FP += 1
elif (pred == 0) & (true == 1):
act = True
FN += 1
elif (pred == 0) & (true == 0):
TN += 1
if pos == 0:
precision = 0
else:
precision = TP / (TP+FP)
if act == False:
recall = 0
else:
recall = TP / (TP + FN)
if (precision == 0) & (recall == 0):
F1 = 0
else:
F1 = 2 * (precision * recall) / (precision + recall)
result = {
'precision':round(precision,4),
'recall':round(recall,4),
'f1_score':round(F1,4)
}
return result
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
y_pred = [1 if p>=0.5 else 0 for p in y_pred_proba]
tp = sum(1 for pred, true in zip(y_pred, y_true) if pred==1 and true==1)
fp = sum(1 for pred, true in zip(y_pred, y_true) if pred==1 and true==0)
fn = sum(1 for pred, true in zip(y_pred, y_true) if pred==0 and true==1)
precision = tp / (tp+fp) if (tp+fp) != 0 else 0
recall = tp / (tp+fn) if (tp+fn) != 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
return {'precision': round(precision, 4),
'recall': round(recall, 4),
'f1_score': round(f1_score, 4)}
In [ ]:
### Solution - Exercise 9
def calculate_metrics(y_pred_proba, y_true):
###
import numpy as np
y_pred = np.where(y_pred_proba >= 0.5, 1, 0)
tp = np.sum((y_pred == 1) & (y_true == 1))
fp = np.sum((y_pred == 1) & (y_true == 0))
tn = np.sum((y_pred == 0) & (y_true == 0))
fn = np.sum((y_pred == 0) & (y_true == 1))
if (tp + fp) > 0:
precision = tp / (tp + fp)
else:
precision = 0.0
if (tp + fn) > 0:
recall = tp / (tp + fn)
else:
recall = 0.0
if precision == 0.0 and recall == 0.0:
f1_score = 0.0
else:
f1_score = 2 * (precision * recall) / (precision + recall)
return {
'precision' : round(precision, 4),
'recall': round(recall, 4),
'f1_score': round(f1_score, 4)
}
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
y_pred_proba = [1 if c>=0.5 else 0 for c in y_pred_proba]
data = {'prediction':y_pred_proba,'actual':y_true}
df=pd.DataFrame(data)
tp = len(df[(df['prediction']==1) & (df['actual']==1)])
fp = len(df[(df['prediction']==1) & (df['actual']==0)])
tn = len(df[(df['prediction']==0) & (df['actual']==0)])
fn = len(df[(df['prediction']==0) & (df['actual']==1)])
if tp+fp ==0:
precision = 0
else:
precision = tp / (tp +fp)
if tp+fn==0:
recall = 0
else:
recall = tp/(tp+fn)
if precision ==0 and recall==0:
f1_score=0
else:
f1_score = 2*(precision*recall)/ (precision + recall)
res = dict()
res['precision']=float(round(precision,4))
res['recall']=float(round(recall,4))
res['f1_score']=float(round(f1_score,4))
return res
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
y_pred = (y_pred_proba >= 0.5).astype(int)
y_true_cp = np.array(y_true)
tp = np.sum((y_pred == 1) & (y_true_cp == 1))
fp = np.sum((y_pred == 1) & (y_true_cp == 0))
fn = np.sum((y_pred == 0) & (y_true_cp == 1))
if (tp + fp) == 0:
precision = 0.0
else:
precision = tp / (tp + fp)
if (tp + fn) == 0:
recall = 0.0
else:
recall = tp / (tp + fn)
if (precision + recall) == 0:
f1_score = 0.0
else:
f1_score = 2 * (precision * recall) / (precision + recall)
return {
'precision': round(float(precision), 4),
'recall': round(float(recall), 4),
'f1_score': round(float(f1_score), 4)
}
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
prediction = np.where(y_pred_proba >= 0.5, 1, 0)
TP = ((prediction == 1) & (y_true == 1)).sum()
FP = ((prediction == 1) & (y_true == 0)).sum()
TN = ((prediction == 0) & (y_true == 0)).sum()
FN = ((prediction == 0) & (y_true == 1)).sum()
if TP == 0 and FP == 0:
precision = 0.0
else:
precision = TP/(TP+FP)
if TP ==0 and FN == 0:
recall = 0.0
else:
recall = TP/(TP+FN)
if precision == 0 and recall == 0:
f1_score = 0.0
else:
f1_score = round(2*(precision*recall)/(precision + recall), 4)
return {
'f1_score': f1_score,
'precision': round(precision, 4),
'recall': round(recall, 4)
}
In [ ]:
### Solution - Exercise 9
def calculate_metrics(y_pred_proba, y_true):
y_pred = np.where(y_pred_proba >= 0.5, 1, 0)
tp = np.sum(y_pred & y_true)
tn = np.sum(np.where((y_pred == 0) & (y_true == 0), 1, 0))
fp = np.sum(np.where((y_pred == 1) & (y_true == 0), 1, 0))
fn = np.sum(np.where((y_pred == 0) & (y_true == 1), 1, 0))
recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0
precision = tp / (tp + fp) if (tp + fp) != 0 else 0.0
f1 = 2 * (precision * recall) / (precision + recall) if (recall != 0.0 or precision != 0) else 0.0
return {'precision': round(precision, 4), 'recall': round(recall, 4), 'f1_score': round(f1, 4)}
Ex10¶
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
k = int(np.ceil(k_percent * len(y_true)))
sorted_indices = np.argsort(y_pred_proba)[::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
precision = np.sum(top_k_labels == 1) / k
return round(float(precision), 2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
###
### YOUR CODE HERE
###
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
k = int(np.ceil(len(y_pred_proba) * k_percent))
if k < 1:
k = 1
sorted_indices = np.argsort(y_pred_proba)[::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
precision = np.sum(top_k_labels) / k
precision = np.round(precision, 2)
precision.astype(float)
return precision
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
###
### YOUR CODE HERE
###
import math
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
k = math.ceil(len(y_pred_proba) * k_percent)
sorted_indices = np.argsort(y_pred_proba)[::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
precision = np.sum(top_k_labels) / k
return round(precision, 2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
###
### YOUR CODE HERE
###
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
n = len(y_true)
k = int(np.ceil(k_percent * n))
if k < 1:
k = 1
sorted_indices = np.argsort(y_pred_proba)[::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
precision = np.sum(top_k_labels) / k
return round(precision,2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
import math
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
k = math.ceil(len(y_pred_proba) * k_percent)
if k < 1:
k = 1
sorted_indices = np.argsort(y_pred_proba)[::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
precision = float(round(np.sum(top_k_labels) / k,2))
return precision
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
import math
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
k = math.ceil(len(y_pred_proba) * k_percent)
if k < 1:
k = 1
sorted_indices = np.argsort(y_pred_proba)[::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
precision = float(np.sum(top_k_labels) / k)
return round(precision, 2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
###
### YOUR CODE HERE
###
import math
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
k = math.ceil(len(y_pred_proba) * k_percent) # int
if k < 1:
k = 1
sorted_indices = np.argsort(y_pred_proba)[::-1] # [::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
top_k_pred = y_pred_proba[top_k_indices]
top_k_pred = np.where(top_k_pred >= 0.5, 1, 0)
num_tp_in_topk = ((top_k_labels == 1) & (top_k_pred == 1)).sum()
top_k_precision = float(round(num_tp_in_topk/k, 2))
# precision = np.sum(top_k_labels) / k
return top_k_precision
In [ ]:
### Solution - Exercise 10
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
from math import ceil
y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
k = ceil(len(y_pred_proba) * k_percent)
if k < 1:
k = 1
sorted_indices = np.argsort(y_pred_proba)[::-1]
top_k_indices = sorted_indices[:k]
top_k_labels = y_true[top_k_indices]
top_k_preds = y_pred_proba[top_k_indices]
# display(y_true, y_pred_proba, sorted_indices, top_k_indices, top_k_preds, top_k_labels)
# precision = np.sum(top_k_labels) / k
precision = calculate_metrics(y_pred_proba=top_k_preds, y_true=top_k_labels)['precision']
return round(precision, 2)