SP2026 MT2 TA Alternative Solutions¶

Ex0¶

In [ ]:
vle_activity_summary_query = '''SELECT activity_type, COUNT(*) AS resource_count, 
                                COUNT(DISTINCT code_module || code_presentation) AS courses_using
                                FROM vle
                                GROUP BY activity_type
                                ORDER BY resource_count DESC, courses_using ASC'''
In [ ]:
vle_activity_summary_query = '''
    
    SELECT activity_type, COUNT(DISTINCT v.id) as resource_count, COUNT(DISTINCT c.id) as courses_using
    FROM vle AS v
    JOIN courses AS c
        ON v.code_module = c.code_module
        AND v.code_presentation = c.code_presentation
    GROUP BY activity_type
    ORDER BY resource_count DESC, courses_using ASC
'''
In [ ]:
vle_activity_summary_query = '''
select activity_type, count(*) as resource_count,
count(distinct code_module || code_presentation) as courses_using
from vle
group by activity_type
order by 2 desc, 3 asc
'''
In [ ]:
SELECT activity_type,
                                   COUNT(*) AS resource_count,
                                   COUNT(DISTINCT course_pres) AS courses_using
                            FROM 
                                (
                                    SELECT id,
                                       activity_type,
                                       code_module || '' || code_presentation AS course_pres
                                    FROM vle
                                ) AS sub
                            GROUP BY 1
                            ORDER BY 2 DESC, 3 ASC
In [ ]:
vle_activity_summary_query = '''
    SELECT
        activity_type,
        COUNT(*)AS resource_count,
        COUNT(DISTINCT code_module || ' '|| code_presentation)AS courses_using        
    FROM vle
    GROUP BY activity_type
    ORDER BY resource_count DESC, courses_using ASC    
    '''
In [ ]:
vle_activity_summary_query = 
'''SELECT activity_type, Count(*) as resource_count,  count(DISTINCT temp) as courses_using
        FROM (SELECT id, activity_type, code_module || '_' || code_presentation as temp FROM vle) 
        GROUP BY activity_type 
        ORDER BY resource_count DESC, courses_using ASC'''
In [ ]:
vle_activity_summary_query = '''

WITH temp1 AS(SELECT activity_type, COUNT(*) AS resource_count
FROM vle
GROUP BY activity_type),

temp2 AS(SELECT
activity_type,
COUNT(*)
FROM vle
GROUP BY activity_type, code_module, code_presentation),

temp3 AS(SELECT
activity_type,
COUNT(*) AS courses_using
FROM temp2
GROUP BY activity_type)

SELECT t1.activity_type, t1.resource_count, t3.courses_using
FROM temp1 t1
JOIN temp3 t3 ON t1.activity_type = t3.activity_type
ORDER BY t1.resource_count DESC, courses_using ASC

'''
In [ ]:
### Solution - Exercise 0  
vle_activity_summary_query = '''
SELECT activity_type,
  COUNT(*) AS resource_count,
       COUNT(DISTINCT code_module || code_presentation) AS courses_using
  FROM vle
 WHERE 1 = 1
 GROUP BY activity_type
 ORDER BY resource_count DESC, courses_using ASC;
'''

Ex1¶

In [ ]:
course_pass_rate_query = '''
SELECT courses.code_module, courses.code_presentation, COUNT(*) AS total_students, 
SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 ELSE 0 END) AS passed_students,
ROUND(100.0 * SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 ELSE 0 END)/COUNT(*), 2) AS pass_rate
FROM courses
JOIN studentInfo
ON courses.code_module = studentInfo.code_module AND courses.code_presentation = studentInfo.code_presentation
GROUP BY courses.code_module, courses.code_presentation
HAVING COUNT(*) >= 500
ORDER BY pass_rate DESC, courses.code_presentation ASC
LIMIT 10
'''
In [ ]:
course_pass_rate_query = '''
    SELECT c.code_module, c.code_presentation, 
        COUNT(DISTINCT id_student) as total_students,
        SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 ELSE 0 END) as passed_students,
        ROUND((SUM(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1.0 ELSE 0 END) / COUNT(DISTINCT id_student)) * 100.0,2) as pass_rate
    FROM courses as c
    JOIN studentInfo as s
        ON s.code_presentation = c.code_presentation
        AND s.code_module = c.code_module
    GROUP BY c.code_module, c.code_presentation
    HAVING total_students >= 500
    ORDER BY pass_rate DESC, c.code_presentation ASC
    LIMIT 10
'''
In [ ]:
course_pass_rate_query = '''
select c.code_module, 
c.code_presentation,
count(*) as total_students,
sum(case when s.final_result in ('Pass', 'Distinction') then 1 else 0 end) as passed_students,
round(100.0 * sum(case when s.final_result in ('Pass', 'Distinction') then 1 else 0 end) / count(*), 2) as pass_rate
from courses c 
join studentInfo s
on c.code_module = s.code_module
and c.code_presentation = s.code_presentation
group by c.code_module, c.code_presentation
having count(*) >= 500
order by
5 desc, 1 asc, 2 asc
'''
In [ ]:
course_pass_rate_query = '''
                        SELECT c.code_module,
                               c.code_presentation,
                               COUNT(*) AS total_students,
                               SUM(CASE WHEN s.final_result IN ('Pass', 'Distinction') THEN 1 ELSE 0 END) AS passed_students,
                               ROUND((SUM(CASE WHEN s.final_result IN ('Pass', 'Distinction') THEN 1 ELSE 0 END) * 1.0 / COUNT(*) ) * 100, 2) AS pass_rate
                        FROM courses AS c
                        LEFT JOIN studentInfo AS s
                            ON c.code_module = s.code_module
                            AND c.code_presentation = s.code_presentation
                        GROUP BY 1, 2
                        HAVING total_students > 500
                        ORDER BY 5 DESC, 2 ASC
                         
                         '''
In [ ]:
course_pass_rate_query = '''
        SELECT
            *,
            ROUND(passed_students*100.0/total_students,2)AS pass_rate 
        FROM
            (SELECT
                c.code_module,
                c.code_presentation,
                COUNT(*)AS total_students,
                COUNT(CASE WHEN final_result = 'Pass' OR final_result = 'Distinction' THEN 1 END)AS passed_students
            FROM courses AS c
            JOIN studentInfo AS s
            ON c.code_module = s.code_module AND c.code_presentation = s.code_presentation
            GROUP BY c.code_module,c.code_presentation
            HAVING COUNT(*)>=500) AS t
        ORDER BY pass_rate DESC, code_presentation ASC
    '''
In [ ]:
course_pass_rate_query = '''

With temp AS (SELECT Count(s.Id) as total_students, s.code_module, s.code_presentation,
COUNT(CASE WHEN s.final_result = 'Pass' OR s.final_result = 'Distinction' THEN 1 ELSE NULL END) as passed_students
FROM studentInfo as s
JOIN courses as c ON s.code_module = c.code_module and s.code_presentation = c.code_presentation
GROUP BY s.code_module, s.code_presentation
Having total_students >= 500)

SELECT 
code_module,
code_presentation,
total_students,
passed_students,
Round(CAST(passed_students AS FLOAT)/total_students*100.0, 2) as pass_rate
FROM temp
ORDER BY pass_rate desc, code_presentation asc

'''
In [ ]:
course_pass_rate_query = '''
WITH temp1 AS(
SELECT *
FROM courses c
JOIN studentInfo s
ON c.code_module = s.code_module AND c.code_presentation = s.code_presentation),

temp2 AS(
SELECT code_module, code_presentation, COUNT(*) AS total_students
FROM temp1
GROUP BY code_module, code_presentation),

temp3 AS(SELECT code_module, code_presentation, count(*) AS passed_students
FROM temp1
WHERE final_result = 'Pass' OR final_result = 'Distinction'
GROUP BY code_module, code_presentation)

SELECT temp2.code_module, temp2.code_presentation, temp2.total_students, temp3.passed_students,
ROUND((temp3.passed_students*100.0/temp2.total_students), 2) AS pass_rate
FROM temp2
JOIN temp3
ON temp2.code_module = temp3.code_module AND temp2.code_presentation = temp3.code_presentation
WHERE temp2.total_students >= 500
ORDER BY pass_rate DESC, temp2.code_presentation ASC
'''
In [ ]:
### Solution - Exercise 1  
course_pass_rate_query = '''
SELECT c.code_module,
       c.code_presentation,
       COUNT(DISTINCT si.id_student) AS total_students,
       SUM(CASE WHEN si.final_result IN ('Pass', 'Distinction') THEN 1 END) AS passed_students,
       ROUND(
            SUM(CASE WHEN si.final_result IN ('Pass', 'Distinction') THEN 1.0 END) / COUNT(DISTINCT si.id_student) * 100
            , 2) AS pass_rate
  FROM courses AS c
  INNER JOIN studentInfo AS si ON c.code_module = si.code_module AND c.code_presentation = si.code_presentation
  WHERE 1 = 1
  GROUP BY c.code_module, c.code_presentation
  HAVING COUNT(DISTINCT si.id_student) >= 500
  ORDER BY pass_rate DESC, c.code_presentation ASC
'''

Ex2¶

In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
    CASE
        WHEN sa.date_submitted <= 25 THEN 'Early'
        ELSE 'Late'
    END AS submission_category,
    COUNT(sa.id_student) AS student_count,
    ROUND(AVG(CASE WHEN sv.total_clicks IS NULL THEN 0 ELSE sv.total_clicks END), 2) AS avg_total_clicks,
    ROUND(AVG(CASE WHEN sv.days_active IS NULL THEN 0 ELSE sv.days_active END), 2) AS avg_days_active,
    ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
    SELECT
        id_student,
        code_module,
        code_presentation,
        SUM(sum_click) as total_clicks,
        COUNT(DISTINCT date) as days_active
    FROM studentVle
    WHERE code_module = 'BBB' AND code_presentation = '2013B'
    GROUP BY id_student, code_module, code_presentation
) sv
    ON sa.id_student = sv.id_student
    AND a.code_module = sv.code_module
    AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
SELECT
    CASE
        WHEN sa.date_submitted <= 25 THEN 'Early'
        WHEN sa.date_submitted > 25 THEN 'Late'
    END AS submission_category,
    COUNT(DISTINCT sa.id_student) AS student_count,
    ROUND(AVG(CASE WHEN sv.total_clicks IS NULL THEN 0 ELSE sv.total_clicks END),2) AS avg_total_clicks,
    ROUND(AVG(CASE WHEN sv.days_active IS NULL THEN 0 ELSE sv.days_active END),2) AS avg_days_active,
    CASE 
        WHEN sa.date_submitted IS NULL THEN 0
        ELSE ROUND(AVG(sa.date_submitted),2) END AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a 
    ON sa.id_assessment = a.id_assessment
LEFT JOIN (
    SELECT
        id_student,
        code_module,
        code_presentation,
        SUM(sum_click) AS total_clicks,
        COUNT(DISTINCT date) AS days_active
    FROM studentVle
    WHERE code_module = 'BBB'
        AND code_presentation = '2013B'
    GROUP BY id_student, code_module, code_presentation
) sv
    ON sa.id_student = sv.id_student
    AND a.code_module = sv.code_module
    AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
    CASE
        WHEN sa.date_submitted <= 25 THEN 'Early'
        ELSE 'Late'
    END AS submission_category,
    COUNT(distinct sa.id_student) AS student_count,
    round(AVG(coalesce(sv.total_clicks, 0)), 2) AS avg_total_clicks,
    round(AVG(coalesce(sv.days_active, 0)), 2) AS avg_days_active,
    round(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
left JOIN (
    SELECT
        id_student,
        code_module,
        code_presentation,
        SUM(sum_click) as total_clicks,
        COUNT(distinct date) as days_active
    FROM studentVle
    GROUP BY id_student, code_module, code_presentation
) sv
    ON sa.id_student = sv.id_student
    AND a.code_module = sv.code_module
    AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
and a.code_module = 'BBB'
and a.code_presentation = '2013B'
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
    CASE
        WHEN sa.date_submitted <= 25 THEN 'Early'
        ELSE 'Late'
    END AS submission_category,
    COUNT(sa.id_student) AS student_count,
    ROUND(AVG(COALESCE(sv.total_clicks, 0)), 2) AS avg_total_clicks,
    ROUND(AVG(COALESCE(sv.days_active, 0)), 2) AS avg_days_active,
    ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
    SELECT
        id_student,
        code_module,
        code_presentation,
        SUM(sum_click) as total_clicks,
        COUNT(DISTINCT date) as days_active
    FROM studentVle
    WHERE code_module = 'BBB' AND code_presentation = '2013B'
    GROUP BY id_student, code_module, code_presentation
) sv
    ON sa.id_student = sv.id_student
    AND a.code_module = sv.code_module
    AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
    AND a.code_module = 'BBB' 
    AND a.code_presentation = '2013B'
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
    CASE
        WHEN sa.date_submitted <= 25 THEN 'Early'
        ELSE 'Late'
    END AS submission_category,
    COUNT(sa.id_student) AS student_count,
    ROUND(AVG(COALESCE(sv.total_clicks,0)),2) AS avg_total_clicks,
    ROUND(AVG(COALESCE(sv.days_active,0)),2) AS avg_days_active,
    ROUND(AVG(sa.date_submitted),2) AS avg_submission_day
FROM studentAssessment sa
LEFT JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
    SELECT
        id_student,
        code_module,
        code_presentation,
        SUM(sum_click) as total_clicks,
        COUNT(DISTINCT date) as days_active
    FROM studentVle
    WHERE code_module ='BBB' AND code_presentation = '2013B'
    GROUP BY id_student, code_module, code_presentation
) sv
    ON sa.id_student = sv.id_student
    AND a.code_module = sv.code_module
    AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''
SELECT
    CASE
        WHEN sa.date_submitted <= 25 THEN 'Early'
        ELSE 'Late'
    END AS submission_category,
    COUNT(sa.id_student) AS student_count,
    ROUND(AVG(COALESCE(sv.total_clicks, 0)), 2) AS avg_total_clicks,
    ROUND(AVG(COALESCE(sv.days_active, 0)), 2) AS avg_days_active,
    ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
    SELECT
        id_student,
        code_module,
        code_presentation,
        COALESCE(SUM(sum_click), 0) as total_clicks,
        COALESCE(COUNT(DISTINCT date), 0) as days_active
    FROM studentVle
    WHERE code_module = 'BBB' and code_presentation = '2013B'
    GROUP BY id_student, code_module, code_presentation
) sv
    ON sa.id_student = sv.id_student
    AND a.code_module = sv.code_module
    AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''
In [ ]:
DEBUG_early_late_submitters_query = '''

SELECT
CASE WHEN sa.date_submitted <= 25 THEN 'Early'
     WHEN sa.date_submitted > 25 THEN 'Late'
END AS submission_category,
COUNT(sa.id_student) AS student_count,
ROUND(AVG(IFNULL(sv.click_total, 0)), 2) AS avg_total_clicks,
ROUND(AVG(IFNULL(sv.distinct_date, 0)), 2) AS avg_days_active,
ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day

FROM studentAssessment sa 
JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN(
SELECT id_student, code_module, code_presentation, IFNULL(SUM(sum_click), 0.0) AS click_total, IFNULL(COUNT(DISTINCT date), 0.0) AS distinct_date
FROM studentVLE
WHERE code_module = 'BBB' AND code_presentation = '2013B'
GROUP BY id_student, code_module, code_presentation) sv
ON sa.id_student = sv.id_student AND a.code_module = sv.code_module AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category

'''
In [ ]:
### Solution - Exercise 2  
DEBUG_early_late_submitters_query = '''
SELECT
    CASE
        WHEN sa.date_submitted <= 25 THEN 'Early'
        ELSE 'Late'
    END AS submission_category,
    COUNT(DISTINCT sa.id_student) AS student_count,
    ROUND(AVG(COALESCE(sv.total_clicks, 0)), 2) AS avg_total_clicks,
    ROUND(AVG(COALESCE(sv.days_active, 0)), 2) AS avg_days_active,
    ROUND(AVG(sa.date_submitted), 2) AS avg_submission_day
FROM studentAssessment sa
INNER JOIN assessments a ON sa.id_assessment = a.id_assessment
LEFT JOIN (
    SELECT
        id_student,
        code_module,
        code_presentation,
        SUM(sum_click) as total_clicks,
        COUNT(DISTINCT date) as days_active
    FROM studentVle
    GROUP BY id_student, code_module, code_presentation
    HAVING code_module = 'BBB' AND code_presentation = '2013B'
) sv
    ON sa.id_student = sv.id_student
    AND a.code_module = sv.code_module
    AND a.code_presentation = sv.code_presentation
WHERE sa.id_assessment = 14984
GROUP BY submission_category
ORDER BY submission_category;
'''

Ex3¶

In [ ]:
vle_engagement_risk_query = '''
WITH filtered_student_vle AS (
SELECT *
FROM studentVle
WHERE code_module = 'BBB' AND code_presentation = '2013J' AND date < 30)
SELECT filt.id_student, SUM(sum_click) AS total_clicks, COUNT(DISTINCT filt.date) AS days_active,
CASE WHEN SUM(sum_click) < 50 OR COUNT(DISTINCT filt.date) < 5 THEN 1 ELSE 0 END AS at_risk_flag, final_result
FROM filtered_student_vle filt
JOIN studentInfo si
ON filt.id_student = si.id_student
AND filt.code_module = si.code_module
AND filt.code_presentation = si.code_presentation
GROUP BY filt.id_student, si.final_result
ORDER BY filt.id_student ASC, final_result DESC'''
In [ ]:
vle_engagement_risk_query = '''
        
    SELECT v.id_student, total_clicks, days_active,
        CASE 
            WHEN total_clicks < 50 OR days_active < 5 THEN 1
            ELSE 0 END AS at_risk_flag,
    final_result
    FROM (SELECT *,
        SUM(sum_click) as total_clicks,
        COUNT(DISTINCT date) as days_active
        FROM studentVle
        WHERE code_module = 'BBB'
            AND code_presentation = '2013J'
            AND date < 30
        GROUP BY id_student) as v
    LEFT JOIN studentInfo as i
        ON v.id_student = i.id_student
        AND v.code_module = i.code_module
        AND v.code_presentation = i.code_presentation
    GROUP BY v.id_student, i.final_result
In [ ]:
vle_engagement_risk_query = '''
select sv.id_student,
sum(sv.sum_click) as total_clicks,
count(distinct sv.date) as days_active,
case when sum(sv.sum_click) < 50 or count(distinct sv.date) < 5 then 1 else 0 end as at_risk_flag,
si.final_result

from studentVle sv
join studentInfo si
on sv.id_student = si.id_student
and sv.code_module = si.code_module
and sv.code_presentation = si.code_presentation
where sv.code_module = 'BBB'
and sv.code_presentation = '2013J'
and sv.date < 30 

group by sv.id_student, si.final_result
order by 1 asc, 5 asc
'''
In [ ]:
vle_engagement_risk_query = '''
                            SELECT v.id_student,
                                   SUM(v.sum_click) AS total_clicks,
                                   COUNT(DISTINCT v.date) AS days_active,
                                   CASE 
                                       WHEN SUM(v.sum_click) < 50 OR COUNT(DISTINCT v.date) < 5 THEN 1
                                       ELSE 0 
                                   END AS at_risk_flag,   
                                   s.final_result
                            FROM studentInfo AS s
                            JOIN 
                                (
                                    SELECT *
                                    FROM studentVle
                                    WHERE code_module = 'BBB'
                                        AND code_presentation = '2013J'
                                        AND date < 30
                                ) AS v
                            ON s.id_student = v.id_student
                                AND s.code_module = v.code_module
                                AND s.code_presentation = v.code_presentation
                            GROUP BY v.id_student, s.final_result
                            ORDER BY 1 ASC, 4 ASC
                            '''
In [ ]:
vle_engagement_risk_query = '''
    SELECT
        sv.id_student,
        SUM(sv.sum_click) AS total_clicks,
        COUNT(DISTINCT sv.date) AS days_active,
        (CASE 
            WHEN SUM(sv.sum_click) <50 OR COUNT(DISTINCT sv.date) <5 THEN 1
            ELSE 0
        END) AS at_risk_flag,
        si.final_result
FROM
	studentVle AS sv
JOIN
	studentInfo AS si 
    on sv.id_student=si.id_student
	AND sv.code_module=si.code_module
	AND sv.code_presentation=si.code_presentation
WHERE
	sv.code_module='BBB' AND sv.code_presentation='2013J' AND sv.date <30
GROUP BY
	sv.id_student, si.final_result
ORDER BY
	sv.id_student ASC, si.final_result ASC
    '''
In [ ]:
vle_engagement_risk_query = '''
SELECT 
    sv.id_student, 
    SUM(sv.sum_click) AS total_clicks, 
    COUNT(DISTINCT sv.date) AS days_active,
    CASE 
        WHEN SUM(sv.sum_click) < 50 OR COUNT(DISTINCT sv.date) < 5 THEN 1 
        ELSE 0 
    END AS at_risk_flag,
    si.final_result
FROM studentVle AS sv
JOIN studentInfo AS si 
  ON sv.id_student = si.id_student 
  AND sv.code_module = si.code_module 
  AND sv.code_presentation = si.code_presentation
WHERE sv.code_module = 'BBB' 
  AND sv.code_presentation = '2013J' 
  AND sv.date < 30
GROUP BY sv.id_student, si.final_result
ORDER BY sv.id_student ASC, si.final_result ASC'''
In [ ]:
vle_engagement_risk_query = '''

SELECT 
    st.id_student,
    SUM(st.sum_click) AS total_clicks,
    COUNT(distinct st.date) AS days_active,
    CASE WHEN SUM(st.sum_click) < 50 OR COUNT(DISTINCT st.date) < 5 THEN 1
         ELSE 0
    END AS at_risk_flag,
    si.final_result
    

FROM 
    (SELECT *
    FROM studentVle 
    WHERE code_module = 'BBB' AND code_presentation = '2013J' AND date < 30) st
JOIN studentInfo si 
ON st.id_student = si.id_student 
AND st.code_module = si.code_module
AND st.code_presentation = si.code_presentation
GROUP BY st.id_student, si.final_result
ORDER BY st.id_student, final_result

'''
In [ ]:
### Solution - Exercise 3  
vle_engagement_risk_query = '''

SELECT sv.id_student, 
       SUM(sv.sum_click) AS total_clicks,
       COUNT(DISTINCT sv.date) AS days_active,
       (CASE WHEN (SUM(sv.sum_click) < 50 OR COUNT(DISTINCT sv.date) < 5) THEN 1 ELSE 0 END) AS at_risk_flag,
       si.final_result
  FROM studentVle AS sv
 INNER JOIN studentInfo AS si ON si.id_student = sv.id_student 
                AND si.code_module = sv.code_module 
                AND si.code_presentation = sv.code_presentation
 WHERE sv.code_module = 'BBB' AND sv.code_presentation = '2013J' AND sv.date < 30
 GROUP BY sv.id_student, si.final_result
 ORDER BY sv.id_student ASC, si.final_result ASC
'''

Ex4¶

In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
    students_df = students_df[STUDENT_FEATURES].set_index('id_student')
    merged = students_df.merge(registrations_df[REGISTRATION_FEATURES], how = 'left', on = ['code_module', 'code_presentation', 'id_student'])
    vle_feats = vle_df[vle_df.date < cutoff_day].groupby(['code_module', 'code_presentation', 'id_student']).agg(total_clicks=('sum_click', 'sum'), total_active_days=('date', 'nunique'))
    vle_feats['avg_clicks_per_day'] = vle_feats['total_clicks']/vle_feats['total_active_days']
    VLE_FEATS = list(vle_feats.columns)
    merged = merged.merge(vle_feats, how = 'left', on = ['code_module', 'code_presentation', 'id_student'])
    merged[VLE_FEATS] = merged[VLE_FEATS].fillna(0)
    merged['days_registered_before_start'] = merged['date_registration']
    merged = merged.astype({
    'total_active_days': 'int64',
    'total_clicks': 'int64'
})

    return merged[RESULT_FEATURES].set_index('id_student')
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
    df = students_df[STUDENT_FEATURES].set_index('id_student')
    df = pd.merge(df, registrations_df[REGISTRATION_FEATURES], how='left', on=['code_module','code_presentation','id_student'])
    
    vle_df_small = vle_df[vle_df['date'] < cutoff_day]
    vle_df_small = vle_df_small.groupby(['code_module', 'code_presentation', 'id_student']).agg(total_clicks = ('sum_click', 'sum'), 
                                                                                                total_active_days = ('date', 'nunique'))
    vle_df_small['avg_clicks_per_day'] = vle_df_small['total_clicks'] / vle_df_small['total_active_days']
    
    VLE = list(vle_df_small.columns)
    
    df = pd.merge(df,vle_df_small, how='left', on=['code_module', 'code_presentation','id_student'])
    df[VLE] = df[VLE].fillna(0)
    df['days_registered_before_start'] = df['date_registration']
    
    df = df.astype({
        'total_clicks':'int64',
        'total_active_days':'int64'
    })
    return df[RESULT_FEATURES].set_index('id_student')
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
    df = students_df[STUDENT_FEATURES].copy()
    df = df.merge(registrations_df[REGISTRATION_FEATURES], 
                 on = ["code_module", 'code_presentation', 'id_student'],
                 how = 'left')
    
    vle = vle_df[vle_df['date'] < cutoff_day]
    vle = vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
    total_clicks = ('sum_click', 'sum'), total_active_days = ('date', 'nunique')).reset_index()
    vle['avg_clicks_per_day'] = vle['total_clicks'] / vle['total_active_days']
    
    df = df.merge(vle, on=['code_module', 'code_presentation', 'id_student'],
                 how = 'left')
    
    df[['total_clicks', 'total_active_days', 'avg_clicks_per_day']] = df[['total_clicks', 'total_active_days', 'avg_clicks_per_day']].fillna(0)
    
    df['total_clicks'] = df['total_clicks'].astype('int64')
    df['total_active_days'] = df['total_active_days'].astype('int64')
    df['avg_clicks_per_day'] = df['avg_clicks_per_day'].astype('float64')
    
    df['days_registered_before_start'] = df['date_registration']
    
    
    df = df[RESULT_FEATURES]
    df = df.set_index('id_student')
    
    return df
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
    ###
    students = students_df[STUDENT_FEATURES]
    registrations = registrations_df[REGISTRATION_FEATURES]
    features = pd.merge(students, registrations, on=['code_module', 'code_presentation', 'id_student'], how='left')
    
    vle = vle_df[vle_df['date'] < cutoff_day]
    vle_grouped = vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(total_clicks=('sum_click', 'sum'),
                                                                         total_active_days = ('date', 'nunique')).reset_index()
    vle_grouped['avg_clicks_per_day'] = vle_grouped['total_clicks'] / vle_grouped['total_active_days']
    
    merged = pd.merge(features, vle_grouped, how='left', on=['code_module', 'code_presentation', 'id_student'])           
    vle_cols = ['total_clicks', 'total_active_days', 'avg_clicks_per_day']
    merged[vle_cols] = merged[vle_cols].fillna(0)
    
    dtype_map = {
        'total_clicks':'int64',
        'total_active_days':'int64',
        'avg_clicks_per_day':'float64'
    }
    merged = merged.astype(dtype_map)  
    merged['days_registered_before_start'] = merged['date_registration']
    merged = merged[RESULT_FEATURES]
    merged.set_index('id_student', inplace=True)
    
    return merged
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):    
    students_df = students_df[STUDENT_FEATURES]
    registrations_df = registrations_df[REGISTRATION_FEATURES]
    students_registration_merged = pd.merge(students_df,registrations_df,on=['code_module','code_presentation','id_student'],how='left')
    vle_df = vle_df[vle_df['date']<cutoff_day]
    vle_df = vle_df.groupby(['code_module','code_presentation','id_student'],as_index=False).agg(total_clicks=('sum_click','sum'),total_active_days=('date','nunique'))
    vle_df['avg_clicks_per_day'] = vle_df['total_clicks']/vle_df['total_active_days']
    column_name = vle_df.columns.tolist()
    feature_vle_merged = pd.merge(students_registration_merged,vle_df,on=['code_module','code_presentation','id_student'],how='left')
    feature_vle_merged[column_name] = feature_vle_merged[column_name].fillna(0)
    feature_vle_merged = feature_vle_merged.astype({'total_clicks':'int64','total_active_days':'int64','avg_clicks_per_day':'float64'})
    feature_vle_merged['days_registered_before_start']=feature_vle_merged['date_registration']
    result = feature_vle_merged[RESULT_FEATURES]
    result = result.set_index('id_student')
    return result
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):
    ###
    ### YOUR CODE HERE
    ###
    df = students_df[STUDENT_FEATURES].copy()
    reg_subset = registrations_df[REGISTRATION_FEATURES]
    df = df.merge(reg_subset, on=['code_module', 'code_presentation', 'id_student'], how='left')
    
    vle = vle_df[vle_df['date'] < cutoff_day].copy()
    
    vle_stats = vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
        total_clicks=('sum_click', 'sum'),
        total_active_days=('date', 'nunique')
    ).reset_index()
    
    vle_stats['avg_clicks_per_day'] = vle_stats['total_clicks'] / vle_stats['total_active_days']
    
    df = df.merge(vle_stats, on=['code_module', 'code_presentation', 'id_student'], how='left')
    
    cols = ['total_clicks', 'total_active_days', 'avg_clicks_per_day']
    df[cols] = df[cols].fillna(0)
    
    df['total_clicks'] = df['total_clicks'].astype('int64')
    df['total_active_days'] = df['total_active_days'].astype('int64')
    df['avg_clicks_per_day'] = df['avg_clicks_per_day'].astype('float64')
    
    df['days_registered_before_start'] = df['date_registration']
    #print(df.set_index('id_student'))
    #print(RESULT_FEATURES)
    
    return df[RESULT_FEATURES].set_index('id_student')
In [ ]:
def create_student_features(students_df, registrations_df, vle_df, cutoff_day): 
    students_df_1 = students_df.copy()
    students_df_1 = students_df_1[STUDENT_FEATURES]
    
    registrations_df_1 = registrations_df.copy()
    registrations_df_1 = registrations_df_1[REGISTRATION_FEATURES]
    
    merged = students_df_1.merge(registrations_df_1, on = ['code_module', 'code_presentation', 'id_student'], how = 'left')
    
    
    #step 3
    vle_df_1 = vle_df.copy()
    vle_df_1 = vle_df_1[vle_df_1['date'] < cutoff_day]
    
    grouped = vle_df_1.groupby(['code_module', 'code_presentation', 'id_student'])\
                        .agg(total_clicks = ('sum_click', 'sum'),
                            total_active_days = ('date', 'nunique'))\
                        .reset_index()
            
    grouped['avg_clicks_per_day'] = grouped['total_clicks']/grouped['total_active_days']
    
    #step4
    merged_new = merged.merge(grouped, on = ['code_module', 'code_presentation', 'id_student'], how = 'left')
    
    merged_new[['total_clicks', 'total_active_days', 'avg_clicks_per_day']] = merged_new[['total_clicks', 'total_active_days', 'avg_clicks_per_day']].fillna(0)
    merged_new = merged_new.astype({'total_clicks': int,
                                   'total_active_days': int,
                                   'avg_clicks_per_day': float})
    
    merged_new['days_registered_before_start'] = merged_new['date_registration']
    
    output = merged_new[RESULT_FEATURES]
    output = output.set_index('id_student')

    
    return output
In [ ]:
### Solution - Exercise 4  
def create_student_features(students_df, registrations_df, vle_df, cutoff_day):

    res = pd.merge(left=students_df[STUDENT_FEATURES],
                   right=registrations_df[REGISTRATION_FEATURES],
                   how='left',
                   on=['code_module', 'code_presentation', 'id_student'])
    
    vle_processed = (
        vle_df.query('`date` < @cutoff_day')
        .groupby(by=['code_module', 'code_presentation', 'id_student'])
        .agg({'sum_click': 'sum', 'date': 'nunique'})
        .rename({'sum_click': 'total_clicks', 'date': 'total_active_days'}, axis=1)
        .assign(avg_clicks_per_day = lambda x: x['total_clicks'] / x['total_active_days'])
        .reset_index(drop=False)
    )
    
    res = (
        pd.merge(
            left=res, 
            right=vle_processed, 
            how='left', 
            on=['code_module', 'code_presentation', 'id_student']
        ).fillna(
            {'total_clicks': 0, 'total_active_days': 0, 'avg_clicks_per_day': 0}
        ).astype(
            {'total_clicks': np.int64, 'total_active_days': np.int64, 'avg_clicks_per_day': np.float64}
        ).rename(
            {'date_registration': 'days_registered_before_start'}, 
            axis=1
        )
    )[RESULT_FEATURES].set_index('id_student')
        
    return res

Ex5¶

In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
    student_assessments_df = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id]
    student_assessments_df = student_assessments_df[(student_assessments_df['date_submitted'] >= label_start_day) & (student_assessments_df['date_submitted'] <= label_end_day)]
    student_assessments_df = student_assessments_df[['id_student', 'code_module', 'code_presentation', 'date_submitted']]
    
    merged = students_df.merge(student_assessments_df, how = 'left', on=['id_student', 'code_module', 'code_presentation'])

    merged['label'] = merged['date_submitted'].notna().astype(int)
    return merged[['id_student', 'label', 'code_module', 'code_presentation']].set_index('id_student')
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
    
    df = student_assessments_df[student_assessments_df.id_assessment == first_assessment_id]
    df = df[(student_assessments_df.date_submitted >= label_start_day) & (student_assessments_df.date_submitted <= label_end_day)]
    
    COLS = ['id_student', 'code_module', 'code_presentation', 'date_submitted']
    df = df[COLS]
    
    merge_df = pd.merge(students_df, df, how='left', on=['id_student', 'code_module', 'code_presentation'])
    
    merge_df['label'] = merge_df['date_submitted'].notna().astype(int)
    new_cols = ['id_student','code_module','code_presentation','label']
    merge_df = merge_df[new_cols].set_index('id_student')
    
    return merge_df
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
    df = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id].copy()
    
    submit_df = df[(df['date_submitted'] >= label_start_day) &
           (df['date_submitted'] <= label_end_day)][['id_student', 'code_module', 'code_presentation']]
    submit_df['sumbit'] = 1

    df_2 = students_df[['id_student', 'code_module', 'code_presentation']].copy()
    
    final = df_2.merge(submit_df, on=['id_student', 'code_module', 'code_presentation'],
                      how='left')
    
    final['label'] = final['sumbit'].fillna(0)
    
    final = final[['id_student', 'code_module', 'code_presentation', 'label']]
    final['label'] = final['label'].astype('int64')
    final = final.set_index('id_student')
    
    return final
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
    ###
    
    submission = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id]
    submission = submission[ (label_start_day <= submission['date_submitted']) & (label_end_day >= submission['date_submitted'])]
    submission = submission[['id_student', 'code_module', 'code_presentation']]
    submission['label'] = 1
    
    merged = pd.merge(students_df[['id_student', 'code_module', 'code_presentation']], submission, how='left', on=['id_student', 'code_module', 'code_presentation'] )
    merged['label'] = merged['label'].fillna(0)
    merged['label'] = merged['label'].astype('int')
    merged = merged.set_index('id_student')
    
    return merged[['code_module', 'code_presentation', 'label']]
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
    sa = student_assessments_df[student_assessments_df['id_assessment']==first_assessment_id]
    sa = sa[sa['date_submitted'].between(label_start_day, label_end_day)]
    sa = sa[['id_student','code_module','code_presentation']]
    students_df = students_df[['id_student','code_module','code_presentation']]
    df = pd.merge(students_df,sa,on=['id_student','code_module','code_presentation'],how='left',indicator=True)
    df['label'] = (df['_merge']=='both').astype(int)
    df = df.drop(columns=['_merge'])
    df=df.set_index(['id_student'])
    return df
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
    ###
    ### YOUR CODE HERE
    ###
    sa_filtered = student_assessments_df[student_assessments_df['id_assessment'] == first_assessment_id]
    
    mask = (sa_filtered['date_submitted'] >= label_start_day) & (sa_filtered['date_submitted'] <= label_end_day)
    sub = sa_filtered[mask][['id_student', 'code_module', 'code_presentation']].copy()
    
    sub['label'] = 1
    
    student_subset = students_df[['id_student', 'code_module', 'code_presentation']]
    
    ans = student_subset.merge(sub, 
                                  on=['id_student', 'code_module', 'code_presentation'], 
                                  how='left')
    
    ans['label'] = ans['label'].fillna(0).astype('int64')
    
    return ans.set_index('id_student')[['code_module', 'code_presentation', 'label']]
In [ ]:
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):
    
    student_assessments_df_1 = student_assessments_df.copy()
    student_assessments_df_1 = student_assessments_df_1[student_assessments_df_1['id_assessment'] == first_assessment_id]

    student_assessments_df_1 = student_assessments_df_1[(student_assessments_df_1['date_submitted'] >= label_start_day) & (student_assessments_df_1['date_submitted'] <= label_end_day)]

    submission = student_assessments_df_1[['id_student', 'code_module', 'code_presentation']]

    students_df_1 = students_df.copy()
    students_df_1 = students_df[['id_student', 'code_module', 'code_presentation']]

    merged = pd.merge(students_df_1, submission, on = ['id_student', 'code_module', 'code_presentation'], how = 'inner', suffixes = ('', '_x'))
    
    merged['label'] = 1
    
    result = students_df_1.merge(merged, on = ['id_student', 'code_module', 'code_presentation'], how = 'left').fillna(0)
    
    result['label'] = result['label'].astype(int)
    result = result.set_index('id_student')

    return result
In [ ]:
### Solution - Exercise 5  
def create_submission_labels(students_df, student_assessments_df, first_assessment_id, label_start_day, label_end_day):   
    # req 2 bullet point 2 says to only keep columns id_student, code_module, code_presentation
    # however if we do that, then after left merging with students_df, we can't know which one is date_submitted is within the window
    # we should not do this until near the end
    su = (
        student_assessments_df
        .query('`id_assessment` == @first_assessment_id')
        .query('@label_start_day <= `date_submitted` <= @label_end_day')
    )
    
    res = pd.merge(left=students_df, right=su, how='left', on=['id_student', 'code_module', 'code_presentation'])
    res['label'] = np.where(np.isnan(res['date_submitted'].values), 0, 1)
    
    # direction says to return columns in the below order
#     return res[['id_student', 'label', 'code_module', 'code_presentation']].set_index('id_student')
    
    # however test cases want this order
    return res[['id_student', 'code_module', 'code_presentation', 'label']].set_index('id_student')

Ex6¶

In [ ]:
def analyze_class_imbalance(labels):
    total_samples = len(labels)
    positive_count = len(labels[labels == 1])
    negative_count = len(labels[labels == 0])
    positive_ratio = round(positive_count/total_samples, 4)
    imbalance_ratio = round(max(positive_count, negative_count) / min(positive_count, negative_count), 4) if positive_count!=0 and negative_count !=0 else np.inf
    majority_class = 1 if positive_count >= negative_count else 0
    return {
        'imbalance_ratio' : imbalance_ratio,
        'majority_class' : majority_class,
        'negative_count' : negative_count,
        'positive_count' : positive_count,
        'positive_ratio' : positive_ratio,
        'total_samples' : total_samples
    }
In [ ]:
def analyze_class_imbalance(labels):
    total_samples = labels.count()
    pos = labels.sum()
    
    neg = total_samples - pos
    
    
    ratio = round(pos / total_samples,4)
    
    if (pos == 0) or (neg == 0):
        imb = float('inf')
    else:
        imb = round(max(pos, neg) / min(pos, neg),4)
        
    
    if pos >= neg:
        maj = 1
    else:
        maj = 0
        
    result = {
        'total_samples': total_samples,
        'positive_count': pos,
        'negative_count':neg,
        'positive_ratio':ratio,
        'imbalance_ratio':imb,
        'majority_class':maj
    }
    
    return result
In [ ]:
def analyze_class_imbalance(labels):
    total_samples = len(labels)
    
    positive_count = (labels == 1).sum()
    negative_count = (labels == 0).sum()
    
    positive_ratio = round(positive_count / total_samples, 4)
    
    if positive_count == 0 or negative_count == 0:
        imbalance_ratio = float('inf')
    else:
        imbalance_ratio = round(max(positive_count, negative_count) 
                                / min(positive_count, negative_count), 4)
    
    if positive_count >= negative_count:
        majority_class = 1
        
    else:
        majority_class = 0
        
    return {'total_samples': total_samples,
           'positive_count': positive_count,
           'negative_count': negative_count,
           'positive_ratio': positive_ratio,
           'imbalance_ratio': imbalance_ratio,
           'majority_class': majority_class}
In [ ]:
def analyze_class_imbalance(labels):
    ###
    from collections import Counter
    counts = Counter(labels)
    
    total_samples = len(labels)
    positive_count = counts[1]
    negative_count = counts[0]
    positive_ratio = round(positive_count / total_samples,4)
    
    if positive_count == 0 or negative_count == 0:
        imbalance_ratio = float('inf')
    else:
        imbalance_ratio = round(max(positive_count, negative_count) / min(positive_count, negative_count), 4)
    
    if positive_count >= negative_count:
        majority_class = 1
    else:
        majority_class = 0
    
    return {
        'total_samples' : total_samples,
        'positive_count' : positive_count,
        'negative_count' : negative_count,
        'positive_ratio': positive_ratio,
        'imbalance_ratio' : imbalance_ratio,
        'majority_class': majority_class
    }
In [ ]:
def analyze_class_imbalance(labels):
    res = dict()
    res['total_samples']=len(labels)
    res['positive_count']=len(labels[labels==1])
    res['negative_count']=res['total_samples']-res['positive_count']
    res['positive_ratio']=float(round(res['positive_count']/res['total_samples'],4))
    if res['positive_count']==0 or res['negative_count']==0:
        res['imbalance_ratio']=float('inf')
    else:
        res['imbalance_ratio']=float(round(max(res['positive_count'],res['negative_count'])/min(res['positive_count'],res['negative_count']),4))
    
    if res['positive_count']>=res['negative_count']:
        res['majority_class']=1
    else:
        res['majority_class']=0
    return res
In [ ]:
def analyze_class_imbalance(labels):
    ###
    ### YOUR CODE HERE
    ###
    ts = len(labels)
    pc = (labels == 1).sum()
    nc = (labels == 0).sum()
    pr = round(pc / ts, 4) if ts > 0 else 0.0
    
    if pc == 0 or nc == 0:
        ir = float('inf')
    else:
        ir = round(max(pc, nc) / min(pc, nc), 4)
        
    if pc >= nc:
        mc = 1
    else:
        mc = 0
        
    return {
        'total_samples': ts,
        'positive_count': pc,
        'negative_count': nc,
        'positive_ratio': pr,
        'imbalance_ratio': ir,
        'majority_class': mc
    }
In [ ]:
def analyze_class_imbalance(labels):
    total_samples = len(labels)
    positive_count = labels.sum()
    negative_count = total_samples - positive_count
    
    positive_ratio = round(positive_count/total_samples, 4)
    
    if positive_count == 0 or negative_count == 0:
        imbalance_ratio = float('inf')
    else:
        imbalance_ratio = round(max(positive_count, negative_count)/min(positive_count, negative_count), 4)
    
    if positive_count >= negative_count:
        majority_class = 1
    else: 
        majority_class = 0
        
    
    return {
        'imbalance_ratio': imbalance_ratio,
        'majority_class': majority_class,
        'negative_count': negative_count,
        'positive_count': positive_count,
        'positive_ratio': positive_ratio,
        'total_samples': total_samples
    
    }
In [ ]:
### Solution - Exercise 6  
def analyze_class_imbalance(labels):
    from math import inf
    total_samples = len(labels)
    vcounts = labels.value_counts()
    if len(vcounts) == 2:
        pos_count, neg_count = vcounts.loc[1], vcounts.loc[0]
    else:
        if vcounts.index.isin([1]):
            pos_count, neg_count = vcounts.loc[1], 0
        else:
            pos_count, neg_count = 0, vcounts.loc[0]
        
    pos_ratio = round(pos_count / total_samples, 4)
    imb_ratio = inf if (pos_count == 0 or neg_count == 0) else round(max(pos_count, neg_count) / min(pos_count, neg_count), 4)
    maj_class = 1 if pos_ratio >= 0.5 else 0
    return {'total_samples': total_samples,
            'positive_count': pos_count,
            'negative_count': neg_count,
            'positive_ratio': pos_ratio,
            'imbalance_ratio': imb_ratio,
            'majority_class': maj_class}

Ex7¶

In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
    outcome_df = pd.DataFrame({'outcome' : outcome_series.values}).set_index(outcome_series.index)
    combined = features_df.merge(outcome_df, on = 'id_student')
    corr_df = combined.corr()
    correlations = []
    for feature in numerical_features:
        correlations.append(round(corr_df.at[feature, 'outcome'], 4))
    abs_correlations = [abs(corr) for corr in correlations]
    return pd.DataFrame({'feature_name' : numerical_features, 'correlation' : correlations, 'abs_correlation' : abs_correlations}).sort_values('abs_correlation', ascending=False).dropna().reset_index(drop=True)
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
    
    df = features_df.merge(outcome_series.rename('outcome'), how='left', 
                           left_index=True, right_index=True)
    result_df = df.corr(min_periods = 2)
    
    corrs = []
    for f in numerical_features:
        vals = pd.Series(df[f])
        corrs.append(round(result_df.at[f, 'outcome'],4))
    
    abs_corrs = [abs(c) for c in corrs]
    
    result = {
        'feature_name':numerical_features,
        'correlation':corrs,
        'abs_correlation':abs_corrs
    }
    
    result = pd.DataFrame(result)
    
    result = result.sort_values(by = ['abs_correlation'], ascending = False)
    result.reset_index(drop=True, inplace = True)
    result.dropna(inplace = True)
    
    return result
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
    common_index = features_df.index.intersection(outcome_series.index)
    
    X = features_df.loc[common_index]
    y = outcome_series.loc[common_index]
    
    results = []
    
    for feature in numerical_features:
        feat_values = X[feature]
        
        count = feat_values.count()
        std_dev = feat_values.std()
        
        corr_val = 0.0
        if count >= 2 and std_dev > 0:
            corr_val = feat_values.corr(y)
            
            if pd.isna(corr_val):
                corr_val = 0.0

            rounded_corr = round(float(corr_val), 4)
            abs_corr = round(abs(rounded_corr), 4)
            
            results.append({
                'feature_name': feature,
                'correlation': rounded_corr,
                'abs_correlation': abs_corr
            })
    
    corr_df = pd.DataFrame(results)
    corr_df = corr_df.sort_values(by='abs_correlation', ascending=False)
    corr_df.index = sorted(corr_df.index)
    
    return corr_df
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
    features, outcome = features_df.align(outcome_series, join='inner', axis=0)    
    res = []
    for feature in numerical_features:        
        x = features[feature]        
        df = pd.concat([x,outcome],axis=1)
        if x.isna().sum()!=0:
            corr=0
        else:
            if x.std()> 0:
                corr=df.corr(method='pearson',min_periods=3).iloc[0,1]              

        res.append({
            'feature_name': feature,
            'correlation': round(corr, 4),
            'abs_correlation': round(abs(corr), 4)
        })

    result_df = pd.DataFrame(res)
    result_df = result_df.sort_values(by='abs_correlation', ascending=False).reset_index(drop=True)

    return result_df
In [ ]:
def compute_feature_correlations(features_df, outcome_series, numerical_features):
    features_df_new = features_df.reset_index()
    outcome_series_new = outcome_series.reset_index()
    
    merged = features_df_new.merge(outcome_series_new, on = 'id_student', how = 'inner')
    merged = merged.rename(columns = {0: 'target'})
    
    from collections import defaultdict
    
    output = defaultdict(list)
    
    for feature in numerical_features:
        a_feature = merged[feature]
#         a_feature = a_feature.fillna(0)
        num_values = a_feature.nunique()
        std_values = a_feature.std()
        
        if num_values >= 2 and std_values > 0:
            correlation = round(a_feature.corr(outcome_series_new.iloc[:, 1], method = 'pearson'), 4)
            abs_correlation = round(abs(correlation), 4)
            
            output['feature_name'].append(feature)
            output['correlation'].append(correlation)
            output['abs_correlation'].append(abs_correlation)
            
            
    output = pd.DataFrame(output).sort_values(by = 'abs_correlation', ascending = False).reset_index(drop = True)
        
    
    return output
In [ ]:
### Solution - Exercise 7  
def compute_feature_correlations(features_df, outcome_series, numerical_features):
    res = (
        pd.merge(left=features_df[numerical_features], right=pd.DataFrame({'binary':outcome_series}), how='inner', on='id_student')
    )
    
    res = (
        res.corr()[['binary']]
        .reset_index()
        .rename({'index': 'feature_name', 'binary': 'correlation'}, axis=1)
    )
    
    res = (
        res
        .assign(abs_correlation=lambda x: x['correlation'].abs())
        .query('`feature_name` != "binary"')
        .sort_values('abs_correlation', ascending=False)
    ).round(4).reset_index(drop=True)[['feature_name', 'correlation', 'abs_correlation']]
    
    return res.dropna()

Ex8¶

In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                 first_assessment_id, current_day, cutoff_day):

    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]

    course_students = students_df[
        (students_df['code_module'] == target_module) &
        (students_df['code_presentation'] == target_presentation)
    ].copy()

    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()

    submitted_by_current = course_assessments[
        course_assessments['date_submitted'] <= current_day
    ]['id_student'].unique()

    train_labels = pd.Series(
        course_students['id_student'].isin(submitted_by_current).astype(int).values,
        index=course_students['id_student']
    )
    train_labels.index.name = 'id_student'

    test_students = course_students[
        ~course_students['id_student'].isin(submitted_by_current)
    ]

    submitted_after_current = course_assessments[
        (course_assessments['date_submitted'] > current_day) &
        (course_assessments['date_submitted'] <= cutoff_day)
    ]['id_student'].unique()

    test_labels = pd.Series(
        test_students['id_student'].isin(submitted_after_current).astype(int).values,
        index=test_students['id_student']
    )
    test_labels.index.name = 'id_student'

    return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                 first_assessment_id, current_day, cutoff_day):
    ###
    ### YOUR CODE HERE
    ###
    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]

    course_students = students_df.copy()
    
    course_students = course_students[
        (course_students['code_module'] == target_module) & 
        (course_students['code_presentation']==target_presentation)
    ]
    
    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()

    submitted_by_current = course_assessments[
        course_assessments['date_submitted'] <= current_day
    ]['id_student'].unique()

#     submitted_after_current = course_assessments[
#         course_assessments['date_submitted'] > current_day
#     ]['id_student'].unique()
   
    train_labels = pd.Series(
        course_students['id_student'].isin(submitted_by_current).astype(int).values,
        index=course_students['id_student']
    )
    
    submitted_after_current = course_assessments[
        (course_assessments['date_submitted'] > current_day) &
        (course_assessments['date_submitted'] <= cutoff_day)
    ]['id_student'].unique()
    
    test_students = course_students[~course_students['id_student'].isin(submitted_by_current)]
    
    test_labels = pd.Series(
        test_students['id_student'].isin(submitted_after_current).astype(int).values,
        index=test_students['id_student']
    )

    return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                 first_assessment_id, current_day, cutoff_day):
    ###
    ### YOUR CODE HERE
    ###
    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]

    course_students = students_df[(students_df['code_module'] == target_module) & 
        (students_df['code_presentation'] == target_presentation)].copy()

    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()

    submitted_by_current = course_assessments[
        course_assessments['date_submitted'] <= current_day
    ]['id_student'].unique()

    submitted_after_current = course_assessments[
        (course_assessments['date_submitted'] > current_day) & 
        (course_assessments['date_submitted'] <= cutoff_day)
    ]['id_student'].unique()

    train_labels = pd.Series(
        course_students['id_student'].isin(submitted_by_current).astype(int).values,
        index=course_students['id_student']
    )
    
    train_labels.index.name = 'id_student'
    
    test_students = course_students[train_labels.values == 0]

    test_labels = pd.Series(
        test_students['id_student'].isin(submitted_after_current).astype(int).values,
        index=test_students['id_student']
    )
    test_labels.index.name = 'id_student'

    return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                 first_assessment_id, current_day, cutoff_day):
    ###
    ### 
    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]
    
    course_students = students_df[
        (students_df['code_module'] == target_module) &
        (students_df['code_presentation'] == target_presentation)
    ].copy()

    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()
    
    # train
    submitted_by_current = course_assessments[
        course_assessments['date_submitted'] <= current_day
    ]['id_student'].unique()
    
    train_labels = pd.Series(0, index=course_students['id_student'])
    train_labels.loc[train_labels.index.isin(submitted_by_current)] = 1
    train_labels.index.name = 'id_student'
    train_labels = train_labels.astype(int)
    
    # test
    test_population_ids = train_labels[train_labels == 0].index

    submitted_after_current = course_assessments[
        (course_assessments['date_submitted'] > current_day) &
        (course_assessments['date_submitted'] <= cutoff_day)
    ]['id_student'].unique()
    
    test_labels = pd.Series(0, index=test_population_ids)
    test_labels.loc[test_labels.index.isin(submitted_after_current)] = 1
    test_labels.index.name = 'id_student'

#     train_labels = pd.Series(
#         course_students['id_student'].isin(submitted_by_current).astype(int).values,
#         index=course_students['id_student']
#     )

#     test_labels = pd.Series(
#         course_students['id_student'].isin(submitted_after_current).astype(int).values,
#         index=course_students['id_student']
#     )
#     test_labels.index.name = 'id_student'

    return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                first_assessment_id, current_day, cutoff_day):
    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]

    course_students = students_df[
        (students_df['code_module'] == target_module) &
        (students_df['code_presentation'] == target_presentation)
    ].copy()

    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()

    submitted_by_current = course_assessments[
        course_assessments['date_submitted'] <= current_day
    ]['id_student'].unique()

    train_labels = pd.Series(
        course_students['id_student'].isin(submitted_by_current).astype(int).values,
        index=course_students['id_student']
    )
    train_labels.index.name = 'id_student'

    remaining_students = course_students[~course_students['id_student'].isin(submitted_by_current)]

    submitted_in_window = course_assessments[
        (course_assessments['date_submitted'] > current_day) &
        (course_assessments['date_submitted'] <= cutoff_day)
    ]['id_student'].unique()

    test_labels = pd.Series(
        remaining_students['id_student'].isin(submitted_in_window).astype(int).values,
        index=remaining_students['id_student']
    )
    test_labels.index.name = 'id_student'

    return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                 first_assessment_id, current_day, cutoff_day):
    ###
    ### YOUR CODE HERE
    ###
    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]

    course_students = students_df[(students_df['code_module'] == target_module) 
                                  & (students_df['code_presentation'] == target_presentation)].copy()

    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()

    submitted_by_current = course_assessments[
        course_assessments['date_submitted'] <= current_day
    ]['id_student'].unique()

    submitted_after_current = course_assessments[
        course_assessments['date_submitted'] > current_day
    ]['id_student'].unique()
    train_labels = pd.Series(
        course_students['id_student'].isin(submitted_by_current).astype(int).values,
        index=course_students['id_student']
    )
    train_labels.index.name = 'id_student'
    
    test_student_ids = course_students[~course_students['id_student'].isin(submitted_by_current)]['id_student']
    
    submitted_in_window = course_assessments[
        (course_assessments['date_submitted'] > current_day) & 
        (course_assessments['date_submitted'] <= cutoff_day)
    ]['id_student'].unique()
    
    test_labels = pd.Series(
        test_student_ids.isin(submitted_in_window).astype(int).values,
        index=test_student_ids
    )
    test_labels.index.name = 'id_student'

    return train_labels, test_labels
In [ ]:
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                 first_assessment_id, current_day, cutoff_day):
    ###
    ### YOUR CODE HERE
    ###
    #3
    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]

#     course_students = students_df.copy()
    
    #4
    course_students = students_df[
        (students_df['code_module'] == target_module) &
        (students_df['code_presentation'] == target_presentation)
    ].copy()

    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()
    
    
    #5
    submitted_by_current = course_assessments[
        course_assessments['date_submitted'] <= current_day
    ]['id_student'].unique()

    submitted_after_current = course_assessments[
        (course_assessments['date_submitted'] > current_day)&
        (course_assessments['date_submitted'] <= cutoff_day)
    ]['id_student'].unique()

    train_labels = pd.Series(
        course_students['id_student'].isin(submitted_by_current).astype(int).values,
        index=course_students['id_student']
    )
    train_labels.index.name = 'id_student'
    
#     test_labels = pd.Series(
#         ((~course_students['id_student'].isin(submitted_by_current)) & (course_students['id_student'].isin(submitted_after_current))).astype(int).values,
#         index = course_students['id_student']
#     )
    
    test_labels = pd.Series(
        course_students['id_student'].isin(submitted_after_current).astype(int).values,
        index=course_students['id_student']
    )
    test_labels.index.name = 'id_student'
    
    train_labels_0 = train_labels[train_labels == 0].index
    test_labels = test_labels[test_labels.index.isin(train_labels_0)]
    

    return train_labels, test_labels
In [ ]:
### Solution - Exercise 8  
def DEBUG_create_temporal_split(students_df, student_assessments_df,
                                 first_assessment_id, current_day, cutoff_day):

    target_module = students_df['code_module'].iloc[0]
    target_presentation = students_df['code_presentation'].iloc[0]
    course_students = students_df.query('`code_module` == @target_module and `code_presentation` == @target_presentation')

    course_assessments = student_assessments_df[
        (student_assessments_df['code_module'] == target_module) &
        (student_assessments_df['code_presentation'] == target_presentation) &
        (student_assessments_df['id_assessment'] == first_assessment_id)
    ].copy()


    submitted_by_current = course_assessments.query('`date_submitted` <= @current_day')['id_student'].unique()
    submitted_after_current = course_assessments.query('`date_submitted` > @current_day and `date_submitted` <= @cutoff_day')['id_student'].unique()

    
    train_labels = pd.Series(course_students['id_student'].isin(submitted_by_current).astype(int).values,
                            index=course_students['id_student'])
    
    left = course_students.query('`id_student` not in @submitted_by_current')
    
#     display(left)
    test_labels = pd.Series(
        (
            (left['id_student'].isin(submitted_after_current))
        ).astype(int).values, index=left['id_student'])

    return train_labels, test_labels

Ex9¶

In [ ]:
def calculate_metrics(y_pred_proba, y_true):
    y_pred = (y_pred_proba >= 0.5).astype(int)
    
    t_pos = sum((y_pred == 1) & (y_true == 1))
    f_pos = sum((y_pred == 1) & (y_true == 0))
    t_neg = sum((y_pred == 0) & (y_true == 0))
    f_neg = sum((y_pred == 0) & (y_true == 1))
    
    precision = t_pos / (t_pos + f_pos) if t_pos + f_pos > 0 else 0.0
    recall = t_pos / (t_pos + f_neg) if t_pos + f_neg > 0 else 0.0
    
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
    return {
        'f1_score' : round(f1_score, 4),
        'precision' : round(precision, 4),
        'recall' : round(recall, 4)
    }
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
    
    prediction = np.where(y_pred_proba > 0.5,1,0)
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    pos = False
    act = False
    
    for pred,true in zip(prediction, y_true):
        if (pred == 1) & (true == 1):
            pos = True
            act = True
            TP += 1
        elif (pred == 1) & (true == 0):
            pos = True
            FP += 1
        elif (pred == 0) & (true == 1):
            act = True
            FN += 1
        elif (pred == 0) & (true == 0):
            TN += 1
    
    if pos == 0:
        precision = 0
    else:
        precision = TP / (TP+FP)
        
    if act == False:
        recall = 0
    else:
        recall = TP / (TP + FN)
    
    if (precision == 0) & (recall == 0):
        F1 = 0
    else:
        F1 = 2 * (precision * recall) / (precision + recall)
    
    result = {
        'precision':round(precision,4),
        'recall':round(recall,4),
        'f1_score':round(F1,4)
    }
    
    return result
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
    y_pred = [1 if p>=0.5 else 0 for p in y_pred_proba]
    
    tp = sum(1 for pred, true in zip(y_pred, y_true) if pred==1 and true==1)
    fp = sum(1 for pred, true in zip(y_pred, y_true) if pred==1 and true==0)
    fn = sum(1 for pred, true in zip(y_pred, y_true) if pred==0 and true==1)
    
    precision = tp / (tp+fp) if (tp+fp) != 0 else 0
    recall = tp / (tp+fn) if (tp+fn) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    return {'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1_score': round(f1_score, 4)}
In [ ]:
### Solution - Exercise 9  
def calculate_metrics(y_pred_proba, y_true):
    ###
    import numpy as np
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)
    
    tp = np.sum((y_pred == 1) & (y_true == 1))
    fp = np.sum((y_pred == 1) & (y_true == 0))
    tn = np.sum((y_pred == 0) & (y_true == 0))
    fn = np.sum((y_pred == 0) & (y_true == 1))
    
    if (tp + fp) > 0:
        precision = tp / (tp + fp)
    else:
        precision = 0.0
    
    if (tp + fn) > 0:
        recall = tp / (tp + fn)
    else:
        recall = 0.0
    
    if precision == 0.0 and recall == 0.0:
        f1_score = 0.0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    return {
        'precision' : round(precision, 4),
        'recall': round(recall, 4),
        'f1_score': round(f1_score, 4)
    }
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
    y_pred_proba = [1 if c>=0.5 else 0 for c in y_pred_proba]
    data = {'prediction':y_pred_proba,'actual':y_true}
    df=pd.DataFrame(data)
    tp = len(df[(df['prediction']==1) & (df['actual']==1)])
    fp = len(df[(df['prediction']==1) & (df['actual']==0)])
    tn = len(df[(df['prediction']==0) & (df['actual']==0)])
    fn = len(df[(df['prediction']==0) & (df['actual']==1)])
    
    if tp+fp ==0:
        precision = 0
    else:
        precision = tp / (tp +fp)
        
    if tp+fn==0:
        recall = 0
    else:
        recall = tp/(tp+fn)
    
    if precision ==0 and recall==0:
        f1_score=0
    else:
        f1_score = 2*(precision*recall)/ (precision + recall)
    
    res = dict()
    res['precision']=float(round(precision,4))
    res['recall']=float(round(recall,4))
    res['f1_score']=float(round(f1_score,4))
    
    return res
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
    y_pred = (y_pred_proba >= 0.5).astype(int)
    y_true_cp = np.array(y_true)
    
    tp = np.sum((y_pred == 1) & (y_true_cp == 1))
    fp = np.sum((y_pred == 1) & (y_true_cp == 0))
    fn = np.sum((y_pred == 0) & (y_true_cp == 1))
    
    if (tp + fp) == 0:
        precision = 0.0
    else:
        precision = tp / (tp + fp)
        
    if (tp + fn) == 0:
        recall = 0.0
    else:
        recall = tp / (tp + fn)
        
    if (precision + recall) == 0:
        f1_score = 0.0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
        
    return {
        'precision': round(float(precision), 4),
        'recall': round(float(recall), 4),
        'f1_score': round(float(f1_score), 4)
    }
In [ ]:
def calculate_metrics(y_pred_proba, y_true):
    prediction = np.where(y_pred_proba >= 0.5, 1, 0)
    
    TP = ((prediction == 1) & (y_true == 1)).sum()
    FP = ((prediction == 1) & (y_true == 0)).sum()
    TN = ((prediction == 0) & (y_true == 0)).sum()
    FN = ((prediction == 0) & (y_true == 1)).sum()
    
    if TP == 0 and FP == 0:
        precision = 0.0
    else:
        precision = TP/(TP+FP)
        
    if TP ==0 and FN == 0:
        recall = 0.0
    else:   
        recall = TP/(TP+FN)
    
    if precision == 0 and recall == 0:
        f1_score = 0.0
    else:
        f1_score = round(2*(precision*recall)/(precision + recall), 4)  
    
    return {
        'f1_score': f1_score,
        'precision': round(precision, 4),
        'recall': round(recall, 4)
    
    }
In [ ]:
### Solution - Exercise 9  
def calculate_metrics(y_pred_proba, y_true):
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)
    
    tp = np.sum(y_pred & y_true)
    tn = np.sum(np.where((y_pred == 0) & (y_true == 0), 1, 0))
    fp = np.sum(np.where((y_pred == 1) & (y_true == 0), 1, 0))
    fn = np.sum(np.where((y_pred == 0) & (y_true == 1), 1, 0))
    
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0.0
    
    f1 = 2 * (precision * recall) / (precision + recall) if (recall != 0.0 or precision != 0) else 0.0
    
    return {'precision': round(precision, 4), 'recall': round(recall, 4), 'f1_score': round(f1, 4)}

Ex10¶

In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):

    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)

    k = int(np.ceil(k_percent * len(y_true)))

    sorted_indices = np.argsort(y_pred_proba)[::-1]
    top_k_indices = sorted_indices[:k]

    top_k_labels = y_true[top_k_indices]

    precision = np.sum(top_k_labels == 1) / k

    return round(float(precision), 2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
    ###
    ### YOUR CODE HERE
    ###
    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)

    k = int(np.ceil(len(y_pred_proba) * k_percent))
    
    if k < 1:
        k = 1

    sorted_indices = np.argsort(y_pred_proba)[::-1]
    
    top_k_indices = sorted_indices[:k]
    top_k_labels = y_true[top_k_indices]
    
    precision = np.sum(top_k_labels) / k
    precision = np.round(precision, 2)
    precision.astype(float)
    
    return precision
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
    ###
    ### YOUR CODE HERE
    ###
    import math 
    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)

    k = math.ceil(len(y_pred_proba) * k_percent)
    
    sorted_indices = np.argsort(y_pred_proba)[::-1]

    top_k_indices = sorted_indices[:k]

    top_k_labels = y_true[top_k_indices]

    precision = np.sum(top_k_labels) / k

    return round(precision, 2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
    ###
    ### YOUR CODE HERE
    ###
    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)
    n = len(y_true)

    k = int(np.ceil(k_percent * n))


    if k < 1:
        k = 1

    sorted_indices = np.argsort(y_pred_proba)[::-1]

    top_k_indices = sorted_indices[:k]

    top_k_labels = y_true[top_k_indices]

    precision = np.sum(top_k_labels) / k

    return round(precision,2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
    import math
    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)

    k = math.ceil(len(y_pred_proba) * k_percent)

    if k < 1:
        k = 1

    sorted_indices = np.argsort(y_pred_proba)[::-1]

    top_k_indices = sorted_indices[:k]

    top_k_labels = y_true[top_k_indices]

    precision = float(round(np.sum(top_k_labels) / k,2))

    return precision
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):

    import math
    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)

    k = math.ceil(len(y_pred_proba) * k_percent)

    if k < 1:
        k = 1

    sorted_indices = np.argsort(y_pred_proba)[::-1]

    top_k_indices = sorted_indices[:k]

    top_k_labels = y_true[top_k_indices]

    precision = float(np.sum(top_k_labels) / k)

    return round(precision, 2)
In [ ]:
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
    ###
    ### YOUR CODE HERE
    ###
    import math
    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)

    k = math.ceil(len(y_pred_proba) * k_percent)   # int

    if k < 1:
        k = 1

    sorted_indices = np.argsort(y_pred_proba)[::-1] # [::-1]

    top_k_indices = sorted_indices[:k]

    top_k_labels = y_true[top_k_indices]
    top_k_pred = y_pred_proba[top_k_indices]
    
    top_k_pred = np.where(top_k_pred >= 0.5, 1, 0)
    
    
    num_tp_in_topk = ((top_k_labels == 1) & (top_k_pred == 1)).sum()
    
    top_k_precision = float(round(num_tp_in_topk/k, 2))
    
#     precision = np.sum(top_k_labels) / k

    return top_k_precision
In [ ]:
### Solution - Exercise 10  
def DEBUG_calculate_topk_precision(y_pred_proba, y_true, k_percent):
    from math import ceil
    y_pred_proba = np.array(y_pred_proba)
    y_true = np.array(y_true)

    k = ceil(len(y_pred_proba) * k_percent)

    if k < 1:
        k = 1

    sorted_indices = np.argsort(y_pred_proba)[::-1]

    top_k_indices = sorted_indices[:k]

    top_k_labels = y_true[top_k_indices]
    
    top_k_preds = y_pred_proba[top_k_indices]
    
#     display(y_true, y_pred_proba, sorted_indices, top_k_indices, top_k_preds, top_k_labels)
#     precision = np.sum(top_k_labels) / k

    precision = calculate_metrics(y_pred_proba=top_k_preds, y_true=top_k_labels)['precision']

    return round(precision, 2)