SP2026 FX Alternative Solutions¶

Ex1¶

In [ ]:
cleanse_observations_query = '''

SELECT 

class,"order",family,genus,species,state,
latitude,longitude,coordinate_uncertainty,day,month,year

FROM

(SELECT 
class,"order",family,genus,species,
stateProvince AS state,
decimalLatitude AS latitude,
decimalLongitude AS longitude,
coordinateUncertaintyInMeters AS coordinate_uncertainty,
day,month,year, 
COUNT(*) OVER (PARTITION by species) as sp_count

FROM observations

WHERE coordinateUncertaintyInMeters <= 25
AND stateProvince NOT IN ('Hawaii','Alaska')
AND species IS NOT NULL)

WHERE sp_count >= 25
AND class IS NOT NULL
AND "order" IS NOT NULL
AND family IS NOT NULL
AND genus IS NOT NULL
AND species IS NOT NULL
AND state IS NOT NULL
'''
In [ ]:
### Solution - Exercise 1  
cleanse_observations_query = '''

with data_with_counts as (
    select 
        class,
        `order`, 
        family, 
        genus, 
        species, 
        stateprovince, 
        decimallatitude, 
        decimallongitude, 
        coordinateuncertaintyinmeters, 
        day, 
        month, 
        year,
        count(*) over(
        partition by 
            species
        ) as species_count

    from
        observations
    where
        coordinateuncertaintyinmeters <= 25
        and stateprovince not in ('Hawaii', 'Alaska')
        and `class` is not null 
        and `order` is not null
        and family is not null
        and genus is not null
        and species is not null
        and stateprovince is not null
    )
    
    select 
        `class`,
        `order`, 
        family, 
        genus, 
        species, 
        stateprovince as state, 
        decimallatitude as latitude, 
        decimallongitude as longitude, 
        coordinateuncertaintyinmeters as coordinate_uncertainty, 
        day, 
        month, 
        year
        
    from
        data_with_counts
    where
        species_count >= 25

'''
In [ ]:
### Solution - Exercise 1  
cleanse_observations_query = '''
SELECT class,
    "order",
    family,
    genus,
    species,
    state,
    latitude,
    longitude,
    coordinate_uncertainty,
    day,
    month,
    year
FROM (
    SELECT class,
        "order",
        family,
        genus,
        species,
        stateProvince AS state,
        decimalLatitude AS latitude,
        decimalLongitude AS longitude,
        coordinateUncertaintyInMeters AS coordinate_uncertainty,
        day,
        month,
        year,
        COUNT(*) OVER (PARTITION BY species) AS species_count
    FROM observations
    WHERE class IS NOT NULL
        AND "order" IS NOT NULL
        AND family IS NOT NULL
        AND genus IS NOT NULL
        AND species IS NOT NULL
        AND stateProvince IS NOT NULL
        AND coordinateUncertaintyInMeters <= 25
        AND stateProvince NOT IN ('Hawaii', 'Alaska')
) sub
WHERE species_count >= 25
'''
pass
In [ ]:
cleanse_observations_query = '''
SELECT gbifID, class, "order", family, genus,o.species,stateProvince state
,decimalLatitude latitude,decimalLongitude longitude
,coordinateUncertaintyInMeters coordinate_uncertainty, day, month, year
FROM observations o
join (select species,count(*) obs from observations o
where o.coordinateUncertaintyInMeters<=25
and class is not null and "order" is not null and family is not null 
and genus is not null and o.species is not null and stateProvince is not null
and stateProvince!='Hawaii' and stateProvince!='Alaska'
group by 1) b on o.species=b.species
where b.obs>=25 and o.coordinateUncertaintyInMeters<=25
and class is not null and "order" is not null and family is not null 
and genus is not null and o.species is not null and stateProvince is not null
and stateProvince!='Hawaii' and stateProvince!='Alaska'
order by 1
'''
In [ ]:
cleanse_observations_query = '''
WITH table_temp AS (
    SELECT gbifID, class, "order", family, genus, species, stateProvince as state,  decimalLatitude as latitude, decimalLongitude as longitude, coordinateUncertaintyInMeters as coordinate_uncertainty, day, month, year, COUNT(*) OVER (PARTITION BY species) as cnt  
    FROM observations
    WHERE coordinateUncertaintyInMeters <  26 
        AND state NOT IN ("Hawaii", "Alaska")
        AND class NOT NULL
        AND "order" NOT NULL
        AND family NOT NULL
        AND genus NOT NULL
        AND species NOT NULL
        AND state NOT NULL
    ORDER BY gbifID 
)

SELECT gbifID, class, "order", family, genus, species, state,  latitude, longitude, coordinate_uncertainty, day, month, year
FROM table_temp
WHERE cnt > 24
'''
In [ ]:
cleanse_observations_query = '''
WITH filtered AS (
    SELECT
        gbifID,
        class,
        "order",
        family,
        genus,
        species,
        stateProvince AS state,
        decimalLatitude AS latitude,
        decimalLongitude AS longitude,
        coordinateUncertaintyInMeters AS coordinate_uncertainty,
        day,
        month,
        year,
        COUNT(*) OVER (PARTITION BY species) AS species_count
    FROM observations
    WHERE coordinateUncertaintyInMeters <= 25
      AND stateProvince NOT IN ('Hawaii', 'Alaska')
      AND class IS NOT NULL
      AND "order" IS NOT NULL
      AND family IS NOT NULL
      AND genus IS NOT NULL
      AND species IS NOT NULL
      AND stateProvince IS NOT NULL
)

SELECT
    gbifID,
    class,
    "order",
    family,
    genus,
    species,
    state,
    latitude,
    longitude,
    coordinate_uncertainty,
    day,
    month,
    year
FROM filtered
WHERE species_count >= 25
ORDER BY gbifID ASC;'''
In [ ]:
cleanse_observations_query = '''
    WITH filtered as (
        SELECT 
            gbifID,
            class,
            "order",
            family,
            genus,
            species,
            stateProvince as state,
            decimalLatitude as latitude,
            decimalLongitude as longitude,
            coordinateUncertaintyInMeters as coordinate_uncertainty,
            day,
            month,
            year,
            COUNT(*) over (partition by species) as species_count
        FROM 
            observations
        WHERE 
            coordinateUncertaintyInMeters <=25
            and stateProvince Not in ('Hawaii','Alaska')
            and class is not null
            and "order" is not null
            and family is not null
            and genus is not null
            and species is not null
            and stateProvince is not null
        )
        Select 
            gbifID,
            class,
            "order",
            family,
            genus,
            species,
            state,
            latitude,
            longitude,
            coordinate_uncertainty,
            day,
            month,
            year
        FROM
            filtered
        WHERE
            species_count>=25
        order by 
            gbifID ASC 
            
'''
In [ ]:
cleanse_observations_subquery = '''
SELECT gbifID, class, "order", family, genus, species,
       stateProvince AS state,
       decimalLatitude AS latitude,
       decimalLongitude AS longitude,
       coordinateUncertaintyInMeters AS coordinate_uncertainty,
       day, month, year,
       COUNT(species) OVER (PARTITION BY species) AS species_count
FROM observations
WHERE (coordinate_uncertainty <= 25)
      AND (state <> 'Hawaii') AND (state <> 'Alaska')
      AND (class IS NOT NULL)
      AND ("order" IS NOT NULL)
      AND (family IS NOT NULL)
      AND (genus IS NOT NULL)
      AND (species IS NOT NULL)
      AND (state IS NOT NULL)
'''

cleanse_observations_query = f'''
SELECT gbifID, class, "order", family, genus, species, state, latitude, longitude, coordinate_uncertainty, day, month, year
FROM ({cleanse_observations_subquery})
WHERE species_count >= 25
'''

Ex3¶

In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:


    import re
    import itertools

    edibles = list(edible_df['Scientific name'])
    s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Za-z]+\s[a-z]+',str(x)))
    similar_edibles= set(itertools.chain.from_iterable(s.values))

    updated_sim = set()
    for i in similar_edibles:
        if 'species' in i or 'spp' in i:
            genus = i.split()[0]
            updated_sim.update([name for name in edibles if genus in name])
        else:
            updated_sim.add(i)
    return updated_sim
In [ ]:
### Solution - Exercise 3  
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:


    import re
    import itertools

    edibles = list(edible_df['Scientific name'])
    s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Z][a-z]+\s[a-z]+',str(x)))
    similar_edibles= set(itertools.chain.from_iterable(s.values))
#     display(similar_edibles)


    updated_sim = set()
    for i in similar_edibles:
        if 'species' in i or 'spp' in i:

            genus = i.split()[0]
            updated_sim.update([name for name in edibles if genus in name])
        else:
            updated_sim.add(i)
    return updated_sim
In [ ]:
### Solution - Exercise 3  
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:

    import re
    import itertools

    edibles = list(edible_df['Scientific name'])

    s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'\b[A-Z][a-z]+ [a-z]+\b',str(x)))

    similar_edibles= set(itertools.chain.from_iterable(s))
       
#     display(similar_edibles)

    updated_sim = set()
    for i in similar_edibles:
        if 'species' in i or 'spp' in i:

            genus = i.split()[0]
            name_list = [name for name in edibles if name.split()[0] == genus]
#             print(f'name_list: {name_list}')
            for name in name_list:
                updated_sim.add(name)
        else:
            updated_sim.add(i)
    return updated_sim
pass
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:


    import re
    import itertools

    edibles = list(edible_df['Scientific name'])
    s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Z][a-z]+\s[a-z]+',str(x)))
    similar_edibles= set(itertools.chain.from_iterable(s.values))
#     display(similar_edibles)


    updated_sim = set()
    for i in similar_edibles:
        if 'species' in i or 'spp' in i:

            genus = i.split()[0]
            updated_sim.update([name for name in edibles if genus in name])
        else:
            updated_sim.add(i)
    return updated_sim
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:

    import re
    # change 1
    # import itertools (seems unneccesary)
    
    # change 2
    # edibles = list(edible_df['Scientific name'])
    edibles = set(edible_df['Scientific name'])
    
    # change 3: taking pattern out + better pattern 
    pattern = r"\b[A-Z][a-z]+ (?:[a-z]+|species|spp)\b"
    
    # another change
    similar_edibles = set()
    
#     s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'^[a-z]+\s[a-z]+$',str(x)))
#     similar_edibles= list(itertools.chain(s.values()))
#     display(similar_edibles)
    
    # change 4
    for x in poisonous_df['Similar edible species']:
        found = re.findall(pattern, str(x))
        similar_edibles.update(found)
    
    # another change
    updated_sim = set()
    
    # change 5
    for i in similar_edibles:
        genus, species = i.split()
        
        if species in ['species', 'spp']:
            for edible in edibles:
                if edible.startswith(genus + " "):
                    updated_sim.add(edible)
                    
        else:
            updated_sim.add(i)
    
    return updated_sim
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:


    import re
    import itertools

    #edibles = list(edible_df['Scientific name'])
    edibles = [s.strip() for s in edible_df['Scientific name'].dropna().unique()]
    pattern = re.compile(r'\b[A-Z][a-z]+\s[a-z]+\b')
    s = poisonous_df['Similar edible species'].apply(lambda x: pattern.findall(str(x)))
    similar_edibles= list(itertools.chain.from_iterable(s.values))
    display(similar_edibles)


    updated_sim = set()
    for i in similar_edibles:
        if i.split()[1] in ('species','spp'):
            genus = i.split()[0]
            updated_sim.update([name for name in edibles if name.startswith(genus + ' ')])
        else:
            updated_sim.add(i)
    return updated_sim
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
    import re

    edibles = set(edible_df['Scientific name'])
    s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Z][a-z]*\s[a-z]+', str(x)))
    s = set(np.concatenate(s.values))
    
    similar_edibles = set()
    wildcard_geni = set()
    for e in s:
        genus, species = e.split()
        if species in ['species', 'spp']:
            wildcard_geni.add(genus)
        else:
            similar_edibles.add(e)
            
    updated_sim = similar_edibles
    for i in edibles:
        genus = i.split()[0]
        if genus in wildcard_geni:
            updated_sim.add(i)
    return updated_sim

Ex4¶

In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
    mpdf = mushroom_poison_df.copy()
    mpdf = mpdf.merge(poisonous_df[['Scientific name','Severity']],left_on='species',right_on='Scientific name',how='left')
    mpdf['dupe'] = np.where((mpdf['edible']==1)&(mpdf['species'].isin(edible_dupes)),1,0)
    mpdf['severe'] = np.where((mpdf['poisonous']==1)&(mpdf['Severity']=='deadly'),1,0)
    mpdf.drop(columns=['Scientific name','Severity'],inplace=True)
    return mpdf
In [ ]:
### Solution - Exercise 4  
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:

    df = mushroom_poison_df.copy(deep=True)

    df = df.merge(
        poisonous_df[['Scientific name', 'Severity']], 
        left_on='species', 
        right_on='Scientific name', 
        how='left'
    )
    
    df['severe'] = np.where(
            (df['poisonous'] == 1) & (df['Severity'] == 'deadly'), 
            1, 
            0
        )   
    
    df['dupe'] = np.where(
            (df['edible'] == 1) & (df['species'].isin(edible_dupes)), 
            1, 
            0
        )    
    
    return df.drop(
        columns=['Scientific name', 'Severity']
    )
In [ ]:
### Solution - Exercise 4  
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:

    ###
    combined_df = mushroom_poison_df.copy()
    combined_df["dupe"] = np.where(
        (combined_df['edible'] == 1) & combined_df['species'].isin(edible_dupes),
        1, 0
    )
    severe_poison_df = poisonous_df[poisonous_df['Severity']== 'deadly']    
    combined_df["severe"] = np.where(
        (combined_df['poisonous'] == 1) & combined_df['species'].isin(severe_poison_df['Scientific name']),
        1, 0
    )
    
    return combined_df
    ###
pass
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:

    ###
    ### YOUR CODE HERE
    ###
    rdf=mushroom_poison_df.copy()
#     display(rdf)
    severe=poisonous_df[poisonous_df['Severity']=='deadly']['Scientific name'].tolist()
    rdf['severe']=((rdf['species'].isin(severe)) & (rdf['poisonous']==1)).astype(int)
    rdf['dupe']=((rdf['species'].isin(list(edible_dupes))) & (rdf['edible']==1)).astype(int)
    return rdf
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:

    ###
    ### YOUR CODE HERE
    ###
    
    ##print(edible_dupes)
    ##print("\n\n-------------\n\n")
    ##print(poisonous_df.head())
    ##print("\n\n------------------\n\n")
    ##print("MUSHROOM POISON DF")
    ##print(mushroom_poison_df.head())
    
    ##GET DEADLY SEVERITY MUSHROOMS
    deadly_severity_mushrooms = poisonous_df["Scientific name"][poisonous_df.Severity == "deadly"]
    ##print(deadly_severity_mushrooms)
     
    
    ##INSERT VAlUES 
    copy_mushroom_poison_df = mushroom_poison_df.copy()
    copy_mushroom_poison_df["severe"] = 0
    copy_mushroom_poison_df.severe.loc[(copy_mushroom_poison_df.poisonous == 1) & (copy_mushroom_poison_df.species.isin(deadly_severity_mushrooms))] = 1
    copy_mushroom_poison_df["dupe"] = 0
    copy_mushroom_poison_df.dupe.loc[(copy_mushroom_poison_df.edible == 1) & (copy_mushroom_poison_df.species.isin(edible_dupes))] = 1

    return(copy_mushroom_poison_df)
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:

    mush_df = mushroom_poison_df.copy()
    pois_df = poisonous_df.copy()
    
#     display(mush_df.head())
    
#     pois_df.rename(columns={'Scientific name': 'species'}, inplace=True)
#     display(pois_df.head())
    
#     mush_df = pd.merge(mush_df, pois_df, on = 'species', how='left')
#     mush_df.dropna(inplace=True)
    
    deadly_mush = set(pois_df.loc[pois_df['Severity']=='deadly', 'Scientific name'])
#     print(deadly_mush)
    
    mush_df['dupe'] = mush_df['species'].apply(lambda x: 1 if x in edible_dupes else 0)
    mush_df['severe'] = mush_df['species'].apply(lambda x: 1 if x in deadly_mush else 0)

#     display(mush_df)
    
    mush_df['severe'] = mush_df['severe'] * mush_df['poisonous']
    mush_df['dupe'] = mush_df['dupe'] * mush_df['edible']
    
    mushroom_severity_df = mush_df
    
    return mushroom_severity_df
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
    mushroom_severity_df = mushroom_poison_df.copy()
    
    deadly_species = set(
            poisonous_df.loc[poisonous_df['Severity']=='deadly','Scientific name'])
    
    mushroom_severity_df['severe'] = ((mushroom_severity_df['poisonous']==1  ) &
    (mushroom_severity_df['species'].isin(deadly_species))).astype(int)
    
    mushroom_severity_df['dupe'] = ((mushroom_severity_df['edible']==1  ) &
    (mushroom_severity_df['species'].isin(edible_dupes))).astype(int)
    
    return mushroom_severity_df
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
    deadly = poisonous_df['Scientific name'][poisonous_df['Severity'] == 'deadly']
    mushroom_severity_df = mushroom_poison_df.copy()
    mushroom_severity_df['severe'] = \
        (mushroom_severity_df['poisonous'] == 1) \
        & (mushroom_severity_df['species'].isin(deadly))
    mushroom_severity_df['dupe'] = \
        (mushroom_severity_df['edible'] == 1) \
        & (mushroom_severity_df['species'].isin(edible_dupes))
	for c in ['severe', 'dupe']:
		mushroom_severity_df[c] = \
            mushroom_severity_df[c].astype(int)
    return mushroom_severity_df

Ex5¶

In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:

    df = mushroom_severity_df.copy()
    df = df[df['genus'] == genus]

    df =  df[['month', 'genus', 'species']].groupby(by = ['month', 'species'], 
                                as_index = False).count().rename(columns={'genus':'counts'})
    


    monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts', fill_value = 0.0)

    
    return monthly_df.reindex(range(1,13),fill_value=0)
In [ ]:
### Solution - Exercise 5  
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:

    df = mushroom_severity_df.copy(deep=True)

    df = df[df['genus'] == genus]

    df =  (
        df[['month', 'genus', 'species']]
            .groupby(
                by = ['month', 'species'], 
                as_index = False
           ).count()
            .rename(
                columns={'genus':'count'}
            )
    )

    monthly_df = df.pivot_table(
        index = 'month',
        columns = 'species',
        values = 'count',
        fill_value = 0.0
    )

    
    return monthly_df.reindex(
        range(1, 13), 
        fill_value=0
    )
In [ ]:
### Solution - Exercise 5  
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:

    df = mushroom_severity_df.copy()
    df = df[df['genus'] == genus]
    

    df =  df[['month', 'genus', 'species']].groupby(by = ['month', 'species'], 
                                as_index = False).count().rename(columns={'genus':'counts'})

    monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts', fill_value = 0.0)
    
    # Ensure all months 1-12 are in the index
    monthly_df = monthly_df.reindex(range(1, 13), fill_value=0)
    
    return monthly_df

pass
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:
    mushroom_severity_df=mushroom_severity_df.copy()
    mushroom_severity_df['is_genus'] = mushroom_severity_df['genus'] == genus
    df = mushroom_severity_df[mushroom_severity_df['is_genus']]
#     display(df)

    df =  df[['month', 'genus', 'species']].groupby(by = ['month', 'species']).count().reset_index().rename(columns={'genus':'counts'})
#     display(df)
    
    monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts',fill_value=0).astype('int64').reset_index()
    monthly_df.index=monthly_df['month']
    monthly_df.drop(columns=['month'],inplace=True)
    
    for i in range(1,13):
        if i not in monthly_df.index:
            monthly_df.loc[i]=0

    return monthly_df
In [ ]:
### Solution - Exercise 5  
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:
    
    ##CHECK DATA
    ##print(mushroom_severity_df)
    ##print(genus)
    ##assert False 
    
    ##FILTER DATA 
    mushroom_severity_df_copy = mushroom_severity_df.copy()
    mushroom_severity_df_copy['is_genus'] = mushroom_severity_df_copy['genus'] == genus
    df = mushroom_severity_df_copy[mushroom_severity_df_copy['is_genus']]

    
    ##COUNT SPECIES 
    ##df =  df[['month', 'genus', 'species']].groupby(by = ['month', 'genus','species']).count().rename(columns={'species':'counts'})
    df =  df[['month', 'genus', 'species']].groupby(by = ['month', 'species']).count().rename(columns = {"genus":"counts"})##.rename(columns={'species':'counts'})  
    
    ##PIVOT 
    ##monthly_df = df.pivot_table(index = 'species', columns = 'month', values = 'counts').reset_index()
    monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts')##.reset_index(drop = True)
    ##print(monthly_df)
    ##assert False 
    
    ##CHECK FOR MISSING MONTHS 
    months = [1,2,3,4,5,6,7,8,9,10,11,12]
    for month in months:
        if month not in monthly_df.index:
            monthly_df.loc[month] = 0
    
    ##SET MISSING VALUES TO 0 
    monthly_df = monthly_df.fillna(0)
    monthly_df = monthly_df.astype(int)
    
    ##RETURN DF 
    return monthly_df
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:
#     mushroom_severity_df['is_genus'] = mushroom_severity_df['genus'] == genus
    df = mushroom_severity_df[mushroom_severity_df['genus'] == genus]

#     df =  df[['month', 'genus', 'species']].groupby(by = ['month', 'genus']).count().rename(columns={'species':'counts'})
    df = df.groupby(['month', 'species']).size().reset_index(name='counts')
        
    monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts', fill_value=0) #.reset_index()
    monthly_df = monthly_df.reindex(range(1, 13), fill_value=0)
    
    return monthly_df
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:
    df = mushroom_severity_df[mushroom_severity_df['genus']==genus]
    
    df = (df[['month', 'genus', 'species']].groupby(by = ['month', 'species']).count()
          .rename(columns={'genus':'counts'}))
    
    monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts',fill_value=0).reindex(range(1,13),fill_value=0)

    return monthly_df
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) ->  pd.DataFrame:
    is_genus = mushroom_severity_df['genus'] == genus
    df = mushroom_severity_df[is_genus][['month', 'genus', 'species']] \
         .groupby(by=['month', 'species']) \
         .count() \
         .rename(columns={'genus': 'counts'})
    monthly_df = df.pivot_table(index='month',
                                columns='species',
                                values='counts',
                                fill_value=0)
    missing_months = set(range(1, 13)) - set(monthly_df.index)
    if len(missing_months) > 0:
		zeros_row = [0] * len(monthly_df.columns)
        for missing in missing_months:
            monthly_df.loc[missing] = zeros_row
    return monthly_df

Ex6¶

In [ ]:
### Solution - Exercise 6  
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
    msdf = mushroom_severity_df.copy()
    msdf = msdf[msdf['species']==species]
    return msdf.groupby('state')['state'].count().reset_index(name='count').sort_values(['count','state'],ascending=[False,True])
In [ ]:
### Solution - Exercise 6  
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
#     print(mushroom_severity_df.columns)
    df = mushroom_severity_df[mushroom_severity_df["species"]==species].copy(deep=True)
    
    df = (
        df.groupby('state')
            .size()
            .reset_index(
                name='count'
            )
    )
    
    return df.sort_values(
        ["count","state"], 
        ascending = [False,True]
    )
In [ ]:
### Solution - Exercise 6  
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
    ###
    df = mushroom_severity_df.copy()
    filtered_df = df[df['species']==species]
    grouped_df = (filtered_df.groupby('state')
                            .size()
                            .reset_index(name='count')
                            .sort_values(['count', 'state'], ascending=[False, True])
                            .reset_index(drop=True))
    return grouped_df
    ###
pass
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
    ###
    ### YOUR CODE HERE
    ###
#     display(mushroom_severity_df)
    rdf=mushroom_severity_df[mushroom_severity_df['species']==species]
    rdf=rdf.groupby('state',as_index=False).agg(count=('state','count')).sort_values(by=['count','state'],ascending=[False,True])
    return rdf
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
    df = mushroom_severity_df[mushroom_severity_df['species'] == species]
    
    top_states = (
        df.groupby('state')
        .size()
        .reset_index(name='count')
        .sort_values(by=['count', 'state'], ascending=[False, True])
        .reset_index(drop=True)
    )
    
    return top_states
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
    df = mushroom_severity_df[mushroom_severity_df['species']==species]
    top_states = (df.groupby('state').size().reset_index(name='count')
                  .sort_values('count',ascending=False)
                  .reset_index(drop=True))
    return top_states
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
    is_species = mushroom_severity_df['species'].isin({species})
    counts = mushroom_severity_df['state'][is_species].value_counts()
    countsdf = counts.reset_index().rename(columns={'index': 'state', 'state': 'count'})
    countsdf = countsdf.sort_values(by='count', ascending=False)
    return countsdf

Ex7¶

In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
    from sklearn.cluster import KMeans
    import math
    
    ### Filter mushroom_severity_df
    _df = mushroom_severity_df.copy()
    df = _df[_df['species'] == species][['longitude','latitude']]

    k = 1
    k_max = 20

    ### Do initial calculations for k = 1
    ### You will do similar calculations in the loop.
    ### You will need to adjust the prior_centers and inertia on each iteration.
    prior_centers = None # replace with centers for the prior run
    kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
    inertia = kmeans.inertia_
    wcss_percent_decrease = math.inf # replace with your calculation

    # loop will terminate when one of the below is true:
    # - k reaches k_max
    # - percentage decrease in inertia drops below threshold
    for k in range(2, k_max + 1):
        if (wcss_percent_decrease <= threshold):
            break
        prior_centers = kmeans.cluster_centers_
        kmeans = KMeans(n_clusters=k,random_state=seed_value).fit(df)
        wcss_percent_decrease = 100*((inertia - kmeans.inertia_)/inertia)
        inertia = kmeans.inertia_
        k+=1
        

    return prior_centers
In [ ]:
### Solution - Exercise 7  
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
    from sklearn.cluster import KMeans
    import math
    
    ### Filter mushroom_severity_df
    _df = mushroom_severity_df.copy(deep=True)
    df = _df[_df['species'] == species][['longitude','latitude']]

    k = 1
    k_max = 20

    ### Do initial calculations for k = 1
    ### You will do similar calculations in the loop.
    ### You will need to adjust the prior_centers and inertia on each iteration.
    
    prior_centers = None # replace with centers for the prior run
    kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
    inertia = kmeans.inertia_
    wcss_percent_decrease = math.inf # replace with your calculation
    
    # Store clusters
    prior_centers = kmeans.cluster_centers_

    # loop will terminate when one of the below is true:
    # - k reaches k_max
    # - percentage decrease in inertia drops below threshold
    for k in range(2, k_max):# + 1):
        
        # Fit new model
        kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
        new_inertia = kmeans.inertia_
        new_centers = kmeans.cluster_centers_

        # Calculate percentage decrease: 100 * (prior - new) / prior
        wcss_percent_decrease = 100.0 * (inertia - new_inertia) / inertia
        
        if (wcss_percent_decrease <= threshold):
            break
    
        else:
            # Improvement was good, update prior values and continue
            prior_centers = new_centers
            inertia = new_inertia

    return prior_centers
In [ ]:
### Solution - Exercise 7  
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
    from sklearn.cluster import KMeans
    import math
    
    ### Filter mushroom_severity_df
    _df = mushroom_severity_df.copy()
    df = _df[_df['species'] == species][['longitude','latitude']]

    k = 1
    k_max = 20

    ### Do initial calculations for k = 1
    ### You will do similar calculations in the loop.
    ### You will need to adjust the prior_centers and inertia on each iteration.
    prior_centers = None # replace with centers for the prior run
    kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
    prior_inertia = kmeans.inertia_
    wcss_percent_decrease = math.inf # replace with your calculation

    # loop will terminate when one of the below is true:
    # - k reaches k_max
    # - percentage decrease in inertia drops below threshold
    for k in range(2, k_max + 1):
        if (wcss_percent_decrease <= threshold):
            break
    ###
        prior_centers = kmeans.cluster_centers_    
        kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
        new_inertia = kmeans.inertia_
        wcss_percent_decrease = 100 * (prior_inertia - new_inertia)/prior_inertia
        
        prior_inertia = new_inertia

    ###
    

    return prior_centers
pass
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
    from sklearn.cluster import KMeans
    import math
    
    ### Filter mushroom_severity_df
    _df = mushroom_severity_df.copy()
    df = _df[_df['species'] == species][['longitude','latitude']]

    k = 1
    k_max = 20

    ### Do initial calculations for k = 1
    ### You will do similar calculations in the loop.
    ### You will need to adjust the prior_centers and inertia on each iteration.
    
    ###
    ### YOUR CODE HERE
    ###
    pct_decrease=1000
    kmeans=KMeans(n_clusters=k,random_state=seed_value).fit(df)
    inertia=kmeans.inertia_
    k+=1
    while k<=k_max:
        if pct_decrease<=threshold:
            break
        prior_centers=kmeans.cluster_centers_
        kmeans=KMeans(n_clusters=k,random_state=seed_value).fit(df)
        pct_decrease=100.0*(inertia-kmeans.inertia_)/inertia
        inertia=kmeans.inertia_
        k+=1
    return prior_centers
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
    from sklearn.cluster import KMeans
    import math
    
    ### Filter mushroom_severity_df
    _df = mushroom_severity_df.copy()
    df = _df[_df['species'] == species][['longitude','latitude']]
    ##print(df)
    ##print(threshold)

    k = 1
    k_max = 20
    
    
    ### Do initial calculations for k = 1
    ### You will do similar calculations in the loop.
    ### You will need to adjust the prior_centers and inertia on each iteration.
    
    ###
    ### YOUR CODE HERE
    
    ##LOOP THROUGH k 
    prior_centers = [0]
    prior_precentage_decrease = None
    prior_inertia = 100000000000
    while k <= k_max:
        
        ##FIT MODEL 
        ##print(f"\nk = {k}")
        ##print(f"threshold = {threshold}")
        model = KMeans(n_clusters = k, random_state = seed_value )
        model = model.fit(df)

        
        ##GET MODEL VALUES
        new_inertia = model.inertia_
        new_cluster_centers = model.cluster_centers_
        ##print("\nmodel parameters")
        ##print(new_inertia)
        ##print(new_cluster_centers)
        
        
        ##CALCULATE VALUES 
        new_percentage_decrease = 100*((prior_inertia - new_inertia)/prior_inertia)
        ##print(f"new_percentage_decrease = {new_percentage_decrease}")
        
        if (prior_precentage_decrease is not None) and (new_percentage_decrease < threshold): 
        ##if (prior_precentage_decrease is not None) and (prior_precentage_decrease < threshold): 
            print(f"k = {k-1}")
            print(f"new_percentage_decrease = {new_percentage_decrease}")
            print(f"threshold = {threshold}")
            print("\n\n ================ \n\n")
            break 
        
        ##UPDATE PRIORS
        prior_centers = new_cluster_centers
        prior_precentage_decrease = new_percentage_decrease
        prior_inertia = new_inertia
        
        ##INCREASE k 
        k = k + 1
    
    return(prior_centers)
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
    from sklearn.cluster import KMeans
    import math
    
    ### Filter mushroom_severity_df
    _df = mushroom_severity_df.copy()
    df = _df[_df['species'] == species][['longitude','latitude']]

    k = 1
    k_max = 20

    ### Do initial calculations for k = 1
    kmeans = KMeans(n_clusters=k, random_state=seed_value)
    kmeans.fit(df)

    prior_centers = kmeans.cluster_centers_
    prior_in = kmeans.inertia_

    pct_dec = math.inf
    
    
    ### You will do similar calculations in the loop.
    ### You will need to adjust the prior_centers and inertia on each iteration.
    
    while k < k_max and pct_dec > threshold:
        centers = kmeans.cluster_centers_
        prior_in = kmeans.inertia_

        k += 1

        kmeans = KMeans(n_clusters=k, random_state=seed_value)
        kmeans.fit(df)

        new_in = kmeans.inertia_
        pct_dec = 100 * (prior_in - new_in) / prior_in

    return centers
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
    from sklearn.cluster import KMeans
    import math
    
    ### Filter mushroom_severity_df
    _df = mushroom_severity_df.copy()
    df = _df[_df['species'] == species][['longitude','latitude']]

    k = 1
    k_max = 20

    ### Do initial calculations for k = 1
    ### You will do similar calculations in the loop.
    ### You will need to adjust the prior_centers and inertia on each iteration.
    model = KMeans(n_clusters=k,random_state=seed_value,n_init=10).fit(df)
    prior_centers = model.cluster_centers_
    inertia = model.inertia_
    percent_decrease = float('inf')
    
    for k in range(2,k_max+1):
        if percent_decrease <threshold:
            break
        prior_centers = model.cluster_centers_
        prior_inertia = model.inertia_
        model = KMeans(n_clusters=k,random_state=seed_value,n_init=10).fit(df)
        inertia = model.inertia_
        percent_decrease = 100 * (prior_inertia - inertia)/prior_inertia
        
    

    return prior_centers
In [ ]:
k = 1
kmeans = KMeans(k, ...).fit(...)
inertia = kmeans.inertia_
centers = kmeans.cluster_centers_
while k < k_max: # Or should this be `k < k_max-1` or `k <= k_max` or something else? When to stop?
    k += 1 # Next `k` to try
    kmeans = KMeans(k, ...).fit(...)
    new_inertia = kmeans.inertia_
    percentage_decrease = 100 * (inertia - new_inertia) / inertia
    if percentage_decrease < threshold:
        break
    inertia = new_inertia
    centers = kmeans.cluster_centers_
return centers

Ex8¶

In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
    import math
    l1,t1 = coord
    l2,t2 = obs
    tm = math.radians((t1 + t2)/2)
    K1 = 111.13209 - 0.56605*math.cos(2*tm)+0.00120*math.cos(4*tm)
    K2 = 111.41513*math.cos(tm)-0.09455*math.cos(3*tm)+0.00012*math.cos(5*tm)
    dt = t2 - t1
    dl = l2 - l1
    D = ((K1*dt)**2+(K2*dl)**2)**0.5
    return D
In [ ]:
### Solution - Exercise 8  
def geodetic_distance(coord:list, obs:list) -> float:
    
    import math

    # Extract coords & obs
    lon1, lat1 = coord
    lon2, lat2 = obs
    
    # Calc diffs (phi & lambda)
    delta_phi = lat2 - lat1
    delta_lambda = lon2 - lon1
    
    # Calc mid-latitude (phi_m)
    phi_m = (lat1 + lat2) / 2
    
    # Convert phi_m to rads
    phi_m_rad = math.radians(phi_m)
    
    # Calc K1 & K2
    k1 = 111.13209 - 0.56605 * math.cos(2 * phi_m_rad) + 0.00120 * math.cos(4 * phi_m_rad)
    k2 = 111.41513 * math.cos(phi_m_rad) - 0.09455 * math.cos(3 * phi_m_rad) + 0.00012 * math.cos(5 * phi_m_rad)
    
    # Calc D:
    distance = math.sqrt((k1 * delta_phi)**2 + (k2 * delta_lambda)**2)
    
    return distance
In [ ]:
### Solution - Exercise 8  
from math import radians, sin, cos, acos
def geodetic_distance(coord:list, obs:list) -> float:
    ###
    diff_long = float(float(coord[0] - obs[0]))
    diff_lat = float(float(coord[1] - obs[1]))
    mid_lat = radians((coord[1] + obs[1]) / 2)
    K1 = 111.13209 - (0.56605 * cos(2 * mid_lat)) + (0.00120 * cos(4 * mid_lat))
    K2 = (111.41513 * cos(mid_lat)) - (0.09455 * cos(3 * mid_lat)) + (0.00012 * cos(5 * mid_lat))
    D = np.sqrt((K1 * diff_lat)**2 + (K2 * diff_long)**2)
    return D
    ###
pass
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
    ###
    ### YOUR CODE HERE
    ###
    from math import cos,radians
    lon1,lat1=coord
    lon2,lat2=obs
    delphi=lat2-lat1
    dellam=lon2-lon1
    phim=radians((lat2+lat1)/2)
    k1=111.13209-.56605*cos(2*phim)+.00120*cos(4*phim)
    k2=111.41513*cos(phim)-.09455*cos(3*phim)+.00012*cos(5*phim)
    print(f'k1 {k1} k2 {k2}')
    return ((k1*delphi)**2+(k2*dellam)**2)**(1/2)
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
    ###
    ### YOUR CODE HERE
    ###
    print(coord)
    print(obs)
    
    mid_latitude = (coord[1] + obs[1])/2
    k1 = 111.13209 - (0.56605 * np.cos(2 * mid_latitude)) + (0.00120 * np.cos(4 * mid_latitude))
    k2 = (111.41513 * np.cos(mid_latitude)) - (0.09455 * np.cos(3 * mid_latitude)) + (0.00012 * np.cos(5 * mid_latitude))
    
    diff_lat = coord[1] - obs[1]
    diff_long = coord[0] - obs[0]
    
    distance = ((k1*diff_lat)**2 + (k2*diff_long)**2)**(1/2)
    return(distance)
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
    import math
    
    lon1 = coord[0]
    lat1 = coord[1]
    
    lon2 = obs[0]
    lat2 = obs[1]
    
    del_phi = lat2 - lat1
    del_lambda = lon2 - lon1
    
    phi_m = (lat1 + lat2) / 2
    
    phi_m_rad = math.radians(phi_m)
    
    k1 = 111.13209 - 0.56605 * math.cos(2 * phi_m_rad) + 0.00120 * math.cos(4 * phi_m_rad)
    
    k2 = 111.41513 * math.cos(phi_m_rad) - 0.09455 * math.cos(3 * phi_m_rad) + 0.00012 * math.cos(5 * phi_m_rad)
    
    distance = math.sqrt((k1 * del_phi) ** 2 + (k2 * del_lambda) ** 2)
    
    return distance
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
    long1,lat1 = coord 
    long2,lat2 = obs
    
    delta_phi = lat2-lat1
    detla_lambda = long2-long1
    
    phi_m = np.radians((lat1+lat2)/2)
    
    K1 = 111.13209 - 0.56605 * np.cos(2*phi_m)+0.00120*np.cos(4*phi_m)
    K2 = 111.41513* np.cos(phi_m)-0.09455*np.cos(3*phi_m)+0.00012*np.cos(5*phi_m)
    
    return np.sqrt((K1*delta_phi)**2+(K2*detla_lambda)**2)
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
    from math import radians, cos, sqrt
    dphi = obs[1] - coord[1]
    dlambda = obs[0] - coord[0]
    phim = radians((obs[1] + coord[1]) / 2)
    K1 = 111.13209 - 0.56605*cos(2*phim) + 0.00120*cos(4*phim)
    K2 = 111.41513*cos(phim) - 0.09455*cos(3*phim) + 0.00012*cos(5*phim)
    D = sqrt((K1*dphi)**2 + (K2*dlambda)**2)
    return D

Ex9¶

In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
    dist = []
    for i in centers:
        dist.append(distance_function(i,coordinates))
    return np.argmin(dist,axis=0)
In [ ]:
### Solution - Exercise 9  
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
    
    dist_matrix = np.apply_along_axis(
        lambda row: distance_function(
                row,
                centers
            ),
        1,
        coordinates
    )
    
    return np.argmin(
        dist_matrix,
        axis=1
    )
In [ ]:
### Solution - Exercise 9  
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
    
    distances = np.apply_along_axis(
        func1d=lambda coord: distance_function(coord, centers),
        axis=1,
        arr=coordinates
    )
    labels = np.argmin(distances, axis=1)
    
    return labels
pass
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
    
    ###
    ### YOUR CODE HERE
    ###
#     print(coordinates)
    dist=np.apply_along_axis(distance_function,1,centers,point_2=coordinates)
    return np.argmin(dist,axis=0)
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
    
    ###
    ### YOUR CODE HERE
    ###
    print(centers)
    print(coordinates)
    distances = [distance_function(coordinates[i], centers) for i in range(len(coordinates))]
    labels = np.argmin(distances, axis = 1)
    return(labels)
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
    
    distances = np.apply_along_axis(lambda x: distance_function(x, centers), axis=1, arr=coordinates)
    labels = np.argmin(distances, axis=1)
    
    return labels
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
    distance = np.apply_along_axis(
            lambda point: distance_function(point,centers),
            axis =1,
            arr=coordinates,)
    labels = np.argmin(distance,axis=1)
    return labels

Ex10¶

In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
    msdf = mushroom_severity_df.copy()
    msdf['dist'] = distance_function(coordinates,mushroom_severity_df[['longitude','latitude']].values)
    msdf = msdf[msdf['dist']<=radius]
    N = msdf.shape[0]
    p,s,d = msdf[['poisonous','severe','dupe']].sum().values
    score = round(100*(1-p/N)*(1-s/N)*(1-d/N),2)
    msdf = msdf[msdf['edible']==1]
    msdf = msdf.groupby('species')['species'].count().reset_index(name='count').sort_values(by=['count','species'],ascending=[False,True])
    return score, msdf['species'].iloc[0]
In [ ]:
### Solution - Exercise 10  
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
    
    # Copy
    df = mushroom_severity_df.copy(deep=True)
    
    # 1. Find dist & filter
    prof_coords = np.array(coordinates)
    obs_coords = df[['longitude', 'latitude']].values
    
    dists = distance_function(
                prof_coords,
                obs_coords
            )
    
    df_local = df[dists <= radius].copy()
    
    N = len(df_local)
    
    # Handle empty r case
    if N == 0:
        return 0.00, None
        
    # 2. Calc safety score comps
    n_pois = df_local['poisonous'].sum()
    n_severe = df_local['severe'].sum()
    n_dupe = df_local['dupe'].sum()
    
    score = 100 * (1 - n_pois/N) * (1 - n_severe/N) * (1 - n_dupe/N)
    
    # 3. Find best edible species
    edibles = df_local[df_local['edible'] == 1]
    
    if len(edibles) == 0:
        return round(score, 2), None
        
    # Group by species & count, then sort by count DESC & species ASC
    species_counts = (
        edibles.groupby('species')
                .size()
                .reset_index(
                    name='count'
                )
    )
    
    best_species = species_counts.sort_values(
        by=['count', 'species'], 
        ascending=[False, True]
    ).iloc[0]['species']
    
    return round(score, 2), best_species
In [ ]:
### Solution - Exercise 10  
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
    ###
    observations = mushroom_severity_df[['longitude','latitude']].to_numpy()
    
    distances = distance_function(coordinates, observations)
    within_radius_df = mushroom_severity_df[distances<=radius]
    
    N = len(within_radius_df)
#     n_pois = (within_radius_df['poisonous']==True).sum()
#     n_severe = (within_radius_df['severe']==True).sum()
#     n_dupes = (within_radius_df['dupe']==True).sum()
    n_pois = ((within_radius_df['poisonous']==True).sum())/N
    n_severe = ((within_radius_df['severe']==True).sum())/N
    n_dupes = ((within_radius_df['dupe']==True).sum())/N
    
#     safety_score = 100 * ((1-n_pois)/N) * ((1-n_severe)/N) * ((1-n_dupes)/N)
    safety_score = 100 * (1 - n_pois) * (1 - n_severe) * (1 - n_dupes)
    safety_score = round(safety_score, 2)
    
#     display(within_radius_df)

    edible = within_radius_df[within_radius_df['edible'] == True]
    species_counts = edible['species'].value_counts()
    
    best_species = (edible['species']
                    .value_counts()
                    .reset_index()
                    .rename(columns={'index': 'species', 'species': 'count'})
                    .sort_values(['count', 'species'], ascending=[False, True])
                    .iloc[0]['species'])
    
    return safety_score, best_species
    ###
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
    ###
    ### YOUR CODE HERE
    ###
    df=mushroom_severity_df.copy()
    df['dist']=distance_function(coordinates,df[['longitude','latitude']].values)
    df=df[df['dist']<=radius]
    n=df.shape[0]
    npois=df[df['poisonous']==1].shape[0]
    nsevere=df[df['severe']==1].shape[0]
    ndupe=df[df['dupe']==1].shape[0]
    
    safe_score=round(100.0*(1-npois/n)*(1-nsevere/n)*(1-ndupe/n),2)
    df=df[df['edible']==1].groupby('species',as_index=False).agg(count=('species','count')).sort_values(by=['count','species'],ascending=[False,True]).reset_index()
    return safe_score,df['species'][0]
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
    ###
    ### YOUR CODE HERE
    ###
    ##print(mushroom_severity_df.head(5))
    ##print("\n-----------------\n")
    ##print(coordinates)
    ##print("\n-----------------\n")
    ##print(radius)
    ##print("\n-----------------\n")
    
    ##CREATE COPY 
    mushroom_severity_df_copy = mushroom_severity_df.copy()
    
    ##FILTER FOR CLOSE OBS
    mushroom_coord = mushroom_severity_df_copy[['longitude', 'latitude']].to_numpy()
    ##print(mushroom_coord)
    distance_to_obs = distance_function(coordinates, mushroom_coord)  
    ##print(distance_to_obs)
    mushroom_severity_df_copy["distance_to_obs"] = distance_to_obs
    ##print(mushroom_severity_df_copy.shape)
    mushroom_severity_df_copy = mushroom_severity_df_copy[mushroom_severity_df_copy.distance_to_obs <= radius]
    ##print(mushroom_severity_df_copy.shape)
    ##print(mushroom_severity_df_copy)
    
    ##CALUCATE FORMULA
    total_num = len(mushroom_severity_df_copy)
    poisonous_num = sum(mushroom_severity_df_copy.poisonous)
    dupe_num = sum(mushroom_severity_df_copy.dupe)
    severe_num = sum(mushroom_severity_df_copy.severe)
    safety_score = (1 - (poisonous_num/total_num)) * (1 - (severe_num/total_num)) * (1 - (dupe_num/total_num))
    safety_score = round(100*safety_score, 2)
    ##print(safety_score)
    
    ##FIND BEST SPECIES
    mushroom_severity_df_copy = mushroom_severity_df_copy[mushroom_severity_df_copy.edible == 1]
    ##print(mushroom_severity_df_copy)
    mushroom_severity_df_copy = mushroom_severity_df_copy.groupby('species')['species'].count().reset_index(name = "count")
    ##print(mushroom_severity_df_copy)
    mushroom_severity_df_copy = mushroom_severity_df_copy.sort_values(by=['count', 'species'], ascending=[False, True])
    ##print(mushroom_severity_df_copy)
    best_species = mushroom_severity_df_copy.iloc[0][0]
    ##print(best_species)
    
    ##RETURN VALUES
    return(safety_score, best_species)
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
    
    df = mushroom_severity_df.copy()
    
    obs_coords = np.array(df[['longitude', 'latitude']])
    
    distances = distance_function(np.array(coordinates), obs_coords)
    
    df['distance'] = distances
    
    nearby_df = df[df['distance'] <= radius]
    
    N = nearby_df.shape[0]
    
    n_pois = nearby_df['poisonous'].sum()
    n_severe = nearby_df['severe'].sum()
    n_dupe = nearby_df['dupe'].sum()
    
    safety_score = round((1 - n_pois / N)*(1 - n_severe / N)*(1 - n_dupe / N)* 100, 2)
    
    edible_nearby = nearby_df[nearby_df['edible'] == 1]
    
    best_species = (
        edible_nearby.groupby('species')
        .size()
        .reset_index(name='count')
        .sort_values(by=['count', 'species'], ascending=[False, True])
        .iloc[0]['species']
    )
    
    return safety_score, best_species
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
    points = np.array(coordinates)
    observations = mushroom_severity_df[['longitude','latitude']].values
    distances = distance_function(points,observations)
    
    nearby = mushroom_severity_df[distances<=radius]
    
    nearby_len = len(nearby)
    n_pois = nearby['poisonous'].sum()
    n_severe = nearby['severe'].sum()
    n_dupe = nearby['dupe'].sum()
    
    safety_score = (1-n_pois/nearby_len)*(1-n_severe/nearby_len)*(1-n_dupe/nearby_len)*100
    safety_score = round(safety_score,2)
    
    edible_near = nearby[nearby['edible']==1]
    counts = edible_near['species'].value_counts()
    top_count = counts.max()
    best_species = sorted(counts[counts==top_count].index)[0]
    
    return safety_score,best_species