SP2026 FX Alternative Solutions¶
Ex1¶
In [ ]:
cleanse_observations_query = '''
SELECT
class,"order",family,genus,species,state,
latitude,longitude,coordinate_uncertainty,day,month,year
FROM
(SELECT
class,"order",family,genus,species,
stateProvince AS state,
decimalLatitude AS latitude,
decimalLongitude AS longitude,
coordinateUncertaintyInMeters AS coordinate_uncertainty,
day,month,year,
COUNT(*) OVER (PARTITION by species) as sp_count
FROM observations
WHERE coordinateUncertaintyInMeters <= 25
AND stateProvince NOT IN ('Hawaii','Alaska')
AND species IS NOT NULL)
WHERE sp_count >= 25
AND class IS NOT NULL
AND "order" IS NOT NULL
AND family IS NOT NULL
AND genus IS NOT NULL
AND species IS NOT NULL
AND state IS NOT NULL
'''
In [ ]:
### Solution - Exercise 1
cleanse_observations_query = '''
with data_with_counts as (
select
class,
`order`,
family,
genus,
species,
stateprovince,
decimallatitude,
decimallongitude,
coordinateuncertaintyinmeters,
day,
month,
year,
count(*) over(
partition by
species
) as species_count
from
observations
where
coordinateuncertaintyinmeters <= 25
and stateprovince not in ('Hawaii', 'Alaska')
and `class` is not null
and `order` is not null
and family is not null
and genus is not null
and species is not null
and stateprovince is not null
)
select
`class`,
`order`,
family,
genus,
species,
stateprovince as state,
decimallatitude as latitude,
decimallongitude as longitude,
coordinateuncertaintyinmeters as coordinate_uncertainty,
day,
month,
year
from
data_with_counts
where
species_count >= 25
'''
In [ ]:
### Solution - Exercise 1
cleanse_observations_query = '''
SELECT class,
"order",
family,
genus,
species,
state,
latitude,
longitude,
coordinate_uncertainty,
day,
month,
year
FROM (
SELECT class,
"order",
family,
genus,
species,
stateProvince AS state,
decimalLatitude AS latitude,
decimalLongitude AS longitude,
coordinateUncertaintyInMeters AS coordinate_uncertainty,
day,
month,
year,
COUNT(*) OVER (PARTITION BY species) AS species_count
FROM observations
WHERE class IS NOT NULL
AND "order" IS NOT NULL
AND family IS NOT NULL
AND genus IS NOT NULL
AND species IS NOT NULL
AND stateProvince IS NOT NULL
AND coordinateUncertaintyInMeters <= 25
AND stateProvince NOT IN ('Hawaii', 'Alaska')
) sub
WHERE species_count >= 25
'''
pass
In [ ]:
cleanse_observations_query = '''
SELECT gbifID, class, "order", family, genus,o.species,stateProvince state
,decimalLatitude latitude,decimalLongitude longitude
,coordinateUncertaintyInMeters coordinate_uncertainty, day, month, year
FROM observations o
join (select species,count(*) obs from observations o
where o.coordinateUncertaintyInMeters<=25
and class is not null and "order" is not null and family is not null
and genus is not null and o.species is not null and stateProvince is not null
and stateProvince!='Hawaii' and stateProvince!='Alaska'
group by 1) b on o.species=b.species
where b.obs>=25 and o.coordinateUncertaintyInMeters<=25
and class is not null and "order" is not null and family is not null
and genus is not null and o.species is not null and stateProvince is not null
and stateProvince!='Hawaii' and stateProvince!='Alaska'
order by 1
'''
In [ ]:
cleanse_observations_query = '''
WITH table_temp AS (
SELECT gbifID, class, "order", family, genus, species, stateProvince as state, decimalLatitude as latitude, decimalLongitude as longitude, coordinateUncertaintyInMeters as coordinate_uncertainty, day, month, year, COUNT(*) OVER (PARTITION BY species) as cnt
FROM observations
WHERE coordinateUncertaintyInMeters < 26
AND state NOT IN ("Hawaii", "Alaska")
AND class NOT NULL
AND "order" NOT NULL
AND family NOT NULL
AND genus NOT NULL
AND species NOT NULL
AND state NOT NULL
ORDER BY gbifID
)
SELECT gbifID, class, "order", family, genus, species, state, latitude, longitude, coordinate_uncertainty, day, month, year
FROM table_temp
WHERE cnt > 24
'''
In [ ]:
cleanse_observations_query = '''
WITH filtered AS (
SELECT
gbifID,
class,
"order",
family,
genus,
species,
stateProvince AS state,
decimalLatitude AS latitude,
decimalLongitude AS longitude,
coordinateUncertaintyInMeters AS coordinate_uncertainty,
day,
month,
year,
COUNT(*) OVER (PARTITION BY species) AS species_count
FROM observations
WHERE coordinateUncertaintyInMeters <= 25
AND stateProvince NOT IN ('Hawaii', 'Alaska')
AND class IS NOT NULL
AND "order" IS NOT NULL
AND family IS NOT NULL
AND genus IS NOT NULL
AND species IS NOT NULL
AND stateProvince IS NOT NULL
)
SELECT
gbifID,
class,
"order",
family,
genus,
species,
state,
latitude,
longitude,
coordinate_uncertainty,
day,
month,
year
FROM filtered
WHERE species_count >= 25
ORDER BY gbifID ASC;'''
In [ ]:
cleanse_observations_query = '''
WITH filtered as (
SELECT
gbifID,
class,
"order",
family,
genus,
species,
stateProvince as state,
decimalLatitude as latitude,
decimalLongitude as longitude,
coordinateUncertaintyInMeters as coordinate_uncertainty,
day,
month,
year,
COUNT(*) over (partition by species) as species_count
FROM
observations
WHERE
coordinateUncertaintyInMeters <=25
and stateProvince Not in ('Hawaii','Alaska')
and class is not null
and "order" is not null
and family is not null
and genus is not null
and species is not null
and stateProvince is not null
)
Select
gbifID,
class,
"order",
family,
genus,
species,
state,
latitude,
longitude,
coordinate_uncertainty,
day,
month,
year
FROM
filtered
WHERE
species_count>=25
order by
gbifID ASC
'''
In [ ]:
cleanse_observations_subquery = '''
SELECT gbifID, class, "order", family, genus, species,
stateProvince AS state,
decimalLatitude AS latitude,
decimalLongitude AS longitude,
coordinateUncertaintyInMeters AS coordinate_uncertainty,
day, month, year,
COUNT(species) OVER (PARTITION BY species) AS species_count
FROM observations
WHERE (coordinate_uncertainty <= 25)
AND (state <> 'Hawaii') AND (state <> 'Alaska')
AND (class IS NOT NULL)
AND ("order" IS NOT NULL)
AND (family IS NOT NULL)
AND (genus IS NOT NULL)
AND (species IS NOT NULL)
AND (state IS NOT NULL)
'''
cleanse_observations_query = f'''
SELECT gbifID, class, "order", family, genus, species, state, latitude, longitude, coordinate_uncertainty, day, month, year
FROM ({cleanse_observations_subquery})
WHERE species_count >= 25
'''
Ex3¶
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
import re
import itertools
edibles = list(edible_df['Scientific name'])
s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Za-z]+\s[a-z]+',str(x)))
similar_edibles= set(itertools.chain.from_iterable(s.values))
updated_sim = set()
for i in similar_edibles:
if 'species' in i or 'spp' in i:
genus = i.split()[0]
updated_sim.update([name for name in edibles if genus in name])
else:
updated_sim.add(i)
return updated_sim
In [ ]:
### Solution - Exercise 3
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
import re
import itertools
edibles = list(edible_df['Scientific name'])
s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Z][a-z]+\s[a-z]+',str(x)))
similar_edibles= set(itertools.chain.from_iterable(s.values))
# display(similar_edibles)
updated_sim = set()
for i in similar_edibles:
if 'species' in i or 'spp' in i:
genus = i.split()[0]
updated_sim.update([name for name in edibles if genus in name])
else:
updated_sim.add(i)
return updated_sim
In [ ]:
### Solution - Exercise 3
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
import re
import itertools
edibles = list(edible_df['Scientific name'])
s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'\b[A-Z][a-z]+ [a-z]+\b',str(x)))
similar_edibles= set(itertools.chain.from_iterable(s))
# display(similar_edibles)
updated_sim = set()
for i in similar_edibles:
if 'species' in i or 'spp' in i:
genus = i.split()[0]
name_list = [name for name in edibles if name.split()[0] == genus]
# print(f'name_list: {name_list}')
for name in name_list:
updated_sim.add(name)
else:
updated_sim.add(i)
return updated_sim
pass
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
import re
import itertools
edibles = list(edible_df['Scientific name'])
s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Z][a-z]+\s[a-z]+',str(x)))
similar_edibles= set(itertools.chain.from_iterable(s.values))
# display(similar_edibles)
updated_sim = set()
for i in similar_edibles:
if 'species' in i or 'spp' in i:
genus = i.split()[0]
updated_sim.update([name for name in edibles if genus in name])
else:
updated_sim.add(i)
return updated_sim
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
import re
# change 1
# import itertools (seems unneccesary)
# change 2
# edibles = list(edible_df['Scientific name'])
edibles = set(edible_df['Scientific name'])
# change 3: taking pattern out + better pattern
pattern = r"\b[A-Z][a-z]+ (?:[a-z]+|species|spp)\b"
# another change
similar_edibles = set()
# s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'^[a-z]+\s[a-z]+$',str(x)))
# similar_edibles= list(itertools.chain(s.values()))
# display(similar_edibles)
# change 4
for x in poisonous_df['Similar edible species']:
found = re.findall(pattern, str(x))
similar_edibles.update(found)
# another change
updated_sim = set()
# change 5
for i in similar_edibles:
genus, species = i.split()
if species in ['species', 'spp']:
for edible in edibles:
if edible.startswith(genus + " "):
updated_sim.add(edible)
else:
updated_sim.add(i)
return updated_sim
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
import re
import itertools
#edibles = list(edible_df['Scientific name'])
edibles = [s.strip() for s in edible_df['Scientific name'].dropna().unique()]
pattern = re.compile(r'\b[A-Z][a-z]+\s[a-z]+\b')
s = poisonous_df['Similar edible species'].apply(lambda x: pattern.findall(str(x)))
similar_edibles= list(itertools.chain.from_iterable(s.values))
display(similar_edibles)
updated_sim = set()
for i in similar_edibles:
if i.split()[1] in ('species','spp'):
genus = i.split()[0]
updated_sim.update([name for name in edibles if name.startswith(genus + ' ')])
else:
updated_sim.add(i)
return updated_sim
In [ ]:
def DEBUG_find_similar(poisonous_df: pd.DataFrame, edible_df: pd.DataFrame) -> set:
import re
edibles = set(edible_df['Scientific name'])
s = poisonous_df['Similar edible species'].apply(lambda x: re.findall(r'[A-Z][a-z]*\s[a-z]+', str(x)))
s = set(np.concatenate(s.values))
similar_edibles = set()
wildcard_geni = set()
for e in s:
genus, species = e.split()
if species in ['species', 'spp']:
wildcard_geni.add(genus)
else:
similar_edibles.add(e)
updated_sim = similar_edibles
for i in edibles:
genus = i.split()[0]
if genus in wildcard_geni:
updated_sim.add(i)
return updated_sim
Ex4¶
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
mpdf = mushroom_poison_df.copy()
mpdf = mpdf.merge(poisonous_df[['Scientific name','Severity']],left_on='species',right_on='Scientific name',how='left')
mpdf['dupe'] = np.where((mpdf['edible']==1)&(mpdf['species'].isin(edible_dupes)),1,0)
mpdf['severe'] = np.where((mpdf['poisonous']==1)&(mpdf['Severity']=='deadly'),1,0)
mpdf.drop(columns=['Scientific name','Severity'],inplace=True)
return mpdf
In [ ]:
### Solution - Exercise 4
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
df = mushroom_poison_df.copy(deep=True)
df = df.merge(
poisonous_df[['Scientific name', 'Severity']],
left_on='species',
right_on='Scientific name',
how='left'
)
df['severe'] = np.where(
(df['poisonous'] == 1) & (df['Severity'] == 'deadly'),
1,
0
)
df['dupe'] = np.where(
(df['edible'] == 1) & (df['species'].isin(edible_dupes)),
1,
0
)
return df.drop(
columns=['Scientific name', 'Severity']
)
In [ ]:
### Solution - Exercise 4
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
###
combined_df = mushroom_poison_df.copy()
combined_df["dupe"] = np.where(
(combined_df['edible'] == 1) & combined_df['species'].isin(edible_dupes),
1, 0
)
severe_poison_df = poisonous_df[poisonous_df['Severity']== 'deadly']
combined_df["severe"] = np.where(
(combined_df['poisonous'] == 1) & combined_df['species'].isin(severe_poison_df['Scientific name']),
1, 0
)
return combined_df
###
pass
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
###
### YOUR CODE HERE
###
rdf=mushroom_poison_df.copy()
# display(rdf)
severe=poisonous_df[poisonous_df['Severity']=='deadly']['Scientific name'].tolist()
rdf['severe']=((rdf['species'].isin(severe)) & (rdf['poisonous']==1)).astype(int)
rdf['dupe']=((rdf['species'].isin(list(edible_dupes))) & (rdf['edible']==1)).astype(int)
return rdf
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
###
### YOUR CODE HERE
###
##print(edible_dupes)
##print("\n\n-------------\n\n")
##print(poisonous_df.head())
##print("\n\n------------------\n\n")
##print("MUSHROOM POISON DF")
##print(mushroom_poison_df.head())
##GET DEADLY SEVERITY MUSHROOMS
deadly_severity_mushrooms = poisonous_df["Scientific name"][poisonous_df.Severity == "deadly"]
##print(deadly_severity_mushrooms)
##INSERT VAlUES
copy_mushroom_poison_df = mushroom_poison_df.copy()
copy_mushroom_poison_df["severe"] = 0
copy_mushroom_poison_df.severe.loc[(copy_mushroom_poison_df.poisonous == 1) & (copy_mushroom_poison_df.species.isin(deadly_severity_mushrooms))] = 1
copy_mushroom_poison_df["dupe"] = 0
copy_mushroom_poison_df.dupe.loc[(copy_mushroom_poison_df.edible == 1) & (copy_mushroom_poison_df.species.isin(edible_dupes))] = 1
return(copy_mushroom_poison_df)
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
mush_df = mushroom_poison_df.copy()
pois_df = poisonous_df.copy()
# display(mush_df.head())
# pois_df.rename(columns={'Scientific name': 'species'}, inplace=True)
# display(pois_df.head())
# mush_df = pd.merge(mush_df, pois_df, on = 'species', how='left')
# mush_df.dropna(inplace=True)
deadly_mush = set(pois_df.loc[pois_df['Severity']=='deadly', 'Scientific name'])
# print(deadly_mush)
mush_df['dupe'] = mush_df['species'].apply(lambda x: 1 if x in edible_dupes else 0)
mush_df['severe'] = mush_df['species'].apply(lambda x: 1 if x in deadly_mush else 0)
# display(mush_df)
mush_df['severe'] = mush_df['severe'] * mush_df['poisonous']
mush_df['dupe'] = mush_df['dupe'] * mush_df['edible']
mushroom_severity_df = mush_df
return mushroom_severity_df
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
mushroom_severity_df = mushroom_poison_df.copy()
deadly_species = set(
poisonous_df.loc[poisonous_df['Severity']=='deadly','Scientific name'])
mushroom_severity_df['severe'] = ((mushroom_severity_df['poisonous']==1 ) &
(mushroom_severity_df['species'].isin(deadly_species))).astype(int)
mushroom_severity_df['dupe'] = ((mushroom_severity_df['edible']==1 ) &
(mushroom_severity_df['species'].isin(edible_dupes))).astype(int)
return mushroom_severity_df
In [ ]:
def determine_severity(edible_dupes: set, poisonous_df: pd.DataFrame, mushroom_poison_df: pd.DataFrame) -> set:
deadly = poisonous_df['Scientific name'][poisonous_df['Severity'] == 'deadly']
mushroom_severity_df = mushroom_poison_df.copy()
mushroom_severity_df['severe'] = \
(mushroom_severity_df['poisonous'] == 1) \
& (mushroom_severity_df['species'].isin(deadly))
mushroom_severity_df['dupe'] = \
(mushroom_severity_df['edible'] == 1) \
& (mushroom_severity_df['species'].isin(edible_dupes))
for c in ['severe', 'dupe']:
mushroom_severity_df[c] = \
mushroom_severity_df[c].astype(int)
return mushroom_severity_df
Ex5¶
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
df = mushroom_severity_df.copy()
df = df[df['genus'] == genus]
df = df[['month', 'genus', 'species']].groupby(by = ['month', 'species'],
as_index = False).count().rename(columns={'genus':'counts'})
monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts', fill_value = 0.0)
return monthly_df.reindex(range(1,13),fill_value=0)
In [ ]:
### Solution - Exercise 5
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
df = mushroom_severity_df.copy(deep=True)
df = df[df['genus'] == genus]
df = (
df[['month', 'genus', 'species']]
.groupby(
by = ['month', 'species'],
as_index = False
).count()
.rename(
columns={'genus':'count'}
)
)
monthly_df = df.pivot_table(
index = 'month',
columns = 'species',
values = 'count',
fill_value = 0.0
)
return monthly_df.reindex(
range(1, 13),
fill_value=0
)
In [ ]:
### Solution - Exercise 5
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
df = mushroom_severity_df.copy()
df = df[df['genus'] == genus]
df = df[['month', 'genus', 'species']].groupby(by = ['month', 'species'],
as_index = False).count().rename(columns={'genus':'counts'})
monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts', fill_value = 0.0)
# Ensure all months 1-12 are in the index
monthly_df = monthly_df.reindex(range(1, 13), fill_value=0)
return monthly_df
pass
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
mushroom_severity_df=mushroom_severity_df.copy()
mushroom_severity_df['is_genus'] = mushroom_severity_df['genus'] == genus
df = mushroom_severity_df[mushroom_severity_df['is_genus']]
# display(df)
df = df[['month', 'genus', 'species']].groupby(by = ['month', 'species']).count().reset_index().rename(columns={'genus':'counts'})
# display(df)
monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts',fill_value=0).astype('int64').reset_index()
monthly_df.index=monthly_df['month']
monthly_df.drop(columns=['month'],inplace=True)
for i in range(1,13):
if i not in monthly_df.index:
monthly_df.loc[i]=0
return monthly_df
In [ ]:
### Solution - Exercise 5
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
##CHECK DATA
##print(mushroom_severity_df)
##print(genus)
##assert False
##FILTER DATA
mushroom_severity_df_copy = mushroom_severity_df.copy()
mushroom_severity_df_copy['is_genus'] = mushroom_severity_df_copy['genus'] == genus
df = mushroom_severity_df_copy[mushroom_severity_df_copy['is_genus']]
##COUNT SPECIES
##df = df[['month', 'genus', 'species']].groupby(by = ['month', 'genus','species']).count().rename(columns={'species':'counts'})
df = df[['month', 'genus', 'species']].groupby(by = ['month', 'species']).count().rename(columns = {"genus":"counts"})##.rename(columns={'species':'counts'})
##PIVOT
##monthly_df = df.pivot_table(index = 'species', columns = 'month', values = 'counts').reset_index()
monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts')##.reset_index(drop = True)
##print(monthly_df)
##assert False
##CHECK FOR MISSING MONTHS
months = [1,2,3,4,5,6,7,8,9,10,11,12]
for month in months:
if month not in monthly_df.index:
monthly_df.loc[month] = 0
##SET MISSING VALUES TO 0
monthly_df = monthly_df.fillna(0)
monthly_df = monthly_df.astype(int)
##RETURN DF
return monthly_df
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
# mushroom_severity_df['is_genus'] = mushroom_severity_df['genus'] == genus
df = mushroom_severity_df[mushroom_severity_df['genus'] == genus]
# df = df[['month', 'genus', 'species']].groupby(by = ['month', 'genus']).count().rename(columns={'species':'counts'})
df = df.groupby(['month', 'species']).size().reset_index(name='counts')
monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts', fill_value=0) #.reset_index()
monthly_df = monthly_df.reindex(range(1, 13), fill_value=0)
return monthly_df
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
df = mushroom_severity_df[mushroom_severity_df['genus']==genus]
df = (df[['month', 'genus', 'species']].groupby(by = ['month', 'species']).count()
.rename(columns={'genus':'counts'}))
monthly_df = df.pivot_table(index = 'month', columns = 'species', values = 'counts',fill_value=0).reindex(range(1,13),fill_value=0)
return monthly_df
In [ ]:
def DEBUG_peak_months(mushroom_severity_df: pd.DataFrame, genus:str) -> pd.DataFrame:
is_genus = mushroom_severity_df['genus'] == genus
df = mushroom_severity_df[is_genus][['month', 'genus', 'species']] \
.groupby(by=['month', 'species']) \
.count() \
.rename(columns={'genus': 'counts'})
monthly_df = df.pivot_table(index='month',
columns='species',
values='counts',
fill_value=0)
missing_months = set(range(1, 13)) - set(monthly_df.index)
if len(missing_months) > 0:
zeros_row = [0] * len(monthly_df.columns)
for missing in missing_months:
monthly_df.loc[missing] = zeros_row
return monthly_df
Ex6¶
In [ ]:
### Solution - Exercise 6
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
msdf = mushroom_severity_df.copy()
msdf = msdf[msdf['species']==species]
return msdf.groupby('state')['state'].count().reset_index(name='count').sort_values(['count','state'],ascending=[False,True])
In [ ]:
### Solution - Exercise 6
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
# print(mushroom_severity_df.columns)
df = mushroom_severity_df[mushroom_severity_df["species"]==species].copy(deep=True)
df = (
df.groupby('state')
.size()
.reset_index(
name='count'
)
)
return df.sort_values(
["count","state"],
ascending = [False,True]
)
In [ ]:
### Solution - Exercise 6
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
###
df = mushroom_severity_df.copy()
filtered_df = df[df['species']==species]
grouped_df = (filtered_df.groupby('state')
.size()
.reset_index(name='count')
.sort_values(['count', 'state'], ascending=[False, True])
.reset_index(drop=True))
return grouped_df
###
pass
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
###
### YOUR CODE HERE
###
# display(mushroom_severity_df)
rdf=mushroom_severity_df[mushroom_severity_df['species']==species]
rdf=rdf.groupby('state',as_index=False).agg(count=('state','count')).sort_values(by=['count','state'],ascending=[False,True])
return rdf
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
df = mushroom_severity_df[mushroom_severity_df['species'] == species]
top_states = (
df.groupby('state')
.size()
.reset_index(name='count')
.sort_values(by=['count', 'state'], ascending=[False, True])
.reset_index(drop=True)
)
return top_states
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
df = mushroom_severity_df[mushroom_severity_df['species']==species]
top_states = (df.groupby('state').size().reset_index(name='count')
.sort_values('count',ascending=False)
.reset_index(drop=True))
return top_states
In [ ]:
def top_states(mushroom_severity_df: pd.DataFrame, species: str) -> pd.DataFrame:
is_species = mushroom_severity_df['species'].isin({species})
counts = mushroom_severity_df['state'][is_species].value_counts()
countsdf = counts.reset_index().rename(columns={'index': 'state', 'state': 'count'})
countsdf = countsdf.sort_values(by='count', ascending=False)
return countsdf
Ex7¶
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
from sklearn.cluster import KMeans
import math
### Filter mushroom_severity_df
_df = mushroom_severity_df.copy()
df = _df[_df['species'] == species][['longitude','latitude']]
k = 1
k_max = 20
### Do initial calculations for k = 1
### You will do similar calculations in the loop.
### You will need to adjust the prior_centers and inertia on each iteration.
prior_centers = None # replace with centers for the prior run
kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
inertia = kmeans.inertia_
wcss_percent_decrease = math.inf # replace with your calculation
# loop will terminate when one of the below is true:
# - k reaches k_max
# - percentage decrease in inertia drops below threshold
for k in range(2, k_max + 1):
if (wcss_percent_decrease <= threshold):
break
prior_centers = kmeans.cluster_centers_
kmeans = KMeans(n_clusters=k,random_state=seed_value).fit(df)
wcss_percent_decrease = 100*((inertia - kmeans.inertia_)/inertia)
inertia = kmeans.inertia_
k+=1
return prior_centers
In [ ]:
### Solution - Exercise 7
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
from sklearn.cluster import KMeans
import math
### Filter mushroom_severity_df
_df = mushroom_severity_df.copy(deep=True)
df = _df[_df['species'] == species][['longitude','latitude']]
k = 1
k_max = 20
### Do initial calculations for k = 1
### You will do similar calculations in the loop.
### You will need to adjust the prior_centers and inertia on each iteration.
prior_centers = None # replace with centers for the prior run
kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
inertia = kmeans.inertia_
wcss_percent_decrease = math.inf # replace with your calculation
# Store clusters
prior_centers = kmeans.cluster_centers_
# loop will terminate when one of the below is true:
# - k reaches k_max
# - percentage decrease in inertia drops below threshold
for k in range(2, k_max):# + 1):
# Fit new model
kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
new_inertia = kmeans.inertia_
new_centers = kmeans.cluster_centers_
# Calculate percentage decrease: 100 * (prior - new) / prior
wcss_percent_decrease = 100.0 * (inertia - new_inertia) / inertia
if (wcss_percent_decrease <= threshold):
break
else:
# Improvement was good, update prior values and continue
prior_centers = new_centers
inertia = new_inertia
return prior_centers
In [ ]:
### Solution - Exercise 7
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
from sklearn.cluster import KMeans
import math
### Filter mushroom_severity_df
_df = mushroom_severity_df.copy()
df = _df[_df['species'] == species][['longitude','latitude']]
k = 1
k_max = 20
### Do initial calculations for k = 1
### You will do similar calculations in the loop.
### You will need to adjust the prior_centers and inertia on each iteration.
prior_centers = None # replace with centers for the prior run
kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
prior_inertia = kmeans.inertia_
wcss_percent_decrease = math.inf # replace with your calculation
# loop will terminate when one of the below is true:
# - k reaches k_max
# - percentage decrease in inertia drops below threshold
for k in range(2, k_max + 1):
if (wcss_percent_decrease <= threshold):
break
###
prior_centers = kmeans.cluster_centers_
kmeans = KMeans(n_clusters=k, random_state=seed_value).fit(df)
new_inertia = kmeans.inertia_
wcss_percent_decrease = 100 * (prior_inertia - new_inertia)/prior_inertia
prior_inertia = new_inertia
###
return prior_centers
pass
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
from sklearn.cluster import KMeans
import math
### Filter mushroom_severity_df
_df = mushroom_severity_df.copy()
df = _df[_df['species'] == species][['longitude','latitude']]
k = 1
k_max = 20
### Do initial calculations for k = 1
### You will do similar calculations in the loop.
### You will need to adjust the prior_centers and inertia on each iteration.
###
### YOUR CODE HERE
###
pct_decrease=1000
kmeans=KMeans(n_clusters=k,random_state=seed_value).fit(df)
inertia=kmeans.inertia_
k+=1
while k<=k_max:
if pct_decrease<=threshold:
break
prior_centers=kmeans.cluster_centers_
kmeans=KMeans(n_clusters=k,random_state=seed_value).fit(df)
pct_decrease=100.0*(inertia-kmeans.inertia_)/inertia
inertia=kmeans.inertia_
k+=1
return prior_centers
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
from sklearn.cluster import KMeans
import math
### Filter mushroom_severity_df
_df = mushroom_severity_df.copy()
df = _df[_df['species'] == species][['longitude','latitude']]
##print(df)
##print(threshold)
k = 1
k_max = 20
### Do initial calculations for k = 1
### You will do similar calculations in the loop.
### You will need to adjust the prior_centers and inertia on each iteration.
###
### YOUR CODE HERE
##LOOP THROUGH k
prior_centers = [0]
prior_precentage_decrease = None
prior_inertia = 100000000000
while k <= k_max:
##FIT MODEL
##print(f"\nk = {k}")
##print(f"threshold = {threshold}")
model = KMeans(n_clusters = k, random_state = seed_value )
model = model.fit(df)
##GET MODEL VALUES
new_inertia = model.inertia_
new_cluster_centers = model.cluster_centers_
##print("\nmodel parameters")
##print(new_inertia)
##print(new_cluster_centers)
##CALCULATE VALUES
new_percentage_decrease = 100*((prior_inertia - new_inertia)/prior_inertia)
##print(f"new_percentage_decrease = {new_percentage_decrease}")
if (prior_precentage_decrease is not None) and (new_percentage_decrease < threshold):
##if (prior_precentage_decrease is not None) and (prior_precentage_decrease < threshold):
print(f"k = {k-1}")
print(f"new_percentage_decrease = {new_percentage_decrease}")
print(f"threshold = {threshold}")
print("\n\n ================ \n\n")
break
##UPDATE PRIORS
prior_centers = new_cluster_centers
prior_precentage_decrease = new_percentage_decrease
prior_inertia = new_inertia
##INCREASE k
k = k + 1
return(prior_centers)
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
from sklearn.cluster import KMeans
import math
### Filter mushroom_severity_df
_df = mushroom_severity_df.copy()
df = _df[_df['species'] == species][['longitude','latitude']]
k = 1
k_max = 20
### Do initial calculations for k = 1
kmeans = KMeans(n_clusters=k, random_state=seed_value)
kmeans.fit(df)
prior_centers = kmeans.cluster_centers_
prior_in = kmeans.inertia_
pct_dec = math.inf
### You will do similar calculations in the loop.
### You will need to adjust the prior_centers and inertia on each iteration.
while k < k_max and pct_dec > threshold:
centers = kmeans.cluster_centers_
prior_in = kmeans.inertia_
k += 1
kmeans = KMeans(n_clusters=k, random_state=seed_value)
kmeans.fit(df)
new_in = kmeans.inertia_
pct_dec = 100 * (prior_in - new_in) / prior_in
return centers
In [ ]:
def find_population_centers(mushroom_severity_df: pd.DataFrame, species: str, seed_value: int, threshold: int) -> np.ndarray:
from sklearn.cluster import KMeans
import math
### Filter mushroom_severity_df
_df = mushroom_severity_df.copy()
df = _df[_df['species'] == species][['longitude','latitude']]
k = 1
k_max = 20
### Do initial calculations for k = 1
### You will do similar calculations in the loop.
### You will need to adjust the prior_centers and inertia on each iteration.
model = KMeans(n_clusters=k,random_state=seed_value,n_init=10).fit(df)
prior_centers = model.cluster_centers_
inertia = model.inertia_
percent_decrease = float('inf')
for k in range(2,k_max+1):
if percent_decrease <threshold:
break
prior_centers = model.cluster_centers_
prior_inertia = model.inertia_
model = KMeans(n_clusters=k,random_state=seed_value,n_init=10).fit(df)
inertia = model.inertia_
percent_decrease = 100 * (prior_inertia - inertia)/prior_inertia
return prior_centers
In [ ]:
k = 1
kmeans = KMeans(k, ...).fit(...)
inertia = kmeans.inertia_
centers = kmeans.cluster_centers_
while k < k_max: # Or should this be `k < k_max-1` or `k <= k_max` or something else? When to stop?
k += 1 # Next `k` to try
kmeans = KMeans(k, ...).fit(...)
new_inertia = kmeans.inertia_
percentage_decrease = 100 * (inertia - new_inertia) / inertia
if percentage_decrease < threshold:
break
inertia = new_inertia
centers = kmeans.cluster_centers_
return centers
Ex8¶
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
import math
l1,t1 = coord
l2,t2 = obs
tm = math.radians((t1 + t2)/2)
K1 = 111.13209 - 0.56605*math.cos(2*tm)+0.00120*math.cos(4*tm)
K2 = 111.41513*math.cos(tm)-0.09455*math.cos(3*tm)+0.00012*math.cos(5*tm)
dt = t2 - t1
dl = l2 - l1
D = ((K1*dt)**2+(K2*dl)**2)**0.5
return D
In [ ]:
### Solution - Exercise 8
def geodetic_distance(coord:list, obs:list) -> float:
import math
# Extract coords & obs
lon1, lat1 = coord
lon2, lat2 = obs
# Calc diffs (phi & lambda)
delta_phi = lat2 - lat1
delta_lambda = lon2 - lon1
# Calc mid-latitude (phi_m)
phi_m = (lat1 + lat2) / 2
# Convert phi_m to rads
phi_m_rad = math.radians(phi_m)
# Calc K1 & K2
k1 = 111.13209 - 0.56605 * math.cos(2 * phi_m_rad) + 0.00120 * math.cos(4 * phi_m_rad)
k2 = 111.41513 * math.cos(phi_m_rad) - 0.09455 * math.cos(3 * phi_m_rad) + 0.00012 * math.cos(5 * phi_m_rad)
# Calc D:
distance = math.sqrt((k1 * delta_phi)**2 + (k2 * delta_lambda)**2)
return distance
In [ ]:
### Solution - Exercise 8
from math import radians, sin, cos, acos
def geodetic_distance(coord:list, obs:list) -> float:
###
diff_long = float(float(coord[0] - obs[0]))
diff_lat = float(float(coord[1] - obs[1]))
mid_lat = radians((coord[1] + obs[1]) / 2)
K1 = 111.13209 - (0.56605 * cos(2 * mid_lat)) + (0.00120 * cos(4 * mid_lat))
K2 = (111.41513 * cos(mid_lat)) - (0.09455 * cos(3 * mid_lat)) + (0.00012 * cos(5 * mid_lat))
D = np.sqrt((K1 * diff_lat)**2 + (K2 * diff_long)**2)
return D
###
pass
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
###
### YOUR CODE HERE
###
from math import cos,radians
lon1,lat1=coord
lon2,lat2=obs
delphi=lat2-lat1
dellam=lon2-lon1
phim=radians((lat2+lat1)/2)
k1=111.13209-.56605*cos(2*phim)+.00120*cos(4*phim)
k2=111.41513*cos(phim)-.09455*cos(3*phim)+.00012*cos(5*phim)
print(f'k1 {k1} k2 {k2}')
return ((k1*delphi)**2+(k2*dellam)**2)**(1/2)
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
###
### YOUR CODE HERE
###
print(coord)
print(obs)
mid_latitude = (coord[1] + obs[1])/2
k1 = 111.13209 - (0.56605 * np.cos(2 * mid_latitude)) + (0.00120 * np.cos(4 * mid_latitude))
k2 = (111.41513 * np.cos(mid_latitude)) - (0.09455 * np.cos(3 * mid_latitude)) + (0.00012 * np.cos(5 * mid_latitude))
diff_lat = coord[1] - obs[1]
diff_long = coord[0] - obs[0]
distance = ((k1*diff_lat)**2 + (k2*diff_long)**2)**(1/2)
return(distance)
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
import math
lon1 = coord[0]
lat1 = coord[1]
lon2 = obs[0]
lat2 = obs[1]
del_phi = lat2 - lat1
del_lambda = lon2 - lon1
phi_m = (lat1 + lat2) / 2
phi_m_rad = math.radians(phi_m)
k1 = 111.13209 - 0.56605 * math.cos(2 * phi_m_rad) + 0.00120 * math.cos(4 * phi_m_rad)
k2 = 111.41513 * math.cos(phi_m_rad) - 0.09455 * math.cos(3 * phi_m_rad) + 0.00012 * math.cos(5 * phi_m_rad)
distance = math.sqrt((k1 * del_phi) ** 2 + (k2 * del_lambda) ** 2)
return distance
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
long1,lat1 = coord
long2,lat2 = obs
delta_phi = lat2-lat1
detla_lambda = long2-long1
phi_m = np.radians((lat1+lat2)/2)
K1 = 111.13209 - 0.56605 * np.cos(2*phi_m)+0.00120*np.cos(4*phi_m)
K2 = 111.41513* np.cos(phi_m)-0.09455*np.cos(3*phi_m)+0.00012*np.cos(5*phi_m)
return np.sqrt((K1*delta_phi)**2+(K2*detla_lambda)**2)
In [ ]:
def geodetic_distance(coord:list, obs:list) -> float:
from math import radians, cos, sqrt
dphi = obs[1] - coord[1]
dlambda = obs[0] - coord[0]
phim = radians((obs[1] + coord[1]) / 2)
K1 = 111.13209 - 0.56605*cos(2*phim) + 0.00120*cos(4*phim)
K2 = 111.41513*cos(phim) - 0.09455*cos(3*phim) + 0.00012*cos(5*phim)
D = sqrt((K1*dphi)**2 + (K2*dlambda)**2)
return D
Ex9¶
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
dist = []
for i in centers:
dist.append(distance_function(i,coordinates))
return np.argmin(dist,axis=0)
In [ ]:
### Solution - Exercise 9
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
dist_matrix = np.apply_along_axis(
lambda row: distance_function(
row,
centers
),
1,
coordinates
)
return np.argmin(
dist_matrix,
axis=1
)
In [ ]:
### Solution - Exercise 9
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
distances = np.apply_along_axis(
func1d=lambda coord: distance_function(coord, centers),
axis=1,
arr=coordinates
)
labels = np.argmin(distances, axis=1)
return labels
pass
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
###
### YOUR CODE HERE
###
# print(coordinates)
dist=np.apply_along_axis(distance_function,1,centers,point_2=coordinates)
return np.argmin(dist,axis=0)
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
###
### YOUR CODE HERE
###
print(centers)
print(coordinates)
distances = [distance_function(coordinates[i], centers) for i in range(len(coordinates))]
labels = np.argmin(distances, axis = 1)
return(labels)
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
distances = np.apply_along_axis(lambda x: distance_function(x, centers), axis=1, arr=coordinates)
labels = np.argmin(distances, axis=1)
return labels
In [ ]:
def assign_labels(centers: np.ndarray, coordinates: np.ndarray, distance_function) -> np.ndarray:
distance = np.apply_along_axis(
lambda point: distance_function(point,centers),
axis =1,
arr=coordinates,)
labels = np.argmin(distance,axis=1)
return labels
Ex10¶
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
msdf = mushroom_severity_df.copy()
msdf['dist'] = distance_function(coordinates,mushroom_severity_df[['longitude','latitude']].values)
msdf = msdf[msdf['dist']<=radius]
N = msdf.shape[0]
p,s,d = msdf[['poisonous','severe','dupe']].sum().values
score = round(100*(1-p/N)*(1-s/N)*(1-d/N),2)
msdf = msdf[msdf['edible']==1]
msdf = msdf.groupby('species')['species'].count().reset_index(name='count').sort_values(by=['count','species'],ascending=[False,True])
return score, msdf['species'].iloc[0]
In [ ]:
### Solution - Exercise 10
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
# Copy
df = mushroom_severity_df.copy(deep=True)
# 1. Find dist & filter
prof_coords = np.array(coordinates)
obs_coords = df[['longitude', 'latitude']].values
dists = distance_function(
prof_coords,
obs_coords
)
df_local = df[dists <= radius].copy()
N = len(df_local)
# Handle empty r case
if N == 0:
return 0.00, None
# 2. Calc safety score comps
n_pois = df_local['poisonous'].sum()
n_severe = df_local['severe'].sum()
n_dupe = df_local['dupe'].sum()
score = 100 * (1 - n_pois/N) * (1 - n_severe/N) * (1 - n_dupe/N)
# 3. Find best edible species
edibles = df_local[df_local['edible'] == 1]
if len(edibles) == 0:
return round(score, 2), None
# Group by species & count, then sort by count DESC & species ASC
species_counts = (
edibles.groupby('species')
.size()
.reset_index(
name='count'
)
)
best_species = species_counts.sort_values(
by=['count', 'species'],
ascending=[False, True]
).iloc[0]['species']
return round(score, 2), best_species
In [ ]:
### Solution - Exercise 10
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
###
observations = mushroom_severity_df[['longitude','latitude']].to_numpy()
distances = distance_function(coordinates, observations)
within_radius_df = mushroom_severity_df[distances<=radius]
N = len(within_radius_df)
# n_pois = (within_radius_df['poisonous']==True).sum()
# n_severe = (within_radius_df['severe']==True).sum()
# n_dupes = (within_radius_df['dupe']==True).sum()
n_pois = ((within_radius_df['poisonous']==True).sum())/N
n_severe = ((within_radius_df['severe']==True).sum())/N
n_dupes = ((within_radius_df['dupe']==True).sum())/N
# safety_score = 100 * ((1-n_pois)/N) * ((1-n_severe)/N) * ((1-n_dupes)/N)
safety_score = 100 * (1 - n_pois) * (1 - n_severe) * (1 - n_dupes)
safety_score = round(safety_score, 2)
# display(within_radius_df)
edible = within_radius_df[within_radius_df['edible'] == True]
species_counts = edible['species'].value_counts()
best_species = (edible['species']
.value_counts()
.reset_index()
.rename(columns={'index': 'species', 'species': 'count'})
.sort_values(['count', 'species'], ascending=[False, True])
.iloc[0]['species'])
return safety_score, best_species
###
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
###
### YOUR CODE HERE
###
df=mushroom_severity_df.copy()
df['dist']=distance_function(coordinates,df[['longitude','latitude']].values)
df=df[df['dist']<=radius]
n=df.shape[0]
npois=df[df['poisonous']==1].shape[0]
nsevere=df[df['severe']==1].shape[0]
ndupe=df[df['dupe']==1].shape[0]
safe_score=round(100.0*(1-npois/n)*(1-nsevere/n)*(1-ndupe/n),2)
df=df[df['edible']==1].groupby('species',as_index=False).agg(count=('species','count')).sort_values(by=['count','species'],ascending=[False,True]).reset_index()
return safe_score,df['species'][0]
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
###
### YOUR CODE HERE
###
##print(mushroom_severity_df.head(5))
##print("\n-----------------\n")
##print(coordinates)
##print("\n-----------------\n")
##print(radius)
##print("\n-----------------\n")
##CREATE COPY
mushroom_severity_df_copy = mushroom_severity_df.copy()
##FILTER FOR CLOSE OBS
mushroom_coord = mushroom_severity_df_copy[['longitude', 'latitude']].to_numpy()
##print(mushroom_coord)
distance_to_obs = distance_function(coordinates, mushroom_coord)
##print(distance_to_obs)
mushroom_severity_df_copy["distance_to_obs"] = distance_to_obs
##print(mushroom_severity_df_copy.shape)
mushroom_severity_df_copy = mushroom_severity_df_copy[mushroom_severity_df_copy.distance_to_obs <= radius]
##print(mushroom_severity_df_copy.shape)
##print(mushroom_severity_df_copy)
##CALUCATE FORMULA
total_num = len(mushroom_severity_df_copy)
poisonous_num = sum(mushroom_severity_df_copy.poisonous)
dupe_num = sum(mushroom_severity_df_copy.dupe)
severe_num = sum(mushroom_severity_df_copy.severe)
safety_score = (1 - (poisonous_num/total_num)) * (1 - (severe_num/total_num)) * (1 - (dupe_num/total_num))
safety_score = round(100*safety_score, 2)
##print(safety_score)
##FIND BEST SPECIES
mushroom_severity_df_copy = mushroom_severity_df_copy[mushroom_severity_df_copy.edible == 1]
##print(mushroom_severity_df_copy)
mushroom_severity_df_copy = mushroom_severity_df_copy.groupby('species')['species'].count().reset_index(name = "count")
##print(mushroom_severity_df_copy)
mushroom_severity_df_copy = mushroom_severity_df_copy.sort_values(by=['count', 'species'], ascending=[False, True])
##print(mushroom_severity_df_copy)
best_species = mushroom_severity_df_copy.iloc[0][0]
##print(best_species)
##RETURN VALUES
return(safety_score, best_species)
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
df = mushroom_severity_df.copy()
obs_coords = np.array(df[['longitude', 'latitude']])
distances = distance_function(np.array(coordinates), obs_coords)
df['distance'] = distances
nearby_df = df[df['distance'] <= radius]
N = nearby_df.shape[0]
n_pois = nearby_df['poisonous'].sum()
n_severe = nearby_df['severe'].sum()
n_dupe = nearby_df['dupe'].sum()
safety_score = round((1 - n_pois / N)*(1 - n_severe / N)*(1 - n_dupe / N)* 100, 2)
edible_nearby = nearby_df[nearby_df['edible'] == 1]
best_species = (
edible_nearby.groupby('species')
.size()
.reset_index(name='count')
.sort_values(by=['count', 'species'], ascending=[False, True])
.iloc[0]['species']
)
return safety_score, best_species
In [ ]:
def find_safety_score(mushroom_severity_df: pd.DataFrame, coordinates: tuple, radius: float, distance_function):
points = np.array(coordinates)
observations = mushroom_severity_df[['longitude','latitude']].values
distances = distance_function(points,observations)
nearby = mushroom_severity_df[distances<=radius]
nearby_len = len(nearby)
n_pois = nearby['poisonous'].sum()
n_severe = nearby['severe'].sum()
n_dupe = nearby['dupe'].sum()
safety_score = (1-n_pois/nearby_len)*(1-n_severe/nearby_len)*(1-n_dupe/nearby_len)*100
safety_score = round(safety_score,2)
edible_near = nearby[nearby['edible']==1]
counts = edible_near['species'].value_counts()
top_count = counts.max()
best_species = sorted(counts[counts==top_count].index)[0]
return safety_score,best_species