Version history
All of the header information is important. Please read it.
Topics, number of exercises: This problem builds on your knowledge of basic Python data structures, nested data structures, and string manipulation. It has 10 exercises, numbered 0 to 9. There are 18 available points. However, to earn 100% the threshold is 13 points. (Therefore, once you hit 13 points, you can stop. There is no extra credit for exceeding this threshold.)
Exercise ordering: Each exercise builds logically on previous exercises, but you may solve them in any order. That is, if you can't solve an exercise, you can still move on and try the next one. Use this design to your advantage, as the exercises are not necessarily ordered in terms of difficulty. Higher point values generally indicate more difficult exercises.
Demo cells: Code cells starting with the comment ### define demo inputs
load results from prior exercises applied to the entire data set and use those to build demo inputs. These cells must be run for subsequent demos to work properly, but they do not affect the test cells. The data loaded in these cells may be rather large (at least in terms of human readability). You are free to print or otherwise use Python to explore them, but we did not print them in the starter code.
Debugging you code: Right before each exercise test cell, there is a block of text explaining the variables available to you for debugging. You may use these to test your code and can print/display them as needed (careful when printing large objects, you may want to print the head or chunks of rows at a time).
Exercise point breakdown:
Final reminders:
In this notebook, you will analyze social media data from Twitter and FourSquare. Your ultimate goal will be to help people find others with common interests.
The (anonymized) data consists of the following:
We will analyze this data and then, by the very last exercise, create a function that can, for a given person, recommend other people they might be compatible with based on their affinity for the same places.
Start by running the next cell, which will set up some of the code and data you'll need later.
### Global Imports
### BEGIN HIDDEN TESTS
if False: # set to True to set up
import dill
import hashlib
def hash_check(f1, f2, verbose=True):
with open(f1, 'rb') as f:
h1 = hashlib.md5(f.read()).hexdigest()
with open(f2, 'rb') as f:
h2 = hashlib.md5(f.read()).hexdigest()
if verbose:
print(h1)
print(h2)
assert h1 == h2, f'The file "{f1}" has been modified'
with open('resource/asnlib/public/hash_check.pkl', 'wb') as f:
dill.dump(hash_check, f)
del hash_check
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
### END HIDDEN TESTS
# Some functionality needed by the notebook and demo cells:
from pprint import pprint, pformat
import math
def status_msg(s, time=None):
from datetime import datetime
if time is None:
time = datetime.now()
print(f"[{time}] {s}")
def load_json(filename):
from json import load
with open(filename, "rt") as fp:
return load(fp)
return None
def save_json(filename, obj):
from json import dump
with open(filename, "w") as fp:
dump(obj, fp, indent=2)
def load_pickle(filename):
from pickle import load
with open(filename, "rb") as fp:
return load(fp)
return None
def save_pickle(filename, obj):
from pickle import dump
with open(filename, "wb") as fp:
dump(obj, fp)
def choose_ext(s, ext_map):
for ext in ext_map:
if s[-(len(ext)+1):] == f".{ext}":
return ext_map[ext]
return None
def load_database(basename, tag=None, sample=None, pathname="resource/asnlib/publicdata/"):
filename = pathname + basename
status_msg(f"Loading {tag+' ' if tag is not None else ''}[{filename}] ...")
loader = choose_ext(basename, {'pickle': load_pickle, 'json': load_json})
assert loader is not None, "*** Unrecognized file extension. ***"
database = loader(filename)
status_msg("... done!")
print(f"\nThis data has {len(database):,} entries.")
if sample is not None and isinstance(database, dict):
print("\nHere is a sample:\n")
pprint({key: database[key] for key in sample})
return database
def save_database(database, basename, tag=None, sample=None, pathname="resource/asnlib/publicdata/"):
if sample is not None and isinstance(database, dict):
database = {key: database[key] for key in sample}
print(f"This data has {len(database):,} entries.")
filename = pathname + basename
status_msg(f"\nSaving {tag+' ' if tag is not None else ''}[{filename}] ...")
saver = choose_ext(basename, {'pickle': save_pickle, 'json': save_json})
assert saver is not None, "*** Unrecognized file extension. ***"
saver(filename, database)
status_msg("... done!")
def sample_dict(d, k=1):
"""Extract a sample of at most `k` key-value pairs from the dictionary `d`."""
assert k >= 0, f"*** The number of samples must be nonnegative (k={k}). ***"
from random import sample
keys = sample(d.keys(), min(k, len(list(d.keys()))))
return {k: d[k] for k in keys}
def sample_safely(x, k):
"""Returns a set of at most `k` uniform-random samples from `x`."""
from random import sample
return set(sample(x, min(k, len(x))))
def subset_dict(d, ks):
"""Returns a subset of the dictionary `d` for the keys `ks`."""
return {k: v for k, v in d.items() if k in ks}
def enum_map(x):
map_dict = {e: k for k, e in enumerate(x)}
return lambda i: map_dict[i]
def remap_dict(d, map_key=lambda k: k, map_val=lambda k: k):
"""Relabel the key-value pairs of a dictionary."""
assert len(set(map_key(k) for k in d.keys())) == len(d.keys()), '*** `map_key` is not one to one ***'
# assert len(set(map_val(k) for k in d.values())) == len(d.values()), '*** `map_val` is not one to one ***'
return {map_key(k): map_val(v) for k, v in d.items()}
def remap_set(s, map_fun=lambda e: e):
"""Relabel the elements of a set."""
s_new = {map_fun(e) for e in s}
assert len(s_new) == len(s), "*** `map_fun` is not one to one ***"
return s_new
The first part of the dataset consists of points-of-interest or POIs. A POI is a place that a person can visit.
Run this code cell to load the dataset of POIs.
pois = load_database('pois.pickle',
tag="points-of-interest (POIs)",
sample=['4f4f5309771648ea92a4bc21', '4d52b314d7eaa1434b90810f', '4d24cfc22ac6f04d13473845'])
There are over half-a-million POIs. Each one is stored as a key-value pair where
In the sample above, there are three POIs, each one with four attributes.
'country_code'
: A two-letter country code indicating in which country the POI is located. In this example, one is in Brazil ('BR'
) and two in Turkey ('TR'
).'lat'
and 'long'
: The latitude and longitude coordinates of the POI.'type'
: The type of POI. We see three types in this example: a 'University'
, a 'Automotive Shop'
, and a 'Turkish Restaurant'
.find_food_and_drink_types
¶To practice inspecting the POIs, suppose we want to find types of POIs that are associated with restaurants, bars, and the like. Implement the function,
def find_food_and_drink_types(pois):
...
to accomplish this task.
Inputs: A collection of POIs as shown above, given as a Python dictionary of dictionaries.
Task: For each POI type, which is a string, split the string into "words" using whitespace as a delimiter.
Compare each word to the target list, below. Consider the word a "match" if it is exactly the same except for case. (So 'Food'
and 'FOOD'
match 'food'
, but 'foods'
and 'food'
do not match.)
The target words are:
'bar', 'bars', 'beer', 'brewery', 'cafe', 'cafes', 'coffee',
'cocktail', 'cocktails', 'drink', 'drinks', 'food',
'restaurant', 'restaurants', 'sake', 'tea', 'wine',
'whisky', 'whiskey'
Outputs: Return a Python set containing types whose words matched the target list. The case of the returned types should match the originals exactly, including case. (See the demo.)
# Demo input:
demo_pois_ex0 = {'4c0e251bb1b676b0f788e186': {
'country_code': 'AT',
'lat': 48.257386,
'long': 16.400122,
'type': 'Fast Food Restaurant'},
'4c247550b012b713167d0893': {
'country_code': 'HR',
'lat': 45.347616,
'long': 14.300922,
'type': 'Pool'},
'4fa3c7ade4b0f90206220bbf': {
'country_code': 'KR',
'lat': 37.545866,
'long': 127.122266,
'type': 'Bike Shop'}}
outline = '''
Goal/Output:
Return types whose words matched (case insensitive) the target list. Return originals
- data_type: set
Input:
pois
- data_type: dict of dicts
- {poi_id_str {str: object}}
Strategy:
1. initialize return set matched_types
2. loop through keys in pois so we can access the inner dict values
- we don't have to loop through key-value pairs (dict.items()) because we always want the value of 'type'
3. convert pois[i]['type'] to lower case via str.lower()
4. convert pois[i]['type'] to a list of words via str.split()
- only want to do this after I convert to lower case wih .lower() bc .lower() doesn't work on lists
5. check if intersection exists:
- if true: add pois[i]['type'] to return set
- if false: nothing
6. return matched_types
Solution:
'''
### Exercise 0 solution
def find_food_and_drink_types(pois):
### BEGIN SOLUTION
target_list = ['bar', 'bars', 'beer', 'brewery', 'cafe', 'cafes', 'coffee', 'cocktail', 'cocktails',
'drink', 'drinks', 'food', 'restaurant', 'restaurants', 'sake', 'tea', 'wine',
'whisky', 'whiskey']
matched_types = set()
for key in pois.keys():
lower_case_type = pois[key]['type'].lower()
type_words = lower_case_type.split()
if set(target_list).intersection(type_words):
matched_types.add(pois[key]['type'])
return matched_types
### END SOLUTION
### demo function call
find_food_and_drink_types(demo_pois_ex0)
The cell below will test your solution for Exercise 0. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex0
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_0',
'func': find_food_and_drink_types, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'pois':{
'dtype':'dict', # data type of param.
'check_modified':True,
}
},
'outputs':{
'output_0':{
'index':0,
'dtype':'set',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
RUN ME: Food and drink types: If you had a correct solution, you could use it to determine the food and drink types. We have precomputed this list for you, and here are the results. Run this cell whether or not you completed Exercise 0.
food_and_drink_types = load_database('food_and_drink_types.pickle')
print(f"\nThere are {len(food_and_drink_types):,} types that match the food and drink keywords. They are:")
print(food_and_drink_types)
A second component of the dataset is a collection of city records. Run the following cell to load it.
cities = load_database('cities.pickle',
tag="cities",
sample = ["Beijing, China", "Hartford, United States", "Stockholm, Sweden"])
Each city record is a key-value pair. The key is the city name, and the value is a dictionary of city attributes. The attributes are a two-letter country code, the latitude and longitude coordinates of the city center, and the type of city.
In this example, there are three cities: 'Beijing, China'
, 'Hartford, United States'
, and 'Stockholm, Sweden'
.
Important! Both the POIs and the cities are dictionaries of dictionaries, where in both cases the outer dictionary's key-value pairs define a record, and the inner dictionaries have the keys, 'country_code'
, 'lat'
, 'long'
, and 'type'
.
get_common_ccs
¶Suppose we wish to determine which countries exist in both the POIs data and the cities data. Implement the function,
def get_common_ccs(pois, cities):
...
to complete this task.
Inputs:
pois
: POIs, stored in a Python dictionary of dictionaries, mapping POI IDs keys to attribute values (like earlier examples)cities
: City records, stored in a Python dictionary of dictionaries, mapping city names to attribute values (like earlier examples)You may assume that the inner dictionaries have the same four keys: 'country_code'
, 'lat'
, 'long'
, and 'type'
.
Task: Identify all country codes that exist in pois
and in cities
.
Outputs: Return a new Python set consisting of the common country codes.
### DEMO INPUTS ###
demo_pois_ex1 = \
{'abc': {'country_code': 'US', 'lat': -5.2, 'long': 2.0, 'type': 'Restaurant'},
'def': {'country_code': 'US', 'lat': -2.3, 'long': 6.8, 'type': 'Golf Course'},
'ghi': {'country_code': 'PL', 'lat': 1.0, 'long': 6.5, 'type': 'Apartment'},
'jkl': {'country_code': 'VN', 'lat': 11.0, 'long': 1.5, 'type': 'Post Office'}}
demo_cities_ex1 = \
{'Seattle, United States': {'country_code': 'US', 'lat': -5.0, 'long': 3.0, 'type': 'Other'},
'Richmond, United States': {'country_code': 'US', 'lat': -2.0, 'long': 7.0, 'type': 'Other'},
'Hanoi, Vietnam': {'country_code': 'VN', 'lat': 10.0, 'long': 2.0, 'type': 'National and provincial capital'},
'Curitiba, Brazil': {'country_code': 'BR', 'lat': 0.0, 'long': 6.0, 'type': 'Provincial capital'}}
When a correct implementation runs on the demo inputs, it should return
{'US', 'VN'}
The POI 'PL'
does not appear in the output since it exists in pois
but not in cities
.
outline = '''
Goal/Output:
Return all country codes that exist in pois and in cities
- data_type: set
Input:
pois
- data_type: dict of dicts
- {poi_id_str {str: object}}
cities
- data_type: dict of dicts
- {city_name_str {str: object}} + inner keys will always have 'country_code', 'lat', 'long', and 'type'
Strategy:
0. Compare matching country_code in:
- pois[i]['country_code']
- cities[i]['country_code']
1. initialize return set matched_country_codes
2. loop through keys in pois so we can access the inner dict values
3. loop through keys in cities
4. check pois[i]['country_code'] == cities[j]['country_code']:
- if true: add pois[i]['country_code'] to return set
- if false: nothing
5. return matched_country_codes
Solution:
'''
### Exercise 1 solution
def get_common_ccs(pois, cities):
### BEGIN SOLUTION
matched_country_codes = set()
for key_pois in pois.keys():
for key_cities in cities.keys():
if pois[key_pois]['country_code'] == cities[key_cities]['country_code']:
matched_country_codes.add(pois[key_pois]['country_code'])
return matched_country_codes
### END SOLUTION
### demo function call
get_common_ccs(demo_pois_ex1, demo_cities_ex1)
The cell below will test your solution for Exercise 1. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex1
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_1',
'func': get_common_ccs, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'pois':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
'cities':{
'dtype':'dict',
'check_modified':True,
},
},
'outputs':{
'output_0':{
'index':0,
'dtype':'set',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
proximity_join
¶POI records only have lat-long coordinates. To make their locations more user-friendly, let's associate each POI with its closest city in the same country.
Implement the function,
def proxmity_join(pois, cities):
...
to help accomplish this task, following the requirements below.
Inputs:
pois
: POIs, stored in a Python dictionary of dictionaries, mapping POI IDs keys to attribute values (like Exercise 1)cities
: City records, stored in a Python dictionary of dictionaries, mapping city names to attribute values (like Exercise 1)You may assume that the inner dictionaries of both have the same four keys: 'country_code'
, 'lat'
, 'long'
, and 'type'
.
Task: For each POI record, determine which city is closest to it. The closest city must be in the same country (the 'country_code'
values must match).
To measure distance, use the squared Euclidean distance. That is, if a POI record has the lat-long coordinates $(h_p, v_p)$, and a city record has the lat-long coordinates $(h_c, v_c)$, then their squared Euclidean distance is
$$(h_p - h_c)^2 + (v_p - v_c)^2.$$Outputs: Your function should return a new Python dictionary of dictionaries. The outer dictionary's keys should be the same POI IDs. The inner dictionary should have exactly two keys: 'type'
, which is the POI type (not the city type), and 'city'
, which is the name of the closest city. See the demo below.
Notes and hints:
math.inf
, and it may be a helpful initializer as you search for the closest city to a given POI.dd_distance_squared
, that will compute the squared Euclidean distance given the attributes of a POI and the attributes of a city.### DEMO INPUTS ###
demo_pois_ex2 = \
{'abc': {'country_code': 'US', 'lat': -5.2, 'long': 2.0, 'type': 'Restaurant'},
'def': {'country_code': 'US', 'lat': -2.3, 'long': 6.8, 'type': 'Golf Course'},
'ghi': {'country_code': 'PL', 'lat': 1.0, 'long': 6.5, 'type': 'Apartment'},
'jkl': {'country_code': 'VN', 'lat': 11.0, 'long': 1.5, 'type': 'Post Office'}}
demo_cities_ex2 = \
{'Seattle, United States': {'country_code': 'US', 'lat': -5.0, 'long': 3.0, 'type': 'Other'},
'Richmond, United States': {'country_code': 'US', 'lat': -2.0, 'long': 7.0, 'type': 'Other'},
'Hanoi, Vietnam': {'country_code': 'VN', 'lat': 10.0, 'long': 2.0, 'type': 'National and provincial capital'},
'Curitiba, Brazil': {'country_code': 'BR', 'lat': 0.0, 'long': 6.0, 'type': 'Provincial capital'}}
Observe that the POI 'ghi'
does not appear in the output. That's because its country code, 'PL'
, does not match any country code of any city.
outline = '''
Goal/Output:
Return to each POI with its closest city in the same country
- date_type: dict of dicts
- {pois_id_str: {'type': pois[i]['type'], 'city': closest_city_str}}
Input:
pois
- data_type: dict of dicts
- {poi_id_str {str: object}}
cities
- data_type: dict of dicts
- {city_name_str {str: object}}
Strategy:
0. initialize return dict as closest_city
1. initialize intermediate dict current_closest {pois_id: euclid dist}
2. loop through keys in pois
3. loop through keys in cities
4. for each pois[i], check if pois[i]['country_code'] = cities[j]['country_code']
5. if true, calculate euclid distance via helper function
6. check if pois_key in closest_city:
7. if no: initialize pois type, city key, current_closest[euclid_dist]
8. if yes: check if current city is closer via euclid distance
9. if yes, overwrite closest_city values and current_closest[euclid_dist]
10. return closest_city
Solution:
'''
### Exercise 2 solution
def proximity_join(pois, cities):
### BEGIN SOLUTION
closest_city = {}
current_closest = {}
for pois_key in pois.keys():
pois_attr = pois[pois_key]
for cities_key in cities.keys():
if pois[pois_key]['country_code'] == cities[cities_key]['country_code']:
city_attr = cities[cities_key]
euclid_dist = dd_distance_squared(pois_attr, city_attr)
if pois_key not in closest_city:
closest_city[pois_key] = {}
closest_city[pois_key]['type'] = pois[pois_key]['type']
closest_city[pois_key]['city'] = cities_key
current_closest[pois_key] = euclid_dist
else:
if euclid_dist < current_closest[pois_key]:
closest_city[pois_key]['type'] = pois[pois_key]['type']
closest_city[pois_key]['city'] = cities_key
current_closest[pois_key] = euclid_dist
return closest_city
### END SOLUTION
# Handy function to calculate the distance between one POI and one city:
def dd_distance_squared(poi_attr, city_attr):
p_lat, p_long = poi_attr['lat'], poi_attr['long']
c_lat, c_long = city_attr['lat'], city_attr['long']
return (p_lat - c_lat)**2 + (p_long - c_long)**2
### demo function call
proximity_join(demo_pois_ex2, demo_cities_ex2)
The cell below will test your solution for Exercise 2. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex2
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_2',
'func': proximity_join, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'pois':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
'cities':{
'dtype':'dict',
'check_modified':True,
},
},
'outputs':{
'output_0':{
'index':0,
'dtype':'dict',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
The next part of the dataset consists of check-in records. Run the cell below to load this data.
checkins = load_database('checkins.pickle',
tag="check-in records",
sample=[119_352, 677_020, 728_580])
Each check-in record (or just check-in) is a key-value pair, where
Each visit is itself a Python dictionary with two key-value pairs:
'poi'
: The POI that the user visited, represented by its string ID.'date'
: The date of that visit, stored as a string.In the preceding example, there are three users, 119352
, 677020
, and 728580
. They each visited four, seven, and one POI, respectively. User 728580
visited the one POI on Sunday, May 13, 2012. A user can visit the same POI on different dates; for example, user 119352
made two visits to POI 4ab4edf1f964a520ad7120e3
, once on April 16 and again on April 23.
count_visits_by_country
¶Suppose we wish to determine in which country the most visits have been recorded. Complete the function,
def count_visits_by_country(checkins, pois):
...
to help accomplish this task.
Inputs:
checkins
: Check-in records, a Python dictionary of lists of dictionaries, as in the preceding example.pois
: Points-of-interest, a Python dictionary of dictionaries, as in earlier exercises.Task: For each country code, count the number of visits that occurred there.
Outputs: Return a Python list whose elements are tuples. Each tuple should be a pair, (cc, n)
, where cc
is a country code (a string) and n
is the total number of visits made in that country (an integer). Sort this list in ascending order by country code.
Notes:
pois
. In such cases, the visit should simply be ignored.### Define demo inputs
demo_checkins_ex3 = \
{ 259270: [ { 'date': 'Fri Apr 20 01:12:02 +0000 2012',
'poi': '4b155088f964a520beb023e3'}, # present in demo_pois_ex3
{ 'date': 'Sat Apr 14 01:25:16 +0000 2012',
'poi': '4b155088f964a520beb023e3'}], # present in demo_pois_ex3
424689: [ { 'date': 'Tue Apr 10 00:49:14 +0000 2012',
'poi': '4b7a6d3bf964a5208b2c2fe3'}], # present in demo_pois_ex3
1402043: [ { 'date': 'Thu May 17 20:42:11 +0000 2012',
'poi': '4bb4d63ff1b976b023661f20'},
{ 'date': 'Thu May 03 17:33:11 +0000 2012',
'poi': '4da9b4f36a2303012efb07c2'}],
1815705: [ { 'date': 'Tue Jun 12 20:59:11 +0000 2012',
'poi': '4f7ecdeae4b0ac821d08d00c'}]}
demo_pois_ex3 = \
{ '4b155088f964a520beb023e3': { 'country_code': 'US',
'lat': 47.591549,
'long': -122.332592,
'type': 'Baseball Stadium'},
'4b7a6d3bf964a5208b2c2fe3': { 'country_code': 'MY',
'lat': 3.170318,
'long': 101.708915,
'type': 'Hospital'},
'4b9a3eaaf964a520fda635e3': { 'country_code': 'US',
'lat': 39.43205,
'long': -84.207813,
'type': 'Train Station'}}
Given the demo data above, a correct solution would produce:
[('MY', 1), ('US', 2)]
That's because of the four users in demo_checkins_ex3
, only the first two (259270
and 424689
) visited POIs that are present in demo_pois_ex3
.
outline = '''
Goal/Output:
Return a sorted list of country_code and # of visits pairs
- date_type: list of tuples
- [(str, int)]
- sorted by country code asc
- # visits must be > 0
Input:
pois
- data_type: dict of dicts
- {poi_id_str {str: object}}
checkins
- data_type: a dict of lists of dicts
- {user_id_int: [ {{'date': str, 'poi': poi_id_str}}]}
Strategy:
0. initialize defaultdict country_visits as {'country_name': int}
1. loop keys in pois
2. count the # of visits via matches in checkins. store this as visit_count
3. if visit_count > 0
4. if true, store country_name of pois_key
5. add visit_count to existing country_visits[country_name]
6. transform country_visits from defaultdict -> list via dict.items()
7. return sorted country_visits
Solution:
'''
### Exercise 3 solution
def count_visits_by_country(checkins, pois):
### BEGIN SOLUTION
from collections import defaultdict
country_visits = defaultdict(int)
for pois_key in pois.keys():
visit_count = str(checkins.values()).count(pois_key)
if visit_count > 0:
country_name = pois[pois_key]['country_code']
country_visits[country_name] += visit_count
country_visits_list = country_visits.items()
return sorted(country_visits_list)
### END SOLUTION
### demo function call
count_visits_by_country(demo_checkins_ex3, demo_pois_ex3)
The cell below will test your solution for Exercise 3. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex3
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_3',
'func': count_visits_by_country, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'checkins':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
'pois':{
'dtype':'dict', # data type of param.
'check_modified':True,
}
},
'outputs':{
'output_0':{
'index':0,
'dtype':'list',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
RUN ME: Top countries: If you had a correct solution, you could use it to determine, say, the top 10 countries with having the most visit data. We have precomputed this ranking for you and here are the results. Run this cell whether or not you completed Exercise 3.
cc_visit_counts = load_database('cc_visit_counts.pickle')
top_ccs_by_visits = sorted(cc_visit_counts, key=lambda e: e[1], reverse=True)[:10]
print("\nThe top 10 countries by recorded visit counts:")
for k, (cc, nn) in enumerate(top_ccs_by_visits):
print(f"{' ' if k < 9 else ''}{k+1:d}. {cc}: {nn:,} visits")
Many of the data structures so far have the same form: a dictionary of dictionaries. For example, here are some POI records.
{'4f2491a8e4b04f6e695eac41': {
'lat': 32.984154,
'long': -96.754765,
'type': 'Temple',
'country_code': 'US'},
'4d6d12b5d5d937046513cbf9': {
'lat': -29.911862,
'long': -71.256914,
'type': 'Medical Center',
'country_code': 'CL'},
'4f67d81de4b072f14537b19b': {
'lat': 10.077291,
'long': -69.280417,
'type': 'Residential Building (Apartment / Condo)',
'country_code': 'VE'},
'4e88d624d3e39d6f4e7dded3': {
'lat': 49.276867,
'long': -123.111415,
'type': 'Soccer Stadium',
'country_code': 'CA'}}
The outer keys are the POI IDs, and the inner keys are 'lat'
, 'long'
, 'type'
, and 'country_code'
.
Suppose you have a client who only wants to look at the POIs where a specific inner key has a value satisfying an arbitrary condition. For example, the client gives you the function,
def is_positive(x):
return x > 0
which returns True
only when x
is positive. They then ask you to apply this function to all of the 'lat'
values and keep only the POI records where is_positive
on that inner value returns True
. This query is an example of filtering. Filtering keeps only the records where the predicate, is_positive
, holds.
filter_dd
¶Suppose we wish to write a function that can filter a dictionary-of-dictionaries by applying any predicate to a specific inner key's value. Implement the function,
def filter_dd(dd, inner_key, predicate):
...
to perform this task according to the specification below.
Inputs:
dd
: A dictionary of dictionaries.inner_key
: The inner key whose values we wish to use for filtering.predicate
: A function that, given the value of an inner key, returns True
or False
.Task: Scan all outer key-value pairs, applying predicate
to the value of the target inner-key. Keep only the outer key-value pairs where the predicate returns True
. Returning a new dictionary with the final results.
Outputs: Return a new Python dictionary.
Notes: You may assume that inner_key
exists in all of the inner dictionaries.
### Define demo inputs
demo_dd_ex4 = \
{'4f2491a8e4b04f6e695eac41': {'lat': 32.984154,
'long': -96.754765,
'type': 'Temple',
'country_code': 'US'},
'4d6d12b5d5d937046513cbf9': {'lat': -29.911862,
'long': -71.256914,
'type': 'Medical Center',
'country_code': 'CL'},
'4f67d81de4b072f14537b19b': {'lat': 10.077291,
'long': -69.280417,
'type': 'Residential Building (Apartment / Condo)',
'country_code': 'VE'},
'4e88d624d3e39d6f4e7dded3': {'lat': 49.276867,
'long': -123.111415,
'type': 'Soccer Stadium',
'country_code': 'CA'}}
demo_inner_key_ex4 = 'lat'
def demo_predicate_ex4(x):
return x > 0
Calling filter_dd
on the above inputs, as done in the solution cell below, should return the dictionary,
{'4f2491a8e4b04f6e695eac41': {
'lat': 32.984154,
'long': -96.754765,
'type': 'Temple',
'country_code': 'US'},
'4f67d81de4b072f14537b19b': {
'lat': 10.077291,
'long': -69.280417,
'type': 'Residential Building (Apartment / Condo)',
'country_code': 'VE'},
'4e88d624d3e39d6f4e7dded3': {
'lat': 49.276867,
'long': -123.111415,
'type': 'Soccer Stadium',
'country_code': 'CA'}}
since these are the ones where the inner 'lat'
value is positive.
outline = '''
Goal/Output:
Return filtered dict where the rpedicate == True
- data_type: dict
Input:
dd
- dict of dicts
inner_key
- the inner key values who we wish to use for filtering
predicate
- function that, given the value of an inner_key, return True/False
Strategy:
0. initialize dict dd_filtered
1. loop through key-values pair in dd
2. if predicate[inner_key] is True
3. add dict values to dd_filtered
4. return dd_filtered
Solution:
'''
### Exercise 4 solution
def filter_dd(dd, inner_key, predicate=lambda x: True):
### BEGIN SOLUTION
dd_filtered = {}
for key, value in dd.items():
if predicate(value[inner_key]):
dd_filtered[key] = value
return dd_filtered
### END SOLUTION
### demo function call
filter_dd(demo_dd_ex4, demo_inner_key_ex4, predicate=demo_predicate_ex4)
The cell below will test your solution for Exercise 4. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex4
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_4',
'func': filter_dd, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'dd':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
'inner_key':{
'dtype':'str', # data type of param.
'check_modified':False,
},
'predicate':{
'dtype':'function', # data type of param.
'check_modified':True,
}
},
'outputs':{
'output_0':{
'index':0,
'dtype':'dict',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
RUN ME: Filtered POIs: If you had a correct solution, you could use it to filter the POIs by various criteria. We have done so to created a filtered POI, called filtered_pois
, satisfying two predicates:
Run this cell whether or not you completed Exercise 4 to make this filtered POI data available, in a dictionary of dictionaries called filtered_pois
.
filtered_pois = load_database('filtered_pois.pickle')
print(f"\nThere are {len(filtered_pois):,} POI records (down from {len(pois):,} originally).")
print(f"Top countries by visit: {top_ccs_by_visits}")
print("A sample of filtered POIs (by top countries and food/drink types):")
sample_dict(filtered_pois, 5)
The last part of the dataset is the collection of follower connections. Run the cell below to load these data:
connections = load_database('connections.pickle')
print(f"\nThere are {len(connections):,} connections (nonsymmetric).")
print(f"For instance, here are user 54's connections:")
[x for x in connections if x[0] == 54]
The connections are stored as a Python list of pairs of user IDs. The example above shows people that user 54 is connected to on some social media site.
Important caveat: The connections are stored nonsymmetrically. That is, while the pair (54, 528)
appears, user 54
happens to not appear in the connections list of 528
:
[x for x in connections if x[0] == 528]
This choice was made by the people who generated the data to save space. However, for our analysis, we will assume that if a pair (a, b)
exists, it implies that both a
is connected to b
and b
is connected to a
.
get_active_users
¶Let's define an active user to be one who has both connections and check-in visits. Implement a function,
def get_active_users(connections, checkins):
...
to find and return all active users.
Inputs:
connections
: A Python list of pairs of user IDs (integers).checkins
: Check-in records, stored as a dictionary of lists of dictionaries (mapping user IDs to lists of visits; recall Part C, Exercise 3 or Part D, Exercise 4).Task: Determine which users have both connections (meaning they appear in the connections
list) and check-ins (meaning they have at least one check-in visit).
Output: Return a Python set of all active users. If there are no active users, this function should return an empty set.
Notes and hints: Assume relationships are symmetric. That is, we say an active user x
"has connections" if it appears in either a pair (x, y)
or a pair (y, x)
.
### Define demo inputs
demo_connections_ex5 = \
[ (72049, 99545),
(99545, 822927),
(470311, 665256),
(470311, 846214),
(470311, 894292),
(470311, 1045011),
(502736, 921752),
(668626, 921752),
(686743, 921752),
(921752, 944779),
(921752, 972956),
(921752, 1076615)]
demo_checkins_ex5 = \
{ 99545: [ { 'date': 'Thu Aug 16 22:55:43 +0000 2012',
'poi': '4b6ef8a8f964a520f2d32ce3'},
{ 'date': 'Tue Aug 14 23:46:30 +0000 2012',
'poi': '4c66a5b1e1da1b8daf6e9bc3'},
{ 'date': 'Thu Aug 09 18:45:08 +0000 2012',
'poi': '4ce575825fce548110bf5baa'}],
470311: [ { 'date': 'Sun Apr 08 08:53:52 +0000 2012',
'poi': '4cddd943db1254815fee2cce'},
{ 'date': 'Sun Jun 03 00:01:22 +0000 2012',
'poi': '4cddd943db1254815fee2cce'}],
921752: [ { 'date': 'Sun Aug 19 13:35:29 +0000 2012',
'poi': '4b58539af964a520ea5228e3'},
{ 'date': 'Sat May 05 10:24:08 +0000 2012',
'poi': '4b55a124f964a520b2e927e3'},
{ 'date': 'Mon Aug 06 03:42:48 +0000 2012',
'poi': '4ed2ff869adf25445a084c63'},
{ 'date': 'Thu Jul 05 11:01:38 +0000 2012',
'poi': '4b3dd6ddf964a520089725e3'}]}
Given the preceding input, a correct solution would return the set of active users,
{921752, 99545, 470311}
These are the only user IDs in this input that have both visits and connections.
outline = '''
Goal/Output:
Return a set of all active users
Input:
checkins
- data_type: dict of lists of dicts
- {user_id_int: [ {'date': str, 'poi': poi_id_str}, {} ] }
connections
- list of tuples (int, int)
- non-symmetric
Strategy:
0. flatten list of tuples in connections
1. get set of checkins user_ids
2. return intersection of 0 and 1
Solution:
'''
### Exercise 5 solution
def get_active_users(connections, checkins):
### BEGIN SOLUTION
connections_flattened = sum(connections, ())
checkins_set = set(checkins.keys())
return checkins_set.intersection(connections_flattened)
### END SOLUTION
### demo function call
get_active_users(demo_connections_ex5, demo_checkins_ex5)
The cell below will test your solution for Exercise 5. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex5
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_5',
'func': get_active_users, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'connections':{
'dtype':'list', # data type of param.
'check_modified':True,
},
'checkins':{
'dtype':'dict', # data type of param.
'check_modified':True,
}
},
'outputs':{
'output_0':{
'index':0,
'dtype':'set',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
RUN ME: Active users: If you had a correct solution, you could use it to find all active users in the full dataset. We have done just that.
Run this cell whether or not you completed Exercise 5. It will create a Python set of active users' IDs called active_users
.
active_users = load_database('active_users.pickle')
print(f"\nThere are {len(active_users):,} active users.")
print("Here are a few of them:", *list(active_users)[:5], "...")
form_connection_vectors
¶Given some user, a
, the connection vector of a
is a Python set consisting of all other users connected to a
. Implement the function,
def form_connection_vectors(connections):
...
to construct connection vectors for all users.
Input:
connections
: A Python list of user ID-pairs, (a, b)
, signifying a mutual connection between user a
and user b
. By "mutual connection," we mean that a
is connected to b
and b
is connected to a
. Therefore, if there is a pair (a, b)
, then b
should be in the connection vector of a
and a
should be in the connection vector of b
.Task: Sweep through the connections and construct connection vectors for all users.
Outputs: Return a Python dictionary of Python sets. The dictionary keys are user IDs; their values are the connection vectors (Python sets).
### Define demo inputs
demo_connections_ex6 = [(2, 10), (6, 7), (7, 0), (7, 9), (11, 10), (12, 10), (13, 10)]
Suppose the input connections are given by demo_connections_ex6
. Then a correct solution to this problem would produce the following output:
{ 0: {7},
2: {10},
6: {7},
7: {0, 9, 6},
9: {7},
10: {2, 11, 12, 13},
11: {10},
12: {10},
13: {10}}
Observe that 0
only appears in the pair (7, 0)
; therefore, the connection vector of 0
is the singleton set {7}
. By contrast, 7
appears in the pairs (6, 7)
, (7, 0)
, and (7, 9)
; therefore, the connection vector of 7
is the set {0, 6, 9}
. (Recall that sets are equal regardless of element order.)
outline = '''
Goal/Output:
Return connection_dict
- data_type: dict of sets
- {user_id_int: {set of connected user_id_ints}}
Input:
connections
- datatype: list of tuples (int, int)
- non-symmetric connections
Strategy:
0. initialize a dict as connections_dict
1. loop through tuples in connections
2. add to sets a key-value pair for BOTH entries in each tuple
3. return connections_dict
Solution:
'''
### Exercise 6 solution
def form_connection_vectors(connections):
### BEGIN SOLUTION
connections_set = set(sum(connections, ()))
connections_dict = {k:set() for k in connections_set}
for i, j in connections:
connections_dict[i].add(j)
connections_dict[j].add(i)
return connections_dict
### END SOLUTION
### demo function call
form_connection_vectors(demo_connections_ex6)
The cell below will test your solution for Exercise 6. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex6
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_6',
'func': form_connection_vectors, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'connections':{
'dtype':'list', # data type of param.
'check_modified':True,
}
},
'outputs':{
'output_0':{
'index':0,
'dtype':'dict',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
RUN ME: Connection vectors: If you had a correct solution, you could use it to calculate all connection vectors of the full dataset. We have done just that for you.
Run this cell whether or not you completed Exercise 6. It will create a Python set of connection vectors called connection_vectors
.
connection_vectors = load_database('connection_vectors.pickle')
print(f"\nThere are {len(connection_vectors):,} connection vectors.")
print("Here are a few of them:")
sample_dict(connection_vectors, 3)
To measure the "friend potential" of two users, we need a way to measure whether they have anything in common. Let's measure similarity based on whether two users visit the same places.
Visit vectors. Consider a hypothetical check-in record for user 123
:
{ ...,
123: [{'poi': 'abc', 'date:' 'Mon May 07...'},
{'poi': 'def', 'date:' 'Fri May 25...'},
{'poi': 'abc', 'date': 'Thu May 17...'},
{'poi': 'abc', 'date': 'Sun Apr 15...'},
{'poi': 'xyz', 'date': 'Wed Apr 11...'}
],
}
This user visited 'abc'
three times and 'def'
and 'xyz'
once each.
Define the visit vector for this user to be the distinct POIs that they visited. In this instance, the visit vector of 123
would be the set:
{'abc', 'def', 'xyz'}
Next, define the similarity of two users to be the number of POIs they have in common. For instance, for these two visit vectors,
v = {'abc', 'def', 'xyz'}
w = {'abc', 'xyz', 'lmn'}
the similarity equals 2, since visit vectors v
and w
share abc
and xyz
in common.
In the next two exercises, you'll calculate similarities for the full dataset.
form_visit_vectors
¶Write some code to calculate all of the visit vectors for a given set of users, by completing the function,
def form_visit_vectors(users, checkins, pois):
...
according to the following specifications.
Inputs:
users
: User IDs to process, given as a Python setcheckins
: Check-in records, stored as a dictionary of lists of visit dictionaries (see Part C, Exercise 3)pois
: POIs, stored as a dictionary of dictionariesTask: For each user in users
, determine which of their visits in checkins
has a known POI (meaning it appears in pois
). The visit vector of that user will be those POIs.
Outputs: Return a Python dictionary of Python sets. The keys are user IDs, and the values are the visit vectors.
Notes:
users
and checkins
.pois
.### Define demo inputs
demo_users_ex7 = {1290304, 880634, 1972270}
demo_checkins_ex7 = \
{491496: [{'date': 'Fri Aug 10 10:01:04 +0000 2012',
'poi': '4f0549d249013460824fbf09'},
{'date': 'Mon Jul 30 13:28:34 +0000 2012',
'poi': '4dd259ade4cd7f7178c663d3'},
{'date': 'Sun Jul 29 01:19:14 +0000 2012',
'poi': '4e17035018388d0d26847618'},
{'date': 'Mon Aug 13 05:42:51 +0000 2012',
'poi': '4d26846a342d6dcbf78ce4ca'}],
880634: [{'date': 'Sun Jul 29 14:23:05 +0000 2012',
'poi': '4ef48cec29c24e3536a28b45'}],
1290304: [{'date': 'Wed Jul 11 07:21:59 +0000 2012',
'poi': '4f64e3f0e4b03a7ce161c360'},
{'date': 'Thu Jul 26 22:23:05 +0000 2012',
'poi': '5009a022e4b058692f5cb2c9'},
{'date': 'Mon Jul 09 06:40:29 +0000 2012',
'poi': '4f64e3f0e4b03a7ce161c360'}],
1972270: [{'date': 'Tue May 15 12:15:40 +0000 2012',
'poi': '4be81fd588ed2d7f3d74cb1d'},
{'date': 'Sat Apr 28 08:35:22 +0000 2012',
'poi': '4c270706b012b713a7fd0893'},
{'date': 'Sat May 19 06:38:19 +0000 2012',
'poi': '4e6b107baeb7c31e43571420'}]}
demo_pois_ex7 = \
{'4be81fd588ed2d7f3d74cb1d': {'country_code': 'MY',
'lat': 1.524932,
'long': 110.336895,
'type': 'Fast Food Restaurant'},
'4c270706b012b713a7fd0893': {'country_code': 'MY',
'lat': 1.45373,
'long': 110.458474,
'type': 'Malaysian Restaurant'},
'4e6b107baeb7c31e43571420': {'country_code': 'MY',
'lat': 1.456737,
'long': 110.441649,
'type': 'Malaysian Restaurant'},
'4ef48cec29c24e3536a28b45': {'country_code': 'MY',
'lat': 2.213388,
'long': 102.246559,
'type': 'Coffee Shop'}}
Although the demo input above indicates four possible users—three in demo_users_ex7
and a fourth in demo_checkins_ex7
—a correct solution applied to the demo inputs above should produce only two visit vectors,
{ 880634: {'4ef48cec29c24e3536a28b45'},
1972270: { '4be81fd588ed2d7f3d74cb1d',
'4c270706b012b713a7fd0893',
'4e6b107baeb7c31e43571420'}}
for users 880634
and 1972270
. User 491496
does not have a visit vector because they are not in demo_users_ex7
. User 1290304
does not appear because none of their check-ins appears in demo_pois_ex7
.
outline = '''
Goal/Output:
Return visit_vector_dict
- data_type: dict of sets
- {user_id_int: {set of vistit vectors}}
Input:
checkins
- data_type: dict of lists of dicts
- {user_id_int: [ {'date': str, 'poi': poi_id_str}, {} ] }
pois
- data_type: dict of dicts
- {poi_id_str {str: object}}
users
- data_type: set
Strategy:
0. initialize a dict as visit_vector_dict
- defaultdict(set)
1. loop through users
2. loop through through their visits in checkins
3. if checkins[i]['poi'] in pois
4. if true, add visits['poi'] to visits_vector_dict[user_id]
5. return visit_vector_dict
Solution:
'''
### Exercise 7 solution
def form_visit_vectors(users, checkins, pois):
### BEGIN SOLUTION
from collections import defaultdict
visit_vector_dict = defaultdict(set)
for user in users:
for visit_list in checkins[user]:
if visit_list['poi'] in pois:
visit_vector_dict[user].add(visit_list['poi'])
return dict(visit_vector_dict)
### END SOLUTION
### demo function call
pprint(form_visit_vectors(demo_users_ex7, demo_checkins_ex7, demo_pois_ex7), indent=2)
The cell below will test your solution for Exercise 7. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex7
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_7',
'func': form_visit_vectors, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'users':{
'dtype':'set', # data type of param.
'check_modified':True,
},
'checkins':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
'pois':{
'dtype':'dict', # data type of param.
'check_modified':True,
}
},
'outputs':{
'output_0':{
'index':0,
'dtype':'dict',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
RUN ME: Visit vectors: If you had a correct solution, you could use it to calculate all visit vectors of the real dataset. We have done just that for you.
Run this cell whether or not you completed Exercise 7. It will create a Python dictionary of visit vectors called visit_vectors
.
visit_vectors = load_database('visit_vectors.pickle')
print(f"\nThere are {len(visit_vectors):,} visit vectors.")
print("Here are a few of them:")
sample_dict(visit_vectors, 3)
calc_similarities
¶This exercise asks you to measure how similar connected users are to one another using connection vectors and visit vectors.
Similarity: Recall that the similarity of two users is the number of common POIs between their visit vectors. For instance, if v
and w
are two visit vectors,
v = {'abc', 'def', 'xyz'}
w = {'abc', 'xyz', 'lmn'}
then their similiarity equals two, due to 'abc'
and 'xyz'
being common.
Complete the function,
def calc_similarities(visit_vectors, connection_vectors):
...
so that it calculates similarities per the following requirements.
Inputs:
visit_vectors
: A dictionary of sets, where each key is a user and the value is the visit vector, that is, the set of POIs they visited.connection_vectors
: A dictionary of sets, where each key is a user and the value is the connections, that is, the set of users they are connected to.Task: For every pair (u, v)
of connected users u
and v
, measure their similarity. If either is missing a visit vector, then their similarity is 0.
Outputs: Return a Python dictionary of tuples. The key is a user. The tuple should have these three components in this order:
Note: You may assume the connections vectors are "complete." That is, take the connections as given in the data structure. Do not try to add or otherwise "symmetrize" these connections.
### Define demo inputs
demo_connection_vectors_ex8 = \
{ 100: {101, 102, 103},
101: {104, 100, 103},
103: {101},
104: {107, 100, 101, 103},
105: {100},
106: {107, 103}}
demo_visit_vectors_ex8 = \
{ 100: {'344274', 'e3832a'},
101: {'344274', 'e5495e', '9ae1fe', 'e3832a'},
102: {'e5495e'},
103: {'344274', '9ae1fe', 'e5495e', 'e3832a'},
105: {'344274'},
106: {'9ae1fe', 'e5495e', 'e3832a'},
107: {'e3832a'}}
A correct function should produce the following result:
{100: (2, 3, 2),
101: (4, 3, 4),
103: (4, 1, 4),
104: (0, 4, 0),
105: (1, 1, 1),
106: (3, 2, 3)}
Some quick observations:
100
through 107
).102
and 107
do not appear in the output because they are missing from demo_connection_vectors_ex8
—there are no connection pairs to enumerate.104
appears in the output. However, since it has no visits (does not appear in demo_visit_vectors_ex8
, it has a maximum similarity of 0 with all of its neighbors.outline = '''
Goal/Output:
Return visit_vector_dict containing similarity of every pair of CONNECTED users (sim = 0 if visit_vector is empty)
- data_types: tuples
- {user_id_in : (component_0, component_1, component_2)}
- component_0: length of the user's visit vector
- component_1: the number of the user's connections
- component_2: maximum similarity between the user and any of their connections
Input:
visit_vectors
- data:type: dict of sets
- {user_id_int: {set of poi_id_str visited}}
connection_vectors
- dict of sets
- {user_id_int: {set of connected user_id_ints}}
Strategy:
0. initialize dict as visit_vector_dict
1. loop through users (cv_key) in connection_vectors
** something to be aware of, we don't always know if the keys will exist in visit_vectors,
we need to be able to get something regardless
**
2. component_0 == visit_vectors[cv_key] == cv_visits
3. component_1 == connection_vectors[cv_key] == cv_connections
4. initialize max_sim = 0
5. loop through values in connection_vectors[cv_key]
6. count intersection of visits_vectors and overwrite max_sim if new_sim > max_sim
7. enter len(component_0), len(component_1), max_sim into visit_vector_dict
8. return visit_vector_dict
Solution:
'''
### Exercise 8 solution
def calc_similarities(visit_vectors, connection_vectors):
### BEGIN SOLUTION
visit_vector_dict = {}
for cv_key in connection_vectors.keys():
cv_visits = visit_vectors.get(cv_key, set())
cv_connections = connection_vectors[cv_key]
max_sim = 0
for vv_key in connection_vectors[cv_key]:
vv_visits = visit_vectors.get(vv_key, set())
common_visits = vv_visits.intersection(cv_visits)
new_sim = len(common_visits)
if new_sim > max_sim:
max_sim = new_sim
visit_vector_dict[cv_key] = (len(cv_visits), len(cv_connections), max_sim)
return visit_vector_dict
### END SOLUTION
### demo function call
pprint(calc_similarities(demo_visit_vectors_ex8, demo_connection_vectors_ex8))
The cell below will test your solution for Exercise 7. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex8
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_8',
'func': calc_similarities, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'visit_vectors':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
'connection_vectors':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
},
'outputs':{
'output_0':{
'index':0,
'dtype':'dict',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
RUN ME: Similarity analysis: If you had a correct solution, you could use it to calculate all similarity scores for all users. We have done just that for you.
The code cell below plots the results. One conclusion is that a user's connections do not seem to reflect their interests, as assessed by check-ins behavior. (In this analysis, recall that the only check-ins considered are those pertaining to food and drink, which is at least one kind of social interest.)
Run this cell whether or not you completed Exercise 8 to see the results. It will create a plot for your edification.
sims = load_database('sims.pickle')
def make_sims_histogram(sims=sims):
from numpy import histogram
VISITS_THRESHOLD = 5
CONNS_THRESHOLD = 5
percent_in_common = [common/total*100 for num_conns, total, common in sims.values()
if total >= VISITS_THRESHOLD and num_conns >= CONNS_THRESHOLD]
counts_in_common, bins_in_common = histogram(percent_in_common, bins=list(range(0, 105, 5)))
print(list((c/len(percent_in_common)*100, b) for c, b in zip(counts_in_common, bins_in_common)))
from matplotlib.pyplot import figure, hist, title, xlabel, ylabel
figure(figsize=(10, 5.625))
hist(bins_in_common[:-1], bins_in_common,
weights=[c/len(percent_in_common)*100 for c in counts_in_common])
title(f'Check-ins similarity (for the {len(percent_in_common):,} users that have at least {VISITS_THRESHOLD:,} check-ins and {CONNS_THRESHOLD:,} connections.)',
loc='left')
xlabel('% of check-ins that overlap with connections')
ylabel('% of users', rotation=0, horizontalalignment='right')
pass
make_sims_histogram()
The check-ins data allows us to do something interesting: we can try to suggest new connections based on similar visit behavior!
suggest_friends
¶Complete the function,
def suggest_friends(user_id, visit_vectors, connection_vectors):
...
so that it suggests new friends for a given user. Here are the requirements:
Inputs:
user_id
: A user ID, given as an integer.visit_vectors
: The collection of visit vectors for all users (see Part F, Exercise 7).connection_vectors
: The collection of connection vectors for all users (see Part E, Exercise 6).Task: Compare the visit vector for user_id
against the visit vectors of all other users who have a visit vector but are not already connected to that user. Return the top matches.
Outputs: Return the top 3 most similar candidates as a Python list of pairs. The first element of the pair is the suggested user ID and the second is the similarity measure. The output must be sorted in descending order from most matches to fewest matches. Break ties using the user ID in ascending order. If there are fewer than 3 matches, return them all.
Notes:
user_id
does not have a visit vector, return an empty list.### Define demo inputs
demo_user_id_ex9 = 101
demo_visit_vectors_ex9 = \
{100: {'e3832a', '344274'},
101: {'9ae1fe', 'e3832a', 'e5495e', '344274'},
102: {'e5495e'},
103: {'e5495e', 'e3832a', '9ae1fe', '344274'},
105: {'344274'},
106: {'e5495e', 'e3832a', '9ae1fe'},
107: {'e3832a'}}
demo_connection_vectors_ex9 = { \
100: {101, 102, 103},
101: {104, 100, 103},
103: {101},
104: {107, 100, 101, 103},
105: {100},
106: {107, 103}}
outline = '''
Goal/Output:
Return top 3 highest matches of unconnected users by comparing visit_vetor[user_id] against the visit_vectors of
unconnected users who have a visit vector
- data_type: list of pairs (tuples)
- (suggested_user_id_int, sim_score)
- ordering: # of matches desc, user_id asc
Input:
user_id
- data_type: int
visit_vectors
- data_type: dict of sets
- {user_id_int: (set of poi_id_str visited)}
connection_vectors
- data_type: dict of sets
- {user_id_int: (set of connected user_id_ints) }
Strategy:
0. initialize empty list as suggested_friends
1. if user_id does not have a visit_vector, return an empty list
2. get visit_vectors[user_id] to calculate non-connected users' visits
3. get non-connected users
- all_users via visit_vectors
- connected_users = connection_vector[user_id] IF THEY EXIST
- all_users - connected_user - user_id
4. loop through candidates in non_connected_users
5. get # of matching visits for each candidate and user_id
6. get candidate visit_vectors IF THEY EXIST
7. get intersection non_conn_visits and user_id_visits as common_visits
8. len(common_visits) as sim_score
9. store as tuple (candidate, sim_score)
10. append to suggested_friends
11. sort suggested_friends
- sorted nested list by multiple keys == lambda function
- sim_score desc == -x[1]
- user_id asc == x[0]
12. return top 3 suggested_friends
Solution:
'''
### Exercise 9 solution
def suggest_friends(user_id, visit_vectors, connection_vectors):
### BEGIN SOLUTION
suggested_friends = []
if user_id not in visit_vectors:
return suggested_friends
user_id_visits = visit_vectors[user_id]
all_users = set(visit_vectors.keys())
connected_users = connection_vectors.get(user_id, set())
non_connected_users = all_users - connected_users - {user_id}
for candidate in non_connected_users:
non_conn_visits = visit_vectors.get(candidate, set())
common_visits = user_id_visits.intersection(non_conn_visits)
sim_score = len(common_visits)
suggested_friends.append((candidate, sim_score))
suggested_friends.sort(key = lambda x: (-x[1], x[0]))
return suggested_friends[:3]
### END SOLUTION
### demo function call
pprint(suggest_friends(demo_user_id_ex9, demo_visit_vectors_ex9, demo_connection_vectors_ex9))
The cell below will test your solution for Exercise 9. The testing variables will be available for debugging under the following names in a dictionary format.
input_vars
- Input variables for your solution. original_input_vars
- Copy of input variables from prior to running your solution. These should be the same as input_vars
- otherwise the inputs were modified by your solution.returned_output_vars
- Outputs returned by your solution.true_output_vars
- The expected output. This should "match" returned_output_vars
based on the question requirements - otherwise, your solution is not returning the correct output. ### test_cell_ex9
### BEGIN HIDDEN TESTS
import dill
import hashlib
with open('resource/asnlib/public/hash_check.pkl', 'rb') as f:
hash_check = dill.load(f)
for fname in ['testers.py', '__init__.py', 'test_utils.py']:
hash_check(f'tester_fw/{fname}', f'resource/asnlib/public/{fname}')
del hash_check
del dill
del hashlib
### END HIDDEN TESTS
from tester_fw.testers import Tester
conf = {
'case_file':'tc_9',
'func': suggest_friends, # replace this with the function defined above
'inputs':{ # input config dict. keys are parameter names
'user_id':{
'dtype':'int', # data type of param.
'check_modified':True,
},
'visit_vectors':{
'dtype':'dict', # data type of param.
'check_modified':True,
},
'connection_vectors':{
'dtype':'dict', # data type of param.
'check_modified':True,
}
},
'outputs':{
'output_0':{
'index':0,
'dtype':'list',
'check_dtype': True,
'check_col_dtypes': True, # Ignored if dtype is not df
'check_col_order': True, # Ignored if dtype is not df
'check_row_order': True, # Ignored if dtype is not df
'check_column_type': True, # Ignored if dtype is not df
'float_tolerance': 10 ** (-6)
}
}
}
tester = Tester(conf, key=b'1ZePXzAcTR7lcNpmx1HRK0lT3v-Ikrg8mZ3n-wVFTBo=', path='resource/asnlib/publicdata/')
for _ in range(70):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### BEGIN HIDDEN TESTS
tester = Tester(conf, key=b'VsXYx28C2-hiT5E79LTUsrmNZRvKsBiAU_NaL4lPedI=', path='resource/asnlib/publicdata/encrypted/')
for _ in range(20):
try:
tester.run_test()
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
except:
(input_vars, original_input_vars, returned_output_vars, true_output_vars) = tester.get_test_vars()
raise
### END HIDDEN TESTS
print('Passed! Please submit.')
If you have made it this far, congratulations on completing the exam. Don't forget to submit!
If you are interested in exploring this topic after the exam is over, run the friend suggester on other users and see how the distribution of candidate matches compares to existing matches. Is there an improvement? Can you come up with a better method?