From 170eb121f0e403cf249a96632e3a451083f6d19a Mon Sep 17 00:00:00 2001 From: Gabriel Date: Sat, 9 May 2020 15:14:41 -0400 Subject: [PATCH] Cache mem optimize (#23) * formatting (120 length -> 100) * csbs to/from Redis * nyt to/from Redis * ignore locustfile * unused coordinates --- .gitignore | 1 + app/data/__init__.py | 6 +- app/io.py | 13 +++- app/location/__init__.py | 8 ++- app/main.py | 6 +- app/routers/v1.py | 6 +- app/routers/v2.py | 14 ++++- app/services/location/csbs.py | 91 +++++++++++++++------------- app/services/location/jhu.py | 4 +- app/services/location/nyt.py | 110 ++++++++++++++++++---------------- app/utils/populations.py | 4 +- pyproject.toml | 6 +- tasks.py | 13 +++- tests/test_io.py | 6 +- tests/test_location.py | 7 ++- tests/test_routes.py | 8 ++- tests/test_timeline.py | 7 ++- 17 files changed, 194 insertions(+), 116 deletions(-) diff --git a/.gitignore b/.gitignore index 9c41818c..ab6f17ff 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,7 @@ htmlcov/ nosetests.xml coverage.xml *,cover +locustfile.py # Translations *.mo diff --git a/app/data/__init__.py b/app/data/__init__.py index 265bf3d3..60a75dac 100644 --- a/app/data/__init__.py +++ b/app/data/__init__.py @@ -4,7 +4,11 @@ from ..services.location.nyt import NYTLocationService # Mapping of services to data-sources. -DATA_SOURCES = {"jhu": JhuLocationService(), "csbs": CSBSLocationService(), "nyt": NYTLocationService()} +DATA_SOURCES = { + "jhu": JhuLocationService(), + "csbs": CSBSLocationService(), + "nyt": NYTLocationService(), +} def data_source(source): diff --git a/app/io.py b/app/io.py index 3bd443b6..2a563b15 100644 --- a/app/io.py +++ b/app/io.py @@ -10,7 +10,11 @@ def save( - name: str, content: Union[str, Dict, List], write_mode: str = "w", indent: int = 2, **json_dumps_kwargs + name: str, + content: Union[str, Dict, List], + write_mode: str = "w", + indent: int = 2, + **json_dumps_kwargs, ) -> pathlib.Path: """Save content to a file. If content is a dictionary, use json.dumps().""" path = DATA / name @@ -35,7 +39,12 @@ class AIO: @classmethod async def save( - cls, name: str, content: Union[str, Dict, List], write_mode: str = "w", indent: int = 2, **json_dumps_kwargs + cls, + name: str, + content: Union[str, Dict, List], + write_mode: str = "w", + indent: int = 2, + **json_dumps_kwargs, ): """Save content to a file. If content is a dictionary, use json.dumps().""" path = DATA / name diff --git a/app/location/__init__.py b/app/location/__init__.py index d12f28c3..1da5e9e5 100644 --- a/app/location/__init__.py +++ b/app/location/__init__.py @@ -11,7 +11,7 @@ class Location: # pylint: disable=too-many-instance-attributes """ def __init__( - self, id, country, province, coordinates, last_updated, confirmed, deaths, recovered + self, id, country, province, coordinates, last_updated, confirmed, deaths, recovered, ): # pylint: disable=too-many-arguments # General info. self.id = id @@ -66,7 +66,11 @@ def serialize(self): # Last updated. "last_updated": self.last_updated, # Latest data (statistics). - "latest": {"confirmed": self.confirmed, "deaths": self.deaths, "recovered": self.recovered}, + "latest": { + "confirmed": self.confirmed, + "deaths": self.deaths, + "recovered": self.recovered, + }, } diff --git a/app/main.py b/app/main.py index f8ed1b41..3e3c3317 100644 --- a/app/main.py +++ b/app/main.py @@ -49,7 +49,11 @@ # Enable CORS. APP.add_middleware( - CORSMiddleware, allow_credentials=True, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], + CORSMiddleware, + allow_credentials=True, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], ) APP.add_middleware(GZipMiddleware, minimum_size=1000) diff --git a/app/routers/v1.py b/app/routers/v1.py index 662514a0..517bc625 100644 --- a/app/routers/v1.py +++ b/app/routers/v1.py @@ -19,7 +19,11 @@ async def all_categories(): "deaths": deaths, "recovered": recovered, # Latest. - "latest": {"confirmed": confirmed["latest"], "deaths": deaths["latest"], "recovered": recovered["latest"],}, + "latest": { + "confirmed": confirmed["latest"], + "deaths": deaths["latest"], + "recovered": recovered["latest"], + }, } diff --git a/app/routers/v2.py b/app/routers/v2.py index de5a5312..fe9d2475 100644 --- a/app/routers/v2.py +++ b/app/routers/v2.py @@ -65,11 +65,17 @@ async def get_locations( # Do filtering. try: - locations = [location for location in locations if str(getattr(location, key)).lower() == str(value)] + locations = [ + location + for location in locations + if str(getattr(location, key)).lower() == str(value) + ] except AttributeError: pass if not locations: - raise HTTPException(404, detail=f"Source `{source}` does not have the desired location data.") + raise HTTPException( + 404, detail=f"Source `{source}` does not have the desired location data.", + ) # Return final serialized data. return { @@ -84,7 +90,9 @@ async def get_locations( # pylint: disable=invalid-name @V2.get("/locations/{id}", response_model=LocationResponse) -async def get_location_by_id(request: Request, id: int, source: Sources = "jhu", timelines: bool = True): +async def get_location_by_id( + request: Request, id: int, source: Sources = "jhu", timelines: bool = True +): """ Getting specific location by id. """ diff --git a/app/services/location/csbs.py b/app/services/location/csbs.py index 68bdb01c..e8668280 100644 --- a/app/services/location/csbs.py +++ b/app/services/location/csbs.py @@ -6,6 +6,7 @@ from asyncache import cached from cachetools import TTLCache +from ...caches import check_cache, load_cache from ...coordinates import Coordinates from ...location.csbs import CSBSLocation from ...utils import httputils @@ -34,7 +35,7 @@ async def get(self, loc_id): # pylint: disable=arguments-differ BASE_URL = "https://facts.csbs.org/covid-19/covid19_county.csv" -@cached(cache=TTLCache(maxsize=1, ttl=3600)) +@cached(cache=TTLCache(maxsize=1, ttl=1800)) async def get_locations(): """ Retrieves county locations; locations are cached for 1 hour @@ -44,48 +45,54 @@ async def get_locations(): """ data_id = "csbs.locations" LOGGER.info(f"{data_id} Requesting data...") - async with httputils.CLIENT_SESSION.get(BASE_URL) as response: - text = await response.text() - - LOGGER.debug(f"{data_id} Data received") - - data = list(csv.DictReader(text.splitlines())) - LOGGER.debug(f"{data_id} CSV parsed") - - locations = [] - - for i, item in enumerate(data): - # General info. - state = item["State Name"] - county = item["County Name"] - - # Ensure country is specified. - if county in {"Unassigned", "Unknown"}: - continue - - # Coordinates. - coordinates = Coordinates(item["Latitude"], item["Longitude"]) # pylint: disable=unused-variable - - # Date string without "EDT" at end. - last_update = " ".join(item["Last Update"].split(" ")[0:2]) - - # Append to locations. - locations.append( - CSBSLocation( - # General info. - i, - state, - county, - # Coordinates. - Coordinates(item["Latitude"], item["Longitude"]), - # Last update (parse as ISO). - datetime.strptime(last_update, "%Y-%m-%d %H:%M").isoformat() + "Z", - # Statistics. - int(item["Confirmed"] or 0), - int(item["Death"] or 0), + # check shared cache + cache_results = await check_cache(data_id) + if cache_results: + LOGGER.info(f"{data_id} using shared cache results") + locations = cache_results + else: + LOGGER.info(f"{data_id} shared cache empty") + async with httputils.CLIENT_SESSION.get(BASE_URL) as response: + text = await response.text() + + LOGGER.debug(f"{data_id} Data received") + + data = list(csv.DictReader(text.splitlines())) + LOGGER.debug(f"{data_id} CSV parsed") + + locations = [] + + for i, item in enumerate(data): + # General info. + state = item["State Name"] + county = item["County Name"] + + # Ensure country is specified. + if county in {"Unassigned", "Unknown"}: + continue + + # Date string without "EDT" at end. + last_update = " ".join(item["Last Update"].split(" ")[0:2]) + + # Append to locations. + locations.append( + CSBSLocation( + # General info. + i, + state, + county, + # Coordinates. + Coordinates(item["Latitude"], item["Longitude"]), + # Last update (parse as ISO). + datetime.strptime(last_update, "%Y-%m-%d %H:%M").isoformat() + "Z", + # Statistics. + int(item["Confirmed"] or 0), + int(item["Death"] or 0), + ) ) - ) - LOGGER.info(f"{data_id} Data normalized") + LOGGER.info(f"{data_id} Data normalized") + # save the results to distributed cache + await load_cache(data_id, locations) # Return the locations. return locations diff --git a/app/services/location/jhu.py b/app/services/location/jhu.py index fdfebe7a..06fa3fe0 100644 --- a/app/services/location/jhu.py +++ b/app/services/location/jhu.py @@ -41,9 +41,7 @@ async def get(self, loc_id): # pylint: disable=arguments-differ # Base URL for fetching category. -BASE_URL = ( - "https://raw.githubusercontent.com/CSSEGISandData/2019-nCoV/master/csse_covid_19_data/csse_covid_19_time_series/" -) +BASE_URL = "https://raw.githubusercontent.com/CSSEGISandData/2019-nCoV/master/csse_covid_19_data/csse_covid_19_time_series/" @cached(cache=TTLCache(maxsize=4, ttl=1800)) diff --git a/app/services/location/nyt.py b/app/services/location/nyt.py index 3c092500..3ba12d02 100644 --- a/app/services/location/nyt.py +++ b/app/services/location/nyt.py @@ -6,6 +6,7 @@ from asyncache import cached from cachetools import TTLCache +from ...caches import check_cache, load_cache from ...coordinates import Coordinates from ...location.nyt import NYTLocation from ...timeline import Timeline @@ -66,7 +67,7 @@ def get_grouped_locations_dict(data): return grouped_locations -@cached(cache=TTLCache(maxsize=1, ttl=3600)) +@cached(cache=TTLCache(maxsize=1, ttl=1800)) async def get_locations(): """ Returns a list containing parsed NYT data by US county. The data is cached for 1 hour. @@ -77,55 +78,64 @@ async def get_locations(): data_id = "nyt.locations" # Request the data. LOGGER.info(f"{data_id} Requesting data...") - async with httputils.CLIENT_SESSION.get(BASE_URL) as response: - text = await response.text() - - LOGGER.debug(f"{data_id} Data received") - - # Parse the CSV. - data = list(csv.DictReader(text.splitlines())) - LOGGER.debug(f"{data_id} CSV parsed") - - # Group together locations (NYT data ordered by dates not location). - grouped_locations = get_grouped_locations_dict(data) - - # The normalized locations. - locations = [] - - for idx, (county_state, histories) in enumerate(grouped_locations.items()): - # Make location history for confirmed and deaths from dates. - # List is tuples of (date, amount) in order of increasing dates. - confirmed_list = histories["confirmed"] - confirmed_history = {date: int(amount or 0) for date, amount in confirmed_list} - - deaths_list = histories["deaths"] - deaths_history = {date: int(amount or 0) for date, amount in deaths_list} - - # Normalize the item and append to locations. - locations.append( - NYTLocation( - id=idx, - state=county_state[1], - county=county_state[0], - coordinates=Coordinates(None, None), # NYT does not provide coordinates - last_updated=datetime.utcnow().isoformat() + "Z", # since last request - timelines={ - "confirmed": Timeline( - { - datetime.strptime(date, "%Y-%m-%d").isoformat() + "Z": amount - for date, amount in confirmed_history.items() - } - ), - "deaths": Timeline( - { - datetime.strptime(date, "%Y-%m-%d").isoformat() + "Z": amount - for date, amount in deaths_history.items() - } - ), - "recovered": Timeline({}), - }, + # check shared cache + cache_results = await check_cache(data_id) + if cache_results: + LOGGER.info(f"{data_id} using shared cache results") + locations = cache_results + else: + LOGGER.info(f"{data_id} shared cache empty") + async with httputils.CLIENT_SESSION.get(BASE_URL) as response: + text = await response.text() + + LOGGER.debug(f"{data_id} Data received") + + # Parse the CSV. + data = list(csv.DictReader(text.splitlines())) + LOGGER.debug(f"{data_id} CSV parsed") + + # Group together locations (NYT data ordered by dates not location). + grouped_locations = get_grouped_locations_dict(data) + + # The normalized locations. + locations = [] + + for idx, (county_state, histories) in enumerate(grouped_locations.items()): + # Make location history for confirmed and deaths from dates. + # List is tuples of (date, amount) in order of increasing dates. + confirmed_list = histories["confirmed"] + confirmed_history = {date: int(amount or 0) for date, amount in confirmed_list} + + deaths_list = histories["deaths"] + deaths_history = {date: int(amount or 0) for date, amount in deaths_list} + + # Normalize the item and append to locations. + locations.append( + NYTLocation( + id=idx, + state=county_state[1], + county=county_state[0], + coordinates=Coordinates(None, None), # NYT does not provide coordinates + last_updated=datetime.utcnow().isoformat() + "Z", # since last request + timelines={ + "confirmed": Timeline( + { + datetime.strptime(date, "%Y-%m-%d").isoformat() + "Z": amount + for date, amount in confirmed_history.items() + } + ), + "deaths": Timeline( + { + datetime.strptime(date, "%Y-%m-%d").isoformat() + "Z": amount + for date, amount in deaths_history.items() + } + ), + "recovered": Timeline({}), + }, + ) ) - ) - LOGGER.info(f"{data_id} Data normalized") + LOGGER.info(f"{data_id} Data normalized") + # save the results to distributed cache + await load_cache(data_id, locations) return locations diff --git a/app/utils/populations.py b/app/utils/populations.py index 24f0fa4e..c02f15a9 100644 --- a/app/utils/populations.py +++ b/app/utils/populations.py @@ -28,7 +28,9 @@ def fetch_populations(save=False): # Fetch the countries. try: - countries = requests.get(GEONAMES_URL, params={"username": "dperic"}, timeout=1.25).json()["geonames"] + countries = requests.get(GEONAMES_URL, params={"username": "dperic"}, timeout=1.25).json()[ + "geonames" + ] # Go through all the countries and perform the mapping. for country in countries: mappings.update({country["countryCode"]: int(country["population"]) or None}) diff --git a/pyproject.toml b/pyproject.toml index b6bc6af6..df1ad168 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.black] -line-length = 120 +line-length = 100 target-version = ['py36', 'py37', 'py38'] include = '\.pyi?$' exclude = ''' @@ -23,7 +23,7 @@ multi_line_output = 3 include_trailing_comma = "True" force_grid_wrap = 0 use_parentheses = "True" -line_length = 120 +line_length = 100 [tool.pylint.master] extension-pkg-whitelist = "pydantic" @@ -42,7 +42,7 @@ logging-modules = "logging" allow-wildcard-with-all = "no" [tool.pylint.format] indent-after-paren = "4" -max-line-length = "120" # matches black setting +max-line-length = "100" # matches black setting max-module-lines = "800" no-space-check = ''' trailing-comma, diff --git a/tasks.py b/tasks.py index ae1f09cd..0f6d6995 100644 --- a/tasks.py +++ b/tasks.py @@ -72,12 +72,21 @@ def test(ctx): @invoke.task def generate_reqs(ctx): """Generate requirements.txt""" - reqs = ["pipenv lock -r > requirements.txt", "pipenv lock -r --dev > requirements-dev.txt"] + reqs = [ + "pipenv lock -r > requirements.txt", + "pipenv lock -r --dev > requirements-dev.txt", + ] [ctx.run(req) for req in reqs] @invoke.task -def docker(ctx, build=False, run=False, tag="covid-tracker-api:latest", name=f"covid-api-{random.randint(0,999)}"): +def docker( + ctx, + build=False, + run=False, + tag="covid-tracker-api:latest", + name=f"covid-api-{random.randint(0,999)}", +): """Build and run docker container.""" if not any([build, run]): raise invoke.Exit(message="Specify either --build or --run", code=1) diff --git a/tests/test_io.py b/tests/test_io.py index c5d16c3a..ba926011 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -10,7 +10,11 @@ [ ("test_file.txt", string.ascii_lowercase, {}), ("test_json_file.json", {"a": 0, "b": 1, "c": 2}, {}), - ("test_custom_json.json", {"z": -1, "b": 1, "y": -2, "a": 0}, {"indent": 4, "sort_keys": True}), + ( + "test_custom_json.json", + {"z": -1, "b": 1, "y": -2, "a": 0}, + {"indent": 4, "sort_keys": True}, + ), ], ) diff --git a/tests/test_location.py b/tests/test_location.py index 567eddcd..50129a52 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -48,7 +48,12 @@ def test_location_class( # Location. location_obj = location.TimelinedLocation( - test_id, country, province, coords, now, {"confirmed": confirmed, "deaths": deaths, "recovered": recovered,} + test_id, + country, + province, + coords, + now, + {"confirmed": confirmed, "deaths": deaths, "recovered": recovered,}, ) assert location_obj.country_code == country_code diff --git a/tests/test_routes.py b/tests/test_routes.py index 52d26843..eea153bc 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -126,7 +126,9 @@ async def test_v2_locations_id(self): return_data = response.json() - filepath = "tests/expected_output/v2_{state}_id_{test_id}.json".format(state=state, test_id=test_id) + filepath = "tests/expected_output/v2_{state}_id_{test_id}.json".format( + state=state, test_id=test_id + ) with open(filepath, "r") as file: expected_json_output = file.read() @@ -151,7 +153,9 @@ async def test_v2_locations_id(self): ({"source": "jhu", "country_code": "US"}, 404), ], ) -async def test_locations_status_code(async_api_client, query_params, expected_status, mock_client_session): +async def test_locations_status_code( + async_api_client, query_params, expected_status, mock_client_session +): response = await async_api_client.get("/v2/locations", query_string=query_params) print(f"GET {response.url}\n{response}") diff --git a/tests/test_timeline.py b/tests/test_timeline.py index 056286aa..79612f5a 100644 --- a/tests/test_timeline.py +++ b/tests/test_timeline.py @@ -21,7 +21,12 @@ def test_timeline_class(): assert history_data.latest == 7 # validate order - assert list(dict(history_data.timeline).keys()) == ["1/22/20", "1/23/20", "1/24/20", "1/25/20"] + assert list(dict(history_data.timeline).keys()) == [ + "1/22/20", + "1/23/20", + "1/24/20", + "1/25/20", + ] # validate serialize check_serialize = {