From f8fd116f3c6f1d99db54e83612451c4c7efa1a68 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Mon, 29 Jul 2019 17:28:23 -0300 Subject: [PATCH 01/31] fix some type hints after running mypy --- src/arche/readers/items.py | 5 +++-- src/arche/tools/api.py | 5 +++-- src/arche/tools/helpers.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py index c33ac24..f2253dd 100755 --- a/src/arche/readers/items.py +++ b/src/arche/readers/items.py @@ -47,6 +47,7 @@ def origin_column_name(self, new: str) -> str: for column in self.df.columns: if column in new: return column + return '' @classmethod def from_df(cls, df: pd.DataFrame): @@ -66,7 +67,7 @@ def __init__( ): self.key = key self._count = count - self._limit = None + self._limit: Any = None self.filters = filters raw = self.fetch_data() df = pd.DataFrame(list(raw)) @@ -104,7 +105,7 @@ def __init__( filters: Optional[api.Filters] = None, ): self.start_index = start_index - self.start: int = f"{key}/{start_index}" + self.start: str = f"{key}/{start_index}" self._job: Job = None super().__init__(key, count, filters) diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index cd56b2a..4a10be1 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -3,7 +3,7 @@ import math from multiprocessing import Pool import time -from typing import Dict, List, Tuple, Optional, Union +from typing import Dict, List, Tuple, Optional, Union, cast from arche.tools import helpers from dateutil.relativedelta import relativedelta @@ -144,7 +144,8 @@ def get_items_with_pool( A numpy array of items """ active_connections_limit = 10 - processes_count = min(max(helpers.cpus_count(), workers), active_connections_limit) + processes_count: int = cast( + int, min(max(helpers.cpus_count(), workers), active_connections_limit)) batch_size = math.ceil(count / processes_count) start_idxs = range(start_index, start_index + count, batch_size) diff --git a/src/arche/tools/helpers.py b/src/arche/tools/helpers.py index 03308f8..847d6cb 100755 --- a/src/arche/tools/helpers.py +++ b/src/arche/tools/helpers.py @@ -76,7 +76,7 @@ def is_number(s): return True -def cpus_count() -> int: +def cpus_count() -> Optional[int]: try: return len(os.sched_getaffinity(0)) except AttributeError: From fb53167bdd1de3bb0a702944e2abb4461f3b2418 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Mon, 29 Jul 2019 17:43:00 -0300 Subject: [PATCH 02/31] fix type hints at readers/schema and rules/price --- src/arche/readers/schema.py | 11 ++++++----- src/arche/rules/price.py | 16 +++++++++------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index 06b8883..99e1af6 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -3,7 +3,7 @@ import json import os import pprint -from typing import Dict, List, Union +from typing import Dict, List, Union, DefaultDict, cast import urllib.request from arche.tools import s3 @@ -58,17 +58,18 @@ def __repr__(self): def get_enums(self) -> List[str]: enums = [] for k, v in self.raw["properties"].items(): - if "enum" in v.keys(): + if "enum" in v.keys(): # type: ignore enums.append(k) return enums @staticmethod def get_tags(schema: RawSchema) -> TaggedFields: - tagged_fields = defaultdict(list) + tagged_fields: DefaultDict[str, List[str]] = defaultdict(list) for key, value in schema["properties"].items(): - property_tags = value.get("tag", []) + property_tags = value.get("tag", []) # type: ignore if property_tags: - tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields) + tagged_fields = cast( + DefaultDict[str, List[str]], Schema.get_field_tags(property_tags, key, tagged_fields)) return tagged_fields @classmethod diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py index 086d33f..c3b0a0f 100755 --- a/src/arche/rules/price.py +++ b/src/arche/rules/price.py @@ -1,3 +1,5 @@ +from typing import Any + from arche.readers.schema import TaggedFields from arche.rules.result import Result, Outcome from arche.tools.helpers import is_number, ratio_diff @@ -74,13 +76,13 @@ def compare_prices_for_same_urls( missing and new `product_url_field` tagged fields. """ result = Result("Compare Prices For Same Urls") - url_field = tagged_fields.get("product_url_field") + url_field: Any = tagged_fields.get("product_url_field") if not url_field: result.add_info(Outcome.SKIPPED) return result url_field = url_field[0] - price_field = tagged_fields.get("product_price_field") + price_field: Any = tagged_fields.get("product_price_field") source_df = source_df.dropna(subset=[url_field]) target_df = target_df.dropna(subset=[url_field]) @@ -152,8 +154,8 @@ def compare_names_for_same_urls( compare `name_field` field""" result = Result("Compare Names Per Url") - url_field = tagged_fields.get("product_url_field") - name_field = tagged_fields.get("name_field") + url_field: Any = tagged_fields.get("product_url_field") + name_field: Any = tagged_fields.get("name_field") if not url_field or not name_field: result.add_info(Outcome.SKIPPED) return result @@ -199,14 +201,14 @@ def compare_prices_for_same_names( source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields ): result = Result("Compare Prices For Same Names") - name_field = tagged_fields.get("name_field") + name_field: Any = tagged_fields.get("name_field") if not name_field: result.add_info(Outcome.SKIPPED) return result name_field = name_field[0] - product_url_field = tagged_fields.get("product_url_field") + product_url_field: Any = tagged_fields.get("product_url_field") if not product_url_field: result.add_info("product_url_field tag is not set") else: @@ -242,7 +244,7 @@ def compare_prices_for_same_names( result.add_info(f"{len(same_names)} same names in both jobs") price_tag = "product_price_field" - price_field = tagged_fields.get(price_tag) + price_field: Any = tagged_fields.get(price_tag) if not price_field: result.add_info("product_price_field tag is not set") return result From 429faf794827d33ba6716a9884f4fb3a755d54ab Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Mon, 29 Jul 2019 17:48:36 -0300 Subject: [PATCH 03/31] fix type hints at rules module --- src/arche/rules/duplicates.py | 4 ++-- src/arche/rules/others.py | 3 ++- src/arche/rules/result.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/arche/rules/duplicates.py b/src/arche/rules/duplicates.py index f9c4270..5ba5492 100755 --- a/src/arche/rules/duplicates.py +++ b/src/arche/rules/duplicates.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Set from arche.readers.schema import TaggedFields from arche.rules.result import Result, Outcome @@ -18,7 +18,7 @@ def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: result.add_info(Outcome.SKIPPED) return result - err_keys = set() + err_keys: Set = set() for field in unique_fields: result.items_count = df[field].count() duplicates = df[df.duplicated(field, keep=False)][[field]] diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py index bc9ec53..95d0509 100755 --- a/src/arche/rules/others.py +++ b/src/arche/rules/others.py @@ -1,5 +1,6 @@ import codecs import re +from typing import Set from arche.rules.result import Outcome, Result import numpy as np @@ -90,7 +91,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result: ) errors = {} - row_keys = set() + row_keys: Set = set() rule_result = Result("Garbage Symbols", items_count=len(df)) for column in tqdm_notebook( diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py index 05c09f3..1785c5e 100755 --- a/src/arche/rules/result.py +++ b/src/arche/rules/result.py @@ -225,7 +225,7 @@ def build_stack_bar_data(values_counts: List[pd.Series]) -> List[go.Bar]: Returns: A list of Bar objects. """ - data = [] + data: List[go.Bar] = [] for vc in values_counts: data = data + [ go.Bar( From 52ce0803ed7bb98a5145d911a395649a53ac1d97 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Mon, 29 Jul 2019 18:15:25 -0300 Subject: [PATCH 04/31] fix type hints at dqr --- src/arche/data_quality_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py index 3aa8e4c..0e6b160 100755 --- a/src/arche/data_quality_report.py +++ b/src/arche/data_quality_report.py @@ -1,6 +1,6 @@ from io import StringIO import json -from typing import Optional +from typing import Optional, List from arche.figures import tables @@ -36,7 +36,7 @@ def __init__( """ self.schema = schema self.report = report - self.figures = [] + self.figures: Optional[List] = [] self.appendix = self.create_appendix(self.schema.raw) self.create_figures(items) self.plot_to_notebook() From fbf5484bf0782b5568817376b9b6a34011deb8c4 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Mon, 29 Jul 2019 19:45:52 -0300 Subject: [PATCH 05/31] adding more type annotation --- src/arche/data_quality_report.py | 10 +++++----- src/arche/report.py | 2 +- src/arche/tools/schema.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py index 0e6b160..54ce601 100755 --- a/src/arche/data_quality_report.py +++ b/src/arche/data_quality_report.py @@ -36,7 +36,7 @@ def __init__( """ self.schema = schema self.report = report - self.figures: Optional[List] = [] + self.figures: List = [] self.appendix = self.create_appendix(self.schema.raw) self.create_figures(items) self.plot_to_notebook() @@ -44,7 +44,7 @@ def __init__( if bucket: self.save_report_to_bucket( project_id=items.key.split("/")[0], - spider=items.job.metadata.get("spider"), + spider=items.job.metadata.get("spider"), # type: ignore bucket=bucket, ) @@ -63,7 +63,7 @@ def create_figures(self, items: CloudItems): no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count - crawlera_user = api.get_crawlera_user(items.job) + crawlera_user = api.get_crawlera_user(items.job) # type: ignore validation_errors = self.report.results.get( "JSON Schema Validation", @@ -77,7 +77,7 @@ def create_figures(self, items: CloudItems): ) quality_estimation, field_accuracy = generate_quality_estimation( - items.job, + items.job, # type: ignore crawlera_user, validation_errors, name_url_dups.err_items_count, @@ -91,7 +91,7 @@ def create_figures(self, items: CloudItems): ) self.score_table(quality_estimation, field_accuracy) - self.job_summary_table(items.job) + self.job_summary_table(items.job) # type: ignore self.rules_summary_table( items.df, validation_errors, diff --git a/src/arche/report.py b/src/arche/report.py index da34131..78dd894 100755 --- a/src/arche/report.py +++ b/src/arche/report.py @@ -38,7 +38,7 @@ def write_summaries(self) -> None: def write_summary(cls, result: Result) -> None: cls.write_rule_name(result.name) if not result.messages: - cls.write_rule_outcome(Outcome.PASSED, Level.INFO) + cls.write_rule_outcome(Outcome.PASSED, Level.INFO) #type: ignore for level, rule_msgs in result.messages.items(): for rule_msg in rule_msgs: cls.write_rule_outcome(rule_msg.summary, level) diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index 3b607dc..bd55e90 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -1,6 +1,6 @@ from collections import defaultdict import random -from typing import Any, Deque, Dict, List, Optional +from typing import Any, Deque, Dict, List, Optional, DefaultDict from arche.readers.items import RawItems from arche.readers.schema import Schema @@ -92,7 +92,7 @@ def fast_validate( Returns: A dictionary of errors with message and item keys """ - errors = defaultdict(set) + errors: DefaultDict = defaultdict(set) validate = fastjsonschema.compile(schema) for i, raw_item in enumerate( @@ -113,7 +113,7 @@ def full_validate( """This function uses jsonschema validator which returns all found error per item. See `fast_validate()` for arguments descriptions. """ - errors = defaultdict(set) + errors: DefaultDict = defaultdict(set) validator = validators.validator_for(schema)(schema) validator.format_checker = FormatChecker() @@ -134,7 +134,7 @@ def format_validation_message( error_msg: str, path: Deque, schema_path: Deque, validator: str ) -> str: str_path = "/".join(p for p in path if isinstance(p, str)) - schema_path = "/".join(p for p in schema_path) + schema_path = "/".join(p for p in schema_path) # type: ignore if validator == "anyOf": if str_path: From c4af9e9d42ecc42d44625a2d195886bc6a2e3a93 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Mon, 16 Sep 2019 18:44:39 -0300 Subject: [PATCH 06/31] fix mypy typing - partial commit --- Pipfile | 2 ++ src/arche/readers/items.py | 2 +- src/arche/readers/schema.py | 21 ++++++++++++--------- src/arche/report.py | 2 +- src/arche/tools/api.py | 3 ++- tox.ini | 12 +++++++++++- 6 files changed, 29 insertions(+), 13 deletions(-) diff --git a/Pipfile b/Pipfile index 1ff4a44..64dad58 100755 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,7 @@ fastjsonschema = "*" perfect-jsonschema = "*" tqdm = "*" ipywidgets = "*" +mypy = "*" [dev-packages] jupyterlab = "*" @@ -42,6 +43,7 @@ pyarrow = "*" cufflinks = "*" tables = "*" nb-black = "*" +pylint = "*" [requires] python_version = "3.7" diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py index f2253dd..e187241 100755 --- a/src/arche/readers/items.py +++ b/src/arche/readers/items.py @@ -47,7 +47,7 @@ def origin_column_name(self, new: str) -> str: for column in self.df.columns: if column in new: return column - return '' + return "" @classmethod def from_df(cls, df: pd.DataFrame): diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index ad7c637..359abbb 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -1,10 +1,8 @@ from collections import defaultdict from enum import Enum import json -import os import pprint -from typing import Dict, List, Union, DefaultDict, cast -import urllib.request +from typing import Dict, List, Union, DefaultDict, cast, Tuple, Any, ItemsView from arche.tools import s3 import perfect_jsonschema @@ -44,19 +42,24 @@ def __repr__(self): def get_enums(self) -> List[str]: enums = [] - for k, v in self.raw["properties"].items(): - if "enum" in v.keys(): # type: ignore + # self.raw["properties"].items() has type: + # ItemsView[str, Union[str, bool, int, float, None, list[Any]]] + properties = cast(ItemsView[str, Dict[str, Any]], self.raw["properties"].items()) + for k, v in properties: + if "enum" in v.keys(): enums.append(k) return enums @staticmethod def get_tags(schema: RawSchema) -> TaggedFields: tagged_fields: DefaultDict[str, List[str]] = defaultdict(list) - for key, value in schema["properties"].items(): - property_tags = value.get("tag", []) # type: ignore + # schema["properties"].items() has type: + # ItemsView[str, Union[str, bool, int, float, None, list[Any]]] + properties = cast(ItemsView[str, Dict[str, Any]], schema["properties"].items()) + for key, value in properties: + property_tags = value.get("tag", []) if property_tags: - tagged_fields = cast( - DefaultDict[str, List[str]], Schema.get_field_tags(property_tags, key, tagged_fields)) + tagged_fields: Dict[str, List[str]] = Schema.get_field_tags(property_tags, key, tagged_fields) return tagged_fields @classmethod diff --git a/src/arche/report.py b/src/arche/report.py index 9d1afd3..5669a24 100755 --- a/src/arche/report.py +++ b/src/arche/report.py @@ -38,7 +38,7 @@ def write_summaries(self) -> None: def write_summary(cls, result: Result) -> None: cls.write_rule_name(result.name) if not result.messages: - cls.write_rule_outcome(Outcome.PASSED, Level.INFO) #type: ignore + cls.write_rule_outcome(Outcome.PASSED, Level.INFO) for level, rule_msgs in result.messages.items(): for rule_msg in rule_msgs: cls.write_rule_outcome(rule_msg.summary, level) diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index 4a10be1..6c181ba 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -145,7 +145,8 @@ def get_items_with_pool( """ active_connections_limit = 10 processes_count: int = cast( - int, min(max(helpers.cpus_count(), workers), active_connections_limit)) + int, min(max(helpers.cpus_count(), workers), active_connections_limit) + ) batch_size = math.ceil(count / processes_count) start_idxs = range(start_index, start_index + count, batch_size) diff --git a/tox.ini b/tox.ini index 077c79c..5aa06e9 100755 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37, pep8 +envlist = py37, pep8, mypy skipsdist = false [testenv] @@ -22,6 +22,16 @@ extras = docs commands = sphinx-build docs/source docs/_build -b linkcheck -b html +[testenv:mypy] +deps = + mypy +commands = mypy --ignore-missing-imports src/arche tests + +[mypy] +deps = + mypy +commands = mypy src/arche + [flake8] select = C,E,F,W,I,D,B,B9 ignore = W503, E741, E501, E203, I101 From f11f278559801e9bec49ba53197131d1782ab885 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Tue, 17 Sep 2019 14:07:34 -0300 Subject: [PATCH 07/31] fix typing at schema.py --- src/arche/readers/schema.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index 359abbb..d8722f3 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -42,9 +42,11 @@ def __repr__(self): def get_enums(self) -> List[str]: enums = [] - # self.raw["properties"].items() has type: + # self.raw["properties"].items() has type: # ItemsView[str, Union[str, bool, int, float, None, list[Any]]] - properties = cast(ItemsView[str, Dict[str, Any]], self.raw["properties"].items()) + properties = cast( + ItemsView[str, Dict[str, Any]], self.raw["properties"].items() + ) for k, v in properties: if "enum" in v.keys(): enums.append(k) @@ -52,19 +54,19 @@ def get_enums(self) -> List[str]: @staticmethod def get_tags(schema: RawSchema) -> TaggedFields: - tagged_fields: DefaultDict[str, List[str]] = defaultdict(list) - # schema["properties"].items() has type: + tagged_fields: Dict[str, List[str]] = defaultdict(list) + # schema["properties"].items() has type: # ItemsView[str, Union[str, bool, int, float, None, list[Any]]] properties = cast(ItemsView[str, Dict[str, Any]], schema["properties"].items()) for key, value in properties: property_tags = value.get("tag", []) if property_tags: - tagged_fields: Dict[str, List[str]] = Schema.get_field_tags(property_tags, key, tagged_fields) + tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields) return tagged_fields @classmethod def get_field_tags( - cls, tags: List[str], field: str, tagged_fields: defaultdict + cls, tags: List[str], field: str, tagged_fields: Dict ) -> TaggedFields: tags = cls.parse_tag(tags) if not tags: From 06aacf881c57ceb99f4fbe7cd509377259d4183c Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Tue, 17 Sep 2019 17:10:02 -0300 Subject: [PATCH 08/31] fix typing --- src/arche/readers/schema.py | 2 +- src/arche/report.py | 6 ++++-- src/arche/rules/price.py | 18 +++++++++--------- src/arche/rules/result.py | 8 ++++++-- tests/conftest.py | 4 ++-- tests/test_arche.py | 3 ++- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index d8722f3..8368b91 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -2,7 +2,7 @@ from enum import Enum import json import pprint -from typing import Dict, List, Union, DefaultDict, cast, Tuple, Any, ItemsView +from typing import Dict, List, Union, cast, Any, ItemsView from arche.tools import s3 import perfect_jsonschema diff --git a/src/arche/report.py b/src/arche/report.py index 5669a24..e457f02 100755 --- a/src/arche/report.py +++ b/src/arche/report.py @@ -1,5 +1,5 @@ from functools import partial -from typing import Dict +from typing import Dict, Union from arche import SH_URL from arche.rules.result import Level, Outcome, Result @@ -44,7 +44,9 @@ def write_summary(cls, result: Result) -> None: cls.write_rule_outcome(rule_msg.summary, level) @classmethod - def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None: + def write_rule_outcome( + cls, outcome: Union[str, Outcome], level: Level = Level.INFO + ) -> None: if isinstance(outcome, Outcome): outcome = outcome.name msg = outcome diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py index 517a5e5..732677f 100755 --- a/src/arche/rules/price.py +++ b/src/arche/rules/price.py @@ -110,11 +110,11 @@ def compare_prices_for_same_urls( result.add_info(f"{len(same_urls)} same urls in both jobs") diff_prices_count = 0 - price_field = tagged_fields.get("product_price_field") - if not price_field: + price_field_tag = tagged_fields.get("product_price_field") + if not price_field_tag: result.add_info("product_price_field tag is not set") else: - price_field = price_field[0] + price_field = price_field_tag[0] detailed_messages = [] for url in same_urls: if url.strip() != "nan": @@ -202,12 +202,12 @@ def compare_prices_for_same_names( source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields ): result = Result("Compare Prices For Same Names") - name_field = tagged_fields.get("name_field") - if not name_field: + name_field_tag = tagged_fields.get("name_field") + if not name_field_tag: result.add_info(Outcome.SKIPPED) return result - name_field = name_field[0] + name_field = name_field_tag[0] source_df = source_df[source_df[name_field].notnull()] target_df = target_df[target_df[name_field].notnull()] @@ -234,12 +234,12 @@ def compare_prices_for_same_names( result.add_info(f"{len(same_names)} same names in both jobs") price_tag = "product_price_field" - price_field = tagged_fields.get(price_tag) - if not price_field: + price_field_tag = tagged_fields.get(price_tag) + if not price_field_tag: result.add_info("product_price_field tag is not set") return result - price_field = price_field[0] + price_field = price_field_tag[0] count = 0 detailed_messages = [] diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py index 9b6dc78..2c219b7 100755 --- a/src/arche/rules/result.py +++ b/src/arche/rules/result.py @@ -2,7 +2,7 @@ from enum import Enum import itertools import math -from typing import Dict, List, Optional, Set, Union +from typing import Dict, List, Optional, Set, Union, cast import IPython import numpy as np @@ -40,7 +40,11 @@ class Message: summary: str detailed: Optional[str] = None errors: Optional[Dict[str, Set]] = None - _err_keys: Optional[Set[Union[str, int]]] = field(default_factory=set) + + # expression "field(default_factory=set)" has type "Set[_T]", so we have to cast + _err_keys: Optional[Set[Union[str, int]]] = cast( + Optional[Set[Union[str, int]]], field(default_factory=set) + ) @property def err_keys(self): diff --git a/tests/conftest.py b/tests/conftest.py index 1c53809..8837e97 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -211,8 +211,8 @@ def create_result( items_count: Optional[int] = None, ) -> Result: result = Result(rule_name) - for level, messages in messages.items(): - for message in messages: + for level, messages_list in messages.items(): + for message in messages_list: result.add_message(level, *message) if stats: diff --git a/tests/test_arche.py b/tests/test_arche.py index fa2c736..7cebe32 100755 --- a/tests/test_arche.py +++ b/tests/test_arche.py @@ -1,3 +1,4 @@ +from typing import Dict, List from arche import arche, SH_URL from arche.arche import Arche from arche.rules.result import Level @@ -34,7 +35,7 @@ def test_arche_df(get_df): pd.testing.assert_frame_equal(a.target_items.df, get_df) -schema_dummies = [{"properties": {"name": {}}}, {"properties": {"url": {}}}] +schema_dummies: List[Dict] = [{"properties": {"name": {}}}, {"properties": {"url": {}}}] def test_schema(): From 9b01ae1433a0c6c2751594efe22397a41e7bf360 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Tue, 17 Sep 2019 17:26:20 -0300 Subject: [PATCH 09/31] pep8 --- tests/test_arche.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_arche.py b/tests/test_arche.py index 7cebe32..235de2a 100755 --- a/tests/test_arche.py +++ b/tests/test_arche.py @@ -1,4 +1,5 @@ from typing import Dict, List + from arche import arche, SH_URL from arche.arche import Arche from arche.rules.result import Level From 3c43406f184295de02d1051aac4ff457fe8b9b6c Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Wed, 25 Sep 2019 21:32:29 -0300 Subject: [PATCH 10/31] fix mypy at arche.py --- src/arche/arche.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/arche/arche.py b/src/arche/arche.py index 3e67ee5..996e7bb 100755 --- a/src/arche/arche.py +++ b/src/arche/arche.py @@ -1,6 +1,6 @@ from functools import lru_cache import logging -from typing import Iterable, Optional, Union +from typing import Iterable, Optional, Union, cast from arche.data_quality_report import DataQualityReport from arche.readers.items import Items, CollectionItems, JobItems, RawItems @@ -106,15 +106,15 @@ def schema(self, schema_source): def get_items( source: Union[str, pd.DataFrame, RawItems], count: Optional[int], - start: Union[str, int], + start: Optional[str], filters: Optional[api.Filters], ) -> Items: if isinstance(source, pd.DataFrame): return Items.from_df(source) elif isinstance(source, Iterable) and not isinstance(source, str): - return Items.from_array(source) + return Items.from_array(cast(RawItems, source)) elif helpers.is_job_key(source): - return JobItems(source, count, start or 0, filters) + return JobItems(source, count, int(start or 0), filters) elif helpers.is_collection_key(source): return CollectionItems(source, count, start, filters) else: @@ -140,7 +140,7 @@ def run_all_rules(self): self.run_schema_rules() def data_quality_report(self, bucket: Optional[str] = None): - if helpers.is_collection_key(self.source): + if helpers.is_collection_key(str(self.source or '')): raise ValueError("Collections are not supported") if not self.schema: raise ValueError("Schema is empty") From 82de89a1a57bb6a8f111754e277b95d3d935b64f Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Thu, 26 Sep 2019 19:53:37 -0300 Subject: [PATCH 11/31] fix pep8; improve mypy typinh --- src/arche/arche.py | 2 +- src/arche/data_quality_report.py | 14 +++++++------- src/arche/readers/schema.py | 4 ++-- src/arche/tools/bitbucket.py | 2 +- src/arche/tools/schema.py | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/arche/arche.py b/src/arche/arche.py index 996e7bb..7ab2375 100755 --- a/src/arche/arche.py +++ b/src/arche/arche.py @@ -140,7 +140,7 @@ def run_all_rules(self): self.run_schema_rules() def data_quality_report(self, bucket: Optional[str] = None): - if helpers.is_collection_key(str(self.source or '')): + if helpers.is_collection_key(str(self.source or "")): raise ValueError("Collections are not supported") if not self.schema: raise ValueError("Schema is empty") diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py index 54ce601..9f70aa2 100755 --- a/src/arche/data_quality_report.py +++ b/src/arche/data_quality_report.py @@ -5,7 +5,7 @@ from arche.figures import tables from arche.quality_estimation_algorithm import generate_quality_estimation -from arche.readers.items import CloudItems +from arche.readers.items import JobItems from arche.readers.schema import Schema from arche.report import Report import arche.rules.coverage as coverage_rules @@ -23,7 +23,7 @@ class DataQualityReport: def __init__( self, - items: CloudItems, + items: JobItems, schema: Schema, report: Report, bucket: Optional[str] = None, @@ -44,11 +44,11 @@ def __init__( if bucket: self.save_report_to_bucket( project_id=items.key.split("/")[0], - spider=items.job.metadata.get("spider"), # type: ignore + spider=items.job.metadata.get("spider"), bucket=bucket, ) - def create_figures(self, items: CloudItems): + def create_figures(self, items: JobItems): name_url_dups = self.report.results.get( "Duplicates By **name_field, product_url_field** Tags", duplicate_rules.find_by_name_url(items.df, self.schema.tags), @@ -63,7 +63,7 @@ def create_figures(self, items: CloudItems): no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count - crawlera_user = api.get_crawlera_user(items.job) # type: ignore + crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", @@ -77,7 +77,7 @@ def create_figures(self, items: CloudItems): ) quality_estimation, field_accuracy = generate_quality_estimation( - items.job, # type: ignore + items.job, crawlera_user, validation_errors, name_url_dups.err_items_count, @@ -91,7 +91,7 @@ def create_figures(self, items: CloudItems): ) self.score_table(quality_estimation, field_accuracy) - self.job_summary_table(items.job) # type: ignore + self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index 8368b91..7d6b08c 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -2,7 +2,7 @@ from enum import Enum import json import pprint -from typing import Dict, List, Union, cast, Any, ItemsView +from typing import Dict, List, Union, cast, Any, ItemsView, Set from arche.tools import s3 import perfect_jsonschema @@ -66,7 +66,7 @@ def get_tags(schema: RawSchema) -> TaggedFields: @classmethod def get_field_tags( - cls, tags: List[str], field: str, tagged_fields: Dict + cls, tags: Set[Any], field: str, tagged_fields: Dict ) -> TaggedFields: tags = cls.parse_tag(tags) if not tags: diff --git a/src/arche/tools/bitbucket.py b/src/arche/tools/bitbucket.py index 8ea5b8f..117db05 100644 --- a/src/arche/tools/bitbucket.py +++ b/src/arche/tools/bitbucket.py @@ -23,7 +23,7 @@ def prepare_request(url: str) -> urllib.request.Request: def convert_to_api_url(url: str, netloc: str, api_netloc: str) -> str: """Support both regular and raw URLs""" try: - user, repo, path = re.search( + user, repo, path = re.search( # type: ignore f"https://{netloc}/(.*?)/(.*?)/(?:raw|src)/(.*)", url ).groups() except AttributeError: diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index bd55e90..cb910c7 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -134,7 +134,7 @@ def format_validation_message( error_msg: str, path: Deque, schema_path: Deque, validator: str ) -> str: str_path = "/".join(p for p in path if isinstance(p, str)) - schema_path = "/".join(p for p in schema_path) # type: ignore + schema_path = "/".join(p for p in schema_path) if validator == "anyOf": if str_path: From 26066000084db063f5e120bfba8d0c7ac23ab8f0 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Thu, 26 Sep 2019 20:08:21 -0300 Subject: [PATCH 12/31] update typing at tools/schema.py --- src/arche/tools/schema.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index cb910c7..e295462 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -1,9 +1,9 @@ from collections import defaultdict import random -from typing import Any, Deque, Dict, List, Optional, DefaultDict +from typing import Any, Deque, Dict, List, Optional, DefaultDict, Union from arche.readers.items import RawItems -from arche.readers.schema import Schema +from arche.readers.schema import Schema, RawSchema from arche.schema_definitions import extension from arche.tools import api, helpers import fastjsonschema @@ -26,7 +26,7 @@ def basic_json_schema(data_source: str, items_numbers: List[int] = None) -> Sche def create_json_schema( source_key: str, items_numbers: Optional[List[int]] = None -) -> Schema: +) -> Union[str, RawSchema]: if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() @@ -58,7 +58,7 @@ def create_json_schema( return infer_schema(samples) -def infer_schema(samples: List[Dict[str, Any]]) -> Schema: +def infer_schema(samples: List[Dict[str, Any]]) -> Union[str, RawSchema]: builder = SchemaBuilder("http://json-schema.org/draft-07/schema#") for sample in samples: builder.add_object(sample) @@ -134,13 +134,13 @@ def format_validation_message( error_msg: str, path: Deque, schema_path: Deque, validator: str ) -> str: str_path = "/".join(p for p in path if isinstance(p, str)) - schema_path = "/".join(p for p in schema_path) + schema_path_str: str = "/".join(p for p in schema_path) if validator == "anyOf": if str_path: - return f"'{str_path}' does not satisfy 'schema/{schema_path}'" + return f"'{str_path}' does not satisfy 'schema/{schema_path_str}'" else: - return f"'schema/{schema_path}' failed" + return f"'schema/{schema_path_str}' failed" if "Additional properties are not allowed" in error_msg: return error_msg From f092fe849b23310c5b0e2658b3e0d3012e5bda7e Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Fri, 27 Sep 2019 11:47:21 -0300 Subject: [PATCH 13/31] updating typing --- Pipfile | 1 - src/arche/rules/json_schema.py | 2 +- src/arche/tools/schema.py | 4 ++-- tests/conftest.py | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Pipfile b/Pipfile index 64dad58..26cbf83 100755 --- a/Pipfile +++ b/Pipfile @@ -43,7 +43,6 @@ pyarrow = "*" cufflinks = "*" tables = "*" nb-black = "*" -pylint = "*" [requires] python_version = "3.7" diff --git a/src/arche/rules/json_schema.py b/src/arche/rules/json_schema.py index 8458aaa..043a32a 100755 --- a/src/arche/rules/json_schema.py +++ b/src/arche/rules/json_schema.py @@ -25,7 +25,7 @@ def validate( err_items = len(set(itertools.chain.from_iterable(errors.values()))) if errors: result.add_error( - f"{err_items} ({err_items/len(raw_items):.0%}) items have {len(errors)} errors", + f"{err_items} ({err_items/len(list(raw_items)):.0%}) items have {len(errors)} errors", # noqa errors=errors, ) return result diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index e295462..95dbc77 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -79,7 +79,7 @@ def set_item_no(items_count: int) -> List[int]: def fast_validate( - schema: Schema, raw_items: RawItems, keys: pd.Index + schema: RawSchema, raw_items: RawItems, keys: pd.Index ) -> Dict[str, set]: """Verify items one by one. It stops after the first error in an item in most cases. Faster than jsonschema validation @@ -108,7 +108,7 @@ def fast_validate( def full_validate( - schema: Schema, raw_items: RawItems, keys: pd.Index + schema: RawSchema, raw_items: RawItems, keys: pd.Index ) -> Dict[str, set]: """This function uses jsonschema validator which returns all found error per item. See `fast_validate()` for arguments descriptions. diff --git a/tests/conftest.py b/tests/conftest.py index 8837e97..512fbb3 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ from copy import deepcopy from itertools import zip_longest -from typing import Dict, Iterable, List, Optional +from typing import Dict, List, Optional from arche.readers.items import CollectionItems, JobItems from arche.rules.result import Level, Message, Result, Stat @@ -50,7 +50,7 @@ def get_df(): class Job: def __init__( self, - items: Optional[Iterable] = None, + items: Optional[List[Dict]] = None, metadata: Optional[Dict] = None, stats: Optional[Dict] = None, key: str = "112358/13/21", From 6affcdf7bcaece87f0b2b0ca8c42cfde9cc9f886 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Fri, 27 Sep 2019 18:48:41 -0300 Subject: [PATCH 14/31] remove cast --- src/arche/rules/result.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py index 2c219b7..8dd1109 100755 --- a/src/arche/rules/result.py +++ b/src/arche/rules/result.py @@ -2,7 +2,7 @@ from enum import Enum import itertools import math -from typing import Dict, List, Optional, Set, Union, cast +from typing import Dict, List, Optional, Set, Union import IPython import numpy as np @@ -42,9 +42,7 @@ class Message: errors: Optional[Dict[str, Set]] = None # expression "field(default_factory=set)" has type "Set[_T]", so we have to cast - _err_keys: Optional[Set[Union[str, int]]] = cast( - Optional[Set[Union[str, int]]], field(default_factory=set) - ) + _err_keys: Set[Union[str, int]] = field(default_factory=set) @property def err_keys(self): From 80a56289237c7a6e3040efaf2792186a7d0e398e Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Fri, 27 Sep 2019 19:03:09 -0300 Subject: [PATCH 15/31] fixing type --- src/arche/readers/items.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py index e187241..1905eaa 100755 --- a/src/arche/readers/items.py +++ b/src/arche/readers/items.py @@ -67,7 +67,7 @@ def __init__( ): self.key = key self._count = count - self._limit: Any = None + self._limit: int self.filters = filters raw = self.fetch_data() df = pd.DataFrame(list(raw)) From fa1597431636b11dd2cbf73d1c2e534bfc852886 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Sun, 29 Sep 2019 12:07:44 -0300 Subject: [PATCH 16/31] fix typing at price.py --- src/arche/rules/price.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py index 732677f..dab1b9d 100755 --- a/src/arche/rules/price.py +++ b/src/arche/rules/price.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Optional, List from arche.readers.schema import TaggedFields from arche.rules.result import Result, Outcome @@ -77,12 +77,12 @@ def compare_prices_for_same_urls( missing and new `product_url_field` tagged fields. """ result = Result("Compare Prices For Same Urls") - url_field: Any = tagged_fields.get("product_url_field") - if not url_field: + url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field") + if not url_field_list: result.add_info(Outcome.SKIPPED) return result - url_field = url_field[0] + url_field = url_field_list[0] source_df = source_df.dropna(subset=[url_field]) target_df = target_df.dropna(subset=[url_field]) From e9c47d383391e4fa7f84e068e4f0794692c21cad Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Sun, 29 Sep 2019 12:08:28 -0300 Subject: [PATCH 17/31] fix typing at price.py --- src/arche/tools/api.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index 6c181ba..756d011 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -144,9 +144,7 @@ def get_items_with_pool( A numpy array of items """ active_connections_limit = 10 - processes_count: int = cast( - int, min(max(helpers.cpus_count(), workers), active_connections_limit) - ) + processes_count: int = min(max(helpers.cpus_count(), workers), active_connections_limit) batch_size = math.ceil(count / processes_count) start_idxs = range(start_index, start_index + count, batch_size) From 95588625c5e21bd2b091d6d0328184a544f1e8ee Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Sun, 29 Sep 2019 12:09:27 -0300 Subject: [PATCH 18/31] fix typing at api.py --- src/arche/tools/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index 756d011..86c32b9 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -144,7 +144,7 @@ def get_items_with_pool( A numpy array of items """ active_connections_limit = 10 - processes_count: int = min(max(helpers.cpus_count(), workers), active_connections_limit) + processes_count: int = int(min(max(helpers.cpus_count(), workers), active_connections_limit) or 0) batch_size = math.ceil(count / processes_count) start_idxs = range(start_index, start_index + count, batch_size) From d6e60278dcf1b05cf35c76f765860102ae3db212 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Sun, 29 Sep 2019 13:02:06 -0300 Subject: [PATCH 19/31] fix tests and pep8 --- src/arche/readers/items.py | 4 +++- src/arche/tools/api.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py index 1905eaa..392c591 100755 --- a/src/arche/readers/items.py +++ b/src/arche/readers/items.py @@ -67,7 +67,7 @@ def __init__( ): self.key = key self._count = count - self._limit: int + self._limit: int = 0 self.filters = filters raw = self.fetch_data() df = pd.DataFrame(list(raw)) @@ -107,6 +107,7 @@ def __init__( self.start_index = start_index self.start: str = f"{key}/{start_index}" self._job: Job = None + self._limit: int = 0 super().__init__(key, count, filters) @property @@ -155,6 +156,7 @@ def __init__( filters: Optional[api.Filters] = None, ): self.start = start + self._limit: int = 0 super().__init__(key, count, filters) @property diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index 86c32b9..f97d210 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -3,7 +3,7 @@ import math from multiprocessing import Pool import time -from typing import Dict, List, Tuple, Optional, Union, cast +from typing import Dict, List, Tuple, Optional, Union from arche.tools import helpers from dateutil.relativedelta import relativedelta @@ -144,7 +144,9 @@ def get_items_with_pool( A numpy array of items """ active_connections_limit = 10 - processes_count: int = int(min(max(helpers.cpus_count(), workers), active_connections_limit) or 0) + processes_count: int = int( + min(max(helpers.cpus_count(), workers), active_connections_limit) or 0 + ) batch_size = math.ceil(count / processes_count) start_idxs = range(start_index, start_index + count, batch_size) From e432ece4e0339cde19c635beaa9f6509a56e1d17 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Mon, 30 Sep 2019 17:34:24 -0300 Subject: [PATCH 20/31] fix typing at conftest --- tests/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 512fbb3..f858d53 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,9 @@ from copy import deepcopy from itertools import zip_longest -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple from arche.readers.items import CollectionItems, JobItems -from arche.rules.result import Level, Message, Result, Stat +from arche.rules.result import Level, Result, Stat import numpy as np import pandas as pd import pytest @@ -206,7 +206,7 @@ def get_collection_items(mocker): def create_result( rule_name: str, - messages: Dict[Level, List[Message]], + messages: Dict[Level, List[Tuple]], stats: Optional[List[Stat]] = None, items_count: Optional[int] = None, ) -> Result: From 1f4611799748f9736b3d5b77cbe96c51111a2b44 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Thu, 3 Oct 2019 09:56:08 -0300 Subject: [PATCH 21/31] refactor typing at schema.py --- src/arche/tools/schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index 95dbc77..bb6999d 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -26,7 +26,7 @@ def basic_json_schema(data_source: str, items_numbers: List[int] = None) -> Sche def create_json_schema( source_key: str, items_numbers: Optional[List[int]] = None -) -> Union[str, RawSchema]: +) -> RawSchema: if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() @@ -58,7 +58,7 @@ def create_json_schema( return infer_schema(samples) -def infer_schema(samples: List[Dict[str, Any]]) -> Union[str, RawSchema]: +def infer_schema(samples: List[Dict[str, Any]]) -> RawSchema: builder = SchemaBuilder("http://json-schema.org/draft-07/schema#") for sample in samples: builder.add_object(sample) From 2fe5ba9df1e85163c932bf7ff2daf686216d977d Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Thu, 3 Oct 2019 10:06:12 -0300 Subject: [PATCH 22/31] fix typing at price.py and result.py --- src/arche/rules/price.py | 10 +++++----- src/arche/rules/result.py | 2 -- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py index dab1b9d..de20ed7 100755 --- a/src/arche/rules/price.py +++ b/src/arche/rules/price.py @@ -155,14 +155,14 @@ def compare_names_for_same_urls( compare `name_field` field""" result = Result("Compare Names Per Url") - url_field: Any = tagged_fields.get("product_url_field") - name_field: Any = tagged_fields.get("name_field") - if not url_field or not name_field: + url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field") + name_field_list: Optional[List[str]] = tagged_fields.get("name_field") + if not url_field_list or not name_field_list: result.add_info(Outcome.SKIPPED) return result - name_field = name_field[0] - url_field = url_field[0] + name_field: str = name_field_list[0] + url_field: str = url_field_list[0] diff_names_count = 0 same_urls = source_df[(source_df[url_field].isin(target_df[url_field].values))][ diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py index 8dd1109..87c5282 100755 --- a/src/arche/rules/result.py +++ b/src/arche/rules/result.py @@ -40,8 +40,6 @@ class Message: summary: str detailed: Optional[str] = None errors: Optional[Dict[str, Set]] = None - - # expression "field(default_factory=set)" has type "Set[_T]", so we have to cast _err_keys: Set[Union[str, int]] = field(default_factory=set) @property From 774f8c88725889f0291757e3b459a2cfbd87b630 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Thu, 3 Oct 2019 10:20:25 -0300 Subject: [PATCH 23/31] refactor --- src/arche/readers/schema.py | 2 +- src/arche/rules/price.py | 2 +- src/arche/tools/schema.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index 7d6b08c..fd48cdd 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -59,7 +59,7 @@ def get_tags(schema: RawSchema) -> TaggedFields: # ItemsView[str, Union[str, bool, int, float, None, list[Any]]] properties = cast(ItemsView[str, Dict[str, Any]], schema["properties"].items()) for key, value in properties: - property_tags = value.get("tag", []) + property_tags = value.get("tag") if property_tags: tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields) return tagged_fields diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py index de20ed7..7835c0d 100755 --- a/src/arche/rules/price.py +++ b/src/arche/rules/price.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, List +from typing import Optional, List from arche.readers.schema import TaggedFields from arche.rules.result import Result, Outcome diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index bb6999d..dc0c09a 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -1,6 +1,6 @@ from collections import defaultdict import random -from typing import Any, Deque, Dict, List, Optional, DefaultDict, Union +from typing import Any, Deque, Dict, List, Optional, DefaultDict from arche.readers.items import RawItems from arche.readers.schema import Schema, RawSchema From 797079eb292bffa325ac30b7dff9cd96739208ed Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Thu, 3 Oct 2019 10:35:25 -0300 Subject: [PATCH 24/31] refactor --- src/arche/arche.py | 2 +- src/arche/readers/items.py | 10 ---------- tests/readers/test_items.py | 8 -------- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/src/arche/arche.py b/src/arche/arche.py index 7ab2375..2bc55f5 100755 --- a/src/arche/arche.py +++ b/src/arche/arche.py @@ -140,7 +140,7 @@ def run_all_rules(self): self.run_schema_rules() def data_quality_report(self, bucket: Optional[str] = None): - if helpers.is_collection_key(str(self.source or "")): + if helpers.is_collection_key(str(self.source)): raise ValueError("Collections are not supported") if not self.schema: raise ValueError("Schema is empty") diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py index 392c591..1a69ce1 100755 --- a/src/arche/readers/items.py +++ b/src/arche/readers/items.py @@ -41,14 +41,6 @@ def categorize(df: pd.DataFrame) -> pd.DataFrame: except TypeError: continue - def origin_column_name(self, new: str) -> str: - if new in self.df.columns: - return new - for column in self.df.columns: - if column in new: - return column - return "" - @classmethod def from_df(cls, df: pd.DataFrame): return cls(raw=np.array(df.to_dict("records")), df=df) @@ -107,7 +99,6 @@ def __init__( self.start_index = start_index self.start: str = f"{key}/{start_index}" self._job: Job = None - self._limit: int = 0 super().__init__(key, count, filters) @property @@ -156,7 +147,6 @@ def __init__( filters: Optional[api.Filters] = None, ): self.start = start - self._limit: int = 0 super().__init__(key, count, filters) @property diff --git a/tests/readers/test_items.py b/tests/readers/test_items.py index a23569b..40a4d06 100755 --- a/tests/readers/test_items.py +++ b/tests/readers/test_items.py @@ -6,14 +6,6 @@ import pytest -@pytest.mark.parametrize( - "name, expected_name", [("price", "price"), ("name_0", "name")] -) -def test_origin_column_name(get_cloud_items, name, expected_name): - items = Items.from_df(pd.DataFrame(get_cloud_items)) - assert items.origin_column_name(name) == expected_name - - @pytest.mark.parametrize( "df, expected_raw, expected_df", [ From 5d07463159d8706d7dc4260af70a1e4e7421e80d Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Fri, 4 Oct 2019 15:14:02 -0300 Subject: [PATCH 25/31] update Pipfile --- Pipfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pipfile b/Pipfile index 26cbf83..8e1cec6 100755 --- a/Pipfile +++ b/Pipfile @@ -14,7 +14,6 @@ fastjsonschema = "*" perfect-jsonschema = "*" tqdm = "*" ipywidgets = "*" -mypy = "*" [dev-packages] jupyterlab = "*" @@ -43,6 +42,7 @@ pyarrow = "*" cufflinks = "*" tables = "*" nb-black = "*" +mypy = "*" [requires] python_version = "3.7" From d073642ea913dfef6d05782ad9d2d5d08c1bafb5 Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Fri, 4 Oct 2019 15:35:35 -0300 Subject: [PATCH 26/31] refactoring --- src/arche/tools/api.py | 2 +- src/arche/tools/bitbucket.py | 9 ++++----- tox.ini | 5 ----- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index f97d210..748ad3f 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -145,7 +145,7 @@ def get_items_with_pool( """ active_connections_limit = 10 processes_count: int = int( - min(max(helpers.cpus_count(), workers), active_connections_limit) or 0 + min(max(helpers.cpus_count() or 0, workers), active_connections_limit) ) batch_size = math.ceil(count / processes_count) diff --git a/src/arche/tools/bitbucket.py b/src/arche/tools/bitbucket.py index 117db05..03c86ab 100644 --- a/src/arche/tools/bitbucket.py +++ b/src/arche/tools/bitbucket.py @@ -22,11 +22,10 @@ def prepare_request(url: str) -> urllib.request.Request: def convert_to_api_url(url: str, netloc: str, api_netloc: str) -> str: """Support both regular and raw URLs""" - try: - user, repo, path = re.search( # type: ignore - f"https://{netloc}/(.*?)/(.*?)/(?:raw|src)/(.*)", url - ).groups() - except AttributeError: + match = re.search(f"https://{netloc}/(.*?)/(.*?)/(?:raw|src)/(.*)", url) + if match: + user, repo, path = match.groups() + else: raise ValueError("Not a valid bitbucket URL: {url}") return f"https://{api_netloc}/2.0/repositories/{user}/{repo}/src/{path}" diff --git a/tox.ini b/tox.ini index 5aa06e9..256cfff 100755 --- a/tox.ini +++ b/tox.ini @@ -27,11 +27,6 @@ deps = mypy commands = mypy --ignore-missing-imports src/arche tests -[mypy] -deps = - mypy -commands = mypy src/arche - [flake8] select = C,E,F,W,I,D,B,B9 ignore = W503, E741, E501, E203, I101 From d42dbe1fd5a33a3e6758d1a77cbd62a947ab735c Mon Sep 17 00:00:00 2001 From: Anderson Berg Date: Fri, 4 Oct 2019 15:48:42 -0300 Subject: [PATCH 27/31] refactor typing --- src/arche/readers/schema.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index fd48cdd..222ac3a 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -2,7 +2,7 @@ from enum import Enum import json import pprint -from typing import Dict, List, Union, cast, Any, ItemsView, Set +from typing import Dict, List, Union, cast, Any, ItemsView, Set, DefaultDict from arche.tools import s3 import perfect_jsonschema @@ -54,20 +54,20 @@ def get_enums(self) -> List[str]: @staticmethod def get_tags(schema: RawSchema) -> TaggedFields: - tagged_fields: Dict[str, List[str]] = defaultdict(list) - # schema["properties"].items() has type: - # ItemsView[str, Union[str, bool, int, float, None, list[Any]]] - properties = cast(ItemsView[str, Dict[str, Any]], schema["properties"].items()) - for key, value in properties: - property_tags = value.get("tag") - if property_tags: - tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields) - return tagged_fields + tagged_fields: DefaultDict[str, List[str]] = defaultdict(list) + for key, value in schema["properties"].items(): + if isinstance(value, Dict): + property_tags = value.get("tag") + if property_tags: + tagged_fields = Schema.get_field_tags( + property_tags, key, tagged_fields + ) + return dict(tagged_fields) @classmethod def get_field_tags( - cls, tags: Set[Any], field: str, tagged_fields: Dict - ) -> TaggedFields: + cls, tags: Set[Any], field: str, tagged_fields: DefaultDict + ) -> DefaultDict[str, List[str]]: tags = cls.parse_tag(tags) if not tags: raise ValueError( From 0f3145f9550b3d4582f2ddc95753578779b5f093 Mon Sep 17 00:00:00 2001 From: manycoding Date: Fri, 4 Oct 2019 16:08:10 -0300 Subject: [PATCH 28/31] Add to travis, fix request import in mypy 0.730 --- .travis.yml | 2 +- src/arche/tools/bitbucket.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9b654e9..7e59224 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ install: - pip install tox-travis pip -U --no-cache-dir script: - tox - - tox -e docs + - tox -e pep8, mypy, docs after_success: - tox -e codecov deploy: diff --git a/src/arche/tools/bitbucket.py b/src/arche/tools/bitbucket.py index 03c86ab..90ffba2 100644 --- a/src/arche/tools/bitbucket.py +++ b/src/arche/tools/bitbucket.py @@ -2,7 +2,7 @@ import os import re from typing import Dict -import urllib +from urllib.request import Request NETLOC = os.getenv("BITBUCKET_NETLOC") or "bitbucket.org" @@ -11,13 +11,13 @@ PASS = os.getenv("BITBUCKET_PASSWORD") -def prepare_request(url: str) -> urllib.request.Request: +def prepare_request(url: str) -> Request: if not USER or not PASS: msg = "Credentials not found: `BITBUCKET_USER` or `BITBUCKET_PASSWORD` not set." raise ValueError(msg) api_url = convert_to_api_url(url, NETLOC, API_NETLOC) - return urllib.request.Request(api_url, headers=get_auth_header(USER, PASS)) + return Request(api_url, headers=get_auth_header(USER, PASS)) def convert_to_api_url(url: str, netloc: str, api_netloc: str) -> str: From e9b66048d7249b4d6ed1eda22f64c9b8d1856a1c Mon Sep 17 00:00:00 2001 From: manycoding Date: Fri, 4 Oct 2019 16:13:31 -0300 Subject: [PATCH 29/31] Remove redundant casting --- src/arche/tools/api.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index 748ad3f..db19984 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -144,9 +144,10 @@ def get_items_with_pool( A numpy array of items """ active_connections_limit = 10 - processes_count: int = int( - min(max(helpers.cpus_count() or 0, workers), active_connections_limit) + processes_count: int = min( + max(helpers.cpus_count() or 0, workers), active_connections_limit ) + batch_size = math.ceil(count / processes_count) start_idxs = range(start_index, start_index + count, batch_size) From 6771ab8a2749058d94f9da28f4494d45cebc0c0b Mon Sep 17 00:00:00 2001 From: manycoding Date: Fri, 4 Oct 2019 16:19:10 -0300 Subject: [PATCH 30/31] Another redundant casting --- src/arche/readers/schema.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index 222ac3a..7d78bb6 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -2,7 +2,7 @@ from enum import Enum import json import pprint -from typing import Dict, List, Union, cast, Any, ItemsView, Set, DefaultDict +from typing import Dict, List, Union, Any, Set, DefaultDict from arche.tools import s3 import perfect_jsonschema @@ -41,14 +41,9 @@ def __repr__(self): return pprint.pformat(self.raw) def get_enums(self) -> List[str]: - enums = [] - # self.raw["properties"].items() has type: - # ItemsView[str, Union[str, bool, int, float, None, list[Any]]] - properties = cast( - ItemsView[str, Dict[str, Any]], self.raw["properties"].items() - ) - for k, v in properties: - if "enum" in v.keys(): + enums: List[str] = [] + for k, v in self.raw["properties"].items(): + if isinstance(v, Dict) and "enum" in v.keys(): enums.append(k) return enums From 561c2ca7d8025474c68ab0a84091692e97dacaa3 Mon Sep 17 00:00:00 2001 From: manycoding Date: Fri, 4 Oct 2019 16:23:59 -0300 Subject: [PATCH 31/31] Spaces are bad --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7e59224..0b68ddb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ install: - pip install tox-travis pip -U --no-cache-dir script: - tox - - tox -e pep8, mypy, docs + - tox -e pep8,mypy,docs after_success: - tox -e codecov deploy: