scrapinghub · manycoding · Oct 4, 2019 · Jul 29, 2019 · Jul 29, 2019 · Jul 29, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,7 @@ install:
   - pip install tox-travis pip -U --no-cache-dir
 script:
   - tox
-  - tox -e docs
+  - tox -e pep8,mypy,docs
 after_success:
   - tox -e codecov
 deploy:

diff --git a/Pipfile b/Pipfile
@@ -42,6 +42,7 @@ pyarrow = "*"
 cufflinks = "*"
 tables = "*"
 nb-black = "*"
+mypy = "*"
 
 [requires]
 python_version = "3.7"

diff --git a/src/arche/arche.py b/src/arche/arche.py
@@ -1,6 +1,6 @@
 from functools import lru_cache
 import logging
-from typing import Iterable, Optional, Union
+from typing import Iterable, Optional, Union, cast
 
 from arche.data_quality_report import DataQualityReport
 from arche.readers.items import Items, CollectionItems, JobItems, RawItems
@@ -106,15 +106,15 @@ def schema(self, schema_source):
     def get_items(
         source: Union[str, pd.DataFrame, RawItems],
         count: Optional[int],
-        start: Union[str, int],
+        start: Optional[str],
         filters: Optional[api.Filters],
     ) -> Items:
         if isinstance(source, pd.DataFrame):
             return Items.from_df(source)
         elif isinstance(source, Iterable) and not isinstance(source, str):
-            return Items.from_array(source)
+            return Items.from_array(cast(RawItems, source))
         elif helpers.is_job_key(source):
-            return JobItems(source, count, start or 0, filters)
+            return JobItems(source, count, int(start or 0), filters)
         elif helpers.is_collection_key(source):
             return CollectionItems(source, count, start, filters)
         else:
@@ -140,7 +140,7 @@ def run_all_rules(self):
         self.run_schema_rules()
 
     def data_quality_report(self, bucket: Optional[str] = None):
-        if helpers.is_collection_key(self.source):
+        if helpers.is_collection_key(str(self.source)):
             raise ValueError("Collections are not supported")
         if not self.schema:
             raise ValueError("Schema is empty")

diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py
@@ -1,11 +1,11 @@
 from io import StringIO
 import json
-from typing import Optional
+from typing import Optional, List
 
 
 from arche.figures import tables
 from arche.quality_estimation_algorithm import generate_quality_estimation
-from arche.readers.items import CloudItems
+from arche.readers.items import JobItems
 from arche.readers.schema import Schema
 from arche.report import Report
 import arche.rules.coverage as coverage_rules
@@ -23,7 +23,7 @@
 class DataQualityReport:
     def __init__(
         self,
-        items: CloudItems,
+        items: JobItems,
         schema: Schema,
         report: Report,
         bucket: Optional[str] = None,
@@ -36,7 +36,7 @@ def __init__(
         """
         self.schema = schema
         self.report = report
-        self.figures = []
+        self.figures: List = []
         self.appendix = self.create_appendix(self.schema.raw)
         self.create_figures(items)
         self.plot_to_notebook()
@@ -48,7 +48,7 @@ def __init__(
                 bucket=bucket,
             )
 
-    def create_figures(self, items: CloudItems):
+    def create_figures(self, items: JobItems):
         name_url_dups = self.report.results.get(
             "Duplicates By **name_field, product_url_field** Tags",
             duplicate_rules.find_by_name_url(items.df, self.schema.tags),

diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py
@@ -41,13 +41,6 @@ def categorize(df: pd.DataFrame) -> pd.DataFrame:
             except TypeError:
                 continue
 
-    def origin_column_name(self, new: str) -> str:
-        if new in self.df.columns:
-            return new
-        for column in self.df.columns:
-            if column in new:
-                return column
-
     @classmethod
     def from_df(cls, df: pd.DataFrame):
         return cls(raw=np.array(df.to_dict("records")), df=df)
@@ -66,7 +59,7 @@ def __init__(
     ):
         self.key = key
         self._count = count
-        self._limit = None
+        self._limit: int = 0
         self.filters = filters
         raw = self.fetch_data()
         df = pd.DataFrame(list(raw))
@@ -104,7 +97,7 @@ def __init__(
         filters: Optional[api.Filters] = None,
     ):
         self.start_index = start_index
-        self.start: int = f"{key}/{start_index}"
+        self.start: str = f"{key}/{start_index}"
         self._job: Job = None
         super().__init__(key, count, filters)
 

diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py
@@ -2,7 +2,7 @@
 from enum import Enum
 import json
 import pprint
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Any, Set, DefaultDict
 
 from arche.tools import s3
 import perfect_jsonschema
@@ -41,25 +41,28 @@ def __repr__(self):
         return pprint.pformat(self.raw)
 
     def get_enums(self) -> List[str]:
-        enums = []
+        enums: List[str] = []
         for k, v in self.raw["properties"].items():
-            if "enum" in v.keys():
+            if isinstance(v, Dict) and "enum" in v.keys():
                 enums.append(k)
         return enums
 
     @staticmethod
     def get_tags(schema: RawSchema) -> TaggedFields:
-        tagged_fields = defaultdict(list)
+        tagged_fields: DefaultDict[str, List[str]] = defaultdict(list)
         for key, value in schema["properties"].items():
-            property_tags = value.get("tag", [])
-            if property_tags:
-                tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields)
-        return tagged_fields
+            if isinstance(value, Dict):
+                property_tags = value.get("tag")
+                if property_tags:
+                    tagged_fields = Schema.get_field_tags(
+                        property_tags, key, tagged_fields
+                    )
+        return dict(tagged_fields)
 
     @classmethod
     def get_field_tags(
-        cls, tags: List[str], field: str, tagged_fields: defaultdict
-    ) -> TaggedFields:
+        cls, tags: Set[Any], field: str, tagged_fields: DefaultDict
+    ) -> DefaultDict[str, List[str]]:
         tags = cls.parse_tag(tags)
         if not tags:
             raise ValueError(

diff --git a/src/arche/report.py b/src/arche/report.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Dict
+from typing import Dict, Union
 
 from arche import SH_URL
 from arche.rules.result import Level, Outcome, Result
@@ -44,7 +44,9 @@ def write_summary(cls, result: Result) -> None:
                 cls.write_rule_outcome(rule_msg.summary, level)
 
     @classmethod
-    def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None:
+    def write_rule_outcome(
+        cls, outcome: Union[str, Outcome], level: Level = Level.INFO
+    ) -> None:
         if isinstance(outcome, Outcome):
             outcome = outcome.name
         msg = outcome

diff --git a/src/arche/rules/duplicates.py b/src/arche/rules/duplicates.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Set
 
 from arche.readers.schema import TaggedFields
 from arche.rules.result import Result, Outcome
@@ -18,7 +18,7 @@ def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    err_keys = set()
+    err_keys: Set = set()
     for field in unique_fields:
         result.items_count = df[field].count()
         duplicates = df[df.duplicated(field, keep=False)][[field]]

diff --git a/src/arche/rules/json_schema.py b/src/arche/rules/json_schema.py
@@ -25,7 +25,7 @@ def validate(
     err_items = len(set(itertools.chain.from_iterable(errors.values())))
     if errors:
         result.add_error(
-            f"{err_items} ({err_items/len(raw_items):.0%}) items have {len(errors)} errors",
+            f"{err_items} ({err_items/len(list(raw_items)):.0%}) items have {len(errors)} errors",  # noqa
-            f"{err_items} ({err_items/len(list(raw_items)):.0%}) items have {len(errors)} errors",  # noqa
+            f"{err_items} ({err_items/len(list(raw_items)):.0%}) "
+            f"items have {len(errors)} errors",
-            f"{err_items} ({err_items/len(list(raw_items)):.0%}) items have {len(errors)} errors",  # noqa
+            f"{err_items} ({err_items/len(list(raw_items)):.0%}) "
+            f"items have {len(errors)} errors",
             errors=errors,
         )
     return result

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
@@ -1,5 +1,6 @@
 import codecs
 import re
+from typing import Set
 
 from arche.rules.result import Outcome, Result
 import numpy as np
@@ -90,7 +91,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
     )
 
     errors = {}
-    row_keys = set()
+    row_keys: Set = set()
     rule_result = Result("Garbage Symbols", items_count=len(df))
 
     for column in tqdm_notebook(

diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py
@@ -1,3 +1,5 @@
+from typing import Optional, List
+
 from arche.readers.schema import TaggedFields
 from arche.rules.result import Result, Outcome
 from arche.tools.helpers import is_number, ratio_diff
@@ -75,12 +77,12 @@ def compare_prices_for_same_urls(
         missing and new `product_url_field` tagged fields.
     """
     result = Result("Compare Prices For Same Urls")
-    url_field = tagged_fields.get("product_url_field")
-    if not url_field:
+    url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field")
+    if not url_field_list:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    url_field = url_field[0]
+    url_field = url_field_list[0]
 
     source_df = source_df.dropna(subset=[url_field])
     target_df = target_df.dropna(subset=[url_field])
@@ -108,11 +110,11 @@ def compare_prices_for_same_urls(
     result.add_info(f"{len(same_urls)} same urls in both jobs")
 
     diff_prices_count = 0
-    price_field = tagged_fields.get("product_price_field")
-    if not price_field:
+    price_field_tag = tagged_fields.get("product_price_field")
+    if not price_field_tag:
         result.add_info("product_price_field tag is not set")
     else:
-        price_field = price_field[0]
+        price_field = price_field_tag[0]
         detailed_messages = []
         for url in same_urls:
             if url.strip() != "nan":
@@ -153,14 +155,14 @@ def compare_names_for_same_urls(
     compare `name_field` field"""
 
     result = Result("Compare Names Per Url")
-    url_field = tagged_fields.get("product_url_field")
-    name_field = tagged_fields.get("name_field")
-    if not url_field or not name_field:
+    url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field")
+    name_field_list: Optional[List[str]] = tagged_fields.get("name_field")
+    if not url_field_list or not name_field_list:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    name_field = name_field[0]
-    url_field = url_field[0]
+    name_field: str = name_field_list[0]
+    url_field: str = url_field_list[0]
     diff_names_count = 0
 
     same_urls = source_df[(source_df[url_field].isin(target_df[url_field].values))][
@@ -200,12 +202,12 @@ def compare_prices_for_same_names(
     source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields
 ):
     result = Result("Compare Prices For Same Names")
-    name_field = tagged_fields.get("name_field")
-    if not name_field:
+    name_field_tag = tagged_fields.get("name_field")
+    if not name_field_tag:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    name_field = name_field[0]
+    name_field = name_field_tag[0]
     source_df = source_df[source_df[name_field].notnull()]
     target_df = target_df[target_df[name_field].notnull()]
 
@@ -232,12 +234,12 @@ def compare_prices_for_same_names(
     result.add_info(f"{len(same_names)} same names in both jobs")
 
     price_tag = "product_price_field"
-    price_field = tagged_fields.get(price_tag)
-    if not price_field:
+    price_field_tag = tagged_fields.get(price_tag)
+    if not price_field_tag:
         result.add_info("product_price_field tag is not set")
         return result
 
-    price_field = price_field[0]
+    price_field = price_field_tag[0]
     count = 0
 
     detailed_messages = []

diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py
@@ -40,7 +40,7 @@ class Message:
     summary: str
     detailed: Optional[str] = None
     errors: Optional[Dict[str, Set]] = None
-    _err_keys: Optional[Set[Union[str, int]]] = field(default_factory=set)
+    _err_keys: Set[Union[str, int]] = field(default_factory=set)
 
     @property
     def err_keys(self):
@@ -246,7 +246,7 @@ def build_stack_bar_data(values_counts: List[pd.Series]) -> List[go.Bar]:
         Returns:
             A list of Bar objects.
         """
-        data = []
+        data: List[go.Bar] = []
         for vc in values_counts:
             data = data + [
                 go.Bar(

diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py
@@ -144,7 +144,10 @@ def get_items_with_pool(
         A numpy array of items
     """
     active_connections_limit = 10
-    processes_count = min(max(helpers.cpus_count(), workers), active_connections_limit)
+    processes_count: int = min(
+        max(helpers.cpus_count() or 0, workers), active_connections_limit
+    )
+
     batch_size = math.ceil(count / processes_count)
 
     start_idxs = range(start_index, start_index + count, batch_size)

diff --git a/src/arche/tools/bitbucket.py b/src/arche/tools/bitbucket.py
@@ -2,7 +2,7 @@
 import os
 import re
 from typing import Dict
-import urllib
+from urllib.request import Request
 
 
 NETLOC = os.getenv("BITBUCKET_NETLOC") or "bitbucket.org"
@@ -11,22 +11,21 @@
 PASS = os.getenv("BITBUCKET_PASSWORD")
 
 
-def prepare_request(url: str) -> urllib.request.Request:
+def prepare_request(url: str) -> Request:
     if not USER or not PASS:
         msg = "Credentials not found: `BITBUCKET_USER` or `BITBUCKET_PASSWORD` not set."
         raise ValueError(msg)
 
     api_url = convert_to_api_url(url, NETLOC, API_NETLOC)
-    return urllib.request.Request(api_url, headers=get_auth_header(USER, PASS))
+    return Request(api_url, headers=get_auth_header(USER, PASS))
 
 
 def convert_to_api_url(url: str, netloc: str, api_netloc: str) -> str:
     """Support both regular and raw URLs"""
-    try:
-        user, repo, path = re.search(
-            f"https://{netloc}/(.*?)/(.*?)/(?:raw|src)/(.*)", url
-        ).groups()
-    except AttributeError:
+    match = re.search(f"https://{netloc}/(.*?)/(.*?)/(?:raw|src)/(.*)", url)
+    if match:
+        user, repo, path = match.groups()
+    else:
         raise ValueError("Not a valid bitbucket URL: {url}")
     return f"https://{api_netloc}/2.0/repositories/{user}/{repo}/src/{path}"
 

diff --git a/src/arche/tools/helpers.py b/src/arche/tools/helpers.py
@@ -76,7 +76,7 @@ def is_number(s):
     return True
 
 
-def cpus_count() -> int:
+def cpus_count() -> Optional[int]:
     try:
         return len(os.sched_getaffinity(0))
     except AttributeError: