scrapinghub · manycoding · Oct 4, 2019 · Jul 29, 2019 · Jul 29, 2019 · Jul 29, 2019
diff --git a/Pipfile b/Pipfile
@@ -14,6 +14,7 @@ fastjsonschema = "*"
 perfect-jsonschema = "*"
 tqdm = "*"
 ipywidgets = "*"
+mypy = "*"
 
 [dev-packages]
 jupyterlab = "*"

diff --git a/src/arche/arche.py b/src/arche/arche.py
@@ -1,6 +1,6 @@
 from functools import lru_cache
 import logging
-from typing import Iterable, Optional, Union
+from typing import Iterable, Optional, Union, cast
 
 from arche.data_quality_report import DataQualityReport
 from arche.readers.items import Items, CollectionItems, JobItems, RawItems
@@ -106,15 +106,15 @@ def schema(self, schema_source):
     def get_items(
         source: Union[str, pd.DataFrame, RawItems],
         count: Optional[int],
-        start: Union[str, int],
+        start: Optional[str],
         filters: Optional[api.Filters],
     ) -> Items:
         if isinstance(source, pd.DataFrame):
             return Items.from_df(source)
         elif isinstance(source, Iterable) and not isinstance(source, str):
-            return Items.from_array(source)
+            return Items.from_array(cast(RawItems, source))
         elif helpers.is_job_key(source):
-            return JobItems(source, count, start or 0, filters)
+            return JobItems(source, count, int(start or 0), filters)
         elif helpers.is_collection_key(source):
             return CollectionItems(source, count, start, filters)
         else:
@@ -140,7 +140,7 @@ def run_all_rules(self):
         self.run_schema_rules()
 
     def data_quality_report(self, bucket: Optional[str] = None):
-        if helpers.is_collection_key(self.source):
+        if helpers.is_collection_key(str(self.source or "")):
             raise ValueError("Collections are not supported")
         if not self.schema:
             raise ValueError("Schema is empty")

diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py
@@ -1,11 +1,11 @@
 from io import StringIO
 import json
-from typing import Optional
+from typing import Optional, List
 
 
 from arche.figures import tables
 from arche.quality_estimation_algorithm import generate_quality_estimation
-from arche.readers.items import CloudItems
+from arche.readers.items import JobItems
 from arche.readers.schema import Schema
 from arche.report import Report
 import arche.rules.coverage as coverage_rules
@@ -23,7 +23,7 @@
 class DataQualityReport:
     def __init__(
         self,
-        items: CloudItems,
+        items: JobItems,
         schema: Schema,
         report: Report,
         bucket: Optional[str] = None,
@@ -36,7 +36,7 @@ def __init__(
         """
         self.schema = schema
         self.report = report
-        self.figures = []
+        self.figures: List = []
         self.appendix = self.create_appendix(self.schema.raw)
         self.create_figures(items)
         self.plot_to_notebook()
@@ -48,7 +48,7 @@ def __init__(
                 bucket=bucket,
             )
 
-    def create_figures(self, items: CloudItems):
+    def create_figures(self, items: JobItems):
         name_url_dups = self.report.results.get(
             "Duplicates By **name_field, product_url_field** Tags",
             duplicate_rules.find_by_name_url(items.df, self.schema.tags),

diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py
@@ -47,6 +47,7 @@ def origin_column_name(self, new: str) -> str:
         for column in self.df.columns:
             if column in new:
                 return column
+        return ""
 
     @classmethod
     def from_df(cls, df: pd.DataFrame):
@@ -66,7 +67,7 @@ def __init__(
     ):
         self.key = key
         self._count = count
-        self._limit = None
+        self._limit: int = 0
         self.filters = filters
         raw = self.fetch_data()
         df = pd.DataFrame(list(raw))
@@ -104,8 +105,9 @@ def __init__(
         filters: Optional[api.Filters] = None,
     ):
         self.start_index = start_index
-        self.start: int = f"{key}/{start_index}"
+        self.start: str = f"{key}/{start_index}"
         self._job: Job = None
+        self._limit: int = 0
         super().__init__(key, count, filters)
 
     @property
@@ -154,6 +156,7 @@ def __init__(
         filters: Optional[api.Filters] = None,
     ):
         self.start = start
+        self._limit: int = 0
         super().__init__(key, count, filters)
 
     @property

diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py
@@ -2,7 +2,7 @@
 from enum import Enum
 import json
 import pprint
-from typing import Dict, List, Union
+from typing import Dict, List, Union, cast, Any, ItemsView, Set
 
 from arche.tools import s3
 import perfect_jsonschema
@@ -42,23 +42,31 @@ def __repr__(self):
 
     def get_enums(self) -> List[str]:
         enums = []
-        for k, v in self.raw["properties"].items():
+        # self.raw["properties"].items() has type:
+        # ItemsView[str, Union[str, bool, int, float, None, list[Any]]]
+        properties = cast(
+            ItemsView[str, Dict[str, Any]], self.raw["properties"].items()
+        )
+        for k, v in properties:
             if "enum" in v.keys():
                 enums.append(k)
         return enums
 
     @staticmethod
     def get_tags(schema: RawSchema) -> TaggedFields:
-        tagged_fields = defaultdict(list)
-        for key, value in schema["properties"].items():
+        tagged_fields: Dict[str, List[str]] = defaultdict(list)
+        # schema["properties"].items() has type:
+        # ItemsView[str, Union[str, bool, int, float, None, list[Any]]]
+        properties = cast(ItemsView[str, Dict[str, Any]], schema["properties"].items())
+        for key, value in properties:
             property_tags = value.get("tag", [])
             if property_tags:
                 tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields)
         return tagged_fields
 
     @classmethod
     def get_field_tags(
-        cls, tags: List[str], field: str, tagged_fields: defaultdict
+        cls, tags: Set[Any], field: str, tagged_fields: Dict
     ) -> TaggedFields:
         tags = cls.parse_tag(tags)
         if not tags:

diff --git a/src/arche/report.py b/src/arche/report.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Dict
+from typing import Dict, Union
 
 from arche import SH_URL
 from arche.rules.result import Level, Outcome, Result
@@ -44,7 +44,9 @@ def write_summary(cls, result: Result) -> None:
                 cls.write_rule_outcome(rule_msg.summary, level)
 
     @classmethod
-    def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None:
+    def write_rule_outcome(
+        cls, outcome: Union[str, Outcome], level: Level = Level.INFO
+    ) -> None:
         if isinstance(outcome, Outcome):
             outcome = outcome.name
         msg = outcome

diff --git a/src/arche/rules/duplicates.py b/src/arche/rules/duplicates.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Set
 
 from arche.readers.schema import TaggedFields
 from arche.rules.result import Result, Outcome
@@ -18,7 +18,7 @@ def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    err_keys = set()
+    err_keys: Set = set()
     for field in unique_fields:
         result.items_count = df[field].count()
         duplicates = df[df.duplicated(field, keep=False)][[field]]

diff --git a/src/arche/rules/json_schema.py b/src/arche/rules/json_schema.py
@@ -25,7 +25,7 @@ def validate(
     err_items = len(set(itertools.chain.from_iterable(errors.values())))
     if errors:
         result.add_error(
-            f"{err_items} ({err_items/len(raw_items):.0%}) items have {len(errors)} errors",
+            f"{err_items} ({err_items/len(list(raw_items)):.0%}) items have {len(errors)} errors",  # noqa
-            f"{err_items} ({err_items/len(list(raw_items)):.0%}) items have {len(errors)} errors",  # noqa
+            f"{err_items} ({err_items/len(list(raw_items)):.0%}) "
+            f"items have {len(errors)} errors",
-            f"{err_items} ({err_items/len(list(raw_items)):.0%}) items have {len(errors)} errors",  # noqa
+            f"{err_items} ({err_items/len(list(raw_items)):.0%}) "
+            f"items have {len(errors)} errors",
             errors=errors,
         )
     return result

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
@@ -1,5 +1,6 @@
 import codecs
 import re
+from typing import Set
 
 from arche.rules.result import Outcome, Result
 import numpy as np
@@ -90,7 +91,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
     )
 
     errors = {}
-    row_keys = set()
+    row_keys: Set = set()
     rule_result = Result("Garbage Symbols", items_count=len(df))
 
     for column in tqdm_notebook(

diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py
@@ -1,3 +1,5 @@
+from typing import Any, Optional, List
+
 from arche.readers.schema import TaggedFields
 from arche.rules.result import Result, Outcome
 from arche.tools.helpers import is_number, ratio_diff
@@ -75,12 +77,12 @@ def compare_prices_for_same_urls(
         missing and new `product_url_field` tagged fields.
     """
     result = Result("Compare Prices For Same Urls")
-    url_field = tagged_fields.get("product_url_field")
-    if not url_field:
+    url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field")
+    if not url_field_list:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    url_field = url_field[0]
+    url_field = url_field_list[0]
 
     source_df = source_df.dropna(subset=[url_field])
     target_df = target_df.dropna(subset=[url_field])
@@ -108,11 +110,11 @@ def compare_prices_for_same_urls(
     result.add_info(f"{len(same_urls)} same urls in both jobs")
 
     diff_prices_count = 0
-    price_field = tagged_fields.get("product_price_field")
-    if not price_field:
+    price_field_tag = tagged_fields.get("product_price_field")
+    if not price_field_tag:
         result.add_info("product_price_field tag is not set")
     else:
-        price_field = price_field[0]
+        price_field = price_field_tag[0]
         detailed_messages = []
         for url in same_urls:
             if url.strip() != "nan":
@@ -153,8 +155,8 @@ def compare_names_for_same_urls(
     compare `name_field` field"""
 
     result = Result("Compare Names Per Url")
-    url_field = tagged_fields.get("product_url_field")
-    name_field = tagged_fields.get("name_field")
+    url_field: Any = tagged_fields.get("product_url_field")
+    name_field: Any = tagged_fields.get("name_field")
     if not url_field or not name_field:
         result.add_info(Outcome.SKIPPED)
         return result
@@ -200,12 +202,12 @@ def compare_prices_for_same_names(
     source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields
 ):
     result = Result("Compare Prices For Same Names")
-    name_field = tagged_fields.get("name_field")
-    if not name_field:
+    name_field_tag = tagged_fields.get("name_field")
+    if not name_field_tag:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    name_field = name_field[0]
+    name_field = name_field_tag[0]
     source_df = source_df[source_df[name_field].notnull()]
     target_df = target_df[target_df[name_field].notnull()]
 
@@ -232,12 +234,12 @@ def compare_prices_for_same_names(
     result.add_info(f"{len(same_names)} same names in both jobs")
 
     price_tag = "product_price_field"
-    price_field = tagged_fields.get(price_tag)
-    if not price_field:
+    price_field_tag = tagged_fields.get(price_tag)
+    if not price_field_tag:
         result.add_info("product_price_field tag is not set")
         return result
 
-    price_field = price_field[0]
+    price_field = price_field_tag[0]
     count = 0
 
     detailed_messages = []

diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py
@@ -40,7 +40,9 @@ class Message:
     summary: str
     detailed: Optional[str] = None
     errors: Optional[Dict[str, Set]] = None
-    _err_keys: Optional[Set[Union[str, int]]] = field(default_factory=set)
+
+    # expression "field(default_factory=set)" has type "Set[_T]", so we have to cast
+    _err_keys: Set[Union[str, int]] = field(default_factory=set)
 
     @property
     def err_keys(self):
@@ -246,7 +248,7 @@ def build_stack_bar_data(values_counts: List[pd.Series]) -> List[go.Bar]:
         Returns:
             A list of Bar objects.
         """
-        data = []
+        data: List[go.Bar] = []
         for vc in values_counts:
             data = data + [
                 go.Bar(

diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py
@@ -144,7 +144,9 @@ def get_items_with_pool(
         A numpy array of items
     """
     active_connections_limit = 10
-    processes_count = min(max(helpers.cpus_count(), workers), active_connections_limit)
+    processes_count: int = int(
+        min(max(helpers.cpus_count(), workers), active_connections_limit) or 0
+    )
     batch_size = math.ceil(count / processes_count)
 
     start_idxs = range(start_index, start_index + count, batch_size)

diff --git a/src/arche/tools/bitbucket.py b/src/arche/tools/bitbucket.py
@@ -23,7 +23,7 @@ def prepare_request(url: str) -> urllib.request.Request:
 def convert_to_api_url(url: str, netloc: str, api_netloc: str) -> str:
     """Support both regular and raw URLs"""
     try:
-        user, repo, path = re.search(
+        user, repo, path = re.search(  # type: ignore
             f"https://{netloc}/(.*?)/(.*?)/(?:raw|src)/(.*)", url
         ).groups()
     except AttributeError:

diff --git a/src/arche/tools/helpers.py b/src/arche/tools/helpers.py
@@ -76,7 +76,7 @@ def is_number(s):
     return True
 
 
-def cpus_count() -> int:
+def cpus_count() -> Optional[int]:
     try:
         return len(os.sched_getaffinity(0))
     except AttributeError: