scrapinghub · manycoding · Oct 4, 2019 · Jul 29, 2019 · Jul 29, 2019 · Jul 29, 2019
diff --git a/Pipfile b/Pipfile
@@ -14,6 +14,7 @@ fastjsonschema = "*"
 perfect-jsonschema = "*"
 tqdm = "*"
 ipywidgets = "*"
+mypy = "*"
 
 [dev-packages]
 jupyterlab = "*"
@@ -42,6 +43,7 @@ pyarrow = "*"
 cufflinks = "*"
 tables = "*"
 nb-black = "*"
+pylint = "*"
 
 [requires]
 python_version = "3.7"

diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py
@@ -1,6 +1,6 @@
 from io import StringIO
 import json
-from typing import Optional
+from typing import Optional, List
 
 
 from arche.figures import tables
@@ -36,15 +36,15 @@ def __init__(
         """
         self.schema = schema
         self.report = report
-        self.figures = []
+        self.figures: List = []
         self.appendix = self.create_appendix(self.schema.raw)
         self.create_figures(items)
         self.plot_to_notebook()
 
         if bucket:
             self.save_report_to_bucket(
                 project_id=items.key.split("/")[0],
-                spider=items.job.metadata.get("spider"),
+                spider=items.job.metadata.get("spider"),  # type: ignore
                 bucket=bucket,
             )
 
@@ -63,7 +63,7 @@ def create_figures(self, items: CloudItems):
         no_of_price_warns = price_was_now_result.err_items_count
         no_of_checked_price_items = price_was_now_result.items_count
 
-        crawlera_user = api.get_crawlera_user(items.job)
+        crawlera_user = api.get_crawlera_user(items.job)  # type: ignore
 
         validation_errors = self.report.results.get(
             "JSON Schema Validation",
@@ -77,7 +77,7 @@ def create_figures(self, items: CloudItems):
         )
 
         quality_estimation, field_accuracy = generate_quality_estimation(
-            items.job,
+            items.job,  # type: ignore
             crawlera_user,
             validation_errors,
             name_url_dups.err_items_count,
@@ -91,7 +91,7 @@ def create_figures(self, items: CloudItems):
         )
 
         self.score_table(quality_estimation, field_accuracy)
-        self.job_summary_table(items.job)
+        self.job_summary_table(items.job)  # type: ignore
         self.rules_summary_table(
             items.df,
             validation_errors,

diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py
@@ -47,6 +47,7 @@ def origin_column_name(self, new: str) -> str:
         for column in self.df.columns:
             if column in new:
                 return column
+        return ""
 
     @classmethod
     def from_df(cls, df: pd.DataFrame):
@@ -66,7 +67,7 @@ def __init__(
     ):
         self.key = key
         self._count = count
-        self._limit = None
+        self._limit: Any = None
         self.filters = filters
         raw = self.fetch_data()
         df = pd.DataFrame(list(raw))
@@ -104,7 +105,7 @@ def __init__(
         filters: Optional[api.Filters] = None,
     ):
         self.start_index = start_index
-        self.start: int = f"{key}/{start_index}"
+        self.start: str = f"{key}/{start_index}"
         self._job: Job = None
         super().__init__(key, count, filters)
 

diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py
@@ -2,7 +2,7 @@
 from enum import Enum
 import json
 import pprint
-from typing import Dict, List, Union
+from typing import Dict, List, Union, cast, Any, ItemsView
 
 from arche.tools import s3
 import perfect_jsonschema
@@ -42,23 +42,31 @@ def __repr__(self):
 
     def get_enums(self) -> List[str]:
         enums = []
-        for k, v in self.raw["properties"].items():
+        # self.raw["properties"].items() has type:
+        # ItemsView[str, Union[str, bool, int, float, None, list[Any]]]
+        properties = cast(
+            ItemsView[str, Dict[str, Any]], self.raw["properties"].items()
+        )
+        for k, v in properties:
             if "enum" in v.keys():
                 enums.append(k)
         return enums
 
     @staticmethod
     def get_tags(schema: RawSchema) -> TaggedFields:
-        tagged_fields = defaultdict(list)
-        for key, value in schema["properties"].items():
+        tagged_fields: Dict[str, List[str]] = defaultdict(list)
+        # schema["properties"].items() has type:
+        # ItemsView[str, Union[str, bool, int, float, None, list[Any]]]
+        properties = cast(ItemsView[str, Dict[str, Any]], schema["properties"].items())
+        for key, value in properties:
             property_tags = value.get("tag", [])
             if property_tags:
                 tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields)
         return tagged_fields
 
     @classmethod
     def get_field_tags(
-        cls, tags: List[str], field: str, tagged_fields: defaultdict
+        cls, tags: List[str], field: str, tagged_fields: Dict
     ) -> TaggedFields:
         tags = cls.parse_tag(tags)
         if not tags:

diff --git a/src/arche/report.py b/src/arche/report.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Dict
+from typing import Dict, Union
 
 from arche import SH_URL
 from arche.rules.result import Level, Outcome, Result
@@ -44,7 +44,9 @@ def write_summary(cls, result: Result) -> None:
                 cls.write_rule_outcome(rule_msg.summary, level)
 
     @classmethod
-    def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None:
+    def write_rule_outcome(
+        cls, outcome: Union[str, Outcome], level: Level = Level.INFO
+    ) -> None:
         if isinstance(outcome, Outcome):
             outcome = outcome.name
         msg = outcome

diff --git a/src/arche/rules/duplicates.py b/src/arche/rules/duplicates.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Set
 
 from arche.readers.schema import TaggedFields
 from arche.rules.result import Result, Outcome
@@ -18,7 +18,7 @@ def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    err_keys = set()
+    err_keys: Set = set()
     for field in unique_fields:
         result.items_count = df[field].count()
         duplicates = df[df.duplicated(field, keep=False)][[field]]

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
@@ -1,5 +1,6 @@
 import codecs
 import re
+from typing import Set
 
 from arche.rules.result import Outcome, Result
 import numpy as np
@@ -90,7 +91,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
     )
 
     errors = {}
-    row_keys = set()
+    row_keys: Set = set()
     rule_result = Result("Garbage Symbols", items_count=len(df))
 
     for column in tqdm_notebook(

diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 from arche.readers.schema import TaggedFields
 from arche.rules.result import Result, Outcome
 from arche.tools.helpers import is_number, ratio_diff
@@ -75,7 +77,7 @@ def compare_prices_for_same_urls(
         missing and new `product_url_field` tagged fields.
     """
     result = Result("Compare Prices For Same Urls")
-    url_field = tagged_fields.get("product_url_field")
+    url_field: Any = tagged_fields.get("product_url_field")
     if not url_field:
         result.add_info(Outcome.SKIPPED)
         return result
@@ -108,11 +110,11 @@ def compare_prices_for_same_urls(
     result.add_info(f"{len(same_urls)} same urls in both jobs")
 
     diff_prices_count = 0
-    price_field = tagged_fields.get("product_price_field")
-    if not price_field:
+    price_field_tag = tagged_fields.get("product_price_field")
+    if not price_field_tag:
         result.add_info("product_price_field tag is not set")
     else:
-        price_field = price_field[0]
+        price_field = price_field_tag[0]
         detailed_messages = []
         for url in same_urls:
             if url.strip() != "nan":
@@ -153,8 +155,8 @@ def compare_names_for_same_urls(
     compare `name_field` field"""
 
     result = Result("Compare Names Per Url")
-    url_field = tagged_fields.get("product_url_field")
-    name_field = tagged_fields.get("name_field")
+    url_field: Any = tagged_fields.get("product_url_field")
+    name_field: Any = tagged_fields.get("name_field")
     if not url_field or not name_field:
         result.add_info(Outcome.SKIPPED)
         return result
@@ -200,12 +202,12 @@ def compare_prices_for_same_names(
     source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields
 ):
     result = Result("Compare Prices For Same Names")
-    name_field = tagged_fields.get("name_field")
-    if not name_field:
+    name_field_tag = tagged_fields.get("name_field")
+    if not name_field_tag:
         result.add_info(Outcome.SKIPPED)
         return result
 
-    name_field = name_field[0]
+    name_field = name_field_tag[0]
     source_df = source_df[source_df[name_field].notnull()]
     target_df = target_df[target_df[name_field].notnull()]
 
@@ -232,12 +234,12 @@ def compare_prices_for_same_names(
     result.add_info(f"{len(same_names)} same names in both jobs")
 
     price_tag = "product_price_field"
-    price_field = tagged_fields.get(price_tag)
-    if not price_field:
+    price_field_tag = tagged_fields.get(price_tag)
+    if not price_field_tag:
         result.add_info("product_price_field tag is not set")
         return result
 
-    price_field = price_field[0]
+    price_field = price_field_tag[0]
     count = 0
 
     detailed_messages = []

diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py
@@ -2,7 +2,7 @@
 from enum import Enum
 import itertools
 import math
-from typing import Dict, List, Optional, Set, Union
+from typing import Dict, List, Optional, Set, Union, cast
 
 import IPython
 import numpy as np
@@ -40,7 +40,11 @@ class Message:
     summary: str
     detailed: Optional[str] = None
     errors: Optional[Dict[str, Set]] = None
-    _err_keys: Optional[Set[Union[str, int]]] = field(default_factory=set)
+
+    # expression "field(default_factory=set)" has type "Set[_T]", so we have to cast
+    _err_keys: Optional[Set[Union[str, int]]] = cast(
+        Optional[Set[Union[str, int]]], field(default_factory=set)
+    )
 
     @property
     def err_keys(self):
@@ -246,7 +250,7 @@ def build_stack_bar_data(values_counts: List[pd.Series]) -> List[go.Bar]:
         Returns:
             A list of Bar objects.
         """
-        data = []
+        data: List[go.Bar] = []
         for vc in values_counts:
             data = data + [
                 go.Bar(

diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py
@@ -3,7 +3,7 @@
 import math
 from multiprocessing import Pool
 import time
-from typing import Dict, List, Tuple, Optional, Union
+from typing import Dict, List, Tuple, Optional, Union, cast
 
 from arche.tools import helpers
 from dateutil.relativedelta import relativedelta
@@ -144,7 +144,9 @@ def get_items_with_pool(
         A numpy array of items
     """
     active_connections_limit = 10
-    processes_count = min(max(helpers.cpus_count(), workers), active_connections_limit)
+    processes_count: int = cast(
+        int, min(max(helpers.cpus_count(), workers), active_connections_limit)
+    )
     batch_size = math.ceil(count / processes_count)
 
     start_idxs = range(start_index, start_index + count, batch_size)

diff --git a/src/arche/tools/helpers.py b/src/arche/tools/helpers.py
@@ -76,7 +76,7 @@ def is_number(s):
     return True
 
 
-def cpus_count() -> int:
+def cpus_count() -> Optional[int]:
     try:
         return len(os.sched_getaffinity(0))
     except AttributeError:

diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 import random
-from typing import Any, Deque, Dict, List, Optional
+from typing import Any, Deque, Dict, List, Optional, DefaultDict
 
 from arche.readers.items import RawItems
 from arche.readers.schema import Schema
@@ -92,7 +92,7 @@ def fast_validate(
     Returns:
         A dictionary of errors with message and item keys
     """
-    errors = defaultdict(set)
+    errors: DefaultDict = defaultdict(set)
 
     validate = fastjsonschema.compile(schema)
     for i, raw_item in enumerate(
@@ -113,7 +113,7 @@ def full_validate(
     """This function uses jsonschema validator which returns all found error per item.
     See `fast_validate()` for arguments descriptions.
     """
-    errors = defaultdict(set)
+    errors: DefaultDict = defaultdict(set)
 
     validator = validators.validator_for(schema)(schema)
     validator.format_checker = FormatChecker()
@@ -134,7 +134,7 @@ def format_validation_message(
     error_msg: str, path: Deque, schema_path: Deque, validator: str
 ) -> str:
     str_path = "/".join(p for p in path if isinstance(p, str))
-    schema_path = "/".join(p for p in schema_path)
+    schema_path = "/".join(p for p in schema_path)  # type: ignore
 
     if validator == "anyOf":
         if str_path:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -211,8 +211,8 @@ def create_result(
     items_count: Optional[int] = None,
 ) -> Result:
     result = Result(rule_name)
-    for level, messages in messages.items():
-        for message in messages:
+    for level, messages_list in messages.items():
+        for message in messages_list:
             result.add_message(level, *message)
 
     if stats:

diff --git a/tests/test_arche.py b/tests/test_arche.py
@@ -1,3 +1,5 @@
+from typing import Dict, List
+
 from arche import arche, SH_URL
 from arche.arche import Arche
 from arche.rules.result import Level
@@ -34,7 +36,7 @@ def test_arche_df(get_df):
     pd.testing.assert_frame_equal(a.target_items.df, get_df)
 
 
-schema_dummies = [{"properties": {"name": {}}}, {"properties": {"url": {}}}]
+schema_dummies: List[Dict] = [{"properties": {"name": {}}}, {"properties": {"url": {}}}]
 
 
 def test_schema():