Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mypy #152

Merged
merged 32 commits into from
Oct 4, 2019
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
f8fd116
fix some type hints after running mypy
andersonberg Jul 29, 2019
fb53167
fix type hints at readers/schema and rules/price
andersonberg Jul 29, 2019
429faf7
fix type hints at rules module
andersonberg Jul 29, 2019
52ce080
fix type hints at dqr
andersonberg Jul 29, 2019
fbf5484
adding more type annotation
andersonberg Jul 29, 2019
e3b30f4
merge with master
andersonberg Sep 13, 2019
c4af9e9
fix mypy typing - partial commit
andersonberg Sep 16, 2019
f11f278
fix typing at schema.py
andersonberg Sep 17, 2019
06aacf8
fix typing
andersonberg Sep 17, 2019
9b01ae1
pep8
andersonberg Sep 17, 2019
3c43406
fix mypy at arche.py
andersonberg Sep 26, 2019
82de89a
fix pep8; improve mypy typinh
andersonberg Sep 26, 2019
2606600
update typing at tools/schema.py
andersonberg Sep 26, 2019
f092fe8
updating typing
andersonberg Sep 27, 2019
6affcdf
remove cast
andersonberg Sep 27, 2019
80a5628
fixing type
andersonberg Sep 27, 2019
fa15974
fix typing at price.py
andersonberg Sep 29, 2019
e9c47d3
fix typing at price.py
andersonberg Sep 29, 2019
9558862
fix typing at api.py
andersonberg Sep 29, 2019
d6e6027
fix tests and pep8
andersonberg Sep 29, 2019
e432ece
fix typing at conftest
andersonberg Sep 30, 2019
1f46117
refactor typing at schema.py
andersonberg Oct 3, 2019
2fe5ba9
fix typing at price.py and result.py
andersonberg Oct 3, 2019
774f8c8
refactor
andersonberg Oct 3, 2019
797079e
refactor
andersonberg Oct 3, 2019
5d07463
update Pipfile
andersonberg Oct 4, 2019
d073642
refactoring
andersonberg Oct 4, 2019
d42dbe1
refactor typing
andersonberg Oct 4, 2019
0f3145f
Add to travis, fix request import in mypy 0.730
manycoding Oct 4, 2019
e9b6604
Remove redundant casting
manycoding Oct 4, 2019
6771ab8
Another redundant casting
manycoding Oct 4, 2019
561c2ca
Spaces are bad
manycoding Oct 4, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ fastjsonschema = "*"
perfect-jsonschema = "*"
tqdm = "*"
ipywidgets = "*"
mypy = "*"
manycoding marked this conversation as resolved.
Show resolved Hide resolved

[dev-packages]
jupyterlab = "*"
Expand Down Expand Up @@ -42,6 +43,7 @@ pyarrow = "*"
cufflinks = "*"
tables = "*"
nb-black = "*"
pylint = "*"
manycoding marked this conversation as resolved.
Show resolved Hide resolved

[requires]
python_version = "3.7"
Expand Down
12 changes: 6 additions & 6 deletions src/arche/data_quality_report.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from io import StringIO
import json
from typing import Optional
from typing import Optional, List


from arche.figures import tables
Expand Down Expand Up @@ -36,15 +36,15 @@ def __init__(
"""
self.schema = schema
self.report = report
self.figures = []
self.figures: List = []
self.appendix = self.create_appendix(self.schema.raw)
self.create_figures(items)
self.plot_to_notebook()

if bucket:
self.save_report_to_bucket(
project_id=items.key.split("/")[0],
spider=items.job.metadata.get("spider"),
spider=items.job.metadata.get("spider"), # type: ignore
bucket=bucket,
)

Expand All @@ -63,7 +63,7 @@ def create_figures(self, items: CloudItems):
no_of_price_warns = price_was_now_result.err_items_count
no_of_checked_price_items = price_was_now_result.items_count

crawlera_user = api.get_crawlera_user(items.job)
crawlera_user = api.get_crawlera_user(items.job) # type: ignore

validation_errors = self.report.results.get(
"JSON Schema Validation",
Expand All @@ -77,7 +77,7 @@ def create_figures(self, items: CloudItems):
)

quality_estimation, field_accuracy = generate_quality_estimation(
items.job,
items.job, # type: ignore
crawlera_user,
validation_errors,
name_url_dups.err_items_count,
Expand All @@ -91,7 +91,7 @@ def create_figures(self, items: CloudItems):
)

self.score_table(quality_estimation, field_accuracy)
self.job_summary_table(items.job)
self.job_summary_table(items.job) # type: ignore
self.rules_summary_table(
items.df,
validation_errors,
Expand Down
5 changes: 3 additions & 2 deletions src/arche/readers/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def origin_column_name(self, new: str) -> str:
for column in self.df.columns:
if column in new:
return column
return ""
manycoding marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def from_df(cls, df: pd.DataFrame):
Expand All @@ -66,7 +67,7 @@ def __init__(
):
self.key = key
self._count = count
self._limit = None
self._limit: Any = None
manycoding marked this conversation as resolved.
Show resolved Hide resolved
self.filters = filters
raw = self.fetch_data()
df = pd.DataFrame(list(raw))
Expand Down Expand Up @@ -104,7 +105,7 @@ def __init__(
filters: Optional[api.Filters] = None,
):
self.start_index = start_index
self.start: int = f"{key}/{start_index}"
self.start: str = f"{key}/{start_index}"
self._job: Job = None
super().__init__(key, count, filters)

Expand Down
18 changes: 13 additions & 5 deletions src/arche/readers/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from enum import Enum
import json
import pprint
from typing import Dict, List, Union
from typing import Dict, List, Union, cast, Any, ItemsView

from arche.tools import s3
import perfect_jsonschema
Expand Down Expand Up @@ -42,23 +42,31 @@ def __repr__(self):

def get_enums(self) -> List[str]:
enums = []
for k, v in self.raw["properties"].items():
# self.raw["properties"].items() has type:
# ItemsView[str, Union[str, bool, int, float, None, list[Any]]]
properties = cast(
ItemsView[str, Dict[str, Any]], self.raw["properties"].items()
)
for k, v in properties:
manycoding marked this conversation as resolved.
Show resolved Hide resolved
if "enum" in v.keys():
enums.append(k)
return enums

@staticmethod
def get_tags(schema: RawSchema) -> TaggedFields:
tagged_fields = defaultdict(list)
for key, value in schema["properties"].items():
tagged_fields: Dict[str, List[str]] = defaultdict(list)
manycoding marked this conversation as resolved.
Show resolved Hide resolved
# schema["properties"].items() has type:
# ItemsView[str, Union[str, bool, int, float, None, list[Any]]]
properties = cast(ItemsView[str, Dict[str, Any]], schema["properties"].items())
for key, value in properties:
property_tags = value.get("tag", [])
manycoding marked this conversation as resolved.
Show resolved Hide resolved
if property_tags:
tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields)
return tagged_fields

@classmethod
def get_field_tags(
cls, tags: List[str], field: str, tagged_fields: defaultdict
cls, tags: List[str], field: str, tagged_fields: Dict
manycoding marked this conversation as resolved.
Show resolved Hide resolved
) -> TaggedFields:
tags = cls.parse_tag(tags)
if not tags:
Expand Down
6 changes: 4 additions & 2 deletions src/arche/report.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from functools import partial
from typing import Dict
from typing import Dict, Union

from arche import SH_URL
from arche.rules.result import Level, Outcome, Result
Expand Down Expand Up @@ -44,7 +44,9 @@ def write_summary(cls, result: Result) -> None:
cls.write_rule_outcome(rule_msg.summary, level)

@classmethod
def write_rule_outcome(cls, outcome: str, level: Level = Level.INFO) -> None:
def write_rule_outcome(
cls, outcome: Union[str, Outcome], level: Level = Level.INFO
) -> None:
if isinstance(outcome, Outcome):
outcome = outcome.name
msg = outcome
Expand Down
4 changes: 2 additions & 2 deletions src/arche/rules/duplicates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Set

from arche.readers.schema import TaggedFields
from arche.rules.result import Result, Outcome
Expand All @@ -18,7 +18,7 @@ def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
result.add_info(Outcome.SKIPPED)
return result

err_keys = set()
err_keys: Set = set()
for field in unique_fields:
result.items_count = df[field].count()
duplicates = df[df.duplicated(field, keep=False)][[field]]
Expand Down
3 changes: 2 additions & 1 deletion src/arche/rules/others.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import codecs
import re
from typing import Set

from arche.rules.result import Outcome, Result
import numpy as np
Expand Down Expand Up @@ -90,7 +91,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
)

errors = {}
row_keys = set()
row_keys: Set = set()
rule_result = Result("Garbage Symbols", items_count=len(df))

for column in tqdm_notebook(
Expand Down
26 changes: 14 additions & 12 deletions src/arche/rules/price.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Any

from arche.readers.schema import TaggedFields
from arche.rules.result import Result, Outcome
from arche.tools.helpers import is_number, ratio_diff
Expand Down Expand Up @@ -75,7 +77,7 @@ def compare_prices_for_same_urls(
missing and new `product_url_field` tagged fields.
"""
result = Result("Compare Prices For Same Urls")
url_field = tagged_fields.get("product_url_field")
url_field: Any = tagged_fields.get("product_url_field")
manycoding marked this conversation as resolved.
Show resolved Hide resolved
if not url_field:
result.add_info(Outcome.SKIPPED)
return result
Expand Down Expand Up @@ -108,11 +110,11 @@ def compare_prices_for_same_urls(
result.add_info(f"{len(same_urls)} same urls in both jobs")

diff_prices_count = 0
price_field = tagged_fields.get("product_price_field")
if not price_field:
price_field_tag = tagged_fields.get("product_price_field")
if not price_field_tag:
result.add_info("product_price_field tag is not set")
else:
price_field = price_field[0]
price_field = price_field_tag[0]
detailed_messages = []
for url in same_urls:
if url.strip() != "nan":
Expand Down Expand Up @@ -153,8 +155,8 @@ def compare_names_for_same_urls(
compare `name_field` field"""

result = Result("Compare Names Per Url")
url_field = tagged_fields.get("product_url_field")
name_field = tagged_fields.get("name_field")
url_field: Any = tagged_fields.get("product_url_field")
manycoding marked this conversation as resolved.
Show resolved Hide resolved
name_field: Any = tagged_fields.get("name_field")
manycoding marked this conversation as resolved.
Show resolved Hide resolved
if not url_field or not name_field:
result.add_info(Outcome.SKIPPED)
return result
Expand Down Expand Up @@ -200,12 +202,12 @@ def compare_prices_for_same_names(
source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields
):
result = Result("Compare Prices For Same Names")
name_field = tagged_fields.get("name_field")
if not name_field:
name_field_tag = tagged_fields.get("name_field")
if not name_field_tag:
result.add_info(Outcome.SKIPPED)
return result

name_field = name_field[0]
name_field = name_field_tag[0]
source_df = source_df[source_df[name_field].notnull()]
target_df = target_df[target_df[name_field].notnull()]

Expand All @@ -232,12 +234,12 @@ def compare_prices_for_same_names(
result.add_info(f"{len(same_names)} same names in both jobs")

price_tag = "product_price_field"
price_field = tagged_fields.get(price_tag)
if not price_field:
price_field_tag = tagged_fields.get(price_tag)
if not price_field_tag:
result.add_info("product_price_field tag is not set")
return result

price_field = price_field[0]
price_field = price_field_tag[0]
count = 0

detailed_messages = []
Expand Down
10 changes: 7 additions & 3 deletions src/arche/rules/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from enum import Enum
import itertools
import math
from typing import Dict, List, Optional, Set, Union
from typing import Dict, List, Optional, Set, Union, cast

import IPython
import numpy as np
Expand Down Expand Up @@ -40,7 +40,11 @@ class Message:
summary: str
detailed: Optional[str] = None
errors: Optional[Dict[str, Set]] = None
_err_keys: Optional[Set[Union[str, int]]] = field(default_factory=set)

# expression "field(default_factory=set)" has type "Set[_T]", so we have to cast
manycoding marked this conversation as resolved.
Show resolved Hide resolved
_err_keys: Optional[Set[Union[str, int]]] = cast(
manycoding marked this conversation as resolved.
Show resolved Hide resolved
Optional[Set[Union[str, int]]], field(default_factory=set)
)

@property
def err_keys(self):
Expand Down Expand Up @@ -246,7 +250,7 @@ def build_stack_bar_data(values_counts: List[pd.Series]) -> List[go.Bar]:
Returns:
A list of Bar objects.
"""
data = []
data: List[go.Bar] = []
for vc in values_counts:
data = data + [
go.Bar(
Expand Down
6 changes: 4 additions & 2 deletions src/arche/tools/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import math
from multiprocessing import Pool
import time
from typing import Dict, List, Tuple, Optional, Union
from typing import Dict, List, Tuple, Optional, Union, cast

from arche.tools import helpers
from dateutil.relativedelta import relativedelta
Expand Down Expand Up @@ -144,7 +144,9 @@ def get_items_with_pool(
A numpy array of items
"""
active_connections_limit = 10
processes_count = min(max(helpers.cpus_count(), workers), active_connections_limit)
processes_count: int = cast(
manycoding marked this conversation as resolved.
Show resolved Hide resolved
int, min(max(helpers.cpus_count(), workers), active_connections_limit)
)
batch_size = math.ceil(count / processes_count)

start_idxs = range(start_index, start_index + count, batch_size)
Expand Down
2 changes: 1 addition & 1 deletion src/arche/tools/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def is_number(s):
return True


def cpus_count() -> int:
def cpus_count() -> Optional[int]:
try:
return len(os.sched_getaffinity(0))
except AttributeError:
Expand Down
8 changes: 4 additions & 4 deletions src/arche/tools/schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
import random
from typing import Any, Deque, Dict, List, Optional
from typing import Any, Deque, Dict, List, Optional, DefaultDict

from arche.readers.items import RawItems
from arche.readers.schema import Schema
Expand Down Expand Up @@ -92,7 +92,7 @@ def fast_validate(
Returns:
A dictionary of errors with message and item keys
"""
errors = defaultdict(set)
errors: DefaultDict = defaultdict(set)

validate = fastjsonschema.compile(schema)
for i, raw_item in enumerate(
Expand All @@ -113,7 +113,7 @@ def full_validate(
"""This function uses jsonschema validator which returns all found error per item.
See `fast_validate()` for arguments descriptions.
"""
errors = defaultdict(set)
errors: DefaultDict = defaultdict(set)

validator = validators.validator_for(schema)(schema)
validator.format_checker = FormatChecker()
Expand All @@ -134,7 +134,7 @@ def format_validation_message(
error_msg: str, path: Deque, schema_path: Deque, validator: str
) -> str:
str_path = "/".join(p for p in path if isinstance(p, str))
schema_path = "/".join(p for p in schema_path)
schema_path = "/".join(p for p in schema_path) # type: ignore

if validator == "anyOf":
if str_path:
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ def create_result(
items_count: Optional[int] = None,
) -> Result:
result = Result(rule_name)
for level, messages in messages.items():
for message in messages:
for level, messages_list in messages.items():
for message in messages_list:
result.add_message(level, *message)

if stats:
Expand Down
4 changes: 3 additions & 1 deletion tests/test_arche.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Dict, List

from arche import arche, SH_URL
from arche.arche import Arche
from arche.rules.result import Level
Expand Down Expand Up @@ -34,7 +36,7 @@ def test_arche_df(get_df):
pd.testing.assert_frame_equal(a.target_items.df, get_df)


schema_dummies = [{"properties": {"name": {}}}, {"properties": {"url": {}}}]
schema_dummies: List[Dict] = [{"properties": {"name": {}}}, {"properties": {"url": {}}}]


def test_schema():
Expand Down
Loading