diff --git a/data/ares/main.py b/data/ares/main.py index cafd59e..b4dcfc2 100644 --- a/data/ares/main.py +++ b/data/ares/main.py @@ -3,10 +3,17 @@ import os import tarfile from tempfile import NamedTemporaryFile +from typing import List from urllib.request import urlretrieve import lxml.etree +URL_BULK = "https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz" + + +def resources() -> List[str]: + return [URL_BULK] + def attr(root, parts, nsmap): ret = [] @@ -65,7 +72,7 @@ def organi(root, ico, nsmap): def main(outdir: str, partial: bool = False): with NamedTemporaryFile() as vfn: - urlretrieve("https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz", vfn.name) + urlretrieve(URL_BULK, vfn.name) with tarfile.open(vfn.name, "r:gz") as tf, open( os.path.join(outdir, "firmy.csv"), "w", encoding="utf8" ) as ud, open( diff --git a/data/datovky/main.py b/data/datovky/main.py index 56c4868..6add9f6 100644 --- a/data/datovky/main.py +++ b/data/datovky/main.py @@ -2,10 +2,24 @@ import gzip import json import os +from typing import List from urllib.request import urlopen import lxml.etree +BASE_URL = "https://www.mojedatovaschranka.cz/sds/datafile.do?format=xml&service=" +urls = { + "po": BASE_URL + "seznam_ds_po", + "pfo": BASE_URL + "seznam_ds_pfo", + "fo": BASE_URL + "seznam_ds_fo", + "ovm": BASE_URL + "seznam_ds_ovm", +} + + +def resources() -> List[str]: + return list(urls.values()) + + mapping = { "id": "id", "type": "type", @@ -89,14 +103,6 @@ def parse_xml(source, target_fn, partial): def main(outdir: str, partial: bool = False): - BASE_URL = "https://www.mojedatovaschranka.cz/sds/datafile.do?format=xml&service=" - urls = { - "po": BASE_URL + "seznam_ds_po", - "pfo": BASE_URL + "seznam_ds_pfo", - "fo": BASE_URL + "seznam_ds_fo", - "ovm": BASE_URL + "seznam_ds_ovm", - } - tdir = os.path.join(outdir, "datovky") os.makedirs(tdir, exist_ok=True) for ds, url in urls.items(): diff --git a/data/dotinfo/main.py b/data/dotinfo/main.py index df2da30..7a64a2d 100644 --- a/data/dotinfo/main.py +++ b/data/dotinfo/main.py @@ -5,8 +5,17 @@ import zipfile from datetime import datetime from tempfile import TemporaryDirectory +from typing import List from urllib.request import urlretrieve +URL_DUMP = "https://data.mfcr.cz/sites/default/files/DotInfo_report_29_01_2020.zip" + + +def resources() -> List[str]: + return [] # TODO(PR): skipping for now due to TLS issues + # return [URL_DUMP] + + header = { "Evidenční číslo dotace": "evidencni_cislo_dotace", "Identifikator dotace": "identifikator_dotace", @@ -26,10 +35,7 @@ def main(outdir: str, partial: bool = False): ssl._create_default_https_context = ssl._create_unverified_context with TemporaryDirectory() as tmpdir: rawpath = os.path.join(tmpdir, "raw.zip") - urlretrieve( - "https://data.mfcr.cz/sites/default/files/DotInfo_report_29_01_2020.zip", - rawpath, - ) + urlretrieve(URL_DUMP, rawpath) with zipfile.ZipFile(rawpath) as zf, zf.open( "DotInfo_report_29_01_2020.csv" diff --git a/data/iissp/main.py b/data/iissp/main.py index 21988ba..74cfb00 100644 --- a/data/iissp/main.py +++ b/data/iissp/main.py @@ -2,6 +2,7 @@ import gzip import os from datetime import date +from typing import List from urllib.request import Request, urlopen import lxml.etree @@ -10,6 +11,11 @@ url = "https://monitor.statnipokladna.cz/data/xml/ucjed.xml" table_name = "ucetni_jednotky" + +def resources() -> List[str]: + return [url] + + # XSD nema vsechno, dafuq cols = [ "ucjed_id", diff --git a/data/justice/main.py b/data/justice/main.py index ac45eb1..372ffe0 100644 --- a/data/justice/main.py +++ b/data/justice/main.py @@ -76,9 +76,9 @@ def main(outdir: str, partial: bool = False): # nejde filtrovat??? Tak to asi udelame na klientovi url_pl = "https://dataor.justice.cz/api/3/action/package_list" - r = urlopen(url_pl, timeout=HTTP_TIMEOUT) - data = json.load(r) - assert data["success"] + with urlopen(url_pl, timeout=HTTP_TIMEOUT) as r: + data = json.load(r) + assert data["success"] dss = [ds for ds in data["result"] if "-full-" in ds] print(f"celkem {len(dss)} datasetu, ale filtruji jen na ty letosni") diff --git a/data/res/main.py b/data/res/main.py index 8318e4d..d501417 100644 --- a/data/res/main.py +++ b/data/res/main.py @@ -1,6 +1,7 @@ import gzip import os import shutil +from typing import List from urllib.request import Request, urlopen DATA = ("https://opendata.czso.cz/data/od_org03/res_data.csv", "subjekty.csv") @@ -8,6 +9,13 @@ HTTP_TIMEOUT = 30 +def resources() -> List[str]: + return [ + DATA[0], + NACE[0], + ] + + def download_gzipped(url: str, filename: str): req = Request(url) req.add_header("Accept-Encoding", "gzip") diff --git a/data/szif/main.py b/data/szif/main.py index 3739247..3c7e5a9 100644 --- a/data/szif/main.py +++ b/data/szif/main.py @@ -3,6 +3,7 @@ import shutil from contextlib import closing from tempfile import NamedTemporaryFile +from typing import List from urllib.request import urlopen from zipfile import ZipFile @@ -13,12 +14,17 @@ "dokumenty_ke_stazeni%2Fpkp%2Fspd%2Fopendata%2F" ) urls = { + 2020: BASE_URL + "1622192829773.zip", 2019: BASE_URL + "1590753721920.zip", 2018: BASE_URL + "1563197121858.zip", 2017: BASE_URL + "1563197147275.zip", } +def resources() -> List[str]: + return list(urls.values()) + + def main(outdir: str, partial: bool = False): id_prijemce = 1 diff --git a/data/udhpsh/main.py b/data/udhpsh/main.py index e6fe6b5..c12f148 100644 --- a/data/udhpsh/main.py +++ b/data/udhpsh/main.py @@ -1,6 +1,7 @@ import csv import json import os +from typing import List from urllib.request import urlopen HTTP_TIMEOUT = 60 @@ -10,9 +11,15 @@ "2018": "https://zpravy.udhpsh.cz/zpravy/vfz2018.json", "2019": "https://zpravy.udhpsh.cz/zpravy/vfz2019.json", "2020": "https://zpravy.udhpsh.cz/zpravy/vfz2020.json", + "2021": "https://zpravy.udhpsh.cz/zpravy/vfz2021.json", } years = sorted(indices.keys()) + +def resources() -> List[str]: + return list(indices.values()) + + mappings = { "penizefo": { "date": "datum", diff --git a/resource_check.py b/resource_check.py new file mode 100644 index 0000000..61622dc --- /dev/null +++ b/resource_check.py @@ -0,0 +1,41 @@ +from importlib import import_module +from urllib.request import urlopen + +modules = [ + "ares", + "res", + "udhpsh", + "cssz", + "datovky", + "dotinfo", + "eufondy", + "iissp", + "justice", + "psp", + "steno", + "smlouvy", + "szif", + "upv", + "wikidata", + "zakazky", + "volby", + "icij", +] + +# TODO(PR): docs +if __name__ == "__main__": + # TODO(PR): problemy: + # - udhpsh a par dalsich (no last modified) + # - justice (megamoc datafilu) + + for module in modules: + # TODO(PR): tohle triggeruje importy z toho main.py, takze by + # to chtelo ty URL vyseparovat jinam mozna (a nebo poustet ve venvu) + try: + resources = import_module(f"data.{module}.main").resources() + except AttributeError: + print(module, "not found") + continue + for resource in resources: + with urlopen(resource) as req: + print(module, "\t", resource, "\t", req.headers.get("Last-Modified"))