diff --git a/API/api_worker.py b/API/api_worker.py index 0a90ce21..a7085178 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -15,7 +15,8 @@ from celery import Celery # Reader imports -from src.app import CustomExport, PolygonStats, GeoJSONStats, RawData, S3FileTransfer +from src.app import CustomExport, PolygonStats, RawData, S3FileTransfer +from src.post_processing.processor import PostProcessor from src.config import ALLOW_BIND_ZIP_FILTER from src.config import CELERY_BROKER_URL as celery_broker_uri from src.config import CELERY_RESULT_BACKEND as celery_backend @@ -39,6 +40,7 @@ RawDataCurrentParams, RawDataOutputType, ) +from src.post_processing.processor import PostProcessor if ENABLE_SOZIP: # Third party imports @@ -218,19 +220,46 @@ def process_raw_data(self, params, user=None): ) polygon_stats = None - geojson_stats = None + geojson_stats_html = None + + if "include_stats" or "include_translit" in params.dict(): + post_processor = PostProcessor({ + "include_stats": params.include_stats, + "include_translit": params.include_translit + }) - if "include_stats" in params.dict(): if params.include_stats: - geoJSONStats = GeoJSONStats(params.filters) - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts, geoJSONStats.raw_data_line_stats) - geojson_stats = geoJSONStats.json() - else: - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts) + post_processor.filters = params.filters + + post_processor.init() + + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts, post_processor.post_process_line) + + if params.include_stats: + geojson_stats_json = json.dumps(post_processor.geoJSONStats.dict()) + + # Create a HTML summary of stats + if params.include_stats_html: + tpl = "stats" + if 'waterway' in post_processor.geoJSONStats.config.keys: + tpl = "stats_waterway" + if 'highway' in post_processor.geoJSONStats.config.keys: + tpl = "stats_highway" + if 'building' in post_processor.geoJSONStats.config.keys: + tpl = "stats_building" + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join(project_root, "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl)) + geojson_stats_html = post_processor.geoJSONStats.html(tpl_path).build() + upload_html_path = os.path.join(working_dir, os.pardir, f"{exportname_parts[-1]}.html") + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + else: + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts) inside_file_size = 0 if "include_stats" in params.dict(): @@ -248,7 +277,7 @@ def process_raw_data(self, params, user=None): exportname_parts=exportname_parts, geom_dump=geom_dump, polygon_stats=polygon_stats, - geojson_stats=geojson_stats, + geojson_stats=geojson_stats_json, default_readme=DEFAULT_README_TEXT, ) @@ -261,6 +290,7 @@ def process_raw_data(self, params, user=None): upload_file_path = file_path inside_file_size += os.path.getsize(file_path) break # only take one file inside dir , if contains many it should be inside zip + # check if download url will be generated from s3 or not from config if use_s3_to_upload: file_transfer_obj = S3FileTransfer() @@ -274,7 +304,6 @@ def process_raw_data(self, params, user=None): pattern = r"(hotosm_project_)(\d+)" match = re.match(pattern, exportname) if match: - prefix = match.group(1) project_number = match.group(2) if project_number: upload_name = f"TM/{project_number}/{exportname}" diff --git a/requirements.txt b/requirements.txt index 6e87b47b..9e7b5a68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,5 +56,8 @@ psutil==5.9.8 tqdm==4.66.2 # stats for geojson data -geojson-stats==0.1.0 +geojson-stats==0.2.2 + +# transliterations +transliterate==1.10.2 diff --git a/src/app.py b/src/app.py index 60d8fcdf..580530a5 100644 --- a/src/app.py +++ b/src/app.py @@ -47,7 +47,6 @@ from psycopg2.extras import DictCursor from slugify import slugify from tqdm import tqdm -from geojson_stats.stats import Stats # Reader imports from src.config import ( @@ -2261,54 +2260,3 @@ def get_summary_stats(self, start_date, end_date, group_by): return [dict(item) for item in result] -class GeoJSONStats(Stats): - """Used for collecting stats while processing GeoJSON files line by line""" - - def __init__(self, filters, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.config.clean = True - self.config.properties_prop = "properties.tags" - - if filters and filters.tags: - config_area = ["building"] - config_length = ["highway", "waterway"] - - for tag in config_area: - if self.check_filter(filters.tags, tag): - self.config.keys.append(tag) - self.config.value_keys.append(tag) - self.config.area = True - for tag in config_length: - if self.check_filter(filters.tags, tag): - self.config.keys.append(tag) - self.config.value_keys.append(tag) - self.config.length = True - - def check_filter(self, tags, tag): - """ - Check if a tag is present in tag filters - """ - - if tags.all_geometry: - if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: - return True - if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: - return True - if tags.polygon: - if tags.polygon.join_or and tag in tags.polygon.join_or: - return True - if tags.polygon.join_and and tag in tags.polygon.join_and: - return True - if tags.line: - if tags.line.join_or and tag in tags.line.join_or: - return True - if tags.line.join_and and tag in tags.line.join_and: - return True - - def raw_data_line_stats(self, line: str): - """ - Process a GeoJSON line (for getting stats) and return that line - """ - self.process_file_line(line) - return line diff --git a/src/post_processing/__init__.py b/src/post_processing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py new file mode 100644 index 00000000..edf6bdf4 --- /dev/null +++ b/src/post_processing/geojson_stats.py @@ -0,0 +1,58 @@ +from geojson_stats.stats import Stats +from geojson_stats.html import Html + +CONFIG_AREA = ["building"] +CONFIG_LENGTH = ["highway", "waterway"] + +class GeoJSONStats(Stats): + """Used for collecting stats while processing GeoJSON files line by line""" + + def __init__(self, filters, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.config.clean = True + self.config.properties_prop = "properties.tags" + + if filters and filters.tags: + + for tag in CONFIG_AREA: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.area = True + + for tag in CONFIG_LENGTH: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.length = True + + def check_filter(self, tags, tag): + """ + Check if a tag is present in tag filters + """ + + if tags.all_geometry: + if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: + return True + if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: + return True + if tags.polygon: + if tags.polygon.join_or and tag in tags.polygon.join_or: + return True + if tags.polygon.join_and and tag in tags.polygon.join_and: + return True + if tags.line: + if tags.line.join_or and tag in tags.line.join_or: + return True + if tags.line.join_and and tag in tags.line.join_and: + return True + + def raw_data_line_stats(self, json_object: dict): + """ + Process a GeoJSON line (for getting stats) and return that line + """ + self.get_object_stats(json_object) + + def html(self, tpl): + return Html(tpl, self) diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py new file mode 100644 index 00000000..25ed726f --- /dev/null +++ b/src/post_processing/processor.py @@ -0,0 +1,39 @@ + +import json +from .transliterator import Transliterator +from .geojson_stats import GeoJSONStats + +class PostProcessor(): + """Used for posst-process data while processing GeoJSON files line by line""" + + options = {} + filters = {} + functions = [] + + def __init__(self, options, *args, **kwargs): + self.options = options + + def post_process_line(self, line: str): + """ + Parses line, run functions over it and returns it + """ + + line_object = json.loads(line) + + for fn in self.functions: + fn(line_object) + + return json.dumps(line_object) + + def init(self): + """ + Initialize post-processor + """ + + if self.options["include_stats"]: + self.geoJSONStats = GeoJSONStats(self.filters) + self.functions.append(self.geoJSONStats.raw_data_line_stats) + + if self.options["include_translit"]: + self.transliterator = Transliterator() + self.functions.append(self.transliterator.translit) \ No newline at end of file diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html new file mode 100644 index 00000000..e6cdfdd6 --- /dev/null +++ b/src/post_processing/stats_building_tpl.html @@ -0,0 +1,165 @@ + + + +
+ + + + + + +Elements identified as distinct
+Including local language and english
+Hot Key Counts | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent}% | +
${key_1} | +${key_1_count} | +${key_1_percent}% | +
${key_2} | +${key_2_count} | +${key_2_percent}% | +
${key_3} | +${key_3_count} | +${key_3_percent}% | +
${key_4} | +${key_4_count} | +${key_4_percent}% | +
${key_5} | +${key_5_count} | +${key_5_percent}% | +
Elements identified as distinct
+Including local language and english
+Hot Key Counts | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent}% | +
${key_1} | +${key_1_count} | +${key_1_percent}% | +
${key_2} | +${key_2_count} | +${key_2_percent}% | +
${key_3} | +${key_3_count} | +${key_3_percent}% | +
${key_4} | +${key_4_count} | +${key_4_percent}% | +
${key_5} | +${key_5_count} | +${key_5_percent}% | +
Elements identified as distinct
+Including local language and english
+Hot Key Counts | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent}% | +
${key_1} | +${key_1_count} | +${key_1_percent}% | +
${key_2} | +${key_2_count} | +${key_2_percent}% | +
${key_3} | +${key_3_count} | +${key_3_percent}% | +
${key_4} | +${key_4_count} | +${key_4_percent}% | +
${key_5} | +${key_5_count} | +${key_5_percent}% | +
Elements identified as distinct
+Including local language and english
+Hot Key Counts | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent}% | +
${key_1} | +${key_1_count} | +${key_1_percent}% | +
${key_2} | +${key_2_count} | +${key_2_percent}% | +
${key_3} | +${key_3_count} | +${key_3_percent}% | +
${key_4} | +${key_4_count} | +${key_4_percent}% | +
${key_5} | +${key_5_count} | +${key_5_percent}% | +