Skip to content

Commit

Permalink
+ Post processing, + Transliterations, HTML stats, upgrade geojson-stats
Browse files Browse the repository at this point in the history
  • Loading branch information
emi420 committed Nov 15, 2024
1 parent 0e2e378 commit 655f318
Show file tree
Hide file tree
Showing 12 changed files with 827 additions and 67 deletions.
57 changes: 43 additions & 14 deletions API/api_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from celery import Celery

# Reader imports
from src.app import CustomExport, PolygonStats, GeoJSONStats, RawData, S3FileTransfer
from src.app import CustomExport, PolygonStats, RawData, S3FileTransfer
from src.post_processing.processor import PostProcessor
from src.config import ALLOW_BIND_ZIP_FILTER
from src.config import CELERY_BROKER_URL as celery_broker_uri
from src.config import CELERY_RESULT_BACKEND as celery_backend
Expand All @@ -39,6 +40,7 @@
RawDataCurrentParams,
RawDataOutputType,
)
from src.post_processing.processor import PostProcessor

if ENABLE_SOZIP:
# Third party imports
Expand Down Expand Up @@ -218,19 +220,46 @@ def process_raw_data(self, params, user=None):
)

polygon_stats = None
geojson_stats = None
geojson_stats_html = None

if "include_stats" or "include_translit" in params.dict():
post_processor = PostProcessor({
"include_stats": params.include_stats,
"include_translit": params.include_translit
})

if "include_stats" in params.dict():
if params.include_stats:
geoJSONStats = GeoJSONStats(params.filters)
geom_area, geom_dump, working_dir = RawData(
params, str(self.request.id)
).extract_current_data(file_parts, geoJSONStats.raw_data_line_stats)
geojson_stats = geoJSONStats.json()
else:
geom_area, geom_dump, working_dir = RawData(
params, str(self.request.id)
).extract_current_data(file_parts)
post_processor.filters = params.filters

post_processor.init()

geom_area, geom_dump, working_dir = RawData(
params, str(self.request.id)
).extract_current_data(file_parts, post_processor.post_process_line)

if params.include_stats:
geojson_stats_json = json.dumps(post_processor.geoJSONStats.dict())

# Create a HTML summary of stats
if params.include_stats_html:
tpl = "stats"
if 'waterway' in post_processor.geoJSONStats.config.keys:
tpl = "stats_waterway"
if 'highway' in post_processor.geoJSONStats.config.keys:
tpl = "stats_highway"
if 'building' in post_processor.geoJSONStats.config.keys:
tpl = "stats_building"
project_root = pathlib.Path(__file__).resolve().parent
tpl_path = os.path.join(project_root, "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl))
geojson_stats_html = post_processor.geoJSONStats.html(tpl_path).build()
upload_html_path = os.path.join(working_dir, os.pardir, f"{exportname_parts[-1]}.html")
with open(upload_html_path, "w") as f:
f.write(geojson_stats_html)

else:
geom_area, geom_dump, working_dir = RawData(
params, str(self.request.id)
).extract_current_data(file_parts)

inside_file_size = 0
if "include_stats" in params.dict():
Expand All @@ -248,7 +277,7 @@ def process_raw_data(self, params, user=None):
exportname_parts=exportname_parts,
geom_dump=geom_dump,
polygon_stats=polygon_stats,
geojson_stats=geojson_stats,
geojson_stats=geojson_stats_json,
default_readme=DEFAULT_README_TEXT,
)

Expand All @@ -261,6 +290,7 @@ def process_raw_data(self, params, user=None):
upload_file_path = file_path
inside_file_size += os.path.getsize(file_path)
break # only take one file inside dir , if contains many it should be inside zip

# check if download url will be generated from s3 or not from config
if use_s3_to_upload:
file_transfer_obj = S3FileTransfer()
Expand All @@ -274,7 +304,6 @@ def process_raw_data(self, params, user=None):
pattern = r"(hotosm_project_)(\d+)"
match = re.match(pattern, exportname)
if match:
prefix = match.group(1)
project_number = match.group(2)
if project_number:
upload_name = f"TM/{project_number}/{exportname}"
Expand Down
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,8 @@ psutil==5.9.8
tqdm==4.66.2

# stats for geojson data
geojson-stats==0.1.0
geojson-stats==0.2.2

# transliterations
transliterate==1.10.2

52 changes: 0 additions & 52 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
from psycopg2.extras import DictCursor
from slugify import slugify
from tqdm import tqdm
from geojson_stats.stats import Stats

# Reader imports
from src.config import (
Expand Down Expand Up @@ -2261,54 +2260,3 @@ def get_summary_stats(self, start_date, end_date, group_by):
return [dict(item) for item in result]


class GeoJSONStats(Stats):
"""Used for collecting stats while processing GeoJSON files line by line"""

def __init__(self, filters, *args, **kwargs):
super().__init__(*args, **kwargs)

self.config.clean = True
self.config.properties_prop = "properties.tags"

if filters and filters.tags:
config_area = ["building"]
config_length = ["highway", "waterway"]

for tag in config_area:
if self.check_filter(filters.tags, tag):
self.config.keys.append(tag)
self.config.value_keys.append(tag)
self.config.area = True
for tag in config_length:
if self.check_filter(filters.tags, tag):
self.config.keys.append(tag)
self.config.value_keys.append(tag)
self.config.length = True

def check_filter(self, tags, tag):
"""
Check if a tag is present in tag filters
"""

if tags.all_geometry:
if tags.all_geometry.join_or and tag in tags.all_geometry.join_or:
return True
if tags.all_geometry.join_and and tag in tags.all_geometry.join_and:
return True
if tags.polygon:
if tags.polygon.join_or and tag in tags.polygon.join_or:
return True
if tags.polygon.join_and and tag in tags.polygon.join_and:
return True
if tags.line:
if tags.line.join_or and tag in tags.line.join_or:
return True
if tags.line.join_and and tag in tags.line.join_and:
return True

def raw_data_line_stats(self, line: str):
"""
Process a GeoJSON line (for getting stats) and return that line
"""
self.process_file_line(line)
return line
Empty file added src/post_processing/__init__.py
Empty file.
58 changes: 58 additions & 0 deletions src/post_processing/geojson_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from geojson_stats.stats import Stats
from geojson_stats.html import Html

CONFIG_AREA = ["building"]
CONFIG_LENGTH = ["highway", "waterway"]

class GeoJSONStats(Stats):
"""Used for collecting stats while processing GeoJSON files line by line"""

def __init__(self, filters, *args, **kwargs):
super().__init__(*args, **kwargs)

self.config.clean = True
self.config.properties_prop = "properties.tags"

if filters and filters.tags:

for tag in CONFIG_AREA:
if self.check_filter(filters.tags, tag):
self.config.keys.append(tag)
self.config.value_keys.append(tag)
self.config.area = True

for tag in CONFIG_LENGTH:
if self.check_filter(filters.tags, tag):
self.config.keys.append(tag)
self.config.value_keys.append(tag)
self.config.length = True

def check_filter(self, tags, tag):
"""
Check if a tag is present in tag filters
"""

if tags.all_geometry:
if tags.all_geometry.join_or and tag in tags.all_geometry.join_or:
return True
if tags.all_geometry.join_and and tag in tags.all_geometry.join_and:
return True
if tags.polygon:
if tags.polygon.join_or and tag in tags.polygon.join_or:
return True
if tags.polygon.join_and and tag in tags.polygon.join_and:
return True
if tags.line:
if tags.line.join_or and tag in tags.line.join_or:
return True
if tags.line.join_and and tag in tags.line.join_and:
return True

def raw_data_line_stats(self, json_object: dict):
"""
Process a GeoJSON line (for getting stats) and return that line
"""
self.get_object_stats(json_object)

def html(self, tpl):
return Html(tpl, self)
39 changes: 39 additions & 0 deletions src/post_processing/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

import json
from .transliterator import Transliterator
from .geojson_stats import GeoJSONStats

class PostProcessor():
"""Used for posst-process data while processing GeoJSON files line by line"""

options = {}
filters = {}
functions = []

def __init__(self, options, *args, **kwargs):
self.options = options

def post_process_line(self, line: str):
"""
Parses line, run functions over it and returns it
"""

line_object = json.loads(line)

for fn in self.functions:
fn(line_object)

return json.dumps(line_object)

def init(self):
"""
Initialize post-processor
"""

if self.options["include_stats"]:
self.geoJSONStats = GeoJSONStats(self.filters)
self.functions.append(self.geoJSONStats.raw_data_line_stats)

if self.options["include_translit"]:
self.transliterator = Transliterator()
self.functions.append(self.transliterator.translit)
Loading

0 comments on commit 655f318

Please sign in to comment.