From d75b259d5d2e2f8352dc3677f400f1f3e93c712a Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Fri, 2 Dec 2022 09:44:27 +1300 Subject: [PATCH 01/12] refactor: add basic types to gdalinfo response --- scripts/gdal/gdalinfo.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/scripts/gdal/gdalinfo.py b/scripts/gdal/gdalinfo.py index 437808f15..1b4a5ac02 100644 --- a/scripts/gdal/gdalinfo.py +++ b/scripts/gdal/gdalinfo.py @@ -1,24 +1,38 @@ import json import re -from typing import Any, Dict +from typing import Any, Dict, List, TypedDict from linz_logger import get_log from scripts.gdal.gdal_helper import GDALExecutionException, run_gdal - -def gdal_info(path: str) -> Dict[Any, Any]: +class GdalInfoBands(TypedDict): + band: int + block: List[int] + type: str + colorInterpretation: str + +class GdalInfo(TypedDict): + description: str + driverShortName: str + driverLongName: str + files: List[str] + size: List[int] + geoTransform: List[float] + metadata: Dict[any, any] + cornerCoordinates: Dict[any, any] + extent: Dict[any, any] + bands: List[GdalInfoBands] + +def gdal_info(path: str) -> GdalInfo: # Set GDAL_PAM_ENABLED to NO to temporarily diable PAM support and prevent creation of auxiliary XML file. gdalinfo_command = ["gdalinfo", "-stats", "-json", "--config", "GDAL_PAM_ENABLED", "NO"] - gdalinfo_result = {} try: gdalinfo_process = run_gdal(gdalinfo_command, path) - try: - gdalinfo_result = json.loads(gdalinfo_process.stdout) - except json.JSONDecodeError as e: - get_log().error("load_gdalinfo_result_error", file=path, error=e) - raise e - return gdalinfo_result + return json.loads(gdalinfo_process.stdout) + except json.JSONDecodeError as e: + get_log().error("load_gdalinfo_result_error", file=path, error=e) + raise e except GDALExecutionException as gee: get_log().error("gdalinfo_failed", file=path, error=str(gee)) raise gee From fae9de71e56ceb2d46bc586aeb140fafdf4e8253 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Fri, 2 Dec 2022 11:18:38 +1300 Subject: [PATCH 02/12] feat: detect the band count in imagery --- scripts/gdal/gdal_helper.py | 6 +- scripts/gdal/gdal_preset.py | 169 ++++++++++++++++++------------------ scripts/gdal/gdalinfo.py | 9 +- scripts/standardising.py | 6 +- 4 files changed, 99 insertions(+), 91 deletions(-) diff --git a/scripts/gdal/gdal_helper.py b/scripts/gdal/gdal_helper.py index a01753da7..0dbc13c83 100644 --- a/scripts/gdal/gdal_helper.py +++ b/scripts/gdal/gdal_helper.py @@ -57,14 +57,14 @@ def get_gdal_version() -> str: def run_gdal( command: List[str], - input_file: Optional[str] = None, + input_file: str, output_file: Optional[str] = None, ) -> "subprocess.CompletedProcess[bytes]": """Run the GDAL command. The permissions to access to the input file are applied to the gdal environment. Args: command (List[str]): each arguments of the GDAL command. - input_file (str, optional): the input file path. + input_file str: the input file path. output_file (str, optional): the output file path. Raises: @@ -102,7 +102,7 @@ def run_gdal( if proc.stderr: get_log().warning("run_gdal_stderr", command=command_to_string(temp_command), stderr=proc.stderr.decode()) - get_log().debug("run_gdal_succeeded", command=command_to_string(temp_command), stdout=proc.stdout.decode()) + get_log().trace("run_gdal_succeeded", command=command_to_string(temp_command), stdout=proc.stdout.decode()) return proc diff --git a/scripts/gdal/gdal_preset.py b/scripts/gdal/gdal_preset.py index e3022c7df..7f3220bc0 100644 --- a/scripts/gdal/gdal_preset.py +++ b/scripts/gdal/gdal_preset.py @@ -1,122 +1,123 @@ -from typing import List +from typing import List, Optional from linz_logger import get_log +from scripts.gdal.gdalinfo import gdal_info, GdalInfoBands -GDAL_PRESET_LZW = [ - "gdal_translate", - "-q", - "-scale", + +# Force the source projection as NZTM EPSG:2193 +NZTM_SOURCE = [ + "-a_srs", + "EPSG:2193" +] + +# Scale imagery from 0-255 to 0-254 then set 255 as NO_DATA +# Useful for imagery that does not have a alpha band +SCALE_254_ADD_NO_DATA = [ + "-scale", "0", "255", "0", "254", - "-a_srs", - "EPSG:2193", "-a_nodata", - "255", - "-b", - "1", - "-b", - "2", - "-b", - "3", - "-of", - "COG", - "-co", - "compress=lzw", - "-co", - "num_threads=all_cpus", - "-co", - "predictor=2", - "-co", - "overview_compress=webp", - "-co", - "bigtiff=yes", - "-co", - "overview_resampling=lanczos", - "-co", - "blocksize=512", - "-co", - "overview_quality=90", - "-co", - "sparse_ok=true", + "255" ] -GDAL_PRESET_WEBP = [ - "gdal_translate", +BASE_COG = [ + # ?? "-q", - "-a_srs", - "EPSG:2193", - "-b", - "1", - "-b", - "2", - "-b", - "3", + # Output to a COG "-of", "COG", + # Tile the image int 512x512px images "-co", - "compress=webp", + "blocksize=512", + # Ensure all CPUs are used for gdal translate "-co", "num_threads=all_cpus", + # If not all tiles are needed in the tiff, instead of writing empty images write a null byte + # this significantly reduces the size of tiffs which are very sparse "-co", - "quality=100", - "-co", - "overview_compress=webp", + "sparse_ok=true", + # Force everything into big tiff + # this converts all offsets from 32bit to 64bit to support TIFFs > 4GB in size "-co", "bigtiff=yes", +] + +COMPRESS_LZW = [ + # Compress as LZW "-co", - "overview_resampling=lanczos", - "-co", - "blocksize=512", - "-co", - "overview_quality=90", + "compress=lzw", + # Predictor two reduces file size "-co", - "sparse_ok=true", + "predictor=2", ] -GDAL_PRESET_GRAY_WEBP = [ - "gdal_translate", - "-q", - "-a_srs", - "EPSG:2193", - "-b", - "1", - "-b", - "1", - "-b", - "1", - "-a_nodata", - "255", - "-of", - "COG", +COMPRESS_WEBP_LOSSLESS =[ + # Comppress into webp "-co", "compress=webp", - "-co", - "num_threads=all_cpus", + # Compress losslessly "-co", "quality=100", +] + +WEBP_OVERVIEWS = [ + # When creating overviews also compress them into Webp "-co", "overview_compress=webp", - "-co", - "bigtiff=yes", + # When resampling overviews use lanczos + # see https://github.com/linz/basemaps/blob/master/docs/imagery/cog.quality.md "-co", "overview_resampling=lanczos", - "-co", - "blocksize=512", + # Reduce quality of overviews to 90% "-co", "overview_quality=90", - "-co", - "sparse_ok=true", ] - def get_gdal_command(preset: str) -> List[str]: get_log().info("gdal_preset", preset=preset) + gdal_command:List[str] = ["gdal_translate"] + + gdal_command.extend(BASE_COG) + gdal_command.extend(NZTM_SOURCE) + if preset == "lzw": - return GDAL_PRESET_LZW - if preset == "webp": - return GDAL_PRESET_WEBP - if preset == "gray_webp": - return GDAL_PRESET_GRAY_WEBP - raise Exception(f"Unknown GDAL preset: {preset}") + gdal_command.extend(SCALE_254_ADD_NO_DATA) + gdal_command.extend(COMPRESS_LZW) + + elif preset == "webp": + gdal_command.extend(COMPRESS_WEBP_LOSSLESS) + + gdal_command.extend(WEBP_OVERVIEWS) + + return gdal_command + +# Find a band from the color interpretation +def find_band(bands: List[GdalInfoBands], color: str) -> Optional[GdalInfoBands]: + for band in bands: + if band['colorInterpretation'] == color: + return band + return None + +# Determine what band numbers to use for the "-b" overrides for gdal_translate +def get_gdal_band_offset(file: str) -> List[str]: + info = gdal_info(file, False) + + bands = info['bands']; + + # Grey scale imagery, set R,G and B to just the grey_band + grey_band = find_band(bands, 'Gray') + if grey_band: + grand_band_index = str(grey_band['band']) + return ["-b", grand_band_index, "-b", grand_band_index, "-b", grand_band_index] + + band_red = find_band(bands, 'Red') + band_green = find_band(bands, 'Green') + band_blue = find_band(bands, 'Blue') + + if band_red is None or band_green is None or band_blue is None: + get_log().warn("gdal_info_bands_failed", band_red=band_red is None, band_green=band_green is None, band_blue=band_blue is None) + return ["-b", "1", "-b", "2", "-b", "3"] + + return ["-b", str(band_red['band']), "-b", str(band_green['band']), "-b", str(band_blue['band'])] diff --git a/scripts/gdal/gdalinfo.py b/scripts/gdal/gdalinfo.py index 1b4a5ac02..166bd249f 100644 --- a/scripts/gdal/gdalinfo.py +++ b/scripts/gdal/gdalinfo.py @@ -24,9 +24,14 @@ class GdalInfo(TypedDict): extent: Dict[any, any] bands: List[GdalInfoBands] -def gdal_info(path: str) -> GdalInfo: +def gdal_info(path: str, stats: bool = True) -> GdalInfo: # Set GDAL_PAM_ENABLED to NO to temporarily diable PAM support and prevent creation of auxiliary XML file. - gdalinfo_command = ["gdalinfo", "-stats", "-json", "--config", "GDAL_PAM_ENABLED", "NO"] + gdalinfo_command = ["gdalinfo", "-json", "--config", "GDAL_PAM_ENABLED", "NO"] + + # Stats takes a while to generate only generate if needed + if stats: + gdalinfo_command.append("-stats") + try: gdalinfo_process = run_gdal(gdalinfo_command, path) return json.loads(gdalinfo_process.stdout) diff --git a/scripts/standardising.py b/scripts/standardising.py index 4f79db6a2..770e3a5ae 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -10,7 +10,7 @@ from scripts.cli.cli_helper import format_source, is_argo from scripts.files.files_helper import get_file_name_from_path, is_tiff from scripts.gdal.gdal_helper import get_gdal_version, run_gdal -from scripts.gdal.gdal_preset import get_gdal_command +from scripts.gdal.gdal_preset import get_gdal_command, get_gdal_band_offset from scripts.logging.time_helper import time_in_ms @@ -41,13 +41,15 @@ def start_standardising(files: List[str], preset: str, concurrency: int) -> List def standardising(file: str, preset: str) -> str: output_folder = "/tmp/" - get_log().info(f"standardising {file}", path=file) + get_log().info(f"standardising", path=file) _, src_file_path = parse_path(file) standardized_file_name = f"{get_file_name_from_path(src_file_path)}.tiff" tmp_file_path = os.path.join(output_folder, standardized_file_name) command = get_gdal_command(preset) + command.extend(get_gdal_band_offset(file)) + run_gdal(command, input_file=file, output_file=tmp_file_path) return tmp_file_path From 22e1ec0f049e13be9a25ae66c11d066321f27a10 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Fri, 2 Dec 2022 12:00:30 +1300 Subject: [PATCH 03/12] wip: attempt to apply a cutline --- scripts/files/files_helper.py | 2 ++ scripts/gdal/gdal_preset.py | 26 ++++++++++++++++++++++---- scripts/gdal/gdalinfo.py | 2 +- scripts/standardising.py | 33 +++++++++++++++++++++++---------- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/scripts/files/files_helper.py b/scripts/files/files_helper.py index 9dd9ac754..bd3013ce7 100644 --- a/scripts/files/files_helper.py +++ b/scripts/files/files_helper.py @@ -9,6 +9,8 @@ def get_file_name_from_path(path: str) -> str: def is_tiff(path: str) -> bool: return path.lower().endswith((".tiff", ".tif")) +def is_vrt(path: str) -> bool: + return path.lower().endswith(".vrt") def is_json(path: str) -> bool: return path.lower().endswith(".json") diff --git a/scripts/gdal/gdal_preset.py b/scripts/gdal/gdal_preset.py index 7f3220bc0..faae4cbb2 100644 --- a/scripts/gdal/gdal_preset.py +++ b/scripts/gdal/gdal_preset.py @@ -106,11 +106,16 @@ def get_gdal_band_offset(file: str) -> List[str]: bands = info['bands']; + alpha_band = find_band(bands, 'Alpha') + alpha_band_info: List[str] = [] + if alpha_band: + alpha_band_info.extend(['-b', str(alpha_band['band'])]) + # Grey scale imagery, set R,G and B to just the grey_band grey_band = find_band(bands, 'Gray') if grey_band: grand_band_index = str(grey_band['band']) - return ["-b", grand_band_index, "-b", grand_band_index, "-b", grand_band_index] + return ["-b", grand_band_index, "-b", grand_band_index, "-b", grand_band_index] + alpha_band_info band_red = find_band(bands, 'Red') band_green = find_band(bands, 'Green') @@ -118,6 +123,19 @@ def get_gdal_band_offset(file: str) -> List[str]: if band_red is None or band_green is None or band_blue is None: get_log().warn("gdal_info_bands_failed", band_red=band_red is None, band_green=band_green is None, band_blue=band_blue is None) - return ["-b", "1", "-b", "2", "-b", "3"] - - return ["-b", str(band_red['band']), "-b", str(band_green['band']), "-b", str(band_blue['band'])] + return ["-b", "1", "-b", "2", "-b", "3"] + alpha_band_info + + return ["-b", str(band_red['band']), "-b", str(band_green['band']), "-b", str(band_blue['band'])] + alpha_band_info + + +# Get a command to create a virtual file which has a cutline and alpha applied +def get_cutline_command(cutline: str)-> List[str] : + return [ + 'gdalwarp', + # Outputting a VRT makes things faster as its not recomputing everything + '-of', 'VRT', + # Apply the cutline + '-cutline', cutline, + # Ensure the target has a alpha channel + '-dstalpha' + ] \ No newline at end of file diff --git a/scripts/gdal/gdalinfo.py b/scripts/gdal/gdalinfo.py index 166bd249f..6cfef7bd9 100644 --- a/scripts/gdal/gdalinfo.py +++ b/scripts/gdal/gdalinfo.py @@ -1,6 +1,6 @@ import json import re -from typing import Any, Dict, List, TypedDict +from typing import Dict, List, TypedDict from linz_logger import get_log diff --git a/scripts/standardising.py b/scripts/standardising.py index 770e3a5ae..9c1ef797a 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -1,20 +1,21 @@ import argparse import os +import ulid from functools import partial from multiprocessing import Pool -from typing import List +from typing import List, Optional from linz_logger import get_log from scripts.aws.aws_helper import parse_path from scripts.cli.cli_helper import format_source, is_argo -from scripts.files.files_helper import get_file_name_from_path, is_tiff +from scripts.files.files_helper import get_file_name_from_path, is_tiff, is_vrt from scripts.gdal.gdal_helper import get_gdal_version, run_gdal -from scripts.gdal.gdal_preset import get_gdal_command, get_gdal_band_offset +from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_command, get_gdal_band_offset from scripts.logging.time_helper import time_in_ms -def start_standardising(files: List[str], preset: str, concurrency: int) -> List[str]: +def start_standardising(files: List[str], preset: str, cutline: Optional[str], concurrency: int) -> List[str]: start_time = time_in_ms() tiff_files = [] output_files = [] @@ -23,13 +24,13 @@ def start_standardising(files: List[str], preset: str, concurrency: int) -> List get_log().info("standardising_start", gdalVersion=gdal_version) for file in files: - if is_tiff(file): + if is_tiff(file) or is_vrt(file): tiff_files.append(file) else: get_log().info("standardising_file_not_tiff_skipped", file=file) with Pool(concurrency) as p: - output_files = p.map(partial(standardising, preset=preset), tiff_files) + output_files = p.map(partial(standardising, preset=preset, cutline=cutline), tiff_files) p.close() p.join() @@ -38,7 +39,7 @@ def start_standardising(files: List[str], preset: str, concurrency: int) -> List return output_files -def standardising(file: str, preset: str) -> str: +def standardising(file: str, preset: str, cutline: Optional[str]) -> str: output_folder = "/tmp/" get_log().info(f"standardising", path=file) @@ -47,10 +48,21 @@ def standardising(file: str, preset: str) -> str: standardized_file_name = f"{get_file_name_from_path(src_file_path)}.tiff" tmp_file_path = os.path.join(output_folder, standardized_file_name) + input_file = file + if cutline: + target_cutline_file = os.path.join(output_folder, str(ulid.ULID()) + ".vrt") + # TODO check if the cutline actually intersects with the input_file as apply a cutline is much slower than conversion + run_gdal(get_cutline_command(cutline), input_file=input_file, output_file=target_cutline_file) + input_file = target_cutline_file + command = get_gdal_command(preset) - command.extend(get_gdal_band_offset(file)) + command.extend(get_gdal_band_offset(input_file)) + + run_gdal(command, input_file=input_file, output_file=tmp_file_path) - run_gdal(command, input_file=file, output_file=tmp_file_path) + # Cleanup temporary file + if input_file != file: + os.remove(input_file) return tmp_file_path @@ -60,6 +72,7 @@ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--preset", dest="preset", required=True) parser.add_argument("--source", dest="source", nargs="+", required=True) + parser.add_argument("--cutline", dest="cutline", required=False) arguments = parser.parse_args() source = format_source(arguments.source) @@ -67,7 +80,7 @@ def main() -> None: if is_argo(): concurrency = 4 - start_standardising(source, arguments.preset, concurrency) + start_standardising(source, arguments.preset, arguments.cutline, concurrency) if __name__ == "__main__": From a985707e76b85d823523cba8cfbee9dfd28f9763 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 11:06:24 +1300 Subject: [PATCH 04/12] refactor: correct mypy typing --- scripts/create_stac.py | 4 +- scripts/files/file_check.py | 16 ++--- scripts/files/files_helper.py | 2 + scripts/files/geotiff.py | 5 +- scripts/files/tests/file_check_test.py | 86 ++++++++++++-------------- scripts/gdal/gdal_helper.py | 2 +- scripts/gdal/gdal_preset.py | 72 ++++++++++----------- scripts/gdal/gdalinfo.py | 27 ++++---- scripts/standardise_validate.py | 6 +- scripts/standardising.py | 2 +- 10 files changed, 113 insertions(+), 109 deletions(-) diff --git a/scripts/create_stac.py b/scripts/create_stac.py index 53cebdb76..ac0d5c6bd 100644 --- a/scripts/create_stac.py +++ b/scripts/create_stac.py @@ -4,7 +4,7 @@ from scripts.files.files_helper import get_file_name_from_path from scripts.files.geotiff import get_extents -from scripts.gdal.gdalinfo import gdal_info +from scripts.gdal.gdalinfo import gdal_info, GdalInfo from scripts.stac.imagery.item import ImageryItem @@ -13,7 +13,7 @@ def create_item( start_datetime: str, end_datetime: str, collection_id: str, - gdalinfo_result: Optional[Dict[Any, Any]] = None, + gdalinfo_result: Optional[GdalInfo] = None, ) -> ImageryItem: id_ = get_file_name_from_path(file) diff --git a/scripts/files/file_check.py b/scripts/files/file_check.py index 18674bc19..cb3c96b4b 100644 --- a/scripts/files/file_check.py +++ b/scripts/files/file_check.py @@ -7,7 +7,7 @@ from scripts.files.files_helper import get_file_name_from_path from scripts.gdal.gdal_helper import GDALExecutionException, run_gdal -from scripts.gdal.gdalinfo import gdal_info +from scripts.gdal.gdalinfo import GdalInfo, gdal_info from scripts.tile.tile_index import Point, TileIndexException, get_tile_name @@ -31,10 +31,10 @@ def __init__( self.scale = scale self.errors: List[Dict[str, Any]] = [] self._valid = True - self._gdalinfo: Dict[Any, Any] = {} + self._gdalinfo: Optional[GdalInfo] = None self._srs = srs - def get_gdalinfo(self) -> Optional[Dict[Any, Any]]: + def get_gdalinfo(self) -> Optional[GdalInfo]: if self.is_error_type(FileCheckErrorType.GDAL_INFO): return None if not self._gdalinfo: @@ -65,7 +65,7 @@ def is_error_type(self, error_type: str) -> bool: return True return False - def check_no_data(self, gdalinfo: Dict[Any, Any]) -> None: + def check_no_data(self, gdalinfo: GdalInfo) -> None: """Add an error if there is no "noDataValue" or the "noDataValue" is not equal to 255 in the "bands".""" bands = gdalinfo["bands"] if "noDataValue" in bands[0]: @@ -74,12 +74,12 @@ def check_no_data(self, gdalinfo: Dict[Any, Any]) -> None: self.add_error( error_type=FileCheckErrorType.NO_DATA, error_message="noDataValue is not 255", - custom_fields={"current": f"{int(current_nodata_val)}"}, + custom_fields={"current": f"{current_nodata_val}"}, ) else: self.add_error(error_type=FileCheckErrorType.NO_DATA, error_message="noDataValue not set") - def check_band_count(self, gdalinfo: Dict[Any, Any]) -> None: + def check_band_count(self, gdalinfo: GdalInfo) -> None: """Add an error if there is no exactly 3 bands found.""" bands = gdalinfo["bands"] bands_num = len(bands) @@ -99,7 +99,7 @@ def check_srs(self, gdalsrsinfo_tif: bytes) -> None: if gdalsrsinfo_tif != self._srs: self.add_error(error_type=FileCheckErrorType.SRS, error_message="different srs") - def check_color_interpretation(self, gdalinfo: Dict[Any, Any]) -> None: + def check_color_interpretation(self, gdalinfo: GdalInfo) -> None: """Add an error if the colors don't match RGB. Args: @@ -125,7 +125,7 @@ def check_color_interpretation(self, gdalinfo: Dict[Any, Any]) -> None: custom_fields={"missing": f"{', '.join(missing_bands)}"}, ) - def check_tile_and_rename(self, gdalinfo: Dict[Any, Any]) -> None: + def check_tile_and_rename(self, gdalinfo: GdalInfo) -> None: origin = Point(gdalinfo["cornerCoordinates"]["upperLeft"][0], gdalinfo["cornerCoordinates"]["upperLeft"][1]) try: tile_name = get_tile_name(origin, self.scale) diff --git a/scripts/files/files_helper.py b/scripts/files/files_helper.py index bd3013ce7..e7edb42b1 100644 --- a/scripts/files/files_helper.py +++ b/scripts/files/files_helper.py @@ -9,8 +9,10 @@ def get_file_name_from_path(path: str) -> str: def is_tiff(path: str) -> bool: return path.lower().endswith((".tiff", ".tif")) + def is_vrt(path: str) -> bool: return path.lower().endswith(".vrt") + def is_json(path: str) -> bool: return path.lower().endswith(".json") diff --git a/scripts/files/geotiff.py b/scripts/files/geotiff.py index f86c82d91..8e7b5ea6f 100644 --- a/scripts/files/geotiff.py +++ b/scripts/files/geotiff.py @@ -1,9 +1,12 @@ from typing import Any, Dict, List, Tuple +from scripts.gdal.gdalinfo import GdalInfo from shapely.geometry import Polygon -def get_extents(gdalinfo_result: Dict[Any, Any]) -> Tuple[List[List[float]], List[float]]: +def get_extents(gdalinfo_result: GdalInfo) -> Tuple[Dict[str, List[float]], List[float]]: + if gdalinfo_result["wgs84Extent"] is None: + raise Exception("No WGS84 Extent was found") geometry = gdalinfo_result["wgs84Extent"] bbox = Polygon(geometry["coordinates"][0]).bounds diff --git a/scripts/files/tests/file_check_test.py b/scripts/files/tests/file_check_test.py index 370d14961..732e7ddf9 100644 --- a/scripts/files/tests/file_check_test.py +++ b/scripts/files/tests/file_check_test.py @@ -1,4 +1,23 @@ +from typing import Optional, cast + from scripts.files.file_check import FileCheck +from scripts.gdal.gdalinfo import GdalInfo, GdalInfoBand + + +def fake_gdal_info() -> GdalInfo: + return cast(GdalInfo, {}) + + +def add_band(gdalinfo: GdalInfo, color_interpretation: Optional[str] = None, no_data_value: Optional[int] = None) -> None: + if gdalinfo.get("bands", None) is None: + gdalinfo["bands"] = [] + + gdalinfo["bands"].append( + cast( + GdalInfoBand, + {"band": len(gdalinfo["bands"]), "colorInterpretation": color_interpretation, "noDataValue": no_data_value}, + ) + ) def test_check_band_count_valid() -> None: @@ -6,8 +25,10 @@ def test_check_band_count_valid() -> None: tests check_band_count when the input layer has a valid band count which is 3 bands """ - gdalinfo = {} - gdalinfo["bands"] = [{"band": 1}, {"band": 2}, {"band": 3}] + gdalinfo = fake_gdal_info() + add_band(gdalinfo) + add_band(gdalinfo) + add_band(gdalinfo) file_check = FileCheck("test", 500, b"test") file_check.check_band_count(gdalinfo) @@ -20,8 +41,9 @@ def test_check_band_count_invalid() -> None: tests check_band_count when the input layer has a invalid band count of 2 which is 3 bands to be valid """ - gdalinfo = {} - gdalinfo["bands"] = [{"band": 1}, {"band": 2}] + gdalinfo = fake_gdal_info() + add_band(gdalinfo) + add_band(gdalinfo) file_check = FileCheck("test", 500, b"test") file_check.check_band_count(gdalinfo) @@ -33,18 +55,10 @@ def test_check_color_interpretation_valid() -> None: """ tests check_color_interpretation with the correct color interpretation """ - gdalinfo = {} - gdalinfo["bands"] = [ - { - "colorInterpretation": "Red", - }, - { - "colorInterpretation": "Green", - }, - { - "colorInterpretation": "Blue", - }, - ] + gdalinfo = fake_gdal_info() + add_band(gdalinfo, color_interpretation="Red") + add_band(gdalinfo, color_interpretation="Green") + add_band(gdalinfo, color_interpretation="Blue") file_check = FileCheck("test", 500, b"test") file_check.check_color_interpretation(gdalinfo) @@ -56,21 +70,11 @@ def test_check_color_interpretation_invalid() -> None: """ tests check_color_interpretation with the incorrect color interpretation """ - gdalinfo = {} - gdalinfo["bands"] = [ - { - "colorInterpretation": "Red", - }, - { - "colorInterpretation": "Green", - }, - { - "colorInterpretation": "Blue", - }, - { - "colorInterpretation": "Undefined", - }, - ] + gdalinfo = fake_gdal_info() + add_band(gdalinfo, color_interpretation="Red") + add_band(gdalinfo, color_interpretation="Green") + add_band(gdalinfo, color_interpretation="Blue") + add_band(gdalinfo, color_interpretation="undefined") file_check = FileCheck("test", 500, b"test") file_check.check_color_interpretation(gdalinfo) @@ -82,12 +86,8 @@ def test_check_no_data_valid() -> None: """ tests check_no_data when the input layer has a valid no data value of 255 """ - gdalinfo = {} - gdalinfo["bands"] = [ - { - "noDataValue": 255, - } - ] + gdalinfo = fake_gdal_info() + add_band(gdalinfo, no_data_value=255) file_check = FileCheck("test", 500, b"test") file_check.check_no_data(gdalinfo) @@ -99,8 +99,8 @@ def test_check_no_data_no_value() -> None: """ tests check_no_data when the input layer has no no_data value assigned """ - gdalinfo = {} - gdalinfo["bands"] = [{"test": 1}] + gdalinfo = fake_gdal_info() + add_band(gdalinfo) file_check = FileCheck("test", 500, b"test") file_check.check_no_data(gdalinfo) @@ -112,12 +112,8 @@ def test_check_no_data_invalid_value() -> None: """ tests check_no_data when the input layer has the wrong value of 0 assigned """ - gdalinfo = {} - gdalinfo["bands"] = [ - { - "noDataValue": 0, - } - ] + gdalinfo = fake_gdal_info() + add_band(gdalinfo, no_data_value=0) file_check = FileCheck("test", 500, b"test") file_check.check_no_data(gdalinfo) diff --git a/scripts/gdal/gdal_helper.py b/scripts/gdal/gdal_helper.py index 0dbc13c83..9dfeff700 100644 --- a/scripts/gdal/gdal_helper.py +++ b/scripts/gdal/gdal_helper.py @@ -57,7 +57,7 @@ def get_gdal_version() -> str: def run_gdal( command: List[str], - input_file: str, + input_file: Optional[str] = None, output_file: Optional[str] = None, ) -> "subprocess.CompletedProcess[bytes]": """Run the GDAL command. The permissions to access to the input file are applied to the gdal environment. diff --git a/scripts/gdal/gdal_preset.py b/scripts/gdal/gdal_preset.py index faae4cbb2..b5dfb001b 100644 --- a/scripts/gdal/gdal_preset.py +++ b/scripts/gdal/gdal_preset.py @@ -1,29 +1,18 @@ from typing import List, Optional from linz_logger import get_log -from scripts.gdal.gdalinfo import gdal_info, GdalInfoBands +from scripts.gdal.gdalinfo import gdal_info, GdalInfoBand # Force the source projection as NZTM EPSG:2193 -NZTM_SOURCE = [ - "-a_srs", - "EPSG:2193" -] +NZTM_SOURCE = ["-a_srs", "EPSG:2193"] # Scale imagery from 0-255 to 0-254 then set 255 as NO_DATA # Useful for imagery that does not have a alpha band -SCALE_254_ADD_NO_DATA = [ - "-scale", - "0", - "255", - "0", - "254", - "-a_nodata", - "255" -] +SCALE_254_ADD_NO_DATA = ["-scale", "0", "255", "0", "254", "-a_nodata", "255"] BASE_COG = [ - # ?? + # Suppress progress monitor and other non-error output. "-q", # Output to a COG "-of", @@ -48,12 +37,12 @@ # Compress as LZW "-co", "compress=lzw", - # Predictor two reduces file size + # Predictor creates smaller files, for RGB imagery "-co", "predictor=2", ] -COMPRESS_WEBP_LOSSLESS =[ +COMPRESS_WEBP_LOSSLESS = [ # Comppress into webp "-co", "compress=webp", @@ -75,9 +64,10 @@ "overview_quality=90", ] + def get_gdal_command(preset: str) -> List[str]: get_log().info("gdal_preset", preset=preset) - gdal_command:List[str] = ["gdal_translate"] + gdal_command: List[str] = ["gdal_translate"] gdal_command.extend(BASE_COG) gdal_command.extend(NZTM_SOURCE) @@ -93,49 +83,55 @@ def get_gdal_command(preset: str) -> List[str]: return gdal_command + # Find a band from the color interpretation -def find_band(bands: List[GdalInfoBands], color: str) -> Optional[GdalInfoBands]: +def find_band(bands: List[GdalInfoBand], color: str) -> Optional[GdalInfoBand]: for band in bands: - if band['colorInterpretation'] == color: + if band["colorInterpretation"] == color: return band return None + # Determine what band numbers to use for the "-b" overrides for gdal_translate def get_gdal_band_offset(file: str) -> List[str]: info = gdal_info(file, False) - bands = info['bands']; + bands = info["bands"] - alpha_band = find_band(bands, 'Alpha') + alpha_band = find_band(bands, "Alpha") alpha_band_info: List[str] = [] if alpha_band: - alpha_band_info.extend(['-b', str(alpha_band['band'])]) + alpha_band_info.extend(["-b", str(alpha_band["band"])]) # Grey scale imagery, set R,G and B to just the grey_band - grey_band = find_band(bands, 'Gray') + grey_band = find_band(bands, "Gray") if grey_band: - grand_band_index = str(grey_band['band']) - return ["-b", grand_band_index, "-b", grand_band_index, "-b", grand_band_index] + alpha_band_info + grey_band_index = str(grey_band["band"]) + return ["-b", grey_band_index, "-b", grey_band_index, "-b", grey_band_index] + alpha_band_info - band_red = find_band(bands, 'Red') - band_green = find_band(bands, 'Green') - band_blue = find_band(bands, 'Blue') + band_red = find_band(bands, "Red") + band_green = find_band(bands, "Green") + band_blue = find_band(bands, "Blue") if band_red is None or band_green is None or band_blue is None: - get_log().warn("gdal_info_bands_failed", band_red=band_red is None, band_green=band_green is None, band_blue=band_blue is None) + get_log().warn( + "gdal_info_bands_failed", band_red=band_red is None, band_green=band_green is None, band_blue=band_blue is None + ) return ["-b", "1", "-b", "2", "-b", "3"] + alpha_band_info - return ["-b", str(band_red['band']), "-b", str(band_green['band']), "-b", str(band_blue['band'])] + alpha_band_info + return ["-b", str(band_red["band"]), "-b", str(band_green["band"]), "-b", str(band_blue["band"])] + alpha_band_info -# Get a command to create a virtual file which has a cutline and alpha applied -def get_cutline_command(cutline: str)-> List[str] : +# Get a command to create a virtual file which has a cutline and alpha applied +def get_cutline_command(cutline: str) -> List[str]: return [ - 'gdalwarp', + "gdalwarp", # Outputting a VRT makes things faster as its not recomputing everything - '-of', 'VRT', + "-of", + "VRT", # Apply the cutline - '-cutline', cutline, + "-cutline", + cutline, # Ensure the target has a alpha channel - '-dstalpha' - ] \ No newline at end of file + "-dstalpha", + ] diff --git a/scripts/gdal/gdalinfo.py b/scripts/gdal/gdalinfo.py index 6cfef7bd9..09d84689b 100644 --- a/scripts/gdal/gdalinfo.py +++ b/scripts/gdal/gdalinfo.py @@ -1,16 +1,19 @@ import json import re -from typing import Dict, List, TypedDict +from typing import Dict, List, Optional, TypedDict, Any, cast from linz_logger import get_log from scripts.gdal.gdal_helper import GDALExecutionException, run_gdal -class GdalInfoBands(TypedDict): + +class GdalInfoBand(TypedDict): band: int block: List[int] type: str colorInterpretation: str + noDataValue: Optional[int] + class GdalInfo(TypedDict): description: str @@ -19,14 +22,16 @@ class GdalInfo(TypedDict): files: List[str] size: List[int] geoTransform: List[float] - metadata: Dict[any, any] - cornerCoordinates: Dict[any, any] - extent: Dict[any, any] - bands: List[GdalInfoBands] + metadata: Dict[Any, Any] + cornerCoordinates: Dict[Any, Any] + extent: Dict[Any, Any] + wgs84Extent: Optional[Dict[str, List[float]]] + bands: List[GdalInfoBand] + def gdal_info(path: str, stats: bool = True) -> GdalInfo: # Set GDAL_PAM_ENABLED to NO to temporarily diable PAM support and prevent creation of auxiliary XML file. - gdalinfo_command = ["gdalinfo", "-json", "--config", "GDAL_PAM_ENABLED", "NO"] + gdalinfo_command = ["gdalinfo", "-json", "--config", "GDAL_PAM_ENABLED", "NO"] # Stats takes a while to generate only generate if needed if stats: @@ -34,13 +39,13 @@ def gdal_info(path: str, stats: bool = True) -> GdalInfo: try: gdalinfo_process = run_gdal(gdalinfo_command, path) - return json.loads(gdalinfo_process.stdout) + return cast(GdalInfo, json.loads(gdalinfo_process.stdout)) except json.JSONDecodeError as e: get_log().error("load_gdalinfo_result_error", file=path, error=e) raise e - except GDALExecutionException as gee: - get_log().error("gdalinfo_failed", file=path, error=str(gee)) - raise gee + except GDALExecutionException as e: + get_log().error("gdalinfo_failed", file=path, error=str(e)) + raise e def format_wkt(wkt: str) -> str: diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index 94e88ce4a..5350ff1fa 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -17,7 +17,9 @@ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--preset", dest="preset", required=True) parser.add_argument("--source", dest="source", nargs="+", required=True) - parser.add_argument("--scale", dest="scale", required=True) + parser.add_argument("--cutline", dest="cutline", help="Optional cutline to cut imagery to", required=True) + + parser.add_argument("--scale", dest="scale", help="Tile grid scale to align output tile to", required=True) parser.add_argument("--collection-id", dest="collection_id", help="Unique id for collection", required=True) parser.add_argument( "--start-datetime", dest="start_datetime", help="start datetime in format YYYY-MM-DD", type=valid_date, required=True @@ -36,7 +38,7 @@ def main() -> None: if is_argo(): concurrency = 4 - standardised_files = start_standardising(source, arguments.preset, concurrency) + standardised_files = start_standardising(source, arguments.preset, arguments.cutline, concurrency) if not standardised_files: get_log().info("Process skipped because no file has been standardised", action="standardise_validate", reason="skip") return diff --git a/scripts/standardising.py b/scripts/standardising.py index 9c1ef797a..23d61ebd2 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -50,7 +50,7 @@ def standardising(file: str, preset: str, cutline: Optional[str]) -> str: input_file = file if cutline: - target_cutline_file = os.path.join(output_folder, str(ulid.ULID()) + ".vrt") + target_cutline_file = os.path.join(output_folder, str(ulid.ULID()) + ".vrt") # TODO check if the cutline actually intersects with the input_file as apply a cutline is much slower than conversion run_gdal(get_cutline_command(cutline), input_file=input_file, output_file=target_cutline_file) input_file = target_cutline_file From e17f6c501d011d8fa5ffcd1ef0b9a9c1b5f23069 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 11:12:28 +1300 Subject: [PATCH 05/12] refactor: fixup lint/isort --- scripts/create_stac.py | 4 ++-- scripts/files/geotiff.py | 5 +++-- scripts/gdal/gdal_preset.py | 2 +- scripts/gdal/gdalinfo.py | 2 +- scripts/standardising.py | 6 +++--- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/create_stac.py b/scripts/create_stac.py index ac0d5c6bd..72e0a270b 100644 --- a/scripts/create_stac.py +++ b/scripts/create_stac.py @@ -1,10 +1,10 @@ -from typing import Any, Dict, Optional +from typing import Optional from linz_logger import get_log from scripts.files.files_helper import get_file_name_from_path from scripts.files.geotiff import get_extents -from scripts.gdal.gdalinfo import gdal_info, GdalInfo +from scripts.gdal.gdalinfo import GdalInfo, gdal_info from scripts.stac.imagery.item import ImageryItem diff --git a/scripts/files/geotiff.py b/scripts/files/geotiff.py index 8e7b5ea6f..08eb4b582 100644 --- a/scripts/files/geotiff.py +++ b/scripts/files/geotiff.py @@ -1,8 +1,9 @@ -from typing import Any, Dict, List, Tuple -from scripts.gdal.gdalinfo import GdalInfo +from typing import Dict, List, Tuple from shapely.geometry import Polygon +from scripts.gdal.gdalinfo import GdalInfo + def get_extents(gdalinfo_result: GdalInfo) -> Tuple[Dict[str, List[float]], List[float]]: if gdalinfo_result["wgs84Extent"] is None: diff --git a/scripts/gdal/gdal_preset.py b/scripts/gdal/gdal_preset.py index b5dfb001b..b2b847454 100644 --- a/scripts/gdal/gdal_preset.py +++ b/scripts/gdal/gdal_preset.py @@ -1,8 +1,8 @@ from typing import List, Optional from linz_logger import get_log -from scripts.gdal.gdalinfo import gdal_info, GdalInfoBand +from scripts.gdal.gdalinfo import GdalInfoBand, gdal_info # Force the source projection as NZTM EPSG:2193 NZTM_SOURCE = ["-a_srs", "EPSG:2193"] diff --git a/scripts/gdal/gdalinfo.py b/scripts/gdal/gdalinfo.py index 09d84689b..7d8bedd4c 100644 --- a/scripts/gdal/gdalinfo.py +++ b/scripts/gdal/gdalinfo.py @@ -1,6 +1,6 @@ import json import re -from typing import Dict, List, Optional, TypedDict, Any, cast +from typing import Any, Dict, List, Optional, TypedDict, cast from linz_logger import get_log diff --git a/scripts/standardising.py b/scripts/standardising.py index 23d61ebd2..6c8b18d1d 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -1,17 +1,17 @@ import argparse import os -import ulid from functools import partial from multiprocessing import Pool from typing import List, Optional +import ulid from linz_logger import get_log from scripts.aws.aws_helper import parse_path from scripts.cli.cli_helper import format_source, is_argo from scripts.files.files_helper import get_file_name_from_path, is_tiff, is_vrt from scripts.gdal.gdal_helper import get_gdal_version, run_gdal -from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_command, get_gdal_band_offset +from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_band_offset, get_gdal_command from scripts.logging.time_helper import time_in_ms @@ -42,7 +42,7 @@ def start_standardising(files: List[str], preset: str, cutline: Optional[str], c def standardising(file: str, preset: str, cutline: Optional[str]) -> str: output_folder = "/tmp/" - get_log().info(f"standardising", path=file) + get_log().info("standardising", path=file) _, src_file_path = parse_path(file) standardized_file_name = f"{get_file_name_from_path(src_file_path)}.tiff" From a220987c4203f0d0066351a3c03678a8260652e7 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 11:42:28 +1300 Subject: [PATCH 06/12] feat: download tiff/cutlines if we have multiple s3 locations --- scripts/standardising.py | 43 +++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/scripts/standardising.py b/scripts/standardising.py index 6c8b18d1d..01754260f 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -1,13 +1,15 @@ import argparse import os +import tempfile from functools import partial from multiprocessing import Pool from typing import List, Optional +from scripts.files.fs import read, write import ulid from linz_logger import get_log -from scripts.aws.aws_helper import parse_path +from scripts.aws.aws_helper import is_s3, parse_path from scripts.cli.cli_helper import format_source, is_argo from scripts.files.files_helper import get_file_name_from_path, is_tiff, is_vrt from scripts.gdal.gdal_helper import get_gdal_version, run_gdal @@ -38,7 +40,6 @@ def start_standardising(files: List[str], preset: str, cutline: Optional[str], c return output_files - def standardising(file: str, preset: str, cutline: Optional[str]) -> str: output_folder = "/tmp/" @@ -46,25 +47,35 @@ def standardising(file: str, preset: str, cutline: Optional[str]) -> str: _, src_file_path = parse_path(file) standardized_file_name = f"{get_file_name_from_path(src_file_path)}.tiff" - tmp_file_path = os.path.join(output_folder, standardized_file_name) + output_file_path = os.path.join(output_folder, standardized_file_name) + + with tempfile.TemporaryDirectory() as tmp_path: + input_file = file + + # Ensure the remote file can be read locally, having multiple s3 paths with different credentials + # makes it hard for GDAL to do its thing + if is_s3(input_file): + input_file_path = os.path.join(tmp_path, str(ulid.ULID()) + ".tiff") + write(input_file_path, read(input_file)) + input_file = input_file_path; + + if cutline: + input_cutline_path = os.path.join(tmp_path, str(ulid.ULID()) + ".vrt") + # Ensure the input cutline is a easy spot for GDAL to read + write(input_cutline_path, read(cutline)) - input_file = file - if cutline: - target_cutline_file = os.path.join(output_folder, str(ulid.ULID()) + ".vrt") - # TODO check if the cutline actually intersects with the input_file as apply a cutline is much slower than conversion - run_gdal(get_cutline_command(cutline), input_file=input_file, output_file=target_cutline_file) - input_file = target_cutline_file + target_vrt = os.path.join(tmp_path, str(ulid.ULID()) + ".vrt") + # TODO check if the cutline actually intersects with the input_file as apply a cutline is much slower than conversion + run_gdal(get_cutline_command(input_cutline_path), input_file=input_file, output_file=target_vrt) + input_file = target_vrt - command = get_gdal_command(preset) - command.extend(get_gdal_band_offset(input_file)) + command = get_gdal_command(preset) + command.extend(get_gdal_band_offset(input_file)) - run_gdal(command, input_file=input_file, output_file=tmp_file_path) + run_gdal(command, input_file=input_file, output_file=output_file_path) - # Cleanup temporary file - if input_file != file: - os.remove(input_file) - return tmp_file_path + return output_file_path def main() -> None: From 21a74dc02cca68edcf958eb4c95d9f4c0f250358 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 11:57:34 +1300 Subject: [PATCH 07/12] feat: basic tests for presets --- scripts/gdal/tests/gdal_preset_test.py | 51 ++++++++++++++++++++++++++ scripts/standardising.py | 16 +++++--- 2 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 scripts/gdal/tests/gdal_preset_test.py diff --git a/scripts/gdal/tests/gdal_preset_test.py b/scripts/gdal/tests/gdal_preset_test.py new file mode 100644 index 000000000..8aea7aa0d --- /dev/null +++ b/scripts/gdal/tests/gdal_preset_test.py @@ -0,0 +1,51 @@ +from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_command + + +def test_preset_webp(): + gdal_command = get_gdal_command("webp") + + # Basic cog creation + assert "COG" in gdal_command + assert "blocksize=512" in gdal_command + assert "num_threads=all_cpus" in gdal_command + assert "bigtiff=yes" in gdal_command + + # Webp lossless + assert "compress=webp" in gdal_command + assert "quality=100" in gdal_command + + # Webp overviews + assert "overview_compress=webp" in gdal_command + assert "overview_resampling=lanczos" in gdal_command + assert "overview_quality=90" in gdal_command + + assert "EPSG:2193" in gdal_command + + +def test_preset_lzw(): + gdal_command = get_gdal_command("lzw") + + # Basic cog creation + assert "COG" in gdal_command + assert "blocksize=512" in gdal_command + assert "num_threads=all_cpus" in gdal_command + assert "bigtiff=yes" in gdal_command + + # LZW compression + assert "compress=lzw" in gdal_command + assert "predictor=2" in gdal_command + + # Webp overviews + assert "overview_compress=webp" in gdal_command + assert "overview_resampling=lanczos" in gdal_command + assert "overview_quality=90" in gdal_command + + assert "EPSG:2193" in gdal_command + + +def test_cutline_params(): + gdal_command = get_cutline_command("cutline.fgb") + + assert "-cutline" in gdal_command + assert "cutline.fgb" in gdal_command + assert "-dstalpha" in gdal_command diff --git a/scripts/standardising.py b/scripts/standardising.py index 01754260f..424c3ea05 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -40,6 +40,7 @@ def start_standardising(files: List[str], preset: str, cutline: Optional[str], c return output_files + def standardising(file: str, preset: str, cutline: Optional[str]) -> str: output_folder = "/tmp/" @@ -52,17 +53,21 @@ def standardising(file: str, preset: str, cutline: Optional[str]) -> str: with tempfile.TemporaryDirectory() as tmp_path: input_file = file - # Ensure the remote file can be read locally, having multiple s3 paths with different credentials + # Ensure the remote file can be read locally, having multiple s3 paths with different credentials # makes it hard for GDAL to do its thing if is_s3(input_file): input_file_path = os.path.join(tmp_path, str(ulid.ULID()) + ".tiff") write(input_file_path, read(input_file)) - input_file = input_file_path; + input_file = input_file_path if cutline: - input_cutline_path = os.path.join(tmp_path, str(ulid.ULID()) + ".vrt") - # Ensure the input cutline is a easy spot for GDAL to read - write(input_cutline_path, read(cutline)) + input_cutline_path = cutline + if is_s3(cutline): + if not cutline.endswith((".fgb", ".geojson")): + raise Exception(f"Only .fgb or .geojson cutlines are support cutline:{cutline}") + input_cutline_path = os.path.join(tmp_path, str(ulid.ULID()) + os.path.splitext(cutline)[1]) + # Ensure the input cutline is a easy spot for GDAL to read + write(input_cutline_path, read(cutline)) target_vrt = os.path.join(tmp_path, str(ulid.ULID()) + ".vrt") # TODO check if the cutline actually intersects with the input_file as apply a cutline is much slower than conversion @@ -74,7 +79,6 @@ def standardising(file: str, preset: str, cutline: Optional[str]) -> str: run_gdal(command, input_file=input_file, output_file=output_file_path) - return output_file_path From ecc9b0c45afc328008647c87ead02a8cf8e56a2c Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 11:58:03 +1300 Subject: [PATCH 08/12] refactor: fix import sorting --- scripts/standardising.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/standardising.py b/scripts/standardising.py index 424c3ea05..74b511938 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -4,7 +4,6 @@ from functools import partial from multiprocessing import Pool from typing import List, Optional -from scripts.files.fs import read, write import ulid from linz_logger import get_log @@ -12,6 +11,7 @@ from scripts.aws.aws_helper import is_s3, parse_path from scripts.cli.cli_helper import format_source, is_argo from scripts.files.files_helper import get_file_name_from_path, is_tiff, is_vrt +from scripts.files.fs import read, write from scripts.gdal.gdal_helper import get_gdal_version, run_gdal from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_band_offset, get_gdal_command from scripts.logging.time_helper import time_in_ms From 013aeef59215379b4e3f39e365c2d32d51698b74 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 12:15:26 +1300 Subject: [PATCH 09/12] refactor add more tests --- scripts/files/tests/file_check_test.py | 20 +--------- scripts/gdal/gdal_bands.py | 44 +++++++++++++++++++++ scripts/gdal/gdal_preset.py | 42 +------------------- scripts/gdal/tests/gdal_bands_test.py | 53 ++++++++++++++++++++++++++ scripts/gdal/tests/gdal_preset_test.py | 6 +-- scripts/gdal/tests/gdalinfo.py | 19 +++++++++ scripts/standardising.py | 6 ++- 7 files changed, 125 insertions(+), 65 deletions(-) create mode 100644 scripts/gdal/gdal_bands.py create mode 100644 scripts/gdal/tests/gdal_bands_test.py create mode 100644 scripts/gdal/tests/gdalinfo.py diff --git a/scripts/files/tests/file_check_test.py b/scripts/files/tests/file_check_test.py index 732e7ddf9..61d0c7221 100644 --- a/scripts/files/tests/file_check_test.py +++ b/scripts/files/tests/file_check_test.py @@ -1,23 +1,5 @@ -from typing import Optional, cast - from scripts.files.file_check import FileCheck -from scripts.gdal.gdalinfo import GdalInfo, GdalInfoBand - - -def fake_gdal_info() -> GdalInfo: - return cast(GdalInfo, {}) - - -def add_band(gdalinfo: GdalInfo, color_interpretation: Optional[str] = None, no_data_value: Optional[int] = None) -> None: - if gdalinfo.get("bands", None) is None: - gdalinfo["bands"] = [] - - gdalinfo["bands"].append( - cast( - GdalInfoBand, - {"band": len(gdalinfo["bands"]), "colorInterpretation": color_interpretation, "noDataValue": no_data_value}, - ) - ) +from scripts.gdal.tests.gdalinfo import add_band, fake_gdal_info def test_check_band_count_valid() -> None: diff --git a/scripts/gdal/gdal_bands.py b/scripts/gdal/gdal_bands.py new file mode 100644 index 000000000..79b3378f4 --- /dev/null +++ b/scripts/gdal/gdal_bands.py @@ -0,0 +1,44 @@ +from typing import List, Optional + +from linz_logger import get_log + +from scripts.gdal.gdalinfo import GdalInfo, GdalInfoBand, gdal_info + + +# Find a band from the color interpretation +def find_band(bands: List[GdalInfoBand], color: str) -> Optional[GdalInfoBand]: + for band in bands: + if band["colorInterpretation"] == color: + return band + return None + + +# Determine what band numbers to use for the "-b" overrides for gdal_translate +def get_gdal_band_offset(file: str, info: Optional[GdalInfo] = None) -> List[str]: + if info is None: + info = gdal_info(file, False) + + bands = info["bands"] + + alpha_band = find_band(bands, "Alpha") + alpha_band_info: List[str] = [] + if alpha_band: + alpha_band_info.extend(["-b", str(alpha_band["band"])]) + + # Grey scale imagery, set R,G and B to just the grey_band + grey_band = find_band(bands, "Gray") + if grey_band: + grey_band_index = str(grey_band["band"]) + return ["-b", grey_band_index, "-b", grey_band_index, "-b", grey_band_index] + alpha_band_info + + band_red = find_band(bands, "Red") + band_green = find_band(bands, "Green") + band_blue = find_band(bands, "Blue") + + if band_red is None or band_green is None or band_blue is None: + get_log().warn( + "gdal_info_bands_failed", band_red=band_red is None, band_green=band_green is None, band_blue=band_blue is None + ) + return ["-b", "1", "-b", "2", "-b", "3"] + alpha_band_info + + return ["-b", str(band_red["band"]), "-b", str(band_green["band"]), "-b", str(band_blue["band"])] + alpha_band_info diff --git a/scripts/gdal/gdal_preset.py b/scripts/gdal/gdal_preset.py index b2b847454..2addf6ac7 100644 --- a/scripts/gdal/gdal_preset.py +++ b/scripts/gdal/gdal_preset.py @@ -1,9 +1,7 @@ -from typing import List, Optional +from typing import List from linz_logger import get_log -from scripts.gdal.gdalinfo import GdalInfoBand, gdal_info - # Force the source projection as NZTM EPSG:2193 NZTM_SOURCE = ["-a_srs", "EPSG:2193"] @@ -84,44 +82,6 @@ def get_gdal_command(preset: str) -> List[str]: return gdal_command -# Find a band from the color interpretation -def find_band(bands: List[GdalInfoBand], color: str) -> Optional[GdalInfoBand]: - for band in bands: - if band["colorInterpretation"] == color: - return band - return None - - -# Determine what band numbers to use for the "-b" overrides for gdal_translate -def get_gdal_band_offset(file: str) -> List[str]: - info = gdal_info(file, False) - - bands = info["bands"] - - alpha_band = find_band(bands, "Alpha") - alpha_band_info: List[str] = [] - if alpha_band: - alpha_band_info.extend(["-b", str(alpha_band["band"])]) - - # Grey scale imagery, set R,G and B to just the grey_band - grey_band = find_band(bands, "Gray") - if grey_band: - grey_band_index = str(grey_band["band"]) - return ["-b", grey_band_index, "-b", grey_band_index, "-b", grey_band_index] + alpha_band_info - - band_red = find_band(bands, "Red") - band_green = find_band(bands, "Green") - band_blue = find_band(bands, "Blue") - - if band_red is None or band_green is None or band_blue is None: - get_log().warn( - "gdal_info_bands_failed", band_red=band_red is None, band_green=band_green is None, band_blue=band_blue is None - ) - return ["-b", "1", "-b", "2", "-b", "3"] + alpha_band_info - - return ["-b", str(band_red["band"]), "-b", str(band_green["band"]), "-b", str(band_blue["band"])] + alpha_band_info - - # Get a command to create a virtual file which has a cutline and alpha applied def get_cutline_command(cutline: str) -> List[str]: return [ diff --git a/scripts/gdal/tests/gdal_bands_test.py b/scripts/gdal/tests/gdal_bands_test.py new file mode 100644 index 000000000..5160285d5 --- /dev/null +++ b/scripts/gdal/tests/gdal_bands_test.py @@ -0,0 +1,53 @@ +from scripts.gdal.gdal_bands import get_gdal_band_offset +from scripts.gdal.tests.gdalinfo import add_band, fake_gdal_info + + +def test_gdal_grey_bands() -> None: + gdalinfo = fake_gdal_info() + add_band(gdalinfo, color_interpretation="Gray") + + bands = get_gdal_band_offset("some_file.tiff", gdalinfo) + + assert " ".join(bands) == "-b 1 -b 1 -b 1" + + +def test_gdal_grey_bands_detection() -> None: + gdalinfo = fake_gdal_info() + add_band(gdalinfo, color_interpretation="Alpha") + add_band(gdalinfo, color_interpretation="Gray") + + bands = get_gdal_band_offset("some_file.tiff", gdalinfo) + + assert " ".join(bands) == "-b 2 -b 2 -b 2 -b 1" + + +def test_gdal_rgba_bands_detection() -> None: + gdalinfo = fake_gdal_info() + add_band(gdalinfo, color_interpretation="Alpha") + add_band(gdalinfo, color_interpretation="Red") + add_band(gdalinfo, color_interpretation="Green") + add_band(gdalinfo, color_interpretation="Blue") + + bands = get_gdal_band_offset("some_file.tiff", gdalinfo) + + assert " ".join(bands) == "-b 2 -b 3 -b 4 -b 1" + + +def test_gdal_rgb_bands_detection() -> None: + gdalinfo = fake_gdal_info() + add_band(gdalinfo, color_interpretation="Red") + add_band(gdalinfo, color_interpretation="Green") + add_band(gdalinfo, color_interpretation="Blue") + + bands = get_gdal_band_offset("some_file.tiff", gdalinfo) + + assert " ".join(bands) == "-b 1 -b 2 -b 3" + + +def test_gdal_default_rgb() -> None: + gdalinfo = fake_gdal_info() + add_band(gdalinfo, color_interpretation="unknown") + + bands = get_gdal_band_offset("some_file.tiff", gdalinfo) + + assert " ".join(bands) == "-b 1 -b 2 -b 3" diff --git a/scripts/gdal/tests/gdal_preset_test.py b/scripts/gdal/tests/gdal_preset_test.py index 8aea7aa0d..c9903ab78 100644 --- a/scripts/gdal/tests/gdal_preset_test.py +++ b/scripts/gdal/tests/gdal_preset_test.py @@ -1,7 +1,7 @@ from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_command -def test_preset_webp(): +def test_preset_webp() -> None: gdal_command = get_gdal_command("webp") # Basic cog creation @@ -22,7 +22,7 @@ def test_preset_webp(): assert "EPSG:2193" in gdal_command -def test_preset_lzw(): +def test_preset_lzw() -> None: gdal_command = get_gdal_command("lzw") # Basic cog creation @@ -43,7 +43,7 @@ def test_preset_lzw(): assert "EPSG:2193" in gdal_command -def test_cutline_params(): +def test_cutline_params() -> None: gdal_command = get_cutline_command("cutline.fgb") assert "-cutline" in gdal_command diff --git a/scripts/gdal/tests/gdalinfo.py b/scripts/gdal/tests/gdalinfo.py new file mode 100644 index 000000000..baae3c6a6 --- /dev/null +++ b/scripts/gdal/tests/gdalinfo.py @@ -0,0 +1,19 @@ +from typing import Optional, cast + +from scripts.gdal.gdalinfo import GdalInfo, GdalInfoBand + + +def fake_gdal_info() -> GdalInfo: + return cast(GdalInfo, {}) + + +def add_band(gdalinfo: GdalInfo, color_interpretation: Optional[str] = None, no_data_value: Optional[int] = None) -> None: + if gdalinfo.get("bands", None) is None: + gdalinfo["bands"] = [] + + gdalinfo["bands"].append( + cast( + GdalInfoBand, + {"band": len(gdalinfo["bands"]) + 1, "colorInterpretation": color_interpretation, "noDataValue": no_data_value}, + ) + ) diff --git a/scripts/standardising.py b/scripts/standardising.py index 74b511938..78f7f4cd6 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -12,8 +12,9 @@ from scripts.cli.cli_helper import format_source, is_argo from scripts.files.files_helper import get_file_name_from_path, is_tiff, is_vrt from scripts.files.fs import read, write +from scripts.gdal.gdal_bands import get_gdal_band_offset from scripts.gdal.gdal_helper import get_gdal_version, run_gdal -from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_band_offset, get_gdal_command +from scripts.gdal.gdal_preset import get_cutline_command, get_gdal_command from scripts.logging.time_helper import time_in_ms @@ -70,7 +71,8 @@ def standardising(file: str, preset: str, cutline: Optional[str]) -> str: write(input_cutline_path, read(cutline)) target_vrt = os.path.join(tmp_path, str(ulid.ULID()) + ".vrt") - # TODO check if the cutline actually intersects with the input_file as apply a cutline is much slower than conversion + # TODO check if the cutline actually intersects with the input_file + # as apply a cutline is much slower than conversion run_gdal(get_cutline_command(input_cutline_path), input_file=input_file, output_file=target_vrt) input_file = target_vrt From ac89df21d67160d8de08309060d56dac083214fb Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 12:27:11 +1300 Subject: [PATCH 10/12] refactor: apply formatter --- scripts/standardising.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/standardising.py b/scripts/standardising.py index 78f7f4cd6..9ae321c1d 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -71,7 +71,7 @@ def standardising(file: str, preset: str, cutline: Optional[str]) -> str: write(input_cutline_path, read(cutline)) target_vrt = os.path.join(tmp_path, str(ulid.ULID()) + ".vrt") - # TODO check if the cutline actually intersects with the input_file + # TODO check if the cutline actually intersects with the input_file # as apply a cutline is much slower than conversion run_gdal(get_cutline_command(input_cutline_path), input_file=input_file, output_file=target_vrt) input_file = target_vrt From 63011202678dae607849994eb9f44743fa2cd374 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 15:15:23 +1300 Subject: [PATCH 11/12] docs: add docs --- scripts/gdal/gdal_bands.py | 20 ++++++++++++++++++-- scripts/gdal/gdal_preset.py | 5 ++++- scripts/gdal/gdalinfo.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/scripts/gdal/gdal_bands.py b/scripts/gdal/gdal_bands.py index 79b3378f4..36fdcd047 100644 --- a/scripts/gdal/gdal_bands.py +++ b/scripts/gdal/gdal_bands.py @@ -5,16 +5,32 @@ from scripts.gdal.gdalinfo import GdalInfo, GdalInfoBand, gdal_info -# Find a band from the color interpretation def find_band(bands: List[GdalInfoBand], color: str) -> Optional[GdalInfoBand]: + """Look for a specific colorInterperation inside of a gdalinfo band output + + Args: + bands: Bands to search + color: Color to search, eg Red, Green, Gray + + Returns: + Band if it exists, None otherwise + """ for band in bands: if band["colorInterpretation"] == color: return band return None -# Determine what band numbers to use for the "-b" overrides for gdal_translate def get_gdal_band_offset(file: str, info: Optional[GdalInfo] = None) -> List[str]: + """Get the banding parameters for a gdal_translate command + + Args: + file: file to check + info: optional precomputed gdalinfo + + Returns: + list of band mappings eg "-b 1 -b 1 -b 1" + """ if info is None: info = gdal_info(file, False) diff --git a/scripts/gdal/gdal_preset.py b/scripts/gdal/gdal_preset.py index 2addf6ac7..ce396943e 100644 --- a/scripts/gdal/gdal_preset.py +++ b/scripts/gdal/gdal_preset.py @@ -82,8 +82,11 @@ def get_gdal_command(preset: str) -> List[str]: return gdal_command -# Get a command to create a virtual file which has a cutline and alpha applied def get_cutline_command(cutline: str) -> List[str]: + """ + Get a "gdalwarp" command to create a virtual file (.vrt) which has a cutline applied and alpha added + """ + return [ "gdalwarp", # Outputting a VRT makes things faster as its not recomputing everything diff --git a/scripts/gdal/gdalinfo.py b/scripts/gdal/gdalinfo.py index 7d8bedd4c..9dcfa69b3 100644 --- a/scripts/gdal/gdalinfo.py +++ b/scripts/gdal/gdalinfo.py @@ -9,19 +9,44 @@ class GdalInfoBand(TypedDict): band: int + """band offset, starting at 1 + """ block: List[int] type: str colorInterpretation: str + """Color + Examples: + "Red", "Green", "Blue", "Alpha", "Gray" + """ noDataValue: Optional[int] class GdalInfo(TypedDict): description: str + """Information about the target of the gdalinfo + """ driverShortName: str + """Short names for the driver + + Example: "GTiff" + + """ driverLongName: str + """Longer driver name + + Example: + "GeoTIFF" + """ + files: List[str] + """List of files processed by the command + """ size: List[int] + """Width and height of input""" geoTransform: List[float] + """GeoTransformation information + + """ metadata: Dict[Any, Any] cornerCoordinates: Dict[Any, Any] extent: Dict[Any, Any] @@ -30,6 +55,15 @@ class GdalInfo(TypedDict): def gdal_info(path: str, stats: bool = True) -> GdalInfo: + """run gdalinfo on the provided file + + Args: + path: path to file to gdalinfo + stats: Optionally add "-stats" to gdalinfo. Defaults to True. + + Returns: + GdalInfo output + """ # Set GDAL_PAM_ENABLED to NO to temporarily diable PAM support and prevent creation of auxiliary XML file. gdalinfo_command = ["gdalinfo", "-json", "--config", "GDAL_PAM_ENABLED", "NO"] From 12f14e73588fd42d717f8369806845d690791cc5 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 6 Dec 2022 15:20:27 +1300 Subject: [PATCH 12/12] refactor: fix lint --- scripts/gdal/gdalinfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdal/gdalinfo.py b/scripts/gdal/gdalinfo.py index 9dcfa69b3..196ce3653 100644 --- a/scripts/gdal/gdalinfo.py +++ b/scripts/gdal/gdalinfo.py @@ -34,7 +34,7 @@ class GdalInfo(TypedDict): driverLongName: str """Longer driver name - Example: + Example: "GeoTIFF" """