From 7bd1c6e04550fab3309c538810d824cfdee6c2c2 Mon Sep 17 00:00:00 2001 From: Shishi Chen Date: Thu, 27 Jun 2024 00:25:46 +0000 Subject: [PATCH 1/2] Add base copy of files from the docker-parsl-workflow directory from the branch enhancement-1-k8s containing the non-GKE workflow, to show the diff of the GKE workflow. --- gke_workflow/Dockerfile | 78 +++++++ gke_workflow/README.md | 98 +++++++++ gke_workflow/parsl_config.py | 62 ++++++ gke_workflow/parsl_workflow.py | 353 ++++++++++++++++++++++++++++++++ gke_workflow/requirements.txt | 5 + gke_workflow/workflow_config.py | 39 ++++ 6 files changed, 635 insertions(+) create mode 100644 gke_workflow/Dockerfile create mode 100644 gke_workflow/README.md create mode 100644 gke_workflow/parsl_config.py create mode 100644 gke_workflow/parsl_workflow.py create mode 100644 gke_workflow/requirements.txt create mode 100644 gke_workflow/workflow_config.py diff --git a/gke_workflow/Dockerfile b/gke_workflow/Dockerfile new file mode 100644 index 0000000..bbc63cf --- /dev/null +++ b/gke_workflow/Dockerfile @@ -0,0 +1,78 @@ +# for parsl_workflow.py: + +# Note: the order of the steps may need to be adjusted + +# FROM ubuntu:22.04 is also an otpion but this would make the image larger and would need to install python too +FROM python:3.9 +SHELL ["/bin/bash", "-c"] +# metadata info: +LABEL org.opencontainers.image.source https://github.com/permafrostdiscoverygateway/viz-workflow + +# WORKDIR /usr/local/share/app # a generalized option +# Keep in mind WORKDIR can use environment variables previously set +# using ENV, like ENV DIRPATH=/path followed by WORKDIR $DIRPATH/$DIRNAME +WORKDIR /home/pdgk8suser + +RUN apt update && apt -y install wget sudo vim nano iproute2 tree +# pip should already be installed after installing python, so no need to install here + +# Create new group called pdgk8sgroup and add new user to that group +# both with same ID number as jcohen for permissions after container runs. +# Do this before miniconda operations because want to install miniconda in the +# user's homedir? +RUN groupadd --gid 1040 -r pdgk8sgroup && useradd --uid 1040 -r -g pdgk8sgroup pdgk8suser +# make dir that matches WORKDIR +RUN mkdir -p /home/pdgk8suser && chown pdgk8suser:pdgk8sgroup /home/pdgk8suser +# make dir that matches the PV to store output data +RUN mkdir -p /mnt/k8s-dev-pdg && chown pdgk8suser:pdgk8sgroup /mnt/k8s-dev-pdg + +# actviate that user account +USER pdgk8suser:pdgk8sgroup + +# define miniconda installation path based on WORKDIR +ENV CONDA_HOME="/home/pdgk8suser/miniconda3" +ENV PATH="${CONDA_HOME}/bin:${PATH}" + +RUN mkdir -p ${CONDA_HOME} && \ + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ${CONDA_HOME}/miniconda.sh && \ + bash ${CONDA_HOME}/miniconda.sh -b -u -p ${CONDA_HOME} && \ + rm -rf ${CONDA_HOME}/miniconda.sh && \ + conda --version + +# create new conda env +RUN conda create -n pdg_k8s_env python=3.9 + +SHELL ["conda", "run", "-n", "pdg_k8s_env", "/bin/bash", "-c"] + +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY workflow_config.py . +COPY iwp_2_files . +COPY parsl_workflow.py . +COPY parsl_config.py . + +# maybe we don't want to run a command bc we need to use the terminal to +# do it now that we are using parsl and k8s +# CMD [ "python", "./parsl_workflow.py", ">", "k8s_parsl.log", "2>&1"] + + +# ------------------------------------------------------------ + +# # for simple_workflow.py: + +# # base image +# FROM python:3.9 + +# WORKDIR /home/jcohen/viz-worflow/docker-parsl_workflow/ + +# # python script to run +# ADD simple_workflow.py . +# # add the input data +# COPY data/test_polygons.gpkg . +# COPY requirements.txt . + +# # packages to install +# RUN pip install -r requirements.txt + +# CMD [ "python", "./simple_workflow.py" ] diff --git a/gke_workflow/README.md b/gke_workflow/README.md new file mode 100644 index 0000000..0ac865f --- /dev/null +++ b/gke_workflow/README.md @@ -0,0 +1,98 @@ +# Run the visualization workflow with Docker + +This directory contains everything necessary to execute the Permafrost Discovery Gateway visualization workflow with Docker, Kubernetes, and parsl for parallelization. + +The following 2 scripts can be run within a Docker container: + 1. `simple_workflow.py` - a simple version of the visualization workflow with no parallelization + 2. `parsl_workflow.py` - more complex workflow that integrates parallelization with parsl and kubernetes + +## Python environment + +Before executing either of these scripts, create a fresh environment with `conda` or `venv` and install all the requirements specified in the `requirements.txt` file with `pip install -r requirements.txt` + +## Persistent data volumes + +These scripts can be run locally on a laptop, _or_ on a server. Either way, you will have to specify a persistent data volume because both scripts write the following output: + - GeoPackage files to a `staging` directory + - GeoTIFF files to a `geotiff` directory + - web tiles to a `web_tiles` directory + - supplementary `csv` files output by the visualization workflow + - a log file + +See the documentation in the [`viz-staging`](https://github.com/PermafrostDiscoveryGateway/viz-staging) and [`viz-raster`](https://github.com/PermafrostDiscoveryGateway/viz-raster/tree/main) repositories for further information about the output. + +_How_ you specify a persistent data volume for the container differs between the simple workflow and the parallel workflow. In the non-parallelized workflow, we specify the path to the directory of choice within the `docker run` command. In the parallel workflow, the `parsl` config includes the filepath for the mounted persistent volume, and the volume must be configured beforehand. See details for these within the steps for each scipt below. + +## 1. Run `simple_workflow.py`: Build an image and run the container + +### Overview + +- Execute the `docker build` command in the terminal, then the `docker run` command. +- When the script is complete, see the files written to a new output dir `app` in the `viz-workflow/docker-parsl/workflow` dir. +- Mounting a volume for output on a server is done in the same way as we do on our local laptop: we specify the filepath when we execute the `docker run` command! It's just a different filepath because now we are on a different machine. + +### Steps + +- If working on a laptop rather than a server, clone repository & open the Docker Desktop app, then navigate to repository in VScode. If working on a server, SSH into the server in VScode, clone the repository, and navigate to the repository. +- Ensure an environment is activated in the terminal that is built from the same `requirements.txt` file as the docker image. This requires you to create a fresh environment, activate it, then run `pip install -r requirements.txt` in the command line. +- Retrieve input data to process. + - **(TODO: make sample data accessible to people without access to Datateam)** +- Edit the filepath for the WORKDIR in the Dockerfile as needed. + - **TODO: automate this step by adjusting `WORKDIR` in Dockerfile, and potentially defining `ENV` variables prior to defining the `WORKDIR` that are then used in the `WORKDIR`.** + - Recall that the default `WORKDIR` is `/` if not otherwise defined. +- Run `docker build -t image_name .` in the command line. +- Run the container and specify a persistent directory for input and output data, updating the path as needed: `docker run -v /path/to/repository/viz-workflow/docker-parsl-workflow/app:/app image_name` + - Note: The `app` dir does not need ro be created manually, it will be created when the container runs. + +## 2. Run `parsl_workflow.py`: Build an image and publish it to the repository, then run a container from the published repository package + +### Overview + +- This script runs the same visualization worklow but processes in parallel with several workers. The amount of workers can be adjusted in the configuration: `parsl_config.py` +- The GitHub repository "packages" section contains all published Docker images that can be pulled by users. These are version controlled, so you can point to a specific image version to run. This makes a workflow more reproducibile. The repo and version are specified in the `parsl_config.py` + +### Steps + +- Make sure your GitHub Personal Access Token allows for publishing packages to the repository. + - Navigate to your token on GitHub, then scroll down to `write:packages` and check the box and save. +- SSH into server in VScode, clone the repository, and navigate to the repository. +- Ensure an environment is activated in the terminal that is built from the same `requirements.txt` file as the docker image, and the same version of python. +- Edit the line in the parsl configuration to specify the persistent volume name and mount filepath. + - The first item in the list will need to be a persistent volume that is set up by the server admin. See [this repository](https://github.com/mbjones/k8s-parsl?tab=readme-ov-file#persistent-data-volumes) for details. + - The second item is the location that you want the volume to be mounted _within your container_. `/mnt/data` is a common path used in this field. Recall that the data will be written there in the container but will be persistently accessible at the location of the persistent volume on your machine. +``` +persistent_volumes=persistent_volumes=[('pdgrun-dev-0', f'/mnt/data')] +``` +- Update the string that represents the desired published repository package version of image in `parsl_config.py`. Replace the version number with the next version number you will publish it as: +``` +image='ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.8', +``` +- Publish the package to the repository with new version number by running 3 commands one-by-one: +``` +docker build -t ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.8 . +``` +Note: The string representing the organization and repo in these commands must be all lower-case. + +``` +echo $GITHUB_TOKEN | docker login ghcr.io -u $GITHUB_USER --password-stdin +``` +``` +docker push ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.8 +``` +- Run `kubectl get pods` to see if any pods are left hanging from the last run in your namespace. This could be the case if a past run failed to shut down the parsl workers. + - If there are any hanging, delete them all at once for the specific namespace by running: `kubectl delete pods --all -n {namespace}` + - or take the safer route by deleting them by listing each pod name: `kubectl delete pods {podname} {podname} {podname}` +- Run the python script for the parsl workflow, specifying to print the log output to file: + +``` +python parsl_workflow.py > k8s_parsl.log 2>&1 +``` + +If you simply run `python parsl_workflow.py`, a lot of parsl output will print to the terminal instead. + +**General Notes:** +- If the run is successful, parsl processes should shut down cleanly. If not, you'll need to kill the processes manually. + - You can check your processes in the command line with `ps -ef | grep {username}` + - In the output, the column next to your username shows the 5-digit identifier for the proess. Run `kill -9 {identifier}` to kill one in particular. +- After each run, if files were output, remove them from the persistent directory before next run. +- If testing code and you end up building many images, run `docker images` to list them and you can choose which to delete diff --git a/gke_workflow/parsl_config.py b/gke_workflow/parsl_config.py new file mode 100644 index 0000000..034464e --- /dev/null +++ b/gke_workflow/parsl_config.py @@ -0,0 +1,62 @@ +from parsl.config import Config +from parsl.executors import HighThroughputExecutor +from parsl.providers import KubernetesProvider +from parsl.addresses import address_by_route + +# not necessary if mounting volume at /usr/local/share/app: +# import subprocess +# user = subprocess.check_output("whoami").strip().decode("ascii") + +def config_parsl_cluster( + max_blocks = 4, + min_blocks = 1, + init_blocks = 1, + max_workers = 4, + cores_per_worker = 1, + # TODO: automate this following string to pull most recent release on github? + image='ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.6', + namespace='pdgrun'): + + htex_kube = Config( + executors = [ + HighThroughputExecutor( + label = 'kube-htex', + cores_per_worker = cores_per_worker, + max_workers = max_workers, + worker_logdir_root = '/', + # Address for the pod worker to connect back + address = address_by_route(), + # address='128.111.85.174', + #address_probe_timeout=3600, + worker_debug = True, + provider = KubernetesProvider( + + # Namespace in K8S to use for the run + namespace = namespace, + + # Docker image url to use for pods + image = image, + + # Command to be run upon pod start, such as: + # 'module load Anaconda; source activate parsl_env'. + # or 'pip install parsl' + # worker_init='echo "Worker started..."; lf=`find . -name \'manager.log\'` tail -n+1 -f ${lf}', + worker_init = 'pip install parsl==2023.11.27', + + # Should follow the Kubernetes naming rules + pod_name = 'parsl-worker', + + nodes_per_block = 1, + init_blocks = init_blocks, + min_blocks = min_blocks, + # Maximum number of pods to scale up + max_blocks = max_blocks, + # persistent_volumes (list[(str, str)]) – List of tuples + # describing persistent volumes to be mounted in the pod. + # The tuples consist of (PVC Name, Mount Directory). + persistent_volumes = [('pdgrun-dev-0', f'/mnt/k8s-dev-pdg')] + ), + ), + ] + ) + return(htex_kube) diff --git a/gke_workflow/parsl_workflow.py b/gke_workflow/parsl_workflow.py new file mode 100644 index 0000000..28c4ca5 --- /dev/null +++ b/gke_workflow/parsl_workflow.py @@ -0,0 +1,353 @@ +# test docker image and orchestrate containers +# with kubernetes by running a version of +# the workflow with a kubernetes parsl config +# processing 2 small overlapping IWP files + +# documentation for parsl config: +# https://parsl.readthedocs.io/en/stable/userguide/configuring.html#kubernetes-clusters + + +from datetime import datetime +import time + +import pdgstaging +import pdgraster +import workflow_config + +import json +import logging +import logging.handlers +from pdgstaging import logging_config +import os + +import parsl +from parsl import python_app +from parsl.config import Config +from parsl.executors import HighThroughputExecutor +from parsl.providers import KubernetesProvider +from parsl.addresses import address_by_route +# from kubernetes import client, config # NOTE: might need to import this? not sure +# from . import parsl_config # NOTE: might need to import this file if running python command from Dockerfile? +from parsl_config import config_parsl_cluster + +import shutil + +import subprocess +from subprocess import Popen +user = subprocess.check_output("whoami").strip().decode("ascii") + + +# call parsl config and initiate k8s cluster +parsl.set_stream_logger() +# use default settings defined in parsl_config.py: +htex_kube = config_parsl_cluster() +parsl.load(htex_kube) + +workflow_config = workflow_config.workflow_config + + +# print("Removing old directories and files...") +# TODO: Decide filepath here, /app/ or . ? +# using just dir names and filenames here because set WORKDIR as: +# /home/jcohen/viz-workflow/docker-parsl_workflow/app +# dir = "app/" +# old_filepaths = ["staging_summary.csv", +# "raster_summary.csv", +# "raster_events.csv", +# "config__updated.json", +# "log.log"] +# for old_file in old_filepaths: +# if os.path.exists(old_file): +# os.remove(old_file) + +# # remove dirs from past run +# old_dirs = ["staged", +# "geotiff", +# "web_tiles"] +# for old_dir in old_dirs: +# if os.path.exists(old_dir) and os.path.isdir(old_dir): +# shutil.rmtree(old_dir) + + +def run_pdg_workflow( + workflow_config, + batch_size = 300 +): + """ + Run the main PDG workflow for the following steps: + 1. staging + 2. raster highest + 3. raster lower + 4. web tiling + + Parameters + ---------- + workflow_config : dict + Configuration for the PDG staging workflow. + batch_size: int + How many input files, staged files, geotiffs, or web tiles should be included in a single creation + task? (each task is run in parallel) Default: 300 + """ + + start_time = datetime.now() + + logging.info("Staging initiated.") + + stager = pdgstaging.TileStager(workflow_config) + #tile_manager = rasterizer.tiles + tile_manager = stager.tiles + config_manager = stager.config + + input_paths = stager.tiles.get_filenames_from_dir('input') + print(f"Input paths are: {input_paths}") + input_batches = make_batch(input_paths, batch_size) + + # Stage all the input files (each batch in parallel) + app_futures = [] + for i, batch in enumerate(input_batches): + print(f"batch is {batch}") + app_future = stage(batch, workflow_config) + app_futures.append(app_future) + logging.info(f'Started job for batch {i} of {len(input_batches)}') + + # Don't continue to next step until all files have been staged + [a.result() for a in app_futures] + + logging.info("Staging complete.") + print("Staging complete.") + + # ---------------------------------------------------------------- + + # Create highest geotiffs + rasterizer = pdgraster.RasterTiler(workflow_config) + + # Process staged files in batches + logging.info(f'Collecting staged file paths to process...') + staged_paths = tile_manager.get_filenames_from_dir('staged') + logging.info(f'Found {len(staged_paths)} staged files to process.') + staged_batches = make_batch(staged_paths, batch_size) + logging.info(f'Processing staged files in {len(staged_batches)} batches.') + + app_futures = [] + for i, batch in enumerate(staged_batches): + app_future = create_highest_geotiffs(batch, workflow_config) + app_futures.append(app_future) + logging.info(f'Started job for batch {i} of {len(staged_batches)}') + + # Don't move on to next step until all geotiffs have been created + [a.result() for a in app_futures] + + logging.info("Rasterization highest complete. Rasterizing lower z-levels.") + print("Rasterization highest complete. Rasterizing lower z-levels.") + + # ---------------------------------------------------------------- + + # Rasterize composite geotiffs + min_z = config_manager.get_min_z() + max_z = config_manager.get_max_z() + parent_zs = range(max_z - 1, min_z - 1, -1) + + # Can't start lower z-level until higher z-level is complete. + for z in parent_zs: + + # Determine which tiles we need to make for the next z-level based on the + # path names of the geotiffs just created + logging.info(f'Collecting highest geotiff paths to process...') + child_paths = tile_manager.get_filenames_from_dir('geotiff', z = z + 1) + logging.info(f'Found {len(child_paths)} highest geotiffs to process.') + # create empty set for the following loop + parent_tiles = set() + for child_path in child_paths: + parent_tile = tile_manager.get_parent_tile(child_path) + parent_tiles.add(parent_tile) + # convert the set into a list + parent_tiles = list(parent_tiles) + + # Break all parent tiles at level z into batches + parent_tile_batches = make_batch(parent_tiles, batch_size) + logging.info(f'Processing highest geotiffs in {len(parent_tile_batches)} batches.') + + # Make the next level of parent tiles + app_futures = [] + for parent_tile_batch in parent_tile_batches: + app_future = create_composite_geotiffs( + parent_tile_batch, workflow_config) + app_futures.append(app_future) + + # Don't start the next z-level, and don't move to web tiling, until the + # current z-level is complete + [a.result() for a in app_futures] + + logging.info("Composite rasterization complete. Creating web tiles.") + print("Composite rasterization complete. Creating web tiles.") + + # ---------------------------------------------------------------- + + # Process web tiles in batches + logging.info(f'Collecting file paths of geotiffs to process...') + geotiff_paths = tile_manager.get_filenames_from_dir('geotiff') + logging.info(f'Found {len(geotiff_paths)} geotiffs to process.') + geotiff_batches = make_batch(geotiff_paths, batch_size) + logging.info(f'Processing geotiffs in {len(geotiff_batches)} batches.') + + app_futures = [] + for i, batch in enumerate(geotiff_batches): + app_future = create_web_tiles(batch, workflow_config) + app_futures.append(app_future) + logging.info(f'Started job for batch {i} of {len(geotiff_batches)}') + + # Don't record end time until all web tiles have been created + [a.result() for a in app_futures] + + end_time = datetime.now() + logging.info(f'⏰ Total time to create all z-level geotiffs and web tiles: ' + f'{end_time - start_time}') + +# ---------------------------------------------------------------- + +# Define the parsl functions used in the workflow: + +@python_app +def stage(paths, config): + """ + Stage a file + """ + from datetime import datetime + import json + import logging + import logging.handlers + import os + import pdgstaging + from pdgstaging import logging_config + + stager = pdgstaging.TileStager(config, check_footprints = False) + for path in paths: + stager.stage(path) + return True + +# Create highest z-level geotiffs from staged files +@python_app +def create_highest_geotiffs(staged_paths, config): + """ + Create a batch of geotiffs from staged files + """ + from datetime import datetime + import json + import logging + import logging.handlers + import os + import pdgraster + from pdgraster import logging_config + + # rasterize the vectors, highest z-level only + rasterizer = pdgraster.RasterTiler(config) + return rasterizer.rasterize_vectors( + staged_paths, make_parents = False) + # no need to update ranges if manually set val_range in viz config + +# ---------------------------------------------------------------- + +# Create composite geotiffs from highest z-level geotiffs +@python_app +def create_composite_geotiffs(tiles, config): + """ + Create a batch of composite geotiffs from highest geotiffs + """ + from datetime import datetime + import json + import logging + import logging.handlers + import os + import pdgraster + from pdgraster import logging_config + + rasterizer = pdgraster.RasterTiler(config) + return rasterizer.parent_geotiffs_from_children( + tiles, recursive = False) + +# ---------------------------------------------------------------- + +# Create a batch of webtiles from geotiffs +@python_app +def create_web_tiles(geotiff_paths, config): + """ + Create a batch of webtiles from geotiffs + """ + + from datetime import datetime + import json + import logging + import logging.handlers + import os + import pdgraster + from pdgraster import logging_config + + rasterizer = pdgraster.RasterTiler(config) + return rasterizer.webtiles_from_geotiffs( + geotiff_paths, update_ranges = False) + # no need to update ranges if manually set val_range in workflow config + +# ---------------------------------------------------------------- + +def make_batch(items, batch_size): + """ + Create batches of a given size from a list of items. + """ + return [items[i:i + batch_size] for i in range(0, len(items), batch_size)] + +# ---------------------------------------------------------------- + +logging.info(f'Starting PDG workflow: staging, rasterization, and web tiling') +if __name__ == "__main__": + run_pdg_workflow(workflow_config) + +# # transfer visualization log from /tmp to user dir +# # TODO: Automate the following destination path to be the mounted volume in the config +# # maybe do this by importing config script that specifies the filepath as a variable at the top +# # TODO: Decide filepath here, /app/ or . ? +# cmd = ['mv', '/tmp/log.log', '/home/jcohen/viz-workflow/docker-parsl_workflow/app/'] +# # initiate the process to run that command +# process = Popen(cmd) + +# ---------------------------------------------------------------- + +# def main(): + +# '''Main program.''' + +# # make job last a while with useless computation +# size = 30 +# stat_results = [] +# for x in range(size): +# for y in range(size): +# current_time = datetime.now() +# print(f'Schedule job at {current_time} for {x} and {y}') +# stat_results.append(calc_product_long(x, y)) + +# stats = [r.result() for r in stat_results] +# print(f"Sum of stats: {sum(stats)}") + + +# @python_app +# def calc_product_long(x, y): +# '''Useless computation to simulate one that takes a long time''' +# from datetime import datetime +# import time +# current_time = datetime.now() +# prod = x*y +# time.sleep(15) +# return(prod) + + +# if __name__ == "__main__": +# main() + +# ------------------------------------------ + + +# Shutdown and clear the parsl executor +# htex_kube.executors[0].scale_in(htex_kube.executors[0].connected_blocks()) +htex_kube.executors[0].shutdown() +parsl.clear() + +print("Script complete.") diff --git a/gke_workflow/requirements.txt b/gke_workflow/requirements.txt new file mode 100644 index 0000000..371b2f5 --- /dev/null +++ b/gke_workflow/requirements.txt @@ -0,0 +1,5 @@ +parsl==2023.11.27 +kubernetes==28.1.0 +glances==1.7.3 +git+https://github.com/PermafrostDiscoveryGateway/viz-staging.git +git+https://github.com/PermafrostDiscoveryGateway/viz-raster.git diff --git a/gke_workflow/workflow_config.py b/gke_workflow/workflow_config.py new file mode 100644 index 0000000..0fb5c87 --- /dev/null +++ b/gke_workflow/workflow_config.py @@ -0,0 +1,39 @@ +workflow_config = { + "deduplicate_method": None, + "deduplicate_clip_to_footprint": False, + "dir_input": "input", + "ext_input": ".gpkg", + "dir_staged": "/mnt/k8s-dev-pdg/staged/", + "dir_geotiff": "/mnt/k8s-dev-pdg/geotiff/", + "dir_web_tiles": "/mnt/k8s-dev-pdg/web_tiles/", + "filename_staging_summary": "/mnt/k8s-dev-pdg/staging_summary.csv", + "filename_rasterization_events": "/mnt/k8s-dev-pdg/raster_events.csv", + "filename_rasters_summary": "/mnt/k8s-dev-pdg/raster_summary.csv", + "simplify_tolerance": 0.1, + "tms_id": "WGS1984Quad", + "z_range": [ + 0, + 9 # increase this later to 15 + ], + "geometricError": 57, + "z_coord": 0, + "statistics": [ + { + "name": "iwp_coverage", + "weight_by": "area", + "property": "area_per_pixel_area", + "aggregation_method": "sum", + "resampling_method": "average", + "val_range": [ + 0, + 1 + ], + "palette": [ + "#f8ff1f1A", # 10% alpha yellow + "#f8ff1f" # solid yellow + ], + "nodata_val": 0, + "nodata_color": "#ffffff00" + } + ] +} From 0170c00cad8f2545dfeaa95c4fc9ac2cd93ace68 Mon Sep 17 00:00:00 2001 From: Shishi Chen Date: Tue, 2 Jul 2024 17:15:33 +0000 Subject: [PATCH 2/2] Updated code, documentation, and example manifests for running viz-workflow on GKE. --- gke_workflow/Dockerfile | 72 +----- gke_workflow/README.md | 210 +++++++++++------- gke_workflow/manifests/leader_deployment.yaml | 33 +++ gke_workflow/manifests/persistent_volume.yaml | 20 ++ .../manifests/persistent_volume_claim.yaml | 13 ++ .../manifests/service_account_role.yaml | 18 ++ .../service_account_role_binding.yaml | 13 ++ gke_workflow/parsl_config.py | 29 ++- gke_workflow/parsl_workflow.py | 76 +------ gke_workflow/requirements.txt | 8 +- gke_workflow/workflow_config.py | 14 +- 11 files changed, 259 insertions(+), 247 deletions(-) create mode 100644 gke_workflow/manifests/leader_deployment.yaml create mode 100644 gke_workflow/manifests/persistent_volume.yaml create mode 100644 gke_workflow/manifests/persistent_volume_claim.yaml create mode 100644 gke_workflow/manifests/service_account_role.yaml create mode 100644 gke_workflow/manifests/service_account_role_binding.yaml diff --git a/gke_workflow/Dockerfile b/gke_workflow/Dockerfile index bbc63cf..7012286 100644 --- a/gke_workflow/Dockerfile +++ b/gke_workflow/Dockerfile @@ -1,78 +1,12 @@ -# for parsl_workflow.py: - -# Note: the order of the steps may need to be adjusted - -# FROM ubuntu:22.04 is also an otpion but this would make the image larger and would need to install python too FROM python:3.9 -SHELL ["/bin/bash", "-c"] -# metadata info: -LABEL org.opencontainers.image.source https://github.com/permafrostdiscoverygateway/viz-workflow - -# WORKDIR /usr/local/share/app # a generalized option -# Keep in mind WORKDIR can use environment variables previously set -# using ENV, like ENV DIRPATH=/path followed by WORKDIR $DIRPATH/$DIRNAME -WORKDIR /home/pdgk8suser - -RUN apt update && apt -y install wget sudo vim nano iproute2 tree -# pip should already be installed after installing python, so no need to install here - -# Create new group called pdgk8sgroup and add new user to that group -# both with same ID number as jcohen for permissions after container runs. -# Do this before miniconda operations because want to install miniconda in the -# user's homedir? -RUN groupadd --gid 1040 -r pdgk8sgroup && useradd --uid 1040 -r -g pdgk8sgroup pdgk8suser -# make dir that matches WORKDIR -RUN mkdir -p /home/pdgk8suser && chown pdgk8suser:pdgk8sgroup /home/pdgk8suser -# make dir that matches the PV to store output data -RUN mkdir -p /mnt/k8s-dev-pdg && chown pdgk8suser:pdgk8sgroup /mnt/k8s-dev-pdg -# actviate that user account -USER pdgk8suser:pdgk8sgroup - -# define miniconda installation path based on WORKDIR -ENV CONDA_HOME="/home/pdgk8suser/miniconda3" -ENV PATH="${CONDA_HOME}/bin:${PATH}" - -RUN mkdir -p ${CONDA_HOME} && \ - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ${CONDA_HOME}/miniconda.sh && \ - bash ${CONDA_HOME}/miniconda.sh -b -u -p ${CONDA_HOME} && \ - rm -rf ${CONDA_HOME}/miniconda.sh && \ - conda --version - -# create new conda env -RUN conda create -n pdg_k8s_env python=3.9 +LABEL org.opencontainers.image.source https://github.com/permafrostdiscoverygateway/viz-workflow -SHELL ["conda", "run", "-n", "pdg_k8s_env", "/bin/bash", "-c"] +WORKDIR /usr/local/share/app COPY requirements.txt . RUN pip install -r requirements.txt -COPY workflow_config.py . -COPY iwp_2_files . COPY parsl_workflow.py . COPY parsl_config.py . - -# maybe we don't want to run a command bc we need to use the terminal to -# do it now that we are using parsl and k8s -# CMD [ "python", "./parsl_workflow.py", ">", "k8s_parsl.log", "2>&1"] - - -# ------------------------------------------------------------ - -# # for simple_workflow.py: - -# # base image -# FROM python:3.9 - -# WORKDIR /home/jcohen/viz-worflow/docker-parsl_workflow/ - -# # python script to run -# ADD simple_workflow.py . -# # add the input data -# COPY data/test_polygons.gpkg . -# COPY requirements.txt . - -# # packages to install -# RUN pip install -r requirements.txt - -# CMD [ "python", "./simple_workflow.py" ] +COPY workflow_config.py . diff --git a/gke_workflow/README.md b/gke_workflow/README.md index 0ac865f..7b113d2 100644 --- a/gke_workflow/README.md +++ b/gke_workflow/README.md @@ -1,98 +1,150 @@ -# Run the visualization workflow with Docker +# Run the visualization workflow on GKE -This directory contains everything necessary to execute the Permafrost Discovery Gateway visualization workflow with Docker, Kubernetes, and parsl for parallelization. +Note: This documentation is written to be self-contained, but see also the [PDG GCP infra documentation](https://github.com/PermafrostDiscoveryGateway/pdg-tech/blob/master/gcloud/gcloud-infrastructure.md#gcp-kubernetes-clusters), the [Ray cluster setup instructions](https://github.com/PermafrostDiscoveryGateway/pdg-tech/blob/master/gcloud/raycluster/README.md), and the [docker-parsl-workflow instructions](https://github.com/PermafrostDiscoveryGateway/viz-workflow/blob/enhancement-1-k8s/docker-parsl-workflow/README.md), which contain some similar information. -The following 2 scripts can be run within a Docker container: - 1. `simple_workflow.py` - a simple version of the visualization workflow with no parallelization - 2. `parsl_workflow.py` - more complex workflow that integrates parallelization with parsl and kubernetes +## Prerequisites -## Python environment +We’re using the following shared GKE autopilot cluster that’s already been set up in the PDG project: -Before executing either of these scripts, create a fresh environment with `conda` or `venv` and install all the requirements specified in the `requirements.txt` file with `pip install -r requirements.txt` +* project: `pdg-project-406720` +* cluster: `pdg-autopilot-cluster-1` +* region: `us-west1` -## Persistent data volumes +Many of the following instructions use a CLI to deploy changes in the cluster. I’m using Cloud Shell to SSH into an existing GCE VM instance set up for this purpose and then running commands from there, since there are networking restrictions preventing us from running commands directly from Cloud Shell or many other places. (There may be other ways to set up your terminal as well.) To do this: -These scripts can be run locally on a laptop, _or_ on a server. Either way, you will have to specify a persistent data volume because both scripts write the following output: - - GeoPackage files to a `staging` directory - - GeoTIFF files to a `geotiff` directory - - web tiles to a `web_tiles` directory - - supplementary `csv` files output by the visualization workflow - - a log file +1. From Cloud Console VM instances page, start the existing GCE VM instance if it’s currently stopped + * instance name: `pdg-gke-entrypoint` + * zone: `us-west1-b` +2. SSH into the VM instance: + ``` + $ gcloud compute ssh --zone us-west1-b pdg-gke-entrypoint --project pdg-project-406720 + ``` +3. Set up authorization to the cluster: + ``` + $ gcloud auth login + $ gcloud container clusters get-credentials pdg-autopilot-cluster-1 --internal-ip --region us-west1 --project pdg-project-406720 + ``` -See the documentation in the [`viz-staging`](https://github.com/PermafrostDiscoveryGateway/viz-staging) and [`viz-raster`](https://github.com/PermafrostDiscoveryGateway/viz-raster/tree/main) repositories for further information about the output. +## One-time setup -_How_ you specify a persistent data volume for the container differs between the simple workflow and the parallel workflow. In the non-parallelized workflow, we specify the path to the directory of choice within the `docker run` command. In the parallel workflow, the `parsl` config includes the filepath for the mounted persistent volume, and the volume must be configured beforehand. See details for these within the steps for each scipt below. +The following three objects have been set up once and don't need to be changed during a normal execution - see [Running the script](#running-the-script) below instead for what to change each time you want to rerun the script. However these setup steps are documented for reference in case the setup needs to be modified in the future. Note the service account and persistent volume/persistent volume claim are all namespace-scoped, so one of each needs to be created in every new namespace. -## 1. Run `simple_workflow.py`: Build an image and run the container +1. A namespace within the cluster + * namespace name: `viz-workflow` + * See [Set up namespace](#set-up-namespace) instructions below +2. A (Kubernetes) service account for accessing the GCS bucket + * service account name: `viz-workflow-sa` + * See [Set up service account](#set-up-service-account) instructions below +3. A persistent volume/claim pointing to the GCS bucket where we store input/output files + * persistent volume name: `viz-workflow-pv` + * persistent volume claim name: `viz-workflow-pvc` + * See [Set up persistent volume](#set-up-persistent-volume) instructions below -### Overview +### Set up namespace -- Execute the `docker build` command in the terminal, then the `docker run` command. -- When the script is complete, see the files written to a new output dir `app` in the `viz-workflow/docker-parsl/workflow` dir. -- Mounting a volume for output on a server is done in the same way as we do on our local laptop: we specify the filepath when we execute the `docker run` command! It's just a different filepath because now we are on a different machine. +Kubernetes namespaces provide logical separation between workloads in the cluster. Our cluster is shared between the viz workflow and the PDG data pipelines, so I created a namespace to separate the viz workflow: -### Steps +1. Create the namespace: + ``` + $ kubectl create namespace viz-workflow + ``` + +### Set up service account + +The service account needs to be set up for the leader and worker pods to have permissions to access the GCS bucket. However, because it’s what’s used within the leader pod, it also will need to have permissions on the GKE cluster to be able to modify the cluster to turn up worker pods. This generally follows the GCP docs to [configure KSAs](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity#configure-authz-principals), [configure GCS persistent volume auth](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver#authentication), and [configure RBAC permisssions](https://cloud.google.com/kubernetes-engine/docs/how-to/role-based-access-control). + +1. Create the (Kubernetes) service account: + ``` + $ kubectl create serviceaccount viz-workflow-sa --namespace viz-workflow + ``` + +Ask someone with access to grant the IAM role with GCS permissions to the KSA: + +2. Grant the Storage Object User IAM role to the KSA: + ``` + $ gcloud projects add-iam-policy-binding pdg-project-406720 --member=principal://iam.googleapis.com/projects/896944613548/locations/global/workloadIdentityPools/pdg-project-406720.svc.id.goog/subject/ns/viz-workflow/sa/viz-workflow-sa --role=roles/storage.objectUser + ``` + As an alternative, it should be possible to grant permissions only on a specific GCS bucket to the KSA if you prefer - see the GCP doc above. + +According to the GCP docs, it should be possible to grant the GKE permissions to the KSA either through an IAM role or through Kubernetes RBAC permissions (which are more fine-grained than the IAM role) - however, I haven’t been able to get the IAM role option working. Instead you can just set up RBAC permissions: + +3. Modify the manifests (if needed): + * Example role: [manifests/service_account_role.yaml](manifests/service_account_role.yaml) + * Example role binding: [manifests/service_account_role_binding.yaml](manifests/service_account_role_binding.yaml) +4. Create the RBAC role: + ``` + $ kubectl apply -f service_account_role.yaml + ``` +5. Create the RBAC role binding: + ``` + $ kubectl apply -f service_account_role_binding.yaml + ``` + +### Set up persistent volume + +Currently there’s only one GCS bucket that’s shared across all workflows, which contains a subdirectory for viz-workflow: + +* bucket name: `pdg-storage-default` + +In the future it’s possible there should be different GCS buckets for different workflows or workflow executions, in which case the instructions below would need to be rerun to point to that bucket. This generally follows the GCP docs to [create a persistent volume](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver#create-persistentvolume) and [create a persistent volume claim](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver#create-persistentvolumeclaim). + +1. Modify the manifests (if needed, e.g. to point to your GCS bucket. See the docs for more info about requirements): + * Example persistent volume: [manifests/persistent_volume.yaml](manifests/persistent_volume.yaml) + * Example persistent volume claim: [manifests/persistent_volume_claim.yaml](manifests/persistent_volume_claim.yaml) +2. Create the persistent volume: + ``` + $ kubectl apply -f manifests/persistent_volume.yaml + ``` +3. Create the persistent volume claim: + ``` + $ kubectl apply -f manifests/persistent_volume_claim.yaml + ``` -- If working on a laptop rather than a server, clone repository & open the Docker Desktop app, then navigate to repository in VScode. If working on a server, SSH into the server in VScode, clone the repository, and navigate to the repository. -- Ensure an environment is activated in the terminal that is built from the same `requirements.txt` file as the docker image. This requires you to create a fresh environment, activate it, then run `pip install -r requirements.txt` in the command line. -- Retrieve input data to process. - - **(TODO: make sample data accessible to people without access to Datateam)** -- Edit the filepath for the WORKDIR in the Dockerfile as needed. - - **TODO: automate this step by adjusting `WORKDIR` in Dockerfile, and potentially defining `ENV` variables prior to defining the `WORKDIR` that are then used in the `WORKDIR`.** - - Recall that the default `WORKDIR` is `/` if not otherwise defined. -- Run `docker build -t image_name .` in the command line. -- Run the container and specify a persistent directory for input and output data, updating the path as needed: `docker run -v /path/to/repository/viz-workflow/docker-parsl-workflow/app:/app image_name` - - Note: The `app` dir does not need ro be created manually, it will be created when the container runs. +## Running the script -## 2. Run `parsl_workflow.py`: Build an image and publish it to the repository, then run a container from the published repository package +The setup creates a leader pod in which the main [parsl_workflow.py](parsl_workflow.py) script is executed. During the script execution, parsl will bring up additional worker pods as needed. These worker pods need to be able to communicate back to the main script, so that’s the reason we run it in the leader pod (since networking restrictions allow easier communication between pods within the cluster than outside of it). -### Overview +> **TODO:** An alternative setup that the QGreenland project uses is to run a Kubernetes Job with a ConfigMap that runs the script, which we could try also. See comments in the [QGreenland parsl repo](https://github.com/QGreenland-Net/parsl-exploration/blob/main/README.md#submitting-jobs-to-a-remote-cluster). -- This script runs the same visualization worklow but processes in parallel with several workers. The amount of workers can be adjusted in the configuration: `parsl_config.py` -- The GitHub repository "packages" section contains all published Docker images that can be pulled by users. These are version controlled, so you can point to a specific image version to run. This makes a workflow more reproducibile. The repo and version are specified in the `parsl_config.py` +At the moment, both the leader and worker pods use the same Docker image, but this is not necessary - we could maintain separate images for the leader and worker instead (for example if we wanted to add the command to execute the script to the leader’s Dockerfile). Note that both are required to use the same parsl version. Note that the worker pods are automatically created with an IfNotPresent pull policy so a new image tag needs to be used every time the worker pod image changes. ### Steps -- Make sure your GitHub Personal Access Token allows for publishing packages to the repository. - - Navigate to your token on GitHub, then scroll down to `write:packages` and check the box and save. -- SSH into server in VScode, clone the repository, and navigate to the repository. -- Ensure an environment is activated in the terminal that is built from the same `requirements.txt` file as the docker image, and the same version of python. -- Edit the line in the parsl configuration to specify the persistent volume name and mount filepath. - - The first item in the list will need to be a persistent volume that is set up by the server admin. See [this repository](https://github.com/mbjones/k8s-parsl?tab=readme-ov-file#persistent-data-volumes) for details. - - The second item is the location that you want the volume to be mounted _within your container_. `/mnt/data` is a common path used in this field. Recall that the data will be written there in the container but will be persistently accessible at the location of the persistent volume on your machine. -``` -persistent_volumes=persistent_volumes=[('pdgrun-dev-0', f'/mnt/data')] -``` -- Update the string that represents the desired published repository package version of image in `parsl_config.py`. Replace the version number with the next version number you will publish it as: -``` -image='ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.8', -``` -- Publish the package to the repository with new version number by running 3 commands one-by-one: -``` -docker build -t ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.8 . -``` -Note: The string representing the organization and repo in these commands must be all lower-case. - -``` -echo $GITHUB_TOKEN | docker login ghcr.io -u $GITHUB_USER --password-stdin -``` -``` -docker push ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.8 -``` -- Run `kubectl get pods` to see if any pods are left hanging from the last run in your namespace. This could be the case if a past run failed to shut down the parsl workers. - - If there are any hanging, delete them all at once for the specific namespace by running: `kubectl delete pods --all -n {namespace}` - - or take the safer route by deleting them by listing each pod name: `kubectl delete pods {podname} {podname} {podname}` -- Run the python script for the parsl workflow, specifying to print the log output to file: - -``` -python parsl_workflow.py > k8s_parsl.log 2>&1 -``` - -If you simply run `python parsl_workflow.py`, a lot of parsl output will print to the terminal instead. - -**General Notes:** -- If the run is successful, parsl processes should shut down cleanly. If not, you'll need to kill the processes manually. - - You can check your processes in the command line with `ps -ef | grep {username}` - - In the output, the column next to your username shows the 5-digit identifier for the proess. Run `kill -9 {identifier}` to kill one in particular. -- After each run, if files were output, remove them from the persistent directory before next run. -- If testing code and you end up building many images, run `docker images` to list them and you can choose which to delete +1. Make changes to the worker pod configurations. The worker pods are configured in the script itself, so this changes the code and requires the Docker image to be rebuilt in step 2 + * Parameters to configure the worker pods are in [parsl_config.py](parsl_config.py) + * Things to note: + - pod name prefix: `viz-workflow-worker` + - `image` **should** be set to the tag that will be used in step 2 + - The worker pods need to be set up to consume the GCS persistent volume. `persistent_volumes` has been set to the point to the persistent volume claim from [One-time setup](#one-time-setup) above, and the mount path **should** match the directories used for input/output data in [workflow_config.py](workflow_config.py). In addition, a service account and a particular annotation must be provided in order for the worker pods to consume the GCS persistent volume, per the GCP docs on [mounting persistent volumes](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver#consume-persistentvolumeclaim); this has been done by setting `service_account_name` to the service account from [One-time setup](#one-time-setup) above and including the annotation `{"gke-gcsfuse/volumes": "true"}` in `annotations` + - `namespace` has been set to the namespace from [One-time setup](#one-time-setup) above +2. Make any other changes to the code and rebuild the Docker image replacing `` below + ``` + $ docker build -t ghcr.io/permafrostdiscoverygateway/viz-workflow: . + $ docker push ghcr.io/permafrostdiscoverygateway/viz-workflow: + ``` +3. Create or modify the leader deployment manifest + * Example deployment: [manifests/leader_deployment.yaml](manifests/leader_deployment.yaml) + * Things to note: + - deployment name (and container name): `viz-workflow-leader` + - `image` **should** be set to the tag from step 2 + - The leader deployment needs to be set up to consume the GCS persistent volume. `volumeMounts` and `volumes` have been set to point to the persistent volume claim from [One-time setup](#one-time-setup) above, and similar to the worker pod config `mountPath` **should** match the directories used for input/output data in [workflow_config.py](workflow_config.py). In addition, `service_account_name` and `annotations` must be provided in the same way as for the worker pods above + - `namespace` has been set to the namespace from [One-time setup](#one-time-setup) above +4. Create or update the leader deployment + ``` + $ kubectl apply -f manifests/leader_deployment.yaml + ``` +5. Open a terminal within the leader pod and execute the script in that terminal. The pod name changes every time the deployment is updated so replace `` below with the current name + ``` + $ kubectl exec -it -c viz-workflow-leader -n viz-workflow -- bash + $ python parsl_workflow.py + ``` + +## Cleanup + +1. From the Cloud Console GKE Workloads page, delete any worker pods. Usually the parsl script itself should clean up these pods at the end of a run, but you may need to do it manually if the previous run exited abnormally: + * pod_name prefix: `viz-workflow-worker` +2. *Optional:* From the Cloud Console GKE Workloads page, delete the leader pod: + * deployment name: `viz-workflow-leader` +3. *Optional:* From Cloud Console GCE VM Instances page, stop the GCE VM instance: + * instance name: `pdg-gke-entrypoint` + * zone: `us-west1-b` diff --git a/gke_workflow/manifests/leader_deployment.yaml b/gke_workflow/manifests/leader_deployment.yaml new file mode 100644 index 0000000..ffd3009 --- /dev/null +++ b/gke_workflow/manifests/leader_deployment.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: viz-workflow-leader + labels: + app: viz-workflow-leader + namespace: viz-workflow +spec: + selector: + matchLabels: + app: viz-workflow-leader + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + labels: + app: viz-workflow-leader + spec: + serviceAccountName: viz-workflow-sa + containers: + - name: viz-workflow-leader + image: ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.9 + imagePullPolicy: Always + command: + - sleep + - infinity + volumeMounts: + - mountPath: /data + name: viz-workflow-volume + volumes: + - name: viz-workflow-volume + persistentVolumeClaim: + claimName: viz-workflow-pvc diff --git a/gke_workflow/manifests/persistent_volume.yaml b/gke_workflow/manifests/persistent_volume.yaml new file mode 100644 index 0000000..64d6e3a --- /dev/null +++ b/gke_workflow/manifests/persistent_volume.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: viz-workflow-pv +spec: + accessModes: + - ReadWriteMany + capacity: + storage: 5Gi + storageClassName: gcs-bucket-class + claimRef: + namespace: viz-workflow + name: viz-workflow-pvc + mountOptions: + - implicit-dirs + csi: + driver: gcsfuse.csi.storage.gke.io + volumeHandle: pdg-storage-default + volumeAttributes: + gcsfuseLoggingSeverity: warning diff --git a/gke_workflow/manifests/persistent_volume_claim.yaml b/gke_workflow/manifests/persistent_volume_claim.yaml new file mode 100644 index 0000000..8df10e2 --- /dev/null +++ b/gke_workflow/manifests/persistent_volume_claim.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: viz-workflow-pvc + namespace: viz-workflow +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + volumeName: viz-workflow-pv + storageClassName: gcs-bucket-class diff --git a/gke_workflow/manifests/service_account_role.yaml b/gke_workflow/manifests/service_account_role.yaml new file mode 100644 index 0000000..1e32543 --- /dev/null +++ b/gke_workflow/manifests/service_account_role.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: viz-workflow-sa-role + namespace: viz-workflow +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + - create + - update + - patch + - delete diff --git a/gke_workflow/manifests/service_account_role_binding.yaml b/gke_workflow/manifests/service_account_role_binding.yaml new file mode 100644 index 0000000..50266ae --- /dev/null +++ b/gke_workflow/manifests/service_account_role_binding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: viz-workflow-sa-role-binding + namespace: viz-workflow +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: viz-workflow-sa-role +subjects: +- kind: ServiceAccount + name: viz-workflow-sa + namespace: viz-workflow diff --git a/gke_workflow/parsl_config.py b/gke_workflow/parsl_config.py index 034464e..6ddc0aa 100644 --- a/gke_workflow/parsl_config.py +++ b/gke_workflow/parsl_config.py @@ -3,10 +3,6 @@ from parsl.providers import KubernetesProvider from parsl.addresses import address_by_route -# not necessary if mounting volume at /usr/local/share/app: -# import subprocess -# user = subprocess.check_output("whoami").strip().decode("ascii") - def config_parsl_cluster( max_blocks = 4, min_blocks = 1, @@ -14,8 +10,8 @@ def config_parsl_cluster( max_workers = 4, cores_per_worker = 1, # TODO: automate this following string to pull most recent release on github? - image='ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.6', - namespace='pdgrun'): + image='ghcr.io/permafrostdiscoverygateway/viz-workflow:0.2.9', + namespace='viz-workflow'): htex_kube = Config( executors = [ @@ -26,8 +22,6 @@ def config_parsl_cluster( worker_logdir_root = '/', # Address for the pod worker to connect back address = address_by_route(), - # address='128.111.85.174', - #address_probe_timeout=3600, worker_debug = True, provider = KubernetesProvider( @@ -37,14 +31,11 @@ def config_parsl_cluster( # Docker image url to use for pods image = image, - # Command to be run upon pod start, such as: - # 'module load Anaconda; source activate parsl_env'. - # or 'pip install parsl' - # worker_init='echo "Worker started..."; lf=`find . -name \'manager.log\'` tail -n+1 -f ${lf}', - worker_init = 'pip install parsl==2023.11.27', - # Should follow the Kubernetes naming rules - pod_name = 'parsl-worker', + pod_name = 'viz-workflow-worker', + + init_mem='1Gi', + max_mem='2Gi', nodes_per_block = 1, init_blocks = init_blocks, @@ -54,7 +45,13 @@ def config_parsl_cluster( # persistent_volumes (list[(str, str)]) – List of tuples # describing persistent volumes to be mounted in the pod. # The tuples consist of (PVC Name, Mount Directory). - persistent_volumes = [('pdgrun-dev-0', f'/mnt/k8s-dev-pdg')] + persistent_volumes = [('viz-workflow-pvc', f'/data')], + + # This annotation is required to mount a GCS PVC to the pod and + # the service account (with sufficient permissions) is required + # to access a GCS PVC. + annotations={"gke-gcsfuse/volumes": "true"}, + service_account_name="viz-workflow-sa", ), ), ] diff --git a/gke_workflow/parsl_workflow.py b/gke_workflow/parsl_workflow.py index 28c4ca5..8d8df39 100644 --- a/gke_workflow/parsl_workflow.py +++ b/gke_workflow/parsl_workflow.py @@ -18,7 +18,6 @@ import logging import logging.handlers from pdgstaging import logging_config -import os import parsl from parsl import python_app @@ -26,16 +25,9 @@ from parsl.executors import HighThroughputExecutor from parsl.providers import KubernetesProvider from parsl.addresses import address_by_route -# from kubernetes import client, config # NOTE: might need to import this? not sure -# from . import parsl_config # NOTE: might need to import this file if running python command from Dockerfile? +from kubernetes import client, config from parsl_config import config_parsl_cluster -import shutil - -import subprocess -from subprocess import Popen -user = subprocess.check_output("whoami").strip().decode("ascii") - # call parsl config and initiate k8s cluster parsl.set_stream_logger() @@ -46,29 +38,6 @@ workflow_config = workflow_config.workflow_config -# print("Removing old directories and files...") -# TODO: Decide filepath here, /app/ or . ? -# using just dir names and filenames here because set WORKDIR as: -# /home/jcohen/viz-workflow/docker-parsl_workflow/app -# dir = "app/" -# old_filepaths = ["staging_summary.csv", -# "raster_summary.csv", -# "raster_events.csv", -# "config__updated.json", -# "log.log"] -# for old_file in old_filepaths: -# if os.path.exists(old_file): -# os.remove(old_file) - -# # remove dirs from past run -# old_dirs = ["staged", -# "geotiff", -# "web_tiles"] -# for old_dir in old_dirs: -# if os.path.exists(old_dir) and os.path.isdir(old_dir): -# shutil.rmtree(old_dir) - - def run_pdg_workflow( workflow_config, batch_size = 300 @@ -301,52 +270,11 @@ def make_batch(items, batch_size): if __name__ == "__main__": run_pdg_workflow(workflow_config) -# # transfer visualization log from /tmp to user dir -# # TODO: Automate the following destination path to be the mounted volume in the config -# # maybe do this by importing config script that specifies the filepath as a variable at the top -# # TODO: Decide filepath here, /app/ or . ? -# cmd = ['mv', '/tmp/log.log', '/home/jcohen/viz-workflow/docker-parsl_workflow/app/'] -# # initiate the process to run that command -# process = Popen(cmd) - -# ---------------------------------------------------------------- - -# def main(): - -# '''Main program.''' - -# # make job last a while with useless computation -# size = 30 -# stat_results = [] -# for x in range(size): -# for y in range(size): -# current_time = datetime.now() -# print(f'Schedule job at {current_time} for {x} and {y}') -# stat_results.append(calc_product_long(x, y)) - -# stats = [r.result() for r in stat_results] -# print(f"Sum of stats: {sum(stats)}") - - -# @python_app -# def calc_product_long(x, y): -# '''Useless computation to simulate one that takes a long time''' -# from datetime import datetime -# import time -# current_time = datetime.now() -# prod = x*y -# time.sleep(15) -# return(prod) - - -# if __name__ == "__main__": -# main() - # ------------------------------------------ # Shutdown and clear the parsl executor -# htex_kube.executors[0].scale_in(htex_kube.executors[0].connected_blocks()) +htex_kube.executors[0].scale_in(len(htex_kube.executors[0].connected_blocks())) htex_kube.executors[0].shutdown() parsl.clear() diff --git a/gke_workflow/requirements.txt b/gke_workflow/requirements.txt index 371b2f5..6375198 100644 --- a/gke_workflow/requirements.txt +++ b/gke_workflow/requirements.txt @@ -1,5 +1,9 @@ -parsl==2023.11.27 + +# Use a parsl version >= 2024.06.17 to support GCS persistent volumes. +parsl==2024.06.24 kubernetes==28.1.0 glances==1.7.3 -git+https://github.com/PermafrostDiscoveryGateway/viz-staging.git +# TODO: There seems to be some new dependency conflict between viz-raster and viz-staging - replace this when that's fixed. +# git+https://github.com/PermafrostDiscoveryGateway/viz-staging.git +git+https://github.com/PermafrostDiscoveryGateway/viz-staging.git@develop git+https://github.com/PermafrostDiscoveryGateway/viz-raster.git diff --git a/gke_workflow/workflow_config.py b/gke_workflow/workflow_config.py index 0fb5c87..d90668a 100644 --- a/gke_workflow/workflow_config.py +++ b/gke_workflow/workflow_config.py @@ -1,14 +1,14 @@ workflow_config = { "deduplicate_method": None, "deduplicate_clip_to_footprint": False, - "dir_input": "input", + "dir_input": "/data/viz_workflow/input/", "ext_input": ".gpkg", - "dir_staged": "/mnt/k8s-dev-pdg/staged/", - "dir_geotiff": "/mnt/k8s-dev-pdg/geotiff/", - "dir_web_tiles": "/mnt/k8s-dev-pdg/web_tiles/", - "filename_staging_summary": "/mnt/k8s-dev-pdg/staging_summary.csv", - "filename_rasterization_events": "/mnt/k8s-dev-pdg/raster_events.csv", - "filename_rasters_summary": "/mnt/k8s-dev-pdg/raster_summary.csv", + "dir_staged": "/data/viz_workflow/output/staged/", + "dir_geotiff": "/data/viz_workflow/output/geotiff/", + "dir_web_tiles": "/data/viz_workflow/output/web_tiles/", + "filename_staging_summary": "/data/viz_workflow/output/staging_summary.csv", + "filename_rasterization_events": "/data/viz_workflow/output/raster_events.csv", + "filename_rasters_summary": "/data/viz_workflow/output/raster_summary.csv", "simplify_tolerance": 0.1, "tms_id": "WGS1984Quad", "z_range": [