Issue 3767 HCS parser: support different formats (CZI) (#3768)

epam · Nov 12, 2024 · 079dcfe · 079dcfe
1 parent 98757db
commit 079dcfe
Show file tree

Hide file tree

Showing 9 changed files with 684 additions and 403 deletions.
diff --git a/deploy/docker/cp-tools/research/hcs-parser/Dockerfile b/deploy/docker/cp-tools/research/hcs-parser/Dockerfile
@@ -29,11 +29,11 @@ RUN yum install -y curl \
                    openssh-server \
                    wget \
                    unzip \
-                   python && \
-                   python-devel && \
+                   python  \
+                   python-devel \
                    gcc && \
     curl https://cloud-pipeline-oss-builds.s3.amazonaws.com/tools/pip/2.7/get-pip.py | python - && \
-    pip install pillow==9.2.0 tifffile==2019.7.26.2 numpy==1.16.6 imagecodecs-lite==2019.12.3 pandas==0.24.2 openpyxl==2.6.4
+    pip install pillow==6.2.2 tifffile==2019.7.26.2 numpy==1.16.6 imagecodecs-lite==2019.12.3 pandas==0.24.2 openpyxl==2.6.4
 
 ARG ANACONDA_VERSION="3-latest"
 ENV ANACONDA_HOME=/opt/local/anaconda
@@ -48,15 +48,15 @@ RUN mkdir -p "$ANACONDA_HOME" && \
     rm -f /tmp/Anaconda_Install.sh && \
     conda init bash && \
     source $ANACONDA_HOME/etc/profile.d/conda.sh && \
-    conda create -y -n hcs && \
+    conda create -y -n hcs python=3.9 && \
     conda activate hcs && \
     yum install -y epel-release && \
     yum clean all && \
     yum install -y blosc \
                    ImageMagick \
                    java-1.8.0-openjdk-devel && \
     pip3 install $CP_PIP_EXTRA_ARGS -I -q generate-tiff-offsets==0.1.7 && \
-    pip3 install $CP_PIP_EXTRA_ARGS -I -q awscli
+    pip3 install $CP_PIP_EXTRA_ARGS -I -q awscli && \
     conda deactivate
 
 ARG BIOFORMATS_TO_RAW_VERSION=0.6.1
@@ -74,19 +74,22 @@ RUN mkdir -p "$HCS_TOOLS_HOME" && \
     wget -q "$BIOFORMATS_TO_RAW_DISTR_URL" -O "$HCS_TOOLS_HOME/bioformats2raw.zip" && \
     unzip "$HCS_TOOLS_HOME/bioformats2raw.zip" -d "$HCS_TOOLS_HOME/bioformats2raw" && \
     rm -f "$HCS_TOOLS_HOME/bioformats2raw.zip" && \
+    mv $HCS_TOOLS_HOME/bioformats2raw/bioformats2raw-$BIOFORMATS_TO_RAW_VERSION/* "$HCS_TOOLS_HOME/bioformats2raw" && \
+    rm -rf "$HCS_TOOLS_HOME/bioformats2raw/bioformats2raw-$BIOFORMATS_TO_RAW_VERSION" && \
     chmod +x "$HCS_TOOLS_HOME/bioformats2raw/bin/bioformats2raw" && \
     wget -q "$RAW_TO_OMETIFF_DISTR_URL" -O "$HCS_TOOLS_HOME/raw2ometiff.zip" && \
     unzip "$HCS_TOOLS_HOME/raw2ometiff.zip" -d "$HCS_TOOLS_HOME/raw2ometiff" && \
     rm -f "$HCS_TOOLS_HOME/raw2ometiff.zip" && \
+    mv $HCS_TOOLS_HOME/raw2ometiff/raw2ometiff-$RAW_TO_OMETIFF_VERSION/* $HCS_TOOLS_HOME/raw2ometiff && \
+    rm -rf "$HCS_TOOLS_HOME/raw2ometiff-$RAW_TO_OMETIFF_VERSION" && \
     chmod +x "$HCS_TOOLS_HOME/raw2ometiff/bin/raw2ometiff" && \
     wget "$BFTOOLS_DISTR_URL" -O "$HCS_TOOLS_HOME/bftools-cli.zip" && \
     unzip "$HCS_TOOLS_HOME/bftools-cli.zip" -d "$HCS_TOOLS_HOME" && \
     rm -f "$HCS_TOOLS_HOME/bftools-cli.zip"
 
 ENV PATH="$HCS_TOOLS_HOME/bftools:$HCS_TOOLS_HOME/bioformats2raw/bin:$HCS_TOOLS_HOME/raw2ometiff/bin:$PATH"
 
-COPY process_hcs_files.py "$HCS_TOOLS_HOME/scripts/parser/process_hcs_files.py"
-COPY process_hcs_files_cluster.py "$HCS_TOOLS_HOME/scripts/parser/process_hcs_files_cluster.py"
+COPY parser "$HCS_TOOLS_HOME/scripts/parser"
 COPY convert_to_ome_tiff.sh "$HCS_TOOLS_HOME/scripts/convert_to_ome_tiff.sh"
 COPY start.sh "$HCS_TOOLS_HOME/scripts/start.sh"
 COPY start_cluster.sh "$HCS_TOOLS_HOME/scripts/start_cluster.sh"

diff --git a/deploy/docker/cp-tools/research/hcs-parser/README.md b/deploy/docker/cp-tools/research/hcs-parser/README.md
@@ -0,0 +1,59 @@
+# HCS Parser
+
+## Description
+
+Tool for performing search and processing (converting) microscope images of different formats to the ome.tiff.
+Currently, the following initial formats are supported:
+
+- TIFF
+- CZI
+
+HCS Parser can be run in two modes:
+
+- `Standalone mode` - one node will go through all hcs roots and process it one by one
+  To run hcs-parser in this mode, execute `start.sh`
+- `Cluster mode` - master node will go through all hcs roots and execute `Standalone mode` for each file on different
+  node (by `SGE job` or `pipe run`)
+  To run hcs-parser in this mode, execute `start_cluster.sh`
+
+## Input parameters
+
+All input parameters passed as environment variable.
+
+### Common parameters
+
+| Name                                          | Description                                                                                                        |
+|-----------------------------------------------|--------------------------------------------------------------------------------------------------------------------|
+| HCS_ROOT_TYPE                                 | Type of the input hcs roots which are located in HCS_LOOKUP_DIRECTORIES or HCS_TARGET_PATHS (Supported: TIFF, CZI) |
+| HCS_OBJECT_META_FILE                          | Name of the file with metadata about process of Harmony synchronization. TIFF supported only!                      |
+| HCS_LOOKUP_DIRECTORIES                        | Directories to search hcs_roots location into. Comma separated list of paths                                       |
+| HCS_TARGET_PATHS                              | Paths of hcs_roots location. Comma separated list of paths                                                         |
+| HCS_PARSING_LOGS_OUTPUT                       | Datastorage cloud path where processing logs will be uploaded during image processing                              |
+| HCS_PARSING_TAG_MAPPING                       | Comma separate list of <XML tag name>=<Cloud-pipeline tag name>                                                    |
+| HCS_PARSING_OUTPUT_FOLDER                     | Filesystem local path, where to store result of the processing (hcs files + directory with ome.tiff related files) |
+| HCS_PARSING_PREVIEW_FIELDS_USE_ABSOLUTE_PATHS |                                                                                                                    |
+| HCS_PARSING_IMAGE_DIR_NAME                    | Name of the folder where tiff images is located inside a hcs_root folder. TIFF supported only!                     |
+| HCS_PARSING_INDEX_FILE_NAME                   | Name of the index.xml file inside a hcs_root folder. TIFF supported only!                                          |
+| HCS_PARSING_PLATE_DETAILS_DICT                | Json string with plate types details. TIFF supported only!                                                         |
+| HCS_SKIP_MARKERS                              | List if file names, comma separated. If such file exists in hcs_root, such hcs root will be skipped.               |
+| JAVA_OPTS                                     | Java options that will be propagated to the underling bioformats java processes for image processing               |
+
+### Cluster mode parameters
+
+| Name                                                    | Description                                                                                                                                                                                                                                                                                              |
+|---------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| HCS_CLUSTER_INSTANCE_SLOT_SIZE                          | Number of SGE slots per node that will be available for process hcs root.                                                                                                                                                                                                                                |
+| HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_INSTANCE_SLOT | This parameter is used to calculate memory requirements to process hcs_root as: HCS_CLUSTER_INSTANCE_SLOT_SIZE * HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_INSTANCE_SLOT                                                                                                                                 |
+| HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR            | If HCS_CLUSTER_INSTANCE_SLOT_SIZE is not defined, this factor will be used to define HCS_CLUSTER_INSTANCE_SLOT_SIZE as hcs_root_size / HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR                                                                                                                      |
+| HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_CLUSTER_SLOT  | If HCS_CLUSTER_INSTANCE_SLOT_SIZE is not defined, this value will be used to define memory requirements to process hsc_root as: <br/>hcs_root_size / HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR * HCS_CLUSTER_PROCESSING_MEMORY_CLUSTER_SLOT / HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_INSTANCE_SLOT |
+| HCS_WORKER_INSTANCE_TYPE                                | Instance type to be used to run cluster worker with pipe run command. Pipe run cluster option only.                                                                                                                                                                                                      |
+| HCS_WORKER_MEMORY_GB                                    | Max memory limit to be propagated to JMV opts for underlying bioformats java processes. Pipe run cluster option only.                                                                                                                                                                                    |
+
+### Notification parameters
+
+| Name                  | Description                                                                          |
+|-----------------------|--------------------------------------------------------------------------------------|
+| HCS_NOTIFY_USERS      | Comma separated list of emails to send notification to                               |
+| HCS_DEPLOY_NAME       | Name of the platform which will be used in notification emails                       |
+| HCS_DATA_STORAGE_ID   | Id of the input source datastorage which will be used to generate notification email |
+| HCS_MARKUP_STORAGE_ID | Id of the output datastorage which will be used to generate notification email       |
diff --git a/deploy/docker/cp-tools/research/hcs-parser/parser/process_hcs_files.py b/deploy/docker/cp-tools/research/hcs-parser/parser/process_hcs_files.py
@@ -12,14 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+This python file is one of the entry point to the hcs-parser.
+Used when you need to run image processing on one node,
+also used by process_hcs_files_cluster.py to run processing of particular hcs_roots.
+
+Firstly it will find all hcs_roots to process based on:
+  HCS_TARGET_PATHS - exact locations of hcs roots
+  or
+  HCS_LOOKUP_DIRECTORIES - folders to search to find all hcs_roots
+  and
+  HCS_ROOT_TYPE - Currently TIFF and CZI are supported, based on this parameter hcs-parser defines how to
+                  search hcs_roots and later how to perform some of the image generation steps
+                  (see process_hcs_files.py and processors.py)
+
+Secondly script, based on HCS_ROOT_TYPE property, will initiate one of HcsFileParser (see processors.py) to run processing
+"""
+
 import os
 import multiprocessing
 import traceback
 
 from src.utils import log_run_info, log_run_success
 from src.utils import get_int_run_param, get_bool_run_param
 from src.fs import get_processing_roots
-from src.processors import HcsFileParser
+from src.processors import HcsCZIFileParser, HcsTiffFileParser
+from src.hcs_entity import HcsRootType
 
 TAGS_PROCESSING_ONLY = get_bool_run_param('HCS_PARSING_TAGS_ONLY')
 EVAL_PROCESSING_ONLY = get_bool_run_param('HCS_PARSING_EVAL_ONLY')
@@ -30,15 +48,19 @@
 HCS_INDEX_FILE_NAME = os.getenv('HCS_PARSING_INDEX_FILE_NAME', 'Index.xml')
 HCS_IMAGE_DIR_NAME = os.getenv('HCS_PARSING_IMAGE_DIR_NAME', 'Images')
 MEASUREMENT_INDEX_FILE_PATH = '/{}/{}'.format(HCS_IMAGE_DIR_NAME, HCS_INDEX_FILE_NAME)
+HCS_ROOT_TYPE = HcsRootType.get(os.getenv('HCS_ROOT_TYPE', 'TIFF'))
 
+HCS_ROOT_SEARCH_MARK = MEASUREMENT_INDEX_FILE_PATH
+if HCS_ROOT_TYPE == HcsRootType.CZI:
+    HCS_ROOT_SEARCH_MARK = ".czi"
 
 def try_process_hcs(hcs_root):
     parser = None
     processing_result = 1
     try:
-        log_run_info('Starting processing of folder {} with image preview {}'
-                     .format(hcs_root.root_path, hcs_root.hcs_img_path))
-        parser = HcsFileParser(hcs_root.root_path, hcs_root.hcs_img_path)
+        log_run_info('Starting processing of path {} of type {} with image preview {}'
+                     .format(hcs_root.root_path, HCS_ROOT_TYPE, hcs_root.hcs_img_path))
+        parser = initialize_hcs_parser(hcs_root)
         processing_result = parser.process_file()
         return processing_result
     except Exception as e:
@@ -52,9 +74,15 @@ def try_process_hcs(hcs_root):
             parser.clear_tmp_local_dir()
 
 
+def initialize_hcs_parser(hcs_root):
+    if HCS_ROOT_TYPE == HcsRootType.CZI:
+        return HcsCZIFileParser(hcs_root.root_path, hcs_root.hcs_img_path)
+    return HcsTiffFileParser(hcs_root.root_path, hcs_root.hcs_img_path)
+
+
 def process_hcs_files():
     should_force_processing = TAGS_PROCESSING_ONLY or FORCE_PROCESSING
-    paths_to_hcs_roots = get_processing_roots(should_force_processing, MEASUREMENT_INDEX_FILE_PATH)
+    paths_to_hcs_roots = get_processing_roots(should_force_processing, HCS_ROOT_SEARCH_MARK, HCS_ROOT_TYPE)
     if not paths_to_hcs_roots or len(paths_to_hcs_roots) == 0:
         log_run_success('Found no files requires processing in the lookup directories.')
         exit(0)

diff --git a/deploy/docker/cp-tools/research/hcs-parser/parser/process_hcs_files_cluster.py b/deploy/docker/cp-tools/research/hcs-parser/parser/process_hcs_files_cluster.py
@@ -12,6 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+This python file is one of the entry point to the hcs-parser.
+Used when you need to run image processing in cluster to process several images in parallel.
+
+Firstly it will find all hcs_roots to process based on:
+  HCS_TARGET_PATHS - exact locations of hcs roots
+  or
+  HCS_LOOKUP_DIRECTORIES - folders to search to find all hcs_roots
+  and
+  HCS_ROOT_TYPE - Currently TIFF and CZI are supported, based on this parameter hcs-parser defines how to
+                  search hcs_roots and later how to perform some of the image generation steps
+                  (see process_hcs_files.py and processors.py)
+
+Secondly script, based on HCS_ASYNC_PROCESSING property, will run SGE job or execute pipe run command
+to run additional node which will execute process_hcs_files.py with specified HCS_TARGET_PATHS
+to run actual image generation
+"""
 import os
 import math
 from pipeline.api import PipelineAPI
@@ -26,6 +43,7 @@
 from src.fs import get_processing_roots
 from src.utils import HcsFileLogger, log_run_info, log_run_success
 from src.utils import get_int_run_param, get_bool_run_param
+from src.hcs_entity import HcsRootType
 
 SUCCESS_EXIT_CODE = '0'
 ASYNC_EXIT_CODE = '777'
@@ -100,6 +118,12 @@
 HCS_INDEX_FILE_NAME = os.getenv('HCS_PARSING_INDEX_FILE_NAME', 'Index.xml')
 HCS_IMAGE_DIR_NAME = os.getenv('HCS_PARSING_IMAGE_DIR_NAME', 'Images')
 MEASUREMENT_INDEX_FILE_PATH = '/{}/{}'.format(HCS_IMAGE_DIR_NAME, HCS_INDEX_FILE_NAME)
+HCS_ROOT_TYPE = HcsRootType.get(os.getenv('HCS_ROOT_TYPE', 'TIFF'))
+
+HCS_ROOT_SEARCH_MARK = MEASUREMENT_INDEX_FILE_PATH
+if HCS_ROOT_TYPE == HcsRootType.CZI:
+    HCS_ROOT_SEARCH_MARK = ".czi"
+
 HCS_CLUSTER_PROCESSING_MEMORY_SIZE_SLOT_FACTOR = get_int_run_param('HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR', 20)
 HCS_CLUSTER_INSTANCE_SLOT_SIZE = get_int_run_param('HCS_CLUSTER_INSTANCE_SLOT_SIZE', 0)
 HCS_CLUSTER_PROCESSING_MEMORY_CLUSTER_SLOT = \
@@ -121,9 +145,10 @@ class HcsFileSgeParser:
     PENDING_JOB_STATUSES = ['qw', 'qw', 'hqw', 'hqw', 'hRwq', 'hRwq', 'hRwq', 'qw', 'qw']
     RUNNING_JOB_STATUSES = ['r', 't', 'Rr', 'Rt']
 
-    def __init__(self, hcs_file_root_path, hcs_img_path):
+    def __init__(self, hcs_file_root_path, hcs_img_path, root_type):
         self.hcs_root_path = hcs_file_root_path
         self.hcs_img_path = hcs_img_path
+        self.root_type = root_type
         self.processing_logger = HcsFileLogger(hcs_file_root_path)
 
     @staticmethod
@@ -207,7 +232,7 @@ def _calculate_hcs_dir_size_gigabytes(self):
         cloud_path_chunks = cloud_path.split('/', 1)
         storage_name = cloud_path_chunks[0]
         relative_path = cloud_path_chunks[1] if len(cloud_path_chunks) == 2 else ''
-        command = "pipe storage du '{}' -p '{}' -f GB | awk ' FNR > 1 {{ print $3 }}' ".format(storage_name, relative_path)
+        command = "pipe storage du '{}' -p '{}' -f GB | awk ' FNR == 2 {{ print $(NF-1) }}' ".format(storage_name, relative_path)
         output = subprocess.check_output(command, shell=True)
         try:
             return float(output.strip())
@@ -217,13 +242,14 @@ def _calculate_hcs_dir_size_gigabytes(self):
     def _build_env_vars_to_propagate(self, heap_limit_gb):
         jvm_parameters = COMMON_JAVA_OPTS + ' -Xmx{}G'.format(heap_limit_gb)
         env_vars_string = '''
-        export HCS_TARGET_DIRECTORIES="{}"
+        export HCS_TARGET_PATHS="{}"
         export HCS_TARGET_IMG_NAMES="{}"
+        export HCS_ROOT_TYPE="{}"
         export JAVA_OPTS="{}"
         export HCS_PARSER_PROCESSING_THREADS=1
         export PATH="{}"
         export BF_MAX_MEM="{}G"
-        '''.format(self.hcs_root_path, self.hcs_img_path, jvm_parameters, os.getenv('PATH'), str(heap_limit_gb))
+        '''.format(self.hcs_root_path, self.hcs_img_path, self.root_type.name, jvm_parameters, os.getenv('PATH'), str(heap_limit_gb))
         for key, value in os.environ.items():
             if key.startswith('HCS_PARSING_'):
                 if key == 'HCS_PARSING_PLATE_DETAILS_DICT':
@@ -234,8 +260,9 @@ def _build_env_vars_to_propagate(self, heap_limit_gb):
 
     def _get_propagated_env_vars(self, memory_limit):
         result = {
-            'HCS_TARGET_DIRECTORIES': self.hcs_root_path,
+            'HCS_TARGET_PATHS': self.hcs_root_path,
             'HCS_TARGET_IMG_NAMES': self.hcs_img_path,
+            'HCS_ROOT_TYPE': HCS_ROOT_TYPE.name,
             'JAVA_OPTS': COMMON_JAVA_OPTS + ' -Xmx{}G'.format(memory_limit),
             'HCS_PARSER_PROCESSING_THREADS': '1',
             'CP_CAP_LIMIT_MOUNTS': os.getenv('CP_CAP_LIMIT_MOUNTS')
@@ -340,7 +367,7 @@ def _get_job_state(self, job_id):
 
 
 def try_process_hcs_in_cluster(hcs_root_dir):
-    parser = HcsFileSgeParser(hcs_root_dir.root_path, hcs_root_dir.hcs_img_path)
+    parser = HcsFileSgeParser(hcs_root_dir.root_path, hcs_root_dir.hcs_img_path, HCS_ROOT_TYPE)
     try:
         return parser.process_file_using_pipe() if ASYNC_MODE else parser.process_file_in_sge()
     except Exception as e:
@@ -353,7 +380,7 @@ def try_process_hcs_in_cluster(hcs_root_dir):
 
 def process_hcs_files_cluster():
     should_force_processing = TAGS_PROCESSING_ONLY or EVAL_PROCESSING_ONLY or FORCE_PROCESSING
-    paths_to_hcs_roots = get_processing_roots(should_force_processing, MEASUREMENT_INDEX_FILE_PATH)
+    paths_to_hcs_roots = get_processing_roots(should_force_processing, HCS_ROOT_SEARCH_MARK, HCS_ROOT_TYPE)
     if not paths_to_hcs_roots or len(paths_to_hcs_roots) == 0:
         log_run_success('Found no files requires processing in the lookup directories.')
         exit(0)