Skip to content

Commit

Permalink
Issue 3767 HCS parser: support different formats (CZI) (#3768)
Browse files Browse the repository at this point in the history
  • Loading branch information
SilinPavel authored Nov 12, 2024
1 parent 98757db commit 079dcfe
Show file tree
Hide file tree
Showing 9 changed files with 684 additions and 403 deletions.
17 changes: 10 additions & 7 deletions deploy/docker/cp-tools/research/hcs-parser/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ RUN yum install -y curl \
openssh-server \
wget \
unzip \
python && \
python-devel && \
python \
python-devel \
gcc && \
curl https://cloud-pipeline-oss-builds.s3.amazonaws.com/tools/pip/2.7/get-pip.py | python - && \
pip install pillow==9.2.0 tifffile==2019.7.26.2 numpy==1.16.6 imagecodecs-lite==2019.12.3 pandas==0.24.2 openpyxl==2.6.4
pip install pillow==6.2.2 tifffile==2019.7.26.2 numpy==1.16.6 imagecodecs-lite==2019.12.3 pandas==0.24.2 openpyxl==2.6.4

ARG ANACONDA_VERSION="3-latest"
ENV ANACONDA_HOME=/opt/local/anaconda
Expand All @@ -48,15 +48,15 @@ RUN mkdir -p "$ANACONDA_HOME" && \
rm -f /tmp/Anaconda_Install.sh && \
conda init bash && \
source $ANACONDA_HOME/etc/profile.d/conda.sh && \
conda create -y -n hcs && \
conda create -y -n hcs python=3.9 && \
conda activate hcs && \
yum install -y epel-release && \
yum clean all && \
yum install -y blosc \
ImageMagick \
java-1.8.0-openjdk-devel && \
pip3 install $CP_PIP_EXTRA_ARGS -I -q generate-tiff-offsets==0.1.7 && \
pip3 install $CP_PIP_EXTRA_ARGS -I -q awscli
pip3 install $CP_PIP_EXTRA_ARGS -I -q awscli && \
conda deactivate

ARG BIOFORMATS_TO_RAW_VERSION=0.6.1
Expand All @@ -74,19 +74,22 @@ RUN mkdir -p "$HCS_TOOLS_HOME" && \
wget -q "$BIOFORMATS_TO_RAW_DISTR_URL" -O "$HCS_TOOLS_HOME/bioformats2raw.zip" && \
unzip "$HCS_TOOLS_HOME/bioformats2raw.zip" -d "$HCS_TOOLS_HOME/bioformats2raw" && \
rm -f "$HCS_TOOLS_HOME/bioformats2raw.zip" && \
mv $HCS_TOOLS_HOME/bioformats2raw/bioformats2raw-$BIOFORMATS_TO_RAW_VERSION/* "$HCS_TOOLS_HOME/bioformats2raw" && \
rm -rf "$HCS_TOOLS_HOME/bioformats2raw/bioformats2raw-$BIOFORMATS_TO_RAW_VERSION" && \
chmod +x "$HCS_TOOLS_HOME/bioformats2raw/bin/bioformats2raw" && \
wget -q "$RAW_TO_OMETIFF_DISTR_URL" -O "$HCS_TOOLS_HOME/raw2ometiff.zip" && \
unzip "$HCS_TOOLS_HOME/raw2ometiff.zip" -d "$HCS_TOOLS_HOME/raw2ometiff" && \
rm -f "$HCS_TOOLS_HOME/raw2ometiff.zip" && \
mv $HCS_TOOLS_HOME/raw2ometiff/raw2ometiff-$RAW_TO_OMETIFF_VERSION/* $HCS_TOOLS_HOME/raw2ometiff && \
rm -rf "$HCS_TOOLS_HOME/raw2ometiff-$RAW_TO_OMETIFF_VERSION" && \
chmod +x "$HCS_TOOLS_HOME/raw2ometiff/bin/raw2ometiff" && \
wget "$BFTOOLS_DISTR_URL" -O "$HCS_TOOLS_HOME/bftools-cli.zip" && \
unzip "$HCS_TOOLS_HOME/bftools-cli.zip" -d "$HCS_TOOLS_HOME" && \
rm -f "$HCS_TOOLS_HOME/bftools-cli.zip"

ENV PATH="$HCS_TOOLS_HOME/bftools:$HCS_TOOLS_HOME/bioformats2raw/bin:$HCS_TOOLS_HOME/raw2ometiff/bin:$PATH"

COPY process_hcs_files.py "$HCS_TOOLS_HOME/scripts/parser/process_hcs_files.py"
COPY process_hcs_files_cluster.py "$HCS_TOOLS_HOME/scripts/parser/process_hcs_files_cluster.py"
COPY parser "$HCS_TOOLS_HOME/scripts/parser"
COPY convert_to_ome_tiff.sh "$HCS_TOOLS_HOME/scripts/convert_to_ome_tiff.sh"
COPY start.sh "$HCS_TOOLS_HOME/scripts/start.sh"
COPY start_cluster.sh "$HCS_TOOLS_HOME/scripts/start_cluster.sh"
Expand Down
59 changes: 59 additions & 0 deletions deploy/docker/cp-tools/research/hcs-parser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# HCS Parser

## Description

Tool for performing search and processing (converting) microscope images of different formats to the ome.tiff.
Currently, the following initial formats are supported:

- TIFF
- CZI

HCS Parser can be run in two modes:

- `Standalone mode` - one node will go through all hcs roots and process it one by one
To run hcs-parser in this mode, execute `start.sh`
- `Cluster mode` - master node will go through all hcs roots and execute `Standalone mode` for each file on different
node (by `SGE job` or `pipe run`)
To run hcs-parser in this mode, execute `start_cluster.sh`

## Input parameters

All input parameters passed as environment variable.

### Common parameters

| Name | Description |
|-----------------------------------------------|--------------------------------------------------------------------------------------------------------------------|
| HCS_ROOT_TYPE | Type of the input hcs roots which are located in HCS_LOOKUP_DIRECTORIES or HCS_TARGET_PATHS (Supported: TIFF, CZI) |
| HCS_OBJECT_META_FILE | Name of the file with metadata about process of Harmony synchronization. TIFF supported only! |
| HCS_LOOKUP_DIRECTORIES | Directories to search hcs_roots location into. Comma separated list of paths |
| HCS_TARGET_PATHS | Paths of hcs_roots location. Comma separated list of paths |
| HCS_PARSING_LOGS_OUTPUT | Datastorage cloud path where processing logs will be uploaded during image processing |
| HCS_PARSING_TAG_MAPPING | Comma separate list of <XML tag name>=<Cloud-pipeline tag name> |
| HCS_PARSING_OUTPUT_FOLDER | Filesystem local path, where to store result of the processing (hcs files + directory with ome.tiff related files) |
| HCS_PARSING_PREVIEW_FIELDS_USE_ABSOLUTE_PATHS | |
| HCS_PARSING_IMAGE_DIR_NAME | Name of the folder where tiff images is located inside a hcs_root folder. TIFF supported only! |
| HCS_PARSING_INDEX_FILE_NAME | Name of the index.xml file inside a hcs_root folder. TIFF supported only! |
| HCS_PARSING_PLATE_DETAILS_DICT | Json string with plate types details. TIFF supported only! |
| HCS_SKIP_MARKERS | List if file names, comma separated. If such file exists in hcs_root, such hcs root will be skipped. |
| JAVA_OPTS | Java options that will be propagated to the underling bioformats java processes for image processing |

### Cluster mode parameters

| Name | Description |
|---------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| HCS_CLUSTER_INSTANCE_SLOT_SIZE | Number of SGE slots per node that will be available for process hcs root. |
| HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_INSTANCE_SLOT | This parameter is used to calculate memory requirements to process hcs_root as: HCS_CLUSTER_INSTANCE_SLOT_SIZE * HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_INSTANCE_SLOT |
| HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR | If HCS_CLUSTER_INSTANCE_SLOT_SIZE is not defined, this factor will be used to define HCS_CLUSTER_INSTANCE_SLOT_SIZE as hcs_root_size / HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR |
| HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_CLUSTER_SLOT | If HCS_CLUSTER_INSTANCE_SLOT_SIZE is not defined, this value will be used to define memory requirements to process hsc_root as: <br/>hcs_root_size / HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR * HCS_CLUSTER_PROCESSING_MEMORY_CLUSTER_SLOT / HCS_PARSING_CLUSTER_PROCESSING_MEMORY_PER_INSTANCE_SLOT |
| HCS_WORKER_INSTANCE_TYPE | Instance type to be used to run cluster worker with pipe run command. Pipe run cluster option only. |
| HCS_WORKER_MEMORY_GB | Max memory limit to be propagated to JMV opts for underlying bioformats java processes. Pipe run cluster option only. |

### Notification parameters

| Name | Description |
|-----------------------|--------------------------------------------------------------------------------------|
| HCS_NOTIFY_USERS | Comma separated list of emails to send notification to |
| HCS_DEPLOY_NAME | Name of the platform which will be used in notification emails |
| HCS_DATA_STORAGE_ID | Id of the input source datastorage which will be used to generate notification email |
| HCS_MARKUP_STORAGE_ID | Id of the output datastorage which will be used to generate notification email |
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This python file is one of the entry point to the hcs-parser.
Used when you need to run image processing on one node,
also used by process_hcs_files_cluster.py to run processing of particular hcs_roots.
Firstly it will find all hcs_roots to process based on:
HCS_TARGET_PATHS - exact locations of hcs roots
or
HCS_LOOKUP_DIRECTORIES - folders to search to find all hcs_roots
and
HCS_ROOT_TYPE - Currently TIFF and CZI are supported, based on this parameter hcs-parser defines how to
search hcs_roots and later how to perform some of the image generation steps
(see process_hcs_files.py and processors.py)
Secondly script, based on HCS_ROOT_TYPE property, will initiate one of HcsFileParser (see processors.py) to run processing
"""

import os
import multiprocessing
import traceback

from src.utils import log_run_info, log_run_success
from src.utils import get_int_run_param, get_bool_run_param
from src.fs import get_processing_roots
from src.processors import HcsFileParser
from src.processors import HcsCZIFileParser, HcsTiffFileParser
from src.hcs_entity import HcsRootType

TAGS_PROCESSING_ONLY = get_bool_run_param('HCS_PARSING_TAGS_ONLY')
EVAL_PROCESSING_ONLY = get_bool_run_param('HCS_PARSING_EVAL_ONLY')
Expand All @@ -30,15 +48,19 @@
HCS_INDEX_FILE_NAME = os.getenv('HCS_PARSING_INDEX_FILE_NAME', 'Index.xml')
HCS_IMAGE_DIR_NAME = os.getenv('HCS_PARSING_IMAGE_DIR_NAME', 'Images')
MEASUREMENT_INDEX_FILE_PATH = '/{}/{}'.format(HCS_IMAGE_DIR_NAME, HCS_INDEX_FILE_NAME)
HCS_ROOT_TYPE = HcsRootType.get(os.getenv('HCS_ROOT_TYPE', 'TIFF'))

HCS_ROOT_SEARCH_MARK = MEASUREMENT_INDEX_FILE_PATH
if HCS_ROOT_TYPE == HcsRootType.CZI:
HCS_ROOT_SEARCH_MARK = ".czi"

def try_process_hcs(hcs_root):
parser = None
processing_result = 1
try:
log_run_info('Starting processing of folder {} with image preview {}'
.format(hcs_root.root_path, hcs_root.hcs_img_path))
parser = HcsFileParser(hcs_root.root_path, hcs_root.hcs_img_path)
log_run_info('Starting processing of path {} of type {} with image preview {}'
.format(hcs_root.root_path, HCS_ROOT_TYPE, hcs_root.hcs_img_path))
parser = initialize_hcs_parser(hcs_root)
processing_result = parser.process_file()
return processing_result
except Exception as e:
Expand All @@ -52,9 +74,15 @@ def try_process_hcs(hcs_root):
parser.clear_tmp_local_dir()


def initialize_hcs_parser(hcs_root):
if HCS_ROOT_TYPE == HcsRootType.CZI:
return HcsCZIFileParser(hcs_root.root_path, hcs_root.hcs_img_path)
return HcsTiffFileParser(hcs_root.root_path, hcs_root.hcs_img_path)


def process_hcs_files():
should_force_processing = TAGS_PROCESSING_ONLY or FORCE_PROCESSING
paths_to_hcs_roots = get_processing_roots(should_force_processing, MEASUREMENT_INDEX_FILE_PATH)
paths_to_hcs_roots = get_processing_roots(should_force_processing, HCS_ROOT_SEARCH_MARK, HCS_ROOT_TYPE)
if not paths_to_hcs_roots or len(paths_to_hcs_roots) == 0:
log_run_success('Found no files requires processing in the lookup directories.')
exit(0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This python file is one of the entry point to the hcs-parser.
Used when you need to run image processing in cluster to process several images in parallel.
Firstly it will find all hcs_roots to process based on:
HCS_TARGET_PATHS - exact locations of hcs roots
or
HCS_LOOKUP_DIRECTORIES - folders to search to find all hcs_roots
and
HCS_ROOT_TYPE - Currently TIFF and CZI are supported, based on this parameter hcs-parser defines how to
search hcs_roots and later how to perform some of the image generation steps
(see process_hcs_files.py and processors.py)
Secondly script, based on HCS_ASYNC_PROCESSING property, will run SGE job or execute pipe run command
to run additional node which will execute process_hcs_files.py with specified HCS_TARGET_PATHS
to run actual image generation
"""
import os
import math
from pipeline.api import PipelineAPI
Expand All @@ -26,6 +43,7 @@
from src.fs import get_processing_roots
from src.utils import HcsFileLogger, log_run_info, log_run_success
from src.utils import get_int_run_param, get_bool_run_param
from src.hcs_entity import HcsRootType

SUCCESS_EXIT_CODE = '0'
ASYNC_EXIT_CODE = '777'
Expand Down Expand Up @@ -100,6 +118,12 @@
HCS_INDEX_FILE_NAME = os.getenv('HCS_PARSING_INDEX_FILE_NAME', 'Index.xml')
HCS_IMAGE_DIR_NAME = os.getenv('HCS_PARSING_IMAGE_DIR_NAME', 'Images')
MEASUREMENT_INDEX_FILE_PATH = '/{}/{}'.format(HCS_IMAGE_DIR_NAME, HCS_INDEX_FILE_NAME)
HCS_ROOT_TYPE = HcsRootType.get(os.getenv('HCS_ROOT_TYPE', 'TIFF'))

HCS_ROOT_SEARCH_MARK = MEASUREMENT_INDEX_FILE_PATH
if HCS_ROOT_TYPE == HcsRootType.CZI:
HCS_ROOT_SEARCH_MARK = ".czi"

HCS_CLUSTER_PROCESSING_MEMORY_SIZE_SLOT_FACTOR = get_int_run_param('HCS_PARSING_CLUSTER_PROCESSING_MEMORY_FACTOR', 20)
HCS_CLUSTER_INSTANCE_SLOT_SIZE = get_int_run_param('HCS_CLUSTER_INSTANCE_SLOT_SIZE', 0)
HCS_CLUSTER_PROCESSING_MEMORY_CLUSTER_SLOT = \
Expand All @@ -121,9 +145,10 @@ class HcsFileSgeParser:
PENDING_JOB_STATUSES = ['qw', 'qw', 'hqw', 'hqw', 'hRwq', 'hRwq', 'hRwq', 'qw', 'qw']
RUNNING_JOB_STATUSES = ['r', 't', 'Rr', 'Rt']

def __init__(self, hcs_file_root_path, hcs_img_path):
def __init__(self, hcs_file_root_path, hcs_img_path, root_type):
self.hcs_root_path = hcs_file_root_path
self.hcs_img_path = hcs_img_path
self.root_type = root_type
self.processing_logger = HcsFileLogger(hcs_file_root_path)

@staticmethod
Expand Down Expand Up @@ -207,7 +232,7 @@ def _calculate_hcs_dir_size_gigabytes(self):
cloud_path_chunks = cloud_path.split('/', 1)
storage_name = cloud_path_chunks[0]
relative_path = cloud_path_chunks[1] if len(cloud_path_chunks) == 2 else ''
command = "pipe storage du '{}' -p '{}' -f GB | awk ' FNR > 1 {{ print $3 }}' ".format(storage_name, relative_path)
command = "pipe storage du '{}' -p '{}' -f GB | awk ' FNR == 2 {{ print $(NF-1) }}' ".format(storage_name, relative_path)
output = subprocess.check_output(command, shell=True)
try:
return float(output.strip())
Expand All @@ -217,13 +242,14 @@ def _calculate_hcs_dir_size_gigabytes(self):
def _build_env_vars_to_propagate(self, heap_limit_gb):
jvm_parameters = COMMON_JAVA_OPTS + ' -Xmx{}G'.format(heap_limit_gb)
env_vars_string = '''
export HCS_TARGET_DIRECTORIES="{}"
export HCS_TARGET_PATHS="{}"
export HCS_TARGET_IMG_NAMES="{}"
export HCS_ROOT_TYPE="{}"
export JAVA_OPTS="{}"
export HCS_PARSER_PROCESSING_THREADS=1
export PATH="{}"
export BF_MAX_MEM="{}G"
'''.format(self.hcs_root_path, self.hcs_img_path, jvm_parameters, os.getenv('PATH'), str(heap_limit_gb))
'''.format(self.hcs_root_path, self.hcs_img_path, self.root_type.name, jvm_parameters, os.getenv('PATH'), str(heap_limit_gb))
for key, value in os.environ.items():
if key.startswith('HCS_PARSING_'):
if key == 'HCS_PARSING_PLATE_DETAILS_DICT':
Expand All @@ -234,8 +260,9 @@ def _build_env_vars_to_propagate(self, heap_limit_gb):

def _get_propagated_env_vars(self, memory_limit):
result = {
'HCS_TARGET_DIRECTORIES': self.hcs_root_path,
'HCS_TARGET_PATHS': self.hcs_root_path,
'HCS_TARGET_IMG_NAMES': self.hcs_img_path,
'HCS_ROOT_TYPE': HCS_ROOT_TYPE.name,
'JAVA_OPTS': COMMON_JAVA_OPTS + ' -Xmx{}G'.format(memory_limit),
'HCS_PARSER_PROCESSING_THREADS': '1',
'CP_CAP_LIMIT_MOUNTS': os.getenv('CP_CAP_LIMIT_MOUNTS')
Expand Down Expand Up @@ -340,7 +367,7 @@ def _get_job_state(self, job_id):


def try_process_hcs_in_cluster(hcs_root_dir):
parser = HcsFileSgeParser(hcs_root_dir.root_path, hcs_root_dir.hcs_img_path)
parser = HcsFileSgeParser(hcs_root_dir.root_path, hcs_root_dir.hcs_img_path, HCS_ROOT_TYPE)
try:
return parser.process_file_using_pipe() if ASYNC_MODE else parser.process_file_in_sge()
except Exception as e:
Expand All @@ -353,7 +380,7 @@ def try_process_hcs_in_cluster(hcs_root_dir):

def process_hcs_files_cluster():
should_force_processing = TAGS_PROCESSING_ONLY or EVAL_PROCESSING_ONLY or FORCE_PROCESSING
paths_to_hcs_roots = get_processing_roots(should_force_processing, MEASUREMENT_INDEX_FILE_PATH)
paths_to_hcs_roots = get_processing_roots(should_force_processing, HCS_ROOT_SEARCH_MARK, HCS_ROOT_TYPE)
if not paths_to_hcs_roots or len(paths_to_hcs_roots) == 0:
log_run_success('Found no files requires processing in the lookup directories.')
exit(0)
Expand Down
Loading

0 comments on commit 079dcfe

Please sign in to comment.