Merge pull request galaxyproject#18928 from mvdbeek/add_filter_null_c…

…ollection_operation_tool Add filter null collection operation tool
natefoo · Oct 4, 2024 · e3f4068 · e3f4068
2 parents d18bc68 + 1c2bfb9
commit e3f4068
Show file tree

Hide file tree

Showing 11 changed files with 129 additions and 9 deletions.
diff --git a/lib/galaxy/config/sample/tool_conf.xml.sample b/lib/galaxy/config/sample/tool_conf.xml.sample
@@ -32,6 +32,7 @@
     <tool file="${model_tools_path}/zip_collection.xml" />
     <tool file="${model_tools_path}/filter_failed_collection.xml" />
     <tool file="${model_tools_path}/filter_empty_collection.xml" />
+    <tool file="${model_tools_path}/filter_null.xml" />
     <tool file="${model_tools_path}/flatten_collection.xml" />
     <tool file="${model_tools_path}/merge_collection.xml" />
     <tool file="${model_tools_path}/relabel_from_file.xml" />

diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py
@@ -4723,8 +4723,10 @@ def set_skipped(self, object_store_populator: "ObjectStorePopulator") -> None:
         self.state = self.states.OK
         self.blurb = "skipped"
         self.visible = False
+        null = json.dumps(None)
         with open(self.dataset.get_file_name(), "w") as out:
-            out.write(json.dumps(None))
+            out.write(null)
+        self.peek = null
         self.set_total_size()
 
     def get_file_name(self, sync_cache: bool = True) -> str:

diff --git a/lib/galaxy/tool_util/models.py b/lib/galaxy/tool_util/models.py
@@ -156,6 +156,7 @@ class TestCollectionOutputAssertions(StrictModel):
     class_: Optional[Literal["Collection"]] = Field("Collection", alias="class")
     elements: Optional[Dict[str, TestCollectionElementAssertion]] = None
     element_tests: Optional[Dict[str, "TestCollectionElementAssertion"]] = None
+    element_count: Optional[int] = None
     attributes: Optional[CollectionAttributes] = None
     collection_type: CollectionType = None
 

diff --git a/lib/galaxy/tool_util/parser/interface.py b/lib/galaxy/tool_util/parser/interface.py
@@ -861,10 +861,13 @@ def matches(ie_list: List, rel_path: str):
 class TestCollectionOutputDef:
     __test__ = False  # Prevent pytest from discovering this class (issue #12071)
 
-    def __init__(self, name, attrib, element_tests):
+    def __init__(self, name, attrib, element_tests, element_count: Optional[int] = None):
         self.name = name
         self.collection_type = attrib.get("type", None)
-        count = attrib.get("count", None)
+        if element_count is not None:
+            count = element_count
+        else:
+            count = attrib.get("count")
         self.count = int(count) if count is not None else None
         self.attrib = attrib
         self.element_tests = element_tests
@@ -874,7 +877,8 @@ def from_dict(as_dict):
         return TestCollectionOutputDef(
             name=as_dict["name"],
             attrib=as_dict.get("attributes", {}),
-            element_tests=as_dict["element_tests"],
+            element_tests=as_dict.get("element_tests"),
+            element_count=as_dict.get("element_count"),
         )
 
     @staticmethod
@@ -891,7 +895,7 @@ def from_yaml_test_format(as_dict):
         return TestCollectionOutputDef.from_dict(as_dict)
 
     def to_dict(self):
-        return dict(name=self.name, attributes=self.attrib, element_tests=self.element_tests)
+        return dict(name=self.name, attributes=self.attrib, element_tests=self.element_tests, element_count=self.count)
 
 
 class DrillDownOptionsDict(TypedDict):

diff --git a/lib/galaxy/tool_util/verify/interactor.py b/lib/galaxy/tool_util/verify/interactor.py
@@ -1132,7 +1132,7 @@ def verify_collection(output_collection_def, data_collection, verify_dataset):
             raise AssertionError(message)
 
     expected_element_count = output_collection_def.count
-    if expected_element_count:
+    if expected_element_count is not None:
         actual_element_count = len(data_collection["elements"])
         if expected_element_count != actual_element_count:
             message = f"Output collection '{name}': expected to have {expected_element_count} elements, but it had {actual_element_count}."
@@ -1185,7 +1185,8 @@ def verify_elements(element_objects, element_tests):
                     message = f"Output collection '{name}': identifier '{identifier}' found out of order, expected order of {expected_sort_order} for the tool generated collection elements {eo_ids}"
                     raise AssertionError(message)
 
-    verify_elements(data_collection["elements"], output_collection_def.element_tests)
+    if output_collection_def.element_tests:
+        verify_elements(data_collection["elements"], output_collection_def.element_tests)
 
 
 def _verify_composite_datatype_file_content(

diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py
@@ -3798,7 +3798,15 @@ class FilterNullTool(FilterDatasetsTool):
     def element_is_valid(element: model.DatasetCollectionElement):
         element_object = element.element_object
         assert isinstance(element_object, model.DatasetInstance)
-        return element_object.extension == "expression.json" and element_object.blurb == "skipped"
+        if element_object.extension == "expression.json":
+            if element_object.peek == "null":
+                # shortcut
+                return False
+            else:
+                with open(element_object.get_file_name()) as fh:
+                    if fh.read(5) == "null":
+                        return False
+        return True
 
 
 class FlattenTool(DatabaseOperationTool):

diff --git a/lib/galaxy/tools/filter_null.xml b/lib/galaxy/tools/filter_null.xml
@@ -0,0 +1,46 @@
+<tool id="__FILTER_NULL__" name="Filter null elements" version="1.0.0" tool_type="filter_null">
+    <description/>
+    <type class="FilterNullTool" module="galaxy.tools"/>
+    <action module="galaxy.tools.actions.model_operations" class="ModelOperationToolAction"/>
+    <edam_operations>
+        <edam_operation>operation_3695</edam_operation>
+    </edam_operations>
+    <inputs>
+        <param type="data_collection" collection_type="list,list:paired" name="input" label="Input Collection"/>
+    </inputs>
+    <outputs>
+        <collection name="output" format_source="input" type_source="input" label="${on_string} (without null datasets)">
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input">
+                <collection type="list">
+                    <element name="e1" value="simple_line.txt"/>
+                </collection>
+            </param>
+            <output_collection name="output" type="list" count="1">
+                <element name="e1">
+                    <assert_contents>
+                        <has_text_matching expression="^This is a line of text.\n$"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+========
+Synopsis
+========
+
+Removes null elements from a collection.
+
+This tool takes a dataset collection and filters out nulls. This is useful for removing elements that resulted from conditional execution of jobs.
+
+.. class:: infomark
+
+This tool will create new history datasets from your collection but your quota usage will not increase.
+
+      ]]></help>
+</tool>
diff --git a/lib/galaxy_test/workflow/filter_null.gxwf-tests.yml b/lib/galaxy_test/workflow/filter_null.gxwf-tests.yml
@@ -0,0 +1,32 @@
+- doc: |
+    Test to verify filter null tool keeps non-null datasets.
+  job:
+    input_collection:
+      collection_type: list
+      elements:
+        - identifier: first
+          content: "abc"
+    when:
+      value: true
+      type: raw
+  outputs:
+    out:
+      class: Collection
+      collection_type: list
+      element_count: 1
+- doc: |
+    Test to verify filter null tool discards null datasets.
+  job:
+    input_collection:
+      collection_type: list
+      elements:
+        - identifier: first
+          content: "abc"
+    when:
+      value: false
+      type: raw
+  outputs:
+    out:
+      class: Collection
+      collection_type: list
+      element_count: 0
diff --git a/lib/galaxy_test/workflow/filter_null.gxwf.yml b/lib/galaxy_test/workflow/filter_null.gxwf.yml
@@ -0,0 +1,22 @@
+class: GalaxyWorkflow
+inputs:
+  input_collection:
+    type: data_collection
+  when:
+    type: boolean
+outputs:
+  out:
+    outputSource: filter_null/output
+steps:
+  cat:
+    tool_id: cat
+    in:
+      input1:
+        source: input_collection
+      when:
+        source: when
+    when: $(inputs.when)
+  filter_null:
+    tool_id: '__FILTER_NULL__'
+    in:
+      input: cat/out_file1
diff --git a/lib/galaxy_test/workflow/test_framework_workflows.py b/lib/galaxy_test/workflow/test_framework_workflows.py
@@ -74,7 +74,9 @@ def _verify(self, run_summary: RunJobsSummary, output_definitions: OutputsDict):
             self._verify_output(run_summary, output_name, output_definition)
 
     def _verify_output(self, run_summary: RunJobsSummary, output_name, test_properties: OutputChecks):
-        is_collection_test = isinstance(test_properties, dict) and "elements" in test_properties
+        is_collection_test = isinstance(test_properties, dict) and (
+            "elements" in test_properties or test_properties.get("class") == "Collection"
+        )
         item_label = f"Output named {output_name}"
 
         def get_filename(name):

diff --git a/test/functional/tools/sample_tool_conf.xml b/test/functional/tools/sample_tool_conf.xml
@@ -303,6 +303,7 @@
   <tool file="${model_tools_path}/filter_failed_collection.xml" />
   <tool file="${model_tools_path}/keep_success_collection.xml" />
   <tool file="${model_tools_path}/filter_empty_collection.xml" />
+  <tool file="${model_tools_path}/filter_null.xml" />
   <tool file="${model_tools_path}/flatten_collection.xml" />
   <tool file="${model_tools_path}/sort_collection_list.xml" />
   <tool file="${model_tools_path}/harmonize_two_collections_list.xml" />