feat: Action/Condition Classification (#48)

* #43 basic condtion and action extraction functions * #43 corrected extractions, now matching occurs per sentence, removes many false positives * #43 fixed recursive action extraction function * Added new pattern * #43 classifying of ignored parameter actions * #43 conditions and actions are now dataclasses, renamed one condition * #43 phrases to categories actions/conditions * #43 mypy fixes * style: apply automatic fixes of linters * #43 renamed IDs in general SCONJ pattern * #43 more descriptive pattern name Co-authored-by: Aleksandr Sergeev <[email protected]> Co-authored-by: prajakta <[email protected]> Co-authored-by: aserge16 <[email protected]>
Safe-DS · Jan 17, 2022 · 13c789c · 13c789c
1 parent 3efa424
commit 13c789c
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 112 deletions.
diff --git a/package_parser/package_parser/cli.py b/package_parser/package_parser/cli.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 from .commands.get_api import get_api
+from .commands.get_dependencies import get_dependencies
 from .utils import ensure_file_exists
 
 __API_COMMAND = "api"
@@ -22,6 +23,7 @@ def cli() -> None:
 
     if args.command == __API_COMMAND:
         public_api = get_api(args.package)
+        get_dependencies(public_api)
 
         out_dir: Path = args.out
         out_file = out_dir.joinpath(

diff --git a/package_parser/package_parser/commands/get_api/_model.py b/package_parser/package_parser/commands/get_api/_model.py
@@ -2,7 +2,7 @@
 
 import inspect
 import re
-from dataclasses import asdict
+from dataclasses import asdict, dataclass
 from enum import Enum, auto
 from typing import Any, Dict, Optional, Union
 
@@ -445,6 +445,7 @@ def to_json(self) -> Any:
         return {"type": self.type, "description": self.description}
 
 
+@dataclass
 class Action:
     @classmethod
     def from_json(cls, json: Any):
@@ -477,6 +478,7 @@ def __init__(self, action: str) -> None:
         super().__init__(action)
 
 
+@dataclass
 class Condition:
     @classmethod
     def from_json(cls, json: Any):
@@ -504,7 +506,7 @@ def __init__(self, condition: str) -> None:
         super().__init__(condition)
 
 
-class ParameterIsSet(StaticCondition):
+class ParameterIsOptional(StaticCondition):
     def __init__(self, condition: str) -> None:
         super().__init__(condition)
 

diff --git a/package_parser/package_parser/commands/get_dependencies/_dependency_patterns.py b/package_parser/package_parser/commands/get_dependencies/_dependency_patterns.py
@@ -1,53 +1,41 @@
 dependency_matcher_patterns = {
-    "pattern_parameter_used_condition": [
-        {"RIGHT_ID": "used", "RIGHT_ATTRS": {"ORTH": {"IN": ["used", "Used"]}}},
+    "pattern_parameter_subordinating_conjunction": [
+        {"RIGHT_ID": "action_head", "RIGHT_ATTRS": {"POS": "VERB"}},
         {
-            "LEFT_ID": "used",
+            "LEFT_ID": "action_head",
             "REL_OP": ">",
-            "RIGHT_ID": "condition",
+            "RIGHT_ID": "condition_head",
             "RIGHT_ATTRS": {"DEP": "advcl"},
         },
         {
-            "LEFT_ID": "condition",
+            "LEFT_ID": "condition_head",
             "REL_OP": ">",
             "RIGHT_ID": "dependee_param",
             "RIGHT_ATTRS": {"DEP": {"IN": ["nsubj", "nsubjpass"]}},
         },
     ],
-    "pattern_parameter_ignored_condition": [
+    "pattern_parameter_": [
         {
-            "RIGHT_ID": "ignored",
-            "RIGHT_ATTRS": {"ORTH": {"IN": ["ignored", "Ignored"]}},
+            "RIGHT_ID": "action",
+            "RIGHT_ATTRS": {"POS": "VERB"},  # verb is set as an anchor token
         },
         {
-            "LEFT_ID": "ignored",
+            "LEFT_ID": "action",
             "REL_OP": ">",
-            "RIGHT_ID": "condition",
-            "RIGHT_ATTRS": {"DEP": "advcl"},
+            "RIGHT_ID": "ActionParameterName",  # verb is a direct head of subject which is a NOUN i.e. Parameter Name
+            "RIGHT_ATTRS": {"DEP": {"IN": ["nsubjpass", "nsubj"]}},
         },
         {
-            "LEFT_ID": "condition",
+            "LEFT_ID": "action",
             "REL_OP": ">",
-            "RIGHT_ID": "dependee_param",
-            "RIGHT_ATTRS": {"DEP": {"IN": ["nsubj", "nsubjpass"]}},
+            "RIGHT_ID": "ConditionalVerbModifier",  # Verb is restricted by Verb Modifier
+            "RIGHT_ATTRS": {"DEP": "advmod"},
         },
-    ],
-    "pattern_parameter_applies_condition": [
         {
-            "RIGHT_ID": "applies",
-            "RIGHT_ATTRS": {"ORTH": {"IN": ["applies", "Applies"]}},
-        },
-        {
-            "LEFT_ID": "applies",
-            "REL_OP": ">",
-            "RIGHT_ID": "condition",
-            "RIGHT_ATTRS": {"DEP": "advcl"},
-        },
-        {
-            "LEFT_ID": "condition",
-            "REL_OP": ">",
-            "RIGHT_ID": "dependee_param",
-            "RIGHT_ATTRS": {"DEP": {"IN": ["nsubj", "nsubjpass"]}},
+            "LEFT_ID": "action",
+            "REL_OP": ">>",
+            "RIGHT_ID": "ConditionalParameterName",  # verb is a head in chain of object i.e. Parameter name or value
+            "RIGHT_ATTRS": {"DEP": {"IN": ["dobj", "pobj"]}},
         },
     ],
 }
diff --git a/package_parser/package_parser/commands/get_dependencies/_get_dependency.py b/package_parser/package_parser/commands/get_dependencies/_get_dependency.py
@@ -2,84 +2,98 @@
 
 import spacy
 from spacy.matcher import DependencyMatcher
+from spacy.tokens import Token
 from spacy.tokens.doc import Doc
 
-from ..get_api._model import API, Action, Condition, Dependency, Parameter
+from ..get_api._model import (
+    API,
+    Action,
+    Condition,
+    Dependency,
+    Parameter,
+    ParameterHasValue,
+    ParameterIsIgnored,
+    ParameterIsIllegal,
+    ParameterIsOptional,
+)
 from ._dependency_patterns import dependency_matcher_patterns
 from ._preprocess_docstring import preprocess_docstring
 
 PIPELINE = "en_core_web_sm"
 
 
-class DependencyExtractor:
-    @staticmethod
-    def extract_pattern_parameter_used_condition(
-        dependent_param: Parameter,
-        func_parameters: List[Parameter],
-        match: Tuple,
-        param_docstring: Doc,
-    ) -> Union[Dependency, None]:
-        is_depending_on_param_index = match[1][2]
-        is_depending_on_param_name = param_docstring[is_depending_on_param_index].text
-        is_depending_on_param = next(
-            filter(
-                lambda param: param.name == is_depending_on_param_name, func_parameters
-            ),
-            None,
-        )
-        if is_depending_on_param is None:
-            # Likely not a correct dependency match
-            return None
-
-        condition_verb = param_docstring[match[1][1]]
-        condition_verb_subtree = list(condition_verb.subtree)
-        condition_text = " ".join([token.text for token in condition_verb_subtree])
-        condition = Condition(condition=condition_text)
-
-        action = Action(action="used")
-
-        return Dependency(
-            hasDependentParameter=dependent_param,
-            isDependingOn=is_depending_on_param,
-            hasCondition=condition,
-            hasAction=action,
-        )
+def extract_lefts_and_rights(curr_token: Token, extracted: Union[List, None] = None):
+    if extracted is None:
+        extracted = []
+
+    token_lefts = list(curr_token.lefts)
+    for token in token_lefts:
+        extract_lefts_and_rights(token, extracted)
+
+    extracted.append(curr_token.text)
+
+    token_rights = list(curr_token.rights)
+    for token in token_rights:
+        extract_lefts_and_rights(token, extracted)
+
+    return extracted
+
+
+def extract_action(action_token: Token, condition_token: Token) -> Action:
+    action_tokens = []
+    action_lefts = list(action_token.lefts)
+    action_rights = list(action_token.rights)
+
+    for token in action_lefts:
+        if token != condition_token:
+            action_tokens.extend(extract_lefts_and_rights(token))
+    action_tokens.append(action_token.text)
+    for token in action_rights:
+        if token != condition_token:
+            action_tokens.extend(extract_lefts_and_rights(token))
+
+    action_text = " ".join(action_tokens)
+
+    ignored_phrases = [
+        "ignored",
+        "not used",
+        "no impact",
+        "only supported",
+        "only applies",
+    ]
+    illegal_phrases = ["raise", "exception", "must be", "must not be"]
+    if any(phrase in action_text.lower() for phrase in ignored_phrases):
+        return ParameterIsIgnored(action=action_text)
+    elif any(phrase in action_text.lower() for phrase in illegal_phrases):
+        return ParameterIsIllegal(action=action_text)
+    else:
+        return Action(action=action_text)
+
+
+def extract_condition(condition_token: Token) -> Condition:
+    condition_token_subtree = list(condition_token.subtree)
+    condition_text = " ".join([token.text for token in condition_token_subtree])
+
+    is_optional_phrases = [
+        "is none",
+        "is not set",
+        "is not specified",
+        "is not none",
+        "if none",
+        "if not none",
+    ]
+    has_value_phrases = ["equals", "is true", "is false", "is set to"]
+    if any(phrase in condition_text.lower() for phrase in is_optional_phrases):
+        return ParameterIsOptional(condition=condition_text)
+    elif any(phrase in condition_text.lower() for phrase in has_value_phrases):
+        return ParameterHasValue(condition=condition_text)
+    else:
+        return Condition(condition=condition_text)
 
-    @staticmethod
-    def extract_pattern_parameter_ignored_condition(
-        dependent_param: Parameter,
-        func_parameters: List[Parameter],
-        match: Tuple,
-        param_docstring: Doc,
-    ) -> Union[Dependency, None]:
-        is_depending_on_param_index = match[1][2]
-        is_depending_on_param_name = param_docstring[is_depending_on_param_index].text
-        is_depending_on_param = next(
-            filter(
-                lambda param: param.name == is_depending_on_param_name, func_parameters
-            ),
-            None,
-        )
-        if is_depending_on_param is None:
-            # Likely not a correct dependency match
-            return None
-
-        condition_verb = param_docstring[match[1][1]]
-        condition_verb_subtree = list(condition_verb.subtree)
-        condition_text = " ".join([token.text for token in condition_verb_subtree])
-        condition = Condition(condition=condition_text)
-
-        action = Action(action="ignored")
-
-        return Dependency(
-            hasDependentParameter=dependent_param,
-            isDependingOn=is_depending_on_param,
-            hasCondition=condition,
-            hasAction=action,
-        )
 
+class DependencyExtractor:
     @staticmethod
-    def extract_pattern_parameter_applies_condition(
+    def extract_pattern_parameter_subordinating_conjunction(
         dependent_param: Parameter,
         func_parameters: List[Parameter],
         match: Tuple,
@@ -97,12 +111,11 @@ def extract_pattern_parameter_applies_condition(
             # Likely not a correct dependency match
             return None
 
-        condition_verb = param_docstring[match[1][1]]
-        condition_verb_subtree = list(condition_verb.subtree)
-        condition_text = " ".join([token.text for token in condition_verb_subtree])
-        condition = Condition(condition=condition_text)
+        condition_token = param_docstring[match[1][1]]
+        condition = extract_condition(condition_token)
 
-        action = Action(action="applies")
+        action_token = param_docstring[match[1][0]]
+        action = extract_action(action_token, condition_token)
 
         return Dependency(
             hasDependentParameter=dependent_param,
@@ -159,14 +172,18 @@ def get_dependencies(api: API) -> Dict:
             docstring = parameter.docstring.description
             docstring_preprocessed = preprocess_docstring(docstring)
             doc = nlp(docstring_preprocessed)
-            dependency_matches = matcher(doc)
-            param_dependencies = extract_dependencies_from_docstring(
-                parameter,
-                parameters,
-                doc,
-                dependency_matches,
-                spacy_id_to_pattern_id_mapping,
-            )
+            param_dependencies = []
+            for sentence in doc.sents:
+                sentence_dependency_matches = matcher(sentence)
+                sentence_dependencies = extract_dependencies_from_docstring(
+                    parameter,
+                    parameters,
+                    sentence,
+                    sentence_dependency_matches,
+                    spacy_id_to_pattern_id_mapping,
+                )
+                if sentence_dependencies:
+                    param_dependencies.extend(sentence_dependencies)
             if param_dependencies:
                 all_dependencies[function_name][parameter.name] = param_dependencies