diff --git a/ontology/Dependency_Constraints.owl b/ontology/Dependency_Constraints.owl index 4ef38f4dd..a66405e13 100644 --- a/ontology/Dependency_Constraints.owl +++ b/ontology/Dependency_Constraints.owl @@ -14,15 +14,23 @@ ################################################################# ### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#hasAction -:hasAction rdf:type owl:ObjectProperty ; - rdfs:domain :Dependency . +:hasAction rdf:type owl:ObjectProperty , + owl:IrreflexiveProperty ; + rdfs:domain :Dependency ; + rdfs:range [ rdf:type owl:Restriction ; + owl:onProperty :hasCondition ; + owl:someValuesFrom :Action + ] . ### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#hasCondition -:hasCondition rdf:type owl:ObjectProperty ; - rdfs:domain :Action , - :Condition , - :Dependency . +:hasCondition rdf:type owl:ObjectProperty , + owl:IrreflexiveProperty ; + rdfs:domain :Dependency ; + rdfs:range [ rdf:type owl:Restriction ; + owl:onProperty :hasCondition ; + owl:someValuesFrom :Condition + ] . ### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#hasDependentParameter @@ -55,11 +63,6 @@ rdfs:range xsd:string . -### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#hasType -:hasType rdf:type owl:DatatypeProperty ; - rdfs:comment "Each parameter has a type." . - - ### http://www.w3.org/2002/07/owl#topDataProperty owl:topDataProperty rdfs:subPropertyOf owl:topDataProperty . @@ -83,11 +86,6 @@ owl:topDataProperty rdfs:subPropertyOf owl:topDataProperty . rdfs:comment "This class describes Dependencies."@en . -### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#IsIgnored -:IsIgnored rdf:type owl:Class ; - rdfs:subClassOf :StaticAction . - - ### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#Parameter :Parameter rdf:type owl:Class ; rdfs:comment "This describes Parameters of Scikit-lean APIs." . @@ -98,9 +96,19 @@ owl:topDataProperty rdfs:subPropertyOf owl:topDataProperty . rdfs:subClassOf :StaticCondition . -### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#ParameterOnlyUsedWhen -:ParameterOnlyUsedWhen rdf:type owl:Class ; - rdfs:subClassOf :StaticCondition . +### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#ParameterIsIgnored +:ParameterIsIgnored rdf:type owl:Class ; + rdfs:subClassOf :StaticAction . + + +### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#ParameterIsIllegal +:ParameterIsIllegal rdf:type owl:Class ; + rdfs:subClassOf :StaticAction . + + +### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#ParameterIsNone +:ParameterIsNone rdf:type owl:Class ; + rdfs:subClassOf :StaticCondition . ### http://www.semanticweb.org/praja/ontologies/2021/11/Dependency_Constraints#RuntimeAction diff --git a/package_parser/package_parser/cli.py b/package_parser/package_parser/cli.py index 82d43d576..d65a6014f 100644 --- a/package_parser/package_parser/cli.py +++ b/package_parser/package_parser/cli.py @@ -23,15 +23,20 @@ def cli() -> None: if args.command == __API_COMMAND: public_api = get_api(args.package) - get_dependencies(public_api) + public_api_dependencies = get_dependencies(public_api) out_dir: Path = args.out - out_file = out_dir.joinpath( + out_file_api = out_dir.joinpath( f"{public_api.distribution}__{public_api.package}__{public_api.version}__api.json" ) - ensure_file_exists(out_file) - with out_file.open("w") as f: + out_file_api_dependencies = out_dir.joinpath( + f"{public_api.distribution}__{public_api.package}__{public_api.version}__api_dependencies.json" + ) + ensure_file_exists(out_file_api) + with out_file_api.open("w") as f: json.dump(public_api.to_json(), f, indent=2, cls=CustomEncoder) + with out_file_api_dependencies.open("w") as f: + json.dump(public_api_dependencies.to_json(), f, indent=2, cls=CustomEncoder) def __get_args() -> argparse.Namespace: diff --git a/package_parser/package_parser/commands/get_api/_model.py b/package_parser/package_parser/commands/get_api/_model.py index 0c52d07dc..ffea1b64e 100644 --- a/package_parser/package_parser/commands/get_api/_model.py +++ b/package_parser/package_parser/commands/get_api/_model.py @@ -447,13 +447,12 @@ def to_json(self) -> Any: @dataclass class Action: + action: str + @classmethod def from_json(cls, json: Any): return cls(json["action"]) - def __init__(self, action: str) -> None: - self.action = action - def to_json(self) -> Dict: return {"action": self.action} @@ -480,13 +479,12 @@ def __init__(self, action: str) -> None: @dataclass class Condition: + condition: str + @classmethod def from_json(cls, json: Any): return cls(json["condition"]) - def __init__(self, condition: str) -> None: - self.condition = condition - def to_json(self) -> Dict: return {"condition": self.condition} @@ -506,12 +504,18 @@ def __init__(self, condition: str) -> None: super().__init__(condition) -class ParameterIsOptional(StaticCondition): +class ParameterIsNone(StaticCondition): def __init__(self, condition: str) -> None: super().__init__(condition) +@dataclass class Dependency: + hasDependentParameter: Parameter + isDependingOn: Parameter + hasCondition: Condition + hasAction: Action + @classmethod def from_json(cls, json: Any): return cls( @@ -521,18 +525,6 @@ def from_json(cls, json: Any): Action.from_json(["hasAction"]), ) - def __init__( - self, - hasDependentParameter: Parameter, - isDependingOn: Parameter, - hasCondition: Condition, - hasAction: Action, - ) -> None: - self.hasDependentParameter = hasDependentParameter - self.isDependingOn = isDependingOn - self.hasCondition = hasCondition - self.hasAction = hasAction - def to_json(self) -> Dict: return { "hasDependentParameter": self.hasDependentParameter.to_json(), @@ -540,3 +532,17 @@ def to_json(self) -> Dict: "hasCondition": self.hasCondition.to_json(), "hasAction": self.hasAction.to_json(), } + + +@dataclass +class APIDependencies: + dependencies: Dict + + def to_json(self) -> Dict: + return { + function_name: { + parameter_name: [dependency.to_json() for dependency in dependencies] + for parameter_name, dependencies in parameter_name.items() + } + for function_name, parameter_name in self.dependencies.items() + } diff --git a/package_parser/package_parser/commands/get_dependencies/README.md b/package_parser/package_parser/commands/get_dependencies/README.md new file mode 100644 index 000000000..4269a2efd --- /dev/null +++ b/package_parser/package_parser/commands/get_dependencies/README.md @@ -0,0 +1,19 @@ +# Dependency Extaction + +## How do we imagine a Dependency + +A basic parameter dependency, what we are in this lab handling, is contained within a single sentence. In this sentence, we expect the identifying name of another parameter in the same function to appear, specifically in the dependency subtree of the condition. + + +## How do we extract a Dependency + +Relying on spaCy's DependencyMatcher, we write functions to detect the head token of both the action and condition dependency subtrees. We assume that the action is always the root of the sentence, and a subtree inside the action contains the condition text. + +Phrases are used to identify the type of the action/condition and create the appropriate model object. + +Parsing a dependency subtree in an InOrder traversal, we can rebuild a sentence from the spaCy token objects. + + +### Dependency Tree Example + +![Alt text](dependency_tree_example.png "Dependency Tree Example") diff --git a/package_parser/package_parser/commands/get_dependencies/_dependency_patterns.py b/package_parser/package_parser/commands/get_dependencies/_dependency_patterns.py index f00e71d7f..5c747d1fa 100644 --- a/package_parser/package_parser/commands/get_dependencies/_dependency_patterns.py +++ b/package_parser/package_parser/commands/get_dependencies/_dependency_patterns.py @@ -13,29 +13,5 @@ "RIGHT_ID": "dependee_param", "RIGHT_ATTRS": {"DEP": {"IN": ["nsubj", "nsubjpass"]}}, }, - ], - "pattern_parameter_": [ - { - "RIGHT_ID": "action", - "RIGHT_ATTRS": {"POS": "VERB"}, # verb is set as an anchor token - }, - { - "LEFT_ID": "action", - "REL_OP": ">", - "RIGHT_ID": "ActionParameterName", # verb is a direct head of subject which is a NOUN i.e. Parameter Name - "RIGHT_ATTRS": {"DEP": {"IN": ["nsubjpass", "nsubj"]}}, - }, - { - "LEFT_ID": "action", - "REL_OP": ">", - "RIGHT_ID": "ConditionalVerbModifier", # Verb is restricted by Verb Modifier - "RIGHT_ATTRS": {"DEP": "advmod"}, - }, - { - "LEFT_ID": "action", - "REL_OP": ">>", - "RIGHT_ID": "ConditionalParameterName", # verb is a head in chain of object i.e. Parameter name or value - "RIGHT_ATTRS": {"DEP": {"IN": ["dobj", "pobj"]}}, - }, - ], + ] } diff --git a/package_parser/package_parser/commands/get_dependencies/_get_dependency.py b/package_parser/package_parser/commands/get_dependencies/_get_dependency.py index 2b6e42400..4c2bb4702 100644 --- a/package_parser/package_parser/commands/get_dependencies/_get_dependency.py +++ b/package_parser/package_parser/commands/get_dependencies/_get_dependency.py @@ -8,13 +8,14 @@ from ..get_api._model import ( API, Action, + APIDependencies, Condition, Dependency, Parameter, ParameterHasValue, ParameterIsIgnored, ParameterIsIllegal, - ParameterIsOptional, + ParameterIsNone, ) from ._dependency_patterns import dependency_matcher_patterns from ._preprocess_docstring import preprocess_docstring @@ -23,6 +24,9 @@ def extract_lefts_and_rights(curr_token: Token, extracted: Union[List, None] = None): + """ + Given a spaCy token, extract recursively all tokens in its dependency subtree in inorder traversal. + """ if extracted is None: extracted = [] @@ -40,6 +44,10 @@ def extract_lefts_and_rights(curr_token: Token, extracted: Union[List, None] = N def extract_action(action_token: Token, condition_token: Token) -> Action: + """ + Create action object given head token of action phrase in docstring. + Condition token used to avoid traversing into the condition phrase dependency subtree of the docstring. + """ action_tokens = [] action_lefts = list(action_token.lefts) action_rights = list(action_token.rights) @@ -52,6 +60,9 @@ def extract_action(action_token: Token, condition_token: Token) -> Action: if token != condition_token: action_tokens.extend(extract_lefts_and_rights(token)) + # Remove trailing punctiation + if any(p == action_tokens[-1] for p in [",", "."]): + del action_tokens[-1] action_text = " ".join(action_tokens) ignored_phrases = [ @@ -71,20 +82,30 @@ def extract_action(action_token: Token, condition_token: Token) -> Action: def extract_condition(condition_token: Token) -> Condition: + """ + Create condition object given head token of condition phrase in docstring. + """ condition_token_subtree = list(condition_token.subtree) condition_text = " ".join([token.text for token in condition_token_subtree]) - is_optional_phrases = [ + is_none_phrases = [ "is none", - "is not set", + "is also none" "is not set", "is not specified", "is not none", "if none", "if not none", ] - has_value_phrases = ["equals", "is true", "is false", "is set to"] - if any(phrase in condition_text.lower() for phrase in is_optional_phrases): - return ParameterIsOptional(condition=condition_text) + has_value_phrases = [ + "equals", + "is true", + "is false", + "is set to", + "is greater than", + "is less than", + ] + if any(phrase in condition_text.lower() for phrase in is_none_phrases): + return ParameterIsNone(condition=condition_text) elif any(phrase in condition_text.lower() for phrase in has_value_phrases): return ParameterHasValue(condition=condition_text) else: @@ -92,6 +113,10 @@ def extract_condition(condition_token: Token) -> Condition: class DependencyExtractor: + """ + Functions to extract each type of pattern in _dependency_patterns + """ + @staticmethod def extract_pattern_parameter_subordinating_conjunction( dependent_param: Parameter, @@ -133,7 +158,8 @@ def extract_dependencies_from_docstring( spacy_id_to_pattern_id_mapping: Dict, ) -> List[Dependency]: """ - Extract readable dependencies in a Docstring from pattern matches + Extract readable dependencies in a Docstring from pattern matches. + Function fetched from class DependencyExtractor, when 'extract_' + pattern name match function name in the class. """ dependencies = list() for match in matches: @@ -149,7 +175,7 @@ def extract_dependencies_from_docstring( return dependencies -def get_dependencies(api: API) -> Dict: +def get_dependencies(api: API) -> APIDependencies: """ Loop through all functions in the API Parse and preprocess each doc string from every function @@ -187,4 +213,4 @@ def get_dependencies(api: API) -> Dict: if param_dependencies: all_dependencies[function_name][parameter.name] = param_dependencies - return all_dependencies + return APIDependencies(dependencies=all_dependencies) diff --git a/package_parser/package_parser/commands/get_dependencies/dependency_tree_example.png b/package_parser/package_parser/commands/get_dependencies/dependency_tree_example.png new file mode 100644 index 000000000..6ff36abf1 Binary files /dev/null and b/package_parser/package_parser/commands/get_dependencies/dependency_tree_example.png differ diff --git a/package_parser/pyproject.toml b/package_parser/pyproject.toml index 2063c63df..bf5f3687a 100644 --- a/package_parser/pyproject.toml +++ b/package_parser/pyproject.toml @@ -16,6 +16,9 @@ importlib-metadata = "^4.10.0" numpydoc = "^1.1.0" spacy = "^3.2.1" +[tool.poetry.dependencies.en_core_web_sm] +url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl" + [tool.poetry.dev-dependencies] pytest = "^6.2.5" pytest-cov = "^3.0.0" diff --git a/package_parser/tests/commands/get_dependencies/test_get_dependency.py b/package_parser/tests/commands/get_dependencies/test_get_dependency.py new file mode 100644 index 000000000..885a9d550 --- /dev/null +++ b/package_parser/tests/commands/get_dependencies/test_get_dependency.py @@ -0,0 +1,138 @@ +import spacy +from package_parser.commands.get_api._model import ( + Action, + Condition, + Dependency, + Parameter, + ParameterAndResultDocstring, + ParameterHasValue, + ParameterIsIgnored, + ParameterIsIllegal, + ParameterIsNone, +) +from package_parser.commands.get_dependencies._get_dependency import ( + DependencyExtractor, + extract_action, + extract_condition, + extract_lefts_and_rights, +) + +nlp = spacy.load("en_core_web_sm") + + +def test_extract_lefts_and_rights(): + # string from https://spacy.io/usage/linguistic-features#navigating + doc = nlp("Autonomous cars shift insurance liability toward manufacturers") + doc_head_token = doc[2] + extracted_lefts_and_rights = extract_lefts_and_rights(doc_head_token) + assert extracted_lefts_and_rights == doc.text.split() + + +def test_extract_action(): + action_is_ignored = nlp( + "this parameter is ignored when fit_intercept is set to False." + ) + action_is_ignored_action_token = action_is_ignored[3] + action_is_ignored_condition_token = action_is_ignored[7] + + ignored_action = extract_action( + action_is_ignored_action_token, action_is_ignored_condition_token + ) + assert ignored_action == ParameterIsIgnored(action="this parameter is ignored") + + action_is_illegal = nlp( + "Individual weights for each sample raises error if sample_weight is passed and base_estimator fit method does not support it. " + ) + action_is_illegal_action_token = action_is_illegal[5] + action_is_illegal_condition_token = action_is_illegal[10] + + illegal_action = extract_action( + action_is_illegal_action_token, action_is_illegal_condition_token + ) + assert illegal_action == ParameterIsIllegal( + action="Individual weights for each sample raises error" + ) + + action_uncategorised = nlp( + "If metric is precomputed, X is assumed to be a kernel matrix." + ) + action_uncategorised_action_token = action_uncategorised[7] + action_uncategorised_condition_token = action_uncategorised[3] + + action = extract_action( + action_uncategorised_action_token, action_uncategorised_condition_token + ) + assert action == Action(action=", X is assumed to be a kernel matrix") + + +def test_extract_condition(): + condition_is_none = nlp( + "If func is None , then func will be the identity function." + ) + condition_is_none_root_token = condition_is_none[2] + + is_none_condition = extract_condition(condition_is_none_root_token) + assert is_none_condition == ParameterIsNone(condition="If func is None") + + condition_has_value = nlp( + "this parameter is ignored when fit_intercept is set to False." + ) + condition_has_value_root_token = condition_has_value[7] + + has_value_condition = extract_condition(condition_has_value_root_token) + assert has_value_condition == ParameterHasValue( + condition="when fit_intercept is set to False" + ) + + condition_uncategorised = nlp( + "If metric is a string, it must be one of the metrics in pairwise." + ) + condition_uncategorised_root_token = condition_uncategorised[2] + + condition = extract_condition(condition_uncategorised_root_token) + assert condition == Condition(condition="If metric is a string") + + +def test_extract_dependencies_from_docstring_pattern_subordinating_conjunction(): + param_docstring_nlp = nlp("ignored when probability is False") + dependent_param = Parameter( + name="random_state", + default_value=None, + is_public=True, + assigned_by="NAME_ONLY", + docstring=ParameterAndResultDocstring( + type_="param possible types", description=param_docstring_nlp.text + ), + ) + dependee_param = Parameter( + name="probability", + default_value=None, + is_public=True, + assigned_by="NAME_ONLY", + docstring=ParameterAndResultDocstring( + type_="param possible types", description="param probability docstring" + ), + ) + pattern_parameter_subordinating_conjunction = nlp( + "ignored when probability is False" + ) + func_params = [dependent_param, dependee_param] + match = (314159265, [0, 3, 2]) + + expected_dependency = Dependency( + hasDependentParameter=dependent_param, + isDependingOn=dependee_param, + hasCondition=ParameterHasValue("when probability is False"), + hasAction=ParameterIsIgnored("ignored"), + ) + + extracted_dependency = ( + DependencyExtractor.extract_pattern_parameter_subordinating_conjunction( + dependent_param=dependent_param, + func_parameters=func_params, + match=match, + param_docstring=param_docstring_nlp, + ) + ) + + assert expected_dependency == extracted_dependency