diff --git a/setup.py b/setup.py index cafd5c7..5e1f6e8 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(file_path): setup( name = 'shexer', packages = find_packages(exclude=["*.local_code.*"]), # this must be the same as the name above - version = '2.5.5', + version = '2.5.6', description = 'Automatic schema extraction for RDF graphs', author = 'Daniel Fernandez-Alvarez', author_email = 'danifdezalvarez@gmail.com', url = 'https://github.com/DaniFdezAlvarez/shexer', - download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.5.5.tar.gz', + download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.5.6.tar.gz', keywords = ['testing', 'shexer', 'shexerp3', "rdf", "shex", "shacl", "schema"], long_description = read('README.md'), long_description_content_type='text/markdown', diff --git a/shexer/core/instances/abstract_instance_tracker.py b/shexer/core/instances/abstract_instance_tracker.py index 34a7eb2..8146815 100644 --- a/shexer/core/instances/abstract_instance_tracker.py +++ b/shexer/core/instances/abstract_instance_tracker.py @@ -1,6 +1,12 @@ +from shexer.model.property import Property +from shexer.utils.uri import remove_corners +from shexer.utils.factories.h_tree import get_basic_h_tree _TRACKERS_DISAM_COUNT = 0 +_RDF_TYPE = Property(content="http://www.w3.org/1999/02/22-rdf-syntax-ns#type") +_RDFS_SUBCLASS_OF = Property(content="http://www.w3.org/2000/01/rdf-schema#subClassOf") + class AbstractInstanceTracker(object): def track_instances(self, verbose=False): @@ -10,7 +16,7 @@ def track_instances(self, verbose=False): @property def disambiguator_prefix(self): """ - It return an str that may help for disambiguation purposes if the instance_tracker is used to produce dicts + It returns a str that may help for disambiguation purposes if the instance_tracker is used to produce dicts that may be integrated with other instance dicts and there should be any key colission. :return: """ @@ -19,4 +25,25 @@ def disambiguator_prefix(self): return self._specific_disambiguator_prefix() + str(_TRACKERS_DISAM_COUNT ) def _specific_disambiguator_prefix(self): - raise NotImplementedError() \ No newline at end of file + raise NotImplementedError() + + @staticmethod + def _build_instances_dict(): + return {} # Empty in every case. Instances, on the fly, will be the keys + + @staticmethod + def _decide_instantiation_property(instantiation_property): + if instantiation_property == None: + return _RDF_TYPE + if type(instantiation_property) == type(_RDF_TYPE): + return instantiation_property + if type(instantiation_property) == str: + return Property(remove_corners(a_uri=instantiation_property, + raise_error_if_no_corners=False)) + raise ValueError("Unrecognized param type to define instantiation property") + + def _reset_count(self): + self._relevant_triples = 0 + self._not_relevant_triples = 0 + self._htree = get_basic_h_tree() + diff --git a/shexer/core/instances/endpoint_instance_tracker.py b/shexer/core/instances/endpoint_instance_tracker.py new file mode 100644 index 0000000..5131880 --- /dev/null +++ b/shexer/core/instances/endpoint_instance_tracker.py @@ -0,0 +1,33 @@ +from shexer.core.instances.abstract_instance_tracker import _RDF_TYPE, _RDFS_SUBCLASS_OF +from shexer.core.instances.instance_tracker import InstanceTracker +from shexer.consts import SHAPES_DEFAULT_NAMESPACE +from shexer.model.bnode import BNode +from shexer.core.instances.pconsts import _S + + +class EndpointInstanceTracker(InstanceTracker): + + def __init__(self, target_classes, triples_yielder, instantiation_property=_RDF_TYPE, all_classes_mode=False, + subclass_property=_RDFS_SUBCLASS_OF, track_hierarchies=True, shape_qualifiers_mode=False, + namespaces_for_qualifier_props=None, shapes_namespace=SHAPES_DEFAULT_NAMESPACE, instances_cap=-1): + super().__init__(target_classes=target_classes, + triples_yielder=triples_yielder, + instantiation_property=instantiation_property, + all_classes_mode=all_classes_mode, + subclass_property=subclass_property, + track_hierarchies=track_hierarchies, + shape_qualifiers_mode=shape_qualifiers_mode, + namespaces_for_qualifier_props=namespaces_for_qualifier_props, + shapes_namespace=shapes_namespace, + instances_cap=instances_cap) + + def _yield_relevant_triples(self): + for a_triple in self._triples_yielder.yield_triples(): + if self._annotator.is_relevant_triple(a_triple) and self._subject_is_not_bnode(a_triple): + self._relevant_triples += 1 + yield a_triple + else: + self._not_relevant_triples += 1 + + def _subject_is_not_bnode(self, a_triple): + return not isinstance(a_triple[_S], BNode) diff --git a/shexer/core/instances/instance_tracker.py b/shexer/core/instances/instance_tracker.py index f2510e4..191f0af 100644 --- a/shexer/core/instances/instance_tracker.py +++ b/shexer/core/instances/instance_tracker.py @@ -1,15 +1,11 @@ -from shexer.model.property import Property -from shexer.utils.uri import remove_corners -from shexer.utils.factories.h_tree import get_basic_h_tree -from shexer.core.instances.annotators.annotator_func import get_proper_annotator -from shexer.core.instances.abstract_instance_tracker import AbstractInstanceTracker +from shexer.core.instances.abstract_instance_tracker import AbstractInstanceTracker, _RDF_TYPE, _RDFS_SUBCLASS_OF from shexer.consts import SHAPES_DEFAULT_NAMESPACE from shexer.utils.log import log_msg from shexer.core.instances.annotators.strategy_mode.instances_cap_exception import InstancesCapException +from shexer.utils.factories.h_tree import get_basic_h_tree +from shexer.core.instances.annotators.annotator_func import get_proper_annotator -_RDF_TYPE = Property(content="http://www.w3.org/1999/02/22-rdf-syntax-ns#type") -_RDFS_SUBCLASS_OF = Property(content="http://www.w3.org/2000/01/rdf-schema#subClassOf") class InstanceTracker(AbstractInstanceTracker): @@ -75,10 +71,7 @@ def _yield_relevant_triples(self): else: self._not_relevant_triples += 1 - def _reset_count(self): - self._relevant_triples = 0 - self._not_relevant_triples = 0 - self._htree = get_basic_h_tree() + def is_an_instantiation_prop(self, a_property): return a_property == self._instantiation_property @@ -86,20 +79,6 @@ def is_an_instantiation_prop(self, a_property): def is_a_subclass_property(self, a_property): return a_property == self._subclass_property - @staticmethod - def _build_instances_dict(): - return {} # Empty in every case. Instances, on the fly, will be the keys - - @staticmethod - def _decide_instantiation_property(instantiation_property): - if instantiation_property == None: - return _RDF_TYPE - if type(instantiation_property) == type(_RDF_TYPE): - return instantiation_property - if type(instantiation_property) == str: - return Property(remove_corners(a_uri=instantiation_property, - raise_error_if_no_corners=False)) - raise ValueError("Unrecognized param type to define instantiation property") diff --git a/shexer/io/graph/yielder/nt_triples_yielder.py b/shexer/io/graph/yielder/nt_triples_yielder.py index 3782e6c..26fd38d 100644 --- a/shexer/io/graph/yielder/nt_triples_yielder.py +++ b/shexer/io/graph/yielder/nt_triples_yielder.py @@ -29,8 +29,7 @@ def yield_triples(self): tokens = self._look_for_tokens(a_line.strip()) if len(tokens) != 3: self._error_triples += 1 - log_msg(msg="This line was discarded: " + a_line, - source=self._source_file) + log_msg(verbose=False, msg="This line was discarded: " + a_line) else: yield (tune_token(a_token=tokens[0]), tune_prop(a_token=tokens[1]), diff --git a/shexer/utils/translators/list_of_classes_to_shape_map.py b/shexer/utils/translators/list_of_classes_to_shape_map.py index d076c27..60cec18 100644 --- a/shexer/utils/translators/list_of_classes_to_shape_map.py +++ b/shexer/utils/translators/list_of_classes_to_shape_map.py @@ -37,7 +37,7 @@ def _get_shape_label_for_class_uri(self, class_uri): return class_uri def _get_raw_selector_to_catch_instances_of_class_uri(self, class_uri, instantiation_property, limit_remote_instances): - return 'SPARQL "select ?s where {{ ?s <{prop}> <{class_uri}> }} {limit}"'.format( + return 'SPARQL "select ?s where {{ ?s <{prop}> <{class_uri}> . FILTER (!isBlank(?s)) }} {limit}"'.format( # FILTER (!isBlank(?c)) class_uri=class_uri, prop=instantiation_property, limit="" if limit_remote_instances < 0 else "LIMIT " + str(limit_remote_instances) diff --git a/test/test_disable_endpoint_cache.py b/test/test_disable_endpoint_cache.py index 7bc5dcd..7c4aab7 100644 --- a/test/test_disable_endpoint_cache.py +++ b/test/test_disable_endpoint_cache.py @@ -55,5 +55,6 @@ def test_all_classes_mode(self): limit_remote_instances=5, disable_endpoint_cache=True) str_result = shaper.shex_graph(string_output=True) + print(str_result) self.assertTrue(number_of_shapes(str_result) > 2) pass # diff --git a/test/test_url_endpoint.py b/test/test_url_endpoint.py index e171252..3bd6815 100644 --- a/test/test_url_endpoint.py +++ b/test/test_url_endpoint.py @@ -48,5 +48,6 @@ def test_all_classes_mode(self): track_classes_for_entities_at_last_depth_level=False, limit_remote_instances=5) str_result = shaper.shex_graph(string_output=True) + print(str_result) self.assertTrue(number_of_shapes(str_result) > 2) pass #