From a38b9db822b72a7ef4ecbbc040cdffa5cc82bf14 Mon Sep 17 00:00:00 2001 From: Sam-el0 Date: Sun, 21 Apr 2024 19:59:23 +0100 Subject: [PATCH 1/4] fixed recursion loops with imported_schema_uris set that is passed into the recurcive function parse_taxonomy --- xbrl/instance.py | 10 +++++++--- xbrl/taxonomy.py | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/xbrl/instance.py b/xbrl/instance.py index d88d67c..45ff150 100644 --- a/xbrl/instance.py +++ b/xbrl/instance.py @@ -347,17 +347,21 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None = schema_uri: str = schema_ref.attrib[XLINK_NS + 'href'] # check if the schema uri is relative or absolute # submissions from SEC normally have their own schema files, whereas submissions from the uk have absolute schemas + + # initalise a set that will store cached taxonomy schemas uris to avoid recursive loops + imported_schema_uris = set() + if is_url(schema_uri): # fetch the taxonomy extension schema from remote - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache, imported_schema_uris) elif instance_url: # fetch the taxonomy extension schema from remote by reconstructing the url schema_url = resolve_uri(instance_url, schema_uri) - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache, imported_schema_uris) else: # try to find the taxonomy extension schema file locally because no full url can be constructed schema_path = resolve_uri(instance_path, schema_uri) - taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache) + taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache, imported_schema_uris) # parse contexts and units context_dir = _parse_context_elements(root.findall('xbrli:context', NAME_SPACES), root.attrib['ns_map'], taxonomy, diff --git a/xbrl/taxonomy.py b/xbrl/taxonomy.py index 4150139..79dc750 100644 --- a/xbrl/taxonomy.py +++ b/xbrl/taxonomy.py @@ -588,26 +588,28 @@ def parse_common_taxonomy(cache: HttpCache, namespace: str) -> TaxonomySchema or @lru_cache(maxsize=60) -def parse_taxonomy_url(schema_url: str, cache: HttpCache) -> TaxonomySchema: +def parse_taxonomy_url(schema_url: str, cache: HttpCache, imported_schema_uris : set) -> TaxonomySchema: """ Parses a taxonomy schema file from the internet :param schema_url: full link to the taxonomy schema :param cache: :class:`xbrl.cache.HttpCache` instance + :param imported_schema_uris: set of already imported schema uris :return: parsed :class:`xbrl.taxonomy.TaxonomySchema` object """ if not is_url(schema_url): raise XbrlParseException('This function only parses remotely saved taxonomies. ' 'Please use parse_taxonomy to parse local taxonomy schemas') schema_path: str = cache.cache_file(schema_url) - return parse_taxonomy(schema_path, cache, schema_url) + return parse_taxonomy(schema_path, cache, imported_schema_uris, schema_url) -def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None = None) -> TaxonomySchema: +def parse_taxonomy(schema_path: str, cache: HttpCache, imported_schema_uris : set, schema_url: str or None = None) -> TaxonomySchema: """ Parses a taxonomy schema file. :param schema_path: url to the schema (on the internet) :param cache: :class:`xbrl.cache.HttpCache` instance + :param imported_schema_uris: set of already imported schema uris :param schema_url: if this url is set, the script will try to fetch additionally imported files such as linkbases or imported schemas from the remote location. If this url is None, the script will try to find those resources locally. :return: parsed :class:`xbrl.taxonomy.TaxonomySchema` object @@ -633,18 +635,25 @@ def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None = if import_uri == "": continue + # Skip already imported URIs + if import_uri in imported_schema_uris: + continue + # sometimes the import schema location is relative. i.e schemaLocation="xbrl-linkbase-2003-12-31.xsd" if is_url(import_uri): # fetch the schema file from remote taxonomy.imports.append(parse_taxonomy_url(import_uri, cache)) + imported_schema_uris.add(import_uri) elif schema_url: # fetch the schema file from remote by reconstructing the full url import_url = resolve_uri(schema_url, import_uri) taxonomy.imports.append(parse_taxonomy_url(import_url, cache)) + imported_schema_uris.add(import_uri) else: # We have to try to fetch the linkbase locally because no full url can be constructed import_path = resolve_uri(schema_path, import_uri) taxonomy.imports.append(parse_taxonomy(import_path, cache)) + imported_schema_uris.add(import_uri) role_type_elements: List[ET.Element] = root.findall('xsd:annotation/xsd:appinfo/link:roleType', NAME_SPACES) # parse ELR's From 658e730573433e6ae1c017d493a91202dd2fc261 Mon Sep 17 00:00:00 2001 From: Sam-el0 Date: Sun, 12 May 2024 15:04:20 +0100 Subject: [PATCH 2/4] Simplifed with default value for parse via parse_taxonomy_url & fixed the set hashing bug. --- tests/test_local_taxonomy.py | 4 ++-- xbrl/instance.py | 20 +++++++++++++------- xbrl/taxonomy.py | 9 ++++----- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/tests/test_local_taxonomy.py b/tests/test_local_taxonomy.py index 64d5b6e..81bd203 100644 --- a/tests/test_local_taxonomy.py +++ b/tests/test_local_taxonomy.py @@ -18,10 +18,10 @@ def test_parse_taxonomy(self): cache_dir: str = './cache/' cache: HttpCache = HttpCache(cache_dir) print(f"Saving to {cache_dir}") - + imported_schema_uris = set() extension_schema_path: str = './tests/data/example.xsd' # extension_schema_path: str = './data/example.xsd' - tax: TaxonomySchema = parse_taxonomy(extension_schema_path, cache) + tax: TaxonomySchema = parse_taxonomy(extension_schema_path, cache, imported_schema_uris = set()) print(tax) srt_tax: TaxonomySchema = tax.get_taxonomy('http://fasb.org/srt/2020-01-31') self.assertTrue(srt_tax) diff --git a/xbrl/instance.py b/xbrl/instance.py index 45ff150..75a5254 100644 --- a/xbrl/instance.py +++ b/xbrl/instance.py @@ -349,18 +349,19 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None = # submissions from SEC normally have their own schema files, whereas submissions from the uk have absolute schemas # initalise a set that will store cached taxonomy schemas uris to avoid recursive loops - imported_schema_uris = set() + if is_url(schema_uri): # fetch the taxonomy extension schema from remote - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache, imported_schema_uris) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache) elif instance_url: # fetch the taxonomy extension schema from remote by reconstructing the url schema_url = resolve_uri(instance_url, schema_uri) - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache, imported_schema_uris) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache) else: # try to find the taxonomy extension schema file locally because no full url can be constructed schema_path = resolve_uri(instance_path, schema_uri) + imported_schema_uris = set() taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache, imported_schema_uris) # parse contexts and units @@ -457,21 +458,26 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None schema_uri: str = schema_ref.attrib[XLINK_NS + 'href'] # check if the schema uri is relative or absolute # submissions from SEC normally have their own schema files, whereas submissions from the uk have absolute schemas + + # initalise a set that will store cached taxonomy schemas uris to avoid recursive loops + imported_schema_uris = set() + + if is_url(schema_uri): # fetch the taxonomy extension schema from remote - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache, imported_schema_uris) elif schema_root: # take the given schema_root path as directory for searching for the taxonomy schema schema_path = str(next(Path(schema_root).glob(f'**/{schema_uri}'))) - taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache) + taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache, imported_schema_uris) elif instance_url: # fetch the taxonomy extension schema from remote by reconstructing the url schema_url = resolve_uri(instance_url, schema_uri) - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache, imported_schema_uris) else: # try to find the taxonomy extension schema file locally because no full url can be constructed schema_path = resolve_uri(instance_path, schema_uri) - taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache) + taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache, imported_schema_uris) # get all contexts and units xbrl_resources: ET.Element = root.find('.//ix:resources', ns_map) diff --git a/xbrl/taxonomy.py b/xbrl/taxonomy.py index 79dc750..f9b078c 100644 --- a/xbrl/taxonomy.py +++ b/xbrl/taxonomy.py @@ -588,7 +588,7 @@ def parse_common_taxonomy(cache: HttpCache, namespace: str) -> TaxonomySchema or @lru_cache(maxsize=60) -def parse_taxonomy_url(schema_url: str, cache: HttpCache, imported_schema_uris : set) -> TaxonomySchema: +def parse_taxonomy_url(schema_url: str, cache: HttpCache, imported_schema_uris: set = set()) -> TaxonomySchema: """ Parses a taxonomy schema file from the internet @@ -643,17 +643,16 @@ def parse_taxonomy(schema_path: str, cache: HttpCache, imported_schema_uris : se if is_url(import_uri): # fetch the schema file from remote taxonomy.imports.append(parse_taxonomy_url(import_uri, cache)) - imported_schema_uris.add(import_uri) elif schema_url: # fetch the schema file from remote by reconstructing the full url import_url = resolve_uri(schema_url, import_uri) - taxonomy.imports.append(parse_taxonomy_url(import_url, cache)) imported_schema_uris.add(import_uri) + taxonomy.imports.append(parse_taxonomy_url(import_url, cache)) else: # We have to try to fetch the linkbase locally because no full url can be constructed import_path = resolve_uri(schema_path, import_uri) - taxonomy.imports.append(parse_taxonomy(import_path, cache)) - imported_schema_uris.add(import_uri) + taxonomy.imports.append(parse_taxonomy(import_path, cache, imported_schema_uris)) + role_type_elements: List[ET.Element] = root.findall('xsd:annotation/xsd:appinfo/link:roleType', NAME_SPACES) # parse ELR's From 6920a6e8e3b0ea20da4baf91ca684c9ced039875 Mon Sep 17 00:00:00 2001 From: Sam-el0 Date: Sun, 12 May 2024 15:10:35 +0100 Subject: [PATCH 3/4] Fixed tcomment and spacing --- xbrl/instance.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xbrl/instance.py b/xbrl/instance.py index 75a5254..21b4378 100644 --- a/xbrl/instance.py +++ b/xbrl/instance.py @@ -347,9 +347,6 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None = schema_uri: str = schema_ref.attrib[XLINK_NS + 'href'] # check if the schema uri is relative or absolute # submissions from SEC normally have their own schema files, whereas submissions from the uk have absolute schemas - - # initalise a set that will store cached taxonomy schemas uris to avoid recursive loops - if is_url(schema_uri): # fetch the taxonomy extension schema from remote @@ -361,6 +358,7 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None = else: # try to find the taxonomy extension schema file locally because no full url can be constructed schema_path = resolve_uri(instance_path, schema_uri) + # initalise a set that will store cached taxonomy schemas uris to avoid recursive loops imported_schema_uris = set() taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache, imported_schema_uris) From e28b5e25cb6a8474f457ecbd3592c4fc3c059731 Mon Sep 17 00:00:00 2001 From: Sam-el0 Date: Tue, 14 May 2024 22:24:49 +0100 Subject: [PATCH 4/4] Removed incorrect passing of imported_schema_uris to parse_taconomy_url that cuases a set hashing error --- xbrl/instance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xbrl/instance.py b/xbrl/instance.py index 21b4378..81d01df 100644 --- a/xbrl/instance.py +++ b/xbrl/instance.py @@ -463,7 +463,7 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None if is_url(schema_uri): # fetch the taxonomy extension schema from remote - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache, imported_schema_uris) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_uri, cache) elif schema_root: # take the given schema_root path as directory for searching for the taxonomy schema schema_path = str(next(Path(schema_root).glob(f'**/{schema_uri}'))) @@ -471,7 +471,7 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None elif instance_url: # fetch the taxonomy extension schema from remote by reconstructing the url schema_url = resolve_uri(instance_url, schema_uri) - taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache, imported_schema_uris) + taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache) else: # try to find the taxonomy extension schema file locally because no full url can be constructed schema_path = resolve_uri(instance_path, schema_uri)