From af5a664de2504dd4fd4a88300659ea019261bf9b Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Fri, 24 Jun 2022 12:59:46 +0200 Subject: [PATCH 1/2] Correct support for parsing components and portions --- README.rst | 7 ++ VERSION | 2 +- cobalt/uri.py | 63 +++++------- tests/test_uri.py | 255 +++++++++++++++++++--------------------------- 4 files changed, 140 insertions(+), 187 deletions(-) diff --git a/README.rst b/README.rst index 83e0449..6f8d543 100644 --- a/README.rst +++ b/README.rst @@ -83,6 +83,13 @@ Cobalt is Copyright 2015-2020 AfricanLII. Change Log ---------- +6.0.0 +----- + +- Add support for portions, such as ``~chp_2`` +- Remove non-standard support for expression component and subcomponent +- Remove non-standard legacy support for work components without ``!`` + 5.0.0 ----- diff --git a/VERSION b/VERSION index 0062ac9..09b254e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -5.0.0 +6.0.0 diff --git a/cobalt/uri.py b/cobalt/uri.py index 9d6dfc1..7e2b83b 100644 --- a/cobalt/uri.py +++ b/cobalt/uri.py @@ -8,21 +8,16 @@ (/(?P[^0-9][^/]*))? # actor (optional), cannot start with a number /(?P[0-9]{4}(-[0-9]{2}(-[0-9]{2})?)?) # date /(?P[^/]+) # number - (/ - ( # either a work component or expression details - ( # optional expression details + (/ # optional expression language and date (?P[a-z]{3}) # language (eg. eng) (?P[@:][^/]*)? # expression date (eg. @ or @2012-12-22 or :2012-12-22) - (/!? # optional expression component - # the ! is optional for backwards compatibility but won't be optional - # in a future version - (?P[^/]+?)? # expression component (eg. !main or !schedule1) - (/(?P[^.]+))? # expression subcomponent (eg. chapter/1 or section/20) - )? # - (\.(?P[a-z0-9]+))? # format (eg. .xml, .akn, .html, .pdf) - )| # - !?(?P.+) # work component - ))?$""", re.X) + )? + (/ + (!(?P[^~.]+?))? # optional component (eg. !main or !schedule1) + (~(?P[^.]+))? # optional portion + )? + (\.(?P[a-z0-9]+))? # optional format (eg. .xml, .akn, .html, .pdf) + $""", re.X) class FrbrUri(object): @@ -41,7 +36,7 @@ class FrbrUri(object): Example:: - >>> uri = FrbrUri.parse('/akn/za-jhb/act/by-law/2003/public-health/eng:2015-01-01/!main/part/A.xml') + >>> uri = FrbrUri.parse('/akn/za-jhb/act/by-law/2003/public-health/eng:2015-01-01/!main~part_1.xml') >>> uri.prefix 'akn' >>> uri.country @@ -60,18 +55,18 @@ class FrbrUri(object): 'eng' >>> uri.expression_date ':2015-01-01' - >>> uri.expression_component + >>> uri.work_component 'main' - >>> uri.expression_subcomponent - 'part/A' + >>> uri.portion + 'part_1' >>> uri.format 'xml' >>> uri.work_uri() '/za-jhb/act/by-law/2003/public-health' >>> uri.expression_uri() - '/za-jhb/act/by-law/2003/public-health/eng:2015-01-01/main/part/A' + '/za-jhb/act/by-law/2003/public-health/eng:2015-01-01/!main~part_1' >>> uri.manifestation_uri() - '/za-jhb/act/by-law/2003/public-health/eng:2015-01-01/main/part/A.xml' + '/za-jhb/act/by-law/2003/public-health/eng:2015-01-01/!main~part_1.xml' :ivar prefix: optional `akn` prefix :ivar country: two letter country code @@ -84,16 +79,13 @@ class FrbrUri(object): :ivar work_component: name of the work component, may be None :ivar language: three-letter expression language code, may be None :ivar expression_date: expression date (str), [@:]YYYY[-MM[-DD]], may be None - :ivar expression_component: name of the expression component, may be None - :ivar expression_subcomponent: name of the expression subcomponent, may be None :ivar format: format extension, may be None """ default_language = 'eng' - def __init__(self, country, locality, doctype, subtype, actor, date, number, - work_component=None, language=None, expression_date=None, expression_component=None, - expression_subcomponent=None, format=None, prefix="akn"): + def __init__(self, country, locality, doctype, subtype, actor, date, number, work_component=None, language=None, + expression_date=None, format=None, portion=None, prefix="akn"): self.prefix = prefix self.country = country self.locality = locality @@ -103,11 +95,10 @@ def __init__(self, country, locality, doctype, subtype, actor, date, number, self.date = date self.number = number self.work_component = work_component + self.portion = portion self.language = language or self.default_language self.expression_date = expression_date - self.expression_component = expression_component - self.expression_subcomponent = expression_subcomponent self.format = format def clone(self): @@ -125,8 +116,7 @@ def clone(self): work_component=self.work_component, language=self.language, expression_date=self.expression_date, - expression_component=self.expression_component, - expression_subcomponent=self.expression_subcomponent, + portion=self.portion, format=self.format, ) @@ -168,16 +158,17 @@ def expression_uri(self, work_component=True): if self.expression_date is not None: uri = uri + self.expression_date - # expression component is preferred over a work component - if self.expression_component: - uri = uri + "/!" + self.expression_component - if self.expression_subcomponent: - uri = uri + "/" + self.expression_subcomponent - # if we have a work component, use it - elif work_component and self.work_component: + slashed = False + if work_component and self.work_component: + slashed = True uri = uri + "/!" + self.work_component + if self.portion: + if not slashed: + uri = uri + "/" + uri = uri + "~" + self.portion + return uri def manifestation_uri(self, work_component=True): @@ -190,7 +181,7 @@ def manifestation_uri(self, work_component=True): def __str__(self): if self.format: return self.manifestation_uri() - if self.expression_date or self.expression_component: + if self.expression_date or self.work_component: return self.expression_uri() return self.work_uri() diff --git a/tests/test_uri.py b/tests/test_uri.py index e47eb4d..b89a4bb 100644 --- a/tests/test_uri.py +++ b/tests/test_uri.py @@ -10,7 +10,7 @@ def test_bad_value(self): assert_raises(ValueError, FrbrUri.parse, "/ukpga/2015/1") def test_simple(self): - uri = FrbrUri.parse("/za/act/1980/01") + uri = FrbrUri.parse("/akn/za/act/1980/01") assert_equal(uri.country, "za") assert_equal(uri.locality, None) assert_equal(uri.doctype, "act") @@ -20,12 +20,12 @@ def test_simple(self): assert_equal(uri.number, "01") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_is_none(uri.prefix) + assert_equal(uri.prefix, "akn") - assert_equal("/za/act/1980/01", uri.work_uri()) + assert_equal("/akn/za/act/1980/01", uri.work_uri()) def test_with_subtype(self): - uri = FrbrUri.parse("/za/act/by-law/1980/01") + uri = FrbrUri.parse("/akn/za/act/by-law/1980/01") assert_equal(uri.country, "za") assert_equal(uri.locality, None) assert_equal(uri.doctype, "act") @@ -36,10 +36,10 @@ def test_with_subtype(self): assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal("/za/act/by-law/1980/01", uri.work_uri()) + assert_equal("/akn/za/act/by-law/1980/01", uri.work_uri()) def test_with_locality(self): - uri = FrbrUri.parse("/za-cpt/act/by-law/1980/01") + uri = FrbrUri.parse("/akn/za-cpt/act/by-law/1980/01") assert_equal(uri.country, "za") assert_equal(uri.locality, "cpt") assert_equal(uri.doctype, "act") @@ -50,10 +50,10 @@ def test_with_locality(self): assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal("/za-cpt/act/by-law/1980/01", uri.work_uri()) + assert_equal("/akn/za-cpt/act/by-law/1980/01", uri.work_uri()) def test_with_subtype_and_actor(self): - uri = FrbrUri.parse("/za/act/by-law/actor/1980/01") + uri = FrbrUri.parse("/akn/za/act/by-law/actor/1980/01") assert_equal(uri.country, "za") assert_equal(uri.doctype, "act") assert_equal(uri.subtype, "by-law") @@ -63,10 +63,10 @@ def test_with_subtype_and_actor(self): assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal("/za/act/by-law/actor/1980/01", uri.work_uri()) + assert_equal("/akn/za/act/by-law/actor/1980/01", uri.work_uri()) def test_with_long_date(self): - uri = FrbrUri.parse("/za/act/1980-02-01/01") + uri = FrbrUri.parse("/akn/za/act/1980-02-01/01") assert_equal(uri.country, "za") assert_equal(uri.doctype, "act") assert_equal(uri.subtype, None) @@ -77,10 +77,10 @@ def test_with_long_date(self): assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal("/za/act/1980-02-01/01", uri.work_uri()) + assert_equal("/akn/za/act/1980-02-01/01", uri.work_uri()) def test_with_non_numeric_number(self): - uri = FrbrUri.parse("/za/act/1980/nn") + uri = FrbrUri.parse("/akn/za/act/1980/nn") assert_equal(uri.country, "za") assert_equal(uri.doctype, "act") assert_equal(uri.subtype, None) @@ -90,10 +90,10 @@ def test_with_non_numeric_number(self): assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal("/za/act/1980/nn", uri.work_uri()) + assert_equal("/akn/za/act/1980/nn", uri.work_uri()) def test_with_work_component(self): - uri = FrbrUri.parse("/za/act/1980/2/!schedule1") + uri = FrbrUri.parse("/akn/za/act/1980/2/!schedule1") assert_equal(uri.country, "za") assert_equal(uri.doctype, "act") assert_equal(uri.subtype, None) @@ -103,18 +103,17 @@ def test_with_work_component(self): assert_equal(uri.language, "eng") assert_equal(uri.work_component, "schedule1") assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, None) - assert_equal("/za/act/1980/2", uri.uri()) - assert_equal("/za/act/1980/2/!schedule1", uri.work_uri()) - assert_equal("/za/act/1980/2/eng/!schedule1", uri.expression_uri()) + assert_equal("/akn/za/act/1980/2", uri.uri()) + assert_equal("/akn/za/act/1980/2/!schedule1", uri.work_uri()) + assert_equal("/akn/za/act/1980/2/eng/!schedule1", uri.expression_uri()) def test_with_nested_work_components(self): - uri = FrbrUri.parse("/za/act/1980/2/!schedule1/schedule2/schedule3") + uri = FrbrUri.parse("/akn/za/act/1980/2/!schedule1/schedule2/schedule3") assert_equal(uri.work_component, "schedule1/schedule2/schedule3") def test_with_work_component_legacy(self): - uri = FrbrUri.parse("/za/act/1980/2/schedule1") + uri = FrbrUri.parse("/akn/za/act/1980/2/!schedule1") assert_equal(uri.country, "za") assert_equal(uri.doctype, "act") assert_equal(uri.subtype, None) @@ -124,14 +123,13 @@ def test_with_work_component_legacy(self): assert_equal(uri.language, "eng") assert_equal(uri.work_component, "schedule1") assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, None) - assert_equal("/za/act/1980/2", uri.uri()) - assert_equal("/za/act/1980/2/!schedule1", uri.work_uri()) - assert_equal("/za/act/1980/2/eng/!schedule1", uri.expression_uri()) + assert_equal("/akn/za/act/1980/2", uri.uri()) + assert_equal("/akn/za/act/1980/2/!schedule1", uri.work_uri()) + assert_equal("/akn/za/act/1980/2/eng/!schedule1", uri.expression_uri()) def test_with_short_work_component(self): - uri = FrbrUri.parse("/za-wc/act/pn/2018/46/6") + uri = FrbrUri.parse("/akn/za-wc/act/pn/2018/46/!6") assert_equal(uri.country, "za") assert_equal(uri.locality, "wc") assert_equal(uri.doctype, "act") @@ -142,9 +140,8 @@ def test_with_short_work_component(self): assert_equal(uri.language, "eng") assert_equal(uri.work_component, "6") assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, None) - uri = FrbrUri.parse("/za-wc/act/2018/46/6") + uri = FrbrUri.parse("/akn/za-wc/act/2018/46/!6") assert_equal(uri.country, "za") assert_equal(uri.locality, "wc") assert_equal(uri.doctype, "act") @@ -155,17 +152,15 @@ def test_with_short_work_component(self): assert_equal(uri.language, "eng") assert_equal(uri.work_component, "6") assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, None) - def test_with_work_and_expression_component(self): - uri = FrbrUri.parse("/za/act/1980/2") - uri.work_component = "main" - uri.expression_component = "schedule1" - uri.expression_subcomponent = "chapter/2" + def test_with_work_component_and_portion(self): + uri = FrbrUri.parse("/akn/za/act/1980/2") + uri.work_component = "main/schedule_1" + uri.portion = "chp_2" - assert_equal("/za/act/1980/2", uri.uri()) - assert_equal("/za/act/1980/2/!main", uri.work_uri()) - assert_equal("/za/act/1980/2/eng/!schedule1/chapter/2", uri.expression_uri()) + assert_equal("/akn/za/act/1980/2", uri.uri()) + assert_equal("/akn/za/act/1980/2/!main/schedule_1", uri.work_uri()) + assert_equal("/akn/za/act/1980/2/eng/!main/schedule_1~chp_2", uri.expression_uri()) def test_parse_expression2(self): uri = FrbrUri.parse("/gh/act/2020/1013/eng@2020-04-03") @@ -184,156 +179,82 @@ def test_expression_string_no_language(self): assert_equal(str(err), "Expression URI requires a language.") def test_parse_expression(self): - uri = FrbrUri.parse("/za/act/1980/02/afr@") + uri = FrbrUri.parse("/akn/za/act/1980/02/afr@") assert_equal(uri.language, "afr") assert_equal(uri.expression_date, '@') - assert_equal("/za/act/1980/02", uri.work_uri()) - assert_equal("/za/act/1980/02/afr@", uri.expression_uri()) + assert_equal("/akn/za/act/1980/02", uri.work_uri()) + assert_equal("/akn/za/act/1980/02/afr@", uri.expression_uri()) - uri = FrbrUri.parse("/za/act/1980/02/afr@2014-01-01") + uri = FrbrUri.parse("/akn/za/act/1980/02/afr@2014-01-01") assert_equal(uri.language, "afr") assert_equal(uri.expression_date, "@2014-01-01") - assert_equal("/za/act/1980/02", uri.work_uri()) - assert_equal("/za/act/1980/02/afr@2014-01-01", uri.expression_uri()) + assert_equal("/akn/za/act/1980/02", uri.work_uri()) + assert_equal("/akn/za/act/1980/02/afr@2014-01-01", uri.expression_uri()) - uri = FrbrUri.parse("/za/act/1980/02/afr.html") + uri = FrbrUri.parse("/akn/za/act/1980/02/afr.html") assert_equal(uri.language, "afr") assert_equal(uri.format, 'html') def test_parse_expression_component_legacy(self): - uri = FrbrUri.parse("/za/act/1980/02/eng/main") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "main") - - uri = FrbrUri.parse("/za/act/1980/02/eng/main/chapter/2") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "chapter/2") - - uri = FrbrUri.parse("/za/act/1980/02/eng@/main/chapter/2") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, '@') - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "chapter/2") - - uri = FrbrUri.parse("/za/act/1980/02/eng@2014-01-01/main/schedule1") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, "@2014-01-01") - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "schedule1") - - uri = FrbrUri.parse("/za/act/1980/02/eng@2014-01-01/main/schedule1") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, "@2014-01-01") - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "schedule1") - - uri = FrbrUri.parse("/za/act/1980/02/eng/main/chapter/2") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "chapter/2") + """ Legacy components without a ! are no longer supported. + """ + with self.assertRaises(ValueError): + FrbrUri.parse("/akn/za/act/1980/02/eng/main") - # this is a weird edge case - uri = FrbrUri.parse("/za/act/1980/02/eng/chapter/2") + def test_parse_work_component(self): + uri = FrbrUri.parse("/akn/za/act/1980/02/eng/!main") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "chapter") - assert_equal(uri.expression_subcomponent, "2") + assert_equal(uri.work_component, "main") - def test_parse_expression_component(self): - uri = FrbrUri.parse("/za/act/1980/02/eng/!main") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng/!main~chp_2") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "main") + assert_equal(uri.work_component, "main") + assert_equal(uri.portion, "chp_2") - uri = FrbrUri.parse("/za/act/1980/02/eng/!main/chapter/2") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "chapter/2") - - uri = FrbrUri.parse("/za/act/1980/02/eng@/!main/chapter/2") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng@/!main~chp_2") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, '@') - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "chapter/2") - - uri = FrbrUri.parse("/za/act/1980/02/eng@2014-01-01/!main/schedule1") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, "@2014-01-01") - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "schedule1") - - uri = FrbrUri.parse("/za/act/1980/02/eng@2014-01-01/!main/schedule1") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, "@2014-01-01") - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "schedule1") - - uri = FrbrUri.parse("/za/act/1980/02/eng/!main/chapter/2") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "main") - assert_equal(uri.expression_subcomponent, "chapter/2") - - # this is a weird edge case - uri = FrbrUri.parse("/za/act/1980/02/eng/chapter/2") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, None) - assert_equal(uri.expression_component, "chapter") - assert_equal(uri.expression_subcomponent, "2") + assert_equal(uri.work_component, "main") + assert_equal(uri.portion, "chp_2") def test_parse_expression_date(self): # A dangling @ indicates the very FIRST expression date, which - # we represent with an empty string (''). + # we represent with an empty string (''). # A URI without an @ at all, indicates the most recent # expression date, which is None. - uri = FrbrUri.parse("/za/act/1980/02/eng") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - assert_equal(uri.expression_uri(), '/za/act/1980/02/eng') + assert_equal(uri.expression_uri(), '/akn/za/act/1980/02/eng') - uri = FrbrUri.parse("/za/act/1980/02/eng/main") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng/!main") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, None) - uri = FrbrUri.parse("/za/act/1980/02/eng@") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng@") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, '@') - assert_equal(uri.expression_uri(), '/za/act/1980/02/eng@') + assert_equal(uri.expression_uri(), '/akn/za/act/1980/02/eng@') - uri = FrbrUri.parse("/za/act/1980/02/eng@/main") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng@/!main") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, '@') - uri = FrbrUri.parse("/za/act/1980/02/eng:/main") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng:/!main") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, ':') - uri = FrbrUri.parse("/za/act/1980/02/eng:2012-01-01/main") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, ':2012-01-01') - - uri = FrbrUri.parse("/za/act/1980/02/eng@/!main") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, '@') - - uri = FrbrUri.parse("/za/act/1980/02/eng:/!main") - assert_equal(uri.language, "eng") - assert_equal(uri.expression_date, ':') - - uri = FrbrUri.parse("/za/act/1980/02/eng:2012-01-01/!main") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng:2012-01-01/!main") assert_equal(uri.language, "eng") assert_equal(uri.expression_date, ':2012-01-01') def test_parse_subtype_numeric_number(self): # A subtype with a numeric number should not be # mistaken for an actor - uri = FrbrUri.parse("/za-jhb/act/notice/2007/5319/eng@2007-12-05") + uri = FrbrUri.parse("/akn/za-jhb/act/notice/2007/5319/eng@2007-12-05") assert_is_none(uri.actor) assert_equal(uri.date, "2007") assert_equal(uri.language, "eng") @@ -341,7 +262,7 @@ def test_parse_subtype_numeric_number(self): assert_equal(uri.expression_date, "@2007-12-05") def test_parse_subtype_and_actor(self): - uri = FrbrUri.parse("/za-jhb/act/notice/actor/2007/5319/eng@2007-12-05") + uri = FrbrUri.parse("/akn/za-jhb/act/notice/actor/2007/5319/eng@2007-12-05") assert_equal(uri.actor, "actor") assert_equal(uri.date, "2007") assert_equal(uri.number, "5319") @@ -349,23 +270,20 @@ def test_parse_subtype_and_actor(self): assert_equal(uri.expression_date, "@2007-12-05") def test_expression_uri(self): - uri = FrbrUri.parse("/za/act/1980/02/eng") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng") uri.expression_date = '@2014-01-01' - uri.expression_component = 'main' + uri.work_component = 'main' uri.format = 'html' - assert_equal("/za/act/1980/02/eng@2014-01-01/!main", uri.expression_uri()) - - uri.expression_subcomponent = "chapter/2" - assert_equal("/za/act/1980/02/eng@2014-01-01/!main/chapter/2", uri.expression_uri()) + assert_equal("/akn/za/act/1980/02/eng@2014-01-01/!main", uri.expression_uri()) def test_manifestation_uri(self): - uri = FrbrUri.parse("/za/act/1980/02/eng") + uri = FrbrUri.parse("/akn/za/act/1980/02/eng") uri.expression_date = '@2014-01-01' - uri.expression_component = 'main' + uri.work_component = 'main' uri.format = 'html' - assert_equal("/za/act/1980/02/eng@2014-01-01/!main.html", uri.manifestation_uri()) + assert_equal("/akn/za/act/1980/02/eng@2014-01-01/!main.html", uri.manifestation_uri()) def test_simple_prefix(self): # also recognises akn prefix @@ -408,3 +326,40 @@ def test_akn_prefix(self): actor=None ) assert_is_none(uri.prefix) + + def test_parse_portion_no_component(self): + uri = FrbrUri.parse("/akn/za/act/2005/5/~sec_5") + assert_equal(uri.portion, "sec_5") + + uri = FrbrUri.parse("/akn/za/act/2005/5/eng/~sec_5") + assert_equal(uri.portion, "sec_5") + + uri = FrbrUri.parse("/akn/za/act/2005/5/eng@2002-03-01/~sec_5") + assert_equal(uri.portion, "sec_5") + assert_equal(uri.expression_uri(), "/akn/za/act/2005/5/eng@2002-03-01/~sec_5") + + def test_parse_portion_no_component_format(self): + uri = FrbrUri.parse("/akn/za/act/2005/5/~sec_5.html") + assert_equal(uri.portion, "sec_5") + assert_equal(uri.format, "html") + + uri = FrbrUri.parse("/akn/za/act/2005/5/eng/~sec_5.xml") + assert_equal(uri.portion, "sec_5") + assert_equal(uri.format, "xml") + + uri = FrbrUri.parse("/akn/za/act/2005/5/eng@2002-03-01/~sec_5.xml") + assert_equal(uri.portion, "sec_5") + assert_equal(uri.format, "xml") + + def test_parse_portion_component(self): + uri = FrbrUri.parse("/akn/za/act/2005/5/!main~sec_5") + assert_equal(uri.work_component, "main") + assert_equal(uri.portion, "sec_5") + + uri = FrbrUri.parse("/akn/za/act/2005/5/eng/!schedule_3~sec_5") + assert_equal(uri.work_component, "schedule_3") + assert_equal(uri.portion, "sec_5") + + uri = FrbrUri.parse("/akn/za/act/2005/5/eng@2002-03-01/!main~sec_5") + assert_equal(uri.work_component, "main") + assert_equal(uri.portion, "sec_5") From 62e318717ace9edf3507ed30d4627038b48c3c0a Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Tue, 28 Jun 2022 12:14:12 +0200 Subject: [PATCH 2/2] portion extraction --- cobalt/akn.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cobalt/akn.py b/cobalt/akn.py index 67f9017..5b41b49 100644 --- a/cobalt/akn.py +++ b/cobalt/akn.py @@ -157,6 +157,11 @@ class StructuredDocument(AkomaNtosoDocument): """ The name of the document type, corresponding to the primary document XML element. """ + non_eid_portions = "arguments background conclusions decision header introduction motivation preamble" \ + " preface remedies".split() + """ Portion names that are valid portions, but don't have eids, for use with get_portion_element. + """ + @classmethod def for_document_type(cls, document_type): """ Return the subclass for this document type. @@ -441,6 +446,26 @@ def components(self): return components + def get_portion_element(self, portion, component=None): + """ Get a single portion of this document. The `portion` is usually an eId, as specified by + https://docs.oasis-open.org/legaldocml/akn-nc/v1.0/os/akn-nc-v1.0-os.html#_Toc531692279. + + The optional `component` is the ancestor element within which to look for the portion. + + Range portions (eg. `chp_1->chp_3`) are not supported by this function. + """ + root = component or self.root + + if portion in self.non_eid_portions: + # these are valid portions that don't have eids + xpath = f'.//a:{portion}' + else: + portion = portion.replace('"', '') + xpath = f'.//a:*[@eId="{portion}"]' + + for x in root.xpath(xpath, namespaces={'a': self.namespace}): + return x + def _ensure_lifecycle(self): try: after = self.meta.publication