Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Step 4 of separating out interpretations #349

Merged
merged 9 commits into from
Jan 20, 2017
77 changes: 77 additions & 0 deletions interpparser/amendments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import functools
from copy import deepcopy

from lxml import etree

from interpparser import gpo_cfr
from regparser.notice.amendments.utils import label_amdpar_from
from regparser.notice.util import spaces_then_remove
from regparser.tree.struct import Node


def content_for_interpretations(instruction_xml):
"""Return a chunk of XML (which serves as a unique key) and a think for
parsing that XML as an interpretation"""
label_parts, amdpar = label_amdpar_from(instruction_xml)
if len(label_parts) > 0 and 'Interpretations' in label_parts[1]:
xml = amdpar.getparent()
return xml, functools.partial(parse_interp, label_parts[0], xml)


def parse_interp(cfr_part, xml):
"""Figure out which parts of the parent_xml are relevant to
interpretations. Pass those on to interpretations.parse_from_xml and
return the results"""
parent_xml = standardize_interp_xml(xml)

# Skip over everything until 'Supplement I' in a header
seen_header = False
xml_nodes = []

def contains_supp(n):
text = (n.text or '').lower()
return 'supplement i' in text

for child in parent_xml:
# SECTION shouldn't be in this part of the XML, but often is. Expand
# it to proceed
if seen_header and child.tag == 'SECTION':
sectno = child.xpath('./SECTNO')[0]
subject = child.xpath('./SUBJECT')[0]
header = etree.Element("HD", SOURCE="HD2")
header.text = sectno.text + '—' + subject.text
child.insert(child.index(sectno), header)
child.remove(sectno)
child.remove(subject)
xml_nodes.extend(child.getchildren())
elif seen_header:
xml_nodes.append(child)
else:
if child.tag == 'HD' and contains_supp(child):
seen_header = True
if any(contains_supp(c) for c in child.xpath(".//HD")):
seen_header = True

root = Node(label=[cfr_part, Node.INTERP_MARK], node_type=Node.INTERP)
root = gpo_cfr.parse_from_xml(root, xml_nodes)
if not root.children:
return None
else:
return root


def standardize_interp_xml(xml):
"""We will assume a format of Supplement I header followed by HDs,
STARS, and Ps, so move anything in an EXTRACT up a level"""
xml = spaces_then_remove(deepcopy(xml), 'PRTPAGE')
for extract in xml.xpath(".//EXTRACT|.//APPENDIX|.//SUBPART"):
ex_parent = extract.getparent()
idx = ex_parent.index(extract)
for child in extract:
ex_parent.insert(idx, child)
idx += 1
ex_parent.remove(extract)
return xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
import logging
import re

from interpparser.tree import merge_labels, text_to_labels
from regparser.citations import Label, remove_citation_overlaps
from regparser.layer.key_terms import KeyTerms
from regparser.tree.depth import markers as mtypes
from regparser.tree.depth import heuristics, rules
from regparser.tree.depth.derive import derive_depths
from regparser.tree.interpretation import merge_labels, text_to_labels
from regparser.tree.struct import Node, treeify
from regparser.tree.xml_parser import matchers, tree_utils

Expand Down
2 changes: 1 addition & 1 deletion interpparser/layers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from collections import defaultdict

from interpparser.tree import text_to_labels
from regparser.citations import Label
from regparser.layer.layer import Layer
from regparser.tree import struct
from regparser.tree.interpretation import text_to_labels


class Interpretations(Layer):
Expand Down
2 changes: 1 addition & 1 deletion interpparser/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from regparser.tree.gpo_cfr.interpretations import get_app_title
from interpparser.gpo_cfr import get_app_title

_CONTAINS_SUPPLEMENT = "contains(., 'Supplement I')"
_SUPPLEMENT_HD = "//REGTEXT//HD[@SOURCE='HD1' and {0}]".format(
Expand Down
8 changes: 7 additions & 1 deletion interpparser/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
'License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication'
],
entry_points={
'eregs_ns.parser.amendment.content':
('interpretations = '
'interpparser.amendments:content_for_interpretations'),
'eregs_ns.parser.layer.cfr':
'interpretations = interpparser.layers:Interpretations',
'eregs_ns.parser.preprocessors': [
'supplement-amdpar = interpparser.preprocessors:supplement_amdpar',
('appendix-to-interp = interpparser.preprocessors:'
'appendix_to_interp'),
],
],
"eregs_ns.parser.xml_matchers.gpo_cfr.PART": [
"interpretations = interpparser.gpo_cfr:parse_interp",
]
}
)
File renamed without changes.
36 changes: 35 additions & 1 deletion regparser/grammar/appendix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import string

from pyparsing import FollowedBy, Literal, Word
from pyparsing import FollowedBy, LineEnd, LineStart, Literal, SkipTo, Word

from regparser.grammar import atomic, unified, utils


def parenthesize(characters, name):
Expand Down Expand Up @@ -30,3 +32,35 @@ def decimalize(characters, name):
period_upper = decimalize(string.ascii_uppercase, "period_upper")
period_lower = decimalize(string.ascii_lowercase, "period_lower")
period_digit = decimalize(string.digits, "period_digit")


section = (
atomic.section_marker.copy().leaveWhitespace() +
unified.part_section +
SkipTo(LineEnd())
)


par = (
atomic.section.copy().leaveWhitespace() +
unified.depth1_p +
SkipTo(LineEnd())
)


marker_par = (
atomic.paragraph_marker.copy().leaveWhitespace() +
atomic.section +
unified.depth1_p
)


appendix = (
atomic.appendix_marker.copy().leaveWhitespace() +
atomic.appendix +
SkipTo(LineEnd())
)


headers = utils.QuickSearchable(
LineStart() + (section | marker_par | par | appendix))
34 changes: 0 additions & 34 deletions regparser/grammar/interpretation_headers.py

This file was deleted.

Loading