Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge pull request #359 from cmc333333/tweaks-37-1
Browse files Browse the repository at this point in the history
Three parsing tweaks from #355
  • Loading branch information
cmc333333 authored Mar 6, 2017
2 parents d0bbfce + ad32089 commit 2bd31fb
Show file tree
Hide file tree
Showing 8 changed files with 584 additions and 572 deletions.
3 changes: 2 additions & 1 deletion interpparser/gpo_cfr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from regparser.tree.depth import markers as mtypes
from regparser.tree.depth import heuristics, rules
from regparser.tree.depth.derive import derive_depths
from regparser.tree.gpo_cfr.appendices import appendix_headers
from regparser.tree.struct import Node, treeify
from regparser.tree.xml_parser import matchers, tree_utils

Expand Down Expand Up @@ -291,7 +292,7 @@ def per_node(node):

def build_supplement_tree(reg_part, node):
""" Build the tree for the supplement section. """
title = get_app_title(node)
title = tree_utils.get_node_text(appendix_headers(node)[0])
root = Node(
node_type=Node.INTERP,
label=[reg_part, Node.INTERP_MARK],
Expand Down
16 changes: 10 additions & 6 deletions regparser/commands/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@ def fetch_version_ids(cfr_title, cfr_part, notice_dir):
final_rules = fetch_notice_json(cfr_title, cfr_part, only_final=True)

version_ids = []
for fr_id in map(itemgetter('document_number'), final_rules):
# Version_id concatenated with the date
regex = re.compile(re.escape(fr_id) + r"_\d{8}")
split_entries = [vid for vid in present_ids if regex.match(vid)]
# Add either the split entries or the original version_id
version_ids.extend(split_entries or [fr_id])
pair_fn = itemgetter('document_number', 'full_text_xml_url')
for fr_id, xml_url in map(pair_fn, final_rules):
if xml_url:
# Version_id concatenated with the date
regex = re.compile(re.escape(fr_id) + r"_\d{8}")
split_entries = [vid for vid in present_ids if regex.match(vid)]
# Add either the split entries or the original version_id
version_ids.extend(split_entries or [fr_id])
else:
logger.warning("No XML for %s; skipping", fr_id)

return version_ids

Expand Down
2 changes: 1 addition & 1 deletion regparser/notice/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class NoticeXML(XMLWrapper):
def delays(self):
"""Pull out FRDelays found in the DATES tag"""
dates_str = "".join(p.text for p in self.xpath(
"(//DATES/P)|(//EFFDATE/P)"))
"(//DATES/P)|(//EFFDATE/P)") if p.text)
return [delay for sent in dates_str.split('.')
for delay in delays_in_sentence(sent)]

Expand Down
10 changes: 4 additions & 6 deletions regparser/tree/gpo_cfr/appendices.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ def remove_toc(appendix, letter):
return


def is_appendix_header(node):
return (node.tag == 'RESERVED' or
(node.tag == 'HD' and node.attrib['SOURCE'] == 'HED'))
def appendix_headers(node):
return node.xpath('./RESERVED|./HD[@SOURCE="HED"]|./WHED')


_first_markers = [re.compile(r'[\)\.|,|;|-|—]\s*\(' + lvl[0] + r'\)')
Expand Down Expand Up @@ -76,9 +75,8 @@ def __init__(self, part):

def set_letter(self, appendix):
"""Find (and set) the appendix letter"""
for node in (c for c in appendix.getchildren()
if is_appendix_header(c)):
text = tree_utils.get_node_text(node)
for hd in appendix_headers(appendix):
text = tree_utils.get_node_text(hd)
if self.appendix_letter:
logger.warning("Found two appendix headers: %s and %s",
self.appendix_letter, text)
Expand Down
20 changes: 18 additions & 2 deletions tests/commands/versions_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ def test_fetch_version_ids_no_local(monkeypatch):
"""If there are no local copies, the document numbers found in the FR
notices should be passed through"""
monkeypatch.setattr(versions, 'fetch_notice_json', Mock(return_value=[
{'document_number': '1'}, {'document_number': '22'}]))
{'document_number': '1', 'full_text_xml_url': 'somewhere'},
{'document_number': '22', 'full_text_xml_url': 'somewhere'}
]))
path = entry.Entry("path")
assert ['1', '22'] == versions.fetch_version_ids('title', 'part', path)

Expand All @@ -26,7 +28,9 @@ def test_fetch_version_ids_local(monkeypatch):
"""If a notice is split into multiple entries locally, a single document
number might result in multiple version ids"""
monkeypatch.setattr(versions, 'fetch_notice_json', Mock(return_value=[
{'document_number': '1'}, {'document_number': '22'}]))
{'document_number': '1', 'full_text_xml_url': 'somewhere'},
{'document_number': '22', 'full_text_xml_url': 'somewhere'}
]))
path = entry.Entry("path")
(path / '1_20010101').write(b'v1')
(path / '1_20020202').write(b'v2')
Expand All @@ -36,6 +40,18 @@ def test_fetch_version_ids_local(monkeypatch):
'1_20010101', '1_20020202', '22']


@pytest.mark.django_db
def test_fetch_version_ids_skip_no_xml(monkeypatch):
"""We'll skip over all of the versions which don't have XML"""
monkeypatch.setattr(versions, 'fetch_notice_json', Mock(return_value=[
{'document_number': '1', 'full_text_xml_url': 'something'},
{'document_number': '2', 'full_text_xml_url': None},
{'document_number': '3', 'full_text_xml_url': 'somewhere'}
]))
path = entry.Entry("path")
assert ['1', '3'] == versions.fetch_version_ids('title', 'part', path)


def test_delays():
"""For NoticeXMLs which cause delays to other NoticeXMLs, we'd like to get
a dictionary of delayed -> Delay(delayer, delayed_until)"""
Expand Down
Loading

0 comments on commit 2bd31fb

Please sign in to comment.