Skip to content

Commit

Permalink
core[patch]: XMLOutputParser fix to handle changes to xml standard li…
Browse files Browse the repository at this point in the history
…brary (#19612)

Newest python micro releases broke streaming in the XMLOutputParser. This fixes the parsing code to work with trailing junk after the XML content.
  • Loading branch information
eyurtsev authored and hinthornw committed Apr 26, 2024
1 parent c9cd177 commit 69530b0
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 55 deletions.
116 changes: 83 additions & 33 deletions libs/core/langchain_core/output_parsers/xml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import xml
import xml.etree.ElementTree as ET
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union

Expand Down Expand Up @@ -81,58 +82,107 @@ def _transform(
continue
# feed buffer to parser
parser.feed(buffer)

buffer = ""
# yield all events
for event, elem in parser.read_events():
if event == "start":
# update current path
current_path.append(elem.tag)
current_path_has_children = False
elif event == "end":
# remove last element from current path
current_path.pop()
# yield element
if not current_path_has_children:
yield nested_element(current_path, elem)
# prevent yielding of parent element
if current_path:
current_path_has_children = True
else:
xml_started = False
try:
for event, elem in parser.read_events():
if event == "start":
# update current path
current_path.append(elem.tag)
current_path_has_children = False
elif event == "end":
# remove last element from current path
#
current_path.pop()
# yield element
if not current_path_has_children:
yield nested_element(current_path, elem)
# prevent yielding of parent element
if current_path:
current_path_has_children = True
else:
xml_started = False
except xml.etree.ElementTree.ParseError:
# This might be junk at the end of the XML input.
# Let's check whether the current path is empty.
if not current_path:
# If it is empty, we can ignore this error.
break
else:
raise

# close parser
parser.close()
try:
parser.close()
except xml.etree.ElementTree.ParseError:
# Ignore. This will ignore any incomplete XML at the end of the input
pass

async def _atransform(
self, input: AsyncIterator[Union[str, BaseMessage]]
) -> AsyncIterator[AddableDict]:
xml_start_re = re.compile(r"<[a-zA-Z:_]")
parser = ET.XMLPullParser(["start", "end"])
xml_started = False
current_path: List[str] = []
current_path_has_children = False
buffer = ""
async for chunk in input:
if isinstance(chunk, BaseMessage):
# extract text
chunk_content = chunk.content
if not isinstance(chunk_content, str):
continue
chunk = chunk_content
# pass chunk to parser
parser.feed(chunk)
# add chunk to buffer of unprocessed text
buffer += chunk
# if xml string hasn't started yet, continue to next chunk
if not xml_started:
if match := xml_start_re.search(buffer):
# if xml string has started, remove all text before it
buffer = buffer[match.start() :]
xml_started = True
else:
continue
# feed buffer to parser
parser.feed(buffer)

buffer = ""
# yield all events
for event, elem in parser.read_events():
if event == "start":
# update current path
current_path.append(elem.tag)
current_path_has_children = False
elif event == "end":
# remove last element from current path
current_path.pop()
# yield element
if not current_path_has_children:
yield nested_element(current_path, elem)
# prevent yielding of parent element
current_path_has_children = True
try:
for event, elem in parser.read_events():
if event == "start":
# update current path
current_path.append(elem.tag)
current_path_has_children = False
elif event == "end":
# remove last element from current path
#
current_path.pop()
# yield element
if not current_path_has_children:
yield nested_element(current_path, elem)
# prevent yielding of parent element
if current_path:
current_path_has_children = True
else:
xml_started = False
except xml.etree.ElementTree.ParseError:
# This might be junk at the end of the XML input.
# Let's check whether the current path is empty.
if not current_path:
# If it is empty, we can ignore this error.
break
else:
raise

# close parser
parser.close()
try:
parser.close()
except xml.etree.ElementTree.ParseError:
# Ignore. This will ignore any incomplete XML at the end of the input
pass

def _root_to_dict(self, root: ET.Element) -> Dict[str, List[Any]]:
"""Converts xml tree to python dictionary."""
Expand Down
66 changes: 44 additions & 22 deletions libs/core/tests/unit_tests/output_parsers/test_xml_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Test XMLOutputParser"""
from typing import AsyncIterator, Iterable

import pytest

from langchain_core.exceptions import OutputParserException
from langchain_core.output_parsers.xml import XMLOutputParser

DEF_RESULT_ENCODING = """<?xml version="1.0" encoding="UTF-8"?>
DATA = """
<foo>
<bar>
<baz></baz>
Expand All @@ -13,6 +15,25 @@
<baz>tag</baz>
</foo>"""

WITH_XML_HEADER = f"""<?xml version="1.0" encoding="UTF-8"?>
{DATA}"""


IN_XML_TAGS_WITH_XML_HEADER = f"""
```xml
{WITH_XML_HEADER}
```
"""

IN_XML_TAGS_WITH_HEADER_AND_TRAILING_JUNK = f"""
Some random text
```xml
{WITH_XML_HEADER}
```
More random text
"""


DEF_RESULT_EXPECTED = {
"foo": [
{"bar": [{"baz": None}, {"baz": "slim.shady"}]},
Expand All @@ -24,36 +45,37 @@
@pytest.mark.parametrize(
"result",
[
DEF_RESULT_ENCODING,
DEF_RESULT_ENCODING[DEF_RESULT_ENCODING.find("\n") :],
f"""
```xml
{DEF_RESULT_ENCODING}
```
""",
f"""
Some random text
```xml
{DEF_RESULT_ENCODING}
```
More random text
""",
DATA, # has no xml header
WITH_XML_HEADER,
IN_XML_TAGS_WITH_XML_HEADER,
IN_XML_TAGS_WITH_HEADER_AND_TRAILING_JUNK,
],
)
def test_xml_output_parser(result: str) -> None:
async def test_xml_output_parser(result: str) -> None:
"""Test XMLOutputParser."""

xml_parser = XMLOutputParser()

xml_result = xml_parser.parse(result)
assert DEF_RESULT_EXPECTED == xml_result

# TODO(Eugene): Fix this test for newer python version
# assert list(xml_parser.transform(iter(result))) == [
# {"foo": [{"bar": [{"baz": None}]}]},
# {"foo": [{"bar": [{"baz": "slim.shady"}]}]},
# {"foo": [{"baz": "tag"}]},
# ]
assert list(xml_parser.transform(iter(result))) == [
{"foo": [{"bar": [{"baz": None}]}]},
{"foo": [{"bar": [{"baz": "slim.shady"}]}]},
{"foo": [{"baz": "tag"}]},
]

async def _as_iter(iterable: Iterable[str]) -> AsyncIterator[str]:
for item in iterable:
yield item

chunks = [chunk async for chunk in xml_parser.atransform(_as_iter(result))]

assert list(chunks) == [
{"foo": [{"bar": [{"baz": None}]}]},
{"foo": [{"bar": [{"baz": "slim.shady"}]}]},
{"foo": [{"baz": "tag"}]},
]


@pytest.mark.parametrize("result", ["foo></foo>", "<foo></foo", "foo></foo", "foofoo"])
Expand Down

0 comments on commit 69530b0

Please sign in to comment.