diff --git a/CHANGELOG.md b/CHANGELOG.md index 43d7362951..70aa368560 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.3-dev1 +## 0.14.3-dev2 ### Enhancements @@ -8,6 +8,8 @@ ### Fixes +**Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml` + to avoid text being dynamically injected into the XML document. * Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call. ## 0.14.2 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1bc42f1aef..f9fccef86f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.3-dev1" # pragma: no cover +__version__ = "0.14.3-dev2" # pragma: no cover diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index 265f5cb413..b6cc2accb9 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -51,7 +51,7 @@ def _get_leaf_elements( """Parse the XML tree in a memory efficient manner if possible.""" element_stack = [] - element_iterator = etree.iterparse(file, events=("start", "end")) + element_iterator = etree.iterparse(file, events=("start", "end"), resolve_entities=False) # NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream # elements through in a memory efficient way, so we bite the bullet and load it all into # memory.