Initial commit (Clean history)

2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/init.py
@@ -0,0 +1,211 @@
+from functools import partial
+import os
+
+import cobble
+
+from .. import results, lists, zips
+from .document_xml import read_document_xml_element
+from .content_types_xml import empty_content_types, read_content_types_xml_element
+from .relationships_xml import read_relationships_xml_element, Relationships
+from .numbering_xml import read_numbering_xml_element, Numbering
+from .styles_xml import read_styles_xml_element, Styles
+from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
+from .comments_xml import read_comments_xml_element
+from .files import Files
+from . import body_xml, office_xml
+from ..zips import open_zip
+
+
+_empty_result = results.success([])
+
+
+def read(fileobj, external_file_access=False):
+    zip_file = open_zip(fileobj, "r")
+    part_paths = _find_part_paths(zip_file)
+    read_part_with_body = _part_with_body_reader(
+        getattr(fileobj, "name", None),
+        zip_file,
+        part_paths=part_paths,
+        external_file_access=external_file_access,
+    )
+
+    return results.combine([
+        _read_notes(read_part_with_body, part_paths),
+        _read_comments(read_part_with_body, part_paths),
+    ]).bind(lambda referents:
+        _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
+    )
+
+
+@cobble.data
+class _PartPaths(object):
+    main_document = cobble.field()
+    comments = cobble.field()
+    endnotes = cobble.field()
+    footnotes = cobble.field()
+    numbering = cobble.field()
+    styles = cobble.field()
+
+
+def _find_part_paths(zip_file):
+    package_relationships = _read_relationships(zip_file, "_rels/.rels")
+    document_filename = _find_document_filename(zip_file, package_relationships)
+
+    document_relationships = _read_relationships(
+        zip_file,
+        _find_relationships_path_for(document_filename),
+    )
+
+    def find(name):
+        return _find_part_path(
+            zip_file=zip_file,
+            relationships=document_relationships,
+            relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
+            fallback_path="word/{0}.xml".format(name),
+            base_path=zips.split_path(document_filename)[0],
+        )
+
+    return _PartPaths(
+        main_document=document_filename,
+        comments=find("comments"),
+        endnotes=find("endnotes"),
+        footnotes=find("footnotes"),
+        numbering=find("numbering"),
+        styles=find("styles"),
+    )
+
+
+def _find_document_filename(zip_file, relationships):
+    path = _find_part_path(
+        zip_file,
+        relationships,
+        relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
+        base_path="",
+        fallback_path="word/document.xml",
+    )
+    if zip_file.exists(path):
+        return path
+    else:
+        raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
+
+
+def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
+    targets = [
+        zips.join_path(base_path, target).lstrip("/")
+        for target in relationships.find_targets_by_type(relationship_type)
+    ]
+    valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
+    if len(valid_targets) == 0:
+        return fallback_path
+    else:
+        return valid_targets[0]
+
+
+def _read_notes(read_part_with_body, part_paths):
+    footnotes = read_part_with_body(
+        part_paths.footnotes,
+        lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
+        default=_empty_result,
+    )
+    endnotes = read_part_with_body(
+        part_paths.endnotes,
+        lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
+        default=_empty_result,
+    )
+
+    return results.combine([footnotes, endnotes]).map(lists.flatten)
+
+
+def _read_comments(read_part_with_body, part_paths):
+    return read_part_with_body(
+        part_paths.comments,
+        lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
+        default=_empty_result,
+    )
+
+
+def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
+    return read_part_with_body(
+        part_paths.main_document,
+        partial(
+            read_document_xml_element,
+            notes=notes,
+            comments=comments,
+        ),
+    )
+
+
+def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
+    content_types = _try_read_entry_or_default(
+        zip_file,
+        "[Content_Types].xml",
+        read_content_types_xml_element,
+        empty_content_types,
+    )
+
+    styles = _try_read_entry_or_default(
+        zip_file,
+        part_paths.styles,
+        read_styles_xml_element,
+        Styles.EMPTY,
+    )
+
+    numbering = _try_read_entry_or_default(
+        zip_file,
+        part_paths.numbering,
+        lambda element: read_numbering_xml_element(element, styles=styles),
+        default=Numbering.EMPTY,
+    )
+
+    files = Files(
+        None if document_path is None else os.path.dirname(document_path),
+        external_file_access=external_file_access,
+    )
+
+    def read_part(name, reader, default=_undefined):
+        relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
+
+        body_reader = body_xml.reader(
+            numbering=numbering,
+            content_types=content_types,
+            relationships=relationships,
+            styles=styles,
+            docx_file=zip_file,
+            files=files,
+        )
+
+        if default is _undefined:
+            return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
+        else:
+            return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
+
+    return read_part
+
+
+
+def _find_relationships_path_for(name):
+    dirname, basename = zips.split_path(name)
+    return zips.join_path(dirname, "_rels", basename + ".rels")
+
+
+def _read_relationships(zip_file, name):
+    return _try_read_entry_or_default(
+        zip_file,
+        name,
+        read_relationships_xml_element,
+        default=Relationships.EMPTY,
+    )
+
+def _try_read_entry_or_default(zip_file, name, reader, default):
+    if zip_file.exists(name):
+        return _read_entry(zip_file, name, reader)
+    else:
+        return default
+
+
+def _read_entry(zip_file, name, reader):
+    with zip_file.open(name) as fileobj:
+        return reader(office_xml.read(fileobj))
+
+
+_undefined = object()
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/body_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/body_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/comments_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/comments_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/complex_fields.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/complex_fields.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/content_types_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/content_types_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/dingbats.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/dingbats.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/document_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/document_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/files.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/files.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/notes_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/notes_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/numbering_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/numbering_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/office_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/office_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/relationships_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/relationships_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/style_map.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/style_map.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/styles_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/styles_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/uris.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/uris.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/xmlparser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/xmlparser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/body_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/body_xml.py
@@ -0,0 +1,794 @@
+import contextlib
+import re
+import sys
+
+from .. import documents
+from .. import results
+from .. import lists
+from .. import transforms
+from . import complex_fields
+from .dingbats import dingbats
+from .xmlparser import node_types, XmlElement, null_xml_element
+from .styles_xml import Styles
+from .uris import replace_fragment, uri_to_zip_entry_name
+
+if sys.version_info >= (3, ):
+    unichr = chr
+
+
+def reader(
+    numbering=None,
+    content_types=None,
+    relationships=None,
+    styles=None,
+    docx_file=None,
+    files=None
+):
+
+    if styles is None:
+        styles = Styles.EMPTY
+
+    read_all = _create_reader(
+        numbering=numbering,
+        content_types=content_types,
+        relationships=relationships,
+        styles=styles,
+        docx_file=docx_file,
+        files=files,
+    )
+    return _BodyReader(read_all)
+
+
+
+class _BodyReader(object):
+    def __init__(self, read_all):
+        self._read_all = read_all
+
+    def read_all(self, elements):
+        result = self._read_all(elements)
+        return results.Result(result.elements, result.messages)
+
+
+def _create_reader(numbering, content_types, relationships, styles, docx_file, files):
+    current_instr_text = []
+    complex_field_stack = []
+
+    # When a paragraph is marked as deleted, its contents should be combined
+    # with the following paragraph. See 17.13.5.15 del (Deleted Paragraph) of
+    # ECMA-376 4th edition Part 1.
+    deleted_paragraph_contents = []
+
+    _ignored_elements = set([
+        "office-word:wrap",
+        "v:shadow",
+        "v:shapetype",
+        "w:annotationRef",
+        "w:bookmarkEnd",
+        "w:sectPr",
+        "w:proofErr",
+        "w:lastRenderedPageBreak",
+        "w:commentRangeStart",
+        "w:commentRangeEnd",
+        "w:del",
+        "w:footnoteRef",
+        "w:endnoteRef",
+        "w:pPr",
+        "w:rPr",
+        "w:tblPr",
+        "w:tblGrid",
+        "w:trPr",
+        "w:tcPr",
+    ])
+
+    def text(element):
+        return _success(documents.Text(_inner_text(element)))
+
+    def run(element):
+        properties = element.find_child_or_null("w:rPr")
+        vertical_alignment = properties \
+            .find_child_or_null("w:vertAlign") \
+            .attributes.get("w:val")
+        font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
+
+        font_size_string = properties.find_child_or_null("w:sz").attributes.get("w:val")
+        if _is_int(font_size_string):
+            # w:sz gives the font size in half points, so halve the value to get the size in points
+            font_size = int(font_size_string) / 2
+        else:
+            font_size = None
+
+        is_bold = read_boolean_element(properties.find_child("w:b"))
+        is_italic = read_boolean_element(properties.find_child("w:i"))
+        is_underline = read_underline_element(properties.find_child("w:u"))
+        is_strikethrough = read_boolean_element(properties.find_child("w:strike"))
+        is_all_caps = read_boolean_element(properties.find_child("w:caps"))
+        is_small_caps = read_boolean_element(properties.find_child("w:smallCaps"))
+        highlight = read_highlight_value(properties.find_child_or_null("w:highlight").attributes.get("w:val"))
+
+        def add_complex_field_hyperlink(children):
+            hyperlink_kwargs = current_hyperlink_kwargs()
+            if hyperlink_kwargs is None:
+                return children
+            else:
+                return [documents.hyperlink(children=children, **hyperlink_kwargs)]
+
+        return _ReadResult.map_results(
+            _read_run_style(properties),
+            _read_xml_elements(element.children).map(add_complex_field_hyperlink),
+            lambda style, children: documents.run(
+                children=children,
+                style_id=style[0],
+                style_name=style[1],
+                is_bold=is_bold,
+                is_italic=is_italic,
+                is_underline=is_underline,
+                is_strikethrough=is_strikethrough,
+                is_all_caps=is_all_caps,
+                is_small_caps=is_small_caps,
+                vertical_alignment=vertical_alignment,
+                font=font,
+                font_size=font_size,
+                highlight=highlight,
+            ))
+
+    def _read_run_style(properties):
+        return _read_style(properties, "w:rStyle", "Run", styles.find_character_style_by_id)
+
+    def read_boolean_element(element):
+        if element is None:
+            return False
+        else:
+            return read_boolean_attribute_value(element.attributes.get("w:val"))
+
+    def read_boolean_attribute_value(value):
+        return value not in ["false", "0"]
+
+    def read_underline_element(element):
+        return element and element.attributes.get("w:val") not in [None, "false", "0", "none"]
+
+    def read_highlight_value(value):
+        if not value or value == "none":
+            return None
+        else:
+            return value
+
+    def paragraph(element):
+        properties = element.find_child_or_null("w:pPr")
+
+        is_deleted = properties.find_child_or_null("w:rPr").find_child("w:del")
+
+        if is_deleted is not None:
+            for child in element.children:
+                deleted_paragraph_contents.append(child)
+            return _empty_result
+
+        else:
+            alignment = properties.find_child_or_null("w:jc").attributes.get("w:val")
+            indent = _read_paragraph_indent(properties.find_child_or_null("w:ind"))
+
+            children_xml = element.children
+            if deleted_paragraph_contents:
+                children_xml = deleted_paragraph_contents + children_xml
+                del deleted_paragraph_contents[:]
+
+            return _ReadResult.map_results(
+                _read_paragraph_style(properties),
+                _read_xml_elements(children_xml),
+                lambda style, children: documents.paragraph(
+                    children=children,
+                    style_id=style[0],
+                    style_name=style[1],
+                    numbering=_read_numbering_properties(
+                        paragraph_style_id=style[0],
+                        element=properties.find_child_or_null("w:numPr"),
+                    ),
+                    alignment=alignment,
+                    indent=indent,
+                )).append_extra()
+
+    def _read_paragraph_style(properties):
+        return _read_style(properties, "w:pStyle", "Paragraph", styles.find_paragraph_style_by_id)
+
+    def current_hyperlink_kwargs():
+        for complex_field in reversed(complex_field_stack):
+            if isinstance(complex_field, complex_fields.Hyperlink):
+                return complex_field.kwargs
+
+        return None
+
+    def read_fld_char(element):
+        fld_char_type = element.attributes.get("w:fldCharType")
+        if fld_char_type == "begin":
+            complex_field_stack.append(complex_fields.begin(fld_char=element))
+            del current_instr_text[:]
+
+        elif fld_char_type == "end":
+            complex_field = complex_field_stack.pop()
+            if isinstance(complex_field, complex_fields.Begin):
+                complex_field = parse_current_instr_text(complex_field)
+
+            if isinstance(complex_field, complex_fields.Checkbox):
+                return _success(documents.checkbox(checked=complex_field.checked))
+
+        elif fld_char_type == "separate":
+            complex_field_separate = complex_field_stack.pop()
+            complex_field = parse_current_instr_text(complex_field_separate)
+            complex_field_stack.append(complex_field)
+
+        return _empty_result
+
+    def parse_current_instr_text(complex_field):
+        instr_text = "".join(current_instr_text)
+
+        if isinstance(complex_field, complex_fields.Begin):
+            fld_char = complex_field.fld_char
+        else:
+            fld_char = null_xml_element
+
+        return parse_instr_text(instr_text, fld_char=fld_char)
+
+    def parse_instr_text(instr_text, *, fld_char):
+        external_link_result = re.match(r'\s*HYPERLINK "(.*)"', instr_text)
+        if external_link_result is not None:
+            return complex_fields.hyperlink(dict(href=external_link_result.group(1)))
+
+        internal_link_result = re.match(r'\s*HYPERLINK\s+\\l\s+"(.*)"', instr_text)
+        if internal_link_result is not None:
+            return complex_fields.hyperlink(dict(anchor=internal_link_result.group(1)))
+
+        checkbox_result = re.match(r'\s*FORMCHECKBOX\s*', instr_text)
+        if checkbox_result is not None:
+            checkbox_element = fld_char \
+                .find_child_or_null("w:ffData") \
+                .find_child_or_null("w:checkBox")
+            checked_element = checkbox_element.find_child("w:checked")
+
+            if checked_element is None:
+                checked = read_boolean_element(checkbox_element.find_child("w:default"))
+            else:
+                checked = read_boolean_element(checked_element)
+
+            return complex_fields.checkbox(checked=checked)
+
+        return None
+
+    def read_instr_text(element):
+        current_instr_text.append(_inner_text(element))
+        return _empty_result
+
+    def _read_style(properties, style_tag_name, style_type, find_style_by_id):
+        messages = []
+        style_id = properties \
+            .find_child_or_null(style_tag_name) \
+            .attributes.get("w:val")
+
+        if style_id is None:
+            style_name = None
+        else:
+            style = find_style_by_id(style_id)
+            if style is None:
+                style_name = None
+                messages.append(_undefined_style_warning(style_type, style_id))
+            else:
+                style_name = style.name
+
+        return _ReadResult([style_id, style_name], [], messages)
+
+    def _undefined_style_warning(style_type, style_id):
+        return results.warning("{0} style with ID {1} was referenced but not defined in the document".format(style_type, style_id))
+
+    def _read_numbering_properties(paragraph_style_id, element):
+        num_id = element.find_child_or_null("w:numId").attributes.get("w:val")
+        level_index = element.find_child_or_null("w:ilvl").attributes.get("w:val")
+        if num_id is not None and level_index is not None:
+            return numbering.find_level(num_id, level_index)
+
+        if paragraph_style_id is not None:
+            level = numbering.find_level_by_paragraph_style_id(paragraph_style_id)
+            if level is not None:
+                return level
+
+        # Some malformed documents define numbering levels without an index, and
+        # reference the numbering using a w:numPr element without a w:ilvl child.
+        # To handle such cases, we assume a level of 0 as a fallback.
+        if num_id is not None:
+            return numbering.find_level(num_id, "0")
+
+        return None
+
+    def _read_paragraph_indent(element):
+        attributes = element.attributes
+        return documents.paragraph_indent(
+            start=attributes.get("w:start") or attributes.get("w:left"),
+            end=attributes.get("w:end") or attributes.get("w:right"),
+            first_line=attributes.get("w:firstLine"),
+            hanging=attributes.get("w:hanging"),
+        )
+
+    def tab(element):
+        return _success(documents.tab())
+
+
+    def no_break_hyphen(element):
+        return _success(documents.text(unichr(0x2011)))
+
+
+    def soft_hyphen(element):
+        return _success(documents.text(u"\u00ad"))
+
+    def symbol(element):
+        # See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1
+        font = element.attributes.get("w:font")
+        char = element.attributes.get("w:char")
+
+        unicode_code_point = dingbats.get((font, int(char, 16)))
+
+        if unicode_code_point is None and re.match("^F0..", char):
+            unicode_code_point = dingbats.get((font, int(char[2:], 16)))
+
+        if unicode_code_point is None:
+            warning = results.warning("A w:sym element with an unsupported character was ignored: char {0} in font {1}".format(
+                char,
+                font,
+            ))
+            return _empty_result_with_message(warning)
+        else:
+            return _success(documents.text(unichr(unicode_code_point)))
+
+
+    def table(element):
+        properties = element.find_child_or_null("w:tblPr")
+        return _ReadResult.map_results(
+            read_table_style(properties),
+            _read_xml_elements(element.children)
+                .flat_map(calculate_row_spans),
+
+            lambda style, children: documents.table(
+                children=children,
+                style_id=style[0],
+                style_name=style[1],
+            ),
+        )
+
+
+    def read_table_style(properties):
+        return _read_style(properties, "w:tblStyle", "Table", styles.find_table_style_by_id)
+
+
+    def table_row(element):
+        properties = element.find_child_or_null("w:trPr")
+
+        # See 17.13.5.12 del (Deleted Table Row) of ECMA-376 4th edition Part 1
+        is_deleted = bool(properties.find_child("w:del"))
+        if is_deleted:
+            return _empty_result
+
+        is_header = bool(properties.find_child("w:tblHeader"))
+        return _read_xml_elements(element.children) \
+            .map(lambda children: documents.table_row(
+                children=children,
+                is_header=is_header,
+            ))
+
+
+    def table_cell(element):
+        properties = element.find_child_or_null("w:tcPr")
+        gridspan = properties \
+            .find_child_or_null("w:gridSpan") \
+            .attributes.get("w:val")
+
+        if gridspan is None:
+            colspan = 1
+        else:
+            colspan = int(gridspan)
+
+        return _read_xml_elements(element.children) \
+            .map(lambda children: documents.table_cell_unmerged(
+                children=children,
+                colspan=colspan,
+                rowspan=1,
+                vmerge=read_vmerge(properties),
+            ))
+
+    def read_vmerge(properties):
+        vmerge_element = properties.find_child("w:vMerge")
+        if vmerge_element is None:
+            return False
+        else:
+            val = vmerge_element.attributes.get("w:val")
+            return val == "continue" or not val
+
+
+    def calculate_row_spans(rows):
+        unexpected_non_rows = any(
+            not isinstance(row, documents.TableRow)
+            for row in rows
+        )
+        if unexpected_non_rows:
+            rows = remove_unmerged_table_cells(rows)
+            return _elements_result_with_messages(rows, [results.warning(
+                "unexpected non-row element in table, cell merging may be incorrect"
+            )])
+
+        unexpected_non_cells = any(
+            not isinstance(cell, documents.TableCellUnmerged)
+            for row in rows
+            for cell in row.children
+        )
+        if unexpected_non_cells:
+            rows = remove_unmerged_table_cells(rows)
+            return _elements_result_with_messages(rows, [results.warning(
+                "unexpected non-cell element in table row, cell merging may be incorrect"
+            )])
+
+        columns = {}
+        for row in rows:
+            cell_index = 0
+            for cell in row.children:
+                if cell.vmerge and cell_index in columns:
+                    columns[cell_index].rowspan += 1
+                else:
+                    columns[cell_index] = cell
+                    cell.vmerge = False
+                cell_index += cell.colspan
+
+        for row in rows:
+            row.children = [
+                documents.table_cell(
+                    children=cell.children,
+                    colspan=cell.colspan,
+                    rowspan=cell.rowspan,
+                )
+                for cell in row.children
+                if not cell.vmerge
+            ]
+
+        return _success(rows)
+
+
+    def remove_unmerged_table_cells(rows):
+        return list(map(
+            transforms.element_of_type(
+                documents.TableCellUnmerged,
+                lambda cell: documents.table_cell(
+                    children=cell.children,
+                    colspan=cell.colspan,
+                    rowspan=cell.rowspan,
+                ),
+            ),
+            rows,
+        ))
+
+
+    def read_child_elements(element):
+        return _read_xml_elements(element.children)
+
+
+    def pict(element):
+        return read_child_elements(element).to_extra()
+
+
+    def hyperlink(element):
+        relationship_id = element.attributes.get("r:id")
+        anchor = element.attributes.get("w:anchor")
+        target_frame = element.attributes.get("w:tgtFrame") or None
+        children_result = _read_xml_elements(element.children)
+
+        def create(**kwargs):
+            return children_result.map(lambda children: documents.hyperlink(
+                children=children,
+                target_frame=target_frame,
+                **kwargs
+            ))
+
+        if relationship_id is not None:
+            href = relationships.find_target_by_relationship_id(relationship_id)
+            if anchor is not None:
+                href = replace_fragment(href, anchor)
+
+            return create(href=href)
+        elif anchor is not None:
+            return create(anchor=anchor)
+        else:
+            return children_result
+
+
+    def bookmark_start(element):
+        name = element.attributes.get("w:name")
+        if name == "_GoBack":
+            return _empty_result
+        else:
+            return _success(documents.bookmark(name))
+
+
+    def break_(element):
+        break_type = element.attributes.get("w:type")
+
+        if not break_type or break_type == "textWrapping":
+            return _success(documents.line_break)
+        elif break_type == "page":
+            return _success(documents.page_break)
+        elif break_type == "column":
+            return _success(documents.column_break)
+        else:
+            warning = results.warning("Unsupported break type: {0}".format(break_type))
+            return _empty_result_with_message(warning)
+
+
+    def inline(element):
+        properties = element.find_child_or_null("wp:docPr").attributes
+        if properties.get("descr", "").strip():
+            alt_text = properties.get("descr")
+        else:
+            alt_text = properties.get("title")
+        blips = element.find_children("a:graphic") \
+            .find_children("a:graphicData") \
+            .find_children("pic:pic") \
+            .find_children("pic:blipFill") \
+            .find_children("a:blip")
+        return _read_blips(blips, alt_text)
+
+    def _read_blips(blips, alt_text):
+        return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
+
+    def _read_blip(element, alt_text):
+        blip_image = _find_blip_image(element)
+
+        if blip_image is None:
+            warning = results.warning("Could not find image file for a:blip element")
+            return _empty_result_with_message(warning)
+        else:
+            return _read_image(blip_image, alt_text)
+
+    def _read_image(image_file, alt_text):
+        image_path, open_image = image_file
+        content_type = content_types.find_content_type(image_path)
+        image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
+
+        if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
+            messages = []
+        else:
+            messages = [results.warning("Image of type {0} is unlikely to display in web browsers".format(content_type))]
+
+        return _element_result_with_messages(image, messages)
+
+    def _find_blip_image(element):
+        embed_relationship_id = element.attributes.get("r:embed")
+        link_relationship_id = element.attributes.get("r:link")
+        if embed_relationship_id is not None:
+            return _find_embedded_image(embed_relationship_id)
+        elif link_relationship_id is not None:
+            return _find_linked_image(link_relationship_id)
+        else:
+            return None
+
+    def _find_embedded_image(relationship_id):
+        target = relationships.find_target_by_relationship_id(relationship_id)
+        image_path = uri_to_zip_entry_name("word", target)
+
+        def open_image():
+            image_file = docx_file.open(image_path)
+            if hasattr(image_file, "__exit__"):
+                return image_file
+            else:
+                return contextlib.closing(image_file)
+
+        return image_path, open_image
+
+
+    def _find_linked_image(relationship_id):
+        image_path = relationships.find_target_by_relationship_id(relationship_id)
+
+        def open_image():
+            return files.open(image_path)
+
+        return image_path, open_image
+
+    def read_imagedata(element):
+        relationship_id = element.attributes.get("r:id")
+        if relationship_id is None:
+            warning = results.warning("A v:imagedata element without a relationship ID was ignored")
+            return _empty_result_with_message(warning)
+        else:
+            title = element.attributes.get("o:title")
+            return _read_image(_find_embedded_image(relationship_id), title)
+
+    def note_reference_reader(note_type):
+        def note_reference(element):
+            return _success(documents.note_reference(note_type, element.attributes["w:id"]))
+
+        return note_reference
+
+    def read_comment_reference(element):
+        return _success(documents.comment_reference(element.attributes["w:id"]))
+
+    def alternate_content(element):
+        return read_child_elements(element.find_child_or_null("mc:Fallback"))
+
+    def read_sdt(element):
+        content_result = read_child_elements(element.find_child_or_null("w:sdtContent"))
+
+        def handle_content(content):
+            # From the WordML standard: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/3350cb64-931f-41f7-8824-f18b2568ce66
+            #
+            # > A CT_SdtCheckbox element that specifies that the parent
+            # > structured document tag is a checkbox when displayed in the
+            # > document. The parent structured document tag contents MUST
+            # > contain a single character and optionally an additional
+            # > character in a deleted run.
+            checkbox = element.find_child_or_null("w:sdtPr").find_child("wordml:checkbox")
+
+            if checkbox is None:
+                return content
+
+            checked_element = checkbox.find_child("wordml:checked")
+            is_checked = (
+                checked_element is not None and
+                read_boolean_attribute_value(checked_element.attributes.get("wordml:val"))
+            )
+            document_checkbox = documents.checkbox(checked=is_checked)
+
+            has_checkbox = False
+
+            def transform_text(text):
+                nonlocal has_checkbox
+                if len(text.value) > 0 and not has_checkbox:
+                    has_checkbox = True
+                    return document_checkbox
+                else:
+                    return text
+
+            replaced_content = list(map(
+                transforms.element_of_type(documents.Text, transform_text),
+                content,
+            ))
+
+            if has_checkbox:
+                return replaced_content
+            else:
+                return document_checkbox
+
+        return content_result.map(handle_content)
+
+    handlers = {
+        "w:t": text,
+        "w:r": run,
+        "w:p": paragraph,
+        "w:fldChar": read_fld_char,
+        "w:instrText": read_instr_text,
+        "w:tab": tab,
+        "w:noBreakHyphen": no_break_hyphen,
+        "w:softHyphen": soft_hyphen,
+        "w:sym": symbol,
+        "w:tbl": table,
+        "w:tr": table_row,
+        "w:tc": table_cell,
+        "w:ins": read_child_elements,
+        "w:object": read_child_elements,
+        "w:smartTag": read_child_elements,
+        "w:drawing": read_child_elements,
+        "v:group": read_child_elements,
+        "v:rect": read_child_elements,
+        "v:roundrect": read_child_elements,
+        "v:shape": read_child_elements,
+        "v:textbox": read_child_elements,
+        "w:txbxContent": read_child_elements,
+        "w:pict": pict,
+        "w:hyperlink": hyperlink,
+        "w:bookmarkStart": bookmark_start,
+        "w:br": break_,
+        "wp:inline": inline,
+        "wp:anchor": inline,
+        "v:imagedata": read_imagedata,
+        "w:footnoteReference": note_reference_reader("footnote"),
+        "w:endnoteReference": note_reference_reader("endnote"),
+        "w:commentReference": read_comment_reference,
+        "mc:AlternateContent": alternate_content,
+        "w:sdt": read_sdt
+    }
+
+    def read(element):
+        handler = handlers.get(element.name)
+        if handler is None:
+            if element.name not in _ignored_elements:
+                warning = results.warning("An unrecognised element was ignored: {0}".format(element.name))
+                return _empty_result_with_message(warning)
+            else:
+                return _empty_result
+        else:
+            return handler(element)
+
+
+    def _read_xml_elements(nodes):
+        elements = filter(lambda node: isinstance(node, XmlElement), nodes)
+        return _ReadResult.concat(lists.map(read, elements))
+
+    return _read_xml_elements
+
+
+def _inner_text(node):
+    if node.node_type == node_types.text:
+        return node.value
+    else:
+        return "".join(_inner_text(child) for child in node.children)
+
+
+
+class _ReadResult(object):
+    @staticmethod
+    def concat(results):
+        return _ReadResult(
+            lists.flat_map(lambda result: result.elements, results),
+            lists.flat_map(lambda result: result.extra, results),
+            lists.flat_map(lambda result: result.messages, results))
+
+
+    @staticmethod
+    def map_results(first, second, func):
+        return _ReadResult(
+            [func(first.elements, second.elements)],
+            first.extra + second.extra,
+            first.messages + second.messages)
+
+    def __init__(self, elements, extra, messages):
+        self.elements = elements
+        self.extra = extra
+        self.messages = messages
+
+    def map(self, func):
+        elements = func(self.elements)
+        if not isinstance(elements, list):
+            elements = [elements]
+        return _ReadResult(
+            elements,
+            self.extra,
+            self.messages)
+
+    def flat_map(self, func):
+        result = func(self.elements)
+        return _ReadResult(
+            result.elements,
+            self.extra + result.extra,
+            self.messages + result.messages)
+
+
+    def to_extra(self):
+        return _ReadResult([], _concat(self.extra, self.elements), self.messages)
+
+    def append_extra(self):
+        return _ReadResult(_concat(self.elements, self.extra), [], self.messages)
+
+def _success(elements):
+    if not isinstance(elements, list):
+        elements = [elements]
+    return _ReadResult(elements, [], [])
+
+def _element_result_with_messages(element, messages):
+    return _elements_result_with_messages([element], messages)
+
+def _elements_result_with_messages(elements, messages):
+    return _ReadResult(elements, [], messages)
+
+_empty_result = _ReadResult([], [], [])
+
+def _empty_result_with_message(message):
+    return _ReadResult([], [], [message])
+
+def _concat(*values):
+    result = []
+    for value in values:
+        for element in value:
+            result.append(element)
+    return result
+
+
+def _is_int(value):
+    if value is None:
+        return False
+
+    try:
+        int(value)
+    except ValueError:
+        return False
+
+    return True
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/comments_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/comments_xml.py
@@ -0,0 +1,24 @@
+from .. import lists
+from .. import documents
+from .. import results
+
+
+def read_comments_xml_element(element, body_reader):
+    def read_comments_xml_element(element):
+        comment_elements = element.find_children("w:comment")
+        return results.combine(lists.map(_read_comment_element, comment_elements))
+
+
+    def _read_comment_element(element):
+        def read_optional_attribute(name):
+            return element.attributes.get(name, "").strip() or None
+
+        return body_reader.read_all(element.children).map(lambda body:
+            documents.comment(
+                comment_id=element.attributes["w:id"],
+                body=body,
+                author_name=read_optional_attribute("w:author"),
+                author_initials=read_optional_attribute("w:initials"),
+            ))
+
+    return read_comments_xml_element(element)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/complex_fields.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/complex_fields.py
@@ -0,0 +1,29 @@
+class unknown(object):
+    pass
+
+
+class Begin:
+    def __init__(self, *, fld_char):
+        self.fld_char = fld_char
+
+
+def begin(*, fld_char):
+    return Begin(fld_char=fld_char)
+
+
+class Hyperlink(object):
+    def __init__(self, kwargs):
+        self.kwargs = kwargs
+
+
+def hyperlink(kwargs):
+    return Hyperlink(kwargs=kwargs)
+
+
+class Checkbox:
+    def __init__(self, *, checked):
+        self.checked = checked
+
+
+def checkbox(*, checked):
+    return Checkbox(checked=checked)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/content_types_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/content_types_xml.py
@@ -0,0 +1,58 @@
+def read_content_types_xml_element(element):
+    extension_defaults = dict(map(
+        _read_default,
+        element.find_children("content-types:Default")
+    ))
+    overrides = dict(map(
+        _read_override,
+        element.find_children("content-types:Override")
+    ))
+    return _ContentTypes(extension_defaults, overrides)
+
+
+def _read_default(element):
+    extension = element.attributes["Extension"]
+    content_type = element.attributes["ContentType"]
+    return extension, content_type
+
+
+def _read_override(element):
+    part_name = element.attributes["PartName"]
+    content_type = element.attributes["ContentType"]
+    return part_name.lstrip("/"), content_type
+
+
+class _ContentTypes(object):
+    _image_content_types = {
+        "png": "png",
+        "gif": "gif",
+        "jpeg": "jpeg",
+        "jpg": "jpeg",
+        "tif": "tiff",
+        "tiff": "tiff",
+        "bmp": "bmp",
+    }
+    
+    def __init__(self, extension_defaults, overrides):
+        self._extension_defaults = extension_defaults
+        self._overrides = overrides
+    
+    def find_content_type(self, path):
+        if path in self._overrides:
+            return self._overrides[path]
+
+        extension = _get_extension(path)
+        default_type = self._extension_defaults.get(extension)
+        if default_type is not None:
+            return default_type
+
+        image_type = self._image_content_types.get(extension.lower())
+        if image_type is not None:
+            return "image/" + image_type
+        
+        return None
+
+empty_content_types = _ContentTypes({}, {})
+
+def _get_extension(path):
+    return path.rpartition(".")[2]
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/document_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/document_xml.py
@@ -0,0 +1,25 @@
+from .. import documents
+
+
+def read_document_xml_element(
+        element,
+        body_reader,
+        notes=None,
+        comments=None):
+
+    if notes is None:
+        notes = []
+    if comments is None:
+        comments = []
+
+    body_element = element.find_child("w:body")
+
+    if body_element is None:
+        raise ValueError("Could not find the body element: are you sure this is a docx file?")
+
+    return body_reader.read_all(body_element.children) \
+        .map(lambda children: documents.document(
+            children,
+            notes=documents.notes(notes),
+            comments=comments
+        ))
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/files.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/files.py
@@ -0,0 +1,46 @@
+import os
+import contextlib
+try:
+    from urllib2 import urlopen
+except ImportError:
+    from urllib.request import urlopen
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+
+class Files(object):
+    def __init__(self, base, external_file_access):
+        self._base = base
+        self._external_file_access = external_file_access
+
+    def open(self, uri):
+        if not self._external_file_access:
+            raise ExternalFileAccessIsDisabledError(
+                "could not open external image '{0}', external file access is disabled".format(uri)
+            )
+
+        try:
+            if _is_absolute(uri):
+                return contextlib.closing(urlopen(uri))
+            elif self._base is not None:
+                return open(os.path.join(self._base, uri), "rb")
+            else:
+                raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
+        except IOError as error:
+            message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
+                uri, self._base, str(error))
+            raise InvalidFileReferenceError(message)
+
+
+def _is_absolute(url):
+    return urlparse(url).scheme != ""
+
+
+class InvalidFileReferenceError(ValueError):
+    pass
+
+
+class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
+    pass
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/notes_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/notes_xml.py
@@ -0,0 +1,32 @@
+import functools
+
+from .. import lists
+from .. import documents
+from .. import results
+
+
+def _read_notes(note_type, element, body_reader):
+    def read_notes_xml_element(element):
+        note_elements = lists.filter(
+            _is_note_element,
+            element.find_children("w:" + note_type),
+        )
+        return results.combine(lists.map(_read_note_element, note_elements))
+
+
+    def _is_note_element(element):
+        return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
+
+
+    def _read_note_element(element):
+        return body_reader.read_all(element.children).map(lambda body: 
+            documents.note(
+                note_type=note_type,
+                note_id=element.attributes["w:id"],
+                body=body
+            ))
+    
+    return read_notes_xml_element(element)
+
+read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
+read_endnotes_xml_element = functools.partial(_read_notes, "endnote")
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/numbering_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/numbering_xml.py
@@ -0,0 +1,130 @@
+import cobble
+
+from ..documents import numbering_level
+from .styles_xml import Styles
+
+
+def read_numbering_xml_element(element, styles):
+    abstract_nums = _read_abstract_nums(element)
+    nums = _read_nums(element)
+    return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
+
+
+def _read_abstract_nums(element):
+    abstract_num_elements = element.find_children("w:abstractNum")
+    return dict(map(_read_abstract_num, abstract_num_elements))
+
+
+def _read_abstract_num(element):
+    abstract_num_id = element.attributes.get("w:abstractNumId")
+    levels = _read_abstract_num_levels(element)
+    num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
+    return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
+
+
+@cobble.data
+class _AbstractNum(object):
+    levels = cobble.field()
+    num_style_link = cobble.field()
+
+
+@cobble.data
+class _AbstractNumLevel(object):
+    level_index = cobble.field()
+    is_ordered = cobble.field()
+    paragraph_style_id = cobble.field()
+
+
+def _read_abstract_num_levels(element):
+    levels = {}
+
+    # Some malformed documents define numbering levels without an index, and
+    # reference the numbering using a w:numPr element without a w:ilvl child.
+    # To handle such cases, we assume a level of 0 as a fallback.
+    level_without_index = None
+
+    for level_element in element.find_children("w:lvl"):
+        level = _read_abstract_num_level(level_element)
+        if level.level_index is None:
+            level.level_index = "0"
+            level_without_index = level
+        else:
+            levels[level.level_index] = level
+
+    if level_without_index is not None and level_without_index.level_index not in levels:
+        levels[level_without_index.level_index] = level_without_index
+
+    return levels
+
+
+def _read_abstract_num_level(element):
+    level_index = element.attributes.get("w:ilvl")
+    num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
+    is_ordered = num_fmt != "bullet"
+    paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
+    return _AbstractNumLevel(
+        level_index=level_index,
+        is_ordered=is_ordered,
+        paragraph_style_id=paragraph_style_id,
+    )
+
+
+def _read_nums(element):
+    num_elements = element.find_children("w:num")
+    return dict(
+        _read_num(num_element)
+        for num_element in num_elements
+    )
+
+
+def _read_num(element):
+    num_id = element.attributes.get("w:numId")
+    abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
+    return num_id, _Num(abstract_num_id=abstract_num_id)
+
+
+@cobble.data
+class _Num(object):
+    abstract_num_id = cobble.field()
+
+
+class Numbering(object):
+    def __init__(self, abstract_nums, nums, styles):
+        self._abstract_nums = abstract_nums
+        self._levels_by_paragraph_style_id = dict(
+            (level.paragraph_style_id, self._to_numbering_level(level))
+            for abstract_num in abstract_nums.values()
+            for level in abstract_num.levels.values()
+            if level.paragraph_style_id is not None
+        )
+        self._nums = nums
+        self._styles = styles
+
+    def find_level(self, num_id, level):
+        num = self._nums.get(num_id)
+        if num is None:
+            return None
+        else:
+            abstract_num = self._abstract_nums.get(num.abstract_num_id)
+            if abstract_num is None:
+                return None
+            elif abstract_num.num_style_link is None:
+                return self._to_numbering_level(abstract_num.levels.get(level))
+            else:
+                style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
+                return self.find_level(style.num_id, level)
+
+    def find_level_by_paragraph_style_id(self, style_id):
+        return self._levels_by_paragraph_style_id.get(style_id)
+
+    def _to_numbering_level(self, abstract_num_level):
+        if abstract_num_level is None:
+            return None
+        else:
+            return numbering_level(
+                level_index=abstract_num_level.level_index,
+                is_ordered=abstract_num_level.is_ordered,
+            )
+
+
+Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/office_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/office_xml.py
@@ -0,0 +1,45 @@
+from ..lists import flat_map
+from .xmlparser import parse_xml, XmlElement
+
+
+_namespaces = [
+    # Transitional format
+    ("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
+    ("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
+    ("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
+    ("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
+    ("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
+
+    # Strict format
+    ("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
+    ("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
+    ("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
+    ("a", "http://purl.oclc.org/ooxml/drawingml/main"),
+    ("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
+
+    # Common
+    ("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
+    ("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
+    ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
+    ("v", "urn:schemas-microsoft-com:vml"),
+    ("office-word", "urn:schemas-microsoft-com:office:word"),
+
+    # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
+    # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
+    ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
+]
+
+
+def read(fileobj):
+    return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
+
+
+def _collapse_alternate_content(node):
+    if isinstance(node, XmlElement):
+        if node.name == "mc:AlternateContent":
+            return node.find_child_or_null("mc:Fallback").children
+        else:
+            node.children = flat_map(_collapse_alternate_content, node.children)
+            return [node]
+    else:
+        return [node]
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/relationships_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/relationships_xml.py
@@ -0,0 +1,38 @@
+import collections
+
+
+class Relationships(object):
+    def __init__(self, relationships):
+        self._targets_by_id = dict(
+            (relationship.relationship_id, relationship.target)
+            for relationship in relationships
+        )
+        self._targets_by_type = collections.defaultdict(list)
+        for relationship in relationships:
+            self._targets_by_type[relationship.type].append(relationship.target)
+    
+    def find_target_by_relationship_id(self, key):
+        return self._targets_by_id[key]
+    
+    def find_targets_by_type(self, relationship_type):
+        return self._targets_by_type[relationship_type]
+
+
+Relationships.EMPTY = Relationships([])
+
+
+Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
+
+
+def read_relationships_xml_element(element):
+    children = element.find_children("relationships:Relationship")
+    return Relationships(list(map(_read_relationship, children)))
+
+
+def _read_relationship(element):
+    relationship = Relationship(
+        relationship_id=element.attributes["Id"],
+        target=element.attributes["Target"],
+        type=element.attributes["Type"],
+    )
+    return relationship
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/style_map.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/style_map.py
@@ -0,0 +1,70 @@
+from xml.etree import ElementTree
+
+from ..zips import open_zip, update_zip
+
+
+_style_map_path = "mammoth/style-map"
+_style_map_absolute_path = "/" + _style_map_path
+_relationships_path = "word/_rels/document.xml.rels"
+_content_types_path = "[Content_Types].xml"
+
+
+def write_style_map(fileobj, style_map):
+    with open_zip(fileobj, "r") as zip_file:
+        relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
+        content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
+    
+    update_zip(fileobj, {
+        _style_map_path: style_map.encode("utf8"),
+        _relationships_path: relationships_xml,
+        _content_types_path: content_types_xml,
+    })
+
+def _generate_relationships_xml(relationships_xml):
+    schema = "http://schemas.zwobble.org/mammoth/style-map"
+    relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
+    relationship_element_name = "{" + relationships_uri + "}Relationship"
+    
+    relationships = ElementTree.fromstring(relationships_xml)
+    _add_or_update_element(relationships, relationship_element_name, "Id", {
+        "Id": "rMammothStyleMap",
+        "Type": schema,
+        "Target": _style_map_absolute_path,
+    })
+
+    return ElementTree.tostring(relationships, "UTF-8")
+
+
+def _generate_content_types_xml(content_types_xml):
+    content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
+    override_name = "{" + content_types_uri + "}Override"
+    
+    types = ElementTree.fromstring(content_types_xml)
+    _add_or_update_element(types, override_name, "PartName", {
+        "PartName": _style_map_absolute_path,
+        "ContentType": "text/prs.mammoth.style-map",
+    })
+    
+    return ElementTree.tostring(types, "UTF-8")
+
+
+def _add_or_update_element(parent, name, identifying_attribute, attributes):
+    existing_child = _find_child(parent, name, identifying_attribute, attributes)
+    if existing_child is None:
+        ElementTree.SubElement(parent, name, attributes)
+    else:
+        existing_child.attrib = attributes
+    
+
+def _find_child(parent, name, identifying_attribute, attributes):
+    for element in parent.iter():
+        if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
+            return element
+
+
+def read_style_map(fileobj):
+    with open_zip(fileobj, "r") as zip_file:
+        if zip_file.exists(_style_map_path):
+            return zip_file.read_str(_style_map_path)
+
+
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/styles_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/styles_xml.py
@@ -0,0 +1,117 @@
+import collections
+
+
+class Styles(object):
+    @staticmethod
+    def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
+        if paragraph_styles is None:
+            paragraph_styles = {}
+        if character_styles is None:
+            character_styles = {}
+        if table_styles is None:
+            table_styles = {}
+        if numbering_styles is None:
+            numbering_styles = {}
+
+        return Styles(
+            paragraph_styles=paragraph_styles,
+            character_styles=character_styles,
+            table_styles=table_styles,
+            numbering_styles=numbering_styles,
+        )
+
+    def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
+        self._paragraph_styles = paragraph_styles
+        self._character_styles = character_styles
+        self._table_styles = table_styles
+        self._numbering_styles = numbering_styles
+
+    def find_paragraph_style_by_id(self, style_id):
+        return self._paragraph_styles.get(style_id)
+
+    def find_character_style_by_id(self, style_id):
+        return self._character_styles.get(style_id)
+
+    def find_table_style_by_id(self, style_id):
+        return self._table_styles.get(style_id)
+
+    def find_numbering_style_by_id(self, style_id):
+        return self._numbering_styles.get(style_id)
+
+
+Styles.EMPTY = Styles(
+    paragraph_styles={},
+    character_styles={},
+    table_styles={},
+    numbering_styles={},
+)
+
+
+def read_styles_xml_element(element):
+    paragraph_styles = {}
+    character_styles = {}
+    table_styles = {}
+    numbering_styles = {}
+    styles = {
+        "paragraph": paragraph_styles,
+        "character": character_styles,
+        "table": table_styles,
+        "numbering": numbering_styles,
+    }
+
+    for style_element in element.find_children("w:style"):
+        element_type = style_element.attributes["w:type"]
+        if element_type == "numbering":
+            style = _read_numbering_style_element(style_element)
+        else:
+            style = _read_style_element(style_element)
+
+        style_set = styles.get(element_type)
+
+        # Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1:
+        #
+        # > If multiple style definitions each declare the same value for their
+        # > styleId, then the first such instance shall keep its current
+        # > identifier with all other instances being reassigned in any manner
+        # > desired.
+        #
+        # For the purpose of conversion, there's no point holding onto styles
+        # with reassigned style IDs, so we ignore such style definitions.
+
+        if style_set is not None and style.style_id not in style_set:
+            style_set[style.style_id] = style
+
+    return Styles(
+        paragraph_styles=paragraph_styles,
+        character_styles=character_styles,
+        table_styles=table_styles,
+        numbering_styles=numbering_styles,
+    )
+
+
+Style = collections.namedtuple("Style", ["style_id", "name"])
+
+
+def _read_style_element(element):
+    style_id = _read_style_id(element)
+    name = element.find_child_or_null("w:name").attributes.get("w:val")
+    return Style(style_id=style_id, name=name)
+
+
+NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"])
+
+
+def _read_numbering_style_element(element):
+    style_id = _read_style_id(element)
+
+    num_id = element \
+        .find_child_or_null("w:pPr") \
+        .find_child_or_null("w:numPr") \
+        .find_child_or_null("w:numId") \
+        .attributes.get("w:val")
+
+    return NumberingStyle(style_id=style_id, num_id=num_id)
+
+
+def _read_style_id(element):
+    return element.attributes["w:styleId"]
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/uris.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/uris.py
@@ -0,0 +1,12 @@
+def uri_to_zip_entry_name(base, uri):
+    if uri.startswith("/"):
+        return uri[1:]
+    else:
+        return base + "/" + uri
+
+
+def replace_fragment(uri, fragment):
+    hash_index = uri.find("#")
+    if hash_index != -1:
+        uri = uri[:hash_index]
+    return uri + "#" + fragment
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/xmlparser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/xmlparser.py
@@ -0,0 +1,121 @@
+import xml.dom.minidom
+
+import cobble
+
+
+@cobble.data
+class XmlElement(object):
+    name = cobble.field()
+    attributes = cobble.field()
+    children = cobble.field()
+
+    def find_child_or_null(self, name):
+        return self.find_child(name) or null_xml_element
+
+    def find_child(self, name):
+        for child in self.children:
+            if isinstance(child, XmlElement) and child.name == name:
+                return child
+
+
+    def find_children(self, name):
+        return XmlElementList(filter(
+            lambda child: child.node_type == node_types.element and child.name == name,
+            self.children
+        ))
+
+
+class XmlElementList(object):
+    def __init__(self, elements):
+        self._elements = elements
+
+    def __iter__(self):
+        return iter(self._elements)
+
+    def find_children(self, name):
+        children = []
+        for element in self._elements:
+            for child in element.find_children(name):
+                children.append(child)
+        return XmlElementList(children)
+
+
+class NullXmlElement(object):
+    attributes = {}
+    children = []
+
+    def find_child_or_null(self, name):
+        return self
+
+    def find_child(self, name):
+        return None
+
+
+null_xml_element = NullXmlElement()
+
+
+@cobble.data
+class XmlText(object):
+    value = cobble.field()
+
+
+def element(name, attributes=None, children=None):
+    return XmlElement(name, attributes or {}, children or [])
+
+text = XmlText
+
+
+class node_types(object):
+    element = 1
+    text = 3
+
+
+XmlElement.node_type = node_types.element
+XmlText.node_type = node_types.text
+
+
+
+def parse_xml(fileobj, namespace_mapping=None):
+    if namespace_mapping is None:
+        namespace_prefixes = {}
+    else:
+        namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
+
+    document = xml.dom.minidom.parse(fileobj)
+
+    def convert_node(node):
+        if node.nodeType == xml.dom.Node.ELEMENT_NODE:
+            return convert_element(node)
+        elif node.nodeType == xml.dom.Node.TEXT_NODE:
+            return XmlText(node.nodeValue)
+        else:
+            return None
+
+    def convert_element(element):
+        converted_name = convert_name(element)
+
+        converted_attributes = dict(
+            (convert_name(attribute), attribute.value)
+            for attribute in element.attributes.values()
+            if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
+        )
+
+        converted_children = []
+        for child_node in element.childNodes:
+            converted_child_node = convert_node(child_node)
+            if converted_child_node is not None:
+                converted_children.append(converted_child_node)
+
+        return XmlElement(converted_name, converted_attributes, converted_children)
+
+    def convert_name(node):
+        if node.namespaceURI is None:
+            return node.localName
+        else:
+            prefix = namespace_prefixes.get(node.namespaceURI)
+            if prefix is None:
+                return "{%s}%s" % (node.namespaceURI, node.localName)
+            else:
+                return "%s:%s" % (prefix, node.localName)
+
+    return convert_node(document.documentElement)