Initial commit (Clean history)

2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/init.py
@@ -0,0 +1,58 @@
+from . import docx, conversion, options, images, transforms, underline
+from .raw_text import extract_raw_text_from_element
+from .docx.style_map import write_style_map, read_style_map
+
+__all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"]
+
+
+_undefined = object()
+
+
+def convert_to_html(*args, **kwargs):
+    return convert(*args, output_format="html", **kwargs)
+
+
+def convert_to_markdown(*args, **kwargs):
+    return convert(*args, output_format="markdown", **kwargs)
+
+
+def convert(
+    fileobj,
+    transform_document=None,
+    id_prefix=None,
+    include_embedded_style_map=_undefined,
+    external_file_access=_undefined,
+    **kwargs
+):
+    if include_embedded_style_map is _undefined:
+        include_embedded_style_map = True
+
+    if transform_document is None:
+        transform_document = lambda x: x
+
+    if include_embedded_style_map:
+        kwargs["embedded_style_map"] = read_style_map(fileobj)
+
+    if external_file_access is _undefined:
+        external_file_access = False
+
+    return options.read_options(kwargs).bind(lambda convert_options:
+        docx.read(fileobj, external_file_access=external_file_access).map(transform_document).bind(lambda document:
+            conversion.convert_document_element_to_html(
+                document,
+                id_prefix=id_prefix,
+                **convert_options
+            )
+        )
+    )
+
+
+def extract_raw_text(fileobj):
+    return docx.read(fileobj).map(extract_raw_text_from_element)
+
+
+def embed_style_map(fileobj, style_map):
+    write_style_map(fileobj, style_map)
+
+def read_embedded_style_map(fileobj):
+    return read_style_map(fileobj)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/cli.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/cli.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/conversion.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/conversion.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/document_matchers.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/document_matchers.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/documents.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/documents.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/html_paths.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/html_paths.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/images.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/images.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/lists.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/lists.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/options.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/options.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/raw_text.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/raw_text.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/results.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/results.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/transforms.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/transforms.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/underline.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/underline.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/zips.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/pycache/zips.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/cli.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/cli.py
@@ -0,0 +1,104 @@
+import argparse
+import io
+import os
+import shutil
+import sys
+
+import mammoth
+from . import writers
+
+
+def main():
+    args = _parse_args()
+    
+    if args.style_map is None:
+        style_map = None
+    else:
+        with open(args.style_map) as style_map_fileobj:
+            style_map = style_map_fileobj.read()
+    
+    with open(args.path, "rb") as docx_fileobj:
+        if args.output_dir is None:
+            convert_image = None
+            output_path = args.output
+        else:
+            convert_image = mammoth.images.img_element(ImageWriter(args.output_dir))
+            output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0])
+            output_path = os.path.join(args.output_dir, output_filename)
+        
+        result = mammoth.convert(
+            docx_fileobj,
+            style_map=style_map,
+            convert_image=convert_image,
+            output_format=args.output_format,
+        )
+        for message in result.messages:
+            sys.stderr.write(message.message)
+            sys.stderr.write("\n")
+        
+        _write_output(output_path, result.value)
+
+
+class ImageWriter(object):
+    def __init__(self, output_dir):
+        self._output_dir = output_dir
+        self._image_number = 1
+        
+    def __call__(self, element):
+        extension = element.content_type.partition("/")[2]
+        image_filename = "{0}.{1}".format(self._image_number, extension)
+        with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest:
+            with element.open() as image_source:
+                shutil.copyfileobj(image_source, image_dest)
+        
+        self._image_number += 1
+        
+        return {"src": image_filename}
+
+
+def _write_output(path, contents):
+    if path is None:
+        if sys.version_info[0] <= 2:
+            stdout = sys.stdout
+        else:
+            stdout = sys.stdout.buffer
+
+        stdout.write(contents.encode("utf-8"))
+        stdout.flush()
+    else:
+        with io.open(path, "w", encoding="utf-8") as fileobj:
+            fileobj.write(contents)
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "path",
+        metavar="docx-path",
+        help="Path to the .docx file to convert.")
+    
+    output_group = parser.add_mutually_exclusive_group()
+    output_group.add_argument(
+        "output",
+        nargs="?",
+        metavar="output-path",
+        help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.")
+    output_group.add_argument(
+        "--output-dir",
+        help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.")
+    
+    parser.add_argument(
+        "--output-format",
+        required=False,
+        choices=writers.formats(),
+        help="Output format.")
+    parser.add_argument(
+        "--style-map",
+        required=False,
+        help="File containg a style map.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    main()
+
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/conversion.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/conversion.py
@@ -0,0 +1,408 @@
+# coding=utf-8
+
+from __future__ import unicode_literals
+
+from functools import partial
+
+import cobble
+
+from . import documents, results, html_paths, images, writers, html
+from .docx.files import InvalidFileReferenceError
+from .lists import find_index
+
+
+def convert_document_element_to_html(element,
+        style_map=None,
+        convert_image=None,
+        id_prefix=None,
+        output_format=None,
+        ignore_empty_paragraphs=True):
+
+    if style_map is None:
+        style_map = []
+
+    if id_prefix is None:
+        id_prefix = ""
+
+    if convert_image is None:
+        convert_image = images.data_uri
+
+    if isinstance(element, documents.Document):
+        comments = dict(
+            (comment.comment_id, comment)
+            for comment in element.comments
+        )
+    else:
+        comments = {}
+
+    messages = []
+    converter = _DocumentConverter(
+        messages=messages,
+        style_map=style_map,
+        convert_image=convert_image,
+        id_prefix=id_prefix,
+        ignore_empty_paragraphs=ignore_empty_paragraphs,
+        note_references=[],
+        comments=comments,
+    )
+    context = _ConversionContext(is_table_header=False)
+    nodes = converter.visit(element, context)
+
+    writer = writers.writer(output_format)
+    html.write(writer, html.collapse(html.strip_empty(nodes)))
+    return results.Result(writer.as_string(), messages)
+
+
+@cobble.data
+class _ConversionContext(object):
+    is_table_header = cobble.field()
+
+    def copy(self, **kwargs):
+        return cobble.copy(self, **kwargs)
+
+
+class _DocumentConverter(documents.element_visitor(args=1)):
+    def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
+        self._messages = messages
+        self._style_map = style_map
+        self._id_prefix = id_prefix
+        self._ignore_empty_paragraphs = ignore_empty_paragraphs
+        self._note_references = note_references
+        self._referenced_comments = []
+        self._convert_image = convert_image
+        self._comments = comments
+
+    def visit_image(self, image, context):
+        try:
+            return self._convert_image(image)
+        except InvalidFileReferenceError as error:
+            self._messages.append(results.warning(str(error)))
+            return []
+
+    def visit_document(self, document, context):
+        nodes = self._visit_all(document.children, context)
+        notes = [
+            document.notes.resolve(reference)
+            for reference in self._note_references
+        ]
+        notes_list = html.element("ol", {}, self._visit_all(notes, context))
+        comments = html.element("dl", {}, [
+            html_node
+            for referenced_comment in self._referenced_comments
+            for html_node in self.visit_comment(referenced_comment, context)
+        ])
+        return nodes + [notes_list, comments]
+
+
+    def visit_paragraph(self, paragraph, context):
+        def children():
+            content = self._visit_all(paragraph.children, context)
+            if self._ignore_empty_paragraphs:
+                return content
+            else:
+                return [html.force_write] + content
+
+        html_path = self._find_html_path_for_paragraph(paragraph)
+        return html_path.wrap(children)
+
+
+    def visit_run(self, run, context):
+        nodes = lambda: self._visit_all(run.children, context)
+        paths = []
+        if run.highlight is not None:
+            style = self._find_style(Highlight(color=run.highlight), "highlight")
+            if style is not None:
+                paths.append(style.html_path)
+        if run.is_small_caps:
+            paths.append(self._find_style_for_run_property("small_caps"))
+        if run.is_all_caps:
+            paths.append(self._find_style_for_run_property("all_caps"))
+        if run.is_strikethrough:
+            paths.append(self._find_style_for_run_property("strikethrough", default="s"))
+        if run.is_underline:
+            paths.append(self._find_style_for_run_property("underline"))
+        if run.vertical_alignment == documents.VerticalAlignment.subscript:
+            paths.append(html_paths.element(["sub"], fresh=False))
+        if run.vertical_alignment == documents.VerticalAlignment.superscript:
+            paths.append(html_paths.element(["sup"], fresh=False))
+        if run.is_italic:
+            paths.append(self._find_style_for_run_property("italic", default="em"))
+        if run.is_bold:
+            paths.append(self._find_style_for_run_property("bold", default="strong"))
+        paths.append(self._find_html_path_for_run(run))
+
+        for path in paths:
+            nodes = partial(path.wrap, nodes)
+
+        return nodes()
+
+
+    def _find_style_for_run_property(self, element_type, default=None):
+        style = self._find_style(None, element_type)
+        if style is not None:
+            return style.html_path
+        elif default is not None:
+            return html_paths.element(default, fresh=False)
+        else:
+            return html_paths.empty
+
+
+    def visit_text(self, text, context):
+        return [html.text(text.value)]
+
+
+    def visit_hyperlink(self, hyperlink, context):
+        if hyperlink.anchor is None:
+            href = hyperlink.href
+        else:
+            href = "#{0}".format(self._html_id(hyperlink.anchor))
+
+        attributes = {"href": href}
+        if hyperlink.target_frame is not None:
+            attributes["target"] = hyperlink.target_frame
+
+        nodes = self._visit_all(hyperlink.children, context)
+        return [html.collapsible_element("a", attributes, nodes)]
+
+
+    def visit_checkbox(self, checkbox, context):
+        attributes = {"type": "checkbox"}
+
+        if checkbox.checked:
+            attributes["checked"] = "checked"
+
+        return [html.element("input", attributes)]
+
+
+    def visit_bookmark(self, bookmark, context):
+        element = html.collapsible_element(
+            "a",
+            {"id": self._html_id(bookmark.name)},
+            [html.force_write])
+        return [element]
+
+
+    def visit_tab(self, tab, context):
+        return [html.text("\t")]
+
+    _default_table_path = html_paths.path([html_paths.element(["table"], fresh=True)])
+
+    def visit_table(self, table, context):
+        return self._find_html_path(table, "table", self._default_table_path) \
+            .wrap(lambda: self._convert_table_children(table, context))
+
+    def _convert_table_children(self, table, context):
+        body_index = find_index(
+            lambda child: not isinstance(child, documents.TableRow) or not child.is_header,
+            table.children,
+        )
+        if body_index is None:
+            body_index = len(table.children)
+
+        if body_index == 0:
+            children = self._visit_all(table.children, context.copy(is_table_header=False))
+        else:
+            head_rows = self._visit_all(table.children[:body_index], context.copy(is_table_header=True))
+            body_rows = self._visit_all(table.children[body_index:], context.copy(is_table_header=False))
+            children = [
+                html.element("thead", {}, head_rows),
+                html.element("tbody", {}, body_rows),
+            ]
+
+        return [html.force_write] + children
+
+
+    def visit_table_row(self, table_row, context):
+        return [html.element("tr", {}, [html.force_write] + self._visit_all(table_row.children, context))]
+
+
+    def visit_table_cell(self, table_cell, context):
+        if context.is_table_header:
+            tag_name = "th"
+        else:
+            tag_name = "td"
+        attributes = {}
+        if table_cell.colspan != 1:
+            attributes["colspan"] = str(table_cell.colspan)
+        if table_cell.rowspan != 1:
+            attributes["rowspan"] = str(table_cell.rowspan)
+        nodes = [html.force_write] + self._visit_all(table_cell.children, context)
+        return [
+            html.element(tag_name, attributes, nodes)
+        ]
+
+
+    def visit_break(self, break_, context):
+        return self._find_html_path_for_break(break_).wrap(lambda: [])
+
+
+    def _find_html_path_for_break(self, break_):
+        style = self._find_style(break_, "break")
+        if style is not None:
+            return style.html_path
+        elif break_.break_type == "line":
+            return html_paths.path([html_paths.element("br", fresh=True)])
+        else:
+            return html_paths.empty
+
+
+    def visit_note_reference(self, note_reference, context):
+        self._note_references.append(note_reference)
+        note_number = len(self._note_references)
+        return [
+            html.element("sup", {}, [
+                html.element("a", {
+                    "href": "#" + self._note_html_id(note_reference),
+                    "id": self._note_ref_html_id(note_reference),
+                }, [html.text("[{0}]".format(note_number))])
+            ])
+        ]
+
+
+    def visit_note(self, note, context):
+        note_body = self._visit_all(note.body, context) + [
+            html.collapsible_element("p", {}, [
+                html.text(" "),
+                html.element("a", {"href": "#" + self._note_ref_html_id(note)}, [
+                    html.text(_up_arrow)
+                ]),
+            ])
+        ]
+        return [
+            html.element("li", {"id": self._note_html_id(note)}, note_body)
+        ]
+
+
+    def visit_comment_reference(self, reference, context):
+        def nodes():
+            comment = self._comments[reference.comment_id]
+            count = len(self._referenced_comments) + 1
+            label = "[{0}{1}]".format(_comment_author_label(comment), count)
+            self._referenced_comments.append((label, comment))
+            return [
+                # TODO: remove duplication with note references
+                html.element("a", {
+                    "href": "#" + self._referent_html_id("comment", reference.comment_id),
+                    "id": self._reference_html_id("comment", reference.comment_id),
+                }, [html.text(label)])
+            ]
+
+        html_path = self._find_html_path(
+            None,
+            "comment_reference",
+            default=html_paths.ignore,
+        )
+
+        return html_path.wrap(nodes)
+
+    def visit_comment(self, referenced_comment, context):
+        label, comment = referenced_comment
+        # TODO remove duplication with notes
+        body = self._visit_all(comment.body, context) + [
+            html.collapsible_element("p", {}, [
+                html.text(" "),
+                html.element("a", {"href": "#" + self._reference_html_id("comment", comment.comment_id)}, [
+                    html.text(_up_arrow)
+                ]),
+            ])
+        ]
+        return [
+            html.element(
+                "dt",
+                {"id": self._referent_html_id("comment", comment.comment_id)},
+                [html.text("Comment {0}".format(label))],
+            ),
+            html.element("dd", {}, body),
+        ]
+
+
+    def _visit_all(self, elements, context):
+        return [
+            html_node
+            for element in elements
+            for html_node in self.visit(element, context)
+        ]
+
+
+    def _find_html_path_for_paragraph(self, paragraph):
+        default = html_paths.path([html_paths.element("p", fresh=True)])
+        return self._find_html_path(paragraph, "paragraph", default, warn_unrecognised=True)
+
+    def _find_html_path_for_run(self, run):
+        return self._find_html_path(run, "run", default=html_paths.empty, warn_unrecognised=True)
+
+
+    def _find_html_path(self, element, element_type, default, warn_unrecognised=False):
+        style = self._find_style(element, element_type)
+        if style is not None:
+            return style.html_path
+
+        if warn_unrecognised and getattr(element, "style_id", None) is not None:
+            self._messages.append(results.warning(
+                "Unrecognised {0} style: {1} (Style ID: {2})".format(
+                    element_type, element.style_name, element.style_id)
+            ))
+
+        return default
+
+    def _find_style(self, element, element_type):
+        for style in self._style_map:
+            document_matcher = style.document_matcher
+            if _document_matcher_matches(document_matcher, element, element_type):
+                return style
+
+    def _note_html_id(self, note):
+        return self._referent_html_id(note.note_type, note.note_id)
+
+    def _note_ref_html_id(self, note):
+        return self._reference_html_id(note.note_type, note.note_id)
+
+    def _referent_html_id(self, reference_type, reference_id):
+        return self._html_id("{0}-{1}".format(reference_type, reference_id))
+
+    def _reference_html_id(self, reference_type, reference_id):
+        return self._html_id("{0}-ref-{1}".format(reference_type, reference_id))
+
+    def _html_id(self, suffix):
+        return "{0}{1}".format(self._id_prefix, suffix)
+
+
+@cobble.data
+class Highlight:
+    color = cobble.field()
+
+
+def _document_matcher_matches(matcher, element, element_type):
+    if matcher.element_type in ["underline", "strikethrough", "all_caps", "small_caps", "bold", "italic", "comment_reference"]:
+        return matcher.element_type == element_type
+    elif matcher.element_type == "highlight":
+        return (
+            matcher.element_type == element_type and
+            (matcher.color is None or matcher.color == element.color)
+        )
+    elif matcher.element_type == "break":
+        return (
+            matcher.element_type == element_type and
+            matcher.break_type == element.break_type
+        )
+    else: # matcher.element_type in ["paragraph", "run"]:
+        return (
+            matcher.element_type == element_type and (
+                matcher.style_id is None or
+                matcher.style_id == element.style_id
+            ) and (
+                matcher.style_name is None or
+                element.style_name is not None and (matcher.style_name.matches(element.style_name))
+            ) and (
+                element_type != "paragraph" or
+                matcher.numbering is None or
+                matcher.numbering == element.numbering
+            )
+        )
+
+
+def _comment_author_label(comment):
+    return comment.author_initials or ""
+
+
+_up_arrow = "↑"
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/document_matchers.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/document_matchers.py
@@ -0,0 +1,95 @@
+import collections
+
+import cobble
+
+
+def paragraph(style_id=None, style_name=None, numbering=None):
+    return ParagraphMatcher(style_id, style_name, numbering)
+
+
+ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"])
+ParagraphMatcher.element_type = "paragraph"
+
+
+def run(style_id=None, style_name=None):
+    return RunMatcher(style_id, style_name)
+
+
+RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"])
+RunMatcher.element_type = "run"
+
+
+def table(style_id=None, style_name=None):
+    return TableMatcher(style_id, style_name)
+
+
+TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"])
+TableMatcher.element_type = "table"
+
+
+class bold(object):
+    element_type = "bold"
+
+
+class italic(object):
+    element_type = "italic"
+
+
+class underline(object):
+    element_type = "underline"
+
+
+class strikethrough(object):
+    element_type = "strikethrough"
+
+
+class all_caps(object):
+    element_type = "all_caps"
+
+
+class small_caps(object):
+    element_type = "small_caps"
+
+
+def highlight(color=None):
+    return HighlightMatcher(color=color)
+
+
+HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"])
+HighlightMatcher.element_type = "highlight"
+
+class comment_reference(object):
+    element_type = "comment_reference"
+
+
+BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"])
+BreakMatcher.element_type = "break"
+
+
+line_break = BreakMatcher("line")
+page_break = BreakMatcher("page")
+column_break = BreakMatcher("column")
+
+
+def equal_to(value):
+    return StringMatcher(_operator_equal_to, value)
+
+
+def _operator_equal_to(first, second):
+    return first.upper() == second.upper()
+
+
+def starts_with(value):
+    return StringMatcher(_operator_starts_with, value)
+
+def _operator_starts_with(first, second):
+    return second.upper().startswith(first.upper())
+
+
+@cobble.data
+class StringMatcher(object):
+    operator = cobble.field()
+    value = cobble.field()
+
+    def matches(self, other):
+        return self.operator(self.value, other)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/documents.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/documents.py
@@ -0,0 +1,286 @@
+import cobble
+
+
+class Element(object):
+    def copy(self, **kwargs):
+        return cobble.copy(self, **kwargs)
+
+
+class HasChildren(Element):
+    children = cobble.field()
+
+
+@cobble.data
+class Document(HasChildren):
+    notes = cobble.field()
+    comments = cobble.field()
+
+@cobble.data
+class Paragraph(HasChildren):
+    style_id = cobble.field()
+    style_name = cobble.field()
+    numbering = cobble.field()
+    alignment = cobble.field()
+    indent = cobble.field()
+
+
+@cobble.data
+class ParagraphIndent(object):
+    start = cobble.field()
+    end = cobble.field()
+    first_line = cobble.field()
+    hanging = cobble.field()
+
+
+@cobble.data
+class Indent(object):
+    left = cobble.field()
+    right = cobble.field()
+    first_line = cobble.field()
+    hanging = cobble.field()
+
+
+@cobble.data
+class Run(HasChildren):
+    style_id = cobble.field()
+    style_name = cobble.field()
+    is_bold = cobble.field()
+    is_italic = cobble.field()
+    is_underline = cobble.field()
+    is_strikethrough = cobble.field()
+    is_all_caps = cobble.field()
+    is_small_caps = cobble.field()
+    vertical_alignment = cobble.field()
+    font = cobble.field()
+    font_size = cobble.field()
+    highlight = cobble.field()
+
+@cobble.data
+class Text(Element):
+    value = cobble.field()
+
+@cobble.data
+class Hyperlink(HasChildren):
+    href = cobble.field()
+    anchor = cobble.field()
+    target_frame = cobble.field()
+
+@cobble.data
+class Checkbox(Element):
+    checked = cobble.field()
+
+checkbox = Checkbox
+
+@cobble.data
+class Table(HasChildren):
+    style_id = cobble.field()
+    style_name = cobble.field()
+
+@cobble.data
+class TableRow(HasChildren):
+    is_header = cobble.field()
+
+@cobble.data
+class TableCell(HasChildren):
+    colspan = cobble.field()
+    rowspan = cobble.field()
+
+@cobble.data
+class TableCellUnmerged:
+    children = cobble.field()
+    colspan = cobble.field()
+    rowspan = cobble.field()
+    vmerge = cobble.field()
+
+    def _accept1(self, visitor, arg0):
+        return visitor.visit_table_cell(self, arg0)
+
+    def copy(self, **kwargs):
+        return cobble.copy(self, **kwargs)
+
+@cobble.data
+class Break(Element):
+    break_type = cobble.field()
+
+line_break = Break("line")
+page_break = Break("page")
+column_break = Break("column")
+
+
+@cobble.data
+class Tab(Element):
+    pass
+
+
+@cobble.data
+class Image(Element):
+    alt_text = cobble.field()
+    content_type = cobble.field()
+    open = cobble.field()
+
+
+def document(children, notes=None, comments=None):
+    if notes is None:
+        notes = Notes({})
+    if comments is None:
+        comments = []
+    return Document(children, notes, comments=comments)
+
+def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
+    if indent is None:
+        indent = paragraph_indent()
+
+    return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent)
+
+def paragraph_indent(start=None, end=None, first_line=None, hanging=None):
+    return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging)
+
+def run(
+    children,
+    style_id=None,
+    style_name=None,
+    is_bold=None,
+    is_italic=None,
+    is_underline=None,
+    is_strikethrough=None,
+    is_all_caps=None,
+    is_small_caps=None,
+    vertical_alignment=None,
+    font=None,
+    font_size=None,
+    highlight=None,
+):
+    if vertical_alignment is None:
+        vertical_alignment = VerticalAlignment.baseline
+    return Run(
+        children=children,
+        style_id=style_id,
+        style_name=style_name,
+        is_bold=bool(is_bold),
+        is_italic=bool(is_italic),
+        is_underline=bool(is_underline),
+        is_strikethrough=bool(is_strikethrough),
+        is_all_caps=bool(is_all_caps),
+        is_small_caps=bool(is_small_caps),
+        vertical_alignment=vertical_alignment,
+        font=font,
+        font_size=font_size,
+        highlight=highlight,
+    )
+
+class VerticalAlignment(object):
+    baseline = "baseline"
+    superscript = "superscript"
+    subscript = "subscript"
+
+text = Text
+
+_tab = Tab()
+
+def tab():
+    return _tab
+
+
+image = Image
+
+def hyperlink(children, href=None, anchor=None, target_frame=None):
+    return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children)
+
+
+@cobble.data
+class Bookmark(Element):
+    name = cobble.field()
+
+bookmark = Bookmark
+
+
+def table(children, style_id=None, style_name=None):
+    return Table(children=children, style_id=style_id, style_name=style_name)
+
+def table_row(children, is_header=None):
+    return TableRow(children=children, is_header=bool(is_header))
+
+def table_cell(children, colspan=None, rowspan=None):
+    if colspan is None:
+        colspan = 1
+    if rowspan is None:
+        rowspan = 1
+    return TableCell(children=children, colspan=colspan, rowspan=rowspan)
+
+def table_cell_unmerged(children, colspan, rowspan, vmerge):
+    return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge)
+
+def numbering_level(level_index, is_ordered):
+    return _NumberingLevel(str(level_index), bool(is_ordered))
+
+@cobble.data
+class _NumberingLevel(object):
+    level_index = cobble.field()
+    is_ordered = cobble.field()
+
+@cobble.data
+class Note(Element):
+    note_type = cobble.field()
+    note_id = cobble.field()
+    body = cobble.field()
+
+
+note = Note
+
+
+class Notes(object):
+    def __init__(self, notes):
+        self._notes = notes
+
+    def find_note(self, note_type, note_id):
+        return self._notes[(note_type, note_id)]
+
+    def resolve(self, reference):
+        return self.find_note(reference.note_type, reference.note_id)
+
+    def __eq__(self, other):
+        return isinstance(other, Notes) and self._notes == other._notes
+
+    def __ne__(self, other):
+        return not (self == other)
+
+def notes(notes_list):
+    return Notes(dict(
+        (_note_key(note), note)
+        for note in notes_list
+    ))
+
+def _note_key(note):
+    return (note.note_type, note.note_id)
+
+@cobble.data
+class NoteReference(Element):
+    note_type = cobble.field()
+    note_id = cobble.field()
+
+note_reference = NoteReference
+
+
+@cobble.data
+class Comment(object):
+    comment_id = cobble.field()
+    body = cobble.field()
+    author_name = cobble.field()
+    author_initials = cobble.field()
+
+def comment(comment_id, body, author_name=None, author_initials=None):
+    return Comment(
+        comment_id=comment_id,
+        body=body,
+        author_name=author_name,
+        author_initials=author_initials,
+    )
+
+@cobble.data
+class CommentReference(Element):
+    comment_id = cobble.field()
+
+comment_reference = CommentReference
+
+def element_visitor(args):
+    return cobble.visitor(Element, args=args)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/init.py
@@ -0,0 +1,211 @@
+from functools import partial
+import os
+
+import cobble
+
+from .. import results, lists, zips
+from .document_xml import read_document_xml_element
+from .content_types_xml import empty_content_types, read_content_types_xml_element
+from .relationships_xml import read_relationships_xml_element, Relationships
+from .numbering_xml import read_numbering_xml_element, Numbering
+from .styles_xml import read_styles_xml_element, Styles
+from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
+from .comments_xml import read_comments_xml_element
+from .files import Files
+from . import body_xml, office_xml
+from ..zips import open_zip
+
+
+_empty_result = results.success([])
+
+
+def read(fileobj, external_file_access=False):
+    zip_file = open_zip(fileobj, "r")
+    part_paths = _find_part_paths(zip_file)
+    read_part_with_body = _part_with_body_reader(
+        getattr(fileobj, "name", None),
+        zip_file,
+        part_paths=part_paths,
+        external_file_access=external_file_access,
+    )
+
+    return results.combine([
+        _read_notes(read_part_with_body, part_paths),
+        _read_comments(read_part_with_body, part_paths),
+    ]).bind(lambda referents:
+        _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
+    )
+
+
+@cobble.data
+class _PartPaths(object):
+    main_document = cobble.field()
+    comments = cobble.field()
+    endnotes = cobble.field()
+    footnotes = cobble.field()
+    numbering = cobble.field()
+    styles = cobble.field()
+
+
+def _find_part_paths(zip_file):
+    package_relationships = _read_relationships(zip_file, "_rels/.rels")
+    document_filename = _find_document_filename(zip_file, package_relationships)
+
+    document_relationships = _read_relationships(
+        zip_file,
+        _find_relationships_path_for(document_filename),
+    )
+
+    def find(name):
+        return _find_part_path(
+            zip_file=zip_file,
+            relationships=document_relationships,
+            relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
+            fallback_path="word/{0}.xml".format(name),
+            base_path=zips.split_path(document_filename)[0],
+        )
+
+    return _PartPaths(
+        main_document=document_filename,
+        comments=find("comments"),
+        endnotes=find("endnotes"),
+        footnotes=find("footnotes"),
+        numbering=find("numbering"),
+        styles=find("styles"),
+    )
+
+
+def _find_document_filename(zip_file, relationships):
+    path = _find_part_path(
+        zip_file,
+        relationships,
+        relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
+        base_path="",
+        fallback_path="word/document.xml",
+    )
+    if zip_file.exists(path):
+        return path
+    else:
+        raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
+
+
+def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
+    targets = [
+        zips.join_path(base_path, target).lstrip("/")
+        for target in relationships.find_targets_by_type(relationship_type)
+    ]
+    valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
+    if len(valid_targets) == 0:
+        return fallback_path
+    else:
+        return valid_targets[0]
+
+
+def _read_notes(read_part_with_body, part_paths):
+    footnotes = read_part_with_body(
+        part_paths.footnotes,
+        lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
+        default=_empty_result,
+    )
+    endnotes = read_part_with_body(
+        part_paths.endnotes,
+        lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
+        default=_empty_result,
+    )
+
+    return results.combine([footnotes, endnotes]).map(lists.flatten)
+
+
+def _read_comments(read_part_with_body, part_paths):
+    return read_part_with_body(
+        part_paths.comments,
+        lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
+        default=_empty_result,
+    )
+
+
+def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
+    return read_part_with_body(
+        part_paths.main_document,
+        partial(
+            read_document_xml_element,
+            notes=notes,
+            comments=comments,
+        ),
+    )
+
+
+def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
+    content_types = _try_read_entry_or_default(
+        zip_file,
+        "[Content_Types].xml",
+        read_content_types_xml_element,
+        empty_content_types,
+    )
+
+    styles = _try_read_entry_or_default(
+        zip_file,
+        part_paths.styles,
+        read_styles_xml_element,
+        Styles.EMPTY,
+    )
+
+    numbering = _try_read_entry_or_default(
+        zip_file,
+        part_paths.numbering,
+        lambda element: read_numbering_xml_element(element, styles=styles),
+        default=Numbering.EMPTY,
+    )
+
+    files = Files(
+        None if document_path is None else os.path.dirname(document_path),
+        external_file_access=external_file_access,
+    )
+
+    def read_part(name, reader, default=_undefined):
+        relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
+
+        body_reader = body_xml.reader(
+            numbering=numbering,
+            content_types=content_types,
+            relationships=relationships,
+            styles=styles,
+            docx_file=zip_file,
+            files=files,
+        )
+
+        if default is _undefined:
+            return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
+        else:
+            return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
+
+    return read_part
+
+
+
+def _find_relationships_path_for(name):
+    dirname, basename = zips.split_path(name)
+    return zips.join_path(dirname, "_rels", basename + ".rels")
+
+
+def _read_relationships(zip_file, name):
+    return _try_read_entry_or_default(
+        zip_file,
+        name,
+        read_relationships_xml_element,
+        default=Relationships.EMPTY,
+    )
+
+def _try_read_entry_or_default(zip_file, name, reader, default):
+    if zip_file.exists(name):
+        return _read_entry(zip_file, name, reader)
+    else:
+        return default
+
+
+def _read_entry(zip_file, name, reader):
+    with zip_file.open(name) as fileobj:
+        return reader(office_xml.read(fileobj))
+
+
+_undefined = object()
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/body_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/body_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/comments_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/comments_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/complex_fields.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/complex_fields.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/content_types_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/content_types_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/dingbats.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/dingbats.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/document_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/document_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/files.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/files.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/notes_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/notes_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/numbering_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/numbering_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/office_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/office_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/relationships_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/relationships_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/style_map.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/style_map.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/styles_xml.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/styles_xml.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/uris.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/uris.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/xmlparser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/pycache/xmlparser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/body_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/body_xml.py
@@ -0,0 +1,794 @@
+import contextlib
+import re
+import sys
+
+from .. import documents
+from .. import results
+from .. import lists
+from .. import transforms
+from . import complex_fields
+from .dingbats import dingbats
+from .xmlparser import node_types, XmlElement, null_xml_element
+from .styles_xml import Styles
+from .uris import replace_fragment, uri_to_zip_entry_name
+
+if sys.version_info >= (3, ):
+    unichr = chr
+
+
+def reader(
+    numbering=None,
+    content_types=None,
+    relationships=None,
+    styles=None,
+    docx_file=None,
+    files=None
+):
+
+    if styles is None:
+        styles = Styles.EMPTY
+
+    read_all = _create_reader(
+        numbering=numbering,
+        content_types=content_types,
+        relationships=relationships,
+        styles=styles,
+        docx_file=docx_file,
+        files=files,
+    )
+    return _BodyReader(read_all)
+
+
+
+class _BodyReader(object):
+    def __init__(self, read_all):
+        self._read_all = read_all
+
+    def read_all(self, elements):
+        result = self._read_all(elements)
+        return results.Result(result.elements, result.messages)
+
+
+def _create_reader(numbering, content_types, relationships, styles, docx_file, files):
+    current_instr_text = []
+    complex_field_stack = []
+
+    # When a paragraph is marked as deleted, its contents should be combined
+    # with the following paragraph. See 17.13.5.15 del (Deleted Paragraph) of
+    # ECMA-376 4th edition Part 1.
+    deleted_paragraph_contents = []
+
+    _ignored_elements = set([
+        "office-word:wrap",
+        "v:shadow",
+        "v:shapetype",
+        "w:annotationRef",
+        "w:bookmarkEnd",
+        "w:sectPr",
+        "w:proofErr",
+        "w:lastRenderedPageBreak",
+        "w:commentRangeStart",
+        "w:commentRangeEnd",
+        "w:del",
+        "w:footnoteRef",
+        "w:endnoteRef",
+        "w:pPr",
+        "w:rPr",
+        "w:tblPr",
+        "w:tblGrid",
+        "w:trPr",
+        "w:tcPr",
+    ])
+
+    def text(element):
+        return _success(documents.Text(_inner_text(element)))
+
+    def run(element):
+        properties = element.find_child_or_null("w:rPr")
+        vertical_alignment = properties \
+            .find_child_or_null("w:vertAlign") \
+            .attributes.get("w:val")
+        font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
+
+        font_size_string = properties.find_child_or_null("w:sz").attributes.get("w:val")
+        if _is_int(font_size_string):
+            # w:sz gives the font size in half points, so halve the value to get the size in points
+            font_size = int(font_size_string) / 2
+        else:
+            font_size = None
+
+        is_bold = read_boolean_element(properties.find_child("w:b"))
+        is_italic = read_boolean_element(properties.find_child("w:i"))
+        is_underline = read_underline_element(properties.find_child("w:u"))
+        is_strikethrough = read_boolean_element(properties.find_child("w:strike"))
+        is_all_caps = read_boolean_element(properties.find_child("w:caps"))
+        is_small_caps = read_boolean_element(properties.find_child("w:smallCaps"))
+        highlight = read_highlight_value(properties.find_child_or_null("w:highlight").attributes.get("w:val"))
+
+        def add_complex_field_hyperlink(children):
+            hyperlink_kwargs = current_hyperlink_kwargs()
+            if hyperlink_kwargs is None:
+                return children
+            else:
+                return [documents.hyperlink(children=children, **hyperlink_kwargs)]
+
+        return _ReadResult.map_results(
+            _read_run_style(properties),
+            _read_xml_elements(element.children).map(add_complex_field_hyperlink),
+            lambda style, children: documents.run(
+                children=children,
+                style_id=style[0],
+                style_name=style[1],
+                is_bold=is_bold,
+                is_italic=is_italic,
+                is_underline=is_underline,
+                is_strikethrough=is_strikethrough,
+                is_all_caps=is_all_caps,
+                is_small_caps=is_small_caps,
+                vertical_alignment=vertical_alignment,
+                font=font,
+                font_size=font_size,
+                highlight=highlight,
+            ))
+
+    def _read_run_style(properties):
+        return _read_style(properties, "w:rStyle", "Run", styles.find_character_style_by_id)
+
+    def read_boolean_element(element):
+        if element is None:
+            return False
+        else:
+            return read_boolean_attribute_value(element.attributes.get("w:val"))
+
+    def read_boolean_attribute_value(value):
+        return value not in ["false", "0"]
+
+    def read_underline_element(element):
+        return element and element.attributes.get("w:val") not in [None, "false", "0", "none"]
+
+    def read_highlight_value(value):
+        if not value or value == "none":
+            return None
+        else:
+            return value
+
+    def paragraph(element):
+        properties = element.find_child_or_null("w:pPr")
+
+        is_deleted = properties.find_child_or_null("w:rPr").find_child("w:del")
+
+        if is_deleted is not None:
+            for child in element.children:
+                deleted_paragraph_contents.append(child)
+            return _empty_result
+
+        else:
+            alignment = properties.find_child_or_null("w:jc").attributes.get("w:val")
+            indent = _read_paragraph_indent(properties.find_child_or_null("w:ind"))
+
+            children_xml = element.children
+            if deleted_paragraph_contents:
+                children_xml = deleted_paragraph_contents + children_xml
+                del deleted_paragraph_contents[:]
+
+            return _ReadResult.map_results(
+                _read_paragraph_style(properties),
+                _read_xml_elements(children_xml),
+                lambda style, children: documents.paragraph(
+                    children=children,
+                    style_id=style[0],
+                    style_name=style[1],
+                    numbering=_read_numbering_properties(
+                        paragraph_style_id=style[0],
+                        element=properties.find_child_or_null("w:numPr"),
+                    ),
+                    alignment=alignment,
+                    indent=indent,
+                )).append_extra()
+
+    def _read_paragraph_style(properties):
+        return _read_style(properties, "w:pStyle", "Paragraph", styles.find_paragraph_style_by_id)
+
+    def current_hyperlink_kwargs():
+        for complex_field in reversed(complex_field_stack):
+            if isinstance(complex_field, complex_fields.Hyperlink):
+                return complex_field.kwargs
+
+        return None
+
+    def read_fld_char(element):
+        fld_char_type = element.attributes.get("w:fldCharType")
+        if fld_char_type == "begin":
+            complex_field_stack.append(complex_fields.begin(fld_char=element))
+            del current_instr_text[:]
+
+        elif fld_char_type == "end":
+            complex_field = complex_field_stack.pop()
+            if isinstance(complex_field, complex_fields.Begin):
+                complex_field = parse_current_instr_text(complex_field)
+
+            if isinstance(complex_field, complex_fields.Checkbox):
+                return _success(documents.checkbox(checked=complex_field.checked))
+
+        elif fld_char_type == "separate":
+            complex_field_separate = complex_field_stack.pop()
+            complex_field = parse_current_instr_text(complex_field_separate)
+            complex_field_stack.append(complex_field)
+
+        return _empty_result
+
+    def parse_current_instr_text(complex_field):
+        instr_text = "".join(current_instr_text)
+
+        if isinstance(complex_field, complex_fields.Begin):
+            fld_char = complex_field.fld_char
+        else:
+            fld_char = null_xml_element
+
+        return parse_instr_text(instr_text, fld_char=fld_char)
+
+    def parse_instr_text(instr_text, *, fld_char):
+        external_link_result = re.match(r'\s*HYPERLINK "(.*)"', instr_text)
+        if external_link_result is not None:
+            return complex_fields.hyperlink(dict(href=external_link_result.group(1)))
+
+        internal_link_result = re.match(r'\s*HYPERLINK\s+\\l\s+"(.*)"', instr_text)
+        if internal_link_result is not None:
+            return complex_fields.hyperlink(dict(anchor=internal_link_result.group(1)))
+
+        checkbox_result = re.match(r'\s*FORMCHECKBOX\s*', instr_text)
+        if checkbox_result is not None:
+            checkbox_element = fld_char \
+                .find_child_or_null("w:ffData") \
+                .find_child_or_null("w:checkBox")
+            checked_element = checkbox_element.find_child("w:checked")
+
+            if checked_element is None:
+                checked = read_boolean_element(checkbox_element.find_child("w:default"))
+            else:
+                checked = read_boolean_element(checked_element)
+
+            return complex_fields.checkbox(checked=checked)
+
+        return None
+
+    def read_instr_text(element):
+        current_instr_text.append(_inner_text(element))
+        return _empty_result
+
+    def _read_style(properties, style_tag_name, style_type, find_style_by_id):
+        messages = []
+        style_id = properties \
+            .find_child_or_null(style_tag_name) \
+            .attributes.get("w:val")
+
+        if style_id is None:
+            style_name = None
+        else:
+            style = find_style_by_id(style_id)
+            if style is None:
+                style_name = None
+                messages.append(_undefined_style_warning(style_type, style_id))
+            else:
+                style_name = style.name
+
+        return _ReadResult([style_id, style_name], [], messages)
+
+    def _undefined_style_warning(style_type, style_id):
+        return results.warning("{0} style with ID {1} was referenced but not defined in the document".format(style_type, style_id))
+
+    def _read_numbering_properties(paragraph_style_id, element):
+        num_id = element.find_child_or_null("w:numId").attributes.get("w:val")
+        level_index = element.find_child_or_null("w:ilvl").attributes.get("w:val")
+        if num_id is not None and level_index is not None:
+            return numbering.find_level(num_id, level_index)
+
+        if paragraph_style_id is not None:
+            level = numbering.find_level_by_paragraph_style_id(paragraph_style_id)
+            if level is not None:
+                return level
+
+        # Some malformed documents define numbering levels without an index, and
+        # reference the numbering using a w:numPr element without a w:ilvl child.
+        # To handle such cases, we assume a level of 0 as a fallback.
+        if num_id is not None:
+            return numbering.find_level(num_id, "0")
+
+        return None
+
+    def _read_paragraph_indent(element):
+        attributes = element.attributes
+        return documents.paragraph_indent(
+            start=attributes.get("w:start") or attributes.get("w:left"),
+            end=attributes.get("w:end") or attributes.get("w:right"),
+            first_line=attributes.get("w:firstLine"),
+            hanging=attributes.get("w:hanging"),
+        )
+
+    def tab(element):
+        return _success(documents.tab())
+
+
+    def no_break_hyphen(element):
+        return _success(documents.text(unichr(0x2011)))
+
+
+    def soft_hyphen(element):
+        return _success(documents.text(u"\u00ad"))
+
+    def symbol(element):
+        # See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1
+        font = element.attributes.get("w:font")
+        char = element.attributes.get("w:char")
+
+        unicode_code_point = dingbats.get((font, int(char, 16)))
+
+        if unicode_code_point is None and re.match("^F0..", char):
+            unicode_code_point = dingbats.get((font, int(char[2:], 16)))
+
+        if unicode_code_point is None:
+            warning = results.warning("A w:sym element with an unsupported character was ignored: char {0} in font {1}".format(
+                char,
+                font,
+            ))
+            return _empty_result_with_message(warning)
+        else:
+            return _success(documents.text(unichr(unicode_code_point)))
+
+
+    def table(element):
+        properties = element.find_child_or_null("w:tblPr")
+        return _ReadResult.map_results(
+            read_table_style(properties),
+            _read_xml_elements(element.children)
+                .flat_map(calculate_row_spans),
+
+            lambda style, children: documents.table(
+                children=children,
+                style_id=style[0],
+                style_name=style[1],
+            ),
+        )
+
+
+    def read_table_style(properties):
+        return _read_style(properties, "w:tblStyle", "Table", styles.find_table_style_by_id)
+
+
+    def table_row(element):
+        properties = element.find_child_or_null("w:trPr")
+
+        # See 17.13.5.12 del (Deleted Table Row) of ECMA-376 4th edition Part 1
+        is_deleted = bool(properties.find_child("w:del"))
+        if is_deleted:
+            return _empty_result
+
+        is_header = bool(properties.find_child("w:tblHeader"))
+        return _read_xml_elements(element.children) \
+            .map(lambda children: documents.table_row(
+                children=children,
+                is_header=is_header,
+            ))
+
+
+    def table_cell(element):
+        properties = element.find_child_or_null("w:tcPr")
+        gridspan = properties \
+            .find_child_or_null("w:gridSpan") \
+            .attributes.get("w:val")
+
+        if gridspan is None:
+            colspan = 1
+        else:
+            colspan = int(gridspan)
+
+        return _read_xml_elements(element.children) \
+            .map(lambda children: documents.table_cell_unmerged(
+                children=children,
+                colspan=colspan,
+                rowspan=1,
+                vmerge=read_vmerge(properties),
+            ))
+
+    def read_vmerge(properties):
+        vmerge_element = properties.find_child("w:vMerge")
+        if vmerge_element is None:
+            return False
+        else:
+            val = vmerge_element.attributes.get("w:val")
+            return val == "continue" or not val
+
+
+    def calculate_row_spans(rows):
+        unexpected_non_rows = any(
+            not isinstance(row, documents.TableRow)
+            for row in rows
+        )
+        if unexpected_non_rows:
+            rows = remove_unmerged_table_cells(rows)
+            return _elements_result_with_messages(rows, [results.warning(
+                "unexpected non-row element in table, cell merging may be incorrect"
+            )])
+
+        unexpected_non_cells = any(
+            not isinstance(cell, documents.TableCellUnmerged)
+            for row in rows
+            for cell in row.children
+        )
+        if unexpected_non_cells:
+            rows = remove_unmerged_table_cells(rows)
+            return _elements_result_with_messages(rows, [results.warning(
+                "unexpected non-cell element in table row, cell merging may be incorrect"
+            )])
+
+        columns = {}
+        for row in rows:
+            cell_index = 0
+            for cell in row.children:
+                if cell.vmerge and cell_index in columns:
+                    columns[cell_index].rowspan += 1
+                else:
+                    columns[cell_index] = cell
+                    cell.vmerge = False
+                cell_index += cell.colspan
+
+        for row in rows:
+            row.children = [
+                documents.table_cell(
+                    children=cell.children,
+                    colspan=cell.colspan,
+                    rowspan=cell.rowspan,
+                )
+                for cell in row.children
+                if not cell.vmerge
+            ]
+
+        return _success(rows)
+
+
+    def remove_unmerged_table_cells(rows):
+        return list(map(
+            transforms.element_of_type(
+                documents.TableCellUnmerged,
+                lambda cell: documents.table_cell(
+                    children=cell.children,
+                    colspan=cell.colspan,
+                    rowspan=cell.rowspan,
+                ),
+            ),
+            rows,
+        ))
+
+
+    def read_child_elements(element):
+        return _read_xml_elements(element.children)
+
+
+    def pict(element):
+        return read_child_elements(element).to_extra()
+
+
+    def hyperlink(element):
+        relationship_id = element.attributes.get("r:id")
+        anchor = element.attributes.get("w:anchor")
+        target_frame = element.attributes.get("w:tgtFrame") or None
+        children_result = _read_xml_elements(element.children)
+
+        def create(**kwargs):
+            return children_result.map(lambda children: documents.hyperlink(
+                children=children,
+                target_frame=target_frame,
+                **kwargs
+            ))
+
+        if relationship_id is not None:
+            href = relationships.find_target_by_relationship_id(relationship_id)
+            if anchor is not None:
+                href = replace_fragment(href, anchor)
+
+            return create(href=href)
+        elif anchor is not None:
+            return create(anchor=anchor)
+        else:
+            return children_result
+
+
+    def bookmark_start(element):
+        name = element.attributes.get("w:name")
+        if name == "_GoBack":
+            return _empty_result
+        else:
+            return _success(documents.bookmark(name))
+
+
+    def break_(element):
+        break_type = element.attributes.get("w:type")
+
+        if not break_type or break_type == "textWrapping":
+            return _success(documents.line_break)
+        elif break_type == "page":
+            return _success(documents.page_break)
+        elif break_type == "column":
+            return _success(documents.column_break)
+        else:
+            warning = results.warning("Unsupported break type: {0}".format(break_type))
+            return _empty_result_with_message(warning)
+
+
+    def inline(element):
+        properties = element.find_child_or_null("wp:docPr").attributes
+        if properties.get("descr", "").strip():
+            alt_text = properties.get("descr")
+        else:
+            alt_text = properties.get("title")
+        blips = element.find_children("a:graphic") \
+            .find_children("a:graphicData") \
+            .find_children("pic:pic") \
+            .find_children("pic:blipFill") \
+            .find_children("a:blip")
+        return _read_blips(blips, alt_text)
+
+    def _read_blips(blips, alt_text):
+        return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
+
+    def _read_blip(element, alt_text):
+        blip_image = _find_blip_image(element)
+
+        if blip_image is None:
+            warning = results.warning("Could not find image file for a:blip element")
+            return _empty_result_with_message(warning)
+        else:
+            return _read_image(blip_image, alt_text)
+
+    def _read_image(image_file, alt_text):
+        image_path, open_image = image_file
+        content_type = content_types.find_content_type(image_path)
+        image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
+
+        if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
+            messages = []
+        else:
+            messages = [results.warning("Image of type {0} is unlikely to display in web browsers".format(content_type))]
+
+        return _element_result_with_messages(image, messages)
+
+    def _find_blip_image(element):
+        embed_relationship_id = element.attributes.get("r:embed")
+        link_relationship_id = element.attributes.get("r:link")
+        if embed_relationship_id is not None:
+            return _find_embedded_image(embed_relationship_id)
+        elif link_relationship_id is not None:
+            return _find_linked_image(link_relationship_id)
+        else:
+            return None
+
+    def _find_embedded_image(relationship_id):
+        target = relationships.find_target_by_relationship_id(relationship_id)
+        image_path = uri_to_zip_entry_name("word", target)
+
+        def open_image():
+            image_file = docx_file.open(image_path)
+            if hasattr(image_file, "__exit__"):
+                return image_file
+            else:
+                return contextlib.closing(image_file)
+
+        return image_path, open_image
+
+
+    def _find_linked_image(relationship_id):
+        image_path = relationships.find_target_by_relationship_id(relationship_id)
+
+        def open_image():
+            return files.open(image_path)
+
+        return image_path, open_image
+
+    def read_imagedata(element):
+        relationship_id = element.attributes.get("r:id")
+        if relationship_id is None:
+            warning = results.warning("A v:imagedata element without a relationship ID was ignored")
+            return _empty_result_with_message(warning)
+        else:
+            title = element.attributes.get("o:title")
+            return _read_image(_find_embedded_image(relationship_id), title)
+
+    def note_reference_reader(note_type):
+        def note_reference(element):
+            return _success(documents.note_reference(note_type, element.attributes["w:id"]))
+
+        return note_reference
+
+    def read_comment_reference(element):
+        return _success(documents.comment_reference(element.attributes["w:id"]))
+
+    def alternate_content(element):
+        return read_child_elements(element.find_child_or_null("mc:Fallback"))
+
+    def read_sdt(element):
+        content_result = read_child_elements(element.find_child_or_null("w:sdtContent"))
+
+        def handle_content(content):
+            # From the WordML standard: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/3350cb64-931f-41f7-8824-f18b2568ce66
+            #
+            # > A CT_SdtCheckbox element that specifies that the parent
+            # > structured document tag is a checkbox when displayed in the
+            # > document. The parent structured document tag contents MUST
+            # > contain a single character and optionally an additional
+            # > character in a deleted run.
+            checkbox = element.find_child_or_null("w:sdtPr").find_child("wordml:checkbox")
+
+            if checkbox is None:
+                return content
+
+            checked_element = checkbox.find_child("wordml:checked")
+            is_checked = (
+                checked_element is not None and
+                read_boolean_attribute_value(checked_element.attributes.get("wordml:val"))
+            )
+            document_checkbox = documents.checkbox(checked=is_checked)
+
+            has_checkbox = False
+
+            def transform_text(text):
+                nonlocal has_checkbox
+                if len(text.value) > 0 and not has_checkbox:
+                    has_checkbox = True
+                    return document_checkbox
+                else:
+                    return text
+
+            replaced_content = list(map(
+                transforms.element_of_type(documents.Text, transform_text),
+                content,
+            ))
+
+            if has_checkbox:
+                return replaced_content
+            else:
+                return document_checkbox
+
+        return content_result.map(handle_content)
+
+    handlers = {
+        "w:t": text,
+        "w:r": run,
+        "w:p": paragraph,
+        "w:fldChar": read_fld_char,
+        "w:instrText": read_instr_text,
+        "w:tab": tab,
+        "w:noBreakHyphen": no_break_hyphen,
+        "w:softHyphen": soft_hyphen,
+        "w:sym": symbol,
+        "w:tbl": table,
+        "w:tr": table_row,
+        "w:tc": table_cell,
+        "w:ins": read_child_elements,
+        "w:object": read_child_elements,
+        "w:smartTag": read_child_elements,
+        "w:drawing": read_child_elements,
+        "v:group": read_child_elements,
+        "v:rect": read_child_elements,
+        "v:roundrect": read_child_elements,
+        "v:shape": read_child_elements,
+        "v:textbox": read_child_elements,
+        "w:txbxContent": read_child_elements,
+        "w:pict": pict,
+        "w:hyperlink": hyperlink,
+        "w:bookmarkStart": bookmark_start,
+        "w:br": break_,
+        "wp:inline": inline,
+        "wp:anchor": inline,
+        "v:imagedata": read_imagedata,
+        "w:footnoteReference": note_reference_reader("footnote"),
+        "w:endnoteReference": note_reference_reader("endnote"),
+        "w:commentReference": read_comment_reference,
+        "mc:AlternateContent": alternate_content,
+        "w:sdt": read_sdt
+    }
+
+    def read(element):
+        handler = handlers.get(element.name)
+        if handler is None:
+            if element.name not in _ignored_elements:
+                warning = results.warning("An unrecognised element was ignored: {0}".format(element.name))
+                return _empty_result_with_message(warning)
+            else:
+                return _empty_result
+        else:
+            return handler(element)
+
+
+    def _read_xml_elements(nodes):
+        elements = filter(lambda node: isinstance(node, XmlElement), nodes)
+        return _ReadResult.concat(lists.map(read, elements))
+
+    return _read_xml_elements
+
+
+def _inner_text(node):
+    if node.node_type == node_types.text:
+        return node.value
+    else:
+        return "".join(_inner_text(child) for child in node.children)
+
+
+
+class _ReadResult(object):
+    @staticmethod
+    def concat(results):
+        return _ReadResult(
+            lists.flat_map(lambda result: result.elements, results),
+            lists.flat_map(lambda result: result.extra, results),
+            lists.flat_map(lambda result: result.messages, results))
+
+
+    @staticmethod
+    def map_results(first, second, func):
+        return _ReadResult(
+            [func(first.elements, second.elements)],
+            first.extra + second.extra,
+            first.messages + second.messages)
+
+    def __init__(self, elements, extra, messages):
+        self.elements = elements
+        self.extra = extra
+        self.messages = messages
+
+    def map(self, func):
+        elements = func(self.elements)
+        if not isinstance(elements, list):
+            elements = [elements]
+        return _ReadResult(
+            elements,
+            self.extra,
+            self.messages)
+
+    def flat_map(self, func):
+        result = func(self.elements)
+        return _ReadResult(
+            result.elements,
+            self.extra + result.extra,
+            self.messages + result.messages)
+
+
+    def to_extra(self):
+        return _ReadResult([], _concat(self.extra, self.elements), self.messages)
+
+    def append_extra(self):
+        return _ReadResult(_concat(self.elements, self.extra), [], self.messages)
+
+def _success(elements):
+    if not isinstance(elements, list):
+        elements = [elements]
+    return _ReadResult(elements, [], [])
+
+def _element_result_with_messages(element, messages):
+    return _elements_result_with_messages([element], messages)
+
+def _elements_result_with_messages(elements, messages):
+    return _ReadResult(elements, [], messages)
+
+_empty_result = _ReadResult([], [], [])
+
+def _empty_result_with_message(message):
+    return _ReadResult([], [], [message])
+
+def _concat(*values):
+    result = []
+    for value in values:
+        for element in value:
+            result.append(element)
+    return result
+
+
+def _is_int(value):
+    if value is None:
+        return False
+
+    try:
+        int(value)
+    except ValueError:
+        return False
+
+    return True
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/comments_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/comments_xml.py
@@ -0,0 +1,24 @@
+from .. import lists
+from .. import documents
+from .. import results
+
+
+def read_comments_xml_element(element, body_reader):
+    def read_comments_xml_element(element):
+        comment_elements = element.find_children("w:comment")
+        return results.combine(lists.map(_read_comment_element, comment_elements))
+
+
+    def _read_comment_element(element):
+        def read_optional_attribute(name):
+            return element.attributes.get(name, "").strip() or None
+
+        return body_reader.read_all(element.children).map(lambda body:
+            documents.comment(
+                comment_id=element.attributes["w:id"],
+                body=body,
+                author_name=read_optional_attribute("w:author"),
+                author_initials=read_optional_attribute("w:initials"),
+            ))
+
+    return read_comments_xml_element(element)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/complex_fields.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/complex_fields.py
@@ -0,0 +1,29 @@
+class unknown(object):
+    pass
+
+
+class Begin:
+    def __init__(self, *, fld_char):
+        self.fld_char = fld_char
+
+
+def begin(*, fld_char):
+    return Begin(fld_char=fld_char)
+
+
+class Hyperlink(object):
+    def __init__(self, kwargs):
+        self.kwargs = kwargs
+
+
+def hyperlink(kwargs):
+    return Hyperlink(kwargs=kwargs)
+
+
+class Checkbox:
+    def __init__(self, *, checked):
+        self.checked = checked
+
+
+def checkbox(*, checked):
+    return Checkbox(checked=checked)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/content_types_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/content_types_xml.py
@@ -0,0 +1,58 @@
+def read_content_types_xml_element(element):
+    extension_defaults = dict(map(
+        _read_default,
+        element.find_children("content-types:Default")
+    ))
+    overrides = dict(map(
+        _read_override,
+        element.find_children("content-types:Override")
+    ))
+    return _ContentTypes(extension_defaults, overrides)
+
+
+def _read_default(element):
+    extension = element.attributes["Extension"]
+    content_type = element.attributes["ContentType"]
+    return extension, content_type
+
+
+def _read_override(element):
+    part_name = element.attributes["PartName"]
+    content_type = element.attributes["ContentType"]
+    return part_name.lstrip("/"), content_type
+
+
+class _ContentTypes(object):
+    _image_content_types = {
+        "png": "png",
+        "gif": "gif",
+        "jpeg": "jpeg",
+        "jpg": "jpeg",
+        "tif": "tiff",
+        "tiff": "tiff",
+        "bmp": "bmp",
+    }
+    
+    def __init__(self, extension_defaults, overrides):
+        self._extension_defaults = extension_defaults
+        self._overrides = overrides
+    
+    def find_content_type(self, path):
+        if path in self._overrides:
+            return self._overrides[path]
+
+        extension = _get_extension(path)
+        default_type = self._extension_defaults.get(extension)
+        if default_type is not None:
+            return default_type
+
+        image_type = self._image_content_types.get(extension.lower())
+        if image_type is not None:
+            return "image/" + image_type
+        
+        return None
+
+empty_content_types = _ContentTypes({}, {})
+
+def _get_extension(path):
+    return path.rpartition(".")[2]
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/document_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/document_xml.py
@@ -0,0 +1,25 @@
+from .. import documents
+
+
+def read_document_xml_element(
+        element,
+        body_reader,
+        notes=None,
+        comments=None):
+
+    if notes is None:
+        notes = []
+    if comments is None:
+        comments = []
+
+    body_element = element.find_child("w:body")
+
+    if body_element is None:
+        raise ValueError("Could not find the body element: are you sure this is a docx file?")
+
+    return body_reader.read_all(body_element.children) \
+        .map(lambda children: documents.document(
+            children,
+            notes=documents.notes(notes),
+            comments=comments
+        ))
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/files.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/files.py
@@ -0,0 +1,46 @@
+import os
+import contextlib
+try:
+    from urllib2 import urlopen
+except ImportError:
+    from urllib.request import urlopen
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+
+class Files(object):
+    def __init__(self, base, external_file_access):
+        self._base = base
+        self._external_file_access = external_file_access
+
+    def open(self, uri):
+        if not self._external_file_access:
+            raise ExternalFileAccessIsDisabledError(
+                "could not open external image '{0}', external file access is disabled".format(uri)
+            )
+
+        try:
+            if _is_absolute(uri):
+                return contextlib.closing(urlopen(uri))
+            elif self._base is not None:
+                return open(os.path.join(self._base, uri), "rb")
+            else:
+                raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
+        except IOError as error:
+            message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
+                uri, self._base, str(error))
+            raise InvalidFileReferenceError(message)
+
+
+def _is_absolute(url):
+    return urlparse(url).scheme != ""
+
+
+class InvalidFileReferenceError(ValueError):
+    pass
+
+
+class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
+    pass
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/notes_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/notes_xml.py
@@ -0,0 +1,32 @@
+import functools
+
+from .. import lists
+from .. import documents
+from .. import results
+
+
+def _read_notes(note_type, element, body_reader):
+    def read_notes_xml_element(element):
+        note_elements = lists.filter(
+            _is_note_element,
+            element.find_children("w:" + note_type),
+        )
+        return results.combine(lists.map(_read_note_element, note_elements))
+
+
+    def _is_note_element(element):
+        return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
+
+
+    def _read_note_element(element):
+        return body_reader.read_all(element.children).map(lambda body: 
+            documents.note(
+                note_type=note_type,
+                note_id=element.attributes["w:id"],
+                body=body
+            ))
+    
+    return read_notes_xml_element(element)
+
+read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
+read_endnotes_xml_element = functools.partial(_read_notes, "endnote")
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/numbering_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/numbering_xml.py
@@ -0,0 +1,130 @@
+import cobble
+
+from ..documents import numbering_level
+from .styles_xml import Styles
+
+
+def read_numbering_xml_element(element, styles):
+    abstract_nums = _read_abstract_nums(element)
+    nums = _read_nums(element)
+    return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
+
+
+def _read_abstract_nums(element):
+    abstract_num_elements = element.find_children("w:abstractNum")
+    return dict(map(_read_abstract_num, abstract_num_elements))
+
+
+def _read_abstract_num(element):
+    abstract_num_id = element.attributes.get("w:abstractNumId")
+    levels = _read_abstract_num_levels(element)
+    num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
+    return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
+
+
+@cobble.data
+class _AbstractNum(object):
+    levels = cobble.field()
+    num_style_link = cobble.field()
+
+
+@cobble.data
+class _AbstractNumLevel(object):
+    level_index = cobble.field()
+    is_ordered = cobble.field()
+    paragraph_style_id = cobble.field()
+
+
+def _read_abstract_num_levels(element):
+    levels = {}
+
+    # Some malformed documents define numbering levels without an index, and
+    # reference the numbering using a w:numPr element without a w:ilvl child.
+    # To handle such cases, we assume a level of 0 as a fallback.
+    level_without_index = None
+
+    for level_element in element.find_children("w:lvl"):
+        level = _read_abstract_num_level(level_element)
+        if level.level_index is None:
+            level.level_index = "0"
+            level_without_index = level
+        else:
+            levels[level.level_index] = level
+
+    if level_without_index is not None and level_without_index.level_index not in levels:
+        levels[level_without_index.level_index] = level_without_index
+
+    return levels
+
+
+def _read_abstract_num_level(element):
+    level_index = element.attributes.get("w:ilvl")
+    num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
+    is_ordered = num_fmt != "bullet"
+    paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
+    return _AbstractNumLevel(
+        level_index=level_index,
+        is_ordered=is_ordered,
+        paragraph_style_id=paragraph_style_id,
+    )
+
+
+def _read_nums(element):
+    num_elements = element.find_children("w:num")
+    return dict(
+        _read_num(num_element)
+        for num_element in num_elements
+    )
+
+
+def _read_num(element):
+    num_id = element.attributes.get("w:numId")
+    abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
+    return num_id, _Num(abstract_num_id=abstract_num_id)
+
+
+@cobble.data
+class _Num(object):
+    abstract_num_id = cobble.field()
+
+
+class Numbering(object):
+    def __init__(self, abstract_nums, nums, styles):
+        self._abstract_nums = abstract_nums
+        self._levels_by_paragraph_style_id = dict(
+            (level.paragraph_style_id, self._to_numbering_level(level))
+            for abstract_num in abstract_nums.values()
+            for level in abstract_num.levels.values()
+            if level.paragraph_style_id is not None
+        )
+        self._nums = nums
+        self._styles = styles
+
+    def find_level(self, num_id, level):
+        num = self._nums.get(num_id)
+        if num is None:
+            return None
+        else:
+            abstract_num = self._abstract_nums.get(num.abstract_num_id)
+            if abstract_num is None:
+                return None
+            elif abstract_num.num_style_link is None:
+                return self._to_numbering_level(abstract_num.levels.get(level))
+            else:
+                style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
+                return self.find_level(style.num_id, level)
+
+    def find_level_by_paragraph_style_id(self, style_id):
+        return self._levels_by_paragraph_style_id.get(style_id)
+
+    def _to_numbering_level(self, abstract_num_level):
+        if abstract_num_level is None:
+            return None
+        else:
+            return numbering_level(
+                level_index=abstract_num_level.level_index,
+                is_ordered=abstract_num_level.is_ordered,
+            )
+
+
+Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/office_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/office_xml.py
@@ -0,0 +1,45 @@
+from ..lists import flat_map
+from .xmlparser import parse_xml, XmlElement
+
+
+_namespaces = [
+    # Transitional format
+    ("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
+    ("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
+    ("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
+    ("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
+    ("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
+
+    # Strict format
+    ("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
+    ("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
+    ("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
+    ("a", "http://purl.oclc.org/ooxml/drawingml/main"),
+    ("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
+
+    # Common
+    ("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
+    ("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
+    ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
+    ("v", "urn:schemas-microsoft-com:vml"),
+    ("office-word", "urn:schemas-microsoft-com:office:word"),
+
+    # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
+    # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
+    ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
+]
+
+
+def read(fileobj):
+    return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
+
+
+def _collapse_alternate_content(node):
+    if isinstance(node, XmlElement):
+        if node.name == "mc:AlternateContent":
+            return node.find_child_or_null("mc:Fallback").children
+        else:
+            node.children = flat_map(_collapse_alternate_content, node.children)
+            return [node]
+    else:
+        return [node]
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/relationships_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/relationships_xml.py
@@ -0,0 +1,38 @@
+import collections
+
+
+class Relationships(object):
+    def __init__(self, relationships):
+        self._targets_by_id = dict(
+            (relationship.relationship_id, relationship.target)
+            for relationship in relationships
+        )
+        self._targets_by_type = collections.defaultdict(list)
+        for relationship in relationships:
+            self._targets_by_type[relationship.type].append(relationship.target)
+    
+    def find_target_by_relationship_id(self, key):
+        return self._targets_by_id[key]
+    
+    def find_targets_by_type(self, relationship_type):
+        return self._targets_by_type[relationship_type]
+
+
+Relationships.EMPTY = Relationships([])
+
+
+Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
+
+
+def read_relationships_xml_element(element):
+    children = element.find_children("relationships:Relationship")
+    return Relationships(list(map(_read_relationship, children)))
+
+
+def _read_relationship(element):
+    relationship = Relationship(
+        relationship_id=element.attributes["Id"],
+        target=element.attributes["Target"],
+        type=element.attributes["Type"],
+    )
+    return relationship
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/style_map.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/style_map.py
@@ -0,0 +1,70 @@
+from xml.etree import ElementTree
+
+from ..zips import open_zip, update_zip
+
+
+_style_map_path = "mammoth/style-map"
+_style_map_absolute_path = "/" + _style_map_path
+_relationships_path = "word/_rels/document.xml.rels"
+_content_types_path = "[Content_Types].xml"
+
+
+def write_style_map(fileobj, style_map):
+    with open_zip(fileobj, "r") as zip_file:
+        relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
+        content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
+    
+    update_zip(fileobj, {
+        _style_map_path: style_map.encode("utf8"),
+        _relationships_path: relationships_xml,
+        _content_types_path: content_types_xml,
+    })
+
+def _generate_relationships_xml(relationships_xml):
+    schema = "http://schemas.zwobble.org/mammoth/style-map"
+    relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
+    relationship_element_name = "{" + relationships_uri + "}Relationship"
+    
+    relationships = ElementTree.fromstring(relationships_xml)
+    _add_or_update_element(relationships, relationship_element_name, "Id", {
+        "Id": "rMammothStyleMap",
+        "Type": schema,
+        "Target": _style_map_absolute_path,
+    })
+
+    return ElementTree.tostring(relationships, "UTF-8")
+
+
+def _generate_content_types_xml(content_types_xml):
+    content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
+    override_name = "{" + content_types_uri + "}Override"
+    
+    types = ElementTree.fromstring(content_types_xml)
+    _add_or_update_element(types, override_name, "PartName", {
+        "PartName": _style_map_absolute_path,
+        "ContentType": "text/prs.mammoth.style-map",
+    })
+    
+    return ElementTree.tostring(types, "UTF-8")
+
+
+def _add_or_update_element(parent, name, identifying_attribute, attributes):
+    existing_child = _find_child(parent, name, identifying_attribute, attributes)
+    if existing_child is None:
+        ElementTree.SubElement(parent, name, attributes)
+    else:
+        existing_child.attrib = attributes
+    
+
+def _find_child(parent, name, identifying_attribute, attributes):
+    for element in parent.iter():
+        if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
+            return element
+
+
+def read_style_map(fileobj):
+    with open_zip(fileobj, "r") as zip_file:
+        if zip_file.exists(_style_map_path):
+            return zip_file.read_str(_style_map_path)
+
+
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/styles_xml.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/styles_xml.py
@@ -0,0 +1,117 @@
+import collections
+
+
+class Styles(object):
+    @staticmethod
+    def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
+        if paragraph_styles is None:
+            paragraph_styles = {}
+        if character_styles is None:
+            character_styles = {}
+        if table_styles is None:
+            table_styles = {}
+        if numbering_styles is None:
+            numbering_styles = {}
+
+        return Styles(
+            paragraph_styles=paragraph_styles,
+            character_styles=character_styles,
+            table_styles=table_styles,
+            numbering_styles=numbering_styles,
+        )
+
+    def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
+        self._paragraph_styles = paragraph_styles
+        self._character_styles = character_styles
+        self._table_styles = table_styles
+        self._numbering_styles = numbering_styles
+
+    def find_paragraph_style_by_id(self, style_id):
+        return self._paragraph_styles.get(style_id)
+
+    def find_character_style_by_id(self, style_id):
+        return self._character_styles.get(style_id)
+
+    def find_table_style_by_id(self, style_id):
+        return self._table_styles.get(style_id)
+
+    def find_numbering_style_by_id(self, style_id):
+        return self._numbering_styles.get(style_id)
+
+
+Styles.EMPTY = Styles(
+    paragraph_styles={},
+    character_styles={},
+    table_styles={},
+    numbering_styles={},
+)
+
+
+def read_styles_xml_element(element):
+    paragraph_styles = {}
+    character_styles = {}
+    table_styles = {}
+    numbering_styles = {}
+    styles = {
+        "paragraph": paragraph_styles,
+        "character": character_styles,
+        "table": table_styles,
+        "numbering": numbering_styles,
+    }
+
+    for style_element in element.find_children("w:style"):
+        element_type = style_element.attributes["w:type"]
+        if element_type == "numbering":
+            style = _read_numbering_style_element(style_element)
+        else:
+            style = _read_style_element(style_element)
+
+        style_set = styles.get(element_type)
+
+        # Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1:
+        #
+        # > If multiple style definitions each declare the same value for their
+        # > styleId, then the first such instance shall keep its current
+        # > identifier with all other instances being reassigned in any manner
+        # > desired.
+        #
+        # For the purpose of conversion, there's no point holding onto styles
+        # with reassigned style IDs, so we ignore such style definitions.
+
+        if style_set is not None and style.style_id not in style_set:
+            style_set[style.style_id] = style
+
+    return Styles(
+        paragraph_styles=paragraph_styles,
+        character_styles=character_styles,
+        table_styles=table_styles,
+        numbering_styles=numbering_styles,
+    )
+
+
+Style = collections.namedtuple("Style", ["style_id", "name"])
+
+
+def _read_style_element(element):
+    style_id = _read_style_id(element)
+    name = element.find_child_or_null("w:name").attributes.get("w:val")
+    return Style(style_id=style_id, name=name)
+
+
+NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"])
+
+
+def _read_numbering_style_element(element):
+    style_id = _read_style_id(element)
+
+    num_id = element \
+        .find_child_or_null("w:pPr") \
+        .find_child_or_null("w:numPr") \
+        .find_child_or_null("w:numId") \
+        .attributes.get("w:val")
+
+    return NumberingStyle(style_id=style_id, num_id=num_id)
+
+
+def _read_style_id(element):
+    return element.attributes["w:styleId"]
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/uris.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/uris.py
@@ -0,0 +1,12 @@
+def uri_to_zip_entry_name(base, uri):
+    if uri.startswith("/"):
+        return uri[1:]
+    else:
+        return base + "/" + uri
+
+
+def replace_fragment(uri, fragment):
+    hash_index = uri.find("#")
+    if hash_index != -1:
+        uri = uri[:hash_index]
+    return uri + "#" + fragment
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/docx/xmlparser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/docx/xmlparser.py
@@ -0,0 +1,121 @@
+import xml.dom.minidom
+
+import cobble
+
+
+@cobble.data
+class XmlElement(object):
+    name = cobble.field()
+    attributes = cobble.field()
+    children = cobble.field()
+
+    def find_child_or_null(self, name):
+        return self.find_child(name) or null_xml_element
+
+    def find_child(self, name):
+        for child in self.children:
+            if isinstance(child, XmlElement) and child.name == name:
+                return child
+
+
+    def find_children(self, name):
+        return XmlElementList(filter(
+            lambda child: child.node_type == node_types.element and child.name == name,
+            self.children
+        ))
+
+
+class XmlElementList(object):
+    def __init__(self, elements):
+        self._elements = elements
+
+    def __iter__(self):
+        return iter(self._elements)
+
+    def find_children(self, name):
+        children = []
+        for element in self._elements:
+            for child in element.find_children(name):
+                children.append(child)
+        return XmlElementList(children)
+
+
+class NullXmlElement(object):
+    attributes = {}
+    children = []
+
+    def find_child_or_null(self, name):
+        return self
+
+    def find_child(self, name):
+        return None
+
+
+null_xml_element = NullXmlElement()
+
+
+@cobble.data
+class XmlText(object):
+    value = cobble.field()
+
+
+def element(name, attributes=None, children=None):
+    return XmlElement(name, attributes or {}, children or [])
+
+text = XmlText
+
+
+class node_types(object):
+    element = 1
+    text = 3
+
+
+XmlElement.node_type = node_types.element
+XmlText.node_type = node_types.text
+
+
+
+def parse_xml(fileobj, namespace_mapping=None):
+    if namespace_mapping is None:
+        namespace_prefixes = {}
+    else:
+        namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
+
+    document = xml.dom.minidom.parse(fileobj)
+
+    def convert_node(node):
+        if node.nodeType == xml.dom.Node.ELEMENT_NODE:
+            return convert_element(node)
+        elif node.nodeType == xml.dom.Node.TEXT_NODE:
+            return XmlText(node.nodeValue)
+        else:
+            return None
+
+    def convert_element(element):
+        converted_name = convert_name(element)
+
+        converted_attributes = dict(
+            (convert_name(attribute), attribute.value)
+            for attribute in element.attributes.values()
+            if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
+        )
+
+        converted_children = []
+        for child_node in element.childNodes:
+            converted_child_node = convert_node(child_node)
+            if converted_child_node is not None:
+                converted_children.append(converted_child_node)
+
+        return XmlElement(converted_name, converted_attributes, converted_children)
+
+    def convert_name(node):
+        if node.namespaceURI is None:
+            return node.localName
+        else:
+            prefix = namespace_prefixes.get(node.namespaceURI)
+            if prefix is None:
+                return "{%s}%s" % (node.namespaceURI, node.localName)
+            else:
+                return "%s:%s" % (prefix, node.localName)
+
+    return convert_node(document.documentElement)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/html/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/html/init.py
@@ -0,0 +1,135 @@
+from ..lists import flat_map
+from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor
+
+
+def text(value):
+    return TextNode(value)
+
+
+def tag(tag_names, attributes=None, collapsible=None, separator=None):
+    if not isinstance(tag_names, list):
+        tag_names = [tag_names]
+    if attributes is None:
+        attributes = {}
+    return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator)
+
+
+def element(tag_names, attributes=None, children=None, collapsible=None, separator=None):
+    if children is None:
+        children = []
+        
+    element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator)
+    return Element(element_tag, children)
+
+
+def collapsible_element(tag_names, attributes=None, children=None):
+    return element(tag_names, attributes, children, collapsible=True)
+
+
+force_write = ForceWrite()
+
+
+def strip_empty(nodes):
+    return flat_map(_strip_empty_node, nodes)
+
+
+def _strip_empty_node(node):
+    return StripEmpty().visit(node)
+
+
+class StripEmpty(NodeVisitor):
+    def visit_text_node(self, node):
+        if node.value:
+            return [node]
+        else:
+            return []
+    
+    def visit_element(self, element):
+        children = strip_empty(element.children)
+        if len(children) == 0 and not element.is_void():
+            return []
+        else:
+            return [Element(element.tag, children)]
+    
+    def visit_force_write(self, node):
+        return [node]
+
+
+def collapse(nodes):
+    collapsed = []
+    
+    for node in nodes:
+        _collapsing_add(collapsed, node)
+    
+    return collapsed
+
+class _CollapseNode(NodeVisitor):
+    def visit_text_node(self, node):
+        return node
+    
+    def visit_element(self, element):
+        return Element(element.tag, collapse(element.children))
+    
+    def visit_force_write(self, node):
+        return node
+    
+_collapse_node = _CollapseNode().visit
+
+
+def _collapsing_add(collapsed, node):
+    collapsed_node = _collapse_node(node)
+    if not _try_collapse(collapsed, collapsed_node):
+        collapsed.append(collapsed_node)
+    
+def _try_collapse(collapsed, node):
+    if not collapsed:
+        return False
+
+    last = collapsed[-1]
+    if not isinstance(last, Element) or not isinstance(node, Element):
+        return False
+    
+    if not node.collapsible:
+        return False
+        
+    if not _is_match(last, node):
+        return False
+    
+    if node.separator:
+        last.children.append(text(node.separator))
+    
+    for child in node.children:
+        _collapsing_add(last.children, child)
+        
+    return True
+
+def _is_match(first, second):
+    return first.tag_name in second.tag_names and first.attributes == second.attributes
+
+
+def write(writer, nodes):
+    visitor = _NodeWriter(writer)
+    visitor.visit_all(nodes)
+        
+
+class _NodeWriter(NodeVisitor):
+    def __init__(self, writer):
+        self._writer = writer
+    
+    def visit_text_node(self, node):
+        self._writer.text(node.value)
+    
+    def visit_element(self, element):
+        if element.is_void():
+            self._writer.self_closing(element.tag_name, element.attributes)
+        else:
+            self._writer.start(element.tag_name, element.attributes)
+            self.visit_all(element.children)
+            self._writer.end(element.tag_name)
+    
+    def visit_force_write(self, element):
+        pass
+    
+    def visit_all(self, nodes):
+        for node in nodes:
+            self.visit(node)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/html/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/html/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/html/pycache/nodes.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/html/pycache/nodes.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/html/nodes.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/html/nodes.py
@@ -0,0 +1,61 @@
+import cobble
+
+
+class Node(object):
+    pass
+
+
+@cobble.data
+class TextNode(Node):
+    value = cobble.field()
+
+
+@cobble.data
+class Tag(object):
+    tag_names = cobble.field()
+    attributes = cobble.field()
+    collapsible = cobble.field()
+    separator = cobble.field()
+
+    @property
+    def tag_name(self):
+        return self.tag_names[0]
+
+
+@cobble.data
+class Element(Node):
+    tag = cobble.field()
+    children = cobble.field()
+
+    @property
+    def tag_name(self):
+        return self.tag.tag_name
+
+    @property
+    def tag_names(self):
+        return self.tag.tag_names
+
+    @property
+    def attributes(self):
+        return self.tag.attributes
+
+    @property
+    def collapsible(self):
+        return self.tag.collapsible
+
+    @property
+    def separator(self):
+        return self.tag.separator
+
+    _VOID_TAG_NAMES = set(["br", "hr", "img", "input"])
+
+    def is_void(self):
+        return not self.children and self.tag_name in self._VOID_TAG_NAMES
+
+
+@cobble.visitable
+class ForceWrite(Node):
+    pass
+
+
+NodeVisitor = cobble.visitor(Node)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/html_paths.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/html_paths.py
@@ -0,0 +1,58 @@
+import cobble
+
+from . import html
+
+
+def path(elements):
+    return HtmlPath(elements)
+
+
+def element(names, attributes=None, class_names=None, fresh=None, separator=None):
+    if attributes is None:
+        attributes = {}
+    if class_names is None:
+        class_names = []
+    if fresh is None:
+        fresh = False
+    if class_names:
+        attributes["class"] = " ".join(class_names)
+
+    return HtmlPathElement(html.tag(
+        tag_names=names,
+        attributes=attributes,
+        collapsible=not fresh,
+        separator=separator,
+    ))
+
+
+@cobble.data
+class HtmlPath(object):
+    elements = cobble.field()
+
+    def wrap(self, generate_nodes):
+        nodes = generate_nodes()
+
+        for element in reversed(self.elements):
+            nodes = element.wrap_nodes(nodes)
+
+        return nodes
+
+
+@cobble.data
+class HtmlPathElement(object):
+    tag = cobble.field()
+
+    def wrap(self, generate_nodes):
+        return self.wrap_nodes(generate_nodes())
+
+    def wrap_nodes(self, nodes):
+        element = html.Element(self.tag, nodes)
+        return [element]
+
+empty = path([])
+
+
+class ignore(object):
+    @staticmethod
+    def wrap(generate_nodes):
+        return []
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/images.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/images.py
@@ -0,0 +1,28 @@
+import base64
+
+from . import html
+
+
+def img_element(func):
+    def convert_image(image):
+        attributes = {}
+        if image.alt_text:
+            attributes["alt"] = image.alt_text
+        attributes.update(func(image))
+
+        return [html.element("img", attributes)]
+
+    return convert_image
+
+# Undocumented, but retained for backwards-compatibility with 0.3.x
+inline = img_element
+
+
+@img_element
+def data_uri(image):
+    with image.open() as image_bytes:
+        encoded_src = base64.b64encode(image_bytes.read()).decode("ascii")
+
+    return {
+        "src": "data:{0};base64,{1}".format(image.content_type, encoded_src)
+    }
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/lists.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/lists.py
@@ -0,0 +1,40 @@
+import sys
+
+
+def flatten(values):
+    return flat_map(lambda x: x, values)
+
+
+def unique(values):
+    output = []
+    seen = set()
+    for value in values:
+        if value not in seen:
+            seen.add(value)
+            output.append(value)
+    return output
+
+
+def flat_map(func, values):
+    return [
+        element
+        for value in values
+        for element in func(value)
+    ]
+
+
+def find_index(predicate, values):
+    for index, value in enumerate(values):
+        if predicate(value):
+            return index
+
+
+if sys.version_info[0] == 2:
+    map = map
+    filter = filter
+else:
+    import builtins
+    def map(*args, **kwargs):
+        return list(builtins.map(*args, **kwargs))
+    def filter(*args, **kwargs):
+        return list(builtins.filter(*args, **kwargs))
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/options.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/options.py
@@ -0,0 +1,101 @@
+from .styles.parser import read_style_mapping
+from . import lists, results
+
+
+def read_options(options):
+    custom_style_map_text = options.pop("style_map", "") or ""
+    embedded_style_map_text = options.pop("embedded_style_map", "") or ""
+    include_default_style_map = options.pop("include_default_style_map", True)
+
+    read_style_map_result = results.combine([
+        _read_style_map(custom_style_map_text),
+        _read_style_map(embedded_style_map_text),
+    ])
+
+    custom_style_map, embedded_style_map = read_style_map_result.value
+    style_map = custom_style_map + embedded_style_map
+
+    if include_default_style_map:
+        style_map += _default_style_map
+
+    options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
+    options["style_map"] = style_map
+    return read_style_map_result.map(lambda _: options)
+
+
+def _read_style_map(style_text):
+    lines = filter(None, map(_get_line, style_text.split("\n")))
+    return results.combine(lists.map(read_style_mapping, lines)) \
+        .map(lambda style_mappings: lists.filter(None, style_mappings))
+
+
+def _get_line(line):
+    line = line.strip()
+    if line.startswith("#"):
+        return None
+    else:
+        return line
+
+
+_default_style_map_result = _read_style_map("""
+p.Heading1 => h1:fresh
+p.Heading2 => h2:fresh
+p.Heading3 => h3:fresh
+p.Heading4 => h4:fresh
+p.Heading5 => h5:fresh
+p.Heading6 => h6:fresh
+p[style-name='Heading 1'] => h1:fresh
+p[style-name='Heading 2'] => h2:fresh
+p[style-name='Heading 3'] => h3:fresh
+p[style-name='Heading 4'] => h4:fresh
+p[style-name='Heading 5'] => h5:fresh
+p[style-name='Heading 6'] => h6:fresh
+p[style-name='heading 1'] => h1:fresh
+p[style-name='heading 2'] => h2:fresh
+p[style-name='heading 3'] => h3:fresh
+p[style-name='heading 4'] => h4:fresh
+p[style-name='heading 5'] => h5:fresh
+p[style-name='heading 6'] => h6:fresh
+
+# Apple Pages
+p.Heading => h1:fresh
+p[style-name='Heading'] => h1:fresh
+
+r[style-name='Strong'] => strong
+
+p[style-name='footnote text'] => p:fresh
+r[style-name='footnote reference'] =>
+p[style-name='endnote text'] => p:fresh
+r[style-name='endnote reference'] =>
+p[style-name='annotation text'] => p:fresh
+r[style-name='annotation reference'] =>
+
+# LibreOffice
+p[style-name='Footnote'] => p:fresh
+r[style-name='Footnote anchor'] =>
+p[style-name='Endnote'] => p:fresh
+r[style-name='Endnote anchor'] =>
+
+p:unordered-list(1) => ul > li:fresh
+p:unordered-list(2) => ul|ol > li > ul > li:fresh
+p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh
+p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
+p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
+p:ordered-list(1) => ol > li:fresh
+p:ordered-list(2) => ul|ol > li > ol > li:fresh
+p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh
+p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
+p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
+
+r[style-name='Hyperlink'] =>
+
+p[style-name='Normal'] => p:fresh
+
+# Apple Pages
+p.Body => p:fresh
+p[style-name='Body'] => p:fresh
+""")
+
+
+assert not _default_style_map_result.messages
+_default_style_map = _default_style_map_result.value
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/raw_text.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/raw_text.py
@@ -0,0 +1,14 @@
+from . import documents
+
+
+def extract_raw_text_from_element(element):
+    if isinstance(element, documents.Text):
+        return element.value
+    elif isinstance(element, documents.Tab):
+        return "\t"
+    else:
+        text = "".join(map(extract_raw_text_from_element, getattr(element, "children", [])))
+        if isinstance(element, documents.Paragraph):
+            return text + "\n\n"
+        else:
+            return text
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/results.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/results.py
@@ -0,0 +1,42 @@
+import collections
+
+from .lists import unique
+
+
+class Result(object):
+    def __init__(self, value, messages):
+        self.value = value
+        self.messages = unique(messages)
+    
+    def map(self, func):
+        return Result(func(self.value), self.messages)
+    
+    def bind(self, func):
+        result = func(self.value)
+        return Result(result.value, self.messages + result.messages)
+
+
+Message = collections.namedtuple("Message", ["type", "message"])
+
+
+def warning(message):
+    return Message("warning", message)
+
+
+def success(value):
+    return Result(value, [])
+
+
+def combine(results):
+    values = []
+    messages = []
+    for result in results:
+        values.append(result.value)
+        for message in result.messages:
+            messages.append(message)
+        
+    return Result(values, messages)
+
+
+def map(func, *args):
+    return combine(args).map(lambda values: func(*values))
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/init.py
@@ -0,0 +1,8 @@
+import collections
+
+
+def style(document_matcher, html_path):
+    return Style(document_matcher, html_path)
+
+
+Style = collections.namedtuple("Style", ["document_matcher", "html_path"])
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/init.py
@@ -0,0 +1,14 @@
+from .errors import LineParseError
+from .style_mapping_parser import parse_style_mapping
+from .tokeniser import tokenise
+from .token_iterator import TokenIterator
+from ... import results
+
+
+def read_style_mapping(string):
+    try:
+        tokens = tokenise(string)
+        return results.success(parse_style_mapping(TokenIterator(tokens)))
+    except LineParseError:
+        warning = "Did not understand this style mapping, so ignored it: " + string
+        return results.Result(None, [results.warning(warning)])
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/document_matcher_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/document_matcher_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/errors.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/errors.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/html_path_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/html_path_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/style_mapping_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/style_mapping_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_iterator.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_iterator.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/tokeniser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/tokeniser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/document_matcher_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/document_matcher_parser.py
@@ -0,0 +1,130 @@
+from ... import documents, document_matchers
+from .errors import LineParseError
+from .tokeniser import TokenType
+from .token_parser import try_parse_class_name, parse_string
+
+
+def parse_document_matcher(tokens):
+    if tokens.try_skip(TokenType.IDENTIFIER, "p"):
+        style_id = try_parse_class_name(tokens)
+        style_name = _parse_style_name(tokens)
+        numbering = _parse_numbering(tokens)
+
+        return document_matchers.paragraph(
+            style_id=style_id,
+            style_name=style_name,
+            numbering=numbering,
+        )
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "r"):
+        style_id = try_parse_class_name(tokens)
+        style_name = _parse_style_name(tokens)
+
+        return document_matchers.run(
+            style_id=style_id,
+            style_name=style_name,
+        )
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "table"):
+        style_id = try_parse_class_name(tokens)
+        style_name = _parse_style_name(tokens)
+
+        return document_matchers.table(
+            style_id=style_id,
+            style_name=style_name,
+        )
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "b"):
+        return document_matchers.bold
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "i"):
+        return document_matchers.italic
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "u"):
+        return document_matchers.underline
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "strike"):
+        return document_matchers.strikethrough
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"):
+        return document_matchers.all_caps
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"):
+        return document_matchers.small_caps
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"):
+        return _parse_highlight(tokens)
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"):
+        return document_matchers.comment_reference
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "br"):
+        return _parse_break(tokens)
+
+    else:
+        raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER)))
+
+def _parse_style_name(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "["):
+        tokens.skip(TokenType.IDENTIFIER, "style-name")
+        string_matcher = _parse_string_matcher(tokens)
+        tokens.skip(TokenType.SYMBOL, "]")
+        return string_matcher
+    else:
+        return None
+
+
+def _parse_string_matcher(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "="):
+        return document_matchers.equal_to(parse_string(tokens))
+    elif tokens.try_skip(TokenType.SYMBOL, "^="):
+        return document_matchers.starts_with(parse_string(tokens))
+    else:
+        raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value()))
+
+def _parse_numbering(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, ":"):
+        is_ordered = _parse_list_type(tokens)
+        tokens.skip(TokenType.SYMBOL, "(")
+        level = int(tokens.next_value(TokenType.INTEGER)) - 1
+        tokens.skip(TokenType.SYMBOL, ")")
+        return documents.numbering_level(level, is_ordered=is_ordered)
+
+
+def _parse_list_type(tokens):
+    list_type = tokens.next_value(TokenType.IDENTIFIER)
+    if list_type == "ordered-list":
+        return True
+    elif list_type == "unordered-list":
+        return False
+    else:
+        raise LineParseError("Unrecognised list type: {0}".format(list_type))
+
+
+def _parse_highlight(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "["):
+        tokens.skip(TokenType.IDENTIFIER, "color")
+        tokens.skip(TokenType.SYMBOL, "=")
+        color = parse_string(tokens)
+        tokens.skip(TokenType.SYMBOL, "]");
+    else:
+        color = None
+
+    return document_matchers.highlight(color=color)
+
+
+def _parse_break(tokens):
+    tokens.skip(TokenType.SYMBOL, "[")
+    tokens.skip(TokenType.IDENTIFIER, "type")
+    tokens.skip(TokenType.SYMBOL, "=")
+    type_name = parse_string(tokens)
+    tokens.skip(TokenType.SYMBOL, "]");
+
+    if type_name == "line":
+        return document_matchers.line_break
+    elif type_name == "page":
+        return document_matchers.page_break
+    elif type_name == "column":
+        return document_matchers.column_break
+    else:
+        raise LineParseError("Unrecognised break type: {0}".format(type_name))
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/errors.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/errors.py
@@ -0,0 +1,2 @@
+class LineParseError(Exception):
+    pass
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/html_path_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/html_path_parser.py
@@ -0,0 +1,120 @@
+import cobble
+
+from ... import html_paths
+from .tokeniser import TokenType
+from .token_parser import parse_identifier, parse_string
+
+
+@cobble.data
+class _AttributeOrClassName(object):
+    name = cobble.field()
+    value = cobble.field()
+    append = cobble.field()
+
+
+def parse_html_path(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "!"):
+        return html_paths.ignore
+    else:
+        return html_paths.path(_parse_html_path_elements(tokens))
+
+
+def _parse_html_path_elements(tokens):
+    elements = []
+
+    if tokens.peek_token_type() == TokenType.IDENTIFIER:
+        elements.append(_parse_element(tokens))
+
+        while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
+            tokens.skip(TokenType.WHITESPACE)
+            elements.append(_parse_element(tokens))
+
+    return elements
+
+
+def _parse_element(tokens):
+    tag_names = _parse_tag_names(tokens)
+    attributes_list = _parse_attribute_or_class_names(tokens)
+    is_fresh = _parse_is_fresh(tokens)
+    separator = _parse_separator(tokens)
+
+    attributes = {}
+    for attribute in attributes_list:
+        if attribute.append and attributes.get(attribute.name):
+            attributes[attribute.name] += " " + attribute.value
+        else:
+            attributes[attribute.name] = attribute.value
+
+    return html_paths.element(
+        tag_names,
+        attributes=attributes,
+        fresh=is_fresh,
+        separator=separator,
+    )
+
+
+def _parse_tag_names(tokens):
+    tag_names = [parse_identifier(tokens)]
+
+    while tokens.try_skip(TokenType.SYMBOL, "|"):
+        tag_names.append(parse_identifier(tokens))
+
+    return tag_names
+
+
+def _parse_attribute_or_class_names(tokens):
+    attribute_or_class_names = []
+
+    while True:
+        attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
+        if attribute_or_class_name is None:
+            break
+        else:
+            attribute_or_class_names.append(attribute_or_class_name)
+
+    return attribute_or_class_names
+
+
+def _try_parse_attribute_or_class_name(tokens):
+    if tokens.is_next(TokenType.SYMBOL, "["):
+        return _parse_attribute(tokens)
+    if tokens.is_next(TokenType.SYMBOL, "."):
+        return _parse_class_name(tokens)
+    else:
+        return None
+
+
+def _parse_attribute(tokens):
+    tokens.skip(TokenType.SYMBOL, "[")
+    name = parse_identifier(tokens)
+    tokens.skip(TokenType.SYMBOL, "=")
+    value = parse_string(tokens)
+    tokens.skip(TokenType.SYMBOL, "]")
+    return _AttributeOrClassName(name=name, value=value, append=False)
+
+
+def _parse_class_name(tokens):
+    tokens.skip(TokenType.SYMBOL, ".")
+    class_name = parse_identifier(tokens)
+    return _AttributeOrClassName(name="class", value=class_name, append=True)
+
+
+def _parse_is_fresh(tokens):
+    return tokens.try_skip_many((
+        (TokenType.SYMBOL, ":"),
+        (TokenType.IDENTIFIER, "fresh"),
+    ))
+
+
+def _parse_separator(tokens):
+    is_separator = tokens.try_skip_many((
+        (TokenType.SYMBOL, ":"),
+        (TokenType.IDENTIFIER, "separator"),
+    ))
+    if is_separator:
+        tokens.skip(TokenType.SYMBOL, "(")
+        value = parse_string(tokens)
+        tokens.skip(TokenType.SYMBOL, ")")
+        return value
+    else:
+        return None
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/style_mapping_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/style_mapping_parser.py
@@ -0,0 +1,15 @@
+from .tokeniser import TokenType
+from .document_matcher_parser import parse_document_matcher
+from .html_path_parser import parse_html_path
+from ...styles import Style
+
+
+def parse_style_mapping(tokens):
+    document_matcher = parse_document_matcher(tokens)
+    tokens.skip(TokenType.WHITESPACE)
+    tokens.skip(TokenType.SYMBOL, "=>")
+    tokens.try_skip(TokenType.WHITESPACE)
+    html_path = parse_html_path(tokens)
+    tokens.skip(TokenType.END)
+    
+    return Style(document_matcher, html_path)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_iterator.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_iterator.py
@@ -0,0 +1,59 @@
+# TODO: check indices
+# TODO: proper tests for unexpected tokens
+
+from .errors import LineParseError
+
+
+class TokenIterator(object):
+    def __init__(self, tokens):
+        self._tokens = tokens
+        self._index = 0
+
+    def peek_token_type(self):
+        return self._tokens[self._index].type
+
+    def next_value(self, token_type=None):
+        return self._next(token_type).value
+
+    def _next(self, token_type=None):
+        token = self._tokens[self._index]
+        if token_type is None or token.type == token_type:
+            self._index += 1
+            return token
+        else:
+            raise self._unexpected_token_type(token_type, token)
+
+    def skip(self, token_type, token_value=None):
+        token = self._tokens[self._index]
+        if token.type == token_type and (token_value is None or token.value == token_value):
+            self._index += 1
+            return True
+        else:
+            raise self._unexpected_token_type(token_type, token)
+
+    def try_skip(self, token_type, token_value=None):
+        if self.is_next(token_type, token_value):
+            self._index += 1
+            return True
+        else:
+            return False
+
+    def try_skip_many(self, tokens):
+        start = self._index
+        for token_type, token_value in tokens:
+            token = self._tokens[self._index]
+            if not (token.type == token_type and (token_value is None or token.value == token_value)):
+                self._index = start
+                return False
+            else:
+                self._index += 1
+
+        return True
+
+    def is_next(self, token_type, token_value=None):
+        token = self._tokens[self._index]
+        return token.type == token_type and (token_value is None or token.value == token_value)
+
+    def _unexpected_token_type(self, token_type, token):
+        raise LineParseError()
+
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_parser.py
@@ -0,0 +1,37 @@
+import re
+
+from .tokeniser import TokenType
+
+
+def try_parse_class_name(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "."):
+        return parse_identifier(tokens)
+    else:
+        return None
+
+
+def parse_identifier(tokens):
+    return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER))
+
+
+def parse_string(tokens):
+    return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1])
+
+
+_ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)")
+
+
+def decode_escape_sequences(value):
+    return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value)
+    
+    
+def _decode_escape_sequence(match):
+    code = match.group(1)
+    if code == "n":
+        return "\n"
+    elif code == "r":
+        return "\r"
+    elif code == "t":
+        return "\t"
+    else:
+        return code
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/tokeniser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/tokeniser.py
@@ -0,0 +1,61 @@
+import collections
+import re
+
+
+Token = collections.namedtuple("Token", ["character_index", "type", "value"])
+
+
+class TokenType(object):
+    IDENTIFIER = "identifier"
+    SYMBOL = "symbol"
+    WHITESPACE = "whitespace"
+    STRING = "string"
+    UNTERMINATED_STRING = "unterminated string"
+    INTEGER = "integer"
+    END = "end"
+    
+
+
+def regex_tokeniser(rules):
+    rules = [(token_type, _to_regex(regex)) for token_type, regex in rules]
+    rules.append(("unknown", re.compile(".")))
+    
+    def tokenise(value):
+        tokens = []
+        index = 0
+        while index < len(value):
+            for token_type, regex in rules:
+                match = regex.match(value, index)
+                if match is not None:
+                    tokens.append(Token(index, token_type, match.group(0)))
+                    index = match.end()
+                    break
+            else:
+                # Should be impossible
+                raise Exception("Remaining: " + value[index:])
+
+        tokens.append(Token(index, TokenType.END, ""))
+
+        return tokens
+
+    return tokenise
+    
+
+def _to_regex(value):
+    if hasattr(value, "match"):
+        return value
+    else:
+        return re.compile(value)
+
+
+_string_prefix = r"'(?:\\.|[^'])*"
+_identifier_character = r"(?:[a-zA-Z\-_]|\\.)"
+
+tokenise = regex_tokeniser([
+    (TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"),
+    (TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."),
+    (TokenType.WHITESPACE, r"\s+"),
+    (TokenType.STRING, _string_prefix + "'"),
+    (TokenType.UNTERMINATED_STRING, _string_prefix),
+    (TokenType.INTEGER, "([0-9]+)"),
+])
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/transforms.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/transforms.py
@@ -0,0 +1,56 @@
+from . import documents
+
+
+def paragraph(transform_paragraph):
+    return element_of_type(documents.Paragraph, transform_paragraph)
+
+
+def run(transform_run):
+    return element_of_type(documents.Run, transform_run)
+
+
+def element_of_type(element_type, transform):
+    def transform_element(element):
+        if isinstance(element, element_type):
+            return transform(element)
+        else:
+            return element
+
+    return _each_element(transform_element)
+
+
+def _each_element(transform_element):
+    def transform_element_and_children(element):
+        if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)):
+            children = list(map(transform_element_and_children, element.children))
+            element = element.copy(children=children)
+
+        return transform_element(element)
+
+    return transform_element_and_children
+
+
+def get_descendants_of_type(element, element_type):
+    return list(filter(
+        lambda descendant: isinstance(descendant, element_type),
+        get_descendants(element),
+    ))
+
+
+def get_descendants(element):
+    descendants = []
+
+    def visit(element):
+        descendants.append(element)
+
+    _visit_descendants(element, visit)
+
+    return descendants
+
+
+def _visit_descendants(element, visit):
+    if isinstance(element, documents.HasChildren):
+        for child in element.children:
+            _visit_descendants(child, visit)
+            visit(child)
+
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/underline.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/underline.py
@@ -0,0 +1,8 @@
+from . import html
+
+
+def element(name):
+    def convert_underline(nodes):
+        return [html.collapsible_element(name, {}, nodes)]
+        
+    return convert_underline
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/init.py
@@ -0,0 +1,19 @@
+from .html import HtmlWriter
+from .markdown import MarkdownWriter
+
+
+def writer(output_format=None):
+    if output_format is None:
+        output_format = "html"
+    
+    return _writers[output_format]()
+
+
+def formats():
+    return _writers.keys()
+
+
+_writers = {
+    "html": HtmlWriter,
+    "markdown": MarkdownWriter,
+}
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/abc.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/abc.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/html.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/html.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/markdown.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/pycache/markdown.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/abc.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/abc.py
@@ -0,0 +1,31 @@
+from __future__ import absolute_import
+
+import abc
+
+
+class Writer(object):
+    __metaclass__ = abc.ABCMeta
+    
+    @abc.abstractmethod
+    def text(self, text):
+        pass
+    
+    @abc.abstractmethod
+    def start(self, name, attributes=None):
+        pass
+
+    @abc.abstractmethod
+    def end(self, name):
+        pass
+    
+    @abc.abstractmethod
+    def self_closing(self, name, attributes=None):
+        pass
+    
+    @abc.abstractmethod
+    def append(self, html):
+        pass
+    
+    @abc.abstractmethod
+    def as_string(self):
+        pass
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/html.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/html.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+from xml.sax.saxutils import escape
+
+from .abc import Writer
+
+
+class HtmlWriter(Writer):
+    def __init__(self):
+        self._fragments = []
+    
+    def text(self, text):
+        self._fragments.append(_escape_html(text))
+    
+    def start(self, name, attributes=None):
+        attribute_string = _generate_attribute_string(attributes)
+        self._fragments.append("<{0}{1}>".format(name, attribute_string))
+
+    def end(self, name):
+        self._fragments.append("</{0}>".format(name))
+    
+    def self_closing(self, name, attributes=None):
+        attribute_string = _generate_attribute_string(attributes)
+        self._fragments.append("<{0}{1} />".format(name, attribute_string))
+    
+    def append(self, html):
+        self._fragments.append(html)
+    
+    def as_string(self):
+        return "".join(self._fragments)
+
+
+def _escape_html(text):
+    return escape(text, {'"': "&quot;"})
+
+
+def _generate_attribute_string(attributes):
+    if attributes is None:
+        return ""
+    else:
+        return "".join(
+            ' {0}="{1}"'.format(key, _escape_html(attributes[key]))
+            for key in sorted(attributes)
+        )
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/writers/markdown.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/writers/markdown.py
@@ -0,0 +1,203 @@
+from __future__ import unicode_literals
+
+from .abc import Writer
+
+import re
+
+
+class _WriterOutput(object):
+    def __init__(self, start, end=None, generate_end=None, anchor_position=None):
+        if generate_end is None:
+            generate_end = _constant(end)
+        
+        self.start = start
+        self.generate_end = generate_end
+        self.anchor_position = anchor_position
+
+
+def _constant(value):
+    def get():
+        return value
+    
+    return get
+
+
+class _MarkdownState(object):
+    def __init__(self):
+        self._list_state_stack = []
+        self.list_state = None
+        self.list_item_has_closed = False
+    
+    def update_list_state(self, list_state):
+        self._list_state_stack.append(self.list_state)
+        self.list_state = list_state
+    
+    def pop_list_state(self):
+        self.list_state = self._list_state_stack.pop()
+
+
+class _MarkdownListState(object):
+    def __init__(self, ordered, indentation):
+        self.ordered = ordered
+        self.count = 0
+        self.indentation = indentation
+
+
+def _symmetric_wrapped(end):
+    return _Wrapped(end, end)
+
+
+class _Wrapped(object):
+    def __init__(self, start, end):
+        self._start = start
+        self._end = end
+    
+    def __call__(self, attributes, markdown_state):
+        return _WriterOutput(self._start, self._end)
+
+
+def _hyperlink(attributes, markdown_state):
+    href = attributes.get("href", "")
+    if href:
+        return _WriterOutput(
+            "[", "]({0})".format(href),
+            anchor_position="before",
+        )
+    else:
+        return _default_output
+
+
+def _image(attributes, markdown_state):
+    src = attributes.get("src", "")
+    alt_text = attributes.get("alt", "")
+    if src or alt_text:
+        return _WriterOutput("![{0}]({1})".format(alt_text, src), "")
+    else:
+        return _default_output
+
+
+def _list(ordered):
+    def call(attributes, markdown_state):
+        if markdown_state.list_state is None:
+            start = ""
+            end_text = "\n"
+            indentation = 0
+        else:
+            start = "\n"
+            end_text = ""
+            indentation = markdown_state.list_state.indentation + 1
+        
+        def generate_end():
+            markdown_state.pop_list_state()
+            return end_text
+        
+        markdown_state.update_list_state(_MarkdownListState(
+            ordered=ordered,
+            indentation=indentation,
+        ))
+        
+        return _WriterOutput(start, generate_end=generate_end)
+    
+    return call
+
+
+def _list_item(attributes, markdown_state):
+    markdown_state.list_item_has_closed = False
+    
+    list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0)
+    list_state.count += 1
+    
+    if list_state.ordered:
+        bullet = "{0}.".format(list_state.count)
+    else:
+        bullet = "-"
+    
+    def generate_end():
+        if markdown_state.list_item_has_closed:
+            return ""
+        else:
+            markdown_state.list_item_has_closed = True
+            return "\n"
+    
+    return _WriterOutput(
+        start=("\t" * list_state.indentation) + bullet + " ",
+        generate_end=generate_end
+    )
+
+
+def _init_writers():
+    writers = {
+        "p": _Wrapped("", "\n\n"),
+        "br": _Wrapped("", "  \n"),
+        "strong": _symmetric_wrapped("__"),
+        "em": _symmetric_wrapped("*"),
+        "a": _hyperlink,
+        "img": _image,
+        "ol": _list(ordered=True),
+        "ul": _list(ordered=False),
+        "li": _list_item,
+    }
+    
+    for level in range(1, 7):
+        writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n")
+    
+    return writers
+
+
+_writers = _init_writers()
+_default_output = _WriterOutput("", "")
+
+def _default_writer(attributes, markdown_state):
+    return _default_output
+
+
+class MarkdownWriter(Writer):
+    def __init__(self):
+        self._fragments = []
+        self._element_stack = []
+        self._markdown_state = _MarkdownState()
+    
+    def text(self, text):
+        self._fragments.append(_escape_markdown(text))
+    
+    def start(self, name, attributes=None):
+        if attributes is None:
+            attributes = {}
+        
+        output = _writers.get(name, _default_writer)(attributes, self._markdown_state)
+        self._element_stack.append(output.generate_end)
+        
+        anchor_before_start = output.anchor_position == "before"
+        if anchor_before_start:
+            self._write_anchor(attributes)
+        
+        self._fragments.append(output.start)
+        
+        if not anchor_before_start:
+            self._write_anchor(attributes)
+        
+        
+
+    def end(self, name):
+        end = self._element_stack.pop()
+        output = end()
+        self._fragments.append(output)
+    
+    def self_closing(self, name, attributes=None):
+        self.start(name, attributes)
+        self.end(name)
+    
+    def append(self, other):
+        self._fragments.append(other)
+    
+    def as_string(self):
+        return "".join(self._fragments)
+    
+    def _write_anchor(self, attributes):
+        html_id = attributes.get("id")
+        if html_id:
+            self._fragments.append('<a id="{0}"></a>'.format(html_id))
+
+
+def _escape_markdown(value):
+    return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value))
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/zips.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/zips.py
@@ -0,0 +1,77 @@
+import contextlib
+import io
+import shutil
+
+from zipfile import ZipFile
+
+
+def open_zip(fileobj, mode):
+    return _Zip(ZipFile(fileobj, mode))
+
+
+class _Zip(object):
+    def __init__(self, zip_file):
+        self._zip_file = zip_file
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, *args):
+        self._zip_file.close()
+
+    def open(self, name):
+        return contextlib.closing(self._zip_file.open(name))
+
+    def exists(self, name):
+        try:
+            self._zip_file.getinfo(name)
+            return True
+        except KeyError:
+            return False
+
+    def read_str(self, name):
+        return self._zip_file.read(name).decode("utf8")
+
+
+def update_zip(fileobj, files):
+    source = ZipFile(fileobj, "r")
+    try:
+        destination_fileobj = io.BytesIO()
+        destination = ZipFile(destination_fileobj, "w")
+        try:
+            names = set(source.namelist()) | set(files.keys())
+            for name in names:
+                if name in files:
+                    contents = files[name]
+                else:
+                    contents = source.read(name)
+                destination.writestr(name, contents)
+        finally:
+            destination.close()
+    finally:
+        source.close()
+    
+    fileobj.seek(0)
+    destination_fileobj.seek(0)
+    shutil.copyfileobj(destination_fileobj, fileobj)
+
+
+def split_path(path):
+    parts = path.rsplit("/", 1)
+    if len(parts) == 1:
+        return ("", path)
+    else:
+        return tuple(parts)
+
+
+def join_path(*args):
+    non_empty_paths = list(filter(None, args))
+    
+    relevant_paths = []
+    for path in non_empty_paths:
+        if path.startswith("/"):
+            relevant_paths = [path]
+        else:
+            relevant_paths.append(path)
+    
+    return "/".join(relevant_paths)