Initial commit (Clean history)

This commit is contained in:
anhduy-tech
2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions

View File

@@ -0,0 +1,211 @@
from functools import partial
import os
import cobble
from .. import results, lists, zips
from .document_xml import read_document_xml_element
from .content_types_xml import empty_content_types, read_content_types_xml_element
from .relationships_xml import read_relationships_xml_element, Relationships
from .numbering_xml import read_numbering_xml_element, Numbering
from .styles_xml import read_styles_xml_element, Styles
from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
from .comments_xml import read_comments_xml_element
from .files import Files
from . import body_xml, office_xml
from ..zips import open_zip
_empty_result = results.success([])
def read(fileobj, external_file_access=False):
zip_file = open_zip(fileobj, "r")
part_paths = _find_part_paths(zip_file)
read_part_with_body = _part_with_body_reader(
getattr(fileobj, "name", None),
zip_file,
part_paths=part_paths,
external_file_access=external_file_access,
)
return results.combine([
_read_notes(read_part_with_body, part_paths),
_read_comments(read_part_with_body, part_paths),
]).bind(lambda referents:
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
)
@cobble.data
class _PartPaths(object):
main_document = cobble.field()
comments = cobble.field()
endnotes = cobble.field()
footnotes = cobble.field()
numbering = cobble.field()
styles = cobble.field()
def _find_part_paths(zip_file):
package_relationships = _read_relationships(zip_file, "_rels/.rels")
document_filename = _find_document_filename(zip_file, package_relationships)
document_relationships = _read_relationships(
zip_file,
_find_relationships_path_for(document_filename),
)
def find(name):
return _find_part_path(
zip_file=zip_file,
relationships=document_relationships,
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
fallback_path="word/{0}.xml".format(name),
base_path=zips.split_path(document_filename)[0],
)
return _PartPaths(
main_document=document_filename,
comments=find("comments"),
endnotes=find("endnotes"),
footnotes=find("footnotes"),
numbering=find("numbering"),
styles=find("styles"),
)
def _find_document_filename(zip_file, relationships):
path = _find_part_path(
zip_file,
relationships,
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
base_path="",
fallback_path="word/document.xml",
)
if zip_file.exists(path):
return path
else:
raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
targets = [
zips.join_path(base_path, target).lstrip("/")
for target in relationships.find_targets_by_type(relationship_type)
]
valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
if len(valid_targets) == 0:
return fallback_path
else:
return valid_targets[0]
def _read_notes(read_part_with_body, part_paths):
footnotes = read_part_with_body(
part_paths.footnotes,
lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
default=_empty_result,
)
endnotes = read_part_with_body(
part_paths.endnotes,
lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
default=_empty_result,
)
return results.combine([footnotes, endnotes]).map(lists.flatten)
def _read_comments(read_part_with_body, part_paths):
return read_part_with_body(
part_paths.comments,
lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
default=_empty_result,
)
def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
return read_part_with_body(
part_paths.main_document,
partial(
read_document_xml_element,
notes=notes,
comments=comments,
),
)
def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
content_types = _try_read_entry_or_default(
zip_file,
"[Content_Types].xml",
read_content_types_xml_element,
empty_content_types,
)
styles = _try_read_entry_or_default(
zip_file,
part_paths.styles,
read_styles_xml_element,
Styles.EMPTY,
)
numbering = _try_read_entry_or_default(
zip_file,
part_paths.numbering,
lambda element: read_numbering_xml_element(element, styles=styles),
default=Numbering.EMPTY,
)
files = Files(
None if document_path is None else os.path.dirname(document_path),
external_file_access=external_file_access,
)
def read_part(name, reader, default=_undefined):
relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
body_reader = body_xml.reader(
numbering=numbering,
content_types=content_types,
relationships=relationships,
styles=styles,
docx_file=zip_file,
files=files,
)
if default is _undefined:
return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
else:
return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
return read_part
def _find_relationships_path_for(name):
dirname, basename = zips.split_path(name)
return zips.join_path(dirname, "_rels", basename + ".rels")
def _read_relationships(zip_file, name):
return _try_read_entry_or_default(
zip_file,
name,
read_relationships_xml_element,
default=Relationships.EMPTY,
)
def _try_read_entry_or_default(zip_file, name, reader, default):
if zip_file.exists(name):
return _read_entry(zip_file, name, reader)
else:
return default
def _read_entry(zip_file, name, reader):
with zip_file.open(name) as fileobj:
return reader(office_xml.read(fileobj))
_undefined = object()

View File

@@ -0,0 +1,794 @@
import contextlib
import re
import sys
from .. import documents
from .. import results
from .. import lists
from .. import transforms
from . import complex_fields
from .dingbats import dingbats
from .xmlparser import node_types, XmlElement, null_xml_element
from .styles_xml import Styles
from .uris import replace_fragment, uri_to_zip_entry_name
if sys.version_info >= (3, ):
unichr = chr
def reader(
numbering=None,
content_types=None,
relationships=None,
styles=None,
docx_file=None,
files=None
):
if styles is None:
styles = Styles.EMPTY
read_all = _create_reader(
numbering=numbering,
content_types=content_types,
relationships=relationships,
styles=styles,
docx_file=docx_file,
files=files,
)
return _BodyReader(read_all)
class _BodyReader(object):
def __init__(self, read_all):
self._read_all = read_all
def read_all(self, elements):
result = self._read_all(elements)
return results.Result(result.elements, result.messages)
def _create_reader(numbering, content_types, relationships, styles, docx_file, files):
current_instr_text = []
complex_field_stack = []
# When a paragraph is marked as deleted, its contents should be combined
# with the following paragraph. See 17.13.5.15 del (Deleted Paragraph) of
# ECMA-376 4th edition Part 1.
deleted_paragraph_contents = []
_ignored_elements = set([
"office-word:wrap",
"v:shadow",
"v:shapetype",
"w:annotationRef",
"w:bookmarkEnd",
"w:sectPr",
"w:proofErr",
"w:lastRenderedPageBreak",
"w:commentRangeStart",
"w:commentRangeEnd",
"w:del",
"w:footnoteRef",
"w:endnoteRef",
"w:pPr",
"w:rPr",
"w:tblPr",
"w:tblGrid",
"w:trPr",
"w:tcPr",
])
def text(element):
return _success(documents.Text(_inner_text(element)))
def run(element):
properties = element.find_child_or_null("w:rPr")
vertical_alignment = properties \
.find_child_or_null("w:vertAlign") \
.attributes.get("w:val")
font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
font_size_string = properties.find_child_or_null("w:sz").attributes.get("w:val")
if _is_int(font_size_string):
# w:sz gives the font size in half points, so halve the value to get the size in points
font_size = int(font_size_string) / 2
else:
font_size = None
is_bold = read_boolean_element(properties.find_child("w:b"))
is_italic = read_boolean_element(properties.find_child("w:i"))
is_underline = read_underline_element(properties.find_child("w:u"))
is_strikethrough = read_boolean_element(properties.find_child("w:strike"))
is_all_caps = read_boolean_element(properties.find_child("w:caps"))
is_small_caps = read_boolean_element(properties.find_child("w:smallCaps"))
highlight = read_highlight_value(properties.find_child_or_null("w:highlight").attributes.get("w:val"))
def add_complex_field_hyperlink(children):
hyperlink_kwargs = current_hyperlink_kwargs()
if hyperlink_kwargs is None:
return children
else:
return [documents.hyperlink(children=children, **hyperlink_kwargs)]
return _ReadResult.map_results(
_read_run_style(properties),
_read_xml_elements(element.children).map(add_complex_field_hyperlink),
lambda style, children: documents.run(
children=children,
style_id=style[0],
style_name=style[1],
is_bold=is_bold,
is_italic=is_italic,
is_underline=is_underline,
is_strikethrough=is_strikethrough,
is_all_caps=is_all_caps,
is_small_caps=is_small_caps,
vertical_alignment=vertical_alignment,
font=font,
font_size=font_size,
highlight=highlight,
))
def _read_run_style(properties):
return _read_style(properties, "w:rStyle", "Run", styles.find_character_style_by_id)
def read_boolean_element(element):
if element is None:
return False
else:
return read_boolean_attribute_value(element.attributes.get("w:val"))
def read_boolean_attribute_value(value):
return value not in ["false", "0"]
def read_underline_element(element):
return element and element.attributes.get("w:val") not in [None, "false", "0", "none"]
def read_highlight_value(value):
if not value or value == "none":
return None
else:
return value
def paragraph(element):
properties = element.find_child_or_null("w:pPr")
is_deleted = properties.find_child_or_null("w:rPr").find_child("w:del")
if is_deleted is not None:
for child in element.children:
deleted_paragraph_contents.append(child)
return _empty_result
else:
alignment = properties.find_child_or_null("w:jc").attributes.get("w:val")
indent = _read_paragraph_indent(properties.find_child_or_null("w:ind"))
children_xml = element.children
if deleted_paragraph_contents:
children_xml = deleted_paragraph_contents + children_xml
del deleted_paragraph_contents[:]
return _ReadResult.map_results(
_read_paragraph_style(properties),
_read_xml_elements(children_xml),
lambda style, children: documents.paragraph(
children=children,
style_id=style[0],
style_name=style[1],
numbering=_read_numbering_properties(
paragraph_style_id=style[0],
element=properties.find_child_or_null("w:numPr"),
),
alignment=alignment,
indent=indent,
)).append_extra()
def _read_paragraph_style(properties):
return _read_style(properties, "w:pStyle", "Paragraph", styles.find_paragraph_style_by_id)
def current_hyperlink_kwargs():
for complex_field in reversed(complex_field_stack):
if isinstance(complex_field, complex_fields.Hyperlink):
return complex_field.kwargs
return None
def read_fld_char(element):
fld_char_type = element.attributes.get("w:fldCharType")
if fld_char_type == "begin":
complex_field_stack.append(complex_fields.begin(fld_char=element))
del current_instr_text[:]
elif fld_char_type == "end":
complex_field = complex_field_stack.pop()
if isinstance(complex_field, complex_fields.Begin):
complex_field = parse_current_instr_text(complex_field)
if isinstance(complex_field, complex_fields.Checkbox):
return _success(documents.checkbox(checked=complex_field.checked))
elif fld_char_type == "separate":
complex_field_separate = complex_field_stack.pop()
complex_field = parse_current_instr_text(complex_field_separate)
complex_field_stack.append(complex_field)
return _empty_result
def parse_current_instr_text(complex_field):
instr_text = "".join(current_instr_text)
if isinstance(complex_field, complex_fields.Begin):
fld_char = complex_field.fld_char
else:
fld_char = null_xml_element
return parse_instr_text(instr_text, fld_char=fld_char)
def parse_instr_text(instr_text, *, fld_char):
external_link_result = re.match(r'\s*HYPERLINK "(.*)"', instr_text)
if external_link_result is not None:
return complex_fields.hyperlink(dict(href=external_link_result.group(1)))
internal_link_result = re.match(r'\s*HYPERLINK\s+\\l\s+"(.*)"', instr_text)
if internal_link_result is not None:
return complex_fields.hyperlink(dict(anchor=internal_link_result.group(1)))
checkbox_result = re.match(r'\s*FORMCHECKBOX\s*', instr_text)
if checkbox_result is not None:
checkbox_element = fld_char \
.find_child_or_null("w:ffData") \
.find_child_or_null("w:checkBox")
checked_element = checkbox_element.find_child("w:checked")
if checked_element is None:
checked = read_boolean_element(checkbox_element.find_child("w:default"))
else:
checked = read_boolean_element(checked_element)
return complex_fields.checkbox(checked=checked)
return None
def read_instr_text(element):
current_instr_text.append(_inner_text(element))
return _empty_result
def _read_style(properties, style_tag_name, style_type, find_style_by_id):
messages = []
style_id = properties \
.find_child_or_null(style_tag_name) \
.attributes.get("w:val")
if style_id is None:
style_name = None
else:
style = find_style_by_id(style_id)
if style is None:
style_name = None
messages.append(_undefined_style_warning(style_type, style_id))
else:
style_name = style.name
return _ReadResult([style_id, style_name], [], messages)
def _undefined_style_warning(style_type, style_id):
return results.warning("{0} style with ID {1} was referenced but not defined in the document".format(style_type, style_id))
def _read_numbering_properties(paragraph_style_id, element):
num_id = element.find_child_or_null("w:numId").attributes.get("w:val")
level_index = element.find_child_or_null("w:ilvl").attributes.get("w:val")
if num_id is not None and level_index is not None:
return numbering.find_level(num_id, level_index)
if paragraph_style_id is not None:
level = numbering.find_level_by_paragraph_style_id(paragraph_style_id)
if level is not None:
return level
# Some malformed documents define numbering levels without an index, and
# reference the numbering using a w:numPr element without a w:ilvl child.
# To handle such cases, we assume a level of 0 as a fallback.
if num_id is not None:
return numbering.find_level(num_id, "0")
return None
def _read_paragraph_indent(element):
attributes = element.attributes
return documents.paragraph_indent(
start=attributes.get("w:start") or attributes.get("w:left"),
end=attributes.get("w:end") or attributes.get("w:right"),
first_line=attributes.get("w:firstLine"),
hanging=attributes.get("w:hanging"),
)
def tab(element):
return _success(documents.tab())
def no_break_hyphen(element):
return _success(documents.text(unichr(0x2011)))
def soft_hyphen(element):
return _success(documents.text(u"\u00ad"))
def symbol(element):
# See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1
font = element.attributes.get("w:font")
char = element.attributes.get("w:char")
unicode_code_point = dingbats.get((font, int(char, 16)))
if unicode_code_point is None and re.match("^F0..", char):
unicode_code_point = dingbats.get((font, int(char[2:], 16)))
if unicode_code_point is None:
warning = results.warning("A w:sym element with an unsupported character was ignored: char {0} in font {1}".format(
char,
font,
))
return _empty_result_with_message(warning)
else:
return _success(documents.text(unichr(unicode_code_point)))
def table(element):
properties = element.find_child_or_null("w:tblPr")
return _ReadResult.map_results(
read_table_style(properties),
_read_xml_elements(element.children)
.flat_map(calculate_row_spans),
lambda style, children: documents.table(
children=children,
style_id=style[0],
style_name=style[1],
),
)
def read_table_style(properties):
return _read_style(properties, "w:tblStyle", "Table", styles.find_table_style_by_id)
def table_row(element):
properties = element.find_child_or_null("w:trPr")
# See 17.13.5.12 del (Deleted Table Row) of ECMA-376 4th edition Part 1
is_deleted = bool(properties.find_child("w:del"))
if is_deleted:
return _empty_result
is_header = bool(properties.find_child("w:tblHeader"))
return _read_xml_elements(element.children) \
.map(lambda children: documents.table_row(
children=children,
is_header=is_header,
))
def table_cell(element):
properties = element.find_child_or_null("w:tcPr")
gridspan = properties \
.find_child_or_null("w:gridSpan") \
.attributes.get("w:val")
if gridspan is None:
colspan = 1
else:
colspan = int(gridspan)
return _read_xml_elements(element.children) \
.map(lambda children: documents.table_cell_unmerged(
children=children,
colspan=colspan,
rowspan=1,
vmerge=read_vmerge(properties),
))
def read_vmerge(properties):
vmerge_element = properties.find_child("w:vMerge")
if vmerge_element is None:
return False
else:
val = vmerge_element.attributes.get("w:val")
return val == "continue" or not val
def calculate_row_spans(rows):
unexpected_non_rows = any(
not isinstance(row, documents.TableRow)
for row in rows
)
if unexpected_non_rows:
rows = remove_unmerged_table_cells(rows)
return _elements_result_with_messages(rows, [results.warning(
"unexpected non-row element in table, cell merging may be incorrect"
)])
unexpected_non_cells = any(
not isinstance(cell, documents.TableCellUnmerged)
for row in rows
for cell in row.children
)
if unexpected_non_cells:
rows = remove_unmerged_table_cells(rows)
return _elements_result_with_messages(rows, [results.warning(
"unexpected non-cell element in table row, cell merging may be incorrect"
)])
columns = {}
for row in rows:
cell_index = 0
for cell in row.children:
if cell.vmerge and cell_index in columns:
columns[cell_index].rowspan += 1
else:
columns[cell_index] = cell
cell.vmerge = False
cell_index += cell.colspan
for row in rows:
row.children = [
documents.table_cell(
children=cell.children,
colspan=cell.colspan,
rowspan=cell.rowspan,
)
for cell in row.children
if not cell.vmerge
]
return _success(rows)
def remove_unmerged_table_cells(rows):
return list(map(
transforms.element_of_type(
documents.TableCellUnmerged,
lambda cell: documents.table_cell(
children=cell.children,
colspan=cell.colspan,
rowspan=cell.rowspan,
),
),
rows,
))
def read_child_elements(element):
return _read_xml_elements(element.children)
def pict(element):
return read_child_elements(element).to_extra()
def hyperlink(element):
relationship_id = element.attributes.get("r:id")
anchor = element.attributes.get("w:anchor")
target_frame = element.attributes.get("w:tgtFrame") or None
children_result = _read_xml_elements(element.children)
def create(**kwargs):
return children_result.map(lambda children: documents.hyperlink(
children=children,
target_frame=target_frame,
**kwargs
))
if relationship_id is not None:
href = relationships.find_target_by_relationship_id(relationship_id)
if anchor is not None:
href = replace_fragment(href, anchor)
return create(href=href)
elif anchor is not None:
return create(anchor=anchor)
else:
return children_result
def bookmark_start(element):
name = element.attributes.get("w:name")
if name == "_GoBack":
return _empty_result
else:
return _success(documents.bookmark(name))
def break_(element):
break_type = element.attributes.get("w:type")
if not break_type or break_type == "textWrapping":
return _success(documents.line_break)
elif break_type == "page":
return _success(documents.page_break)
elif break_type == "column":
return _success(documents.column_break)
else:
warning = results.warning("Unsupported break type: {0}".format(break_type))
return _empty_result_with_message(warning)
def inline(element):
properties = element.find_child_or_null("wp:docPr").attributes
if properties.get("descr", "").strip():
alt_text = properties.get("descr")
else:
alt_text = properties.get("title")
blips = element.find_children("a:graphic") \
.find_children("a:graphicData") \
.find_children("pic:pic") \
.find_children("pic:blipFill") \
.find_children("a:blip")
return _read_blips(blips, alt_text)
def _read_blips(blips, alt_text):
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
def _read_blip(element, alt_text):
blip_image = _find_blip_image(element)
if blip_image is None:
warning = results.warning("Could not find image file for a:blip element")
return _empty_result_with_message(warning)
else:
return _read_image(blip_image, alt_text)
def _read_image(image_file, alt_text):
image_path, open_image = image_file
content_type = content_types.find_content_type(image_path)
image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
messages = []
else:
messages = [results.warning("Image of type {0} is unlikely to display in web browsers".format(content_type))]
return _element_result_with_messages(image, messages)
def _find_blip_image(element):
embed_relationship_id = element.attributes.get("r:embed")
link_relationship_id = element.attributes.get("r:link")
if embed_relationship_id is not None:
return _find_embedded_image(embed_relationship_id)
elif link_relationship_id is not None:
return _find_linked_image(link_relationship_id)
else:
return None
def _find_embedded_image(relationship_id):
target = relationships.find_target_by_relationship_id(relationship_id)
image_path = uri_to_zip_entry_name("word", target)
def open_image():
image_file = docx_file.open(image_path)
if hasattr(image_file, "__exit__"):
return image_file
else:
return contextlib.closing(image_file)
return image_path, open_image
def _find_linked_image(relationship_id):
image_path = relationships.find_target_by_relationship_id(relationship_id)
def open_image():
return files.open(image_path)
return image_path, open_image
def read_imagedata(element):
relationship_id = element.attributes.get("r:id")
if relationship_id is None:
warning = results.warning("A v:imagedata element without a relationship ID was ignored")
return _empty_result_with_message(warning)
else:
title = element.attributes.get("o:title")
return _read_image(_find_embedded_image(relationship_id), title)
def note_reference_reader(note_type):
def note_reference(element):
return _success(documents.note_reference(note_type, element.attributes["w:id"]))
return note_reference
def read_comment_reference(element):
return _success(documents.comment_reference(element.attributes["w:id"]))
def alternate_content(element):
return read_child_elements(element.find_child_or_null("mc:Fallback"))
def read_sdt(element):
content_result = read_child_elements(element.find_child_or_null("w:sdtContent"))
def handle_content(content):
# From the WordML standard: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/3350cb64-931f-41f7-8824-f18b2568ce66
#
# > A CT_SdtCheckbox element that specifies that the parent
# > structured document tag is a checkbox when displayed in the
# > document. The parent structured document tag contents MUST
# > contain a single character and optionally an additional
# > character in a deleted run.
checkbox = element.find_child_or_null("w:sdtPr").find_child("wordml:checkbox")
if checkbox is None:
return content
checked_element = checkbox.find_child("wordml:checked")
is_checked = (
checked_element is not None and
read_boolean_attribute_value(checked_element.attributes.get("wordml:val"))
)
document_checkbox = documents.checkbox(checked=is_checked)
has_checkbox = False
def transform_text(text):
nonlocal has_checkbox
if len(text.value) > 0 and not has_checkbox:
has_checkbox = True
return document_checkbox
else:
return text
replaced_content = list(map(
transforms.element_of_type(documents.Text, transform_text),
content,
))
if has_checkbox:
return replaced_content
else:
return document_checkbox
return content_result.map(handle_content)
handlers = {
"w:t": text,
"w:r": run,
"w:p": paragraph,
"w:fldChar": read_fld_char,
"w:instrText": read_instr_text,
"w:tab": tab,
"w:noBreakHyphen": no_break_hyphen,
"w:softHyphen": soft_hyphen,
"w:sym": symbol,
"w:tbl": table,
"w:tr": table_row,
"w:tc": table_cell,
"w:ins": read_child_elements,
"w:object": read_child_elements,
"w:smartTag": read_child_elements,
"w:drawing": read_child_elements,
"v:group": read_child_elements,
"v:rect": read_child_elements,
"v:roundrect": read_child_elements,
"v:shape": read_child_elements,
"v:textbox": read_child_elements,
"w:txbxContent": read_child_elements,
"w:pict": pict,
"w:hyperlink": hyperlink,
"w:bookmarkStart": bookmark_start,
"w:br": break_,
"wp:inline": inline,
"wp:anchor": inline,
"v:imagedata": read_imagedata,
"w:footnoteReference": note_reference_reader("footnote"),
"w:endnoteReference": note_reference_reader("endnote"),
"w:commentReference": read_comment_reference,
"mc:AlternateContent": alternate_content,
"w:sdt": read_sdt
}
def read(element):
handler = handlers.get(element.name)
if handler is None:
if element.name not in _ignored_elements:
warning = results.warning("An unrecognised element was ignored: {0}".format(element.name))
return _empty_result_with_message(warning)
else:
return _empty_result
else:
return handler(element)
def _read_xml_elements(nodes):
elements = filter(lambda node: isinstance(node, XmlElement), nodes)
return _ReadResult.concat(lists.map(read, elements))
return _read_xml_elements
def _inner_text(node):
if node.node_type == node_types.text:
return node.value
else:
return "".join(_inner_text(child) for child in node.children)
class _ReadResult(object):
@staticmethod
def concat(results):
return _ReadResult(
lists.flat_map(lambda result: result.elements, results),
lists.flat_map(lambda result: result.extra, results),
lists.flat_map(lambda result: result.messages, results))
@staticmethod
def map_results(first, second, func):
return _ReadResult(
[func(first.elements, second.elements)],
first.extra + second.extra,
first.messages + second.messages)
def __init__(self, elements, extra, messages):
self.elements = elements
self.extra = extra
self.messages = messages
def map(self, func):
elements = func(self.elements)
if not isinstance(elements, list):
elements = [elements]
return _ReadResult(
elements,
self.extra,
self.messages)
def flat_map(self, func):
result = func(self.elements)
return _ReadResult(
result.elements,
self.extra + result.extra,
self.messages + result.messages)
def to_extra(self):
return _ReadResult([], _concat(self.extra, self.elements), self.messages)
def append_extra(self):
return _ReadResult(_concat(self.elements, self.extra), [], self.messages)
def _success(elements):
if not isinstance(elements, list):
elements = [elements]
return _ReadResult(elements, [], [])
def _element_result_with_messages(element, messages):
return _elements_result_with_messages([element], messages)
def _elements_result_with_messages(elements, messages):
return _ReadResult(elements, [], messages)
_empty_result = _ReadResult([], [], [])
def _empty_result_with_message(message):
return _ReadResult([], [], [message])
def _concat(*values):
result = []
for value in values:
for element in value:
result.append(element)
return result
def _is_int(value):
if value is None:
return False
try:
int(value)
except ValueError:
return False
return True

View File

@@ -0,0 +1,24 @@
from .. import lists
from .. import documents
from .. import results
def read_comments_xml_element(element, body_reader):
def read_comments_xml_element(element):
comment_elements = element.find_children("w:comment")
return results.combine(lists.map(_read_comment_element, comment_elements))
def _read_comment_element(element):
def read_optional_attribute(name):
return element.attributes.get(name, "").strip() or None
return body_reader.read_all(element.children).map(lambda body:
documents.comment(
comment_id=element.attributes["w:id"],
body=body,
author_name=read_optional_attribute("w:author"),
author_initials=read_optional_attribute("w:initials"),
))
return read_comments_xml_element(element)

View File

@@ -0,0 +1,29 @@
class unknown(object):
pass
class Begin:
def __init__(self, *, fld_char):
self.fld_char = fld_char
def begin(*, fld_char):
return Begin(fld_char=fld_char)
class Hyperlink(object):
def __init__(self, kwargs):
self.kwargs = kwargs
def hyperlink(kwargs):
return Hyperlink(kwargs=kwargs)
class Checkbox:
def __init__(self, *, checked):
self.checked = checked
def checkbox(*, checked):
return Checkbox(checked=checked)

View File

@@ -0,0 +1,58 @@
def read_content_types_xml_element(element):
extension_defaults = dict(map(
_read_default,
element.find_children("content-types:Default")
))
overrides = dict(map(
_read_override,
element.find_children("content-types:Override")
))
return _ContentTypes(extension_defaults, overrides)
def _read_default(element):
extension = element.attributes["Extension"]
content_type = element.attributes["ContentType"]
return extension, content_type
def _read_override(element):
part_name = element.attributes["PartName"]
content_type = element.attributes["ContentType"]
return part_name.lstrip("/"), content_type
class _ContentTypes(object):
_image_content_types = {
"png": "png",
"gif": "gif",
"jpeg": "jpeg",
"jpg": "jpeg",
"tif": "tiff",
"tiff": "tiff",
"bmp": "bmp",
}
def __init__(self, extension_defaults, overrides):
self._extension_defaults = extension_defaults
self._overrides = overrides
def find_content_type(self, path):
if path in self._overrides:
return self._overrides[path]
extension = _get_extension(path)
default_type = self._extension_defaults.get(extension)
if default_type is not None:
return default_type
image_type = self._image_content_types.get(extension.lower())
if image_type is not None:
return "image/" + image_type
return None
empty_content_types = _ContentTypes({}, {})
def _get_extension(path):
return path.rpartition(".")[2]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,25 @@
from .. import documents
def read_document_xml_element(
element,
body_reader,
notes=None,
comments=None):
if notes is None:
notes = []
if comments is None:
comments = []
body_element = element.find_child("w:body")
if body_element is None:
raise ValueError("Could not find the body element: are you sure this is a docx file?")
return body_reader.read_all(body_element.children) \
.map(lambda children: documents.document(
children,
notes=documents.notes(notes),
comments=comments
))

View File

@@ -0,0 +1,46 @@
import os
import contextlib
try:
from urllib2 import urlopen
except ImportError:
from urllib.request import urlopen
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
class Files(object):
def __init__(self, base, external_file_access):
self._base = base
self._external_file_access = external_file_access
def open(self, uri):
if not self._external_file_access:
raise ExternalFileAccessIsDisabledError(
"could not open external image '{0}', external file access is disabled".format(uri)
)
try:
if _is_absolute(uri):
return contextlib.closing(urlopen(uri))
elif self._base is not None:
return open(os.path.join(self._base, uri), "rb")
else:
raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
except IOError as error:
message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
uri, self._base, str(error))
raise InvalidFileReferenceError(message)
def _is_absolute(url):
return urlparse(url).scheme != ""
class InvalidFileReferenceError(ValueError):
pass
class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
pass

View File

@@ -0,0 +1,32 @@
import functools
from .. import lists
from .. import documents
from .. import results
def _read_notes(note_type, element, body_reader):
def read_notes_xml_element(element):
note_elements = lists.filter(
_is_note_element,
element.find_children("w:" + note_type),
)
return results.combine(lists.map(_read_note_element, note_elements))
def _is_note_element(element):
return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
def _read_note_element(element):
return body_reader.read_all(element.children).map(lambda body:
documents.note(
note_type=note_type,
note_id=element.attributes["w:id"],
body=body
))
return read_notes_xml_element(element)
read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
read_endnotes_xml_element = functools.partial(_read_notes, "endnote")

View File

@@ -0,0 +1,130 @@
import cobble
from ..documents import numbering_level
from .styles_xml import Styles
def read_numbering_xml_element(element, styles):
abstract_nums = _read_abstract_nums(element)
nums = _read_nums(element)
return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
def _read_abstract_nums(element):
abstract_num_elements = element.find_children("w:abstractNum")
return dict(map(_read_abstract_num, abstract_num_elements))
def _read_abstract_num(element):
abstract_num_id = element.attributes.get("w:abstractNumId")
levels = _read_abstract_num_levels(element)
num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
@cobble.data
class _AbstractNum(object):
levels = cobble.field()
num_style_link = cobble.field()
@cobble.data
class _AbstractNumLevel(object):
level_index = cobble.field()
is_ordered = cobble.field()
paragraph_style_id = cobble.field()
def _read_abstract_num_levels(element):
levels = {}
# Some malformed documents define numbering levels without an index, and
# reference the numbering using a w:numPr element without a w:ilvl child.
# To handle such cases, we assume a level of 0 as a fallback.
level_without_index = None
for level_element in element.find_children("w:lvl"):
level = _read_abstract_num_level(level_element)
if level.level_index is None:
level.level_index = "0"
level_without_index = level
else:
levels[level.level_index] = level
if level_without_index is not None and level_without_index.level_index not in levels:
levels[level_without_index.level_index] = level_without_index
return levels
def _read_abstract_num_level(element):
level_index = element.attributes.get("w:ilvl")
num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
is_ordered = num_fmt != "bullet"
paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
return _AbstractNumLevel(
level_index=level_index,
is_ordered=is_ordered,
paragraph_style_id=paragraph_style_id,
)
def _read_nums(element):
num_elements = element.find_children("w:num")
return dict(
_read_num(num_element)
for num_element in num_elements
)
def _read_num(element):
num_id = element.attributes.get("w:numId")
abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
return num_id, _Num(abstract_num_id=abstract_num_id)
@cobble.data
class _Num(object):
abstract_num_id = cobble.field()
class Numbering(object):
def __init__(self, abstract_nums, nums, styles):
self._abstract_nums = abstract_nums
self._levels_by_paragraph_style_id = dict(
(level.paragraph_style_id, self._to_numbering_level(level))
for abstract_num in abstract_nums.values()
for level in abstract_num.levels.values()
if level.paragraph_style_id is not None
)
self._nums = nums
self._styles = styles
def find_level(self, num_id, level):
num = self._nums.get(num_id)
if num is None:
return None
else:
abstract_num = self._abstract_nums.get(num.abstract_num_id)
if abstract_num is None:
return None
elif abstract_num.num_style_link is None:
return self._to_numbering_level(abstract_num.levels.get(level))
else:
style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
return self.find_level(style.num_id, level)
def find_level_by_paragraph_style_id(self, style_id):
return self._levels_by_paragraph_style_id.get(style_id)
def _to_numbering_level(self, abstract_num_level):
if abstract_num_level is None:
return None
else:
return numbering_level(
level_index=abstract_num_level.level_index,
is_ordered=abstract_num_level.is_ordered,
)
Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)

View File

@@ -0,0 +1,45 @@
from ..lists import flat_map
from .xmlparser import parse_xml, XmlElement
_namespaces = [
# Transitional format
("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
# Strict format
("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
("a", "http://purl.oclc.org/ooxml/drawingml/main"),
("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
# Common
("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
("v", "urn:schemas-microsoft-com:vml"),
("office-word", "urn:schemas-microsoft-com:office:word"),
# [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
# https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
]
def read(fileobj):
return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
def _collapse_alternate_content(node):
if isinstance(node, XmlElement):
if node.name == "mc:AlternateContent":
return node.find_child_or_null("mc:Fallback").children
else:
node.children = flat_map(_collapse_alternate_content, node.children)
return [node]
else:
return [node]

View File

@@ -0,0 +1,38 @@
import collections
class Relationships(object):
def __init__(self, relationships):
self._targets_by_id = dict(
(relationship.relationship_id, relationship.target)
for relationship in relationships
)
self._targets_by_type = collections.defaultdict(list)
for relationship in relationships:
self._targets_by_type[relationship.type].append(relationship.target)
def find_target_by_relationship_id(self, key):
return self._targets_by_id[key]
def find_targets_by_type(self, relationship_type):
return self._targets_by_type[relationship_type]
Relationships.EMPTY = Relationships([])
Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
def read_relationships_xml_element(element):
children = element.find_children("relationships:Relationship")
return Relationships(list(map(_read_relationship, children)))
def _read_relationship(element):
relationship = Relationship(
relationship_id=element.attributes["Id"],
target=element.attributes["Target"],
type=element.attributes["Type"],
)
return relationship

View File

@@ -0,0 +1,70 @@
from xml.etree import ElementTree
from ..zips import open_zip, update_zip
_style_map_path = "mammoth/style-map"
_style_map_absolute_path = "/" + _style_map_path
_relationships_path = "word/_rels/document.xml.rels"
_content_types_path = "[Content_Types].xml"
def write_style_map(fileobj, style_map):
with open_zip(fileobj, "r") as zip_file:
relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
update_zip(fileobj, {
_style_map_path: style_map.encode("utf8"),
_relationships_path: relationships_xml,
_content_types_path: content_types_xml,
})
def _generate_relationships_xml(relationships_xml):
schema = "http://schemas.zwobble.org/mammoth/style-map"
relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
relationship_element_name = "{" + relationships_uri + "}Relationship"
relationships = ElementTree.fromstring(relationships_xml)
_add_or_update_element(relationships, relationship_element_name, "Id", {
"Id": "rMammothStyleMap",
"Type": schema,
"Target": _style_map_absolute_path,
})
return ElementTree.tostring(relationships, "UTF-8")
def _generate_content_types_xml(content_types_xml):
content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
override_name = "{" + content_types_uri + "}Override"
types = ElementTree.fromstring(content_types_xml)
_add_or_update_element(types, override_name, "PartName", {
"PartName": _style_map_absolute_path,
"ContentType": "text/prs.mammoth.style-map",
})
return ElementTree.tostring(types, "UTF-8")
def _add_or_update_element(parent, name, identifying_attribute, attributes):
existing_child = _find_child(parent, name, identifying_attribute, attributes)
if existing_child is None:
ElementTree.SubElement(parent, name, attributes)
else:
existing_child.attrib = attributes
def _find_child(parent, name, identifying_attribute, attributes):
for element in parent.iter():
if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
return element
def read_style_map(fileobj):
with open_zip(fileobj, "r") as zip_file:
if zip_file.exists(_style_map_path):
return zip_file.read_str(_style_map_path)

View File

@@ -0,0 +1,117 @@
import collections
class Styles(object):
@staticmethod
def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
if paragraph_styles is None:
paragraph_styles = {}
if character_styles is None:
character_styles = {}
if table_styles is None:
table_styles = {}
if numbering_styles is None:
numbering_styles = {}
return Styles(
paragraph_styles=paragraph_styles,
character_styles=character_styles,
table_styles=table_styles,
numbering_styles=numbering_styles,
)
def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
self._paragraph_styles = paragraph_styles
self._character_styles = character_styles
self._table_styles = table_styles
self._numbering_styles = numbering_styles
def find_paragraph_style_by_id(self, style_id):
return self._paragraph_styles.get(style_id)
def find_character_style_by_id(self, style_id):
return self._character_styles.get(style_id)
def find_table_style_by_id(self, style_id):
return self._table_styles.get(style_id)
def find_numbering_style_by_id(self, style_id):
return self._numbering_styles.get(style_id)
Styles.EMPTY = Styles(
paragraph_styles={},
character_styles={},
table_styles={},
numbering_styles={},
)
def read_styles_xml_element(element):
paragraph_styles = {}
character_styles = {}
table_styles = {}
numbering_styles = {}
styles = {
"paragraph": paragraph_styles,
"character": character_styles,
"table": table_styles,
"numbering": numbering_styles,
}
for style_element in element.find_children("w:style"):
element_type = style_element.attributes["w:type"]
if element_type == "numbering":
style = _read_numbering_style_element(style_element)
else:
style = _read_style_element(style_element)
style_set = styles.get(element_type)
# Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1:
#
# > If multiple style definitions each declare the same value for their
# > styleId, then the first such instance shall keep its current
# > identifier with all other instances being reassigned in any manner
# > desired.
#
# For the purpose of conversion, there's no point holding onto styles
# with reassigned style IDs, so we ignore such style definitions.
if style_set is not None and style.style_id not in style_set:
style_set[style.style_id] = style
return Styles(
paragraph_styles=paragraph_styles,
character_styles=character_styles,
table_styles=table_styles,
numbering_styles=numbering_styles,
)
Style = collections.namedtuple("Style", ["style_id", "name"])
def _read_style_element(element):
style_id = _read_style_id(element)
name = element.find_child_or_null("w:name").attributes.get("w:val")
return Style(style_id=style_id, name=name)
NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"])
def _read_numbering_style_element(element):
style_id = _read_style_id(element)
num_id = element \
.find_child_or_null("w:pPr") \
.find_child_or_null("w:numPr") \
.find_child_or_null("w:numId") \
.attributes.get("w:val")
return NumberingStyle(style_id=style_id, num_id=num_id)
def _read_style_id(element):
return element.attributes["w:styleId"]

View File

@@ -0,0 +1,12 @@
def uri_to_zip_entry_name(base, uri):
if uri.startswith("/"):
return uri[1:]
else:
return base + "/" + uri
def replace_fragment(uri, fragment):
hash_index = uri.find("#")
if hash_index != -1:
uri = uri[:hash_index]
return uri + "#" + fragment

View File

@@ -0,0 +1,121 @@
import xml.dom.minidom
import cobble
@cobble.data
class XmlElement(object):
name = cobble.field()
attributes = cobble.field()
children = cobble.field()
def find_child_or_null(self, name):
return self.find_child(name) or null_xml_element
def find_child(self, name):
for child in self.children:
if isinstance(child, XmlElement) and child.name == name:
return child
def find_children(self, name):
return XmlElementList(filter(
lambda child: child.node_type == node_types.element and child.name == name,
self.children
))
class XmlElementList(object):
def __init__(self, elements):
self._elements = elements
def __iter__(self):
return iter(self._elements)
def find_children(self, name):
children = []
for element in self._elements:
for child in element.find_children(name):
children.append(child)
return XmlElementList(children)
class NullXmlElement(object):
attributes = {}
children = []
def find_child_or_null(self, name):
return self
def find_child(self, name):
return None
null_xml_element = NullXmlElement()
@cobble.data
class XmlText(object):
value = cobble.field()
def element(name, attributes=None, children=None):
return XmlElement(name, attributes or {}, children or [])
text = XmlText
class node_types(object):
element = 1
text = 3
XmlElement.node_type = node_types.element
XmlText.node_type = node_types.text
def parse_xml(fileobj, namespace_mapping=None):
if namespace_mapping is None:
namespace_prefixes = {}
else:
namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
document = xml.dom.minidom.parse(fileobj)
def convert_node(node):
if node.nodeType == xml.dom.Node.ELEMENT_NODE:
return convert_element(node)
elif node.nodeType == xml.dom.Node.TEXT_NODE:
return XmlText(node.nodeValue)
else:
return None
def convert_element(element):
converted_name = convert_name(element)
converted_attributes = dict(
(convert_name(attribute), attribute.value)
for attribute in element.attributes.values()
if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
)
converted_children = []
for child_node in element.childNodes:
converted_child_node = convert_node(child_node)
if converted_child_node is not None:
converted_children.append(converted_child_node)
return XmlElement(converted_name, converted_attributes, converted_children)
def convert_name(node):
if node.namespaceURI is None:
return node.localName
else:
prefix = namespace_prefixes.get(node.namespaceURI)
if prefix is None:
return "{%s}%s" % (node.namespaceURI, node.localName)
else:
return "%s:%s" % (prefix, node.localName)
return convert_node(document.documentElement)