Initial commit (Clean history)
This commit is contained in:
@@ -0,0 +1,211 @@
|
||||
from functools import partial
|
||||
import os
|
||||
|
||||
import cobble
|
||||
|
||||
from .. import results, lists, zips
|
||||
from .document_xml import read_document_xml_element
|
||||
from .content_types_xml import empty_content_types, read_content_types_xml_element
|
||||
from .relationships_xml import read_relationships_xml_element, Relationships
|
||||
from .numbering_xml import read_numbering_xml_element, Numbering
|
||||
from .styles_xml import read_styles_xml_element, Styles
|
||||
from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
|
||||
from .comments_xml import read_comments_xml_element
|
||||
from .files import Files
|
||||
from . import body_xml, office_xml
|
||||
from ..zips import open_zip
|
||||
|
||||
|
||||
_empty_result = results.success([])
|
||||
|
||||
|
||||
def read(fileobj, external_file_access=False):
|
||||
zip_file = open_zip(fileobj, "r")
|
||||
part_paths = _find_part_paths(zip_file)
|
||||
read_part_with_body = _part_with_body_reader(
|
||||
getattr(fileobj, "name", None),
|
||||
zip_file,
|
||||
part_paths=part_paths,
|
||||
external_file_access=external_file_access,
|
||||
)
|
||||
|
||||
return results.combine([
|
||||
_read_notes(read_part_with_body, part_paths),
|
||||
_read_comments(read_part_with_body, part_paths),
|
||||
]).bind(lambda referents:
|
||||
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
|
||||
)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _PartPaths(object):
|
||||
main_document = cobble.field()
|
||||
comments = cobble.field()
|
||||
endnotes = cobble.field()
|
||||
footnotes = cobble.field()
|
||||
numbering = cobble.field()
|
||||
styles = cobble.field()
|
||||
|
||||
|
||||
def _find_part_paths(zip_file):
|
||||
package_relationships = _read_relationships(zip_file, "_rels/.rels")
|
||||
document_filename = _find_document_filename(zip_file, package_relationships)
|
||||
|
||||
document_relationships = _read_relationships(
|
||||
zip_file,
|
||||
_find_relationships_path_for(document_filename),
|
||||
)
|
||||
|
||||
def find(name):
|
||||
return _find_part_path(
|
||||
zip_file=zip_file,
|
||||
relationships=document_relationships,
|
||||
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
|
||||
fallback_path="word/{0}.xml".format(name),
|
||||
base_path=zips.split_path(document_filename)[0],
|
||||
)
|
||||
|
||||
return _PartPaths(
|
||||
main_document=document_filename,
|
||||
comments=find("comments"),
|
||||
endnotes=find("endnotes"),
|
||||
footnotes=find("footnotes"),
|
||||
numbering=find("numbering"),
|
||||
styles=find("styles"),
|
||||
)
|
||||
|
||||
|
||||
def _find_document_filename(zip_file, relationships):
|
||||
path = _find_part_path(
|
||||
zip_file,
|
||||
relationships,
|
||||
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
|
||||
base_path="",
|
||||
fallback_path="word/document.xml",
|
||||
)
|
||||
if zip_file.exists(path):
|
||||
return path
|
||||
else:
|
||||
raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
|
||||
|
||||
|
||||
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
|
||||
targets = [
|
||||
zips.join_path(base_path, target).lstrip("/")
|
||||
for target in relationships.find_targets_by_type(relationship_type)
|
||||
]
|
||||
valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
|
||||
if len(valid_targets) == 0:
|
||||
return fallback_path
|
||||
else:
|
||||
return valid_targets[0]
|
||||
|
||||
|
||||
def _read_notes(read_part_with_body, part_paths):
|
||||
footnotes = read_part_with_body(
|
||||
part_paths.footnotes,
|
||||
lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
|
||||
default=_empty_result,
|
||||
)
|
||||
endnotes = read_part_with_body(
|
||||
part_paths.endnotes,
|
||||
lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
|
||||
default=_empty_result,
|
||||
)
|
||||
|
||||
return results.combine([footnotes, endnotes]).map(lists.flatten)
|
||||
|
||||
|
||||
def _read_comments(read_part_with_body, part_paths):
|
||||
return read_part_with_body(
|
||||
part_paths.comments,
|
||||
lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
|
||||
default=_empty_result,
|
||||
)
|
||||
|
||||
|
||||
def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
|
||||
return read_part_with_body(
|
||||
part_paths.main_document,
|
||||
partial(
|
||||
read_document_xml_element,
|
||||
notes=notes,
|
||||
comments=comments,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
|
||||
content_types = _try_read_entry_or_default(
|
||||
zip_file,
|
||||
"[Content_Types].xml",
|
||||
read_content_types_xml_element,
|
||||
empty_content_types,
|
||||
)
|
||||
|
||||
styles = _try_read_entry_or_default(
|
||||
zip_file,
|
||||
part_paths.styles,
|
||||
read_styles_xml_element,
|
||||
Styles.EMPTY,
|
||||
)
|
||||
|
||||
numbering = _try_read_entry_or_default(
|
||||
zip_file,
|
||||
part_paths.numbering,
|
||||
lambda element: read_numbering_xml_element(element, styles=styles),
|
||||
default=Numbering.EMPTY,
|
||||
)
|
||||
|
||||
files = Files(
|
||||
None if document_path is None else os.path.dirname(document_path),
|
||||
external_file_access=external_file_access,
|
||||
)
|
||||
|
||||
def read_part(name, reader, default=_undefined):
|
||||
relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
|
||||
|
||||
body_reader = body_xml.reader(
|
||||
numbering=numbering,
|
||||
content_types=content_types,
|
||||
relationships=relationships,
|
||||
styles=styles,
|
||||
docx_file=zip_file,
|
||||
files=files,
|
||||
)
|
||||
|
||||
if default is _undefined:
|
||||
return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
|
||||
else:
|
||||
return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
|
||||
|
||||
return read_part
|
||||
|
||||
|
||||
|
||||
def _find_relationships_path_for(name):
|
||||
dirname, basename = zips.split_path(name)
|
||||
return zips.join_path(dirname, "_rels", basename + ".rels")
|
||||
|
||||
|
||||
def _read_relationships(zip_file, name):
|
||||
return _try_read_entry_or_default(
|
||||
zip_file,
|
||||
name,
|
||||
read_relationships_xml_element,
|
||||
default=Relationships.EMPTY,
|
||||
)
|
||||
|
||||
def _try_read_entry_or_default(zip_file, name, reader, default):
|
||||
if zip_file.exists(name):
|
||||
return _read_entry(zip_file, name, reader)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
||||
def _read_entry(zip_file, name, reader):
|
||||
with zip_file.open(name) as fileobj:
|
||||
return reader(office_xml.read(fileobj))
|
||||
|
||||
|
||||
_undefined = object()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,794 @@
|
||||
import contextlib
|
||||
import re
|
||||
import sys
|
||||
|
||||
from .. import documents
|
||||
from .. import results
|
||||
from .. import lists
|
||||
from .. import transforms
|
||||
from . import complex_fields
|
||||
from .dingbats import dingbats
|
||||
from .xmlparser import node_types, XmlElement, null_xml_element
|
||||
from .styles_xml import Styles
|
||||
from .uris import replace_fragment, uri_to_zip_entry_name
|
||||
|
||||
if sys.version_info >= (3, ):
|
||||
unichr = chr
|
||||
|
||||
|
||||
def reader(
|
||||
numbering=None,
|
||||
content_types=None,
|
||||
relationships=None,
|
||||
styles=None,
|
||||
docx_file=None,
|
||||
files=None
|
||||
):
|
||||
|
||||
if styles is None:
|
||||
styles = Styles.EMPTY
|
||||
|
||||
read_all = _create_reader(
|
||||
numbering=numbering,
|
||||
content_types=content_types,
|
||||
relationships=relationships,
|
||||
styles=styles,
|
||||
docx_file=docx_file,
|
||||
files=files,
|
||||
)
|
||||
return _BodyReader(read_all)
|
||||
|
||||
|
||||
|
||||
class _BodyReader(object):
|
||||
def __init__(self, read_all):
|
||||
self._read_all = read_all
|
||||
|
||||
def read_all(self, elements):
|
||||
result = self._read_all(elements)
|
||||
return results.Result(result.elements, result.messages)
|
||||
|
||||
|
||||
def _create_reader(numbering, content_types, relationships, styles, docx_file, files):
|
||||
current_instr_text = []
|
||||
complex_field_stack = []
|
||||
|
||||
# When a paragraph is marked as deleted, its contents should be combined
|
||||
# with the following paragraph. See 17.13.5.15 del (Deleted Paragraph) of
|
||||
# ECMA-376 4th edition Part 1.
|
||||
deleted_paragraph_contents = []
|
||||
|
||||
_ignored_elements = set([
|
||||
"office-word:wrap",
|
||||
"v:shadow",
|
||||
"v:shapetype",
|
||||
"w:annotationRef",
|
||||
"w:bookmarkEnd",
|
||||
"w:sectPr",
|
||||
"w:proofErr",
|
||||
"w:lastRenderedPageBreak",
|
||||
"w:commentRangeStart",
|
||||
"w:commentRangeEnd",
|
||||
"w:del",
|
||||
"w:footnoteRef",
|
||||
"w:endnoteRef",
|
||||
"w:pPr",
|
||||
"w:rPr",
|
||||
"w:tblPr",
|
||||
"w:tblGrid",
|
||||
"w:trPr",
|
||||
"w:tcPr",
|
||||
])
|
||||
|
||||
def text(element):
|
||||
return _success(documents.Text(_inner_text(element)))
|
||||
|
||||
def run(element):
|
||||
properties = element.find_child_or_null("w:rPr")
|
||||
vertical_alignment = properties \
|
||||
.find_child_or_null("w:vertAlign") \
|
||||
.attributes.get("w:val")
|
||||
font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
|
||||
|
||||
font_size_string = properties.find_child_or_null("w:sz").attributes.get("w:val")
|
||||
if _is_int(font_size_string):
|
||||
# w:sz gives the font size in half points, so halve the value to get the size in points
|
||||
font_size = int(font_size_string) / 2
|
||||
else:
|
||||
font_size = None
|
||||
|
||||
is_bold = read_boolean_element(properties.find_child("w:b"))
|
||||
is_italic = read_boolean_element(properties.find_child("w:i"))
|
||||
is_underline = read_underline_element(properties.find_child("w:u"))
|
||||
is_strikethrough = read_boolean_element(properties.find_child("w:strike"))
|
||||
is_all_caps = read_boolean_element(properties.find_child("w:caps"))
|
||||
is_small_caps = read_boolean_element(properties.find_child("w:smallCaps"))
|
||||
highlight = read_highlight_value(properties.find_child_or_null("w:highlight").attributes.get("w:val"))
|
||||
|
||||
def add_complex_field_hyperlink(children):
|
||||
hyperlink_kwargs = current_hyperlink_kwargs()
|
||||
if hyperlink_kwargs is None:
|
||||
return children
|
||||
else:
|
||||
return [documents.hyperlink(children=children, **hyperlink_kwargs)]
|
||||
|
||||
return _ReadResult.map_results(
|
||||
_read_run_style(properties),
|
||||
_read_xml_elements(element.children).map(add_complex_field_hyperlink),
|
||||
lambda style, children: documents.run(
|
||||
children=children,
|
||||
style_id=style[0],
|
||||
style_name=style[1],
|
||||
is_bold=is_bold,
|
||||
is_italic=is_italic,
|
||||
is_underline=is_underline,
|
||||
is_strikethrough=is_strikethrough,
|
||||
is_all_caps=is_all_caps,
|
||||
is_small_caps=is_small_caps,
|
||||
vertical_alignment=vertical_alignment,
|
||||
font=font,
|
||||
font_size=font_size,
|
||||
highlight=highlight,
|
||||
))
|
||||
|
||||
def _read_run_style(properties):
|
||||
return _read_style(properties, "w:rStyle", "Run", styles.find_character_style_by_id)
|
||||
|
||||
def read_boolean_element(element):
|
||||
if element is None:
|
||||
return False
|
||||
else:
|
||||
return read_boolean_attribute_value(element.attributes.get("w:val"))
|
||||
|
||||
def read_boolean_attribute_value(value):
|
||||
return value not in ["false", "0"]
|
||||
|
||||
def read_underline_element(element):
|
||||
return element and element.attributes.get("w:val") not in [None, "false", "0", "none"]
|
||||
|
||||
def read_highlight_value(value):
|
||||
if not value or value == "none":
|
||||
return None
|
||||
else:
|
||||
return value
|
||||
|
||||
def paragraph(element):
|
||||
properties = element.find_child_or_null("w:pPr")
|
||||
|
||||
is_deleted = properties.find_child_or_null("w:rPr").find_child("w:del")
|
||||
|
||||
if is_deleted is not None:
|
||||
for child in element.children:
|
||||
deleted_paragraph_contents.append(child)
|
||||
return _empty_result
|
||||
|
||||
else:
|
||||
alignment = properties.find_child_or_null("w:jc").attributes.get("w:val")
|
||||
indent = _read_paragraph_indent(properties.find_child_or_null("w:ind"))
|
||||
|
||||
children_xml = element.children
|
||||
if deleted_paragraph_contents:
|
||||
children_xml = deleted_paragraph_contents + children_xml
|
||||
del deleted_paragraph_contents[:]
|
||||
|
||||
return _ReadResult.map_results(
|
||||
_read_paragraph_style(properties),
|
||||
_read_xml_elements(children_xml),
|
||||
lambda style, children: documents.paragraph(
|
||||
children=children,
|
||||
style_id=style[0],
|
||||
style_name=style[1],
|
||||
numbering=_read_numbering_properties(
|
||||
paragraph_style_id=style[0],
|
||||
element=properties.find_child_or_null("w:numPr"),
|
||||
),
|
||||
alignment=alignment,
|
||||
indent=indent,
|
||||
)).append_extra()
|
||||
|
||||
def _read_paragraph_style(properties):
|
||||
return _read_style(properties, "w:pStyle", "Paragraph", styles.find_paragraph_style_by_id)
|
||||
|
||||
def current_hyperlink_kwargs():
|
||||
for complex_field in reversed(complex_field_stack):
|
||||
if isinstance(complex_field, complex_fields.Hyperlink):
|
||||
return complex_field.kwargs
|
||||
|
||||
return None
|
||||
|
||||
def read_fld_char(element):
|
||||
fld_char_type = element.attributes.get("w:fldCharType")
|
||||
if fld_char_type == "begin":
|
||||
complex_field_stack.append(complex_fields.begin(fld_char=element))
|
||||
del current_instr_text[:]
|
||||
|
||||
elif fld_char_type == "end":
|
||||
complex_field = complex_field_stack.pop()
|
||||
if isinstance(complex_field, complex_fields.Begin):
|
||||
complex_field = parse_current_instr_text(complex_field)
|
||||
|
||||
if isinstance(complex_field, complex_fields.Checkbox):
|
||||
return _success(documents.checkbox(checked=complex_field.checked))
|
||||
|
||||
elif fld_char_type == "separate":
|
||||
complex_field_separate = complex_field_stack.pop()
|
||||
complex_field = parse_current_instr_text(complex_field_separate)
|
||||
complex_field_stack.append(complex_field)
|
||||
|
||||
return _empty_result
|
||||
|
||||
def parse_current_instr_text(complex_field):
|
||||
instr_text = "".join(current_instr_text)
|
||||
|
||||
if isinstance(complex_field, complex_fields.Begin):
|
||||
fld_char = complex_field.fld_char
|
||||
else:
|
||||
fld_char = null_xml_element
|
||||
|
||||
return parse_instr_text(instr_text, fld_char=fld_char)
|
||||
|
||||
def parse_instr_text(instr_text, *, fld_char):
|
||||
external_link_result = re.match(r'\s*HYPERLINK "(.*)"', instr_text)
|
||||
if external_link_result is not None:
|
||||
return complex_fields.hyperlink(dict(href=external_link_result.group(1)))
|
||||
|
||||
internal_link_result = re.match(r'\s*HYPERLINK\s+\\l\s+"(.*)"', instr_text)
|
||||
if internal_link_result is not None:
|
||||
return complex_fields.hyperlink(dict(anchor=internal_link_result.group(1)))
|
||||
|
||||
checkbox_result = re.match(r'\s*FORMCHECKBOX\s*', instr_text)
|
||||
if checkbox_result is not None:
|
||||
checkbox_element = fld_char \
|
||||
.find_child_or_null("w:ffData") \
|
||||
.find_child_or_null("w:checkBox")
|
||||
checked_element = checkbox_element.find_child("w:checked")
|
||||
|
||||
if checked_element is None:
|
||||
checked = read_boolean_element(checkbox_element.find_child("w:default"))
|
||||
else:
|
||||
checked = read_boolean_element(checked_element)
|
||||
|
||||
return complex_fields.checkbox(checked=checked)
|
||||
|
||||
return None
|
||||
|
||||
def read_instr_text(element):
|
||||
current_instr_text.append(_inner_text(element))
|
||||
return _empty_result
|
||||
|
||||
def _read_style(properties, style_tag_name, style_type, find_style_by_id):
|
||||
messages = []
|
||||
style_id = properties \
|
||||
.find_child_or_null(style_tag_name) \
|
||||
.attributes.get("w:val")
|
||||
|
||||
if style_id is None:
|
||||
style_name = None
|
||||
else:
|
||||
style = find_style_by_id(style_id)
|
||||
if style is None:
|
||||
style_name = None
|
||||
messages.append(_undefined_style_warning(style_type, style_id))
|
||||
else:
|
||||
style_name = style.name
|
||||
|
||||
return _ReadResult([style_id, style_name], [], messages)
|
||||
|
||||
def _undefined_style_warning(style_type, style_id):
|
||||
return results.warning("{0} style with ID {1} was referenced but not defined in the document".format(style_type, style_id))
|
||||
|
||||
def _read_numbering_properties(paragraph_style_id, element):
|
||||
num_id = element.find_child_or_null("w:numId").attributes.get("w:val")
|
||||
level_index = element.find_child_or_null("w:ilvl").attributes.get("w:val")
|
||||
if num_id is not None and level_index is not None:
|
||||
return numbering.find_level(num_id, level_index)
|
||||
|
||||
if paragraph_style_id is not None:
|
||||
level = numbering.find_level_by_paragraph_style_id(paragraph_style_id)
|
||||
if level is not None:
|
||||
return level
|
||||
|
||||
# Some malformed documents define numbering levels without an index, and
|
||||
# reference the numbering using a w:numPr element without a w:ilvl child.
|
||||
# To handle such cases, we assume a level of 0 as a fallback.
|
||||
if num_id is not None:
|
||||
return numbering.find_level(num_id, "0")
|
||||
|
||||
return None
|
||||
|
||||
def _read_paragraph_indent(element):
|
||||
attributes = element.attributes
|
||||
return documents.paragraph_indent(
|
||||
start=attributes.get("w:start") or attributes.get("w:left"),
|
||||
end=attributes.get("w:end") or attributes.get("w:right"),
|
||||
first_line=attributes.get("w:firstLine"),
|
||||
hanging=attributes.get("w:hanging"),
|
||||
)
|
||||
|
||||
def tab(element):
|
||||
return _success(documents.tab())
|
||||
|
||||
|
||||
def no_break_hyphen(element):
|
||||
return _success(documents.text(unichr(0x2011)))
|
||||
|
||||
|
||||
def soft_hyphen(element):
|
||||
return _success(documents.text(u"\u00ad"))
|
||||
|
||||
def symbol(element):
|
||||
# See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1
|
||||
font = element.attributes.get("w:font")
|
||||
char = element.attributes.get("w:char")
|
||||
|
||||
unicode_code_point = dingbats.get((font, int(char, 16)))
|
||||
|
||||
if unicode_code_point is None and re.match("^F0..", char):
|
||||
unicode_code_point = dingbats.get((font, int(char[2:], 16)))
|
||||
|
||||
if unicode_code_point is None:
|
||||
warning = results.warning("A w:sym element with an unsupported character was ignored: char {0} in font {1}".format(
|
||||
char,
|
||||
font,
|
||||
))
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
return _success(documents.text(unichr(unicode_code_point)))
|
||||
|
||||
|
||||
def table(element):
|
||||
properties = element.find_child_or_null("w:tblPr")
|
||||
return _ReadResult.map_results(
|
||||
read_table_style(properties),
|
||||
_read_xml_elements(element.children)
|
||||
.flat_map(calculate_row_spans),
|
||||
|
||||
lambda style, children: documents.table(
|
||||
children=children,
|
||||
style_id=style[0],
|
||||
style_name=style[1],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def read_table_style(properties):
|
||||
return _read_style(properties, "w:tblStyle", "Table", styles.find_table_style_by_id)
|
||||
|
||||
|
||||
def table_row(element):
|
||||
properties = element.find_child_or_null("w:trPr")
|
||||
|
||||
# See 17.13.5.12 del (Deleted Table Row) of ECMA-376 4th edition Part 1
|
||||
is_deleted = bool(properties.find_child("w:del"))
|
||||
if is_deleted:
|
||||
return _empty_result
|
||||
|
||||
is_header = bool(properties.find_child("w:tblHeader"))
|
||||
return _read_xml_elements(element.children) \
|
||||
.map(lambda children: documents.table_row(
|
||||
children=children,
|
||||
is_header=is_header,
|
||||
))
|
||||
|
||||
|
||||
def table_cell(element):
|
||||
properties = element.find_child_or_null("w:tcPr")
|
||||
gridspan = properties \
|
||||
.find_child_or_null("w:gridSpan") \
|
||||
.attributes.get("w:val")
|
||||
|
||||
if gridspan is None:
|
||||
colspan = 1
|
||||
else:
|
||||
colspan = int(gridspan)
|
||||
|
||||
return _read_xml_elements(element.children) \
|
||||
.map(lambda children: documents.table_cell_unmerged(
|
||||
children=children,
|
||||
colspan=colspan,
|
||||
rowspan=1,
|
||||
vmerge=read_vmerge(properties),
|
||||
))
|
||||
|
||||
def read_vmerge(properties):
|
||||
vmerge_element = properties.find_child("w:vMerge")
|
||||
if vmerge_element is None:
|
||||
return False
|
||||
else:
|
||||
val = vmerge_element.attributes.get("w:val")
|
||||
return val == "continue" or not val
|
||||
|
||||
|
||||
def calculate_row_spans(rows):
|
||||
unexpected_non_rows = any(
|
||||
not isinstance(row, documents.TableRow)
|
||||
for row in rows
|
||||
)
|
||||
if unexpected_non_rows:
|
||||
rows = remove_unmerged_table_cells(rows)
|
||||
return _elements_result_with_messages(rows, [results.warning(
|
||||
"unexpected non-row element in table, cell merging may be incorrect"
|
||||
)])
|
||||
|
||||
unexpected_non_cells = any(
|
||||
not isinstance(cell, documents.TableCellUnmerged)
|
||||
for row in rows
|
||||
for cell in row.children
|
||||
)
|
||||
if unexpected_non_cells:
|
||||
rows = remove_unmerged_table_cells(rows)
|
||||
return _elements_result_with_messages(rows, [results.warning(
|
||||
"unexpected non-cell element in table row, cell merging may be incorrect"
|
||||
)])
|
||||
|
||||
columns = {}
|
||||
for row in rows:
|
||||
cell_index = 0
|
||||
for cell in row.children:
|
||||
if cell.vmerge and cell_index in columns:
|
||||
columns[cell_index].rowspan += 1
|
||||
else:
|
||||
columns[cell_index] = cell
|
||||
cell.vmerge = False
|
||||
cell_index += cell.colspan
|
||||
|
||||
for row in rows:
|
||||
row.children = [
|
||||
documents.table_cell(
|
||||
children=cell.children,
|
||||
colspan=cell.colspan,
|
||||
rowspan=cell.rowspan,
|
||||
)
|
||||
for cell in row.children
|
||||
if not cell.vmerge
|
||||
]
|
||||
|
||||
return _success(rows)
|
||||
|
||||
|
||||
def remove_unmerged_table_cells(rows):
|
||||
return list(map(
|
||||
transforms.element_of_type(
|
||||
documents.TableCellUnmerged,
|
||||
lambda cell: documents.table_cell(
|
||||
children=cell.children,
|
||||
colspan=cell.colspan,
|
||||
rowspan=cell.rowspan,
|
||||
),
|
||||
),
|
||||
rows,
|
||||
))
|
||||
|
||||
|
||||
def read_child_elements(element):
|
||||
return _read_xml_elements(element.children)
|
||||
|
||||
|
||||
def pict(element):
|
||||
return read_child_elements(element).to_extra()
|
||||
|
||||
|
||||
def hyperlink(element):
|
||||
relationship_id = element.attributes.get("r:id")
|
||||
anchor = element.attributes.get("w:anchor")
|
||||
target_frame = element.attributes.get("w:tgtFrame") or None
|
||||
children_result = _read_xml_elements(element.children)
|
||||
|
||||
def create(**kwargs):
|
||||
return children_result.map(lambda children: documents.hyperlink(
|
||||
children=children,
|
||||
target_frame=target_frame,
|
||||
**kwargs
|
||||
))
|
||||
|
||||
if relationship_id is not None:
|
||||
href = relationships.find_target_by_relationship_id(relationship_id)
|
||||
if anchor is not None:
|
||||
href = replace_fragment(href, anchor)
|
||||
|
||||
return create(href=href)
|
||||
elif anchor is not None:
|
||||
return create(anchor=anchor)
|
||||
else:
|
||||
return children_result
|
||||
|
||||
|
||||
def bookmark_start(element):
|
||||
name = element.attributes.get("w:name")
|
||||
if name == "_GoBack":
|
||||
return _empty_result
|
||||
else:
|
||||
return _success(documents.bookmark(name))
|
||||
|
||||
|
||||
def break_(element):
|
||||
break_type = element.attributes.get("w:type")
|
||||
|
||||
if not break_type or break_type == "textWrapping":
|
||||
return _success(documents.line_break)
|
||||
elif break_type == "page":
|
||||
return _success(documents.page_break)
|
||||
elif break_type == "column":
|
||||
return _success(documents.column_break)
|
||||
else:
|
||||
warning = results.warning("Unsupported break type: {0}".format(break_type))
|
||||
return _empty_result_with_message(warning)
|
||||
|
||||
|
||||
def inline(element):
|
||||
properties = element.find_child_or_null("wp:docPr").attributes
|
||||
if properties.get("descr", "").strip():
|
||||
alt_text = properties.get("descr")
|
||||
else:
|
||||
alt_text = properties.get("title")
|
||||
blips = element.find_children("a:graphic") \
|
||||
.find_children("a:graphicData") \
|
||||
.find_children("pic:pic") \
|
||||
.find_children("pic:blipFill") \
|
||||
.find_children("a:blip")
|
||||
return _read_blips(blips, alt_text)
|
||||
|
||||
def _read_blips(blips, alt_text):
|
||||
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
|
||||
|
||||
def _read_blip(element, alt_text):
|
||||
blip_image = _find_blip_image(element)
|
||||
|
||||
if blip_image is None:
|
||||
warning = results.warning("Could not find image file for a:blip element")
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
return _read_image(blip_image, alt_text)
|
||||
|
||||
def _read_image(image_file, alt_text):
|
||||
image_path, open_image = image_file
|
||||
content_type = content_types.find_content_type(image_path)
|
||||
image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
|
||||
|
||||
if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
|
||||
messages = []
|
||||
else:
|
||||
messages = [results.warning("Image of type {0} is unlikely to display in web browsers".format(content_type))]
|
||||
|
||||
return _element_result_with_messages(image, messages)
|
||||
|
||||
def _find_blip_image(element):
|
||||
embed_relationship_id = element.attributes.get("r:embed")
|
||||
link_relationship_id = element.attributes.get("r:link")
|
||||
if embed_relationship_id is not None:
|
||||
return _find_embedded_image(embed_relationship_id)
|
||||
elif link_relationship_id is not None:
|
||||
return _find_linked_image(link_relationship_id)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _find_embedded_image(relationship_id):
|
||||
target = relationships.find_target_by_relationship_id(relationship_id)
|
||||
image_path = uri_to_zip_entry_name("word", target)
|
||||
|
||||
def open_image():
|
||||
image_file = docx_file.open(image_path)
|
||||
if hasattr(image_file, "__exit__"):
|
||||
return image_file
|
||||
else:
|
||||
return contextlib.closing(image_file)
|
||||
|
||||
return image_path, open_image
|
||||
|
||||
|
||||
def _find_linked_image(relationship_id):
|
||||
image_path = relationships.find_target_by_relationship_id(relationship_id)
|
||||
|
||||
def open_image():
|
||||
return files.open(image_path)
|
||||
|
||||
return image_path, open_image
|
||||
|
||||
def read_imagedata(element):
|
||||
relationship_id = element.attributes.get("r:id")
|
||||
if relationship_id is None:
|
||||
warning = results.warning("A v:imagedata element without a relationship ID was ignored")
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
title = element.attributes.get("o:title")
|
||||
return _read_image(_find_embedded_image(relationship_id), title)
|
||||
|
||||
def note_reference_reader(note_type):
|
||||
def note_reference(element):
|
||||
return _success(documents.note_reference(note_type, element.attributes["w:id"]))
|
||||
|
||||
return note_reference
|
||||
|
||||
def read_comment_reference(element):
|
||||
return _success(documents.comment_reference(element.attributes["w:id"]))
|
||||
|
||||
def alternate_content(element):
|
||||
return read_child_elements(element.find_child_or_null("mc:Fallback"))
|
||||
|
||||
def read_sdt(element):
|
||||
content_result = read_child_elements(element.find_child_or_null("w:sdtContent"))
|
||||
|
||||
def handle_content(content):
|
||||
# From the WordML standard: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/3350cb64-931f-41f7-8824-f18b2568ce66
|
||||
#
|
||||
# > A CT_SdtCheckbox element that specifies that the parent
|
||||
# > structured document tag is a checkbox when displayed in the
|
||||
# > document. The parent structured document tag contents MUST
|
||||
# > contain a single character and optionally an additional
|
||||
# > character in a deleted run.
|
||||
checkbox = element.find_child_or_null("w:sdtPr").find_child("wordml:checkbox")
|
||||
|
||||
if checkbox is None:
|
||||
return content
|
||||
|
||||
checked_element = checkbox.find_child("wordml:checked")
|
||||
is_checked = (
|
||||
checked_element is not None and
|
||||
read_boolean_attribute_value(checked_element.attributes.get("wordml:val"))
|
||||
)
|
||||
document_checkbox = documents.checkbox(checked=is_checked)
|
||||
|
||||
has_checkbox = False
|
||||
|
||||
def transform_text(text):
|
||||
nonlocal has_checkbox
|
||||
if len(text.value) > 0 and not has_checkbox:
|
||||
has_checkbox = True
|
||||
return document_checkbox
|
||||
else:
|
||||
return text
|
||||
|
||||
replaced_content = list(map(
|
||||
transforms.element_of_type(documents.Text, transform_text),
|
||||
content,
|
||||
))
|
||||
|
||||
if has_checkbox:
|
||||
return replaced_content
|
||||
else:
|
||||
return document_checkbox
|
||||
|
||||
return content_result.map(handle_content)
|
||||
|
||||
handlers = {
|
||||
"w:t": text,
|
||||
"w:r": run,
|
||||
"w:p": paragraph,
|
||||
"w:fldChar": read_fld_char,
|
||||
"w:instrText": read_instr_text,
|
||||
"w:tab": tab,
|
||||
"w:noBreakHyphen": no_break_hyphen,
|
||||
"w:softHyphen": soft_hyphen,
|
||||
"w:sym": symbol,
|
||||
"w:tbl": table,
|
||||
"w:tr": table_row,
|
||||
"w:tc": table_cell,
|
||||
"w:ins": read_child_elements,
|
||||
"w:object": read_child_elements,
|
||||
"w:smartTag": read_child_elements,
|
||||
"w:drawing": read_child_elements,
|
||||
"v:group": read_child_elements,
|
||||
"v:rect": read_child_elements,
|
||||
"v:roundrect": read_child_elements,
|
||||
"v:shape": read_child_elements,
|
||||
"v:textbox": read_child_elements,
|
||||
"w:txbxContent": read_child_elements,
|
||||
"w:pict": pict,
|
||||
"w:hyperlink": hyperlink,
|
||||
"w:bookmarkStart": bookmark_start,
|
||||
"w:br": break_,
|
||||
"wp:inline": inline,
|
||||
"wp:anchor": inline,
|
||||
"v:imagedata": read_imagedata,
|
||||
"w:footnoteReference": note_reference_reader("footnote"),
|
||||
"w:endnoteReference": note_reference_reader("endnote"),
|
||||
"w:commentReference": read_comment_reference,
|
||||
"mc:AlternateContent": alternate_content,
|
||||
"w:sdt": read_sdt
|
||||
}
|
||||
|
||||
def read(element):
|
||||
handler = handlers.get(element.name)
|
||||
if handler is None:
|
||||
if element.name not in _ignored_elements:
|
||||
warning = results.warning("An unrecognised element was ignored: {0}".format(element.name))
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
return _empty_result
|
||||
else:
|
||||
return handler(element)
|
||||
|
||||
|
||||
def _read_xml_elements(nodes):
|
||||
elements = filter(lambda node: isinstance(node, XmlElement), nodes)
|
||||
return _ReadResult.concat(lists.map(read, elements))
|
||||
|
||||
return _read_xml_elements
|
||||
|
||||
|
||||
def _inner_text(node):
|
||||
if node.node_type == node_types.text:
|
||||
return node.value
|
||||
else:
|
||||
return "".join(_inner_text(child) for child in node.children)
|
||||
|
||||
|
||||
|
||||
class _ReadResult(object):
|
||||
@staticmethod
|
||||
def concat(results):
|
||||
return _ReadResult(
|
||||
lists.flat_map(lambda result: result.elements, results),
|
||||
lists.flat_map(lambda result: result.extra, results),
|
||||
lists.flat_map(lambda result: result.messages, results))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def map_results(first, second, func):
|
||||
return _ReadResult(
|
||||
[func(first.elements, second.elements)],
|
||||
first.extra + second.extra,
|
||||
first.messages + second.messages)
|
||||
|
||||
def __init__(self, elements, extra, messages):
|
||||
self.elements = elements
|
||||
self.extra = extra
|
||||
self.messages = messages
|
||||
|
||||
def map(self, func):
|
||||
elements = func(self.elements)
|
||||
if not isinstance(elements, list):
|
||||
elements = [elements]
|
||||
return _ReadResult(
|
||||
elements,
|
||||
self.extra,
|
||||
self.messages)
|
||||
|
||||
def flat_map(self, func):
|
||||
result = func(self.elements)
|
||||
return _ReadResult(
|
||||
result.elements,
|
||||
self.extra + result.extra,
|
||||
self.messages + result.messages)
|
||||
|
||||
|
||||
def to_extra(self):
|
||||
return _ReadResult([], _concat(self.extra, self.elements), self.messages)
|
||||
|
||||
def append_extra(self):
|
||||
return _ReadResult(_concat(self.elements, self.extra), [], self.messages)
|
||||
|
||||
def _success(elements):
|
||||
if not isinstance(elements, list):
|
||||
elements = [elements]
|
||||
return _ReadResult(elements, [], [])
|
||||
|
||||
def _element_result_with_messages(element, messages):
|
||||
return _elements_result_with_messages([element], messages)
|
||||
|
||||
def _elements_result_with_messages(elements, messages):
|
||||
return _ReadResult(elements, [], messages)
|
||||
|
||||
_empty_result = _ReadResult([], [], [])
|
||||
|
||||
def _empty_result_with_message(message):
|
||||
return _ReadResult([], [], [message])
|
||||
|
||||
def _concat(*values):
|
||||
result = []
|
||||
for value in values:
|
||||
for element in value:
|
||||
result.append(element)
|
||||
return result
|
||||
|
||||
|
||||
def _is_int(value):
|
||||
if value is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
int(value)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -0,0 +1,24 @@
|
||||
from .. import lists
|
||||
from .. import documents
|
||||
from .. import results
|
||||
|
||||
|
||||
def read_comments_xml_element(element, body_reader):
|
||||
def read_comments_xml_element(element):
|
||||
comment_elements = element.find_children("w:comment")
|
||||
return results.combine(lists.map(_read_comment_element, comment_elements))
|
||||
|
||||
|
||||
def _read_comment_element(element):
|
||||
def read_optional_attribute(name):
|
||||
return element.attributes.get(name, "").strip() or None
|
||||
|
||||
return body_reader.read_all(element.children).map(lambda body:
|
||||
documents.comment(
|
||||
comment_id=element.attributes["w:id"],
|
||||
body=body,
|
||||
author_name=read_optional_attribute("w:author"),
|
||||
author_initials=read_optional_attribute("w:initials"),
|
||||
))
|
||||
|
||||
return read_comments_xml_element(element)
|
||||
@@ -0,0 +1,29 @@
|
||||
class unknown(object):
|
||||
pass
|
||||
|
||||
|
||||
class Begin:
|
||||
def __init__(self, *, fld_char):
|
||||
self.fld_char = fld_char
|
||||
|
||||
|
||||
def begin(*, fld_char):
|
||||
return Begin(fld_char=fld_char)
|
||||
|
||||
|
||||
class Hyperlink(object):
|
||||
def __init__(self, kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
|
||||
def hyperlink(kwargs):
|
||||
return Hyperlink(kwargs=kwargs)
|
||||
|
||||
|
||||
class Checkbox:
|
||||
def __init__(self, *, checked):
|
||||
self.checked = checked
|
||||
|
||||
|
||||
def checkbox(*, checked):
|
||||
return Checkbox(checked=checked)
|
||||
@@ -0,0 +1,58 @@
|
||||
def read_content_types_xml_element(element):
|
||||
extension_defaults = dict(map(
|
||||
_read_default,
|
||||
element.find_children("content-types:Default")
|
||||
))
|
||||
overrides = dict(map(
|
||||
_read_override,
|
||||
element.find_children("content-types:Override")
|
||||
))
|
||||
return _ContentTypes(extension_defaults, overrides)
|
||||
|
||||
|
||||
def _read_default(element):
|
||||
extension = element.attributes["Extension"]
|
||||
content_type = element.attributes["ContentType"]
|
||||
return extension, content_type
|
||||
|
||||
|
||||
def _read_override(element):
|
||||
part_name = element.attributes["PartName"]
|
||||
content_type = element.attributes["ContentType"]
|
||||
return part_name.lstrip("/"), content_type
|
||||
|
||||
|
||||
class _ContentTypes(object):
|
||||
_image_content_types = {
|
||||
"png": "png",
|
||||
"gif": "gif",
|
||||
"jpeg": "jpeg",
|
||||
"jpg": "jpeg",
|
||||
"tif": "tiff",
|
||||
"tiff": "tiff",
|
||||
"bmp": "bmp",
|
||||
}
|
||||
|
||||
def __init__(self, extension_defaults, overrides):
|
||||
self._extension_defaults = extension_defaults
|
||||
self._overrides = overrides
|
||||
|
||||
def find_content_type(self, path):
|
||||
if path in self._overrides:
|
||||
return self._overrides[path]
|
||||
|
||||
extension = _get_extension(path)
|
||||
default_type = self._extension_defaults.get(extension)
|
||||
if default_type is not None:
|
||||
return default_type
|
||||
|
||||
image_type = self._image_content_types.get(extension.lower())
|
||||
if image_type is not None:
|
||||
return "image/" + image_type
|
||||
|
||||
return None
|
||||
|
||||
empty_content_types = _ContentTypes({}, {})
|
||||
|
||||
def _get_extension(path):
|
||||
return path.rpartition(".")[2]
|
||||
1065
path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
Normal file
1065
path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,25 @@
|
||||
from .. import documents
|
||||
|
||||
|
||||
def read_document_xml_element(
|
||||
element,
|
||||
body_reader,
|
||||
notes=None,
|
||||
comments=None):
|
||||
|
||||
if notes is None:
|
||||
notes = []
|
||||
if comments is None:
|
||||
comments = []
|
||||
|
||||
body_element = element.find_child("w:body")
|
||||
|
||||
if body_element is None:
|
||||
raise ValueError("Could not find the body element: are you sure this is a docx file?")
|
||||
|
||||
return body_reader.read_all(body_element.children) \
|
||||
.map(lambda children: documents.document(
|
||||
children,
|
||||
notes=documents.notes(notes),
|
||||
comments=comments
|
||||
))
|
||||
@@ -0,0 +1,46 @@
|
||||
import os
|
||||
import contextlib
|
||||
try:
|
||||
from urllib2 import urlopen
|
||||
except ImportError:
|
||||
from urllib.request import urlopen
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
except ImportError:
|
||||
from urlparse import urlparse
|
||||
|
||||
|
||||
class Files(object):
|
||||
def __init__(self, base, external_file_access):
|
||||
self._base = base
|
||||
self._external_file_access = external_file_access
|
||||
|
||||
def open(self, uri):
|
||||
if not self._external_file_access:
|
||||
raise ExternalFileAccessIsDisabledError(
|
||||
"could not open external image '{0}', external file access is disabled".format(uri)
|
||||
)
|
||||
|
||||
try:
|
||||
if _is_absolute(uri):
|
||||
return contextlib.closing(urlopen(uri))
|
||||
elif self._base is not None:
|
||||
return open(os.path.join(self._base, uri), "rb")
|
||||
else:
|
||||
raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
|
||||
except IOError as error:
|
||||
message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
|
||||
uri, self._base, str(error))
|
||||
raise InvalidFileReferenceError(message)
|
||||
|
||||
|
||||
def _is_absolute(url):
|
||||
return urlparse(url).scheme != ""
|
||||
|
||||
|
||||
class InvalidFileReferenceError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
|
||||
pass
|
||||
@@ -0,0 +1,32 @@
|
||||
import functools
|
||||
|
||||
from .. import lists
|
||||
from .. import documents
|
||||
from .. import results
|
||||
|
||||
|
||||
def _read_notes(note_type, element, body_reader):
|
||||
def read_notes_xml_element(element):
|
||||
note_elements = lists.filter(
|
||||
_is_note_element,
|
||||
element.find_children("w:" + note_type),
|
||||
)
|
||||
return results.combine(lists.map(_read_note_element, note_elements))
|
||||
|
||||
|
||||
def _is_note_element(element):
|
||||
return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
|
||||
|
||||
|
||||
def _read_note_element(element):
|
||||
return body_reader.read_all(element.children).map(lambda body:
|
||||
documents.note(
|
||||
note_type=note_type,
|
||||
note_id=element.attributes["w:id"],
|
||||
body=body
|
||||
))
|
||||
|
||||
return read_notes_xml_element(element)
|
||||
|
||||
read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
|
||||
read_endnotes_xml_element = functools.partial(_read_notes, "endnote")
|
||||
@@ -0,0 +1,130 @@
|
||||
import cobble
|
||||
|
||||
from ..documents import numbering_level
|
||||
from .styles_xml import Styles
|
||||
|
||||
|
||||
def read_numbering_xml_element(element, styles):
|
||||
abstract_nums = _read_abstract_nums(element)
|
||||
nums = _read_nums(element)
|
||||
return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
|
||||
|
||||
|
||||
def _read_abstract_nums(element):
|
||||
abstract_num_elements = element.find_children("w:abstractNum")
|
||||
return dict(map(_read_abstract_num, abstract_num_elements))
|
||||
|
||||
|
||||
def _read_abstract_num(element):
|
||||
abstract_num_id = element.attributes.get("w:abstractNumId")
|
||||
levels = _read_abstract_num_levels(element)
|
||||
num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
|
||||
return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _AbstractNum(object):
|
||||
levels = cobble.field()
|
||||
num_style_link = cobble.field()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _AbstractNumLevel(object):
|
||||
level_index = cobble.field()
|
||||
is_ordered = cobble.field()
|
||||
paragraph_style_id = cobble.field()
|
||||
|
||||
|
||||
def _read_abstract_num_levels(element):
|
||||
levels = {}
|
||||
|
||||
# Some malformed documents define numbering levels without an index, and
|
||||
# reference the numbering using a w:numPr element without a w:ilvl child.
|
||||
# To handle such cases, we assume a level of 0 as a fallback.
|
||||
level_without_index = None
|
||||
|
||||
for level_element in element.find_children("w:lvl"):
|
||||
level = _read_abstract_num_level(level_element)
|
||||
if level.level_index is None:
|
||||
level.level_index = "0"
|
||||
level_without_index = level
|
||||
else:
|
||||
levels[level.level_index] = level
|
||||
|
||||
if level_without_index is not None and level_without_index.level_index not in levels:
|
||||
levels[level_without_index.level_index] = level_without_index
|
||||
|
||||
return levels
|
||||
|
||||
|
||||
def _read_abstract_num_level(element):
|
||||
level_index = element.attributes.get("w:ilvl")
|
||||
num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
|
||||
is_ordered = num_fmt != "bullet"
|
||||
paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
|
||||
return _AbstractNumLevel(
|
||||
level_index=level_index,
|
||||
is_ordered=is_ordered,
|
||||
paragraph_style_id=paragraph_style_id,
|
||||
)
|
||||
|
||||
|
||||
def _read_nums(element):
|
||||
num_elements = element.find_children("w:num")
|
||||
return dict(
|
||||
_read_num(num_element)
|
||||
for num_element in num_elements
|
||||
)
|
||||
|
||||
|
||||
def _read_num(element):
|
||||
num_id = element.attributes.get("w:numId")
|
||||
abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
|
||||
return num_id, _Num(abstract_num_id=abstract_num_id)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _Num(object):
|
||||
abstract_num_id = cobble.field()
|
||||
|
||||
|
||||
class Numbering(object):
|
||||
def __init__(self, abstract_nums, nums, styles):
|
||||
self._abstract_nums = abstract_nums
|
||||
self._levels_by_paragraph_style_id = dict(
|
||||
(level.paragraph_style_id, self._to_numbering_level(level))
|
||||
for abstract_num in abstract_nums.values()
|
||||
for level in abstract_num.levels.values()
|
||||
if level.paragraph_style_id is not None
|
||||
)
|
||||
self._nums = nums
|
||||
self._styles = styles
|
||||
|
||||
def find_level(self, num_id, level):
|
||||
num = self._nums.get(num_id)
|
||||
if num is None:
|
||||
return None
|
||||
else:
|
||||
abstract_num = self._abstract_nums.get(num.abstract_num_id)
|
||||
if abstract_num is None:
|
||||
return None
|
||||
elif abstract_num.num_style_link is None:
|
||||
return self._to_numbering_level(abstract_num.levels.get(level))
|
||||
else:
|
||||
style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
|
||||
return self.find_level(style.num_id, level)
|
||||
|
||||
def find_level_by_paragraph_style_id(self, style_id):
|
||||
return self._levels_by_paragraph_style_id.get(style_id)
|
||||
|
||||
def _to_numbering_level(self, abstract_num_level):
|
||||
if abstract_num_level is None:
|
||||
return None
|
||||
else:
|
||||
return numbering_level(
|
||||
level_index=abstract_num_level.level_index,
|
||||
is_ordered=abstract_num_level.is_ordered,
|
||||
)
|
||||
|
||||
|
||||
Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)
|
||||
@@ -0,0 +1,45 @@
|
||||
from ..lists import flat_map
|
||||
from .xmlparser import parse_xml, XmlElement
|
||||
|
||||
|
||||
_namespaces = [
|
||||
# Transitional format
|
||||
("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
|
||||
("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
|
||||
("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
|
||||
("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
|
||||
("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
|
||||
|
||||
# Strict format
|
||||
("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
|
||||
("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
|
||||
("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
|
||||
("a", "http://purl.oclc.org/ooxml/drawingml/main"),
|
||||
("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
|
||||
|
||||
# Common
|
||||
("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
|
||||
("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
|
||||
("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
|
||||
("v", "urn:schemas-microsoft-com:vml"),
|
||||
("office-word", "urn:schemas-microsoft-com:office:word"),
|
||||
|
||||
# [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
|
||||
# https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
|
||||
("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
|
||||
]
|
||||
|
||||
|
||||
def read(fileobj):
|
||||
return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
|
||||
|
||||
|
||||
def _collapse_alternate_content(node):
|
||||
if isinstance(node, XmlElement):
|
||||
if node.name == "mc:AlternateContent":
|
||||
return node.find_child_or_null("mc:Fallback").children
|
||||
else:
|
||||
node.children = flat_map(_collapse_alternate_content, node.children)
|
||||
return [node]
|
||||
else:
|
||||
return [node]
|
||||
@@ -0,0 +1,38 @@
|
||||
import collections
|
||||
|
||||
|
||||
class Relationships(object):
|
||||
def __init__(self, relationships):
|
||||
self._targets_by_id = dict(
|
||||
(relationship.relationship_id, relationship.target)
|
||||
for relationship in relationships
|
||||
)
|
||||
self._targets_by_type = collections.defaultdict(list)
|
||||
for relationship in relationships:
|
||||
self._targets_by_type[relationship.type].append(relationship.target)
|
||||
|
||||
def find_target_by_relationship_id(self, key):
|
||||
return self._targets_by_id[key]
|
||||
|
||||
def find_targets_by_type(self, relationship_type):
|
||||
return self._targets_by_type[relationship_type]
|
||||
|
||||
|
||||
Relationships.EMPTY = Relationships([])
|
||||
|
||||
|
||||
Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
|
||||
|
||||
|
||||
def read_relationships_xml_element(element):
|
||||
children = element.find_children("relationships:Relationship")
|
||||
return Relationships(list(map(_read_relationship, children)))
|
||||
|
||||
|
||||
def _read_relationship(element):
|
||||
relationship = Relationship(
|
||||
relationship_id=element.attributes["Id"],
|
||||
target=element.attributes["Target"],
|
||||
type=element.attributes["Type"],
|
||||
)
|
||||
return relationship
|
||||
@@ -0,0 +1,70 @@
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..zips import open_zip, update_zip
|
||||
|
||||
|
||||
_style_map_path = "mammoth/style-map"
|
||||
_style_map_absolute_path = "/" + _style_map_path
|
||||
_relationships_path = "word/_rels/document.xml.rels"
|
||||
_content_types_path = "[Content_Types].xml"
|
||||
|
||||
|
||||
def write_style_map(fileobj, style_map):
|
||||
with open_zip(fileobj, "r") as zip_file:
|
||||
relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
|
||||
content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
|
||||
|
||||
update_zip(fileobj, {
|
||||
_style_map_path: style_map.encode("utf8"),
|
||||
_relationships_path: relationships_xml,
|
||||
_content_types_path: content_types_xml,
|
||||
})
|
||||
|
||||
def _generate_relationships_xml(relationships_xml):
|
||||
schema = "http://schemas.zwobble.org/mammoth/style-map"
|
||||
relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
|
||||
relationship_element_name = "{" + relationships_uri + "}Relationship"
|
||||
|
||||
relationships = ElementTree.fromstring(relationships_xml)
|
||||
_add_or_update_element(relationships, relationship_element_name, "Id", {
|
||||
"Id": "rMammothStyleMap",
|
||||
"Type": schema,
|
||||
"Target": _style_map_absolute_path,
|
||||
})
|
||||
|
||||
return ElementTree.tostring(relationships, "UTF-8")
|
||||
|
||||
|
||||
def _generate_content_types_xml(content_types_xml):
|
||||
content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
|
||||
override_name = "{" + content_types_uri + "}Override"
|
||||
|
||||
types = ElementTree.fromstring(content_types_xml)
|
||||
_add_or_update_element(types, override_name, "PartName", {
|
||||
"PartName": _style_map_absolute_path,
|
||||
"ContentType": "text/prs.mammoth.style-map",
|
||||
})
|
||||
|
||||
return ElementTree.tostring(types, "UTF-8")
|
||||
|
||||
|
||||
def _add_or_update_element(parent, name, identifying_attribute, attributes):
|
||||
existing_child = _find_child(parent, name, identifying_attribute, attributes)
|
||||
if existing_child is None:
|
||||
ElementTree.SubElement(parent, name, attributes)
|
||||
else:
|
||||
existing_child.attrib = attributes
|
||||
|
||||
|
||||
def _find_child(parent, name, identifying_attribute, attributes):
|
||||
for element in parent.iter():
|
||||
if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
|
||||
return element
|
||||
|
||||
|
||||
def read_style_map(fileobj):
|
||||
with open_zip(fileobj, "r") as zip_file:
|
||||
if zip_file.exists(_style_map_path):
|
||||
return zip_file.read_str(_style_map_path)
|
||||
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
import collections
|
||||
|
||||
|
||||
class Styles(object):
|
||||
@staticmethod
|
||||
def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
|
||||
if paragraph_styles is None:
|
||||
paragraph_styles = {}
|
||||
if character_styles is None:
|
||||
character_styles = {}
|
||||
if table_styles is None:
|
||||
table_styles = {}
|
||||
if numbering_styles is None:
|
||||
numbering_styles = {}
|
||||
|
||||
return Styles(
|
||||
paragraph_styles=paragraph_styles,
|
||||
character_styles=character_styles,
|
||||
table_styles=table_styles,
|
||||
numbering_styles=numbering_styles,
|
||||
)
|
||||
|
||||
def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
|
||||
self._paragraph_styles = paragraph_styles
|
||||
self._character_styles = character_styles
|
||||
self._table_styles = table_styles
|
||||
self._numbering_styles = numbering_styles
|
||||
|
||||
def find_paragraph_style_by_id(self, style_id):
|
||||
return self._paragraph_styles.get(style_id)
|
||||
|
||||
def find_character_style_by_id(self, style_id):
|
||||
return self._character_styles.get(style_id)
|
||||
|
||||
def find_table_style_by_id(self, style_id):
|
||||
return self._table_styles.get(style_id)
|
||||
|
||||
def find_numbering_style_by_id(self, style_id):
|
||||
return self._numbering_styles.get(style_id)
|
||||
|
||||
|
||||
Styles.EMPTY = Styles(
|
||||
paragraph_styles={},
|
||||
character_styles={},
|
||||
table_styles={},
|
||||
numbering_styles={},
|
||||
)
|
||||
|
||||
|
||||
def read_styles_xml_element(element):
|
||||
paragraph_styles = {}
|
||||
character_styles = {}
|
||||
table_styles = {}
|
||||
numbering_styles = {}
|
||||
styles = {
|
||||
"paragraph": paragraph_styles,
|
||||
"character": character_styles,
|
||||
"table": table_styles,
|
||||
"numbering": numbering_styles,
|
||||
}
|
||||
|
||||
for style_element in element.find_children("w:style"):
|
||||
element_type = style_element.attributes["w:type"]
|
||||
if element_type == "numbering":
|
||||
style = _read_numbering_style_element(style_element)
|
||||
else:
|
||||
style = _read_style_element(style_element)
|
||||
|
||||
style_set = styles.get(element_type)
|
||||
|
||||
# Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1:
|
||||
#
|
||||
# > If multiple style definitions each declare the same value for their
|
||||
# > styleId, then the first such instance shall keep its current
|
||||
# > identifier with all other instances being reassigned in any manner
|
||||
# > desired.
|
||||
#
|
||||
# For the purpose of conversion, there's no point holding onto styles
|
||||
# with reassigned style IDs, so we ignore such style definitions.
|
||||
|
||||
if style_set is not None and style.style_id not in style_set:
|
||||
style_set[style.style_id] = style
|
||||
|
||||
return Styles(
|
||||
paragraph_styles=paragraph_styles,
|
||||
character_styles=character_styles,
|
||||
table_styles=table_styles,
|
||||
numbering_styles=numbering_styles,
|
||||
)
|
||||
|
||||
|
||||
Style = collections.namedtuple("Style", ["style_id", "name"])
|
||||
|
||||
|
||||
def _read_style_element(element):
|
||||
style_id = _read_style_id(element)
|
||||
name = element.find_child_or_null("w:name").attributes.get("w:val")
|
||||
return Style(style_id=style_id, name=name)
|
||||
|
||||
|
||||
NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"])
|
||||
|
||||
|
||||
def _read_numbering_style_element(element):
|
||||
style_id = _read_style_id(element)
|
||||
|
||||
num_id = element \
|
||||
.find_child_or_null("w:pPr") \
|
||||
.find_child_or_null("w:numPr") \
|
||||
.find_child_or_null("w:numId") \
|
||||
.attributes.get("w:val")
|
||||
|
||||
return NumberingStyle(style_id=style_id, num_id=num_id)
|
||||
|
||||
|
||||
def _read_style_id(element):
|
||||
return element.attributes["w:styleId"]
|
||||
@@ -0,0 +1,12 @@
|
||||
def uri_to_zip_entry_name(base, uri):
|
||||
if uri.startswith("/"):
|
||||
return uri[1:]
|
||||
else:
|
||||
return base + "/" + uri
|
||||
|
||||
|
||||
def replace_fragment(uri, fragment):
|
||||
hash_index = uri.find("#")
|
||||
if hash_index != -1:
|
||||
uri = uri[:hash_index]
|
||||
return uri + "#" + fragment
|
||||
@@ -0,0 +1,121 @@
|
||||
import xml.dom.minidom
|
||||
|
||||
import cobble
|
||||
|
||||
|
||||
@cobble.data
|
||||
class XmlElement(object):
|
||||
name = cobble.field()
|
||||
attributes = cobble.field()
|
||||
children = cobble.field()
|
||||
|
||||
def find_child_or_null(self, name):
|
||||
return self.find_child(name) or null_xml_element
|
||||
|
||||
def find_child(self, name):
|
||||
for child in self.children:
|
||||
if isinstance(child, XmlElement) and child.name == name:
|
||||
return child
|
||||
|
||||
|
||||
def find_children(self, name):
|
||||
return XmlElementList(filter(
|
||||
lambda child: child.node_type == node_types.element and child.name == name,
|
||||
self.children
|
||||
))
|
||||
|
||||
|
||||
class XmlElementList(object):
|
||||
def __init__(self, elements):
|
||||
self._elements = elements
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._elements)
|
||||
|
||||
def find_children(self, name):
|
||||
children = []
|
||||
for element in self._elements:
|
||||
for child in element.find_children(name):
|
||||
children.append(child)
|
||||
return XmlElementList(children)
|
||||
|
||||
|
||||
class NullXmlElement(object):
|
||||
attributes = {}
|
||||
children = []
|
||||
|
||||
def find_child_or_null(self, name):
|
||||
return self
|
||||
|
||||
def find_child(self, name):
|
||||
return None
|
||||
|
||||
|
||||
null_xml_element = NullXmlElement()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class XmlText(object):
|
||||
value = cobble.field()
|
||||
|
||||
|
||||
def element(name, attributes=None, children=None):
|
||||
return XmlElement(name, attributes or {}, children or [])
|
||||
|
||||
text = XmlText
|
||||
|
||||
|
||||
class node_types(object):
|
||||
element = 1
|
||||
text = 3
|
||||
|
||||
|
||||
XmlElement.node_type = node_types.element
|
||||
XmlText.node_type = node_types.text
|
||||
|
||||
|
||||
|
||||
def parse_xml(fileobj, namespace_mapping=None):
|
||||
if namespace_mapping is None:
|
||||
namespace_prefixes = {}
|
||||
else:
|
||||
namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
|
||||
|
||||
document = xml.dom.minidom.parse(fileobj)
|
||||
|
||||
def convert_node(node):
|
||||
if node.nodeType == xml.dom.Node.ELEMENT_NODE:
|
||||
return convert_element(node)
|
||||
elif node.nodeType == xml.dom.Node.TEXT_NODE:
|
||||
return XmlText(node.nodeValue)
|
||||
else:
|
||||
return None
|
||||
|
||||
def convert_element(element):
|
||||
converted_name = convert_name(element)
|
||||
|
||||
converted_attributes = dict(
|
||||
(convert_name(attribute), attribute.value)
|
||||
for attribute in element.attributes.values()
|
||||
if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
|
||||
)
|
||||
|
||||
converted_children = []
|
||||
for child_node in element.childNodes:
|
||||
converted_child_node = convert_node(child_node)
|
||||
if converted_child_node is not None:
|
||||
converted_children.append(converted_child_node)
|
||||
|
||||
return XmlElement(converted_name, converted_attributes, converted_children)
|
||||
|
||||
def convert_name(node):
|
||||
if node.namespaceURI is None:
|
||||
return node.localName
|
||||
else:
|
||||
prefix = namespace_prefixes.get(node.namespaceURI)
|
||||
if prefix is None:
|
||||
return "{%s}%s" % (node.namespaceURI, node.localName)
|
||||
else:
|
||||
return "%s:%s" % (prefix, node.localName)
|
||||
|
||||
return convert_node(document.documentElement)
|
||||
Reference in New Issue
Block a user