Initial commit (Clean history)

This commit is contained in:
anhduy-tech
2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions

View File

@@ -0,0 +1,58 @@
from . import docx, conversion, options, images, transforms, underline
from .raw_text import extract_raw_text_from_element
from .docx.style_map import write_style_map, read_style_map
__all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"]
_undefined = object()
def convert_to_html(*args, **kwargs):
return convert(*args, output_format="html", **kwargs)
def convert_to_markdown(*args, **kwargs):
return convert(*args, output_format="markdown", **kwargs)
def convert(
fileobj,
transform_document=None,
id_prefix=None,
include_embedded_style_map=_undefined,
external_file_access=_undefined,
**kwargs
):
if include_embedded_style_map is _undefined:
include_embedded_style_map = True
if transform_document is None:
transform_document = lambda x: x
if include_embedded_style_map:
kwargs["embedded_style_map"] = read_style_map(fileobj)
if external_file_access is _undefined:
external_file_access = False
return options.read_options(kwargs).bind(lambda convert_options:
docx.read(fileobj, external_file_access=external_file_access).map(transform_document).bind(lambda document:
conversion.convert_document_element_to_html(
document,
id_prefix=id_prefix,
**convert_options
)
)
)
def extract_raw_text(fileobj):
return docx.read(fileobj).map(extract_raw_text_from_element)
def embed_style_map(fileobj, style_map):
write_style_map(fileobj, style_map)
def read_embedded_style_map(fileobj):
return read_style_map(fileobj)

View File

@@ -0,0 +1,104 @@
import argparse
import io
import os
import shutil
import sys
import mammoth
from . import writers
def main():
args = _parse_args()
if args.style_map is None:
style_map = None
else:
with open(args.style_map) as style_map_fileobj:
style_map = style_map_fileobj.read()
with open(args.path, "rb") as docx_fileobj:
if args.output_dir is None:
convert_image = None
output_path = args.output
else:
convert_image = mammoth.images.img_element(ImageWriter(args.output_dir))
output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0])
output_path = os.path.join(args.output_dir, output_filename)
result = mammoth.convert(
docx_fileobj,
style_map=style_map,
convert_image=convert_image,
output_format=args.output_format,
)
for message in result.messages:
sys.stderr.write(message.message)
sys.stderr.write("\n")
_write_output(output_path, result.value)
class ImageWriter(object):
def __init__(self, output_dir):
self._output_dir = output_dir
self._image_number = 1
def __call__(self, element):
extension = element.content_type.partition("/")[2]
image_filename = "{0}.{1}".format(self._image_number, extension)
with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest:
with element.open() as image_source:
shutil.copyfileobj(image_source, image_dest)
self._image_number += 1
return {"src": image_filename}
def _write_output(path, contents):
if path is None:
if sys.version_info[0] <= 2:
stdout = sys.stdout
else:
stdout = sys.stdout.buffer
stdout.write(contents.encode("utf-8"))
stdout.flush()
else:
with io.open(path, "w", encoding="utf-8") as fileobj:
fileobj.write(contents)
def _parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"path",
metavar="docx-path",
help="Path to the .docx file to convert.")
output_group = parser.add_mutually_exclusive_group()
output_group.add_argument(
"output",
nargs="?",
metavar="output-path",
help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.")
output_group.add_argument(
"--output-dir",
help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.")
parser.add_argument(
"--output-format",
required=False,
choices=writers.formats(),
help="Output format.")
parser.add_argument(
"--style-map",
required=False,
help="File containg a style map.")
return parser.parse_args()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,408 @@
# coding=utf-8
from __future__ import unicode_literals
from functools import partial
import cobble
from . import documents, results, html_paths, images, writers, html
from .docx.files import InvalidFileReferenceError
from .lists import find_index
def convert_document_element_to_html(element,
style_map=None,
convert_image=None,
id_prefix=None,
output_format=None,
ignore_empty_paragraphs=True):
if style_map is None:
style_map = []
if id_prefix is None:
id_prefix = ""
if convert_image is None:
convert_image = images.data_uri
if isinstance(element, documents.Document):
comments = dict(
(comment.comment_id, comment)
for comment in element.comments
)
else:
comments = {}
messages = []
converter = _DocumentConverter(
messages=messages,
style_map=style_map,
convert_image=convert_image,
id_prefix=id_prefix,
ignore_empty_paragraphs=ignore_empty_paragraphs,
note_references=[],
comments=comments,
)
context = _ConversionContext(is_table_header=False)
nodes = converter.visit(element, context)
writer = writers.writer(output_format)
html.write(writer, html.collapse(html.strip_empty(nodes)))
return results.Result(writer.as_string(), messages)
@cobble.data
class _ConversionContext(object):
is_table_header = cobble.field()
def copy(self, **kwargs):
return cobble.copy(self, **kwargs)
class _DocumentConverter(documents.element_visitor(args=1)):
def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
self._messages = messages
self._style_map = style_map
self._id_prefix = id_prefix
self._ignore_empty_paragraphs = ignore_empty_paragraphs
self._note_references = note_references
self._referenced_comments = []
self._convert_image = convert_image
self._comments = comments
def visit_image(self, image, context):
try:
return self._convert_image(image)
except InvalidFileReferenceError as error:
self._messages.append(results.warning(str(error)))
return []
def visit_document(self, document, context):
nodes = self._visit_all(document.children, context)
notes = [
document.notes.resolve(reference)
for reference in self._note_references
]
notes_list = html.element("ol", {}, self._visit_all(notes, context))
comments = html.element("dl", {}, [
html_node
for referenced_comment in self._referenced_comments
for html_node in self.visit_comment(referenced_comment, context)
])
return nodes + [notes_list, comments]
def visit_paragraph(self, paragraph, context):
def children():
content = self._visit_all(paragraph.children, context)
if self._ignore_empty_paragraphs:
return content
else:
return [html.force_write] + content
html_path = self._find_html_path_for_paragraph(paragraph)
return html_path.wrap(children)
def visit_run(self, run, context):
nodes = lambda: self._visit_all(run.children, context)
paths = []
if run.highlight is not None:
style = self._find_style(Highlight(color=run.highlight), "highlight")
if style is not None:
paths.append(style.html_path)
if run.is_small_caps:
paths.append(self._find_style_for_run_property("small_caps"))
if run.is_all_caps:
paths.append(self._find_style_for_run_property("all_caps"))
if run.is_strikethrough:
paths.append(self._find_style_for_run_property("strikethrough", default="s"))
if run.is_underline:
paths.append(self._find_style_for_run_property("underline"))
if run.vertical_alignment == documents.VerticalAlignment.subscript:
paths.append(html_paths.element(["sub"], fresh=False))
if run.vertical_alignment == documents.VerticalAlignment.superscript:
paths.append(html_paths.element(["sup"], fresh=False))
if run.is_italic:
paths.append(self._find_style_for_run_property("italic", default="em"))
if run.is_bold:
paths.append(self._find_style_for_run_property("bold", default="strong"))
paths.append(self._find_html_path_for_run(run))
for path in paths:
nodes = partial(path.wrap, nodes)
return nodes()
def _find_style_for_run_property(self, element_type, default=None):
style = self._find_style(None, element_type)
if style is not None:
return style.html_path
elif default is not None:
return html_paths.element(default, fresh=False)
else:
return html_paths.empty
def visit_text(self, text, context):
return [html.text(text.value)]
def visit_hyperlink(self, hyperlink, context):
if hyperlink.anchor is None:
href = hyperlink.href
else:
href = "#{0}".format(self._html_id(hyperlink.anchor))
attributes = {"href": href}
if hyperlink.target_frame is not None:
attributes["target"] = hyperlink.target_frame
nodes = self._visit_all(hyperlink.children, context)
return [html.collapsible_element("a", attributes, nodes)]
def visit_checkbox(self, checkbox, context):
attributes = {"type": "checkbox"}
if checkbox.checked:
attributes["checked"] = "checked"
return [html.element("input", attributes)]
def visit_bookmark(self, bookmark, context):
element = html.collapsible_element(
"a",
{"id": self._html_id(bookmark.name)},
[html.force_write])
return [element]
def visit_tab(self, tab, context):
return [html.text("\t")]
_default_table_path = html_paths.path([html_paths.element(["table"], fresh=True)])
def visit_table(self, table, context):
return self._find_html_path(table, "table", self._default_table_path) \
.wrap(lambda: self._convert_table_children(table, context))
def _convert_table_children(self, table, context):
body_index = find_index(
lambda child: not isinstance(child, documents.TableRow) or not child.is_header,
table.children,
)
if body_index is None:
body_index = len(table.children)
if body_index == 0:
children = self._visit_all(table.children, context.copy(is_table_header=False))
else:
head_rows = self._visit_all(table.children[:body_index], context.copy(is_table_header=True))
body_rows = self._visit_all(table.children[body_index:], context.copy(is_table_header=False))
children = [
html.element("thead", {}, head_rows),
html.element("tbody", {}, body_rows),
]
return [html.force_write] + children
def visit_table_row(self, table_row, context):
return [html.element("tr", {}, [html.force_write] + self._visit_all(table_row.children, context))]
def visit_table_cell(self, table_cell, context):
if context.is_table_header:
tag_name = "th"
else:
tag_name = "td"
attributes = {}
if table_cell.colspan != 1:
attributes["colspan"] = str(table_cell.colspan)
if table_cell.rowspan != 1:
attributes["rowspan"] = str(table_cell.rowspan)
nodes = [html.force_write] + self._visit_all(table_cell.children, context)
return [
html.element(tag_name, attributes, nodes)
]
def visit_break(self, break_, context):
return self._find_html_path_for_break(break_).wrap(lambda: [])
def _find_html_path_for_break(self, break_):
style = self._find_style(break_, "break")
if style is not None:
return style.html_path
elif break_.break_type == "line":
return html_paths.path([html_paths.element("br", fresh=True)])
else:
return html_paths.empty
def visit_note_reference(self, note_reference, context):
self._note_references.append(note_reference)
note_number = len(self._note_references)
return [
html.element("sup", {}, [
html.element("a", {
"href": "#" + self._note_html_id(note_reference),
"id": self._note_ref_html_id(note_reference),
}, [html.text("[{0}]".format(note_number))])
])
]
def visit_note(self, note, context):
note_body = self._visit_all(note.body, context) + [
html.collapsible_element("p", {}, [
html.text(" "),
html.element("a", {"href": "#" + self._note_ref_html_id(note)}, [
html.text(_up_arrow)
]),
])
]
return [
html.element("li", {"id": self._note_html_id(note)}, note_body)
]
def visit_comment_reference(self, reference, context):
def nodes():
comment = self._comments[reference.comment_id]
count = len(self._referenced_comments) + 1
label = "[{0}{1}]".format(_comment_author_label(comment), count)
self._referenced_comments.append((label, comment))
return [
# TODO: remove duplication with note references
html.element("a", {
"href": "#" + self._referent_html_id("comment", reference.comment_id),
"id": self._reference_html_id("comment", reference.comment_id),
}, [html.text(label)])
]
html_path = self._find_html_path(
None,
"comment_reference",
default=html_paths.ignore,
)
return html_path.wrap(nodes)
def visit_comment(self, referenced_comment, context):
label, comment = referenced_comment
# TODO remove duplication with notes
body = self._visit_all(comment.body, context) + [
html.collapsible_element("p", {}, [
html.text(" "),
html.element("a", {"href": "#" + self._reference_html_id("comment", comment.comment_id)}, [
html.text(_up_arrow)
]),
])
]
return [
html.element(
"dt",
{"id": self._referent_html_id("comment", comment.comment_id)},
[html.text("Comment {0}".format(label))],
),
html.element("dd", {}, body),
]
def _visit_all(self, elements, context):
return [
html_node
for element in elements
for html_node in self.visit(element, context)
]
def _find_html_path_for_paragraph(self, paragraph):
default = html_paths.path([html_paths.element("p", fresh=True)])
return self._find_html_path(paragraph, "paragraph", default, warn_unrecognised=True)
def _find_html_path_for_run(self, run):
return self._find_html_path(run, "run", default=html_paths.empty, warn_unrecognised=True)
def _find_html_path(self, element, element_type, default, warn_unrecognised=False):
style = self._find_style(element, element_type)
if style is not None:
return style.html_path
if warn_unrecognised and getattr(element, "style_id", None) is not None:
self._messages.append(results.warning(
"Unrecognised {0} style: {1} (Style ID: {2})".format(
element_type, element.style_name, element.style_id)
))
return default
def _find_style(self, element, element_type):
for style in self._style_map:
document_matcher = style.document_matcher
if _document_matcher_matches(document_matcher, element, element_type):
return style
def _note_html_id(self, note):
return self._referent_html_id(note.note_type, note.note_id)
def _note_ref_html_id(self, note):
return self._reference_html_id(note.note_type, note.note_id)
def _referent_html_id(self, reference_type, reference_id):
return self._html_id("{0}-{1}".format(reference_type, reference_id))
def _reference_html_id(self, reference_type, reference_id):
return self._html_id("{0}-ref-{1}".format(reference_type, reference_id))
def _html_id(self, suffix):
return "{0}{1}".format(self._id_prefix, suffix)
@cobble.data
class Highlight:
color = cobble.field()
def _document_matcher_matches(matcher, element, element_type):
if matcher.element_type in ["underline", "strikethrough", "all_caps", "small_caps", "bold", "italic", "comment_reference"]:
return matcher.element_type == element_type
elif matcher.element_type == "highlight":
return (
matcher.element_type == element_type and
(matcher.color is None or matcher.color == element.color)
)
elif matcher.element_type == "break":
return (
matcher.element_type == element_type and
matcher.break_type == element.break_type
)
else: # matcher.element_type in ["paragraph", "run"]:
return (
matcher.element_type == element_type and (
matcher.style_id is None or
matcher.style_id == element.style_id
) and (
matcher.style_name is None or
element.style_name is not None and (matcher.style_name.matches(element.style_name))
) and (
element_type != "paragraph" or
matcher.numbering is None or
matcher.numbering == element.numbering
)
)
def _comment_author_label(comment):
return comment.author_initials or ""
_up_arrow = ""

View File

@@ -0,0 +1,95 @@
import collections
import cobble
def paragraph(style_id=None, style_name=None, numbering=None):
return ParagraphMatcher(style_id, style_name, numbering)
ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"])
ParagraphMatcher.element_type = "paragraph"
def run(style_id=None, style_name=None):
return RunMatcher(style_id, style_name)
RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"])
RunMatcher.element_type = "run"
def table(style_id=None, style_name=None):
return TableMatcher(style_id, style_name)
TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"])
TableMatcher.element_type = "table"
class bold(object):
element_type = "bold"
class italic(object):
element_type = "italic"
class underline(object):
element_type = "underline"
class strikethrough(object):
element_type = "strikethrough"
class all_caps(object):
element_type = "all_caps"
class small_caps(object):
element_type = "small_caps"
def highlight(color=None):
return HighlightMatcher(color=color)
HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"])
HighlightMatcher.element_type = "highlight"
class comment_reference(object):
element_type = "comment_reference"
BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"])
BreakMatcher.element_type = "break"
line_break = BreakMatcher("line")
page_break = BreakMatcher("page")
column_break = BreakMatcher("column")
def equal_to(value):
return StringMatcher(_operator_equal_to, value)
def _operator_equal_to(first, second):
return first.upper() == second.upper()
def starts_with(value):
return StringMatcher(_operator_starts_with, value)
def _operator_starts_with(first, second):
return second.upper().startswith(first.upper())
@cobble.data
class StringMatcher(object):
operator = cobble.field()
value = cobble.field()
def matches(self, other):
return self.operator(self.value, other)

View File

@@ -0,0 +1,286 @@
import cobble
class Element(object):
def copy(self, **kwargs):
return cobble.copy(self, **kwargs)
class HasChildren(Element):
children = cobble.field()
@cobble.data
class Document(HasChildren):
notes = cobble.field()
comments = cobble.field()
@cobble.data
class Paragraph(HasChildren):
style_id = cobble.field()
style_name = cobble.field()
numbering = cobble.field()
alignment = cobble.field()
indent = cobble.field()
@cobble.data
class ParagraphIndent(object):
start = cobble.field()
end = cobble.field()
first_line = cobble.field()
hanging = cobble.field()
@cobble.data
class Indent(object):
left = cobble.field()
right = cobble.field()
first_line = cobble.field()
hanging = cobble.field()
@cobble.data
class Run(HasChildren):
style_id = cobble.field()
style_name = cobble.field()
is_bold = cobble.field()
is_italic = cobble.field()
is_underline = cobble.field()
is_strikethrough = cobble.field()
is_all_caps = cobble.field()
is_small_caps = cobble.field()
vertical_alignment = cobble.field()
font = cobble.field()
font_size = cobble.field()
highlight = cobble.field()
@cobble.data
class Text(Element):
value = cobble.field()
@cobble.data
class Hyperlink(HasChildren):
href = cobble.field()
anchor = cobble.field()
target_frame = cobble.field()
@cobble.data
class Checkbox(Element):
checked = cobble.field()
checkbox = Checkbox
@cobble.data
class Table(HasChildren):
style_id = cobble.field()
style_name = cobble.field()
@cobble.data
class TableRow(HasChildren):
is_header = cobble.field()
@cobble.data
class TableCell(HasChildren):
colspan = cobble.field()
rowspan = cobble.field()
@cobble.data
class TableCellUnmerged:
children = cobble.field()
colspan = cobble.field()
rowspan = cobble.field()
vmerge = cobble.field()
def _accept1(self, visitor, arg0):
return visitor.visit_table_cell(self, arg0)
def copy(self, **kwargs):
return cobble.copy(self, **kwargs)
@cobble.data
class Break(Element):
break_type = cobble.field()
line_break = Break("line")
page_break = Break("page")
column_break = Break("column")
@cobble.data
class Tab(Element):
pass
@cobble.data
class Image(Element):
alt_text = cobble.field()
content_type = cobble.field()
open = cobble.field()
def document(children, notes=None, comments=None):
if notes is None:
notes = Notes({})
if comments is None:
comments = []
return Document(children, notes, comments=comments)
def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
if indent is None:
indent = paragraph_indent()
return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent)
def paragraph_indent(start=None, end=None, first_line=None, hanging=None):
return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging)
def run(
children,
style_id=None,
style_name=None,
is_bold=None,
is_italic=None,
is_underline=None,
is_strikethrough=None,
is_all_caps=None,
is_small_caps=None,
vertical_alignment=None,
font=None,
font_size=None,
highlight=None,
):
if vertical_alignment is None:
vertical_alignment = VerticalAlignment.baseline
return Run(
children=children,
style_id=style_id,
style_name=style_name,
is_bold=bool(is_bold),
is_italic=bool(is_italic),
is_underline=bool(is_underline),
is_strikethrough=bool(is_strikethrough),
is_all_caps=bool(is_all_caps),
is_small_caps=bool(is_small_caps),
vertical_alignment=vertical_alignment,
font=font,
font_size=font_size,
highlight=highlight,
)
class VerticalAlignment(object):
baseline = "baseline"
superscript = "superscript"
subscript = "subscript"
text = Text
_tab = Tab()
def tab():
return _tab
image = Image
def hyperlink(children, href=None, anchor=None, target_frame=None):
return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children)
@cobble.data
class Bookmark(Element):
name = cobble.field()
bookmark = Bookmark
def table(children, style_id=None, style_name=None):
return Table(children=children, style_id=style_id, style_name=style_name)
def table_row(children, is_header=None):
return TableRow(children=children, is_header=bool(is_header))
def table_cell(children, colspan=None, rowspan=None):
if colspan is None:
colspan = 1
if rowspan is None:
rowspan = 1
return TableCell(children=children, colspan=colspan, rowspan=rowspan)
def table_cell_unmerged(children, colspan, rowspan, vmerge):
return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge)
def numbering_level(level_index, is_ordered):
return _NumberingLevel(str(level_index), bool(is_ordered))
@cobble.data
class _NumberingLevel(object):
level_index = cobble.field()
is_ordered = cobble.field()
@cobble.data
class Note(Element):
note_type = cobble.field()
note_id = cobble.field()
body = cobble.field()
note = Note
class Notes(object):
def __init__(self, notes):
self._notes = notes
def find_note(self, note_type, note_id):
return self._notes[(note_type, note_id)]
def resolve(self, reference):
return self.find_note(reference.note_type, reference.note_id)
def __eq__(self, other):
return isinstance(other, Notes) and self._notes == other._notes
def __ne__(self, other):
return not (self == other)
def notes(notes_list):
return Notes(dict(
(_note_key(note), note)
for note in notes_list
))
def _note_key(note):
return (note.note_type, note.note_id)
@cobble.data
class NoteReference(Element):
note_type = cobble.field()
note_id = cobble.field()
note_reference = NoteReference
@cobble.data
class Comment(object):
comment_id = cobble.field()
body = cobble.field()
author_name = cobble.field()
author_initials = cobble.field()
def comment(comment_id, body, author_name=None, author_initials=None):
return Comment(
comment_id=comment_id,
body=body,
author_name=author_name,
author_initials=author_initials,
)
@cobble.data
class CommentReference(Element):
comment_id = cobble.field()
comment_reference = CommentReference
def element_visitor(args):
return cobble.visitor(Element, args=args)

View File

@@ -0,0 +1,211 @@
from functools import partial
import os
import cobble
from .. import results, lists, zips
from .document_xml import read_document_xml_element
from .content_types_xml import empty_content_types, read_content_types_xml_element
from .relationships_xml import read_relationships_xml_element, Relationships
from .numbering_xml import read_numbering_xml_element, Numbering
from .styles_xml import read_styles_xml_element, Styles
from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
from .comments_xml import read_comments_xml_element
from .files import Files
from . import body_xml, office_xml
from ..zips import open_zip
_empty_result = results.success([])
def read(fileobj, external_file_access=False):
zip_file = open_zip(fileobj, "r")
part_paths = _find_part_paths(zip_file)
read_part_with_body = _part_with_body_reader(
getattr(fileobj, "name", None),
zip_file,
part_paths=part_paths,
external_file_access=external_file_access,
)
return results.combine([
_read_notes(read_part_with_body, part_paths),
_read_comments(read_part_with_body, part_paths),
]).bind(lambda referents:
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
)
@cobble.data
class _PartPaths(object):
main_document = cobble.field()
comments = cobble.field()
endnotes = cobble.field()
footnotes = cobble.field()
numbering = cobble.field()
styles = cobble.field()
def _find_part_paths(zip_file):
package_relationships = _read_relationships(zip_file, "_rels/.rels")
document_filename = _find_document_filename(zip_file, package_relationships)
document_relationships = _read_relationships(
zip_file,
_find_relationships_path_for(document_filename),
)
def find(name):
return _find_part_path(
zip_file=zip_file,
relationships=document_relationships,
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
fallback_path="word/{0}.xml".format(name),
base_path=zips.split_path(document_filename)[0],
)
return _PartPaths(
main_document=document_filename,
comments=find("comments"),
endnotes=find("endnotes"),
footnotes=find("footnotes"),
numbering=find("numbering"),
styles=find("styles"),
)
def _find_document_filename(zip_file, relationships):
path = _find_part_path(
zip_file,
relationships,
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
base_path="",
fallback_path="word/document.xml",
)
if zip_file.exists(path):
return path
else:
raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
targets = [
zips.join_path(base_path, target).lstrip("/")
for target in relationships.find_targets_by_type(relationship_type)
]
valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
if len(valid_targets) == 0:
return fallback_path
else:
return valid_targets[0]
def _read_notes(read_part_with_body, part_paths):
footnotes = read_part_with_body(
part_paths.footnotes,
lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
default=_empty_result,
)
endnotes = read_part_with_body(
part_paths.endnotes,
lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
default=_empty_result,
)
return results.combine([footnotes, endnotes]).map(lists.flatten)
def _read_comments(read_part_with_body, part_paths):
return read_part_with_body(
part_paths.comments,
lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
default=_empty_result,
)
def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
return read_part_with_body(
part_paths.main_document,
partial(
read_document_xml_element,
notes=notes,
comments=comments,
),
)
def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
content_types = _try_read_entry_or_default(
zip_file,
"[Content_Types].xml",
read_content_types_xml_element,
empty_content_types,
)
styles = _try_read_entry_or_default(
zip_file,
part_paths.styles,
read_styles_xml_element,
Styles.EMPTY,
)
numbering = _try_read_entry_or_default(
zip_file,
part_paths.numbering,
lambda element: read_numbering_xml_element(element, styles=styles),
default=Numbering.EMPTY,
)
files = Files(
None if document_path is None else os.path.dirname(document_path),
external_file_access=external_file_access,
)
def read_part(name, reader, default=_undefined):
relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
body_reader = body_xml.reader(
numbering=numbering,
content_types=content_types,
relationships=relationships,
styles=styles,
docx_file=zip_file,
files=files,
)
if default is _undefined:
return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
else:
return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
return read_part
def _find_relationships_path_for(name):
dirname, basename = zips.split_path(name)
return zips.join_path(dirname, "_rels", basename + ".rels")
def _read_relationships(zip_file, name):
return _try_read_entry_or_default(
zip_file,
name,
read_relationships_xml_element,
default=Relationships.EMPTY,
)
def _try_read_entry_or_default(zip_file, name, reader, default):
if zip_file.exists(name):
return _read_entry(zip_file, name, reader)
else:
return default
def _read_entry(zip_file, name, reader):
with zip_file.open(name) as fileobj:
return reader(office_xml.read(fileobj))
_undefined = object()

View File

@@ -0,0 +1,794 @@
import contextlib
import re
import sys
from .. import documents
from .. import results
from .. import lists
from .. import transforms
from . import complex_fields
from .dingbats import dingbats
from .xmlparser import node_types, XmlElement, null_xml_element
from .styles_xml import Styles
from .uris import replace_fragment, uri_to_zip_entry_name
if sys.version_info >= (3, ):
unichr = chr
def reader(
numbering=None,
content_types=None,
relationships=None,
styles=None,
docx_file=None,
files=None
):
if styles is None:
styles = Styles.EMPTY
read_all = _create_reader(
numbering=numbering,
content_types=content_types,
relationships=relationships,
styles=styles,
docx_file=docx_file,
files=files,
)
return _BodyReader(read_all)
class _BodyReader(object):
def __init__(self, read_all):
self._read_all = read_all
def read_all(self, elements):
result = self._read_all(elements)
return results.Result(result.elements, result.messages)
def _create_reader(numbering, content_types, relationships, styles, docx_file, files):
current_instr_text = []
complex_field_stack = []
# When a paragraph is marked as deleted, its contents should be combined
# with the following paragraph. See 17.13.5.15 del (Deleted Paragraph) of
# ECMA-376 4th edition Part 1.
deleted_paragraph_contents = []
_ignored_elements = set([
"office-word:wrap",
"v:shadow",
"v:shapetype",
"w:annotationRef",
"w:bookmarkEnd",
"w:sectPr",
"w:proofErr",
"w:lastRenderedPageBreak",
"w:commentRangeStart",
"w:commentRangeEnd",
"w:del",
"w:footnoteRef",
"w:endnoteRef",
"w:pPr",
"w:rPr",
"w:tblPr",
"w:tblGrid",
"w:trPr",
"w:tcPr",
])
def text(element):
return _success(documents.Text(_inner_text(element)))
def run(element):
properties = element.find_child_or_null("w:rPr")
vertical_alignment = properties \
.find_child_or_null("w:vertAlign") \
.attributes.get("w:val")
font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
font_size_string = properties.find_child_or_null("w:sz").attributes.get("w:val")
if _is_int(font_size_string):
# w:sz gives the font size in half points, so halve the value to get the size in points
font_size = int(font_size_string) / 2
else:
font_size = None
is_bold = read_boolean_element(properties.find_child("w:b"))
is_italic = read_boolean_element(properties.find_child("w:i"))
is_underline = read_underline_element(properties.find_child("w:u"))
is_strikethrough = read_boolean_element(properties.find_child("w:strike"))
is_all_caps = read_boolean_element(properties.find_child("w:caps"))
is_small_caps = read_boolean_element(properties.find_child("w:smallCaps"))
highlight = read_highlight_value(properties.find_child_or_null("w:highlight").attributes.get("w:val"))
def add_complex_field_hyperlink(children):
hyperlink_kwargs = current_hyperlink_kwargs()
if hyperlink_kwargs is None:
return children
else:
return [documents.hyperlink(children=children, **hyperlink_kwargs)]
return _ReadResult.map_results(
_read_run_style(properties),
_read_xml_elements(element.children).map(add_complex_field_hyperlink),
lambda style, children: documents.run(
children=children,
style_id=style[0],
style_name=style[1],
is_bold=is_bold,
is_italic=is_italic,
is_underline=is_underline,
is_strikethrough=is_strikethrough,
is_all_caps=is_all_caps,
is_small_caps=is_small_caps,
vertical_alignment=vertical_alignment,
font=font,
font_size=font_size,
highlight=highlight,
))
def _read_run_style(properties):
return _read_style(properties, "w:rStyle", "Run", styles.find_character_style_by_id)
def read_boolean_element(element):
if element is None:
return False
else:
return read_boolean_attribute_value(element.attributes.get("w:val"))
def read_boolean_attribute_value(value):
return value not in ["false", "0"]
def read_underline_element(element):
return element and element.attributes.get("w:val") not in [None, "false", "0", "none"]
def read_highlight_value(value):
if not value or value == "none":
return None
else:
return value
def paragraph(element):
properties = element.find_child_or_null("w:pPr")
is_deleted = properties.find_child_or_null("w:rPr").find_child("w:del")
if is_deleted is not None:
for child in element.children:
deleted_paragraph_contents.append(child)
return _empty_result
else:
alignment = properties.find_child_or_null("w:jc").attributes.get("w:val")
indent = _read_paragraph_indent(properties.find_child_or_null("w:ind"))
children_xml = element.children
if deleted_paragraph_contents:
children_xml = deleted_paragraph_contents + children_xml
del deleted_paragraph_contents[:]
return _ReadResult.map_results(
_read_paragraph_style(properties),
_read_xml_elements(children_xml),
lambda style, children: documents.paragraph(
children=children,
style_id=style[0],
style_name=style[1],
numbering=_read_numbering_properties(
paragraph_style_id=style[0],
element=properties.find_child_or_null("w:numPr"),
),
alignment=alignment,
indent=indent,
)).append_extra()
def _read_paragraph_style(properties):
return _read_style(properties, "w:pStyle", "Paragraph", styles.find_paragraph_style_by_id)
def current_hyperlink_kwargs():
for complex_field in reversed(complex_field_stack):
if isinstance(complex_field, complex_fields.Hyperlink):
return complex_field.kwargs
return None
def read_fld_char(element):
fld_char_type = element.attributes.get("w:fldCharType")
if fld_char_type == "begin":
complex_field_stack.append(complex_fields.begin(fld_char=element))
del current_instr_text[:]
elif fld_char_type == "end":
complex_field = complex_field_stack.pop()
if isinstance(complex_field, complex_fields.Begin):
complex_field = parse_current_instr_text(complex_field)
if isinstance(complex_field, complex_fields.Checkbox):
return _success(documents.checkbox(checked=complex_field.checked))
elif fld_char_type == "separate":
complex_field_separate = complex_field_stack.pop()
complex_field = parse_current_instr_text(complex_field_separate)
complex_field_stack.append(complex_field)
return _empty_result
def parse_current_instr_text(complex_field):
instr_text = "".join(current_instr_text)
if isinstance(complex_field, complex_fields.Begin):
fld_char = complex_field.fld_char
else:
fld_char = null_xml_element
return parse_instr_text(instr_text, fld_char=fld_char)
def parse_instr_text(instr_text, *, fld_char):
external_link_result = re.match(r'\s*HYPERLINK "(.*)"', instr_text)
if external_link_result is not None:
return complex_fields.hyperlink(dict(href=external_link_result.group(1)))
internal_link_result = re.match(r'\s*HYPERLINK\s+\\l\s+"(.*)"', instr_text)
if internal_link_result is not None:
return complex_fields.hyperlink(dict(anchor=internal_link_result.group(1)))
checkbox_result = re.match(r'\s*FORMCHECKBOX\s*', instr_text)
if checkbox_result is not None:
checkbox_element = fld_char \
.find_child_or_null("w:ffData") \
.find_child_or_null("w:checkBox")
checked_element = checkbox_element.find_child("w:checked")
if checked_element is None:
checked = read_boolean_element(checkbox_element.find_child("w:default"))
else:
checked = read_boolean_element(checked_element)
return complex_fields.checkbox(checked=checked)
return None
def read_instr_text(element):
current_instr_text.append(_inner_text(element))
return _empty_result
def _read_style(properties, style_tag_name, style_type, find_style_by_id):
messages = []
style_id = properties \
.find_child_or_null(style_tag_name) \
.attributes.get("w:val")
if style_id is None:
style_name = None
else:
style = find_style_by_id(style_id)
if style is None:
style_name = None
messages.append(_undefined_style_warning(style_type, style_id))
else:
style_name = style.name
return _ReadResult([style_id, style_name], [], messages)
def _undefined_style_warning(style_type, style_id):
return results.warning("{0} style with ID {1} was referenced but not defined in the document".format(style_type, style_id))
def _read_numbering_properties(paragraph_style_id, element):
num_id = element.find_child_or_null("w:numId").attributes.get("w:val")
level_index = element.find_child_or_null("w:ilvl").attributes.get("w:val")
if num_id is not None and level_index is not None:
return numbering.find_level(num_id, level_index)
if paragraph_style_id is not None:
level = numbering.find_level_by_paragraph_style_id(paragraph_style_id)
if level is not None:
return level
# Some malformed documents define numbering levels without an index, and
# reference the numbering using a w:numPr element without a w:ilvl child.
# To handle such cases, we assume a level of 0 as a fallback.
if num_id is not None:
return numbering.find_level(num_id, "0")
return None
def _read_paragraph_indent(element):
attributes = element.attributes
return documents.paragraph_indent(
start=attributes.get("w:start") or attributes.get("w:left"),
end=attributes.get("w:end") or attributes.get("w:right"),
first_line=attributes.get("w:firstLine"),
hanging=attributes.get("w:hanging"),
)
def tab(element):
return _success(documents.tab())
def no_break_hyphen(element):
return _success(documents.text(unichr(0x2011)))
def soft_hyphen(element):
return _success(documents.text(u"\u00ad"))
def symbol(element):
# See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1
font = element.attributes.get("w:font")
char = element.attributes.get("w:char")
unicode_code_point = dingbats.get((font, int(char, 16)))
if unicode_code_point is None and re.match("^F0..", char):
unicode_code_point = dingbats.get((font, int(char[2:], 16)))
if unicode_code_point is None:
warning = results.warning("A w:sym element with an unsupported character was ignored: char {0} in font {1}".format(
char,
font,
))
return _empty_result_with_message(warning)
else:
return _success(documents.text(unichr(unicode_code_point)))
def table(element):
properties = element.find_child_or_null("w:tblPr")
return _ReadResult.map_results(
read_table_style(properties),
_read_xml_elements(element.children)
.flat_map(calculate_row_spans),
lambda style, children: documents.table(
children=children,
style_id=style[0],
style_name=style[1],
),
)
def read_table_style(properties):
return _read_style(properties, "w:tblStyle", "Table", styles.find_table_style_by_id)
def table_row(element):
properties = element.find_child_or_null("w:trPr")
# See 17.13.5.12 del (Deleted Table Row) of ECMA-376 4th edition Part 1
is_deleted = bool(properties.find_child("w:del"))
if is_deleted:
return _empty_result
is_header = bool(properties.find_child("w:tblHeader"))
return _read_xml_elements(element.children) \
.map(lambda children: documents.table_row(
children=children,
is_header=is_header,
))
def table_cell(element):
properties = element.find_child_or_null("w:tcPr")
gridspan = properties \
.find_child_or_null("w:gridSpan") \
.attributes.get("w:val")
if gridspan is None:
colspan = 1
else:
colspan = int(gridspan)
return _read_xml_elements(element.children) \
.map(lambda children: documents.table_cell_unmerged(
children=children,
colspan=colspan,
rowspan=1,
vmerge=read_vmerge(properties),
))
def read_vmerge(properties):
vmerge_element = properties.find_child("w:vMerge")
if vmerge_element is None:
return False
else:
val = vmerge_element.attributes.get("w:val")
return val == "continue" or not val
def calculate_row_spans(rows):
unexpected_non_rows = any(
not isinstance(row, documents.TableRow)
for row in rows
)
if unexpected_non_rows:
rows = remove_unmerged_table_cells(rows)
return _elements_result_with_messages(rows, [results.warning(
"unexpected non-row element in table, cell merging may be incorrect"
)])
unexpected_non_cells = any(
not isinstance(cell, documents.TableCellUnmerged)
for row in rows
for cell in row.children
)
if unexpected_non_cells:
rows = remove_unmerged_table_cells(rows)
return _elements_result_with_messages(rows, [results.warning(
"unexpected non-cell element in table row, cell merging may be incorrect"
)])
columns = {}
for row in rows:
cell_index = 0
for cell in row.children:
if cell.vmerge and cell_index in columns:
columns[cell_index].rowspan += 1
else:
columns[cell_index] = cell
cell.vmerge = False
cell_index += cell.colspan
for row in rows:
row.children = [
documents.table_cell(
children=cell.children,
colspan=cell.colspan,
rowspan=cell.rowspan,
)
for cell in row.children
if not cell.vmerge
]
return _success(rows)
def remove_unmerged_table_cells(rows):
return list(map(
transforms.element_of_type(
documents.TableCellUnmerged,
lambda cell: documents.table_cell(
children=cell.children,
colspan=cell.colspan,
rowspan=cell.rowspan,
),
),
rows,
))
def read_child_elements(element):
return _read_xml_elements(element.children)
def pict(element):
return read_child_elements(element).to_extra()
def hyperlink(element):
relationship_id = element.attributes.get("r:id")
anchor = element.attributes.get("w:anchor")
target_frame = element.attributes.get("w:tgtFrame") or None
children_result = _read_xml_elements(element.children)
def create(**kwargs):
return children_result.map(lambda children: documents.hyperlink(
children=children,
target_frame=target_frame,
**kwargs
))
if relationship_id is not None:
href = relationships.find_target_by_relationship_id(relationship_id)
if anchor is not None:
href = replace_fragment(href, anchor)
return create(href=href)
elif anchor is not None:
return create(anchor=anchor)
else:
return children_result
def bookmark_start(element):
name = element.attributes.get("w:name")
if name == "_GoBack":
return _empty_result
else:
return _success(documents.bookmark(name))
def break_(element):
break_type = element.attributes.get("w:type")
if not break_type or break_type == "textWrapping":
return _success(documents.line_break)
elif break_type == "page":
return _success(documents.page_break)
elif break_type == "column":
return _success(documents.column_break)
else:
warning = results.warning("Unsupported break type: {0}".format(break_type))
return _empty_result_with_message(warning)
def inline(element):
properties = element.find_child_or_null("wp:docPr").attributes
if properties.get("descr", "").strip():
alt_text = properties.get("descr")
else:
alt_text = properties.get("title")
blips = element.find_children("a:graphic") \
.find_children("a:graphicData") \
.find_children("pic:pic") \
.find_children("pic:blipFill") \
.find_children("a:blip")
return _read_blips(blips, alt_text)
def _read_blips(blips, alt_text):
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
def _read_blip(element, alt_text):
blip_image = _find_blip_image(element)
if blip_image is None:
warning = results.warning("Could not find image file for a:blip element")
return _empty_result_with_message(warning)
else:
return _read_image(blip_image, alt_text)
def _read_image(image_file, alt_text):
image_path, open_image = image_file
content_type = content_types.find_content_type(image_path)
image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
messages = []
else:
messages = [results.warning("Image of type {0} is unlikely to display in web browsers".format(content_type))]
return _element_result_with_messages(image, messages)
def _find_blip_image(element):
embed_relationship_id = element.attributes.get("r:embed")
link_relationship_id = element.attributes.get("r:link")
if embed_relationship_id is not None:
return _find_embedded_image(embed_relationship_id)
elif link_relationship_id is not None:
return _find_linked_image(link_relationship_id)
else:
return None
def _find_embedded_image(relationship_id):
target = relationships.find_target_by_relationship_id(relationship_id)
image_path = uri_to_zip_entry_name("word", target)
def open_image():
image_file = docx_file.open(image_path)
if hasattr(image_file, "__exit__"):
return image_file
else:
return contextlib.closing(image_file)
return image_path, open_image
def _find_linked_image(relationship_id):
image_path = relationships.find_target_by_relationship_id(relationship_id)
def open_image():
return files.open(image_path)
return image_path, open_image
def read_imagedata(element):
relationship_id = element.attributes.get("r:id")
if relationship_id is None:
warning = results.warning("A v:imagedata element without a relationship ID was ignored")
return _empty_result_with_message(warning)
else:
title = element.attributes.get("o:title")
return _read_image(_find_embedded_image(relationship_id), title)
def note_reference_reader(note_type):
def note_reference(element):
return _success(documents.note_reference(note_type, element.attributes["w:id"]))
return note_reference
def read_comment_reference(element):
return _success(documents.comment_reference(element.attributes["w:id"]))
def alternate_content(element):
return read_child_elements(element.find_child_or_null("mc:Fallback"))
def read_sdt(element):
content_result = read_child_elements(element.find_child_or_null("w:sdtContent"))
def handle_content(content):
# From the WordML standard: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/3350cb64-931f-41f7-8824-f18b2568ce66
#
# > A CT_SdtCheckbox element that specifies that the parent
# > structured document tag is a checkbox when displayed in the
# > document. The parent structured document tag contents MUST
# > contain a single character and optionally an additional
# > character in a deleted run.
checkbox = element.find_child_or_null("w:sdtPr").find_child("wordml:checkbox")
if checkbox is None:
return content
checked_element = checkbox.find_child("wordml:checked")
is_checked = (
checked_element is not None and
read_boolean_attribute_value(checked_element.attributes.get("wordml:val"))
)
document_checkbox = documents.checkbox(checked=is_checked)
has_checkbox = False
def transform_text(text):
nonlocal has_checkbox
if len(text.value) > 0 and not has_checkbox:
has_checkbox = True
return document_checkbox
else:
return text
replaced_content = list(map(
transforms.element_of_type(documents.Text, transform_text),
content,
))
if has_checkbox:
return replaced_content
else:
return document_checkbox
return content_result.map(handle_content)
handlers = {
"w:t": text,
"w:r": run,
"w:p": paragraph,
"w:fldChar": read_fld_char,
"w:instrText": read_instr_text,
"w:tab": tab,
"w:noBreakHyphen": no_break_hyphen,
"w:softHyphen": soft_hyphen,
"w:sym": symbol,
"w:tbl": table,
"w:tr": table_row,
"w:tc": table_cell,
"w:ins": read_child_elements,
"w:object": read_child_elements,
"w:smartTag": read_child_elements,
"w:drawing": read_child_elements,
"v:group": read_child_elements,
"v:rect": read_child_elements,
"v:roundrect": read_child_elements,
"v:shape": read_child_elements,
"v:textbox": read_child_elements,
"w:txbxContent": read_child_elements,
"w:pict": pict,
"w:hyperlink": hyperlink,
"w:bookmarkStart": bookmark_start,
"w:br": break_,
"wp:inline": inline,
"wp:anchor": inline,
"v:imagedata": read_imagedata,
"w:footnoteReference": note_reference_reader("footnote"),
"w:endnoteReference": note_reference_reader("endnote"),
"w:commentReference": read_comment_reference,
"mc:AlternateContent": alternate_content,
"w:sdt": read_sdt
}
def read(element):
handler = handlers.get(element.name)
if handler is None:
if element.name not in _ignored_elements:
warning = results.warning("An unrecognised element was ignored: {0}".format(element.name))
return _empty_result_with_message(warning)
else:
return _empty_result
else:
return handler(element)
def _read_xml_elements(nodes):
elements = filter(lambda node: isinstance(node, XmlElement), nodes)
return _ReadResult.concat(lists.map(read, elements))
return _read_xml_elements
def _inner_text(node):
if node.node_type == node_types.text:
return node.value
else:
return "".join(_inner_text(child) for child in node.children)
class _ReadResult(object):
@staticmethod
def concat(results):
return _ReadResult(
lists.flat_map(lambda result: result.elements, results),
lists.flat_map(lambda result: result.extra, results),
lists.flat_map(lambda result: result.messages, results))
@staticmethod
def map_results(first, second, func):
return _ReadResult(
[func(first.elements, second.elements)],
first.extra + second.extra,
first.messages + second.messages)
def __init__(self, elements, extra, messages):
self.elements = elements
self.extra = extra
self.messages = messages
def map(self, func):
elements = func(self.elements)
if not isinstance(elements, list):
elements = [elements]
return _ReadResult(
elements,
self.extra,
self.messages)
def flat_map(self, func):
result = func(self.elements)
return _ReadResult(
result.elements,
self.extra + result.extra,
self.messages + result.messages)
def to_extra(self):
return _ReadResult([], _concat(self.extra, self.elements), self.messages)
def append_extra(self):
return _ReadResult(_concat(self.elements, self.extra), [], self.messages)
def _success(elements):
if not isinstance(elements, list):
elements = [elements]
return _ReadResult(elements, [], [])
def _element_result_with_messages(element, messages):
return _elements_result_with_messages([element], messages)
def _elements_result_with_messages(elements, messages):
return _ReadResult(elements, [], messages)
_empty_result = _ReadResult([], [], [])
def _empty_result_with_message(message):
return _ReadResult([], [], [message])
def _concat(*values):
result = []
for value in values:
for element in value:
result.append(element)
return result
def _is_int(value):
if value is None:
return False
try:
int(value)
except ValueError:
return False
return True

View File

@@ -0,0 +1,24 @@
from .. import lists
from .. import documents
from .. import results
def read_comments_xml_element(element, body_reader):
def read_comments_xml_element(element):
comment_elements = element.find_children("w:comment")
return results.combine(lists.map(_read_comment_element, comment_elements))
def _read_comment_element(element):
def read_optional_attribute(name):
return element.attributes.get(name, "").strip() or None
return body_reader.read_all(element.children).map(lambda body:
documents.comment(
comment_id=element.attributes["w:id"],
body=body,
author_name=read_optional_attribute("w:author"),
author_initials=read_optional_attribute("w:initials"),
))
return read_comments_xml_element(element)

View File

@@ -0,0 +1,29 @@
class unknown(object):
pass
class Begin:
def __init__(self, *, fld_char):
self.fld_char = fld_char
def begin(*, fld_char):
return Begin(fld_char=fld_char)
class Hyperlink(object):
def __init__(self, kwargs):
self.kwargs = kwargs
def hyperlink(kwargs):
return Hyperlink(kwargs=kwargs)
class Checkbox:
def __init__(self, *, checked):
self.checked = checked
def checkbox(*, checked):
return Checkbox(checked=checked)

View File

@@ -0,0 +1,58 @@
def read_content_types_xml_element(element):
extension_defaults = dict(map(
_read_default,
element.find_children("content-types:Default")
))
overrides = dict(map(
_read_override,
element.find_children("content-types:Override")
))
return _ContentTypes(extension_defaults, overrides)
def _read_default(element):
extension = element.attributes["Extension"]
content_type = element.attributes["ContentType"]
return extension, content_type
def _read_override(element):
part_name = element.attributes["PartName"]
content_type = element.attributes["ContentType"]
return part_name.lstrip("/"), content_type
class _ContentTypes(object):
_image_content_types = {
"png": "png",
"gif": "gif",
"jpeg": "jpeg",
"jpg": "jpeg",
"tif": "tiff",
"tiff": "tiff",
"bmp": "bmp",
}
def __init__(self, extension_defaults, overrides):
self._extension_defaults = extension_defaults
self._overrides = overrides
def find_content_type(self, path):
if path in self._overrides:
return self._overrides[path]
extension = _get_extension(path)
default_type = self._extension_defaults.get(extension)
if default_type is not None:
return default_type
image_type = self._image_content_types.get(extension.lower())
if image_type is not None:
return "image/" + image_type
return None
empty_content_types = _ContentTypes({}, {})
def _get_extension(path):
return path.rpartition(".")[2]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,25 @@
from .. import documents
def read_document_xml_element(
element,
body_reader,
notes=None,
comments=None):
if notes is None:
notes = []
if comments is None:
comments = []
body_element = element.find_child("w:body")
if body_element is None:
raise ValueError("Could not find the body element: are you sure this is a docx file?")
return body_reader.read_all(body_element.children) \
.map(lambda children: documents.document(
children,
notes=documents.notes(notes),
comments=comments
))

View File

@@ -0,0 +1,46 @@
import os
import contextlib
try:
from urllib2 import urlopen
except ImportError:
from urllib.request import urlopen
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
class Files(object):
def __init__(self, base, external_file_access):
self._base = base
self._external_file_access = external_file_access
def open(self, uri):
if not self._external_file_access:
raise ExternalFileAccessIsDisabledError(
"could not open external image '{0}', external file access is disabled".format(uri)
)
try:
if _is_absolute(uri):
return contextlib.closing(urlopen(uri))
elif self._base is not None:
return open(os.path.join(self._base, uri), "rb")
else:
raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
except IOError as error:
message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
uri, self._base, str(error))
raise InvalidFileReferenceError(message)
def _is_absolute(url):
return urlparse(url).scheme != ""
class InvalidFileReferenceError(ValueError):
pass
class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
pass

View File

@@ -0,0 +1,32 @@
import functools
from .. import lists
from .. import documents
from .. import results
def _read_notes(note_type, element, body_reader):
def read_notes_xml_element(element):
note_elements = lists.filter(
_is_note_element,
element.find_children("w:" + note_type),
)
return results.combine(lists.map(_read_note_element, note_elements))
def _is_note_element(element):
return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
def _read_note_element(element):
return body_reader.read_all(element.children).map(lambda body:
documents.note(
note_type=note_type,
note_id=element.attributes["w:id"],
body=body
))
return read_notes_xml_element(element)
read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
read_endnotes_xml_element = functools.partial(_read_notes, "endnote")

View File

@@ -0,0 +1,130 @@
import cobble
from ..documents import numbering_level
from .styles_xml import Styles
def read_numbering_xml_element(element, styles):
abstract_nums = _read_abstract_nums(element)
nums = _read_nums(element)
return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
def _read_abstract_nums(element):
abstract_num_elements = element.find_children("w:abstractNum")
return dict(map(_read_abstract_num, abstract_num_elements))
def _read_abstract_num(element):
abstract_num_id = element.attributes.get("w:abstractNumId")
levels = _read_abstract_num_levels(element)
num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
@cobble.data
class _AbstractNum(object):
levels = cobble.field()
num_style_link = cobble.field()
@cobble.data
class _AbstractNumLevel(object):
level_index = cobble.field()
is_ordered = cobble.field()
paragraph_style_id = cobble.field()
def _read_abstract_num_levels(element):
levels = {}
# Some malformed documents define numbering levels without an index, and
# reference the numbering using a w:numPr element without a w:ilvl child.
# To handle such cases, we assume a level of 0 as a fallback.
level_without_index = None
for level_element in element.find_children("w:lvl"):
level = _read_abstract_num_level(level_element)
if level.level_index is None:
level.level_index = "0"
level_without_index = level
else:
levels[level.level_index] = level
if level_without_index is not None and level_without_index.level_index not in levels:
levels[level_without_index.level_index] = level_without_index
return levels
def _read_abstract_num_level(element):
level_index = element.attributes.get("w:ilvl")
num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
is_ordered = num_fmt != "bullet"
paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
return _AbstractNumLevel(
level_index=level_index,
is_ordered=is_ordered,
paragraph_style_id=paragraph_style_id,
)
def _read_nums(element):
num_elements = element.find_children("w:num")
return dict(
_read_num(num_element)
for num_element in num_elements
)
def _read_num(element):
num_id = element.attributes.get("w:numId")
abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
return num_id, _Num(abstract_num_id=abstract_num_id)
@cobble.data
class _Num(object):
abstract_num_id = cobble.field()
class Numbering(object):
def __init__(self, abstract_nums, nums, styles):
self._abstract_nums = abstract_nums
self._levels_by_paragraph_style_id = dict(
(level.paragraph_style_id, self._to_numbering_level(level))
for abstract_num in abstract_nums.values()
for level in abstract_num.levels.values()
if level.paragraph_style_id is not None
)
self._nums = nums
self._styles = styles
def find_level(self, num_id, level):
num = self._nums.get(num_id)
if num is None:
return None
else:
abstract_num = self._abstract_nums.get(num.abstract_num_id)
if abstract_num is None:
return None
elif abstract_num.num_style_link is None:
return self._to_numbering_level(abstract_num.levels.get(level))
else:
style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
return self.find_level(style.num_id, level)
def find_level_by_paragraph_style_id(self, style_id):
return self._levels_by_paragraph_style_id.get(style_id)
def _to_numbering_level(self, abstract_num_level):
if abstract_num_level is None:
return None
else:
return numbering_level(
level_index=abstract_num_level.level_index,
is_ordered=abstract_num_level.is_ordered,
)
Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)

View File

@@ -0,0 +1,45 @@
from ..lists import flat_map
from .xmlparser import parse_xml, XmlElement
_namespaces = [
# Transitional format
("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
# Strict format
("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
("a", "http://purl.oclc.org/ooxml/drawingml/main"),
("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
# Common
("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
("v", "urn:schemas-microsoft-com:vml"),
("office-word", "urn:schemas-microsoft-com:office:word"),
# [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
# https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
]
def read(fileobj):
return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
def _collapse_alternate_content(node):
if isinstance(node, XmlElement):
if node.name == "mc:AlternateContent":
return node.find_child_or_null("mc:Fallback").children
else:
node.children = flat_map(_collapse_alternate_content, node.children)
return [node]
else:
return [node]

View File

@@ -0,0 +1,38 @@
import collections
class Relationships(object):
def __init__(self, relationships):
self._targets_by_id = dict(
(relationship.relationship_id, relationship.target)
for relationship in relationships
)
self._targets_by_type = collections.defaultdict(list)
for relationship in relationships:
self._targets_by_type[relationship.type].append(relationship.target)
def find_target_by_relationship_id(self, key):
return self._targets_by_id[key]
def find_targets_by_type(self, relationship_type):
return self._targets_by_type[relationship_type]
Relationships.EMPTY = Relationships([])
Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
def read_relationships_xml_element(element):
children = element.find_children("relationships:Relationship")
return Relationships(list(map(_read_relationship, children)))
def _read_relationship(element):
relationship = Relationship(
relationship_id=element.attributes["Id"],
target=element.attributes["Target"],
type=element.attributes["Type"],
)
return relationship

View File

@@ -0,0 +1,70 @@
from xml.etree import ElementTree
from ..zips import open_zip, update_zip
_style_map_path = "mammoth/style-map"
_style_map_absolute_path = "/" + _style_map_path
_relationships_path = "word/_rels/document.xml.rels"
_content_types_path = "[Content_Types].xml"
def write_style_map(fileobj, style_map):
with open_zip(fileobj, "r") as zip_file:
relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
update_zip(fileobj, {
_style_map_path: style_map.encode("utf8"),
_relationships_path: relationships_xml,
_content_types_path: content_types_xml,
})
def _generate_relationships_xml(relationships_xml):
schema = "http://schemas.zwobble.org/mammoth/style-map"
relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
relationship_element_name = "{" + relationships_uri + "}Relationship"
relationships = ElementTree.fromstring(relationships_xml)
_add_or_update_element(relationships, relationship_element_name, "Id", {
"Id": "rMammothStyleMap",
"Type": schema,
"Target": _style_map_absolute_path,
})
return ElementTree.tostring(relationships, "UTF-8")
def _generate_content_types_xml(content_types_xml):
content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
override_name = "{" + content_types_uri + "}Override"
types = ElementTree.fromstring(content_types_xml)
_add_or_update_element(types, override_name, "PartName", {
"PartName": _style_map_absolute_path,
"ContentType": "text/prs.mammoth.style-map",
})
return ElementTree.tostring(types, "UTF-8")
def _add_or_update_element(parent, name, identifying_attribute, attributes):
existing_child = _find_child(parent, name, identifying_attribute, attributes)
if existing_child is None:
ElementTree.SubElement(parent, name, attributes)
else:
existing_child.attrib = attributes
def _find_child(parent, name, identifying_attribute, attributes):
for element in parent.iter():
if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
return element
def read_style_map(fileobj):
with open_zip(fileobj, "r") as zip_file:
if zip_file.exists(_style_map_path):
return zip_file.read_str(_style_map_path)

View File

@@ -0,0 +1,117 @@
import collections
class Styles(object):
@staticmethod
def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
if paragraph_styles is None:
paragraph_styles = {}
if character_styles is None:
character_styles = {}
if table_styles is None:
table_styles = {}
if numbering_styles is None:
numbering_styles = {}
return Styles(
paragraph_styles=paragraph_styles,
character_styles=character_styles,
table_styles=table_styles,
numbering_styles=numbering_styles,
)
def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
self._paragraph_styles = paragraph_styles
self._character_styles = character_styles
self._table_styles = table_styles
self._numbering_styles = numbering_styles
def find_paragraph_style_by_id(self, style_id):
return self._paragraph_styles.get(style_id)
def find_character_style_by_id(self, style_id):
return self._character_styles.get(style_id)
def find_table_style_by_id(self, style_id):
return self._table_styles.get(style_id)
def find_numbering_style_by_id(self, style_id):
return self._numbering_styles.get(style_id)
Styles.EMPTY = Styles(
paragraph_styles={},
character_styles={},
table_styles={},
numbering_styles={},
)
def read_styles_xml_element(element):
paragraph_styles = {}
character_styles = {}
table_styles = {}
numbering_styles = {}
styles = {
"paragraph": paragraph_styles,
"character": character_styles,
"table": table_styles,
"numbering": numbering_styles,
}
for style_element in element.find_children("w:style"):
element_type = style_element.attributes["w:type"]
if element_type == "numbering":
style = _read_numbering_style_element(style_element)
else:
style = _read_style_element(style_element)
style_set = styles.get(element_type)
# Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1:
#
# > If multiple style definitions each declare the same value for their
# > styleId, then the first such instance shall keep its current
# > identifier with all other instances being reassigned in any manner
# > desired.
#
# For the purpose of conversion, there's no point holding onto styles
# with reassigned style IDs, so we ignore such style definitions.
if style_set is not None and style.style_id not in style_set:
style_set[style.style_id] = style
return Styles(
paragraph_styles=paragraph_styles,
character_styles=character_styles,
table_styles=table_styles,
numbering_styles=numbering_styles,
)
Style = collections.namedtuple("Style", ["style_id", "name"])
def _read_style_element(element):
style_id = _read_style_id(element)
name = element.find_child_or_null("w:name").attributes.get("w:val")
return Style(style_id=style_id, name=name)
NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"])
def _read_numbering_style_element(element):
style_id = _read_style_id(element)
num_id = element \
.find_child_or_null("w:pPr") \
.find_child_or_null("w:numPr") \
.find_child_or_null("w:numId") \
.attributes.get("w:val")
return NumberingStyle(style_id=style_id, num_id=num_id)
def _read_style_id(element):
return element.attributes["w:styleId"]

View File

@@ -0,0 +1,12 @@
def uri_to_zip_entry_name(base, uri):
if uri.startswith("/"):
return uri[1:]
else:
return base + "/" + uri
def replace_fragment(uri, fragment):
hash_index = uri.find("#")
if hash_index != -1:
uri = uri[:hash_index]
return uri + "#" + fragment

View File

@@ -0,0 +1,121 @@
import xml.dom.minidom
import cobble
@cobble.data
class XmlElement(object):
name = cobble.field()
attributes = cobble.field()
children = cobble.field()
def find_child_or_null(self, name):
return self.find_child(name) or null_xml_element
def find_child(self, name):
for child in self.children:
if isinstance(child, XmlElement) and child.name == name:
return child
def find_children(self, name):
return XmlElementList(filter(
lambda child: child.node_type == node_types.element and child.name == name,
self.children
))
class XmlElementList(object):
def __init__(self, elements):
self._elements = elements
def __iter__(self):
return iter(self._elements)
def find_children(self, name):
children = []
for element in self._elements:
for child in element.find_children(name):
children.append(child)
return XmlElementList(children)
class NullXmlElement(object):
attributes = {}
children = []
def find_child_or_null(self, name):
return self
def find_child(self, name):
return None
null_xml_element = NullXmlElement()
@cobble.data
class XmlText(object):
value = cobble.field()
def element(name, attributes=None, children=None):
return XmlElement(name, attributes or {}, children or [])
text = XmlText
class node_types(object):
element = 1
text = 3
XmlElement.node_type = node_types.element
XmlText.node_type = node_types.text
def parse_xml(fileobj, namespace_mapping=None):
if namespace_mapping is None:
namespace_prefixes = {}
else:
namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
document = xml.dom.minidom.parse(fileobj)
def convert_node(node):
if node.nodeType == xml.dom.Node.ELEMENT_NODE:
return convert_element(node)
elif node.nodeType == xml.dom.Node.TEXT_NODE:
return XmlText(node.nodeValue)
else:
return None
def convert_element(element):
converted_name = convert_name(element)
converted_attributes = dict(
(convert_name(attribute), attribute.value)
for attribute in element.attributes.values()
if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
)
converted_children = []
for child_node in element.childNodes:
converted_child_node = convert_node(child_node)
if converted_child_node is not None:
converted_children.append(converted_child_node)
return XmlElement(converted_name, converted_attributes, converted_children)
def convert_name(node):
if node.namespaceURI is None:
return node.localName
else:
prefix = namespace_prefixes.get(node.namespaceURI)
if prefix is None:
return "{%s}%s" % (node.namespaceURI, node.localName)
else:
return "%s:%s" % (prefix, node.localName)
return convert_node(document.documentElement)

View File

@@ -0,0 +1,135 @@
from ..lists import flat_map
from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor
def text(value):
return TextNode(value)
def tag(tag_names, attributes=None, collapsible=None, separator=None):
if not isinstance(tag_names, list):
tag_names = [tag_names]
if attributes is None:
attributes = {}
return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator)
def element(tag_names, attributes=None, children=None, collapsible=None, separator=None):
if children is None:
children = []
element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator)
return Element(element_tag, children)
def collapsible_element(tag_names, attributes=None, children=None):
return element(tag_names, attributes, children, collapsible=True)
force_write = ForceWrite()
def strip_empty(nodes):
return flat_map(_strip_empty_node, nodes)
def _strip_empty_node(node):
return StripEmpty().visit(node)
class StripEmpty(NodeVisitor):
def visit_text_node(self, node):
if node.value:
return [node]
else:
return []
def visit_element(self, element):
children = strip_empty(element.children)
if len(children) == 0 and not element.is_void():
return []
else:
return [Element(element.tag, children)]
def visit_force_write(self, node):
return [node]
def collapse(nodes):
collapsed = []
for node in nodes:
_collapsing_add(collapsed, node)
return collapsed
class _CollapseNode(NodeVisitor):
def visit_text_node(self, node):
return node
def visit_element(self, element):
return Element(element.tag, collapse(element.children))
def visit_force_write(self, node):
return node
_collapse_node = _CollapseNode().visit
def _collapsing_add(collapsed, node):
collapsed_node = _collapse_node(node)
if not _try_collapse(collapsed, collapsed_node):
collapsed.append(collapsed_node)
def _try_collapse(collapsed, node):
if not collapsed:
return False
last = collapsed[-1]
if not isinstance(last, Element) or not isinstance(node, Element):
return False
if not node.collapsible:
return False
if not _is_match(last, node):
return False
if node.separator:
last.children.append(text(node.separator))
for child in node.children:
_collapsing_add(last.children, child)
return True
def _is_match(first, second):
return first.tag_name in second.tag_names and first.attributes == second.attributes
def write(writer, nodes):
visitor = _NodeWriter(writer)
visitor.visit_all(nodes)
class _NodeWriter(NodeVisitor):
def __init__(self, writer):
self._writer = writer
def visit_text_node(self, node):
self._writer.text(node.value)
def visit_element(self, element):
if element.is_void():
self._writer.self_closing(element.tag_name, element.attributes)
else:
self._writer.start(element.tag_name, element.attributes)
self.visit_all(element.children)
self._writer.end(element.tag_name)
def visit_force_write(self, element):
pass
def visit_all(self, nodes):
for node in nodes:
self.visit(node)

View File

@@ -0,0 +1,61 @@
import cobble
class Node(object):
pass
@cobble.data
class TextNode(Node):
value = cobble.field()
@cobble.data
class Tag(object):
tag_names = cobble.field()
attributes = cobble.field()
collapsible = cobble.field()
separator = cobble.field()
@property
def tag_name(self):
return self.tag_names[0]
@cobble.data
class Element(Node):
tag = cobble.field()
children = cobble.field()
@property
def tag_name(self):
return self.tag.tag_name
@property
def tag_names(self):
return self.tag.tag_names
@property
def attributes(self):
return self.tag.attributes
@property
def collapsible(self):
return self.tag.collapsible
@property
def separator(self):
return self.tag.separator
_VOID_TAG_NAMES = set(["br", "hr", "img", "input"])
def is_void(self):
return not self.children and self.tag_name in self._VOID_TAG_NAMES
@cobble.visitable
class ForceWrite(Node):
pass
NodeVisitor = cobble.visitor(Node)

View File

@@ -0,0 +1,58 @@
import cobble
from . import html
def path(elements):
return HtmlPath(elements)
def element(names, attributes=None, class_names=None, fresh=None, separator=None):
if attributes is None:
attributes = {}
if class_names is None:
class_names = []
if fresh is None:
fresh = False
if class_names:
attributes["class"] = " ".join(class_names)
return HtmlPathElement(html.tag(
tag_names=names,
attributes=attributes,
collapsible=not fresh,
separator=separator,
))
@cobble.data
class HtmlPath(object):
elements = cobble.field()
def wrap(self, generate_nodes):
nodes = generate_nodes()
for element in reversed(self.elements):
nodes = element.wrap_nodes(nodes)
return nodes
@cobble.data
class HtmlPathElement(object):
tag = cobble.field()
def wrap(self, generate_nodes):
return self.wrap_nodes(generate_nodes())
def wrap_nodes(self, nodes):
element = html.Element(self.tag, nodes)
return [element]
empty = path([])
class ignore(object):
@staticmethod
def wrap(generate_nodes):
return []

View File

@@ -0,0 +1,28 @@
import base64
from . import html
def img_element(func):
def convert_image(image):
attributes = {}
if image.alt_text:
attributes["alt"] = image.alt_text
attributes.update(func(image))
return [html.element("img", attributes)]
return convert_image
# Undocumented, but retained for backwards-compatibility with 0.3.x
inline = img_element
@img_element
def data_uri(image):
with image.open() as image_bytes:
encoded_src = base64.b64encode(image_bytes.read()).decode("ascii")
return {
"src": "data:{0};base64,{1}".format(image.content_type, encoded_src)
}

View File

@@ -0,0 +1,40 @@
import sys
def flatten(values):
return flat_map(lambda x: x, values)
def unique(values):
output = []
seen = set()
for value in values:
if value not in seen:
seen.add(value)
output.append(value)
return output
def flat_map(func, values):
return [
element
for value in values
for element in func(value)
]
def find_index(predicate, values):
for index, value in enumerate(values):
if predicate(value):
return index
if sys.version_info[0] == 2:
map = map
filter = filter
else:
import builtins
def map(*args, **kwargs):
return list(builtins.map(*args, **kwargs))
def filter(*args, **kwargs):
return list(builtins.filter(*args, **kwargs))

View File

@@ -0,0 +1,101 @@
from .styles.parser import read_style_mapping
from . import lists, results
def read_options(options):
custom_style_map_text = options.pop("style_map", "") or ""
embedded_style_map_text = options.pop("embedded_style_map", "") or ""
include_default_style_map = options.pop("include_default_style_map", True)
read_style_map_result = results.combine([
_read_style_map(custom_style_map_text),
_read_style_map(embedded_style_map_text),
])
custom_style_map, embedded_style_map = read_style_map_result.value
style_map = custom_style_map + embedded_style_map
if include_default_style_map:
style_map += _default_style_map
options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
options["style_map"] = style_map
return read_style_map_result.map(lambda _: options)
def _read_style_map(style_text):
lines = filter(None, map(_get_line, style_text.split("\n")))
return results.combine(lists.map(read_style_mapping, lines)) \
.map(lambda style_mappings: lists.filter(None, style_mappings))
def _get_line(line):
line = line.strip()
if line.startswith("#"):
return None
else:
return line
_default_style_map_result = _read_style_map("""
p.Heading1 => h1:fresh
p.Heading2 => h2:fresh
p.Heading3 => h3:fresh
p.Heading4 => h4:fresh
p.Heading5 => h5:fresh
p.Heading6 => h6:fresh
p[style-name='Heading 1'] => h1:fresh
p[style-name='Heading 2'] => h2:fresh
p[style-name='Heading 3'] => h3:fresh
p[style-name='Heading 4'] => h4:fresh
p[style-name='Heading 5'] => h5:fresh
p[style-name='Heading 6'] => h6:fresh
p[style-name='heading 1'] => h1:fresh
p[style-name='heading 2'] => h2:fresh
p[style-name='heading 3'] => h3:fresh
p[style-name='heading 4'] => h4:fresh
p[style-name='heading 5'] => h5:fresh
p[style-name='heading 6'] => h6:fresh
# Apple Pages
p.Heading => h1:fresh
p[style-name='Heading'] => h1:fresh
r[style-name='Strong'] => strong
p[style-name='footnote text'] => p:fresh
r[style-name='footnote reference'] =>
p[style-name='endnote text'] => p:fresh
r[style-name='endnote reference'] =>
p[style-name='annotation text'] => p:fresh
r[style-name='annotation reference'] =>
# LibreOffice
p[style-name='Footnote'] => p:fresh
r[style-name='Footnote anchor'] =>
p[style-name='Endnote'] => p:fresh
r[style-name='Endnote anchor'] =>
p:unordered-list(1) => ul > li:fresh
p:unordered-list(2) => ul|ol > li > ul > li:fresh
p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh
p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
p:ordered-list(1) => ol > li:fresh
p:ordered-list(2) => ul|ol > li > ol > li:fresh
p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh
p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
r[style-name='Hyperlink'] =>
p[style-name='Normal'] => p:fresh
# Apple Pages
p.Body => p:fresh
p[style-name='Body'] => p:fresh
""")
assert not _default_style_map_result.messages
_default_style_map = _default_style_map_result.value

View File

@@ -0,0 +1,14 @@
from . import documents
def extract_raw_text_from_element(element):
if isinstance(element, documents.Text):
return element.value
elif isinstance(element, documents.Tab):
return "\t"
else:
text = "".join(map(extract_raw_text_from_element, getattr(element, "children", [])))
if isinstance(element, documents.Paragraph):
return text + "\n\n"
else:
return text

View File

@@ -0,0 +1,42 @@
import collections
from .lists import unique
class Result(object):
def __init__(self, value, messages):
self.value = value
self.messages = unique(messages)
def map(self, func):
return Result(func(self.value), self.messages)
def bind(self, func):
result = func(self.value)
return Result(result.value, self.messages + result.messages)
Message = collections.namedtuple("Message", ["type", "message"])
def warning(message):
return Message("warning", message)
def success(value):
return Result(value, [])
def combine(results):
values = []
messages = []
for result in results:
values.append(result.value)
for message in result.messages:
messages.append(message)
return Result(values, messages)
def map(func, *args):
return combine(args).map(lambda values: func(*values))

View File

@@ -0,0 +1,8 @@
import collections
def style(document_matcher, html_path):
return Style(document_matcher, html_path)
Style = collections.namedtuple("Style", ["document_matcher", "html_path"])

View File

@@ -0,0 +1,14 @@
from .errors import LineParseError
from .style_mapping_parser import parse_style_mapping
from .tokeniser import tokenise
from .token_iterator import TokenIterator
from ... import results
def read_style_mapping(string):
try:
tokens = tokenise(string)
return results.success(parse_style_mapping(TokenIterator(tokens)))
except LineParseError:
warning = "Did not understand this style mapping, so ignored it: " + string
return results.Result(None, [results.warning(warning)])

View File

@@ -0,0 +1,130 @@
from ... import documents, document_matchers
from .errors import LineParseError
from .tokeniser import TokenType
from .token_parser import try_parse_class_name, parse_string
def parse_document_matcher(tokens):
if tokens.try_skip(TokenType.IDENTIFIER, "p"):
style_id = try_parse_class_name(tokens)
style_name = _parse_style_name(tokens)
numbering = _parse_numbering(tokens)
return document_matchers.paragraph(
style_id=style_id,
style_name=style_name,
numbering=numbering,
)
elif tokens.try_skip(TokenType.IDENTIFIER, "r"):
style_id = try_parse_class_name(tokens)
style_name = _parse_style_name(tokens)
return document_matchers.run(
style_id=style_id,
style_name=style_name,
)
elif tokens.try_skip(TokenType.IDENTIFIER, "table"):
style_id = try_parse_class_name(tokens)
style_name = _parse_style_name(tokens)
return document_matchers.table(
style_id=style_id,
style_name=style_name,
)
elif tokens.try_skip(TokenType.IDENTIFIER, "b"):
return document_matchers.bold
elif tokens.try_skip(TokenType.IDENTIFIER, "i"):
return document_matchers.italic
elif tokens.try_skip(TokenType.IDENTIFIER, "u"):
return document_matchers.underline
elif tokens.try_skip(TokenType.IDENTIFIER, "strike"):
return document_matchers.strikethrough
elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"):
return document_matchers.all_caps
elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"):
return document_matchers.small_caps
elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"):
return _parse_highlight(tokens)
elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"):
return document_matchers.comment_reference
elif tokens.try_skip(TokenType.IDENTIFIER, "br"):
return _parse_break(tokens)
else:
raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER)))
def _parse_style_name(tokens):
if tokens.try_skip(TokenType.SYMBOL, "["):
tokens.skip(TokenType.IDENTIFIER, "style-name")
string_matcher = _parse_string_matcher(tokens)
tokens.skip(TokenType.SYMBOL, "]")
return string_matcher
else:
return None
def _parse_string_matcher(tokens):
if tokens.try_skip(TokenType.SYMBOL, "="):
return document_matchers.equal_to(parse_string(tokens))
elif tokens.try_skip(TokenType.SYMBOL, "^="):
return document_matchers.starts_with(parse_string(tokens))
else:
raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value()))
def _parse_numbering(tokens):
if tokens.try_skip(TokenType.SYMBOL, ":"):
is_ordered = _parse_list_type(tokens)
tokens.skip(TokenType.SYMBOL, "(")
level = int(tokens.next_value(TokenType.INTEGER)) - 1
tokens.skip(TokenType.SYMBOL, ")")
return documents.numbering_level(level, is_ordered=is_ordered)
def _parse_list_type(tokens):
list_type = tokens.next_value(TokenType.IDENTIFIER)
if list_type == "ordered-list":
return True
elif list_type == "unordered-list":
return False
else:
raise LineParseError("Unrecognised list type: {0}".format(list_type))
def _parse_highlight(tokens):
if tokens.try_skip(TokenType.SYMBOL, "["):
tokens.skip(TokenType.IDENTIFIER, "color")
tokens.skip(TokenType.SYMBOL, "=")
color = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, "]");
else:
color = None
return document_matchers.highlight(color=color)
def _parse_break(tokens):
tokens.skip(TokenType.SYMBOL, "[")
tokens.skip(TokenType.IDENTIFIER, "type")
tokens.skip(TokenType.SYMBOL, "=")
type_name = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, "]");
if type_name == "line":
return document_matchers.line_break
elif type_name == "page":
return document_matchers.page_break
elif type_name == "column":
return document_matchers.column_break
else:
raise LineParseError("Unrecognised break type: {0}".format(type_name))

View File

@@ -0,0 +1,2 @@
class LineParseError(Exception):
pass

View File

@@ -0,0 +1,120 @@
import cobble
from ... import html_paths
from .tokeniser import TokenType
from .token_parser import parse_identifier, parse_string
@cobble.data
class _AttributeOrClassName(object):
name = cobble.field()
value = cobble.field()
append = cobble.field()
def parse_html_path(tokens):
if tokens.try_skip(TokenType.SYMBOL, "!"):
return html_paths.ignore
else:
return html_paths.path(_parse_html_path_elements(tokens))
def _parse_html_path_elements(tokens):
elements = []
if tokens.peek_token_type() == TokenType.IDENTIFIER:
elements.append(_parse_element(tokens))
while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
tokens.skip(TokenType.WHITESPACE)
elements.append(_parse_element(tokens))
return elements
def _parse_element(tokens):
tag_names = _parse_tag_names(tokens)
attributes_list = _parse_attribute_or_class_names(tokens)
is_fresh = _parse_is_fresh(tokens)
separator = _parse_separator(tokens)
attributes = {}
for attribute in attributes_list:
if attribute.append and attributes.get(attribute.name):
attributes[attribute.name] += " " + attribute.value
else:
attributes[attribute.name] = attribute.value
return html_paths.element(
tag_names,
attributes=attributes,
fresh=is_fresh,
separator=separator,
)
def _parse_tag_names(tokens):
tag_names = [parse_identifier(tokens)]
while tokens.try_skip(TokenType.SYMBOL, "|"):
tag_names.append(parse_identifier(tokens))
return tag_names
def _parse_attribute_or_class_names(tokens):
attribute_or_class_names = []
while True:
attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
if attribute_or_class_name is None:
break
else:
attribute_or_class_names.append(attribute_or_class_name)
return attribute_or_class_names
def _try_parse_attribute_or_class_name(tokens):
if tokens.is_next(TokenType.SYMBOL, "["):
return _parse_attribute(tokens)
if tokens.is_next(TokenType.SYMBOL, "."):
return _parse_class_name(tokens)
else:
return None
def _parse_attribute(tokens):
tokens.skip(TokenType.SYMBOL, "[")
name = parse_identifier(tokens)
tokens.skip(TokenType.SYMBOL, "=")
value = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, "]")
return _AttributeOrClassName(name=name, value=value, append=False)
def _parse_class_name(tokens):
tokens.skip(TokenType.SYMBOL, ".")
class_name = parse_identifier(tokens)
return _AttributeOrClassName(name="class", value=class_name, append=True)
def _parse_is_fresh(tokens):
return tokens.try_skip_many((
(TokenType.SYMBOL, ":"),
(TokenType.IDENTIFIER, "fresh"),
))
def _parse_separator(tokens):
is_separator = tokens.try_skip_many((
(TokenType.SYMBOL, ":"),
(TokenType.IDENTIFIER, "separator"),
))
if is_separator:
tokens.skip(TokenType.SYMBOL, "(")
value = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, ")")
return value
else:
return None

View File

@@ -0,0 +1,15 @@
from .tokeniser import TokenType
from .document_matcher_parser import parse_document_matcher
from .html_path_parser import parse_html_path
from ...styles import Style
def parse_style_mapping(tokens):
document_matcher = parse_document_matcher(tokens)
tokens.skip(TokenType.WHITESPACE)
tokens.skip(TokenType.SYMBOL, "=>")
tokens.try_skip(TokenType.WHITESPACE)
html_path = parse_html_path(tokens)
tokens.skip(TokenType.END)
return Style(document_matcher, html_path)

View File

@@ -0,0 +1,59 @@
# TODO: check indices
# TODO: proper tests for unexpected tokens
from .errors import LineParseError
class TokenIterator(object):
def __init__(self, tokens):
self._tokens = tokens
self._index = 0
def peek_token_type(self):
return self._tokens[self._index].type
def next_value(self, token_type=None):
return self._next(token_type).value
def _next(self, token_type=None):
token = self._tokens[self._index]
if token_type is None or token.type == token_type:
self._index += 1
return token
else:
raise self._unexpected_token_type(token_type, token)
def skip(self, token_type, token_value=None):
token = self._tokens[self._index]
if token.type == token_type and (token_value is None or token.value == token_value):
self._index += 1
return True
else:
raise self._unexpected_token_type(token_type, token)
def try_skip(self, token_type, token_value=None):
if self.is_next(token_type, token_value):
self._index += 1
return True
else:
return False
def try_skip_many(self, tokens):
start = self._index
for token_type, token_value in tokens:
token = self._tokens[self._index]
if not (token.type == token_type and (token_value is None or token.value == token_value)):
self._index = start
return False
else:
self._index += 1
return True
def is_next(self, token_type, token_value=None):
token = self._tokens[self._index]
return token.type == token_type and (token_value is None or token.value == token_value)
def _unexpected_token_type(self, token_type, token):
raise LineParseError()

View File

@@ -0,0 +1,37 @@
import re
from .tokeniser import TokenType
def try_parse_class_name(tokens):
if tokens.try_skip(TokenType.SYMBOL, "."):
return parse_identifier(tokens)
else:
return None
def parse_identifier(tokens):
return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER))
def parse_string(tokens):
return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1])
_ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)")
def decode_escape_sequences(value):
return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value)
def _decode_escape_sequence(match):
code = match.group(1)
if code == "n":
return "\n"
elif code == "r":
return "\r"
elif code == "t":
return "\t"
else:
return code

View File

@@ -0,0 +1,61 @@
import collections
import re
Token = collections.namedtuple("Token", ["character_index", "type", "value"])
class TokenType(object):
IDENTIFIER = "identifier"
SYMBOL = "symbol"
WHITESPACE = "whitespace"
STRING = "string"
UNTERMINATED_STRING = "unterminated string"
INTEGER = "integer"
END = "end"
def regex_tokeniser(rules):
rules = [(token_type, _to_regex(regex)) for token_type, regex in rules]
rules.append(("unknown", re.compile(".")))
def tokenise(value):
tokens = []
index = 0
while index < len(value):
for token_type, regex in rules:
match = regex.match(value, index)
if match is not None:
tokens.append(Token(index, token_type, match.group(0)))
index = match.end()
break
else:
# Should be impossible
raise Exception("Remaining: " + value[index:])
tokens.append(Token(index, TokenType.END, ""))
return tokens
return tokenise
def _to_regex(value):
if hasattr(value, "match"):
return value
else:
return re.compile(value)
_string_prefix = r"'(?:\\.|[^'])*"
_identifier_character = r"(?:[a-zA-Z\-_]|\\.)"
tokenise = regex_tokeniser([
(TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"),
(TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."),
(TokenType.WHITESPACE, r"\s+"),
(TokenType.STRING, _string_prefix + "'"),
(TokenType.UNTERMINATED_STRING, _string_prefix),
(TokenType.INTEGER, "([0-9]+)"),
])

View File

@@ -0,0 +1,56 @@
from . import documents
def paragraph(transform_paragraph):
return element_of_type(documents.Paragraph, transform_paragraph)
def run(transform_run):
return element_of_type(documents.Run, transform_run)
def element_of_type(element_type, transform):
def transform_element(element):
if isinstance(element, element_type):
return transform(element)
else:
return element
return _each_element(transform_element)
def _each_element(transform_element):
def transform_element_and_children(element):
if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)):
children = list(map(transform_element_and_children, element.children))
element = element.copy(children=children)
return transform_element(element)
return transform_element_and_children
def get_descendants_of_type(element, element_type):
return list(filter(
lambda descendant: isinstance(descendant, element_type),
get_descendants(element),
))
def get_descendants(element):
descendants = []
def visit(element):
descendants.append(element)
_visit_descendants(element, visit)
return descendants
def _visit_descendants(element, visit):
if isinstance(element, documents.HasChildren):
for child in element.children:
_visit_descendants(child, visit)
visit(child)

View File

@@ -0,0 +1,8 @@
from . import html
def element(name):
def convert_underline(nodes):
return [html.collapsible_element(name, {}, nodes)]
return convert_underline

View File

@@ -0,0 +1,19 @@
from .html import HtmlWriter
from .markdown import MarkdownWriter
def writer(output_format=None):
if output_format is None:
output_format = "html"
return _writers[output_format]()
def formats():
return _writers.keys()
_writers = {
"html": HtmlWriter,
"markdown": MarkdownWriter,
}

View File

@@ -0,0 +1,31 @@
from __future__ import absolute_import
import abc
class Writer(object):
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def text(self, text):
pass
@abc.abstractmethod
def start(self, name, attributes=None):
pass
@abc.abstractmethod
def end(self, name):
pass
@abc.abstractmethod
def self_closing(self, name, attributes=None):
pass
@abc.abstractmethod
def append(self, html):
pass
@abc.abstractmethod
def as_string(self):
pass

View File

@@ -0,0 +1,43 @@
from __future__ import unicode_literals
from xml.sax.saxutils import escape
from .abc import Writer
class HtmlWriter(Writer):
def __init__(self):
self._fragments = []
def text(self, text):
self._fragments.append(_escape_html(text))
def start(self, name, attributes=None):
attribute_string = _generate_attribute_string(attributes)
self._fragments.append("<{0}{1}>".format(name, attribute_string))
def end(self, name):
self._fragments.append("</{0}>".format(name))
def self_closing(self, name, attributes=None):
attribute_string = _generate_attribute_string(attributes)
self._fragments.append("<{0}{1} />".format(name, attribute_string))
def append(self, html):
self._fragments.append(html)
def as_string(self):
return "".join(self._fragments)
def _escape_html(text):
return escape(text, {'"': "&quot;"})
def _generate_attribute_string(attributes):
if attributes is None:
return ""
else:
return "".join(
' {0}="{1}"'.format(key, _escape_html(attributes[key]))
for key in sorted(attributes)
)

View File

@@ -0,0 +1,203 @@
from __future__ import unicode_literals
from .abc import Writer
import re
class _WriterOutput(object):
def __init__(self, start, end=None, generate_end=None, anchor_position=None):
if generate_end is None:
generate_end = _constant(end)
self.start = start
self.generate_end = generate_end
self.anchor_position = anchor_position
def _constant(value):
def get():
return value
return get
class _MarkdownState(object):
def __init__(self):
self._list_state_stack = []
self.list_state = None
self.list_item_has_closed = False
def update_list_state(self, list_state):
self._list_state_stack.append(self.list_state)
self.list_state = list_state
def pop_list_state(self):
self.list_state = self._list_state_stack.pop()
class _MarkdownListState(object):
def __init__(self, ordered, indentation):
self.ordered = ordered
self.count = 0
self.indentation = indentation
def _symmetric_wrapped(end):
return _Wrapped(end, end)
class _Wrapped(object):
def __init__(self, start, end):
self._start = start
self._end = end
def __call__(self, attributes, markdown_state):
return _WriterOutput(self._start, self._end)
def _hyperlink(attributes, markdown_state):
href = attributes.get("href", "")
if href:
return _WriterOutput(
"[", "]({0})".format(href),
anchor_position="before",
)
else:
return _default_output
def _image(attributes, markdown_state):
src = attributes.get("src", "")
alt_text = attributes.get("alt", "")
if src or alt_text:
return _WriterOutput("![{0}]({1})".format(alt_text, src), "")
else:
return _default_output
def _list(ordered):
def call(attributes, markdown_state):
if markdown_state.list_state is None:
start = ""
end_text = "\n"
indentation = 0
else:
start = "\n"
end_text = ""
indentation = markdown_state.list_state.indentation + 1
def generate_end():
markdown_state.pop_list_state()
return end_text
markdown_state.update_list_state(_MarkdownListState(
ordered=ordered,
indentation=indentation,
))
return _WriterOutput(start, generate_end=generate_end)
return call
def _list_item(attributes, markdown_state):
markdown_state.list_item_has_closed = False
list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0)
list_state.count += 1
if list_state.ordered:
bullet = "{0}.".format(list_state.count)
else:
bullet = "-"
def generate_end():
if markdown_state.list_item_has_closed:
return ""
else:
markdown_state.list_item_has_closed = True
return "\n"
return _WriterOutput(
start=("\t" * list_state.indentation) + bullet + " ",
generate_end=generate_end
)
def _init_writers():
writers = {
"p": _Wrapped("", "\n\n"),
"br": _Wrapped("", " \n"),
"strong": _symmetric_wrapped("__"),
"em": _symmetric_wrapped("*"),
"a": _hyperlink,
"img": _image,
"ol": _list(ordered=True),
"ul": _list(ordered=False),
"li": _list_item,
}
for level in range(1, 7):
writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n")
return writers
_writers = _init_writers()
_default_output = _WriterOutput("", "")
def _default_writer(attributes, markdown_state):
return _default_output
class MarkdownWriter(Writer):
def __init__(self):
self._fragments = []
self._element_stack = []
self._markdown_state = _MarkdownState()
def text(self, text):
self._fragments.append(_escape_markdown(text))
def start(self, name, attributes=None):
if attributes is None:
attributes = {}
output = _writers.get(name, _default_writer)(attributes, self._markdown_state)
self._element_stack.append(output.generate_end)
anchor_before_start = output.anchor_position == "before"
if anchor_before_start:
self._write_anchor(attributes)
self._fragments.append(output.start)
if not anchor_before_start:
self._write_anchor(attributes)
def end(self, name):
end = self._element_stack.pop()
output = end()
self._fragments.append(output)
def self_closing(self, name, attributes=None):
self.start(name, attributes)
self.end(name)
def append(self, other):
self._fragments.append(other)
def as_string(self):
return "".join(self._fragments)
def _write_anchor(self, attributes):
html_id = attributes.get("id")
if html_id:
self._fragments.append('<a id="{0}"></a>'.format(html_id))
def _escape_markdown(value):
return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value))

View File

@@ -0,0 +1,77 @@
import contextlib
import io
import shutil
from zipfile import ZipFile
def open_zip(fileobj, mode):
return _Zip(ZipFile(fileobj, mode))
class _Zip(object):
def __init__(self, zip_file):
self._zip_file = zip_file
def __enter__(self):
return self
def __exit__(self, *args):
self._zip_file.close()
def open(self, name):
return contextlib.closing(self._zip_file.open(name))
def exists(self, name):
try:
self._zip_file.getinfo(name)
return True
except KeyError:
return False
def read_str(self, name):
return self._zip_file.read(name).decode("utf8")
def update_zip(fileobj, files):
source = ZipFile(fileobj, "r")
try:
destination_fileobj = io.BytesIO()
destination = ZipFile(destination_fileobj, "w")
try:
names = set(source.namelist()) | set(files.keys())
for name in names:
if name in files:
contents = files[name]
else:
contents = source.read(name)
destination.writestr(name, contents)
finally:
destination.close()
finally:
source.close()
fileobj.seek(0)
destination_fileobj.seek(0)
shutil.copyfileobj(destination_fileobj, fileobj)
def split_path(path):
parts = path.rsplit("/", 1)
if len(parts) == 1:
return ("", path)
else:
return tuple(parts)
def join_path(*args):
non_empty_paths = list(filter(None, args))
relevant_paths = []
for path in non_empty_paths:
if path.startswith("/"):
relevant_paths = [path]
else:
relevant_paths.append(path)
return "/".join(relevant_paths)