Initial commit (Clean history)
This commit is contained in:
@@ -0,0 +1,58 @@
|
||||
from . import docx, conversion, options, images, transforms, underline
|
||||
from .raw_text import extract_raw_text_from_element
|
||||
from .docx.style_map import write_style_map, read_style_map
|
||||
|
||||
__all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"]
|
||||
|
||||
|
||||
_undefined = object()
|
||||
|
||||
|
||||
def convert_to_html(*args, **kwargs):
|
||||
return convert(*args, output_format="html", **kwargs)
|
||||
|
||||
|
||||
def convert_to_markdown(*args, **kwargs):
|
||||
return convert(*args, output_format="markdown", **kwargs)
|
||||
|
||||
|
||||
def convert(
|
||||
fileobj,
|
||||
transform_document=None,
|
||||
id_prefix=None,
|
||||
include_embedded_style_map=_undefined,
|
||||
external_file_access=_undefined,
|
||||
**kwargs
|
||||
):
|
||||
if include_embedded_style_map is _undefined:
|
||||
include_embedded_style_map = True
|
||||
|
||||
if transform_document is None:
|
||||
transform_document = lambda x: x
|
||||
|
||||
if include_embedded_style_map:
|
||||
kwargs["embedded_style_map"] = read_style_map(fileobj)
|
||||
|
||||
if external_file_access is _undefined:
|
||||
external_file_access = False
|
||||
|
||||
return options.read_options(kwargs).bind(lambda convert_options:
|
||||
docx.read(fileobj, external_file_access=external_file_access).map(transform_document).bind(lambda document:
|
||||
conversion.convert_document_element_to_html(
|
||||
document,
|
||||
id_prefix=id_prefix,
|
||||
**convert_options
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def extract_raw_text(fileobj):
|
||||
return docx.read(fileobj).map(extract_raw_text_from_element)
|
||||
|
||||
|
||||
def embed_style_map(fileobj, style_map):
|
||||
write_style_map(fileobj, style_map)
|
||||
|
||||
def read_embedded_style_map(fileobj):
|
||||
return read_style_map(fileobj)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
104
path/to/venv/lib/python3.12/site-packages/mammoth/cli.py
Normal file
104
path/to/venv/lib/python3.12/site-packages/mammoth/cli.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
import mammoth
|
||||
from . import writers
|
||||
|
||||
|
||||
def main():
|
||||
args = _parse_args()
|
||||
|
||||
if args.style_map is None:
|
||||
style_map = None
|
||||
else:
|
||||
with open(args.style_map) as style_map_fileobj:
|
||||
style_map = style_map_fileobj.read()
|
||||
|
||||
with open(args.path, "rb") as docx_fileobj:
|
||||
if args.output_dir is None:
|
||||
convert_image = None
|
||||
output_path = args.output
|
||||
else:
|
||||
convert_image = mammoth.images.img_element(ImageWriter(args.output_dir))
|
||||
output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0])
|
||||
output_path = os.path.join(args.output_dir, output_filename)
|
||||
|
||||
result = mammoth.convert(
|
||||
docx_fileobj,
|
||||
style_map=style_map,
|
||||
convert_image=convert_image,
|
||||
output_format=args.output_format,
|
||||
)
|
||||
for message in result.messages:
|
||||
sys.stderr.write(message.message)
|
||||
sys.stderr.write("\n")
|
||||
|
||||
_write_output(output_path, result.value)
|
||||
|
||||
|
||||
class ImageWriter(object):
|
||||
def __init__(self, output_dir):
|
||||
self._output_dir = output_dir
|
||||
self._image_number = 1
|
||||
|
||||
def __call__(self, element):
|
||||
extension = element.content_type.partition("/")[2]
|
||||
image_filename = "{0}.{1}".format(self._image_number, extension)
|
||||
with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest:
|
||||
with element.open() as image_source:
|
||||
shutil.copyfileobj(image_source, image_dest)
|
||||
|
||||
self._image_number += 1
|
||||
|
||||
return {"src": image_filename}
|
||||
|
||||
|
||||
def _write_output(path, contents):
|
||||
if path is None:
|
||||
if sys.version_info[0] <= 2:
|
||||
stdout = sys.stdout
|
||||
else:
|
||||
stdout = sys.stdout.buffer
|
||||
|
||||
stdout.write(contents.encode("utf-8"))
|
||||
stdout.flush()
|
||||
else:
|
||||
with io.open(path, "w", encoding="utf-8") as fileobj:
|
||||
fileobj.write(contents)
|
||||
|
||||
|
||||
def _parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"path",
|
||||
metavar="docx-path",
|
||||
help="Path to the .docx file to convert.")
|
||||
|
||||
output_group = parser.add_mutually_exclusive_group()
|
||||
output_group.add_argument(
|
||||
"output",
|
||||
nargs="?",
|
||||
metavar="output-path",
|
||||
help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.")
|
||||
output_group.add_argument(
|
||||
"--output-dir",
|
||||
help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.")
|
||||
|
||||
parser.add_argument(
|
||||
"--output-format",
|
||||
required=False,
|
||||
choices=writers.formats(),
|
||||
help="Output format.")
|
||||
parser.add_argument(
|
||||
"--style-map",
|
||||
required=False,
|
||||
help="File containg a style map.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
408
path/to/venv/lib/python3.12/site-packages/mammoth/conversion.py
Normal file
408
path/to/venv/lib/python3.12/site-packages/mammoth/conversion.py
Normal file
@@ -0,0 +1,408 @@
|
||||
# coding=utf-8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cobble
|
||||
|
||||
from . import documents, results, html_paths, images, writers, html
|
||||
from .docx.files import InvalidFileReferenceError
|
||||
from .lists import find_index
|
||||
|
||||
|
||||
def convert_document_element_to_html(element,
|
||||
style_map=None,
|
||||
convert_image=None,
|
||||
id_prefix=None,
|
||||
output_format=None,
|
||||
ignore_empty_paragraphs=True):
|
||||
|
||||
if style_map is None:
|
||||
style_map = []
|
||||
|
||||
if id_prefix is None:
|
||||
id_prefix = ""
|
||||
|
||||
if convert_image is None:
|
||||
convert_image = images.data_uri
|
||||
|
||||
if isinstance(element, documents.Document):
|
||||
comments = dict(
|
||||
(comment.comment_id, comment)
|
||||
for comment in element.comments
|
||||
)
|
||||
else:
|
||||
comments = {}
|
||||
|
||||
messages = []
|
||||
converter = _DocumentConverter(
|
||||
messages=messages,
|
||||
style_map=style_map,
|
||||
convert_image=convert_image,
|
||||
id_prefix=id_prefix,
|
||||
ignore_empty_paragraphs=ignore_empty_paragraphs,
|
||||
note_references=[],
|
||||
comments=comments,
|
||||
)
|
||||
context = _ConversionContext(is_table_header=False)
|
||||
nodes = converter.visit(element, context)
|
||||
|
||||
writer = writers.writer(output_format)
|
||||
html.write(writer, html.collapse(html.strip_empty(nodes)))
|
||||
return results.Result(writer.as_string(), messages)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _ConversionContext(object):
|
||||
is_table_header = cobble.field()
|
||||
|
||||
def copy(self, **kwargs):
|
||||
return cobble.copy(self, **kwargs)
|
||||
|
||||
|
||||
class _DocumentConverter(documents.element_visitor(args=1)):
|
||||
def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
|
||||
self._messages = messages
|
||||
self._style_map = style_map
|
||||
self._id_prefix = id_prefix
|
||||
self._ignore_empty_paragraphs = ignore_empty_paragraphs
|
||||
self._note_references = note_references
|
||||
self._referenced_comments = []
|
||||
self._convert_image = convert_image
|
||||
self._comments = comments
|
||||
|
||||
def visit_image(self, image, context):
|
||||
try:
|
||||
return self._convert_image(image)
|
||||
except InvalidFileReferenceError as error:
|
||||
self._messages.append(results.warning(str(error)))
|
||||
return []
|
||||
|
||||
def visit_document(self, document, context):
|
||||
nodes = self._visit_all(document.children, context)
|
||||
notes = [
|
||||
document.notes.resolve(reference)
|
||||
for reference in self._note_references
|
||||
]
|
||||
notes_list = html.element("ol", {}, self._visit_all(notes, context))
|
||||
comments = html.element("dl", {}, [
|
||||
html_node
|
||||
for referenced_comment in self._referenced_comments
|
||||
for html_node in self.visit_comment(referenced_comment, context)
|
||||
])
|
||||
return nodes + [notes_list, comments]
|
||||
|
||||
|
||||
def visit_paragraph(self, paragraph, context):
|
||||
def children():
|
||||
content = self._visit_all(paragraph.children, context)
|
||||
if self._ignore_empty_paragraphs:
|
||||
return content
|
||||
else:
|
||||
return [html.force_write] + content
|
||||
|
||||
html_path = self._find_html_path_for_paragraph(paragraph)
|
||||
return html_path.wrap(children)
|
||||
|
||||
|
||||
def visit_run(self, run, context):
|
||||
nodes = lambda: self._visit_all(run.children, context)
|
||||
paths = []
|
||||
if run.highlight is not None:
|
||||
style = self._find_style(Highlight(color=run.highlight), "highlight")
|
||||
if style is not None:
|
||||
paths.append(style.html_path)
|
||||
if run.is_small_caps:
|
||||
paths.append(self._find_style_for_run_property("small_caps"))
|
||||
if run.is_all_caps:
|
||||
paths.append(self._find_style_for_run_property("all_caps"))
|
||||
if run.is_strikethrough:
|
||||
paths.append(self._find_style_for_run_property("strikethrough", default="s"))
|
||||
if run.is_underline:
|
||||
paths.append(self._find_style_for_run_property("underline"))
|
||||
if run.vertical_alignment == documents.VerticalAlignment.subscript:
|
||||
paths.append(html_paths.element(["sub"], fresh=False))
|
||||
if run.vertical_alignment == documents.VerticalAlignment.superscript:
|
||||
paths.append(html_paths.element(["sup"], fresh=False))
|
||||
if run.is_italic:
|
||||
paths.append(self._find_style_for_run_property("italic", default="em"))
|
||||
if run.is_bold:
|
||||
paths.append(self._find_style_for_run_property("bold", default="strong"))
|
||||
paths.append(self._find_html_path_for_run(run))
|
||||
|
||||
for path in paths:
|
||||
nodes = partial(path.wrap, nodes)
|
||||
|
||||
return nodes()
|
||||
|
||||
|
||||
def _find_style_for_run_property(self, element_type, default=None):
|
||||
style = self._find_style(None, element_type)
|
||||
if style is not None:
|
||||
return style.html_path
|
||||
elif default is not None:
|
||||
return html_paths.element(default, fresh=False)
|
||||
else:
|
||||
return html_paths.empty
|
||||
|
||||
|
||||
def visit_text(self, text, context):
|
||||
return [html.text(text.value)]
|
||||
|
||||
|
||||
def visit_hyperlink(self, hyperlink, context):
|
||||
if hyperlink.anchor is None:
|
||||
href = hyperlink.href
|
||||
else:
|
||||
href = "#{0}".format(self._html_id(hyperlink.anchor))
|
||||
|
||||
attributes = {"href": href}
|
||||
if hyperlink.target_frame is not None:
|
||||
attributes["target"] = hyperlink.target_frame
|
||||
|
||||
nodes = self._visit_all(hyperlink.children, context)
|
||||
return [html.collapsible_element("a", attributes, nodes)]
|
||||
|
||||
|
||||
def visit_checkbox(self, checkbox, context):
|
||||
attributes = {"type": "checkbox"}
|
||||
|
||||
if checkbox.checked:
|
||||
attributes["checked"] = "checked"
|
||||
|
||||
return [html.element("input", attributes)]
|
||||
|
||||
|
||||
def visit_bookmark(self, bookmark, context):
|
||||
element = html.collapsible_element(
|
||||
"a",
|
||||
{"id": self._html_id(bookmark.name)},
|
||||
[html.force_write])
|
||||
return [element]
|
||||
|
||||
|
||||
def visit_tab(self, tab, context):
|
||||
return [html.text("\t")]
|
||||
|
||||
_default_table_path = html_paths.path([html_paths.element(["table"], fresh=True)])
|
||||
|
||||
def visit_table(self, table, context):
|
||||
return self._find_html_path(table, "table", self._default_table_path) \
|
||||
.wrap(lambda: self._convert_table_children(table, context))
|
||||
|
||||
def _convert_table_children(self, table, context):
|
||||
body_index = find_index(
|
||||
lambda child: not isinstance(child, documents.TableRow) or not child.is_header,
|
||||
table.children,
|
||||
)
|
||||
if body_index is None:
|
||||
body_index = len(table.children)
|
||||
|
||||
if body_index == 0:
|
||||
children = self._visit_all(table.children, context.copy(is_table_header=False))
|
||||
else:
|
||||
head_rows = self._visit_all(table.children[:body_index], context.copy(is_table_header=True))
|
||||
body_rows = self._visit_all(table.children[body_index:], context.copy(is_table_header=False))
|
||||
children = [
|
||||
html.element("thead", {}, head_rows),
|
||||
html.element("tbody", {}, body_rows),
|
||||
]
|
||||
|
||||
return [html.force_write] + children
|
||||
|
||||
|
||||
def visit_table_row(self, table_row, context):
|
||||
return [html.element("tr", {}, [html.force_write] + self._visit_all(table_row.children, context))]
|
||||
|
||||
|
||||
def visit_table_cell(self, table_cell, context):
|
||||
if context.is_table_header:
|
||||
tag_name = "th"
|
||||
else:
|
||||
tag_name = "td"
|
||||
attributes = {}
|
||||
if table_cell.colspan != 1:
|
||||
attributes["colspan"] = str(table_cell.colspan)
|
||||
if table_cell.rowspan != 1:
|
||||
attributes["rowspan"] = str(table_cell.rowspan)
|
||||
nodes = [html.force_write] + self._visit_all(table_cell.children, context)
|
||||
return [
|
||||
html.element(tag_name, attributes, nodes)
|
||||
]
|
||||
|
||||
|
||||
def visit_break(self, break_, context):
|
||||
return self._find_html_path_for_break(break_).wrap(lambda: [])
|
||||
|
||||
|
||||
def _find_html_path_for_break(self, break_):
|
||||
style = self._find_style(break_, "break")
|
||||
if style is not None:
|
||||
return style.html_path
|
||||
elif break_.break_type == "line":
|
||||
return html_paths.path([html_paths.element("br", fresh=True)])
|
||||
else:
|
||||
return html_paths.empty
|
||||
|
||||
|
||||
def visit_note_reference(self, note_reference, context):
|
||||
self._note_references.append(note_reference)
|
||||
note_number = len(self._note_references)
|
||||
return [
|
||||
html.element("sup", {}, [
|
||||
html.element("a", {
|
||||
"href": "#" + self._note_html_id(note_reference),
|
||||
"id": self._note_ref_html_id(note_reference),
|
||||
}, [html.text("[{0}]".format(note_number))])
|
||||
])
|
||||
]
|
||||
|
||||
|
||||
def visit_note(self, note, context):
|
||||
note_body = self._visit_all(note.body, context) + [
|
||||
html.collapsible_element("p", {}, [
|
||||
html.text(" "),
|
||||
html.element("a", {"href": "#" + self._note_ref_html_id(note)}, [
|
||||
html.text(_up_arrow)
|
||||
]),
|
||||
])
|
||||
]
|
||||
return [
|
||||
html.element("li", {"id": self._note_html_id(note)}, note_body)
|
||||
]
|
||||
|
||||
|
||||
def visit_comment_reference(self, reference, context):
|
||||
def nodes():
|
||||
comment = self._comments[reference.comment_id]
|
||||
count = len(self._referenced_comments) + 1
|
||||
label = "[{0}{1}]".format(_comment_author_label(comment), count)
|
||||
self._referenced_comments.append((label, comment))
|
||||
return [
|
||||
# TODO: remove duplication with note references
|
||||
html.element("a", {
|
||||
"href": "#" + self._referent_html_id("comment", reference.comment_id),
|
||||
"id": self._reference_html_id("comment", reference.comment_id),
|
||||
}, [html.text(label)])
|
||||
]
|
||||
|
||||
html_path = self._find_html_path(
|
||||
None,
|
||||
"comment_reference",
|
||||
default=html_paths.ignore,
|
||||
)
|
||||
|
||||
return html_path.wrap(nodes)
|
||||
|
||||
def visit_comment(self, referenced_comment, context):
|
||||
label, comment = referenced_comment
|
||||
# TODO remove duplication with notes
|
||||
body = self._visit_all(comment.body, context) + [
|
||||
html.collapsible_element("p", {}, [
|
||||
html.text(" "),
|
||||
html.element("a", {"href": "#" + self._reference_html_id("comment", comment.comment_id)}, [
|
||||
html.text(_up_arrow)
|
||||
]),
|
||||
])
|
||||
]
|
||||
return [
|
||||
html.element(
|
||||
"dt",
|
||||
{"id": self._referent_html_id("comment", comment.comment_id)},
|
||||
[html.text("Comment {0}".format(label))],
|
||||
),
|
||||
html.element("dd", {}, body),
|
||||
]
|
||||
|
||||
|
||||
def _visit_all(self, elements, context):
|
||||
return [
|
||||
html_node
|
||||
for element in elements
|
||||
for html_node in self.visit(element, context)
|
||||
]
|
||||
|
||||
|
||||
def _find_html_path_for_paragraph(self, paragraph):
|
||||
default = html_paths.path([html_paths.element("p", fresh=True)])
|
||||
return self._find_html_path(paragraph, "paragraph", default, warn_unrecognised=True)
|
||||
|
||||
def _find_html_path_for_run(self, run):
|
||||
return self._find_html_path(run, "run", default=html_paths.empty, warn_unrecognised=True)
|
||||
|
||||
|
||||
def _find_html_path(self, element, element_type, default, warn_unrecognised=False):
|
||||
style = self._find_style(element, element_type)
|
||||
if style is not None:
|
||||
return style.html_path
|
||||
|
||||
if warn_unrecognised and getattr(element, "style_id", None) is not None:
|
||||
self._messages.append(results.warning(
|
||||
"Unrecognised {0} style: {1} (Style ID: {2})".format(
|
||||
element_type, element.style_name, element.style_id)
|
||||
))
|
||||
|
||||
return default
|
||||
|
||||
def _find_style(self, element, element_type):
|
||||
for style in self._style_map:
|
||||
document_matcher = style.document_matcher
|
||||
if _document_matcher_matches(document_matcher, element, element_type):
|
||||
return style
|
||||
|
||||
def _note_html_id(self, note):
|
||||
return self._referent_html_id(note.note_type, note.note_id)
|
||||
|
||||
def _note_ref_html_id(self, note):
|
||||
return self._reference_html_id(note.note_type, note.note_id)
|
||||
|
||||
def _referent_html_id(self, reference_type, reference_id):
|
||||
return self._html_id("{0}-{1}".format(reference_type, reference_id))
|
||||
|
||||
def _reference_html_id(self, reference_type, reference_id):
|
||||
return self._html_id("{0}-ref-{1}".format(reference_type, reference_id))
|
||||
|
||||
def _html_id(self, suffix):
|
||||
return "{0}{1}".format(self._id_prefix, suffix)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Highlight:
|
||||
color = cobble.field()
|
||||
|
||||
|
||||
def _document_matcher_matches(matcher, element, element_type):
|
||||
if matcher.element_type in ["underline", "strikethrough", "all_caps", "small_caps", "bold", "italic", "comment_reference"]:
|
||||
return matcher.element_type == element_type
|
||||
elif matcher.element_type == "highlight":
|
||||
return (
|
||||
matcher.element_type == element_type and
|
||||
(matcher.color is None or matcher.color == element.color)
|
||||
)
|
||||
elif matcher.element_type == "break":
|
||||
return (
|
||||
matcher.element_type == element_type and
|
||||
matcher.break_type == element.break_type
|
||||
)
|
||||
else: # matcher.element_type in ["paragraph", "run"]:
|
||||
return (
|
||||
matcher.element_type == element_type and (
|
||||
matcher.style_id is None or
|
||||
matcher.style_id == element.style_id
|
||||
) and (
|
||||
matcher.style_name is None or
|
||||
element.style_name is not None and (matcher.style_name.matches(element.style_name))
|
||||
) and (
|
||||
element_type != "paragraph" or
|
||||
matcher.numbering is None or
|
||||
matcher.numbering == element.numbering
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _comment_author_label(comment):
|
||||
return comment.author_initials or ""
|
||||
|
||||
|
||||
_up_arrow = "↑"
|
||||
@@ -0,0 +1,95 @@
|
||||
import collections
|
||||
|
||||
import cobble
|
||||
|
||||
|
||||
def paragraph(style_id=None, style_name=None, numbering=None):
|
||||
return ParagraphMatcher(style_id, style_name, numbering)
|
||||
|
||||
|
||||
ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"])
|
||||
ParagraphMatcher.element_type = "paragraph"
|
||||
|
||||
|
||||
def run(style_id=None, style_name=None):
|
||||
return RunMatcher(style_id, style_name)
|
||||
|
||||
|
||||
RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"])
|
||||
RunMatcher.element_type = "run"
|
||||
|
||||
|
||||
def table(style_id=None, style_name=None):
|
||||
return TableMatcher(style_id, style_name)
|
||||
|
||||
|
||||
TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"])
|
||||
TableMatcher.element_type = "table"
|
||||
|
||||
|
||||
class bold(object):
|
||||
element_type = "bold"
|
||||
|
||||
|
||||
class italic(object):
|
||||
element_type = "italic"
|
||||
|
||||
|
||||
class underline(object):
|
||||
element_type = "underline"
|
||||
|
||||
|
||||
class strikethrough(object):
|
||||
element_type = "strikethrough"
|
||||
|
||||
|
||||
class all_caps(object):
|
||||
element_type = "all_caps"
|
||||
|
||||
|
||||
class small_caps(object):
|
||||
element_type = "small_caps"
|
||||
|
||||
|
||||
def highlight(color=None):
|
||||
return HighlightMatcher(color=color)
|
||||
|
||||
|
||||
HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"])
|
||||
HighlightMatcher.element_type = "highlight"
|
||||
|
||||
class comment_reference(object):
|
||||
element_type = "comment_reference"
|
||||
|
||||
|
||||
BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"])
|
||||
BreakMatcher.element_type = "break"
|
||||
|
||||
|
||||
line_break = BreakMatcher("line")
|
||||
page_break = BreakMatcher("page")
|
||||
column_break = BreakMatcher("column")
|
||||
|
||||
|
||||
def equal_to(value):
|
||||
return StringMatcher(_operator_equal_to, value)
|
||||
|
||||
|
||||
def _operator_equal_to(first, second):
|
||||
return first.upper() == second.upper()
|
||||
|
||||
|
||||
def starts_with(value):
|
||||
return StringMatcher(_operator_starts_with, value)
|
||||
|
||||
def _operator_starts_with(first, second):
|
||||
return second.upper().startswith(first.upper())
|
||||
|
||||
|
||||
@cobble.data
|
||||
class StringMatcher(object):
|
||||
operator = cobble.field()
|
||||
value = cobble.field()
|
||||
|
||||
def matches(self, other):
|
||||
return self.operator(self.value, other)
|
||||
286
path/to/venv/lib/python3.12/site-packages/mammoth/documents.py
Normal file
286
path/to/venv/lib/python3.12/site-packages/mammoth/documents.py
Normal file
@@ -0,0 +1,286 @@
|
||||
import cobble
|
||||
|
||||
|
||||
class Element(object):
|
||||
def copy(self, **kwargs):
|
||||
return cobble.copy(self, **kwargs)
|
||||
|
||||
|
||||
class HasChildren(Element):
|
||||
children = cobble.field()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Document(HasChildren):
|
||||
notes = cobble.field()
|
||||
comments = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class Paragraph(HasChildren):
|
||||
style_id = cobble.field()
|
||||
style_name = cobble.field()
|
||||
numbering = cobble.field()
|
||||
alignment = cobble.field()
|
||||
indent = cobble.field()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class ParagraphIndent(object):
|
||||
start = cobble.field()
|
||||
end = cobble.field()
|
||||
first_line = cobble.field()
|
||||
hanging = cobble.field()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Indent(object):
|
||||
left = cobble.field()
|
||||
right = cobble.field()
|
||||
first_line = cobble.field()
|
||||
hanging = cobble.field()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Run(HasChildren):
|
||||
style_id = cobble.field()
|
||||
style_name = cobble.field()
|
||||
is_bold = cobble.field()
|
||||
is_italic = cobble.field()
|
||||
is_underline = cobble.field()
|
||||
is_strikethrough = cobble.field()
|
||||
is_all_caps = cobble.field()
|
||||
is_small_caps = cobble.field()
|
||||
vertical_alignment = cobble.field()
|
||||
font = cobble.field()
|
||||
font_size = cobble.field()
|
||||
highlight = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class Text(Element):
|
||||
value = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class Hyperlink(HasChildren):
|
||||
href = cobble.field()
|
||||
anchor = cobble.field()
|
||||
target_frame = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class Checkbox(Element):
|
||||
checked = cobble.field()
|
||||
|
||||
checkbox = Checkbox
|
||||
|
||||
@cobble.data
|
||||
class Table(HasChildren):
|
||||
style_id = cobble.field()
|
||||
style_name = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class TableRow(HasChildren):
|
||||
is_header = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class TableCell(HasChildren):
|
||||
colspan = cobble.field()
|
||||
rowspan = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class TableCellUnmerged:
|
||||
children = cobble.field()
|
||||
colspan = cobble.field()
|
||||
rowspan = cobble.field()
|
||||
vmerge = cobble.field()
|
||||
|
||||
def _accept1(self, visitor, arg0):
|
||||
return visitor.visit_table_cell(self, arg0)
|
||||
|
||||
def copy(self, **kwargs):
|
||||
return cobble.copy(self, **kwargs)
|
||||
|
||||
@cobble.data
|
||||
class Break(Element):
|
||||
break_type = cobble.field()
|
||||
|
||||
line_break = Break("line")
|
||||
page_break = Break("page")
|
||||
column_break = Break("column")
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Tab(Element):
|
||||
pass
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Image(Element):
|
||||
alt_text = cobble.field()
|
||||
content_type = cobble.field()
|
||||
open = cobble.field()
|
||||
|
||||
|
||||
def document(children, notes=None, comments=None):
|
||||
if notes is None:
|
||||
notes = Notes({})
|
||||
if comments is None:
|
||||
comments = []
|
||||
return Document(children, notes, comments=comments)
|
||||
|
||||
def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
|
||||
if indent is None:
|
||||
indent = paragraph_indent()
|
||||
|
||||
return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent)
|
||||
|
||||
def paragraph_indent(start=None, end=None, first_line=None, hanging=None):
|
||||
return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging)
|
||||
|
||||
def run(
|
||||
children,
|
||||
style_id=None,
|
||||
style_name=None,
|
||||
is_bold=None,
|
||||
is_italic=None,
|
||||
is_underline=None,
|
||||
is_strikethrough=None,
|
||||
is_all_caps=None,
|
||||
is_small_caps=None,
|
||||
vertical_alignment=None,
|
||||
font=None,
|
||||
font_size=None,
|
||||
highlight=None,
|
||||
):
|
||||
if vertical_alignment is None:
|
||||
vertical_alignment = VerticalAlignment.baseline
|
||||
return Run(
|
||||
children=children,
|
||||
style_id=style_id,
|
||||
style_name=style_name,
|
||||
is_bold=bool(is_bold),
|
||||
is_italic=bool(is_italic),
|
||||
is_underline=bool(is_underline),
|
||||
is_strikethrough=bool(is_strikethrough),
|
||||
is_all_caps=bool(is_all_caps),
|
||||
is_small_caps=bool(is_small_caps),
|
||||
vertical_alignment=vertical_alignment,
|
||||
font=font,
|
||||
font_size=font_size,
|
||||
highlight=highlight,
|
||||
)
|
||||
|
||||
class VerticalAlignment(object):
|
||||
baseline = "baseline"
|
||||
superscript = "superscript"
|
||||
subscript = "subscript"
|
||||
|
||||
text = Text
|
||||
|
||||
_tab = Tab()
|
||||
|
||||
def tab():
|
||||
return _tab
|
||||
|
||||
|
||||
image = Image
|
||||
|
||||
def hyperlink(children, href=None, anchor=None, target_frame=None):
|
||||
return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Bookmark(Element):
|
||||
name = cobble.field()
|
||||
|
||||
bookmark = Bookmark
|
||||
|
||||
|
||||
def table(children, style_id=None, style_name=None):
|
||||
return Table(children=children, style_id=style_id, style_name=style_name)
|
||||
|
||||
def table_row(children, is_header=None):
|
||||
return TableRow(children=children, is_header=bool(is_header))
|
||||
|
||||
def table_cell(children, colspan=None, rowspan=None):
|
||||
if colspan is None:
|
||||
colspan = 1
|
||||
if rowspan is None:
|
||||
rowspan = 1
|
||||
return TableCell(children=children, colspan=colspan, rowspan=rowspan)
|
||||
|
||||
def table_cell_unmerged(children, colspan, rowspan, vmerge):
|
||||
return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge)
|
||||
|
||||
def numbering_level(level_index, is_ordered):
|
||||
return _NumberingLevel(str(level_index), bool(is_ordered))
|
||||
|
||||
@cobble.data
|
||||
class _NumberingLevel(object):
|
||||
level_index = cobble.field()
|
||||
is_ordered = cobble.field()
|
||||
|
||||
@cobble.data
|
||||
class Note(Element):
|
||||
note_type = cobble.field()
|
||||
note_id = cobble.field()
|
||||
body = cobble.field()
|
||||
|
||||
|
||||
note = Note
|
||||
|
||||
|
||||
class Notes(object):
|
||||
def __init__(self, notes):
|
||||
self._notes = notes
|
||||
|
||||
def find_note(self, note_type, note_id):
|
||||
return self._notes[(note_type, note_id)]
|
||||
|
||||
def resolve(self, reference):
|
||||
return self.find_note(reference.note_type, reference.note_id)
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, Notes) and self._notes == other._notes
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def notes(notes_list):
|
||||
return Notes(dict(
|
||||
(_note_key(note), note)
|
||||
for note in notes_list
|
||||
))
|
||||
|
||||
def _note_key(note):
|
||||
return (note.note_type, note.note_id)
|
||||
|
||||
@cobble.data
|
||||
class NoteReference(Element):
|
||||
note_type = cobble.field()
|
||||
note_id = cobble.field()
|
||||
|
||||
note_reference = NoteReference
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Comment(object):
|
||||
comment_id = cobble.field()
|
||||
body = cobble.field()
|
||||
author_name = cobble.field()
|
||||
author_initials = cobble.field()
|
||||
|
||||
def comment(comment_id, body, author_name=None, author_initials=None):
|
||||
return Comment(
|
||||
comment_id=comment_id,
|
||||
body=body,
|
||||
author_name=author_name,
|
||||
author_initials=author_initials,
|
||||
)
|
||||
|
||||
@cobble.data
|
||||
class CommentReference(Element):
|
||||
comment_id = cobble.field()
|
||||
|
||||
comment_reference = CommentReference
|
||||
|
||||
def element_visitor(args):
|
||||
return cobble.visitor(Element, args=args)
|
||||
@@ -0,0 +1,211 @@
|
||||
from functools import partial
|
||||
import os
|
||||
|
||||
import cobble
|
||||
|
||||
from .. import results, lists, zips
|
||||
from .document_xml import read_document_xml_element
|
||||
from .content_types_xml import empty_content_types, read_content_types_xml_element
|
||||
from .relationships_xml import read_relationships_xml_element, Relationships
|
||||
from .numbering_xml import read_numbering_xml_element, Numbering
|
||||
from .styles_xml import read_styles_xml_element, Styles
|
||||
from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
|
||||
from .comments_xml import read_comments_xml_element
|
||||
from .files import Files
|
||||
from . import body_xml, office_xml
|
||||
from ..zips import open_zip
|
||||
|
||||
|
||||
_empty_result = results.success([])
|
||||
|
||||
|
||||
def read(fileobj, external_file_access=False):
|
||||
zip_file = open_zip(fileobj, "r")
|
||||
part_paths = _find_part_paths(zip_file)
|
||||
read_part_with_body = _part_with_body_reader(
|
||||
getattr(fileobj, "name", None),
|
||||
zip_file,
|
||||
part_paths=part_paths,
|
||||
external_file_access=external_file_access,
|
||||
)
|
||||
|
||||
return results.combine([
|
||||
_read_notes(read_part_with_body, part_paths),
|
||||
_read_comments(read_part_with_body, part_paths),
|
||||
]).bind(lambda referents:
|
||||
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
|
||||
)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _PartPaths(object):
|
||||
main_document = cobble.field()
|
||||
comments = cobble.field()
|
||||
endnotes = cobble.field()
|
||||
footnotes = cobble.field()
|
||||
numbering = cobble.field()
|
||||
styles = cobble.field()
|
||||
|
||||
|
||||
def _find_part_paths(zip_file):
|
||||
package_relationships = _read_relationships(zip_file, "_rels/.rels")
|
||||
document_filename = _find_document_filename(zip_file, package_relationships)
|
||||
|
||||
document_relationships = _read_relationships(
|
||||
zip_file,
|
||||
_find_relationships_path_for(document_filename),
|
||||
)
|
||||
|
||||
def find(name):
|
||||
return _find_part_path(
|
||||
zip_file=zip_file,
|
||||
relationships=document_relationships,
|
||||
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
|
||||
fallback_path="word/{0}.xml".format(name),
|
||||
base_path=zips.split_path(document_filename)[0],
|
||||
)
|
||||
|
||||
return _PartPaths(
|
||||
main_document=document_filename,
|
||||
comments=find("comments"),
|
||||
endnotes=find("endnotes"),
|
||||
footnotes=find("footnotes"),
|
||||
numbering=find("numbering"),
|
||||
styles=find("styles"),
|
||||
)
|
||||
|
||||
|
||||
def _find_document_filename(zip_file, relationships):
|
||||
path = _find_part_path(
|
||||
zip_file,
|
||||
relationships,
|
||||
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
|
||||
base_path="",
|
||||
fallback_path="word/document.xml",
|
||||
)
|
||||
if zip_file.exists(path):
|
||||
return path
|
||||
else:
|
||||
raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
|
||||
|
||||
|
||||
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
|
||||
targets = [
|
||||
zips.join_path(base_path, target).lstrip("/")
|
||||
for target in relationships.find_targets_by_type(relationship_type)
|
||||
]
|
||||
valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
|
||||
if len(valid_targets) == 0:
|
||||
return fallback_path
|
||||
else:
|
||||
return valid_targets[0]
|
||||
|
||||
|
||||
def _read_notes(read_part_with_body, part_paths):
|
||||
footnotes = read_part_with_body(
|
||||
part_paths.footnotes,
|
||||
lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
|
||||
default=_empty_result,
|
||||
)
|
||||
endnotes = read_part_with_body(
|
||||
part_paths.endnotes,
|
||||
lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
|
||||
default=_empty_result,
|
||||
)
|
||||
|
||||
return results.combine([footnotes, endnotes]).map(lists.flatten)
|
||||
|
||||
|
||||
def _read_comments(read_part_with_body, part_paths):
|
||||
return read_part_with_body(
|
||||
part_paths.comments,
|
||||
lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
|
||||
default=_empty_result,
|
||||
)
|
||||
|
||||
|
||||
def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
|
||||
return read_part_with_body(
|
||||
part_paths.main_document,
|
||||
partial(
|
||||
read_document_xml_element,
|
||||
notes=notes,
|
||||
comments=comments,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
|
||||
content_types = _try_read_entry_or_default(
|
||||
zip_file,
|
||||
"[Content_Types].xml",
|
||||
read_content_types_xml_element,
|
||||
empty_content_types,
|
||||
)
|
||||
|
||||
styles = _try_read_entry_or_default(
|
||||
zip_file,
|
||||
part_paths.styles,
|
||||
read_styles_xml_element,
|
||||
Styles.EMPTY,
|
||||
)
|
||||
|
||||
numbering = _try_read_entry_or_default(
|
||||
zip_file,
|
||||
part_paths.numbering,
|
||||
lambda element: read_numbering_xml_element(element, styles=styles),
|
||||
default=Numbering.EMPTY,
|
||||
)
|
||||
|
||||
files = Files(
|
||||
None if document_path is None else os.path.dirname(document_path),
|
||||
external_file_access=external_file_access,
|
||||
)
|
||||
|
||||
def read_part(name, reader, default=_undefined):
|
||||
relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
|
||||
|
||||
body_reader = body_xml.reader(
|
||||
numbering=numbering,
|
||||
content_types=content_types,
|
||||
relationships=relationships,
|
||||
styles=styles,
|
||||
docx_file=zip_file,
|
||||
files=files,
|
||||
)
|
||||
|
||||
if default is _undefined:
|
||||
return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
|
||||
else:
|
||||
return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
|
||||
|
||||
return read_part
|
||||
|
||||
|
||||
|
||||
def _find_relationships_path_for(name):
|
||||
dirname, basename = zips.split_path(name)
|
||||
return zips.join_path(dirname, "_rels", basename + ".rels")
|
||||
|
||||
|
||||
def _read_relationships(zip_file, name):
|
||||
return _try_read_entry_or_default(
|
||||
zip_file,
|
||||
name,
|
||||
read_relationships_xml_element,
|
||||
default=Relationships.EMPTY,
|
||||
)
|
||||
|
||||
def _try_read_entry_or_default(zip_file, name, reader, default):
|
||||
if zip_file.exists(name):
|
||||
return _read_entry(zip_file, name, reader)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
||||
def _read_entry(zip_file, name, reader):
|
||||
with zip_file.open(name) as fileobj:
|
||||
return reader(office_xml.read(fileobj))
|
||||
|
||||
|
||||
_undefined = object()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,794 @@
|
||||
import contextlib
|
||||
import re
|
||||
import sys
|
||||
|
||||
from .. import documents
|
||||
from .. import results
|
||||
from .. import lists
|
||||
from .. import transforms
|
||||
from . import complex_fields
|
||||
from .dingbats import dingbats
|
||||
from .xmlparser import node_types, XmlElement, null_xml_element
|
||||
from .styles_xml import Styles
|
||||
from .uris import replace_fragment, uri_to_zip_entry_name
|
||||
|
||||
if sys.version_info >= (3, ):
|
||||
unichr = chr
|
||||
|
||||
|
||||
def reader(
|
||||
numbering=None,
|
||||
content_types=None,
|
||||
relationships=None,
|
||||
styles=None,
|
||||
docx_file=None,
|
||||
files=None
|
||||
):
|
||||
|
||||
if styles is None:
|
||||
styles = Styles.EMPTY
|
||||
|
||||
read_all = _create_reader(
|
||||
numbering=numbering,
|
||||
content_types=content_types,
|
||||
relationships=relationships,
|
||||
styles=styles,
|
||||
docx_file=docx_file,
|
||||
files=files,
|
||||
)
|
||||
return _BodyReader(read_all)
|
||||
|
||||
|
||||
|
||||
class _BodyReader(object):
|
||||
def __init__(self, read_all):
|
||||
self._read_all = read_all
|
||||
|
||||
def read_all(self, elements):
|
||||
result = self._read_all(elements)
|
||||
return results.Result(result.elements, result.messages)
|
||||
|
||||
|
||||
def _create_reader(numbering, content_types, relationships, styles, docx_file, files):
|
||||
current_instr_text = []
|
||||
complex_field_stack = []
|
||||
|
||||
# When a paragraph is marked as deleted, its contents should be combined
|
||||
# with the following paragraph. See 17.13.5.15 del (Deleted Paragraph) of
|
||||
# ECMA-376 4th edition Part 1.
|
||||
deleted_paragraph_contents = []
|
||||
|
||||
_ignored_elements = set([
|
||||
"office-word:wrap",
|
||||
"v:shadow",
|
||||
"v:shapetype",
|
||||
"w:annotationRef",
|
||||
"w:bookmarkEnd",
|
||||
"w:sectPr",
|
||||
"w:proofErr",
|
||||
"w:lastRenderedPageBreak",
|
||||
"w:commentRangeStart",
|
||||
"w:commentRangeEnd",
|
||||
"w:del",
|
||||
"w:footnoteRef",
|
||||
"w:endnoteRef",
|
||||
"w:pPr",
|
||||
"w:rPr",
|
||||
"w:tblPr",
|
||||
"w:tblGrid",
|
||||
"w:trPr",
|
||||
"w:tcPr",
|
||||
])
|
||||
|
||||
def text(element):
|
||||
return _success(documents.Text(_inner_text(element)))
|
||||
|
||||
def run(element):
|
||||
properties = element.find_child_or_null("w:rPr")
|
||||
vertical_alignment = properties \
|
||||
.find_child_or_null("w:vertAlign") \
|
||||
.attributes.get("w:val")
|
||||
font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
|
||||
|
||||
font_size_string = properties.find_child_or_null("w:sz").attributes.get("w:val")
|
||||
if _is_int(font_size_string):
|
||||
# w:sz gives the font size in half points, so halve the value to get the size in points
|
||||
font_size = int(font_size_string) / 2
|
||||
else:
|
||||
font_size = None
|
||||
|
||||
is_bold = read_boolean_element(properties.find_child("w:b"))
|
||||
is_italic = read_boolean_element(properties.find_child("w:i"))
|
||||
is_underline = read_underline_element(properties.find_child("w:u"))
|
||||
is_strikethrough = read_boolean_element(properties.find_child("w:strike"))
|
||||
is_all_caps = read_boolean_element(properties.find_child("w:caps"))
|
||||
is_small_caps = read_boolean_element(properties.find_child("w:smallCaps"))
|
||||
highlight = read_highlight_value(properties.find_child_or_null("w:highlight").attributes.get("w:val"))
|
||||
|
||||
def add_complex_field_hyperlink(children):
|
||||
hyperlink_kwargs = current_hyperlink_kwargs()
|
||||
if hyperlink_kwargs is None:
|
||||
return children
|
||||
else:
|
||||
return [documents.hyperlink(children=children, **hyperlink_kwargs)]
|
||||
|
||||
return _ReadResult.map_results(
|
||||
_read_run_style(properties),
|
||||
_read_xml_elements(element.children).map(add_complex_field_hyperlink),
|
||||
lambda style, children: documents.run(
|
||||
children=children,
|
||||
style_id=style[0],
|
||||
style_name=style[1],
|
||||
is_bold=is_bold,
|
||||
is_italic=is_italic,
|
||||
is_underline=is_underline,
|
||||
is_strikethrough=is_strikethrough,
|
||||
is_all_caps=is_all_caps,
|
||||
is_small_caps=is_small_caps,
|
||||
vertical_alignment=vertical_alignment,
|
||||
font=font,
|
||||
font_size=font_size,
|
||||
highlight=highlight,
|
||||
))
|
||||
|
||||
def _read_run_style(properties):
|
||||
return _read_style(properties, "w:rStyle", "Run", styles.find_character_style_by_id)
|
||||
|
||||
def read_boolean_element(element):
|
||||
if element is None:
|
||||
return False
|
||||
else:
|
||||
return read_boolean_attribute_value(element.attributes.get("w:val"))
|
||||
|
||||
def read_boolean_attribute_value(value):
|
||||
return value not in ["false", "0"]
|
||||
|
||||
def read_underline_element(element):
|
||||
return element and element.attributes.get("w:val") not in [None, "false", "0", "none"]
|
||||
|
||||
def read_highlight_value(value):
|
||||
if not value or value == "none":
|
||||
return None
|
||||
else:
|
||||
return value
|
||||
|
||||
def paragraph(element):
|
||||
properties = element.find_child_or_null("w:pPr")
|
||||
|
||||
is_deleted = properties.find_child_or_null("w:rPr").find_child("w:del")
|
||||
|
||||
if is_deleted is not None:
|
||||
for child in element.children:
|
||||
deleted_paragraph_contents.append(child)
|
||||
return _empty_result
|
||||
|
||||
else:
|
||||
alignment = properties.find_child_or_null("w:jc").attributes.get("w:val")
|
||||
indent = _read_paragraph_indent(properties.find_child_or_null("w:ind"))
|
||||
|
||||
children_xml = element.children
|
||||
if deleted_paragraph_contents:
|
||||
children_xml = deleted_paragraph_contents + children_xml
|
||||
del deleted_paragraph_contents[:]
|
||||
|
||||
return _ReadResult.map_results(
|
||||
_read_paragraph_style(properties),
|
||||
_read_xml_elements(children_xml),
|
||||
lambda style, children: documents.paragraph(
|
||||
children=children,
|
||||
style_id=style[0],
|
||||
style_name=style[1],
|
||||
numbering=_read_numbering_properties(
|
||||
paragraph_style_id=style[0],
|
||||
element=properties.find_child_or_null("w:numPr"),
|
||||
),
|
||||
alignment=alignment,
|
||||
indent=indent,
|
||||
)).append_extra()
|
||||
|
||||
def _read_paragraph_style(properties):
|
||||
return _read_style(properties, "w:pStyle", "Paragraph", styles.find_paragraph_style_by_id)
|
||||
|
||||
def current_hyperlink_kwargs():
|
||||
for complex_field in reversed(complex_field_stack):
|
||||
if isinstance(complex_field, complex_fields.Hyperlink):
|
||||
return complex_field.kwargs
|
||||
|
||||
return None
|
||||
|
||||
def read_fld_char(element):
|
||||
fld_char_type = element.attributes.get("w:fldCharType")
|
||||
if fld_char_type == "begin":
|
||||
complex_field_stack.append(complex_fields.begin(fld_char=element))
|
||||
del current_instr_text[:]
|
||||
|
||||
elif fld_char_type == "end":
|
||||
complex_field = complex_field_stack.pop()
|
||||
if isinstance(complex_field, complex_fields.Begin):
|
||||
complex_field = parse_current_instr_text(complex_field)
|
||||
|
||||
if isinstance(complex_field, complex_fields.Checkbox):
|
||||
return _success(documents.checkbox(checked=complex_field.checked))
|
||||
|
||||
elif fld_char_type == "separate":
|
||||
complex_field_separate = complex_field_stack.pop()
|
||||
complex_field = parse_current_instr_text(complex_field_separate)
|
||||
complex_field_stack.append(complex_field)
|
||||
|
||||
return _empty_result
|
||||
|
||||
def parse_current_instr_text(complex_field):
|
||||
instr_text = "".join(current_instr_text)
|
||||
|
||||
if isinstance(complex_field, complex_fields.Begin):
|
||||
fld_char = complex_field.fld_char
|
||||
else:
|
||||
fld_char = null_xml_element
|
||||
|
||||
return parse_instr_text(instr_text, fld_char=fld_char)
|
||||
|
||||
def parse_instr_text(instr_text, *, fld_char):
|
||||
external_link_result = re.match(r'\s*HYPERLINK "(.*)"', instr_text)
|
||||
if external_link_result is not None:
|
||||
return complex_fields.hyperlink(dict(href=external_link_result.group(1)))
|
||||
|
||||
internal_link_result = re.match(r'\s*HYPERLINK\s+\\l\s+"(.*)"', instr_text)
|
||||
if internal_link_result is not None:
|
||||
return complex_fields.hyperlink(dict(anchor=internal_link_result.group(1)))
|
||||
|
||||
checkbox_result = re.match(r'\s*FORMCHECKBOX\s*', instr_text)
|
||||
if checkbox_result is not None:
|
||||
checkbox_element = fld_char \
|
||||
.find_child_or_null("w:ffData") \
|
||||
.find_child_or_null("w:checkBox")
|
||||
checked_element = checkbox_element.find_child("w:checked")
|
||||
|
||||
if checked_element is None:
|
||||
checked = read_boolean_element(checkbox_element.find_child("w:default"))
|
||||
else:
|
||||
checked = read_boolean_element(checked_element)
|
||||
|
||||
return complex_fields.checkbox(checked=checked)
|
||||
|
||||
return None
|
||||
|
||||
def read_instr_text(element):
|
||||
current_instr_text.append(_inner_text(element))
|
||||
return _empty_result
|
||||
|
||||
def _read_style(properties, style_tag_name, style_type, find_style_by_id):
|
||||
messages = []
|
||||
style_id = properties \
|
||||
.find_child_or_null(style_tag_name) \
|
||||
.attributes.get("w:val")
|
||||
|
||||
if style_id is None:
|
||||
style_name = None
|
||||
else:
|
||||
style = find_style_by_id(style_id)
|
||||
if style is None:
|
||||
style_name = None
|
||||
messages.append(_undefined_style_warning(style_type, style_id))
|
||||
else:
|
||||
style_name = style.name
|
||||
|
||||
return _ReadResult([style_id, style_name], [], messages)
|
||||
|
||||
def _undefined_style_warning(style_type, style_id):
|
||||
return results.warning("{0} style with ID {1} was referenced but not defined in the document".format(style_type, style_id))
|
||||
|
||||
def _read_numbering_properties(paragraph_style_id, element):
|
||||
num_id = element.find_child_or_null("w:numId").attributes.get("w:val")
|
||||
level_index = element.find_child_or_null("w:ilvl").attributes.get("w:val")
|
||||
if num_id is not None and level_index is not None:
|
||||
return numbering.find_level(num_id, level_index)
|
||||
|
||||
if paragraph_style_id is not None:
|
||||
level = numbering.find_level_by_paragraph_style_id(paragraph_style_id)
|
||||
if level is not None:
|
||||
return level
|
||||
|
||||
# Some malformed documents define numbering levels without an index, and
|
||||
# reference the numbering using a w:numPr element without a w:ilvl child.
|
||||
# To handle such cases, we assume a level of 0 as a fallback.
|
||||
if num_id is not None:
|
||||
return numbering.find_level(num_id, "0")
|
||||
|
||||
return None
|
||||
|
||||
def _read_paragraph_indent(element):
|
||||
attributes = element.attributes
|
||||
return documents.paragraph_indent(
|
||||
start=attributes.get("w:start") or attributes.get("w:left"),
|
||||
end=attributes.get("w:end") or attributes.get("w:right"),
|
||||
first_line=attributes.get("w:firstLine"),
|
||||
hanging=attributes.get("w:hanging"),
|
||||
)
|
||||
|
||||
def tab(element):
|
||||
return _success(documents.tab())
|
||||
|
||||
|
||||
def no_break_hyphen(element):
|
||||
return _success(documents.text(unichr(0x2011)))
|
||||
|
||||
|
||||
def soft_hyphen(element):
|
||||
return _success(documents.text(u"\u00ad"))
|
||||
|
||||
def symbol(element):
|
||||
# See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1
|
||||
font = element.attributes.get("w:font")
|
||||
char = element.attributes.get("w:char")
|
||||
|
||||
unicode_code_point = dingbats.get((font, int(char, 16)))
|
||||
|
||||
if unicode_code_point is None and re.match("^F0..", char):
|
||||
unicode_code_point = dingbats.get((font, int(char[2:], 16)))
|
||||
|
||||
if unicode_code_point is None:
|
||||
warning = results.warning("A w:sym element with an unsupported character was ignored: char {0} in font {1}".format(
|
||||
char,
|
||||
font,
|
||||
))
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
return _success(documents.text(unichr(unicode_code_point)))
|
||||
|
||||
|
||||
def table(element):
|
||||
properties = element.find_child_or_null("w:tblPr")
|
||||
return _ReadResult.map_results(
|
||||
read_table_style(properties),
|
||||
_read_xml_elements(element.children)
|
||||
.flat_map(calculate_row_spans),
|
||||
|
||||
lambda style, children: documents.table(
|
||||
children=children,
|
||||
style_id=style[0],
|
||||
style_name=style[1],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def read_table_style(properties):
|
||||
return _read_style(properties, "w:tblStyle", "Table", styles.find_table_style_by_id)
|
||||
|
||||
|
||||
def table_row(element):
|
||||
properties = element.find_child_or_null("w:trPr")
|
||||
|
||||
# See 17.13.5.12 del (Deleted Table Row) of ECMA-376 4th edition Part 1
|
||||
is_deleted = bool(properties.find_child("w:del"))
|
||||
if is_deleted:
|
||||
return _empty_result
|
||||
|
||||
is_header = bool(properties.find_child("w:tblHeader"))
|
||||
return _read_xml_elements(element.children) \
|
||||
.map(lambda children: documents.table_row(
|
||||
children=children,
|
||||
is_header=is_header,
|
||||
))
|
||||
|
||||
|
||||
def table_cell(element):
|
||||
properties = element.find_child_or_null("w:tcPr")
|
||||
gridspan = properties \
|
||||
.find_child_or_null("w:gridSpan") \
|
||||
.attributes.get("w:val")
|
||||
|
||||
if gridspan is None:
|
||||
colspan = 1
|
||||
else:
|
||||
colspan = int(gridspan)
|
||||
|
||||
return _read_xml_elements(element.children) \
|
||||
.map(lambda children: documents.table_cell_unmerged(
|
||||
children=children,
|
||||
colspan=colspan,
|
||||
rowspan=1,
|
||||
vmerge=read_vmerge(properties),
|
||||
))
|
||||
|
||||
def read_vmerge(properties):
|
||||
vmerge_element = properties.find_child("w:vMerge")
|
||||
if vmerge_element is None:
|
||||
return False
|
||||
else:
|
||||
val = vmerge_element.attributes.get("w:val")
|
||||
return val == "continue" or not val
|
||||
|
||||
|
||||
def calculate_row_spans(rows):
|
||||
unexpected_non_rows = any(
|
||||
not isinstance(row, documents.TableRow)
|
||||
for row in rows
|
||||
)
|
||||
if unexpected_non_rows:
|
||||
rows = remove_unmerged_table_cells(rows)
|
||||
return _elements_result_with_messages(rows, [results.warning(
|
||||
"unexpected non-row element in table, cell merging may be incorrect"
|
||||
)])
|
||||
|
||||
unexpected_non_cells = any(
|
||||
not isinstance(cell, documents.TableCellUnmerged)
|
||||
for row in rows
|
||||
for cell in row.children
|
||||
)
|
||||
if unexpected_non_cells:
|
||||
rows = remove_unmerged_table_cells(rows)
|
||||
return _elements_result_with_messages(rows, [results.warning(
|
||||
"unexpected non-cell element in table row, cell merging may be incorrect"
|
||||
)])
|
||||
|
||||
columns = {}
|
||||
for row in rows:
|
||||
cell_index = 0
|
||||
for cell in row.children:
|
||||
if cell.vmerge and cell_index in columns:
|
||||
columns[cell_index].rowspan += 1
|
||||
else:
|
||||
columns[cell_index] = cell
|
||||
cell.vmerge = False
|
||||
cell_index += cell.colspan
|
||||
|
||||
for row in rows:
|
||||
row.children = [
|
||||
documents.table_cell(
|
||||
children=cell.children,
|
||||
colspan=cell.colspan,
|
||||
rowspan=cell.rowspan,
|
||||
)
|
||||
for cell in row.children
|
||||
if not cell.vmerge
|
||||
]
|
||||
|
||||
return _success(rows)
|
||||
|
||||
|
||||
def remove_unmerged_table_cells(rows):
|
||||
return list(map(
|
||||
transforms.element_of_type(
|
||||
documents.TableCellUnmerged,
|
||||
lambda cell: documents.table_cell(
|
||||
children=cell.children,
|
||||
colspan=cell.colspan,
|
||||
rowspan=cell.rowspan,
|
||||
),
|
||||
),
|
||||
rows,
|
||||
))
|
||||
|
||||
|
||||
def read_child_elements(element):
|
||||
return _read_xml_elements(element.children)
|
||||
|
||||
|
||||
def pict(element):
|
||||
return read_child_elements(element).to_extra()
|
||||
|
||||
|
||||
def hyperlink(element):
|
||||
relationship_id = element.attributes.get("r:id")
|
||||
anchor = element.attributes.get("w:anchor")
|
||||
target_frame = element.attributes.get("w:tgtFrame") or None
|
||||
children_result = _read_xml_elements(element.children)
|
||||
|
||||
def create(**kwargs):
|
||||
return children_result.map(lambda children: documents.hyperlink(
|
||||
children=children,
|
||||
target_frame=target_frame,
|
||||
**kwargs
|
||||
))
|
||||
|
||||
if relationship_id is not None:
|
||||
href = relationships.find_target_by_relationship_id(relationship_id)
|
||||
if anchor is not None:
|
||||
href = replace_fragment(href, anchor)
|
||||
|
||||
return create(href=href)
|
||||
elif anchor is not None:
|
||||
return create(anchor=anchor)
|
||||
else:
|
||||
return children_result
|
||||
|
||||
|
||||
def bookmark_start(element):
|
||||
name = element.attributes.get("w:name")
|
||||
if name == "_GoBack":
|
||||
return _empty_result
|
||||
else:
|
||||
return _success(documents.bookmark(name))
|
||||
|
||||
|
||||
def break_(element):
|
||||
break_type = element.attributes.get("w:type")
|
||||
|
||||
if not break_type or break_type == "textWrapping":
|
||||
return _success(documents.line_break)
|
||||
elif break_type == "page":
|
||||
return _success(documents.page_break)
|
||||
elif break_type == "column":
|
||||
return _success(documents.column_break)
|
||||
else:
|
||||
warning = results.warning("Unsupported break type: {0}".format(break_type))
|
||||
return _empty_result_with_message(warning)
|
||||
|
||||
|
||||
def inline(element):
|
||||
properties = element.find_child_or_null("wp:docPr").attributes
|
||||
if properties.get("descr", "").strip():
|
||||
alt_text = properties.get("descr")
|
||||
else:
|
||||
alt_text = properties.get("title")
|
||||
blips = element.find_children("a:graphic") \
|
||||
.find_children("a:graphicData") \
|
||||
.find_children("pic:pic") \
|
||||
.find_children("pic:blipFill") \
|
||||
.find_children("a:blip")
|
||||
return _read_blips(blips, alt_text)
|
||||
|
||||
def _read_blips(blips, alt_text):
|
||||
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
|
||||
|
||||
def _read_blip(element, alt_text):
|
||||
blip_image = _find_blip_image(element)
|
||||
|
||||
if blip_image is None:
|
||||
warning = results.warning("Could not find image file for a:blip element")
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
return _read_image(blip_image, alt_text)
|
||||
|
||||
def _read_image(image_file, alt_text):
|
||||
image_path, open_image = image_file
|
||||
content_type = content_types.find_content_type(image_path)
|
||||
image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
|
||||
|
||||
if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
|
||||
messages = []
|
||||
else:
|
||||
messages = [results.warning("Image of type {0} is unlikely to display in web browsers".format(content_type))]
|
||||
|
||||
return _element_result_with_messages(image, messages)
|
||||
|
||||
def _find_blip_image(element):
|
||||
embed_relationship_id = element.attributes.get("r:embed")
|
||||
link_relationship_id = element.attributes.get("r:link")
|
||||
if embed_relationship_id is not None:
|
||||
return _find_embedded_image(embed_relationship_id)
|
||||
elif link_relationship_id is not None:
|
||||
return _find_linked_image(link_relationship_id)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _find_embedded_image(relationship_id):
|
||||
target = relationships.find_target_by_relationship_id(relationship_id)
|
||||
image_path = uri_to_zip_entry_name("word", target)
|
||||
|
||||
def open_image():
|
||||
image_file = docx_file.open(image_path)
|
||||
if hasattr(image_file, "__exit__"):
|
||||
return image_file
|
||||
else:
|
||||
return contextlib.closing(image_file)
|
||||
|
||||
return image_path, open_image
|
||||
|
||||
|
||||
def _find_linked_image(relationship_id):
|
||||
image_path = relationships.find_target_by_relationship_id(relationship_id)
|
||||
|
||||
def open_image():
|
||||
return files.open(image_path)
|
||||
|
||||
return image_path, open_image
|
||||
|
||||
def read_imagedata(element):
|
||||
relationship_id = element.attributes.get("r:id")
|
||||
if relationship_id is None:
|
||||
warning = results.warning("A v:imagedata element without a relationship ID was ignored")
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
title = element.attributes.get("o:title")
|
||||
return _read_image(_find_embedded_image(relationship_id), title)
|
||||
|
||||
def note_reference_reader(note_type):
|
||||
def note_reference(element):
|
||||
return _success(documents.note_reference(note_type, element.attributes["w:id"]))
|
||||
|
||||
return note_reference
|
||||
|
||||
def read_comment_reference(element):
|
||||
return _success(documents.comment_reference(element.attributes["w:id"]))
|
||||
|
||||
def alternate_content(element):
|
||||
return read_child_elements(element.find_child_or_null("mc:Fallback"))
|
||||
|
||||
def read_sdt(element):
|
||||
content_result = read_child_elements(element.find_child_or_null("w:sdtContent"))
|
||||
|
||||
def handle_content(content):
|
||||
# From the WordML standard: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/3350cb64-931f-41f7-8824-f18b2568ce66
|
||||
#
|
||||
# > A CT_SdtCheckbox element that specifies that the parent
|
||||
# > structured document tag is a checkbox when displayed in the
|
||||
# > document. The parent structured document tag contents MUST
|
||||
# > contain a single character and optionally an additional
|
||||
# > character in a deleted run.
|
||||
checkbox = element.find_child_or_null("w:sdtPr").find_child("wordml:checkbox")
|
||||
|
||||
if checkbox is None:
|
||||
return content
|
||||
|
||||
checked_element = checkbox.find_child("wordml:checked")
|
||||
is_checked = (
|
||||
checked_element is not None and
|
||||
read_boolean_attribute_value(checked_element.attributes.get("wordml:val"))
|
||||
)
|
||||
document_checkbox = documents.checkbox(checked=is_checked)
|
||||
|
||||
has_checkbox = False
|
||||
|
||||
def transform_text(text):
|
||||
nonlocal has_checkbox
|
||||
if len(text.value) > 0 and not has_checkbox:
|
||||
has_checkbox = True
|
||||
return document_checkbox
|
||||
else:
|
||||
return text
|
||||
|
||||
replaced_content = list(map(
|
||||
transforms.element_of_type(documents.Text, transform_text),
|
||||
content,
|
||||
))
|
||||
|
||||
if has_checkbox:
|
||||
return replaced_content
|
||||
else:
|
||||
return document_checkbox
|
||||
|
||||
return content_result.map(handle_content)
|
||||
|
||||
handlers = {
|
||||
"w:t": text,
|
||||
"w:r": run,
|
||||
"w:p": paragraph,
|
||||
"w:fldChar": read_fld_char,
|
||||
"w:instrText": read_instr_text,
|
||||
"w:tab": tab,
|
||||
"w:noBreakHyphen": no_break_hyphen,
|
||||
"w:softHyphen": soft_hyphen,
|
||||
"w:sym": symbol,
|
||||
"w:tbl": table,
|
||||
"w:tr": table_row,
|
||||
"w:tc": table_cell,
|
||||
"w:ins": read_child_elements,
|
||||
"w:object": read_child_elements,
|
||||
"w:smartTag": read_child_elements,
|
||||
"w:drawing": read_child_elements,
|
||||
"v:group": read_child_elements,
|
||||
"v:rect": read_child_elements,
|
||||
"v:roundrect": read_child_elements,
|
||||
"v:shape": read_child_elements,
|
||||
"v:textbox": read_child_elements,
|
||||
"w:txbxContent": read_child_elements,
|
||||
"w:pict": pict,
|
||||
"w:hyperlink": hyperlink,
|
||||
"w:bookmarkStart": bookmark_start,
|
||||
"w:br": break_,
|
||||
"wp:inline": inline,
|
||||
"wp:anchor": inline,
|
||||
"v:imagedata": read_imagedata,
|
||||
"w:footnoteReference": note_reference_reader("footnote"),
|
||||
"w:endnoteReference": note_reference_reader("endnote"),
|
||||
"w:commentReference": read_comment_reference,
|
||||
"mc:AlternateContent": alternate_content,
|
||||
"w:sdt": read_sdt
|
||||
}
|
||||
|
||||
def read(element):
|
||||
handler = handlers.get(element.name)
|
||||
if handler is None:
|
||||
if element.name not in _ignored_elements:
|
||||
warning = results.warning("An unrecognised element was ignored: {0}".format(element.name))
|
||||
return _empty_result_with_message(warning)
|
||||
else:
|
||||
return _empty_result
|
||||
else:
|
||||
return handler(element)
|
||||
|
||||
|
||||
def _read_xml_elements(nodes):
|
||||
elements = filter(lambda node: isinstance(node, XmlElement), nodes)
|
||||
return _ReadResult.concat(lists.map(read, elements))
|
||||
|
||||
return _read_xml_elements
|
||||
|
||||
|
||||
def _inner_text(node):
|
||||
if node.node_type == node_types.text:
|
||||
return node.value
|
||||
else:
|
||||
return "".join(_inner_text(child) for child in node.children)
|
||||
|
||||
|
||||
|
||||
class _ReadResult(object):
|
||||
@staticmethod
|
||||
def concat(results):
|
||||
return _ReadResult(
|
||||
lists.flat_map(lambda result: result.elements, results),
|
||||
lists.flat_map(lambda result: result.extra, results),
|
||||
lists.flat_map(lambda result: result.messages, results))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def map_results(first, second, func):
|
||||
return _ReadResult(
|
||||
[func(first.elements, second.elements)],
|
||||
first.extra + second.extra,
|
||||
first.messages + second.messages)
|
||||
|
||||
def __init__(self, elements, extra, messages):
|
||||
self.elements = elements
|
||||
self.extra = extra
|
||||
self.messages = messages
|
||||
|
||||
def map(self, func):
|
||||
elements = func(self.elements)
|
||||
if not isinstance(elements, list):
|
||||
elements = [elements]
|
||||
return _ReadResult(
|
||||
elements,
|
||||
self.extra,
|
||||
self.messages)
|
||||
|
||||
def flat_map(self, func):
|
||||
result = func(self.elements)
|
||||
return _ReadResult(
|
||||
result.elements,
|
||||
self.extra + result.extra,
|
||||
self.messages + result.messages)
|
||||
|
||||
|
||||
def to_extra(self):
|
||||
return _ReadResult([], _concat(self.extra, self.elements), self.messages)
|
||||
|
||||
def append_extra(self):
|
||||
return _ReadResult(_concat(self.elements, self.extra), [], self.messages)
|
||||
|
||||
def _success(elements):
|
||||
if not isinstance(elements, list):
|
||||
elements = [elements]
|
||||
return _ReadResult(elements, [], [])
|
||||
|
||||
def _element_result_with_messages(element, messages):
|
||||
return _elements_result_with_messages([element], messages)
|
||||
|
||||
def _elements_result_with_messages(elements, messages):
|
||||
return _ReadResult(elements, [], messages)
|
||||
|
||||
_empty_result = _ReadResult([], [], [])
|
||||
|
||||
def _empty_result_with_message(message):
|
||||
return _ReadResult([], [], [message])
|
||||
|
||||
def _concat(*values):
|
||||
result = []
|
||||
for value in values:
|
||||
for element in value:
|
||||
result.append(element)
|
||||
return result
|
||||
|
||||
|
||||
def _is_int(value):
|
||||
if value is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
int(value)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -0,0 +1,24 @@
|
||||
from .. import lists
|
||||
from .. import documents
|
||||
from .. import results
|
||||
|
||||
|
||||
def read_comments_xml_element(element, body_reader):
|
||||
def read_comments_xml_element(element):
|
||||
comment_elements = element.find_children("w:comment")
|
||||
return results.combine(lists.map(_read_comment_element, comment_elements))
|
||||
|
||||
|
||||
def _read_comment_element(element):
|
||||
def read_optional_attribute(name):
|
||||
return element.attributes.get(name, "").strip() or None
|
||||
|
||||
return body_reader.read_all(element.children).map(lambda body:
|
||||
documents.comment(
|
||||
comment_id=element.attributes["w:id"],
|
||||
body=body,
|
||||
author_name=read_optional_attribute("w:author"),
|
||||
author_initials=read_optional_attribute("w:initials"),
|
||||
))
|
||||
|
||||
return read_comments_xml_element(element)
|
||||
@@ -0,0 +1,29 @@
|
||||
class unknown(object):
|
||||
pass
|
||||
|
||||
|
||||
class Begin:
|
||||
def __init__(self, *, fld_char):
|
||||
self.fld_char = fld_char
|
||||
|
||||
|
||||
def begin(*, fld_char):
|
||||
return Begin(fld_char=fld_char)
|
||||
|
||||
|
||||
class Hyperlink(object):
|
||||
def __init__(self, kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
|
||||
def hyperlink(kwargs):
|
||||
return Hyperlink(kwargs=kwargs)
|
||||
|
||||
|
||||
class Checkbox:
|
||||
def __init__(self, *, checked):
|
||||
self.checked = checked
|
||||
|
||||
|
||||
def checkbox(*, checked):
|
||||
return Checkbox(checked=checked)
|
||||
@@ -0,0 +1,58 @@
|
||||
def read_content_types_xml_element(element):
|
||||
extension_defaults = dict(map(
|
||||
_read_default,
|
||||
element.find_children("content-types:Default")
|
||||
))
|
||||
overrides = dict(map(
|
||||
_read_override,
|
||||
element.find_children("content-types:Override")
|
||||
))
|
||||
return _ContentTypes(extension_defaults, overrides)
|
||||
|
||||
|
||||
def _read_default(element):
|
||||
extension = element.attributes["Extension"]
|
||||
content_type = element.attributes["ContentType"]
|
||||
return extension, content_type
|
||||
|
||||
|
||||
def _read_override(element):
|
||||
part_name = element.attributes["PartName"]
|
||||
content_type = element.attributes["ContentType"]
|
||||
return part_name.lstrip("/"), content_type
|
||||
|
||||
|
||||
class _ContentTypes(object):
|
||||
_image_content_types = {
|
||||
"png": "png",
|
||||
"gif": "gif",
|
||||
"jpeg": "jpeg",
|
||||
"jpg": "jpeg",
|
||||
"tif": "tiff",
|
||||
"tiff": "tiff",
|
||||
"bmp": "bmp",
|
||||
}
|
||||
|
||||
def __init__(self, extension_defaults, overrides):
|
||||
self._extension_defaults = extension_defaults
|
||||
self._overrides = overrides
|
||||
|
||||
def find_content_type(self, path):
|
||||
if path in self._overrides:
|
||||
return self._overrides[path]
|
||||
|
||||
extension = _get_extension(path)
|
||||
default_type = self._extension_defaults.get(extension)
|
||||
if default_type is not None:
|
||||
return default_type
|
||||
|
||||
image_type = self._image_content_types.get(extension.lower())
|
||||
if image_type is not None:
|
||||
return "image/" + image_type
|
||||
|
||||
return None
|
||||
|
||||
empty_content_types = _ContentTypes({}, {})
|
||||
|
||||
def _get_extension(path):
|
||||
return path.rpartition(".")[2]
|
||||
1065
path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
Normal file
1065
path/to/venv/lib/python3.12/site-packages/mammoth/docx/dingbats.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,25 @@
|
||||
from .. import documents
|
||||
|
||||
|
||||
def read_document_xml_element(
|
||||
element,
|
||||
body_reader,
|
||||
notes=None,
|
||||
comments=None):
|
||||
|
||||
if notes is None:
|
||||
notes = []
|
||||
if comments is None:
|
||||
comments = []
|
||||
|
||||
body_element = element.find_child("w:body")
|
||||
|
||||
if body_element is None:
|
||||
raise ValueError("Could not find the body element: are you sure this is a docx file?")
|
||||
|
||||
return body_reader.read_all(body_element.children) \
|
||||
.map(lambda children: documents.document(
|
||||
children,
|
||||
notes=documents.notes(notes),
|
||||
comments=comments
|
||||
))
|
||||
@@ -0,0 +1,46 @@
|
||||
import os
|
||||
import contextlib
|
||||
try:
|
||||
from urllib2 import urlopen
|
||||
except ImportError:
|
||||
from urllib.request import urlopen
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
except ImportError:
|
||||
from urlparse import urlparse
|
||||
|
||||
|
||||
class Files(object):
|
||||
def __init__(self, base, external_file_access):
|
||||
self._base = base
|
||||
self._external_file_access = external_file_access
|
||||
|
||||
def open(self, uri):
|
||||
if not self._external_file_access:
|
||||
raise ExternalFileAccessIsDisabledError(
|
||||
"could not open external image '{0}', external file access is disabled".format(uri)
|
||||
)
|
||||
|
||||
try:
|
||||
if _is_absolute(uri):
|
||||
return contextlib.closing(urlopen(uri))
|
||||
elif self._base is not None:
|
||||
return open(os.path.join(self._base, uri), "rb")
|
||||
else:
|
||||
raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
|
||||
except IOError as error:
|
||||
message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
|
||||
uri, self._base, str(error))
|
||||
raise InvalidFileReferenceError(message)
|
||||
|
||||
|
||||
def _is_absolute(url):
|
||||
return urlparse(url).scheme != ""
|
||||
|
||||
|
||||
class InvalidFileReferenceError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
|
||||
pass
|
||||
@@ -0,0 +1,32 @@
|
||||
import functools
|
||||
|
||||
from .. import lists
|
||||
from .. import documents
|
||||
from .. import results
|
||||
|
||||
|
||||
def _read_notes(note_type, element, body_reader):
|
||||
def read_notes_xml_element(element):
|
||||
note_elements = lists.filter(
|
||||
_is_note_element,
|
||||
element.find_children("w:" + note_type),
|
||||
)
|
||||
return results.combine(lists.map(_read_note_element, note_elements))
|
||||
|
||||
|
||||
def _is_note_element(element):
|
||||
return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
|
||||
|
||||
|
||||
def _read_note_element(element):
|
||||
return body_reader.read_all(element.children).map(lambda body:
|
||||
documents.note(
|
||||
note_type=note_type,
|
||||
note_id=element.attributes["w:id"],
|
||||
body=body
|
||||
))
|
||||
|
||||
return read_notes_xml_element(element)
|
||||
|
||||
read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
|
||||
read_endnotes_xml_element = functools.partial(_read_notes, "endnote")
|
||||
@@ -0,0 +1,130 @@
|
||||
import cobble
|
||||
|
||||
from ..documents import numbering_level
|
||||
from .styles_xml import Styles
|
||||
|
||||
|
||||
def read_numbering_xml_element(element, styles):
|
||||
abstract_nums = _read_abstract_nums(element)
|
||||
nums = _read_nums(element)
|
||||
return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
|
||||
|
||||
|
||||
def _read_abstract_nums(element):
|
||||
abstract_num_elements = element.find_children("w:abstractNum")
|
||||
return dict(map(_read_abstract_num, abstract_num_elements))
|
||||
|
||||
|
||||
def _read_abstract_num(element):
|
||||
abstract_num_id = element.attributes.get("w:abstractNumId")
|
||||
levels = _read_abstract_num_levels(element)
|
||||
num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
|
||||
return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _AbstractNum(object):
|
||||
levels = cobble.field()
|
||||
num_style_link = cobble.field()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _AbstractNumLevel(object):
|
||||
level_index = cobble.field()
|
||||
is_ordered = cobble.field()
|
||||
paragraph_style_id = cobble.field()
|
||||
|
||||
|
||||
def _read_abstract_num_levels(element):
|
||||
levels = {}
|
||||
|
||||
# Some malformed documents define numbering levels without an index, and
|
||||
# reference the numbering using a w:numPr element without a w:ilvl child.
|
||||
# To handle such cases, we assume a level of 0 as a fallback.
|
||||
level_without_index = None
|
||||
|
||||
for level_element in element.find_children("w:lvl"):
|
||||
level = _read_abstract_num_level(level_element)
|
||||
if level.level_index is None:
|
||||
level.level_index = "0"
|
||||
level_without_index = level
|
||||
else:
|
||||
levels[level.level_index] = level
|
||||
|
||||
if level_without_index is not None and level_without_index.level_index not in levels:
|
||||
levels[level_without_index.level_index] = level_without_index
|
||||
|
||||
return levels
|
||||
|
||||
|
||||
def _read_abstract_num_level(element):
|
||||
level_index = element.attributes.get("w:ilvl")
|
||||
num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
|
||||
is_ordered = num_fmt != "bullet"
|
||||
paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
|
||||
return _AbstractNumLevel(
|
||||
level_index=level_index,
|
||||
is_ordered=is_ordered,
|
||||
paragraph_style_id=paragraph_style_id,
|
||||
)
|
||||
|
||||
|
||||
def _read_nums(element):
|
||||
num_elements = element.find_children("w:num")
|
||||
return dict(
|
||||
_read_num(num_element)
|
||||
for num_element in num_elements
|
||||
)
|
||||
|
||||
|
||||
def _read_num(element):
|
||||
num_id = element.attributes.get("w:numId")
|
||||
abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
|
||||
return num_id, _Num(abstract_num_id=abstract_num_id)
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _Num(object):
|
||||
abstract_num_id = cobble.field()
|
||||
|
||||
|
||||
class Numbering(object):
|
||||
def __init__(self, abstract_nums, nums, styles):
|
||||
self._abstract_nums = abstract_nums
|
||||
self._levels_by_paragraph_style_id = dict(
|
||||
(level.paragraph_style_id, self._to_numbering_level(level))
|
||||
for abstract_num in abstract_nums.values()
|
||||
for level in abstract_num.levels.values()
|
||||
if level.paragraph_style_id is not None
|
||||
)
|
||||
self._nums = nums
|
||||
self._styles = styles
|
||||
|
||||
def find_level(self, num_id, level):
|
||||
num = self._nums.get(num_id)
|
||||
if num is None:
|
||||
return None
|
||||
else:
|
||||
abstract_num = self._abstract_nums.get(num.abstract_num_id)
|
||||
if abstract_num is None:
|
||||
return None
|
||||
elif abstract_num.num_style_link is None:
|
||||
return self._to_numbering_level(abstract_num.levels.get(level))
|
||||
else:
|
||||
style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
|
||||
return self.find_level(style.num_id, level)
|
||||
|
||||
def find_level_by_paragraph_style_id(self, style_id):
|
||||
return self._levels_by_paragraph_style_id.get(style_id)
|
||||
|
||||
def _to_numbering_level(self, abstract_num_level):
|
||||
if abstract_num_level is None:
|
||||
return None
|
||||
else:
|
||||
return numbering_level(
|
||||
level_index=abstract_num_level.level_index,
|
||||
is_ordered=abstract_num_level.is_ordered,
|
||||
)
|
||||
|
||||
|
||||
Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)
|
||||
@@ -0,0 +1,45 @@
|
||||
from ..lists import flat_map
|
||||
from .xmlparser import parse_xml, XmlElement
|
||||
|
||||
|
||||
_namespaces = [
|
||||
# Transitional format
|
||||
("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
|
||||
("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
|
||||
("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
|
||||
("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
|
||||
("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
|
||||
|
||||
# Strict format
|
||||
("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
|
||||
("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
|
||||
("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
|
||||
("a", "http://purl.oclc.org/ooxml/drawingml/main"),
|
||||
("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
|
||||
|
||||
# Common
|
||||
("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
|
||||
("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
|
||||
("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
|
||||
("v", "urn:schemas-microsoft-com:vml"),
|
||||
("office-word", "urn:schemas-microsoft-com:office:word"),
|
||||
|
||||
# [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
|
||||
# https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
|
||||
("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
|
||||
]
|
||||
|
||||
|
||||
def read(fileobj):
|
||||
return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
|
||||
|
||||
|
||||
def _collapse_alternate_content(node):
|
||||
if isinstance(node, XmlElement):
|
||||
if node.name == "mc:AlternateContent":
|
||||
return node.find_child_or_null("mc:Fallback").children
|
||||
else:
|
||||
node.children = flat_map(_collapse_alternate_content, node.children)
|
||||
return [node]
|
||||
else:
|
||||
return [node]
|
||||
@@ -0,0 +1,38 @@
|
||||
import collections
|
||||
|
||||
|
||||
class Relationships(object):
|
||||
def __init__(self, relationships):
|
||||
self._targets_by_id = dict(
|
||||
(relationship.relationship_id, relationship.target)
|
||||
for relationship in relationships
|
||||
)
|
||||
self._targets_by_type = collections.defaultdict(list)
|
||||
for relationship in relationships:
|
||||
self._targets_by_type[relationship.type].append(relationship.target)
|
||||
|
||||
def find_target_by_relationship_id(self, key):
|
||||
return self._targets_by_id[key]
|
||||
|
||||
def find_targets_by_type(self, relationship_type):
|
||||
return self._targets_by_type[relationship_type]
|
||||
|
||||
|
||||
Relationships.EMPTY = Relationships([])
|
||||
|
||||
|
||||
Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
|
||||
|
||||
|
||||
def read_relationships_xml_element(element):
|
||||
children = element.find_children("relationships:Relationship")
|
||||
return Relationships(list(map(_read_relationship, children)))
|
||||
|
||||
|
||||
def _read_relationship(element):
|
||||
relationship = Relationship(
|
||||
relationship_id=element.attributes["Id"],
|
||||
target=element.attributes["Target"],
|
||||
type=element.attributes["Type"],
|
||||
)
|
||||
return relationship
|
||||
@@ -0,0 +1,70 @@
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..zips import open_zip, update_zip
|
||||
|
||||
|
||||
_style_map_path = "mammoth/style-map"
|
||||
_style_map_absolute_path = "/" + _style_map_path
|
||||
_relationships_path = "word/_rels/document.xml.rels"
|
||||
_content_types_path = "[Content_Types].xml"
|
||||
|
||||
|
||||
def write_style_map(fileobj, style_map):
|
||||
with open_zip(fileobj, "r") as zip_file:
|
||||
relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
|
||||
content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
|
||||
|
||||
update_zip(fileobj, {
|
||||
_style_map_path: style_map.encode("utf8"),
|
||||
_relationships_path: relationships_xml,
|
||||
_content_types_path: content_types_xml,
|
||||
})
|
||||
|
||||
def _generate_relationships_xml(relationships_xml):
|
||||
schema = "http://schemas.zwobble.org/mammoth/style-map"
|
||||
relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
|
||||
relationship_element_name = "{" + relationships_uri + "}Relationship"
|
||||
|
||||
relationships = ElementTree.fromstring(relationships_xml)
|
||||
_add_or_update_element(relationships, relationship_element_name, "Id", {
|
||||
"Id": "rMammothStyleMap",
|
||||
"Type": schema,
|
||||
"Target": _style_map_absolute_path,
|
||||
})
|
||||
|
||||
return ElementTree.tostring(relationships, "UTF-8")
|
||||
|
||||
|
||||
def _generate_content_types_xml(content_types_xml):
|
||||
content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
|
||||
override_name = "{" + content_types_uri + "}Override"
|
||||
|
||||
types = ElementTree.fromstring(content_types_xml)
|
||||
_add_or_update_element(types, override_name, "PartName", {
|
||||
"PartName": _style_map_absolute_path,
|
||||
"ContentType": "text/prs.mammoth.style-map",
|
||||
})
|
||||
|
||||
return ElementTree.tostring(types, "UTF-8")
|
||||
|
||||
|
||||
def _add_or_update_element(parent, name, identifying_attribute, attributes):
|
||||
existing_child = _find_child(parent, name, identifying_attribute, attributes)
|
||||
if existing_child is None:
|
||||
ElementTree.SubElement(parent, name, attributes)
|
||||
else:
|
||||
existing_child.attrib = attributes
|
||||
|
||||
|
||||
def _find_child(parent, name, identifying_attribute, attributes):
|
||||
for element in parent.iter():
|
||||
if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
|
||||
return element
|
||||
|
||||
|
||||
def read_style_map(fileobj):
|
||||
with open_zip(fileobj, "r") as zip_file:
|
||||
if zip_file.exists(_style_map_path):
|
||||
return zip_file.read_str(_style_map_path)
|
||||
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
import collections
|
||||
|
||||
|
||||
class Styles(object):
|
||||
@staticmethod
|
||||
def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
|
||||
if paragraph_styles is None:
|
||||
paragraph_styles = {}
|
||||
if character_styles is None:
|
||||
character_styles = {}
|
||||
if table_styles is None:
|
||||
table_styles = {}
|
||||
if numbering_styles is None:
|
||||
numbering_styles = {}
|
||||
|
||||
return Styles(
|
||||
paragraph_styles=paragraph_styles,
|
||||
character_styles=character_styles,
|
||||
table_styles=table_styles,
|
||||
numbering_styles=numbering_styles,
|
||||
)
|
||||
|
||||
def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
|
||||
self._paragraph_styles = paragraph_styles
|
||||
self._character_styles = character_styles
|
||||
self._table_styles = table_styles
|
||||
self._numbering_styles = numbering_styles
|
||||
|
||||
def find_paragraph_style_by_id(self, style_id):
|
||||
return self._paragraph_styles.get(style_id)
|
||||
|
||||
def find_character_style_by_id(self, style_id):
|
||||
return self._character_styles.get(style_id)
|
||||
|
||||
def find_table_style_by_id(self, style_id):
|
||||
return self._table_styles.get(style_id)
|
||||
|
||||
def find_numbering_style_by_id(self, style_id):
|
||||
return self._numbering_styles.get(style_id)
|
||||
|
||||
|
||||
Styles.EMPTY = Styles(
|
||||
paragraph_styles={},
|
||||
character_styles={},
|
||||
table_styles={},
|
||||
numbering_styles={},
|
||||
)
|
||||
|
||||
|
||||
def read_styles_xml_element(element):
|
||||
paragraph_styles = {}
|
||||
character_styles = {}
|
||||
table_styles = {}
|
||||
numbering_styles = {}
|
||||
styles = {
|
||||
"paragraph": paragraph_styles,
|
||||
"character": character_styles,
|
||||
"table": table_styles,
|
||||
"numbering": numbering_styles,
|
||||
}
|
||||
|
||||
for style_element in element.find_children("w:style"):
|
||||
element_type = style_element.attributes["w:type"]
|
||||
if element_type == "numbering":
|
||||
style = _read_numbering_style_element(style_element)
|
||||
else:
|
||||
style = _read_style_element(style_element)
|
||||
|
||||
style_set = styles.get(element_type)
|
||||
|
||||
# Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1:
|
||||
#
|
||||
# > If multiple style definitions each declare the same value for their
|
||||
# > styleId, then the first such instance shall keep its current
|
||||
# > identifier with all other instances being reassigned in any manner
|
||||
# > desired.
|
||||
#
|
||||
# For the purpose of conversion, there's no point holding onto styles
|
||||
# with reassigned style IDs, so we ignore such style definitions.
|
||||
|
||||
if style_set is not None and style.style_id not in style_set:
|
||||
style_set[style.style_id] = style
|
||||
|
||||
return Styles(
|
||||
paragraph_styles=paragraph_styles,
|
||||
character_styles=character_styles,
|
||||
table_styles=table_styles,
|
||||
numbering_styles=numbering_styles,
|
||||
)
|
||||
|
||||
|
||||
Style = collections.namedtuple("Style", ["style_id", "name"])
|
||||
|
||||
|
||||
def _read_style_element(element):
|
||||
style_id = _read_style_id(element)
|
||||
name = element.find_child_or_null("w:name").attributes.get("w:val")
|
||||
return Style(style_id=style_id, name=name)
|
||||
|
||||
|
||||
NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"])
|
||||
|
||||
|
||||
def _read_numbering_style_element(element):
|
||||
style_id = _read_style_id(element)
|
||||
|
||||
num_id = element \
|
||||
.find_child_or_null("w:pPr") \
|
||||
.find_child_or_null("w:numPr") \
|
||||
.find_child_or_null("w:numId") \
|
||||
.attributes.get("w:val")
|
||||
|
||||
return NumberingStyle(style_id=style_id, num_id=num_id)
|
||||
|
||||
|
||||
def _read_style_id(element):
|
||||
return element.attributes["w:styleId"]
|
||||
@@ -0,0 +1,12 @@
|
||||
def uri_to_zip_entry_name(base, uri):
|
||||
if uri.startswith("/"):
|
||||
return uri[1:]
|
||||
else:
|
||||
return base + "/" + uri
|
||||
|
||||
|
||||
def replace_fragment(uri, fragment):
|
||||
hash_index = uri.find("#")
|
||||
if hash_index != -1:
|
||||
uri = uri[:hash_index]
|
||||
return uri + "#" + fragment
|
||||
@@ -0,0 +1,121 @@
|
||||
import xml.dom.minidom
|
||||
|
||||
import cobble
|
||||
|
||||
|
||||
@cobble.data
|
||||
class XmlElement(object):
|
||||
name = cobble.field()
|
||||
attributes = cobble.field()
|
||||
children = cobble.field()
|
||||
|
||||
def find_child_or_null(self, name):
|
||||
return self.find_child(name) or null_xml_element
|
||||
|
||||
def find_child(self, name):
|
||||
for child in self.children:
|
||||
if isinstance(child, XmlElement) and child.name == name:
|
||||
return child
|
||||
|
||||
|
||||
def find_children(self, name):
|
||||
return XmlElementList(filter(
|
||||
lambda child: child.node_type == node_types.element and child.name == name,
|
||||
self.children
|
||||
))
|
||||
|
||||
|
||||
class XmlElementList(object):
|
||||
def __init__(self, elements):
|
||||
self._elements = elements
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._elements)
|
||||
|
||||
def find_children(self, name):
|
||||
children = []
|
||||
for element in self._elements:
|
||||
for child in element.find_children(name):
|
||||
children.append(child)
|
||||
return XmlElementList(children)
|
||||
|
||||
|
||||
class NullXmlElement(object):
|
||||
attributes = {}
|
||||
children = []
|
||||
|
||||
def find_child_or_null(self, name):
|
||||
return self
|
||||
|
||||
def find_child(self, name):
|
||||
return None
|
||||
|
||||
|
||||
null_xml_element = NullXmlElement()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class XmlText(object):
|
||||
value = cobble.field()
|
||||
|
||||
|
||||
def element(name, attributes=None, children=None):
|
||||
return XmlElement(name, attributes or {}, children or [])
|
||||
|
||||
text = XmlText
|
||||
|
||||
|
||||
class node_types(object):
|
||||
element = 1
|
||||
text = 3
|
||||
|
||||
|
||||
XmlElement.node_type = node_types.element
|
||||
XmlText.node_type = node_types.text
|
||||
|
||||
|
||||
|
||||
def parse_xml(fileobj, namespace_mapping=None):
|
||||
if namespace_mapping is None:
|
||||
namespace_prefixes = {}
|
||||
else:
|
||||
namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
|
||||
|
||||
document = xml.dom.minidom.parse(fileobj)
|
||||
|
||||
def convert_node(node):
|
||||
if node.nodeType == xml.dom.Node.ELEMENT_NODE:
|
||||
return convert_element(node)
|
||||
elif node.nodeType == xml.dom.Node.TEXT_NODE:
|
||||
return XmlText(node.nodeValue)
|
||||
else:
|
||||
return None
|
||||
|
||||
def convert_element(element):
|
||||
converted_name = convert_name(element)
|
||||
|
||||
converted_attributes = dict(
|
||||
(convert_name(attribute), attribute.value)
|
||||
for attribute in element.attributes.values()
|
||||
if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
|
||||
)
|
||||
|
||||
converted_children = []
|
||||
for child_node in element.childNodes:
|
||||
converted_child_node = convert_node(child_node)
|
||||
if converted_child_node is not None:
|
||||
converted_children.append(converted_child_node)
|
||||
|
||||
return XmlElement(converted_name, converted_attributes, converted_children)
|
||||
|
||||
def convert_name(node):
|
||||
if node.namespaceURI is None:
|
||||
return node.localName
|
||||
else:
|
||||
prefix = namespace_prefixes.get(node.namespaceURI)
|
||||
if prefix is None:
|
||||
return "{%s}%s" % (node.namespaceURI, node.localName)
|
||||
else:
|
||||
return "%s:%s" % (prefix, node.localName)
|
||||
|
||||
return convert_node(document.documentElement)
|
||||
@@ -0,0 +1,135 @@
|
||||
from ..lists import flat_map
|
||||
from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor
|
||||
|
||||
|
||||
def text(value):
|
||||
return TextNode(value)
|
||||
|
||||
|
||||
def tag(tag_names, attributes=None, collapsible=None, separator=None):
|
||||
if not isinstance(tag_names, list):
|
||||
tag_names = [tag_names]
|
||||
if attributes is None:
|
||||
attributes = {}
|
||||
return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator)
|
||||
|
||||
|
||||
def element(tag_names, attributes=None, children=None, collapsible=None, separator=None):
|
||||
if children is None:
|
||||
children = []
|
||||
|
||||
element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator)
|
||||
return Element(element_tag, children)
|
||||
|
||||
|
||||
def collapsible_element(tag_names, attributes=None, children=None):
|
||||
return element(tag_names, attributes, children, collapsible=True)
|
||||
|
||||
|
||||
force_write = ForceWrite()
|
||||
|
||||
|
||||
def strip_empty(nodes):
|
||||
return flat_map(_strip_empty_node, nodes)
|
||||
|
||||
|
||||
def _strip_empty_node(node):
|
||||
return StripEmpty().visit(node)
|
||||
|
||||
|
||||
class StripEmpty(NodeVisitor):
|
||||
def visit_text_node(self, node):
|
||||
if node.value:
|
||||
return [node]
|
||||
else:
|
||||
return []
|
||||
|
||||
def visit_element(self, element):
|
||||
children = strip_empty(element.children)
|
||||
if len(children) == 0 and not element.is_void():
|
||||
return []
|
||||
else:
|
||||
return [Element(element.tag, children)]
|
||||
|
||||
def visit_force_write(self, node):
|
||||
return [node]
|
||||
|
||||
|
||||
def collapse(nodes):
|
||||
collapsed = []
|
||||
|
||||
for node in nodes:
|
||||
_collapsing_add(collapsed, node)
|
||||
|
||||
return collapsed
|
||||
|
||||
class _CollapseNode(NodeVisitor):
|
||||
def visit_text_node(self, node):
|
||||
return node
|
||||
|
||||
def visit_element(self, element):
|
||||
return Element(element.tag, collapse(element.children))
|
||||
|
||||
def visit_force_write(self, node):
|
||||
return node
|
||||
|
||||
_collapse_node = _CollapseNode().visit
|
||||
|
||||
|
||||
def _collapsing_add(collapsed, node):
|
||||
collapsed_node = _collapse_node(node)
|
||||
if not _try_collapse(collapsed, collapsed_node):
|
||||
collapsed.append(collapsed_node)
|
||||
|
||||
def _try_collapse(collapsed, node):
|
||||
if not collapsed:
|
||||
return False
|
||||
|
||||
last = collapsed[-1]
|
||||
if not isinstance(last, Element) or not isinstance(node, Element):
|
||||
return False
|
||||
|
||||
if not node.collapsible:
|
||||
return False
|
||||
|
||||
if not _is_match(last, node):
|
||||
return False
|
||||
|
||||
if node.separator:
|
||||
last.children.append(text(node.separator))
|
||||
|
||||
for child in node.children:
|
||||
_collapsing_add(last.children, child)
|
||||
|
||||
return True
|
||||
|
||||
def _is_match(first, second):
|
||||
return first.tag_name in second.tag_names and first.attributes == second.attributes
|
||||
|
||||
|
||||
def write(writer, nodes):
|
||||
visitor = _NodeWriter(writer)
|
||||
visitor.visit_all(nodes)
|
||||
|
||||
|
||||
class _NodeWriter(NodeVisitor):
|
||||
def __init__(self, writer):
|
||||
self._writer = writer
|
||||
|
||||
def visit_text_node(self, node):
|
||||
self._writer.text(node.value)
|
||||
|
||||
def visit_element(self, element):
|
||||
if element.is_void():
|
||||
self._writer.self_closing(element.tag_name, element.attributes)
|
||||
else:
|
||||
self._writer.start(element.tag_name, element.attributes)
|
||||
self.visit_all(element.children)
|
||||
self._writer.end(element.tag_name)
|
||||
|
||||
def visit_force_write(self, element):
|
||||
pass
|
||||
|
||||
def visit_all(self, nodes):
|
||||
for node in nodes:
|
||||
self.visit(node)
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,61 @@
|
||||
import cobble
|
||||
|
||||
|
||||
class Node(object):
|
||||
pass
|
||||
|
||||
|
||||
@cobble.data
|
||||
class TextNode(Node):
|
||||
value = cobble.field()
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Tag(object):
|
||||
tag_names = cobble.field()
|
||||
attributes = cobble.field()
|
||||
collapsible = cobble.field()
|
||||
separator = cobble.field()
|
||||
|
||||
@property
|
||||
def tag_name(self):
|
||||
return self.tag_names[0]
|
||||
|
||||
|
||||
@cobble.data
|
||||
class Element(Node):
|
||||
tag = cobble.field()
|
||||
children = cobble.field()
|
||||
|
||||
@property
|
||||
def tag_name(self):
|
||||
return self.tag.tag_name
|
||||
|
||||
@property
|
||||
def tag_names(self):
|
||||
return self.tag.tag_names
|
||||
|
||||
@property
|
||||
def attributes(self):
|
||||
return self.tag.attributes
|
||||
|
||||
@property
|
||||
def collapsible(self):
|
||||
return self.tag.collapsible
|
||||
|
||||
@property
|
||||
def separator(self):
|
||||
return self.tag.separator
|
||||
|
||||
_VOID_TAG_NAMES = set(["br", "hr", "img", "input"])
|
||||
|
||||
def is_void(self):
|
||||
return not self.children and self.tag_name in self._VOID_TAG_NAMES
|
||||
|
||||
|
||||
@cobble.visitable
|
||||
class ForceWrite(Node):
|
||||
pass
|
||||
|
||||
|
||||
NodeVisitor = cobble.visitor(Node)
|
||||
@@ -0,0 +1,58 @@
|
||||
import cobble
|
||||
|
||||
from . import html
|
||||
|
||||
|
||||
def path(elements):
|
||||
return HtmlPath(elements)
|
||||
|
||||
|
||||
def element(names, attributes=None, class_names=None, fresh=None, separator=None):
|
||||
if attributes is None:
|
||||
attributes = {}
|
||||
if class_names is None:
|
||||
class_names = []
|
||||
if fresh is None:
|
||||
fresh = False
|
||||
if class_names:
|
||||
attributes["class"] = " ".join(class_names)
|
||||
|
||||
return HtmlPathElement(html.tag(
|
||||
tag_names=names,
|
||||
attributes=attributes,
|
||||
collapsible=not fresh,
|
||||
separator=separator,
|
||||
))
|
||||
|
||||
|
||||
@cobble.data
|
||||
class HtmlPath(object):
|
||||
elements = cobble.field()
|
||||
|
||||
def wrap(self, generate_nodes):
|
||||
nodes = generate_nodes()
|
||||
|
||||
for element in reversed(self.elements):
|
||||
nodes = element.wrap_nodes(nodes)
|
||||
|
||||
return nodes
|
||||
|
||||
|
||||
@cobble.data
|
||||
class HtmlPathElement(object):
|
||||
tag = cobble.field()
|
||||
|
||||
def wrap(self, generate_nodes):
|
||||
return self.wrap_nodes(generate_nodes())
|
||||
|
||||
def wrap_nodes(self, nodes):
|
||||
element = html.Element(self.tag, nodes)
|
||||
return [element]
|
||||
|
||||
empty = path([])
|
||||
|
||||
|
||||
class ignore(object):
|
||||
@staticmethod
|
||||
def wrap(generate_nodes):
|
||||
return []
|
||||
28
path/to/venv/lib/python3.12/site-packages/mammoth/images.py
Normal file
28
path/to/venv/lib/python3.12/site-packages/mammoth/images.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import base64
|
||||
|
||||
from . import html
|
||||
|
||||
|
||||
def img_element(func):
|
||||
def convert_image(image):
|
||||
attributes = {}
|
||||
if image.alt_text:
|
||||
attributes["alt"] = image.alt_text
|
||||
attributes.update(func(image))
|
||||
|
||||
return [html.element("img", attributes)]
|
||||
|
||||
return convert_image
|
||||
|
||||
# Undocumented, but retained for backwards-compatibility with 0.3.x
|
||||
inline = img_element
|
||||
|
||||
|
||||
@img_element
|
||||
def data_uri(image):
|
||||
with image.open() as image_bytes:
|
||||
encoded_src = base64.b64encode(image_bytes.read()).decode("ascii")
|
||||
|
||||
return {
|
||||
"src": "data:{0};base64,{1}".format(image.content_type, encoded_src)
|
||||
}
|
||||
40
path/to/venv/lib/python3.12/site-packages/mammoth/lists.py
Normal file
40
path/to/venv/lib/python3.12/site-packages/mammoth/lists.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import sys
|
||||
|
||||
|
||||
def flatten(values):
|
||||
return flat_map(lambda x: x, values)
|
||||
|
||||
|
||||
def unique(values):
|
||||
output = []
|
||||
seen = set()
|
||||
for value in values:
|
||||
if value not in seen:
|
||||
seen.add(value)
|
||||
output.append(value)
|
||||
return output
|
||||
|
||||
|
||||
def flat_map(func, values):
|
||||
return [
|
||||
element
|
||||
for value in values
|
||||
for element in func(value)
|
||||
]
|
||||
|
||||
|
||||
def find_index(predicate, values):
|
||||
for index, value in enumerate(values):
|
||||
if predicate(value):
|
||||
return index
|
||||
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
map = map
|
||||
filter = filter
|
||||
else:
|
||||
import builtins
|
||||
def map(*args, **kwargs):
|
||||
return list(builtins.map(*args, **kwargs))
|
||||
def filter(*args, **kwargs):
|
||||
return list(builtins.filter(*args, **kwargs))
|
||||
101
path/to/venv/lib/python3.12/site-packages/mammoth/options.py
Normal file
101
path/to/venv/lib/python3.12/site-packages/mammoth/options.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from .styles.parser import read_style_mapping
|
||||
from . import lists, results
|
||||
|
||||
|
||||
def read_options(options):
|
||||
custom_style_map_text = options.pop("style_map", "") or ""
|
||||
embedded_style_map_text = options.pop("embedded_style_map", "") or ""
|
||||
include_default_style_map = options.pop("include_default_style_map", True)
|
||||
|
||||
read_style_map_result = results.combine([
|
||||
_read_style_map(custom_style_map_text),
|
||||
_read_style_map(embedded_style_map_text),
|
||||
])
|
||||
|
||||
custom_style_map, embedded_style_map = read_style_map_result.value
|
||||
style_map = custom_style_map + embedded_style_map
|
||||
|
||||
if include_default_style_map:
|
||||
style_map += _default_style_map
|
||||
|
||||
options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
|
||||
options["style_map"] = style_map
|
||||
return read_style_map_result.map(lambda _: options)
|
||||
|
||||
|
||||
def _read_style_map(style_text):
|
||||
lines = filter(None, map(_get_line, style_text.split("\n")))
|
||||
return results.combine(lists.map(read_style_mapping, lines)) \
|
||||
.map(lambda style_mappings: lists.filter(None, style_mappings))
|
||||
|
||||
|
||||
def _get_line(line):
|
||||
line = line.strip()
|
||||
if line.startswith("#"):
|
||||
return None
|
||||
else:
|
||||
return line
|
||||
|
||||
|
||||
_default_style_map_result = _read_style_map("""
|
||||
p.Heading1 => h1:fresh
|
||||
p.Heading2 => h2:fresh
|
||||
p.Heading3 => h3:fresh
|
||||
p.Heading4 => h4:fresh
|
||||
p.Heading5 => h5:fresh
|
||||
p.Heading6 => h6:fresh
|
||||
p[style-name='Heading 1'] => h1:fresh
|
||||
p[style-name='Heading 2'] => h2:fresh
|
||||
p[style-name='Heading 3'] => h3:fresh
|
||||
p[style-name='Heading 4'] => h4:fresh
|
||||
p[style-name='Heading 5'] => h5:fresh
|
||||
p[style-name='Heading 6'] => h6:fresh
|
||||
p[style-name='heading 1'] => h1:fresh
|
||||
p[style-name='heading 2'] => h2:fresh
|
||||
p[style-name='heading 3'] => h3:fresh
|
||||
p[style-name='heading 4'] => h4:fresh
|
||||
p[style-name='heading 5'] => h5:fresh
|
||||
p[style-name='heading 6'] => h6:fresh
|
||||
|
||||
# Apple Pages
|
||||
p.Heading => h1:fresh
|
||||
p[style-name='Heading'] => h1:fresh
|
||||
|
||||
r[style-name='Strong'] => strong
|
||||
|
||||
p[style-name='footnote text'] => p:fresh
|
||||
r[style-name='footnote reference'] =>
|
||||
p[style-name='endnote text'] => p:fresh
|
||||
r[style-name='endnote reference'] =>
|
||||
p[style-name='annotation text'] => p:fresh
|
||||
r[style-name='annotation reference'] =>
|
||||
|
||||
# LibreOffice
|
||||
p[style-name='Footnote'] => p:fresh
|
||||
r[style-name='Footnote anchor'] =>
|
||||
p[style-name='Endnote'] => p:fresh
|
||||
r[style-name='Endnote anchor'] =>
|
||||
|
||||
p:unordered-list(1) => ul > li:fresh
|
||||
p:unordered-list(2) => ul|ol > li > ul > li:fresh
|
||||
p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh
|
||||
p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
|
||||
p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
|
||||
p:ordered-list(1) => ol > li:fresh
|
||||
p:ordered-list(2) => ul|ol > li > ol > li:fresh
|
||||
p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh
|
||||
p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
|
||||
p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
|
||||
|
||||
r[style-name='Hyperlink'] =>
|
||||
|
||||
p[style-name='Normal'] => p:fresh
|
||||
|
||||
# Apple Pages
|
||||
p.Body => p:fresh
|
||||
p[style-name='Body'] => p:fresh
|
||||
""")
|
||||
|
||||
|
||||
assert not _default_style_map_result.messages
|
||||
_default_style_map = _default_style_map_result.value
|
||||
@@ -0,0 +1,14 @@
|
||||
from . import documents
|
||||
|
||||
|
||||
def extract_raw_text_from_element(element):
|
||||
if isinstance(element, documents.Text):
|
||||
return element.value
|
||||
elif isinstance(element, documents.Tab):
|
||||
return "\t"
|
||||
else:
|
||||
text = "".join(map(extract_raw_text_from_element, getattr(element, "children", [])))
|
||||
if isinstance(element, documents.Paragraph):
|
||||
return text + "\n\n"
|
||||
else:
|
||||
return text
|
||||
42
path/to/venv/lib/python3.12/site-packages/mammoth/results.py
Normal file
42
path/to/venv/lib/python3.12/site-packages/mammoth/results.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import collections
|
||||
|
||||
from .lists import unique
|
||||
|
||||
|
||||
class Result(object):
|
||||
def __init__(self, value, messages):
|
||||
self.value = value
|
||||
self.messages = unique(messages)
|
||||
|
||||
def map(self, func):
|
||||
return Result(func(self.value), self.messages)
|
||||
|
||||
def bind(self, func):
|
||||
result = func(self.value)
|
||||
return Result(result.value, self.messages + result.messages)
|
||||
|
||||
|
||||
Message = collections.namedtuple("Message", ["type", "message"])
|
||||
|
||||
|
||||
def warning(message):
|
||||
return Message("warning", message)
|
||||
|
||||
|
||||
def success(value):
|
||||
return Result(value, [])
|
||||
|
||||
|
||||
def combine(results):
|
||||
values = []
|
||||
messages = []
|
||||
for result in results:
|
||||
values.append(result.value)
|
||||
for message in result.messages:
|
||||
messages.append(message)
|
||||
|
||||
return Result(values, messages)
|
||||
|
||||
|
||||
def map(func, *args):
|
||||
return combine(args).map(lambda values: func(*values))
|
||||
@@ -0,0 +1,8 @@
|
||||
import collections
|
||||
|
||||
|
||||
def style(document_matcher, html_path):
|
||||
return Style(document_matcher, html_path)
|
||||
|
||||
|
||||
Style = collections.namedtuple("Style", ["document_matcher", "html_path"])
|
||||
Binary file not shown.
@@ -0,0 +1,14 @@
|
||||
from .errors import LineParseError
|
||||
from .style_mapping_parser import parse_style_mapping
|
||||
from .tokeniser import tokenise
|
||||
from .token_iterator import TokenIterator
|
||||
from ... import results
|
||||
|
||||
|
||||
def read_style_mapping(string):
|
||||
try:
|
||||
tokens = tokenise(string)
|
||||
return results.success(parse_style_mapping(TokenIterator(tokens)))
|
||||
except LineParseError:
|
||||
warning = "Did not understand this style mapping, so ignored it: " + string
|
||||
return results.Result(None, [results.warning(warning)])
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,130 @@
|
||||
from ... import documents, document_matchers
|
||||
from .errors import LineParseError
|
||||
from .tokeniser import TokenType
|
||||
from .token_parser import try_parse_class_name, parse_string
|
||||
|
||||
|
||||
def parse_document_matcher(tokens):
|
||||
if tokens.try_skip(TokenType.IDENTIFIER, "p"):
|
||||
style_id = try_parse_class_name(tokens)
|
||||
style_name = _parse_style_name(tokens)
|
||||
numbering = _parse_numbering(tokens)
|
||||
|
||||
return document_matchers.paragraph(
|
||||
style_id=style_id,
|
||||
style_name=style_name,
|
||||
numbering=numbering,
|
||||
)
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "r"):
|
||||
style_id = try_parse_class_name(tokens)
|
||||
style_name = _parse_style_name(tokens)
|
||||
|
||||
return document_matchers.run(
|
||||
style_id=style_id,
|
||||
style_name=style_name,
|
||||
)
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "table"):
|
||||
style_id = try_parse_class_name(tokens)
|
||||
style_name = _parse_style_name(tokens)
|
||||
|
||||
return document_matchers.table(
|
||||
style_id=style_id,
|
||||
style_name=style_name,
|
||||
)
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "b"):
|
||||
return document_matchers.bold
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "i"):
|
||||
return document_matchers.italic
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "u"):
|
||||
return document_matchers.underline
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "strike"):
|
||||
return document_matchers.strikethrough
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"):
|
||||
return document_matchers.all_caps
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"):
|
||||
return document_matchers.small_caps
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"):
|
||||
return _parse_highlight(tokens)
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"):
|
||||
return document_matchers.comment_reference
|
||||
|
||||
elif tokens.try_skip(TokenType.IDENTIFIER, "br"):
|
||||
return _parse_break(tokens)
|
||||
|
||||
else:
|
||||
raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER)))
|
||||
|
||||
def _parse_style_name(tokens):
|
||||
if tokens.try_skip(TokenType.SYMBOL, "["):
|
||||
tokens.skip(TokenType.IDENTIFIER, "style-name")
|
||||
string_matcher = _parse_string_matcher(tokens)
|
||||
tokens.skip(TokenType.SYMBOL, "]")
|
||||
return string_matcher
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_string_matcher(tokens):
|
||||
if tokens.try_skip(TokenType.SYMBOL, "="):
|
||||
return document_matchers.equal_to(parse_string(tokens))
|
||||
elif tokens.try_skip(TokenType.SYMBOL, "^="):
|
||||
return document_matchers.starts_with(parse_string(tokens))
|
||||
else:
|
||||
raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value()))
|
||||
|
||||
def _parse_numbering(tokens):
|
||||
if tokens.try_skip(TokenType.SYMBOL, ":"):
|
||||
is_ordered = _parse_list_type(tokens)
|
||||
tokens.skip(TokenType.SYMBOL, "(")
|
||||
level = int(tokens.next_value(TokenType.INTEGER)) - 1
|
||||
tokens.skip(TokenType.SYMBOL, ")")
|
||||
return documents.numbering_level(level, is_ordered=is_ordered)
|
||||
|
||||
|
||||
def _parse_list_type(tokens):
|
||||
list_type = tokens.next_value(TokenType.IDENTIFIER)
|
||||
if list_type == "ordered-list":
|
||||
return True
|
||||
elif list_type == "unordered-list":
|
||||
return False
|
||||
else:
|
||||
raise LineParseError("Unrecognised list type: {0}".format(list_type))
|
||||
|
||||
|
||||
def _parse_highlight(tokens):
|
||||
if tokens.try_skip(TokenType.SYMBOL, "["):
|
||||
tokens.skip(TokenType.IDENTIFIER, "color")
|
||||
tokens.skip(TokenType.SYMBOL, "=")
|
||||
color = parse_string(tokens)
|
||||
tokens.skip(TokenType.SYMBOL, "]");
|
||||
else:
|
||||
color = None
|
||||
|
||||
return document_matchers.highlight(color=color)
|
||||
|
||||
|
||||
def _parse_break(tokens):
|
||||
tokens.skip(TokenType.SYMBOL, "[")
|
||||
tokens.skip(TokenType.IDENTIFIER, "type")
|
||||
tokens.skip(TokenType.SYMBOL, "=")
|
||||
type_name = parse_string(tokens)
|
||||
tokens.skip(TokenType.SYMBOL, "]");
|
||||
|
||||
if type_name == "line":
|
||||
return document_matchers.line_break
|
||||
elif type_name == "page":
|
||||
return document_matchers.page_break
|
||||
elif type_name == "column":
|
||||
return document_matchers.column_break
|
||||
else:
|
||||
raise LineParseError("Unrecognised break type: {0}".format(type_name))
|
||||
@@ -0,0 +1,2 @@
|
||||
class LineParseError(Exception):
|
||||
pass
|
||||
@@ -0,0 +1,120 @@
|
||||
import cobble
|
||||
|
||||
from ... import html_paths
|
||||
from .tokeniser import TokenType
|
||||
from .token_parser import parse_identifier, parse_string
|
||||
|
||||
|
||||
@cobble.data
|
||||
class _AttributeOrClassName(object):
|
||||
name = cobble.field()
|
||||
value = cobble.field()
|
||||
append = cobble.field()
|
||||
|
||||
|
||||
def parse_html_path(tokens):
|
||||
if tokens.try_skip(TokenType.SYMBOL, "!"):
|
||||
return html_paths.ignore
|
||||
else:
|
||||
return html_paths.path(_parse_html_path_elements(tokens))
|
||||
|
||||
|
||||
def _parse_html_path_elements(tokens):
|
||||
elements = []
|
||||
|
||||
if tokens.peek_token_type() == TokenType.IDENTIFIER:
|
||||
elements.append(_parse_element(tokens))
|
||||
|
||||
while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
|
||||
tokens.skip(TokenType.WHITESPACE)
|
||||
elements.append(_parse_element(tokens))
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def _parse_element(tokens):
|
||||
tag_names = _parse_tag_names(tokens)
|
||||
attributes_list = _parse_attribute_or_class_names(tokens)
|
||||
is_fresh = _parse_is_fresh(tokens)
|
||||
separator = _parse_separator(tokens)
|
||||
|
||||
attributes = {}
|
||||
for attribute in attributes_list:
|
||||
if attribute.append and attributes.get(attribute.name):
|
||||
attributes[attribute.name] += " " + attribute.value
|
||||
else:
|
||||
attributes[attribute.name] = attribute.value
|
||||
|
||||
return html_paths.element(
|
||||
tag_names,
|
||||
attributes=attributes,
|
||||
fresh=is_fresh,
|
||||
separator=separator,
|
||||
)
|
||||
|
||||
|
||||
def _parse_tag_names(tokens):
|
||||
tag_names = [parse_identifier(tokens)]
|
||||
|
||||
while tokens.try_skip(TokenType.SYMBOL, "|"):
|
||||
tag_names.append(parse_identifier(tokens))
|
||||
|
||||
return tag_names
|
||||
|
||||
|
||||
def _parse_attribute_or_class_names(tokens):
|
||||
attribute_or_class_names = []
|
||||
|
||||
while True:
|
||||
attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
|
||||
if attribute_or_class_name is None:
|
||||
break
|
||||
else:
|
||||
attribute_or_class_names.append(attribute_or_class_name)
|
||||
|
||||
return attribute_or_class_names
|
||||
|
||||
|
||||
def _try_parse_attribute_or_class_name(tokens):
|
||||
if tokens.is_next(TokenType.SYMBOL, "["):
|
||||
return _parse_attribute(tokens)
|
||||
if tokens.is_next(TokenType.SYMBOL, "."):
|
||||
return _parse_class_name(tokens)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_attribute(tokens):
|
||||
tokens.skip(TokenType.SYMBOL, "[")
|
||||
name = parse_identifier(tokens)
|
||||
tokens.skip(TokenType.SYMBOL, "=")
|
||||
value = parse_string(tokens)
|
||||
tokens.skip(TokenType.SYMBOL, "]")
|
||||
return _AttributeOrClassName(name=name, value=value, append=False)
|
||||
|
||||
|
||||
def _parse_class_name(tokens):
|
||||
tokens.skip(TokenType.SYMBOL, ".")
|
||||
class_name = parse_identifier(tokens)
|
||||
return _AttributeOrClassName(name="class", value=class_name, append=True)
|
||||
|
||||
|
||||
def _parse_is_fresh(tokens):
|
||||
return tokens.try_skip_many((
|
||||
(TokenType.SYMBOL, ":"),
|
||||
(TokenType.IDENTIFIER, "fresh"),
|
||||
))
|
||||
|
||||
|
||||
def _parse_separator(tokens):
|
||||
is_separator = tokens.try_skip_many((
|
||||
(TokenType.SYMBOL, ":"),
|
||||
(TokenType.IDENTIFIER, "separator"),
|
||||
))
|
||||
if is_separator:
|
||||
tokens.skip(TokenType.SYMBOL, "(")
|
||||
value = parse_string(tokens)
|
||||
tokens.skip(TokenType.SYMBOL, ")")
|
||||
return value
|
||||
else:
|
||||
return None
|
||||
@@ -0,0 +1,15 @@
|
||||
from .tokeniser import TokenType
|
||||
from .document_matcher_parser import parse_document_matcher
|
||||
from .html_path_parser import parse_html_path
|
||||
from ...styles import Style
|
||||
|
||||
|
||||
def parse_style_mapping(tokens):
|
||||
document_matcher = parse_document_matcher(tokens)
|
||||
tokens.skip(TokenType.WHITESPACE)
|
||||
tokens.skip(TokenType.SYMBOL, "=>")
|
||||
tokens.try_skip(TokenType.WHITESPACE)
|
||||
html_path = parse_html_path(tokens)
|
||||
tokens.skip(TokenType.END)
|
||||
|
||||
return Style(document_matcher, html_path)
|
||||
@@ -0,0 +1,59 @@
|
||||
# TODO: check indices
|
||||
# TODO: proper tests for unexpected tokens
|
||||
|
||||
from .errors import LineParseError
|
||||
|
||||
|
||||
class TokenIterator(object):
|
||||
def __init__(self, tokens):
|
||||
self._tokens = tokens
|
||||
self._index = 0
|
||||
|
||||
def peek_token_type(self):
|
||||
return self._tokens[self._index].type
|
||||
|
||||
def next_value(self, token_type=None):
|
||||
return self._next(token_type).value
|
||||
|
||||
def _next(self, token_type=None):
|
||||
token = self._tokens[self._index]
|
||||
if token_type is None or token.type == token_type:
|
||||
self._index += 1
|
||||
return token
|
||||
else:
|
||||
raise self._unexpected_token_type(token_type, token)
|
||||
|
||||
def skip(self, token_type, token_value=None):
|
||||
token = self._tokens[self._index]
|
||||
if token.type == token_type and (token_value is None or token.value == token_value):
|
||||
self._index += 1
|
||||
return True
|
||||
else:
|
||||
raise self._unexpected_token_type(token_type, token)
|
||||
|
||||
def try_skip(self, token_type, token_value=None):
|
||||
if self.is_next(token_type, token_value):
|
||||
self._index += 1
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def try_skip_many(self, tokens):
|
||||
start = self._index
|
||||
for token_type, token_value in tokens:
|
||||
token = self._tokens[self._index]
|
||||
if not (token.type == token_type and (token_value is None or token.value == token_value)):
|
||||
self._index = start
|
||||
return False
|
||||
else:
|
||||
self._index += 1
|
||||
|
||||
return True
|
||||
|
||||
def is_next(self, token_type, token_value=None):
|
||||
token = self._tokens[self._index]
|
||||
return token.type == token_type and (token_value is None or token.value == token_value)
|
||||
|
||||
def _unexpected_token_type(self, token_type, token):
|
||||
raise LineParseError()
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
|
||||
from .tokeniser import TokenType
|
||||
|
||||
|
||||
def try_parse_class_name(tokens):
|
||||
if tokens.try_skip(TokenType.SYMBOL, "."):
|
||||
return parse_identifier(tokens)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def parse_identifier(tokens):
|
||||
return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER))
|
||||
|
||||
|
||||
def parse_string(tokens):
|
||||
return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1])
|
||||
|
||||
|
||||
_ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)")
|
||||
|
||||
|
||||
def decode_escape_sequences(value):
|
||||
return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value)
|
||||
|
||||
|
||||
def _decode_escape_sequence(match):
|
||||
code = match.group(1)
|
||||
if code == "n":
|
||||
return "\n"
|
||||
elif code == "r":
|
||||
return "\r"
|
||||
elif code == "t":
|
||||
return "\t"
|
||||
else:
|
||||
return code
|
||||
@@ -0,0 +1,61 @@
|
||||
import collections
|
||||
import re
|
||||
|
||||
|
||||
Token = collections.namedtuple("Token", ["character_index", "type", "value"])
|
||||
|
||||
|
||||
class TokenType(object):
|
||||
IDENTIFIER = "identifier"
|
||||
SYMBOL = "symbol"
|
||||
WHITESPACE = "whitespace"
|
||||
STRING = "string"
|
||||
UNTERMINATED_STRING = "unterminated string"
|
||||
INTEGER = "integer"
|
||||
END = "end"
|
||||
|
||||
|
||||
|
||||
def regex_tokeniser(rules):
|
||||
rules = [(token_type, _to_regex(regex)) for token_type, regex in rules]
|
||||
rules.append(("unknown", re.compile(".")))
|
||||
|
||||
def tokenise(value):
|
||||
tokens = []
|
||||
index = 0
|
||||
while index < len(value):
|
||||
for token_type, regex in rules:
|
||||
match = regex.match(value, index)
|
||||
if match is not None:
|
||||
tokens.append(Token(index, token_type, match.group(0)))
|
||||
index = match.end()
|
||||
break
|
||||
else:
|
||||
# Should be impossible
|
||||
raise Exception("Remaining: " + value[index:])
|
||||
|
||||
tokens.append(Token(index, TokenType.END, ""))
|
||||
|
||||
return tokens
|
||||
|
||||
return tokenise
|
||||
|
||||
|
||||
def _to_regex(value):
|
||||
if hasattr(value, "match"):
|
||||
return value
|
||||
else:
|
||||
return re.compile(value)
|
||||
|
||||
|
||||
_string_prefix = r"'(?:\\.|[^'])*"
|
||||
_identifier_character = r"(?:[a-zA-Z\-_]|\\.)"
|
||||
|
||||
tokenise = regex_tokeniser([
|
||||
(TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"),
|
||||
(TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."),
|
||||
(TokenType.WHITESPACE, r"\s+"),
|
||||
(TokenType.STRING, _string_prefix + "'"),
|
||||
(TokenType.UNTERMINATED_STRING, _string_prefix),
|
||||
(TokenType.INTEGER, "([0-9]+)"),
|
||||
])
|
||||
@@ -0,0 +1,56 @@
|
||||
from . import documents
|
||||
|
||||
|
||||
def paragraph(transform_paragraph):
|
||||
return element_of_type(documents.Paragraph, transform_paragraph)
|
||||
|
||||
|
||||
def run(transform_run):
|
||||
return element_of_type(documents.Run, transform_run)
|
||||
|
||||
|
||||
def element_of_type(element_type, transform):
|
||||
def transform_element(element):
|
||||
if isinstance(element, element_type):
|
||||
return transform(element)
|
||||
else:
|
||||
return element
|
||||
|
||||
return _each_element(transform_element)
|
||||
|
||||
|
||||
def _each_element(transform_element):
|
||||
def transform_element_and_children(element):
|
||||
if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)):
|
||||
children = list(map(transform_element_and_children, element.children))
|
||||
element = element.copy(children=children)
|
||||
|
||||
return transform_element(element)
|
||||
|
||||
return transform_element_and_children
|
||||
|
||||
|
||||
def get_descendants_of_type(element, element_type):
|
||||
return list(filter(
|
||||
lambda descendant: isinstance(descendant, element_type),
|
||||
get_descendants(element),
|
||||
))
|
||||
|
||||
|
||||
def get_descendants(element):
|
||||
descendants = []
|
||||
|
||||
def visit(element):
|
||||
descendants.append(element)
|
||||
|
||||
_visit_descendants(element, visit)
|
||||
|
||||
return descendants
|
||||
|
||||
|
||||
def _visit_descendants(element, visit):
|
||||
if isinstance(element, documents.HasChildren):
|
||||
for child in element.children:
|
||||
_visit_descendants(child, visit)
|
||||
visit(child)
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
from . import html
|
||||
|
||||
|
||||
def element(name):
|
||||
def convert_underline(nodes):
|
||||
return [html.collapsible_element(name, {}, nodes)]
|
||||
|
||||
return convert_underline
|
||||
@@ -0,0 +1,19 @@
|
||||
from .html import HtmlWriter
|
||||
from .markdown import MarkdownWriter
|
||||
|
||||
|
||||
def writer(output_format=None):
|
||||
if output_format is None:
|
||||
output_format = "html"
|
||||
|
||||
return _writers[output_format]()
|
||||
|
||||
|
||||
def formats():
|
||||
return _writers.keys()
|
||||
|
||||
|
||||
_writers = {
|
||||
"html": HtmlWriter,
|
||||
"markdown": MarkdownWriter,
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,31 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import abc
|
||||
|
||||
|
||||
class Writer(object):
|
||||
__metaclass__ = abc.ABCMeta
|
||||
|
||||
@abc.abstractmethod
|
||||
def text(self, text):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def start(self, name, attributes=None):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def end(self, name):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def self_closing(self, name, attributes=None):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def append(self, html):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def as_string(self):
|
||||
pass
|
||||
@@ -0,0 +1,43 @@
|
||||
from __future__ import unicode_literals
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
from .abc import Writer
|
||||
|
||||
|
||||
class HtmlWriter(Writer):
|
||||
def __init__(self):
|
||||
self._fragments = []
|
||||
|
||||
def text(self, text):
|
||||
self._fragments.append(_escape_html(text))
|
||||
|
||||
def start(self, name, attributes=None):
|
||||
attribute_string = _generate_attribute_string(attributes)
|
||||
self._fragments.append("<{0}{1}>".format(name, attribute_string))
|
||||
|
||||
def end(self, name):
|
||||
self._fragments.append("</{0}>".format(name))
|
||||
|
||||
def self_closing(self, name, attributes=None):
|
||||
attribute_string = _generate_attribute_string(attributes)
|
||||
self._fragments.append("<{0}{1} />".format(name, attribute_string))
|
||||
|
||||
def append(self, html):
|
||||
self._fragments.append(html)
|
||||
|
||||
def as_string(self):
|
||||
return "".join(self._fragments)
|
||||
|
||||
|
||||
def _escape_html(text):
|
||||
return escape(text, {'"': """})
|
||||
|
||||
|
||||
def _generate_attribute_string(attributes):
|
||||
if attributes is None:
|
||||
return ""
|
||||
else:
|
||||
return "".join(
|
||||
' {0}="{1}"'.format(key, _escape_html(attributes[key]))
|
||||
for key in sorted(attributes)
|
||||
)
|
||||
@@ -0,0 +1,203 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .abc import Writer
|
||||
|
||||
import re
|
||||
|
||||
|
||||
class _WriterOutput(object):
|
||||
def __init__(self, start, end=None, generate_end=None, anchor_position=None):
|
||||
if generate_end is None:
|
||||
generate_end = _constant(end)
|
||||
|
||||
self.start = start
|
||||
self.generate_end = generate_end
|
||||
self.anchor_position = anchor_position
|
||||
|
||||
|
||||
def _constant(value):
|
||||
def get():
|
||||
return value
|
||||
|
||||
return get
|
||||
|
||||
|
||||
class _MarkdownState(object):
|
||||
def __init__(self):
|
||||
self._list_state_stack = []
|
||||
self.list_state = None
|
||||
self.list_item_has_closed = False
|
||||
|
||||
def update_list_state(self, list_state):
|
||||
self._list_state_stack.append(self.list_state)
|
||||
self.list_state = list_state
|
||||
|
||||
def pop_list_state(self):
|
||||
self.list_state = self._list_state_stack.pop()
|
||||
|
||||
|
||||
class _MarkdownListState(object):
|
||||
def __init__(self, ordered, indentation):
|
||||
self.ordered = ordered
|
||||
self.count = 0
|
||||
self.indentation = indentation
|
||||
|
||||
|
||||
def _symmetric_wrapped(end):
|
||||
return _Wrapped(end, end)
|
||||
|
||||
|
||||
class _Wrapped(object):
|
||||
def __init__(self, start, end):
|
||||
self._start = start
|
||||
self._end = end
|
||||
|
||||
def __call__(self, attributes, markdown_state):
|
||||
return _WriterOutput(self._start, self._end)
|
||||
|
||||
|
||||
def _hyperlink(attributes, markdown_state):
|
||||
href = attributes.get("href", "")
|
||||
if href:
|
||||
return _WriterOutput(
|
||||
"[", "]({0})".format(href),
|
||||
anchor_position="before",
|
||||
)
|
||||
else:
|
||||
return _default_output
|
||||
|
||||
|
||||
def _image(attributes, markdown_state):
|
||||
src = attributes.get("src", "")
|
||||
alt_text = attributes.get("alt", "")
|
||||
if src or alt_text:
|
||||
return _WriterOutput("".format(alt_text, src), "")
|
||||
else:
|
||||
return _default_output
|
||||
|
||||
|
||||
def _list(ordered):
|
||||
def call(attributes, markdown_state):
|
||||
if markdown_state.list_state is None:
|
||||
start = ""
|
||||
end_text = "\n"
|
||||
indentation = 0
|
||||
else:
|
||||
start = "\n"
|
||||
end_text = ""
|
||||
indentation = markdown_state.list_state.indentation + 1
|
||||
|
||||
def generate_end():
|
||||
markdown_state.pop_list_state()
|
||||
return end_text
|
||||
|
||||
markdown_state.update_list_state(_MarkdownListState(
|
||||
ordered=ordered,
|
||||
indentation=indentation,
|
||||
))
|
||||
|
||||
return _WriterOutput(start, generate_end=generate_end)
|
||||
|
||||
return call
|
||||
|
||||
|
||||
def _list_item(attributes, markdown_state):
|
||||
markdown_state.list_item_has_closed = False
|
||||
|
||||
list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0)
|
||||
list_state.count += 1
|
||||
|
||||
if list_state.ordered:
|
||||
bullet = "{0}.".format(list_state.count)
|
||||
else:
|
||||
bullet = "-"
|
||||
|
||||
def generate_end():
|
||||
if markdown_state.list_item_has_closed:
|
||||
return ""
|
||||
else:
|
||||
markdown_state.list_item_has_closed = True
|
||||
return "\n"
|
||||
|
||||
return _WriterOutput(
|
||||
start=("\t" * list_state.indentation) + bullet + " ",
|
||||
generate_end=generate_end
|
||||
)
|
||||
|
||||
|
||||
def _init_writers():
|
||||
writers = {
|
||||
"p": _Wrapped("", "\n\n"),
|
||||
"br": _Wrapped("", " \n"),
|
||||
"strong": _symmetric_wrapped("__"),
|
||||
"em": _symmetric_wrapped("*"),
|
||||
"a": _hyperlink,
|
||||
"img": _image,
|
||||
"ol": _list(ordered=True),
|
||||
"ul": _list(ordered=False),
|
||||
"li": _list_item,
|
||||
}
|
||||
|
||||
for level in range(1, 7):
|
||||
writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n")
|
||||
|
||||
return writers
|
||||
|
||||
|
||||
_writers = _init_writers()
|
||||
_default_output = _WriterOutput("", "")
|
||||
|
||||
def _default_writer(attributes, markdown_state):
|
||||
return _default_output
|
||||
|
||||
|
||||
class MarkdownWriter(Writer):
|
||||
def __init__(self):
|
||||
self._fragments = []
|
||||
self._element_stack = []
|
||||
self._markdown_state = _MarkdownState()
|
||||
|
||||
def text(self, text):
|
||||
self._fragments.append(_escape_markdown(text))
|
||||
|
||||
def start(self, name, attributes=None):
|
||||
if attributes is None:
|
||||
attributes = {}
|
||||
|
||||
output = _writers.get(name, _default_writer)(attributes, self._markdown_state)
|
||||
self._element_stack.append(output.generate_end)
|
||||
|
||||
anchor_before_start = output.anchor_position == "before"
|
||||
if anchor_before_start:
|
||||
self._write_anchor(attributes)
|
||||
|
||||
self._fragments.append(output.start)
|
||||
|
||||
if not anchor_before_start:
|
||||
self._write_anchor(attributes)
|
||||
|
||||
|
||||
|
||||
def end(self, name):
|
||||
end = self._element_stack.pop()
|
||||
output = end()
|
||||
self._fragments.append(output)
|
||||
|
||||
def self_closing(self, name, attributes=None):
|
||||
self.start(name, attributes)
|
||||
self.end(name)
|
||||
|
||||
def append(self, other):
|
||||
self._fragments.append(other)
|
||||
|
||||
def as_string(self):
|
||||
return "".join(self._fragments)
|
||||
|
||||
def _write_anchor(self, attributes):
|
||||
html_id = attributes.get("id")
|
||||
if html_id:
|
||||
self._fragments.append('<a id="{0}"></a>'.format(html_id))
|
||||
|
||||
|
||||
def _escape_markdown(value):
|
||||
return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value))
|
||||
77
path/to/venv/lib/python3.12/site-packages/mammoth/zips.py
Normal file
77
path/to/venv/lib/python3.12/site-packages/mammoth/zips.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import contextlib
|
||||
import io
|
||||
import shutil
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
|
||||
def open_zip(fileobj, mode):
|
||||
return _Zip(ZipFile(fileobj, mode))
|
||||
|
||||
|
||||
class _Zip(object):
|
||||
def __init__(self, zip_file):
|
||||
self._zip_file = zip_file
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self._zip_file.close()
|
||||
|
||||
def open(self, name):
|
||||
return contextlib.closing(self._zip_file.open(name))
|
||||
|
||||
def exists(self, name):
|
||||
try:
|
||||
self._zip_file.getinfo(name)
|
||||
return True
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
def read_str(self, name):
|
||||
return self._zip_file.read(name).decode("utf8")
|
||||
|
||||
|
||||
def update_zip(fileobj, files):
|
||||
source = ZipFile(fileobj, "r")
|
||||
try:
|
||||
destination_fileobj = io.BytesIO()
|
||||
destination = ZipFile(destination_fileobj, "w")
|
||||
try:
|
||||
names = set(source.namelist()) | set(files.keys())
|
||||
for name in names:
|
||||
if name in files:
|
||||
contents = files[name]
|
||||
else:
|
||||
contents = source.read(name)
|
||||
destination.writestr(name, contents)
|
||||
finally:
|
||||
destination.close()
|
||||
finally:
|
||||
source.close()
|
||||
|
||||
fileobj.seek(0)
|
||||
destination_fileobj.seek(0)
|
||||
shutil.copyfileobj(destination_fileobj, fileobj)
|
||||
|
||||
|
||||
def split_path(path):
|
||||
parts = path.rsplit("/", 1)
|
||||
if len(parts) == 1:
|
||||
return ("", path)
|
||||
else:
|
||||
return tuple(parts)
|
||||
|
||||
|
||||
def join_path(*args):
|
||||
non_empty_paths = list(filter(None, args))
|
||||
|
||||
relevant_paths = []
|
||||
for path in non_empty_paths:
|
||||
if path.startswith("/"):
|
||||
relevant_paths = [path]
|
||||
else:
|
||||
relevant_paths.append(path)
|
||||
|
||||
return "/".join(relevant_paths)
|
||||
Reference in New Issue
Block a user