409 lines
14 KiB
Python
409 lines
14 KiB
Python
# coding=utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
from functools import partial
|
|
|
|
import cobble
|
|
|
|
from . import documents, results, html_paths, images, writers, html
|
|
from .docx.files import InvalidFileReferenceError
|
|
from .lists import find_index
|
|
|
|
|
|
def convert_document_element_to_html(element,
|
|
style_map=None,
|
|
convert_image=None,
|
|
id_prefix=None,
|
|
output_format=None,
|
|
ignore_empty_paragraphs=True):
|
|
|
|
if style_map is None:
|
|
style_map = []
|
|
|
|
if id_prefix is None:
|
|
id_prefix = ""
|
|
|
|
if convert_image is None:
|
|
convert_image = images.data_uri
|
|
|
|
if isinstance(element, documents.Document):
|
|
comments = dict(
|
|
(comment.comment_id, comment)
|
|
for comment in element.comments
|
|
)
|
|
else:
|
|
comments = {}
|
|
|
|
messages = []
|
|
converter = _DocumentConverter(
|
|
messages=messages,
|
|
style_map=style_map,
|
|
convert_image=convert_image,
|
|
id_prefix=id_prefix,
|
|
ignore_empty_paragraphs=ignore_empty_paragraphs,
|
|
note_references=[],
|
|
comments=comments,
|
|
)
|
|
context = _ConversionContext(is_table_header=False)
|
|
nodes = converter.visit(element, context)
|
|
|
|
writer = writers.writer(output_format)
|
|
html.write(writer, html.collapse(html.strip_empty(nodes)))
|
|
return results.Result(writer.as_string(), messages)
|
|
|
|
|
|
@cobble.data
|
|
class _ConversionContext(object):
|
|
is_table_header = cobble.field()
|
|
|
|
def copy(self, **kwargs):
|
|
return cobble.copy(self, **kwargs)
|
|
|
|
|
|
class _DocumentConverter(documents.element_visitor(args=1)):
|
|
def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
|
|
self._messages = messages
|
|
self._style_map = style_map
|
|
self._id_prefix = id_prefix
|
|
self._ignore_empty_paragraphs = ignore_empty_paragraphs
|
|
self._note_references = note_references
|
|
self._referenced_comments = []
|
|
self._convert_image = convert_image
|
|
self._comments = comments
|
|
|
|
def visit_image(self, image, context):
|
|
try:
|
|
return self._convert_image(image)
|
|
except InvalidFileReferenceError as error:
|
|
self._messages.append(results.warning(str(error)))
|
|
return []
|
|
|
|
def visit_document(self, document, context):
|
|
nodes = self._visit_all(document.children, context)
|
|
notes = [
|
|
document.notes.resolve(reference)
|
|
for reference in self._note_references
|
|
]
|
|
notes_list = html.element("ol", {}, self._visit_all(notes, context))
|
|
comments = html.element("dl", {}, [
|
|
html_node
|
|
for referenced_comment in self._referenced_comments
|
|
for html_node in self.visit_comment(referenced_comment, context)
|
|
])
|
|
return nodes + [notes_list, comments]
|
|
|
|
|
|
def visit_paragraph(self, paragraph, context):
|
|
def children():
|
|
content = self._visit_all(paragraph.children, context)
|
|
if self._ignore_empty_paragraphs:
|
|
return content
|
|
else:
|
|
return [html.force_write] + content
|
|
|
|
html_path = self._find_html_path_for_paragraph(paragraph)
|
|
return html_path.wrap(children)
|
|
|
|
|
|
def visit_run(self, run, context):
|
|
nodes = lambda: self._visit_all(run.children, context)
|
|
paths = []
|
|
if run.highlight is not None:
|
|
style = self._find_style(Highlight(color=run.highlight), "highlight")
|
|
if style is not None:
|
|
paths.append(style.html_path)
|
|
if run.is_small_caps:
|
|
paths.append(self._find_style_for_run_property("small_caps"))
|
|
if run.is_all_caps:
|
|
paths.append(self._find_style_for_run_property("all_caps"))
|
|
if run.is_strikethrough:
|
|
paths.append(self._find_style_for_run_property("strikethrough", default="s"))
|
|
if run.is_underline:
|
|
paths.append(self._find_style_for_run_property("underline"))
|
|
if run.vertical_alignment == documents.VerticalAlignment.subscript:
|
|
paths.append(html_paths.element(["sub"], fresh=False))
|
|
if run.vertical_alignment == documents.VerticalAlignment.superscript:
|
|
paths.append(html_paths.element(["sup"], fresh=False))
|
|
if run.is_italic:
|
|
paths.append(self._find_style_for_run_property("italic", default="em"))
|
|
if run.is_bold:
|
|
paths.append(self._find_style_for_run_property("bold", default="strong"))
|
|
paths.append(self._find_html_path_for_run(run))
|
|
|
|
for path in paths:
|
|
nodes = partial(path.wrap, nodes)
|
|
|
|
return nodes()
|
|
|
|
|
|
def _find_style_for_run_property(self, element_type, default=None):
|
|
style = self._find_style(None, element_type)
|
|
if style is not None:
|
|
return style.html_path
|
|
elif default is not None:
|
|
return html_paths.element(default, fresh=False)
|
|
else:
|
|
return html_paths.empty
|
|
|
|
|
|
def visit_text(self, text, context):
|
|
return [html.text(text.value)]
|
|
|
|
|
|
def visit_hyperlink(self, hyperlink, context):
|
|
if hyperlink.anchor is None:
|
|
href = hyperlink.href
|
|
else:
|
|
href = "#{0}".format(self._html_id(hyperlink.anchor))
|
|
|
|
attributes = {"href": href}
|
|
if hyperlink.target_frame is not None:
|
|
attributes["target"] = hyperlink.target_frame
|
|
|
|
nodes = self._visit_all(hyperlink.children, context)
|
|
return [html.collapsible_element("a", attributes, nodes)]
|
|
|
|
|
|
def visit_checkbox(self, checkbox, context):
|
|
attributes = {"type": "checkbox"}
|
|
|
|
if checkbox.checked:
|
|
attributes["checked"] = "checked"
|
|
|
|
return [html.element("input", attributes)]
|
|
|
|
|
|
def visit_bookmark(self, bookmark, context):
|
|
element = html.collapsible_element(
|
|
"a",
|
|
{"id": self._html_id(bookmark.name)},
|
|
[html.force_write])
|
|
return [element]
|
|
|
|
|
|
def visit_tab(self, tab, context):
|
|
return [html.text("\t")]
|
|
|
|
_default_table_path = html_paths.path([html_paths.element(["table"], fresh=True)])
|
|
|
|
def visit_table(self, table, context):
|
|
return self._find_html_path(table, "table", self._default_table_path) \
|
|
.wrap(lambda: self._convert_table_children(table, context))
|
|
|
|
def _convert_table_children(self, table, context):
|
|
body_index = find_index(
|
|
lambda child: not isinstance(child, documents.TableRow) or not child.is_header,
|
|
table.children,
|
|
)
|
|
if body_index is None:
|
|
body_index = len(table.children)
|
|
|
|
if body_index == 0:
|
|
children = self._visit_all(table.children, context.copy(is_table_header=False))
|
|
else:
|
|
head_rows = self._visit_all(table.children[:body_index], context.copy(is_table_header=True))
|
|
body_rows = self._visit_all(table.children[body_index:], context.copy(is_table_header=False))
|
|
children = [
|
|
html.element("thead", {}, head_rows),
|
|
html.element("tbody", {}, body_rows),
|
|
]
|
|
|
|
return [html.force_write] + children
|
|
|
|
|
|
def visit_table_row(self, table_row, context):
|
|
return [html.element("tr", {}, [html.force_write] + self._visit_all(table_row.children, context))]
|
|
|
|
|
|
def visit_table_cell(self, table_cell, context):
|
|
if context.is_table_header:
|
|
tag_name = "th"
|
|
else:
|
|
tag_name = "td"
|
|
attributes = {}
|
|
if table_cell.colspan != 1:
|
|
attributes["colspan"] = str(table_cell.colspan)
|
|
if table_cell.rowspan != 1:
|
|
attributes["rowspan"] = str(table_cell.rowspan)
|
|
nodes = [html.force_write] + self._visit_all(table_cell.children, context)
|
|
return [
|
|
html.element(tag_name, attributes, nodes)
|
|
]
|
|
|
|
|
|
def visit_break(self, break_, context):
|
|
return self._find_html_path_for_break(break_).wrap(lambda: [])
|
|
|
|
|
|
def _find_html_path_for_break(self, break_):
|
|
style = self._find_style(break_, "break")
|
|
if style is not None:
|
|
return style.html_path
|
|
elif break_.break_type == "line":
|
|
return html_paths.path([html_paths.element("br", fresh=True)])
|
|
else:
|
|
return html_paths.empty
|
|
|
|
|
|
def visit_note_reference(self, note_reference, context):
|
|
self._note_references.append(note_reference)
|
|
note_number = len(self._note_references)
|
|
return [
|
|
html.element("sup", {}, [
|
|
html.element("a", {
|
|
"href": "#" + self._note_html_id(note_reference),
|
|
"id": self._note_ref_html_id(note_reference),
|
|
}, [html.text("[{0}]".format(note_number))])
|
|
])
|
|
]
|
|
|
|
|
|
def visit_note(self, note, context):
|
|
note_body = self._visit_all(note.body, context) + [
|
|
html.collapsible_element("p", {}, [
|
|
html.text(" "),
|
|
html.element("a", {"href": "#" + self._note_ref_html_id(note)}, [
|
|
html.text(_up_arrow)
|
|
]),
|
|
])
|
|
]
|
|
return [
|
|
html.element("li", {"id": self._note_html_id(note)}, note_body)
|
|
]
|
|
|
|
|
|
def visit_comment_reference(self, reference, context):
|
|
def nodes():
|
|
comment = self._comments[reference.comment_id]
|
|
count = len(self._referenced_comments) + 1
|
|
label = "[{0}{1}]".format(_comment_author_label(comment), count)
|
|
self._referenced_comments.append((label, comment))
|
|
return [
|
|
# TODO: remove duplication with note references
|
|
html.element("a", {
|
|
"href": "#" + self._referent_html_id("comment", reference.comment_id),
|
|
"id": self._reference_html_id("comment", reference.comment_id),
|
|
}, [html.text(label)])
|
|
]
|
|
|
|
html_path = self._find_html_path(
|
|
None,
|
|
"comment_reference",
|
|
default=html_paths.ignore,
|
|
)
|
|
|
|
return html_path.wrap(nodes)
|
|
|
|
def visit_comment(self, referenced_comment, context):
|
|
label, comment = referenced_comment
|
|
# TODO remove duplication with notes
|
|
body = self._visit_all(comment.body, context) + [
|
|
html.collapsible_element("p", {}, [
|
|
html.text(" "),
|
|
html.element("a", {"href": "#" + self._reference_html_id("comment", comment.comment_id)}, [
|
|
html.text(_up_arrow)
|
|
]),
|
|
])
|
|
]
|
|
return [
|
|
html.element(
|
|
"dt",
|
|
{"id": self._referent_html_id("comment", comment.comment_id)},
|
|
[html.text("Comment {0}".format(label))],
|
|
),
|
|
html.element("dd", {}, body),
|
|
]
|
|
|
|
|
|
def _visit_all(self, elements, context):
|
|
return [
|
|
html_node
|
|
for element in elements
|
|
for html_node in self.visit(element, context)
|
|
]
|
|
|
|
|
|
def _find_html_path_for_paragraph(self, paragraph):
|
|
default = html_paths.path([html_paths.element("p", fresh=True)])
|
|
return self._find_html_path(paragraph, "paragraph", default, warn_unrecognised=True)
|
|
|
|
def _find_html_path_for_run(self, run):
|
|
return self._find_html_path(run, "run", default=html_paths.empty, warn_unrecognised=True)
|
|
|
|
|
|
def _find_html_path(self, element, element_type, default, warn_unrecognised=False):
|
|
style = self._find_style(element, element_type)
|
|
if style is not None:
|
|
return style.html_path
|
|
|
|
if warn_unrecognised and getattr(element, "style_id", None) is not None:
|
|
self._messages.append(results.warning(
|
|
"Unrecognised {0} style: {1} (Style ID: {2})".format(
|
|
element_type, element.style_name, element.style_id)
|
|
))
|
|
|
|
return default
|
|
|
|
def _find_style(self, element, element_type):
|
|
for style in self._style_map:
|
|
document_matcher = style.document_matcher
|
|
if _document_matcher_matches(document_matcher, element, element_type):
|
|
return style
|
|
|
|
def _note_html_id(self, note):
|
|
return self._referent_html_id(note.note_type, note.note_id)
|
|
|
|
def _note_ref_html_id(self, note):
|
|
return self._reference_html_id(note.note_type, note.note_id)
|
|
|
|
def _referent_html_id(self, reference_type, reference_id):
|
|
return self._html_id("{0}-{1}".format(reference_type, reference_id))
|
|
|
|
def _reference_html_id(self, reference_type, reference_id):
|
|
return self._html_id("{0}-ref-{1}".format(reference_type, reference_id))
|
|
|
|
def _html_id(self, suffix):
|
|
return "{0}{1}".format(self._id_prefix, suffix)
|
|
|
|
|
|
@cobble.data
|
|
class Highlight:
|
|
color = cobble.field()
|
|
|
|
|
|
def _document_matcher_matches(matcher, element, element_type):
|
|
if matcher.element_type in ["underline", "strikethrough", "all_caps", "small_caps", "bold", "italic", "comment_reference"]:
|
|
return matcher.element_type == element_type
|
|
elif matcher.element_type == "highlight":
|
|
return (
|
|
matcher.element_type == element_type and
|
|
(matcher.color is None or matcher.color == element.color)
|
|
)
|
|
elif matcher.element_type == "break":
|
|
return (
|
|
matcher.element_type == element_type and
|
|
matcher.break_type == element.break_type
|
|
)
|
|
else: # matcher.element_type in ["paragraph", "run"]:
|
|
return (
|
|
matcher.element_type == element_type and (
|
|
matcher.style_id is None or
|
|
matcher.style_id == element.style_id
|
|
) and (
|
|
matcher.style_name is None or
|
|
element.style_name is not None and (matcher.style_name.matches(element.style_name))
|
|
) and (
|
|
element_type != "paragraph" or
|
|
matcher.numbering is None or
|
|
matcher.numbering == element.numbering
|
|
)
|
|
)
|
|
|
|
|
|
def _comment_author_label(comment):
|
|
return comment.author_initials or ""
|
|
|
|
|
|
_up_arrow = "↑"
|