Files
2025-12-30 11:27:14 +07:00

409 lines
14 KiB
Python

# coding=utf-8
from __future__ import unicode_literals
from functools import partial
import cobble
from . import documents, results, html_paths, images, writers, html
from .docx.files import InvalidFileReferenceError
from .lists import find_index
def convert_document_element_to_html(element,
style_map=None,
convert_image=None,
id_prefix=None,
output_format=None,
ignore_empty_paragraphs=True):
if style_map is None:
style_map = []
if id_prefix is None:
id_prefix = ""
if convert_image is None:
convert_image = images.data_uri
if isinstance(element, documents.Document):
comments = dict(
(comment.comment_id, comment)
for comment in element.comments
)
else:
comments = {}
messages = []
converter = _DocumentConverter(
messages=messages,
style_map=style_map,
convert_image=convert_image,
id_prefix=id_prefix,
ignore_empty_paragraphs=ignore_empty_paragraphs,
note_references=[],
comments=comments,
)
context = _ConversionContext(is_table_header=False)
nodes = converter.visit(element, context)
writer = writers.writer(output_format)
html.write(writer, html.collapse(html.strip_empty(nodes)))
return results.Result(writer.as_string(), messages)
@cobble.data
class _ConversionContext(object):
is_table_header = cobble.field()
def copy(self, **kwargs):
return cobble.copy(self, **kwargs)
class _DocumentConverter(documents.element_visitor(args=1)):
def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
self._messages = messages
self._style_map = style_map
self._id_prefix = id_prefix
self._ignore_empty_paragraphs = ignore_empty_paragraphs
self._note_references = note_references
self._referenced_comments = []
self._convert_image = convert_image
self._comments = comments
def visit_image(self, image, context):
try:
return self._convert_image(image)
except InvalidFileReferenceError as error:
self._messages.append(results.warning(str(error)))
return []
def visit_document(self, document, context):
nodes = self._visit_all(document.children, context)
notes = [
document.notes.resolve(reference)
for reference in self._note_references
]
notes_list = html.element("ol", {}, self._visit_all(notes, context))
comments = html.element("dl", {}, [
html_node
for referenced_comment in self._referenced_comments
for html_node in self.visit_comment(referenced_comment, context)
])
return nodes + [notes_list, comments]
def visit_paragraph(self, paragraph, context):
def children():
content = self._visit_all(paragraph.children, context)
if self._ignore_empty_paragraphs:
return content
else:
return [html.force_write] + content
html_path = self._find_html_path_for_paragraph(paragraph)
return html_path.wrap(children)
def visit_run(self, run, context):
nodes = lambda: self._visit_all(run.children, context)
paths = []
if run.highlight is not None:
style = self._find_style(Highlight(color=run.highlight), "highlight")
if style is not None:
paths.append(style.html_path)
if run.is_small_caps:
paths.append(self._find_style_for_run_property("small_caps"))
if run.is_all_caps:
paths.append(self._find_style_for_run_property("all_caps"))
if run.is_strikethrough:
paths.append(self._find_style_for_run_property("strikethrough", default="s"))
if run.is_underline:
paths.append(self._find_style_for_run_property("underline"))
if run.vertical_alignment == documents.VerticalAlignment.subscript:
paths.append(html_paths.element(["sub"], fresh=False))
if run.vertical_alignment == documents.VerticalAlignment.superscript:
paths.append(html_paths.element(["sup"], fresh=False))
if run.is_italic:
paths.append(self._find_style_for_run_property("italic", default="em"))
if run.is_bold:
paths.append(self._find_style_for_run_property("bold", default="strong"))
paths.append(self._find_html_path_for_run(run))
for path in paths:
nodes = partial(path.wrap, nodes)
return nodes()
def _find_style_for_run_property(self, element_type, default=None):
style = self._find_style(None, element_type)
if style is not None:
return style.html_path
elif default is not None:
return html_paths.element(default, fresh=False)
else:
return html_paths.empty
def visit_text(self, text, context):
return [html.text(text.value)]
def visit_hyperlink(self, hyperlink, context):
if hyperlink.anchor is None:
href = hyperlink.href
else:
href = "#{0}".format(self._html_id(hyperlink.anchor))
attributes = {"href": href}
if hyperlink.target_frame is not None:
attributes["target"] = hyperlink.target_frame
nodes = self._visit_all(hyperlink.children, context)
return [html.collapsible_element("a", attributes, nodes)]
def visit_checkbox(self, checkbox, context):
attributes = {"type": "checkbox"}
if checkbox.checked:
attributes["checked"] = "checked"
return [html.element("input", attributes)]
def visit_bookmark(self, bookmark, context):
element = html.collapsible_element(
"a",
{"id": self._html_id(bookmark.name)},
[html.force_write])
return [element]
def visit_tab(self, tab, context):
return [html.text("\t")]
_default_table_path = html_paths.path([html_paths.element(["table"], fresh=True)])
def visit_table(self, table, context):
return self._find_html_path(table, "table", self._default_table_path) \
.wrap(lambda: self._convert_table_children(table, context))
def _convert_table_children(self, table, context):
body_index = find_index(
lambda child: not isinstance(child, documents.TableRow) or not child.is_header,
table.children,
)
if body_index is None:
body_index = len(table.children)
if body_index == 0:
children = self._visit_all(table.children, context.copy(is_table_header=False))
else:
head_rows = self._visit_all(table.children[:body_index], context.copy(is_table_header=True))
body_rows = self._visit_all(table.children[body_index:], context.copy(is_table_header=False))
children = [
html.element("thead", {}, head_rows),
html.element("tbody", {}, body_rows),
]
return [html.force_write] + children
def visit_table_row(self, table_row, context):
return [html.element("tr", {}, [html.force_write] + self._visit_all(table_row.children, context))]
def visit_table_cell(self, table_cell, context):
if context.is_table_header:
tag_name = "th"
else:
tag_name = "td"
attributes = {}
if table_cell.colspan != 1:
attributes["colspan"] = str(table_cell.colspan)
if table_cell.rowspan != 1:
attributes["rowspan"] = str(table_cell.rowspan)
nodes = [html.force_write] + self._visit_all(table_cell.children, context)
return [
html.element(tag_name, attributes, nodes)
]
def visit_break(self, break_, context):
return self._find_html_path_for_break(break_).wrap(lambda: [])
def _find_html_path_for_break(self, break_):
style = self._find_style(break_, "break")
if style is not None:
return style.html_path
elif break_.break_type == "line":
return html_paths.path([html_paths.element("br", fresh=True)])
else:
return html_paths.empty
def visit_note_reference(self, note_reference, context):
self._note_references.append(note_reference)
note_number = len(self._note_references)
return [
html.element("sup", {}, [
html.element("a", {
"href": "#" + self._note_html_id(note_reference),
"id": self._note_ref_html_id(note_reference),
}, [html.text("[{0}]".format(note_number))])
])
]
def visit_note(self, note, context):
note_body = self._visit_all(note.body, context) + [
html.collapsible_element("p", {}, [
html.text(" "),
html.element("a", {"href": "#" + self._note_ref_html_id(note)}, [
html.text(_up_arrow)
]),
])
]
return [
html.element("li", {"id": self._note_html_id(note)}, note_body)
]
def visit_comment_reference(self, reference, context):
def nodes():
comment = self._comments[reference.comment_id]
count = len(self._referenced_comments) + 1
label = "[{0}{1}]".format(_comment_author_label(comment), count)
self._referenced_comments.append((label, comment))
return [
# TODO: remove duplication with note references
html.element("a", {
"href": "#" + self._referent_html_id("comment", reference.comment_id),
"id": self._reference_html_id("comment", reference.comment_id),
}, [html.text(label)])
]
html_path = self._find_html_path(
None,
"comment_reference",
default=html_paths.ignore,
)
return html_path.wrap(nodes)
def visit_comment(self, referenced_comment, context):
label, comment = referenced_comment
# TODO remove duplication with notes
body = self._visit_all(comment.body, context) + [
html.collapsible_element("p", {}, [
html.text(" "),
html.element("a", {"href": "#" + self._reference_html_id("comment", comment.comment_id)}, [
html.text(_up_arrow)
]),
])
]
return [
html.element(
"dt",
{"id": self._referent_html_id("comment", comment.comment_id)},
[html.text("Comment {0}".format(label))],
),
html.element("dd", {}, body),
]
def _visit_all(self, elements, context):
return [
html_node
for element in elements
for html_node in self.visit(element, context)
]
def _find_html_path_for_paragraph(self, paragraph):
default = html_paths.path([html_paths.element("p", fresh=True)])
return self._find_html_path(paragraph, "paragraph", default, warn_unrecognised=True)
def _find_html_path_for_run(self, run):
return self._find_html_path(run, "run", default=html_paths.empty, warn_unrecognised=True)
def _find_html_path(self, element, element_type, default, warn_unrecognised=False):
style = self._find_style(element, element_type)
if style is not None:
return style.html_path
if warn_unrecognised and getattr(element, "style_id", None) is not None:
self._messages.append(results.warning(
"Unrecognised {0} style: {1} (Style ID: {2})".format(
element_type, element.style_name, element.style_id)
))
return default
def _find_style(self, element, element_type):
for style in self._style_map:
document_matcher = style.document_matcher
if _document_matcher_matches(document_matcher, element, element_type):
return style
def _note_html_id(self, note):
return self._referent_html_id(note.note_type, note.note_id)
def _note_ref_html_id(self, note):
return self._reference_html_id(note.note_type, note.note_id)
def _referent_html_id(self, reference_type, reference_id):
return self._html_id("{0}-{1}".format(reference_type, reference_id))
def _reference_html_id(self, reference_type, reference_id):
return self._html_id("{0}-ref-{1}".format(reference_type, reference_id))
def _html_id(self, suffix):
return "{0}{1}".format(self._id_prefix, suffix)
@cobble.data
class Highlight:
color = cobble.field()
def _document_matcher_matches(matcher, element, element_type):
if matcher.element_type in ["underline", "strikethrough", "all_caps", "small_caps", "bold", "italic", "comment_reference"]:
return matcher.element_type == element_type
elif matcher.element_type == "highlight":
return (
matcher.element_type == element_type and
(matcher.color is None or matcher.color == element.color)
)
elif matcher.element_type == "break":
return (
matcher.element_type == element_type and
matcher.break_type == element.break_type
)
else: # matcher.element_type in ["paragraph", "run"]:
return (
matcher.element_type == element_type and (
matcher.style_id is None or
matcher.style_id == element.style_id
) and (
matcher.style_name is None or
element.style_name is not None and (matcher.style_name.matches(element.style_name))
) and (
element_type != "paragraph" or
matcher.numbering is None or
matcher.numbering == element.numbering
)
)
def _comment_author_label(comment):
return comment.author_initials or ""
_up_arrow = ""