Initial commit (Clean history)

2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/init.py
@@ -0,0 +1,8 @@
+import collections
+
+
+def style(document_matcher, html_path):
+    return Style(document_matcher, html_path)
+
+
+Style = collections.namedtuple("Style", ["document_matcher", "html_path"])
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/init.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/init.py
@@ -0,0 +1,14 @@
+from .errors import LineParseError
+from .style_mapping_parser import parse_style_mapping
+from .tokeniser import tokenise
+from .token_iterator import TokenIterator
+from ... import results
+
+
+def read_style_mapping(string):
+    try:
+        tokens = tokenise(string)
+        return results.success(parse_style_mapping(TokenIterator(tokens)))
+    except LineParseError:
+        warning = "Did not understand this style mapping, so ignored it: " + string
+        return results.Result(None, [results.warning(warning)])
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/init.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/init.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/document_matcher_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/document_matcher_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/errors.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/errors.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/html_path_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/html_path_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/style_mapping_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/style_mapping_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_iterator.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_iterator.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_parser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/token_parser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/tokeniser.cpython-312.pyc
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/pycache/tokeniser.cpython-312.pyc
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/document_matcher_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/document_matcher_parser.py
@@ -0,0 +1,130 @@
+from ... import documents, document_matchers
+from .errors import LineParseError
+from .tokeniser import TokenType
+from .token_parser import try_parse_class_name, parse_string
+
+
+def parse_document_matcher(tokens):
+    if tokens.try_skip(TokenType.IDENTIFIER, "p"):
+        style_id = try_parse_class_name(tokens)
+        style_name = _parse_style_name(tokens)
+        numbering = _parse_numbering(tokens)
+
+        return document_matchers.paragraph(
+            style_id=style_id,
+            style_name=style_name,
+            numbering=numbering,
+        )
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "r"):
+        style_id = try_parse_class_name(tokens)
+        style_name = _parse_style_name(tokens)
+
+        return document_matchers.run(
+            style_id=style_id,
+            style_name=style_name,
+        )
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "table"):
+        style_id = try_parse_class_name(tokens)
+        style_name = _parse_style_name(tokens)
+
+        return document_matchers.table(
+            style_id=style_id,
+            style_name=style_name,
+        )
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "b"):
+        return document_matchers.bold
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "i"):
+        return document_matchers.italic
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "u"):
+        return document_matchers.underline
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "strike"):
+        return document_matchers.strikethrough
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"):
+        return document_matchers.all_caps
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"):
+        return document_matchers.small_caps
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"):
+        return _parse_highlight(tokens)
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"):
+        return document_matchers.comment_reference
+
+    elif tokens.try_skip(TokenType.IDENTIFIER, "br"):
+        return _parse_break(tokens)
+
+    else:
+        raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER)))
+
+def _parse_style_name(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "["):
+        tokens.skip(TokenType.IDENTIFIER, "style-name")
+        string_matcher = _parse_string_matcher(tokens)
+        tokens.skip(TokenType.SYMBOL, "]")
+        return string_matcher
+    else:
+        return None
+
+
+def _parse_string_matcher(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "="):
+        return document_matchers.equal_to(parse_string(tokens))
+    elif tokens.try_skip(TokenType.SYMBOL, "^="):
+        return document_matchers.starts_with(parse_string(tokens))
+    else:
+        raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value()))
+
+def _parse_numbering(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, ":"):
+        is_ordered = _parse_list_type(tokens)
+        tokens.skip(TokenType.SYMBOL, "(")
+        level = int(tokens.next_value(TokenType.INTEGER)) - 1
+        tokens.skip(TokenType.SYMBOL, ")")
+        return documents.numbering_level(level, is_ordered=is_ordered)
+
+
+def _parse_list_type(tokens):
+    list_type = tokens.next_value(TokenType.IDENTIFIER)
+    if list_type == "ordered-list":
+        return True
+    elif list_type == "unordered-list":
+        return False
+    else:
+        raise LineParseError("Unrecognised list type: {0}".format(list_type))
+
+
+def _parse_highlight(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "["):
+        tokens.skip(TokenType.IDENTIFIER, "color")
+        tokens.skip(TokenType.SYMBOL, "=")
+        color = parse_string(tokens)
+        tokens.skip(TokenType.SYMBOL, "]");
+    else:
+        color = None
+
+    return document_matchers.highlight(color=color)
+
+
+def _parse_break(tokens):
+    tokens.skip(TokenType.SYMBOL, "[")
+    tokens.skip(TokenType.IDENTIFIER, "type")
+    tokens.skip(TokenType.SYMBOL, "=")
+    type_name = parse_string(tokens)
+    tokens.skip(TokenType.SYMBOL, "]");
+
+    if type_name == "line":
+        return document_matchers.line_break
+    elif type_name == "page":
+        return document_matchers.page_break
+    elif type_name == "column":
+        return document_matchers.column_break
+    else:
+        raise LineParseError("Unrecognised break type: {0}".format(type_name))
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/errors.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/errors.py
@@ -0,0 +1,2 @@
+class LineParseError(Exception):
+    pass
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/html_path_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/html_path_parser.py
@@ -0,0 +1,120 @@
+import cobble
+
+from ... import html_paths
+from .tokeniser import TokenType
+from .token_parser import parse_identifier, parse_string
+
+
+@cobble.data
+class _AttributeOrClassName(object):
+    name = cobble.field()
+    value = cobble.field()
+    append = cobble.field()
+
+
+def parse_html_path(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "!"):
+        return html_paths.ignore
+    else:
+        return html_paths.path(_parse_html_path_elements(tokens))
+
+
+def _parse_html_path_elements(tokens):
+    elements = []
+
+    if tokens.peek_token_type() == TokenType.IDENTIFIER:
+        elements.append(_parse_element(tokens))
+
+        while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
+            tokens.skip(TokenType.WHITESPACE)
+            elements.append(_parse_element(tokens))
+
+    return elements
+
+
+def _parse_element(tokens):
+    tag_names = _parse_tag_names(tokens)
+    attributes_list = _parse_attribute_or_class_names(tokens)
+    is_fresh = _parse_is_fresh(tokens)
+    separator = _parse_separator(tokens)
+
+    attributes = {}
+    for attribute in attributes_list:
+        if attribute.append and attributes.get(attribute.name):
+            attributes[attribute.name] += " " + attribute.value
+        else:
+            attributes[attribute.name] = attribute.value
+
+    return html_paths.element(
+        tag_names,
+        attributes=attributes,
+        fresh=is_fresh,
+        separator=separator,
+    )
+
+
+def _parse_tag_names(tokens):
+    tag_names = [parse_identifier(tokens)]
+
+    while tokens.try_skip(TokenType.SYMBOL, "|"):
+        tag_names.append(parse_identifier(tokens))
+
+    return tag_names
+
+
+def _parse_attribute_or_class_names(tokens):
+    attribute_or_class_names = []
+
+    while True:
+        attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
+        if attribute_or_class_name is None:
+            break
+        else:
+            attribute_or_class_names.append(attribute_or_class_name)
+
+    return attribute_or_class_names
+
+
+def _try_parse_attribute_or_class_name(tokens):
+    if tokens.is_next(TokenType.SYMBOL, "["):
+        return _parse_attribute(tokens)
+    if tokens.is_next(TokenType.SYMBOL, "."):
+        return _parse_class_name(tokens)
+    else:
+        return None
+
+
+def _parse_attribute(tokens):
+    tokens.skip(TokenType.SYMBOL, "[")
+    name = parse_identifier(tokens)
+    tokens.skip(TokenType.SYMBOL, "=")
+    value = parse_string(tokens)
+    tokens.skip(TokenType.SYMBOL, "]")
+    return _AttributeOrClassName(name=name, value=value, append=False)
+
+
+def _parse_class_name(tokens):
+    tokens.skip(TokenType.SYMBOL, ".")
+    class_name = parse_identifier(tokens)
+    return _AttributeOrClassName(name="class", value=class_name, append=True)
+
+
+def _parse_is_fresh(tokens):
+    return tokens.try_skip_many((
+        (TokenType.SYMBOL, ":"),
+        (TokenType.IDENTIFIER, "fresh"),
+    ))
+
+
+def _parse_separator(tokens):
+    is_separator = tokens.try_skip_many((
+        (TokenType.SYMBOL, ":"),
+        (TokenType.IDENTIFIER, "separator"),
+    ))
+    if is_separator:
+        tokens.skip(TokenType.SYMBOL, "(")
+        value = parse_string(tokens)
+        tokens.skip(TokenType.SYMBOL, ")")
+        return value
+    else:
+        return None
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/style_mapping_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/style_mapping_parser.py
@@ -0,0 +1,15 @@
+from .tokeniser import TokenType
+from .document_matcher_parser import parse_document_matcher
+from .html_path_parser import parse_html_path
+from ...styles import Style
+
+
+def parse_style_mapping(tokens):
+    document_matcher = parse_document_matcher(tokens)
+    tokens.skip(TokenType.WHITESPACE)
+    tokens.skip(TokenType.SYMBOL, "=>")
+    tokens.try_skip(TokenType.WHITESPACE)
+    html_path = parse_html_path(tokens)
+    tokens.skip(TokenType.END)
+    
+    return Style(document_matcher, html_path)
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_iterator.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_iterator.py
@@ -0,0 +1,59 @@
+# TODO: check indices
+# TODO: proper tests for unexpected tokens
+
+from .errors import LineParseError
+
+
+class TokenIterator(object):
+    def __init__(self, tokens):
+        self._tokens = tokens
+        self._index = 0
+
+    def peek_token_type(self):
+        return self._tokens[self._index].type
+
+    def next_value(self, token_type=None):
+        return self._next(token_type).value
+
+    def _next(self, token_type=None):
+        token = self._tokens[self._index]
+        if token_type is None or token.type == token_type:
+            self._index += 1
+            return token
+        else:
+            raise self._unexpected_token_type(token_type, token)
+
+    def skip(self, token_type, token_value=None):
+        token = self._tokens[self._index]
+        if token.type == token_type and (token_value is None or token.value == token_value):
+            self._index += 1
+            return True
+        else:
+            raise self._unexpected_token_type(token_type, token)
+
+    def try_skip(self, token_type, token_value=None):
+        if self.is_next(token_type, token_value):
+            self._index += 1
+            return True
+        else:
+            return False
+
+    def try_skip_many(self, tokens):
+        start = self._index
+        for token_type, token_value in tokens:
+            token = self._tokens[self._index]
+            if not (token.type == token_type and (token_value is None or token.value == token_value)):
+                self._index = start
+                return False
+            else:
+                self._index += 1
+
+        return True
+
+    def is_next(self, token_type, token_value=None):
+        token = self._tokens[self._index]
+        return token.type == token_type and (token_value is None or token.value == token_value)
+
+    def _unexpected_token_type(self, token_type, token):
+        raise LineParseError()
+
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_parser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/token_parser.py
@@ -0,0 +1,37 @@
+import re
+
+from .tokeniser import TokenType
+
+
+def try_parse_class_name(tokens):
+    if tokens.try_skip(TokenType.SYMBOL, "."):
+        return parse_identifier(tokens)
+    else:
+        return None
+
+
+def parse_identifier(tokens):
+    return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER))
+
+
+def parse_string(tokens):
+    return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1])
+
+
+_ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)")
+
+
+def decode_escape_sequences(value):
+    return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value)
+    
+    
+def _decode_escape_sequence(match):
+    code = match.group(1)
+    if code == "n":
+        return "\n"
+    elif code == "r":
+        return "\r"
+    elif code == "t":
+        return "\t"
+    else:
+        return code
--- a/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/tokeniser.py
+++ b/path/to/venv/lib/python3.12/site-packages/mammoth/styles/parser/tokeniser.py
@@ -0,0 +1,61 @@
+import collections
+import re
+
+
+Token = collections.namedtuple("Token", ["character_index", "type", "value"])
+
+
+class TokenType(object):
+    IDENTIFIER = "identifier"
+    SYMBOL = "symbol"
+    WHITESPACE = "whitespace"
+    STRING = "string"
+    UNTERMINATED_STRING = "unterminated string"
+    INTEGER = "integer"
+    END = "end"
+    
+
+
+def regex_tokeniser(rules):
+    rules = [(token_type, _to_regex(regex)) for token_type, regex in rules]
+    rules.append(("unknown", re.compile(".")))
+    
+    def tokenise(value):
+        tokens = []
+        index = 0
+        while index < len(value):
+            for token_type, regex in rules:
+                match = regex.match(value, index)
+                if match is not None:
+                    tokens.append(Token(index, token_type, match.group(0)))
+                    index = match.end()
+                    break
+            else:
+                # Should be impossible
+                raise Exception("Remaining: " + value[index:])
+
+        tokens.append(Token(index, TokenType.END, ""))
+
+        return tokens
+
+    return tokenise
+    
+
+def _to_regex(value):
+    if hasattr(value, "match"):
+        return value
+    else:
+        return re.compile(value)
+
+
+_string_prefix = r"'(?:\\.|[^'])*"
+_identifier_character = r"(?:[a-zA-Z\-_]|\\.)"
+
+tokenise = regex_tokeniser([
+    (TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"),
+    (TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."),
+    (TokenType.WHITESPACE, r"\s+"),
+    (TokenType.STRING, _string_prefix + "'"),
+    (TokenType.UNTERMINATED_STRING, _string_prefix),
+    (TokenType.INTEGER, "([0-9]+)"),
+])