Initial commit (Clean history)

This commit is contained in:
anhduy-tech
2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
import collections
def style(document_matcher, html_path):
return Style(document_matcher, html_path)
Style = collections.namedtuple("Style", ["document_matcher", "html_path"])

View File

@@ -0,0 +1,14 @@
from .errors import LineParseError
from .style_mapping_parser import parse_style_mapping
from .tokeniser import tokenise
from .token_iterator import TokenIterator
from ... import results
def read_style_mapping(string):
try:
tokens = tokenise(string)
return results.success(parse_style_mapping(TokenIterator(tokens)))
except LineParseError:
warning = "Did not understand this style mapping, so ignored it: " + string
return results.Result(None, [results.warning(warning)])

View File

@@ -0,0 +1,130 @@
from ... import documents, document_matchers
from .errors import LineParseError
from .tokeniser import TokenType
from .token_parser import try_parse_class_name, parse_string
def parse_document_matcher(tokens):
if tokens.try_skip(TokenType.IDENTIFIER, "p"):
style_id = try_parse_class_name(tokens)
style_name = _parse_style_name(tokens)
numbering = _parse_numbering(tokens)
return document_matchers.paragraph(
style_id=style_id,
style_name=style_name,
numbering=numbering,
)
elif tokens.try_skip(TokenType.IDENTIFIER, "r"):
style_id = try_parse_class_name(tokens)
style_name = _parse_style_name(tokens)
return document_matchers.run(
style_id=style_id,
style_name=style_name,
)
elif tokens.try_skip(TokenType.IDENTIFIER, "table"):
style_id = try_parse_class_name(tokens)
style_name = _parse_style_name(tokens)
return document_matchers.table(
style_id=style_id,
style_name=style_name,
)
elif tokens.try_skip(TokenType.IDENTIFIER, "b"):
return document_matchers.bold
elif tokens.try_skip(TokenType.IDENTIFIER, "i"):
return document_matchers.italic
elif tokens.try_skip(TokenType.IDENTIFIER, "u"):
return document_matchers.underline
elif tokens.try_skip(TokenType.IDENTIFIER, "strike"):
return document_matchers.strikethrough
elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"):
return document_matchers.all_caps
elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"):
return document_matchers.small_caps
elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"):
return _parse_highlight(tokens)
elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"):
return document_matchers.comment_reference
elif tokens.try_skip(TokenType.IDENTIFIER, "br"):
return _parse_break(tokens)
else:
raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER)))
def _parse_style_name(tokens):
if tokens.try_skip(TokenType.SYMBOL, "["):
tokens.skip(TokenType.IDENTIFIER, "style-name")
string_matcher = _parse_string_matcher(tokens)
tokens.skip(TokenType.SYMBOL, "]")
return string_matcher
else:
return None
def _parse_string_matcher(tokens):
if tokens.try_skip(TokenType.SYMBOL, "="):
return document_matchers.equal_to(parse_string(tokens))
elif tokens.try_skip(TokenType.SYMBOL, "^="):
return document_matchers.starts_with(parse_string(tokens))
else:
raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value()))
def _parse_numbering(tokens):
if tokens.try_skip(TokenType.SYMBOL, ":"):
is_ordered = _parse_list_type(tokens)
tokens.skip(TokenType.SYMBOL, "(")
level = int(tokens.next_value(TokenType.INTEGER)) - 1
tokens.skip(TokenType.SYMBOL, ")")
return documents.numbering_level(level, is_ordered=is_ordered)
def _parse_list_type(tokens):
list_type = tokens.next_value(TokenType.IDENTIFIER)
if list_type == "ordered-list":
return True
elif list_type == "unordered-list":
return False
else:
raise LineParseError("Unrecognised list type: {0}".format(list_type))
def _parse_highlight(tokens):
if tokens.try_skip(TokenType.SYMBOL, "["):
tokens.skip(TokenType.IDENTIFIER, "color")
tokens.skip(TokenType.SYMBOL, "=")
color = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, "]");
else:
color = None
return document_matchers.highlight(color=color)
def _parse_break(tokens):
tokens.skip(TokenType.SYMBOL, "[")
tokens.skip(TokenType.IDENTIFIER, "type")
tokens.skip(TokenType.SYMBOL, "=")
type_name = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, "]");
if type_name == "line":
return document_matchers.line_break
elif type_name == "page":
return document_matchers.page_break
elif type_name == "column":
return document_matchers.column_break
else:
raise LineParseError("Unrecognised break type: {0}".format(type_name))

View File

@@ -0,0 +1,2 @@
class LineParseError(Exception):
pass

View File

@@ -0,0 +1,120 @@
import cobble
from ... import html_paths
from .tokeniser import TokenType
from .token_parser import parse_identifier, parse_string
@cobble.data
class _AttributeOrClassName(object):
name = cobble.field()
value = cobble.field()
append = cobble.field()
def parse_html_path(tokens):
if tokens.try_skip(TokenType.SYMBOL, "!"):
return html_paths.ignore
else:
return html_paths.path(_parse_html_path_elements(tokens))
def _parse_html_path_elements(tokens):
elements = []
if tokens.peek_token_type() == TokenType.IDENTIFIER:
elements.append(_parse_element(tokens))
while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
tokens.skip(TokenType.WHITESPACE)
elements.append(_parse_element(tokens))
return elements
def _parse_element(tokens):
tag_names = _parse_tag_names(tokens)
attributes_list = _parse_attribute_or_class_names(tokens)
is_fresh = _parse_is_fresh(tokens)
separator = _parse_separator(tokens)
attributes = {}
for attribute in attributes_list:
if attribute.append and attributes.get(attribute.name):
attributes[attribute.name] += " " + attribute.value
else:
attributes[attribute.name] = attribute.value
return html_paths.element(
tag_names,
attributes=attributes,
fresh=is_fresh,
separator=separator,
)
def _parse_tag_names(tokens):
tag_names = [parse_identifier(tokens)]
while tokens.try_skip(TokenType.SYMBOL, "|"):
tag_names.append(parse_identifier(tokens))
return tag_names
def _parse_attribute_or_class_names(tokens):
attribute_or_class_names = []
while True:
attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
if attribute_or_class_name is None:
break
else:
attribute_or_class_names.append(attribute_or_class_name)
return attribute_or_class_names
def _try_parse_attribute_or_class_name(tokens):
if tokens.is_next(TokenType.SYMBOL, "["):
return _parse_attribute(tokens)
if tokens.is_next(TokenType.SYMBOL, "."):
return _parse_class_name(tokens)
else:
return None
def _parse_attribute(tokens):
tokens.skip(TokenType.SYMBOL, "[")
name = parse_identifier(tokens)
tokens.skip(TokenType.SYMBOL, "=")
value = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, "]")
return _AttributeOrClassName(name=name, value=value, append=False)
def _parse_class_name(tokens):
tokens.skip(TokenType.SYMBOL, ".")
class_name = parse_identifier(tokens)
return _AttributeOrClassName(name="class", value=class_name, append=True)
def _parse_is_fresh(tokens):
return tokens.try_skip_many((
(TokenType.SYMBOL, ":"),
(TokenType.IDENTIFIER, "fresh"),
))
def _parse_separator(tokens):
is_separator = tokens.try_skip_many((
(TokenType.SYMBOL, ":"),
(TokenType.IDENTIFIER, "separator"),
))
if is_separator:
tokens.skip(TokenType.SYMBOL, "(")
value = parse_string(tokens)
tokens.skip(TokenType.SYMBOL, ")")
return value
else:
return None

View File

@@ -0,0 +1,15 @@
from .tokeniser import TokenType
from .document_matcher_parser import parse_document_matcher
from .html_path_parser import parse_html_path
from ...styles import Style
def parse_style_mapping(tokens):
document_matcher = parse_document_matcher(tokens)
tokens.skip(TokenType.WHITESPACE)
tokens.skip(TokenType.SYMBOL, "=>")
tokens.try_skip(TokenType.WHITESPACE)
html_path = parse_html_path(tokens)
tokens.skip(TokenType.END)
return Style(document_matcher, html_path)

View File

@@ -0,0 +1,59 @@
# TODO: check indices
# TODO: proper tests for unexpected tokens
from .errors import LineParseError
class TokenIterator(object):
def __init__(self, tokens):
self._tokens = tokens
self._index = 0
def peek_token_type(self):
return self._tokens[self._index].type
def next_value(self, token_type=None):
return self._next(token_type).value
def _next(self, token_type=None):
token = self._tokens[self._index]
if token_type is None or token.type == token_type:
self._index += 1
return token
else:
raise self._unexpected_token_type(token_type, token)
def skip(self, token_type, token_value=None):
token = self._tokens[self._index]
if token.type == token_type and (token_value is None or token.value == token_value):
self._index += 1
return True
else:
raise self._unexpected_token_type(token_type, token)
def try_skip(self, token_type, token_value=None):
if self.is_next(token_type, token_value):
self._index += 1
return True
else:
return False
def try_skip_many(self, tokens):
start = self._index
for token_type, token_value in tokens:
token = self._tokens[self._index]
if not (token.type == token_type and (token_value is None or token.value == token_value)):
self._index = start
return False
else:
self._index += 1
return True
def is_next(self, token_type, token_value=None):
token = self._tokens[self._index]
return token.type == token_type and (token_value is None or token.value == token_value)
def _unexpected_token_type(self, token_type, token):
raise LineParseError()

View File

@@ -0,0 +1,37 @@
import re
from .tokeniser import TokenType
def try_parse_class_name(tokens):
if tokens.try_skip(TokenType.SYMBOL, "."):
return parse_identifier(tokens)
else:
return None
def parse_identifier(tokens):
return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER))
def parse_string(tokens):
return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1])
_ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)")
def decode_escape_sequences(value):
return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value)
def _decode_escape_sequence(match):
code = match.group(1)
if code == "n":
return "\n"
elif code == "r":
return "\r"
elif code == "t":
return "\t"
else:
return code

View File

@@ -0,0 +1,61 @@
import collections
import re
Token = collections.namedtuple("Token", ["character_index", "type", "value"])
class TokenType(object):
IDENTIFIER = "identifier"
SYMBOL = "symbol"
WHITESPACE = "whitespace"
STRING = "string"
UNTERMINATED_STRING = "unterminated string"
INTEGER = "integer"
END = "end"
def regex_tokeniser(rules):
rules = [(token_type, _to_regex(regex)) for token_type, regex in rules]
rules.append(("unknown", re.compile(".")))
def tokenise(value):
tokens = []
index = 0
while index < len(value):
for token_type, regex in rules:
match = regex.match(value, index)
if match is not None:
tokens.append(Token(index, token_type, match.group(0)))
index = match.end()
break
else:
# Should be impossible
raise Exception("Remaining: " + value[index:])
tokens.append(Token(index, TokenType.END, ""))
return tokens
return tokenise
def _to_regex(value):
if hasattr(value, "match"):
return value
else:
return re.compile(value)
_string_prefix = r"'(?:\\.|[^'])*"
_identifier_character = r"(?:[a-zA-Z\-_]|\\.)"
tokenise = regex_tokeniser([
(TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"),
(TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."),
(TokenType.WHITESPACE, r"\s+"),
(TokenType.STRING, _string_prefix + "'"),
(TokenType.UNTERMINATED_STRING, _string_prefix),
(TokenType.INTEGER, "([0-9]+)"),
])