Initial commit (Clean history)

This commit is contained in:
anhduy-tech
2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions

View File

@@ -0,0 +1,331 @@
# pyright: reportAssignmentType=false
"""Custom element classes related to run properties (font)."""
from __future__ import annotations
from typing import TYPE_CHECKING, Callable
from docx.enum.dml import MSO_THEME_COLOR
from docx.enum.text import WD_COLOR_INDEX, WD_UNDERLINE
from docx.oxml.ns import nsdecls
from docx.oxml.parser import parse_xml
from docx.oxml.simpletypes import (
ST_HexColor,
ST_HpsMeasure,
ST_String,
ST_VerticalAlignRun,
)
from docx.oxml.xmlchemy import (
BaseOxmlElement,
OptionalAttribute,
RequiredAttribute,
ZeroOrOne,
)
from docx.shared import RGBColor
if TYPE_CHECKING:
from docx.oxml.shared import CT_OnOff, CT_String
from docx.shared import Length
class CT_Color(BaseOxmlElement):
"""`w:color` element, specifying the color of a font and perhaps other objects."""
val: RGBColor | str = RequiredAttribute("w:val", ST_HexColor)
themeColor: MSO_THEME_COLOR | None = OptionalAttribute("w:themeColor", MSO_THEME_COLOR)
class CT_Fonts(BaseOxmlElement):
"""`<w:rFonts>` element.
Specifies typeface name for the various language types.
"""
ascii: str | None = OptionalAttribute("w:ascii", ST_String)
hAnsi: str | None = OptionalAttribute("w:hAnsi", ST_String)
class CT_Highlight(BaseOxmlElement):
"""`w:highlight` element, specifying font highlighting/background color."""
val: WD_COLOR_INDEX = RequiredAttribute("w:val", WD_COLOR_INDEX)
class CT_HpsMeasure(BaseOxmlElement):
"""Used for `<w:sz>` element and others, specifying font size in half-points."""
val: Length = RequiredAttribute("w:val", ST_HpsMeasure)
class CT_RPr(BaseOxmlElement):
"""`<w:rPr>` element, containing the properties for a run."""
get_or_add_color: Callable[[], CT_Color]
get_or_add_highlight: Callable[[], CT_Highlight]
get_or_add_rFonts: Callable[[], CT_Fonts]
get_or_add_sz: Callable[[], CT_HpsMeasure]
get_or_add_vertAlign: Callable[[], CT_VerticalAlignRun]
_add_rStyle: Callable[..., CT_String]
_add_u: Callable[[], CT_Underline]
_remove_color: Callable[[], None]
_remove_highlight: Callable[[], None]
_remove_rFonts: Callable[[], None]
_remove_rStyle: Callable[[], None]
_remove_sz: Callable[[], None]
_remove_u: Callable[[], None]
_remove_vertAlign: Callable[[], None]
_tag_seq = (
"w:rStyle",
"w:rFonts",
"w:b",
"w:bCs",
"w:i",
"w:iCs",
"w:caps",
"w:smallCaps",
"w:strike",
"w:dstrike",
"w:outline",
"w:shadow",
"w:emboss",
"w:imprint",
"w:noProof",
"w:snapToGrid",
"w:vanish",
"w:webHidden",
"w:color",
"w:spacing",
"w:w",
"w:kern",
"w:position",
"w:sz",
"w:szCs",
"w:highlight",
"w:u",
"w:effect",
"w:bdr",
"w:shd",
"w:fitText",
"w:vertAlign",
"w:rtl",
"w:cs",
"w:em",
"w:lang",
"w:eastAsianLayout",
"w:specVanish",
"w:oMath",
)
rStyle: CT_String | None = ZeroOrOne("w:rStyle", successors=_tag_seq[1:])
rFonts: CT_Fonts | None = ZeroOrOne("w:rFonts", successors=_tag_seq[2:])
b: CT_OnOff | None = ZeroOrOne("w:b", successors=_tag_seq[3:])
bCs = ZeroOrOne("w:bCs", successors=_tag_seq[4:])
i = ZeroOrOne("w:i", successors=_tag_seq[5:])
iCs = ZeroOrOne("w:iCs", successors=_tag_seq[6:])
caps = ZeroOrOne("w:caps", successors=_tag_seq[7:])
smallCaps = ZeroOrOne("w:smallCaps", successors=_tag_seq[8:])
strike = ZeroOrOne("w:strike", successors=_tag_seq[9:])
dstrike = ZeroOrOne("w:dstrike", successors=_tag_seq[10:])
outline = ZeroOrOne("w:outline", successors=_tag_seq[11:])
shadow = ZeroOrOne("w:shadow", successors=_tag_seq[12:])
emboss = ZeroOrOne("w:emboss", successors=_tag_seq[13:])
imprint = ZeroOrOne("w:imprint", successors=_tag_seq[14:])
noProof = ZeroOrOne("w:noProof", successors=_tag_seq[15:])
snapToGrid = ZeroOrOne("w:snapToGrid", successors=_tag_seq[16:])
vanish = ZeroOrOne("w:vanish", successors=_tag_seq[17:])
webHidden = ZeroOrOne("w:webHidden", successors=_tag_seq[18:])
color: CT_Color | None = ZeroOrOne("w:color", successors=_tag_seq[19:])
sz: CT_HpsMeasure | None = ZeroOrOne("w:sz", successors=_tag_seq[24:])
highlight: CT_Highlight | None = ZeroOrOne("w:highlight", successors=_tag_seq[26:])
u: CT_Underline | None = ZeroOrOne("w:u", successors=_tag_seq[27:])
vertAlign: CT_VerticalAlignRun | None = ZeroOrOne("w:vertAlign", successors=_tag_seq[32:])
rtl = ZeroOrOne("w:rtl", successors=_tag_seq[33:])
cs = ZeroOrOne("w:cs", successors=_tag_seq[34:])
specVanish = ZeroOrOne("w:specVanish", successors=_tag_seq[38:])
oMath = ZeroOrOne("w:oMath", successors=_tag_seq[39:])
del _tag_seq
def _new_color(self):
"""Override metaclass method to set `w:color/@val` to RGB black on create."""
return parse_xml('<w:color %s w:val="000000"/>' % nsdecls("w"))
@property
def highlight_val(self) -> WD_COLOR_INDEX | None:
"""Value of `./w:highlight/@val`.
Specifies font's highlight color, or `None` if the text is not highlighted.
"""
highlight = self.highlight
if highlight is None:
return None
return highlight.val
@highlight_val.setter
def highlight_val(self, value: WD_COLOR_INDEX | None) -> None:
if value is None:
self._remove_highlight()
return
highlight = self.get_or_add_highlight()
highlight.val = value
@property
def rFonts_ascii(self) -> str | None:
"""The value of `w:rFonts/@w:ascii` or |None| if not present.
Represents the assigned typeface name. The rFonts element also specifies other
special-case typeface names; this method handles the case where just the common
name is required.
"""
rFonts = self.rFonts
if rFonts is None:
return None
return rFonts.ascii
@rFonts_ascii.setter
def rFonts_ascii(self, value: str | None) -> None:
if value is None:
self._remove_rFonts()
return
rFonts = self.get_or_add_rFonts()
rFonts.ascii = value
@property
def rFonts_hAnsi(self) -> str | None:
"""The value of `w:rFonts/@w:hAnsi` or |None| if not present."""
rFonts = self.rFonts
if rFonts is None:
return None
return rFonts.hAnsi
@rFonts_hAnsi.setter
def rFonts_hAnsi(self, value: str | None):
if value is None and self.rFonts is None:
return
rFonts = self.get_or_add_rFonts()
rFonts.hAnsi = value
@property
def style(self) -> str | None:
"""String in `./w:rStyle/@val`, or None if `w:rStyle` is not present."""
rStyle = self.rStyle
if rStyle is None:
return None
return rStyle.val
@style.setter
def style(self, style: str | None) -> None:
"""Set `./w:rStyle/@val` to `style`, adding the `w:rStyle` element if necessary.
If `style` is |None|, remove `w:rStyle` element if present.
"""
if style is None:
self._remove_rStyle()
elif self.rStyle is None:
self._add_rStyle(val=style)
else:
self.rStyle.val = style
@property
def subscript(self) -> bool | None:
"""|True| if `./w:vertAlign/@w:val` is "subscript".
|False| if `w:vertAlign/@w:val` contains any other value. |None| if
`w:vertAlign` is not present.
"""
vertAlign = self.vertAlign
if vertAlign is None:
return None
return vertAlign.val == ST_VerticalAlignRun.SUBSCRIPT
@subscript.setter
def subscript(self, value: bool | None) -> None:
if value is None:
self._remove_vertAlign()
elif bool(value) is True:
self.get_or_add_vertAlign().val = ST_VerticalAlignRun.SUBSCRIPT
# -- assert bool(value) is False --
elif self.vertAlign is not None and self.vertAlign.val == ST_VerticalAlignRun.SUBSCRIPT:
self._remove_vertAlign()
@property
def superscript(self) -> bool | None:
"""|True| if `w:vertAlign/@w:val` is 'superscript'.
|False| if `w:vertAlign/@w:val` contains any other value. |None| if
`w:vertAlign` is not present.
"""
vertAlign = self.vertAlign
if vertAlign is None:
return None
return vertAlign.val == ST_VerticalAlignRun.SUPERSCRIPT
@superscript.setter
def superscript(self, value: bool | None):
if value is None:
self._remove_vertAlign()
elif bool(value) is True:
self.get_or_add_vertAlign().val = ST_VerticalAlignRun.SUPERSCRIPT
# -- assert bool(value) is False --
elif self.vertAlign is not None and self.vertAlign.val == ST_VerticalAlignRun.SUPERSCRIPT:
self._remove_vertAlign()
@property
def sz_val(self) -> Length | None:
"""The value of `w:sz/@w:val` or |None| if not present."""
sz = self.sz
if sz is None:
return None
return sz.val
@sz_val.setter
def sz_val(self, value: Length | None):
if value is None:
self._remove_sz()
return
sz = self.get_or_add_sz()
sz.val = value
@property
def u_val(self) -> WD_UNDERLINE | None:
"""Value of `w:u/@val`, or None if not present.
Values `WD_UNDERLINE.SINGLE` and `WD_UNDERLINE.NONE` are mapped to `True` and
`False` respectively.
"""
u = self.u
if u is None:
return None
return u.val
@u_val.setter
def u_val(self, value: WD_UNDERLINE | None):
self._remove_u()
if value is not None:
self._add_u().val = value
def _get_bool_val(self, name: str) -> bool | None:
"""Value of boolean child with `name`, e.g. "w:b", "w:i", and "w:smallCaps"."""
element = getattr(self, name)
if element is None:
return None
return element.val
def _set_bool_val(self, name: str, value: bool | None):
if value is None:
getattr(self, "_remove_%s" % name)()
return
element = getattr(self, "get_or_add_%s" % name)()
element.val = value
class CT_Underline(BaseOxmlElement):
"""`<w:u>` element, specifying the underlining style for a run."""
val: WD_UNDERLINE | None = OptionalAttribute("w:val", WD_UNDERLINE)
class CT_VerticalAlignRun(BaseOxmlElement):
"""`<w:vertAlign>` element, specifying subscript or superscript."""
val: str = RequiredAttribute("w:val", ST_VerticalAlignRun)

View File

@@ -0,0 +1,45 @@
"""Custom element classes related to hyperlinks (CT_Hyperlink)."""
from __future__ import annotations
from typing import TYPE_CHECKING, List
from docx.oxml.simpletypes import ST_OnOff, ST_String, XsdString
from docx.oxml.text.run import CT_R
from docx.oxml.xmlchemy import (
BaseOxmlElement,
OptionalAttribute,
ZeroOrMore,
)
if TYPE_CHECKING:
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
class CT_Hyperlink(BaseOxmlElement):
"""`<w:hyperlink>` element, containing the text and address for a hyperlink."""
r_lst: List[CT_R]
rId: str | None = OptionalAttribute("r:id", XsdString) # pyright: ignore[reportAssignmentType]
anchor: str | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:anchor", ST_String
)
history: bool = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:history", ST_OnOff, default=True
)
r = ZeroOrMore("w:r")
@property
def lastRenderedPageBreaks(self) -> List[CT_LastRenderedPageBreak]:
"""All `w:lastRenderedPageBreak` descendants of this hyperlink."""
return self.xpath("./w:r/w:lastRenderedPageBreak")
@property
def text(self) -> str: # pyright: ignore[reportIncompatibleMethodOverride]
"""The textual content of this hyperlink.
`CT_Hyperlink` stores the hyperlink-text as one or more `w:r` children.
"""
return "".join(r.text for r in self.xpath("w:r"))

View File

@@ -0,0 +1,278 @@
"""Custom element class for rendered page-break (CT_LastRenderedPageBreak)."""
from __future__ import annotations
import copy
from typing import TYPE_CHECKING
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.shared import lazyproperty
if TYPE_CHECKING:
from docx.oxml.text.hyperlink import CT_Hyperlink
from docx.oxml.text.paragraph import CT_P
class CT_LastRenderedPageBreak(BaseOxmlElement):
"""`<w:lastRenderedPageBreak>` element, indicating page break inserted by renderer.
A rendered page-break is one inserted by the renderer when it runs out of room on a
page. It is an empty element (no attrs or children) and is a child of CT_R, peer to
CT_Text.
NOTE: this complex-type name does not exist in the schema, where
`w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it
distinguished behavior. CT_Empty is used for many elements.
"""
@property
def following_fragment_p(self) -> CT_P:
"""A "loose" `CT_P` containing only the paragraph content before this break.
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
page-break in its paragraph.
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
page-break with this `w:lastRenderedPageBreak` element and all content preceding
it removed.
NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements
(when the paragraph contained more than one). While this is rare, the caller
should treat this paragraph the same as other paragraphs and split it if
necessary in a folloing step or recursion.
"""
if not self == self._first_lrpb_in_p(self._enclosing_p):
raise ValueError("only defined on first rendered page-break in paragraph")
# -- splitting approach is different when break is inside a hyperlink --
return (
self._following_frag_in_hlink if self._is_in_hyperlink else self._following_frag_in_run
)
@property
def follows_all_content(self) -> bool:
"""True when this page-break element is the last "content" in the paragraph.
This is very uncommon case and may only occur in contrived or cases where the
XML is edited by hand, but it is not precluded by the spec.
"""
# -- a page-break inside a hyperlink never meets these criteria (for our
# -- purposes at least) because it is considered "atomic" and always associated
# -- with the page it starts on.
if self._is_in_hyperlink:
return False
return bool(
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
self._enclosing_p.xpath(
# -- in first run of paragraph --
f"(./w:r)[last()]"
# -- all page-breaks --
f"/w:lastRenderedPageBreak"
# -- that are not preceded by any content-bearing elements --
f"[not(following-sibling::*[{self._run_inner_content_xpath}])]"
)
)
@property
def precedes_all_content(self) -> bool:
"""True when a `w:lastRenderedPageBreak` precedes all paragraph content.
This is a common case; it occurs whenever the page breaks on an even paragraph
boundary.
"""
# -- a page-break inside a hyperlink never meets these criteria because there
# -- is always part of the hyperlink text before the page-break.
if self._is_in_hyperlink:
return False
return bool(
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
self._enclosing_p.xpath(
# -- in first run of paragraph --
f"./w:r[1]"
# -- all page-breaks --
f"/w:lastRenderedPageBreak"
# -- that are not preceded by any content-bearing elements --
f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]"
)
)
@property
def preceding_fragment_p(self) -> CT_P:
"""A "loose" `CT_P` containing only the paragraph content before this break.
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
paragraph in its paragraph.
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
page-break with this `w:lastRenderedPageBreak` element and all its following
siblings removed.
"""
if not self == self._first_lrpb_in_p(self._enclosing_p):
raise ValueError("only defined on first rendered page-break in paragraph")
# -- splitting approach is different when break is inside a hyperlink --
return (
self._preceding_frag_in_hlink if self._is_in_hyperlink else self._preceding_frag_in_run
)
def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink:
"""The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`.
Raises `IndexError` when this page-break has a `w:p` grandparent, so only call
when `._is_in_hyperlink` is True.
"""
return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0]
@property
def _enclosing_p(self) -> CT_P:
"""The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`."""
return self.xpath("./ancestor::w:p[1]")[0]
def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
"""The first `w:lastRenderedPageBreak` element in `p`.
Raises `ValueError` if there are no rendered page-breaks in `p`.
"""
lrpbs = p.xpath("./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak")
if not lrpbs:
raise ValueError("no rendered page-breaks in paragraph element")
return lrpbs[0]
@lazyproperty
def _following_frag_in_hlink(self) -> CT_P:
"""Following CT_P fragment when break occurs within a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is not inside a
hyperlink.
"""
if not self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
hyperlink = lrpb._enclosing_hyperlink(lrpb)
# -- delete all w:p inner-content preceding the hyperlink --
for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
p.remove(e)
# -- remove the whole hyperlink, it belongs to the preceding-fragment-p --
hyperlink.getparent().remove(hyperlink)
# -- that's it, return the remaining fragment of `w:p` clone --
return p
@lazyproperty
def _following_frag_in_run(self) -> CT_P:
"""following CT_P fragment when break does not occur in a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
"""
if self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break not in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
enclosing_r = lrpb.xpath("./parent::w:r")[0]
# -- delete all w:p inner-content preceding that run (but not w:pPr) --
for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
p.remove(e)
# -- then remove all run inner-content preceding this lrpb in its run (but not
# -- the `w:rPr`) and also remove the page-break itself
for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"):
enclosing_r.remove(e)
enclosing_r.remove(lrpb)
return p
@lazyproperty
def _is_in_hyperlink(self) -> bool:
"""True when this page-break is embedded in a hyperlink run."""
return bool(self.xpath("./parent::w:r/parent::w:hyperlink"))
@lazyproperty
def _preceding_frag_in_hlink(self) -> CT_P:
"""Preceding CT_P fragment when break occurs within a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is not inside a
hyperlink.
"""
if not self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
hyperlink = lrpb._enclosing_hyperlink(lrpb)
# -- delete all w:p inner-content following the hyperlink --
for e in hyperlink.xpath("./following-sibling::*"):
p.remove(e)
# -- remove this page-break from inside the hyperlink --
lrpb.getparent().remove(lrpb)
# -- that's it, the entire hyperlink goes into the preceding fragment so
# -- the hyperlink is not "split".
return p
@lazyproperty
def _preceding_frag_in_run(self) -> CT_P:
"""Preceding CT_P fragment when break does not occur in a hyperlink.
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
"""
if self._is_in_hyperlink:
raise ValueError("only defined on a rendered page-break not in a hyperlink")
# -- work on a clone `w:p` so our mutations don't persist --
p = copy.deepcopy(self._enclosing_p)
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
lrpb = self._first_lrpb_in_p(p)
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
enclosing_r = lrpb.xpath("./parent::w:r")[0]
# -- delete all `w:p` inner-content following that run --
for e in enclosing_r.xpath("./following-sibling::*"):
p.remove(e)
# -- then delete all `w:r` inner-content following this lrpb in its run and
# -- also remove the page-break itself
for e in lrpb.xpath("./following-sibling::*"):
enclosing_r.remove(e)
enclosing_r.remove(lrpb)
return p
@lazyproperty
def _run_inner_content_xpath(self) -> str:
"""XPath fragment matching any run inner-content elements."""
return (
"self::w:br"
" | self::w:cr"
" | self::w:drawing"
" | self::w:noBreakHyphen"
" | self::w:ptab"
" | self::w:t"
" | self::w:tab"
)

View File

@@ -0,0 +1,106 @@
# pyright: reportPrivateUsage=false
"""Custom element classes related to paragraphs (CT_P)."""
from __future__ import annotations
from typing import TYPE_CHECKING, Callable, List, cast
from docx.oxml.parser import OxmlElement
from docx.oxml.xmlchemy import BaseOxmlElement, ZeroOrMore, ZeroOrOne
if TYPE_CHECKING:
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.section import CT_SectPr
from docx.oxml.text.hyperlink import CT_Hyperlink
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
from docx.oxml.text.parfmt import CT_PPr
from docx.oxml.text.run import CT_R
class CT_P(BaseOxmlElement):
"""`<w:p>` element, containing the properties and text for a paragraph."""
add_r: Callable[[], CT_R]
get_or_add_pPr: Callable[[], CT_PPr]
hyperlink_lst: List[CT_Hyperlink]
r_lst: List[CT_R]
pPr: CT_PPr | None = ZeroOrOne("w:pPr") # pyright: ignore[reportAssignmentType]
hyperlink = ZeroOrMore("w:hyperlink")
r = ZeroOrMore("w:r")
def add_p_before(self) -> CT_P:
"""Return a new `<w:p>` element inserted directly prior to this one."""
new_p = cast(CT_P, OxmlElement("w:p"))
self.addprevious(new_p)
return new_p
@property
def alignment(self) -> WD_PARAGRAPH_ALIGNMENT | None:
"""The value of the `<w:jc>` grandchild element or |None| if not present."""
pPr = self.pPr
if pPr is None:
return None
return pPr.jc_val
@alignment.setter
def alignment(self, value: WD_PARAGRAPH_ALIGNMENT):
pPr = self.get_or_add_pPr()
pPr.jc_val = value
def clear_content(self):
"""Remove all child elements, except the `<w:pPr>` element if present."""
for child in self.xpath("./*[not(self::w:pPr)]"):
self.remove(child)
@property
def inner_content_elements(self) -> List[CT_R | CT_Hyperlink]:
"""Run and hyperlink children of the `w:p` element, in document order."""
return self.xpath("./w:r | ./w:hyperlink")
@property
def lastRenderedPageBreaks(self) -> List[CT_LastRenderedPageBreak]:
"""All `w:lastRenderedPageBreak` descendants of this paragraph.
Rendered page-breaks commonly occur in a run but can also occur in a run inside
a hyperlink. This returns both.
"""
return self.xpath(
"./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak"
)
def set_sectPr(self, sectPr: CT_SectPr):
"""Unconditionally replace or add `sectPr` as grandchild in correct sequence."""
pPr = self.get_or_add_pPr()
pPr._remove_sectPr()
pPr._insert_sectPr(sectPr)
@property
def style(self) -> str | None:
"""String contained in `w:val` attribute of `./w:pPr/w:pStyle` grandchild.
|None| if not present.
"""
pPr = self.pPr
if pPr is None:
return None
return pPr.style
@style.setter
def style(self, style: str | None):
pPr = self.get_or_add_pPr()
pPr.style = style
@property
def text(self): # pyright: ignore[reportIncompatibleMethodOverride]
"""The textual content of this paragraph.
Inner-content child elements like `w:r` and `w:hyperlink` are translated to
their text equivalent.
"""
return "".join(e.text for e in self.xpath("w:r | w:hyperlink"))
def _insert_pPr(self, pPr: CT_PPr) -> CT_PPr:
self.insert(0, pPr)
return pPr

View File

@@ -0,0 +1,392 @@
"""Custom element classes related to paragraph properties (CT_PPr)."""
from __future__ import annotations
from typing import TYPE_CHECKING, Callable
from docx.enum.text import (
WD_ALIGN_PARAGRAPH,
WD_LINE_SPACING,
WD_TAB_ALIGNMENT,
WD_TAB_LEADER,
)
from docx.oxml.shared import CT_DecimalNumber
from docx.oxml.simpletypes import ST_SignedTwipsMeasure, ST_TwipsMeasure
from docx.oxml.xmlchemy import (
BaseOxmlElement,
OneOrMore,
OptionalAttribute,
RequiredAttribute,
ZeroOrOne,
)
from docx.shared import Length
if TYPE_CHECKING:
from docx.oxml.section import CT_SectPr
from docx.oxml.shared import CT_String
class CT_Ind(BaseOxmlElement):
"""``<w:ind>`` element, specifying paragraph indentation."""
left: Length | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:left", ST_SignedTwipsMeasure
)
right: Length | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:right", ST_SignedTwipsMeasure
)
firstLine: Length | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:firstLine", ST_TwipsMeasure
)
hanging: Length | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:hanging", ST_TwipsMeasure
)
class CT_Jc(BaseOxmlElement):
"""``<w:jc>`` element, specifying paragraph justification."""
val: WD_ALIGN_PARAGRAPH = RequiredAttribute( # pyright: ignore[reportAssignmentType]
"w:val", WD_ALIGN_PARAGRAPH
)
class CT_PPr(BaseOxmlElement):
"""``<w:pPr>`` element, containing the properties for a paragraph."""
get_or_add_ind: Callable[[], CT_Ind]
get_or_add_pStyle: Callable[[], CT_String]
get_or_add_sectPr: Callable[[], CT_SectPr]
_insert_sectPr: Callable[[CT_SectPr], None]
_remove_pStyle: Callable[[], None]
_remove_sectPr: Callable[[], None]
_tag_seq = (
"w:pStyle",
"w:keepNext",
"w:keepLines",
"w:pageBreakBefore",
"w:framePr",
"w:widowControl",
"w:numPr",
"w:suppressLineNumbers",
"w:pBdr",
"w:shd",
"w:tabs",
"w:suppressAutoHyphens",
"w:kinsoku",
"w:wordWrap",
"w:overflowPunct",
"w:topLinePunct",
"w:autoSpaceDE",
"w:autoSpaceDN",
"w:bidi",
"w:adjustRightInd",
"w:snapToGrid",
"w:spacing",
"w:ind",
"w:contextualSpacing",
"w:mirrorIndents",
"w:suppressOverlap",
"w:jc",
"w:textDirection",
"w:textAlignment",
"w:textboxTightWrap",
"w:outlineLvl",
"w:divId",
"w:cnfStyle",
"w:rPr",
"w:sectPr",
"w:pPrChange",
)
pStyle: CT_String | None = ZeroOrOne( # pyright: ignore[reportAssignmentType]
"w:pStyle", successors=_tag_seq[1:]
)
keepNext = ZeroOrOne("w:keepNext", successors=_tag_seq[2:])
keepLines = ZeroOrOne("w:keepLines", successors=_tag_seq[3:])
pageBreakBefore = ZeroOrOne("w:pageBreakBefore", successors=_tag_seq[4:])
widowControl = ZeroOrOne("w:widowControl", successors=_tag_seq[6:])
numPr = ZeroOrOne("w:numPr", successors=_tag_seq[7:])
tabs = ZeroOrOne("w:tabs", successors=_tag_seq[11:])
spacing = ZeroOrOne("w:spacing", successors=_tag_seq[22:])
ind: CT_Ind | None = ZeroOrOne( # pyright: ignore[reportAssignmentType]
"w:ind", successors=_tag_seq[23:]
)
jc = ZeroOrOne("w:jc", successors=_tag_seq[27:])
outlineLvl: CT_DecimalNumber = ZeroOrOne( # pyright: ignore[reportAssignmentType]
"w:outlineLvl", successors=_tag_seq[31:]
)
sectPr = ZeroOrOne("w:sectPr", successors=_tag_seq[35:])
del _tag_seq
@property
def first_line_indent(self) -> Length | None:
"""A |Length| value calculated from the values of `w:ind/@w:firstLine` and
`w:ind/@w:hanging`.
Returns |None| if the `w:ind` child is not present.
"""
ind = self.ind
if ind is None:
return None
hanging = ind.hanging
if hanging is not None:
return Length(-hanging)
firstLine = ind.firstLine
if firstLine is None:
return None
return firstLine
@first_line_indent.setter
def first_line_indent(self, value: Length | None):
if self.ind is None and value is None:
return
ind = self.get_or_add_ind()
ind.firstLine = ind.hanging = None
if value is None:
return
elif value < 0:
ind.hanging = -value
else:
ind.firstLine = value
@property
def ind_left(self) -> Length | None:
"""The value of `w:ind/@w:left` or |None| if not present."""
ind = self.ind
if ind is None:
return None
return ind.left
@ind_left.setter
def ind_left(self, value: Length | None):
if value is None and self.ind is None:
return
ind = self.get_or_add_ind()
ind.left = value
@property
def ind_right(self) -> Length | None:
"""The value of `w:ind/@w:right` or |None| if not present."""
ind = self.ind
if ind is None:
return None
return ind.right
@ind_right.setter
def ind_right(self, value: Length | None):
if value is None and self.ind is None:
return
ind = self.get_or_add_ind()
ind.right = value
@property
def jc_val(self) -> WD_ALIGN_PARAGRAPH | None:
"""Value of the `<w:jc>` child element or |None| if not present."""
return self.jc.val if self.jc is not None else None
@jc_val.setter
def jc_val(self, value):
if value is None:
self._remove_jc()
return
self.get_or_add_jc().val = value
@property
def keepLines_val(self):
"""The value of `keepLines/@val` or |None| if not present."""
keepLines = self.keepLines
if keepLines is None:
return None
return keepLines.val
@keepLines_val.setter
def keepLines_val(self, value):
if value is None:
self._remove_keepLines()
else:
self.get_or_add_keepLines().val = value
@property
def keepNext_val(self):
"""The value of `keepNext/@val` or |None| if not present."""
keepNext = self.keepNext
if keepNext is None:
return None
return keepNext.val
@keepNext_val.setter
def keepNext_val(self, value):
if value is None:
self._remove_keepNext()
else:
self.get_or_add_keepNext().val = value
@property
def pageBreakBefore_val(self):
"""The value of `pageBreakBefore/@val` or |None| if not present."""
pageBreakBefore = self.pageBreakBefore
if pageBreakBefore is None:
return None
return pageBreakBefore.val
@pageBreakBefore_val.setter
def pageBreakBefore_val(self, value):
if value is None:
self._remove_pageBreakBefore()
else:
self.get_or_add_pageBreakBefore().val = value
@property
def spacing_after(self):
"""The value of `w:spacing/@w:after` or |None| if not present."""
spacing = self.spacing
if spacing is None:
return None
return spacing.after
@spacing_after.setter
def spacing_after(self, value):
if value is None and self.spacing is None:
return
self.get_or_add_spacing().after = value
@property
def spacing_before(self):
"""The value of `w:spacing/@w:before` or |None| if not present."""
spacing = self.spacing
if spacing is None:
return None
return spacing.before
@spacing_before.setter
def spacing_before(self, value):
if value is None and self.spacing is None:
return
self.get_or_add_spacing().before = value
@property
def spacing_line(self):
"""The value of `w:spacing/@w:line` or |None| if not present."""
spacing = self.spacing
if spacing is None:
return None
return spacing.line
@spacing_line.setter
def spacing_line(self, value):
if value is None and self.spacing is None:
return
self.get_or_add_spacing().line = value
@property
def spacing_lineRule(self):
"""The value of `w:spacing/@w:lineRule` as a member of the :ref:`WdLineSpacing`
enumeration.
Only the `MULTIPLE`, `EXACTLY`, and `AT_LEAST` members are used. It is the
responsibility of the client to calculate the use of `SINGLE`, `DOUBLE`, and
`MULTIPLE` based on the value of `w:spacing/@w:line` if that behavior is
desired.
"""
spacing = self.spacing
if spacing is None:
return None
lineRule = spacing.lineRule
if lineRule is None and spacing.line is not None:
return WD_LINE_SPACING.MULTIPLE
return lineRule
@spacing_lineRule.setter
def spacing_lineRule(self, value):
if value is None and self.spacing is None:
return
self.get_or_add_spacing().lineRule = value
@property
def style(self) -> str | None:
"""String contained in `./w:pStyle/@val`, or None if child is not present."""
pStyle = self.pStyle
if pStyle is None:
return None
return pStyle.val
@style.setter
def style(self, style: str | None):
"""Set `./w:pStyle/@val` `style`, adding a new element if necessary.
If `style` is |None|, remove `./w:pStyle` when present.
"""
if style is None:
self._remove_pStyle()
return
pStyle = self.get_or_add_pStyle()
pStyle.val = style
@property
def widowControl_val(self):
"""The value of `widowControl/@val` or |None| if not present."""
widowControl = self.widowControl
if widowControl is None:
return None
return widowControl.val
@widowControl_val.setter
def widowControl_val(self, value):
if value is None:
self._remove_widowControl()
else:
self.get_or_add_widowControl().val = value
class CT_Spacing(BaseOxmlElement):
"""``<w:spacing>`` element, specifying paragraph spacing attributes such as space
before and line spacing."""
after = OptionalAttribute("w:after", ST_TwipsMeasure)
before = OptionalAttribute("w:before", ST_TwipsMeasure)
line = OptionalAttribute("w:line", ST_SignedTwipsMeasure)
lineRule = OptionalAttribute("w:lineRule", WD_LINE_SPACING)
class CT_TabStop(BaseOxmlElement):
"""`<w:tab>` element, representing an individual tab stop.
Overloaded to use for a tab-character in a run, which also uses the w:tab tag but
only needs a __str__ method.
"""
val: WD_TAB_ALIGNMENT = RequiredAttribute( # pyright: ignore[reportAssignmentType]
"w:val", WD_TAB_ALIGNMENT
)
leader: WD_TAB_LEADER | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:leader", WD_TAB_LEADER, default=WD_TAB_LEADER.SPACES
)
pos: Length = RequiredAttribute( # pyright: ignore[reportAssignmentType]
"w:pos", ST_SignedTwipsMeasure
)
def __str__(self) -> str:
"""Text equivalent of a `w:tab` element appearing in a run.
Allows text of run inner-content to be accessed consistently across all text
inner-content.
"""
return "\t"
class CT_TabStops(BaseOxmlElement):
"""``<w:tabs>`` element, container for a sorted sequence of tab stops."""
tab = OneOrMore("w:tab", successors=())
def insert_tab_in_order(self, pos, align, leader):
"""Insert a newly created `w:tab` child element in `pos` order."""
new_tab = self._new_tab()
new_tab.pos, new_tab.val, new_tab.leader = pos, align, leader
for tab in self.tab_lst:
if new_tab.pos < tab.pos:
tab.addprevious(new_tab)
return new_tab
self.append(new_tab)
return new_tab

View File

@@ -0,0 +1,307 @@
"""Custom element classes related to text runs (CT_R)."""
from __future__ import annotations
from typing import TYPE_CHECKING, Callable, Iterator, List, cast
from docx.oxml.drawing import CT_Drawing
from docx.oxml.ns import qn
from docx.oxml.parser import OxmlElement
from docx.oxml.simpletypes import ST_BrClear, ST_BrType
from docx.oxml.text.font import CT_RPr
from docx.oxml.xmlchemy import BaseOxmlElement, OptionalAttribute, ZeroOrMore, ZeroOrOne
from docx.shared import TextAccumulator
if TYPE_CHECKING:
from docx.oxml.shape import CT_Anchor, CT_Inline
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
from docx.oxml.text.parfmt import CT_TabStop
# ------------------------------------------------------------------------------------
# Run-level elements
class CT_R(BaseOxmlElement):
"""`<w:r>` element, containing the properties and text for a run."""
add_br: Callable[[], CT_Br]
add_tab: Callable[[], CT_TabStop]
get_or_add_rPr: Callable[[], CT_RPr]
_add_drawing: Callable[[], CT_Drawing]
_add_t: Callable[..., CT_Text]
rPr: CT_RPr | None = ZeroOrOne("w:rPr") # pyright: ignore[reportAssignmentType]
br = ZeroOrMore("w:br")
cr = ZeroOrMore("w:cr")
drawing = ZeroOrMore("w:drawing")
t = ZeroOrMore("w:t")
tab = ZeroOrMore("w:tab")
def add_t(self, text: str) -> CT_Text:
"""Return a newly added `<w:t>` element containing `text`."""
t = self._add_t(text=text)
if len(text.strip()) < len(text):
t.set(qn("xml:space"), "preserve")
return t
def add_drawing(self, inline_or_anchor: CT_Inline | CT_Anchor) -> CT_Drawing:
"""Return newly appended `CT_Drawing` (`w:drawing`) child element.
The `w:drawing` element has `inline_or_anchor` as its child.
"""
drawing = self._add_drawing()
drawing.append(inline_or_anchor)
return drawing
def clear_content(self) -> None:
"""Remove all child elements except a `w:rPr` element if present."""
# -- remove all run inner-content except a `w:rPr` when present. --
for e in self.xpath("./*[not(self::w:rPr)]"):
self.remove(e)
@property
def inner_content_items(self) -> List[str | CT_Drawing | CT_LastRenderedPageBreak]:
"""Text of run, possibly punctuated by `w:lastRenderedPageBreak` elements."""
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
accum = TextAccumulator()
def iter_items() -> Iterator[str | CT_Drawing | CT_LastRenderedPageBreak]:
for e in self.xpath(
"w:br"
" | w:cr"
" | w:drawing"
" | w:lastRenderedPageBreak"
" | w:noBreakHyphen"
" | w:ptab"
" | w:t"
" | w:tab"
):
if isinstance(e, (CT_Drawing, CT_LastRenderedPageBreak)):
yield from accum.pop()
yield e
else:
accum.push(str(e))
# -- don't forget the "tail" string --
yield from accum.pop()
return list(iter_items())
def insert_comment_range_end_and_reference_below(self, comment_id: int) -> None:
"""Insert a `w:commentRangeEnd` and `w:commentReference` element after this run.
The `w:commentRangeEnd` element is the immediate sibling of this `w:r` and is followed by
a `w:r` containing the `w:commentReference` element.
"""
self.addnext(self._new_comment_reference_run(comment_id))
self.addnext(OxmlElement("w:commentRangeEnd", attrs={qn("w:id"): str(comment_id)}))
def insert_comment_range_start_above(self, comment_id: int) -> None:
"""Insert a `w:commentRangeStart` element with `comment_id` before this run."""
self.addprevious(OxmlElement("w:commentRangeStart", attrs={qn("w:id"): str(comment_id)}))
@property
def lastRenderedPageBreaks(self) -> List[CT_LastRenderedPageBreak]:
"""All `w:lastRenderedPageBreaks` descendants of this run."""
return self.xpath("./w:lastRenderedPageBreak")
@property
def style(self) -> str | None:
"""String contained in `w:val` attribute of `w:rStyle` grandchild.
|None| if that element is not present.
"""
rPr = self.rPr
if rPr is None:
return None
return rPr.style
@style.setter
def style(self, style: str | None):
"""Set character style of this `w:r` element to `style`.
If `style` is None, remove the style element.
"""
rPr = self.get_or_add_rPr()
rPr.style = style
@property
def text(self) -> str:
"""The textual content of this run.
Inner-content child elements like `w:tab` are translated to their text
equivalent.
"""
return "".join(
str(e) for e in self.xpath("w:br | w:cr | w:noBreakHyphen | w:ptab | w:t | w:tab")
)
@text.setter
def text(self, text: str): # pyright: ignore[reportIncompatibleMethodOverride]
self.clear_content()
_RunContentAppender.append_to_run_from_text(self, text)
def _insert_rPr(self, rPr: CT_RPr) -> CT_RPr:
self.insert(0, rPr)
return rPr
def _new_comment_reference_run(self, comment_id: int) -> CT_R:
"""Return a new `w:r` element with `w:commentReference` referencing `comment_id`.
Should look like this:
<w:r>
<w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
<w:commentReference w:id="0"/>
</w:r>
"""
r = cast(CT_R, OxmlElement("w:r"))
rPr = r.get_or_add_rPr()
rPr.style = "CommentReference"
r.append(OxmlElement("w:commentReference", attrs={qn("w:id"): str(comment_id)}))
return r
# ------------------------------------------------------------------------------------
# Run inner-content elements
class CT_Br(BaseOxmlElement):
"""`<w:br>` element, indicating a line, page, or column break in a run."""
type: str | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
"w:type", ST_BrType, default="textWrapping"
)
clear: str | None = OptionalAttribute("w:clear", ST_BrClear) # pyright: ignore
def __str__(self) -> str:
"""Text equivalent of this element. Actual value depends on break type.
A line break is translated as "\n". Column and page breaks produce the empty
string ("").
This allows the text of run inner-content to be accessed in a consistent way
for all run inner-context text elements.
"""
return "\n" if self.type == "textWrapping" else ""
class CT_Cr(BaseOxmlElement):
"""`<w:cr>` element, representing a carriage-return (0x0D) character within a run.
In Word, this represents a "soft carriage-return" in the sense that it does not end
the paragraph the way pressing Enter (aka. Return) on the keyboard does. Here the
text equivalent is considered to be newline ("\n") since in plain-text that's the
closest Python equivalent.
NOTE: this complex-type name does not exist in the schema, where `w:tab` maps to
`CT_Empty`. This name was added to give it distinguished behavior. CT_Empty is used
for many elements.
"""
def __str__(self) -> str:
"""Text equivalent of this element, a single newline ("\n")."""
return "\n"
class CT_NoBreakHyphen(BaseOxmlElement):
"""`<w:noBreakHyphen>` element, a hyphen ineligible for a line-wrap position.
This maps to a plain-text dash ("-").
NOTE: this complex-type name does not exist in the schema, where `w:noBreakHyphen`
maps to `CT_Empty`. This name was added to give it behavior distinguished from the
many other elements represented in the schema by CT_Empty.
"""
def __str__(self) -> str:
"""Text equivalent of this element, a single dash character ("-")."""
return "-"
class CT_PTab(BaseOxmlElement):
"""`<w:ptab>` element, representing an absolute-position tab character within a run.
This character advances the rendering position to the specified position regardless
of any tab-stops, perhaps for layout of a table-of-contents (TOC) or similar.
"""
def __str__(self) -> str:
"""Text equivalent of this element, a single tab ("\t") character.
This allows the text of run inner-content to be accessed in a consistent way
for all run inner-context text elements.
"""
return "\t"
# -- CT_Tab functionality is provided by CT_TabStop which also uses `w:tab` tag. That
# -- element class provides the __str__() method for this empty element, unconditionally
# -- returning "\t".
class CT_Text(BaseOxmlElement):
"""`<w:t>` element, containing a sequence of characters within a run."""
def __str__(self) -> str:
"""Text contained in this element, the empty string if it has no content.
This property allows this run inner-content element to be queried for its text
the same way as other run-content elements are. In particular, this never
returns None, as etree._Element does when there is no content.
"""
return self.text or ""
# ------------------------------------------------------------------------------------
# Utility
class _RunContentAppender:
"""Translates a Python string into run content elements appended in a `w:r` element.
Contiguous sequences of regular characters are appended in a single `<w:t>` element.
Each tab character ('\t') causes a `<w:tab/>` element to be appended. Likewise a
newline or carriage return character ('\n', '\r') causes a `<w:cr>` element to be
appended.
"""
def __init__(self, r: CT_R):
self._r = r
self._bfr: List[str] = []
@classmethod
def append_to_run_from_text(cls, r: CT_R, text: str):
"""Append inner-content elements for `text` to `r` element."""
appender = cls(r)
appender.add_text(text)
def add_text(self, text: str):
"""Append inner-content elements for `text` to the `w:r` element."""
for char in text:
self.add_char(char)
self.flush()
def add_char(self, char: str):
"""Process next character of input through finite state maching (FSM).
There are two possible states, buffer pending and not pending, but those are
hidden behind the `.flush()` method which must be called at the end of text to
ensure any pending `<w:t>` element is written.
"""
if char == "\t":
self.flush()
self._r.add_tab()
elif char in "\r\n":
self.flush()
self._r.add_br()
else:
self._bfr.append(char)
def flush(self):
text = "".join(self._bfr)
if text:
self._r.add_t(text)
self._bfr.clear()