Initial commit (Clean history)

This commit is contained in:
anhduy-tech
2025-12-30 11:27:14 +07:00
commit ef48c93de0
19255 changed files with 3248867 additions and 0 deletions

View File

@@ -0,0 +1,65 @@
"""Initialize `docx` package.
Export the `Document` constructor function and establish the mapping of part-type to
the part-classe that implements that type.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Type
from docx.api import Document
if TYPE_CHECKING:
from docx.opc.part import Part
__version__ = "1.2.0"
__all__ = ["Document"]
# -- register custom Part classes with opc package reader --
from docx.opc.constants import CONTENT_TYPE as CT
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx.opc.part import PartFactory
from docx.opc.parts.coreprops import CorePropertiesPart
from docx.parts.comments import CommentsPart
from docx.parts.document import DocumentPart
from docx.parts.hdrftr import FooterPart, HeaderPart
from docx.parts.image import ImagePart
from docx.parts.numbering import NumberingPart
from docx.parts.settings import SettingsPart
from docx.parts.styles import StylesPart
def part_class_selector(content_type: str, reltype: str) -> Type[Part] | None:
if reltype == RT.IMAGE:
return ImagePart
return None
PartFactory.part_class_selector = part_class_selector
PartFactory.part_type_for[CT.OPC_CORE_PROPERTIES] = CorePropertiesPart
PartFactory.part_type_for[CT.WML_COMMENTS] = CommentsPart
PartFactory.part_type_for[CT.WML_DOCUMENT_MAIN] = DocumentPart
PartFactory.part_type_for[CT.WML_FOOTER] = FooterPart
PartFactory.part_type_for[CT.WML_HEADER] = HeaderPart
PartFactory.part_type_for[CT.WML_NUMBERING] = NumberingPart
PartFactory.part_type_for[CT.WML_SETTINGS] = SettingsPart
PartFactory.part_type_for[CT.WML_STYLES] = StylesPart
del (
CT,
CorePropertiesPart,
CommentsPart,
DocumentPart,
FooterPart,
HeaderPart,
NumberingPart,
PartFactory,
SettingsPart,
StylesPart,
part_class_selector,
)

View File

@@ -0,0 +1,37 @@
"""Directly exposed API functions and classes, :func:`Document` for now.
Provides a syntactically more convenient API for interacting with the OpcPackage graph.
"""
from __future__ import annotations
import os
from typing import IO, TYPE_CHECKING, cast
from docx.opc.constants import CONTENT_TYPE as CT
from docx.package import Package
if TYPE_CHECKING:
from docx.document import Document as DocumentObject
from docx.parts.document import DocumentPart
def Document(docx: str | IO[bytes] | None = None) -> DocumentObject:
"""Return a |Document| object loaded from `docx`, where `docx` can be either a path
to a ``.docx`` file (a string) or a file-like object.
If `docx` is missing or ``None``, the built-in default document "template" is
loaded.
"""
docx = _default_docx_path() if docx is None else docx
document_part = cast("DocumentPart", Package.open(docx).main_document_part)
if document_part.content_type != CT.WML_DOCUMENT_MAIN:
tmpl = "file '%s' is not a Word file, content type is '%s'"
raise ValueError(tmpl % (docx, document_part.content_type))
return document_part.document
def _default_docx_path():
"""Return the path to the built-in default .docx package."""
_thisdir = os.path.split(__file__)[0]
return os.path.join(_thisdir, "templates", "default.docx")

View File

@@ -0,0 +1,101 @@
# pyright: reportImportCycles=false
"""Block item container, used by body, cell, header, etc.
Block level items are things like paragraph and table, although there are a few other
specialized ones like structured document tags.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Iterator
from typing_extensions import TypeAlias
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.shared import StoryChild
from docx.text.paragraph import Paragraph
if TYPE_CHECKING:
import docx.types as t
from docx.oxml.comments import CT_Comment
from docx.oxml.document import CT_Body
from docx.oxml.section import CT_HdrFtr
from docx.oxml.table import CT_Tc
from docx.shared import Length
from docx.styles.style import ParagraphStyle
from docx.table import Table
BlockItemElement: TypeAlias = "CT_Body | CT_Comment | CT_HdrFtr | CT_Tc"
class BlockItemContainer(StoryChild):
"""Base class for proxy objects that can contain block items.
These containers include _Body, _Cell, header, footer, footnote, endnote, comment,
and text box objects. Provides the shared functionality to add a block item like a
paragraph or table.
"""
def __init__(self, element: BlockItemElement, parent: t.ProvidesStoryPart):
super(BlockItemContainer, self).__init__(parent)
self._element = element
def add_paragraph(self, text: str = "", style: str | ParagraphStyle | None = None) -> Paragraph:
"""Return paragraph newly added to the end of the content in this container.
The paragraph has `text` in a single run if present, and is given paragraph
style `style`.
If `style` is |None|, no paragraph style is applied, which has the same effect
as applying the 'Normal' style.
"""
paragraph = self._add_paragraph()
if text:
paragraph.add_run(text)
if style is not None:
paragraph.style = style
return paragraph
def add_table(self, rows: int, cols: int, width: Length) -> Table:
"""Return table of `width` having `rows` rows and `cols` columns.
The table is appended appended at the end of the content in this container.
`width` is evenly distributed between the table columns.
"""
from docx.table import Table
tbl = CT_Tbl.new_tbl(rows, cols, width)
self._element._insert_tbl(tbl) # pyright: ignore[reportPrivateUsage]
return Table(tbl, self)
def iter_inner_content(self) -> Iterator[Paragraph | Table]:
"""Generate each `Paragraph` or `Table` in this container in document order."""
from docx.table import Table
for element in self._element.inner_content_elements:
yield (Paragraph(element, self) if isinstance(element, CT_P) else Table(element, self))
@property
def paragraphs(self):
"""A list containing the paragraphs in this container, in document order.
Read-only.
"""
return [Paragraph(p, self) for p in self._element.p_lst]
@property
def tables(self):
"""A list containing the tables in this container, in document order.
Read-only.
"""
from docx.table import Table
return [Table(tbl, self) for tbl in self._element.tbl_lst]
def _add_paragraph(self):
"""Return paragraph newly added to the end of the content in this container."""
return Paragraph(self._element.add_p(), self)

View File

@@ -0,0 +1,163 @@
"""Collection providing access to comments added to this document."""
from __future__ import annotations
import datetime as dt
from typing import TYPE_CHECKING, Iterator
from docx.blkcntnr import BlockItemContainer
if TYPE_CHECKING:
from docx.oxml.comments import CT_Comment, CT_Comments
from docx.parts.comments import CommentsPart
from docx.styles.style import ParagraphStyle
from docx.text.paragraph import Paragraph
class Comments:
"""Collection containing the comments added to this document."""
def __init__(self, comments_elm: CT_Comments, comments_part: CommentsPart):
self._comments_elm = comments_elm
self._comments_part = comments_part
def __iter__(self) -> Iterator[Comment]:
"""Iterator over the comments in this collection."""
return (
Comment(comment_elm, self._comments_part)
for comment_elm in self._comments_elm.comment_lst
)
def __len__(self) -> int:
"""The number of comments in this collection."""
return len(self._comments_elm.comment_lst)
def add_comment(self, text: str = "", author: str = "", initials: str | None = "") -> Comment:
"""Add a new comment to the document and return it.
The comment is added to the end of the comments collection and is assigned a unique
comment-id.
If `text` is provided, it is added to the comment. This option provides for the common
case where a comment contains a modest passage of plain text. Multiple paragraphs can be
added using the `text` argument by separating their text with newlines (`"\\\\n"`).
Between newlines, text is interpreted as it is in `Document.add_paragraph(text=...)`.
The default is to place a single empty paragraph in the comment, which is the same
behavior as the Word UI when you add a comment. New runs can be added to the first
paragraph in the empty comment with `comments.paragraphs[0].add_run()` to adding more
complex text with emphasis or images. Additional paragraphs can be added using
`.add_paragraph()`.
`author` is a required attribute, set to the empty string by default.
`initials` is an optional attribute, set to the empty string by default. Passing |None|
for the `initials` parameter causes that attribute to be omitted from the XML.
"""
comment_elm = self._comments_elm.add_comment()
comment_elm.author = author
comment_elm.initials = initials
comment_elm.date = dt.datetime.now(dt.timezone.utc)
comment = Comment(comment_elm, self._comments_part)
if text == "":
return comment
para_text_iter = iter(text.split("\n"))
first_para_text = next(para_text_iter)
first_para = comment.paragraphs[0]
first_para.add_run(first_para_text)
for s in para_text_iter:
comment.add_paragraph(text=s)
return comment
def get(self, comment_id: int) -> Comment | None:
"""Return the comment identified by `comment_id`, or |None| if not found."""
comment_elm = self._comments_elm.get_comment_by_id(comment_id)
return Comment(comment_elm, self._comments_part) if comment_elm is not None else None
class Comment(BlockItemContainer):
"""Proxy for a single comment in the document.
Provides methods to access comment metadata such as author, initials, and date.
A comment is also a block-item container, similar to a table cell, so it can contain both
paragraphs and tables and its paragraphs can contain rich text, hyperlinks and images,
although the common case is that a comment contains a single paragraph of plain text like a
sentence or phrase.
Note that certain content like tables may not be displayed in the Word comment sidebar due to
space limitations. Such "over-sized" content can still be viewed in the review pane.
"""
def __init__(self, comment_elm: CT_Comment, comments_part: CommentsPart):
super().__init__(comment_elm, comments_part)
self._comment_elm = comment_elm
def add_paragraph(self, text: str = "", style: str | ParagraphStyle | None = None) -> Paragraph:
"""Return paragraph newly added to the end of the content in this container.
The paragraph has `text` in a single run if present, and is given paragraph style `style`.
When `style` is |None| or ommitted, the "CommentText" paragraph style is applied, which is
the default style for comments.
"""
paragraph = super().add_paragraph(text, style)
# -- have to assign style directly to element because `paragraph.style` raises when
# -- a style is not present in the styles part
if style is None:
paragraph._p.style = "CommentText" # pyright: ignore[reportPrivateUsage]
return paragraph
@property
def author(self) -> str:
"""Read/write. The recorded author of this comment.
This field is required but can be set to the empty string.
"""
return self._comment_elm.author
@author.setter
def author(self, value: str):
self._comment_elm.author = value
@property
def comment_id(self) -> int:
"""The unique identifier of this comment."""
return self._comment_elm.id
@property
def initials(self) -> str | None:
"""Read/write. The recorded initials of the comment author.
This attribute is optional in the XML, returns |None| if not set. Assigning |None| removes
any existing initials from the XML.
"""
return self._comment_elm.initials
@initials.setter
def initials(self, value: str | None):
self._comment_elm.initials = value
@property
def text(self) -> str:
"""The text content of this comment as a string.
Only content in paragraphs is included and of course all emphasis and styling is stripped.
Paragraph boundaries are indicated with a newline (`"\\\\n"`)
"""
return "\n".join(p.text for p in self.paragraphs)
@property
def timestamp(self) -> dt.datetime | None:
"""The date and time this comment was authored.
This attribute is optional in the XML, returns |None| if not set.
"""
return self._comment_elm.date

View File

@@ -0,0 +1,112 @@
"""DrawingML objects related to color, ColorFormat being the most prominent."""
from __future__ import annotations
from typing import TYPE_CHECKING, cast
from typing_extensions import TypeAlias
from docx.enum.dml import MSO_COLOR_TYPE
from docx.oxml.simpletypes import ST_HexColorAuto
from docx.shared import ElementProxy, RGBColor
if TYPE_CHECKING:
from docx.enum.dml import MSO_THEME_COLOR
from docx.oxml.text.font import CT_Color
from docx.oxml.text.run import CT_R
# -- other element types can be a parent of an `w:rPr` element, but for now only `w:r` is --
RPrParent: TypeAlias = "CT_R"
class ColorFormat(ElementProxy):
"""Provides access to color settings like RGB color, theme color, and luminance adjustments."""
def __init__(self, rPr_parent: RPrParent):
super(ColorFormat, self).__init__(rPr_parent)
self._element = rPr_parent
@property
def rgb(self) -> RGBColor | None:
"""An |RGBColor| value or |None| if no RGB color is specified.
When :attr:`type` is `MSO_COLOR_TYPE.RGB`, the value of this property will always be an
|RGBColor| value. It may also be an |RGBColor| value if :attr:`type` is
`MSO_COLOR_TYPE.THEME`, as Word writes the current value of a theme color when one is
assigned. In that case, the RGB value should be interpreted as no more than a good guess
however, as the theme color takes precedence at rendering time. Its value is |None|
whenever :attr:`type` is either |None| or `MSO_COLOR_TYPE.AUTO`.
Assigning an |RGBColor| value causes :attr:`type` to become `MSO_COLOR_TYPE.RGB` and any
theme color is removed. Assigning |None| causes any color to be removed such that the
effective color is inherited from the style hierarchy.
"""
color = self._color
if color is None:
return None
if color.val == ST_HexColorAuto.AUTO:
return None
return cast(RGBColor, color.val)
@rgb.setter
def rgb(self, value: RGBColor | None):
if value is None and self._color is None:
return
rPr = self._element.get_or_add_rPr()
rPr._remove_color() # pyright: ignore[reportPrivateUsage]
if value is not None:
rPr.get_or_add_color().val = value
@property
def theme_color(self) -> MSO_THEME_COLOR | None:
"""Member of :ref:`MsoThemeColorIndex` or |None| if no theme color is specified.
When :attr:`type` is `MSO_COLOR_TYPE.THEME`, the value of this property will always be a
member of :ref:`MsoThemeColorIndex`. When :attr:`type` has any other value, the value of
this property is |None|.
Assigning a member of :ref:`MsoThemeColorIndex` causes :attr:`type` to become
`MSO_COLOR_TYPE.THEME`. Any existing RGB value is retained but ignored by Word. Assigning
|None| causes any color specification to be removed such that the effective color is
inherited from the style hierarchy.
"""
color = self._color
if color is None:
return None
return color.themeColor
@theme_color.setter
def theme_color(self, value: MSO_THEME_COLOR | None):
if value is None:
if self._color is not None and self._element.rPr is not None:
self._element.rPr._remove_color() # pyright: ignore[reportPrivateUsage]
return
self._element.get_or_add_rPr().get_or_add_color().themeColor = value
@property
def type(self) -> MSO_COLOR_TYPE | None:
"""Read-only.
A member of :ref:`MsoColorType`, one of RGB, THEME, or AUTO, corresponding to the way this
color is defined. Its value is |None| if no color is applied at this level, which causes
the effective color to be inherited from the style hierarchy.
"""
color = self._color
if color is None:
return None
if color.themeColor is not None:
return MSO_COLOR_TYPE.THEME
if color.val == ST_HexColorAuto.AUTO:
return MSO_COLOR_TYPE.AUTO
return MSO_COLOR_TYPE.RGB
@property
def _color(self) -> CT_Color | None:
"""Return `w:rPr/w:color` or |None| if not present.
Helper to factor out repetitive element access.
"""
rPr = self._element.rPr
if rPr is None:
return None
return rPr.color

View File

@@ -0,0 +1,265 @@
# pyright: reportImportCycles=false
# pyright: reportPrivateUsage=false
"""|Document| and closely related objects."""
from __future__ import annotations
from typing import IO, TYPE_CHECKING, Iterator, List, Sequence
from docx.blkcntnr import BlockItemContainer
from docx.enum.section import WD_SECTION
from docx.enum.text import WD_BREAK
from docx.section import Section, Sections
from docx.shared import ElementProxy, Emu, Inches, Length
from docx.text.run import Run
if TYPE_CHECKING:
import docx.types as t
from docx.comments import Comment, Comments
from docx.oxml.document import CT_Body, CT_Document
from docx.parts.document import DocumentPart
from docx.settings import Settings
from docx.styles.style import ParagraphStyle, _TableStyle
from docx.table import Table
from docx.text.paragraph import Paragraph
class Document(ElementProxy):
"""WordprocessingML (WML) document.
Not intended to be constructed directly. Use :func:`docx.Document` to open or create
a document.
"""
def __init__(self, element: CT_Document, part: DocumentPart):
super(Document, self).__init__(element)
self._element = element
self._part = part
self.__body = None
def add_comment(
self,
runs: Run | Sequence[Run],
text: str | None = "",
author: str = "",
initials: str | None = "",
) -> Comment:
"""Add a comment to the document, anchored to the specified runs.
`runs` can be a single `Run` object or a non-empty sequence of `Run` objects. Only the
first and last run of a sequence are used, it's just more convenient to pass a whole
sequence when that's what you have handy, like `paragraph.runs` for example. When `runs`
contains a single `Run` object, that run serves as both the first and last run.
A comment can be anchored only on an even run boundary, meaning the text the comment
"references" must be a non-zero integer number of consecutive runs. The runs need not be
_contiguous_ per se, like the first can be in one paragraph and the last in the next
paragraph, but all runs between the first and the last will be included in the reference.
The comment reference range is delimited by placing a `w:commentRangeStart` element before
the first run and a `w:commentRangeEnd` element after the last run. This is why only the
first and last run are required and why a single run can serve as both first and last.
Word works out which text to highlight in the UI based on these range markers.
`text` allows the contents of a simple comment to be provided in the call, providing for
the common case where a comment is a single phrase or sentence without special formatting
such as bold or italics. More complex comments can be added using the returned `Comment`
object in much the same way as a `Document` or (table) `Cell` object, using methods like
`.add_paragraph()`, .add_run()`, etc.
The `author` and `initials` parameters allow that metadata to be set for the comment.
`author` is a required attribute on a comment and is the empty string by default.
`initials` is optional on a comment and may be omitted by passing |None|, but Word adds an
`initials` attribute by default and we follow that convention by using the empty string
when no `initials` argument is provided.
"""
# -- normalize `runs` to a sequence of runs --
runs = [runs] if isinstance(runs, Run) else runs
first_run = runs[0]
last_run = runs[-1]
# -- Note that comments can only appear in the document part --
comment = self.comments.add_comment(text=text, author=author, initials=initials)
# -- let the first run orchestrate placement of the comment range start and end --
first_run.mark_comment_range(last_run, comment.comment_id)
return comment
def add_heading(self, text: str = "", level: int = 1):
"""Return a heading paragraph newly added to the end of the document.
The heading paragraph will contain `text` and have its paragraph style
determined by `level`. If `level` is 0, the style is set to `Title`. If `level`
is 1 (or omitted), `Heading 1` is used. Otherwise the style is set to `Heading
{level}`. Raises |ValueError| if `level` is outside the range 0-9.
"""
if not 0 <= level <= 9:
raise ValueError("level must be in range 0-9, got %d" % level)
style = "Title" if level == 0 else "Heading %d" % level
return self.add_paragraph(text, style)
def add_page_break(self):
"""Return newly |Paragraph| object containing only a page break."""
paragraph = self.add_paragraph()
paragraph.add_run().add_break(WD_BREAK.PAGE)
return paragraph
def add_paragraph(self, text: str = "", style: str | ParagraphStyle | None = None) -> Paragraph:
"""Return paragraph newly added to the end of the document.
The paragraph is populated with `text` and having paragraph style `style`.
`text` can contain tab (``\\t``) characters, which are converted to the
appropriate XML form for a tab. `text` can also include newline (``\\n``) or
carriage return (``\\r``) characters, each of which is converted to a line
break.
"""
return self._body.add_paragraph(text, style)
def add_picture(
self,
image_path_or_stream: str | IO[bytes],
width: int | Length | None = None,
height: int | Length | None = None,
):
"""Return new picture shape added in its own paragraph at end of the document.
The picture contains the image at `image_path_or_stream`, scaled based on
`width` and `height`. If neither width nor height is specified, the picture
appears at its native size. If only one is specified, it is used to compute a
scaling factor that is then applied to the unspecified dimension, preserving the
aspect ratio of the image. The native size of the picture is calculated using
the dots-per-inch (dpi) value specified in the image file, defaulting to 72 dpi
if no value is specified, as is often the case.
"""
run = self.add_paragraph().add_run()
return run.add_picture(image_path_or_stream, width, height)
def add_section(self, start_type: WD_SECTION = WD_SECTION.NEW_PAGE):
"""Return a |Section| object newly added at the end of the document.
The optional `start_type` argument must be a member of the :ref:`WdSectionStart`
enumeration, and defaults to ``WD_SECTION.NEW_PAGE`` if not provided.
"""
new_sectPr = self._element.body.add_section_break()
new_sectPr.start_type = start_type
return Section(new_sectPr, self._part)
def add_table(self, rows: int, cols: int, style: str | _TableStyle | None = None):
"""Add a table having row and column counts of `rows` and `cols` respectively.
`style` may be a table style object or a table style name. If `style` is |None|,
the table inherits the default table style of the document.
"""
table = self._body.add_table(rows, cols, self._block_width)
table.style = style
return table
@property
def comments(self) -> Comments:
"""A |Comments| object providing access to comments added to the document."""
return self._part.comments
@property
def core_properties(self):
"""A |CoreProperties| object providing Dublin Core properties of document."""
return self._part.core_properties
@property
def inline_shapes(self):
"""The |InlineShapes| collection for this document.
An inline shape is a graphical object, such as a picture, contained in a run of
text and behaving like a character glyph, being flowed like other text in a
paragraph.
"""
return self._part.inline_shapes
def iter_inner_content(self) -> Iterator[Paragraph | Table]:
"""Generate each `Paragraph` or `Table` in this document in document order."""
return self._body.iter_inner_content()
@property
def paragraphs(self) -> List[Paragraph]:
"""The |Paragraph| instances in the document, in document order.
Note that paragraphs within revision marks such as ``<w:ins>`` or ``<w:del>`` do
not appear in this list.
"""
return self._body.paragraphs
@property
def part(self) -> DocumentPart:
"""The |DocumentPart| object of this document."""
return self._part
def save(self, path_or_stream: str | IO[bytes]):
"""Save this document to `path_or_stream`.
`path_or_stream` can be either a path to a filesystem location (a string) or a
file-like object.
"""
self._part.save(path_or_stream)
@property
def sections(self) -> Sections:
"""|Sections| object providing access to each section in this document."""
return Sections(self._element, self._part)
@property
def settings(self) -> Settings:
"""A |Settings| object providing access to the document-level settings."""
return self._part.settings
@property
def styles(self):
"""A |Styles| object providing access to the styles in this document."""
return self._part.styles
@property
def tables(self) -> List[Table]:
"""All |Table| instances in the document, in document order.
Note that only tables appearing at the top level of the document appear in this
list; a table nested inside a table cell does not appear. A table within
revision marks such as ``<w:ins>`` or ``<w:del>`` will also not appear in the
list.
"""
return self._body.tables
@property
def _block_width(self) -> Length:
"""A |Length| object specifying the space between margins in last section."""
section = self.sections[-1]
page_width = section.page_width or Inches(8.5)
left_margin = section.left_margin or Inches(1)
right_margin = section.right_margin or Inches(1)
return Emu(page_width - left_margin - right_margin)
@property
def _body(self) -> _Body:
"""The |_Body| instance containing the content for this document."""
if self.__body is None:
self.__body = _Body(self._element.body, self)
return self.__body
class _Body(BlockItemContainer):
"""Proxy for `<w:body>` element in this document.
It's primary role is a container for document content.
"""
def __init__(self, body_elm: CT_Body, parent: t.ProvidesStoryPart):
super(_Body, self).__init__(body_elm, parent)
self._body = body_elm
def clear_content(self) -> _Body:
"""Return this |_Body| instance after clearing it of all content.
Section properties for the main document story, if present, are preserved.
"""
self._body.clear_content()
return self

View File

@@ -0,0 +1,59 @@
"""DrawingML-related objects are in this subpackage."""
from __future__ import annotations
from typing import TYPE_CHECKING
from docx.oxml.drawing import CT_Drawing
from docx.shared import Parented
if TYPE_CHECKING:
import docx.types as t
from docx.image.image import Image
class Drawing(Parented):
"""Container for a DrawingML object."""
def __init__(self, drawing: CT_Drawing, parent: t.ProvidesStoryPart):
super().__init__(parent)
self._parent = parent
self._drawing = self._element = drawing
@property
def has_picture(self) -> bool:
"""True when `drawing` contains an embedded picture.
A drawing can contain a picture, but it can also contain a chart, SmartArt, or a
drawing canvas. Methods related to a picture, like `.image`, will raise when the drawing
does not contain a picture. Use this value to determine whether image methods will succeed.
This value is `False` when a linked picture is present. This should be relatively rare and
the image would only be retrievable from the filesystem.
Note this does not distinguish between inline and floating images. The presence of either
one will cause this value to be `True`.
"""
xpath_expr = (
# -- an inline picture --
"./wp:inline/a:graphic/a:graphicData/pic:pic"
# -- a floating picture --
" | ./wp:anchor/a:graphic/a:graphicData/pic:pic"
)
# -- xpath() will return a list, empty if there are no matches --
return bool(self._drawing.xpath(xpath_expr))
@property
def image(self) -> Image:
"""An `Image` proxy object for the image in this (picture) drawing.
Raises `ValueError` when this drawing does contains something other than a picture. Use
`.has_picture` to qualify drawing objects before using this property.
"""
picture_rIds = self._drawing.xpath(".//pic:blipFill/a:blip/@r:embed")
if not picture_rIds:
raise ValueError("drawing does not contain a picture")
rId = picture_rIds[0]
doc_part = self.part
image_part = doc_part.related_parts[rId]
return image_part.image

View File

@@ -0,0 +1,150 @@
"""Base classes and other objects used by enumerations."""
from __future__ import annotations
import enum
import textwrap
from typing import TYPE_CHECKING, Any, Dict, Type, TypeVar
if TYPE_CHECKING:
from typing_extensions import Self
_T = TypeVar("_T", bound="BaseXmlEnum")
class BaseEnum(int, enum.Enum):
"""Base class for Enums that do not map XML attr values.
The enum's value will be an integer, corresponding to the integer assigned the
corresponding member in the MS API enum of the same name.
"""
def __new__(cls, ms_api_value: int, docstr: str):
self = int.__new__(cls, ms_api_value)
self._value_ = ms_api_value
self.__doc__ = docstr.strip()
return self
def __str__(self):
"""The symbolic name and string value of this member, e.g. 'MIDDLE (3)'."""
return f"{self.name} ({self.value})"
class BaseXmlEnum(int, enum.Enum):
"""Base class for Enums that also map XML attr values.
The enum's value will be an integer, corresponding to the integer assigned the
corresponding member in the MS API enum of the same name.
"""
xml_value: str | None
def __new__(cls, ms_api_value: int, xml_value: str | None, docstr: str):
self = int.__new__(cls, ms_api_value)
self._value_ = ms_api_value
self.xml_value = xml_value
self.__doc__ = docstr.strip()
return self
def __str__(self):
"""The symbolic name and string value of this member, e.g. 'MIDDLE (3)'."""
return f"{self.name} ({self.value})"
@classmethod
def from_xml(cls, xml_value: str | None) -> Self:
"""Enumeration member corresponding to XML attribute value `xml_value`.
Example::
>>> WD_PARAGRAPH_ALIGNMENT.from_xml("center")
WD_PARAGRAPH_ALIGNMENT.CENTER
"""
member = next((member for member in cls if member.xml_value == xml_value), None)
if member is None:
raise ValueError(f"{cls.__name__} has no XML mapping for '{xml_value}'")
return member
@classmethod
def to_xml(cls: Type[_T], value: int | _T | None) -> str | None:
"""XML value of this enum member, generally an XML attribute value."""
# -- presence of multi-arg `__new__()` method fools type-checker, but getting a
# -- member by its value using EnumCls(val) works as usual.
member = cls(value)
xml_value = member.xml_value
if not xml_value:
raise ValueError(f"{cls.__name__}.{member.name} has no XML representation")
return xml_value
class DocsPageFormatter:
"""Generate an .rst doc page for an enumeration.
Formats a RestructuredText documention page (string) for the enumeration class parts
passed to the constructor. An immutable one-shot service object.
"""
def __init__(self, clsname: str, clsdict: Dict[str, Any]):
self._clsname = clsname
self._clsdict = clsdict
@property
def page_str(self):
"""The RestructuredText documentation page for the enumeration.
This is the only API member for the class.
"""
tmpl = ".. _%s:\n\n%s\n\n%s\n\n----\n\n%s"
components = (
self._ms_name,
self._page_title,
self._intro_text,
self._member_defs,
)
return tmpl % components
@property
def _intro_text(self):
"""Docstring of the enumeration, formatted for documentation page."""
try:
cls_docstring = self._clsdict["__doc__"]
except KeyError:
cls_docstring = ""
if cls_docstring is None:
return ""
return textwrap.dedent(cls_docstring).strip()
def _member_def(self, member: BaseEnum | BaseXmlEnum):
"""Return an individual member definition formatted as an RST glossary entry,
wrapped to fit within 78 columns."""
assert member.__doc__ is not None
member_docstring = textwrap.dedent(member.__doc__).strip()
member_docstring = textwrap.fill(
member_docstring,
width=78,
initial_indent=" " * 4,
subsequent_indent=" " * 4,
)
return "%s\n%s\n" % (member.name, member_docstring)
@property
def _member_defs(self):
"""A single string containing the aggregated member definitions section of the
documentation page."""
members = self._clsdict["__members__"]
member_defs = [self._member_def(member) for member in members if member.name is not None]
return "\n".join(member_defs)
@property
def _ms_name(self):
"""The Microsoft API name for this enumeration."""
return self._clsdict["__ms_name__"]
@property
def _page_title(self):
"""The title for the documentation page, formatted as code (surrounded in
double-backtics) and underlined with '=' characters."""
title_underscore = "=" * (len(self._clsname) + 4)
return "``%s``\n%s" % (self._clsname, title_underscore)

View File

@@ -0,0 +1,103 @@
"""Enumerations used by DrawingML objects."""
from .base import BaseEnum, BaseXmlEnum
class MSO_COLOR_TYPE(BaseEnum):
"""Specifies the color specification scheme.
Example::
from docx.enum.dml import MSO_COLOR_TYPE
assert font.color.type == MSO_COLOR_TYPE.SCHEME
MS API name: `MsoColorType`
http://msdn.microsoft.com/en-us/library/office/ff864912(v=office.15).aspx
"""
RGB = (1, "Color is specified by an |RGBColor| value.")
"""Color is specified by an |RGBColor| value."""
THEME = (2, "Color is one of the preset theme colors.")
"""Color is one of the preset theme colors."""
AUTO = (101, "Color is determined automatically by the application.")
"""Color is determined automatically by the application."""
class MSO_THEME_COLOR_INDEX(BaseXmlEnum):
"""Indicates the Office theme color, one of those shown in the color gallery on the
formatting ribbon.
Alias: ``MSO_THEME_COLOR``
Example::
from docx.enum.dml import MSO_THEME_COLOR
font.color.theme_color = MSO_THEME_COLOR.ACCENT_1
MS API name: `MsoThemeColorIndex`
http://msdn.microsoft.com/en-us/library/office/ff860782(v=office.15).aspx
"""
NOT_THEME_COLOR = (0, "UNMAPPED", "Indicates the color is not a theme color.")
"""Indicates the color is not a theme color."""
ACCENT_1 = (5, "accent1", "Specifies the Accent 1 theme color.")
"""Specifies the Accent 1 theme color."""
ACCENT_2 = (6, "accent2", "Specifies the Accent 2 theme color.")
"""Specifies the Accent 2 theme color."""
ACCENT_3 = (7, "accent3", "Specifies the Accent 3 theme color.")
"""Specifies the Accent 3 theme color."""
ACCENT_4 = (8, "accent4", "Specifies the Accent 4 theme color.")
"""Specifies the Accent 4 theme color."""
ACCENT_5 = (9, "accent5", "Specifies the Accent 5 theme color.")
"""Specifies the Accent 5 theme color."""
ACCENT_6 = (10, "accent6", "Specifies the Accent 6 theme color.")
"""Specifies the Accent 6 theme color."""
BACKGROUND_1 = (14, "background1", "Specifies the Background 1 theme color.")
"""Specifies the Background 1 theme color."""
BACKGROUND_2 = (16, "background2", "Specifies the Background 2 theme color.")
"""Specifies the Background 2 theme color."""
DARK_1 = (1, "dark1", "Specifies the Dark 1 theme color.")
"""Specifies the Dark 1 theme color."""
DARK_2 = (3, "dark2", "Specifies the Dark 2 theme color.")
"""Specifies the Dark 2 theme color."""
FOLLOWED_HYPERLINK = (
12,
"followedHyperlink",
"Specifies the theme color for a clicked hyperlink.",
)
"""Specifies the theme color for a clicked hyperlink."""
HYPERLINK = (11, "hyperlink", "Specifies the theme color for a hyperlink.")
"""Specifies the theme color for a hyperlink."""
LIGHT_1 = (2, "light1", "Specifies the Light 1 theme color.")
"""Specifies the Light 1 theme color."""
LIGHT_2 = (4, "light2", "Specifies the Light 2 theme color.")
"""Specifies the Light 2 theme color."""
TEXT_1 = (13, "text1", "Specifies the Text 1 theme color.")
"""Specifies the Text 1 theme color."""
TEXT_2 = (15, "text2", "Specifies the Text 2 theme color.")
"""Specifies the Text 2 theme color."""
MSO_THEME_COLOR = MSO_THEME_COLOR_INDEX

View File

@@ -0,0 +1,86 @@
"""Enumerations related to the main document in WordprocessingML files."""
from .base import BaseXmlEnum
class WD_HEADER_FOOTER_INDEX(BaseXmlEnum):
"""Alias: **WD_HEADER_FOOTER**
Specifies one of the three possible header/footer definitions for a section.
For internal use only; not part of the python-docx API.
MS API name: `WdHeaderFooterIndex`
URL: https://docs.microsoft.com/en-us/office/vba/api/word.wdheaderfooterindex
"""
PRIMARY = (1, "default", "Header for odd pages or all if no even header.")
"""Header for odd pages or all if no even header."""
FIRST_PAGE = (2, "first", "Header for first page of section.")
"""Header for first page of section."""
EVEN_PAGE = (3, "even", "Header for even pages of recto/verso section.")
"""Header for even pages of recto/verso section."""
WD_HEADER_FOOTER = WD_HEADER_FOOTER_INDEX
class WD_ORIENTATION(BaseXmlEnum):
"""Alias: **WD_ORIENT**
Specifies the page layout orientation.
Example::
from docx.enum.section import WD_ORIENT
section = document.sections[-1] section.orientation = WD_ORIENT.LANDSCAPE
MS API name: `WdOrientation`
MS API URL: http://msdn.microsoft.com/en-us/library/office/ff837902.aspx
"""
PORTRAIT = (0, "portrait", "Portrait orientation.")
"""Portrait orientation."""
LANDSCAPE = (1, "landscape", "Landscape orientation.")
"""Landscape orientation."""
WD_ORIENT = WD_ORIENTATION
class WD_SECTION_START(BaseXmlEnum):
"""Alias: **WD_SECTION**
Specifies the start type of a section break.
Example::
from docx.enum.section import WD_SECTION
section = document.sections[0] section.start_type = WD_SECTION.NEW_PAGE
MS API name: `WdSectionStart`
MS API URL: http://msdn.microsoft.com/en-us/library/office/ff840975.aspx
"""
CONTINUOUS = (0, "continuous", "Continuous section break.")
"""Continuous section break."""
NEW_COLUMN = (1, "nextColumn", "New column section break.")
"""New column section break."""
NEW_PAGE = (2, "nextPage", "New page section break.")
"""New page section break."""
EVEN_PAGE = (3, "evenPage", "Even pages section break.")
"""Even pages section break."""
ODD_PAGE = (4, "oddPage", "Section begins on next odd page.")
"""Section begins on next odd page."""
WD_SECTION = WD_SECTION_START

View File

@@ -0,0 +1,19 @@
"""Enumerations related to DrawingML shapes in WordprocessingML files."""
import enum
class WD_INLINE_SHAPE_TYPE(enum.Enum):
"""Corresponds to WdInlineShapeType enumeration.
http://msdn.microsoft.com/en-us/library/office/ff192587.aspx.
"""
CHART = 12
LINKED_PICTURE = 4
PICTURE = 3
SMART_ART = 15
NOT_IMPLEMENTED = -6
WD_INLINE_SHAPE = WD_INLINE_SHAPE_TYPE

View File

@@ -0,0 +1,452 @@
"""Enumerations related to styles."""
from .base import BaseEnum, BaseXmlEnum
class WD_BUILTIN_STYLE(BaseEnum):
"""Alias: **WD_STYLE**
Specifies a built-in Microsoft Word style.
Example::
from docx import Document
from docx.enum.style import WD_STYLE
document = Document()
styles = document.styles
style = styles[WD_STYLE.BODY_TEXT]
MS API name: `WdBuiltinStyle`
http://msdn.microsoft.com/en-us/library/office/ff835210.aspx
"""
BLOCK_QUOTATION = (-85, "Block Text.")
"""Block Text."""
BODY_TEXT = (-67, "Body Text.")
"""Body Text."""
BODY_TEXT_2 = (-81, "Body Text 2.")
"""Body Text 2."""
BODY_TEXT_3 = (-82, "Body Text 3.")
"""Body Text 3."""
BODY_TEXT_FIRST_INDENT = (-78, "Body Text First Indent.")
"""Body Text First Indent."""
BODY_TEXT_FIRST_INDENT_2 = (-79, "Body Text First Indent 2.")
"""Body Text First Indent 2."""
BODY_TEXT_INDENT = (-68, "Body Text Indent.")
"""Body Text Indent."""
BODY_TEXT_INDENT_2 = (-83, "Body Text Indent 2.")
"""Body Text Indent 2."""
BODY_TEXT_INDENT_3 = (-84, "Body Text Indent 3.")
"""Body Text Indent 3."""
BOOK_TITLE = (-265, "Book Title.")
"""Book Title."""
CAPTION = (-35, "Caption.")
"""Caption."""
CLOSING = (-64, "Closing.")
"""Closing."""
COMMENT_REFERENCE = (-40, "Comment Reference.")
"""Comment Reference."""
COMMENT_TEXT = (-31, "Comment Text.")
"""Comment Text."""
DATE = (-77, "Date.")
"""Date."""
DEFAULT_PARAGRAPH_FONT = (-66, "Default Paragraph Font.")
"""Default Paragraph Font."""
EMPHASIS = (-89, "Emphasis.")
"""Emphasis."""
ENDNOTE_REFERENCE = (-43, "Endnote Reference.")
"""Endnote Reference."""
ENDNOTE_TEXT = (-44, "Endnote Text.")
"""Endnote Text."""
ENVELOPE_ADDRESS = (-37, "Envelope Address.")
"""Envelope Address."""
ENVELOPE_RETURN = (-38, "Envelope Return.")
"""Envelope Return."""
FOOTER = (-33, "Footer.")
"""Footer."""
FOOTNOTE_REFERENCE = (-39, "Footnote Reference.")
"""Footnote Reference."""
FOOTNOTE_TEXT = (-30, "Footnote Text.")
"""Footnote Text."""
HEADER = (-32, "Header.")
"""Header."""
HEADING_1 = (-2, "Heading 1.")
"""Heading 1."""
HEADING_2 = (-3, "Heading 2.")
"""Heading 2."""
HEADING_3 = (-4, "Heading 3.")
"""Heading 3."""
HEADING_4 = (-5, "Heading 4.")
"""Heading 4."""
HEADING_5 = (-6, "Heading 5.")
"""Heading 5."""
HEADING_6 = (-7, "Heading 6.")
"""Heading 6."""
HEADING_7 = (-8, "Heading 7.")
"""Heading 7."""
HEADING_8 = (-9, "Heading 8.")
"""Heading 8."""
HEADING_9 = (-10, "Heading 9.")
"""Heading 9."""
HTML_ACRONYM = (-96, "HTML Acronym.")
"""HTML Acronym."""
HTML_ADDRESS = (-97, "HTML Address.")
"""HTML Address."""
HTML_CITE = (-98, "HTML Cite.")
"""HTML Cite."""
HTML_CODE = (-99, "HTML Code.")
"""HTML Code."""
HTML_DFN = (-100, "HTML Definition.")
"""HTML Definition."""
HTML_KBD = (-101, "HTML Keyboard.")
"""HTML Keyboard."""
HTML_NORMAL = (-95, "Normal (Web).")
"""Normal (Web)."""
HTML_PRE = (-102, "HTML Preformatted.")
"""HTML Preformatted."""
HTML_SAMP = (-103, "HTML Sample.")
"""HTML Sample."""
HTML_TT = (-104, "HTML Typewriter.")
"""HTML Typewriter."""
HTML_VAR = (-105, "HTML Variable.")
"""HTML Variable."""
HYPERLINK = (-86, "Hyperlink.")
"""Hyperlink."""
HYPERLINK_FOLLOWED = (-87, "Followed Hyperlink.")
"""Followed Hyperlink."""
INDEX_1 = (-11, "Index 1.")
"""Index 1."""
INDEX_2 = (-12, "Index 2.")
"""Index 2."""
INDEX_3 = (-13, "Index 3.")
"""Index 3."""
INDEX_4 = (-14, "Index 4.")
"""Index 4."""
INDEX_5 = (-15, "Index 5.")
"""Index 5."""
INDEX_6 = (-16, "Index 6.")
"""Index 6."""
INDEX_7 = (-17, "Index 7.")
"""Index 7."""
INDEX_8 = (-18, "Index 8.")
"""Index 8."""
INDEX_9 = (-19, "Index 9.")
"""Index 9."""
INDEX_HEADING = (-34, "Index Heading")
"""Index Heading"""
INTENSE_EMPHASIS = (-262, "Intense Emphasis.")
"""Intense Emphasis."""
INTENSE_QUOTE = (-182, "Intense Quote.")
"""Intense Quote."""
INTENSE_REFERENCE = (-264, "Intense Reference.")
"""Intense Reference."""
LINE_NUMBER = (-41, "Line Number.")
"""Line Number."""
LIST = (-48, "List.")
"""List."""
LIST_2 = (-51, "List 2.")
"""List 2."""
LIST_3 = (-52, "List 3.")
"""List 3."""
LIST_4 = (-53, "List 4.")
"""List 4."""
LIST_5 = (-54, "List 5.")
"""List 5."""
LIST_BULLET = (-49, "List Bullet.")
"""List Bullet."""
LIST_BULLET_2 = (-55, "List Bullet 2.")
"""List Bullet 2."""
LIST_BULLET_3 = (-56, "List Bullet 3.")
"""List Bullet 3."""
LIST_BULLET_4 = (-57, "List Bullet 4.")
"""List Bullet 4."""
LIST_BULLET_5 = (-58, "List Bullet 5.")
"""List Bullet 5."""
LIST_CONTINUE = (-69, "List Continue.")
"""List Continue."""
LIST_CONTINUE_2 = (-70, "List Continue 2.")
"""List Continue 2."""
LIST_CONTINUE_3 = (-71, "List Continue 3.")
"""List Continue 3."""
LIST_CONTINUE_4 = (-72, "List Continue 4.")
"""List Continue 4."""
LIST_CONTINUE_5 = (-73, "List Continue 5.")
"""List Continue 5."""
LIST_NUMBER = (-50, "List Number.")
"""List Number."""
LIST_NUMBER_2 = (-59, "List Number 2.")
"""List Number 2."""
LIST_NUMBER_3 = (-60, "List Number 3.")
"""List Number 3."""
LIST_NUMBER_4 = (-61, "List Number 4.")
"""List Number 4."""
LIST_NUMBER_5 = (-62, "List Number 5.")
"""List Number 5."""
LIST_PARAGRAPH = (-180, "List Paragraph.")
"""List Paragraph."""
MACRO_TEXT = (-46, "Macro Text.")
"""Macro Text."""
MESSAGE_HEADER = (-74, "Message Header.")
"""Message Header."""
NAV_PANE = (-90, "Document Map.")
"""Document Map."""
NORMAL = (-1, "Normal.")
"""Normal."""
NORMAL_INDENT = (-29, "Normal Indent.")
"""Normal Indent."""
NORMAL_OBJECT = (-158, "Normal (applied to an object).")
"""Normal (applied to an object)."""
NORMAL_TABLE = (-106, "Normal (applied within a table).")
"""Normal (applied within a table)."""
NOTE_HEADING = (-80, "Note Heading.")
"""Note Heading."""
PAGE_NUMBER = (-42, "Page Number.")
"""Page Number."""
PLAIN_TEXT = (-91, "Plain Text.")
"""Plain Text."""
QUOTE = (-181, "Quote.")
"""Quote."""
SALUTATION = (-76, "Salutation.")
"""Salutation."""
SIGNATURE = (-65, "Signature.")
"""Signature."""
STRONG = (-88, "Strong.")
"""Strong."""
SUBTITLE = (-75, "Subtitle.")
"""Subtitle."""
SUBTLE_EMPHASIS = (-261, "Subtle Emphasis.")
"""Subtle Emphasis."""
SUBTLE_REFERENCE = (-263, "Subtle Reference.")
"""Subtle Reference."""
TABLE_COLORFUL_GRID = (-172, "Colorful Grid.")
"""Colorful Grid."""
TABLE_COLORFUL_LIST = (-171, "Colorful List.")
"""Colorful List."""
TABLE_COLORFUL_SHADING = (-170, "Colorful Shading.")
"""Colorful Shading."""
TABLE_DARK_LIST = (-169, "Dark List.")
"""Dark List."""
TABLE_LIGHT_GRID = (-161, "Light Grid.")
"""Light Grid."""
TABLE_LIGHT_GRID_ACCENT_1 = (-175, "Light Grid Accent 1.")
"""Light Grid Accent 1."""
TABLE_LIGHT_LIST = (-160, "Light List.")
"""Light List."""
TABLE_LIGHT_LIST_ACCENT_1 = (-174, "Light List Accent 1.")
"""Light List Accent 1."""
TABLE_LIGHT_SHADING = (-159, "Light Shading.")
"""Light Shading."""
TABLE_LIGHT_SHADING_ACCENT_1 = (-173, "Light Shading Accent 1.")
"""Light Shading Accent 1."""
TABLE_MEDIUM_GRID_1 = (-166, "Medium Grid 1.")
"""Medium Grid 1."""
TABLE_MEDIUM_GRID_2 = (-167, "Medium Grid 2.")
"""Medium Grid 2."""
TABLE_MEDIUM_GRID_3 = (-168, "Medium Grid 3.")
"""Medium Grid 3."""
TABLE_MEDIUM_LIST_1 = (-164, "Medium List 1.")
"""Medium List 1."""
TABLE_MEDIUM_LIST_1_ACCENT_1 = (-178, "Medium List 1 Accent 1.")
"""Medium List 1 Accent 1."""
TABLE_MEDIUM_LIST_2 = (-165, "Medium List 2.")
"""Medium List 2."""
TABLE_MEDIUM_SHADING_1 = (-162, "Medium Shading 1.")
"""Medium Shading 1."""
TABLE_MEDIUM_SHADING_1_ACCENT_1 = (-176, "Medium Shading 1 Accent 1.")
"""Medium Shading 1 Accent 1."""
TABLE_MEDIUM_SHADING_2 = (-163, "Medium Shading 2.")
"""Medium Shading 2."""
TABLE_MEDIUM_SHADING_2_ACCENT_1 = (-177, "Medium Shading 2 Accent 1.")
"""Medium Shading 2 Accent 1."""
TABLE_OF_AUTHORITIES = (-45, "Table of Authorities.")
"""Table of Authorities."""
TABLE_OF_FIGURES = (-36, "Table of Figures.")
"""Table of Figures."""
TITLE = (-63, "Title.")
"""Title."""
TOAHEADING = (-47, "TOA Heading.")
"""TOA Heading."""
TOC_1 = (-20, "TOC 1.")
"""TOC 1."""
TOC_2 = (-21, "TOC 2.")
"""TOC 2."""
TOC_3 = (-22, "TOC 3.")
"""TOC 3."""
TOC_4 = (-23, "TOC 4.")
"""TOC 4."""
TOC_5 = (-24, "TOC 5.")
"""TOC 5."""
TOC_6 = (-25, "TOC 6.")
"""TOC 6."""
TOC_7 = (-26, "TOC 7.")
"""TOC 7."""
TOC_8 = (-27, "TOC 8.")
"""TOC 8."""
TOC_9 = (-28, "TOC 9.")
"""TOC 9."""
WD_STYLE = WD_BUILTIN_STYLE
class WD_STYLE_TYPE(BaseXmlEnum):
"""Specifies one of the four style types: paragraph, character, list, or table.
Example::
from docx import Document
from docx.enum.style import WD_STYLE_TYPE
styles = Document().styles
assert styles[0].type == WD_STYLE_TYPE.PARAGRAPH
MS API name: `WdStyleType`
http://msdn.microsoft.com/en-us/library/office/ff196870.aspx
"""
CHARACTER = (2, "character", "Character style.")
"""Character style."""
LIST = (4, "numbering", "List style.")
"""List style."""
PARAGRAPH = (1, "paragraph", "Paragraph style.")
"""Paragraph style."""
TABLE = (3, "table", "Table style.")
"""Table style."""

View File

@@ -0,0 +1,136 @@
"""Enumerations related to tables in WordprocessingML files."""
from docx.enum.base import BaseEnum, BaseXmlEnum
class WD_CELL_VERTICAL_ALIGNMENT(BaseXmlEnum):
"""Alias: **WD_ALIGN_VERTICAL**
Specifies the vertical alignment of text in one or more cells of a table.
Example::
from docx.enum.table import WD_ALIGN_VERTICAL
table = document.add_table(3, 3)
table.cell(0, 0).vertical_alignment = WD_ALIGN_VERTICAL.BOTTOM
MS API name: `WdCellVerticalAlignment`
https://msdn.microsoft.com/en-us/library/office/ff193345.aspx
"""
TOP = (0, "top", "Text is aligned to the top border of the cell.")
"""Text is aligned to the top border of the cell."""
CENTER = (1, "center", "Text is aligned to the center of the cell.")
"""Text is aligned to the center of the cell."""
BOTTOM = (3, "bottom", "Text is aligned to the bottom border of the cell.")
"""Text is aligned to the bottom border of the cell."""
BOTH = (
101,
"both",
"This is an option in the OpenXml spec, but not in Word itself. It's not"
" clear what Word behavior this setting produces. If you find out please"
" let us know and we'll update this documentation. Otherwise, probably best"
" to avoid this option.",
)
"""This is an option in the OpenXml spec, but not in Word itself.
It's not clear what Word behavior this setting produces. If you find out please let
us know and we'll update this documentation. Otherwise, probably best to avoid this
option.
"""
WD_ALIGN_VERTICAL = WD_CELL_VERTICAL_ALIGNMENT
class WD_ROW_HEIGHT_RULE(BaseXmlEnum):
"""Alias: **WD_ROW_HEIGHT**
Specifies the rule for determining the height of a table row
Example::
from docx.enum.table import WD_ROW_HEIGHT_RULE
table = document.add_table(3, 3)
table.rows[0].height_rule = WD_ROW_HEIGHT_RULE.EXACTLY
MS API name: `WdRowHeightRule`
https://msdn.microsoft.com/en-us/library/office/ff193620.aspx
"""
AUTO = (
0,
"auto",
"The row height is adjusted to accommodate the tallest value in the row.",
)
"""The row height is adjusted to accommodate the tallest value in the row."""
AT_LEAST = (1, "atLeast", "The row height is at least a minimum specified value.")
"""The row height is at least a minimum specified value."""
EXACTLY = (2, "exact", "The row height is an exact value.")
"""The row height is an exact value."""
WD_ROW_HEIGHT = WD_ROW_HEIGHT_RULE
class WD_TABLE_ALIGNMENT(BaseXmlEnum):
"""Specifies table justification type.
Example::
from docx.enum.table import WD_TABLE_ALIGNMENT
table = document.add_table(3, 3)
table.alignment = WD_TABLE_ALIGNMENT.CENTER
MS API name: `WdRowAlignment`
http://office.microsoft.com/en-us/word-help/HV080607259.aspx
"""
LEFT = (0, "left", "Left-aligned")
"""Left-aligned"""
CENTER = (1, "center", "Center-aligned.")
"""Center-aligned."""
RIGHT = (2, "right", "Right-aligned.")
"""Right-aligned."""
class WD_TABLE_DIRECTION(BaseEnum):
"""Specifies the direction in which an application orders cells in the specified
table or row.
Example::
from docx.enum.table import WD_TABLE_DIRECTION
table = document.add_table(3, 3)
table.direction = WD_TABLE_DIRECTION.RTL
MS API name: `WdTableDirection`
http://msdn.microsoft.com/en-us/library/ff835141.aspx
"""
LTR = (
0,
"The table or row is arranged with the first column in the leftmost position.",
)
"""The table or row is arranged with the first column in the leftmost position."""
RTL = (
1,
"The table or row is arranged with the first column in the rightmost position.",
)
"""The table or row is arranged with the first column in the rightmost position."""

View File

@@ -0,0 +1,367 @@
"""Enumerations related to text in WordprocessingML files."""
from __future__ import annotations
import enum
from docx.enum.base import BaseXmlEnum
class WD_PARAGRAPH_ALIGNMENT(BaseXmlEnum):
"""Alias: **WD_ALIGN_PARAGRAPH**
Specifies paragraph justification type.
Example::
from docx.enum.text import WD_ALIGN_PARAGRAPH
paragraph = document.add_paragraph()
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
"""
LEFT = (0, "left", "Left-aligned")
"""Left-aligned"""
CENTER = (1, "center", "Center-aligned.")
"""Center-aligned."""
RIGHT = (2, "right", "Right-aligned.")
"""Right-aligned."""
JUSTIFY = (3, "both", "Fully justified.")
"""Fully justified."""
DISTRIBUTE = (
4,
"distribute",
"Paragraph characters are distributed to fill entire width of paragraph.",
)
"""Paragraph characters are distributed to fill entire width of paragraph."""
JUSTIFY_MED = (
5,
"mediumKashida",
"Justified with a medium character compression ratio.",
)
"""Justified with a medium character compression ratio."""
JUSTIFY_HI = (
7,
"highKashida",
"Justified with a high character compression ratio.",
)
"""Justified with a high character compression ratio."""
JUSTIFY_LOW = (8, "lowKashida", "Justified with a low character compression ratio.")
"""Justified with a low character compression ratio."""
THAI_JUSTIFY = (
9,
"thaiDistribute",
"Justified according to Thai formatting layout.",
)
"""Justified according to Thai formatting layout."""
WD_ALIGN_PARAGRAPH = WD_PARAGRAPH_ALIGNMENT
class WD_BREAK_TYPE(enum.Enum):
"""Corresponds to WdBreakType enumeration.
http://msdn.microsoft.com/en-us/library/office/ff195905.aspx.
"""
COLUMN = 8
LINE = 6
LINE_CLEAR_LEFT = 9
LINE_CLEAR_RIGHT = 10
LINE_CLEAR_ALL = 11 # -- added for consistency, not in MS version --
PAGE = 7
SECTION_CONTINUOUS = 3
SECTION_EVEN_PAGE = 4
SECTION_NEXT_PAGE = 2
SECTION_ODD_PAGE = 5
TEXT_WRAPPING = 11
WD_BREAK = WD_BREAK_TYPE
class WD_COLOR_INDEX(BaseXmlEnum):
"""Specifies a standard preset color to apply.
Used for font highlighting and perhaps other applications.
* MS API name: `WdColorIndex`
* URL: https://msdn.microsoft.com/EN-US/library/office/ff195343.aspx
"""
INHERITED = (-1, None, "Color is inherited from the style hierarchy.")
"""Color is inherited from the style hierarchy."""
AUTO = (0, "default", "Automatic color. Default; usually black.")
"""Automatic color. Default; usually black."""
BLACK = (1, "black", "Black color.")
"""Black color."""
BLUE = (2, "blue", "Blue color")
"""Blue color"""
BRIGHT_GREEN = (4, "green", "Bright green color.")
"""Bright green color."""
DARK_BLUE = (9, "darkBlue", "Dark blue color.")
"""Dark blue color."""
DARK_RED = (13, "darkRed", "Dark red color.")
"""Dark red color."""
DARK_YELLOW = (14, "darkYellow", "Dark yellow color.")
"""Dark yellow color."""
GRAY_25 = (16, "lightGray", "25% shade of gray color.")
"""25% shade of gray color."""
GRAY_50 = (15, "darkGray", "50% shade of gray color.")
"""50% shade of gray color."""
GREEN = (11, "darkGreen", "Green color.")
"""Green color."""
PINK = (5, "magenta", "Pink color.")
"""Pink color."""
RED = (6, "red", "Red color.")
"""Red color."""
TEAL = (10, "darkCyan", "Teal color.")
"""Teal color."""
TURQUOISE = (3, "cyan", "Turquoise color.")
"""Turquoise color."""
VIOLET = (12, "darkMagenta", "Violet color.")
"""Violet color."""
WHITE = (8, "white", "White color.")
"""White color."""
YELLOW = (7, "yellow", "Yellow color.")
"""Yellow color."""
WD_COLOR = WD_COLOR_INDEX
class WD_LINE_SPACING(BaseXmlEnum):
"""Specifies a line spacing format to be applied to a paragraph.
Example::
from docx.enum.text import WD_LINE_SPACING
paragraph = document.add_paragraph()
paragraph.line_spacing_rule = WD_LINE_SPACING.EXACTLY
MS API name: `WdLineSpacing`
URL: http://msdn.microsoft.com/en-us/library/office/ff844910.aspx
"""
SINGLE = (0, "UNMAPPED", "Single spaced (default).")
"""Single spaced (default)."""
ONE_POINT_FIVE = (1, "UNMAPPED", "Space-and-a-half line spacing.")
"""Space-and-a-half line spacing."""
DOUBLE = (2, "UNMAPPED", "Double spaced.")
"""Double spaced."""
AT_LEAST = (
3,
"atLeast",
"Minimum line spacing is specified amount. Amount is specified separately.",
)
"""Minimum line spacing is specified amount. Amount is specified separately."""
EXACTLY = (
4,
"exact",
"Line spacing is exactly specified amount. Amount is specified separately.",
)
"""Line spacing is exactly specified amount. Amount is specified separately."""
MULTIPLE = (
5,
"auto",
"Line spacing is specified as multiple of line heights. Changing font size"
" will change line spacing proportionately.",
)
"""Line spacing is specified as multiple of line heights. Changing font size will
change the line spacing proportionately."""
class WD_TAB_ALIGNMENT(BaseXmlEnum):
"""Specifies the tab stop alignment to apply.
MS API name: `WdTabAlignment`
URL: https://msdn.microsoft.com/EN-US/library/office/ff195609.aspx
"""
LEFT = (0, "left", "Left-aligned.")
"""Left-aligned."""
CENTER = (1, "center", "Center-aligned.")
"""Center-aligned."""
RIGHT = (2, "right", "Right-aligned.")
"""Right-aligned."""
DECIMAL = (3, "decimal", "Decimal-aligned.")
"""Decimal-aligned."""
BAR = (4, "bar", "Bar-aligned.")
"""Bar-aligned."""
LIST = (6, "list", "List-aligned. (deprecated)")
"""List-aligned. (deprecated)"""
CLEAR = (101, "clear", "Clear an inherited tab stop.")
"""Clear an inherited tab stop."""
END = (102, "end", "Right-aligned. (deprecated)")
"""Right-aligned. (deprecated)"""
NUM = (103, "num", "Left-aligned. (deprecated)")
"""Left-aligned. (deprecated)"""
START = (104, "start", "Left-aligned. (deprecated)")
"""Left-aligned. (deprecated)"""
class WD_TAB_LEADER(BaseXmlEnum):
"""Specifies the character to use as the leader with formatted tabs.
MS API name: `WdTabLeader`
URL: https://msdn.microsoft.com/en-us/library/office/ff845050.aspx
"""
SPACES = (0, "none", "Spaces. Default.")
"""Spaces. Default."""
DOTS = (1, "dot", "Dots.")
"""Dots."""
DASHES = (2, "hyphen", "Dashes.")
"""Dashes."""
LINES = (3, "underscore", "Double lines.")
"""Double lines."""
HEAVY = (4, "heavy", "A heavy line.")
"""A heavy line."""
MIDDLE_DOT = (5, "middleDot", "A vertically-centered dot.")
"""A vertically-centered dot."""
class WD_UNDERLINE(BaseXmlEnum):
"""Specifies the style of underline applied to a run of characters.
MS API name: `WdUnderline`
URL: http://msdn.microsoft.com/en-us/library/office/ff822388.aspx
"""
INHERITED = (-1, None, "Inherit underline setting from containing paragraph.")
"""Inherit underline setting from containing paragraph."""
NONE = (
0,
"none",
"No underline.\n\nThis setting overrides any inherited underline value, so can"
" be used to remove underline from a run that inherits underlining from its"
" containing paragraph. Note this is not the same as assigning |None| to"
" Run.underline. |None| is a valid assignment value, but causes the run to"
" inherit its underline value. Assigning `WD_UNDERLINE.NONE` causes"
" underlining to be unconditionally turned off.",
)
"""No underline.
This setting overrides any inherited underline value, so can be used to remove
underline from a run that inherits underlining from its containing paragraph. Note
this is not the same as assigning |None| to Run.underline. |None| is a valid
assignment value, but causes the run to inherit its underline value. Assigning
``WD_UNDERLINE.NONE`` causes underlining to be unconditionally turned off.
"""
SINGLE = (
1,
"single",
"A single line.\n\nNote that this setting is write-only in the sense that"
" |True| (rather than `WD_UNDERLINE.SINGLE`) is returned for a run having"
" this setting.",
)
"""A single line.
Note that this setting is write-only in the sense that |True|
(rather than ``WD_UNDERLINE.SINGLE``) is returned for a run having this setting.
"""
WORDS = (2, "words", "Underline individual words only.")
"""Underline individual words only."""
DOUBLE = (3, "double", "A double line.")
"""A double line."""
DOTTED = (4, "dotted", "Dots.")
"""Dots."""
THICK = (6, "thick", "A single thick line.")
"""A single thick line."""
DASH = (7, "dash", "Dashes.")
"""Dashes."""
DOT_DASH = (9, "dotDash", "Alternating dots and dashes.")
"""Alternating dots and dashes."""
DOT_DOT_DASH = (10, "dotDotDash", "An alternating dot-dot-dash pattern.")
"""An alternating dot-dot-dash pattern."""
WAVY = (11, "wave", "A single wavy line.")
"""A single wavy line."""
DOTTED_HEAVY = (20, "dottedHeavy", "Heavy dots.")
"""Heavy dots."""
DASH_HEAVY = (23, "dashedHeavy", "Heavy dashes.")
"""Heavy dashes."""
DOT_DASH_HEAVY = (25, "dashDotHeavy", "Alternating heavy dots and heavy dashes.")
"""Alternating heavy dots and heavy dashes."""
DOT_DOT_DASH_HEAVY = (
26,
"dashDotDotHeavy",
"An alternating heavy dot-dot-dash pattern.",
)
"""An alternating heavy dot-dot-dash pattern."""
WAVY_HEAVY = (27, "wavyHeavy", "A heavy wavy line.")
"""A heavy wavy line."""
DASH_LONG = (39, "dashLong", "Long dashes.")
"""Long dashes."""
WAVY_DOUBLE = (43, "wavyDouble", "A double wavy line.")
"""A double wavy line."""
DASH_LONG_HEAVY = (55, "dashLongHeavy", "Long heavy dashes.")
"""Long heavy dashes."""

View File

@@ -0,0 +1,18 @@
"""Exceptions used with python-docx.
The base exception class is PythonDocxError.
"""
class PythonDocxError(Exception):
"""Generic error class."""
class InvalidSpanError(PythonDocxError):
"""Raised when an invalid merge region is specified in a request to merge table
cells."""
class InvalidXmlError(PythonDocxError):
"""Raised when invalid XML is encountered, such as on attempt to access a missing
required child element."""

View File

@@ -0,0 +1,23 @@
"""Provides objects that can characterize image streams.
That characterization is as to content type and size, as a required step in including
them in a document.
"""
from docx.image.bmp import Bmp
from docx.image.gif import Gif
from docx.image.jpeg import Exif, Jfif
from docx.image.png import Png
from docx.image.tiff import Tiff
SIGNATURES = (
# class, offset, signature_bytes
(Png, 0, b"\x89PNG\x0d\x0a\x1a\x0a"),
(Jfif, 6, b"JFIF"),
(Exif, 6, b"Exif"),
(Gif, 0, b"GIF87a"),
(Gif, 0, b"GIF89a"),
(Tiff, 0, b"MM\x00*"), # big-endian (Motorola) TIFF
(Tiff, 0, b"II*\x00"), # little-endian (Intel) TIFF
(Bmp, 0, b"BM"),
)

View File

@@ -0,0 +1,43 @@
from .constants import MIME_TYPE
from .helpers import LITTLE_ENDIAN, StreamReader
from .image import BaseImageHeader
class Bmp(BaseImageHeader):
"""Image header parser for BMP images."""
@classmethod
def from_stream(cls, stream):
"""Return |Bmp| instance having header properties parsed from the BMP image in
`stream`."""
stream_rdr = StreamReader(stream, LITTLE_ENDIAN)
px_width = stream_rdr.read_long(0x12)
px_height = stream_rdr.read_long(0x16)
horz_px_per_meter = stream_rdr.read_long(0x26)
vert_px_per_meter = stream_rdr.read_long(0x2A)
horz_dpi = cls._dpi(horz_px_per_meter)
vert_dpi = cls._dpi(vert_px_per_meter)
return cls(px_width, px_height, horz_dpi, vert_dpi)
@property
def content_type(self):
"""MIME content type for this image, unconditionally `image/bmp` for BMP
images."""
return MIME_TYPE.BMP
@property
def default_ext(self):
"""Default filename extension, always 'bmp' for BMP images."""
return "bmp"
@staticmethod
def _dpi(px_per_meter):
"""Return the integer pixels per inch from `px_per_meter`, defaulting to 96 if
`px_per_meter` is zero."""
if px_per_meter == 0:
return 96
return int(round(px_per_meter * 0.0254))

View File

@@ -0,0 +1,172 @@
"""Constants specific the the image sub-package."""
class JPEG_MARKER_CODE:
"""JPEG marker codes."""
TEM = b"\x01"
DHT = b"\xc4"
DAC = b"\xcc"
JPG = b"\xc8"
SOF0 = b"\xc0"
SOF1 = b"\xc1"
SOF2 = b"\xc2"
SOF3 = b"\xc3"
SOF5 = b"\xc5"
SOF6 = b"\xc6"
SOF7 = b"\xc7"
SOF9 = b"\xc9"
SOFA = b"\xca"
SOFB = b"\xcb"
SOFD = b"\xcd"
SOFE = b"\xce"
SOFF = b"\xcf"
RST0 = b"\xd0"
RST1 = b"\xd1"
RST2 = b"\xd2"
RST3 = b"\xd3"
RST4 = b"\xd4"
RST5 = b"\xd5"
RST6 = b"\xd6"
RST7 = b"\xd7"
SOI = b"\xd8"
EOI = b"\xd9"
SOS = b"\xda"
DQT = b"\xdb" # Define Quantization Table(s)
DNL = b"\xdc"
DRI = b"\xdd"
DHP = b"\xde"
EXP = b"\xdf"
APP0 = b"\xe0"
APP1 = b"\xe1"
APP2 = b"\xe2"
APP3 = b"\xe3"
APP4 = b"\xe4"
APP5 = b"\xe5"
APP6 = b"\xe6"
APP7 = b"\xe7"
APP8 = b"\xe8"
APP9 = b"\xe9"
APPA = b"\xea"
APPB = b"\xeb"
APPC = b"\xec"
APPD = b"\xed"
APPE = b"\xee"
APPF = b"\xef"
STANDALONE_MARKERS = (TEM, SOI, EOI, RST0, RST1, RST2, RST3, RST4, RST5, RST6, RST7)
SOF_MARKER_CODES = (
SOF0,
SOF1,
SOF2,
SOF3,
SOF5,
SOF6,
SOF7,
SOF9,
SOFA,
SOFB,
SOFD,
SOFE,
SOFF,
)
marker_names = {
b"\x00": "UNKNOWN",
b"\xc0": "SOF0",
b"\xc2": "SOF2",
b"\xc4": "DHT",
b"\xda": "SOS", # start of scan
b"\xd8": "SOI", # start of image
b"\xd9": "EOI", # end of image
b"\xdb": "DQT",
b"\xe0": "APP0",
b"\xe1": "APP1",
b"\xe2": "APP2",
b"\xed": "APP13",
b"\xee": "APP14",
}
@classmethod
def is_standalone(cls, marker_code):
return marker_code in cls.STANDALONE_MARKERS
class MIME_TYPE:
"""Image content types."""
BMP = "image/bmp"
GIF = "image/gif"
JPEG = "image/jpeg"
PNG = "image/png"
TIFF = "image/tiff"
class PNG_CHUNK_TYPE:
"""PNG chunk type names."""
IHDR = "IHDR"
pHYs = "pHYs"
IEND = "IEND"
class TIFF_FLD_TYPE:
"""Tag codes for TIFF Image File Directory (IFD) entries."""
BYTE = 1
ASCII = 2
SHORT = 3
LONG = 4
RATIONAL = 5
field_type_names = {
1: "BYTE",
2: "ASCII char",
3: "SHORT",
4: "LONG",
5: "RATIONAL",
}
TIFF_FLD = TIFF_FLD_TYPE
class TIFF_TAG:
"""Tag codes for TIFF Image File Directory (IFD) entries."""
IMAGE_WIDTH = 0x0100
IMAGE_LENGTH = 0x0101
X_RESOLUTION = 0x011A
Y_RESOLUTION = 0x011B
RESOLUTION_UNIT = 0x0128
tag_names = {
0x00FE: "NewSubfileType",
0x0100: "ImageWidth",
0x0101: "ImageLength",
0x0102: "BitsPerSample",
0x0103: "Compression",
0x0106: "PhotometricInterpretation",
0x010E: "ImageDescription",
0x010F: "Make",
0x0110: "Model",
0x0111: "StripOffsets",
0x0112: "Orientation",
0x0115: "SamplesPerPixel",
0x0117: "StripByteCounts",
0x011A: "XResolution",
0x011B: "YResolution",
0x011C: "PlanarConfiguration",
0x0128: "ResolutionUnit",
0x0131: "Software",
0x0132: "DateTime",
0x0213: "YCbCrPositioning",
0x8769: "ExifTag",
0x8825: "GPS IFD",
0xC4A5: "PrintImageMatching",
}

View File

@@ -0,0 +1,13 @@
"""Exceptions specific the the image sub-package."""
class InvalidImageStreamError(Exception):
"""The recognized image stream appears to be corrupted."""
class UnexpectedEndOfFileError(Exception):
"""EOF was unexpectedly encountered while reading an image stream."""
class UnrecognizedImageError(Exception):
"""The provided image stream could not be recognized."""

View File

@@ -0,0 +1,38 @@
from struct import Struct
from .constants import MIME_TYPE
from .image import BaseImageHeader
class Gif(BaseImageHeader):
"""Image header parser for GIF images.
Note that the GIF format does not support resolution (DPI) information. Both
horizontal and vertical DPI default to 72.
"""
@classmethod
def from_stream(cls, stream):
"""Return |Gif| instance having header properties parsed from GIF image in
`stream`."""
px_width, px_height = cls._dimensions_from_stream(stream)
return cls(px_width, px_height, 72, 72)
@property
def content_type(self):
"""MIME content type for this image, unconditionally `image/gif` for GIF
images."""
return MIME_TYPE.GIF
@property
def default_ext(self):
"""Default filename extension, always 'gif' for GIF images."""
return "gif"
@classmethod
def _dimensions_from_stream(cls, stream):
stream.seek(6)
bytes_ = stream.read(4)
struct = Struct("<HH")
px_width, px_height = struct.unpack(bytes_)
return px_width, px_height

View File

@@ -0,0 +1,86 @@
from struct import Struct
from .exceptions import UnexpectedEndOfFileError
BIG_ENDIAN = ">"
LITTLE_ENDIAN = "<"
class StreamReader:
"""Wraps a file-like object to provide access to structured data from a binary file.
Byte-order is configurable. `base_offset` is added to any base value provided to
calculate actual location for reads.
"""
def __init__(self, stream, byte_order, base_offset=0):
super(StreamReader, self).__init__()
self._stream = stream
self._byte_order = LITTLE_ENDIAN if byte_order == LITTLE_ENDIAN else BIG_ENDIAN
self._base_offset = base_offset
def read(self, count):
"""Allow pass-through read() call."""
return self._stream.read(count)
def read_byte(self, base, offset=0):
"""Return the int value of the byte at the file position defined by
self._base_offset + `base` + `offset`.
If `base` is None, the byte is read from the current position in the stream.
"""
fmt = "B"
return self._read_int(fmt, base, offset)
def read_long(self, base, offset=0):
"""Return the int value of the four bytes at the file position defined by
self._base_offset + `base` + `offset`.
If `base` is None, the long is read from the current position in the stream. The
endian setting of this instance is used to interpret the byte layout of the
long.
"""
fmt = "<L" if self._byte_order is LITTLE_ENDIAN else ">L"
return self._read_int(fmt, base, offset)
def read_short(self, base, offset=0):
"""Return the int value of the two bytes at the file position determined by
`base` and `offset`, similarly to ``read_long()`` above."""
fmt = b"<H" if self._byte_order is LITTLE_ENDIAN else b">H"
return self._read_int(fmt, base, offset)
def read_str(self, char_count, base, offset=0):
"""Return a string containing the `char_count` bytes at the file position
determined by self._base_offset + `base` + `offset`."""
def str_struct(char_count):
format_ = "%ds" % char_count
return Struct(format_)
struct = str_struct(char_count)
chars = self._unpack_item(struct, base, offset)
unicode_str = chars.decode("UTF-8")
return unicode_str
def seek(self, base, offset=0):
location = self._base_offset + base + offset
self._stream.seek(location)
def tell(self):
"""Allow pass-through tell() call."""
return self._stream.tell()
def _read_bytes(self, byte_count, base, offset):
self.seek(base, offset)
bytes_ = self._stream.read(byte_count)
if len(bytes_) < byte_count:
raise UnexpectedEndOfFileError
return bytes_
def _read_int(self, fmt, base, offset):
struct = Struct(fmt)
return self._unpack_item(struct, base, offset)
def _unpack_item(self, struct, base, offset):
bytes_ = self._read_bytes(struct.size, base, offset)
return struct.unpack(bytes_)[0]

View File

@@ -0,0 +1,234 @@
"""Provides objects that can characterize image streams.
That characterization is as to content type and size, as a required step in including
them in a document.
"""
from __future__ import annotations
import hashlib
import io
import os
from typing import IO, Tuple
from docx.image.exceptions import UnrecognizedImageError
from docx.shared import Emu, Inches, Length, lazyproperty
class Image:
"""Graphical image stream such as JPEG, PNG, or GIF with properties and methods
required by ImagePart."""
def __init__(self, blob: bytes, filename: str, image_header: BaseImageHeader):
super(Image, self).__init__()
self._blob = blob
self._filename = filename
self._image_header = image_header
@classmethod
def from_blob(cls, blob: bytes) -> Image:
"""Return a new |Image| subclass instance parsed from the image binary contained
in `blob`."""
stream = io.BytesIO(blob)
return cls._from_stream(stream, blob)
@classmethod
def from_file(cls, image_descriptor: str | IO[bytes]):
"""Return a new |Image| subclass instance loaded from the image file identified
by `image_descriptor`, a path or file-like object."""
if isinstance(image_descriptor, str):
path = image_descriptor
with open(path, "rb") as f:
blob = f.read()
stream = io.BytesIO(blob)
filename = os.path.basename(path)
else:
stream = image_descriptor
stream.seek(0)
blob = stream.read()
filename = None
return cls._from_stream(stream, blob, filename)
@property
def blob(self):
"""The bytes of the image 'file'."""
return self._blob
@property
def content_type(self) -> str:
"""MIME content type for this image, e.g. ``'image/jpeg'`` for a JPEG image."""
return self._image_header.content_type
@lazyproperty
def ext(self):
"""The file extension for the image.
If an actual one is available from a load filename it is used. Otherwise a
canonical extension is assigned based on the content type. Does not contain the
leading period, e.g. 'jpg', not '.jpg'.
"""
return os.path.splitext(self._filename)[1][1:]
@property
def filename(self):
"""Original image file name, if loaded from disk, or a generic filename if
loaded from an anonymous stream."""
return self._filename
@property
def px_width(self) -> int:
"""The horizontal pixel dimension of the image."""
return self._image_header.px_width
@property
def px_height(self) -> int:
"""The vertical pixel dimension of the image."""
return self._image_header.px_height
@property
def horz_dpi(self) -> int:
"""Integer dots per inch for the width of this image.
Defaults to 72 when not present in the file, as is often the case.
"""
return self._image_header.horz_dpi
@property
def vert_dpi(self) -> int:
"""Integer dots per inch for the height of this image.
Defaults to 72 when not present in the file, as is often the case.
"""
return self._image_header.vert_dpi
@property
def width(self) -> Inches:
"""A |Length| value representing the native width of the image, calculated from
the values of `px_width` and `horz_dpi`."""
return Inches(self.px_width / self.horz_dpi)
@property
def height(self) -> Inches:
"""A |Length| value representing the native height of the image, calculated from
the values of `px_height` and `vert_dpi`."""
return Inches(self.px_height / self.vert_dpi)
def scaled_dimensions(
self, width: int | Length | None = None, height: int | Length | None = None
) -> Tuple[Length, Length]:
"""(cx, cy) pair representing scaled dimensions of this image.
The native dimensions of the image are scaled by applying the following rules to
the `width` and `height` arguments.
* If both `width` and `height` are specified, the return value is (`width`,
`height`); no scaling is performed.
* If only one is specified, it is used to compute a scaling factor that is then
applied to the unspecified dimension, preserving the aspect ratio of the image.
* If both `width` and `height` are |None|, the native dimensions are returned.
The native dimensions are calculated using the dots-per-inch (dpi) value
embedded in the image, defaulting to 72 dpi if no value is specified, as is
often the case. The returned values are both |Length| objects.
"""
if width is None and height is None:
return self.width, self.height
if width is None:
assert height is not None
scaling_factor = float(height) / float(self.height)
width = round(self.width * scaling_factor)
if height is None:
scaling_factor = float(width) / float(self.width)
height = round(self.height * scaling_factor)
return Emu(width), Emu(height)
@lazyproperty
def sha1(self):
"""SHA1 hash digest of the image blob."""
return hashlib.sha1(self._blob).hexdigest()
@classmethod
def _from_stream(
cls,
stream: IO[bytes],
blob: bytes,
filename: str | None = None,
) -> Image:
"""Return an instance of the |Image| subclass corresponding to the format of the
image in `stream`."""
image_header = _ImageHeaderFactory(stream)
if filename is None:
filename = "image.%s" % image_header.default_ext
return cls(blob, filename, image_header)
def _ImageHeaderFactory(stream: IO[bytes]):
"""A |BaseImageHeader| subclass instance that can parse headers of image in `stream`."""
from docx.image import SIGNATURES
def read_32(stream: IO[bytes]):
stream.seek(0)
return stream.read(32)
header = read_32(stream)
for cls, offset, signature_bytes in SIGNATURES:
end = offset + len(signature_bytes)
found_bytes = header[offset:end]
if found_bytes == signature_bytes:
return cls.from_stream(stream)
raise UnrecognizedImageError
class BaseImageHeader:
"""Base class for image header subclasses like |Jpeg| and |Tiff|."""
def __init__(self, px_width: int, px_height: int, horz_dpi: int, vert_dpi: int):
self._px_width = px_width
self._px_height = px_height
self._horz_dpi = horz_dpi
self._vert_dpi = vert_dpi
@property
def content_type(self) -> str:
"""Abstract property definition, must be implemented by all subclasses."""
msg = "content_type property must be implemented by all subclasses of BaseImageHeader"
raise NotImplementedError(msg)
@property
def default_ext(self) -> str:
"""Default filename extension for images of this type.
An abstract property definition, must be implemented by all subclasses.
"""
raise NotImplementedError(
"default_ext property must be implemented by all subclasses of BaseImageHeader"
)
@property
def px_width(self):
"""The horizontal pixel dimension of the image."""
return self._px_width
@property
def px_height(self):
"""The vertical pixel dimension of the image."""
return self._px_height
@property
def horz_dpi(self):
"""Integer dots per inch for the width of this image.
Defaults to 72 when not present in the file, as is often the case.
"""
return self._horz_dpi
@property
def vert_dpi(self):
"""Integer dots per inch for the height of this image.
Defaults to 72 when not present in the file, as is often the case.
"""
return self._vert_dpi

View File

@@ -0,0 +1,425 @@
"""Objects related to parsing headers of JPEG image streams.
Includes both JFIF and Exif sub-formats.
"""
import io
from docx.image.constants import JPEG_MARKER_CODE, MIME_TYPE
from docx.image.helpers import BIG_ENDIAN, StreamReader
from docx.image.image import BaseImageHeader
from docx.image.tiff import Tiff
class Jpeg(BaseImageHeader):
"""Base class for JFIF and EXIF subclasses."""
@property
def content_type(self):
"""MIME content type for this image, unconditionally `image/jpeg` for JPEG
images."""
return MIME_TYPE.JPEG
@property
def default_ext(self):
"""Default filename extension, always 'jpg' for JPG images."""
return "jpg"
class Exif(Jpeg):
"""Image header parser for Exif image format."""
@classmethod
def from_stream(cls, stream):
"""Return |Exif| instance having header properties parsed from Exif image in
`stream`."""
markers = _JfifMarkers.from_stream(stream)
# print('\n%s' % markers)
px_width = markers.sof.px_width
px_height = markers.sof.px_height
horz_dpi = markers.app1.horz_dpi
vert_dpi = markers.app1.vert_dpi
return cls(px_width, px_height, horz_dpi, vert_dpi)
class Jfif(Jpeg):
"""Image header parser for JFIF image format."""
@classmethod
def from_stream(cls, stream):
"""Return a |Jfif| instance having header properties parsed from image in
`stream`."""
markers = _JfifMarkers.from_stream(stream)
px_width = markers.sof.px_width
px_height = markers.sof.px_height
horz_dpi = markers.app0.horz_dpi
vert_dpi = markers.app0.vert_dpi
return cls(px_width, px_height, horz_dpi, vert_dpi)
class _JfifMarkers:
"""Sequence of markers in a JPEG file, perhaps truncated at first SOS marker for
performance reasons."""
def __init__(self, markers):
super(_JfifMarkers, self).__init__()
self._markers = list(markers)
def __str__(self): # pragma: no cover
"""Returns a tabular listing of the markers in this instance, which can be handy
for debugging and perhaps other uses."""
header = " offset seglen mc name\n======= ====== == ====="
tmpl = "%7d %6d %02X %s"
rows = []
for marker in self._markers:
rows.append(
tmpl
% (
marker.offset,
marker.segment_length,
ord(marker.marker_code),
marker.name,
)
)
lines = [header] + rows
return "\n".join(lines)
@classmethod
def from_stream(cls, stream):
"""Return a |_JfifMarkers| instance containing a |_JfifMarker| subclass instance
for each marker in `stream`."""
marker_parser = _MarkerParser.from_stream(stream)
markers = []
for marker in marker_parser.iter_markers():
markers.append(marker)
if marker.marker_code == JPEG_MARKER_CODE.SOS:
break
return cls(markers)
@property
def app0(self):
"""First APP0 marker in image markers."""
for m in self._markers:
if m.marker_code == JPEG_MARKER_CODE.APP0:
return m
raise KeyError("no APP0 marker in image")
@property
def app1(self):
"""First APP1 marker in image markers."""
for m in self._markers:
if m.marker_code == JPEG_MARKER_CODE.APP1:
return m
raise KeyError("no APP1 marker in image")
@property
def sof(self):
"""First start of frame (SOFn) marker in this sequence."""
for m in self._markers:
if m.marker_code in JPEG_MARKER_CODE.SOF_MARKER_CODES:
return m
raise KeyError("no start of frame (SOFn) marker in image")
class _MarkerParser:
"""Service class that knows how to parse a JFIF stream and iterate over its
markers."""
def __init__(self, stream_reader):
super(_MarkerParser, self).__init__()
self._stream = stream_reader
@classmethod
def from_stream(cls, stream):
"""Return a |_MarkerParser| instance to parse JFIF markers from `stream`."""
stream_reader = StreamReader(stream, BIG_ENDIAN)
return cls(stream_reader)
def iter_markers(self):
"""Generate a (marker_code, segment_offset) 2-tuple for each marker in the JPEG
`stream`, in the order they occur in the stream."""
marker_finder = _MarkerFinder.from_stream(self._stream)
start = 0
marker_code = None
while marker_code != JPEG_MARKER_CODE.EOI:
marker_code, segment_offset = marker_finder.next(start)
marker = _MarkerFactory(marker_code, self._stream, segment_offset)
yield marker
start = segment_offset + marker.segment_length
class _MarkerFinder:
"""Service class that knows how to find the next JFIF marker in a stream."""
def __init__(self, stream):
super(_MarkerFinder, self).__init__()
self._stream = stream
@classmethod
def from_stream(cls, stream):
"""Return a |_MarkerFinder| instance to find JFIF markers in `stream`."""
return cls(stream)
def next(self, start):
"""Return a (marker_code, segment_offset) 2-tuple identifying and locating the
first marker in `stream` occuring after offset `start`.
The returned `segment_offset` points to the position immediately following the
2-byte marker code, the start of the marker segment, for those markers that have
a segment.
"""
position = start
while True:
# skip over any non-\xFF bytes
position = self._offset_of_next_ff_byte(start=position)
# skip over any \xFF padding bytes
position, byte_ = self._next_non_ff_byte(start=position + 1)
# 'FF 00' sequence is not a marker, start over if found
if byte_ == b"\x00":
continue
# this is a marker, gather return values and break out of scan
marker_code, segment_offset = byte_, position + 1
break
return marker_code, segment_offset
def _next_non_ff_byte(self, start):
"""Return an offset, byte 2-tuple for the next byte in `stream` that is not
'\xff', starting with the byte at offset `start`.
If the byte at offset `start` is not '\xff', `start` and the returned `offset`
will be the same.
"""
self._stream.seek(start)
byte_ = self._read_byte()
while byte_ == b"\xff":
byte_ = self._read_byte()
offset_of_non_ff_byte = self._stream.tell() - 1
return offset_of_non_ff_byte, byte_
def _offset_of_next_ff_byte(self, start):
"""Return the offset of the next '\xff' byte in `stream` starting with the byte
at offset `start`.
Returns `start` if the byte at that offset is a hex 255; it does not necessarily
advance in the stream.
"""
self._stream.seek(start)
byte_ = self._read_byte()
while byte_ != b"\xff":
byte_ = self._read_byte()
offset_of_ff_byte = self._stream.tell() - 1
return offset_of_ff_byte
def _read_byte(self):
"""Return the next byte read from stream.
Raise Exception if stream is at end of file.
"""
byte_ = self._stream.read(1)
if not byte_: # pragma: no cover
raise Exception("unexpected end of file")
return byte_
def _MarkerFactory(marker_code, stream, offset):
"""Return |_Marker| or subclass instance appropriate for marker at `offset` in
`stream` having `marker_code`."""
if marker_code == JPEG_MARKER_CODE.APP0:
marker_cls = _App0Marker
elif marker_code == JPEG_MARKER_CODE.APP1:
marker_cls = _App1Marker
elif marker_code in JPEG_MARKER_CODE.SOF_MARKER_CODES:
marker_cls = _SofMarker
else:
marker_cls = _Marker
return marker_cls.from_stream(stream, marker_code, offset)
class _Marker:
"""Base class for JFIF marker classes.
Represents a marker and its segment occuring in a JPEG byte stream.
"""
def __init__(self, marker_code, offset, segment_length):
super(_Marker, self).__init__()
self._marker_code = marker_code
self._offset = offset
self._segment_length = segment_length
@classmethod
def from_stream(cls, stream, marker_code, offset):
"""Return a generic |_Marker| instance for the marker at `offset` in `stream`
having `marker_code`."""
if JPEG_MARKER_CODE.is_standalone(marker_code):
segment_length = 0
else:
segment_length = stream.read_short(offset)
return cls(marker_code, offset, segment_length)
@property
def marker_code(self):
"""The single-byte code that identifies the type of this marker, e.g. ``'\xe0'``
for start of image (SOI)."""
return self._marker_code
@property
def name(self): # pragma: no cover
return JPEG_MARKER_CODE.marker_names[self._marker_code]
@property
def offset(self): # pragma: no cover
return self._offset
@property
def segment_length(self):
"""The length in bytes of this marker's segment."""
return self._segment_length
class _App0Marker(_Marker):
"""Represents a JFIF APP0 marker segment."""
def __init__(self, marker_code, offset, length, density_units, x_density, y_density):
super(_App0Marker, self).__init__(marker_code, offset, length)
self._density_units = density_units
self._x_density = x_density
self._y_density = y_density
@property
def horz_dpi(self):
"""Horizontal dots per inch specified in this marker, defaults to 72 if not
specified."""
return self._dpi(self._x_density)
@property
def vert_dpi(self):
"""Vertical dots per inch specified in this marker, defaults to 72 if not
specified."""
return self._dpi(self._y_density)
def _dpi(self, density):
"""Return dots per inch corresponding to `density` value."""
if self._density_units == 1:
dpi = density
elif self._density_units == 2:
dpi = int(round(density * 2.54))
else:
dpi = 72
return dpi
@classmethod
def from_stream(cls, stream, marker_code, offset):
"""Return an |_App0Marker| instance for the APP0 marker at `offset` in
`stream`."""
# field off type notes
# ------------------ --- ----- -------------------
# segment length 0 short
# JFIF identifier 2 5 chr 'JFIF\x00'
# major JPEG version 7 byte typically 1
# minor JPEG version 8 byte typically 1 or 2
# density units 9 byte 1=inches, 2=cm
# horz dots per unit 10 short
# vert dots per unit 12 short
# ------------------ --- ----- -------------------
segment_length = stream.read_short(offset)
density_units = stream.read_byte(offset, 9)
x_density = stream.read_short(offset, 10)
y_density = stream.read_short(offset, 12)
return cls(marker_code, offset, segment_length, density_units, x_density, y_density)
class _App1Marker(_Marker):
"""Represents a JFIF APP1 (Exif) marker segment."""
def __init__(self, marker_code, offset, length, horz_dpi, vert_dpi):
super(_App1Marker, self).__init__(marker_code, offset, length)
self._horz_dpi = horz_dpi
self._vert_dpi = vert_dpi
@classmethod
def from_stream(cls, stream, marker_code, offset):
"""Extract the horizontal and vertical dots-per-inch value from the APP1 header
at `offset` in `stream`."""
# field off len type notes
# -------------------- --- --- ----- ----------------------------
# segment length 0 2 short
# Exif identifier 2 6 6 chr 'Exif\x00\x00'
# TIFF byte order 8 2 2 chr 'II'=little 'MM'=big endian
# meaning of universe 10 2 2 chr '*\x00' or '\x00*' depending
# IFD0 off fr/II or MM 10 16 long relative to ...?
# -------------------- --- --- ----- ----------------------------
segment_length = stream.read_short(offset)
if cls._is_non_Exif_APP1_segment(stream, offset):
return cls(marker_code, offset, segment_length, 72, 72)
tiff = cls._tiff_from_exif_segment(stream, offset, segment_length)
return cls(marker_code, offset, segment_length, tiff.horz_dpi, tiff.vert_dpi)
@property
def horz_dpi(self):
"""Horizontal dots per inch specified in this marker, defaults to 72 if not
specified."""
return self._horz_dpi
@property
def vert_dpi(self):
"""Vertical dots per inch specified in this marker, defaults to 72 if not
specified."""
return self._vert_dpi
@classmethod
def _is_non_Exif_APP1_segment(cls, stream, offset):
"""Return True if the APP1 segment at `offset` in `stream` is NOT an Exif
segment, as determined by the ``'Exif\x00\x00'`` signature at offset 2 in the
segment."""
stream.seek(offset + 2)
exif_signature = stream.read(6)
return exif_signature != b"Exif\x00\x00"
@classmethod
def _tiff_from_exif_segment(cls, stream, offset, segment_length):
"""Return a |Tiff| instance parsed from the Exif APP1 segment of
`segment_length` at `offset` in `stream`."""
# wrap full segment in its own stream and feed to Tiff()
stream.seek(offset + 8)
segment_bytes = stream.read(segment_length - 8)
substream = io.BytesIO(segment_bytes)
return Tiff.from_stream(substream)
class _SofMarker(_Marker):
"""Represents a JFIF start of frame (SOFx) marker segment."""
def __init__(self, marker_code, offset, segment_length, px_width, px_height):
super(_SofMarker, self).__init__(marker_code, offset, segment_length)
self._px_width = px_width
self._px_height = px_height
@classmethod
def from_stream(cls, stream, marker_code, offset):
"""Return an |_SofMarker| instance for the SOFn marker at `offset` in stream."""
# field off type notes
# ------------------ --- ----- ----------------------------
# segment length 0 short
# Data precision 2 byte
# Vertical lines 3 short px_height
# Horizontal lines 5 short px_width
# ------------------ --- ----- ----------------------------
segment_length = stream.read_short(offset)
px_height = stream.read_short(offset, 3)
px_width = stream.read_short(offset, 5)
return cls(marker_code, offset, segment_length, px_width, px_height)
@property
def px_height(self):
"""Image height in pixels."""
return self._px_height
@property
def px_width(self):
"""Image width in pixels."""
return self._px_width

View File

@@ -0,0 +1,253 @@
from .constants import MIME_TYPE, PNG_CHUNK_TYPE
from .exceptions import InvalidImageStreamError
from .helpers import BIG_ENDIAN, StreamReader
from .image import BaseImageHeader
class Png(BaseImageHeader):
"""Image header parser for PNG images."""
@property
def content_type(self):
"""MIME content type for this image, unconditionally `image/png` for PNG
images."""
return MIME_TYPE.PNG
@property
def default_ext(self):
"""Default filename extension, always 'png' for PNG images."""
return "png"
@classmethod
def from_stream(cls, stream):
"""Return a |Png| instance having header properties parsed from image in
`stream`."""
parser = _PngParser.parse(stream)
px_width = parser.px_width
px_height = parser.px_height
horz_dpi = parser.horz_dpi
vert_dpi = parser.vert_dpi
return cls(px_width, px_height, horz_dpi, vert_dpi)
class _PngParser:
"""Parses a PNG image stream to extract the image properties found in its chunks."""
def __init__(self, chunks):
super(_PngParser, self).__init__()
self._chunks = chunks
@classmethod
def parse(cls, stream):
"""Return a |_PngParser| instance containing the header properties parsed from
the PNG image in `stream`."""
chunks = _Chunks.from_stream(stream)
return cls(chunks)
@property
def px_width(self):
"""The number of pixels in each row of the image."""
IHDR = self._chunks.IHDR
return IHDR.px_width
@property
def px_height(self):
"""The number of stacked rows of pixels in the image."""
IHDR = self._chunks.IHDR
return IHDR.px_height
@property
def horz_dpi(self):
"""Integer dots per inch for the width of this image.
Defaults to 72 when not present in the file, as is often the case.
"""
pHYs = self._chunks.pHYs
if pHYs is None:
return 72
return self._dpi(pHYs.units_specifier, pHYs.horz_px_per_unit)
@property
def vert_dpi(self):
"""Integer dots per inch for the height of this image.
Defaults to 72 when not present in the file, as is often the case.
"""
pHYs = self._chunks.pHYs
if pHYs is None:
return 72
return self._dpi(pHYs.units_specifier, pHYs.vert_px_per_unit)
@staticmethod
def _dpi(units_specifier, px_per_unit):
"""Return dots per inch value calculated from `units_specifier` and
`px_per_unit`."""
if units_specifier == 1 and px_per_unit:
return int(round(px_per_unit * 0.0254))
return 72
class _Chunks:
"""Collection of the chunks parsed from a PNG image stream."""
def __init__(self, chunk_iterable):
super(_Chunks, self).__init__()
self._chunks = list(chunk_iterable)
@classmethod
def from_stream(cls, stream):
"""Return a |_Chunks| instance containing the PNG chunks in `stream`."""
chunk_parser = _ChunkParser.from_stream(stream)
chunks = list(chunk_parser.iter_chunks())
return cls(chunks)
@property
def IHDR(self):
"""IHDR chunk in PNG image."""
match = lambda chunk: chunk.type_name == PNG_CHUNK_TYPE.IHDR # noqa
IHDR = self._find_first(match)
if IHDR is None:
raise InvalidImageStreamError("no IHDR chunk in PNG image")
return IHDR
@property
def pHYs(self):
"""PHYs chunk in PNG image, or |None| if not present."""
match = lambda chunk: chunk.type_name == PNG_CHUNK_TYPE.pHYs # noqa
return self._find_first(match)
def _find_first(self, match):
"""Return first chunk in stream order returning True for function `match`."""
for chunk in self._chunks:
if match(chunk):
return chunk
return None
class _ChunkParser:
"""Extracts chunks from a PNG image stream."""
def __init__(self, stream_rdr):
super(_ChunkParser, self).__init__()
self._stream_rdr = stream_rdr
@classmethod
def from_stream(cls, stream):
"""Return a |_ChunkParser| instance that can extract the chunks from the PNG
image in `stream`."""
stream_rdr = StreamReader(stream, BIG_ENDIAN)
return cls(stream_rdr)
def iter_chunks(self):
"""Generate a |_Chunk| subclass instance for each chunk in this parser's PNG
stream, in the order encountered in the stream."""
for chunk_type, offset in self._iter_chunk_offsets():
chunk = _ChunkFactory(chunk_type, self._stream_rdr, offset)
yield chunk
def _iter_chunk_offsets(self):
"""Generate a (chunk_type, chunk_offset) 2-tuple for each of the chunks in the
PNG image stream.
Iteration stops after the IEND chunk is returned.
"""
chunk_offset = 8
while True:
chunk_data_len = self._stream_rdr.read_long(chunk_offset)
chunk_type = self._stream_rdr.read_str(4, chunk_offset, 4)
data_offset = chunk_offset + 8
yield chunk_type, data_offset
if chunk_type == "IEND":
break
# incr offset for chunk len long, chunk type, chunk data, and CRC
chunk_offset += 4 + 4 + chunk_data_len + 4
def _ChunkFactory(chunk_type, stream_rdr, offset):
"""Return a |_Chunk| subclass instance appropriate to `chunk_type` parsed from
`stream_rdr` at `offset`."""
chunk_cls_map = {
PNG_CHUNK_TYPE.IHDR: _IHDRChunk,
PNG_CHUNK_TYPE.pHYs: _pHYsChunk,
}
chunk_cls = chunk_cls_map.get(chunk_type, _Chunk)
return chunk_cls.from_offset(chunk_type, stream_rdr, offset)
class _Chunk:
"""Base class for specific chunk types.
Also serves as the default chunk type.
"""
def __init__(self, chunk_type):
super(_Chunk, self).__init__()
self._chunk_type = chunk_type
@classmethod
def from_offset(cls, chunk_type, stream_rdr, offset):
"""Return a default _Chunk instance that only knows its chunk type."""
return cls(chunk_type)
@property
def type_name(self):
"""The chunk type name, e.g. 'IHDR', 'pHYs', etc."""
return self._chunk_type
class _IHDRChunk(_Chunk):
"""IHDR chunk, contains the image dimensions."""
def __init__(self, chunk_type, px_width, px_height):
super(_IHDRChunk, self).__init__(chunk_type)
self._px_width = px_width
self._px_height = px_height
@classmethod
def from_offset(cls, chunk_type, stream_rdr, offset):
"""Return an _IHDRChunk instance containing the image dimensions extracted from
the IHDR chunk in `stream` at `offset`."""
px_width = stream_rdr.read_long(offset)
px_height = stream_rdr.read_long(offset, 4)
return cls(chunk_type, px_width, px_height)
@property
def px_width(self):
return self._px_width
@property
def px_height(self):
return self._px_height
class _pHYsChunk(_Chunk):
"""PYHs chunk, contains the image dpi information."""
def __init__(self, chunk_type, horz_px_per_unit, vert_px_per_unit, units_specifier):
super(_pHYsChunk, self).__init__(chunk_type)
self._horz_px_per_unit = horz_px_per_unit
self._vert_px_per_unit = vert_px_per_unit
self._units_specifier = units_specifier
@classmethod
def from_offset(cls, chunk_type, stream_rdr, offset):
"""Return a _pHYsChunk instance containing the image resolution extracted from
the pHYs chunk in `stream` at `offset`."""
horz_px_per_unit = stream_rdr.read_long(offset)
vert_px_per_unit = stream_rdr.read_long(offset, 4)
units_specifier = stream_rdr.read_byte(offset, 8)
return cls(chunk_type, horz_px_per_unit, vert_px_per_unit, units_specifier)
@property
def horz_px_per_unit(self):
return self._horz_px_per_unit
@property
def vert_px_per_unit(self):
return self._vert_px_per_unit
@property
def units_specifier(self):
return self._units_specifier

View File

@@ -0,0 +1,289 @@
from .constants import MIME_TYPE, TIFF_FLD, TIFF_TAG
from .helpers import BIG_ENDIAN, LITTLE_ENDIAN, StreamReader
from .image import BaseImageHeader
class Tiff(BaseImageHeader):
"""Image header parser for TIFF images.
Handles both big and little endian byte ordering.
"""
@property
def content_type(self):
"""Return the MIME type of this TIFF image, unconditionally the string
``image/tiff``."""
return MIME_TYPE.TIFF
@property
def default_ext(self):
"""Default filename extension, always 'tiff' for TIFF images."""
return "tiff"
@classmethod
def from_stream(cls, stream):
"""Return a |Tiff| instance containing the properties of the TIFF image in
`stream`."""
parser = _TiffParser.parse(stream)
px_width = parser.px_width
px_height = parser.px_height
horz_dpi = parser.horz_dpi
vert_dpi = parser.vert_dpi
return cls(px_width, px_height, horz_dpi, vert_dpi)
class _TiffParser:
"""Parses a TIFF image stream to extract the image properties found in its main
image file directory (IFD)"""
def __init__(self, ifd_entries):
super(_TiffParser, self).__init__()
self._ifd_entries = ifd_entries
@classmethod
def parse(cls, stream):
"""Return an instance of |_TiffParser| containing the properties parsed from the
TIFF image in `stream`."""
stream_rdr = cls._make_stream_reader(stream)
ifd0_offset = stream_rdr.read_long(4)
ifd_entries = _IfdEntries.from_stream(stream_rdr, ifd0_offset)
return cls(ifd_entries)
@property
def horz_dpi(self):
"""The horizontal dots per inch value calculated from the XResolution and
ResolutionUnit tags of the IFD; defaults to 72 if those tags are not present."""
return self._dpi(TIFF_TAG.X_RESOLUTION)
@property
def vert_dpi(self):
"""The vertical dots per inch value calculated from the XResolution and
ResolutionUnit tags of the IFD; defaults to 72 if those tags are not present."""
return self._dpi(TIFF_TAG.Y_RESOLUTION)
@property
def px_height(self):
"""The number of stacked rows of pixels in the image, |None| if the IFD contains
no ``ImageLength`` tag, the expected case when the TIFF is embeded in an Exif
image."""
return self._ifd_entries.get(TIFF_TAG.IMAGE_LENGTH)
@property
def px_width(self):
"""The number of pixels in each row in the image, |None| if the IFD contains no
``ImageWidth`` tag, the expected case when the TIFF is embeded in an Exif
image."""
return self._ifd_entries.get(TIFF_TAG.IMAGE_WIDTH)
@classmethod
def _detect_endian(cls, stream):
"""Return either BIG_ENDIAN or LITTLE_ENDIAN depending on the endian indicator
found in the TIFF `stream` header, either 'MM' or 'II'."""
stream.seek(0)
endian_str = stream.read(2)
return BIG_ENDIAN if endian_str == b"MM" else LITTLE_ENDIAN
def _dpi(self, resolution_tag):
"""Return the dpi value calculated for `resolution_tag`, which can be either
TIFF_TAG.X_RESOLUTION or TIFF_TAG.Y_RESOLUTION.
The calculation is based on the values of both that tag and the
TIFF_TAG.RESOLUTION_UNIT tag in this parser's |_IfdEntries| instance.
"""
ifd_entries = self._ifd_entries
if resolution_tag not in ifd_entries:
return 72
# resolution unit defaults to inches (2)
resolution_unit = ifd_entries.get(TIFF_TAG.RESOLUTION_UNIT, 2)
if resolution_unit == 1: # aspect ratio only
return 72
# resolution_unit == 2 for inches, 3 for centimeters
units_per_inch = 1 if resolution_unit == 2 else 2.54
dots_per_unit = ifd_entries[resolution_tag]
return int(round(dots_per_unit * units_per_inch))
@classmethod
def _make_stream_reader(cls, stream):
"""Return a |StreamReader| instance with wrapping `stream` and having "endian-
ness" determined by the 'MM' or 'II' indicator in the TIFF stream header."""
endian = cls._detect_endian(stream)
return StreamReader(stream, endian)
class _IfdEntries:
"""Image File Directory for a TIFF image, having mapping (dict) semantics allowing
"tag" values to be retrieved by tag code."""
def __init__(self, entries):
super(_IfdEntries, self).__init__()
self._entries = entries
def __contains__(self, key):
"""Provides ``in`` operator, e.g. ``tag in ifd_entries``"""
return self._entries.__contains__(key)
def __getitem__(self, key):
"""Provides indexed access, e.g. ``tag_value = ifd_entries[tag_code]``"""
return self._entries.__getitem__(key)
@classmethod
def from_stream(cls, stream, offset):
"""Return a new |_IfdEntries| instance parsed from `stream` starting at
`offset`."""
ifd_parser = _IfdParser(stream, offset)
entries = {e.tag: e.value for e in ifd_parser.iter_entries()}
return cls(entries)
def get(self, tag_code, default=None):
"""Return value of IFD entry having tag matching `tag_code`, or `default` if no
matching tag found."""
return self._entries.get(tag_code, default)
class _IfdParser:
"""Service object that knows how to extract directory entries from an Image File
Directory (IFD)"""
def __init__(self, stream_rdr, offset):
super(_IfdParser, self).__init__()
self._stream_rdr = stream_rdr
self._offset = offset
def iter_entries(self):
"""Generate an |_IfdEntry| instance corresponding to each entry in the
directory."""
for idx in range(self._entry_count):
dir_entry_offset = self._offset + 2 + (idx * 12)
ifd_entry = _IfdEntryFactory(self._stream_rdr, dir_entry_offset)
yield ifd_entry
@property
def _entry_count(self):
"""The count of directory entries, read from the top of the IFD header."""
return self._stream_rdr.read_short(self._offset)
def _IfdEntryFactory(stream_rdr, offset):
"""Return an |_IfdEntry| subclass instance containing the value of the directory
entry at `offset` in `stream_rdr`."""
ifd_entry_classes = {
TIFF_FLD.ASCII: _AsciiIfdEntry,
TIFF_FLD.SHORT: _ShortIfdEntry,
TIFF_FLD.LONG: _LongIfdEntry,
TIFF_FLD.RATIONAL: _RationalIfdEntry,
}
field_type = stream_rdr.read_short(offset, 2)
EntryCls = ifd_entry_classes.get(field_type, _IfdEntry)
return EntryCls.from_stream(stream_rdr, offset)
class _IfdEntry:
"""Base class for IFD entry classes.
Subclasses are differentiated by value type, e.g. ASCII, long int, etc.
"""
def __init__(self, tag_code, value):
super(_IfdEntry, self).__init__()
self._tag_code = tag_code
self._value = value
@classmethod
def from_stream(cls, stream_rdr, offset):
"""Return an |_IfdEntry| subclass instance containing the tag and value of the
tag parsed from `stream_rdr` at `offset`.
Note this method is common to all subclasses. Override the ``_parse_value()``
method to provide distinctive behavior based on field type.
"""
tag_code = stream_rdr.read_short(offset, 0)
value_count = stream_rdr.read_long(offset, 4)
value_offset = stream_rdr.read_long(offset, 8)
value = cls._parse_value(stream_rdr, offset, value_count, value_offset)
return cls(tag_code, value)
@classmethod
def _parse_value(cls, stream_rdr, offset, value_count, value_offset):
"""Return the value of this field parsed from `stream_rdr` at `offset`.
Intended to be overridden by subclasses.
"""
return "UNIMPLEMENTED FIELD TYPE" # pragma: no cover
@property
def tag(self):
"""Short int code that identifies this IFD entry."""
return self._tag_code
@property
def value(self):
"""Value of this tag, its type being dependent on the tag."""
return self._value
class _AsciiIfdEntry(_IfdEntry):
"""IFD entry having the form of a NULL-terminated ASCII string."""
@classmethod
def _parse_value(cls, stream_rdr, offset, value_count, value_offset):
"""Return the ASCII string parsed from `stream_rdr` at `value_offset`.
The length of the string, including a terminating '\x00' (NUL) character, is in
`value_count`.
"""
return stream_rdr.read_str(value_count - 1, value_offset)
class _ShortIfdEntry(_IfdEntry):
"""IFD entry expressed as a short (2-byte) integer."""
@classmethod
def _parse_value(cls, stream_rdr, offset, value_count, value_offset):
"""Return the short int value contained in the `value_offset` field of this
entry.
Only supports single values at present.
"""
if value_count == 1:
return stream_rdr.read_short(offset, 8)
else: # pragma: no cover
return "Multi-value short integer NOT IMPLEMENTED"
class _LongIfdEntry(_IfdEntry):
"""IFD entry expressed as a long (4-byte) integer."""
@classmethod
def _parse_value(cls, stream_rdr, offset, value_count, value_offset):
"""Return the long int value contained in the `value_offset` field of this
entry.
Only supports single values at present.
"""
if value_count == 1:
return stream_rdr.read_long(offset, 8)
else: # pragma: no cover
return "Multi-value long integer NOT IMPLEMENTED"
class _RationalIfdEntry(_IfdEntry):
"""IFD entry expressed as a numerator, denominator pair."""
@classmethod
def _parse_value(cls, stream_rdr, offset, value_count, value_offset):
"""Return the rational (numerator / denominator) value at `value_offset` in
`stream_rdr` as a floating-point number.
Only supports single values at present.
"""
if value_count == 1:
numerator = stream_rdr.read_long(value_offset)
denominator = stream_rdr.read_long(value_offset, 4)
return numerator / denominator
else: # pragma: no cover
return "Multi-value Rational NOT IMPLEMENTED"

View File

@@ -0,0 +1,306 @@
"""Constant values related to the Open Packaging Convention.
In particular it includes content types and relationship types.
"""
class CONTENT_TYPE:
"""Content type URIs (like MIME-types) that specify a part's format."""
BMP = "image/bmp"
DML_CHART = "application/vnd.openxmlformats-officedocument.drawingml.chart+xml"
DML_CHARTSHAPES = "application/vnd.openxmlformats-officedocument.drawingml.chartshapes+xml"
DML_DIAGRAM_COLORS = "application/vnd.openxmlformats-officedocument.drawingml.diagramColors+xml"
DML_DIAGRAM_DATA = "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml"
DML_DIAGRAM_LAYOUT = "application/vnd.openxmlformats-officedocument.drawingml.diagramLayout+xml"
DML_DIAGRAM_STYLE = "application/vnd.openxmlformats-officedocument.drawingml.diagramStyle+xml"
GIF = "image/gif"
JPEG = "image/jpeg"
MS_PHOTO = "image/vnd.ms-photo"
OFC_CUSTOM_PROPERTIES = "application/vnd.openxmlformats-officedocument.custom-properties+xml"
OFC_CUSTOM_XML_PROPERTIES = (
"application/vnd.openxmlformats-officedocument.customXmlProperties+xml"
)
OFC_DRAWING = "application/vnd.openxmlformats-officedocument.drawing+xml"
OFC_EXTENDED_PROPERTIES = (
"application/vnd.openxmlformats-officedocument.extended-properties+xml"
)
OFC_OLE_OBJECT = "application/vnd.openxmlformats-officedocument.oleObject"
OFC_PACKAGE = "application/vnd.openxmlformats-officedocument.package"
OFC_THEME = "application/vnd.openxmlformats-officedocument.theme+xml"
OFC_THEME_OVERRIDE = "application/vnd.openxmlformats-officedocument.themeOverride+xml"
OFC_VML_DRAWING = "application/vnd.openxmlformats-officedocument.vmlDrawing"
OPC_CORE_PROPERTIES = "application/vnd.openxmlformats-package.core-properties+xml"
OPC_DIGITAL_SIGNATURE_CERTIFICATE = (
"application/vnd.openxmlformats-package.digital-signature-certificate"
)
OPC_DIGITAL_SIGNATURE_ORIGIN = "application/vnd.openxmlformats-package.digital-signature-origin"
OPC_DIGITAL_SIGNATURE_XMLSIGNATURE = (
"application/vnd.openxmlformats-package.digital-signature-xmlsignature+xml"
)
OPC_RELATIONSHIPS = "application/vnd.openxmlformats-package.relationships+xml"
PML_COMMENTS = "application/vnd.openxmlformats-officedocument.presentationml.comments+xml"
PML_COMMENT_AUTHORS = (
"application/vnd.openxmlformats-officedocument.presentationml.commentAuthors+xml"
)
PML_HANDOUT_MASTER = (
"application/vnd.openxmlformats-officedocument.presentationml.handoutMaster+xml"
)
PML_NOTES_MASTER = (
"application/vnd.openxmlformats-officedocument.presentationml.notesMaster+xml"
)
PML_NOTES_SLIDE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml"
PML_PRESENTATION_MAIN = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"
)
PML_PRES_PROPS = "application/vnd.openxmlformats-officedocument.presentationml.presProps+xml"
PML_PRINTER_SETTINGS = (
"application/vnd.openxmlformats-officedocument.presentationml.printerSettings"
)
PML_SLIDE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml"
PML_SLIDESHOW_MAIN = (
"application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml"
)
PML_SLIDE_LAYOUT = (
"application/vnd.openxmlformats-officedocument.presentationml.slideLayout+xml"
)
PML_SLIDE_MASTER = (
"application/vnd.openxmlformats-officedocument.presentationml.slideMaster+xml"
)
PML_SLIDE_UPDATE_INFO = (
"application/vnd.openxmlformats-officedocument.presentationml.slideUpdateInfo+xml"
)
PML_TABLE_STYLES = (
"application/vnd.openxmlformats-officedocument.presentationml.tableStyles+xml"
)
PML_TAGS = "application/vnd.openxmlformats-officedocument.presentationml.tags+xml"
PML_TEMPLATE_MAIN = (
"application/vnd.openxmlformats-officedocument.presentationml.template.main+xml"
)
PML_VIEW_PROPS = "application/vnd.openxmlformats-officedocument.presentationml.viewProps+xml"
PNG = "image/png"
SML_CALC_CHAIN = "application/vnd.openxmlformats-officedocument.spreadsheetml.calcChain+xml"
SML_CHARTSHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.chartsheet+xml"
SML_COMMENTS = "application/vnd.openxmlformats-officedocument.spreadsheetml.comments+xml"
SML_CONNECTIONS = "application/vnd.openxmlformats-officedocument.spreadsheetml.connections+xml"
SML_CUSTOM_PROPERTY = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.customProperty"
)
SML_DIALOGSHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.dialogsheet+xml"
SML_EXTERNAL_LINK = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.externalLink+xml"
)
SML_PIVOT_CACHE_DEFINITION = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.pivotCacheDefinition+xml"
)
SML_PIVOT_CACHE_RECORDS = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.pivotCacheRecords+xml"
)
SML_PIVOT_TABLE = "application/vnd.openxmlformats-officedocument.spreadsheetml.pivotTable+xml"
SML_PRINTER_SETTINGS = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.printerSettings"
)
SML_QUERY_TABLE = "application/vnd.openxmlformats-officedocument.spreadsheetml.queryTable+xml"
SML_REVISION_HEADERS = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.revisionHeaders+xml"
)
SML_REVISION_LOG = "application/vnd.openxmlformats-officedocument.spreadsheetml.revisionLog+xml"
SML_SHARED_STRINGS = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"
)
SML_SHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
SML_SHEET_MAIN = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"
SML_SHEET_METADATA = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheetMetadata+xml"
)
SML_STYLES = "application/vnd.openxmlformats-officedocument.spreadsheetml.styles+xml"
SML_TABLE = "application/vnd.openxmlformats-officedocument.spreadsheetml.table+xml"
SML_TABLE_SINGLE_CELLS = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.tableSingleCells+xml"
)
SML_TEMPLATE_MAIN = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.template.main+xml"
)
SML_USER_NAMES = "application/vnd.openxmlformats-officedocument.spreadsheetml.userNames+xml"
SML_VOLATILE_DEPENDENCIES = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.volatileDependencies+xml"
)
SML_WORKSHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"
TIFF = "image/tiff"
WML_COMMENTS = "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"
WML_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
WML_DOCUMENT_GLOSSARY = (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml"
)
WML_DOCUMENT_MAIN = (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
)
WML_ENDNOTES = "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml"
WML_FONT_TABLE = "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"
WML_FOOTER = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"
WML_FOOTNOTES = "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml"
WML_HEADER = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"
WML_NUMBERING = "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml"
WML_PRINTER_SETTINGS = (
"application/vnd.openxmlformats-officedocument.wordprocessingml.printerSettings"
)
WML_SETTINGS = "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"
WML_STYLES = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"
WML_WEB_SETTINGS = (
"application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml"
)
XML = "application/xml"
X_EMF = "image/x-emf"
X_FONTDATA = "application/x-fontdata"
X_FONT_TTF = "application/x-font-ttf"
X_WMF = "image/x-wmf"
class NAMESPACE:
"""Constant values for OPC XML namespaces."""
DML_WORDPROCESSING_DRAWING = (
"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
)
OFC_RELATIONSHIPS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
OPC_RELATIONSHIPS = "http://schemas.openxmlformats.org/package/2006/relationships"
OPC_CONTENT_TYPES = "http://schemas.openxmlformats.org/package/2006/content-types"
WML_MAIN = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
class RELATIONSHIP_TARGET_MODE:
"""Open XML relationship target modes."""
EXTERNAL = "External"
INTERNAL = "Internal"
class RELATIONSHIP_TYPE:
AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio"
A_F_CHUNK = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk"
CALC_CHAIN = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/calcChain"
CERTIFICATE = (
"http://schemas.openxmlformats.org/package/2006/relationships/digital-signature/certificate"
)
CHART = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart"
CHARTSHEET = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chartsheet"
CHART_USER_SHAPES = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/chartUserShapes"
)
COMMENTS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
COMMENT_AUTHORS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/commentAuthors"
)
CONNECTIONS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/connections"
CONTROL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/control"
CORE_PROPERTIES = (
"http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"
)
CUSTOM_PROPERTIES = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties"
)
CUSTOM_PROPERTY = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/customProperty"
)
CUSTOM_XML = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXml"
CUSTOM_XML_PROPS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXmlProps"
)
DIAGRAM_COLORS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramColors"
)
DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData"
DIAGRAM_LAYOUT = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramLayout"
)
DIAGRAM_QUICK_STYLE = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramQuickStyle"
)
DIALOGSHEET = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/dialogsheet"
DRAWING = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing"
ENDNOTES = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes"
EXTENDED_PROPERTIES = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties"
)
EXTERNAL_LINK = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/externalLink"
)
FONT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/font"
FONT_TABLE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable"
FOOTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer"
FOOTNOTES = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes"
GLOSSARY_DOCUMENT = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/glossaryDocument"
)
HANDOUT_MASTER = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster"
)
HEADER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/header"
HYPERLINK = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
NOTES_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesMaster"
NOTES_SLIDE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide"
NUMBERING = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering"
OFFICE_DOCUMENT = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"
)
OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject"
ORIGIN = "http://schemas.openxmlformats.org/package/2006/relationships/digital-signature/origin"
PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package"
PIVOT_CACHE_DEFINITION = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotCacheDefinition"
)
PIVOT_CACHE_RECORDS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships"
"/spreadsheetml/pivotCacheRecords"
)
PIVOT_TABLE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotTable"
PRES_PROPS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/presProps"
PRINTER_SETTINGS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/printerSettings"
)
QUERY_TABLE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/queryTable"
REVISION_HEADERS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/revisionHeaders"
)
REVISION_LOG = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/revisionLog"
SETTINGS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings"
SHARED_STRINGS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"
)
SHEET_METADATA = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/sheetMetadata"
)
SIGNATURE = (
"http://schemas.openxmlformats.org/package/2006/relationships/digital-signature/signature"
)
SLIDE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide"
SLIDE_LAYOUT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout"
SLIDE_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideMaster"
SLIDE_UPDATE_INFO = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideUpdateInfo"
)
STYLES = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"
TABLE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/table"
TABLE_SINGLE_CELLS = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/tableSingleCells"
)
TABLE_STYLES = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/tableStyles"
TAGS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/tags"
THEME = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme"
THEME_OVERRIDE = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/themeOverride"
)
THUMBNAIL = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/thumbnail"
USERNAMES = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/usernames"
VIDEO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/video"
VIEW_PROPS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/viewProps"
VML_DRAWING = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/vmlDrawing"
VOLATILE_DEPENDENCIES = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/volatileDependencies"
)
WEB_SETTINGS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings"
WORKSHEET_SOURCE = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheetSource"
)
XML_MAPS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/xmlMaps"

View File

@@ -0,0 +1,142 @@
"""Provides CoreProperties, Dublin-Core attributes of the document.
These are broadly-standardized attributes like author, last-modified, etc.
"""
from __future__ import annotations
import datetime as dt
from typing import TYPE_CHECKING
from docx.oxml.coreprops import CT_CoreProperties
if TYPE_CHECKING:
from docx.oxml.coreprops import CT_CoreProperties
class CoreProperties:
"""Corresponds to part named ``/docProps/core.xml``, containing the core document
properties for this document package."""
def __init__(self, element: CT_CoreProperties):
self._element = element
@property
def author(self):
return self._element.author_text
@author.setter
def author(self, value: str):
self._element.author_text = value
@property
def category(self):
return self._element.category_text
@category.setter
def category(self, value: str):
self._element.category_text = value
@property
def comments(self):
return self._element.comments_text
@comments.setter
def comments(self, value: str):
self._element.comments_text = value
@property
def content_status(self):
return self._element.contentStatus_text
@content_status.setter
def content_status(self, value: str):
self._element.contentStatus_text = value
@property
def created(self):
return self._element.created_datetime
@created.setter
def created(self, value: dt.datetime):
self._element.created_datetime = value
@property
def identifier(self):
return self._element.identifier_text
@identifier.setter
def identifier(self, value: str):
self._element.identifier_text = value
@property
def keywords(self):
return self._element.keywords_text
@keywords.setter
def keywords(self, value: str):
self._element.keywords_text = value
@property
def language(self):
return self._element.language_text
@language.setter
def language(self, value: str):
self._element.language_text = value
@property
def last_modified_by(self):
return self._element.lastModifiedBy_text
@last_modified_by.setter
def last_modified_by(self, value: str):
self._element.lastModifiedBy_text = value
@property
def last_printed(self):
return self._element.lastPrinted_datetime
@last_printed.setter
def last_printed(self, value: dt.datetime):
self._element.lastPrinted_datetime = value
@property
def modified(self):
return self._element.modified_datetime
@modified.setter
def modified(self, value: dt.datetime):
self._element.modified_datetime = value
@property
def revision(self):
return self._element.revision_number
@revision.setter
def revision(self, value: int):
self._element.revision_number = value
@property
def subject(self):
return self._element.subject_text
@subject.setter
def subject(self, value: str):
self._element.subject_text = value
@property
def title(self):
return self._element.title_text
@title.setter
def title(self, value: str):
self._element.title_text = value
@property
def version(self):
return self._element.version_text
@version.setter
def version(self, value: str):
self._element.version_text = value

View File

@@ -0,0 +1,12 @@
"""Exceptions specific to python-opc.
The base exception class is OpcError.
"""
class OpcError(Exception):
"""Base error class for python-opc."""
class PackageNotFoundError(OpcError):
"""Raised when a package cannot be found at the specified path."""

View File

@@ -0,0 +1,247 @@
# pyright: reportPrivateUsage=false
"""Temporary stand-in for main oxml module.
This module came across with the PackageReader transplant. Probably much will get
replaced with objects from the pptx.oxml.core and then this module will either get
deleted or only hold the package related custom element classes.
"""
from __future__ import annotations
from typing import cast
from lxml import etree
from docx.opc.constants import NAMESPACE as NS
from docx.opc.constants import RELATIONSHIP_TARGET_MODE as RTM
# configure XML parser
element_class_lookup = etree.ElementNamespaceClassLookup()
oxml_parser = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
oxml_parser.set_element_class_lookup(element_class_lookup)
nsmap = {
"ct": NS.OPC_CONTENT_TYPES,
"pr": NS.OPC_RELATIONSHIPS,
"r": NS.OFC_RELATIONSHIPS,
}
# ===========================================================================
# functions
# ===========================================================================
def parse_xml(text: str) -> etree._Element:
"""`etree.fromstring()` replacement that uses oxml parser."""
return etree.fromstring(text, oxml_parser)
def qn(tag: str) -> str:
"""Stands for "qualified name", a utility function to turn a namespace prefixed tag
name into a Clark-notation qualified tag name for lxml.
For
example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
"""
prefix, tagroot = tag.split(":")
uri = nsmap[prefix]
return "{%s}%s" % (uri, tagroot)
def serialize_part_xml(part_elm: etree._Element) -> bytes:
"""Serialize `part_elm` etree element to XML suitable for storage as an XML part.
That is to say, no insignificant whitespace added for readability, and an
appropriate XML declaration added with UTF-8 encoding specified.
"""
return etree.tostring(part_elm, encoding="UTF-8", standalone=True)
def serialize_for_reading(element: etree._Element) -> str:
"""Serialize `element` to human-readable XML suitable for tests.
No XML declaration.
"""
return etree.tostring(element, encoding="unicode", pretty_print=True)
# ===========================================================================
# Custom element classes
# ===========================================================================
class BaseOxmlElement(etree.ElementBase):
"""Base class for all custom element classes, to add standardized behavior to all
classes in one place."""
@property
def xml(self) -> str:
"""Return XML string for this element, suitable for testing purposes.
Pretty printed for readability and without an XML declaration at the top.
"""
return serialize_for_reading(self)
class CT_Default(BaseOxmlElement):
"""`<Default>` element that appears in `[Content_Types].xml` part.
Used to specify a default content type to be applied to any part with the specified extension.
"""
@property
def content_type(self):
"""String held in the ``ContentType`` attribute of this ``<Default>``
element."""
return self.get("ContentType")
@property
def extension(self):
"""String held in the ``Extension`` attribute of this ``<Default>`` element."""
return self.get("Extension")
@staticmethod
def new(ext: str, content_type: str):
"""Return a new ``<Default>`` element with attributes set to parameter values."""
xml = '<Default xmlns="%s"/>' % nsmap["ct"]
default = parse_xml(xml)
default.set("Extension", ext)
default.set("ContentType", content_type)
return default
class CT_Override(BaseOxmlElement):
"""``<Override>`` element, specifying the content type to be applied for a part with
the specified partname."""
@property
def content_type(self):
"""String held in the ``ContentType`` attribute of this ``<Override>``
element."""
return self.get("ContentType")
@staticmethod
def new(partname, content_type):
"""Return a new ``<Override>`` element with attributes set to parameter values."""
xml = '<Override xmlns="%s"/>' % nsmap["ct"]
override = parse_xml(xml)
override.set("PartName", partname)
override.set("ContentType", content_type)
return override
@property
def partname(self):
"""String held in the ``PartName`` attribute of this ``<Override>`` element."""
return self.get("PartName")
class CT_Relationship(BaseOxmlElement):
"""`<Relationship>` element, representing a single relationship from source to target part."""
@staticmethod
def new(rId: str, reltype: str, target: str, target_mode: str = RTM.INTERNAL):
"""Return a new ``<Relationship>`` element."""
xml = '<Relationship xmlns="%s"/>' % nsmap["pr"]
relationship = parse_xml(xml)
relationship.set("Id", rId)
relationship.set("Type", reltype)
relationship.set("Target", target)
if target_mode == RTM.EXTERNAL:
relationship.set("TargetMode", RTM.EXTERNAL)
return relationship
@property
def rId(self):
"""String held in the ``Id`` attribute of this ``<Relationship>`` element."""
return self.get("Id")
@property
def reltype(self):
"""String held in the ``Type`` attribute of this ``<Relationship>`` element."""
return self.get("Type")
@property
def target_ref(self):
"""String held in the ``Target`` attribute of this ``<Relationship>``
element."""
return self.get("Target")
@property
def target_mode(self):
"""String held in the ``TargetMode`` attribute of this ``<Relationship>``
element, either ``Internal`` or ``External``.
Defaults to ``Internal``.
"""
return self.get("TargetMode", RTM.INTERNAL)
class CT_Relationships(BaseOxmlElement):
"""``<Relationships>`` element, the root element in a .rels file."""
def add_rel(self, rId: str, reltype: str, target: str, is_external: bool = False):
"""Add a child ``<Relationship>`` element with attributes set according to
parameter values."""
target_mode = RTM.EXTERNAL if is_external else RTM.INTERNAL
relationship = CT_Relationship.new(rId, reltype, target, target_mode)
self.append(relationship)
@staticmethod
def new() -> CT_Relationships:
"""Return a new ``<Relationships>`` element."""
xml = '<Relationships xmlns="%s"/>' % nsmap["pr"]
return cast(CT_Relationships, parse_xml(xml))
@property
def Relationship_lst(self):
"""Return a list containing all the ``<Relationship>`` child elements."""
return self.findall(qn("pr:Relationship"))
@property
def xml(self):
"""Return XML string for this element, suitable for saving in a .rels stream,
not pretty printed and with an XML declaration at the top."""
return serialize_part_xml(self)
class CT_Types(BaseOxmlElement):
"""``<Types>`` element, the container element for Default and Override elements in
[Content_Types].xml."""
def add_default(self, ext, content_type):
"""Add a child ``<Default>`` element with attributes set to parameter values."""
default = CT_Default.new(ext, content_type)
self.append(default)
def add_override(self, partname, content_type):
"""Add a child ``<Override>`` element with attributes set to parameter
values."""
override = CT_Override.new(partname, content_type)
self.append(override)
@property
def defaults(self):
return self.findall(qn("ct:Default"))
@staticmethod
def new():
"""Return a new ``<Types>`` element."""
xml = '<Types xmlns="%s"/>' % nsmap["ct"]
types = parse_xml(xml)
return types
@property
def overrides(self):
return self.findall(qn("ct:Override"))
ct_namespace = element_class_lookup.get_namespace(nsmap["ct"])
ct_namespace["Default"] = CT_Default
ct_namespace["Override"] = CT_Override
ct_namespace["Types"] = CT_Types
pr_namespace = element_class_lookup.get_namespace(nsmap["pr"])
pr_namespace["Relationship"] = CT_Relationship
pr_namespace["Relationships"] = CT_Relationships

View File

@@ -0,0 +1,219 @@
"""Objects that implement reading and writing OPC packages."""
from __future__ import annotations
from typing import IO, TYPE_CHECKING, Iterator, cast
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx.opc.packuri import PACKAGE_URI, PackURI
from docx.opc.part import PartFactory
from docx.opc.parts.coreprops import CorePropertiesPart
from docx.opc.pkgreader import PackageReader
from docx.opc.pkgwriter import PackageWriter
from docx.opc.rel import Relationships
from docx.shared import lazyproperty
if TYPE_CHECKING:
from typing_extensions import Self
from docx.opc.coreprops import CoreProperties
from docx.opc.part import Part
from docx.opc.rel import _Relationship # pyright: ignore[reportPrivateUsage]
class OpcPackage:
"""Main API class for |python-opc|.
A new instance is constructed by calling the :meth:`open` class method with a path
to a package file or file-like object containing one.
"""
def after_unmarshal(self):
"""Entry point for any post-unmarshaling processing.
May be overridden by subclasses without forwarding call to super.
"""
# don't place any code here, just catch call if not overridden by
# subclass
pass
@property
def core_properties(self) -> CoreProperties:
"""|CoreProperties| object providing read/write access to the Dublin Core
properties for this document."""
return self._core_properties_part.core_properties
def iter_rels(self) -> Iterator[_Relationship]:
"""Generate exactly one reference to each relationship in the package by
performing a depth-first traversal of the rels graph."""
def walk_rels(
source: OpcPackage | Part, visited: list[Part] | None = None
) -> Iterator[_Relationship]:
visited = [] if visited is None else visited
for rel in source.rels.values():
yield rel
if rel.is_external:
continue
part = rel.target_part
if part in visited:
continue
visited.append(part)
new_source = part
for rel in walk_rels(new_source, visited):
yield rel
for rel in walk_rels(self):
yield rel
def iter_parts(self) -> Iterator[Part]:
"""Generate exactly one reference to each of the parts in the package by
performing a depth-first traversal of the rels graph."""
def walk_parts(source, visited=[]):
for rel in source.rels.values():
if rel.is_external:
continue
part = rel.target_part
if part in visited:
continue
visited.append(part)
yield part
new_source = part
for part in walk_parts(new_source, visited):
yield part
for part in walk_parts(self):
yield part
def load_rel(self, reltype: str, target: Part | str, rId: str, is_external: bool = False):
"""Return newly added |_Relationship| instance of `reltype` between this part
and `target` with key `rId`.
Target mode is set to ``RTM.EXTERNAL`` if `is_external` is |True|. Intended for
use during load from a serialized package, where the rId is well known. Other
methods exist for adding a new relationship to the package during processing.
"""
return self.rels.add_relationship(reltype, target, rId, is_external)
@property
def main_document_part(self):
"""Return a reference to the main document part for this package.
Examples include a document part for a WordprocessingML package, a presentation
part for a PresentationML package, or a workbook part for a SpreadsheetML
package.
"""
return self.part_related_by(RT.OFFICE_DOCUMENT)
def next_partname(self, template: str) -> PackURI:
"""Return a |PackURI| instance representing partname matching `template`.
The returned part-name has the next available numeric suffix to distinguish it
from other parts of its type. `template` is a printf (%)-style template string
containing a single replacement item, a '%d' to be used to insert the integer
portion of the partname. Example: "/word/header%d.xml"
"""
partnames = {part.partname for part in self.iter_parts()}
for n in range(1, len(partnames) + 2):
candidate_partname = template % n
if candidate_partname not in partnames:
return PackURI(candidate_partname)
@classmethod
def open(cls, pkg_file: str | IO[bytes]) -> Self:
"""Return an |OpcPackage| instance loaded with the contents of `pkg_file`."""
pkg_reader = PackageReader.from_file(pkg_file)
package = cls()
Unmarshaller.unmarshal(pkg_reader, package, PartFactory)
return package
def part_related_by(self, reltype: str) -> Part:
"""Return part to which this package has a relationship of `reltype`.
Raises |KeyError| if no such relationship is found and |ValueError| if more than
one such relationship is found.
"""
return self.rels.part_with_reltype(reltype)
@property
def parts(self) -> list[Part]:
"""Return a list containing a reference to each of the parts in this package."""
return list(self.iter_parts())
def relate_to(self, part: Part, reltype: str):
"""Return rId key of new or existing relationship to `part`.
If a relationship of `reltype` to `part` already exists, its rId is returned. Otherwise a
new relationship is created and that rId is returned.
"""
rel = self.rels.get_or_add(reltype, part)
return rel.rId
@lazyproperty
def rels(self):
"""Return a reference to the |Relationships| instance holding the collection of
relationships for this package."""
return Relationships(PACKAGE_URI.baseURI)
def save(self, pkg_file: str | IO[bytes]):
"""Save this package to `pkg_file`.
`pkg_file` can be either a file-path or a file-like object.
"""
for part in self.parts:
part.before_marshal()
PackageWriter.write(pkg_file, self.rels, self.parts)
@property
def _core_properties_part(self) -> CorePropertiesPart:
"""|CorePropertiesPart| object related to this package.
Creates a default core properties part if one is not present (not common).
"""
try:
return cast(CorePropertiesPart, self.part_related_by(RT.CORE_PROPERTIES))
except KeyError:
core_properties_part = CorePropertiesPart.default(self)
self.relate_to(core_properties_part, RT.CORE_PROPERTIES)
return core_properties_part
class Unmarshaller:
"""Hosts static methods for unmarshalling a package from a |PackageReader|."""
@staticmethod
def unmarshal(pkg_reader, package, part_factory):
"""Construct graph of parts and realized relationships based on the contents of
`pkg_reader`, delegating construction of each part to `part_factory`.
Package relationships are added to `pkg`.
"""
parts = Unmarshaller._unmarshal_parts(pkg_reader, package, part_factory)
Unmarshaller._unmarshal_relationships(pkg_reader, package, parts)
for part in parts.values():
part.after_unmarshal()
package.after_unmarshal()
@staticmethod
def _unmarshal_parts(pkg_reader, package, part_factory):
"""Return a dictionary of |Part| instances unmarshalled from `pkg_reader`, keyed
by partname.
Side-effect is that each part in `pkg_reader` is constructed using
`part_factory`.
"""
parts = {}
for partname, content_type, reltype, blob in pkg_reader.iter_sparts():
parts[partname] = part_factory(partname, content_type, reltype, blob, package)
return parts
@staticmethod
def _unmarshal_relationships(pkg_reader, package, parts):
"""Add a relationship to the source object corresponding to each of the
relationships in `pkg_reader` with its target_part set to the actual target part
in `parts`."""
for source_uri, srel in pkg_reader.iter_srels():
source = package if source_uri == "/" else parts[source_uri]
target = srel.target_ref if srel.is_external else parts[srel.target_partname]
source.load_rel(srel.reltype, target, srel.rId, srel.is_external)

View File

@@ -0,0 +1,109 @@
"""Provides the PackURI value type.
Also some useful known pack URI strings such as PACKAGE_URI.
"""
from __future__ import annotations
import posixpath
import re
class PackURI(str):
"""Provides access to pack URI components such as the baseURI and the filename slice.
Behaves as |str| otherwise.
"""
_filename_re = re.compile("([a-zA-Z]+)([1-9][0-9]*)?")
def __new__(cls, pack_uri_str: str):
if pack_uri_str[0] != "/":
tmpl = "PackURI must begin with slash, got '%s'"
raise ValueError(tmpl % pack_uri_str)
return str.__new__(cls, pack_uri_str)
@staticmethod
def from_rel_ref(baseURI: str, relative_ref: str) -> PackURI:
"""The absolute PackURI formed by translating `relative_ref` onto `baseURI`."""
joined_uri = posixpath.join(baseURI, relative_ref)
abs_uri = posixpath.abspath(joined_uri)
return PackURI(abs_uri)
@property
def baseURI(self) -> str:
"""The base URI of this pack URI, the directory portion, roughly speaking.
E.g. ``'/ppt/slides'`` for ``'/ppt/slides/slide1.xml'``. For the package pseudo-
partname '/', baseURI is '/'.
"""
return posixpath.split(self)[0]
@property
def ext(self) -> str:
"""The extension portion of this pack URI, e.g. ``'xml'`` for ``'/word/document.xml'``.
Note the period is not included.
"""
# raw_ext is either empty string or starts with period, e.g. '.xml'
raw_ext = posixpath.splitext(self)[1]
return raw_ext[1:] if raw_ext.startswith(".") else raw_ext
@property
def filename(self):
"""The "filename" portion of this pack URI, e.g. ``'slide1.xml'`` for
``'/ppt/slides/slide1.xml'``.
For the package pseudo-partname '/', filename is ''.
"""
return posixpath.split(self)[1]
@property
def idx(self):
"""Return partname index as integer for tuple partname or None for singleton
partname, e.g. ``21`` for ``'/ppt/slides/slide21.xml'`` and |None| for
``'/ppt/presentation.xml'``."""
filename = self.filename
if not filename:
return None
name_part = posixpath.splitext(filename)[0] # filename w/ext removed
match = self._filename_re.match(name_part)
if match is None:
return None
if match.group(2):
return int(match.group(2))
return None
@property
def membername(self):
"""The pack URI with the leading slash stripped off, the form used as the Zip
file membername for the package item.
Returns '' for the package pseudo-partname '/'.
"""
return self[1:]
def relative_ref(self, baseURI: str):
"""Return string containing relative reference to package item from `baseURI`.
E.g. PackURI('/ppt/slideLayouts/slideLayout1.xml') would return
'../slideLayouts/slideLayout1.xml' for baseURI '/ppt/slides'.
"""
# workaround for posixpath bug in 2.6, doesn't generate correct
# relative path when `start` (second) parameter is root ('/')
return self[1:] if baseURI == "/" else posixpath.relpath(self, baseURI)
@property
def rels_uri(self):
"""The pack URI of the .rels part corresponding to the current pack URI.
Only produces sensible output if the pack URI is a partname or the package
pseudo-partname '/'.
"""
rels_filename = "%s.rels" % self.filename
rels_uri_str = posixpath.join(self.baseURI, "_rels", rels_filename)
return PackURI(rels_uri_str)
PACKAGE_URI = PackURI("/")
CONTENT_TYPES_URI = PackURI("/[Content_Types].xml")

View File

@@ -0,0 +1,247 @@
# pyright: reportImportCycles=false
"""Open Packaging Convention (OPC) objects related to package parts."""
from __future__ import annotations
from typing import TYPE_CHECKING, Callable, Type, cast
from docx.opc.oxml import serialize_part_xml
from docx.opc.packuri import PackURI
from docx.opc.rel import Relationships
from docx.opc.shared import cls_method_fn
from docx.oxml.parser import parse_xml
from docx.shared import lazyproperty
if TYPE_CHECKING:
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.package import Package
class Part:
"""Base class for package parts.
Provides common properties and methods, but intended to be subclassed in client code
to implement specific part behaviors.
"""
def __init__(
self,
partname: PackURI,
content_type: str,
blob: bytes | None = None,
package: Package | None = None,
):
super(Part, self).__init__()
self._partname = partname
self._content_type = content_type
self._blob = blob
self._package = package
def after_unmarshal(self):
"""Entry point for post-unmarshaling processing, for example to parse the part
XML.
May be overridden by subclasses without forwarding call to super.
"""
# don't place any code here, just catch call if not overridden by
# subclass
pass
def before_marshal(self):
"""Entry point for pre-serialization processing, for example to finalize part
naming if necessary.
May be overridden by subclasses without forwarding call to super.
"""
# don't place any code here, just catch call if not overridden by
# subclass
pass
@property
def blob(self) -> bytes:
"""Contents of this package part as a sequence of bytes.
May be text or binary. Intended to be overridden by subclasses. Default behavior
is to return load blob.
"""
return self._blob or b""
@property
def content_type(self):
"""Content type of this part."""
return self._content_type
def drop_rel(self, rId: str):
"""Remove the relationship identified by `rId` if its reference count is less
than 2.
Relationships with a reference count of 0 are implicit relationships.
"""
if self._rel_ref_count(rId) < 2:
del self.rels[rId]
@classmethod
def load(cls, partname: PackURI, content_type: str, blob: bytes, package: Package):
return cls(partname, content_type, blob, package)
def load_rel(self, reltype: str, target: Part | str, rId: str, is_external: bool = False):
"""Return newly added |_Relationship| instance of `reltype`.
The new relationship relates the `target` part to this part with key `rId`.
Target mode is set to ``RTM.EXTERNAL`` if `is_external` is |True|. Intended for
use during load from a serialized package, where the rId is well-known. Other
methods exist for adding a new relationship to a part when manipulating a part.
"""
return self.rels.add_relationship(reltype, target, rId, is_external)
@property
def package(self):
"""|OpcPackage| instance this part belongs to."""
return self._package
@property
def partname(self):
"""|PackURI| instance holding partname of this part, e.g.
'/ppt/slides/slide1.xml'."""
return self._partname
@partname.setter
def partname(self, partname: str):
if not isinstance(partname, PackURI):
tmpl = "partname must be instance of PackURI, got '%s'"
raise TypeError(tmpl % type(partname).__name__)
self._partname = partname
def part_related_by(self, reltype: str) -> Part:
"""Return part to which this part has a relationship of `reltype`.
Raises |KeyError| if no such relationship is found and |ValueError| if more than
one such relationship is found. Provides ability to resolve implicitly related
part, such as Slide -> SlideLayout.
"""
return self.rels.part_with_reltype(reltype)
def relate_to(self, target: Part | str, reltype: str, is_external: bool = False) -> str:
"""Return rId key of relationship of `reltype` to `target`.
The returned `rId` is from an existing relationship if there is one, otherwise a
new relationship is created.
"""
if is_external:
return self.rels.get_or_add_ext_rel(reltype, cast(str, target))
else:
rel = self.rels.get_or_add(reltype, cast(Part, target))
return rel.rId
@property
def related_parts(self):
"""Dictionary mapping related parts by rId, so child objects can resolve
explicit relationships present in the part XML, e.g. sldIdLst to a specific
|Slide| instance."""
return self.rels.related_parts
@lazyproperty
def rels(self):
"""|Relationships| instance holding the relationships for this part."""
# -- prevent breakage in `python-docx-template` by retaining legacy `._rels` attribute --
self._rels = Relationships(self._partname.baseURI)
return self._rels
def target_ref(self, rId: str) -> str:
"""Return URL contained in target ref of relationship identified by `rId`."""
rel = self.rels[rId]
return rel.target_ref
def _rel_ref_count(self, rId: str) -> int:
"""Return the count of references in this part to the relationship identified by `rId`.
Only an XML part can contain references, so this is 0 for `Part`.
"""
return 0
class PartFactory:
"""Provides a way for client code to specify a subclass of |Part| to be constructed
by |Unmarshaller| based on its content type and/or a custom callable.
Setting ``PartFactory.part_class_selector`` to a callable object will cause that
object to be called with the parameters ``content_type, reltype``, once for each
part in the package. If the callable returns an object, it is used as the class for
that part. If it returns |None|, part class selection falls back to the content type
map defined in ``PartFactory.part_type_for``. If no class is returned from either of
these, the class contained in ``PartFactory.default_part_type`` is used to construct
the part, which is by default ``opc.package.Part``.
"""
part_class_selector: Callable[[str, str], Type[Part] | None] | None
part_type_for: dict[str, Type[Part]] = {}
default_part_type = Part
def __new__(
cls,
partname: PackURI,
content_type: str,
reltype: str,
blob: bytes,
package: Package,
):
PartClass: Type[Part] | None = None
if cls.part_class_selector is not None:
part_class_selector = cls_method_fn(cls, "part_class_selector")
PartClass = part_class_selector(content_type, reltype)
if PartClass is None:
PartClass = cls._part_cls_for(content_type)
return PartClass.load(partname, content_type, blob, package)
@classmethod
def _part_cls_for(cls, content_type: str):
"""Return the custom part class registered for `content_type`, or the default
part class if no custom class is registered for `content_type`."""
if content_type in cls.part_type_for:
return cls.part_type_for[content_type]
return cls.default_part_type
class XmlPart(Part):
"""Base class for package parts containing an XML payload, which is most of them.
Provides additional methods to the |Part| base class that take care of parsing and
reserializing the XML payload and managing relationships to other parts.
"""
def __init__(
self, partname: PackURI, content_type: str, element: BaseOxmlElement, package: Package
):
super(XmlPart, self).__init__(partname, content_type, package=package)
self._element = element
@property
def blob(self):
return serialize_part_xml(self._element)
@property
def element(self):
"""The root XML element of this XML part."""
return self._element
@classmethod
def load(cls, partname: PackURI, content_type: str, blob: bytes, package: Package):
element = parse_xml(blob)
return cls(partname, content_type, element, package)
@property
def part(self):
"""Part of the parent protocol, "children" of the document will not know the
part that contains them so must ask their parent object.
That chain of delegation ends here for child objects.
"""
return self
def _rel_ref_count(self, rId: str) -> int:
"""Return the count of references in this part's XML to the relationship
identified by `rId`."""
rIds = cast("list[str]", self._element.xpath("//@r:id"))
return len([_rId for _rId in rIds if _rId == rId])

View File

@@ -0,0 +1,48 @@
"""Core properties part, corresponds to ``/docProps/core.xml`` part in package."""
from __future__ import annotations
import datetime as dt
from typing import TYPE_CHECKING
from docx.opc.constants import CONTENT_TYPE as CT
from docx.opc.coreprops import CoreProperties
from docx.opc.packuri import PackURI
from docx.opc.part import XmlPart
from docx.oxml.coreprops import CT_CoreProperties
if TYPE_CHECKING:
from docx.opc.package import OpcPackage
class CorePropertiesPart(XmlPart):
"""Corresponds to part named ``/docProps/core.xml``.
The "core" is short for "Dublin Core" and contains document metadata relatively common across
documents of all types, not just DOCX.
"""
@classmethod
def default(cls, package: OpcPackage):
"""Return a new |CorePropertiesPart| object initialized with default values for
its base properties."""
core_properties_part = cls._new(package)
core_properties = core_properties_part.core_properties
core_properties.title = "Word Document"
core_properties.last_modified_by = "python-docx"
core_properties.revision = 1
core_properties.modified = dt.datetime.now(dt.timezone.utc)
return core_properties_part
@property
def core_properties(self):
"""A |CoreProperties| object providing read/write access to the core properties
contained in this core properties part."""
return CoreProperties(self.element)
@classmethod
def _new(cls, package: OpcPackage) -> CorePropertiesPart:
partname = PackURI("/docProps/core.xml")
content_type = CT.OPC_CORE_PROPERTIES
coreProperties = CT_CoreProperties.new()
return CorePropertiesPart(partname, content_type, coreProperties, package)

View File

@@ -0,0 +1,119 @@
"""Provides a general interface to a `physical` OPC package, such as a zip file."""
import os
from zipfile import ZIP_DEFLATED, ZipFile, is_zipfile
from docx.opc.exceptions import PackageNotFoundError
from docx.opc.packuri import CONTENT_TYPES_URI
class PhysPkgReader:
"""Factory for physical package reader objects."""
def __new__(cls, pkg_file):
# if `pkg_file` is a string, treat it as a path
if isinstance(pkg_file, str):
if os.path.isdir(pkg_file):
reader_cls = _DirPkgReader
elif is_zipfile(pkg_file):
reader_cls = _ZipPkgReader
else:
raise PackageNotFoundError("Package not found at '%s'" % pkg_file)
else: # assume it's a stream and pass it to Zip reader to sort out
reader_cls = _ZipPkgReader
return super(PhysPkgReader, cls).__new__(reader_cls)
class PhysPkgWriter:
"""Factory for physical package writer objects."""
def __new__(cls, pkg_file):
return super(PhysPkgWriter, cls).__new__(_ZipPkgWriter)
class _DirPkgReader(PhysPkgReader):
"""Implements |PhysPkgReader| interface for an OPC package extracted into a
directory."""
def __init__(self, path):
"""`path` is the path to a directory containing an expanded package."""
super(_DirPkgReader, self).__init__()
self._path = os.path.abspath(path)
def blob_for(self, pack_uri):
"""Return contents of file corresponding to `pack_uri` in package directory."""
path = os.path.join(self._path, pack_uri.membername)
with open(path, "rb") as f:
blob = f.read()
return blob
def close(self):
"""Provides interface consistency with |ZipFileSystem|, but does nothing, a
directory file system doesn't need closing."""
pass
@property
def content_types_xml(self):
"""Return the `[Content_Types].xml` blob from the package."""
return self.blob_for(CONTENT_TYPES_URI)
def rels_xml_for(self, source_uri):
"""Return rels item XML for source with `source_uri`, or None if the item has no
rels item."""
try:
rels_xml = self.blob_for(source_uri.rels_uri)
except IOError:
rels_xml = None
return rels_xml
class _ZipPkgReader(PhysPkgReader):
"""Implements |PhysPkgReader| interface for a zip file OPC package."""
def __init__(self, pkg_file):
super(_ZipPkgReader, self).__init__()
self._zipf = ZipFile(pkg_file, "r")
def blob_for(self, pack_uri):
"""Return blob corresponding to `pack_uri`.
Raises |ValueError| if no matching member is present in zip archive.
"""
return self._zipf.read(pack_uri.membername)
def close(self):
"""Close the zip archive, releasing any resources it is using."""
self._zipf.close()
@property
def content_types_xml(self):
"""Return the `[Content_Types].xml` blob from the zip package."""
return self.blob_for(CONTENT_TYPES_URI)
def rels_xml_for(self, source_uri):
"""Return rels item XML for source with `source_uri` or None if no rels item is
present."""
try:
rels_xml = self.blob_for(source_uri.rels_uri)
except KeyError:
rels_xml = None
return rels_xml
class _ZipPkgWriter(PhysPkgWriter):
"""Implements |PhysPkgWriter| interface for a zip file OPC package."""
def __init__(self, pkg_file):
super(_ZipPkgWriter, self).__init__()
self._zipf = ZipFile(pkg_file, "w", compression=ZIP_DEFLATED)
def close(self):
"""Close the zip archive, flushing any pending physical writes and releasing any
resources it's using."""
self._zipf.close()
def write(self, pack_uri, blob):
"""Write `blob` to this zip package with the membername corresponding to
`pack_uri`."""
self._zipf.writestr(pack_uri.membername, blob)

View File

@@ -0,0 +1,254 @@
"""Low-level, read-only API to a serialized Open Packaging Convention (OPC) package."""
from docx.opc.constants import RELATIONSHIP_TARGET_MODE as RTM
from docx.opc.oxml import parse_xml
from docx.opc.packuri import PACKAGE_URI, PackURI
from docx.opc.phys_pkg import PhysPkgReader
from docx.opc.shared import CaseInsensitiveDict
class PackageReader:
"""Provides access to the contents of a zip-format OPC package via its
:attr:`serialized_parts` and :attr:`pkg_srels` attributes."""
def __init__(self, content_types, pkg_srels, sparts):
super(PackageReader, self).__init__()
self._pkg_srels = pkg_srels
self._sparts = sparts
@staticmethod
def from_file(pkg_file):
"""Return a |PackageReader| instance loaded with contents of `pkg_file`."""
phys_reader = PhysPkgReader(pkg_file)
content_types = _ContentTypeMap.from_xml(phys_reader.content_types_xml)
pkg_srels = PackageReader._srels_for(phys_reader, PACKAGE_URI)
sparts = PackageReader._load_serialized_parts(phys_reader, pkg_srels, content_types)
phys_reader.close()
return PackageReader(content_types, pkg_srels, sparts)
def iter_sparts(self):
"""Generate a 4-tuple `(partname, content_type, reltype, blob)` for each of the
serialized parts in the package."""
for s in self._sparts:
yield (s.partname, s.content_type, s.reltype, s.blob)
def iter_srels(self):
"""Generate a 2-tuple `(source_uri, srel)` for each of the relationships in the
package."""
for srel in self._pkg_srels:
yield (PACKAGE_URI, srel)
for spart in self._sparts:
for srel in spart.srels:
yield (spart.partname, srel)
@staticmethod
def _load_serialized_parts(phys_reader, pkg_srels, content_types):
"""Return a list of |_SerializedPart| instances corresponding to the parts in
`phys_reader` accessible by walking the relationship graph starting with
`pkg_srels`."""
sparts = []
part_walker = PackageReader._walk_phys_parts(phys_reader, pkg_srels)
for partname, blob, reltype, srels in part_walker:
content_type = content_types[partname]
spart = _SerializedPart(partname, content_type, reltype, blob, srels)
sparts.append(spart)
return tuple(sparts)
@staticmethod
def _srels_for(phys_reader, source_uri):
"""Return |_SerializedRelationships| instance populated with relationships for
source identified by `source_uri`."""
rels_xml = phys_reader.rels_xml_for(source_uri)
return _SerializedRelationships.load_from_xml(source_uri.baseURI, rels_xml)
@staticmethod
def _walk_phys_parts(phys_reader, srels, visited_partnames=None):
"""Generate a 4-tuple `(partname, blob, reltype, srels)` for each of the parts
in `phys_reader` by walking the relationship graph rooted at srels."""
if visited_partnames is None:
visited_partnames = []
for srel in srels:
if srel.is_external:
continue
partname = srel.target_partname
if partname in visited_partnames:
continue
visited_partnames.append(partname)
reltype = srel.reltype
part_srels = PackageReader._srels_for(phys_reader, partname)
blob = phys_reader.blob_for(partname)
yield (partname, blob, reltype, part_srels)
next_walker = PackageReader._walk_phys_parts(phys_reader, part_srels, visited_partnames)
for partname, blob, reltype, srels in next_walker:
yield (partname, blob, reltype, srels)
class _ContentTypeMap:
"""Value type providing dictionary semantics for looking up content type by part
name, e.g. ``content_type = cti['/ppt/presentation.xml']``."""
def __init__(self):
super(_ContentTypeMap, self).__init__()
self._overrides = CaseInsensitiveDict()
self._defaults = CaseInsensitiveDict()
def __getitem__(self, partname):
"""Return content type for part identified by `partname`."""
if not isinstance(partname, PackURI):
tmpl = "_ContentTypeMap key must be <type 'PackURI'>, got %s"
raise KeyError(tmpl % type(partname))
if partname in self._overrides:
return self._overrides[partname]
if partname.ext in self._defaults:
return self._defaults[partname.ext]
tmpl = "no content type for partname '%s' in [Content_Types].xml"
raise KeyError(tmpl % partname)
@staticmethod
def from_xml(content_types_xml):
"""Return a new |_ContentTypeMap| instance populated with the contents of
`content_types_xml`."""
types_elm = parse_xml(content_types_xml)
ct_map = _ContentTypeMap()
for o in types_elm.overrides:
ct_map._add_override(o.partname, o.content_type)
for d in types_elm.defaults:
ct_map._add_default(d.extension, d.content_type)
return ct_map
def _add_default(self, extension, content_type):
"""Add the default mapping of `extension` to `content_type` to this content type
mapping."""
self._defaults[extension] = content_type
def _add_override(self, partname, content_type):
"""Add the default mapping of `partname` to `content_type` to this content type
mapping."""
self._overrides[partname] = content_type
class _SerializedPart:
"""Value object for an OPC package part.
Provides access to the partname, content type, blob, and serialized relationships
for the part.
"""
def __init__(self, partname, content_type, reltype, blob, srels):
super(_SerializedPart, self).__init__()
self._partname = partname
self._content_type = content_type
self._reltype = reltype
self._blob = blob
self._srels = srels
@property
def partname(self):
return self._partname
@property
def content_type(self):
return self._content_type
@property
def blob(self):
return self._blob
@property
def reltype(self):
"""The referring relationship type of this part."""
return self._reltype
@property
def srels(self):
return self._srels
class _SerializedRelationship:
"""Value object representing a serialized relationship in an OPC package.
Serialized, in this case, means any target part is referred to via its partname
rather than a direct link to an in-memory |Part| object.
"""
def __init__(self, baseURI, rel_elm):
super(_SerializedRelationship, self).__init__()
self._baseURI = baseURI
self._rId = rel_elm.rId
self._reltype = rel_elm.reltype
self._target_mode = rel_elm.target_mode
self._target_ref = rel_elm.target_ref
@property
def is_external(self):
"""True if target_mode is ``RTM.EXTERNAL``"""
return self._target_mode == RTM.EXTERNAL
@property
def reltype(self):
"""Relationship type, like ``RT.OFFICE_DOCUMENT``"""
return self._reltype
@property
def rId(self):
"""Relationship id, like 'rId9', corresponds to the ``Id`` attribute on the
``CT_Relationship`` element."""
return self._rId
@property
def target_mode(self):
"""String in ``TargetMode`` attribute of ``CT_Relationship`` element, one of
``RTM.INTERNAL`` or ``RTM.EXTERNAL``."""
return self._target_mode
@property
def target_ref(self):
"""String in ``Target`` attribute of ``CT_Relationship`` element, a relative
part reference for internal target mode or an arbitrary URI, e.g. an HTTP URL,
for external target mode."""
return self._target_ref
@property
def target_partname(self):
"""|PackURI| instance containing partname targeted by this relationship.
Raises ``ValueError`` on reference if target_mode is ``'External'``. Use
:attr:`target_mode` to check before referencing.
"""
if self.is_external:
msg = (
"target_partname attribute on Relationship is undefined w"
'here TargetMode == "External"'
)
raise ValueError(msg)
# lazy-load _target_partname attribute
if not hasattr(self, "_target_partname"):
self._target_partname = PackURI.from_rel_ref(self._baseURI, self.target_ref)
return self._target_partname
class _SerializedRelationships:
"""Read-only sequence of |_SerializedRelationship| instances corresponding to the
relationships item XML passed to constructor."""
def __init__(self):
super(_SerializedRelationships, self).__init__()
self._srels = []
def __iter__(self):
"""Support iteration, e.g. 'for x in srels:'."""
return self._srels.__iter__()
@staticmethod
def load_from_xml(baseURI, rels_item_xml):
"""Return |_SerializedRelationships| instance loaded with the relationships
contained in `rels_item_xml`.
Returns an empty collection if `rels_item_xml` is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels

View File

@@ -0,0 +1,115 @@
"""Provides low-level, write-only API to serialized (OPC) package.
OPC stands for Open Packaging Convention. This is e, essentially an implementation of
OpcPackage.save().
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Iterable
from docx.opc.constants import CONTENT_TYPE as CT
from docx.opc.oxml import CT_Types, serialize_part_xml
from docx.opc.packuri import CONTENT_TYPES_URI, PACKAGE_URI
from docx.opc.phys_pkg import PhysPkgWriter
from docx.opc.shared import CaseInsensitiveDict
from docx.opc.spec import default_content_types
if TYPE_CHECKING:
from docx.opc.part import Part
class PackageWriter:
"""Writes a zip-format OPC package to `pkg_file`, where `pkg_file` can be either a
path to a zip file (a string) or a file-like object.
Its single API method, :meth:`write`, is static, so this class is not intended to be
instantiated.
"""
@staticmethod
def write(pkg_file, pkg_rels, parts):
"""Write a physical package (.pptx file) to `pkg_file` containing `pkg_rels` and
`parts` and a content types stream based on the content types of the parts."""
phys_writer = PhysPkgWriter(pkg_file)
PackageWriter._write_content_types_stream(phys_writer, parts)
PackageWriter._write_pkg_rels(phys_writer, pkg_rels)
PackageWriter._write_parts(phys_writer, parts)
phys_writer.close()
@staticmethod
def _write_content_types_stream(phys_writer, parts):
"""Write ``[Content_Types].xml`` part to the physical package with an
appropriate content type lookup target for each part in `parts`."""
cti = _ContentTypesItem.from_parts(parts)
phys_writer.write(CONTENT_TYPES_URI, cti.blob)
@staticmethod
def _write_parts(phys_writer: PhysPkgWriter, parts: Iterable[Part]):
"""Write the blob of each part in `parts` to the package, along with a rels item
for its relationships if and only if it has any."""
for part in parts:
phys_writer.write(part.partname, part.blob)
if len(part.rels):
phys_writer.write(part.partname.rels_uri, part.rels.xml)
@staticmethod
def _write_pkg_rels(phys_writer, pkg_rels):
"""Write the XML rels item for `pkg_rels` ('/_rels/.rels') to the package."""
phys_writer.write(PACKAGE_URI.rels_uri, pkg_rels.xml)
class _ContentTypesItem:
"""Service class that composes a content types item ([Content_Types].xml) based on a
list of parts.
Not meant to be instantiated directly, its single interface method is xml_for(),
e.g. ``_ContentTypesItem.xml_for(parts)``.
"""
def __init__(self):
self._defaults = CaseInsensitiveDict()
self._overrides = {}
@property
def blob(self):
"""Return XML form of this content types item, suitable for storage as
``[Content_Types].xml`` in an OPC package."""
return serialize_part_xml(self._element)
@classmethod
def from_parts(cls, parts):
"""Return content types XML mapping each part in `parts` to the appropriate
content type and suitable for storage as ``[Content_Types].xml`` in an OPC
package."""
cti = cls()
cti._defaults["rels"] = CT.OPC_RELATIONSHIPS
cti._defaults["xml"] = CT.XML
for part in parts:
cti._add_content_type(part.partname, part.content_type)
return cti
def _add_content_type(self, partname, content_type):
"""Add a content type for the part with `partname` and `content_type`, using a
default or override as appropriate."""
ext = partname.ext
if (ext.lower(), content_type) in default_content_types:
self._defaults[ext] = content_type
else:
self._overrides[partname] = content_type
@property
def _element(self):
"""Return XML form of this content types item, suitable for storage as
``[Content_Types].xml`` in an OPC package.
Although the sequence of elements is not strictly significant, as an aid to
testing and readability Default elements are sorted by extension and Override
elements are sorted by partname.
"""
_types_elm = CT_Types.new()
for ext in sorted(self._defaults.keys()):
_types_elm.add_default(ext, self._defaults[ext])
for partname in sorted(self._overrides.keys()):
_types_elm.add_override(partname, self._overrides[partname])
return _types_elm

View File

@@ -0,0 +1,153 @@
"""Relationship-related objects."""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, cast
from docx.opc.oxml import CT_Relationships
if TYPE_CHECKING:
from docx.opc.part import Part
class Relationships(Dict[str, "_Relationship"]):
"""Collection object for |_Relationship| instances, having list semantics."""
def __init__(self, baseURI: str):
super(Relationships, self).__init__()
self._baseURI = baseURI
self._target_parts_by_rId: dict[str, Any] = {}
def add_relationship(
self, reltype: str, target: Part | str, rId: str, is_external: bool = False
) -> "_Relationship":
"""Return a newly added |_Relationship| instance."""
rel = _Relationship(rId, reltype, target, self._baseURI, is_external)
self[rId] = rel
if not is_external:
self._target_parts_by_rId[rId] = target
return rel
def get_or_add(self, reltype: str, target_part: Part) -> _Relationship:
"""Return relationship of `reltype` to `target_part`, newly added if not already
present in collection."""
rel = self._get_matching(reltype, target_part)
if rel is None:
rId = self._next_rId
rel = self.add_relationship(reltype, target_part, rId)
return rel
def get_or_add_ext_rel(self, reltype: str, target_ref: str) -> str:
"""Return rId of external relationship of `reltype` to `target_ref`, newly added
if not already present in collection."""
rel = self._get_matching(reltype, target_ref, is_external=True)
if rel is None:
rId = self._next_rId
rel = self.add_relationship(reltype, target_ref, rId, is_external=True)
return rel.rId
def part_with_reltype(self, reltype: str) -> Part:
"""Return target part of rel with matching `reltype`, raising |KeyError| if not
found and |ValueError| if more than one matching relationship is found."""
rel = self._get_rel_of_type(reltype)
return rel.target_part
@property
def related_parts(self):
"""Dict mapping rIds to target parts for all the internal relationships in the
collection."""
return self._target_parts_by_rId
@property
def xml(self) -> str:
"""Serialize this relationship collection into XML suitable for storage as a
.rels file in an OPC package."""
rels_elm = CT_Relationships.new()
for rel in self.values():
rels_elm.add_rel(rel.rId, rel.reltype, rel.target_ref, rel.is_external)
return rels_elm.xml
def _get_matching(
self, reltype: str, target: Part | str, is_external: bool = False
) -> _Relationship | None:
"""Return relationship of matching `reltype`, `target`, and `is_external` from
collection, or None if not found."""
def matches(rel: _Relationship, reltype: str, target: Part | str, is_external: bool):
if rel.reltype != reltype:
return False
if rel.is_external != is_external:
return False
rel_target = rel.target_ref if rel.is_external else rel.target_part
return rel_target == target
for rel in self.values():
if matches(rel, reltype, target, is_external):
return rel
return None
def _get_rel_of_type(self, reltype: str):
"""Return single relationship of type `reltype` from the collection.
Raises |KeyError| if no matching relationship is found. Raises |ValueError| if
more than one matching relationship is found.
"""
matching = [rel for rel in self.values() if rel.reltype == reltype]
if len(matching) == 0:
tmpl = "no relationship of type '%s' in collection"
raise KeyError(tmpl % reltype)
if len(matching) > 1:
tmpl = "multiple relationships of type '%s' in collection"
raise ValueError(tmpl % reltype)
return matching[0]
@property
def _next_rId(self) -> str: # pyright: ignore[reportReturnType]
"""Next available rId in collection, starting from 'rId1' and making use of any
gaps in numbering, e.g. 'rId2' for rIds ['rId1', 'rId3']."""
for n in range(1, len(self) + 2):
rId_candidate = "rId%d" % n # like 'rId19'
if rId_candidate not in self:
return rId_candidate
class _Relationship:
"""Value object for relationship to part."""
def __init__(
self, rId: str, reltype: str, target: Part | str, baseURI: str, external: bool = False
):
super(_Relationship, self).__init__()
self._rId = rId
self._reltype = reltype
self._target = target
self._baseURI = baseURI
self._is_external = bool(external)
@property
def is_external(self) -> bool:
return self._is_external
@property
def reltype(self) -> str:
return self._reltype
@property
def rId(self) -> str:
return self._rId
@property
def target_part(self) -> Part:
if self._is_external:
raise ValueError(
"target_part property on _Relationship is undefined when target mode is External"
)
return cast("Part", self._target)
@property
def target_ref(self) -> str:
if self._is_external:
return cast(str, self._target)
else:
target = cast("Part", self._target)
return target.partname.relative_ref(self._baseURI)

View File

@@ -0,0 +1,31 @@
"""Objects shared by opc modules."""
from __future__ import annotations
from typing import Any, Dict, TypeVar
_T = TypeVar("_T")
class CaseInsensitiveDict(Dict[str, Any]):
"""Mapping type that behaves like dict except that it matches without respect to the
case of the key.
E.g. cid['A'] == cid['a']. Note this is not general-purpose, just complete enough to
satisfy opc package needs. It assumes str keys, and that it is created empty; keys
passed in constructor are not accounted for
"""
def __contains__(self, key):
return super(CaseInsensitiveDict, self).__contains__(key.lower())
def __getitem__(self, key):
return super(CaseInsensitiveDict, self).__getitem__(key.lower())
def __setitem__(self, key, value):
return super(CaseInsensitiveDict, self).__setitem__(key.lower(), value)
def cls_method_fn(cls: type, method_name: str):
"""Return method of `cls` having `method_name`."""
return getattr(cls, method_name)

View File

@@ -0,0 +1,24 @@
"""Provides mappings that embody aspects of the Open XML spec ISO/IEC 29500."""
from docx.opc.constants import CONTENT_TYPE as CT
default_content_types = (
("bin", CT.PML_PRINTER_SETTINGS),
("bin", CT.SML_PRINTER_SETTINGS),
("bin", CT.WML_PRINTER_SETTINGS),
("bmp", CT.BMP),
("emf", CT.X_EMF),
("fntdata", CT.X_FONTDATA),
("gif", CT.GIF),
("jpe", CT.JPEG),
("jpeg", CT.JPEG),
("jpg", CT.JPEG),
("png", CT.PNG),
("rels", CT.OPC_RELATIONSHIPS),
("tif", CT.TIFF),
("tiff", CT.TIFF),
("wdp", CT.MS_PHOTO),
("wmf", CT.X_WMF),
("xlsx", CT.SML_SHEET),
("xml", CT.XML),
)

View File

@@ -0,0 +1,251 @@
# ruff: noqa: E402, I001
"""Initializes oxml sub-package.
This including registering custom element classes corresponding to Open XML elements.
"""
from __future__ import annotations
from docx.oxml.drawing import CT_Drawing
from docx.oxml.parser import OxmlElement, parse_xml, register_element_cls
from docx.oxml.shape import (
CT_Anchor,
CT_Blip,
CT_BlipFillProperties,
CT_GraphicalObject,
CT_GraphicalObjectData,
CT_Inline,
CT_NonVisualDrawingProps,
CT_Picture,
CT_PictureNonVisual,
CT_Point2D,
CT_PositiveSize2D,
CT_ShapeProperties,
CT_Transform2D,
)
from docx.oxml.shared import CT_DecimalNumber, CT_OnOff, CT_String
from docx.oxml.text.hyperlink import CT_Hyperlink
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
from docx.oxml.text.run import (
CT_R,
CT_Br,
CT_Cr,
CT_NoBreakHyphen,
CT_PTab,
CT_Text,
)
# -- `OxmlElement` and `parse_xml()` are not used in this module but several downstream
# -- "extension" packages expect to find them here and there's no compelling reason
# -- not to republish them here so those keep working.
__all__ = ["OxmlElement", "parse_xml"]
# ---------------------------------------------------------------------------
# DrawingML-related elements
register_element_cls("a:blip", CT_Blip)
register_element_cls("a:ext", CT_PositiveSize2D)
register_element_cls("a:graphic", CT_GraphicalObject)
register_element_cls("a:graphicData", CT_GraphicalObjectData)
register_element_cls("a:off", CT_Point2D)
register_element_cls("a:xfrm", CT_Transform2D)
register_element_cls("pic:blipFill", CT_BlipFillProperties)
register_element_cls("pic:cNvPr", CT_NonVisualDrawingProps)
register_element_cls("pic:nvPicPr", CT_PictureNonVisual)
register_element_cls("pic:pic", CT_Picture)
register_element_cls("pic:spPr", CT_ShapeProperties)
register_element_cls("w:drawing", CT_Drawing)
register_element_cls("wp:anchor", CT_Anchor)
register_element_cls("wp:docPr", CT_NonVisualDrawingProps)
register_element_cls("wp:extent", CT_PositiveSize2D)
register_element_cls("wp:inline", CT_Inline)
# ---------------------------------------------------------------------------
# hyperlink-related elements
register_element_cls("w:hyperlink", CT_Hyperlink)
# ---------------------------------------------------------------------------
# text-related elements
register_element_cls("w:br", CT_Br)
register_element_cls("w:cr", CT_Cr)
register_element_cls("w:lastRenderedPageBreak", CT_LastRenderedPageBreak)
register_element_cls("w:noBreakHyphen", CT_NoBreakHyphen)
register_element_cls("w:ptab", CT_PTab)
register_element_cls("w:r", CT_R)
register_element_cls("w:t", CT_Text)
# ---------------------------------------------------------------------------
# header/footer-related mappings
register_element_cls("w:evenAndOddHeaders", CT_OnOff)
register_element_cls("w:titlePg", CT_OnOff)
# ---------------------------------------------------------------------------
# other custom element class mappings
from .comments import CT_Comments, CT_Comment
register_element_cls("w:comments", CT_Comments)
register_element_cls("w:comment", CT_Comment)
from .coreprops import CT_CoreProperties
register_element_cls("cp:coreProperties", CT_CoreProperties)
from .document import CT_Body, CT_Document
register_element_cls("w:body", CT_Body)
register_element_cls("w:document", CT_Document)
from .numbering import CT_Num, CT_Numbering, CT_NumLvl, CT_NumPr
register_element_cls("w:abstractNumId", CT_DecimalNumber)
register_element_cls("w:ilvl", CT_DecimalNumber)
register_element_cls("w:lvlOverride", CT_NumLvl)
register_element_cls("w:num", CT_Num)
register_element_cls("w:numId", CT_DecimalNumber)
register_element_cls("w:numPr", CT_NumPr)
register_element_cls("w:numbering", CT_Numbering)
register_element_cls("w:startOverride", CT_DecimalNumber)
from .section import (
CT_HdrFtr,
CT_HdrFtrRef,
CT_PageMar,
CT_PageSz,
CT_SectPr,
CT_SectType,
)
register_element_cls("w:footerReference", CT_HdrFtrRef)
register_element_cls("w:ftr", CT_HdrFtr)
register_element_cls("w:hdr", CT_HdrFtr)
register_element_cls("w:headerReference", CT_HdrFtrRef)
register_element_cls("w:pgMar", CT_PageMar)
register_element_cls("w:pgSz", CT_PageSz)
register_element_cls("w:sectPr", CT_SectPr)
register_element_cls("w:type", CT_SectType)
from .settings import CT_Settings
register_element_cls("w:settings", CT_Settings)
from .styles import CT_LatentStyles, CT_LsdException, CT_Style, CT_Styles
register_element_cls("w:basedOn", CT_String)
register_element_cls("w:latentStyles", CT_LatentStyles)
register_element_cls("w:locked", CT_OnOff)
register_element_cls("w:lsdException", CT_LsdException)
register_element_cls("w:name", CT_String)
register_element_cls("w:next", CT_String)
register_element_cls("w:qFormat", CT_OnOff)
register_element_cls("w:semiHidden", CT_OnOff)
register_element_cls("w:style", CT_Style)
register_element_cls("w:styles", CT_Styles)
register_element_cls("w:uiPriority", CT_DecimalNumber)
register_element_cls("w:unhideWhenUsed", CT_OnOff)
from .table import (
CT_Height,
CT_Row,
CT_Tbl,
CT_TblGrid,
CT_TblGridCol,
CT_TblLayoutType,
CT_TblPr,
CT_TblPrEx,
CT_TblWidth,
CT_Tc,
CT_TcPr,
CT_TrPr,
CT_VMerge,
CT_VerticalJc,
)
register_element_cls("w:bidiVisual", CT_OnOff)
register_element_cls("w:gridAfter", CT_DecimalNumber)
register_element_cls("w:gridBefore", CT_DecimalNumber)
register_element_cls("w:gridCol", CT_TblGridCol)
register_element_cls("w:gridSpan", CT_DecimalNumber)
register_element_cls("w:tbl", CT_Tbl)
register_element_cls("w:tblGrid", CT_TblGrid)
register_element_cls("w:tblLayout", CT_TblLayoutType)
register_element_cls("w:tblPr", CT_TblPr)
register_element_cls("w:tblPrEx", CT_TblPrEx)
register_element_cls("w:tblStyle", CT_String)
register_element_cls("w:tc", CT_Tc)
register_element_cls("w:tcPr", CT_TcPr)
register_element_cls("w:tcW", CT_TblWidth)
register_element_cls("w:tr", CT_Row)
register_element_cls("w:trHeight", CT_Height)
register_element_cls("w:trPr", CT_TrPr)
register_element_cls("w:vAlign", CT_VerticalJc)
register_element_cls("w:vMerge", CT_VMerge)
from .text.font import (
CT_Color,
CT_Fonts,
CT_Highlight,
CT_HpsMeasure,
CT_RPr,
CT_Underline,
CT_VerticalAlignRun,
)
register_element_cls("w:b", CT_OnOff)
register_element_cls("w:bCs", CT_OnOff)
register_element_cls("w:caps", CT_OnOff)
register_element_cls("w:color", CT_Color)
register_element_cls("w:cs", CT_OnOff)
register_element_cls("w:dstrike", CT_OnOff)
register_element_cls("w:emboss", CT_OnOff)
register_element_cls("w:highlight", CT_Highlight)
register_element_cls("w:i", CT_OnOff)
register_element_cls("w:iCs", CT_OnOff)
register_element_cls("w:imprint", CT_OnOff)
register_element_cls("w:noProof", CT_OnOff)
register_element_cls("w:oMath", CT_OnOff)
register_element_cls("w:outline", CT_OnOff)
register_element_cls("w:rFonts", CT_Fonts)
register_element_cls("w:rPr", CT_RPr)
register_element_cls("w:rStyle", CT_String)
register_element_cls("w:rtl", CT_OnOff)
register_element_cls("w:shadow", CT_OnOff)
register_element_cls("w:smallCaps", CT_OnOff)
register_element_cls("w:snapToGrid", CT_OnOff)
register_element_cls("w:specVanish", CT_OnOff)
register_element_cls("w:strike", CT_OnOff)
register_element_cls("w:sz", CT_HpsMeasure)
register_element_cls("w:u", CT_Underline)
register_element_cls("w:vanish", CT_OnOff)
register_element_cls("w:vertAlign", CT_VerticalAlignRun)
register_element_cls("w:webHidden", CT_OnOff)
from .text.paragraph import CT_P
register_element_cls("w:p", CT_P)
from .text.parfmt import (
CT_Ind,
CT_Jc,
CT_PPr,
CT_Spacing,
CT_TabStop,
CT_TabStops,
)
register_element_cls("w:ind", CT_Ind)
register_element_cls("w:jc", CT_Jc)
register_element_cls("w:keepLines", CT_OnOff)
register_element_cls("w:keepNext", CT_OnOff)
register_element_cls("w:outlineLvl", CT_DecimalNumber)
register_element_cls("w:pageBreakBefore", CT_OnOff)
register_element_cls("w:pPr", CT_PPr)
register_element_cls("w:pStyle", CT_String)
register_element_cls("w:spacing", CT_Spacing)
register_element_cls("w:tab", CT_TabStop)
register_element_cls("w:tabs", CT_TabStops)
register_element_cls("w:widowControl", CT_OnOff)

Some files were not shown because too many files have changed in this diff Show More