document.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
  2. # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
  3. __all__ = ("PdfDocument", "PdfFormEnv", "PdfXObject", "PdfOutlineItem")
  4. import os
  5. import ctypes
  6. import logging
  7. import inspect
  8. import warnings
  9. from pathlib import Path
  10. from collections import namedtuple
  11. import multiprocessing as mp
  12. import pypdfium2.raw as pdfium_c
  13. import pypdfium2.internal as pdfium_i
  14. from pypdfium2.version import PDFIUM_INFO
  15. from pypdfium2._helpers.misc import PdfiumError
  16. from pypdfium2._helpers.page import PdfPage
  17. from pypdfium2._helpers.pageobjects import PdfObject
  18. from pypdfium2._helpers.attachment import PdfAttachment
  19. logger = logging.getLogger(__name__)
  20. class PdfDocument (pdfium_i.AutoCloseable):
  21. """
  22. Document helper class.
  23. Parameters:
  24. input_data (str | pathlib.Path | bytes | ctypes.Array | typing.BinaryIO | FPDF_DOCUMENT):
  25. The input PDF given as file path, bytes, ctypes array, byte buffer, or raw PDFium document handle.
  26. A byte buffer is defined as an object that implements ``seek() tell() read() readinto()``.
  27. password (str | None):
  28. A password to unlock the PDF, if encrypted. Otherwise, None or an empty string may be passed.
  29. If a password is given but the PDF is not encrypted, it will be ignored (as of PDFium 5418).
  30. autoclose (bool):
  31. Whether byte buffer input should be automatically closed on finalization.
  32. Raises:
  33. PdfiumError: Raised if the document failed to load. The exception message is annotated with the reason reported by PDFium.
  34. FileNotFoundError: Raised if an invalid or non-existent file path was given.
  35. Hint:
  36. * :func:`len` may be called to get a document's number of pages.
  37. * Looping over a document will yield its pages from beginning to end.
  38. * Pages may be loaded using list index access.
  39. * The ``del`` keyword and list index access may be used to delete pages.
  40. Attributes:
  41. raw (FPDF_DOCUMENT):
  42. The underlying PDFium document handle.
  43. formenv (PdfFormEnv | None):
  44. Form env, if the document has forms and :meth:`.init_forms` was called.
  45. """
  46. def __init__(self, input, password=None, autoclose=False):
  47. if isinstance(input, str):
  48. input = Path(input)
  49. if isinstance(input, Path):
  50. input = input.expanduser().resolve()
  51. if not input.is_file():
  52. raise FileNotFoundError(input)
  53. self._input = input
  54. self._password = password
  55. self._autoclose = autoclose
  56. self._data_holder = []
  57. self._data_closer = []
  58. # question: can we make attributes like formenv effectively immutable for the caller?
  59. self.formenv = None
  60. if isinstance(self._input, pdfium_c.FPDF_DOCUMENT):
  61. self.raw = self._input
  62. else:
  63. self.raw, to_hold, to_close = _open_pdf(self._input, self._password, self._autoclose)
  64. self._data_holder += to_hold
  65. self._data_closer += to_close
  66. super().__init__(PdfDocument._close_impl, self._data_holder, self._data_closer)
  67. def __repr__(self):
  68. if isinstance(self._input, Path):
  69. input_r = repr( str(self._input) )
  70. elif isinstance(self._input, bytes):
  71. input_r = f"<bytes object at {hex(id(self._input))}>"
  72. elif isinstance(self._input, pdfium_c.FPDF_DOCUMENT):
  73. input_r = f"<FPDF_DOCUMENT at {hex(id(self._input))}>"
  74. else:
  75. input_r = repr(self._input)
  76. return f"{super().__repr__()[:-1]} from {input_r}>"
  77. @property
  78. def parent(self): # AutoCloseable hook
  79. return None
  80. @staticmethod
  81. def _close_impl(raw, data_holder, data_closer):
  82. pdfium_c.FPDF_CloseDocument(raw)
  83. for data in data_holder:
  84. id(data)
  85. for data in data_closer:
  86. data.close()
  87. data_holder.clear()
  88. data_closer.clear()
  89. def __len__(self):
  90. return pdfium_c.FPDF_GetPageCount(self)
  91. def __iter__(self):
  92. for i in range( len(self) ):
  93. yield self[i]
  94. def __getitem__(self, i):
  95. return self.get_page(i)
  96. def __delitem__(self, i):
  97. self.del_page(i)
  98. @classmethod
  99. def new(cls):
  100. """
  101. Returns:
  102. PdfDocument: A new, empty document.
  103. """
  104. new_pdf = pdfium_c.FPDF_CreateNewDocument()
  105. return cls(new_pdf)
  106. def init_forms(self, config=None):
  107. """
  108. Initialize a form env, if the document has forms. If already initialized, nothing will be done.
  109. See the :attr:`formenv` attribute.
  110. Attention:
  111. If form rendering is desired, this method shall be called immediately after document construction, before getting document length or page handles.
  112. Parameters:
  113. config (FPDF_FORMFILLINFO | None):
  114. Custom form config interface to use (optional).
  115. """
  116. formtype = self.get_formtype()
  117. if formtype == pdfium_c.FORMTYPE_NONE or self.formenv:
  118. return
  119. # safety check for older binaries to prevent a segfault (could be removed at some point)
  120. # https://github.com/bblanchon/pdfium-binaries/issues/105
  121. if "V8" in PDFIUM_INFO.flags and PDFIUM_INFO.origin != "sourcebuild" and PDFIUM_INFO.build <= 5677:
  122. raise RuntimeError("V8 enabled pdfium-binaries builds <= 5677 crash on init_forms().")
  123. if not config:
  124. if "XFA" in PDFIUM_INFO.flags:
  125. js_platform = pdfium_c.IPDF_JSPLATFORM(version=3)
  126. config = pdfium_c.FPDF_FORMFILLINFO(version=2, xfa_disabled=False, m_pJsPlatform=ctypes.pointer(js_platform))
  127. else:
  128. config = pdfium_c.FPDF_FORMFILLINFO(version=2)
  129. raw = pdfium_c.FPDFDOC_InitFormFillEnvironment(self, config)
  130. if not raw:
  131. raise PdfiumError(f"Initializing form env failed for document {self}.")
  132. self.formenv = PdfFormEnv(raw, config, self)
  133. self._add_kid(self.formenv)
  134. if formtype in (pdfium_c.FORMTYPE_XFA_FULL, pdfium_c.FORMTYPE_XFA_FOREGROUND):
  135. if "XFA" in PDFIUM_INFO.flags:
  136. ok = pdfium_c.FPDF_LoadXFA(self)
  137. if not ok:
  138. err = pdfium_c.FPDF_GetLastError()
  139. logger.warning(f"FPDF_LoadXFA() failed with {pdfium_i.XFAErrorToStr.get(err)}")
  140. else:
  141. logger.warning(
  142. "init_forms() called on XFA pdf, but this pdfium binary was compiled without XFA support.\n"
  143. "Run `PDFIUM_PLATFORM=auto-v8 pip install -v pypdfium2 --no-binary pypdfium2` to get a build with XFA support."
  144. )
  145. # TODO?(v5) consider cached property
  146. def get_formtype(self):
  147. """
  148. Returns:
  149. int: PDFium form type that applies to the document (:attr:`FORMTYPE_*`).
  150. :attr:`FORMTYPE_NONE` if the document has no forms.
  151. """
  152. return pdfium_c.FPDF_GetFormType(self)
  153. # TODO?(v5) consider cached property
  154. def get_pagemode(self):
  155. """
  156. Returns:
  157. int: Page displaying mode (:attr:`PAGEMODE_*`).
  158. """
  159. return pdfium_c.FPDFDoc_GetPageMode(self)
  160. # TODO?(v5) consider cached property
  161. def is_tagged(self):
  162. """
  163. Returns:
  164. bool: Whether the document is tagged (cf. PDF 1.7, 10.7 "Tagged PDF").
  165. """
  166. return bool( pdfium_c.FPDFCatalog_IsTagged(self) )
  167. def save(self, dest, version=None, flags=pdfium_c.FPDF_NO_INCREMENTAL):
  168. """
  169. Save the document at its current state.
  170. Parameters:
  171. dest (str | pathlib.Path | io.BytesIO):
  172. File path or byte buffer the document shall be written to.
  173. version (int | None):
  174. The PDF version to use, given as an integer (14 for 1.4, 15 for 1.5, ...).
  175. If None (the default), PDFium will set a version automatically.
  176. flags (int):
  177. PDFium saving flags (defaults to :attr:`FPDF_NO_INCREMENTAL`).
  178. """
  179. if isinstance(dest, (str, Path)):
  180. buffer, need_close = open(dest, "wb"), True
  181. elif pdfium_i.is_buffer(dest, "w"):
  182. buffer, need_close = dest, False
  183. else:
  184. raise ValueError(f"Cannot save to '{dest}'")
  185. try:
  186. saveargs = (self, pdfium_i.get_bufwriter(buffer), flags)
  187. ok = pdfium_c.FPDF_SaveAsCopy(*saveargs) if version is None else pdfium_c.FPDF_SaveWithVersion(*saveargs, version)
  188. if not ok:
  189. raise PdfiumError("Failed to save document.")
  190. finally:
  191. if need_close:
  192. buffer.close()
  193. def get_identifier(self, type=pdfium_c.FILEIDTYPE_PERMANENT):
  194. """
  195. Parameters:
  196. type (int):
  197. The identifier type to retrieve (:attr:`FILEIDTYPE_*`), either permanent or changing.
  198. If the file was updated incrementally, the permanent identifier stays the same,
  199. while the changing identifier is re-calculated.
  200. Returns:
  201. bytes: Unique file identifier from the PDF's trailer dictionary.
  202. See PDF 1.7, Section 14.4 "File Identifiers".
  203. """
  204. n_bytes = pdfium_c.FPDF_GetFileIdentifier(self, type, None, 0)
  205. buffer = ctypes.create_string_buffer(n_bytes)
  206. pdfium_c.FPDF_GetFileIdentifier(self, type, buffer, n_bytes)
  207. return buffer.raw[:n_bytes-2]
  208. def get_version(self):
  209. """
  210. Returns:
  211. int | None: The PDF version of the document (14 for 1.4, 15 for 1.5, ...),
  212. or None if the document is new or its version could not be determined.
  213. """
  214. version = ctypes.c_int()
  215. ok = pdfium_c.FPDF_GetFileVersion(self, version)
  216. if not ok:
  217. return None
  218. return version.value
  219. def get_metadata_value(self, key):
  220. """
  221. Returns:
  222. str: Value of the given key in the PDF's metadata dictionary.
  223. If the key is not contained, an empty string will be returned.
  224. """
  225. enc_key = (key + "\x00").encode("utf-8")
  226. n_bytes = pdfium_c.FPDF_GetMetaText(self, enc_key, None, 0)
  227. buffer = ctypes.create_string_buffer(n_bytes)
  228. pdfium_c.FPDF_GetMetaText(self, enc_key, buffer, n_bytes)
  229. return buffer.raw[:n_bytes-2].decode("utf-16-le")
  230. METADATA_KEYS = ("Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate", "ModDate")
  231. def get_metadata_dict(self, skip_empty=False):
  232. """
  233. Get the document's metadata as dictionary.
  234. Parameters:
  235. skip_empty (bool):
  236. If True, skip items whose value is an empty string.
  237. Returns:
  238. dict: PDF metadata.
  239. """
  240. metadata = {k: self.get_metadata_value(k) for k in self.METADATA_KEYS}
  241. if skip_empty:
  242. metadata = {k: v for k, v in metadata.items() if v}
  243. return metadata
  244. def count_attachments(self):
  245. """
  246. Returns:
  247. int: The number of embedded files in the document.
  248. """
  249. return pdfium_c.FPDFDoc_GetAttachmentCount(self)
  250. def get_attachment(self, index):
  251. """
  252. Returns:
  253. PdfAttachment: The attachment at *index* (zero-based).
  254. """
  255. raw_attachment = pdfium_c.FPDFDoc_GetAttachment(self, index)
  256. if not raw_attachment:
  257. raise PdfiumError(f"Failed to get attachment at index {index}.")
  258. return PdfAttachment(raw_attachment, self)
  259. def new_attachment(self, name):
  260. """
  261. Add a new attachment to the document. It may appear at an arbitrary index (as of PDFium 5418).
  262. Parameters:
  263. name (str):
  264. The name the attachment shall have. Usually a file name with extension.
  265. Returns:
  266. PdfAttachment: Handle to the new, empty attachment.
  267. """
  268. enc_name = (name + "\x00").encode("utf-16-le")
  269. enc_name_ptr = ctypes.cast(enc_name, pdfium_c.FPDF_WIDESTRING)
  270. raw_attachment = pdfium_c.FPDFDoc_AddAttachment(self, enc_name_ptr)
  271. if not raw_attachment:
  272. raise PdfiumError(f"Failed to create new attachment '{name}'.")
  273. return PdfAttachment(raw_attachment, self)
  274. def del_attachment(self, index):
  275. """
  276. Unlink the attachment at *index* (zero-based).
  277. It will be hidden from the viewer, but is still present in the file (as of PDFium 5418).
  278. Following attachments shift one slot to the left in the array representation used by PDFium's API.
  279. Handles to the attachment in question received from :meth:`.get_attachment`
  280. must not be accessed anymore after this method has been called.
  281. """
  282. ok = pdfium_c.FPDFDoc_DeleteAttachment(self, index)
  283. if not ok:
  284. raise PdfiumError(f"Failed to delete attachment at index {index}.")
  285. # TODO deprecate in favour of index access?
  286. def get_page(self, index):
  287. """
  288. Returns:
  289. PdfPage: The page at *index* (zero-based).
  290. Note:
  291. This calls ``FORM_OnAfterLoadPage()`` if the document has an active form env.
  292. The form env must not be closed before the page is closed!
  293. """
  294. raw_page = pdfium_c.FPDF_LoadPage(self, index)
  295. if not raw_page:
  296. raise PdfiumError("Failed to load page.")
  297. page = PdfPage(raw_page, self, self.formenv)
  298. if self.formenv:
  299. pdfium_c.FORM_OnAfterLoadPage(page, self.formenv)
  300. self.formenv._add_kid(page)
  301. else:
  302. self._add_kid(page)
  303. return page
  304. def new_page(self, width, height, index=None):
  305. """
  306. Insert a new, empty page into the document.
  307. Parameters:
  308. width (float):
  309. Target page width (horizontal size).
  310. height (float):
  311. Target page height (vertical size).
  312. index (int | None):
  313. Suggested zero-based index at which the page shall be inserted.
  314. If None or larger that the document's current last index, the page will be appended to the end.
  315. Returns:
  316. PdfPage: The newly created page.
  317. """
  318. if index is None:
  319. index = len(self)
  320. raw_page = pdfium_c.FPDFPage_New(self, index, width, height)
  321. page = PdfPage(raw_page, self, None)
  322. # not doing formenv calls for new pages as we don't see the point
  323. self._add_kid(page)
  324. return page
  325. def del_page(self, index):
  326. """
  327. Remove the page at *index* (zero-based).
  328. """
  329. # FIXME what if the caller still has a handle to the page?
  330. pdfium_c.FPDFPage_Delete(self, index)
  331. def import_pages(self, pdf, pages=None, index=None):
  332. """
  333. Import pages from a foreign document.
  334. Parameters:
  335. pdf (PdfDocument):
  336. The document from which to import pages.
  337. pages (list[int] | str | None):
  338. The pages to include. It may either be a list of zero-based page indices, or a string of one-based page numbers and ranges.
  339. If None, all pages will be included.
  340. index (int):
  341. Zero-based index at which to insert the given pages. If None, they are appended to the end of the document.
  342. """
  343. if index is None:
  344. index = len(self)
  345. if isinstance(pages, str):
  346. ok = pdfium_c.FPDF_ImportPages(self, pdf, pages.encode("ascii"), index)
  347. else:
  348. page_count = 0
  349. c_pages = None
  350. if pages:
  351. page_count = len(pages)
  352. c_pages = (ctypes.c_int * page_count)(*pages)
  353. ok = pdfium_c.FPDF_ImportPagesByIndex(self, pdf, c_pages, page_count, index)
  354. if not ok:
  355. raise PdfiumError("Failed to import pages.")
  356. def get_page_size(self, index):
  357. """
  358. Returns:
  359. (float, float): Width and height in PDF canvas units of the page at *index* (zero-based).
  360. """
  361. size = pdfium_c.FS_SIZEF()
  362. ok = pdfium_c.FPDF_GetPageSizeByIndexF(self, index, size)
  363. if not ok:
  364. raise PdfiumError("Failed to get page size by index.")
  365. return (size.width, size.height)
  366. def get_page_label(self, index):
  367. """
  368. Returns:
  369. str: Label of the page at *index* (zero-based).
  370. (A page label is essentially an alias that may be displayed instead of the page number.)
  371. """
  372. n_bytes = pdfium_c.FPDF_GetPageLabel(self, index, None, 0)
  373. buffer = ctypes.create_string_buffer(n_bytes)
  374. pdfium_c.FPDF_GetPageLabel(self, index, buffer, n_bytes)
  375. return buffer.raw[:n_bytes-2].decode("utf-16-le")
  376. def page_as_xobject(self, index, dest_pdf):
  377. """
  378. Capture a page as XObject and attach it to a document's resources.
  379. Parameters:
  380. index (int):
  381. Zero-based index of the page.
  382. dest_pdf (PdfDocument):
  383. Target document to which the XObject shall be added.
  384. Returns:
  385. PdfXObject: The page as XObject.
  386. """
  387. raw_xobject = pdfium_c.FPDF_NewXObjectFromPage(dest_pdf, self, index)
  388. if raw_xobject is None:
  389. raise PdfiumError(f"Failed to capture page at index {index} as FPDF_XOBJECT.")
  390. xobject = PdfXObject(raw=raw_xobject, pdf=dest_pdf)
  391. self._add_kid(xobject)
  392. return xobject
  393. # TODO(apibreak) consider switching to a wrapper class around the raw bookmark
  394. # (either with getter methods, or possibly cached properties)
  395. def _get_bookmark(self, bookmark, level):
  396. n_bytes = pdfium_c.FPDFBookmark_GetTitle(bookmark, None, 0)
  397. buffer = ctypes.create_string_buffer(n_bytes)
  398. pdfium_c.FPDFBookmark_GetTitle(bookmark, buffer, n_bytes)
  399. title = buffer.raw[:n_bytes-2].decode('utf-16-le')
  400. # TODO(apibreak) just expose count as-is rather than using two variables and doing extra work
  401. count = pdfium_c.FPDFBookmark_GetCount(bookmark)
  402. is_closed = True if count < 0 else None if count == 0 else False
  403. n_kids = abs(count)
  404. dest = pdfium_c.FPDFBookmark_GetDest(self, bookmark)
  405. page_index = pdfium_c.FPDFDest_GetDestPageIndex(self, dest)
  406. if page_index == -1:
  407. page_index = None
  408. n_params = ctypes.c_ulong()
  409. view_pos = (pdfium_c.FS_FLOAT * 4)()
  410. view_mode = pdfium_c.FPDFDest_GetView(dest, n_params, view_pos)
  411. view_pos = list(view_pos)[:n_params.value]
  412. return PdfOutlineItem(
  413. level = level,
  414. title = title,
  415. is_closed = is_closed,
  416. n_kids = n_kids,
  417. page_index = page_index,
  418. view_mode = view_mode,
  419. view_pos = view_pos,
  420. )
  421. # TODO(apibreak) change outline API (see above)
  422. def get_toc(
  423. self,
  424. max_depth = 15,
  425. parent = None,
  426. level = 0,
  427. seen = None,
  428. ):
  429. """
  430. Iterate through the bookmarks in the document's table of contents.
  431. Parameters:
  432. max_depth (int):
  433. Maximum recursion depth to consider.
  434. Yields:
  435. :class:`.PdfOutlineItem`: Bookmark information.
  436. """
  437. if seen is None:
  438. seen = set()
  439. bookmark = pdfium_c.FPDFBookmark_GetFirstChild(self, parent)
  440. while bookmark:
  441. address = ctypes.addressof(bookmark.contents)
  442. if address in seen:
  443. logger.warning("A circular bookmark reference was detected whilst parsing the table of contents.")
  444. break
  445. else:
  446. seen.add(address)
  447. yield self._get_bookmark(bookmark, level)
  448. if level < max_depth-1:
  449. yield from self.get_toc(
  450. max_depth = max_depth,
  451. parent = bookmark,
  452. level = level + 1,
  453. seen = seen,
  454. )
  455. bookmark = pdfium_c.FPDFBookmark_GetNextSibling(self, bookmark)
  456. def render(
  457. self,
  458. converter,
  459. renderer = PdfPage.render,
  460. page_indices = None,
  461. pass_info = False,
  462. n_processes = None, # ignored, retained for compat
  463. mk_formconfig = None, # ignored, retained for compat
  464. **kwargs
  465. ):
  466. """
  467. .. deprecated:: 4.19
  468. This method will be removed with the next major release due to serious issues rooted in the original API design. Use :meth:`.PdfPage.render()` instead.
  469. *Note that the CLI provides parallel rendering using a proper caller-side process pool with inline saving in rendering jobs.*
  470. .. versionchanged:: 4.25
  471. Removed the original process pool implementation and turned this into a wrapper for linear rendering, due to the serious conceptual issues and possible memory load escalation, especially with expensive receiving code (e.g. PNG encoding) or long documents. See the changelog for more info
  472. """
  473. warnings.warn("The document-level pdf.render() API is deprecated and uncored due to serious issues in the original concept. Use page.render() and a caller-side loop or process pool instead.", category=DeprecationWarning)
  474. if not page_indices:
  475. page_indices = [i for i in range(len(self))]
  476. for i in page_indices:
  477. bitmap = renderer(self[i], **kwargs)
  478. if pass_info:
  479. yield (converter(bitmap), bitmap.get_info())
  480. else:
  481. yield converter(bitmap)
  482. class PdfFormEnv (pdfium_i.AutoCloseable):
  483. """
  484. Form environment helper class.
  485. Attributes:
  486. raw (FPDF_FORMHANDLE):
  487. The underlying PDFium form env handle.
  488. config (FPDF_FORMFILLINFO):
  489. Accompanying form configuration interface, to be kept alive.
  490. pdf (PdfDocument):
  491. Parent document this form env belongs to.
  492. """
  493. def __init__(self, raw, config, pdf):
  494. self.raw, self.config, self.pdf = raw, config, pdf
  495. super().__init__(PdfFormEnv._close_impl, self.config, self.pdf)
  496. @property
  497. def parent(self): # AutoCloseable hook
  498. return self.pdf
  499. @staticmethod
  500. def _close_impl(raw, config, pdf):
  501. pdfium_c.FPDFDOC_ExitFormFillEnvironment(raw)
  502. id(config)
  503. pdf.formenv = None
  504. class PdfXObject (pdfium_i.AutoCloseable):
  505. """
  506. XObject helper class.
  507. Attributes:
  508. raw (FPDF_XOBJECT): The underlying PDFium XObject handle.
  509. pdf (PdfDocument): Reference to the document this XObject belongs to.
  510. """
  511. def __init__(self, raw, pdf):
  512. self.raw, self.pdf = raw, pdf
  513. super().__init__(pdfium_c.FPDF_CloseXObject)
  514. @property
  515. def parent(self): # AutoCloseable hook
  516. return self.pdf
  517. def as_pageobject(self):
  518. """
  519. Returns:
  520. PdfObject: An independent page object representation of the XObject.
  521. If multiple page objects are created from one XObject, they share resources.
  522. Page objects created from an XObject remain valid after the XObject is closed.
  523. """
  524. raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self)
  525. return PdfObject( # not a child object (see above)
  526. raw = raw_pageobj,
  527. pdf = self.pdf,
  528. )
  529. def _open_pdf(input_data, password, autoclose):
  530. to_hold, to_close = (), ()
  531. if password is not None:
  532. password = (password+"\x00").encode("utf-8")
  533. if isinstance(input_data, Path):
  534. pdf = pdfium_c.FPDF_LoadDocument((str(input_data)+"\x00").encode("utf-8"), password)
  535. elif isinstance(input_data, (bytes, ctypes.Array)):
  536. pdf = pdfium_c.FPDF_LoadMemDocument64(input_data, len(input_data), password)
  537. to_hold = (input_data, )
  538. elif pdfium_i.is_buffer(input_data, "r"):
  539. bufaccess, to_hold = pdfium_i.get_bufreader(input_data)
  540. if autoclose:
  541. to_close = (input_data, )
  542. pdf = pdfium_c.FPDF_LoadCustomDocument(bufaccess, password)
  543. else:
  544. raise TypeError(f"Invalid input type '{type(input_data).__name__}'")
  545. if pdfium_c.FPDF_GetPageCount(pdf) < 1:
  546. err_code = pdfium_c.FPDF_GetLastError()
  547. raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).")
  548. return pdf, to_hold, to_close
  549. # TODO(apibreak) change outline API (see above)
  550. PdfOutlineItem = namedtuple("PdfOutlineItem", "level title is_closed n_kids page_index view_mode view_pos")
  551. """
  552. Bookmark information.
  553. Parameters:
  554. level (int):
  555. Number of parent items.
  556. title (str):
  557. Title string of the bookmark.
  558. is_closed (bool):
  559. True if child items shall be collapsed, False if they shall be expanded.
  560. None if the item has no descendants (i. e. ``n_kids == 0``).
  561. n_kids (int):
  562. Absolute number of child items, according to the PDF.
  563. page_index (int | None):
  564. Zero-based index of the page the bookmark points to.
  565. May be None if the bookmark has no target page (or it could not be determined).
  566. view_mode (int):
  567. A view mode constant (:data:`PDFDEST_VIEW_*`) defining how the coordinates of *view_pos* shall be interpreted.
  568. view_pos (list[float]):
  569. Target position on the page the viewport should jump to when the bookmark is clicked.
  570. It is a sequence of :class:`float` values in PDF canvas units.
  571. Depending on *view_mode*, it may contain between 0 and 4 coordinates.
  572. """