pdfpage.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import itertools
  2. import logging
  3. from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
  4. from pdfminer import settings
  5. from pdfminer.pdfdocument import (
  6. PDFDocument,
  7. PDFNoPageLabels,
  8. PDFTextExtractionNotAllowed,
  9. )
  10. from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError
  11. from pdfminer.pdfparser import PDFParser
  12. from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1
  13. from pdfminer.psparser import LIT
  14. from pdfminer.utils import Rect, parse_rect
  15. log = logging.getLogger(__name__)
  16. # some predefined literals and keywords.
  17. LITERAL_PAGE = LIT("Page")
  18. LITERAL_PAGES = LIT("Pages")
  19. class PDFPage:
  20. """An object that holds the information about a page.
  21. A PDFPage object is merely a convenience class that has a set
  22. of keys and values, which describe the properties of a page
  23. and point to its contents.
  24. Attributes
  25. ----------
  26. doc: a PDFDocument object.
  27. pageid: any Python object that can uniquely identify the page.
  28. attrs: a dictionary of page attributes.
  29. contents: a list of PDFStream objects that represents the page content.
  30. lastmod: the last modified time of the page.
  31. resources: a dictionary of resources used by the page.
  32. mediabox: the physical size of the page.
  33. cropbox: the crop rectangle of the page.
  34. rotate: the page rotation (in degree).
  35. annots: the page annotations.
  36. beads: a chain that represents natural reading order.
  37. label: the page's label (typically, the logical page number).
  38. """
  39. def __init__(
  40. self,
  41. doc: PDFDocument,
  42. pageid: object,
  43. attrs: object,
  44. label: Optional[str],
  45. ) -> None:
  46. """Initialize a page object.
  47. doc: a PDFDocument object.
  48. pageid: any Python object that can uniquely identify the page.
  49. attrs: a dictionary of page attributes.
  50. label: page label string.
  51. """
  52. self.doc = doc
  53. self.pageid = pageid
  54. self.attrs = dict_value(attrs)
  55. self.label = label
  56. self.lastmod = resolve1(self.attrs.get("LastModified"))
  57. self.resources: Dict[object, object] = resolve1(
  58. self.attrs.get("Resources", dict()),
  59. )
  60. self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox"))
  61. self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox)
  62. self.contents = self._parse_contents(self.attrs.get("Contents"))
  63. self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
  64. self.annots = self.attrs.get("Annots")
  65. self.beads = self.attrs.get("B")
  66. def __repr__(self) -> str:
  67. return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
  68. INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
  69. @classmethod
  70. def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
  71. def depth_first_search(
  72. obj: Any,
  73. parent: Dict[str, Any],
  74. visited: Optional[Set[Any]] = None,
  75. ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
  76. if isinstance(obj, int):
  77. object_id = obj
  78. object_properties = dict_value(document.getobj(object_id)).copy()
  79. else:
  80. # This looks broken. obj.objid means obj could be either
  81. # PDFObjRef or PDFStream, but neither is valid for dict_value.
  82. object_id = obj.objid # type: ignore[attr-defined]
  83. object_properties = dict_value(obj).copy()
  84. # Avoid recursion errors by keeping track of visited nodes
  85. if visited is None:
  86. visited = set()
  87. if object_id in visited:
  88. return
  89. visited.add(object_id)
  90. for k, v in parent.items():
  91. if k in cls.INHERITABLE_ATTRS and k not in object_properties:
  92. object_properties[k] = v
  93. object_type = object_properties.get("Type")
  94. if object_type is None and not settings.STRICT: # See #64
  95. object_type = object_properties.get("type")
  96. if object_type is LITERAL_PAGES and "Kids" in object_properties:
  97. log.debug("Pages: Kids=%r", object_properties["Kids"])
  98. for child in list_value(object_properties["Kids"]):
  99. yield from depth_first_search(child, object_properties, visited)
  100. elif object_type is LITERAL_PAGE:
  101. log.debug("Page: %r", object_properties)
  102. yield (object_id, object_properties)
  103. try:
  104. page_labels: Iterator[Optional[str]] = document.get_page_labels()
  105. except PDFNoPageLabels:
  106. page_labels = itertools.repeat(None)
  107. pages = False
  108. if "Pages" in document.catalog:
  109. objects = depth_first_search(document.catalog["Pages"], document.catalog)
  110. for objid, tree in objects:
  111. yield cls(document, objid, tree, next(page_labels))
  112. pages = True
  113. if not pages:
  114. # fallback when /Pages is missing.
  115. for xref in document.xrefs:
  116. for objid in xref.get_objids():
  117. try:
  118. obj = document.getobj(objid)
  119. if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
  120. yield cls(document, objid, obj, next(page_labels))
  121. except PDFObjectNotFound:
  122. pass
  123. @classmethod
  124. def get_pages(
  125. cls,
  126. fp: BinaryIO,
  127. pagenos: Optional[Container[int]] = None,
  128. maxpages: int = 0,
  129. password: str = "",
  130. caching: bool = True,
  131. check_extractable: bool = False,
  132. ) -> Iterator["PDFPage"]:
  133. # Create a PDF parser object associated with the file object.
  134. parser = PDFParser(fp)
  135. # Create a PDF document object that stores the document structure.
  136. doc = PDFDocument(parser, password=password, caching=caching)
  137. # Check if the document allows text extraction.
  138. # If not, warn the user and proceed.
  139. if not doc.is_extractable:
  140. if check_extractable:
  141. error_msg = "Text extraction is not allowed: %r" % fp
  142. raise PDFTextExtractionNotAllowed(error_msg)
  143. else:
  144. warning_msg = (
  145. "The PDF %r contains a metadata field "
  146. "indicating that it should not allow "
  147. "text extraction. Ignoring this field "
  148. "and proceeding. Use the check_extractable "
  149. "if you want to raise an error in this case" % fp
  150. )
  151. log.warning(warning_msg)
  152. # Process each page contained in the document.
  153. for pageno, page in enumerate(cls.create_pages(doc)):
  154. if pagenos and (pageno not in pagenos):
  155. continue
  156. yield page
  157. if maxpages and maxpages <= pageno + 1:
  158. break
  159. def _parse_mediabox(self, value: Any) -> Rect:
  160. us_letter = (0.0, 0.0, 612.0, 792.0)
  161. if value is None:
  162. log.warning(
  163. "MediaBox missing from /Page (and not inherited), "
  164. "defaulting to US Letter"
  165. )
  166. return us_letter
  167. try:
  168. return parse_rect(resolve1(val) for val in resolve1(value))
  169. except PDFValueError:
  170. log.warning("Invalid MediaBox in /Page, defaulting to US Letter")
  171. return us_letter
  172. def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect:
  173. if value is None:
  174. # CropBox is optional, and MediaBox is used if not specified.
  175. return mediabox
  176. try:
  177. return parse_rect(resolve1(val) for val in resolve1(value))
  178. except PDFValueError:
  179. log.warning("Invalid CropBox in /Page, defaulting to MediaBox")
  180. return mediabox
  181. def _parse_contents(self, value: Any) -> List[Any]:
  182. contents: List[Any] = []
  183. if value is not None:
  184. contents = resolve1(value)
  185. if not isinstance(contents, list):
  186. contents = [contents]
  187. return contents