pdfparser.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. import logging
  2. from io import BytesIO
  3. from typing import TYPE_CHECKING, BinaryIO, Optional, Union
  4. from pdfminer import settings
  5. from pdfminer.casting import safe_int
  6. from pdfminer.pdfexceptions import PDFException
  7. from pdfminer.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
  8. from pdfminer.psexceptions import PSEOF
  9. from pdfminer.psparser import KWD, PSKeyword, PSStackParser
  10. if TYPE_CHECKING:
  11. from pdfminer.pdfdocument import PDFDocument
  12. log = logging.getLogger(__name__)
  13. class PDFSyntaxError(PDFException):
  14. pass
  15. # PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
  16. class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
  17. """PDFParser fetch PDF objects from a file stream.
  18. It can handle indirect references by referring to
  19. a PDF document set by set_document method.
  20. It also reads XRefs at the end of every PDF file.
  21. Typical usage:
  22. parser = PDFParser(fp)
  23. parser.read_xref()
  24. parser.read_xref(fallback=True) # optional
  25. parser.set_document(doc)
  26. parser.seek(offset)
  27. parser.nextobject()
  28. """
  29. def __init__(self, fp: BinaryIO) -> None:
  30. PSStackParser.__init__(self, fp)
  31. self.doc: Optional[PDFDocument] = None
  32. self.fallback = False
  33. def set_document(self, doc: "PDFDocument") -> None:
  34. """Associates the parser with a PDFDocument object."""
  35. self.doc = doc
  36. KEYWORD_R = KWD(b"R")
  37. KEYWORD_NULL = KWD(b"null")
  38. KEYWORD_ENDOBJ = KWD(b"endobj")
  39. KEYWORD_STREAM = KWD(b"stream")
  40. KEYWORD_XREF = KWD(b"xref")
  41. KEYWORD_STARTXREF = KWD(b"startxref")
  42. def do_keyword(self, pos: int, token: PSKeyword) -> None:
  43. """Handles PDF-related keywords."""
  44. if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
  45. self.add_results(*self.pop(1))
  46. elif token is self.KEYWORD_ENDOBJ:
  47. self.add_results(*self.pop(4))
  48. elif token is self.KEYWORD_NULL:
  49. # null object
  50. self.push((pos, None))
  51. elif token is self.KEYWORD_R:
  52. # reference to indirect object
  53. if len(self.curstack) >= 2:
  54. (_, _object_id), _ = self.pop(2)
  55. object_id = safe_int(_object_id)
  56. if object_id is not None:
  57. obj = PDFObjRef(self.doc, object_id)
  58. self.push((pos, obj))
  59. elif token is self.KEYWORD_STREAM:
  60. # stream object
  61. ((_, dic),) = self.pop(1)
  62. dic = dict_value(dic)
  63. objlen = 0
  64. if not self.fallback:
  65. try:
  66. objlen = int_value(dic["Length"])
  67. except KeyError:
  68. if settings.STRICT:
  69. raise PDFSyntaxError("/Length is undefined: %r" % dic)
  70. self.seek(pos)
  71. try:
  72. (_, line) = self.nextline() # 'stream'
  73. except PSEOF:
  74. if settings.STRICT:
  75. raise PDFSyntaxError("Unexpected EOF")
  76. return
  77. pos += len(line)
  78. self.fp.seek(pos)
  79. data = bytearray(self.fp.read(objlen))
  80. self.seek(pos + objlen)
  81. while 1:
  82. try:
  83. (linepos, line) = self.nextline()
  84. except PSEOF:
  85. if settings.STRICT:
  86. raise PDFSyntaxError("Unexpected EOF")
  87. break
  88. if b"endstream" in line:
  89. i = line.index(b"endstream")
  90. objlen += i
  91. if self.fallback:
  92. data += line[:i]
  93. break
  94. objlen += len(line)
  95. if self.fallback:
  96. data += line
  97. self.seek(pos + objlen)
  98. # XXX limit objlen not to exceed object boundary
  99. log.debug(
  100. "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
  101. pos,
  102. objlen,
  103. dic,
  104. data[:10],
  105. )
  106. assert self.doc is not None
  107. stream = PDFStream(dic, bytes(data), self.doc.decipher)
  108. self.push((pos, stream))
  109. else:
  110. # others
  111. self.push((pos, token))
  112. class PDFStreamParser(PDFParser):
  113. """PDFStreamParser is used to parse PDF content streams
  114. that is contained in each page and has instructions
  115. for rendering the page. A reference to a PDF document is
  116. needed because a PDF content stream can also have
  117. indirect references to other objects in the same document.
  118. """
  119. def __init__(self, data: bytes) -> None:
  120. PDFParser.__init__(self, BytesIO(data))
  121. def flush(self) -> None:
  122. self.add_results(*self.popall())
  123. KEYWORD_OBJ = KWD(b"obj")
  124. def do_keyword(self, pos: int, token: PSKeyword) -> None:
  125. if token is self.KEYWORD_R:
  126. # reference to indirect object
  127. (_, _object_id), _ = self.pop(2)
  128. object_id = safe_int(_object_id)
  129. if object_id is not None:
  130. obj = PDFObjRef(self.doc, object_id)
  131. self.push((pos, obj))
  132. return
  133. elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
  134. if settings.STRICT:
  135. # See PDF Spec 3.4.6: Only the object values are stored in the
  136. # stream; the obj and endobj keywords are not used.
  137. raise PDFSyntaxError("Keyword endobj found in stream")
  138. return
  139. # others
  140. self.push((pos, token))