| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- import logging
- from io import BytesIO
- from typing import TYPE_CHECKING, BinaryIO, Optional, Union
- from pdfminer import settings
- from pdfminer.casting import safe_int
- from pdfminer.pdfexceptions import PDFException
- from pdfminer.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
- from pdfminer.psexceptions import PSEOF
- from pdfminer.psparser import KWD, PSKeyword, PSStackParser
- if TYPE_CHECKING:
- from pdfminer.pdfdocument import PDFDocument
- log = logging.getLogger(__name__)
- class PDFSyntaxError(PDFException):
- pass
- # PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
- class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
- """PDFParser fetch PDF objects from a file stream.
- It can handle indirect references by referring to
- a PDF document set by set_document method.
- It also reads XRefs at the end of every PDF file.
- Typical usage:
- parser = PDFParser(fp)
- parser.read_xref()
- parser.read_xref(fallback=True) # optional
- parser.set_document(doc)
- parser.seek(offset)
- parser.nextobject()
- """
- def __init__(self, fp: BinaryIO) -> None:
- PSStackParser.__init__(self, fp)
- self.doc: Optional[PDFDocument] = None
- self.fallback = False
- def set_document(self, doc: "PDFDocument") -> None:
- """Associates the parser with a PDFDocument object."""
- self.doc = doc
- KEYWORD_R = KWD(b"R")
- KEYWORD_NULL = KWD(b"null")
- KEYWORD_ENDOBJ = KWD(b"endobj")
- KEYWORD_STREAM = KWD(b"stream")
- KEYWORD_XREF = KWD(b"xref")
- KEYWORD_STARTXREF = KWD(b"startxref")
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- """Handles PDF-related keywords."""
- if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
- self.add_results(*self.pop(1))
- elif token is self.KEYWORD_ENDOBJ:
- self.add_results(*self.pop(4))
- elif token is self.KEYWORD_NULL:
- # null object
- self.push((pos, None))
- elif token is self.KEYWORD_R:
- # reference to indirect object
- if len(self.curstack) >= 2:
- (_, _object_id), _ = self.pop(2)
- object_id = safe_int(_object_id)
- if object_id is not None:
- obj = PDFObjRef(self.doc, object_id)
- self.push((pos, obj))
- elif token is self.KEYWORD_STREAM:
- # stream object
- ((_, dic),) = self.pop(1)
- dic = dict_value(dic)
- objlen = 0
- if not self.fallback:
- try:
- objlen = int_value(dic["Length"])
- except KeyError:
- if settings.STRICT:
- raise PDFSyntaxError("/Length is undefined: %r" % dic)
- self.seek(pos)
- try:
- (_, line) = self.nextline() # 'stream'
- except PSEOF:
- if settings.STRICT:
- raise PDFSyntaxError("Unexpected EOF")
- return
- pos += len(line)
- self.fp.seek(pos)
- data = bytearray(self.fp.read(objlen))
- self.seek(pos + objlen)
- while 1:
- try:
- (linepos, line) = self.nextline()
- except PSEOF:
- if settings.STRICT:
- raise PDFSyntaxError("Unexpected EOF")
- break
- if b"endstream" in line:
- i = line.index(b"endstream")
- objlen += i
- if self.fallback:
- data += line[:i]
- break
- objlen += len(line)
- if self.fallback:
- data += line
- self.seek(pos + objlen)
- # XXX limit objlen not to exceed object boundary
- log.debug(
- "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
- pos,
- objlen,
- dic,
- data[:10],
- )
- assert self.doc is not None
- stream = PDFStream(dic, bytes(data), self.doc.decipher)
- self.push((pos, stream))
- else:
- # others
- self.push((pos, token))
- class PDFStreamParser(PDFParser):
- """PDFStreamParser is used to parse PDF content streams
- that is contained in each page and has instructions
- for rendering the page. A reference to a PDF document is
- needed because a PDF content stream can also have
- indirect references to other objects in the same document.
- """
- def __init__(self, data: bytes) -> None:
- PDFParser.__init__(self, BytesIO(data))
- def flush(self) -> None:
- self.add_results(*self.popall())
- KEYWORD_OBJ = KWD(b"obj")
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- if token is self.KEYWORD_R:
- # reference to indirect object
- (_, _object_id), _ = self.pop(2)
- object_id = safe_int(_object_id)
- if object_id is not None:
- obj = PDFObjRef(self.doc, object_id)
- self.push((pos, obj))
- return
- elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
- if settings.STRICT:
- # See PDF Spec 3.4.6: Only the object values are stored in the
- # stream; the obj and endobj keywords are not used.
- raise PDFSyntaxError("Keyword endobj found in stream")
- return
- # others
- self.push((pos, token))
|