pdfdocument.py 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081
  1. import itertools
  2. import logging
  3. import re
  4. import struct
  5. from hashlib import md5, sha256, sha384, sha512
  6. from typing import (
  7. Any,
  8. Callable,
  9. Dict,
  10. Iterable,
  11. Iterator,
  12. KeysView,
  13. List,
  14. Optional,
  15. Sequence,
  16. Tuple,
  17. Type,
  18. Union,
  19. cast,
  20. )
  21. from cryptography.hazmat.backends import default_backend
  22. from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
  23. from pdfminer import settings
  24. from pdfminer.arcfour import Arcfour
  25. from pdfminer.casting import safe_int
  26. from pdfminer.data_structures import NumberTree
  27. from pdfminer.pdfexceptions import (
  28. PDFException,
  29. PDFKeyError,
  30. PDFObjectNotFound,
  31. PDFTypeError,
  32. )
  33. from pdfminer.pdfparser import PDFParser, PDFStreamParser, PDFSyntaxError
  34. from pdfminer.pdftypes import (
  35. DecipherCallable,
  36. PDFStream,
  37. decipher_all,
  38. dict_value,
  39. int_value,
  40. list_value,
  41. str_value,
  42. stream_value,
  43. uint_value,
  44. )
  45. from pdfminer.psexceptions import PSEOF
  46. from pdfminer.psparser import KWD, LIT, literal_name
  47. from pdfminer.utils import (
  48. choplist,
  49. decode_text,
  50. format_int_alpha,
  51. format_int_roman,
  52. nunpack,
  53. unpad_aes,
  54. )
  55. log = logging.getLogger(__name__)
  56. class PDFNoValidXRef(PDFSyntaxError):
  57. pass
  58. class PDFNoValidXRefWarning(SyntaxWarning):
  59. """Legacy warning for missing xref.
  60. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
  61. """
  62. class PDFNoOutlines(PDFException):
  63. pass
  64. class PDFNoPageLabels(PDFException):
  65. pass
  66. class PDFDestinationNotFound(PDFException):
  67. pass
  68. class PDFEncryptionError(PDFException):
  69. pass
  70. class PDFPasswordIncorrect(PDFEncryptionError):
  71. pass
  72. class PDFEncryptionWarning(UserWarning):
  73. """Legacy warning for failed decryption.
  74. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
  75. """
  76. class PDFTextExtractionNotAllowedWarning(UserWarning):
  77. """Legacy warning for PDF that does not allow extraction.
  78. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
  79. """
  80. class PDFTextExtractionNotAllowed(PDFEncryptionError):
  81. pass
  82. # some predefined literals and keywords.
  83. LITERAL_OBJSTM = LIT("ObjStm")
  84. LITERAL_XREF = LIT("XRef")
  85. LITERAL_CATALOG = LIT("Catalog")
  86. class PDFBaseXRef:
  87. def get_trailer(self) -> Dict[str, Any]:
  88. raise NotImplementedError
  89. def get_objids(self) -> Iterable[int]:
  90. return []
  91. # Must return
  92. # (strmid, index, genno)
  93. # or (None, pos, genno)
  94. def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
  95. raise PDFKeyError(objid)
  96. def load(self, parser: PDFParser) -> None:
  97. raise NotImplementedError
  98. class PDFXRef(PDFBaseXRef):
  99. def __init__(self) -> None:
  100. self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
  101. self.trailer: Dict[str, Any] = {}
  102. def __repr__(self) -> str:
  103. return "<PDFXRef: offsets=%r>" % (self.offsets.keys())
  104. def load(self, parser: PDFParser) -> None:
  105. while True:
  106. try:
  107. (pos, line) = parser.nextline()
  108. line = line.strip()
  109. if not line:
  110. continue
  111. except PSEOF:
  112. raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
  113. if line.startswith(b"trailer"):
  114. parser.seek(pos)
  115. break
  116. f = line.split(b" ")
  117. if len(f) != 2:
  118. error_msg = f"Trailer not found: {parser!r}: line={line!r}"
  119. raise PDFNoValidXRef(error_msg)
  120. try:
  121. (start, nobjs) = map(int, f)
  122. except ValueError:
  123. error_msg = f"Invalid line: {parser!r}: line={line!r}"
  124. raise PDFNoValidXRef(error_msg)
  125. for objid in range(start, start + nobjs):
  126. try:
  127. (_, line) = parser.nextline()
  128. line = line.strip()
  129. except PSEOF:
  130. raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
  131. f = line.split(b" ")
  132. if len(f) != 3:
  133. error_msg = f"Invalid XRef format: {parser!r}, line={line!r}"
  134. raise PDFNoValidXRef(error_msg)
  135. (pos_b, genno_b, use_b) = f
  136. if use_b != b"n":
  137. continue
  138. pos_i = safe_int(pos_b)
  139. genno_i = safe_int(genno_b)
  140. if pos_i is not None and genno_i is not None:
  141. self.offsets[objid] = (None, pos_i, genno_i)
  142. else:
  143. log.warning(
  144. f"Not adding object {objid} to xref because position {pos_b!r} "
  145. f"or generation number {genno_b!r} cannot be parsed as an int"
  146. )
  147. log.debug("xref objects: %r", self.offsets)
  148. self.load_trailer(parser)
  149. def load_trailer(self, parser: PDFParser) -> None:
  150. try:
  151. (_, kwd) = parser.nexttoken()
  152. assert kwd is KWD(b"trailer"), str(kwd)
  153. (_, dic) = parser.nextobject()
  154. except PSEOF:
  155. x = parser.pop(1)
  156. if not x:
  157. raise PDFNoValidXRef("Unexpected EOF - file corrupted")
  158. (_, dic) = x[0]
  159. self.trailer.update(dict_value(dic))
  160. log.debug("trailer=%r", self.trailer)
  161. def get_trailer(self) -> Dict[str, Any]:
  162. return self.trailer
  163. def get_objids(self) -> KeysView[int]:
  164. return self.offsets.keys()
  165. def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
  166. return self.offsets[objid]
  167. class PDFXRefFallback(PDFXRef):
  168. def __repr__(self) -> str:
  169. return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())
  170. PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
  171. def load(self, parser: PDFParser) -> None:
  172. parser.seek(0)
  173. while 1:
  174. try:
  175. (pos, line_bytes) = parser.nextline()
  176. except PSEOF:
  177. break
  178. if line_bytes.startswith(b"trailer"):
  179. parser.seek(pos)
  180. self.load_trailer(parser)
  181. log.debug("trailer: %r", self.trailer)
  182. break
  183. line = line_bytes.decode("latin-1") # default pdf encoding
  184. m = self.PDFOBJ_CUE.match(line)
  185. if not m:
  186. continue
  187. (objid_s, genno_s) = m.groups()
  188. objid = int(objid_s)
  189. genno = int(genno_s)
  190. self.offsets[objid] = (None, pos, genno)
  191. # expand ObjStm.
  192. parser.seek(pos)
  193. (_, obj) = parser.nextobject()
  194. if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
  195. stream = stream_value(obj)
  196. try:
  197. n = stream["N"]
  198. except KeyError:
  199. if settings.STRICT:
  200. raise PDFSyntaxError("N is not defined: %r" % stream)
  201. n = 0
  202. parser1 = PDFStreamParser(stream.get_data())
  203. objs: List[int] = []
  204. try:
  205. while 1:
  206. (_, obj) = parser1.nextobject()
  207. objs.append(cast(int, obj))
  208. except PSEOF:
  209. pass
  210. n = min(n, len(objs) // 2)
  211. for index in range(n):
  212. objid1 = objs[index * 2]
  213. self.offsets[objid1] = (objid, index, 0)
  214. class PDFXRefStream(PDFBaseXRef):
  215. def __init__(self) -> None:
  216. self.data: Optional[bytes] = None
  217. self.entlen: Optional[int] = None
  218. self.fl1: Optional[int] = None
  219. self.fl2: Optional[int] = None
  220. self.fl3: Optional[int] = None
  221. self.ranges: List[Tuple[int, int]] = []
  222. def __repr__(self) -> str:
  223. return "<PDFXRefStream: ranges=%r>" % (self.ranges)
  224. def load(self, parser: PDFParser) -> None:
  225. (_, objid) = parser.nexttoken() # ignored
  226. (_, genno) = parser.nexttoken() # ignored
  227. (_, kwd) = parser.nexttoken()
  228. (_, stream) = parser.nextobject()
  229. if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
  230. raise PDFNoValidXRef("Invalid PDF stream spec.")
  231. size = stream["Size"]
  232. index_array = stream.get("Index", (0, size))
  233. if len(index_array) % 2 != 0:
  234. raise PDFSyntaxError("Invalid index number")
  235. self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
  236. (self.fl1, self.fl2, self.fl3) = stream["W"]
  237. assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
  238. self.data = stream.get_data()
  239. self.entlen = self.fl1 + self.fl2 + self.fl3
  240. self.trailer = stream.attrs
  241. log.debug(
  242. "xref stream: objid=%s, fields=%d,%d,%d",
  243. ", ".join(map(repr, self.ranges)),
  244. self.fl1,
  245. self.fl2,
  246. self.fl3,
  247. )
  248. def get_trailer(self) -> Dict[str, Any]:
  249. return self.trailer
  250. def get_objids(self) -> Iterator[int]:
  251. for start, nobjs in self.ranges:
  252. for i in range(nobjs):
  253. assert self.entlen is not None
  254. assert self.data is not None
  255. offset = self.entlen * i
  256. ent = self.data[offset : offset + self.entlen]
  257. f1 = nunpack(ent[: self.fl1], 1)
  258. if f1 == 1 or f1 == 2:
  259. yield start + i
  260. def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
  261. index = 0
  262. for start, nobjs in self.ranges:
  263. if start <= objid and objid < start + nobjs:
  264. index += objid - start
  265. break
  266. else:
  267. index += nobjs
  268. else:
  269. raise PDFKeyError(objid)
  270. assert self.entlen is not None
  271. assert self.data is not None
  272. assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
  273. offset = self.entlen * index
  274. ent = self.data[offset : offset + self.entlen]
  275. f1 = nunpack(ent[: self.fl1], 1)
  276. f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2])
  277. f3 = nunpack(ent[self.fl1 + self.fl2 :])
  278. if f1 == 1:
  279. return (None, f2, f3)
  280. elif f1 == 2:
  281. return (f2, f3, 0)
  282. else:
  283. # this is a free object
  284. raise PDFKeyError(objid)
  285. class PDFStandardSecurityHandler:
  286. PASSWORD_PADDING = (
  287. b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
  288. b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
  289. )
  290. supported_revisions: Tuple[int, ...] = (2, 3)
  291. def __init__(
  292. self,
  293. docid: Sequence[bytes],
  294. param: Dict[str, Any],
  295. password: str = "",
  296. ) -> None:
  297. self.docid = docid
  298. self.param = param
  299. self.password = password
  300. self.init()
  301. def init(self) -> None:
  302. self.init_params()
  303. if self.r not in self.supported_revisions:
  304. error_msg = "Unsupported revision: param=%r" % self.param
  305. raise PDFEncryptionError(error_msg)
  306. self.init_key()
  307. def init_params(self) -> None:
  308. self.v = int_value(self.param.get("V", 0))
  309. self.r = int_value(self.param["R"])
  310. self.p = uint_value(self.param["P"], 32)
  311. self.o = str_value(self.param["O"])
  312. self.u = str_value(self.param["U"])
  313. self.length = int_value(self.param.get("Length", 40))
  314. def init_key(self) -> None:
  315. self.key = self.authenticate(self.password)
  316. if self.key is None:
  317. raise PDFPasswordIncorrect
  318. def is_printable(self) -> bool:
  319. return bool(self.p & 4)
  320. def is_modifiable(self) -> bool:
  321. return bool(self.p & 8)
  322. def is_extractable(self) -> bool:
  323. return bool(self.p & 16)
  324. def compute_u(self, key: bytes) -> bytes:
  325. if self.r == 2:
  326. # Algorithm 3.4
  327. return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
  328. else:
  329. # Algorithm 3.5
  330. hash = md5(self.PASSWORD_PADDING) # 2
  331. hash.update(self.docid[0]) # 3
  332. result = Arcfour(key).encrypt(hash.digest()) # 4
  333. for i in range(1, 20): # 5
  334. k = b"".join(bytes((c ^ i,)) for c in iter(key))
  335. result = Arcfour(k).encrypt(result)
  336. result += result # 6
  337. return result
  338. def compute_encryption_key(self, password: bytes) -> bytes:
  339. # Algorithm 3.2
  340. password = (password + self.PASSWORD_PADDING)[:32] # 1
  341. hash = md5(password) # 2
  342. hash.update(self.o) # 3
  343. # See https://github.com/pdfminer/pdfminer.six/issues/186
  344. hash.update(struct.pack("<L", self.p)) # 4
  345. hash.update(self.docid[0]) # 5
  346. if self.r >= 4:
  347. if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
  348. hash.update(b"\xff\xff\xff\xff")
  349. result = hash.digest()
  350. n = 5
  351. if self.r >= 3:
  352. n = self.length // 8
  353. for _ in range(50):
  354. result = md5(result[:n]).digest()
  355. return result[:n]
  356. def authenticate(self, password: str) -> Optional[bytes]:
  357. password_bytes = password.encode("latin1")
  358. key = self.authenticate_user_password(password_bytes)
  359. if key is None:
  360. key = self.authenticate_owner_password(password_bytes)
  361. return key
  362. def authenticate_user_password(self, password: bytes) -> Optional[bytes]:
  363. key = self.compute_encryption_key(password)
  364. if self.verify_encryption_key(key):
  365. return key
  366. else:
  367. return None
  368. def verify_encryption_key(self, key: bytes) -> bool:
  369. # Algorithm 3.6
  370. u = self.compute_u(key)
  371. if self.r == 2:
  372. return u == self.u
  373. return u[:16] == self.u[:16]
  374. def authenticate_owner_password(self, password: bytes) -> Optional[bytes]:
  375. # Algorithm 3.7
  376. password = (password + self.PASSWORD_PADDING)[:32]
  377. hash = md5(password)
  378. if self.r >= 3:
  379. for _ in range(50):
  380. hash = md5(hash.digest())
  381. n = 5
  382. if self.r >= 3:
  383. n = self.length // 8
  384. key = hash.digest()[:n]
  385. if self.r == 2:
  386. user_password = Arcfour(key).decrypt(self.o)
  387. else:
  388. user_password = self.o
  389. for i in range(19, -1, -1):
  390. k = b"".join(bytes((c ^ i,)) for c in iter(key))
  391. user_password = Arcfour(k).decrypt(user_password)
  392. return self.authenticate_user_password(user_password)
  393. def decrypt(
  394. self,
  395. objid: int,
  396. genno: int,
  397. data: bytes,
  398. attrs: Optional[Dict[str, Any]] = None,
  399. ) -> bytes:
  400. return self.decrypt_rc4(objid, genno, data)
  401. def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
  402. assert self.key is not None
  403. key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
  404. hash = md5(key)
  405. key = hash.digest()[: min(len(key), 16)]
  406. return Arcfour(key).decrypt(data)
  407. class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
  408. supported_revisions: Tuple[int, ...] = (4,)
  409. def init_params(self) -> None:
  410. super().init_params()
  411. self.length = 128
  412. self.cf = dict_value(self.param.get("CF"))
  413. self.stmf = literal_name(self.param["StmF"])
  414. self.strf = literal_name(self.param["StrF"])
  415. self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
  416. if self.stmf != self.strf:
  417. error_msg = "Unsupported crypt filter: param=%r" % self.param
  418. raise PDFEncryptionError(error_msg)
  419. self.cfm = {}
  420. for k, v in self.cf.items():
  421. f = self.get_cfm(literal_name(v["CFM"]))
  422. if f is None:
  423. error_msg = "Unknown crypt filter method: param=%r" % self.param
  424. raise PDFEncryptionError(error_msg)
  425. self.cfm[k] = f
  426. self.cfm["Identity"] = self.decrypt_identity
  427. if self.strf not in self.cfm:
  428. error_msg = "Undefined crypt filter: param=%r" % self.param
  429. raise PDFEncryptionError(error_msg)
  430. def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
  431. if name == "V2":
  432. return self.decrypt_rc4
  433. elif name == "AESV2":
  434. return self.decrypt_aes128
  435. else:
  436. return None
  437. def decrypt(
  438. self,
  439. objid: int,
  440. genno: int,
  441. data: bytes,
  442. attrs: Optional[Dict[str, Any]] = None,
  443. name: Optional[str] = None,
  444. ) -> bytes:
  445. if not self.encrypt_metadata and attrs is not None:
  446. t = attrs.get("Type")
  447. if t is not None and literal_name(t) == "Metadata":
  448. return data
  449. if name is None:
  450. name = self.strf
  451. return self.cfm[name](objid, genno, data)
  452. def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes:
  453. return data
  454. def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
  455. assert self.key is not None
  456. key = (
  457. self.key
  458. + struct.pack("<L", objid)[:3]
  459. + struct.pack("<L", genno)[:2]
  460. + b"sAlT"
  461. )
  462. hash = md5(key)
  463. key = hash.digest()[: min(len(key), 16)]
  464. initialization_vector = data[:16]
  465. ciphertext = data[16:]
  466. cipher = Cipher(
  467. algorithms.AES(key),
  468. modes.CBC(initialization_vector),
  469. backend=default_backend(),
  470. ) # type: ignore
  471. plaintext = cipher.decryptor().update(ciphertext) # type: ignore
  472. return unpad_aes(plaintext)
  473. class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
  474. supported_revisions = (5, 6)
  475. def init_params(self) -> None:
  476. super().init_params()
  477. self.length = 256
  478. self.oe = str_value(self.param["OE"])
  479. self.ue = str_value(self.param["UE"])
  480. self.o_hash = self.o[:32]
  481. self.o_validation_salt = self.o[32:40]
  482. self.o_key_salt = self.o[40:]
  483. self.u_hash = self.u[:32]
  484. self.u_validation_salt = self.u[32:40]
  485. self.u_key_salt = self.u[40:]
  486. def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
  487. if name == "AESV3":
  488. return self.decrypt_aes256
  489. else:
  490. return None
  491. def authenticate(self, password: str) -> Optional[bytes]:
  492. password_b = self._normalize_password(password)
  493. hash = self._password_hash(password_b, self.o_validation_salt, self.u)
  494. if hash == self.o_hash:
  495. hash = self._password_hash(password_b, self.o_key_salt, self.u)
  496. cipher = Cipher(
  497. algorithms.AES(hash),
  498. modes.CBC(b"\0" * 16),
  499. backend=default_backend(),
  500. ) # type: ignore
  501. return cipher.decryptor().update(self.oe) # type: ignore
  502. hash = self._password_hash(password_b, self.u_validation_salt)
  503. if hash == self.u_hash:
  504. hash = self._password_hash(password_b, self.u_key_salt)
  505. cipher = Cipher(
  506. algorithms.AES(hash),
  507. modes.CBC(b"\0" * 16),
  508. backend=default_backend(),
  509. ) # type: ignore
  510. return cipher.decryptor().update(self.ue) # type: ignore
  511. return None
  512. def _normalize_password(self, password: str) -> bytes:
  513. if self.r == 6:
  514. # saslprep expects non-empty strings, apparently
  515. if not password:
  516. return b""
  517. from pdfminer._saslprep import saslprep
  518. password = saslprep(password)
  519. return password.encode("utf-8")[:127]
  520. def _password_hash(
  521. self,
  522. password: bytes,
  523. salt: bytes,
  524. vector: Optional[bytes] = None,
  525. ) -> bytes:
  526. """Compute password hash depending on revision number"""
  527. if self.r == 5:
  528. return self._r5_password(password, salt, vector)
  529. return self._r6_password(password, salt[0:8], vector)
  530. def _r5_password(
  531. self,
  532. password: bytes,
  533. salt: bytes,
  534. vector: Optional[bytes] = None,
  535. ) -> bytes:
  536. """Compute the password for revision 5"""
  537. hash = sha256(password)
  538. hash.update(salt)
  539. if vector is not None:
  540. hash.update(vector)
  541. return hash.digest()
  542. def _r6_password(
  543. self,
  544. password: bytes,
  545. salt: bytes,
  546. vector: Optional[bytes] = None,
  547. ) -> bytes:
  548. """Compute the password for revision 6"""
  549. initial_hash = sha256(password)
  550. initial_hash.update(salt)
  551. if vector is not None:
  552. initial_hash.update(vector)
  553. k = initial_hash.digest()
  554. hashes = (sha256, sha384, sha512)
  555. round_no = last_byte_val = 0
  556. while round_no < 64 or last_byte_val > round_no - 32:
  557. k1 = (password + k + (vector or b"")) * 64
  558. e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
  559. # compute the first 16 bytes of e,
  560. # interpreted as an unsigned integer mod 3
  561. next_hash = hashes[self._bytes_mod_3(e[:16])]
  562. k = next_hash(e).digest()
  563. last_byte_val = e[len(e) - 1]
  564. round_no += 1
  565. return k[:32]
  566. @staticmethod
  567. def _bytes_mod_3(input_bytes: bytes) -> int:
  568. # 256 is 1 mod 3, so we can just sum 'em
  569. return sum(b % 3 for b in input_bytes) % 3
  570. def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
  571. cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
  572. encryptor = cipher.encryptor() # type: ignore
  573. return encryptor.update(data) + encryptor.finalize() # type: ignore
  574. def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
  575. initialization_vector = data[:16]
  576. ciphertext = data[16:]
  577. assert self.key is not None
  578. cipher = Cipher(
  579. algorithms.AES(self.key),
  580. modes.CBC(initialization_vector),
  581. backend=default_backend(),
  582. ) # type: ignore
  583. plaintext = cipher.decryptor().update(ciphertext) # type: ignore
  584. return unpad_aes(plaintext)
  585. class PDFDocument:
  586. """PDFDocument object represents a PDF document.
  587. Since a PDF file can be very big, normally it is not loaded at
  588. once. So PDF document has to cooperate with a PDF parser in order to
  589. dynamically import the data as processing goes.
  590. Typical usage:
  591. doc = PDFDocument(parser, password)
  592. obj = doc.getobj(objid)
  593. """
  594. security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = {
  595. 1: PDFStandardSecurityHandler,
  596. 2: PDFStandardSecurityHandler,
  597. 4: PDFStandardSecurityHandlerV4,
  598. 5: PDFStandardSecurityHandlerV5,
  599. }
  600. def __init__(
  601. self,
  602. parser: PDFParser,
  603. password: str = "",
  604. caching: bool = True,
  605. fallback: bool = True,
  606. ) -> None:
  607. """Set the document to use a given PDFParser object."""
  608. self.caching = caching
  609. self.xrefs: List[PDFBaseXRef] = []
  610. self.info = []
  611. self.catalog: Dict[str, Any] = {}
  612. self.encryption: Optional[Tuple[Any, Any]] = None
  613. self.decipher: Optional[DecipherCallable] = None
  614. self._parser = None
  615. self._cached_objs: Dict[int, Tuple[object, int]] = {}
  616. self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
  617. self._parser = parser
  618. self._parser.set_document(self)
  619. self.is_printable = self.is_modifiable = self.is_extractable = True
  620. # Retrieve the information of each header that was appended
  621. # (maybe multiple times) at the end of the document.
  622. try:
  623. pos = self.find_xref(parser)
  624. self.read_xref_from(parser, pos, self.xrefs)
  625. except PDFNoValidXRef:
  626. if fallback:
  627. parser.fallback = True
  628. newxref = PDFXRefFallback()
  629. newxref.load(parser)
  630. self.xrefs.append(newxref)
  631. for xref in self.xrefs:
  632. trailer = xref.get_trailer()
  633. if not trailer:
  634. continue
  635. # If there's an encryption info, remember it.
  636. if "Encrypt" in trailer:
  637. if "ID" in trailer:
  638. id_value = list_value(trailer["ID"])
  639. else:
  640. # Some documents may not have a /ID, use two empty
  641. # byte strings instead. Solves
  642. # https://github.com/pdfminer/pdfminer.six/issues/594
  643. id_value = (b"", b"")
  644. self.encryption = (id_value, dict_value(trailer["Encrypt"]))
  645. self._initialize_password(password)
  646. if "Info" in trailer:
  647. self.info.append(dict_value(trailer["Info"]))
  648. if "Root" in trailer:
  649. # Every PDF file must have exactly one /Root dictionary.
  650. self.catalog = dict_value(trailer["Root"])
  651. break
  652. else:
  653. raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
  654. if self.catalog.get("Type") is not LITERAL_CATALOG:
  655. if settings.STRICT:
  656. raise PDFSyntaxError("Catalog not found!")
  657. KEYWORD_OBJ = KWD(b"obj")
  658. # _initialize_password(password=b'')
  659. # Perform the initialization with a given password.
  660. def _initialize_password(self, password: str = "") -> None:
  661. assert self.encryption is not None
  662. (docid, param) = self.encryption
  663. if literal_name(param.get("Filter")) != "Standard":
  664. raise PDFEncryptionError("Unknown filter: param=%r" % param)
  665. v = int_value(param.get("V", 0))
  666. factory = self.security_handler_registry.get(v)
  667. if factory is None:
  668. raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
  669. handler = factory(docid, param, password)
  670. self.decipher = handler.decrypt
  671. self.is_printable = handler.is_printable()
  672. self.is_modifiable = handler.is_modifiable()
  673. self.is_extractable = handler.is_extractable()
  674. assert self._parser is not None
  675. self._parser.fallback = False # need to read streams with exact length
  676. def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
  677. if stream.objid in self._parsed_objs:
  678. (objs, n) = self._parsed_objs[stream.objid]
  679. else:
  680. (objs, n) = self._get_objects(stream)
  681. if self.caching:
  682. assert stream.objid is not None
  683. self._parsed_objs[stream.objid] = (objs, n)
  684. i = n * 2 + index
  685. try:
  686. obj = objs[i]
  687. except IndexError:
  688. raise PDFSyntaxError("index too big: %r" % index)
  689. return obj
  690. def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
  691. if stream.get("Type") is not LITERAL_OBJSTM:
  692. if settings.STRICT:
  693. raise PDFSyntaxError("Not a stream object: %r" % stream)
  694. try:
  695. n = cast(int, stream["N"])
  696. except KeyError:
  697. if settings.STRICT:
  698. raise PDFSyntaxError("N is not defined: %r" % stream)
  699. n = 0
  700. parser = PDFStreamParser(stream.get_data())
  701. parser.set_document(self)
  702. objs: List[object] = []
  703. try:
  704. while 1:
  705. (_, obj) = parser.nextobject()
  706. objs.append(obj)
  707. except PSEOF:
  708. pass
  709. return (objs, n)
  710. def _getobj_parse(self, pos: int, objid: int) -> object:
  711. assert self._parser is not None
  712. self._parser.seek(pos)
  713. (_, objid1) = self._parser.nexttoken() # objid
  714. (_, genno) = self._parser.nexttoken() # genno
  715. (_, kwd) = self._parser.nexttoken()
  716. # hack around malformed pdf files
  717. # copied from https://github.com/jaepil/pdfminer3k/blob/master/
  718. # pdfminer/pdfparser.py#L399
  719. # to solve https://github.com/pdfminer/pdfminer.six/issues/56
  720. # assert objid1 == objid, str((objid1, objid))
  721. if objid1 != objid:
  722. x = []
  723. while kwd is not self.KEYWORD_OBJ:
  724. (_, kwd) = self._parser.nexttoken()
  725. x.append(kwd)
  726. if len(x) >= 2:
  727. objid1 = x[-2]
  728. # #### end hack around malformed pdf files
  729. if objid1 != objid:
  730. raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}")
  731. if kwd != KWD(b"obj"):
  732. raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
  733. (_, obj) = self._parser.nextobject()
  734. return obj
  735. # can raise PDFObjectNotFound
  736. def getobj(self, objid: int) -> object:
  737. """Get object from PDF
  738. :raises PDFException if PDFDocument is not initialized
  739. :raises PDFObjectNotFound if objid does not exist in PDF
  740. """
  741. if not self.xrefs:
  742. raise PDFException("PDFDocument is not initialized")
  743. log.debug("getobj: objid=%r", objid)
  744. if objid in self._cached_objs:
  745. (obj, genno) = self._cached_objs[objid]
  746. else:
  747. for xref in self.xrefs:
  748. try:
  749. (strmid, index, genno) = xref.get_pos(objid)
  750. except KeyError:
  751. continue
  752. try:
  753. if strmid is not None:
  754. stream = stream_value(self.getobj(strmid))
  755. obj = self._getobj_objstm(stream, index, objid)
  756. else:
  757. obj = self._getobj_parse(index, objid)
  758. if self.decipher:
  759. obj = decipher_all(self.decipher, objid, genno, obj)
  760. if isinstance(obj, PDFStream):
  761. obj.set_objid(objid, genno)
  762. break
  763. except (PSEOF, PDFSyntaxError):
  764. continue
  765. else:
  766. raise PDFObjectNotFound(objid)
  767. log.debug("register: objid=%r: %r", objid, obj)
  768. if self.caching:
  769. self._cached_objs[objid] = (obj, genno)
  770. return obj
  771. OutlineType = Tuple[Any, Any, Any, Any, Any]
  772. def get_outlines(self) -> Iterator[OutlineType]:
  773. if "Outlines" not in self.catalog:
  774. raise PDFNoOutlines
  775. def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
  776. entry = dict_value(entry)
  777. if "Title" in entry:
  778. if "A" in entry or "Dest" in entry:
  779. title = decode_text(str_value(entry["Title"]))
  780. dest = entry.get("Dest")
  781. action = entry.get("A")
  782. se = entry.get("SE")
  783. yield (level, title, dest, action, se)
  784. if "First" in entry and "Last" in entry:
  785. yield from search(entry["First"], level + 1)
  786. if "Next" in entry:
  787. yield from search(entry["Next"], level)
  788. return search(self.catalog["Outlines"], 0)
  789. def get_page_labels(self) -> Iterator[str]:
  790. """Generate page label strings for the PDF document.
  791. If the document includes page labels, generates strings, one per page.
  792. If not, raises PDFNoPageLabels.
  793. The resulting iteration is unbounded.
  794. """
  795. assert self.catalog is not None
  796. try:
  797. page_labels = PageLabels(self.catalog["PageLabels"])
  798. except (PDFTypeError, KeyError):
  799. raise PDFNoPageLabels
  800. return page_labels.labels
  801. def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
  802. try:
  803. names = dict_value(self.catalog["Names"])
  804. except (PDFTypeError, KeyError):
  805. raise PDFKeyError((cat, key))
  806. # may raise KeyError
  807. d0 = dict_value(names[cat])
  808. def lookup(d: Dict[str, Any]) -> Any:
  809. if "Limits" in d:
  810. (k1, k2) = list_value(d["Limits"])
  811. if key < k1 or k2 < key:
  812. return None
  813. if "Names" in d:
  814. objs = list_value(d["Names"])
  815. names = dict(
  816. cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs)),
  817. )
  818. return names[key]
  819. if "Kids" in d:
  820. for c in list_value(d["Kids"]):
  821. v = lookup(dict_value(c))
  822. if v:
  823. return v
  824. raise PDFKeyError((cat, key))
  825. return lookup(d0)
  826. def get_dest(self, name: Union[str, bytes]) -> Any:
  827. try:
  828. # PDF-1.2 or later
  829. obj = self.lookup_name("Dests", name)
  830. except KeyError:
  831. # PDF-1.1 or prior
  832. if "Dests" not in self.catalog:
  833. raise PDFDestinationNotFound(name)
  834. d0 = dict_value(self.catalog["Dests"])
  835. if name not in d0:
  836. raise PDFDestinationNotFound(name)
  837. obj = d0[name]
  838. return obj
  839. # find_xref
  840. def find_xref(self, parser: PDFParser) -> int:
  841. """Internal function used to locate the first XRef."""
  842. # search the last xref table by scanning the file backwards.
  843. prev = b""
  844. for line in parser.revreadlines():
  845. line = line.strip()
  846. log.debug("find_xref: %r", line)
  847. if line == b"startxref":
  848. log.debug("xref found: pos=%r", prev)
  849. if not prev.isdigit():
  850. raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")
  851. start = int(prev)
  852. if not start >= 0:
  853. raise PDFNoValidXRef(f"Invalid negative xref position: {start}")
  854. return start
  855. if line:
  856. prev = line
  857. raise PDFNoValidXRef("Unexpected EOF")
  858. # read xref table
  859. def read_xref_from(
  860. self,
  861. parser: PDFParser,
  862. start: int,
  863. xrefs: List[PDFBaseXRef],
  864. ) -> None:
  865. """Reads XRefs from the given location."""
  866. parser.seek(start)
  867. parser.reset()
  868. try:
  869. (pos, token) = parser.nexttoken()
  870. except PSEOF:
  871. raise PDFNoValidXRef("Unexpected EOF")
  872. log.debug("read_xref_from: start=%d, token=%r", start, token)
  873. if isinstance(token, int):
  874. # XRefStream: PDF-1.5
  875. parser.seek(pos)
  876. parser.reset()
  877. xref: PDFBaseXRef = PDFXRefStream()
  878. xref.load(parser)
  879. else:
  880. if token is parser.KEYWORD_XREF:
  881. parser.nextline()
  882. xref = PDFXRef()
  883. xref.load(parser)
  884. xrefs.append(xref)
  885. trailer = xref.get_trailer()
  886. log.debug("trailer: %r", trailer)
  887. if "XRefStm" in trailer:
  888. pos = int_value(trailer["XRefStm"])
  889. self.read_xref_from(parser, pos, xrefs)
  890. if "Prev" in trailer:
  891. # find previous xref
  892. pos = int_value(trailer["Prev"])
  893. self.read_xref_from(parser, pos, xrefs)
  894. class PageLabels(NumberTree):
  895. """PageLabels from the document catalog.
  896. See Section 8.3.1 in the PDF Reference.
  897. """
  898. @property
  899. def labels(self) -> Iterator[str]:
  900. ranges = self.values
  901. # The tree must begin with page index 0
  902. if len(ranges) == 0 or ranges[0][0] != 0:
  903. if settings.STRICT:
  904. raise PDFSyntaxError("PageLabels is missing page index 0")
  905. else:
  906. # Try to cope, by assuming empty labels for the initial pages
  907. ranges.insert(0, (0, {}))
  908. for next, (start, label_dict_unchecked) in enumerate(ranges, 1):
  909. label_dict = dict_value(label_dict_unchecked)
  910. style = label_dict.get("S")
  911. prefix = decode_text(str_value(label_dict.get("P", b"")))
  912. first_value = int_value(label_dict.get("St", 1))
  913. if next == len(ranges):
  914. # This is the last specified range. It continues until the end
  915. # of the document.
  916. values: Iterable[int] = itertools.count(first_value)
  917. else:
  918. end, _ = ranges[next]
  919. range_length = end - start
  920. values = range(first_value, first_value + range_length)
  921. for value in values:
  922. label = self._format_page_label(value, style)
  923. yield prefix + label
  924. @staticmethod
  925. def _format_page_label(value: int, style: Any) -> str:
  926. """Format page label value in a specific style"""
  927. if style is None:
  928. label = ""
  929. elif style is LIT("D"): # Decimal arabic numerals
  930. label = str(value)
  931. elif style is LIT("R"): # Uppercase roman numerals
  932. label = format_int_roman(value).upper()
  933. elif style is LIT("r"): # Lowercase roman numerals
  934. label = format_int_roman(value)
  935. elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ...
  936. label = format_int_alpha(value).upper()
  937. elif style is LIT("a"): # Lowercase letters a-z, aa-zz...
  938. label = format_int_alpha(value)
  939. else:
  940. log.warning("Unknown page label style: %r", style)
  941. label = ""
  942. return label