pdffont.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221
  1. import logging
  2. import struct
  3. from io import BytesIO
  4. from typing import (
  5. TYPE_CHECKING,
  6. Any,
  7. BinaryIO,
  8. Dict,
  9. Iterable,
  10. Iterator,
  11. List,
  12. Mapping,
  13. Optional,
  14. Tuple,
  15. Union,
  16. cast,
  17. )
  18. from pdfminer import settings
  19. from pdfminer.casting import safe_float, safe_rect_list
  20. from pdfminer.cmapdb import (
  21. CMap,
  22. CMapBase,
  23. CMapDB,
  24. CMapParser,
  25. FileUnicodeMap,
  26. IdentityUnicodeMap,
  27. UnicodeMap,
  28. )
  29. from pdfminer.encodingdb import EncodingDB, name2unicode
  30. from pdfminer.fontmetrics import FONT_METRICS
  31. from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError
  32. from pdfminer.pdftypes import (
  33. PDFStream,
  34. dict_value,
  35. int_value,
  36. list_value,
  37. num_value,
  38. resolve1,
  39. resolve_all,
  40. stream_value,
  41. )
  42. from pdfminer.psexceptions import PSEOF
  43. from pdfminer.psparser import (
  44. KWD,
  45. LIT,
  46. PSKeyword,
  47. PSLiteral,
  48. PSStackParser,
  49. literal_name,
  50. )
  51. from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
  52. if TYPE_CHECKING:
  53. from pdfminer.pdfinterp import PDFResourceManager
  54. log = logging.getLogger(__name__)
  55. def get_widths(seq: Iterable[object]) -> Dict[Union[str, int], float]:
  56. """Build a mapping of character widths for horizontal writing."""
  57. widths: Dict[int, float] = {}
  58. r: List[float] = []
  59. for v in seq:
  60. v = resolve1(v)
  61. if isinstance(v, list):
  62. if r:
  63. char1 = r[-1]
  64. for i, w in enumerate(v):
  65. widths[cast(int, char1) + i] = w
  66. r = []
  67. elif isinstance(v, (int, float)): # == utils.isnumber(v)
  68. r.append(v)
  69. if len(r) == 3:
  70. (char1, char2, w) = r
  71. if isinstance(char1, int) and isinstance(char2, int):
  72. for i in range(cast(int, char1), cast(int, char2) + 1):
  73. widths[i] = w
  74. else:
  75. log.warning(
  76. f"Skipping invalid font width specification for {char1} to {char2} because either of them is not an int"
  77. )
  78. r = []
  79. else:
  80. log.warning(
  81. f"Skipping invalid font width specification for {v} because it is not a number or a list"
  82. )
  83. return cast(Dict[Union[str, int], float], widths)
  84. def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
  85. """Build a mapping of character widths for vertical writing."""
  86. widths: Dict[int, Tuple[float, Point]] = {}
  87. r: List[float] = []
  88. for v in seq:
  89. if isinstance(v, list):
  90. if r:
  91. char1 = r[-1]
  92. for i, (w, vx, vy) in enumerate(choplist(3, v)):
  93. widths[cast(int, char1) + i] = (w, (vx, vy))
  94. r = []
  95. elif isinstance(v, (int, float)): # == utils.isnumber(v)
  96. r.append(v)
  97. if len(r) == 5:
  98. (char1, char2, w, vx, vy) = r
  99. for i in range(cast(int, char1), cast(int, char2) + 1):
  100. widths[i] = (w, (vx, vy))
  101. r = []
  102. return widths
  103. class FontMetricsDB:
  104. @classmethod
  105. def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:
  106. return FONT_METRICS[fontname]
  107. # int here means that we're not extending PSStackParser with additional types.
  108. class Type1FontHeaderParser(PSStackParser[int]):
  109. KEYWORD_BEGIN = KWD(b"begin")
  110. KEYWORD_END = KWD(b"end")
  111. KEYWORD_DEF = KWD(b"def")
  112. KEYWORD_PUT = KWD(b"put")
  113. KEYWORD_DICT = KWD(b"dict")
  114. KEYWORD_ARRAY = KWD(b"array")
  115. KEYWORD_READONLY = KWD(b"readonly")
  116. KEYWORD_FOR = KWD(b"for")
  117. def __init__(self, data: BinaryIO) -> None:
  118. PSStackParser.__init__(self, data)
  119. self._cid2unicode: Dict[int, str] = {}
  120. def get_encoding(self) -> Dict[int, str]:
  121. """Parse the font encoding.
  122. The Type1 font encoding maps character codes to character names. These
  123. character names could either be standard Adobe glyph names, or
  124. character names associated with custom CharStrings for this font. A
  125. CharString is a sequence of operations that describe how the character
  126. should be drawn. Currently, this function returns '' (empty string)
  127. for character names that are associated with a CharStrings.
  128. Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
  129. :returns mapping of character identifiers (cid's) to unicode characters
  130. """
  131. while 1:
  132. try:
  133. (cid, name) = self.nextobject()
  134. except PSEOF:
  135. break
  136. try:
  137. self._cid2unicode[cid] = name2unicode(cast(str, name))
  138. except KeyError as e:
  139. log.debug(str(e))
  140. return self._cid2unicode
  141. def do_keyword(self, pos: int, token: PSKeyword) -> None:
  142. if token is self.KEYWORD_PUT:
  143. ((_, key), (_, value)) = self.pop(2)
  144. if isinstance(key, int) and isinstance(value, PSLiteral):
  145. self.add_results((key, literal_name(value)))
  146. NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
  147. # Mapping of cmap names. Original cmap name is kept if not in the mapping.
  148. # (missing reference for why DLIdent is mapped to Identity)
  149. IDENTITY_ENCODER = {
  150. "DLIdent-H": "Identity-H",
  151. "DLIdent-V": "Identity-V",
  152. }
  153. def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
  154. d: Dict[int, List[Union[float, int]]] = {}
  155. fp = BytesIO(data)
  156. stack: List[Union[float, int]] = []
  157. while 1:
  158. c = fp.read(1)
  159. if not c:
  160. break
  161. b0 = ord(c)
  162. if b0 <= 21:
  163. d[b0] = stack
  164. stack = []
  165. continue
  166. if b0 == 30:
  167. s = ""
  168. loop = True
  169. while loop:
  170. b = ord(fp.read(1))
  171. for n in (b >> 4, b & 15):
  172. if n == 15:
  173. loop = False
  174. else:
  175. nibble = NIBBLES[n]
  176. assert nibble is not None
  177. s += nibble
  178. value = float(s)
  179. elif b0 >= 32 and b0 <= 246:
  180. value = b0 - 139
  181. else:
  182. b1 = ord(fp.read(1))
  183. if b0 >= 247 and b0 <= 250:
  184. value = ((b0 - 247) << 8) + b1 + 108
  185. elif b0 >= 251 and b0 <= 254:
  186. value = -((b0 - 251) << 8) - b1 - 108
  187. else:
  188. b2 = ord(fp.read(1))
  189. if b1 >= 128:
  190. b1 -= 256
  191. if b0 == 28:
  192. value = b1 << 8 | b2
  193. else:
  194. value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
  195. stack.append(value)
  196. return d
  197. class CFFFont:
  198. STANDARD_STRINGS = (
  199. ".notdef",
  200. "space",
  201. "exclam",
  202. "quotedbl",
  203. "numbersign",
  204. "dollar",
  205. "percent",
  206. "ampersand",
  207. "quoteright",
  208. "parenleft",
  209. "parenright",
  210. "asterisk",
  211. "plus",
  212. "comma",
  213. "hyphen",
  214. "period",
  215. "slash",
  216. "zero",
  217. "one",
  218. "two",
  219. "three",
  220. "four",
  221. "five",
  222. "six",
  223. "seven",
  224. "eight",
  225. "nine",
  226. "colon",
  227. "semicolon",
  228. "less",
  229. "equal",
  230. "greater",
  231. "question",
  232. "at",
  233. "A",
  234. "B",
  235. "C",
  236. "D",
  237. "E",
  238. "F",
  239. "G",
  240. "H",
  241. "I",
  242. "J",
  243. "K",
  244. "L",
  245. "M",
  246. "N",
  247. "O",
  248. "P",
  249. "Q",
  250. "R",
  251. "S",
  252. "T",
  253. "U",
  254. "V",
  255. "W",
  256. "X",
  257. "Y",
  258. "Z",
  259. "bracketleft",
  260. "backslash",
  261. "bracketright",
  262. "asciicircum",
  263. "underscore",
  264. "quoteleft",
  265. "a",
  266. "b",
  267. "c",
  268. "d",
  269. "e",
  270. "f",
  271. "g",
  272. "h",
  273. "i",
  274. "j",
  275. "k",
  276. "l",
  277. "m",
  278. "n",
  279. "o",
  280. "p",
  281. "q",
  282. "r",
  283. "s",
  284. "t",
  285. "u",
  286. "v",
  287. "w",
  288. "x",
  289. "y",
  290. "z",
  291. "braceleft",
  292. "bar",
  293. "braceright",
  294. "asciitilde",
  295. "exclamdown",
  296. "cent",
  297. "sterling",
  298. "fraction",
  299. "yen",
  300. "florin",
  301. "section",
  302. "currency",
  303. "quotesingle",
  304. "quotedblleft",
  305. "guillemotleft",
  306. "guilsinglleft",
  307. "guilsinglright",
  308. "fi",
  309. "fl",
  310. "endash",
  311. "dagger",
  312. "daggerdbl",
  313. "periodcentered",
  314. "paragraph",
  315. "bullet",
  316. "quotesinglbase",
  317. "quotedblbase",
  318. "quotedblright",
  319. "guillemotright",
  320. "ellipsis",
  321. "perthousand",
  322. "questiondown",
  323. "grave",
  324. "acute",
  325. "circumflex",
  326. "tilde",
  327. "macron",
  328. "breve",
  329. "dotaccent",
  330. "dieresis",
  331. "ring",
  332. "cedilla",
  333. "hungarumlaut",
  334. "ogonek",
  335. "caron",
  336. "emdash",
  337. "AE",
  338. "ordfeminine",
  339. "Lslash",
  340. "Oslash",
  341. "OE",
  342. "ordmasculine",
  343. "ae",
  344. "dotlessi",
  345. "lslash",
  346. "oslash",
  347. "oe",
  348. "germandbls",
  349. "onesuperior",
  350. "logicalnot",
  351. "mu",
  352. "trademark",
  353. "Eth",
  354. "onehalf",
  355. "plusminus",
  356. "Thorn",
  357. "onequarter",
  358. "divide",
  359. "brokenbar",
  360. "degree",
  361. "thorn",
  362. "threequarters",
  363. "twosuperior",
  364. "registered",
  365. "minus",
  366. "eth",
  367. "multiply",
  368. "threesuperior",
  369. "copyright",
  370. "Aacute",
  371. "Acircumflex",
  372. "Adieresis",
  373. "Agrave",
  374. "Aring",
  375. "Atilde",
  376. "Ccedilla",
  377. "Eacute",
  378. "Ecircumflex",
  379. "Edieresis",
  380. "Egrave",
  381. "Iacute",
  382. "Icircumflex",
  383. "Idieresis",
  384. "Igrave",
  385. "Ntilde",
  386. "Oacute",
  387. "Ocircumflex",
  388. "Odieresis",
  389. "Ograve",
  390. "Otilde",
  391. "Scaron",
  392. "Uacute",
  393. "Ucircumflex",
  394. "Udieresis",
  395. "Ugrave",
  396. "Yacute",
  397. "Ydieresis",
  398. "Zcaron",
  399. "aacute",
  400. "acircumflex",
  401. "adieresis",
  402. "agrave",
  403. "aring",
  404. "atilde",
  405. "ccedilla",
  406. "eacute",
  407. "ecircumflex",
  408. "edieresis",
  409. "egrave",
  410. "iacute",
  411. "icircumflex",
  412. "idieresis",
  413. "igrave",
  414. "ntilde",
  415. "oacute",
  416. "ocircumflex",
  417. "odieresis",
  418. "ograve",
  419. "otilde",
  420. "scaron",
  421. "uacute",
  422. "ucircumflex",
  423. "udieresis",
  424. "ugrave",
  425. "yacute",
  426. "ydieresis",
  427. "zcaron",
  428. "exclamsmall",
  429. "Hungarumlautsmall",
  430. "dollaroldstyle",
  431. "dollarsuperior",
  432. "ampersandsmall",
  433. "Acutesmall",
  434. "parenleftsuperior",
  435. "parenrightsuperior",
  436. "twodotenleader",
  437. "onedotenleader",
  438. "zerooldstyle",
  439. "oneoldstyle",
  440. "twooldstyle",
  441. "threeoldstyle",
  442. "fouroldstyle",
  443. "fiveoldstyle",
  444. "sixoldstyle",
  445. "sevenoldstyle",
  446. "eightoldstyle",
  447. "nineoldstyle",
  448. "commasuperior",
  449. "threequartersemdash",
  450. "periodsuperior",
  451. "questionsmall",
  452. "asuperior",
  453. "bsuperior",
  454. "centsuperior",
  455. "dsuperior",
  456. "esuperior",
  457. "isuperior",
  458. "lsuperior",
  459. "msuperior",
  460. "nsuperior",
  461. "osuperior",
  462. "rsuperior",
  463. "ssuperior",
  464. "tsuperior",
  465. "ff",
  466. "ffi",
  467. "ffl",
  468. "parenleftinferior",
  469. "parenrightinferior",
  470. "Circumflexsmall",
  471. "hyphensuperior",
  472. "Gravesmall",
  473. "Asmall",
  474. "Bsmall",
  475. "Csmall",
  476. "Dsmall",
  477. "Esmall",
  478. "Fsmall",
  479. "Gsmall",
  480. "Hsmall",
  481. "Ismall",
  482. "Jsmall",
  483. "Ksmall",
  484. "Lsmall",
  485. "Msmall",
  486. "Nsmall",
  487. "Osmall",
  488. "Psmall",
  489. "Qsmall",
  490. "Rsmall",
  491. "Ssmall",
  492. "Tsmall",
  493. "Usmall",
  494. "Vsmall",
  495. "Wsmall",
  496. "Xsmall",
  497. "Ysmall",
  498. "Zsmall",
  499. "colonmonetary",
  500. "onefitted",
  501. "rupiah",
  502. "Tildesmall",
  503. "exclamdownsmall",
  504. "centoldstyle",
  505. "Lslashsmall",
  506. "Scaronsmall",
  507. "Zcaronsmall",
  508. "Dieresissmall",
  509. "Brevesmall",
  510. "Caronsmall",
  511. "Dotaccentsmall",
  512. "Macronsmall",
  513. "figuredash",
  514. "hypheninferior",
  515. "Ogoneksmall",
  516. "Ringsmall",
  517. "Cedillasmall",
  518. "questiondownsmall",
  519. "oneeighth",
  520. "threeeighths",
  521. "fiveeighths",
  522. "seveneighths",
  523. "onethird",
  524. "twothirds",
  525. "zerosuperior",
  526. "foursuperior",
  527. "fivesuperior",
  528. "sixsuperior",
  529. "sevensuperior",
  530. "eightsuperior",
  531. "ninesuperior",
  532. "zeroinferior",
  533. "oneinferior",
  534. "twoinferior",
  535. "threeinferior",
  536. "fourinferior",
  537. "fiveinferior",
  538. "sixinferior",
  539. "seveninferior",
  540. "eightinferior",
  541. "nineinferior",
  542. "centinferior",
  543. "dollarinferior",
  544. "periodinferior",
  545. "commainferior",
  546. "Agravesmall",
  547. "Aacutesmall",
  548. "Acircumflexsmall",
  549. "Atildesmall",
  550. "Adieresissmall",
  551. "Aringsmall",
  552. "AEsmall",
  553. "Ccedillasmall",
  554. "Egravesmall",
  555. "Eacutesmall",
  556. "Ecircumflexsmall",
  557. "Edieresissmall",
  558. "Igravesmall",
  559. "Iacutesmall",
  560. "Icircumflexsmall",
  561. "Idieresissmall",
  562. "Ethsmall",
  563. "Ntildesmall",
  564. "Ogravesmall",
  565. "Oacutesmall",
  566. "Ocircumflexsmall",
  567. "Otildesmall",
  568. "Odieresissmall",
  569. "OEsmall",
  570. "Oslashsmall",
  571. "Ugravesmall",
  572. "Uacutesmall",
  573. "Ucircumflexsmall",
  574. "Udieresissmall",
  575. "Yacutesmall",
  576. "Thornsmall",
  577. "Ydieresissmall",
  578. "001.000",
  579. "001.001",
  580. "001.002",
  581. "001.003",
  582. "Black",
  583. "Bold",
  584. "Book",
  585. "Light",
  586. "Medium",
  587. "Regular",
  588. "Roman",
  589. "Semibold",
  590. )
  591. class INDEX:
  592. def __init__(self, fp: BinaryIO) -> None:
  593. self.fp = fp
  594. self.offsets: List[int] = []
  595. (count, offsize) = struct.unpack(">HB", self.fp.read(3))
  596. for i in range(count + 1):
  597. self.offsets.append(nunpack(self.fp.read(offsize)))
  598. self.base = self.fp.tell() - 1
  599. self.fp.seek(self.base + self.offsets[-1])
  600. def __repr__(self) -> str:
  601. return "<INDEX: size=%d>" % len(self)
  602. def __len__(self) -> int:
  603. return len(self.offsets) - 1
  604. def __getitem__(self, i: int) -> bytes:
  605. self.fp.seek(self.base + self.offsets[i])
  606. return self.fp.read(self.offsets[i + 1] - self.offsets[i])
  607. def __iter__(self) -> Iterator[bytes]:
  608. return iter(self[i] for i in range(len(self)))
  609. def __init__(self, name: str, fp: BinaryIO) -> None:
  610. self.name = name
  611. self.fp = fp
  612. # Header
  613. (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
  614. self.fp.read(hdrsize - 4)
  615. # Name INDEX
  616. self.name_index = self.INDEX(self.fp)
  617. # Top DICT INDEX
  618. self.dict_index = self.INDEX(self.fp)
  619. # String INDEX
  620. self.string_index = self.INDEX(self.fp)
  621. # Global Subr INDEX
  622. self.subr_index = self.INDEX(self.fp)
  623. # Top DICT DATA
  624. self.top_dict = getdict(self.dict_index[0])
  625. (charset_pos,) = self.top_dict.get(15, [0])
  626. (encoding_pos,) = self.top_dict.get(16, [0])
  627. (charstring_pos,) = self.top_dict.get(17, [0])
  628. # CharStrings
  629. self.fp.seek(cast(int, charstring_pos))
  630. self.charstring = self.INDEX(self.fp)
  631. self.nglyphs = len(self.charstring)
  632. # Encodings
  633. self.code2gid = {}
  634. self.gid2code = {}
  635. self.fp.seek(cast(int, encoding_pos))
  636. format = self.fp.read(1)
  637. if format == b"\x00":
  638. # Format 0
  639. (n,) = struct.unpack("B", self.fp.read(1))
  640. for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
  641. self.code2gid[code] = gid
  642. self.gid2code[gid] = code
  643. elif format == b"\x01":
  644. # Format 1
  645. (n,) = struct.unpack("B", self.fp.read(1))
  646. code = 0
  647. for i in range(n):
  648. (first, nleft) = struct.unpack("BB", self.fp.read(2))
  649. for gid in range(first, first + nleft + 1):
  650. self.code2gid[code] = gid
  651. self.gid2code[gid] = code
  652. code += 1
  653. else:
  654. raise PDFValueError("unsupported encoding format: %r" % format)
  655. # Charsets
  656. self.name2gid = {}
  657. self.gid2name = {}
  658. self.fp.seek(cast(int, charset_pos))
  659. format = self.fp.read(1)
  660. if format == b"\x00":
  661. # Format 0
  662. n = self.nglyphs - 1
  663. for gid, sid in enumerate(
  664. cast(
  665. Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
  666. ),
  667. ):
  668. gid += 1
  669. sidname = self.getstr(sid)
  670. self.name2gid[sidname] = gid
  671. self.gid2name[gid] = sidname
  672. elif format == b"\x01":
  673. # Format 1
  674. (n,) = struct.unpack("B", self.fp.read(1))
  675. sid = 0
  676. for i in range(n):
  677. (first, nleft) = struct.unpack("BB", self.fp.read(2))
  678. for gid in range(first, first + nleft + 1):
  679. sidname = self.getstr(sid)
  680. self.name2gid[sidname] = gid
  681. self.gid2name[gid] = sidname
  682. sid += 1
  683. elif format == b"\x02":
  684. # Format 2
  685. assert False, str(("Unhandled", format))
  686. else:
  687. raise PDFValueError("unsupported charset format: %r" % format)
  688. def getstr(self, sid: int) -> Union[str, bytes]:
  689. # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
  690. # and appears to be a needless source of type complexity.
  691. if sid < len(self.STANDARD_STRINGS):
  692. return self.STANDARD_STRINGS[sid]
  693. return self.string_index[sid - len(self.STANDARD_STRINGS)]
  694. class TrueTypeFont:
  695. class CMapNotFound(PDFException):
  696. pass
  697. def __init__(self, name: str, fp: BinaryIO) -> None:
  698. self.name = name
  699. self.fp = fp
  700. self.tables: Dict[bytes, Tuple[int, int]] = {}
  701. self.fonttype = fp.read(4)
  702. try:
  703. (ntables, _1, _2, _3) = cast(
  704. Tuple[int, int, int, int],
  705. struct.unpack(">HHHH", fp.read(8)),
  706. )
  707. for _ in range(ntables):
  708. (name_bytes, tsum, offset, length) = cast(
  709. Tuple[bytes, int, int, int],
  710. struct.unpack(">4sLLL", fp.read(16)),
  711. )
  712. self.tables[name_bytes] = (offset, length)
  713. except struct.error:
  714. # Do not fail if there are not enough bytes to read. Even for
  715. # corrupted PDFs we would like to get as much information as
  716. # possible, so continue.
  717. pass
  718. def create_unicode_map(self) -> FileUnicodeMap:
  719. if b"cmap" not in self.tables:
  720. raise TrueTypeFont.CMapNotFound
  721. (base_offset, length) = self.tables[b"cmap"]
  722. fp = self.fp
  723. fp.seek(base_offset)
  724. (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))
  725. subtables: List[Tuple[int, int, int]] = []
  726. for i in range(nsubtables):
  727. subtables.append(
  728. cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
  729. )
  730. char2gid: Dict[int, int] = {}
  731. # Only supports subtable type 0, 2 and 4.
  732. for platform_id, encoding_id, st_offset in subtables:
  733. # Skip non-Unicode cmaps.
  734. # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
  735. if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
  736. continue
  737. fp.seek(base_offset + st_offset)
  738. (fmttype, fmtlen, fmtlang) = cast(
  739. Tuple[int, int, int],
  740. struct.unpack(">HHH", fp.read(6)),
  741. )
  742. if fmttype == 0:
  743. char2gid.update(
  744. enumerate(
  745. cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),
  746. ),
  747. )
  748. elif fmttype == 2:
  749. subheaderkeys = cast(
  750. Tuple[int, ...],
  751. struct.unpack(">256H", fp.read(512)),
  752. )
  753. firstbytes = [0] * 8192
  754. for i, k in enumerate(subheaderkeys):
  755. firstbytes[k // 8] = i
  756. nhdrs = max(subheaderkeys) // 8 + 1
  757. hdrs: List[Tuple[int, int, int, int, int]] = []
  758. for i in range(nhdrs):
  759. (firstcode, entcount, delta, offset) = cast(
  760. Tuple[int, int, int, int],
  761. struct.unpack(">HHhH", fp.read(8)),
  762. )
  763. hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
  764. for i, firstcode, entcount, delta, pos in hdrs:
  765. if not entcount:
  766. continue
  767. first = firstcode + (firstbytes[i] << 8)
  768. fp.seek(pos)
  769. for c in range(entcount):
  770. gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
  771. if gid:
  772. gid += delta
  773. char2gid[first + c] = gid
  774. elif fmttype == 4:
  775. (segcount, _1, _2, _3) = cast(
  776. Tuple[int, int, int, int],
  777. struct.unpack(">HHHH", fp.read(8)),
  778. )
  779. segcount //= 2
  780. ecs = cast(
  781. Tuple[int, ...],
  782. struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
  783. )
  784. fp.read(2)
  785. scs = cast(
  786. Tuple[int, ...],
  787. struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
  788. )
  789. idds = cast(
  790. Tuple[int, ...],
  791. struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),
  792. )
  793. pos = fp.tell()
  794. idrs = cast(
  795. Tuple[int, ...],
  796. struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
  797. )
  798. for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):
  799. if idr:
  800. fp.seek(pos + idr)
  801. for c in range(sc, ec + 1):
  802. b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
  803. char2gid[c] = (b + idd) & 0xFFFF
  804. else:
  805. for c in range(sc, ec + 1):
  806. char2gid[c] = (c + idd) & 0xFFFF
  807. else:
  808. assert False, str(("Unhandled", fmttype))
  809. if not char2gid:
  810. raise TrueTypeFont.CMapNotFound
  811. # create unicode map
  812. unicode_map = FileUnicodeMap()
  813. for char, gid in char2gid.items():
  814. unicode_map.add_cid2unichr(gid, char)
  815. return unicode_map
  816. class PDFFontError(PDFException):
  817. pass
  818. class PDFUnicodeNotDefined(PDFFontError):
  819. pass
  820. LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
  821. LITERAL_TYPE1C = LIT("Type1C")
  822. # Font widths are maintained in a dict type that maps from *either* unicode
  823. # chars or integer character IDs.
  824. FontWidthDict = Dict[Union[int, str], float]
  825. class PDFFont:
  826. def __init__(
  827. self,
  828. descriptor: Mapping[str, Any],
  829. widths: FontWidthDict,
  830. default_width: Optional[float] = None,
  831. ) -> None:
  832. self.descriptor = descriptor
  833. self.widths: FontWidthDict = resolve_all(widths)
  834. self.fontname = resolve1(descriptor.get("FontName", "unknown"))
  835. if isinstance(self.fontname, PSLiteral):
  836. self.fontname = literal_name(self.fontname)
  837. self.flags = int_value(descriptor.get("Flags", 0))
  838. self.ascent = num_value(descriptor.get("Ascent", 0))
  839. self.descent = num_value(descriptor.get("Descent", 0))
  840. self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
  841. if default_width is None:
  842. self.default_width = num_value(descriptor.get("MissingWidth", 0))
  843. else:
  844. self.default_width = default_width
  845. self.default_width = resolve1(self.default_width)
  846. self.leading = num_value(descriptor.get("Leading", 0))
  847. self.bbox = self._parse_bbox(descriptor)
  848. self.hscale = self.vscale = 0.001
  849. # PDF RM 9.8.1 specifies /Descent should always be a negative number.
  850. # PScript5.dll seems to produce Descent with a positive number, but
  851. # text analysis will be wrong if this is taken as correct. So force
  852. # descent to negative.
  853. if self.descent > 0:
  854. self.descent = -self.descent
  855. def __repr__(self) -> str:
  856. return "<PDFFont>"
  857. def is_vertical(self) -> bool:
  858. return False
  859. def is_multibyte(self) -> bool:
  860. return False
  861. def decode(self, bytes: bytes) -> Iterable[int]:
  862. return bytearray(bytes) # map(ord, bytes)
  863. def get_ascent(self) -> float:
  864. """Ascent above the baseline, in text space units"""
  865. return self.ascent * self.vscale
  866. def get_descent(self) -> float:
  867. """Descent below the baseline, in text space units; always negative"""
  868. return self.descent * self.vscale
  869. def get_width(self) -> float:
  870. w = self.bbox[2] - self.bbox[0]
  871. if w == 0:
  872. w = -self.default_width
  873. return w * self.hscale
  874. def get_height(self) -> float:
  875. h = self.bbox[3] - self.bbox[1]
  876. if h == 0:
  877. h = self.ascent - self.descent
  878. return h * self.vscale
  879. def char_width(self, cid: int) -> float:
  880. # Because character widths may be mapping either IDs or strings,
  881. # we try to lookup the character ID first, then its str equivalent.
  882. cid_width = safe_float(self.widths.get(cid))
  883. if cid_width is not None:
  884. return cid_width * self.hscale
  885. try:
  886. str_cid = self.to_unichr(cid)
  887. cid_width = safe_float(self.widths.get(str_cid))
  888. if cid_width is not None:
  889. return cid_width * self.hscale
  890. except PDFUnicodeNotDefined:
  891. pass
  892. return self.default_width * self.hscale
  893. def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
  894. """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
  895. return 0
  896. def string_width(self, s: bytes) -> float:
  897. return sum(self.char_width(cid) for cid in self.decode(s))
  898. def to_unichr(self, cid: int) -> str:
  899. raise NotImplementedError
  900. @staticmethod
  901. def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:
  902. """Parse FontBBox from the fonts descriptor"""
  903. font_bbox = resolve_all(descriptor.get("FontBBox"))
  904. bbox = safe_rect_list(font_bbox)
  905. if bbox is None:
  906. log.warning(
  907. f"Could get FontBBox from font descriptor because {font_bbox!r} cannot be parsed as 4 floats"
  908. )
  909. return 0.0, 0.0, 0.0, 0.0
  910. return bbox
  911. class PDFSimpleFont(PDFFont):
  912. def __init__(
  913. self,
  914. descriptor: Mapping[str, Any],
  915. widths: FontWidthDict,
  916. spec: Mapping[str, Any],
  917. ) -> None:
  918. # Font encoding is specified either by a name of
  919. # built-in encoding or a dictionary that describes
  920. # the differences.
  921. if "Encoding" in spec:
  922. encoding = resolve1(spec["Encoding"])
  923. else:
  924. encoding = LITERAL_STANDARD_ENCODING
  925. if isinstance(encoding, dict):
  926. name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
  927. diff = list_value(encoding.get("Differences", []))
  928. self.cid2unicode = EncodingDB.get_encoding(name, diff)
  929. else:
  930. self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
  931. self.unicode_map: Optional[UnicodeMap] = None
  932. if "ToUnicode" in spec:
  933. strm = stream_value(spec["ToUnicode"])
  934. self.unicode_map = FileUnicodeMap()
  935. CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
  936. PDFFont.__init__(self, descriptor, widths)
  937. def to_unichr(self, cid: int) -> str:
  938. if self.unicode_map:
  939. try:
  940. return self.unicode_map.get_unichr(cid)
  941. except KeyError:
  942. pass
  943. try:
  944. return self.cid2unicode[cid]
  945. except KeyError:
  946. raise PDFUnicodeNotDefined(None, cid)
  947. class PDFType1Font(PDFSimpleFont):
  948. def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
  949. try:
  950. self.basefont = literal_name(spec["BaseFont"])
  951. except KeyError:
  952. if settings.STRICT:
  953. raise PDFFontError("BaseFont is missing")
  954. self.basefont = "unknown"
  955. widths: FontWidthDict
  956. try:
  957. (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
  958. widths = cast(
  959. Dict[Union[str, int], float], int_widths
  960. ) # implicit int->float
  961. except KeyError:
  962. descriptor = dict_value(spec.get("FontDescriptor", {}))
  963. firstchar = int_value(spec.get("FirstChar", 0))
  964. # lastchar = int_value(spec.get('LastChar', 255))
  965. width_list = list_value(spec.get("Widths", [0] * 256))
  966. widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
  967. PDFSimpleFont.__init__(self, descriptor, widths, spec)
  968. if "Encoding" not in spec and "FontFile" in descriptor:
  969. # try to recover the missing encoding info from the font file.
  970. self.fontfile = stream_value(descriptor.get("FontFile"))
  971. length1 = int_value(self.fontfile["Length1"])
  972. data = self.fontfile.get_data()[:length1]
  973. parser = Type1FontHeaderParser(BytesIO(data))
  974. self.cid2unicode = parser.get_encoding()
  975. def __repr__(self) -> str:
  976. return "<PDFType1Font: basefont=%r>" % self.basefont
  977. class PDFTrueTypeFont(PDFType1Font):
  978. def __repr__(self) -> str:
  979. return "<PDFTrueTypeFont: basefont=%r>" % self.basefont
  980. class PDFType3Font(PDFSimpleFont):
  981. def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
  982. firstchar = int_value(spec.get("FirstChar", 0))
  983. # lastchar = int_value(spec.get('LastChar', 0))
  984. width_list = list_value(spec.get("Widths", [0] * 256))
  985. widths: Dict[Union[str, int], float] = {
  986. i + firstchar: w for (i, w) in enumerate(width_list)
  987. }
  988. if "FontDescriptor" in spec:
  989. descriptor = dict_value(spec["FontDescriptor"])
  990. else:
  991. descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
  992. PDFSimpleFont.__init__(self, descriptor, widths, spec)
  993. self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
  994. (_, self.descent, _, self.ascent) = self.bbox
  995. (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
  996. def __repr__(self) -> str:
  997. return "<PDFType3Font>"
  998. class PDFCIDFont(PDFFont):
  999. default_disp: Union[float, Tuple[Optional[float], float]]
  1000. def __init__(
  1001. self,
  1002. rsrcmgr: "PDFResourceManager",
  1003. spec: Mapping[str, Any],
  1004. strict: bool = settings.STRICT,
  1005. ) -> None:
  1006. try:
  1007. self.basefont = literal_name(spec["BaseFont"])
  1008. except KeyError:
  1009. if strict:
  1010. raise PDFFontError("BaseFont is missing")
  1011. self.basefont = "unknown"
  1012. self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
  1013. cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
  1014. "latin1",
  1015. )
  1016. cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
  1017. "latin1",
  1018. )
  1019. self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
  1020. self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
  1021. try:
  1022. descriptor = dict_value(spec["FontDescriptor"])
  1023. except KeyError:
  1024. if strict:
  1025. raise PDFFontError("FontDescriptor is missing")
  1026. descriptor = {}
  1027. ttf = None
  1028. if "FontFile2" in descriptor:
  1029. self.fontfile = stream_value(descriptor.get("FontFile2"))
  1030. ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
  1031. self.unicode_map: Optional[UnicodeMap] = None
  1032. if "ToUnicode" in spec:
  1033. if isinstance(spec["ToUnicode"], PDFStream):
  1034. strm = stream_value(spec["ToUnicode"])
  1035. self.unicode_map = FileUnicodeMap()
  1036. CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
  1037. else:
  1038. cmap_name = literal_name(spec["ToUnicode"])
  1039. encoding = literal_name(spec["Encoding"])
  1040. if (
  1041. "Identity" in cid_ordering
  1042. or "Identity" in cmap_name
  1043. or "Identity" in encoding
  1044. ):
  1045. self.unicode_map = IdentityUnicodeMap()
  1046. elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
  1047. if ttf:
  1048. try:
  1049. self.unicode_map = ttf.create_unicode_map()
  1050. except TrueTypeFont.CMapNotFound:
  1051. pass
  1052. else:
  1053. try:
  1054. self.unicode_map = CMapDB.get_unicode_map(
  1055. self.cidcoding,
  1056. self.cmap.is_vertical(),
  1057. )
  1058. except CMapDB.CMapNotFound:
  1059. pass
  1060. self.vertical = self.cmap.is_vertical()
  1061. if self.vertical:
  1062. # writing mode: vertical
  1063. widths2 = get_widths2(list_value(spec.get("W2", [])))
  1064. self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
  1065. (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
  1066. self.default_disp = (None, vy)
  1067. widths: Dict[Union[str, int], float] = {
  1068. cid: w for (cid, (w, _)) in widths2.items()
  1069. }
  1070. default_width = w
  1071. else:
  1072. # writing mode: horizontal
  1073. self.disps = {}
  1074. self.default_disp = 0
  1075. widths = get_widths(list_value(spec.get("W", [])))
  1076. default_width = spec.get("DW", 1000)
  1077. PDFFont.__init__(self, descriptor, widths, default_width=default_width)
  1078. def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
  1079. """Get cmap from font specification
  1080. For certain PDFs, Encoding Type isn't mentioned as an attribute of
  1081. Encoding but as an attribute of CMapName, where CMapName is an
  1082. attribute of spec['Encoding'].
  1083. The horizontal/vertical modes are mentioned with different name
  1084. such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
  1085. """
  1086. cmap_name = self._get_cmap_name(spec, strict)
  1087. try:
  1088. return CMapDB.get_cmap(cmap_name)
  1089. except CMapDB.CMapNotFound as e:
  1090. if strict:
  1091. raise PDFFontError(e)
  1092. return CMap()
  1093. @staticmethod
  1094. def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
  1095. """Get cmap name from font specification"""
  1096. cmap_name = "unknown" # default value
  1097. try:
  1098. spec_encoding = spec["Encoding"]
  1099. if hasattr(spec_encoding, "name"):
  1100. cmap_name = literal_name(spec["Encoding"])
  1101. else:
  1102. cmap_name = literal_name(spec_encoding["CMapName"])
  1103. except KeyError:
  1104. if strict:
  1105. raise PDFFontError("Encoding is unspecified")
  1106. if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
  1107. cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
  1108. if "CMapName" in cmap_name_stream:
  1109. cmap_name = cmap_name_stream.get("CMapName").name
  1110. elif strict:
  1111. raise PDFFontError("CMapName unspecified for encoding")
  1112. return IDENTITY_ENCODER.get(cmap_name, cmap_name)
  1113. def __repr__(self) -> str:
  1114. return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
  1115. def is_vertical(self) -> bool:
  1116. return self.vertical
  1117. def is_multibyte(self) -> bool:
  1118. return True
  1119. def decode(self, bytes: bytes) -> Iterable[int]:
  1120. return self.cmap.decode(bytes)
  1121. def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
  1122. """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
  1123. return self.disps.get(cid, self.default_disp)
  1124. def to_unichr(self, cid: int) -> str:
  1125. try:
  1126. if not self.unicode_map:
  1127. raise PDFKeyError(cid)
  1128. return self.unicode_map.get_unichr(cid)
  1129. except KeyError:
  1130. raise PDFUnicodeNotDefined(self.cidcoding, cid)