cmapdb.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. """Adobe character mapping (CMap) support.
  2. CMaps provide the mapping between character codes and Unicode
  3. code-points to character ids (CIDs).
  4. More information is available on:
  5. https://github.com/adobe-type-tools/cmap-resources
  6. """
  7. import gzip
  8. import logging
  9. import os
  10. import os.path
  11. import pickle as pickle
  12. import struct
  13. import sys
  14. from typing import (
  15. Any,
  16. BinaryIO,
  17. Dict,
  18. Iterable,
  19. Iterator,
  20. List,
  21. MutableMapping,
  22. Optional,
  23. Set,
  24. TextIO,
  25. Tuple,
  26. Union,
  27. cast,
  28. )
  29. from pdfminer.encodingdb import name2unicode
  30. from pdfminer.pdfexceptions import PDFException, PDFTypeError
  31. from pdfminer.psexceptions import PSEOF, PSSyntaxError
  32. from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
  33. from pdfminer.utils import choplist, nunpack
  34. log = logging.getLogger(__name__)
  35. class CMapError(PDFException):
  36. pass
  37. class CMapBase:
  38. debug = 0
  39. def __init__(self, **kwargs: object) -> None:
  40. self.attrs: MutableMapping[str, object] = kwargs.copy()
  41. def is_vertical(self) -> bool:
  42. return self.attrs.get("WMode", 0) != 0
  43. def set_attr(self, k: str, v: object) -> None:
  44. self.attrs[k] = v
  45. def add_code2cid(self, code: str, cid: int) -> None:
  46. pass
  47. def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
  48. pass
  49. def use_cmap(self, cmap: "CMapBase") -> None:
  50. pass
  51. def decode(self, code: bytes) -> Iterable[int]:
  52. raise NotImplementedError
  53. class CMap(CMapBase):
  54. def __init__(self, **kwargs: Union[str, int]) -> None:
  55. CMapBase.__init__(self, **kwargs)
  56. self.code2cid: Dict[int, object] = {}
  57. def __repr__(self) -> str:
  58. return "<CMap: %s>" % self.attrs.get("CMapName")
  59. def use_cmap(self, cmap: CMapBase) -> None:
  60. assert isinstance(cmap, CMap), str(type(cmap))
  61. def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
  62. for k, v in src.items():
  63. if isinstance(v, dict):
  64. d: Dict[int, object] = {}
  65. dst[k] = d
  66. copy(d, v)
  67. else:
  68. dst[k] = v
  69. copy(self.code2cid, cmap.code2cid)
  70. def decode(self, code: bytes) -> Iterator[int]:
  71. log.debug("decode: %r, %r", self, code)
  72. d = self.code2cid
  73. for i in iter(code):
  74. if i in d:
  75. x = d[i]
  76. if isinstance(x, int):
  77. yield x
  78. d = self.code2cid
  79. else:
  80. d = cast(Dict[int, object], x)
  81. else:
  82. d = self.code2cid
  83. def dump(
  84. self,
  85. out: TextIO = sys.stdout,
  86. code2cid: Optional[Dict[int, object]] = None,
  87. code: Tuple[int, ...] = (),
  88. ) -> None:
  89. if code2cid is None:
  90. code2cid = self.code2cid
  91. code = ()
  92. for k, v in sorted(code2cid.items()):
  93. c = code + (k,)
  94. if isinstance(v, int):
  95. out.write("code %r = cid %d\n" % (c, v))
  96. else:
  97. self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
  98. class IdentityCMap(CMapBase):
  99. def decode(self, code: bytes) -> Tuple[int, ...]:
  100. n = len(code) // 2
  101. if n:
  102. return struct.unpack(">%dH" % n, code)
  103. else:
  104. return ()
  105. class IdentityCMapByte(IdentityCMap):
  106. def decode(self, code: bytes) -> Tuple[int, ...]:
  107. n = len(code)
  108. if n:
  109. return struct.unpack(">%dB" % n, code)
  110. else:
  111. return ()
  112. class UnicodeMap(CMapBase):
  113. def __init__(self, **kwargs: Union[str, int]) -> None:
  114. CMapBase.__init__(self, **kwargs)
  115. self.cid2unichr: Dict[int, str] = {}
  116. def __repr__(self) -> str:
  117. return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
  118. def get_unichr(self, cid: int) -> str:
  119. log.debug("get_unichr: %r, %r", self, cid)
  120. return self.cid2unichr[cid]
  121. def dump(self, out: TextIO = sys.stdout) -> None:
  122. for k, v in sorted(self.cid2unichr.items()):
  123. out.write("cid %d = unicode %r\n" % (k, v))
  124. class IdentityUnicodeMap(UnicodeMap):
  125. def get_unichr(self, cid: int) -> str:
  126. """Interpret character id as unicode codepoint"""
  127. log.debug("get_unichr: %r, %r", self, cid)
  128. return chr(cid)
  129. class FileCMap(CMap):
  130. def add_code2cid(self, code: str, cid: int) -> None:
  131. assert isinstance(code, str) and isinstance(cid, int), str(
  132. (type(code), type(cid)),
  133. )
  134. d = self.code2cid
  135. for c in code[:-1]:
  136. ci = ord(c)
  137. if ci in d:
  138. d = cast(Dict[int, object], d[ci])
  139. else:
  140. t: Dict[int, object] = {}
  141. d[ci] = t
  142. d = t
  143. ci = ord(code[-1])
  144. d[ci] = cid
  145. class FileUnicodeMap(UnicodeMap):
  146. def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
  147. assert isinstance(cid, int), str(type(cid))
  148. if isinstance(code, PSLiteral):
  149. # Interpret as an Adobe glyph name.
  150. assert isinstance(code.name, str)
  151. unichr = name2unicode(code.name)
  152. elif isinstance(code, bytes):
  153. # Interpret as UTF-16BE.
  154. unichr = code.decode("UTF-16BE", "ignore")
  155. elif isinstance(code, int):
  156. unichr = chr(code)
  157. else:
  158. raise PDFTypeError(code)
  159. # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
  160. if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
  161. return
  162. self.cid2unichr[cid] = unichr
  163. class PyCMap(CMap):
  164. def __init__(self, name: str, module: Any) -> None:
  165. super().__init__(CMapName=name)
  166. self.code2cid = module.CODE2CID
  167. if module.IS_VERTICAL:
  168. self.attrs["WMode"] = 1
  169. class PyUnicodeMap(UnicodeMap):
  170. def __init__(self, name: str, module: Any, vertical: bool) -> None:
  171. super().__init__(CMapName=name)
  172. if vertical:
  173. self.cid2unichr = module.CID2UNICHR_V
  174. self.attrs["WMode"] = 1
  175. else:
  176. self.cid2unichr = module.CID2UNICHR_H
  177. class CMapDB:
  178. _cmap_cache: Dict[str, PyCMap] = {}
  179. _umap_cache: Dict[str, List[PyUnicodeMap]] = {}
  180. class CMapNotFound(CMapError):
  181. pass
  182. @classmethod
  183. def _load_data(cls, name: str) -> Any:
  184. name = name.replace("\0", "")
  185. filename = "%s.pickle.gz" % name
  186. log.debug("loading: %r", name)
  187. cmap_paths = (
  188. os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
  189. os.path.join(os.path.dirname(__file__), "cmap"),
  190. )
  191. for directory in cmap_paths:
  192. path = os.path.join(directory, filename)
  193. if os.path.exists(path):
  194. gzfile = gzip.open(path)
  195. try:
  196. return type(str(name), (), pickle.loads(gzfile.read()))
  197. finally:
  198. gzfile.close()
  199. raise CMapDB.CMapNotFound(name)
  200. @classmethod
  201. def get_cmap(cls, name: str) -> CMapBase:
  202. if name == "Identity-H":
  203. return IdentityCMap(WMode=0)
  204. elif name == "Identity-V":
  205. return IdentityCMap(WMode=1)
  206. elif name == "OneByteIdentityH":
  207. return IdentityCMapByte(WMode=0)
  208. elif name == "OneByteIdentityV":
  209. return IdentityCMapByte(WMode=1)
  210. try:
  211. return cls._cmap_cache[name]
  212. except KeyError:
  213. pass
  214. data = cls._load_data(name)
  215. cls._cmap_cache[name] = cmap = PyCMap(name, data)
  216. return cmap
  217. @classmethod
  218. def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
  219. try:
  220. return cls._umap_cache[name][vertical]
  221. except KeyError:
  222. pass
  223. data = cls._load_data("to-unicode-%s" % name)
  224. cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
  225. return cls._umap_cache[name][vertical]
  226. class CMapParser(PSStackParser[PSKeyword]):
  227. def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
  228. PSStackParser.__init__(self, fp)
  229. self.cmap = cmap
  230. # some ToUnicode maps don't have "begincmap" keyword.
  231. self._in_cmap = True
  232. self._warnings: Set[str] = set()
  233. def run(self) -> None:
  234. try:
  235. self.nextobject()
  236. except PSEOF:
  237. pass
  238. KEYWORD_BEGINCMAP = KWD(b"begincmap")
  239. KEYWORD_ENDCMAP = KWD(b"endcmap")
  240. KEYWORD_USECMAP = KWD(b"usecmap")
  241. KEYWORD_DEF = KWD(b"def")
  242. KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
  243. KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
  244. KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
  245. KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
  246. KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
  247. KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
  248. KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
  249. KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
  250. KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
  251. KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
  252. KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
  253. KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
  254. def do_keyword(self, pos: int, token: PSKeyword) -> None:
  255. """ToUnicode CMaps
  256. See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
  257. """
  258. if token is self.KEYWORD_BEGINCMAP:
  259. self._in_cmap = True
  260. self.popall()
  261. return
  262. elif token is self.KEYWORD_ENDCMAP:
  263. self._in_cmap = False
  264. return
  265. if not self._in_cmap:
  266. return
  267. if token is self.KEYWORD_DEF:
  268. try:
  269. ((_, k), (_, v)) = self.pop(2)
  270. self.cmap.set_attr(literal_name(k), v)
  271. except PSSyntaxError:
  272. pass
  273. return
  274. if token is self.KEYWORD_USECMAP:
  275. try:
  276. ((_, cmapname),) = self.pop(1)
  277. self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
  278. except PSSyntaxError:
  279. pass
  280. except CMapDB.CMapNotFound:
  281. pass
  282. return
  283. if token is self.KEYWORD_BEGINCODESPACERANGE:
  284. self.popall()
  285. return
  286. if token is self.KEYWORD_ENDCODESPACERANGE:
  287. self.popall()
  288. return
  289. if token is self.KEYWORD_BEGINCIDRANGE:
  290. self.popall()
  291. return
  292. if token is self.KEYWORD_ENDCIDRANGE:
  293. objs = [obj for (__, obj) in self.popall()]
  294. for start_byte, end_byte, cid in choplist(3, objs):
  295. if not isinstance(start_byte, bytes):
  296. self._warn_once("The start object of begincidrange is not a byte.")
  297. continue
  298. if not isinstance(end_byte, bytes):
  299. self._warn_once("The end object of begincidrange is not a byte.")
  300. continue
  301. if not isinstance(cid, int):
  302. self._warn_once("The cid object of begincidrange is not a byte.")
  303. continue
  304. if len(start_byte) != len(end_byte):
  305. self._warn_once(
  306. "The start and end byte of begincidrange have "
  307. "different lengths.",
  308. )
  309. continue
  310. start_prefix = start_byte[:-4]
  311. end_prefix = end_byte[:-4]
  312. if start_prefix != end_prefix:
  313. self._warn_once(
  314. "The prefix of the start and end byte of "
  315. "begincidrange are not the same.",
  316. )
  317. continue
  318. svar = start_byte[-4:]
  319. evar = end_byte[-4:]
  320. start = nunpack(svar)
  321. end = nunpack(evar)
  322. vlen = len(svar)
  323. for i in range(end - start + 1):
  324. x = start_prefix + struct.pack(">L", start + i)[-vlen:]
  325. self.cmap.add_cid2unichr(cid + i, x)
  326. return
  327. if token is self.KEYWORD_BEGINCIDCHAR:
  328. self.popall()
  329. return
  330. if token is self.KEYWORD_ENDCIDCHAR:
  331. objs = [obj for (__, obj) in self.popall()]
  332. for cid, code in choplist(2, objs):
  333. if isinstance(code, bytes) and isinstance(cid, int):
  334. self.cmap.add_cid2unichr(cid, code)
  335. return
  336. if token is self.KEYWORD_BEGINBFRANGE:
  337. self.popall()
  338. return
  339. if token is self.KEYWORD_ENDBFRANGE:
  340. objs = [obj for (__, obj) in self.popall()]
  341. for start_byte, end_byte, code in choplist(3, objs):
  342. if not isinstance(start_byte, bytes):
  343. self._warn_once("The start object is not a byte.")
  344. continue
  345. if not isinstance(end_byte, bytes):
  346. self._warn_once("The end object is not a byte.")
  347. continue
  348. if len(start_byte) != len(end_byte):
  349. self._warn_once("The start and end byte have different lengths.")
  350. continue
  351. start = nunpack(start_byte)
  352. end = nunpack(end_byte)
  353. if isinstance(code, list):
  354. if len(code) != end - start + 1:
  355. self._warn_once(
  356. "The difference between the start and end "
  357. "offsets does not match the code length.",
  358. )
  359. for cid, unicode_value in zip(range(start, end + 1), code):
  360. self.cmap.add_cid2unichr(cid, unicode_value)
  361. else:
  362. assert isinstance(code, bytes)
  363. var = code[-4:]
  364. base = nunpack(var)
  365. prefix = code[:-4]
  366. vlen = len(var)
  367. for i in range(end - start + 1):
  368. x = prefix + struct.pack(">L", base + i)[-vlen:]
  369. self.cmap.add_cid2unichr(start + i, x)
  370. return
  371. if token is self.KEYWORD_BEGINBFCHAR:
  372. self.popall()
  373. return
  374. if token is self.KEYWORD_ENDBFCHAR:
  375. objs = [obj for (__, obj) in self.popall()]
  376. for cid, code in choplist(2, objs):
  377. if isinstance(cid, bytes) and isinstance(code, bytes):
  378. self.cmap.add_cid2unichr(nunpack(cid), code)
  379. return
  380. if token is self.KEYWORD_BEGINNOTDEFRANGE:
  381. self.popall()
  382. return
  383. if token is self.KEYWORD_ENDNOTDEFRANGE:
  384. self.popall()
  385. return
  386. self.push((pos, token))
  387. def _warn_once(self, msg: str) -> None:
  388. """Warn once for each unique message"""
  389. if msg not in self._warnings:
  390. self._warnings.add(msg)
  391. base_msg = (
  392. "Ignoring (part of) ToUnicode map because the PDF data "
  393. "does not conform to the format. This could result in "
  394. "(cid) values in the output. "
  395. )
  396. log.warning(base_msg + msg)