pdfdevice.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. import logging
  2. from typing import (
  3. TYPE_CHECKING,
  4. BinaryIO,
  5. Iterable,
  6. List,
  7. Optional,
  8. Sequence,
  9. Union,
  10. cast,
  11. )
  12. from pdfminer import utils
  13. from pdfminer.pdfcolor import PDFColorSpace
  14. from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
  15. from pdfminer.pdfpage import PDFPage
  16. from pdfminer.pdftypes import PDFStream
  17. from pdfminer.psparser import PSLiteral
  18. from pdfminer.utils import Matrix, PathSegment, Point, Rect
  19. if TYPE_CHECKING:
  20. from pdfminer.pdfinterp import (
  21. PDFGraphicState,
  22. PDFResourceManager,
  23. PDFStackT,
  24. PDFTextState,
  25. )
  26. PDFTextSeq = Iterable[Union[int, float, bytes]]
  27. logger = logging.getLogger(__name__)
  28. class PDFDevice:
  29. """Translate the output of PDFPageInterpreter to the output that is needed"""
  30. def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
  31. self.rsrcmgr = rsrcmgr
  32. self.ctm: Optional[Matrix] = None
  33. def __repr__(self) -> str:
  34. return "<PDFDevice>"
  35. def __enter__(self) -> "PDFDevice":
  36. return self
  37. def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
  38. self.close()
  39. def close(self) -> None:
  40. pass
  41. def set_ctm(self, ctm: Matrix) -> None:
  42. self.ctm = ctm
  43. def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
  44. pass
  45. def end_tag(self) -> None:
  46. pass
  47. def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
  48. pass
  49. def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
  50. pass
  51. def end_page(self, page: PDFPage) -> None:
  52. pass
  53. def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
  54. pass
  55. def end_figure(self, name: str) -> None:
  56. pass
  57. def paint_path(
  58. self,
  59. graphicstate: "PDFGraphicState",
  60. stroke: bool,
  61. fill: bool,
  62. evenodd: bool,
  63. path: Sequence[PathSegment],
  64. ) -> None:
  65. pass
  66. def render_image(self, name: str, stream: PDFStream) -> None:
  67. pass
  68. def render_string(
  69. self,
  70. textstate: "PDFTextState",
  71. seq: PDFTextSeq,
  72. ncs: PDFColorSpace,
  73. graphicstate: "PDFGraphicState",
  74. ) -> None:
  75. pass
  76. class PDFTextDevice(PDFDevice):
  77. def render_string(
  78. self,
  79. textstate: "PDFTextState",
  80. seq: PDFTextSeq,
  81. ncs: PDFColorSpace,
  82. graphicstate: "PDFGraphicState",
  83. ) -> None:
  84. assert self.ctm is not None
  85. matrix = utils.mult_matrix(textstate.matrix, self.ctm)
  86. font = textstate.font
  87. fontsize = textstate.fontsize
  88. scaling = textstate.scaling * 0.01
  89. charspace = textstate.charspace * scaling
  90. wordspace = textstate.wordspace * scaling
  91. rise = textstate.rise
  92. assert font is not None
  93. if font.is_multibyte():
  94. wordspace = 0
  95. dxscale = 0.001 * fontsize * scaling
  96. if font.is_vertical():
  97. textstate.linematrix = self.render_string_vertical(
  98. seq,
  99. matrix,
  100. textstate.linematrix,
  101. font,
  102. fontsize,
  103. scaling,
  104. charspace,
  105. wordspace,
  106. rise,
  107. dxscale,
  108. ncs,
  109. graphicstate,
  110. )
  111. else:
  112. textstate.linematrix = self.render_string_horizontal(
  113. seq,
  114. matrix,
  115. textstate.linematrix,
  116. font,
  117. fontsize,
  118. scaling,
  119. charspace,
  120. wordspace,
  121. rise,
  122. dxscale,
  123. ncs,
  124. graphicstate,
  125. )
  126. def render_string_horizontal(
  127. self,
  128. seq: PDFTextSeq,
  129. matrix: Matrix,
  130. pos: Point,
  131. font: PDFFont,
  132. fontsize: float,
  133. scaling: float,
  134. charspace: float,
  135. wordspace: float,
  136. rise: float,
  137. dxscale: float,
  138. ncs: PDFColorSpace,
  139. graphicstate: "PDFGraphicState",
  140. ) -> Point:
  141. (x, y) = pos
  142. needcharspace = False
  143. for obj in seq:
  144. if isinstance(obj, (int, float)):
  145. x -= obj * dxscale
  146. needcharspace = True
  147. elif isinstance(obj, bytes):
  148. for cid in font.decode(obj):
  149. if needcharspace:
  150. x += charspace
  151. x += self.render_char(
  152. utils.translate_matrix(matrix, (x, y)),
  153. font,
  154. fontsize,
  155. scaling,
  156. rise,
  157. cid,
  158. ncs,
  159. graphicstate,
  160. )
  161. if cid == 32 and wordspace:
  162. x += wordspace
  163. needcharspace = True
  164. else:
  165. logger.warning(
  166. f"Cannot render horizontal string because {obj!r} is not a valid int, float or bytes."
  167. )
  168. return (x, y)
  169. def render_string_vertical(
  170. self,
  171. seq: PDFTextSeq,
  172. matrix: Matrix,
  173. pos: Point,
  174. font: PDFFont,
  175. fontsize: float,
  176. scaling: float,
  177. charspace: float,
  178. wordspace: float,
  179. rise: float,
  180. dxscale: float,
  181. ncs: PDFColorSpace,
  182. graphicstate: "PDFGraphicState",
  183. ) -> Point:
  184. (x, y) = pos
  185. needcharspace = False
  186. for obj in seq:
  187. if isinstance(obj, (int, float)):
  188. y -= obj * dxscale
  189. needcharspace = True
  190. elif isinstance(obj, bytes):
  191. for cid in font.decode(obj):
  192. if needcharspace:
  193. y += charspace
  194. y += self.render_char(
  195. utils.translate_matrix(matrix, (x, y)),
  196. font,
  197. fontsize,
  198. scaling,
  199. rise,
  200. cid,
  201. ncs,
  202. graphicstate,
  203. )
  204. if cid == 32 and wordspace:
  205. y += wordspace
  206. needcharspace = True
  207. else:
  208. logger.warning(
  209. f"Cannot render vertical string because {obj!r} is not a valid int, float or bytes."
  210. )
  211. return (x, y)
  212. def render_char(
  213. self,
  214. matrix: Matrix,
  215. font: PDFFont,
  216. fontsize: float,
  217. scaling: float,
  218. rise: float,
  219. cid: int,
  220. ncs: PDFColorSpace,
  221. graphicstate: "PDFGraphicState",
  222. ) -> float:
  223. return 0
  224. class TagExtractor(PDFDevice):
  225. def __init__(
  226. self,
  227. rsrcmgr: "PDFResourceManager",
  228. outfp: BinaryIO,
  229. codec: str = "utf-8",
  230. ) -> None:
  231. PDFDevice.__init__(self, rsrcmgr)
  232. self.outfp = outfp
  233. self.codec = codec
  234. self.pageno = 0
  235. self._stack: List[PSLiteral] = []
  236. def render_string(
  237. self,
  238. textstate: "PDFTextState",
  239. seq: PDFTextSeq,
  240. ncs: PDFColorSpace,
  241. graphicstate: "PDFGraphicState",
  242. ) -> None:
  243. font = textstate.font
  244. assert font is not None
  245. text = ""
  246. for obj in seq:
  247. if isinstance(obj, str):
  248. obj = utils.make_compat_bytes(obj)
  249. if not isinstance(obj, bytes):
  250. continue
  251. chars = font.decode(obj)
  252. for cid in chars:
  253. try:
  254. char = font.to_unichr(cid)
  255. text += char
  256. except PDFUnicodeNotDefined:
  257. pass
  258. self._write(utils.enc(text))
  259. def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
  260. output = '<page id="%s" bbox="%s" rotate="%d">' % (
  261. self.pageno,
  262. utils.bbox2str(page.mediabox),
  263. page.rotate,
  264. )
  265. self._write(output)
  266. def end_page(self, page: PDFPage) -> None:
  267. self._write("</page>\n")
  268. self.pageno += 1
  269. def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
  270. s = ""
  271. if isinstance(props, dict):
  272. s = "".join(
  273. [
  274. f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
  275. for (k, v) in sorted(props.items())
  276. ],
  277. )
  278. out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
  279. self._write(out_s)
  280. self._stack.append(tag)
  281. def end_tag(self) -> None:
  282. assert self._stack, str(self.pageno)
  283. tag = self._stack.pop(-1)
  284. out_s = "</%s>" % utils.enc(cast(str, tag.name))
  285. self._write(out_s)
  286. def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
  287. self.begin_tag(tag, props)
  288. self._stack.pop(-1)
  289. def _write(self, s: str) -> None:
  290. self.outfp.write(s.encode(self.codec))