converter.py 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040
  1. import io
  2. import logging
  3. import re
  4. from typing import (
  5. BinaryIO,
  6. Dict,
  7. Generic,
  8. List,
  9. Optional,
  10. Sequence,
  11. TextIO,
  12. Tuple,
  13. TypeVar,
  14. Union,
  15. cast,
  16. )
  17. from pdfminer import utils
  18. from pdfminer.image import ImageWriter
  19. from pdfminer.layout import (
  20. LAParams,
  21. LTAnno,
  22. LTChar,
  23. LTComponent,
  24. LTContainer,
  25. LTCurve,
  26. LTFigure,
  27. LTImage,
  28. LTItem,
  29. LTLayoutContainer,
  30. LTLine,
  31. LTPage,
  32. LTRect,
  33. LTText,
  34. LTTextBox,
  35. LTTextBoxVertical,
  36. LTTextGroup,
  37. LTTextLine,
  38. TextGroupElement,
  39. )
  40. from pdfminer.pdfcolor import PDFColorSpace
  41. from pdfminer.pdfdevice import PDFTextDevice
  42. from pdfminer.pdfexceptions import PDFValueError
  43. from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
  44. from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
  45. from pdfminer.pdfpage import PDFPage
  46. from pdfminer.pdftypes import PDFStream
  47. from pdfminer.utils import (
  48. AnyIO,
  49. Matrix,
  50. PathSegment,
  51. Point,
  52. Rect,
  53. apply_matrix_pt,
  54. apply_matrix_rect,
  55. bbox2str,
  56. enc,
  57. make_compat_str,
  58. mult_matrix,
  59. )
  60. log = logging.getLogger(__name__)
  61. class PDFLayoutAnalyzer(PDFTextDevice):
  62. cur_item: LTLayoutContainer
  63. ctm: Matrix
  64. def __init__(
  65. self,
  66. rsrcmgr: PDFResourceManager,
  67. pageno: int = 1,
  68. laparams: Optional[LAParams] = None,
  69. ) -> None:
  70. PDFTextDevice.__init__(self, rsrcmgr)
  71. self.pageno = pageno
  72. self.laparams = laparams
  73. self._stack: List[LTLayoutContainer] = []
  74. def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
  75. (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox)
  76. mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
  77. self.cur_item = LTPage(self.pageno, mediabox)
  78. def end_page(self, page: PDFPage) -> None:
  79. assert not self._stack, str(len(self._stack))
  80. assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
  81. if self.laparams is not None:
  82. self.cur_item.analyze(self.laparams)
  83. self.pageno += 1
  84. self.receive_layout(self.cur_item)
  85. def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
  86. self._stack.append(self.cur_item)
  87. self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
  88. def end_figure(self, _: str) -> None:
  89. fig = self.cur_item
  90. assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
  91. self.cur_item = self._stack.pop()
  92. self.cur_item.add(fig)
  93. def render_image(self, name: str, stream: PDFStream) -> None:
  94. assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
  95. item = LTImage(
  96. name,
  97. stream,
  98. (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
  99. )
  100. self.cur_item.add(item)
  101. def paint_path(
  102. self,
  103. gstate: PDFGraphicState,
  104. stroke: bool,
  105. fill: bool,
  106. evenodd: bool,
  107. path: Sequence[PathSegment],
  108. ) -> None:
  109. """Paint paths described in section 4.4 of the PDF reference manual"""
  110. shape = "".join(x[0] for x in path)
  111. if shape[:1] != "m":
  112. # Per PDF Reference Section 4.4.1, "path construction operators may
  113. # be invoked in any sequence, but the first one invoked must be m
  114. # or re to begin a new subpath." Since pdfminer.six already
  115. # converts all `re` (rectangle) operators to their equivelent
  116. # `mlllh` representation, paths ingested by `.paint_path(...)` that
  117. # do not begin with the `m` operator are invalid.
  118. pass
  119. elif shape.count("m") > 1:
  120. # recurse if there are multiple m's in this shape
  121. for m in re.finditer(r"m[^m]+", shape):
  122. subpath = path[m.start(0) : m.end(0)]
  123. self.paint_path(gstate, stroke, fill, evenodd, subpath)
  124. else:
  125. # Although the 'h' command does not not literally provide a
  126. # point-position, its position is (by definition) equal to the
  127. # subpath's starting point.
  128. #
  129. # And, per Section 4.4's Table 4.9, all other path commands place
  130. # their point-position in their final two arguments. (Any preceding
  131. # arguments represent control points on Bézier curves.)
  132. raw_pts = [
  133. cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
  134. ]
  135. pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
  136. operators = [str(operation[0]) for operation in path]
  137. transformed_points = [
  138. [
  139. apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
  140. for operand1, operand2 in zip(operation[1::2], operation[2::2])
  141. ]
  142. for operation in path
  143. ]
  144. transformed_path = [
  145. cast(PathSegment, (o, *p))
  146. for o, p in zip(operators, transformed_points)
  147. ]
  148. # Drop a redundant "l" on a path closed with "h"
  149. if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]:
  150. shape = shape[:-2] + "h"
  151. pts.pop()
  152. if shape in {"mlh", "ml"}:
  153. # single line segment
  154. #
  155. # Note: 'ml', in conditional above, is a frequent anomaly
  156. # that we want to support.
  157. line = LTLine(
  158. gstate.linewidth,
  159. pts[0],
  160. pts[1],
  161. stroke,
  162. fill,
  163. evenodd,
  164. gstate.scolor,
  165. gstate.ncolor,
  166. original_path=transformed_path,
  167. dashing_style=gstate.dash,
  168. )
  169. self.cur_item.add(line)
  170. elif shape in {"mlllh", "mllll"}:
  171. (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
  172. is_closed_loop = pts[0] == pts[4]
  173. has_square_coordinates = (
  174. x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
  175. ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
  176. if is_closed_loop and has_square_coordinates:
  177. rect = LTRect(
  178. gstate.linewidth,
  179. (*pts[0], *pts[2]),
  180. stroke,
  181. fill,
  182. evenodd,
  183. gstate.scolor,
  184. gstate.ncolor,
  185. transformed_path,
  186. gstate.dash,
  187. )
  188. self.cur_item.add(rect)
  189. else:
  190. curve = LTCurve(
  191. gstate.linewidth,
  192. pts,
  193. stroke,
  194. fill,
  195. evenodd,
  196. gstate.scolor,
  197. gstate.ncolor,
  198. transformed_path,
  199. gstate.dash,
  200. )
  201. self.cur_item.add(curve)
  202. else:
  203. curve = LTCurve(
  204. gstate.linewidth,
  205. pts,
  206. stroke,
  207. fill,
  208. evenodd,
  209. gstate.scolor,
  210. gstate.ncolor,
  211. transformed_path,
  212. gstate.dash,
  213. )
  214. self.cur_item.add(curve)
  215. def render_char(
  216. self,
  217. matrix: Matrix,
  218. font: PDFFont,
  219. fontsize: float,
  220. scaling: float,
  221. rise: float,
  222. cid: int,
  223. ncs: PDFColorSpace,
  224. graphicstate: PDFGraphicState,
  225. ) -> float:
  226. try:
  227. text = font.to_unichr(cid)
  228. assert isinstance(text, str), str(type(text))
  229. except PDFUnicodeNotDefined:
  230. text = self.handle_undefined_char(font, cid)
  231. textwidth = font.char_width(cid)
  232. textdisp = font.char_disp(cid)
  233. item = LTChar(
  234. matrix,
  235. font,
  236. fontsize,
  237. scaling,
  238. rise,
  239. text,
  240. textwidth,
  241. textdisp,
  242. ncs,
  243. graphicstate,
  244. )
  245. self.cur_item.add(item)
  246. return item.adv
  247. def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
  248. log.debug("undefined: %r, %r", font, cid)
  249. return "(cid:%d)" % cid
  250. def receive_layout(self, ltpage: LTPage) -> None:
  251. pass
  252. class PDFPageAggregator(PDFLayoutAnalyzer):
  253. def __init__(
  254. self,
  255. rsrcmgr: PDFResourceManager,
  256. pageno: int = 1,
  257. laparams: Optional[LAParams] = None,
  258. ) -> None:
  259. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  260. self.result: Optional[LTPage] = None
  261. def receive_layout(self, ltpage: LTPage) -> None:
  262. self.result = ltpage
  263. def get_result(self) -> LTPage:
  264. assert self.result is not None
  265. return self.result
  266. # Some PDFConverter children support only binary I/O
  267. IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
  268. class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
  269. def __init__(
  270. self,
  271. rsrcmgr: PDFResourceManager,
  272. outfp: IOType,
  273. codec: str = "utf-8",
  274. pageno: int = 1,
  275. laparams: Optional[LAParams] = None,
  276. ) -> None:
  277. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  278. self.outfp: IOType = outfp
  279. self.codec = codec
  280. self.outfp_binary = self._is_binary_stream(self.outfp)
  281. @staticmethod
  282. def _is_binary_stream(outfp: AnyIO) -> bool:
  283. """Test if an stream is binary or not"""
  284. if "b" in getattr(outfp, "mode", ""):
  285. return True
  286. elif hasattr(outfp, "mode"):
  287. # output stream has a mode, but it does not contain 'b'
  288. return False
  289. elif isinstance(outfp, io.BytesIO):
  290. return True
  291. elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
  292. return False
  293. return True
  294. class TextConverter(PDFConverter[AnyIO]):
  295. def __init__(
  296. self,
  297. rsrcmgr: PDFResourceManager,
  298. outfp: AnyIO,
  299. codec: str = "utf-8",
  300. pageno: int = 1,
  301. laparams: Optional[LAParams] = None,
  302. showpageno: bool = False,
  303. imagewriter: Optional[ImageWriter] = None,
  304. ) -> None:
  305. super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
  306. self.showpageno = showpageno
  307. self.imagewriter = imagewriter
  308. def write_text(self, text: str) -> None:
  309. text = utils.compatible_encode_method(text, self.codec, "ignore")
  310. if self.outfp_binary:
  311. cast(BinaryIO, self.outfp).write(text.encode())
  312. else:
  313. cast(TextIO, self.outfp).write(text)
  314. def receive_layout(self, ltpage: LTPage) -> None:
  315. def render(item: LTItem) -> None:
  316. if isinstance(item, LTContainer):
  317. for child in item:
  318. render(child)
  319. elif isinstance(item, LTText):
  320. self.write_text(item.get_text())
  321. if isinstance(item, LTTextBox):
  322. self.write_text("\n")
  323. elif isinstance(item, LTImage):
  324. if self.imagewriter is not None:
  325. self.imagewriter.export_image(item)
  326. if self.showpageno:
  327. self.write_text("Page %s\n" % ltpage.pageid)
  328. render(ltpage)
  329. self.write_text("\f")
  330. # Some dummy functions to save memory/CPU when all that is wanted
  331. # is text. This stops all the image and drawing output from being
  332. # recorded and taking up RAM.
  333. def render_image(self, name: str, stream: PDFStream) -> None:
  334. if self.imagewriter is not None:
  335. PDFConverter.render_image(self, name, stream)
  336. def paint_path(
  337. self,
  338. gstate: PDFGraphicState,
  339. stroke: bool,
  340. fill: bool,
  341. evenodd: bool,
  342. path: Sequence[PathSegment],
  343. ) -> None:
  344. pass
  345. class HTMLConverter(PDFConverter[AnyIO]):
  346. RECT_COLORS = {
  347. "figure": "yellow",
  348. "textline": "magenta",
  349. "textbox": "cyan",
  350. "textgroup": "red",
  351. "curve": "black",
  352. "page": "gray",
  353. }
  354. TEXT_COLORS = {
  355. "textbox": "blue",
  356. "char": "black",
  357. }
  358. def __init__(
  359. self,
  360. rsrcmgr: PDFResourceManager,
  361. outfp: AnyIO,
  362. codec: str = "utf-8",
  363. pageno: int = 1,
  364. laparams: Optional[LAParams] = None,
  365. scale: float = 1,
  366. fontscale: float = 1.0,
  367. layoutmode: str = "normal",
  368. showpageno: bool = True,
  369. pagemargin: int = 50,
  370. imagewriter: Optional[ImageWriter] = None,
  371. debug: int = 0,
  372. rect_colors: Optional[Dict[str, str]] = None,
  373. text_colors: Optional[Dict[str, str]] = None,
  374. ) -> None:
  375. PDFConverter.__init__(
  376. self,
  377. rsrcmgr,
  378. outfp,
  379. codec=codec,
  380. pageno=pageno,
  381. laparams=laparams,
  382. )
  383. # write() assumes a codec for binary I/O, or no codec for text I/O.
  384. if self.outfp_binary and not self.codec:
  385. raise PDFValueError("Codec is required for a binary I/O output")
  386. if not self.outfp_binary and self.codec:
  387. raise PDFValueError("Codec must not be specified for a text I/O output")
  388. if text_colors is None:
  389. text_colors = {"char": "black"}
  390. if rect_colors is None:
  391. rect_colors = {"curve": "black", "page": "gray"}
  392. self.scale = scale
  393. self.fontscale = fontscale
  394. self.layoutmode = layoutmode
  395. self.showpageno = showpageno
  396. self.pagemargin = pagemargin
  397. self.imagewriter = imagewriter
  398. self.rect_colors = rect_colors
  399. self.text_colors = text_colors
  400. if debug:
  401. self.rect_colors.update(self.RECT_COLORS)
  402. self.text_colors.update(self.TEXT_COLORS)
  403. self._yoffset: float = self.pagemargin
  404. self._font: Optional[Tuple[str, float]] = None
  405. self._fontstack: List[Optional[Tuple[str, float]]] = []
  406. self.write_header()
  407. def write(self, text: str) -> None:
  408. if self.codec:
  409. cast(BinaryIO, self.outfp).write(text.encode(self.codec))
  410. else:
  411. cast(TextIO, self.outfp).write(text)
  412. def write_header(self) -> None:
  413. self.write("<html><head>\n")
  414. if self.codec:
  415. s = (
  416. '<meta http-equiv="Content-Type" content="text/html; '
  417. 'charset=%s">\n' % self.codec
  418. )
  419. else:
  420. s = '<meta http-equiv="Content-Type" content="text/html">\n'
  421. self.write(s)
  422. self.write("</head><body>\n")
  423. def write_footer(self) -> None:
  424. page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
  425. s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
  426. page_links,
  427. )
  428. self.write(s)
  429. self.write("</body></html>\n")
  430. def write_text(self, text: str) -> None:
  431. self.write(enc(text))
  432. def place_rect(
  433. self,
  434. color: str,
  435. borderwidth: int,
  436. x: float,
  437. y: float,
  438. w: float,
  439. h: float,
  440. ) -> None:
  441. color2 = self.rect_colors.get(color)
  442. if color2 is not None:
  443. s = (
  444. '<span style="position:absolute; border: %s %dpx solid; '
  445. 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
  446. % (
  447. color2,
  448. borderwidth,
  449. x * self.scale,
  450. (self._yoffset - y) * self.scale,
  451. w * self.scale,
  452. h * self.scale,
  453. )
  454. )
  455. self.write(s)
  456. def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
  457. self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
  458. def place_image(
  459. self,
  460. item: LTImage,
  461. borderwidth: int,
  462. x: float,
  463. y: float,
  464. w: float,
  465. h: float,
  466. ) -> None:
  467. if self.imagewriter is not None:
  468. name = self.imagewriter.export_image(item)
  469. s = (
  470. '<img src="%s" border="%d" style="position:absolute; '
  471. 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
  472. % (
  473. enc(name),
  474. borderwidth,
  475. x * self.scale,
  476. (self._yoffset - y) * self.scale,
  477. w * self.scale,
  478. h * self.scale,
  479. )
  480. )
  481. self.write(s)
  482. def place_text(
  483. self,
  484. color: str,
  485. text: str,
  486. x: float,
  487. y: float,
  488. size: float,
  489. ) -> None:
  490. color2 = self.text_colors.get(color)
  491. if color2 is not None:
  492. s = (
  493. '<span style="position:absolute; color:%s; left:%dpx; '
  494. 'top:%dpx; font-size:%dpx;">'
  495. % (
  496. color2,
  497. x * self.scale,
  498. (self._yoffset - y) * self.scale,
  499. size * self.scale * self.fontscale,
  500. )
  501. )
  502. self.write(s)
  503. self.write_text(text)
  504. self.write("</span>\n")
  505. def begin_div(
  506. self,
  507. color: str,
  508. borderwidth: int,
  509. x: float,
  510. y: float,
  511. w: float,
  512. h: float,
  513. writing_mode: str = "False",
  514. ) -> None:
  515. self._fontstack.append(self._font)
  516. self._font = None
  517. s = (
  518. '<div style="position:absolute; border: %s %dpx solid; '
  519. "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
  520. 'height:%dpx;">'
  521. % (
  522. color,
  523. borderwidth,
  524. writing_mode,
  525. x * self.scale,
  526. (self._yoffset - y) * self.scale,
  527. w * self.scale,
  528. h * self.scale,
  529. )
  530. )
  531. self.write(s)
  532. def end_div(self, color: str) -> None:
  533. if self._font is not None:
  534. self.write("</span>")
  535. self._font = self._fontstack.pop()
  536. self.write("</div>")
  537. def put_text(self, text: str, fontname: str, fontsize: float) -> None:
  538. font = (fontname, fontsize)
  539. if font != self._font:
  540. if self._font is not None:
  541. self.write("</span>")
  542. # Remove subset tag from fontname, see PDF Reference 5.5.3
  543. fontname_without_subset_tag = fontname.split("+")[-1]
  544. self.write(
  545. '<span style="font-family: %s; font-size:%dpx">'
  546. % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
  547. )
  548. self._font = font
  549. self.write_text(text)
  550. def put_newline(self) -> None:
  551. self.write("<br>")
  552. def receive_layout(self, ltpage: LTPage) -> None:
  553. def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
  554. if isinstance(item, LTTextGroup):
  555. self.place_border("textgroup", 1, item)
  556. for child in item:
  557. show_group(child)
  558. def render(item: LTItem) -> None:
  559. child: LTItem
  560. if isinstance(item, LTPage):
  561. self._yoffset += item.y1
  562. self.place_border("page", 1, item)
  563. if self.showpageno:
  564. self.write(
  565. '<div style="position:absolute; top:%dpx;">'
  566. % ((self._yoffset - item.y1) * self.scale),
  567. )
  568. self.write(
  569. f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
  570. )
  571. for child in item:
  572. render(child)
  573. if item.groups is not None:
  574. for group in item.groups:
  575. show_group(group)
  576. elif isinstance(item, LTCurve):
  577. self.place_border("curve", 1, item)
  578. elif isinstance(item, LTFigure):
  579. self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
  580. for child in item:
  581. render(child)
  582. self.end_div("figure")
  583. elif isinstance(item, LTImage):
  584. self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
  585. elif self.layoutmode == "exact":
  586. if isinstance(item, LTTextLine):
  587. self.place_border("textline", 1, item)
  588. for child in item:
  589. render(child)
  590. elif isinstance(item, LTTextBox):
  591. self.place_border("textbox", 1, item)
  592. self.place_text(
  593. "textbox",
  594. str(item.index + 1),
  595. item.x0,
  596. item.y1,
  597. 20,
  598. )
  599. for child in item:
  600. render(child)
  601. elif isinstance(item, LTChar):
  602. self.place_border("char", 1, item)
  603. self.place_text(
  604. "char",
  605. item.get_text(),
  606. item.x0,
  607. item.y1,
  608. item.size,
  609. )
  610. elif isinstance(item, LTTextLine):
  611. for child in item:
  612. render(child)
  613. if self.layoutmode != "loose":
  614. self.put_newline()
  615. elif isinstance(item, LTTextBox):
  616. self.begin_div(
  617. "textbox",
  618. 1,
  619. item.x0,
  620. item.y1,
  621. item.width,
  622. item.height,
  623. item.get_writing_mode(),
  624. )
  625. for child in item:
  626. render(child)
  627. self.end_div("textbox")
  628. elif isinstance(item, LTChar):
  629. fontname = make_compat_str(item.fontname)
  630. self.put_text(item.get_text(), fontname, item.size)
  631. elif isinstance(item, LTText):
  632. self.write_text(item.get_text())
  633. render(ltpage)
  634. self._yoffset += self.pagemargin
  635. def close(self) -> None:
  636. self.write_footer()
  637. class XMLConverter(PDFConverter[AnyIO]):
  638. CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
  639. def __init__(
  640. self,
  641. rsrcmgr: PDFResourceManager,
  642. outfp: AnyIO,
  643. codec: str = "utf-8",
  644. pageno: int = 1,
  645. laparams: Optional[LAParams] = None,
  646. imagewriter: Optional[ImageWriter] = None,
  647. stripcontrol: bool = False,
  648. ) -> None:
  649. PDFConverter.__init__(
  650. self,
  651. rsrcmgr,
  652. outfp,
  653. codec=codec,
  654. pageno=pageno,
  655. laparams=laparams,
  656. )
  657. # write() assumes a codec for binary I/O, or no codec for text I/O.
  658. if self.outfp_binary == (not self.codec):
  659. raise PDFValueError("Codec is required for a binary I/O output")
  660. self.imagewriter = imagewriter
  661. self.stripcontrol = stripcontrol
  662. self.write_header()
  663. def write(self, text: str) -> None:
  664. if self.codec:
  665. cast(BinaryIO, self.outfp).write(text.encode(self.codec))
  666. else:
  667. cast(TextIO, self.outfp).write(text)
  668. def write_header(self) -> None:
  669. if self.codec:
  670. self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
  671. else:
  672. self.write('<?xml version="1.0" ?>\n')
  673. self.write("<pages>\n")
  674. def write_footer(self) -> None:
  675. self.write("</pages>\n")
  676. def write_text(self, text: str) -> None:
  677. if self.stripcontrol:
  678. text = self.CONTROL.sub("", text)
  679. self.write(enc(text))
  680. def receive_layout(self, ltpage: LTPage) -> None:
  681. def show_group(item: LTItem) -> None:
  682. if isinstance(item, LTTextBox):
  683. self.write(
  684. '<textbox id="%d" bbox="%s" />\n'
  685. % (item.index, bbox2str(item.bbox)),
  686. )
  687. elif isinstance(item, LTTextGroup):
  688. self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
  689. for child in item:
  690. show_group(child)
  691. self.write("</textgroup>\n")
  692. def render(item: LTItem) -> None:
  693. child: LTItem
  694. if isinstance(item, LTPage):
  695. s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
  696. item.pageid,
  697. bbox2str(item.bbox),
  698. item.rotate,
  699. )
  700. self.write(s)
  701. for child in item:
  702. render(child)
  703. if item.groups is not None:
  704. self.write("<layout>\n")
  705. for group in item.groups:
  706. show_group(group)
  707. self.write("</layout>\n")
  708. self.write("</page>\n")
  709. elif isinstance(item, LTLine):
  710. s = '<line linewidth="%d" bbox="%s" />\n' % (
  711. item.linewidth,
  712. bbox2str(item.bbox),
  713. )
  714. self.write(s)
  715. elif isinstance(item, LTRect):
  716. s = '<rect linewidth="%d" bbox="%s" />\n' % (
  717. item.linewidth,
  718. bbox2str(item.bbox),
  719. )
  720. self.write(s)
  721. elif isinstance(item, LTCurve):
  722. s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
  723. item.linewidth,
  724. bbox2str(item.bbox),
  725. item.get_pts(),
  726. )
  727. self.write(s)
  728. elif isinstance(item, LTFigure):
  729. s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
  730. self.write(s)
  731. for child in item:
  732. render(child)
  733. self.write("</figure>\n")
  734. elif isinstance(item, LTTextLine):
  735. self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
  736. for child in item:
  737. render(child)
  738. self.write("</textline>\n")
  739. elif isinstance(item, LTTextBox):
  740. wmode = ""
  741. if isinstance(item, LTTextBoxVertical):
  742. wmode = ' wmode="vertical"'
  743. s = '<textbox id="%d" bbox="%s"%s>\n' % (
  744. item.index,
  745. bbox2str(item.bbox),
  746. wmode,
  747. )
  748. self.write(s)
  749. for child in item:
  750. render(child)
  751. self.write("</textbox>\n")
  752. elif isinstance(item, LTChar):
  753. s = (
  754. '<text font="%s" bbox="%s" colourspace="%s" '
  755. 'ncolour="%s" size="%.3f">'
  756. % (
  757. enc(item.fontname),
  758. bbox2str(item.bbox),
  759. item.ncs.name,
  760. item.graphicstate.ncolor,
  761. item.size,
  762. )
  763. )
  764. self.write(s)
  765. self.write_text(item.get_text())
  766. self.write("</text>\n")
  767. elif isinstance(item, LTText):
  768. self.write("<text>%s</text>\n" % item.get_text())
  769. elif isinstance(item, LTImage):
  770. if self.imagewriter is not None:
  771. name = self.imagewriter.export_image(item)
  772. self.write(
  773. '<image src="%s" width="%d" height="%d" />\n'
  774. % (enc(name), item.width, item.height),
  775. )
  776. else:
  777. self.write(
  778. '<image width="%d" height="%d" />\n'
  779. % (item.width, item.height),
  780. )
  781. else:
  782. assert False, str(("Unhandled", item))
  783. render(ltpage)
  784. def close(self) -> None:
  785. self.write_footer()
  786. class HOCRConverter(PDFConverter[AnyIO]):
  787. """Extract an hOCR representation from explicit text information within a PDF."""
  788. # Where text is being extracted from a variety of types of PDF within a
  789. # business process, those PDFs where the text is only present in image
  790. # form will need to be analysed using an OCR tool which will typically
  791. # output hOCR. This converter extracts the explicit text information from
  792. # those PDFs that do have it and uses it to genxerate a basic hOCR
  793. # representation that is designed to be used in conjunction with the image
  794. # of the PDF in the same way as genuine OCR output would be, but without the
  795. # inevitable OCR errors.
  796. # The converter does not handle images, diagrams or text colors.
  797. # In the examples processed by the contributor it was necessary to set
  798. # LAParams.all_texts to True.
  799. CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
  800. def __init__(
  801. self,
  802. rsrcmgr: PDFResourceManager,
  803. outfp: AnyIO,
  804. codec: str = "utf8",
  805. pageno: int = 1,
  806. laparams: Optional[LAParams] = None,
  807. stripcontrol: bool = False,
  808. ):
  809. PDFConverter.__init__(
  810. self,
  811. rsrcmgr,
  812. outfp,
  813. codec=codec,
  814. pageno=pageno,
  815. laparams=laparams,
  816. )
  817. self.stripcontrol = stripcontrol
  818. self.within_chars = False
  819. self.write_header()
  820. def bbox_repr(self, bbox: Rect) -> str:
  821. (in_x0, in_y0, in_x1, in_y1) = bbox
  822. # PDF y-coordinates are the other way round from hOCR coordinates
  823. out_x0 = int(in_x0)
  824. out_y0 = int(self.page_bbox[3] - in_y1)
  825. out_x1 = int(in_x1)
  826. out_y1 = int(self.page_bbox[3] - in_y0)
  827. return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
  828. def write(self, text: str) -> None:
  829. if self.codec:
  830. encoded_text = text.encode(self.codec)
  831. cast(BinaryIO, self.outfp).write(encoded_text)
  832. else:
  833. cast(TextIO, self.outfp).write(text)
  834. def write_header(self) -> None:
  835. if self.codec:
  836. self.write(
  837. "<html xmlns='http://www.w3.org/1999/xhtml' "
  838. "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
  839. )
  840. else:
  841. self.write(
  842. "<html xmlns='http://www.w3.org/1999/xhtml' "
  843. "xml:lang='en' lang='en'>\n",
  844. )
  845. self.write("<head>\n")
  846. self.write("<title></title>\n")
  847. self.write(
  848. "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
  849. )
  850. self.write(
  851. "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",
  852. )
  853. self.write(
  854. " <meta name='ocr-capabilities'"
  855. " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
  856. )
  857. self.write("</head>\n")
  858. self.write("<body>\n")
  859. def write_footer(self) -> None:
  860. self.write("<!-- comment in the following line to debug -->\n")
  861. self.write(
  862. "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
  863. )
  864. def write_text(self, text: str) -> None:
  865. if self.stripcontrol:
  866. text = self.CONTROL.sub("", text)
  867. self.write(text)
  868. def write_word(self) -> None:
  869. if len(self.working_text) > 0:
  870. bold_and_italic_styles = ""
  871. if "Italic" in self.working_font:
  872. bold_and_italic_styles = "font-style: italic; "
  873. if "Bold" in self.working_font:
  874. bold_and_italic_styles += "font-weight: bold; "
  875. self.write(
  876. "<span style='font:\"%s\"; font-size:%d; %s' "
  877. "class='ocrx_word' title='%s; x_font %s; "
  878. "x_fsize %d'>%s</span>"
  879. % (
  880. (
  881. self.working_font,
  882. self.working_size,
  883. bold_and_italic_styles,
  884. self.bbox_repr(self.working_bbox),
  885. self.working_font,
  886. self.working_size,
  887. self.working_text.strip(),
  888. )
  889. ),
  890. )
  891. self.within_chars = False
  892. def receive_layout(self, ltpage: LTPage) -> None:
  893. def render(item: LTItem) -> None:
  894. if self.within_chars and isinstance(item, LTAnno):
  895. self.write_word()
  896. if isinstance(item, LTPage):
  897. self.page_bbox = item.bbox
  898. self.write(
  899. "<div class='ocr_page' id='%s' title='%s'>\n"
  900. % (item.pageid, self.bbox_repr(item.bbox)),
  901. )
  902. for child in item:
  903. render(child)
  904. self.write("</div>\n")
  905. elif isinstance(item, LTTextLine):
  906. self.write(
  907. "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
  908. )
  909. for child_line in item:
  910. render(child_line)
  911. self.write("</span>\n")
  912. elif isinstance(item, LTTextBox):
  913. self.write(
  914. "<div class='ocr_block' id='%d' title='%s'>\n"
  915. % (item.index, self.bbox_repr(item.bbox)),
  916. )
  917. for child in item:
  918. render(child)
  919. self.write("</div>\n")
  920. elif isinstance(item, LTChar):
  921. if not self.within_chars:
  922. self.within_chars = True
  923. self.working_text = item.get_text()
  924. self.working_bbox = item.bbox
  925. self.working_font = item.fontname
  926. self.working_size = item.size
  927. elif len(item.get_text().strip()) == 0:
  928. self.write_word()
  929. self.write(item.get_text())
  930. else:
  931. if (
  932. self.working_bbox[1] != item.bbox[1]
  933. or self.working_font != item.fontname
  934. or self.working_size != item.size
  935. ):
  936. self.write_word()
  937. self.working_bbox = item.bbox
  938. self.working_font = item.fontname
  939. self.working_size = item.size
  940. self.working_text += item.get_text()
  941. self.working_bbox = (
  942. self.working_bbox[0],
  943. self.working_bbox[1],
  944. item.bbox[2],
  945. self.working_bbox[3],
  946. )
  947. render(ltpage)
  948. def close(self) -> None:
  949. self.write_footer()