| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040 |
- import io
- import logging
- import re
- from typing import (
- BinaryIO,
- Dict,
- Generic,
- List,
- Optional,
- Sequence,
- TextIO,
- Tuple,
- TypeVar,
- Union,
- cast,
- )
- from pdfminer import utils
- from pdfminer.image import ImageWriter
- from pdfminer.layout import (
- LAParams,
- LTAnno,
- LTChar,
- LTComponent,
- LTContainer,
- LTCurve,
- LTFigure,
- LTImage,
- LTItem,
- LTLayoutContainer,
- LTLine,
- LTPage,
- LTRect,
- LTText,
- LTTextBox,
- LTTextBoxVertical,
- LTTextGroup,
- LTTextLine,
- TextGroupElement,
- )
- from pdfminer.pdfcolor import PDFColorSpace
- from pdfminer.pdfdevice import PDFTextDevice
- from pdfminer.pdfexceptions import PDFValueError
- from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
- from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
- from pdfminer.pdfpage import PDFPage
- from pdfminer.pdftypes import PDFStream
- from pdfminer.utils import (
- AnyIO,
- Matrix,
- PathSegment,
- Point,
- Rect,
- apply_matrix_pt,
- apply_matrix_rect,
- bbox2str,
- enc,
- make_compat_str,
- mult_matrix,
- )
- log = logging.getLogger(__name__)
- class PDFLayoutAnalyzer(PDFTextDevice):
- cur_item: LTLayoutContainer
- ctm: Matrix
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- ) -> None:
- PDFTextDevice.__init__(self, rsrcmgr)
- self.pageno = pageno
- self.laparams = laparams
- self._stack: List[LTLayoutContainer] = []
- def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
- (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox)
- mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
- self.cur_item = LTPage(self.pageno, mediabox)
- def end_page(self, page: PDFPage) -> None:
- assert not self._stack, str(len(self._stack))
- assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
- if self.laparams is not None:
- self.cur_item.analyze(self.laparams)
- self.pageno += 1
- self.receive_layout(self.cur_item)
- def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
- self._stack.append(self.cur_item)
- self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
- def end_figure(self, _: str) -> None:
- fig = self.cur_item
- assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
- self.cur_item = self._stack.pop()
- self.cur_item.add(fig)
- def render_image(self, name: str, stream: PDFStream) -> None:
- assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
- item = LTImage(
- name,
- stream,
- (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
- )
- self.cur_item.add(item)
- def paint_path(
- self,
- gstate: PDFGraphicState,
- stroke: bool,
- fill: bool,
- evenodd: bool,
- path: Sequence[PathSegment],
- ) -> None:
- """Paint paths described in section 4.4 of the PDF reference manual"""
- shape = "".join(x[0] for x in path)
- if shape[:1] != "m":
- # Per PDF Reference Section 4.4.1, "path construction operators may
- # be invoked in any sequence, but the first one invoked must be m
- # or re to begin a new subpath." Since pdfminer.six already
- # converts all `re` (rectangle) operators to their equivelent
- # `mlllh` representation, paths ingested by `.paint_path(...)` that
- # do not begin with the `m` operator are invalid.
- pass
- elif shape.count("m") > 1:
- # recurse if there are multiple m's in this shape
- for m in re.finditer(r"m[^m]+", shape):
- subpath = path[m.start(0) : m.end(0)]
- self.paint_path(gstate, stroke, fill, evenodd, subpath)
- else:
- # Although the 'h' command does not not literally provide a
- # point-position, its position is (by definition) equal to the
- # subpath's starting point.
- #
- # And, per Section 4.4's Table 4.9, all other path commands place
- # their point-position in their final two arguments. (Any preceding
- # arguments represent control points on Bézier curves.)
- raw_pts = [
- cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
- ]
- pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
- operators = [str(operation[0]) for operation in path]
- transformed_points = [
- [
- apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
- for operand1, operand2 in zip(operation[1::2], operation[2::2])
- ]
- for operation in path
- ]
- transformed_path = [
- cast(PathSegment, (o, *p))
- for o, p in zip(operators, transformed_points)
- ]
- # Drop a redundant "l" on a path closed with "h"
- if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]:
- shape = shape[:-2] + "h"
- pts.pop()
- if shape in {"mlh", "ml"}:
- # single line segment
- #
- # Note: 'ml', in conditional above, is a frequent anomaly
- # that we want to support.
- line = LTLine(
- gstate.linewidth,
- pts[0],
- pts[1],
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- original_path=transformed_path,
- dashing_style=gstate.dash,
- )
- self.cur_item.add(line)
- elif shape in {"mlllh", "mllll"}:
- (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
- is_closed_loop = pts[0] == pts[4]
- has_square_coordinates = (
- x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
- ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
- if is_closed_loop and has_square_coordinates:
- rect = LTRect(
- gstate.linewidth,
- (*pts[0], *pts[2]),
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- transformed_path,
- gstate.dash,
- )
- self.cur_item.add(rect)
- else:
- curve = LTCurve(
- gstate.linewidth,
- pts,
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- transformed_path,
- gstate.dash,
- )
- self.cur_item.add(curve)
- else:
- curve = LTCurve(
- gstate.linewidth,
- pts,
- stroke,
- fill,
- evenodd,
- gstate.scolor,
- gstate.ncolor,
- transformed_path,
- gstate.dash,
- )
- self.cur_item.add(curve)
- def render_char(
- self,
- matrix: Matrix,
- font: PDFFont,
- fontsize: float,
- scaling: float,
- rise: float,
- cid: int,
- ncs: PDFColorSpace,
- graphicstate: PDFGraphicState,
- ) -> float:
- try:
- text = font.to_unichr(cid)
- assert isinstance(text, str), str(type(text))
- except PDFUnicodeNotDefined:
- text = self.handle_undefined_char(font, cid)
- textwidth = font.char_width(cid)
- textdisp = font.char_disp(cid)
- item = LTChar(
- matrix,
- font,
- fontsize,
- scaling,
- rise,
- text,
- textwidth,
- textdisp,
- ncs,
- graphicstate,
- )
- self.cur_item.add(item)
- return item.adv
- def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
- log.debug("undefined: %r, %r", font, cid)
- return "(cid:%d)" % cid
- def receive_layout(self, ltpage: LTPage) -> None:
- pass
- class PDFPageAggregator(PDFLayoutAnalyzer):
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- ) -> None:
- PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
- self.result: Optional[LTPage] = None
- def receive_layout(self, ltpage: LTPage) -> None:
- self.result = ltpage
- def get_result(self) -> LTPage:
- assert self.result is not None
- return self.result
- # Some PDFConverter children support only binary I/O
- IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
- class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: IOType,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- ) -> None:
- PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
- self.outfp: IOType = outfp
- self.codec = codec
- self.outfp_binary = self._is_binary_stream(self.outfp)
- @staticmethod
- def _is_binary_stream(outfp: AnyIO) -> bool:
- """Test if an stream is binary or not"""
- if "b" in getattr(outfp, "mode", ""):
- return True
- elif hasattr(outfp, "mode"):
- # output stream has a mode, but it does not contain 'b'
- return False
- elif isinstance(outfp, io.BytesIO):
- return True
- elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
- return False
- return True
- class TextConverter(PDFConverter[AnyIO]):
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- showpageno: bool = False,
- imagewriter: Optional[ImageWriter] = None,
- ) -> None:
- super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
- self.showpageno = showpageno
- self.imagewriter = imagewriter
- def write_text(self, text: str) -> None:
- text = utils.compatible_encode_method(text, self.codec, "ignore")
- if self.outfp_binary:
- cast(BinaryIO, self.outfp).write(text.encode())
- else:
- cast(TextIO, self.outfp).write(text)
- def receive_layout(self, ltpage: LTPage) -> None:
- def render(item: LTItem) -> None:
- if isinstance(item, LTContainer):
- for child in item:
- render(child)
- elif isinstance(item, LTText):
- self.write_text(item.get_text())
- if isinstance(item, LTTextBox):
- self.write_text("\n")
- elif isinstance(item, LTImage):
- if self.imagewriter is not None:
- self.imagewriter.export_image(item)
- if self.showpageno:
- self.write_text("Page %s\n" % ltpage.pageid)
- render(ltpage)
- self.write_text("\f")
- # Some dummy functions to save memory/CPU when all that is wanted
- # is text. This stops all the image and drawing output from being
- # recorded and taking up RAM.
- def render_image(self, name: str, stream: PDFStream) -> None:
- if self.imagewriter is not None:
- PDFConverter.render_image(self, name, stream)
- def paint_path(
- self,
- gstate: PDFGraphicState,
- stroke: bool,
- fill: bool,
- evenodd: bool,
- path: Sequence[PathSegment],
- ) -> None:
- pass
- class HTMLConverter(PDFConverter[AnyIO]):
- RECT_COLORS = {
- "figure": "yellow",
- "textline": "magenta",
- "textbox": "cyan",
- "textgroup": "red",
- "curve": "black",
- "page": "gray",
- }
- TEXT_COLORS = {
- "textbox": "blue",
- "char": "black",
- }
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- scale: float = 1,
- fontscale: float = 1.0,
- layoutmode: str = "normal",
- showpageno: bool = True,
- pagemargin: int = 50,
- imagewriter: Optional[ImageWriter] = None,
- debug: int = 0,
- rect_colors: Optional[Dict[str, str]] = None,
- text_colors: Optional[Dict[str, str]] = None,
- ) -> None:
- PDFConverter.__init__(
- self,
- rsrcmgr,
- outfp,
- codec=codec,
- pageno=pageno,
- laparams=laparams,
- )
- # write() assumes a codec for binary I/O, or no codec for text I/O.
- if self.outfp_binary and not self.codec:
- raise PDFValueError("Codec is required for a binary I/O output")
- if not self.outfp_binary and self.codec:
- raise PDFValueError("Codec must not be specified for a text I/O output")
- if text_colors is None:
- text_colors = {"char": "black"}
- if rect_colors is None:
- rect_colors = {"curve": "black", "page": "gray"}
- self.scale = scale
- self.fontscale = fontscale
- self.layoutmode = layoutmode
- self.showpageno = showpageno
- self.pagemargin = pagemargin
- self.imagewriter = imagewriter
- self.rect_colors = rect_colors
- self.text_colors = text_colors
- if debug:
- self.rect_colors.update(self.RECT_COLORS)
- self.text_colors.update(self.TEXT_COLORS)
- self._yoffset: float = self.pagemargin
- self._font: Optional[Tuple[str, float]] = None
- self._fontstack: List[Optional[Tuple[str, float]]] = []
- self.write_header()
- def write(self, text: str) -> None:
- if self.codec:
- cast(BinaryIO, self.outfp).write(text.encode(self.codec))
- else:
- cast(TextIO, self.outfp).write(text)
- def write_header(self) -> None:
- self.write("<html><head>\n")
- if self.codec:
- s = (
- '<meta http-equiv="Content-Type" content="text/html; '
- 'charset=%s">\n' % self.codec
- )
- else:
- s = '<meta http-equiv="Content-Type" content="text/html">\n'
- self.write(s)
- self.write("</head><body>\n")
- def write_footer(self) -> None:
- page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
- s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
- page_links,
- )
- self.write(s)
- self.write("</body></html>\n")
- def write_text(self, text: str) -> None:
- self.write(enc(text))
- def place_rect(
- self,
- color: str,
- borderwidth: int,
- x: float,
- y: float,
- w: float,
- h: float,
- ) -> None:
- color2 = self.rect_colors.get(color)
- if color2 is not None:
- s = (
- '<span style="position:absolute; border: %s %dpx solid; '
- 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
- % (
- color2,
- borderwidth,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- w * self.scale,
- h * self.scale,
- )
- )
- self.write(s)
- def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
- self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
- def place_image(
- self,
- item: LTImage,
- borderwidth: int,
- x: float,
- y: float,
- w: float,
- h: float,
- ) -> None:
- if self.imagewriter is not None:
- name = self.imagewriter.export_image(item)
- s = (
- '<img src="%s" border="%d" style="position:absolute; '
- 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
- % (
- enc(name),
- borderwidth,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- w * self.scale,
- h * self.scale,
- )
- )
- self.write(s)
- def place_text(
- self,
- color: str,
- text: str,
- x: float,
- y: float,
- size: float,
- ) -> None:
- color2 = self.text_colors.get(color)
- if color2 is not None:
- s = (
- '<span style="position:absolute; color:%s; left:%dpx; '
- 'top:%dpx; font-size:%dpx;">'
- % (
- color2,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- size * self.scale * self.fontscale,
- )
- )
- self.write(s)
- self.write_text(text)
- self.write("</span>\n")
- def begin_div(
- self,
- color: str,
- borderwidth: int,
- x: float,
- y: float,
- w: float,
- h: float,
- writing_mode: str = "False",
- ) -> None:
- self._fontstack.append(self._font)
- self._font = None
- s = (
- '<div style="position:absolute; border: %s %dpx solid; '
- "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
- 'height:%dpx;">'
- % (
- color,
- borderwidth,
- writing_mode,
- x * self.scale,
- (self._yoffset - y) * self.scale,
- w * self.scale,
- h * self.scale,
- )
- )
- self.write(s)
- def end_div(self, color: str) -> None:
- if self._font is not None:
- self.write("</span>")
- self._font = self._fontstack.pop()
- self.write("</div>")
- def put_text(self, text: str, fontname: str, fontsize: float) -> None:
- font = (fontname, fontsize)
- if font != self._font:
- if self._font is not None:
- self.write("</span>")
- # Remove subset tag from fontname, see PDF Reference 5.5.3
- fontname_without_subset_tag = fontname.split("+")[-1]
- self.write(
- '<span style="font-family: %s; font-size:%dpx">'
- % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
- )
- self._font = font
- self.write_text(text)
- def put_newline(self) -> None:
- self.write("<br>")
- def receive_layout(self, ltpage: LTPage) -> None:
- def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
- if isinstance(item, LTTextGroup):
- self.place_border("textgroup", 1, item)
- for child in item:
- show_group(child)
- def render(item: LTItem) -> None:
- child: LTItem
- if isinstance(item, LTPage):
- self._yoffset += item.y1
- self.place_border("page", 1, item)
- if self.showpageno:
- self.write(
- '<div style="position:absolute; top:%dpx;">'
- % ((self._yoffset - item.y1) * self.scale),
- )
- self.write(
- f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
- )
- for child in item:
- render(child)
- if item.groups is not None:
- for group in item.groups:
- show_group(group)
- elif isinstance(item, LTCurve):
- self.place_border("curve", 1, item)
- elif isinstance(item, LTFigure):
- self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
- for child in item:
- render(child)
- self.end_div("figure")
- elif isinstance(item, LTImage):
- self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
- elif self.layoutmode == "exact":
- if isinstance(item, LTTextLine):
- self.place_border("textline", 1, item)
- for child in item:
- render(child)
- elif isinstance(item, LTTextBox):
- self.place_border("textbox", 1, item)
- self.place_text(
- "textbox",
- str(item.index + 1),
- item.x0,
- item.y1,
- 20,
- )
- for child in item:
- render(child)
- elif isinstance(item, LTChar):
- self.place_border("char", 1, item)
- self.place_text(
- "char",
- item.get_text(),
- item.x0,
- item.y1,
- item.size,
- )
- elif isinstance(item, LTTextLine):
- for child in item:
- render(child)
- if self.layoutmode != "loose":
- self.put_newline()
- elif isinstance(item, LTTextBox):
- self.begin_div(
- "textbox",
- 1,
- item.x0,
- item.y1,
- item.width,
- item.height,
- item.get_writing_mode(),
- )
- for child in item:
- render(child)
- self.end_div("textbox")
- elif isinstance(item, LTChar):
- fontname = make_compat_str(item.fontname)
- self.put_text(item.get_text(), fontname, item.size)
- elif isinstance(item, LTText):
- self.write_text(item.get_text())
- render(ltpage)
- self._yoffset += self.pagemargin
- def close(self) -> None:
- self.write_footer()
- class XMLConverter(PDFConverter[AnyIO]):
- CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf-8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- imagewriter: Optional[ImageWriter] = None,
- stripcontrol: bool = False,
- ) -> None:
- PDFConverter.__init__(
- self,
- rsrcmgr,
- outfp,
- codec=codec,
- pageno=pageno,
- laparams=laparams,
- )
- # write() assumes a codec for binary I/O, or no codec for text I/O.
- if self.outfp_binary == (not self.codec):
- raise PDFValueError("Codec is required for a binary I/O output")
- self.imagewriter = imagewriter
- self.stripcontrol = stripcontrol
- self.write_header()
- def write(self, text: str) -> None:
- if self.codec:
- cast(BinaryIO, self.outfp).write(text.encode(self.codec))
- else:
- cast(TextIO, self.outfp).write(text)
- def write_header(self) -> None:
- if self.codec:
- self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
- else:
- self.write('<?xml version="1.0" ?>\n')
- self.write("<pages>\n")
- def write_footer(self) -> None:
- self.write("</pages>\n")
- def write_text(self, text: str) -> None:
- if self.stripcontrol:
- text = self.CONTROL.sub("", text)
- self.write(enc(text))
- def receive_layout(self, ltpage: LTPage) -> None:
- def show_group(item: LTItem) -> None:
- if isinstance(item, LTTextBox):
- self.write(
- '<textbox id="%d" bbox="%s" />\n'
- % (item.index, bbox2str(item.bbox)),
- )
- elif isinstance(item, LTTextGroup):
- self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
- for child in item:
- show_group(child)
- self.write("</textgroup>\n")
- def render(item: LTItem) -> None:
- child: LTItem
- if isinstance(item, LTPage):
- s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
- item.pageid,
- bbox2str(item.bbox),
- item.rotate,
- )
- self.write(s)
- for child in item:
- render(child)
- if item.groups is not None:
- self.write("<layout>\n")
- for group in item.groups:
- show_group(group)
- self.write("</layout>\n")
- self.write("</page>\n")
- elif isinstance(item, LTLine):
- s = '<line linewidth="%d" bbox="%s" />\n' % (
- item.linewidth,
- bbox2str(item.bbox),
- )
- self.write(s)
- elif isinstance(item, LTRect):
- s = '<rect linewidth="%d" bbox="%s" />\n' % (
- item.linewidth,
- bbox2str(item.bbox),
- )
- self.write(s)
- elif isinstance(item, LTCurve):
- s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
- item.linewidth,
- bbox2str(item.bbox),
- item.get_pts(),
- )
- self.write(s)
- elif isinstance(item, LTFigure):
- s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
- self.write(s)
- for child in item:
- render(child)
- self.write("</figure>\n")
- elif isinstance(item, LTTextLine):
- self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
- for child in item:
- render(child)
- self.write("</textline>\n")
- elif isinstance(item, LTTextBox):
- wmode = ""
- if isinstance(item, LTTextBoxVertical):
- wmode = ' wmode="vertical"'
- s = '<textbox id="%d" bbox="%s"%s>\n' % (
- item.index,
- bbox2str(item.bbox),
- wmode,
- )
- self.write(s)
- for child in item:
- render(child)
- self.write("</textbox>\n")
- elif isinstance(item, LTChar):
- s = (
- '<text font="%s" bbox="%s" colourspace="%s" '
- 'ncolour="%s" size="%.3f">'
- % (
- enc(item.fontname),
- bbox2str(item.bbox),
- item.ncs.name,
- item.graphicstate.ncolor,
- item.size,
- )
- )
- self.write(s)
- self.write_text(item.get_text())
- self.write("</text>\n")
- elif isinstance(item, LTText):
- self.write("<text>%s</text>\n" % item.get_text())
- elif isinstance(item, LTImage):
- if self.imagewriter is not None:
- name = self.imagewriter.export_image(item)
- self.write(
- '<image src="%s" width="%d" height="%d" />\n'
- % (enc(name), item.width, item.height),
- )
- else:
- self.write(
- '<image width="%d" height="%d" />\n'
- % (item.width, item.height),
- )
- else:
- assert False, str(("Unhandled", item))
- render(ltpage)
- def close(self) -> None:
- self.write_footer()
- class HOCRConverter(PDFConverter[AnyIO]):
- """Extract an hOCR representation from explicit text information within a PDF."""
- # Where text is being extracted from a variety of types of PDF within a
- # business process, those PDFs where the text is only present in image
- # form will need to be analysed using an OCR tool which will typically
- # output hOCR. This converter extracts the explicit text information from
- # those PDFs that do have it and uses it to genxerate a basic hOCR
- # representation that is designed to be used in conjunction with the image
- # of the PDF in the same way as genuine OCR output would be, but without the
- # inevitable OCR errors.
- # The converter does not handle images, diagrams or text colors.
- # In the examples processed by the contributor it was necessary to set
- # LAParams.all_texts to True.
- CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
- def __init__(
- self,
- rsrcmgr: PDFResourceManager,
- outfp: AnyIO,
- codec: str = "utf8",
- pageno: int = 1,
- laparams: Optional[LAParams] = None,
- stripcontrol: bool = False,
- ):
- PDFConverter.__init__(
- self,
- rsrcmgr,
- outfp,
- codec=codec,
- pageno=pageno,
- laparams=laparams,
- )
- self.stripcontrol = stripcontrol
- self.within_chars = False
- self.write_header()
- def bbox_repr(self, bbox: Rect) -> str:
- (in_x0, in_y0, in_x1, in_y1) = bbox
- # PDF y-coordinates are the other way round from hOCR coordinates
- out_x0 = int(in_x0)
- out_y0 = int(self.page_bbox[3] - in_y1)
- out_x1 = int(in_x1)
- out_y1 = int(self.page_bbox[3] - in_y0)
- return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
- def write(self, text: str) -> None:
- if self.codec:
- encoded_text = text.encode(self.codec)
- cast(BinaryIO, self.outfp).write(encoded_text)
- else:
- cast(TextIO, self.outfp).write(text)
- def write_header(self) -> None:
- if self.codec:
- self.write(
- "<html xmlns='http://www.w3.org/1999/xhtml' "
- "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
- )
- else:
- self.write(
- "<html xmlns='http://www.w3.org/1999/xhtml' "
- "xml:lang='en' lang='en'>\n",
- )
- self.write("<head>\n")
- self.write("<title></title>\n")
- self.write(
- "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
- )
- self.write(
- "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",
- )
- self.write(
- " <meta name='ocr-capabilities'"
- " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
- )
- self.write("</head>\n")
- self.write("<body>\n")
- def write_footer(self) -> None:
- self.write("<!-- comment in the following line to debug -->\n")
- self.write(
- "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
- )
- def write_text(self, text: str) -> None:
- if self.stripcontrol:
- text = self.CONTROL.sub("", text)
- self.write(text)
- def write_word(self) -> None:
- if len(self.working_text) > 0:
- bold_and_italic_styles = ""
- if "Italic" in self.working_font:
- bold_and_italic_styles = "font-style: italic; "
- if "Bold" in self.working_font:
- bold_and_italic_styles += "font-weight: bold; "
- self.write(
- "<span style='font:\"%s\"; font-size:%d; %s' "
- "class='ocrx_word' title='%s; x_font %s; "
- "x_fsize %d'>%s</span>"
- % (
- (
- self.working_font,
- self.working_size,
- bold_and_italic_styles,
- self.bbox_repr(self.working_bbox),
- self.working_font,
- self.working_size,
- self.working_text.strip(),
- )
- ),
- )
- self.within_chars = False
- def receive_layout(self, ltpage: LTPage) -> None:
- def render(item: LTItem) -> None:
- if self.within_chars and isinstance(item, LTAnno):
- self.write_word()
- if isinstance(item, LTPage):
- self.page_bbox = item.bbox
- self.write(
- "<div class='ocr_page' id='%s' title='%s'>\n"
- % (item.pageid, self.bbox_repr(item.bbox)),
- )
- for child in item:
- render(child)
- self.write("</div>\n")
- elif isinstance(item, LTTextLine):
- self.write(
- "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
- )
- for child_line in item:
- render(child_line)
- self.write("</span>\n")
- elif isinstance(item, LTTextBox):
- self.write(
- "<div class='ocr_block' id='%d' title='%s'>\n"
- % (item.index, self.bbox_repr(item.bbox)),
- )
- for child in item:
- render(child)
- self.write("</div>\n")
- elif isinstance(item, LTChar):
- if not self.within_chars:
- self.within_chars = True
- self.working_text = item.get_text()
- self.working_bbox = item.bbox
- self.working_font = item.fontname
- self.working_size = item.size
- elif len(item.get_text().strip()) == 0:
- self.write_word()
- self.write(item.get_text())
- else:
- if (
- self.working_bbox[1] != item.bbox[1]
- or self.working_font != item.fontname
- or self.working_size != item.size
- ):
- self.write_word()
- self.working_bbox = item.bbox
- self.working_font = item.fontname
- self.working_size = item.size
- self.working_text += item.get_text()
- self.working_bbox = (
- self.working_bbox[0],
- self.working_bbox[1],
- item.bbox[2],
- self.working_bbox[3],
- )
- render(ltpage)
- def close(self) -> None:
- self.write_footer()
|