| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563 |
- """
- Copyright (C) 2023 Artifex Software, Inc.
- This file is part of PyMuPDF.
- PyMuPDF is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License as published by the Free
- Software Foundation, either version 3 of the License, or (at your option)
- any later version.
- PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
- You should have received a copy of the GNU Affero General Public License
- along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
- Alternative licensing terms are available from the licensor.
- For commercial licensing, see <https://www.artifex.com/> or contact
- Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
- CA 94129, USA, for further information.
- ---------------------------------------------------------------------
- Portions of this code have been ported from pdfplumber, see
- https://pypi.org/project/pdfplumber/.
- The ported code is under the following MIT license:
- ---------------------------------------------------------------------
- The MIT License (MIT)
- Copyright (c) 2015, Jeremy Singer-Vine
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
- ---------------------------------------------------------------------
- Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
- ---------------------------------------------------------------------
- The porting mainly pertains to files "table.py" and relevant parts of
- "utils/text.py" within pdfplumber's repository on Github.
- With respect to "text.py", we have removed functions or features that are not
- used by table processing. Examples are:
- * the text search function
- * simple text extraction
- * text extraction by lines
- Original pdfplumber code does neither detect, nor identify table headers.
- This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'.
- This is implemented as new class TableHeader with the properties:
- * bbox: A tuple for the header's bbox
- * cells: A tuple for each bbox of a column header
- * names: A list of strings with column header text
- * external: A bool indicating whether the header is outside the table cells.
- """
- import inspect
- import itertools
- import string
- import html
- from collections.abc import Sequence
- from dataclasses import dataclass
- from operator import itemgetter
- import weakref
- # -------------------------------------------------------------------
- # Start of PyMuPDF interface code
- # -------------------------------------------------------------------
- from . import (
- Rect,
- Matrix,
- TEXTFLAGS_TEXT,
- TEXT_FONT_BOLD,
- TEXT_FONT_ITALIC,
- TEXT_FONT_MONOSPACED,
- TEXT_FONT_SUPERSCRIPT,
- TEXT_COLLECT_STYLES,
- TOOLS,
- EMPTY_RECT,
- sRGB_to_pdf,
- Point,
- message,
- mupdf,
- )
- EDGES = [] # vector graphics from PyMuPDF
- CHARS = [] # text characters from PyMuPDF
- TEXTPAGE = None
- TEXT_BOLD = mupdf.FZ_STEXT_BOLD
- TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
- FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
- white_spaces = set(string.whitespace) # for checking white space only cells
- def extract_cells(textpage, cell, markdown=False):
- """Extract text from a rect-like 'cell' as plain or MD style text.
- This function should ultimately be used to extract text from a table cell.
- Markdown output will only work correctly if extraction flag bit
- TEXT_COLLECT_STYLES is set.
- Args:
- textpage: A PyMuPDF TextPage object. Must have been created with
- TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
- cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
- markdown: If True, return text formatted for Markdown.
- Returns:
- A string with the text extracted from the cell.
- """
- text = ""
- for block in textpage.extractRAWDICT()["blocks"]:
- if block["type"] != 0:
- continue
- block_bbox = block["bbox"]
- if (
- 0
- or block_bbox[0] > cell[2]
- or block_bbox[2] < cell[0]
- or block_bbox[1] > cell[3]
- or block_bbox[3] < cell[1]
- ):
- continue # skip block outside cell
- for line in block["lines"]:
- lbbox = line["bbox"]
- if (
- 0
- or lbbox[0] > cell[2]
- or lbbox[2] < cell[0]
- or lbbox[1] > cell[3]
- or lbbox[3] < cell[1]
- ):
- continue # skip line outside cell
- if text: # must be a new line in the cell
- text += "<br>" if markdown else "\n"
- # strikeout detection only works with horizontal text
- horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
- for span in line["spans"]:
- sbbox = span["bbox"]
- if (
- 0
- or sbbox[0] > cell[2]
- or sbbox[2] < cell[0]
- or sbbox[1] > cell[3]
- or sbbox[3] < cell[1]
- ):
- continue # skip spans outside cell
- # only include chars with more than 50% bbox overlap
- span_text = ""
- for char in span["chars"]:
- bbox = Rect(char["bbox"])
- if abs(bbox & cell) > 0.5 * abs(bbox):
- span_text += char["c"]
- if not span_text:
- continue # skip empty span
- if not markdown: # no MD styling
- text += span_text
- continue
- prefix = ""
- suffix = ""
- if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
- prefix += "~~"
- suffix = "~~" + suffix
- if span["char_flags"] & TEXT_BOLD:
- prefix += "**"
- suffix = "**" + suffix
- if span["flags"] & TEXT_FONT_ITALIC:
- prefix += "_"
- suffix = "_" + suffix
- if span["flags"] & TEXT_FONT_MONOSPACED:
- prefix += "`"
- suffix = "`" + suffix
- if len(span["chars"]) > 2:
- span_text = span_text.rstrip()
- # if span continues previous styling: extend cell text
- if (ls := len(suffix)) and text.endswith(suffix):
- text = text[:-ls] + span_text + suffix
- else: # append the span with new styling
- if not span_text.strip():
- text += " "
- else:
- text += prefix + span_text + suffix
- return text.strip()
- # -------------------------------------------------------------------
- # End of PyMuPDF interface code
- # -------------------------------------------------------------------
- class UnsetFloat(float):
- pass
- NON_NEGATIVE_SETTINGS = [
- "snap_tolerance",
- "snap_x_tolerance",
- "snap_y_tolerance",
- "join_tolerance",
- "join_x_tolerance",
- "join_y_tolerance",
- "edge_min_length",
- "min_words_vertical",
- "min_words_horizontal",
- "intersection_tolerance",
- "intersection_x_tolerance",
- "intersection_y_tolerance",
- ]
- TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"]
- UNSET = UnsetFloat(0)
- DEFAULT_SNAP_TOLERANCE = 3
- DEFAULT_JOIN_TOLERANCE = 3
- DEFAULT_MIN_WORDS_VERTICAL = 3
- DEFAULT_MIN_WORDS_HORIZONTAL = 1
- DEFAULT_X_TOLERANCE = 3
- DEFAULT_Y_TOLERANCE = 3
- DEFAULT_X_DENSITY = 7.25
- DEFAULT_Y_DENSITY = 13
- bbox_getter = itemgetter("x0", "top", "x1", "bottom")
- LIGATURES = {
- "ff": "ff",
- "ffi": "ffi",
- "ffl": "ffl",
- "fi": "fi",
- "fl": "fl",
- "st": "st",
- "ſt": "st",
- }
- def to_list(collection) -> list:
- if isinstance(collection, list):
- return collection
- elif isinstance(collection, Sequence):
- return list(collection)
- elif hasattr(collection, "to_dict"):
- res = collection.to_dict("records") # pragma: nocover
- return res
- else:
- return list(collection)
- class TextMap:
- """
- A TextMap maps each unicode character in the text to an individual `char`
- object (or, in the case of layout-implied whitespace, `None`).
- """
- def __init__(self, tuples=None) -> None:
- self.tuples = tuples
- self.as_string = "".join(map(itemgetter(0), tuples))
- def match_to_dict(
- self,
- m,
- main_group: int = 0,
- return_groups: bool = True,
- return_chars: bool = True,
- ) -> dict:
- subset = self.tuples[m.start(main_group) : m.end(main_group)]
- chars = [c for (text, c) in subset if c is not None]
- x0, top, x1, bottom = objects_to_bbox(chars)
- result = {
- "text": m.group(main_group),
- "x0": x0,
- "top": top,
- "x1": x1,
- "bottom": bottom,
- }
- if return_groups:
- result["groups"] = m.groups()
- if return_chars:
- result["chars"] = chars
- return result
- class WordMap:
- """
- A WordMap maps words->chars.
- """
- def __init__(self, tuples) -> None:
- self.tuples = tuples
- def to_textmap(
- self,
- layout: bool = False,
- layout_width=0,
- layout_height=0,
- layout_width_chars: int = 0,
- layout_height_chars: int = 0,
- x_density=DEFAULT_X_DENSITY,
- y_density=DEFAULT_Y_DENSITY,
- x_shift=0,
- y_shift=0,
- y_tolerance=DEFAULT_Y_TOLERANCE,
- use_text_flow: bool = False,
- presorted: bool = False,
- expand_ligatures: bool = True,
- ) -> TextMap:
- """
- Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
- (char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
- structural layout of the text on the page(s), using the following approach:
- - Sort the words by (doctop, x0) if not already sorted.
- - Calculate the initial doctop for the starting page.
- - Cluster the words by doctop (taking `y_tolerance` into account), and
- iterate through them.
- - For each cluster, calculate the distance between that doctop and the
- initial doctop, in points, minus `y_shift`. Divide that distance by
- `y_density` to calculate the minimum number of newlines that should come
- before this cluster. Append that number of newlines *minus* the number of
- newlines already appended, with a minimum of one.
- - Then for each cluster, iterate through each word in it. Divide each
- word's x0, minus `x_shift`, by `x_density` to calculate the minimum
- number of characters that should come before this cluster. Append that
- number of spaces *minus* the number of characters and spaces already
- appended, with a minimum of one. Then append the word's text.
- - At the termination of each line, add more spaces if necessary to
- mimic `layout_width`.
- - Finally, add newlines to the end if necessary to mimic to
- `layout_height`.
- Note: This approach currently works best for horizontal, left-to-right
- text, but will display all words regardless of orientation. There is room
- for improvement in better supporting right-to-left text, as well as
- vertical text.
- """
- _textmap = []
- if not len(self.tuples):
- return TextMap(_textmap)
- expansions = LIGATURES if expand_ligatures else {}
- if layout:
- if layout_width_chars:
- if layout_width:
- raise ValueError(
- "`layout_width` and `layout_width_chars` cannot both be set."
- )
- else:
- layout_width_chars = int(round(layout_width / x_density))
- if layout_height_chars:
- if layout_height:
- raise ValueError(
- "`layout_height` and `layout_height_chars` cannot both be set."
- )
- else:
- layout_height_chars = int(round(layout_height / y_density))
- blank_line = [(" ", None)] * layout_width_chars
- else:
- blank_line = []
- num_newlines = 0
- words_sorted_doctop = (
- self.tuples
- if presorted or use_text_flow
- else sorted(self.tuples, key=lambda x: float(x[0]["doctop"]))
- )
- first_word = words_sorted_doctop[0][0]
- doctop_start = first_word["doctop"] - first_word["top"]
- for i, ws in enumerate(
- cluster_objects(
- words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
- )
- ):
- y_dist = (
- (ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density
- if layout
- else 0
- )
- num_newlines_prepend = max(
- # At least one newline, unless this iis the first line
- int(i > 0),
- # ... or as many as needed to get the imputed "distance" from the top
- round(y_dist) - num_newlines,
- )
- for i in range(num_newlines_prepend):
- if not len(_textmap) or _textmap[-1][0] == "\n":
- _textmap += blank_line
- _textmap.append(("\n", None))
- num_newlines += num_newlines_prepend
- line_len = 0
- line_words_sorted_x0 = (
- ws
- if presorted or use_text_flow
- else sorted(ws, key=lambda x: float(x[0]["x0"]))
- )
- for word, chars in line_words_sorted_x0:
- x_dist = (word["x0"] - x_shift) / x_density if layout else 0
- num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
- _textmap += [(" ", None)] * num_spaces_prepend
- line_len += num_spaces_prepend
- for c in chars:
- letters = expansions.get(c["text"], c["text"])
- for letter in letters:
- _textmap.append((letter, c))
- line_len += 1
- # Append spaces at end of line
- if layout:
- _textmap += [(" ", None)] * (layout_width_chars - line_len)
- # Append blank lines at end of text
- if layout:
- num_newlines_append = layout_height_chars - (num_newlines + 1)
- for i in range(num_newlines_append):
- if i > 0:
- _textmap += blank_line
- _textmap.append(("\n", None))
- # Remove terminal newline
- if _textmap[-1] == ("\n", None):
- _textmap = _textmap[:-1]
- return TextMap(_textmap)
- class WordExtractor:
- def __init__(
- self,
- x_tolerance=DEFAULT_X_TOLERANCE,
- y_tolerance=DEFAULT_Y_TOLERANCE,
- keep_blank_chars: bool = False,
- use_text_flow=False,
- horizontal_ltr=True, # Should words be read left-to-right?
- vertical_ttb=False, # Should vertical words be read top-to-bottom?
- extra_attrs=None,
- split_at_punctuation=False,
- expand_ligatures=True,
- ):
- self.x_tolerance = x_tolerance
- self.y_tolerance = y_tolerance
- self.keep_blank_chars = keep_blank_chars
- self.use_text_flow = use_text_flow
- self.horizontal_ltr = horizontal_ltr
- self.vertical_ttb = vertical_ttb
- self.extra_attrs = [] if extra_attrs is None else extra_attrs
- # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
- self.split_at_punctuation = (
- string.punctuation
- if split_at_punctuation is True
- else (split_at_punctuation or "")
- )
- self.expansions = LIGATURES if expand_ligatures else {}
- def merge_chars(self, ordered_chars: list):
- x0, top, x1, bottom = objects_to_bbox(ordered_chars)
- doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
- upright = ordered_chars[0]["upright"]
- direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1
- matrix = ordered_chars[0]["matrix"]
- rotation = 0
- if not upright and matrix[1] < 0:
- ordered_chars = reversed(ordered_chars)
- rotation = 270
- if matrix[0] < 0 and matrix[3] < 0:
- rotation = 180
- elif matrix[1] > 0:
- rotation = 90
- word = {
- "text": "".join(
- self.expansions.get(c["text"], c["text"]) for c in ordered_chars
- ),
- "x0": x0,
- "x1": x1,
- "top": top,
- "doctop": top + doctop_adj,
- "bottom": bottom,
- "upright": upright,
- "direction": direction,
- "rotation": rotation,
- }
- for key in self.extra_attrs:
- word[key] = ordered_chars[0][key]
- return word
- def char_begins_new_word(
- self,
- prev_char,
- curr_char,
- ) -> bool:
- """This method takes several factors into account to determine if
- `curr_char` represents the beginning of a new word:
- - Whether the text is "upright" (i.e., non-rotated)
- - Whether the user has specified that horizontal text runs
- left-to-right (default) or right-to-left, as represented by
- self.horizontal_ltr
- - Whether the user has specified that vertical text the text runs
- top-to-bottom (default) or bottom-to-top, as represented by
- self.vertical_ttb
- - The x0, top, x1, and bottom attributes of prev_char and
- curr_char
- - The self.x_tolerance and self.y_tolerance settings. Note: In
- this case, x/y refer to those directions for non-rotated text.
- For vertical text, they are flipped. A more accurate terminology
- might be "*intra*line character distance tolerance" and
- "*inter*line character distance tolerance"
- An important note: The *intra*line distance is measured from the
- *end* of the previous character to the *beginning* of the current
- character, while the *inter*line distance is measured from the
- *top* of the previous character to the *top* of the next
- character. The reasons for this are partly repository-historical,
- and partly logical, as successive text lines' bounding boxes often
- overlap slightly (and we don't want that overlap to be interpreted
- as the two lines being the same line).
- The upright-ness of the character determines the attributes to
- compare, while horizontal_ltr/vertical_ttb determine the direction
- of the comparison.
- """
- # Note: Due to the grouping step earlier in the process,
- # curr_char["upright"] will always equal prev_char["upright"].
- if curr_char["upright"]:
- x = self.x_tolerance
- y = self.y_tolerance
- ay = prev_char["top"]
- cy = curr_char["top"]
- if self.horizontal_ltr:
- ax = prev_char["x0"]
- bx = prev_char["x1"]
- cx = curr_char["x0"]
- else:
- ax = -prev_char["x1"]
- bx = -prev_char["x0"]
- cx = -curr_char["x1"]
- else:
- x = self.y_tolerance
- y = self.x_tolerance
- ay = prev_char["x0"]
- cy = curr_char["x0"]
- if self.vertical_ttb:
- ax = prev_char["top"]
- bx = prev_char["bottom"]
- cx = curr_char["top"]
- else:
- ax = -prev_char["bottom"]
- bx = -prev_char["top"]
- cx = -curr_char["bottom"]
- return bool(
- # Intraline test
- (cx < ax)
- or (cx > bx + x)
- # Interline test
- or (cy > ay + y)
- )
- def iter_chars_to_words(self, ordered_chars):
- current_word: list = []
- def start_next_word(new_char=None):
- nonlocal current_word
- if current_word:
- yield current_word
- current_word = [] if new_char is None else [new_char]
- for char in ordered_chars:
- text = char["text"]
- if not self.keep_blank_chars and text.isspace():
- yield from start_next_word(None)
- elif text in self.split_at_punctuation:
- yield from start_next_word(char)
- yield from start_next_word(None)
- elif current_word and self.char_begins_new_word(current_word[-1], char):
- yield from start_next_word(char)
- else:
- current_word.append(char)
- # Finally, after all chars processed
- if current_word:
- yield current_word
- def iter_sort_chars(self, chars):
- def upright_key(x) -> int:
- return -int(x["upright"])
- for upright_cluster in cluster_objects(list(chars), upright_key, 0):
- upright = upright_cluster[0]["upright"]
- cluster_key = "doctop" if upright else "x0"
- # Cluster by line
- subclusters = cluster_objects(
- upright_cluster, itemgetter(cluster_key), self.y_tolerance
- )
- for sc in subclusters:
- # Sort within line
- sort_key = "x0" if upright else "doctop"
- to_yield = sorted(sc, key=itemgetter(sort_key))
- # Reverse order if necessary
- if not (self.horizontal_ltr if upright else self.vertical_ttb):
- yield from reversed(to_yield)
- else:
- yield from to_yield
- def iter_extract_tuples(self, chars):
- ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars)
- grouping_key = itemgetter("upright", *self.extra_attrs)
- grouped_chars = itertools.groupby(ordered_chars, grouping_key)
- for keyvals, char_group in grouped_chars:
- for word_chars in self.iter_chars_to_words(char_group):
- yield (self.merge_chars(word_chars), word_chars)
- def extract_wordmap(self, chars) -> WordMap:
- return WordMap(list(self.iter_extract_tuples(chars)))
- def extract_words(self, chars: list) -> list:
- words = list(word for word, word_chars in self.iter_extract_tuples(chars))
- return words
- def extract_words(chars: list, **kwargs) -> list:
- return WordExtractor(**kwargs).extract_words(chars)
- TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
- WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
- def chars_to_textmap(chars: list, **kwargs) -> TextMap:
- kwargs.update({"presorted": True})
- extractor = WordExtractor(
- **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
- )
- wordmap = extractor.extract_wordmap(chars)
- textmap = wordmap.to_textmap(
- **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
- )
- return textmap
- def extract_text(chars: list, **kwargs) -> str:
- chars = to_list(chars)
- if len(chars) == 0:
- return ""
- if kwargs.get("layout"):
- return chars_to_textmap(chars, **kwargs).as_string
- else:
- y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
- extractor = WordExtractor(
- **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
- )
- words = extractor.extract_words(chars)
- if words:
- rotation = words[0]["rotation"] # rotation cannot change within a cell
- else:
- rotation = 0
- if rotation == 90:
- words.sort(key=lambda w: (w["x1"], -w["top"]))
- lines = " ".join([w["text"] for w in words])
- elif rotation == 270:
- words.sort(key=lambda w: (-w["x1"], w["top"]))
- lines = " ".join([w["text"] for w in words])
- else:
- lines = cluster_objects(words, itemgetter("doctop"), y_tolerance)
- lines = "\n".join(" ".join(word["text"] for word in line) for line in lines)
- if rotation == 180: # needs extra treatment
- lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)])
- return lines
- def collate_line(
- line_chars: list,
- tolerance=DEFAULT_X_TOLERANCE,
- ) -> str:
- coll = ""
- last_x1 = None
- for char in sorted(line_chars, key=itemgetter("x0")):
- if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
- coll += " "
- last_x1 = char["x1"]
- coll += char["text"]
- return coll
- def dedupe_chars(chars: list, tolerance=1) -> list:
- """
- Removes duplicate chars — those sharing the same text, fontname, size,
- and positioning (within `tolerance`) as other characters in the set.
- """
- key = itemgetter("fontname", "size", "upright", "text")
- pos_key = itemgetter("doctop", "x0")
- def yield_unique_chars(chars: list):
- sorted_chars = sorted(chars, key=key)
- for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
- for y_cluster in cluster_objects(
- list(grp_chars), itemgetter("doctop"), tolerance
- ):
- for x_cluster in cluster_objects(
- y_cluster, itemgetter("x0"), tolerance
- ):
- yield sorted(x_cluster, key=pos_key)[0]
- deduped = yield_unique_chars(chars)
- return sorted(deduped, key=chars.index)
- def line_to_edge(line):
- edge = dict(line)
- edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
- return edge
- def rect_to_edges(rect) -> list:
- top, bottom, left, right = [dict(rect) for x in range(4)]
- top.update(
- {
- "object_type": "rect_edge",
- "height": 0,
- "y0": rect["y1"],
- "bottom": rect["top"],
- "orientation": "h",
- }
- )
- bottom.update(
- {
- "object_type": "rect_edge",
- "height": 0,
- "y1": rect["y0"],
- "top": rect["top"] + rect["height"],
- "doctop": rect["doctop"] + rect["height"],
- "orientation": "h",
- }
- )
- left.update(
- {
- "object_type": "rect_edge",
- "width": 0,
- "x1": rect["x0"],
- "orientation": "v",
- }
- )
- right.update(
- {
- "object_type": "rect_edge",
- "width": 0,
- "x0": rect["x1"],
- "orientation": "v",
- }
- )
- return [top, bottom, left, right]
- def curve_to_edges(curve) -> list:
- point_pairs = zip(curve["pts"], curve["pts"][1:])
- return [
- {
- "object_type": "curve_edge",
- "x0": min(p0[0], p1[0]),
- "x1": max(p0[0], p1[0]),
- "top": min(p0[1], p1[1]),
- "doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]),
- "bottom": max(p0[1], p1[1]),
- "width": abs(p0[0] - p1[0]),
- "height": abs(p0[1] - p1[1]),
- "orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None),
- }
- for p0, p1 in point_pairs
- ]
- def obj_to_edges(obj) -> list:
- t = obj["object_type"]
- if "_edge" in t:
- return [obj]
- elif t == "line":
- return [line_to_edge(obj)]
- else:
- return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)
- def filter_edges(
- edges,
- orientation=None,
- edge_type=None,
- min_length=1,
- ) -> list:
- if orientation not in ("v", "h", None):
- raise ValueError("Orientation must be 'v' or 'h'")
- def test(e) -> bool:
- dim = "height" if e["orientation"] == "v" else "width"
- et_correct = e["object_type"] == edge_type if edge_type is not None else True
- orient_correct = orientation is None or e["orientation"] == orientation
- return bool(et_correct and orient_correct and (e[dim] >= min_length))
- return list(filter(test, edges))
- def cluster_list(xs, tolerance=0) -> list:
- if tolerance == 0:
- return [[x] for x in sorted(xs)]
- if len(xs) < 2:
- return [[x] for x in sorted(xs)]
- groups = []
- xs = list(sorted(xs))
- current_group = [xs[0]]
- last = xs[0]
- for x in xs[1:]:
- if x <= (last + tolerance):
- current_group.append(x)
- else:
- groups.append(current_group)
- current_group = [x]
- last = x
- groups.append(current_group)
- return groups
- def make_cluster_dict(values, tolerance) -> dict:
- clusters = cluster_list(list(set(values)), tolerance)
- nested_tuples = [
- [(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
- ]
- return dict(itertools.chain(*nested_tuples))
- def cluster_objects(xs, key_fn, tolerance) -> list:
- if not callable(key_fn):
- key_fn = itemgetter(key_fn)
- values = map(key_fn, xs)
- cluster_dict = make_cluster_dict(values, tolerance)
- get_0, get_1 = itemgetter(0), itemgetter(1)
- cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
- grouped = itertools.groupby(cluster_tuples, key=get_1)
- return [list(map(get_0, v)) for k, v in grouped]
- def move_object(obj, axis: str, value):
- assert axis in ("h", "v")
- if axis == "h":
- new_items = [
- ("x0", obj["x0"] + value),
- ("x1", obj["x1"] + value),
- ]
- if axis == "v":
- new_items = [
- ("top", obj["top"] + value),
- ("bottom", obj["bottom"] + value),
- ]
- if "doctop" in obj:
- new_items += [("doctop", obj["doctop"] + value)]
- if "y0" in obj:
- new_items += [
- ("y0", obj["y0"] - value),
- ("y1", obj["y1"] - value),
- ]
- return obj.__class__(tuple(obj.items()) + tuple(new_items))
- def snap_objects(objs, attr: str, tolerance) -> list:
- axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr]
- list_objs = list(objs)
- clusters = cluster_objects(list_objs, itemgetter(attr), tolerance)
- avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters]
- snapped_clusters = [
- [move_object(obj, axis, avg - obj[attr]) for obj in cluster]
- for cluster, avg in zip(clusters, avgs)
- ]
- return list(itertools.chain(*snapped_clusters))
- def snap_edges(
- edges,
- x_tolerance=DEFAULT_SNAP_TOLERANCE,
- y_tolerance=DEFAULT_SNAP_TOLERANCE,
- ):
- """
- Given a list of edges, snap any within `tolerance` pixels of one another
- to their positional average.
- """
- by_orientation = {"v": [], "h": []}
- for e in edges:
- by_orientation[e["orientation"]].append(e)
- snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance)
- snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance)
- return snapped_v + snapped_h
- def resize_object(obj, key: str, value):
- assert key in ("x0", "x1", "top", "bottom")
- old_value = obj[key]
- diff = value - old_value
- new_items = [
- (key, value),
- ]
- if key == "x0":
- assert value <= obj["x1"]
- new_items.append(("width", obj["x1"] - value))
- elif key == "x1":
- assert value >= obj["x0"]
- new_items.append(("width", value - obj["x0"]))
- elif key == "top":
- assert value <= obj["bottom"]
- new_items.append(("doctop", obj["doctop"] + diff))
- new_items.append(("height", obj["height"] - diff))
- if "y1" in obj:
- new_items.append(("y1", obj["y1"] - diff))
- elif key == "bottom":
- assert value >= obj["top"]
- new_items.append(("height", obj["height"] + diff))
- if "y0" in obj:
- new_items.append(("y0", obj["y0"] - diff))
- return obj.__class__(tuple(obj.items()) + tuple(new_items))
- def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE):
- """
- Given a list of edges along the same infinite line, join those that
- are within `tolerance` pixels of one another.
- """
- if orientation == "h":
- min_prop, max_prop = "x0", "x1"
- elif orientation == "v":
- min_prop, max_prop = "top", "bottom"
- else:
- raise ValueError("Orientation must be 'v' or 'h'")
- sorted_edges = list(sorted(edges, key=itemgetter(min_prop)))
- joined = [sorted_edges[0]]
- for e in sorted_edges[1:]:
- last = joined[-1]
- if e[min_prop] <= (last[max_prop] + tolerance):
- if e[max_prop] > last[max_prop]:
- # Extend current edge to new extremity
- joined[-1] = resize_object(last, max_prop, e[max_prop])
- else:
- # Edge is separate from previous edges
- joined.append(e)
- return joined
- def merge_edges(
- edges,
- snap_x_tolerance,
- snap_y_tolerance,
- join_x_tolerance,
- join_y_tolerance,
- ):
- """
- Using the `snap_edges` and `join_edge_group` methods above,
- merge a list of edges into a more "seamless" list.
- """
- def get_group(edge):
- if edge["orientation"] == "h":
- return ("h", edge["top"])
- else:
- return ("v", edge["x0"])
- if snap_x_tolerance > 0 or snap_y_tolerance > 0:
- edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)
- _sorted = sorted(edges, key=get_group)
- edge_groups = itertools.groupby(_sorted, key=get_group)
- edge_gen = (
- join_edge_group(
- items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance)
- )
- for k, items in edge_groups
- )
- edges = list(itertools.chain(*edge_gen))
- return edges
- def bbox_to_rect(bbox) -> dict:
- """
- Return the rectangle (i.e a dict with keys "x0", "top", "x1",
- "bottom") for an object.
- """
- return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
- def objects_to_rect(objects) -> dict:
- """
- Given an iterable of objects, return the smallest rectangle (i.e. a
- dict with "x0", "top", "x1", and "bottom" keys) that contains them
- all.
- """
- return bbox_to_rect(objects_to_bbox(objects))
- def merge_bboxes(bboxes):
- """
- Given an iterable of bounding boxes, return the smallest bounding box
- that contains them all.
- """
- x0, top, x1, bottom = zip(*bboxes)
- return (min(x0), min(top), max(x1), max(bottom))
- def objects_to_bbox(objects):
- """
- Given an iterable of objects, return the smallest bounding box that
- contains them all.
- """
- return merge_bboxes(map(bbox_getter, objects))
- def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL):
- """
- Find (imaginary) horizontal lines that connect the tops
- of at least `word_threshold` words.
- """
- by_top = cluster_objects(words, itemgetter("top"), 1)
- large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
- rects = list(map(objects_to_rect, large_clusters))
- if len(rects) == 0:
- return []
- min_x0 = min(map(itemgetter("x0"), rects))
- max_x1 = max(map(itemgetter("x1"), rects))
- edges = []
- for r in rects:
- edges += [
- # Top of text
- {
- "x0": min_x0,
- "x1": max_x1,
- "top": r["top"],
- "bottom": r["top"],
- "width": max_x1 - min_x0,
- "orientation": "h",
- },
- # For each detected row, we also add the 'bottom' line. This will
- # generate extra edges, (some will be redundant with the next row
- # 'top' line), but this catches the last row of every table.
- {
- "x0": min_x0,
- "x1": max_x1,
- "top": r["bottom"],
- "bottom": r["bottom"],
- "width": max_x1 - min_x0,
- "orientation": "h",
- },
- ]
- return edges
- def get_bbox_overlap(a, b):
- a_left, a_top, a_right, a_bottom = a
- b_left, b_top, b_right, b_bottom = b
- o_left = max(a_left, b_left)
- o_right = min(a_right, b_right)
- o_bottom = min(a_bottom, b_bottom)
- o_top = max(a_top, b_top)
- o_width = o_right - o_left
- o_height = o_bottom - o_top
- if o_height >= 0 and o_width >= 0 and o_height + o_width > 0:
- return (o_left, o_top, o_right, o_bottom)
- else:
- return None
- def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL):
- """
- Find (imaginary) vertical lines that connect the left, right, or
- center of at least `word_threshold` words.
- """
- # Find words that share the same left, right, or centerpoints
- by_x0 = cluster_objects(words, itemgetter("x0"), 1)
- by_x1 = cluster_objects(words, itemgetter("x1"), 1)
- def get_center(word):
- return float(word["x0"] + word["x1"]) / 2
- by_center = cluster_objects(words, get_center, 1)
- clusters = by_x0 + by_x1 + by_center
- # Find the points that align with the most words
- sorted_clusters = sorted(clusters, key=lambda x: -len(x))
- large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
- # For each of those points, find the bboxes fitting all matching words
- bboxes = list(map(objects_to_bbox, large_clusters))
- # Iterate through those bboxes, condensing overlapping bboxes
- condensed_bboxes = []
- for bbox in bboxes:
- overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes)
- if not overlap:
- condensed_bboxes.append(bbox)
- if not condensed_bboxes:
- return []
- condensed_rects = map(bbox_to_rect, condensed_bboxes)
- sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))
- max_x1 = max(map(itemgetter("x1"), sorted_rects))
- min_top = min(map(itemgetter("top"), sorted_rects))
- max_bottom = max(map(itemgetter("bottom"), sorted_rects))
- return [
- {
- "x0": b["x0"],
- "x1": b["x0"],
- "top": min_top,
- "bottom": max_bottom,
- "height": max_bottom - min_top,
- "orientation": "v",
- }
- for b in sorted_rects
- ] + [
- {
- "x0": max_x1,
- "x1": max_x1,
- "top": min_top,
- "bottom": max_bottom,
- "height": max_bottom - min_top,
- "orientation": "v",
- }
- ]
- def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict:
- """
- Given a list of edges, return the points at which they intersect
- within `tolerance` pixels.
- """
- intersections = {}
- v_edges, h_edges = [
- list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
- ]
- for v in sorted(v_edges, key=itemgetter("x0", "top")):
- for h in sorted(h_edges, key=itemgetter("top", "x0")):
- if (
- (v["top"] <= (h["top"] + y_tolerance))
- and (v["bottom"] >= (h["top"] - y_tolerance))
- and (v["x0"] >= (h["x0"] - x_tolerance))
- and (v["x0"] <= (h["x1"] + x_tolerance))
- ):
- vertex = (v["x0"], h["top"])
- if vertex not in intersections:
- intersections[vertex] = {"v": [], "h": []}
- intersections[vertex]["v"].append(v)
- intersections[vertex]["h"].append(h)
- return intersections
- def obj_to_bbox(obj):
- """
- Return the bounding box for an object.
- """
- return bbox_getter(obj)
- def intersections_to_cells(intersections):
- """
- Given a list of points (`intersections`), return all rectangular "cells"
- that those points describe.
- `intersections` should be a dictionary with (x0, top) tuples as keys,
- and a list of edge objects as values. The edge objects should correspond
- to the edges that touch the intersection.
- """
- def edge_connects(p1, p2) -> bool:
- def edges_to_set(edges):
- return set(map(obj_to_bbox, edges))
- if p1[0] == p2[0]:
- common = edges_to_set(intersections[p1]["v"]).intersection(
- edges_to_set(intersections[p2]["v"])
- )
- if len(common):
- return True
- if p1[1] == p2[1]:
- common = edges_to_set(intersections[p1]["h"]).intersection(
- edges_to_set(intersections[p2]["h"])
- )
- if len(common):
- return True
- return False
- points = list(sorted(intersections.keys()))
- n_points = len(points)
- def find_smallest_cell(points, i: int):
- if i == n_points - 1:
- return None
- pt = points[i]
- rest = points[i + 1 :]
- # Get all the points directly below and directly right
- below = [x for x in rest if x[0] == pt[0]]
- right = [x for x in rest if x[1] == pt[1]]
- for below_pt in below:
- if not edge_connects(pt, below_pt):
- continue
- for right_pt in right:
- if not edge_connects(pt, right_pt):
- continue
- bottom_right = (right_pt[0], below_pt[1])
- if (
- (bottom_right in intersections)
- and edge_connects(bottom_right, right_pt)
- and edge_connects(bottom_right, below_pt)
- ):
- return (pt[0], pt[1], bottom_right[0], bottom_right[1])
- return None
- cell_gen = (find_smallest_cell(points, i) for i in range(len(points)))
- return list(filter(None, cell_gen))
- def cells_to_tables(page, cells) -> list:
- """
- Given a list of bounding boxes (`cells`), return a list of tables that
- hold those cells most simply (and contiguously).
- """
- def bbox_to_corners(bbox) -> tuple:
- x0, top, x1, bottom = bbox
- return ((x0, top), (x0, bottom), (x1, top), (x1, bottom))
- remaining_cells = list(cells)
- # Iterate through the cells found above, and assign them
- # to contiguous tables
- current_corners = set()
- current_cells = []
- tables = []
- while len(remaining_cells):
- initial_cell_count = len(current_cells)
- for cell in list(remaining_cells):
- cell_corners = bbox_to_corners(cell)
- # If we're just starting a table ...
- if len(current_cells) == 0:
- # ... immediately assign it to the empty group
- current_corners |= set(cell_corners)
- current_cells.append(cell)
- remaining_cells.remove(cell)
- else:
- # How many corners does this table share with the current group?
- corner_count = sum(c in current_corners for c in cell_corners)
- # If touching on at least one corner...
- if corner_count > 0:
- # ... assign it to the current group
- current_corners |= set(cell_corners)
- current_cells.append(cell)
- remaining_cells.remove(cell)
- # If this iteration did not find any more cells to append...
- if len(current_cells) == initial_cell_count:
- # ... start a new cell group
- tables.append(list(current_cells))
- current_corners.clear()
- current_cells.clear()
- # Once we have exhausting the list of cells ...
- # ... and we have a cell group that has not been stored
- if len(current_cells):
- # ... store it.
- tables.append(list(current_cells))
- # PyMuPDF modification:
- # Remove tables without text or having only 1 column
- for i in range(len(tables) - 1, -1, -1):
- r = EMPTY_RECT()
- x1_vals = set()
- x0_vals = set()
- for c in tables[i]:
- r |= c
- x1_vals.add(c[2])
- x0_vals.add(c[0])
- if (
- len(x1_vals) < 2
- or len(x0_vals) < 2
- or white_spaces.issuperset(
- page.get_textbox(
- r,
- textpage=TEXTPAGE,
- )
- )
- ):
- del tables[i]
- # Sort the tables top-to-bottom-left-to-right based on the value of the
- # topmost-and-then-leftmost coordinate of a table.
- _sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
- return _sorted
- class CellGroup:
- def __init__(self, cells):
- self.cells = cells
- self.bbox = (
- min(map(itemgetter(0), filter(None, cells))),
- min(map(itemgetter(1), filter(None, cells))),
- max(map(itemgetter(2), filter(None, cells))),
- max(map(itemgetter(3), filter(None, cells))),
- )
- class TableRow(CellGroup):
- pass
- class TableHeader:
- """PyMuPDF extension containing the identified table header."""
- def __init__(self, bbox, cells, names, above):
- self.bbox = bbox
- self.cells = cells
- self.names = names
- self.external = above
- class Table:
- def __init__(self, page, cells):
- self.page = page
- self.cells = cells
- self.header = self._get_header() # PyMuPDF extension
- @property
- def bbox(self):
- c = self.cells
- return (
- min(map(itemgetter(0), c)),
- min(map(itemgetter(1), c)),
- max(map(itemgetter(2), c)),
- max(map(itemgetter(3), c)),
- )
- @property
- def rows(self) -> list:
- _sorted = sorted(self.cells, key=itemgetter(1, 0))
- xs = list(sorted(set(map(itemgetter(0), self.cells))))
- rows = []
- for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
- xdict = {cell[0]: cell for cell in row_cells}
- row = TableRow([xdict.get(x) for x in xs])
- rows.append(row)
- return rows
- @property
- def row_count(self) -> int: # PyMuPDF extension
- return len(self.rows)
- @property
- def col_count(self) -> int: # PyMuPDF extension
- return max([len(r.cells) for r in self.rows])
- def extract(self, **kwargs) -> list:
- chars = CHARS
- table_arr = []
- def char_in_bbox(char, bbox) -> bool:
- v_mid = (char["top"] + char["bottom"]) / 2
- h_mid = (char["x0"] + char["x1"]) / 2
- x0, top, x1, bottom = bbox
- return bool(
- (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
- )
- for row in self.rows:
- arr = []
- row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
- for cell in row.cells:
- if cell is None:
- cell_text = None
- else:
- cell_chars = [
- char for char in row_chars if char_in_bbox(char, cell)
- ]
- if len(cell_chars):
- kwargs["x_shift"] = cell[0]
- kwargs["y_shift"] = cell[1]
- if "layout" in kwargs:
- kwargs["layout_width"] = cell[2] - cell[0]
- kwargs["layout_height"] = cell[3] - cell[1]
- cell_text = extract_text(cell_chars, **kwargs)
- else:
- cell_text = ""
- arr.append(cell_text)
- table_arr.append(arr)
- return table_arr
- def to_markdown(self, clean=False, fill_empty=True):
- """Output table content as a string in Github-markdown format.
- If "clean" then markdown syntax is removed from cell content.
- If "fill_empty" then cell content None is replaced by the values
- above (columns) or left (rows) in an effort to approximate row and
- columns spans.
- """
- output = "|"
- rows = self.row_count
- cols = self.col_count
- # cell coordinates
- cell_boxes = [[c for c in r.cells] for r in self.rows]
- # cell text strings
- cells = [[None for i in range(cols)] for j in range(rows)]
- for i, row in enumerate(cell_boxes):
- for j, cell in enumerate(row):
- if cell is not None:
- cells[i][j] = extract_cells(
- TEXTPAGE, cell_boxes[i][j], markdown=True
- )
- if fill_empty: # fill "None" cells where possible
- # for rows, copy content from left to right
- for j in range(rows):
- for i in range(cols - 1):
- if cells[j][i + 1] is None:
- cells[j][i + 1] = cells[j][i]
- # for columns, copy top to bottom
- for i in range(cols):
- for j in range(rows - 1):
- if cells[j + 1][i] is None:
- cells[j + 1][i] = cells[j][i]
- # generate header string and MD separator
- for i, name in enumerate(self.header.names):
- if not name: # generate a name if empty
- name = f"Col{i+1}"
- name = name.replace("\n", "<br>") # use HTML line breaks
- if clean: # remove sensitive syntax
- name = html.escape(name.replace("-", "-"))
- output += name + "|"
- output += "\n"
- # insert GitHub header line separator
- output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
- # skip first row in details if header is part of the table
- j = 0 if self.header.external else 1
- # iterate over detail rows
- for row in cells[j:]:
- line = "|"
- for i, cell in enumerate(row):
- # replace None cells with empty string
- # use HTML line break tag
- if cell is None:
- cell = ""
- if clean: # remove sensitive syntax
- cell = html.escape(cell.replace("-", "-"))
- line += cell + "|"
- line += "\n"
- output += line
- return output + "\n"
- def to_pandas(self, **kwargs):
- """Return a pandas DataFrame version of the table."""
- try:
- import pandas as pd
- except ModuleNotFoundError:
- message("Package 'pandas' is not installed")
- raise
- pd_dict = {}
- extract = self.extract()
- hdr = self.header
- names = self.header.names
- hdr_len = len(names)
- # ensure uniqueness of column names
- for i in range(hdr_len):
- name = names[i]
- if not name:
- names[i] = f"Col{i}"
- if hdr_len != len(set(names)):
- for i in range(hdr_len):
- name = names[i]
- if name != f"Col{i}":
- names[i] = f"{i}-{name}"
- if not hdr.external: # header is part of 'extract'
- extract = extract[1:]
- for i in range(hdr_len):
- key = names[i]
- value = []
- for j in range(len(extract)):
- value.append(extract[j][i])
- pd_dict[key] = value
- return pd.DataFrame(pd_dict)
- def _get_header(self, y_tolerance=3):
- """Identify the table header.
- *** PyMuPDF extension. ***
- Starting from the first line above the table upwards, check if it
- qualifies to be part of the table header.
- Criteria include:
- * A one-line table never has an extra header.
- * Column borders must not intersect any word. If this happens, all
- text of this line and above of it is ignored.
- * No excess inter-line distance: If a line further up has a distance
- of more than 1.5 times of its font size, it will be ignored and
- all lines above of it.
- * Must have same text properties.
- * Starting with the top table line, a bold text property cannot change
- back to non-bold.
- If not all criteria are met (or there is no text above the table),
- the first table row is assumed to be the header.
- """
- page = self.page
- y_delta = y_tolerance
- def top_row_bg_color(self):
- """
- Compare top row background color with color of same-sized bbox
- above. If different, return True indicating that the original
- table top row is already the header.
- """
- bbox0 = Rect(self.rows[0].bbox)
- bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
- top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
- top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
- if top_color0 != top_colort:
- return True # top row is header
- return False
- def row_has_bold(bbox):
- """Check if a row contains some bold text.
- If e.g. true for the top row, then it will be used as (internal)
- column header row if any of the following is true:
- * the previous (above) text line has no bold span
- * the second table row text has no bold span
- Returns True if any spans are bold else False.
- """
- blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
- spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
- return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
- try:
- row = self.rows[0]
- cells = row.cells
- bbox = Rect(row.bbox)
- except IndexError: # this table has no rows
- return None
- # return this if we determine that the top row is the header
- header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
- # 1-line tables have no extra header
- if len(self.rows) < 2:
- return header_top_row
- # 1-column tables have no extra header
- if len(cells) < 2:
- return header_top_row
- # assume top row is the header if second row is empty
- row2 = self.rows[1] # second row
- if all(c is None for c in row2.cells): # no valid cell bboxes in row2
- return header_top_row
- # Special check: is top row bold?
- top_row_bold = row_has_bold(bbox)
- # assume top row is header if it is bold and any cell
- # of 2nd row is non-bold
- if top_row_bold and not row_has_bold(row2.bbox):
- return header_top_row
- if top_row_bg_color(self):
- # if area above top row has a different background color,
- # then top row is already the header
- return header_top_row
- # column coordinates (x1 values) in top row
- col_x = [c[2] if c is not None else None for c in cells[:-1]]
- # clip = page area above the table
- # We will inspect this area for text qualifying as column header.
- clip = +bbox # take row 0 bbox
- clip.y0 = 0 # start at top of page
- clip.y1 = bbox.y0 # end at top of table
- blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
- # non-empty, non-superscript spans above table, sorted descending by y1
- spans = sorted(
- [
- s
- for b in blocks
- for l in b["lines"]
- for s in l["spans"]
- if not (
- white_spaces.issuperset(s["text"])
- or s["flags"] & TEXT_FONT_SUPERSCRIPT
- )
- ],
- key=lambda s: s["bbox"][3],
- reverse=True,
- )
- select = [] # y1 coordinates above, sorted descending
- line_heights = [] # line heights above, sorted descending
- line_bolds = [] # bold indicator per line above, same sorting
- # walk through the spans and fill above 3 lists
- for i in range(len(spans)):
- s = spans[i]
- y1 = s["bbox"][3] # span bottom
- h = y1 - s["bbox"][1] # span bbox height
- bold = s["flags"] & TEXT_FONT_BOLD
- # use first item to start the lists
- if i == 0:
- select.append(y1)
- line_heights.append(h)
- line_bolds.append(bold)
- continue
- # get previous items from the 3 lists
- y0 = select[-1]
- h0 = line_heights[-1]
- bold0 = line_bolds[-1]
- if bold0 and not bold:
- break # stop if switching from bold to non-bold
- # if fitting in height of previous span, modify bbox
- if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta:
- s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0)
- spans[i] = s
- if bold:
- line_bolds[-1] = bold
- continue
- elif y0 - y1 > 1.5 * h0:
- break # stop if distance to previous line too large
- select.append(y1)
- line_heights.append(h)
- line_bolds.append(bold)
- if select == []: # nothing above the table?
- return header_top_row
- select = select[:5] # accept up to 5 lines for an external header
- # assume top row as header if text above is too far away
- if bbox.y0 - select[0] >= line_heights[0]:
- return header_top_row
- # accept top row as header if bold, but line above is not
- if top_row_bold and not line_bolds[0]:
- return header_top_row
- if spans == []: # nothing left above the table, return top row
- return header_top_row
- # re-compute clip above table
- nclip = EMPTY_RECT()
- for s in [s for s in spans if s["bbox"][3] >= select[-1]]:
- nclip |= s["bbox"]
- if not nclip.is_empty:
- clip = nclip
- clip.y1 = bbox.y0 # make sure we still include every word above
- # Confirm that no word in clip is intersecting a column separator
- word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)]
- word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True)
- select = []
- # exclude lines with words that intersect a column border
- for top in word_tops:
- intersecting = [
- (x, r)
- for x in col_x
- if x is not None
- for r in word_rects
- if r[1] == top and r[0] < x and r[2] > x
- ]
- if intersecting == []:
- select.append(top)
- else: # detected a word crossing a column border
- break
- if select == []: # nothing left over: return first row
- return header_top_row
- hdr_bbox = +clip # compute the header cells
- hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words
- hdr_cells = [
- (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None
- for c in cells
- ]
- # adjust left/right of header bbox
- hdr_bbox.x0 = self.bbox[0]
- hdr_bbox.x1 = self.bbox[2]
- # column names: no line breaks, no excess spaces
- hdr_names = [
- (
- page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip()
- if c is not None
- else ""
- )
- for c in hdr_cells
- ]
- return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
- @dataclass
- class TableSettings:
- vertical_strategy: str = "lines"
- horizontal_strategy: str = "lines"
- explicit_vertical_lines: list = None
- explicit_horizontal_lines: list = None
- snap_tolerance: float = DEFAULT_SNAP_TOLERANCE
- snap_x_tolerance: float = UNSET
- snap_y_tolerance: float = UNSET
- join_tolerance: float = DEFAULT_JOIN_TOLERANCE
- join_x_tolerance: float = UNSET
- join_y_tolerance: float = UNSET
- edge_min_length: float = 3
- min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL
- min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL
- intersection_tolerance: float = 3
- intersection_x_tolerance: float = UNSET
- intersection_y_tolerance: float = UNSET
- text_settings: dict = None
- def __post_init__(self) -> "TableSettings":
- """Clean up user-provided table settings.
- Validates that the table settings provided consists of acceptable values and
- returns a cleaned up version. The cleaned up version fills out the missing
- values with the default values in the provided settings.
- TODO: Can be further used to validate that the values are of the correct
- type. For example, raising a value error when a non-boolean input is
- provided for the key ``keep_blank_chars``.
- :param table_settings: User-provided table settings.
- :returns: A cleaned up version of the user-provided table settings.
- :raises ValueError: When an unrecognised key is provided.
- """
- for setting in NON_NEGATIVE_SETTINGS:
- if (getattr(self, setting) or 0) < 0:
- raise ValueError(f"Table setting '{setting}' cannot be negative")
- for orientation in ["horizontal", "vertical"]:
- strategy = getattr(self, orientation + "_strategy")
- if strategy not in TABLE_STRATEGIES:
- raise ValueError(
- f"{orientation}_strategy must be one of"
- f'{{{",".join(TABLE_STRATEGIES)}}}'
- )
- if self.text_settings is None:
- self.text_settings = {}
- # This next section is for backwards compatibility
- for attr in ["x_tolerance", "y_tolerance"]:
- if attr not in self.text_settings:
- self.text_settings[attr] = self.text_settings.get("tolerance", 3)
- if "tolerance" in self.text_settings:
- del self.text_settings["tolerance"]
- # End of that section
- for attr, fallback in [
- ("snap_x_tolerance", "snap_tolerance"),
- ("snap_y_tolerance", "snap_tolerance"),
- ("join_x_tolerance", "join_tolerance"),
- ("join_y_tolerance", "join_tolerance"),
- ("intersection_x_tolerance", "intersection_tolerance"),
- ("intersection_y_tolerance", "intersection_tolerance"),
- ]:
- if getattr(self, attr) is UNSET:
- setattr(self, attr, getattr(self, fallback))
- return self
- @classmethod
- def resolve(cls, settings=None):
- if settings is None:
- return cls()
- elif isinstance(settings, cls):
- return settings
- elif isinstance(settings, dict):
- core_settings = {}
- text_settings = {}
- for k, v in settings.items():
- if k[:5] == "text_":
- text_settings[k[5:]] = v
- else:
- core_settings[k] = v
- core_settings["text_settings"] = text_settings
- return cls(**core_settings)
- else:
- raise ValueError(f"Cannot resolve settings: {settings}")
- class TableFinder:
- """
- Given a PDF page, find plausible table structures.
- Largely borrowed from Anssi Nurminen's master's thesis:
- http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
- ... and inspired by Tabula:
- https://github.com/tabulapdf/tabula-extractor/issues/16
- """
- def __init__(self, page, settings=None):
- self.page = weakref.proxy(page)
- self.settings = TableSettings.resolve(settings)
- self.edges = self.get_edges()
- self.intersections = edges_to_intersections(
- self.edges,
- self.settings.intersection_x_tolerance,
- self.settings.intersection_y_tolerance,
- )
- self.cells = intersections_to_cells(self.intersections)
- self.tables = [
- Table(self.page, cell_group)
- for cell_group in cells_to_tables(self.page, self.cells)
- ]
- def get_edges(self) -> list:
- settings = self.settings
- for orientation in ["vertical", "horizontal"]:
- strategy = getattr(settings, orientation + "_strategy")
- if strategy == "explicit":
- lines = getattr(settings, "explicit_" + orientation + "_lines")
- if len(lines) < 2:
- raise ValueError(
- f"If {orientation}_strategy == 'explicit', "
- f"explicit_{orientation}_lines "
- f"must be specified as a list/tuple of two or more "
- f"floats/ints."
- )
- v_strat = settings.vertical_strategy
- h_strat = settings.horizontal_strategy
- if v_strat == "text" or h_strat == "text":
- words = extract_words(CHARS, **(settings.text_settings or {}))
- else:
- words = []
- v_explicit = []
- for desc in settings.explicit_vertical_lines or []:
- if isinstance(desc, dict):
- for e in obj_to_edges(desc):
- if e["orientation"] == "v":
- v_explicit.append(e)
- else:
- v_explicit.append(
- {
- "x0": desc,
- "x1": desc,
- "top": self.page.rect[1],
- "bottom": self.page.rect[3],
- "height": self.page.rect[3] - self.page.rect[1],
- "orientation": "v",
- }
- )
- if v_strat == "lines":
- v_base = filter_edges(EDGES, "v")
- elif v_strat == "lines_strict":
- v_base = filter_edges(EDGES, "v", edge_type="line")
- elif v_strat == "text":
- v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical)
- elif v_strat == "explicit":
- v_base = []
- else:
- v_base = []
- v = v_base + v_explicit
- h_explicit = []
- for desc in settings.explicit_horizontal_lines or []:
- if isinstance(desc, dict):
- for e in obj_to_edges(desc):
- if e["orientation"] == "h":
- h_explicit.append(e)
- else:
- h_explicit.append(
- {
- "x0": self.page.rect[0],
- "x1": self.page.rect[2],
- "width": self.page.rect[2] - self.page.rect[0],
- "top": desc,
- "bottom": desc,
- "orientation": "h",
- }
- )
- if h_strat == "lines":
- h_base = filter_edges(EDGES, "h")
- elif h_strat == "lines_strict":
- h_base = filter_edges(EDGES, "h", edge_type="line")
- elif h_strat == "text":
- h_base = words_to_edges_h(
- words, word_threshold=settings.min_words_horizontal
- )
- elif h_strat == "explicit":
- h_base = []
- else:
- h_base = []
- h = h_base + h_explicit
- edges = list(v) + list(h)
- edges = merge_edges(
- edges,
- snap_x_tolerance=settings.snap_x_tolerance,
- snap_y_tolerance=settings.snap_y_tolerance,
- join_x_tolerance=settings.join_x_tolerance,
- join_y_tolerance=settings.join_y_tolerance,
- )
- return filter_edges(edges, min_length=settings.edge_min_length)
- def __getitem__(self, i):
- tcount = len(self.tables)
- if i >= tcount:
- raise IndexError("table not on page")
- while i < 0:
- i += tcount
- return self.tables[i]
- """
- Start of PyMuPDF interface code.
- The following functions are executed when "page.find_tables()" is called.
- * make_chars: Fills the CHARS list with text character information extracted
- via "rawdict" text extraction. Items in CHARS are formatted
- as expected by the table code.
- * make_edges: Fills the EDGES list with vector graphic information extracted
- via "get_drawings". Items in EDGES are formatted as expected
- by the table code.
- The lists CHARS and EDGES are used to replace respective document access
- of pdfplumber or, respectively pdfminer.
- The table code has been modified to use these lists instead of accessing
- page information themselves.
- """
- # -----------------------------------------------------------------------------
- # Extract all page characters to fill the CHARS list
- # -----------------------------------------------------------------------------
- def make_chars(page, clip=None):
- """Extract text as "rawdict" to fill CHARS."""
- global TEXTPAGE
- page_number = page.number + 1
- page_height = page.rect.height
- ctm = page.transformation_matrix
- TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
- blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
- doctop_base = page_height * page.number
- for block in blocks:
- for line in block["lines"]:
- ldir = line["dir"] # = (cosine, sine) of angle
- ldir = (round(ldir[0], 4), round(ldir[1], 4))
- matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
- if ldir[1] == 0:
- upright = True
- else:
- upright = False
- for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
- fontname = span["font"]
- fontsize = span["size"]
- color = sRGB_to_pdf(span["color"])
- for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
- bbox = Rect(char["bbox"])
- bbox_ctm = bbox * ctm
- origin = Point(char["origin"]) * ctm
- matrix.e = origin.x
- matrix.f = origin.y
- text = char["c"]
- char_dict = {
- "adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0,
- "bottom": bbox.y1,
- "doctop": bbox.y0 + doctop_base,
- "fontname": fontname,
- "height": bbox.y1 - bbox.y0,
- "matrix": tuple(matrix),
- "ncs": "DeviceRGB",
- "non_stroking_color": color,
- "non_stroking_pattern": None,
- "object_type": "char",
- "page_number": page_number,
- "size": fontsize if upright else bbox.y1 - bbox.y0,
- "stroking_color": color,
- "stroking_pattern": None,
- "text": text,
- "top": bbox.y0,
- "upright": upright,
- "width": bbox.x1 - bbox.x0,
- "x0": bbox.x0,
- "x1": bbox.x1,
- "y0": bbox_ctm.y0,
- "y1": bbox_ctm.y1,
- }
- CHARS.append(char_dict)
- # ------------------------------------------------------------------------
- # Extract all page vector graphics to fill the EDGES list.
- # We are ignoring Bézier curves completely and are converting everything
- # else to lines.
- # ------------------------------------------------------------------------
- def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
- snap_x = tset.snap_x_tolerance
- snap_y = tset.snap_y_tolerance
- min_length = tset.edge_min_length
- lines_strict = (
- tset.vertical_strategy == "lines_strict"
- or tset.horizontal_strategy == "lines_strict"
- )
- page_height = page.rect.height
- doctop_basis = page.number * page_height
- page_number = page.number + 1
- prect = page.rect
- if page.rotation in (90, 270):
- w, h = prect.br
- prect = Rect(0, 0, h, w)
- if clip is not None:
- clip = Rect(clip)
- else:
- clip = prect
- def are_neighbors(r1, r2):
- """Detect whether r1, r2 are neighbors.
- Defined as:
- The minimum distance between points of r1 and points of r2 is not
- larger than some delta.
- This check supports empty rect-likes and thus also lines.
- Note:
- This type of check is MUCH faster than native Rect containment checks.
- """
- if ( # check if x-coordinates of r1 are within those of r2
- r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
- or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
- ) and ( # ... same for y-coordinates
- r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
- or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
- ):
- return True
- # same check with r1 / r2 exchanging their roles (this is necessary!)
- if (
- r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
- or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
- ) and (
- r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
- or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
- ):
- return True
- return False
- def clean_graphics(npaths=None):
- """Detect and join rectangles of "connected" vector graphics."""
- if npaths is None:
- allpaths = page.get_drawings()
- else: # accept passed-in vector graphics
- allpaths = npaths[:] # paths relevant for table detection
- paths = []
- for p in allpaths:
- # If only looking at lines, we ignore fill-only paths,
- # except simulated lines (i.e. small width or height).
- if (
- lines_strict
- and p["type"] == "f"
- and p["rect"].width > snap_x
- and p["rect"].height > snap_y
- ):
- continue
- paths.append(p)
- # start with all vector graphics rectangles
- prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0))
- new_rects = [] # the final list of joined rectangles
- # ----------------------------------------------------------------
- # Strategy: Join rectangles that "almost touch" each other.
- # Extend first rectangle with any other that is a "neighbor".
- # Then move it to the final list and continue with the rest.
- # ----------------------------------------------------------------
- while prects: # the algorithm will empty this list
- prect0 = prects[0] # copy of first rectangle (performance reasons!)
- repeat = True
- while repeat: # this loop extends first rect in list
- repeat = False # set to true again if some other rect touches
- for i in range(len(prects) - 1, 0, -1): # run backwards
- if are_neighbors(prect0, prects[i]): # close enough to rect 0?
- prect0 |= prects[i].tl # extend rect 0
- prect0 |= prects[i].br # extend rect 0
- del prects[i] # delete this rect
- repeat = True # keep checking the rest
- # move rect 0 over to result list if there is some text in it
- if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
- # contains text, so accept it as a table bbox candidate
- new_rects.append(prect0)
- del prects[0] # remove from rect list
- return new_rects, paths
- bboxes, paths = clean_graphics(npaths=paths)
- def is_parallel(p1, p2):
- """Check if line is roughly axis-parallel."""
- if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y:
- return True
- return False
- def make_line(p, p1, p2, clip):
- """Given 2 points, make a line dictionary for table detection."""
- if not is_parallel(p1, p2): # only accepting axis-parallel lines
- return {}
- # compute the extremal values
- x0 = min(p1.x, p2.x)
- x1 = max(p1.x, p2.x)
- y0 = min(p1.y, p2.y)
- y1 = max(p1.y, p2.y)
- # check for outside clip
- if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0:
- return {}
- if x0 < clip.x0:
- x0 = clip.x0 # adjust to clip boundary
- if x1 > clip.x1:
- x1 = clip.x1 # adjust to clip boundary
- if y0 < clip.y0:
- y0 = clip.y0 # adjust to clip boundary
- if y1 > clip.y1:
- y1 = clip.y1 # adjust to clip boundary
- width = x1 - x0 # from adjusted values
- height = y1 - y0 # from adjusted values
- if width == height == 0:
- return {} # nothing left to deal with
- line_dict = {
- "x0": x0,
- "y0": page_height - y0,
- "x1": x1,
- "y1": page_height - y1,
- "width": width,
- "height": height,
- "pts": [(x0, y0), (x1, y1)],
- "linewidth": p["width"],
- "stroke": True,
- "fill": False,
- "evenodd": False,
- "stroking_color": p["color"] if p["color"] else p["fill"],
- "non_stroking_color": None,
- "object_type": "line",
- "page_number": page_number,
- "stroking_pattern": None,
- "non_stroking_pattern": None,
- "top": y0,
- "bottom": y1,
- "doctop": y0 + doctop_basis,
- }
- return line_dict
- for p in paths:
- items = p["items"] # items in this path
- # if 'closePath', add a line from last to first point
- if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l":
- items.append(("l", items[-1][2], items[0][1]))
- for i in items:
- if i[0] not in ("l", "re", "qu"):
- continue # ignore anything else
- if i[0] == "l": # a line
- p1, p2 = i[1:]
- line_dict = make_line(p, p1, p2, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- elif i[0] == "re":
- # A rectangle: decompose into 4 lines, but filter out
- # the ones that simulate a line
- rect = i[1].normalize() # normalize the rectangle
- if (
- rect.width <= min_length and rect.width < rect.height
- ): # simulates a vertical line
- x = abs(rect.x1 + rect.x0) / 2 # take middle value for x
- p1 = Point(x, rect.y0)
- p2 = Point(x, rect.y1)
- line_dict = make_line(p, p1, p2, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- continue
- if (
- rect.height <= min_length and rect.height < rect.width
- ): # simulates a horizontal line
- y = abs(rect.y1 + rect.y0) / 2 # take middle value for y
- p1 = Point(rect.x0, y)
- p2 = Point(rect.x1, y)
- line_dict = make_line(p, p1, p2, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- continue
- line_dict = make_line(p, rect.tl, rect.bl, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(p, rect.bl, rect.br, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(p, rect.br, rect.tr, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(p, rect.tr, rect.tl, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- else: # must be a quad
- # we convert it into (up to) 4 lines
- ul, ur, ll, lr = i[1]
- line_dict = make_line(p, ul, ll, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(p, ll, lr, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(p, lr, ur, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(p, ur, ul, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- path = {"color": (0, 0, 0), "fill": None, "width": 1}
- for bbox in bboxes: # add the border lines for all enveloping bboxes
- line_dict = make_line(path, bbox.tl, bbox.tr, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(path, bbox.bl, bbox.br, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(path, bbox.tl, bbox.bl, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(path, bbox.tr, bbox.br, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- if add_lines is not None: # add user-specified lines
- assert isinstance(add_lines, (tuple, list))
- else:
- add_lines = []
- for p1, p2 in add_lines:
- p1 = Point(p1)
- p2 = Point(p2)
- line_dict = make_line(path, p1, p2, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- if add_boxes is not None: # add user-specified rectangles
- assert isinstance(add_boxes, (tuple, list))
- else:
- add_boxes = []
- for box in add_boxes:
- r = Rect(box)
- line_dict = make_line(path, r.tl, r.bl, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(path, r.bl, r.br, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(path, r.br, r.tr, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- line_dict = make_line(path, r.tr, r.tl, clip)
- if line_dict:
- EDGES.append(line_to_edge(line_dict))
- def page_rotation_set0(page):
- """Nullify page rotation.
- To correctly detect tables, page rotation must be zero.
- This function performs the necessary adjustments and returns information
- for reverting this changes.
- """
- mediabox = page.mediabox
- rot = page.rotation # contains normalized rotation value
- # need to derotate the page's content
- mb = page.mediabox # current mediabox
- if rot == 90:
- # before derotation, shift content horizontally
- mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
- elif rot == 270:
- # before derotation, shift content vertically
- mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
- else:
- mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
- # prefix with derotation matrix
- mat = mat0 * page.derotation_matrix
- cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
- xref = TOOLS._insert_contents(page, cmd, 0)
- # swap x- and y-coordinates
- if rot in (90, 270):
- x0, y0, x1, y1 = mb
- mb.x0 = y0
- mb.y0 = x0
- mb.x1 = y1
- mb.y1 = x1
- page.set_mediabox(mb)
- page.set_rotation(0)
- # refresh the page to apply these changes
- doc = page.parent
- pno = page.number
- page = doc[pno]
- return page, xref, rot, mediabox
- def page_rotation_reset(page, xref, rot, mediabox):
- """Reset page rotation to original values.
- To be used before we return tables."""
- doc = page.parent # document of the page
- doc.update_stream(xref, b" ") # remove de-rotation matrix
- page.set_mediabox(mediabox) # set mediabox to old value
- page.set_rotation(rot) # set rotation to old value
- pno = page.number
- page = doc[pno] # update page info
- return page
- def find_tables(
- page,
- clip=None,
- vertical_strategy: str = "lines",
- horizontal_strategy: str = "lines",
- vertical_lines: list = None,
- horizontal_lines: list = None,
- snap_tolerance: float = DEFAULT_SNAP_TOLERANCE,
- snap_x_tolerance: float = None,
- snap_y_tolerance: float = None,
- join_tolerance: float = DEFAULT_JOIN_TOLERANCE,
- join_x_tolerance: float = None,
- join_y_tolerance: float = None,
- edge_min_length: float = 3,
- min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL,
- min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL,
- intersection_tolerance: float = 3,
- intersection_x_tolerance: float = None,
- intersection_y_tolerance: float = None,
- text_tolerance=3,
- text_x_tolerance=3,
- text_y_tolerance=3,
- strategy=None, # offer abbreviation
- add_lines=None, # user-specified lines
- add_boxes=None, # user-specified rectangles
- paths=None, # accept vector graphics as parameter
- ):
- global CHARS, EDGES
- CHARS = []
- EDGES = []
- old_small = bool(TOOLS.set_small_glyph_heights()) # save old value
- TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
- if page.rotation != 0:
- page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
- else:
- old_xref, old_rot, old_mediabox = None, None, None
- if snap_x_tolerance is None:
- snap_x_tolerance = UNSET
- if snap_y_tolerance is None:
- snap_y_tolerance = UNSET
- if join_x_tolerance is None:
- join_x_tolerance = UNSET
- if join_y_tolerance is None:
- join_y_tolerance = UNSET
- if intersection_x_tolerance is None:
- intersection_x_tolerance = UNSET
- if intersection_y_tolerance is None:
- intersection_y_tolerance = UNSET
- if strategy is not None:
- vertical_strategy = strategy
- horizontal_strategy = strategy
- settings = {
- "vertical_strategy": vertical_strategy,
- "horizontal_strategy": horizontal_strategy,
- "explicit_vertical_lines": vertical_lines,
- "explicit_horizontal_lines": horizontal_lines,
- "snap_tolerance": snap_tolerance,
- "snap_x_tolerance": snap_x_tolerance,
- "snap_y_tolerance": snap_y_tolerance,
- "join_tolerance": join_tolerance,
- "join_x_tolerance": join_x_tolerance,
- "join_y_tolerance": join_y_tolerance,
- "edge_min_length": edge_min_length,
- "min_words_vertical": min_words_vertical,
- "min_words_horizontal": min_words_horizontal,
- "intersection_tolerance": intersection_tolerance,
- "intersection_x_tolerance": intersection_x_tolerance,
- "intersection_y_tolerance": intersection_y_tolerance,
- "text_tolerance": text_tolerance,
- "text_x_tolerance": text_x_tolerance,
- "text_y_tolerance": text_y_tolerance,
- }
- tset = TableSettings.resolve(settings=settings)
- page.table_settings = tset
- make_chars(page, clip=clip) # create character list of page
- make_edges(
- page,
- clip=clip,
- tset=tset,
- paths=paths,
- add_lines=add_lines,
- add_boxes=add_boxes,
- ) # create lines and curves
- tables = TableFinder(page, settings=tset)
- TOOLS.set_small_glyph_heights(old_small)
- if old_xref is not None:
- page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
- return tables
|