table.py 87 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563
  1. """
  2. Copyright (C) 2023 Artifex Software, Inc.
  3. This file is part of PyMuPDF.
  4. PyMuPDF is free software: you can redistribute it and/or modify it under the
  5. terms of the GNU Affero General Public License as published by the Free
  6. Software Foundation, either version 3 of the License, or (at your option)
  7. any later version.
  8. PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  9. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  10. FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  11. details.
  12. You should have received a copy of the GNU Affero General Public License
  13. along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  14. Alternative licensing terms are available from the licensor.
  15. For commercial licensing, see <https://www.artifex.com/> or contact
  16. Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  17. CA 94129, USA, for further information.
  18. ---------------------------------------------------------------------
  19. Portions of this code have been ported from pdfplumber, see
  20. https://pypi.org/project/pdfplumber/.
  21. The ported code is under the following MIT license:
  22. ---------------------------------------------------------------------
  23. The MIT License (MIT)
  24. Copyright (c) 2015, Jeremy Singer-Vine
  25. Permission is hereby granted, free of charge, to any person obtaining a copy
  26. of this software and associated documentation files (the "Software"), to deal
  27. in the Software without restriction, including without limitation the rights
  28. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  29. copies of the Software, and to permit persons to whom the Software is
  30. furnished to do so, subject to the following conditions:
  31. The above copyright notice and this permission notice shall be included in all
  32. copies or substantial portions of the Software.
  33. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  34. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  35. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  36. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  37. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  38. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  39. SOFTWARE.
  40. ---------------------------------------------------------------------
  41. Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
  42. ---------------------------------------------------------------------
  43. The porting mainly pertains to files "table.py" and relevant parts of
  44. "utils/text.py" within pdfplumber's repository on Github.
  45. With respect to "text.py", we have removed functions or features that are not
  46. used by table processing. Examples are:
  47. * the text search function
  48. * simple text extraction
  49. * text extraction by lines
  50. Original pdfplumber code does neither detect, nor identify table headers.
  51. This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'.
  52. This is implemented as new class TableHeader with the properties:
  53. * bbox: A tuple for the header's bbox
  54. * cells: A tuple for each bbox of a column header
  55. * names: A list of strings with column header text
  56. * external: A bool indicating whether the header is outside the table cells.
  57. """
  58. import inspect
  59. import itertools
  60. import string
  61. import html
  62. from collections.abc import Sequence
  63. from dataclasses import dataclass
  64. from operator import itemgetter
  65. import weakref
  66. # -------------------------------------------------------------------
  67. # Start of PyMuPDF interface code
  68. # -------------------------------------------------------------------
  69. from . import (
  70. Rect,
  71. Matrix,
  72. TEXTFLAGS_TEXT,
  73. TEXT_FONT_BOLD,
  74. TEXT_FONT_ITALIC,
  75. TEXT_FONT_MONOSPACED,
  76. TEXT_FONT_SUPERSCRIPT,
  77. TEXT_COLLECT_STYLES,
  78. TOOLS,
  79. EMPTY_RECT,
  80. sRGB_to_pdf,
  81. Point,
  82. message,
  83. mupdf,
  84. )
  85. EDGES = [] # vector graphics from PyMuPDF
  86. CHARS = [] # text characters from PyMuPDF
  87. TEXTPAGE = None
  88. TEXT_BOLD = mupdf.FZ_STEXT_BOLD
  89. TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
  90. FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
  91. white_spaces = set(string.whitespace) # for checking white space only cells
  92. def extract_cells(textpage, cell, markdown=False):
  93. """Extract text from a rect-like 'cell' as plain or MD style text.
  94. This function should ultimately be used to extract text from a table cell.
  95. Markdown output will only work correctly if extraction flag bit
  96. TEXT_COLLECT_STYLES is set.
  97. Args:
  98. textpage: A PyMuPDF TextPage object. Must have been created with
  99. TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
  100. cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
  101. markdown: If True, return text formatted for Markdown.
  102. Returns:
  103. A string with the text extracted from the cell.
  104. """
  105. text = ""
  106. for block in textpage.extractRAWDICT()["blocks"]:
  107. if block["type"] != 0:
  108. continue
  109. block_bbox = block["bbox"]
  110. if (
  111. 0
  112. or block_bbox[0] > cell[2]
  113. or block_bbox[2] < cell[0]
  114. or block_bbox[1] > cell[3]
  115. or block_bbox[3] < cell[1]
  116. ):
  117. continue # skip block outside cell
  118. for line in block["lines"]:
  119. lbbox = line["bbox"]
  120. if (
  121. 0
  122. or lbbox[0] > cell[2]
  123. or lbbox[2] < cell[0]
  124. or lbbox[1] > cell[3]
  125. or lbbox[3] < cell[1]
  126. ):
  127. continue # skip line outside cell
  128. if text: # must be a new line in the cell
  129. text += "<br>" if markdown else "\n"
  130. # strikeout detection only works with horizontal text
  131. horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
  132. for span in line["spans"]:
  133. sbbox = span["bbox"]
  134. if (
  135. 0
  136. or sbbox[0] > cell[2]
  137. or sbbox[2] < cell[0]
  138. or sbbox[1] > cell[3]
  139. or sbbox[3] < cell[1]
  140. ):
  141. continue # skip spans outside cell
  142. # only include chars with more than 50% bbox overlap
  143. span_text = ""
  144. for char in span["chars"]:
  145. bbox = Rect(char["bbox"])
  146. if abs(bbox & cell) > 0.5 * abs(bbox):
  147. span_text += char["c"]
  148. if not span_text:
  149. continue # skip empty span
  150. if not markdown: # no MD styling
  151. text += span_text
  152. continue
  153. prefix = ""
  154. suffix = ""
  155. if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
  156. prefix += "~~"
  157. suffix = "~~" + suffix
  158. if span["char_flags"] & TEXT_BOLD:
  159. prefix += "**"
  160. suffix = "**" + suffix
  161. if span["flags"] & TEXT_FONT_ITALIC:
  162. prefix += "_"
  163. suffix = "_" + suffix
  164. if span["flags"] & TEXT_FONT_MONOSPACED:
  165. prefix += "`"
  166. suffix = "`" + suffix
  167. if len(span["chars"]) > 2:
  168. span_text = span_text.rstrip()
  169. # if span continues previous styling: extend cell text
  170. if (ls := len(suffix)) and text.endswith(suffix):
  171. text = text[:-ls] + span_text + suffix
  172. else: # append the span with new styling
  173. if not span_text.strip():
  174. text += " "
  175. else:
  176. text += prefix + span_text + suffix
  177. return text.strip()
  178. # -------------------------------------------------------------------
  179. # End of PyMuPDF interface code
  180. # -------------------------------------------------------------------
  181. class UnsetFloat(float):
  182. pass
  183. NON_NEGATIVE_SETTINGS = [
  184. "snap_tolerance",
  185. "snap_x_tolerance",
  186. "snap_y_tolerance",
  187. "join_tolerance",
  188. "join_x_tolerance",
  189. "join_y_tolerance",
  190. "edge_min_length",
  191. "min_words_vertical",
  192. "min_words_horizontal",
  193. "intersection_tolerance",
  194. "intersection_x_tolerance",
  195. "intersection_y_tolerance",
  196. ]
  197. TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"]
  198. UNSET = UnsetFloat(0)
  199. DEFAULT_SNAP_TOLERANCE = 3
  200. DEFAULT_JOIN_TOLERANCE = 3
  201. DEFAULT_MIN_WORDS_VERTICAL = 3
  202. DEFAULT_MIN_WORDS_HORIZONTAL = 1
  203. DEFAULT_X_TOLERANCE = 3
  204. DEFAULT_Y_TOLERANCE = 3
  205. DEFAULT_X_DENSITY = 7.25
  206. DEFAULT_Y_DENSITY = 13
  207. bbox_getter = itemgetter("x0", "top", "x1", "bottom")
  208. LIGATURES = {
  209. "ff": "ff",
  210. "ffi": "ffi",
  211. "ffl": "ffl",
  212. "fi": "fi",
  213. "fl": "fl",
  214. "st": "st",
  215. "ſt": "st",
  216. }
  217. def to_list(collection) -> list:
  218. if isinstance(collection, list):
  219. return collection
  220. elif isinstance(collection, Sequence):
  221. return list(collection)
  222. elif hasattr(collection, "to_dict"):
  223. res = collection.to_dict("records") # pragma: nocover
  224. return res
  225. else:
  226. return list(collection)
  227. class TextMap:
  228. """
  229. A TextMap maps each unicode character in the text to an individual `char`
  230. object (or, in the case of layout-implied whitespace, `None`).
  231. """
  232. def __init__(self, tuples=None) -> None:
  233. self.tuples = tuples
  234. self.as_string = "".join(map(itemgetter(0), tuples))
  235. def match_to_dict(
  236. self,
  237. m,
  238. main_group: int = 0,
  239. return_groups: bool = True,
  240. return_chars: bool = True,
  241. ) -> dict:
  242. subset = self.tuples[m.start(main_group) : m.end(main_group)]
  243. chars = [c for (text, c) in subset if c is not None]
  244. x0, top, x1, bottom = objects_to_bbox(chars)
  245. result = {
  246. "text": m.group(main_group),
  247. "x0": x0,
  248. "top": top,
  249. "x1": x1,
  250. "bottom": bottom,
  251. }
  252. if return_groups:
  253. result["groups"] = m.groups()
  254. if return_chars:
  255. result["chars"] = chars
  256. return result
  257. class WordMap:
  258. """
  259. A WordMap maps words->chars.
  260. """
  261. def __init__(self, tuples) -> None:
  262. self.tuples = tuples
  263. def to_textmap(
  264. self,
  265. layout: bool = False,
  266. layout_width=0,
  267. layout_height=0,
  268. layout_width_chars: int = 0,
  269. layout_height_chars: int = 0,
  270. x_density=DEFAULT_X_DENSITY,
  271. y_density=DEFAULT_Y_DENSITY,
  272. x_shift=0,
  273. y_shift=0,
  274. y_tolerance=DEFAULT_Y_TOLERANCE,
  275. use_text_flow: bool = False,
  276. presorted: bool = False,
  277. expand_ligatures: bool = True,
  278. ) -> TextMap:
  279. """
  280. Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
  281. (char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
  282. structural layout of the text on the page(s), using the following approach:
  283. - Sort the words by (doctop, x0) if not already sorted.
  284. - Calculate the initial doctop for the starting page.
  285. - Cluster the words by doctop (taking `y_tolerance` into account), and
  286. iterate through them.
  287. - For each cluster, calculate the distance between that doctop and the
  288. initial doctop, in points, minus `y_shift`. Divide that distance by
  289. `y_density` to calculate the minimum number of newlines that should come
  290. before this cluster. Append that number of newlines *minus* the number of
  291. newlines already appended, with a minimum of one.
  292. - Then for each cluster, iterate through each word in it. Divide each
  293. word's x0, minus `x_shift`, by `x_density` to calculate the minimum
  294. number of characters that should come before this cluster. Append that
  295. number of spaces *minus* the number of characters and spaces already
  296. appended, with a minimum of one. Then append the word's text.
  297. - At the termination of each line, add more spaces if necessary to
  298. mimic `layout_width`.
  299. - Finally, add newlines to the end if necessary to mimic to
  300. `layout_height`.
  301. Note: This approach currently works best for horizontal, left-to-right
  302. text, but will display all words regardless of orientation. There is room
  303. for improvement in better supporting right-to-left text, as well as
  304. vertical text.
  305. """
  306. _textmap = []
  307. if not len(self.tuples):
  308. return TextMap(_textmap)
  309. expansions = LIGATURES if expand_ligatures else {}
  310. if layout:
  311. if layout_width_chars:
  312. if layout_width:
  313. raise ValueError(
  314. "`layout_width` and `layout_width_chars` cannot both be set."
  315. )
  316. else:
  317. layout_width_chars = int(round(layout_width / x_density))
  318. if layout_height_chars:
  319. if layout_height:
  320. raise ValueError(
  321. "`layout_height` and `layout_height_chars` cannot both be set."
  322. )
  323. else:
  324. layout_height_chars = int(round(layout_height / y_density))
  325. blank_line = [(" ", None)] * layout_width_chars
  326. else:
  327. blank_line = []
  328. num_newlines = 0
  329. words_sorted_doctop = (
  330. self.tuples
  331. if presorted or use_text_flow
  332. else sorted(self.tuples, key=lambda x: float(x[0]["doctop"]))
  333. )
  334. first_word = words_sorted_doctop[0][0]
  335. doctop_start = first_word["doctop"] - first_word["top"]
  336. for i, ws in enumerate(
  337. cluster_objects(
  338. words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
  339. )
  340. ):
  341. y_dist = (
  342. (ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density
  343. if layout
  344. else 0
  345. )
  346. num_newlines_prepend = max(
  347. # At least one newline, unless this iis the first line
  348. int(i > 0),
  349. # ... or as many as needed to get the imputed "distance" from the top
  350. round(y_dist) - num_newlines,
  351. )
  352. for i in range(num_newlines_prepend):
  353. if not len(_textmap) or _textmap[-1][0] == "\n":
  354. _textmap += blank_line
  355. _textmap.append(("\n", None))
  356. num_newlines += num_newlines_prepend
  357. line_len = 0
  358. line_words_sorted_x0 = (
  359. ws
  360. if presorted or use_text_flow
  361. else sorted(ws, key=lambda x: float(x[0]["x0"]))
  362. )
  363. for word, chars in line_words_sorted_x0:
  364. x_dist = (word["x0"] - x_shift) / x_density if layout else 0
  365. num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
  366. _textmap += [(" ", None)] * num_spaces_prepend
  367. line_len += num_spaces_prepend
  368. for c in chars:
  369. letters = expansions.get(c["text"], c["text"])
  370. for letter in letters:
  371. _textmap.append((letter, c))
  372. line_len += 1
  373. # Append spaces at end of line
  374. if layout:
  375. _textmap += [(" ", None)] * (layout_width_chars - line_len)
  376. # Append blank lines at end of text
  377. if layout:
  378. num_newlines_append = layout_height_chars - (num_newlines + 1)
  379. for i in range(num_newlines_append):
  380. if i > 0:
  381. _textmap += blank_line
  382. _textmap.append(("\n", None))
  383. # Remove terminal newline
  384. if _textmap[-1] == ("\n", None):
  385. _textmap = _textmap[:-1]
  386. return TextMap(_textmap)
  387. class WordExtractor:
  388. def __init__(
  389. self,
  390. x_tolerance=DEFAULT_X_TOLERANCE,
  391. y_tolerance=DEFAULT_Y_TOLERANCE,
  392. keep_blank_chars: bool = False,
  393. use_text_flow=False,
  394. horizontal_ltr=True, # Should words be read left-to-right?
  395. vertical_ttb=False, # Should vertical words be read top-to-bottom?
  396. extra_attrs=None,
  397. split_at_punctuation=False,
  398. expand_ligatures=True,
  399. ):
  400. self.x_tolerance = x_tolerance
  401. self.y_tolerance = y_tolerance
  402. self.keep_blank_chars = keep_blank_chars
  403. self.use_text_flow = use_text_flow
  404. self.horizontal_ltr = horizontal_ltr
  405. self.vertical_ttb = vertical_ttb
  406. self.extra_attrs = [] if extra_attrs is None else extra_attrs
  407. # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  408. self.split_at_punctuation = (
  409. string.punctuation
  410. if split_at_punctuation is True
  411. else (split_at_punctuation or "")
  412. )
  413. self.expansions = LIGATURES if expand_ligatures else {}
  414. def merge_chars(self, ordered_chars: list):
  415. x0, top, x1, bottom = objects_to_bbox(ordered_chars)
  416. doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
  417. upright = ordered_chars[0]["upright"]
  418. direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1
  419. matrix = ordered_chars[0]["matrix"]
  420. rotation = 0
  421. if not upright and matrix[1] < 0:
  422. ordered_chars = reversed(ordered_chars)
  423. rotation = 270
  424. if matrix[0] < 0 and matrix[3] < 0:
  425. rotation = 180
  426. elif matrix[1] > 0:
  427. rotation = 90
  428. word = {
  429. "text": "".join(
  430. self.expansions.get(c["text"], c["text"]) for c in ordered_chars
  431. ),
  432. "x0": x0,
  433. "x1": x1,
  434. "top": top,
  435. "doctop": top + doctop_adj,
  436. "bottom": bottom,
  437. "upright": upright,
  438. "direction": direction,
  439. "rotation": rotation,
  440. }
  441. for key in self.extra_attrs:
  442. word[key] = ordered_chars[0][key]
  443. return word
  444. def char_begins_new_word(
  445. self,
  446. prev_char,
  447. curr_char,
  448. ) -> bool:
  449. """This method takes several factors into account to determine if
  450. `curr_char` represents the beginning of a new word:
  451. - Whether the text is "upright" (i.e., non-rotated)
  452. - Whether the user has specified that horizontal text runs
  453. left-to-right (default) or right-to-left, as represented by
  454. self.horizontal_ltr
  455. - Whether the user has specified that vertical text the text runs
  456. top-to-bottom (default) or bottom-to-top, as represented by
  457. self.vertical_ttb
  458. - The x0, top, x1, and bottom attributes of prev_char and
  459. curr_char
  460. - The self.x_tolerance and self.y_tolerance settings. Note: In
  461. this case, x/y refer to those directions for non-rotated text.
  462. For vertical text, they are flipped. A more accurate terminology
  463. might be "*intra*line character distance tolerance" and
  464. "*inter*line character distance tolerance"
  465. An important note: The *intra*line distance is measured from the
  466. *end* of the previous character to the *beginning* of the current
  467. character, while the *inter*line distance is measured from the
  468. *top* of the previous character to the *top* of the next
  469. character. The reasons for this are partly repository-historical,
  470. and partly logical, as successive text lines' bounding boxes often
  471. overlap slightly (and we don't want that overlap to be interpreted
  472. as the two lines being the same line).
  473. The upright-ness of the character determines the attributes to
  474. compare, while horizontal_ltr/vertical_ttb determine the direction
  475. of the comparison.
  476. """
  477. # Note: Due to the grouping step earlier in the process,
  478. # curr_char["upright"] will always equal prev_char["upright"].
  479. if curr_char["upright"]:
  480. x = self.x_tolerance
  481. y = self.y_tolerance
  482. ay = prev_char["top"]
  483. cy = curr_char["top"]
  484. if self.horizontal_ltr:
  485. ax = prev_char["x0"]
  486. bx = prev_char["x1"]
  487. cx = curr_char["x0"]
  488. else:
  489. ax = -prev_char["x1"]
  490. bx = -prev_char["x0"]
  491. cx = -curr_char["x1"]
  492. else:
  493. x = self.y_tolerance
  494. y = self.x_tolerance
  495. ay = prev_char["x0"]
  496. cy = curr_char["x0"]
  497. if self.vertical_ttb:
  498. ax = prev_char["top"]
  499. bx = prev_char["bottom"]
  500. cx = curr_char["top"]
  501. else:
  502. ax = -prev_char["bottom"]
  503. bx = -prev_char["top"]
  504. cx = -curr_char["bottom"]
  505. return bool(
  506. # Intraline test
  507. (cx < ax)
  508. or (cx > bx + x)
  509. # Interline test
  510. or (cy > ay + y)
  511. )
  512. def iter_chars_to_words(self, ordered_chars):
  513. current_word: list = []
  514. def start_next_word(new_char=None):
  515. nonlocal current_word
  516. if current_word:
  517. yield current_word
  518. current_word = [] if new_char is None else [new_char]
  519. for char in ordered_chars:
  520. text = char["text"]
  521. if not self.keep_blank_chars and text.isspace():
  522. yield from start_next_word(None)
  523. elif text in self.split_at_punctuation:
  524. yield from start_next_word(char)
  525. yield from start_next_word(None)
  526. elif current_word and self.char_begins_new_word(current_word[-1], char):
  527. yield from start_next_word(char)
  528. else:
  529. current_word.append(char)
  530. # Finally, after all chars processed
  531. if current_word:
  532. yield current_word
  533. def iter_sort_chars(self, chars):
  534. def upright_key(x) -> int:
  535. return -int(x["upright"])
  536. for upright_cluster in cluster_objects(list(chars), upright_key, 0):
  537. upright = upright_cluster[0]["upright"]
  538. cluster_key = "doctop" if upright else "x0"
  539. # Cluster by line
  540. subclusters = cluster_objects(
  541. upright_cluster, itemgetter(cluster_key), self.y_tolerance
  542. )
  543. for sc in subclusters:
  544. # Sort within line
  545. sort_key = "x0" if upright else "doctop"
  546. to_yield = sorted(sc, key=itemgetter(sort_key))
  547. # Reverse order if necessary
  548. if not (self.horizontal_ltr if upright else self.vertical_ttb):
  549. yield from reversed(to_yield)
  550. else:
  551. yield from to_yield
  552. def iter_extract_tuples(self, chars):
  553. ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars)
  554. grouping_key = itemgetter("upright", *self.extra_attrs)
  555. grouped_chars = itertools.groupby(ordered_chars, grouping_key)
  556. for keyvals, char_group in grouped_chars:
  557. for word_chars in self.iter_chars_to_words(char_group):
  558. yield (self.merge_chars(word_chars), word_chars)
  559. def extract_wordmap(self, chars) -> WordMap:
  560. return WordMap(list(self.iter_extract_tuples(chars)))
  561. def extract_words(self, chars: list) -> list:
  562. words = list(word for word, word_chars in self.iter_extract_tuples(chars))
  563. return words
  564. def extract_words(chars: list, **kwargs) -> list:
  565. return WordExtractor(**kwargs).extract_words(chars)
  566. TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
  567. WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
  568. def chars_to_textmap(chars: list, **kwargs) -> TextMap:
  569. kwargs.update({"presorted": True})
  570. extractor = WordExtractor(
  571. **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
  572. )
  573. wordmap = extractor.extract_wordmap(chars)
  574. textmap = wordmap.to_textmap(
  575. **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
  576. )
  577. return textmap
  578. def extract_text(chars: list, **kwargs) -> str:
  579. chars = to_list(chars)
  580. if len(chars) == 0:
  581. return ""
  582. if kwargs.get("layout"):
  583. return chars_to_textmap(chars, **kwargs).as_string
  584. else:
  585. y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
  586. extractor = WordExtractor(
  587. **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
  588. )
  589. words = extractor.extract_words(chars)
  590. if words:
  591. rotation = words[0]["rotation"] # rotation cannot change within a cell
  592. else:
  593. rotation = 0
  594. if rotation == 90:
  595. words.sort(key=lambda w: (w["x1"], -w["top"]))
  596. lines = " ".join([w["text"] for w in words])
  597. elif rotation == 270:
  598. words.sort(key=lambda w: (-w["x1"], w["top"]))
  599. lines = " ".join([w["text"] for w in words])
  600. else:
  601. lines = cluster_objects(words, itemgetter("doctop"), y_tolerance)
  602. lines = "\n".join(" ".join(word["text"] for word in line) for line in lines)
  603. if rotation == 180: # needs extra treatment
  604. lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)])
  605. return lines
  606. def collate_line(
  607. line_chars: list,
  608. tolerance=DEFAULT_X_TOLERANCE,
  609. ) -> str:
  610. coll = ""
  611. last_x1 = None
  612. for char in sorted(line_chars, key=itemgetter("x0")):
  613. if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
  614. coll += " "
  615. last_x1 = char["x1"]
  616. coll += char["text"]
  617. return coll
  618. def dedupe_chars(chars: list, tolerance=1) -> list:
  619. """
  620. Removes duplicate chars — those sharing the same text, fontname, size,
  621. and positioning (within `tolerance`) as other characters in the set.
  622. """
  623. key = itemgetter("fontname", "size", "upright", "text")
  624. pos_key = itemgetter("doctop", "x0")
  625. def yield_unique_chars(chars: list):
  626. sorted_chars = sorted(chars, key=key)
  627. for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
  628. for y_cluster in cluster_objects(
  629. list(grp_chars), itemgetter("doctop"), tolerance
  630. ):
  631. for x_cluster in cluster_objects(
  632. y_cluster, itemgetter("x0"), tolerance
  633. ):
  634. yield sorted(x_cluster, key=pos_key)[0]
  635. deduped = yield_unique_chars(chars)
  636. return sorted(deduped, key=chars.index)
  637. def line_to_edge(line):
  638. edge = dict(line)
  639. edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
  640. return edge
  641. def rect_to_edges(rect) -> list:
  642. top, bottom, left, right = [dict(rect) for x in range(4)]
  643. top.update(
  644. {
  645. "object_type": "rect_edge",
  646. "height": 0,
  647. "y0": rect["y1"],
  648. "bottom": rect["top"],
  649. "orientation": "h",
  650. }
  651. )
  652. bottom.update(
  653. {
  654. "object_type": "rect_edge",
  655. "height": 0,
  656. "y1": rect["y0"],
  657. "top": rect["top"] + rect["height"],
  658. "doctop": rect["doctop"] + rect["height"],
  659. "orientation": "h",
  660. }
  661. )
  662. left.update(
  663. {
  664. "object_type": "rect_edge",
  665. "width": 0,
  666. "x1": rect["x0"],
  667. "orientation": "v",
  668. }
  669. )
  670. right.update(
  671. {
  672. "object_type": "rect_edge",
  673. "width": 0,
  674. "x0": rect["x1"],
  675. "orientation": "v",
  676. }
  677. )
  678. return [top, bottom, left, right]
  679. def curve_to_edges(curve) -> list:
  680. point_pairs = zip(curve["pts"], curve["pts"][1:])
  681. return [
  682. {
  683. "object_type": "curve_edge",
  684. "x0": min(p0[0], p1[0]),
  685. "x1": max(p0[0], p1[0]),
  686. "top": min(p0[1], p1[1]),
  687. "doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]),
  688. "bottom": max(p0[1], p1[1]),
  689. "width": abs(p0[0] - p1[0]),
  690. "height": abs(p0[1] - p1[1]),
  691. "orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None),
  692. }
  693. for p0, p1 in point_pairs
  694. ]
  695. def obj_to_edges(obj) -> list:
  696. t = obj["object_type"]
  697. if "_edge" in t:
  698. return [obj]
  699. elif t == "line":
  700. return [line_to_edge(obj)]
  701. else:
  702. return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)
  703. def filter_edges(
  704. edges,
  705. orientation=None,
  706. edge_type=None,
  707. min_length=1,
  708. ) -> list:
  709. if orientation not in ("v", "h", None):
  710. raise ValueError("Orientation must be 'v' or 'h'")
  711. def test(e) -> bool:
  712. dim = "height" if e["orientation"] == "v" else "width"
  713. et_correct = e["object_type"] == edge_type if edge_type is not None else True
  714. orient_correct = orientation is None or e["orientation"] == orientation
  715. return bool(et_correct and orient_correct and (e[dim] >= min_length))
  716. return list(filter(test, edges))
  717. def cluster_list(xs, tolerance=0) -> list:
  718. if tolerance == 0:
  719. return [[x] for x in sorted(xs)]
  720. if len(xs) < 2:
  721. return [[x] for x in sorted(xs)]
  722. groups = []
  723. xs = list(sorted(xs))
  724. current_group = [xs[0]]
  725. last = xs[0]
  726. for x in xs[1:]:
  727. if x <= (last + tolerance):
  728. current_group.append(x)
  729. else:
  730. groups.append(current_group)
  731. current_group = [x]
  732. last = x
  733. groups.append(current_group)
  734. return groups
  735. def make_cluster_dict(values, tolerance) -> dict:
  736. clusters = cluster_list(list(set(values)), tolerance)
  737. nested_tuples = [
  738. [(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
  739. ]
  740. return dict(itertools.chain(*nested_tuples))
  741. def cluster_objects(xs, key_fn, tolerance) -> list:
  742. if not callable(key_fn):
  743. key_fn = itemgetter(key_fn)
  744. values = map(key_fn, xs)
  745. cluster_dict = make_cluster_dict(values, tolerance)
  746. get_0, get_1 = itemgetter(0), itemgetter(1)
  747. cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
  748. grouped = itertools.groupby(cluster_tuples, key=get_1)
  749. return [list(map(get_0, v)) for k, v in grouped]
  750. def move_object(obj, axis: str, value):
  751. assert axis in ("h", "v")
  752. if axis == "h":
  753. new_items = [
  754. ("x0", obj["x0"] + value),
  755. ("x1", obj["x1"] + value),
  756. ]
  757. if axis == "v":
  758. new_items = [
  759. ("top", obj["top"] + value),
  760. ("bottom", obj["bottom"] + value),
  761. ]
  762. if "doctop" in obj:
  763. new_items += [("doctop", obj["doctop"] + value)]
  764. if "y0" in obj:
  765. new_items += [
  766. ("y0", obj["y0"] - value),
  767. ("y1", obj["y1"] - value),
  768. ]
  769. return obj.__class__(tuple(obj.items()) + tuple(new_items))
  770. def snap_objects(objs, attr: str, tolerance) -> list:
  771. axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr]
  772. list_objs = list(objs)
  773. clusters = cluster_objects(list_objs, itemgetter(attr), tolerance)
  774. avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters]
  775. snapped_clusters = [
  776. [move_object(obj, axis, avg - obj[attr]) for obj in cluster]
  777. for cluster, avg in zip(clusters, avgs)
  778. ]
  779. return list(itertools.chain(*snapped_clusters))
  780. def snap_edges(
  781. edges,
  782. x_tolerance=DEFAULT_SNAP_TOLERANCE,
  783. y_tolerance=DEFAULT_SNAP_TOLERANCE,
  784. ):
  785. """
  786. Given a list of edges, snap any within `tolerance` pixels of one another
  787. to their positional average.
  788. """
  789. by_orientation = {"v": [], "h": []}
  790. for e in edges:
  791. by_orientation[e["orientation"]].append(e)
  792. snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance)
  793. snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance)
  794. return snapped_v + snapped_h
  795. def resize_object(obj, key: str, value):
  796. assert key in ("x0", "x1", "top", "bottom")
  797. old_value = obj[key]
  798. diff = value - old_value
  799. new_items = [
  800. (key, value),
  801. ]
  802. if key == "x0":
  803. assert value <= obj["x1"]
  804. new_items.append(("width", obj["x1"] - value))
  805. elif key == "x1":
  806. assert value >= obj["x0"]
  807. new_items.append(("width", value - obj["x0"]))
  808. elif key == "top":
  809. assert value <= obj["bottom"]
  810. new_items.append(("doctop", obj["doctop"] + diff))
  811. new_items.append(("height", obj["height"] - diff))
  812. if "y1" in obj:
  813. new_items.append(("y1", obj["y1"] - diff))
  814. elif key == "bottom":
  815. assert value >= obj["top"]
  816. new_items.append(("height", obj["height"] + diff))
  817. if "y0" in obj:
  818. new_items.append(("y0", obj["y0"] - diff))
  819. return obj.__class__(tuple(obj.items()) + tuple(new_items))
  820. def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE):
  821. """
  822. Given a list of edges along the same infinite line, join those that
  823. are within `tolerance` pixels of one another.
  824. """
  825. if orientation == "h":
  826. min_prop, max_prop = "x0", "x1"
  827. elif orientation == "v":
  828. min_prop, max_prop = "top", "bottom"
  829. else:
  830. raise ValueError("Orientation must be 'v' or 'h'")
  831. sorted_edges = list(sorted(edges, key=itemgetter(min_prop)))
  832. joined = [sorted_edges[0]]
  833. for e in sorted_edges[1:]:
  834. last = joined[-1]
  835. if e[min_prop] <= (last[max_prop] + tolerance):
  836. if e[max_prop] > last[max_prop]:
  837. # Extend current edge to new extremity
  838. joined[-1] = resize_object(last, max_prop, e[max_prop])
  839. else:
  840. # Edge is separate from previous edges
  841. joined.append(e)
  842. return joined
  843. def merge_edges(
  844. edges,
  845. snap_x_tolerance,
  846. snap_y_tolerance,
  847. join_x_tolerance,
  848. join_y_tolerance,
  849. ):
  850. """
  851. Using the `snap_edges` and `join_edge_group` methods above,
  852. merge a list of edges into a more "seamless" list.
  853. """
  854. def get_group(edge):
  855. if edge["orientation"] == "h":
  856. return ("h", edge["top"])
  857. else:
  858. return ("v", edge["x0"])
  859. if snap_x_tolerance > 0 or snap_y_tolerance > 0:
  860. edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)
  861. _sorted = sorted(edges, key=get_group)
  862. edge_groups = itertools.groupby(_sorted, key=get_group)
  863. edge_gen = (
  864. join_edge_group(
  865. items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance)
  866. )
  867. for k, items in edge_groups
  868. )
  869. edges = list(itertools.chain(*edge_gen))
  870. return edges
  871. def bbox_to_rect(bbox) -> dict:
  872. """
  873. Return the rectangle (i.e a dict with keys "x0", "top", "x1",
  874. "bottom") for an object.
  875. """
  876. return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
  877. def objects_to_rect(objects) -> dict:
  878. """
  879. Given an iterable of objects, return the smallest rectangle (i.e. a
  880. dict with "x0", "top", "x1", and "bottom" keys) that contains them
  881. all.
  882. """
  883. return bbox_to_rect(objects_to_bbox(objects))
  884. def merge_bboxes(bboxes):
  885. """
  886. Given an iterable of bounding boxes, return the smallest bounding box
  887. that contains them all.
  888. """
  889. x0, top, x1, bottom = zip(*bboxes)
  890. return (min(x0), min(top), max(x1), max(bottom))
  891. def objects_to_bbox(objects):
  892. """
  893. Given an iterable of objects, return the smallest bounding box that
  894. contains them all.
  895. """
  896. return merge_bboxes(map(bbox_getter, objects))
  897. def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL):
  898. """
  899. Find (imaginary) horizontal lines that connect the tops
  900. of at least `word_threshold` words.
  901. """
  902. by_top = cluster_objects(words, itemgetter("top"), 1)
  903. large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
  904. rects = list(map(objects_to_rect, large_clusters))
  905. if len(rects) == 0:
  906. return []
  907. min_x0 = min(map(itemgetter("x0"), rects))
  908. max_x1 = max(map(itemgetter("x1"), rects))
  909. edges = []
  910. for r in rects:
  911. edges += [
  912. # Top of text
  913. {
  914. "x0": min_x0,
  915. "x1": max_x1,
  916. "top": r["top"],
  917. "bottom": r["top"],
  918. "width": max_x1 - min_x0,
  919. "orientation": "h",
  920. },
  921. # For each detected row, we also add the 'bottom' line. This will
  922. # generate extra edges, (some will be redundant with the next row
  923. # 'top' line), but this catches the last row of every table.
  924. {
  925. "x0": min_x0,
  926. "x1": max_x1,
  927. "top": r["bottom"],
  928. "bottom": r["bottom"],
  929. "width": max_x1 - min_x0,
  930. "orientation": "h",
  931. },
  932. ]
  933. return edges
  934. def get_bbox_overlap(a, b):
  935. a_left, a_top, a_right, a_bottom = a
  936. b_left, b_top, b_right, b_bottom = b
  937. o_left = max(a_left, b_left)
  938. o_right = min(a_right, b_right)
  939. o_bottom = min(a_bottom, b_bottom)
  940. o_top = max(a_top, b_top)
  941. o_width = o_right - o_left
  942. o_height = o_bottom - o_top
  943. if o_height >= 0 and o_width >= 0 and o_height + o_width > 0:
  944. return (o_left, o_top, o_right, o_bottom)
  945. else:
  946. return None
  947. def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL):
  948. """
  949. Find (imaginary) vertical lines that connect the left, right, or
  950. center of at least `word_threshold` words.
  951. """
  952. # Find words that share the same left, right, or centerpoints
  953. by_x0 = cluster_objects(words, itemgetter("x0"), 1)
  954. by_x1 = cluster_objects(words, itemgetter("x1"), 1)
  955. def get_center(word):
  956. return float(word["x0"] + word["x1"]) / 2
  957. by_center = cluster_objects(words, get_center, 1)
  958. clusters = by_x0 + by_x1 + by_center
  959. # Find the points that align with the most words
  960. sorted_clusters = sorted(clusters, key=lambda x: -len(x))
  961. large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
  962. # For each of those points, find the bboxes fitting all matching words
  963. bboxes = list(map(objects_to_bbox, large_clusters))
  964. # Iterate through those bboxes, condensing overlapping bboxes
  965. condensed_bboxes = []
  966. for bbox in bboxes:
  967. overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes)
  968. if not overlap:
  969. condensed_bboxes.append(bbox)
  970. if not condensed_bboxes:
  971. return []
  972. condensed_rects = map(bbox_to_rect, condensed_bboxes)
  973. sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))
  974. max_x1 = max(map(itemgetter("x1"), sorted_rects))
  975. min_top = min(map(itemgetter("top"), sorted_rects))
  976. max_bottom = max(map(itemgetter("bottom"), sorted_rects))
  977. return [
  978. {
  979. "x0": b["x0"],
  980. "x1": b["x0"],
  981. "top": min_top,
  982. "bottom": max_bottom,
  983. "height": max_bottom - min_top,
  984. "orientation": "v",
  985. }
  986. for b in sorted_rects
  987. ] + [
  988. {
  989. "x0": max_x1,
  990. "x1": max_x1,
  991. "top": min_top,
  992. "bottom": max_bottom,
  993. "height": max_bottom - min_top,
  994. "orientation": "v",
  995. }
  996. ]
  997. def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict:
  998. """
  999. Given a list of edges, return the points at which they intersect
  1000. within `tolerance` pixels.
  1001. """
  1002. intersections = {}
  1003. v_edges, h_edges = [
  1004. list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
  1005. ]
  1006. for v in sorted(v_edges, key=itemgetter("x0", "top")):
  1007. for h in sorted(h_edges, key=itemgetter("top", "x0")):
  1008. if (
  1009. (v["top"] <= (h["top"] + y_tolerance))
  1010. and (v["bottom"] >= (h["top"] - y_tolerance))
  1011. and (v["x0"] >= (h["x0"] - x_tolerance))
  1012. and (v["x0"] <= (h["x1"] + x_tolerance))
  1013. ):
  1014. vertex = (v["x0"], h["top"])
  1015. if vertex not in intersections:
  1016. intersections[vertex] = {"v": [], "h": []}
  1017. intersections[vertex]["v"].append(v)
  1018. intersections[vertex]["h"].append(h)
  1019. return intersections
  1020. def obj_to_bbox(obj):
  1021. """
  1022. Return the bounding box for an object.
  1023. """
  1024. return bbox_getter(obj)
  1025. def intersections_to_cells(intersections):
  1026. """
  1027. Given a list of points (`intersections`), return all rectangular "cells"
  1028. that those points describe.
  1029. `intersections` should be a dictionary with (x0, top) tuples as keys,
  1030. and a list of edge objects as values. The edge objects should correspond
  1031. to the edges that touch the intersection.
  1032. """
  1033. def edge_connects(p1, p2) -> bool:
  1034. def edges_to_set(edges):
  1035. return set(map(obj_to_bbox, edges))
  1036. if p1[0] == p2[0]:
  1037. common = edges_to_set(intersections[p1]["v"]).intersection(
  1038. edges_to_set(intersections[p2]["v"])
  1039. )
  1040. if len(common):
  1041. return True
  1042. if p1[1] == p2[1]:
  1043. common = edges_to_set(intersections[p1]["h"]).intersection(
  1044. edges_to_set(intersections[p2]["h"])
  1045. )
  1046. if len(common):
  1047. return True
  1048. return False
  1049. points = list(sorted(intersections.keys()))
  1050. n_points = len(points)
  1051. def find_smallest_cell(points, i: int):
  1052. if i == n_points - 1:
  1053. return None
  1054. pt = points[i]
  1055. rest = points[i + 1 :]
  1056. # Get all the points directly below and directly right
  1057. below = [x for x in rest if x[0] == pt[0]]
  1058. right = [x for x in rest if x[1] == pt[1]]
  1059. for below_pt in below:
  1060. if not edge_connects(pt, below_pt):
  1061. continue
  1062. for right_pt in right:
  1063. if not edge_connects(pt, right_pt):
  1064. continue
  1065. bottom_right = (right_pt[0], below_pt[1])
  1066. if (
  1067. (bottom_right in intersections)
  1068. and edge_connects(bottom_right, right_pt)
  1069. and edge_connects(bottom_right, below_pt)
  1070. ):
  1071. return (pt[0], pt[1], bottom_right[0], bottom_right[1])
  1072. return None
  1073. cell_gen = (find_smallest_cell(points, i) for i in range(len(points)))
  1074. return list(filter(None, cell_gen))
  1075. def cells_to_tables(page, cells) -> list:
  1076. """
  1077. Given a list of bounding boxes (`cells`), return a list of tables that
  1078. hold those cells most simply (and contiguously).
  1079. """
  1080. def bbox_to_corners(bbox) -> tuple:
  1081. x0, top, x1, bottom = bbox
  1082. return ((x0, top), (x0, bottom), (x1, top), (x1, bottom))
  1083. remaining_cells = list(cells)
  1084. # Iterate through the cells found above, and assign them
  1085. # to contiguous tables
  1086. current_corners = set()
  1087. current_cells = []
  1088. tables = []
  1089. while len(remaining_cells):
  1090. initial_cell_count = len(current_cells)
  1091. for cell in list(remaining_cells):
  1092. cell_corners = bbox_to_corners(cell)
  1093. # If we're just starting a table ...
  1094. if len(current_cells) == 0:
  1095. # ... immediately assign it to the empty group
  1096. current_corners |= set(cell_corners)
  1097. current_cells.append(cell)
  1098. remaining_cells.remove(cell)
  1099. else:
  1100. # How many corners does this table share with the current group?
  1101. corner_count = sum(c in current_corners for c in cell_corners)
  1102. # If touching on at least one corner...
  1103. if corner_count > 0:
  1104. # ... assign it to the current group
  1105. current_corners |= set(cell_corners)
  1106. current_cells.append(cell)
  1107. remaining_cells.remove(cell)
  1108. # If this iteration did not find any more cells to append...
  1109. if len(current_cells) == initial_cell_count:
  1110. # ... start a new cell group
  1111. tables.append(list(current_cells))
  1112. current_corners.clear()
  1113. current_cells.clear()
  1114. # Once we have exhausting the list of cells ...
  1115. # ... and we have a cell group that has not been stored
  1116. if len(current_cells):
  1117. # ... store it.
  1118. tables.append(list(current_cells))
  1119. # PyMuPDF modification:
  1120. # Remove tables without text or having only 1 column
  1121. for i in range(len(tables) - 1, -1, -1):
  1122. r = EMPTY_RECT()
  1123. x1_vals = set()
  1124. x0_vals = set()
  1125. for c in tables[i]:
  1126. r |= c
  1127. x1_vals.add(c[2])
  1128. x0_vals.add(c[0])
  1129. if (
  1130. len(x1_vals) < 2
  1131. or len(x0_vals) < 2
  1132. or white_spaces.issuperset(
  1133. page.get_textbox(
  1134. r,
  1135. textpage=TEXTPAGE,
  1136. )
  1137. )
  1138. ):
  1139. del tables[i]
  1140. # Sort the tables top-to-bottom-left-to-right based on the value of the
  1141. # topmost-and-then-leftmost coordinate of a table.
  1142. _sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
  1143. return _sorted
  1144. class CellGroup:
  1145. def __init__(self, cells):
  1146. self.cells = cells
  1147. self.bbox = (
  1148. min(map(itemgetter(0), filter(None, cells))),
  1149. min(map(itemgetter(1), filter(None, cells))),
  1150. max(map(itemgetter(2), filter(None, cells))),
  1151. max(map(itemgetter(3), filter(None, cells))),
  1152. )
  1153. class TableRow(CellGroup):
  1154. pass
  1155. class TableHeader:
  1156. """PyMuPDF extension containing the identified table header."""
  1157. def __init__(self, bbox, cells, names, above):
  1158. self.bbox = bbox
  1159. self.cells = cells
  1160. self.names = names
  1161. self.external = above
  1162. class Table:
  1163. def __init__(self, page, cells):
  1164. self.page = page
  1165. self.cells = cells
  1166. self.header = self._get_header() # PyMuPDF extension
  1167. @property
  1168. def bbox(self):
  1169. c = self.cells
  1170. return (
  1171. min(map(itemgetter(0), c)),
  1172. min(map(itemgetter(1), c)),
  1173. max(map(itemgetter(2), c)),
  1174. max(map(itemgetter(3), c)),
  1175. )
  1176. @property
  1177. def rows(self) -> list:
  1178. _sorted = sorted(self.cells, key=itemgetter(1, 0))
  1179. xs = list(sorted(set(map(itemgetter(0), self.cells))))
  1180. rows = []
  1181. for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
  1182. xdict = {cell[0]: cell for cell in row_cells}
  1183. row = TableRow([xdict.get(x) for x in xs])
  1184. rows.append(row)
  1185. return rows
  1186. @property
  1187. def row_count(self) -> int: # PyMuPDF extension
  1188. return len(self.rows)
  1189. @property
  1190. def col_count(self) -> int: # PyMuPDF extension
  1191. return max([len(r.cells) for r in self.rows])
  1192. def extract(self, **kwargs) -> list:
  1193. chars = CHARS
  1194. table_arr = []
  1195. def char_in_bbox(char, bbox) -> bool:
  1196. v_mid = (char["top"] + char["bottom"]) / 2
  1197. h_mid = (char["x0"] + char["x1"]) / 2
  1198. x0, top, x1, bottom = bbox
  1199. return bool(
  1200. (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
  1201. )
  1202. for row in self.rows:
  1203. arr = []
  1204. row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
  1205. for cell in row.cells:
  1206. if cell is None:
  1207. cell_text = None
  1208. else:
  1209. cell_chars = [
  1210. char for char in row_chars if char_in_bbox(char, cell)
  1211. ]
  1212. if len(cell_chars):
  1213. kwargs["x_shift"] = cell[0]
  1214. kwargs["y_shift"] = cell[1]
  1215. if "layout" in kwargs:
  1216. kwargs["layout_width"] = cell[2] - cell[0]
  1217. kwargs["layout_height"] = cell[3] - cell[1]
  1218. cell_text = extract_text(cell_chars, **kwargs)
  1219. else:
  1220. cell_text = ""
  1221. arr.append(cell_text)
  1222. table_arr.append(arr)
  1223. return table_arr
  1224. def to_markdown(self, clean=False, fill_empty=True):
  1225. """Output table content as a string in Github-markdown format.
  1226. If "clean" then markdown syntax is removed from cell content.
  1227. If "fill_empty" then cell content None is replaced by the values
  1228. above (columns) or left (rows) in an effort to approximate row and
  1229. columns spans.
  1230. """
  1231. output = "|"
  1232. rows = self.row_count
  1233. cols = self.col_count
  1234. # cell coordinates
  1235. cell_boxes = [[c for c in r.cells] for r in self.rows]
  1236. # cell text strings
  1237. cells = [[None for i in range(cols)] for j in range(rows)]
  1238. for i, row in enumerate(cell_boxes):
  1239. for j, cell in enumerate(row):
  1240. if cell is not None:
  1241. cells[i][j] = extract_cells(
  1242. TEXTPAGE, cell_boxes[i][j], markdown=True
  1243. )
  1244. if fill_empty: # fill "None" cells where possible
  1245. # for rows, copy content from left to right
  1246. for j in range(rows):
  1247. for i in range(cols - 1):
  1248. if cells[j][i + 1] is None:
  1249. cells[j][i + 1] = cells[j][i]
  1250. # for columns, copy top to bottom
  1251. for i in range(cols):
  1252. for j in range(rows - 1):
  1253. if cells[j + 1][i] is None:
  1254. cells[j + 1][i] = cells[j][i]
  1255. # generate header string and MD separator
  1256. for i, name in enumerate(self.header.names):
  1257. if not name: # generate a name if empty
  1258. name = f"Col{i+1}"
  1259. name = name.replace("\n", "<br>") # use HTML line breaks
  1260. if clean: # remove sensitive syntax
  1261. name = html.escape(name.replace("-", "&#45;"))
  1262. output += name + "|"
  1263. output += "\n"
  1264. # insert GitHub header line separator
  1265. output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
  1266. # skip first row in details if header is part of the table
  1267. j = 0 if self.header.external else 1
  1268. # iterate over detail rows
  1269. for row in cells[j:]:
  1270. line = "|"
  1271. for i, cell in enumerate(row):
  1272. # replace None cells with empty string
  1273. # use HTML line break tag
  1274. if cell is None:
  1275. cell = ""
  1276. if clean: # remove sensitive syntax
  1277. cell = html.escape(cell.replace("-", "&#45;"))
  1278. line += cell + "|"
  1279. line += "\n"
  1280. output += line
  1281. return output + "\n"
  1282. def to_pandas(self, **kwargs):
  1283. """Return a pandas DataFrame version of the table."""
  1284. try:
  1285. import pandas as pd
  1286. except ModuleNotFoundError:
  1287. message("Package 'pandas' is not installed")
  1288. raise
  1289. pd_dict = {}
  1290. extract = self.extract()
  1291. hdr = self.header
  1292. names = self.header.names
  1293. hdr_len = len(names)
  1294. # ensure uniqueness of column names
  1295. for i in range(hdr_len):
  1296. name = names[i]
  1297. if not name:
  1298. names[i] = f"Col{i}"
  1299. if hdr_len != len(set(names)):
  1300. for i in range(hdr_len):
  1301. name = names[i]
  1302. if name != f"Col{i}":
  1303. names[i] = f"{i}-{name}"
  1304. if not hdr.external: # header is part of 'extract'
  1305. extract = extract[1:]
  1306. for i in range(hdr_len):
  1307. key = names[i]
  1308. value = []
  1309. for j in range(len(extract)):
  1310. value.append(extract[j][i])
  1311. pd_dict[key] = value
  1312. return pd.DataFrame(pd_dict)
  1313. def _get_header(self, y_tolerance=3):
  1314. """Identify the table header.
  1315. *** PyMuPDF extension. ***
  1316. Starting from the first line above the table upwards, check if it
  1317. qualifies to be part of the table header.
  1318. Criteria include:
  1319. * A one-line table never has an extra header.
  1320. * Column borders must not intersect any word. If this happens, all
  1321. text of this line and above of it is ignored.
  1322. * No excess inter-line distance: If a line further up has a distance
  1323. of more than 1.5 times of its font size, it will be ignored and
  1324. all lines above of it.
  1325. * Must have same text properties.
  1326. * Starting with the top table line, a bold text property cannot change
  1327. back to non-bold.
  1328. If not all criteria are met (or there is no text above the table),
  1329. the first table row is assumed to be the header.
  1330. """
  1331. page = self.page
  1332. y_delta = y_tolerance
  1333. def top_row_bg_color(self):
  1334. """
  1335. Compare top row background color with color of same-sized bbox
  1336. above. If different, return True indicating that the original
  1337. table top row is already the header.
  1338. """
  1339. bbox0 = Rect(self.rows[0].bbox)
  1340. bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
  1341. top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
  1342. top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
  1343. if top_color0 != top_colort:
  1344. return True # top row is header
  1345. return False
  1346. def row_has_bold(bbox):
  1347. """Check if a row contains some bold text.
  1348. If e.g. true for the top row, then it will be used as (internal)
  1349. column header row if any of the following is true:
  1350. * the previous (above) text line has no bold span
  1351. * the second table row text has no bold span
  1352. Returns True if any spans are bold else False.
  1353. """
  1354. blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
  1355. spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
  1356. return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
  1357. try:
  1358. row = self.rows[0]
  1359. cells = row.cells
  1360. bbox = Rect(row.bbox)
  1361. except IndexError: # this table has no rows
  1362. return None
  1363. # return this if we determine that the top row is the header
  1364. header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
  1365. # 1-line tables have no extra header
  1366. if len(self.rows) < 2:
  1367. return header_top_row
  1368. # 1-column tables have no extra header
  1369. if len(cells) < 2:
  1370. return header_top_row
  1371. # assume top row is the header if second row is empty
  1372. row2 = self.rows[1] # second row
  1373. if all(c is None for c in row2.cells): # no valid cell bboxes in row2
  1374. return header_top_row
  1375. # Special check: is top row bold?
  1376. top_row_bold = row_has_bold(bbox)
  1377. # assume top row is header if it is bold and any cell
  1378. # of 2nd row is non-bold
  1379. if top_row_bold and not row_has_bold(row2.bbox):
  1380. return header_top_row
  1381. if top_row_bg_color(self):
  1382. # if area above top row has a different background color,
  1383. # then top row is already the header
  1384. return header_top_row
  1385. # column coordinates (x1 values) in top row
  1386. col_x = [c[2] if c is not None else None for c in cells[:-1]]
  1387. # clip = page area above the table
  1388. # We will inspect this area for text qualifying as column header.
  1389. clip = +bbox # take row 0 bbox
  1390. clip.y0 = 0 # start at top of page
  1391. clip.y1 = bbox.y0 # end at top of table
  1392. blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
  1393. # non-empty, non-superscript spans above table, sorted descending by y1
  1394. spans = sorted(
  1395. [
  1396. s
  1397. for b in blocks
  1398. for l in b["lines"]
  1399. for s in l["spans"]
  1400. if not (
  1401. white_spaces.issuperset(s["text"])
  1402. or s["flags"] & TEXT_FONT_SUPERSCRIPT
  1403. )
  1404. ],
  1405. key=lambda s: s["bbox"][3],
  1406. reverse=True,
  1407. )
  1408. select = [] # y1 coordinates above, sorted descending
  1409. line_heights = [] # line heights above, sorted descending
  1410. line_bolds = [] # bold indicator per line above, same sorting
  1411. # walk through the spans and fill above 3 lists
  1412. for i in range(len(spans)):
  1413. s = spans[i]
  1414. y1 = s["bbox"][3] # span bottom
  1415. h = y1 - s["bbox"][1] # span bbox height
  1416. bold = s["flags"] & TEXT_FONT_BOLD
  1417. # use first item to start the lists
  1418. if i == 0:
  1419. select.append(y1)
  1420. line_heights.append(h)
  1421. line_bolds.append(bold)
  1422. continue
  1423. # get previous items from the 3 lists
  1424. y0 = select[-1]
  1425. h0 = line_heights[-1]
  1426. bold0 = line_bolds[-1]
  1427. if bold0 and not bold:
  1428. break # stop if switching from bold to non-bold
  1429. # if fitting in height of previous span, modify bbox
  1430. if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta:
  1431. s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0)
  1432. spans[i] = s
  1433. if bold:
  1434. line_bolds[-1] = bold
  1435. continue
  1436. elif y0 - y1 > 1.5 * h0:
  1437. break # stop if distance to previous line too large
  1438. select.append(y1)
  1439. line_heights.append(h)
  1440. line_bolds.append(bold)
  1441. if select == []: # nothing above the table?
  1442. return header_top_row
  1443. select = select[:5] # accept up to 5 lines for an external header
  1444. # assume top row as header if text above is too far away
  1445. if bbox.y0 - select[0] >= line_heights[0]:
  1446. return header_top_row
  1447. # accept top row as header if bold, but line above is not
  1448. if top_row_bold and not line_bolds[0]:
  1449. return header_top_row
  1450. if spans == []: # nothing left above the table, return top row
  1451. return header_top_row
  1452. # re-compute clip above table
  1453. nclip = EMPTY_RECT()
  1454. for s in [s for s in spans if s["bbox"][3] >= select[-1]]:
  1455. nclip |= s["bbox"]
  1456. if not nclip.is_empty:
  1457. clip = nclip
  1458. clip.y1 = bbox.y0 # make sure we still include every word above
  1459. # Confirm that no word in clip is intersecting a column separator
  1460. word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)]
  1461. word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True)
  1462. select = []
  1463. # exclude lines with words that intersect a column border
  1464. for top in word_tops:
  1465. intersecting = [
  1466. (x, r)
  1467. for x in col_x
  1468. if x is not None
  1469. for r in word_rects
  1470. if r[1] == top and r[0] < x and r[2] > x
  1471. ]
  1472. if intersecting == []:
  1473. select.append(top)
  1474. else: # detected a word crossing a column border
  1475. break
  1476. if select == []: # nothing left over: return first row
  1477. return header_top_row
  1478. hdr_bbox = +clip # compute the header cells
  1479. hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words
  1480. hdr_cells = [
  1481. (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None
  1482. for c in cells
  1483. ]
  1484. # adjust left/right of header bbox
  1485. hdr_bbox.x0 = self.bbox[0]
  1486. hdr_bbox.x1 = self.bbox[2]
  1487. # column names: no line breaks, no excess spaces
  1488. hdr_names = [
  1489. (
  1490. page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip()
  1491. if c is not None
  1492. else ""
  1493. )
  1494. for c in hdr_cells
  1495. ]
  1496. return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
  1497. @dataclass
  1498. class TableSettings:
  1499. vertical_strategy: str = "lines"
  1500. horizontal_strategy: str = "lines"
  1501. explicit_vertical_lines: list = None
  1502. explicit_horizontal_lines: list = None
  1503. snap_tolerance: float = DEFAULT_SNAP_TOLERANCE
  1504. snap_x_tolerance: float = UNSET
  1505. snap_y_tolerance: float = UNSET
  1506. join_tolerance: float = DEFAULT_JOIN_TOLERANCE
  1507. join_x_tolerance: float = UNSET
  1508. join_y_tolerance: float = UNSET
  1509. edge_min_length: float = 3
  1510. min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL
  1511. min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL
  1512. intersection_tolerance: float = 3
  1513. intersection_x_tolerance: float = UNSET
  1514. intersection_y_tolerance: float = UNSET
  1515. text_settings: dict = None
  1516. def __post_init__(self) -> "TableSettings":
  1517. """Clean up user-provided table settings.
  1518. Validates that the table settings provided consists of acceptable values and
  1519. returns a cleaned up version. The cleaned up version fills out the missing
  1520. values with the default values in the provided settings.
  1521. TODO: Can be further used to validate that the values are of the correct
  1522. type. For example, raising a value error when a non-boolean input is
  1523. provided for the key ``keep_blank_chars``.
  1524. :param table_settings: User-provided table settings.
  1525. :returns: A cleaned up version of the user-provided table settings.
  1526. :raises ValueError: When an unrecognised key is provided.
  1527. """
  1528. for setting in NON_NEGATIVE_SETTINGS:
  1529. if (getattr(self, setting) or 0) < 0:
  1530. raise ValueError(f"Table setting '{setting}' cannot be negative")
  1531. for orientation in ["horizontal", "vertical"]:
  1532. strategy = getattr(self, orientation + "_strategy")
  1533. if strategy not in TABLE_STRATEGIES:
  1534. raise ValueError(
  1535. f"{orientation}_strategy must be one of"
  1536. f'{{{",".join(TABLE_STRATEGIES)}}}'
  1537. )
  1538. if self.text_settings is None:
  1539. self.text_settings = {}
  1540. # This next section is for backwards compatibility
  1541. for attr in ["x_tolerance", "y_tolerance"]:
  1542. if attr not in self.text_settings:
  1543. self.text_settings[attr] = self.text_settings.get("tolerance", 3)
  1544. if "tolerance" in self.text_settings:
  1545. del self.text_settings["tolerance"]
  1546. # End of that section
  1547. for attr, fallback in [
  1548. ("snap_x_tolerance", "snap_tolerance"),
  1549. ("snap_y_tolerance", "snap_tolerance"),
  1550. ("join_x_tolerance", "join_tolerance"),
  1551. ("join_y_tolerance", "join_tolerance"),
  1552. ("intersection_x_tolerance", "intersection_tolerance"),
  1553. ("intersection_y_tolerance", "intersection_tolerance"),
  1554. ]:
  1555. if getattr(self, attr) is UNSET:
  1556. setattr(self, attr, getattr(self, fallback))
  1557. return self
  1558. @classmethod
  1559. def resolve(cls, settings=None):
  1560. if settings is None:
  1561. return cls()
  1562. elif isinstance(settings, cls):
  1563. return settings
  1564. elif isinstance(settings, dict):
  1565. core_settings = {}
  1566. text_settings = {}
  1567. for k, v in settings.items():
  1568. if k[:5] == "text_":
  1569. text_settings[k[5:]] = v
  1570. else:
  1571. core_settings[k] = v
  1572. core_settings["text_settings"] = text_settings
  1573. return cls(**core_settings)
  1574. else:
  1575. raise ValueError(f"Cannot resolve settings: {settings}")
  1576. class TableFinder:
  1577. """
  1578. Given a PDF page, find plausible table structures.
  1579. Largely borrowed from Anssi Nurminen's master's thesis:
  1580. http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
  1581. ... and inspired by Tabula:
  1582. https://github.com/tabulapdf/tabula-extractor/issues/16
  1583. """
  1584. def __init__(self, page, settings=None):
  1585. self.page = weakref.proxy(page)
  1586. self.settings = TableSettings.resolve(settings)
  1587. self.edges = self.get_edges()
  1588. self.intersections = edges_to_intersections(
  1589. self.edges,
  1590. self.settings.intersection_x_tolerance,
  1591. self.settings.intersection_y_tolerance,
  1592. )
  1593. self.cells = intersections_to_cells(self.intersections)
  1594. self.tables = [
  1595. Table(self.page, cell_group)
  1596. for cell_group in cells_to_tables(self.page, self.cells)
  1597. ]
  1598. def get_edges(self) -> list:
  1599. settings = self.settings
  1600. for orientation in ["vertical", "horizontal"]:
  1601. strategy = getattr(settings, orientation + "_strategy")
  1602. if strategy == "explicit":
  1603. lines = getattr(settings, "explicit_" + orientation + "_lines")
  1604. if len(lines) < 2:
  1605. raise ValueError(
  1606. f"If {orientation}_strategy == 'explicit', "
  1607. f"explicit_{orientation}_lines "
  1608. f"must be specified as a list/tuple of two or more "
  1609. f"floats/ints."
  1610. )
  1611. v_strat = settings.vertical_strategy
  1612. h_strat = settings.horizontal_strategy
  1613. if v_strat == "text" or h_strat == "text":
  1614. words = extract_words(CHARS, **(settings.text_settings or {}))
  1615. else:
  1616. words = []
  1617. v_explicit = []
  1618. for desc in settings.explicit_vertical_lines or []:
  1619. if isinstance(desc, dict):
  1620. for e in obj_to_edges(desc):
  1621. if e["orientation"] == "v":
  1622. v_explicit.append(e)
  1623. else:
  1624. v_explicit.append(
  1625. {
  1626. "x0": desc,
  1627. "x1": desc,
  1628. "top": self.page.rect[1],
  1629. "bottom": self.page.rect[3],
  1630. "height": self.page.rect[3] - self.page.rect[1],
  1631. "orientation": "v",
  1632. }
  1633. )
  1634. if v_strat == "lines":
  1635. v_base = filter_edges(EDGES, "v")
  1636. elif v_strat == "lines_strict":
  1637. v_base = filter_edges(EDGES, "v", edge_type="line")
  1638. elif v_strat == "text":
  1639. v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical)
  1640. elif v_strat == "explicit":
  1641. v_base = []
  1642. else:
  1643. v_base = []
  1644. v = v_base + v_explicit
  1645. h_explicit = []
  1646. for desc in settings.explicit_horizontal_lines or []:
  1647. if isinstance(desc, dict):
  1648. for e in obj_to_edges(desc):
  1649. if e["orientation"] == "h":
  1650. h_explicit.append(e)
  1651. else:
  1652. h_explicit.append(
  1653. {
  1654. "x0": self.page.rect[0],
  1655. "x1": self.page.rect[2],
  1656. "width": self.page.rect[2] - self.page.rect[0],
  1657. "top": desc,
  1658. "bottom": desc,
  1659. "orientation": "h",
  1660. }
  1661. )
  1662. if h_strat == "lines":
  1663. h_base = filter_edges(EDGES, "h")
  1664. elif h_strat == "lines_strict":
  1665. h_base = filter_edges(EDGES, "h", edge_type="line")
  1666. elif h_strat == "text":
  1667. h_base = words_to_edges_h(
  1668. words, word_threshold=settings.min_words_horizontal
  1669. )
  1670. elif h_strat == "explicit":
  1671. h_base = []
  1672. else:
  1673. h_base = []
  1674. h = h_base + h_explicit
  1675. edges = list(v) + list(h)
  1676. edges = merge_edges(
  1677. edges,
  1678. snap_x_tolerance=settings.snap_x_tolerance,
  1679. snap_y_tolerance=settings.snap_y_tolerance,
  1680. join_x_tolerance=settings.join_x_tolerance,
  1681. join_y_tolerance=settings.join_y_tolerance,
  1682. )
  1683. return filter_edges(edges, min_length=settings.edge_min_length)
  1684. def __getitem__(self, i):
  1685. tcount = len(self.tables)
  1686. if i >= tcount:
  1687. raise IndexError("table not on page")
  1688. while i < 0:
  1689. i += tcount
  1690. return self.tables[i]
  1691. """
  1692. Start of PyMuPDF interface code.
  1693. The following functions are executed when "page.find_tables()" is called.
  1694. * make_chars: Fills the CHARS list with text character information extracted
  1695. via "rawdict" text extraction. Items in CHARS are formatted
  1696. as expected by the table code.
  1697. * make_edges: Fills the EDGES list with vector graphic information extracted
  1698. via "get_drawings". Items in EDGES are formatted as expected
  1699. by the table code.
  1700. The lists CHARS and EDGES are used to replace respective document access
  1701. of pdfplumber or, respectively pdfminer.
  1702. The table code has been modified to use these lists instead of accessing
  1703. page information themselves.
  1704. """
  1705. # -----------------------------------------------------------------------------
  1706. # Extract all page characters to fill the CHARS list
  1707. # -----------------------------------------------------------------------------
  1708. def make_chars(page, clip=None):
  1709. """Extract text as "rawdict" to fill CHARS."""
  1710. global TEXTPAGE
  1711. page_number = page.number + 1
  1712. page_height = page.rect.height
  1713. ctm = page.transformation_matrix
  1714. TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
  1715. blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
  1716. doctop_base = page_height * page.number
  1717. for block in blocks:
  1718. for line in block["lines"]:
  1719. ldir = line["dir"] # = (cosine, sine) of angle
  1720. ldir = (round(ldir[0], 4), round(ldir[1], 4))
  1721. matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
  1722. if ldir[1] == 0:
  1723. upright = True
  1724. else:
  1725. upright = False
  1726. for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
  1727. fontname = span["font"]
  1728. fontsize = span["size"]
  1729. color = sRGB_to_pdf(span["color"])
  1730. for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
  1731. bbox = Rect(char["bbox"])
  1732. bbox_ctm = bbox * ctm
  1733. origin = Point(char["origin"]) * ctm
  1734. matrix.e = origin.x
  1735. matrix.f = origin.y
  1736. text = char["c"]
  1737. char_dict = {
  1738. "adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0,
  1739. "bottom": bbox.y1,
  1740. "doctop": bbox.y0 + doctop_base,
  1741. "fontname": fontname,
  1742. "height": bbox.y1 - bbox.y0,
  1743. "matrix": tuple(matrix),
  1744. "ncs": "DeviceRGB",
  1745. "non_stroking_color": color,
  1746. "non_stroking_pattern": None,
  1747. "object_type": "char",
  1748. "page_number": page_number,
  1749. "size": fontsize if upright else bbox.y1 - bbox.y0,
  1750. "stroking_color": color,
  1751. "stroking_pattern": None,
  1752. "text": text,
  1753. "top": bbox.y0,
  1754. "upright": upright,
  1755. "width": bbox.x1 - bbox.x0,
  1756. "x0": bbox.x0,
  1757. "x1": bbox.x1,
  1758. "y0": bbox_ctm.y0,
  1759. "y1": bbox_ctm.y1,
  1760. }
  1761. CHARS.append(char_dict)
  1762. # ------------------------------------------------------------------------
  1763. # Extract all page vector graphics to fill the EDGES list.
  1764. # We are ignoring Bézier curves completely and are converting everything
  1765. # else to lines.
  1766. # ------------------------------------------------------------------------
  1767. def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
  1768. snap_x = tset.snap_x_tolerance
  1769. snap_y = tset.snap_y_tolerance
  1770. min_length = tset.edge_min_length
  1771. lines_strict = (
  1772. tset.vertical_strategy == "lines_strict"
  1773. or tset.horizontal_strategy == "lines_strict"
  1774. )
  1775. page_height = page.rect.height
  1776. doctop_basis = page.number * page_height
  1777. page_number = page.number + 1
  1778. prect = page.rect
  1779. if page.rotation in (90, 270):
  1780. w, h = prect.br
  1781. prect = Rect(0, 0, h, w)
  1782. if clip is not None:
  1783. clip = Rect(clip)
  1784. else:
  1785. clip = prect
  1786. def are_neighbors(r1, r2):
  1787. """Detect whether r1, r2 are neighbors.
  1788. Defined as:
  1789. The minimum distance between points of r1 and points of r2 is not
  1790. larger than some delta.
  1791. This check supports empty rect-likes and thus also lines.
  1792. Note:
  1793. This type of check is MUCH faster than native Rect containment checks.
  1794. """
  1795. if ( # check if x-coordinates of r1 are within those of r2
  1796. r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
  1797. or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
  1798. ) and ( # ... same for y-coordinates
  1799. r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
  1800. or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
  1801. ):
  1802. return True
  1803. # same check with r1 / r2 exchanging their roles (this is necessary!)
  1804. if (
  1805. r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
  1806. or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
  1807. ) and (
  1808. r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
  1809. or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
  1810. ):
  1811. return True
  1812. return False
  1813. def clean_graphics(npaths=None):
  1814. """Detect and join rectangles of "connected" vector graphics."""
  1815. if npaths is None:
  1816. allpaths = page.get_drawings()
  1817. else: # accept passed-in vector graphics
  1818. allpaths = npaths[:] # paths relevant for table detection
  1819. paths = []
  1820. for p in allpaths:
  1821. # If only looking at lines, we ignore fill-only paths,
  1822. # except simulated lines (i.e. small width or height).
  1823. if (
  1824. lines_strict
  1825. and p["type"] == "f"
  1826. and p["rect"].width > snap_x
  1827. and p["rect"].height > snap_y
  1828. ):
  1829. continue
  1830. paths.append(p)
  1831. # start with all vector graphics rectangles
  1832. prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0))
  1833. new_rects = [] # the final list of joined rectangles
  1834. # ----------------------------------------------------------------
  1835. # Strategy: Join rectangles that "almost touch" each other.
  1836. # Extend first rectangle with any other that is a "neighbor".
  1837. # Then move it to the final list and continue with the rest.
  1838. # ----------------------------------------------------------------
  1839. while prects: # the algorithm will empty this list
  1840. prect0 = prects[0] # copy of first rectangle (performance reasons!)
  1841. repeat = True
  1842. while repeat: # this loop extends first rect in list
  1843. repeat = False # set to true again if some other rect touches
  1844. for i in range(len(prects) - 1, 0, -1): # run backwards
  1845. if are_neighbors(prect0, prects[i]): # close enough to rect 0?
  1846. prect0 |= prects[i].tl # extend rect 0
  1847. prect0 |= prects[i].br # extend rect 0
  1848. del prects[i] # delete this rect
  1849. repeat = True # keep checking the rest
  1850. # move rect 0 over to result list if there is some text in it
  1851. if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
  1852. # contains text, so accept it as a table bbox candidate
  1853. new_rects.append(prect0)
  1854. del prects[0] # remove from rect list
  1855. return new_rects, paths
  1856. bboxes, paths = clean_graphics(npaths=paths)
  1857. def is_parallel(p1, p2):
  1858. """Check if line is roughly axis-parallel."""
  1859. if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y:
  1860. return True
  1861. return False
  1862. def make_line(p, p1, p2, clip):
  1863. """Given 2 points, make a line dictionary for table detection."""
  1864. if not is_parallel(p1, p2): # only accepting axis-parallel lines
  1865. return {}
  1866. # compute the extremal values
  1867. x0 = min(p1.x, p2.x)
  1868. x1 = max(p1.x, p2.x)
  1869. y0 = min(p1.y, p2.y)
  1870. y1 = max(p1.y, p2.y)
  1871. # check for outside clip
  1872. if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0:
  1873. return {}
  1874. if x0 < clip.x0:
  1875. x0 = clip.x0 # adjust to clip boundary
  1876. if x1 > clip.x1:
  1877. x1 = clip.x1 # adjust to clip boundary
  1878. if y0 < clip.y0:
  1879. y0 = clip.y0 # adjust to clip boundary
  1880. if y1 > clip.y1:
  1881. y1 = clip.y1 # adjust to clip boundary
  1882. width = x1 - x0 # from adjusted values
  1883. height = y1 - y0 # from adjusted values
  1884. if width == height == 0:
  1885. return {} # nothing left to deal with
  1886. line_dict = {
  1887. "x0": x0,
  1888. "y0": page_height - y0,
  1889. "x1": x1,
  1890. "y1": page_height - y1,
  1891. "width": width,
  1892. "height": height,
  1893. "pts": [(x0, y0), (x1, y1)],
  1894. "linewidth": p["width"],
  1895. "stroke": True,
  1896. "fill": False,
  1897. "evenodd": False,
  1898. "stroking_color": p["color"] if p["color"] else p["fill"],
  1899. "non_stroking_color": None,
  1900. "object_type": "line",
  1901. "page_number": page_number,
  1902. "stroking_pattern": None,
  1903. "non_stroking_pattern": None,
  1904. "top": y0,
  1905. "bottom": y1,
  1906. "doctop": y0 + doctop_basis,
  1907. }
  1908. return line_dict
  1909. for p in paths:
  1910. items = p["items"] # items in this path
  1911. # if 'closePath', add a line from last to first point
  1912. if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l":
  1913. items.append(("l", items[-1][2], items[0][1]))
  1914. for i in items:
  1915. if i[0] not in ("l", "re", "qu"):
  1916. continue # ignore anything else
  1917. if i[0] == "l": # a line
  1918. p1, p2 = i[1:]
  1919. line_dict = make_line(p, p1, p2, clip)
  1920. if line_dict:
  1921. EDGES.append(line_to_edge(line_dict))
  1922. elif i[0] == "re":
  1923. # A rectangle: decompose into 4 lines, but filter out
  1924. # the ones that simulate a line
  1925. rect = i[1].normalize() # normalize the rectangle
  1926. if (
  1927. rect.width <= min_length and rect.width < rect.height
  1928. ): # simulates a vertical line
  1929. x = abs(rect.x1 + rect.x0) / 2 # take middle value for x
  1930. p1 = Point(x, rect.y0)
  1931. p2 = Point(x, rect.y1)
  1932. line_dict = make_line(p, p1, p2, clip)
  1933. if line_dict:
  1934. EDGES.append(line_to_edge(line_dict))
  1935. continue
  1936. if (
  1937. rect.height <= min_length and rect.height < rect.width
  1938. ): # simulates a horizontal line
  1939. y = abs(rect.y1 + rect.y0) / 2 # take middle value for y
  1940. p1 = Point(rect.x0, y)
  1941. p2 = Point(rect.x1, y)
  1942. line_dict = make_line(p, p1, p2, clip)
  1943. if line_dict:
  1944. EDGES.append(line_to_edge(line_dict))
  1945. continue
  1946. line_dict = make_line(p, rect.tl, rect.bl, clip)
  1947. if line_dict:
  1948. EDGES.append(line_to_edge(line_dict))
  1949. line_dict = make_line(p, rect.bl, rect.br, clip)
  1950. if line_dict:
  1951. EDGES.append(line_to_edge(line_dict))
  1952. line_dict = make_line(p, rect.br, rect.tr, clip)
  1953. if line_dict:
  1954. EDGES.append(line_to_edge(line_dict))
  1955. line_dict = make_line(p, rect.tr, rect.tl, clip)
  1956. if line_dict:
  1957. EDGES.append(line_to_edge(line_dict))
  1958. else: # must be a quad
  1959. # we convert it into (up to) 4 lines
  1960. ul, ur, ll, lr = i[1]
  1961. line_dict = make_line(p, ul, ll, clip)
  1962. if line_dict:
  1963. EDGES.append(line_to_edge(line_dict))
  1964. line_dict = make_line(p, ll, lr, clip)
  1965. if line_dict:
  1966. EDGES.append(line_to_edge(line_dict))
  1967. line_dict = make_line(p, lr, ur, clip)
  1968. if line_dict:
  1969. EDGES.append(line_to_edge(line_dict))
  1970. line_dict = make_line(p, ur, ul, clip)
  1971. if line_dict:
  1972. EDGES.append(line_to_edge(line_dict))
  1973. path = {"color": (0, 0, 0), "fill": None, "width": 1}
  1974. for bbox in bboxes: # add the border lines for all enveloping bboxes
  1975. line_dict = make_line(path, bbox.tl, bbox.tr, clip)
  1976. if line_dict:
  1977. EDGES.append(line_to_edge(line_dict))
  1978. line_dict = make_line(path, bbox.bl, bbox.br, clip)
  1979. if line_dict:
  1980. EDGES.append(line_to_edge(line_dict))
  1981. line_dict = make_line(path, bbox.tl, bbox.bl, clip)
  1982. if line_dict:
  1983. EDGES.append(line_to_edge(line_dict))
  1984. line_dict = make_line(path, bbox.tr, bbox.br, clip)
  1985. if line_dict:
  1986. EDGES.append(line_to_edge(line_dict))
  1987. if add_lines is not None: # add user-specified lines
  1988. assert isinstance(add_lines, (tuple, list))
  1989. else:
  1990. add_lines = []
  1991. for p1, p2 in add_lines:
  1992. p1 = Point(p1)
  1993. p2 = Point(p2)
  1994. line_dict = make_line(path, p1, p2, clip)
  1995. if line_dict:
  1996. EDGES.append(line_to_edge(line_dict))
  1997. if add_boxes is not None: # add user-specified rectangles
  1998. assert isinstance(add_boxes, (tuple, list))
  1999. else:
  2000. add_boxes = []
  2001. for box in add_boxes:
  2002. r = Rect(box)
  2003. line_dict = make_line(path, r.tl, r.bl, clip)
  2004. if line_dict:
  2005. EDGES.append(line_to_edge(line_dict))
  2006. line_dict = make_line(path, r.bl, r.br, clip)
  2007. if line_dict:
  2008. EDGES.append(line_to_edge(line_dict))
  2009. line_dict = make_line(path, r.br, r.tr, clip)
  2010. if line_dict:
  2011. EDGES.append(line_to_edge(line_dict))
  2012. line_dict = make_line(path, r.tr, r.tl, clip)
  2013. if line_dict:
  2014. EDGES.append(line_to_edge(line_dict))
  2015. def page_rotation_set0(page):
  2016. """Nullify page rotation.
  2017. To correctly detect tables, page rotation must be zero.
  2018. This function performs the necessary adjustments and returns information
  2019. for reverting this changes.
  2020. """
  2021. mediabox = page.mediabox
  2022. rot = page.rotation # contains normalized rotation value
  2023. # need to derotate the page's content
  2024. mb = page.mediabox # current mediabox
  2025. if rot == 90:
  2026. # before derotation, shift content horizontally
  2027. mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
  2028. elif rot == 270:
  2029. # before derotation, shift content vertically
  2030. mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
  2031. else:
  2032. mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
  2033. # prefix with derotation matrix
  2034. mat = mat0 * page.derotation_matrix
  2035. cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
  2036. xref = TOOLS._insert_contents(page, cmd, 0)
  2037. # swap x- and y-coordinates
  2038. if rot in (90, 270):
  2039. x0, y0, x1, y1 = mb
  2040. mb.x0 = y0
  2041. mb.y0 = x0
  2042. mb.x1 = y1
  2043. mb.y1 = x1
  2044. page.set_mediabox(mb)
  2045. page.set_rotation(0)
  2046. # refresh the page to apply these changes
  2047. doc = page.parent
  2048. pno = page.number
  2049. page = doc[pno]
  2050. return page, xref, rot, mediabox
  2051. def page_rotation_reset(page, xref, rot, mediabox):
  2052. """Reset page rotation to original values.
  2053. To be used before we return tables."""
  2054. doc = page.parent # document of the page
  2055. doc.update_stream(xref, b" ") # remove de-rotation matrix
  2056. page.set_mediabox(mediabox) # set mediabox to old value
  2057. page.set_rotation(rot) # set rotation to old value
  2058. pno = page.number
  2059. page = doc[pno] # update page info
  2060. return page
  2061. def find_tables(
  2062. page,
  2063. clip=None,
  2064. vertical_strategy: str = "lines",
  2065. horizontal_strategy: str = "lines",
  2066. vertical_lines: list = None,
  2067. horizontal_lines: list = None,
  2068. snap_tolerance: float = DEFAULT_SNAP_TOLERANCE,
  2069. snap_x_tolerance: float = None,
  2070. snap_y_tolerance: float = None,
  2071. join_tolerance: float = DEFAULT_JOIN_TOLERANCE,
  2072. join_x_tolerance: float = None,
  2073. join_y_tolerance: float = None,
  2074. edge_min_length: float = 3,
  2075. min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL,
  2076. min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL,
  2077. intersection_tolerance: float = 3,
  2078. intersection_x_tolerance: float = None,
  2079. intersection_y_tolerance: float = None,
  2080. text_tolerance=3,
  2081. text_x_tolerance=3,
  2082. text_y_tolerance=3,
  2083. strategy=None, # offer abbreviation
  2084. add_lines=None, # user-specified lines
  2085. add_boxes=None, # user-specified rectangles
  2086. paths=None, # accept vector graphics as parameter
  2087. ):
  2088. global CHARS, EDGES
  2089. CHARS = []
  2090. EDGES = []
  2091. old_small = bool(TOOLS.set_small_glyph_heights()) # save old value
  2092. TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
  2093. if page.rotation != 0:
  2094. page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
  2095. else:
  2096. old_xref, old_rot, old_mediabox = None, None, None
  2097. if snap_x_tolerance is None:
  2098. snap_x_tolerance = UNSET
  2099. if snap_y_tolerance is None:
  2100. snap_y_tolerance = UNSET
  2101. if join_x_tolerance is None:
  2102. join_x_tolerance = UNSET
  2103. if join_y_tolerance is None:
  2104. join_y_tolerance = UNSET
  2105. if intersection_x_tolerance is None:
  2106. intersection_x_tolerance = UNSET
  2107. if intersection_y_tolerance is None:
  2108. intersection_y_tolerance = UNSET
  2109. if strategy is not None:
  2110. vertical_strategy = strategy
  2111. horizontal_strategy = strategy
  2112. settings = {
  2113. "vertical_strategy": vertical_strategy,
  2114. "horizontal_strategy": horizontal_strategy,
  2115. "explicit_vertical_lines": vertical_lines,
  2116. "explicit_horizontal_lines": horizontal_lines,
  2117. "snap_tolerance": snap_tolerance,
  2118. "snap_x_tolerance": snap_x_tolerance,
  2119. "snap_y_tolerance": snap_y_tolerance,
  2120. "join_tolerance": join_tolerance,
  2121. "join_x_tolerance": join_x_tolerance,
  2122. "join_y_tolerance": join_y_tolerance,
  2123. "edge_min_length": edge_min_length,
  2124. "min_words_vertical": min_words_vertical,
  2125. "min_words_horizontal": min_words_horizontal,
  2126. "intersection_tolerance": intersection_tolerance,
  2127. "intersection_x_tolerance": intersection_x_tolerance,
  2128. "intersection_y_tolerance": intersection_y_tolerance,
  2129. "text_tolerance": text_tolerance,
  2130. "text_x_tolerance": text_x_tolerance,
  2131. "text_y_tolerance": text_y_tolerance,
  2132. }
  2133. tset = TableSettings.resolve(settings=settings)
  2134. page.table_settings = tset
  2135. make_chars(page, clip=clip) # create character list of page
  2136. make_edges(
  2137. page,
  2138. clip=clip,
  2139. tset=tset,
  2140. paths=paths,
  2141. add_lines=add_lines,
  2142. add_boxes=add_boxes,
  2143. ) # create lines and curves
  2144. tables = TableFinder(page, settings=tset)
  2145. TOOLS.set_small_glyph_heights(old_small)
  2146. if old_xref is not None:
  2147. page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
  2148. return tables