textpage.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
  2. # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
  3. __all__ = ("PdfTextPage", "PdfTextSearcher")
  4. import ctypes
  5. import logging
  6. import warnings
  7. import pypdfium2.raw as pdfium_c
  8. import pypdfium2.internal as pdfium_i
  9. from pypdfium2._helpers.misc import PdfiumError
  10. from pypdfium2.version import PDFIUM_INFO
  11. c_double = ctypes.c_double
  12. logger = logging.getLogger(__name__)
  13. class PdfTextPage (pdfium_i.AutoCloseable):
  14. """
  15. Text page helper class.
  16. Attributes:
  17. raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
  18. page (PdfPage): Reference to the page this textpage belongs to.
  19. """
  20. def __init__(self, raw, page):
  21. self.raw = raw
  22. self.page = page
  23. super().__init__(pdfium_c.FPDFText_ClosePage)
  24. @property
  25. def parent(self): # AutoCloseable hook
  26. return self.page
  27. def _get_active_text_range(self, c_start, c_end, l_passive=0, r_passive=0):
  28. if c_start > c_end:
  29. return 0 # no active chars in range
  30. t_start = pdfium_c.FPDFText_GetTextIndexFromCharIndex(self, c_start)
  31. if t_start == -1:
  32. return self._get_active_text_range(c_start+1, c_end, l_passive+1, r_passive)
  33. t_end = pdfium_c.FPDFText_GetTextIndexFromCharIndex(self, c_end)
  34. if t_end == -1:
  35. return self._get_active_text_range(c_start, c_end-1, l_passive, r_passive+1)
  36. return t_start, t_end, l_passive, r_passive
  37. def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False):
  38. """
  39. Warning:
  40. .. versionchanged:: 4.28
  41. For various reasons, calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
  42. Extract text from a given range.
  43. Parameters:
  44. index (int): Index of the first char to include.
  45. count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
  46. errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
  47. Returns:
  48. str: The text in the range in question, or an empty string if no text was found.
  49. Note:
  50. * The returned text's length does not have to match *count*, even if it will for most PDFs.
  51. This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
  52. This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
  53. Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
  54. * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
  55. """
  56. # https://github.com/pypdfium2-team/pypdfium2/issues/298
  57. # https://crbug.com/pdfium/2133
  58. if (index, count) == (0, -1) and not force_this:
  59. warnings.warn("get_text_range() call with default params will be implicitly redirected to get_text_bounded()")
  60. return self.get_text_bounded(errors=errors)
  61. if count == -1:
  62. count = self.count_chars() - index
  63. # https://github.com/pypdfium2-team/pypdfium2/issues/261
  64. # https://crbug.com/pdfium/2079
  65. active_range = self._get_active_text_range(index, index+count-1)
  66. if active_range == 0:
  67. return ""
  68. # NOTE since we have converted indices from char to text, they will shift accordingly for inserted/excluded chars, so this will calculate the exact output count
  69. t_start, t_end, l_passive, r_passive = active_range
  70. index += l_passive
  71. count -= l_passive + r_passive
  72. in_count = t_end+1 - t_start
  73. # pdfium fea01fa9e2 (>6167) to d6a4b27d80 (<6415) requires assuming 4 bytes per character
  74. # https://github.com/pypdfium2-team/pypdfium2/issues/298
  75. # https://crbug.com/pdfium/2133
  76. if 6167 < PDFIUM_INFO.build < 6415:
  77. in_count *= 2
  78. in_count += 1 # null terminator
  79. buffer = ctypes.create_string_buffer(in_count * 2)
  80. buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
  81. out_count = pdfium_c.FPDFText_GetText(self, index, count, buffer_ptr)
  82. assert in_count >= out_count, f"Buffer too small: {in_count} vs {out_count}"
  83. return buffer.raw[:(out_count-1)*2].decode("utf-16-le", errors=errors)
  84. def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):
  85. """
  86. Extract text from given boundaries in PDF coordinates.
  87. If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
  88. Parameters:
  89. errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
  90. Returns:
  91. str: The text on the page area in question, or an empty string if no text was found.
  92. """
  93. bbox = self.page.get_bbox()
  94. if left is None:
  95. left = bbox[0]
  96. if bottom is None:
  97. bottom = bbox[1]
  98. if right is None:
  99. right = bbox[2]
  100. if top is None:
  101. top = bbox[3]
  102. args = (self, left, top, right, bottom)
  103. n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0)
  104. if n_chars <= 0:
  105. return ""
  106. buffer = ctypes.create_string_buffer(n_chars * 2)
  107. buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
  108. pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars)
  109. return buffer.raw.decode("utf-16-le", errors=errors)
  110. def count_chars(self):
  111. """
  112. Returns:
  113. int: The number of characters on the text page.
  114. """
  115. n_chars = pdfium_c.FPDFText_CountChars(self)
  116. if n_chars == -1:
  117. raise PdfiumError("Failed to get character count.")
  118. return n_chars
  119. def count_rects(self, index=0, count=-1):
  120. """
  121. Parameters:
  122. index (int): Start character index.
  123. count (int): Character count to consider (defaults to -1 for all remaining).
  124. Returns:
  125. int: The number of text rectangles in the given character range.
  126. """
  127. n_rects = pdfium_c.FPDFText_CountRects(self, index, count)
  128. if n_rects == -1:
  129. raise PdfiumError("Failed to count rectangles.")
  130. return n_rects
  131. def get_index(self, x, y, x_tol, y_tol):
  132. """
  133. Get the index of a character by position.
  134. Parameters:
  135. x (float): Horizontal position (in PDF canvas units).
  136. y (float): Vertical position.
  137. x_tol (float): Horizontal tolerance.
  138. y_tol (float): Vertical tolerance.
  139. Returns:
  140. int | None: The index of the character at or nearby the point (x, y).
  141. May be None if there is no character or an error occurred.
  142. """
  143. index = pdfium_c.FPDFText_GetCharIndexAtPos(self, x, y, x_tol, y_tol)
  144. if index < 0:
  145. return None
  146. return index
  147. def get_charbox(self, index, loose=False):
  148. """
  149. Get the bounding box of a single character.
  150. Parameters:
  151. index (int):
  152. Index of the character to work with, in the page's character array.
  153. loose (bool):
  154. Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
  155. Returns:
  156. Float values for left, bottom, right and top in PDF canvas units.
  157. """
  158. if loose:
  159. rect = pdfium_c.FS_RECTF()
  160. ok = pdfium_c.FPDFText_GetLooseCharBox(self, index, rect)
  161. l, b, r, t = rect.left, rect.bottom, rect.right, rect.top
  162. else:
  163. l, b, r, t = c_double(), c_double(), c_double(), c_double()
  164. ok = pdfium_c.FPDFText_GetCharBox(self, index, l, r, b, t) # yes, lrbt!
  165. l, b, r, t = l.value, b.value, r.value, t.value
  166. if not ok:
  167. raise PdfiumError("Failed to get charbox.")
  168. return l, b, r, t
  169. def get_rect(self, index):
  170. """
  171. Get the bounding box of a text rectangle at the given index.
  172. Note that :meth:`.count_rects` must be called once with default parameters
  173. before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
  174. Returns:
  175. Float values for left, bottom, right and top in PDF canvas units.
  176. """
  177. l, b, r, t = c_double(), c_double(), c_double(), c_double()
  178. ok = pdfium_c.FPDFText_GetRect(self, index, l, t, r, b) # yes, ltrb!
  179. if not ok:
  180. raise PdfiumError("Failed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.)")
  181. return (l.value, b.value, r.value, t.value)
  182. def search(self, text, index=0, match_case=False, match_whole_word=False, consecutive=False):
  183. """
  184. Locate text on the page.
  185. Parameters:
  186. text (str):
  187. The string to search for.
  188. index (int):
  189. Character index at which to start searching.
  190. match_case (bool):
  191. If True, the search will be case-specific (upper and lower letters treated as different characters).
  192. match_whole_word (bool):
  193. If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
  194. consecutive (bool):
  195. If False (the default), :meth:`.search` will skip past the current match to look for the next match.
  196. If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
  197. Returns:
  198. PdfTextSearcher: A helper object to search text.
  199. """
  200. if len(text) == 0:
  201. raise ValueError("Text length must be greater than 0.")
  202. flags = 0
  203. if match_case:
  204. flags |= pdfium_c.FPDF_MATCHCASE
  205. if match_whole_word:
  206. flags |= pdfium_c.FPDF_MATCHWHOLEWORD
  207. if consecutive:
  208. flags |= pdfium_c.FPDF_CONSECUTIVE
  209. enc_text = (text + "\x00").encode("utf-16-le")
  210. enc_text_ptr = ctypes.cast(enc_text, ctypes.POINTER(ctypes.c_ushort))
  211. raw_searcher = pdfium_c.FPDFText_FindStart(self, enc_text_ptr, flags, index)
  212. searcher = PdfTextSearcher(raw_searcher, self)
  213. self._add_kid(searcher)
  214. return searcher
  215. class PdfTextSearcher (pdfium_i.AutoCloseable):
  216. """
  217. Text searcher helper class.
  218. Attributes:
  219. raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
  220. textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
  221. """
  222. def __init__(self, raw, textpage):
  223. self.raw = raw
  224. self.textpage = textpage
  225. super().__init__(pdfium_c.FPDFText_FindClose)
  226. @property
  227. def parent(self): # AutoCloseable hook
  228. return self.textpage
  229. def _get_occurrence(self, find_func):
  230. ok = find_func(self)
  231. if not ok:
  232. return None
  233. index = pdfium_c.FPDFText_GetSchResultIndex(self)
  234. count = pdfium_c.FPDFText_GetSchCount(self)
  235. return index, count
  236. def get_next(self):
  237. """
  238. Returns:
  239. (int, int): Start character index and count of the next occurrence,
  240. or None if the last occurrence was passed.
  241. """
  242. return self._get_occurrence(pdfium_c.FPDFText_FindNext)
  243. def get_prev(self):
  244. """
  245. Returns:
  246. (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
  247. or None if the last occurrence was passed.
  248. """
  249. return self._get_occurrence(pdfium_c.FPDFText_FindPrev)