utils.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887
  1. """Miscellaneous Routines."""
  2. import io
  3. import pathlib
  4. import string
  5. from html import escape
  6. from typing import (
  7. TYPE_CHECKING,
  8. Any,
  9. BinaryIO,
  10. Callable,
  11. Dict,
  12. Generic,
  13. Iterable,
  14. Iterator,
  15. List,
  16. Optional,
  17. Set,
  18. TextIO,
  19. Tuple,
  20. TypeVar,
  21. Union,
  22. cast,
  23. )
  24. from pdfminer.pdfexceptions import PDFTypeError, PDFValueError
  25. if TYPE_CHECKING:
  26. from pdfminer.layout import LTComponent
  27. import charset_normalizer # For str encoding detection
  28. # from sys import maxint as INF doesn't work anymore under Python3, but PDF
  29. # still uses 32 bits ints
  30. INF = (1 << 31) - 1
  31. FileOrName = Union[pathlib.PurePath, str, io.IOBase]
  32. AnyIO = Union[TextIO, BinaryIO]
  33. class open_filename:
  34. """Context manager that allows opening a filename
  35. (str or pathlib.PurePath type is supported) and closes it on exit,
  36. (just like `open`), but does nothing for file-like objects.
  37. """
  38. def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
  39. if isinstance(filename, pathlib.PurePath):
  40. filename = str(filename)
  41. if isinstance(filename, str):
  42. self.file_handler: AnyIO = open(filename, *args, **kwargs)
  43. self.closing = True
  44. elif isinstance(filename, io.IOBase):
  45. self.file_handler = cast(AnyIO, filename)
  46. self.closing = False
  47. else:
  48. raise PDFTypeError("Unsupported input type: %s" % type(filename))
  49. def __enter__(self) -> AnyIO:
  50. return self.file_handler
  51. def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
  52. if self.closing:
  53. self.file_handler.close()
  54. def make_compat_bytes(in_str: str) -> bytes:
  55. """Converts to bytes, encoding to unicode."""
  56. assert isinstance(in_str, str), str(type(in_str))
  57. return in_str.encode()
  58. def make_compat_str(o: object) -> str:
  59. """Converts everything to string, if bytes guessing the encoding."""
  60. if isinstance(o, bytes):
  61. enc = charset_normalizer.detect(o)
  62. try:
  63. return o.decode(enc["encoding"])
  64. except UnicodeDecodeError:
  65. return str(o)
  66. else:
  67. return str(o)
  68. def shorten_str(s: str, size: int) -> str:
  69. if size < 7:
  70. return s[:size]
  71. if len(s) > size:
  72. length = (size - 5) // 2
  73. return f"{s[:length]} ... {s[-length:]}"
  74. else:
  75. return s
  76. def compatible_encode_method(
  77. bytesorstring: Union[bytes, str],
  78. encoding: str = "utf-8",
  79. erraction: str = "ignore",
  80. ) -> str:
  81. """When Py2 str.encode is called, it often means bytes.encode in Py3.
  82. This does either.
  83. """
  84. if isinstance(bytesorstring, str):
  85. return bytesorstring
  86. assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
  87. return bytesorstring.decode(encoding, erraction)
  88. def paeth_predictor(left: int, above: int, upper_left: int) -> int:
  89. # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
  90. # Initial estimate
  91. p = left + above - upper_left
  92. # Distances to a,b,c
  93. pa = abs(p - left)
  94. pb = abs(p - above)
  95. pc = abs(p - upper_left)
  96. # Return nearest of a,b,c breaking ties in order a,b,c
  97. if pa <= pb and pa <= pc:
  98. return left
  99. elif pb <= pc:
  100. return above
  101. else:
  102. return upper_left
  103. def apply_tiff_predictor(
  104. colors: int, columns: int, bitspercomponent: int, data: bytes
  105. ) -> bytes:
  106. """Reverse the effect of the TIFF predictor 2
  107. Documentation: https://www.itu.int/itudoc/itu-t/com16/tiff-fx/docs/tiff6.pdf (Section 14, page 64)
  108. """
  109. if bitspercomponent != 8:
  110. error_msg = f"Unsupported `bitspercomponent': {bitspercomponent}"
  111. raise PDFValueError(error_msg)
  112. bpp = colors * (bitspercomponent // 8)
  113. nbytes = columns * bpp
  114. buf: List[int] = []
  115. for scanline_i in range(0, len(data), nbytes):
  116. raw: List[int] = []
  117. for i in range(nbytes):
  118. new_value = data[scanline_i + i]
  119. if i >= bpp:
  120. new_value += raw[i - bpp]
  121. new_value %= 256
  122. raw.append(new_value)
  123. buf.extend(raw)
  124. return bytes(buf)
  125. def apply_png_predictor(
  126. pred: int,
  127. colors: int,
  128. columns: int,
  129. bitspercomponent: int,
  130. data: bytes,
  131. ) -> bytes:
  132. """Reverse the effect of the PNG predictor
  133. Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
  134. """
  135. if bitspercomponent not in [8, 1]:
  136. msg = "Unsupported `bitspercomponent': %d" % bitspercomponent
  137. raise PDFValueError(msg)
  138. nbytes = colors * columns * bitspercomponent // 8
  139. bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel
  140. buf = []
  141. line_above = list(b"\x00" * columns)
  142. for scanline_i in range(0, len(data), nbytes + 1):
  143. filter_type = data[scanline_i]
  144. line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
  145. raw = []
  146. if filter_type == 0:
  147. # Filter type 0: None
  148. raw = list(line_encoded)
  149. elif filter_type == 1:
  150. # Filter type 1: Sub
  151. # To reverse the effect of the Sub() filter after decompression,
  152. # output the following value:
  153. # Raw(x) = Sub(x) + Raw(x - bpp)
  154. # (computed mod 256), where Raw() refers to the bytes already
  155. # decoded.
  156. for j, sub_x in enumerate(line_encoded):
  157. if j - bpp < 0:
  158. raw_x_bpp = 0
  159. else:
  160. raw_x_bpp = int(raw[j - bpp])
  161. raw_x = (sub_x + raw_x_bpp) & 255
  162. raw.append(raw_x)
  163. elif filter_type == 2:
  164. # Filter type 2: Up
  165. # To reverse the effect of the Up() filter after decompression,
  166. # output the following value:
  167. # Raw(x) = Up(x) + Prior(x)
  168. # (computed mod 256), where Prior() refers to the decoded bytes of
  169. # the prior scanline.
  170. for up_x, prior_x in zip(line_encoded, line_above):
  171. raw_x = (up_x + prior_x) & 255
  172. raw.append(raw_x)
  173. elif filter_type == 3:
  174. # Filter type 3: Average
  175. # To reverse the effect of the Average() filter after
  176. # decompression, output the following value:
  177. # Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2)
  178. # where the result is computed mod 256, but the prediction is
  179. # calculated in the same way as for encoding. Raw() refers to the
  180. # bytes already decoded, and Prior() refers to the decoded bytes of
  181. # the prior scanline.
  182. for j, average_x in enumerate(line_encoded):
  183. if j - bpp < 0:
  184. raw_x_bpp = 0
  185. else:
  186. raw_x_bpp = int(raw[j - bpp])
  187. prior_x = int(line_above[j])
  188. raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255
  189. raw.append(raw_x)
  190. elif filter_type == 4:
  191. # Filter type 4: Paeth
  192. # To reverse the effect of the Paeth() filter after decompression,
  193. # output the following value:
  194. # Raw(x) = Paeth(x)
  195. # + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp))
  196. # (computed mod 256), where Raw() and Prior() refer to bytes
  197. # already decoded. Exactly the same PaethPredictor() function is
  198. # used by both encoder and decoder.
  199. for j, paeth_x in enumerate(line_encoded):
  200. if j - bpp < 0:
  201. raw_x_bpp = 0
  202. prior_x_bpp = 0
  203. else:
  204. raw_x_bpp = int(raw[j - bpp])
  205. prior_x_bpp = int(line_above[j - bpp])
  206. prior_x = int(line_above[j])
  207. paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp)
  208. raw_x = (paeth_x + paeth) & 255
  209. raw.append(raw_x)
  210. else:
  211. raise PDFValueError("Unsupported predictor value: %d" % filter_type)
  212. buf.extend(raw)
  213. line_above = raw
  214. return bytes(buf)
  215. Point = Tuple[float, float]
  216. Rect = Tuple[float, float, float, float]
  217. Matrix = Tuple[float, float, float, float, float, float]
  218. PathSegment = Union[
  219. Tuple[str], # Literal['h']
  220. Tuple[str, float, float], # Literal['m', 'l']
  221. Tuple[str, float, float, float, float], # Literal['v', 'y']
  222. Tuple[str, float, float, float, float, float, float],
  223. ] # Literal['c']
  224. # Matrix operations
  225. MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
  226. def parse_rect(o: Any) -> Rect:
  227. try:
  228. (x0, y0, x1, y1) = o
  229. return float(x0), float(y0), float(x1), float(y1)
  230. except ValueError:
  231. raise PDFValueError("Could not parse rectangle")
  232. def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
  233. (a1, b1, c1, d1, e1, f1) = m1
  234. (a0, b0, c0, d0, e0, f0) = m0
  235. """Returns the multiplication of two matrices."""
  236. return (
  237. a0 * a1 + c0 * b1,
  238. b0 * a1 + d0 * b1,
  239. a0 * c1 + c0 * d1,
  240. b0 * c1 + d0 * d1,
  241. a0 * e1 + c0 * f1 + e0,
  242. b0 * e1 + d0 * f1 + f0,
  243. )
  244. def translate_matrix(m: Matrix, v: Point) -> Matrix:
  245. """Translates a matrix by (x, y) inside the projection.
  246. The matrix is changed so that its origin is at the specified point in its own
  247. coordinate system. Note that this is different from translating it within the
  248. original coordinate system."""
  249. (a, b, c, d, e, f) = m
  250. (x, y) = v
  251. return a, b, c, d, x * a + y * c + e, x * b + y * d + f
  252. def apply_matrix_pt(m: Matrix, v: Point) -> Point:
  253. """Applies a matrix to a point."""
  254. (a, b, c, d, e, f) = m
  255. (x, y) = v
  256. return a * x + c * y + e, b * x + d * y + f
  257. def apply_matrix_rect(m: Matrix, rect: Rect) -> Rect:
  258. """Applies a matrix to a rectangle.
  259. Note that the result is not a rotated rectangle, but a rectangle with the same
  260. orientation that tightly fits the outside of the rotated content.
  261. :param m: The rotation matrix.
  262. :param rect: The rectangle coordinates (x0, y0, x1, y1), where x0 < x1 and y0 < y1.
  263. :returns a rectangle with the same orientation, but that would fit the rotated
  264. content.
  265. """
  266. (x0, y0, x1, y1) = rect
  267. left_bottom = (x0, y0)
  268. right_bottom = (x1, y0)
  269. right_top = (x1, y1)
  270. left_top = (x0, y1)
  271. (left1, bottom1) = apply_matrix_pt(m, left_bottom)
  272. (right1, bottom2) = apply_matrix_pt(m, right_bottom)
  273. (right2, top1) = apply_matrix_pt(m, right_top)
  274. (left2, top2) = apply_matrix_pt(m, left_top)
  275. return (
  276. min(left1, left2, right1, right2),
  277. min(bottom1, bottom2, top1, top2),
  278. max(left1, left2, right1, right2),
  279. max(bottom1, bottom2, top1, top2),
  280. )
  281. def apply_matrix_norm(m: Matrix, v: Point) -> Point:
  282. """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
  283. (a, b, c, d, e, f) = m
  284. (p, q) = v
  285. return a * p + c * q, b * p + d * q
  286. # Utility functions
  287. def isnumber(x: object) -> bool:
  288. return isinstance(x, (int, float))
  289. _T = TypeVar("_T")
  290. def uniq(objs: Iterable[_T]) -> Iterator[_T]:
  291. """Eliminates duplicated elements."""
  292. done = set()
  293. for obj in objs:
  294. if obj in done:
  295. continue
  296. done.add(obj)
  297. yield obj
  298. def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]:
  299. """Split a list into two classes according to the predicate."""
  300. t = []
  301. f = []
  302. for obj in objs:
  303. if pred(obj):
  304. t.append(obj)
  305. else:
  306. f.append(obj)
  307. return t, f
  308. def drange(v0: float, v1: float, d: int) -> range:
  309. """Returns a discrete range."""
  310. return range(int(v0) // d, int(v1 + d) // d)
  311. def get_bound(pts: Iterable[Point]) -> Rect:
  312. """Compute a minimal rectangle that covers all the points."""
  313. limit: Rect = (INF, INF, -INF, -INF)
  314. (x0, y0, x1, y1) = limit
  315. for x, y in pts:
  316. x0 = min(x0, x)
  317. y0 = min(y0, y)
  318. x1 = max(x1, x)
  319. y1 = max(y1, y)
  320. return x0, y0, x1, y1
  321. def pick(
  322. seq: Iterable[_T],
  323. func: Callable[[_T], float],
  324. maxobj: Optional[_T] = None,
  325. ) -> Optional[_T]:
  326. """Picks the object obj where func(obj) has the highest value."""
  327. maxscore = None
  328. for obj in seq:
  329. score = func(obj)
  330. if maxscore is None or maxscore < score:
  331. (maxscore, maxobj) = (score, obj)
  332. return maxobj
  333. def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
  334. """Groups every n elements of the list."""
  335. r = []
  336. for x in seq:
  337. r.append(x)
  338. if len(r) == n:
  339. yield tuple(r)
  340. r = []
  341. def nunpack(s: bytes, default: int = 0) -> int:
  342. """Unpacks variable-length unsigned integers (big endian)."""
  343. length = len(s)
  344. if not length:
  345. return default
  346. else:
  347. return int.from_bytes(s, byteorder="big", signed=False)
  348. PDFDocEncoding = "".join(
  349. chr(x)
  350. for x in (
  351. 0x0000,
  352. 0x0001,
  353. 0x0002,
  354. 0x0003,
  355. 0x0004,
  356. 0x0005,
  357. 0x0006,
  358. 0x0007,
  359. 0x0008,
  360. 0x0009,
  361. 0x000A,
  362. 0x000B,
  363. 0x000C,
  364. 0x000D,
  365. 0x000E,
  366. 0x000F,
  367. 0x0010,
  368. 0x0011,
  369. 0x0012,
  370. 0x0013,
  371. 0x0014,
  372. 0x0015,
  373. 0x0017,
  374. 0x0017,
  375. 0x02D8,
  376. 0x02C7,
  377. 0x02C6,
  378. 0x02D9,
  379. 0x02DD,
  380. 0x02DB,
  381. 0x02DA,
  382. 0x02DC,
  383. 0x0020,
  384. 0x0021,
  385. 0x0022,
  386. 0x0023,
  387. 0x0024,
  388. 0x0025,
  389. 0x0026,
  390. 0x0027,
  391. 0x0028,
  392. 0x0029,
  393. 0x002A,
  394. 0x002B,
  395. 0x002C,
  396. 0x002D,
  397. 0x002E,
  398. 0x002F,
  399. 0x0030,
  400. 0x0031,
  401. 0x0032,
  402. 0x0033,
  403. 0x0034,
  404. 0x0035,
  405. 0x0036,
  406. 0x0037,
  407. 0x0038,
  408. 0x0039,
  409. 0x003A,
  410. 0x003B,
  411. 0x003C,
  412. 0x003D,
  413. 0x003E,
  414. 0x003F,
  415. 0x0040,
  416. 0x0041,
  417. 0x0042,
  418. 0x0043,
  419. 0x0044,
  420. 0x0045,
  421. 0x0046,
  422. 0x0047,
  423. 0x0048,
  424. 0x0049,
  425. 0x004A,
  426. 0x004B,
  427. 0x004C,
  428. 0x004D,
  429. 0x004E,
  430. 0x004F,
  431. 0x0050,
  432. 0x0051,
  433. 0x0052,
  434. 0x0053,
  435. 0x0054,
  436. 0x0055,
  437. 0x0056,
  438. 0x0057,
  439. 0x0058,
  440. 0x0059,
  441. 0x005A,
  442. 0x005B,
  443. 0x005C,
  444. 0x005D,
  445. 0x005E,
  446. 0x005F,
  447. 0x0060,
  448. 0x0061,
  449. 0x0062,
  450. 0x0063,
  451. 0x0064,
  452. 0x0065,
  453. 0x0066,
  454. 0x0067,
  455. 0x0068,
  456. 0x0069,
  457. 0x006A,
  458. 0x006B,
  459. 0x006C,
  460. 0x006D,
  461. 0x006E,
  462. 0x006F,
  463. 0x0070,
  464. 0x0071,
  465. 0x0072,
  466. 0x0073,
  467. 0x0074,
  468. 0x0075,
  469. 0x0076,
  470. 0x0077,
  471. 0x0078,
  472. 0x0079,
  473. 0x007A,
  474. 0x007B,
  475. 0x007C,
  476. 0x007D,
  477. 0x007E,
  478. 0x0000,
  479. 0x2022,
  480. 0x2020,
  481. 0x2021,
  482. 0x2026,
  483. 0x2014,
  484. 0x2013,
  485. 0x0192,
  486. 0x2044,
  487. 0x2039,
  488. 0x203A,
  489. 0x2212,
  490. 0x2030,
  491. 0x201E,
  492. 0x201C,
  493. 0x201D,
  494. 0x2018,
  495. 0x2019,
  496. 0x201A,
  497. 0x2122,
  498. 0xFB01,
  499. 0xFB02,
  500. 0x0141,
  501. 0x0152,
  502. 0x0160,
  503. 0x0178,
  504. 0x017D,
  505. 0x0131,
  506. 0x0142,
  507. 0x0153,
  508. 0x0161,
  509. 0x017E,
  510. 0x0000,
  511. 0x20AC,
  512. 0x00A1,
  513. 0x00A2,
  514. 0x00A3,
  515. 0x00A4,
  516. 0x00A5,
  517. 0x00A6,
  518. 0x00A7,
  519. 0x00A8,
  520. 0x00A9,
  521. 0x00AA,
  522. 0x00AB,
  523. 0x00AC,
  524. 0x0000,
  525. 0x00AE,
  526. 0x00AF,
  527. 0x00B0,
  528. 0x00B1,
  529. 0x00B2,
  530. 0x00B3,
  531. 0x00B4,
  532. 0x00B5,
  533. 0x00B6,
  534. 0x00B7,
  535. 0x00B8,
  536. 0x00B9,
  537. 0x00BA,
  538. 0x00BB,
  539. 0x00BC,
  540. 0x00BD,
  541. 0x00BE,
  542. 0x00BF,
  543. 0x00C0,
  544. 0x00C1,
  545. 0x00C2,
  546. 0x00C3,
  547. 0x00C4,
  548. 0x00C5,
  549. 0x00C6,
  550. 0x00C7,
  551. 0x00C8,
  552. 0x00C9,
  553. 0x00CA,
  554. 0x00CB,
  555. 0x00CC,
  556. 0x00CD,
  557. 0x00CE,
  558. 0x00CF,
  559. 0x00D0,
  560. 0x00D1,
  561. 0x00D2,
  562. 0x00D3,
  563. 0x00D4,
  564. 0x00D5,
  565. 0x00D6,
  566. 0x00D7,
  567. 0x00D8,
  568. 0x00D9,
  569. 0x00DA,
  570. 0x00DB,
  571. 0x00DC,
  572. 0x00DD,
  573. 0x00DE,
  574. 0x00DF,
  575. 0x00E0,
  576. 0x00E1,
  577. 0x00E2,
  578. 0x00E3,
  579. 0x00E4,
  580. 0x00E5,
  581. 0x00E6,
  582. 0x00E7,
  583. 0x00E8,
  584. 0x00E9,
  585. 0x00EA,
  586. 0x00EB,
  587. 0x00EC,
  588. 0x00ED,
  589. 0x00EE,
  590. 0x00EF,
  591. 0x00F0,
  592. 0x00F1,
  593. 0x00F2,
  594. 0x00F3,
  595. 0x00F4,
  596. 0x00F5,
  597. 0x00F6,
  598. 0x00F7,
  599. 0x00F8,
  600. 0x00F9,
  601. 0x00FA,
  602. 0x00FB,
  603. 0x00FC,
  604. 0x00FD,
  605. 0x00FE,
  606. 0x00FF,
  607. )
  608. )
  609. def decode_text(s: bytes) -> str:
  610. """Decodes a PDFDocEncoding string to Unicode."""
  611. if s.startswith(b"\xfe\xff"):
  612. return str(s[2:], "utf-16be", "ignore")
  613. else:
  614. return "".join(PDFDocEncoding[c] for c in s)
  615. def enc(x: str) -> str:
  616. """Encodes a string for SGML/XML/HTML"""
  617. if isinstance(x, bytes):
  618. return ""
  619. return escape(x)
  620. def bbox2str(bbox: Rect) -> str:
  621. (x0, y0, x1, y1) = bbox
  622. return f"{x0:.3f},{y0:.3f},{x1:.3f},{y1:.3f}"
  623. def matrix2str(m: Matrix) -> str:
  624. (a, b, c, d, e, f) = m
  625. return f"[{a:.2f},{b:.2f},{c:.2f},{d:.2f}, ({e:.2f},{f:.2f})]"
  626. def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
  627. """A distance function between two TextBoxes.
  628. Consider the bounding rectangle for obj1 and obj2.
  629. Return vector between 2 boxes boundaries if they don't overlap, otherwise
  630. returns vector betweeen boxes centers
  631. +------+..........+ (x1, y1)
  632. | obj1 | :
  633. +------+www+------+
  634. : | obj2 |
  635. (x0, y0) +..........+------+
  636. """
  637. (x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0))
  638. (x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1))
  639. (ow, oh) = (x1 - x0, y1 - y0)
  640. (iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height)
  641. if iw < 0 and ih < 0:
  642. # if one is inside another we compute euclidean distance
  643. (xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
  644. (xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
  645. return xc1 - xc2, yc1 - yc2
  646. else:
  647. return max(0, iw), max(0, ih)
  648. LTComponentT = TypeVar("LTComponentT", bound="LTComponent")
  649. class Plane(Generic[LTComponentT]):
  650. """A set-like data structure for objects placed on a plane.
  651. Can efficiently find objects in a certain rectangular area.
  652. It maintains two parallel lists of objects, each of
  653. which is sorted by its x or y coordinate.
  654. """
  655. def __init__(self, bbox: Rect, gridsize: int = 50) -> None:
  656. self._seq: List[LTComponentT] = [] # preserve the object order.
  657. self._objs: Set[LTComponentT] = set()
  658. self._grid: Dict[Point, List[LTComponentT]] = {}
  659. self.gridsize = gridsize
  660. (self.x0, self.y0, self.x1, self.y1) = bbox
  661. def __repr__(self) -> str:
  662. return "<Plane objs=%r>" % list(self)
  663. def __iter__(self) -> Iterator[LTComponentT]:
  664. return (obj for obj in self._seq if obj in self._objs)
  665. def __len__(self) -> int:
  666. return len(self._objs)
  667. def __contains__(self, obj: object) -> bool:
  668. return obj in self._objs
  669. def _getrange(self, bbox: Rect) -> Iterator[Point]:
  670. (x0, y0, x1, y1) = bbox
  671. if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
  672. return
  673. x0 = max(self.x0, x0)
  674. y0 = max(self.y0, y0)
  675. x1 = min(self.x1, x1)
  676. y1 = min(self.y1, y1)
  677. for grid_y in drange(y0, y1, self.gridsize):
  678. for grid_x in drange(x0, x1, self.gridsize):
  679. yield (grid_x, grid_y)
  680. def extend(self, objs: Iterable[LTComponentT]) -> None:
  681. for obj in objs:
  682. self.add(obj)
  683. def add(self, obj: LTComponentT) -> None:
  684. """Place an object."""
  685. for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
  686. if k not in self._grid:
  687. r: List[LTComponentT] = []
  688. self._grid[k] = r
  689. else:
  690. r = self._grid[k]
  691. r.append(obj)
  692. self._seq.append(obj)
  693. self._objs.add(obj)
  694. def remove(self, obj: LTComponentT) -> None:
  695. """Displace an object."""
  696. for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
  697. try:
  698. self._grid[k].remove(obj)
  699. except (KeyError, ValueError):
  700. pass
  701. self._objs.remove(obj)
  702. def find(self, bbox: Rect) -> Iterator[LTComponentT]:
  703. """Finds objects that are in a certain area."""
  704. (x0, y0, x1, y1) = bbox
  705. done = set()
  706. for k in self._getrange(bbox):
  707. if k not in self._grid:
  708. continue
  709. for obj in self._grid[k]:
  710. if obj in done:
  711. continue
  712. done.add(obj)
  713. if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
  714. continue
  715. yield obj
  716. ROMAN_ONES = ["i", "x", "c", "m"]
  717. ROMAN_FIVES = ["v", "l", "d"]
  718. def format_int_roman(value: int) -> str:
  719. """Format a number as lowercase Roman numerals."""
  720. assert 0 < value < 4000
  721. result: List[str] = []
  722. index = 0
  723. while value != 0:
  724. value, remainder = divmod(value, 10)
  725. if remainder == 9:
  726. result.insert(0, ROMAN_ONES[index])
  727. result.insert(1, ROMAN_ONES[index + 1])
  728. elif remainder == 4:
  729. result.insert(0, ROMAN_ONES[index])
  730. result.insert(1, ROMAN_FIVES[index])
  731. else:
  732. over_five = remainder >= 5
  733. if over_five:
  734. result.insert(0, ROMAN_FIVES[index])
  735. remainder -= 5
  736. result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
  737. index += 1
  738. return "".join(result)
  739. def format_int_alpha(value: int) -> str:
  740. """Format a number as lowercase letters a-z, aa-zz, etc."""
  741. assert value > 0
  742. result: List[str] = []
  743. while value != 0:
  744. value, remainder = divmod(value - 1, len(string.ascii_lowercase))
  745. result.append(string.ascii_lowercase[remainder])
  746. result.reverse()
  747. return "".join(result)
  748. def unpad_aes(padded: bytes) -> bytes:
  749. """Remove block padding as described in PDF 1.7 section 7.6.2:
  750. > For an original message length of M, the pad shall consist of 16 -
  751. (M mod 16) bytes whose value shall also be 16 - (M mod 16).
  752. > Note that the pad is present when M is evenly divisible by 16;
  753. it contains 16 bytes of 0x10.
  754. """
  755. if len(padded) == 0:
  756. return padded
  757. # Check for a potential padding byte (bytes are unsigned)
  758. padding = padded[-1]
  759. if padding > 16:
  760. return padded
  761. # A valid padding byte is the length of the padding
  762. if padding > len(padded): # Obviously invalid
  763. return padded
  764. # Every byte of padding is equal to the length of padding
  765. if all(x == padding for x in padded[-padding:]):
  766. return padded[:-padding]
  767. return padded