data_structures.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. from typing import Any, Iterable, List, Optional, Tuple
  2. from pdfminer import settings
  3. from pdfminer.pdfparser import PDFSyntaxError
  4. from pdfminer.pdftypes import dict_value, int_value, list_value
  5. from pdfminer.utils import choplist
  6. class NumberTree:
  7. """A PDF number tree.
  8. See Section 3.8.6 of the PDF Reference.
  9. """
  10. def __init__(self, obj: Any):
  11. self._obj = dict_value(obj)
  12. self.nums: Optional[Iterable[Any]] = None
  13. self.kids: Optional[Iterable[Any]] = None
  14. self.limits: Optional[Iterable[Any]] = None
  15. if "Nums" in self._obj:
  16. self.nums = list_value(self._obj["Nums"])
  17. if "Kids" in self._obj:
  18. self.kids = list_value(self._obj["Kids"])
  19. if "Limits" in self._obj:
  20. self.limits = list_value(self._obj["Limits"])
  21. def _parse(self) -> List[Tuple[int, Any]]:
  22. items = []
  23. if self.nums: # Leaf node
  24. for k, v in choplist(2, self.nums):
  25. items.append((int_value(k), v))
  26. if self.kids: # Root or intermediate node
  27. for child_ref in self.kids:
  28. items += NumberTree(child_ref)._parse()
  29. return items
  30. values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy
  31. @property # type: ignore[no-redef,misc]
  32. def values(self) -> List[Tuple[int, Any]]:
  33. values = self._parse()
  34. if settings.STRICT:
  35. if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
  36. raise PDFSyntaxError("Number tree elements are out of order")
  37. else:
  38. values.sort(key=lambda t: t[0])
  39. return values