pageobjects.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
  2. # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
  3. # TODO test-confirm filter and info params
  4. from enum import Enum
  5. import pypdfium2._helpers as pdfium
  6. import pypdfium2.raw as pdfium_c
  7. import pypdfium2.internal as pdfium_i
  8. # TODO? consider dotted access
  9. from pypdfium2._cli._parsers import (
  10. add_input,
  11. add_n_digits,
  12. get_input,
  13. round_list,
  14. )
  15. class InfoParams (Enum):
  16. pos = 0
  17. imageinfo = 1
  18. def attach(parser):
  19. add_input(parser, pages=True)
  20. add_n_digits(parser)
  21. # TODO think out strategy for choices (see https://github.com/python/cpython/issues/69247)
  22. obj_types = list( pdfium_i.ObjectTypeToConst.keys() )
  23. parser.add_argument(
  24. "--filter",
  25. nargs = "+",
  26. metavar = "T",
  27. choices = obj_types,
  28. help = f"Object types to include. Choices: {obj_types}",
  29. )
  30. parser.add_argument(
  31. "--max-depth",
  32. type = int,
  33. default = 2,
  34. help = "Maximum recursion depth to consider when descending into Form XObjects.",
  35. )
  36. parser.add_argument(
  37. "--info",
  38. nargs = "*",
  39. type = lambda s: InfoParams[s.lower()],
  40. default = (InfoParams.pos, InfoParams.imageinfo),
  41. help = "Object details to show (pos, imageinfo).",
  42. )
  43. def print_img_metadata(metadata, pad=""):
  44. for attr in pdfium_c.FPDF_IMAGEOBJ_METADATA.__slots__:
  45. value = getattr(metadata, attr)
  46. if attr == "colorspace":
  47. value = pdfium_i.ColorspaceToStr.get(value)
  48. elif attr == "marked_content_id" and value == -1:
  49. continue
  50. print(pad + f"{attr}: {value}\n", end="")
  51. def main(args):
  52. pdf = get_input(args)
  53. # if no filter is given, leave it at None (make a difference in case of unhandled object types)
  54. if args.filter:
  55. args.filter = [pdfium_i.ObjectTypeToConst[t] for t in args.filter]
  56. show_pos = (InfoParams.pos in args.info)
  57. show_imageinfo = (InfoParams.imageinfo in args.info)
  58. total_count = 0
  59. for i in args.pages:
  60. page = pdf[i]
  61. obj_searcher = page.get_objects(
  62. filter = args.filter,
  63. max_depth = args.max_depth,
  64. )
  65. preamble = f"# Page {i+1}\n"
  66. count = 0
  67. for obj in obj_searcher:
  68. pad_0 = " " * obj.level
  69. pad_1 = pad_0 + " "
  70. print(preamble + pad_0 + pdfium_i.ObjectTypeToStr.get(obj.type))
  71. if show_pos:
  72. pos = round_list(obj.get_pos(), args.n_digits)
  73. print(pad_1 + f"Position: {pos}")
  74. # TODO? also call get_size() for coverage
  75. if show_imageinfo and isinstance(obj, pdfium.PdfImage):
  76. print(pad_1 + f"Filters: {obj.get_filters()}")
  77. metadata = obj.get_metadata()
  78. print_img_metadata(metadata, pad=pad_1)
  79. count += 1
  80. preamble = ""
  81. if count > 0:
  82. print(f"-> Count: {count}\n")
  83. total_count += count
  84. if total_count > 0:
  85. print(f"-> Total count: {total_count}")