ascii85.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. """Python implementation of ASCII85/ASCIIHex decoder (Adobe version)."""
  2. import re
  3. from base64 import a85decode
  4. from binascii import unhexlify
  5. start_re = re.compile(rb"^\s*<?\s*~\s*")
  6. end_re = re.compile(rb"\s*~\s*>?\s*$")
  7. def ascii85decode(data: bytes) -> bytes:
  8. """In ASCII85 encoding, every four bytes are encoded with five ASCII
  9. letters, using 85 different types of characters (as 256**4 < 85**5).
  10. When the length of the original bytes is not a multiple of 4, a special
  11. rule is used for round up.
  12. Adobe's ASCII85 implementation expects the input to be terminated
  13. by `b"~>"`, and (though this is absent from the PDF spec) it can
  14. also begin with `b"<~"`. We can't reliably expect this to be the
  15. case, and there can be off-by-one errors in stream lengths which
  16. mean we only see `~` at the end. Worse yet, `<` and `>` are
  17. ASCII85 digits, so we can't strip them. We settle on a compromise
  18. where we strip leading `<~` or `~` and trailing `~` or `~>`.
  19. """
  20. data = start_re.sub(b"", data)
  21. data = end_re.sub(b"", data)
  22. return a85decode(data)
  23. bws_re = re.compile(rb"\s")
  24. def asciihexdecode(data: bytes) -> bytes:
  25. """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
  26. For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
  27. ASCIIHexDecode filter produces one byte of binary data. All white-space
  28. characters are ignored. A right angle bracket character (>) indicates
  29. EOD. Any other characters will cause an error. If the filter encounters
  30. the EOD marker after reading an odd number of hexadecimal digits, it
  31. will behave as if a 0 followed the last digit.
  32. """
  33. data = bws_re.sub(b"", data)
  34. idx = data.find(b">")
  35. if idx != -1:
  36. data = data[:idx]
  37. if idx % 2 == 1:
  38. data += b"0"
  39. return unhexlify(data)