| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291 |
- import os
- import os.path
- import struct
- from io import BytesIO
- from typing import BinaryIO, Literal, Tuple
- from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
- from pdfminer.layout import LTImage
- from pdfminer.pdfcolor import (
- LITERAL_DEVICE_CMYK,
- LITERAL_DEVICE_GRAY,
- LITERAL_DEVICE_RGB,
- LITERAL_INLINE_DEVICE_GRAY,
- LITERAL_INLINE_DEVICE_RGB,
- )
- from pdfminer.pdfexceptions import PDFValueError
- from pdfminer.pdftypes import (
- LITERALS_DCT_DECODE,
- LITERALS_FLATE_DECODE,
- LITERALS_JBIG2_DECODE,
- LITERALS_JPX_DECODE,
- )
- PIL_ERROR_MESSAGE = (
- "Could not import Pillow. This dependency of pdfminer.six is not "
- "installed by default. You need it to to save jpg images to a file. Install it "
- "with `pip install 'pdfminer.six[image]'`"
- )
- def align32(x: int) -> int:
- return ((x + 3) // 4) * 4
- class BMPWriter:
- def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
- self.fp = fp
- self.bits = bits
- self.width = width
- self.height = height
- if bits == 1:
- ncols = 2
- elif bits == 8:
- ncols = 256
- elif bits == 24:
- ncols = 0
- else:
- raise PDFValueError(bits)
- self.linesize = align32((self.width * self.bits + 7) // 8)
- self.datasize = self.linesize * self.height
- headersize = 14 + 40 + ncols * 4
- info = struct.pack(
- "<IiiHHIIIIII",
- 40,
- self.width,
- self.height,
- 1,
- self.bits,
- 0,
- self.datasize,
- 0,
- 0,
- ncols,
- 0,
- )
- assert len(info) == 40, str(len(info))
- header = struct.pack(
- "<ccIHHI",
- b"B",
- b"M",
- headersize + self.datasize,
- 0,
- 0,
- headersize,
- )
- assert len(header) == 14, str(len(header))
- self.fp.write(header)
- self.fp.write(info)
- if ncols == 2:
- # B&W color table
- for i in (0, 255):
- self.fp.write(struct.pack("BBBx", i, i, i))
- elif ncols == 256:
- # grayscale color table
- for i in range(256):
- self.fp.write(struct.pack("BBBx", i, i, i))
- self.pos0 = self.fp.tell()
- self.pos1 = self.pos0 + self.datasize
- def write_line(self, y: int, data: bytes) -> None:
- self.fp.seek(self.pos1 - (y + 1) * self.linesize)
- self.fp.write(data)
- class ImageWriter:
- """Write image to a file
- Supports various image types: JPEG, JBIG2 and bitmaps
- """
- def __init__(self, outdir: str) -> None:
- self.outdir = outdir
- if not os.path.exists(self.outdir):
- os.makedirs(self.outdir)
- def export_image(self, image: LTImage) -> str:
- """Save an LTImage to disk"""
- (width, height) = image.srcsize
- filters = image.stream.get_filters()
- if filters[-1][0] in LITERALS_DCT_DECODE:
- name = self._save_jpeg(image)
- elif filters[-1][0] in LITERALS_JPX_DECODE:
- name = self._save_jpeg2000(image)
- elif self._is_jbig2_iamge(image):
- name = self._save_jbig2(image)
- elif image.bits == 1:
- name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
- elif image.bits == 8 and (
- LITERAL_DEVICE_RGB in image.colorspace
- or LITERAL_INLINE_DEVICE_RGB in image.colorspace
- ):
- name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
- elif image.bits == 8 and (
- LITERAL_DEVICE_GRAY in image.colorspace
- or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
- ):
- name = self._save_bmp(image, width, height, width, image.bits)
- elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
- name = self._save_bytes(image)
- else:
- name = self._save_raw(image)
- return name
- def _save_jpeg(self, image: LTImage) -> str:
- """Save a JPEG encoded image"""
- data = image.stream.get_data()
- name, path = self._create_unique_image_name(image, ".jpg")
- with open(path, "wb") as fp:
- if LITERAL_DEVICE_CMYK in image.colorspace:
- try:
- from PIL import Image, ImageChops # type: ignore[import]
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
- ifp = BytesIO(data)
- i = Image.open(ifp)
- i = ImageChops.invert(i)
- i = i.convert("RGB")
- i.save(fp, "JPEG")
- else:
- fp.write(data)
- return name
- def _save_jpeg2000(self, image: LTImage) -> str:
- """Save a JPEG 2000 encoded image"""
- data = image.stream.get_data()
- name, path = self._create_unique_image_name(image, ".jp2")
- with open(path, "wb") as fp:
- try:
- from PIL import Image # type: ignore[import]
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
- # if we just write the raw data, most image programs
- # that I have tried cannot open the file. However,
- # open and saving with PIL produces a file that
- # seems to be easily opened by other programs
- ifp = BytesIO(data)
- i = Image.open(ifp)
- i.save(fp, "JPEG2000")
- return name
- def _save_jbig2(self, image: LTImage) -> str:
- """Save a JBIG2 encoded image"""
- name, path = self._create_unique_image_name(image, ".jb2")
- with open(path, "wb") as fp:
- input_stream = BytesIO()
- global_streams = []
- filters = image.stream.get_filters()
- for filter_name, params in filters:
- if filter_name in LITERALS_JBIG2_DECODE:
- global_streams.append(params["JBIG2Globals"].resolve())
- if len(global_streams) > 1:
- msg = (
- "There should never be more than one JBIG2Globals "
- "associated with a JBIG2 embedded image"
- )
- raise PDFValueError(msg)
- if len(global_streams) == 1:
- input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
- input_stream.write(image.stream.get_data())
- input_stream.seek(0)
- reader = JBIG2StreamReader(input_stream)
- segments = reader.get_segments()
- writer = JBIG2StreamWriter(fp)
- writer.write_file(segments)
- return name
- def _save_bmp(
- self,
- image: LTImage,
- width: int,
- height: int,
- bytes_per_line: int,
- bits: int,
- ) -> str:
- """Save a BMP encoded image"""
- name, path = self._create_unique_image_name(image, ".bmp")
- with open(path, "wb") as fp:
- bmp = BMPWriter(fp, bits, width, height)
- data = image.stream.get_data()
- i = 0
- for y in range(height):
- bmp.write_line(y, data[i : i + bytes_per_line])
- i += bytes_per_line
- return name
- def _save_bytes(self, image: LTImage) -> str:
- """Save an image without encoding, just bytes"""
- name, path = self._create_unique_image_name(image, ".jpg")
- width, height = image.srcsize
- channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
- with open(path, "wb") as fp:
- try:
- from PIL import (
- Image, # type: ignore[import]
- ImageOps,
- )
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
- mode: Literal["1", "L", "RGB", "CMYK"]
- if image.bits == 1:
- mode = "1"
- elif image.bits == 8 and channels == 1:
- mode = "L"
- elif image.bits == 8 and channels == 3:
- mode = "RGB"
- elif image.bits == 8 and channels == 4:
- mode = "CMYK"
- img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
- if mode == "L":
- img = ImageOps.invert(img)
- img.save(fp)
- return name
- def _save_raw(self, image: LTImage) -> str:
- """Save an image with unknown encoding"""
- ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
- name, path = self._create_unique_image_name(image, ext)
- with open(path, "wb") as fp:
- fp.write(image.stream.get_data())
- return name
- @staticmethod
- def _is_jbig2_iamge(image: LTImage) -> bool:
- filters = image.stream.get_filters()
- for filter_name, params in filters:
- if filter_name in LITERALS_JBIG2_DECODE:
- return True
- return False
- def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
- name = image.name + ext
- path = os.path.join(self.outdir, name)
- img_index = 0
- while os.path.exists(path):
- name = "%s.%d%s" % (image.name, img_index, ext)
- path = os.path.join(self.outdir, name)
- img_index += 1
- return name, path
|