| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221 |
- import logging
- import struct
- from io import BytesIO
- from typing import (
- TYPE_CHECKING,
- Any,
- BinaryIO,
- Dict,
- Iterable,
- Iterator,
- List,
- Mapping,
- Optional,
- Tuple,
- Union,
- cast,
- )
- from pdfminer import settings
- from pdfminer.casting import safe_float, safe_rect_list
- from pdfminer.cmapdb import (
- CMap,
- CMapBase,
- CMapDB,
- CMapParser,
- FileUnicodeMap,
- IdentityUnicodeMap,
- UnicodeMap,
- )
- from pdfminer.encodingdb import EncodingDB, name2unicode
- from pdfminer.fontmetrics import FONT_METRICS
- from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError
- from pdfminer.pdftypes import (
- PDFStream,
- dict_value,
- int_value,
- list_value,
- num_value,
- resolve1,
- resolve_all,
- stream_value,
- )
- from pdfminer.psexceptions import PSEOF
- from pdfminer.psparser import (
- KWD,
- LIT,
- PSKeyword,
- PSLiteral,
- PSStackParser,
- literal_name,
- )
- from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
- if TYPE_CHECKING:
- from pdfminer.pdfinterp import PDFResourceManager
- log = logging.getLogger(__name__)
- def get_widths(seq: Iterable[object]) -> Dict[Union[str, int], float]:
- """Build a mapping of character widths for horizontal writing."""
- widths: Dict[int, float] = {}
- r: List[float] = []
- for v in seq:
- v = resolve1(v)
- if isinstance(v, list):
- if r:
- char1 = r[-1]
- for i, w in enumerate(v):
- widths[cast(int, char1) + i] = w
- r = []
- elif isinstance(v, (int, float)): # == utils.isnumber(v)
- r.append(v)
- if len(r) == 3:
- (char1, char2, w) = r
- if isinstance(char1, int) and isinstance(char2, int):
- for i in range(cast(int, char1), cast(int, char2) + 1):
- widths[i] = w
- else:
- log.warning(
- f"Skipping invalid font width specification for {char1} to {char2} because either of them is not an int"
- )
- r = []
- else:
- log.warning(
- f"Skipping invalid font width specification for {v} because it is not a number or a list"
- )
- return cast(Dict[Union[str, int], float], widths)
- def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
- """Build a mapping of character widths for vertical writing."""
- widths: Dict[int, Tuple[float, Point]] = {}
- r: List[float] = []
- for v in seq:
- if isinstance(v, list):
- if r:
- char1 = r[-1]
- for i, (w, vx, vy) in enumerate(choplist(3, v)):
- widths[cast(int, char1) + i] = (w, (vx, vy))
- r = []
- elif isinstance(v, (int, float)): # == utils.isnumber(v)
- r.append(v)
- if len(r) == 5:
- (char1, char2, w, vx, vy) = r
- for i in range(cast(int, char1), cast(int, char2) + 1):
- widths[i] = (w, (vx, vy))
- r = []
- return widths
- class FontMetricsDB:
- @classmethod
- def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:
- return FONT_METRICS[fontname]
- # int here means that we're not extending PSStackParser with additional types.
- class Type1FontHeaderParser(PSStackParser[int]):
- KEYWORD_BEGIN = KWD(b"begin")
- KEYWORD_END = KWD(b"end")
- KEYWORD_DEF = KWD(b"def")
- KEYWORD_PUT = KWD(b"put")
- KEYWORD_DICT = KWD(b"dict")
- KEYWORD_ARRAY = KWD(b"array")
- KEYWORD_READONLY = KWD(b"readonly")
- KEYWORD_FOR = KWD(b"for")
- def __init__(self, data: BinaryIO) -> None:
- PSStackParser.__init__(self, data)
- self._cid2unicode: Dict[int, str] = {}
- def get_encoding(self) -> Dict[int, str]:
- """Parse the font encoding.
- The Type1 font encoding maps character codes to character names. These
- character names could either be standard Adobe glyph names, or
- character names associated with custom CharStrings for this font. A
- CharString is a sequence of operations that describe how the character
- should be drawn. Currently, this function returns '' (empty string)
- for character names that are associated with a CharStrings.
- Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
- :returns mapping of character identifiers (cid's) to unicode characters
- """
- while 1:
- try:
- (cid, name) = self.nextobject()
- except PSEOF:
- break
- try:
- self._cid2unicode[cid] = name2unicode(cast(str, name))
- except KeyError as e:
- log.debug(str(e))
- return self._cid2unicode
- def do_keyword(self, pos: int, token: PSKeyword) -> None:
- if token is self.KEYWORD_PUT:
- ((_, key), (_, value)) = self.pop(2)
- if isinstance(key, int) and isinstance(value, PSLiteral):
- self.add_results((key, literal_name(value)))
- NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
- # Mapping of cmap names. Original cmap name is kept if not in the mapping.
- # (missing reference for why DLIdent is mapped to Identity)
- IDENTITY_ENCODER = {
- "DLIdent-H": "Identity-H",
- "DLIdent-V": "Identity-V",
- }
- def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
- d: Dict[int, List[Union[float, int]]] = {}
- fp = BytesIO(data)
- stack: List[Union[float, int]] = []
- while 1:
- c = fp.read(1)
- if not c:
- break
- b0 = ord(c)
- if b0 <= 21:
- d[b0] = stack
- stack = []
- continue
- if b0 == 30:
- s = ""
- loop = True
- while loop:
- b = ord(fp.read(1))
- for n in (b >> 4, b & 15):
- if n == 15:
- loop = False
- else:
- nibble = NIBBLES[n]
- assert nibble is not None
- s += nibble
- value = float(s)
- elif b0 >= 32 and b0 <= 246:
- value = b0 - 139
- else:
- b1 = ord(fp.read(1))
- if b0 >= 247 and b0 <= 250:
- value = ((b0 - 247) << 8) + b1 + 108
- elif b0 >= 251 and b0 <= 254:
- value = -((b0 - 251) << 8) - b1 - 108
- else:
- b2 = ord(fp.read(1))
- if b1 >= 128:
- b1 -= 256
- if b0 == 28:
- value = b1 << 8 | b2
- else:
- value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
- stack.append(value)
- return d
- class CFFFont:
- STANDARD_STRINGS = (
- ".notdef",
- "space",
- "exclam",
- "quotedbl",
- "numbersign",
- "dollar",
- "percent",
- "ampersand",
- "quoteright",
- "parenleft",
- "parenright",
- "asterisk",
- "plus",
- "comma",
- "hyphen",
- "period",
- "slash",
- "zero",
- "one",
- "two",
- "three",
- "four",
- "five",
- "six",
- "seven",
- "eight",
- "nine",
- "colon",
- "semicolon",
- "less",
- "equal",
- "greater",
- "question",
- "at",
- "A",
- "B",
- "C",
- "D",
- "E",
- "F",
- "G",
- "H",
- "I",
- "J",
- "K",
- "L",
- "M",
- "N",
- "O",
- "P",
- "Q",
- "R",
- "S",
- "T",
- "U",
- "V",
- "W",
- "X",
- "Y",
- "Z",
- "bracketleft",
- "backslash",
- "bracketright",
- "asciicircum",
- "underscore",
- "quoteleft",
- "a",
- "b",
- "c",
- "d",
- "e",
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- "braceleft",
- "bar",
- "braceright",
- "asciitilde",
- "exclamdown",
- "cent",
- "sterling",
- "fraction",
- "yen",
- "florin",
- "section",
- "currency",
- "quotesingle",
- "quotedblleft",
- "guillemotleft",
- "guilsinglleft",
- "guilsinglright",
- "fi",
- "fl",
- "endash",
- "dagger",
- "daggerdbl",
- "periodcentered",
- "paragraph",
- "bullet",
- "quotesinglbase",
- "quotedblbase",
- "quotedblright",
- "guillemotright",
- "ellipsis",
- "perthousand",
- "questiondown",
- "grave",
- "acute",
- "circumflex",
- "tilde",
- "macron",
- "breve",
- "dotaccent",
- "dieresis",
- "ring",
- "cedilla",
- "hungarumlaut",
- "ogonek",
- "caron",
- "emdash",
- "AE",
- "ordfeminine",
- "Lslash",
- "Oslash",
- "OE",
- "ordmasculine",
- "ae",
- "dotlessi",
- "lslash",
- "oslash",
- "oe",
- "germandbls",
- "onesuperior",
- "logicalnot",
- "mu",
- "trademark",
- "Eth",
- "onehalf",
- "plusminus",
- "Thorn",
- "onequarter",
- "divide",
- "brokenbar",
- "degree",
- "thorn",
- "threequarters",
- "twosuperior",
- "registered",
- "minus",
- "eth",
- "multiply",
- "threesuperior",
- "copyright",
- "Aacute",
- "Acircumflex",
- "Adieresis",
- "Agrave",
- "Aring",
- "Atilde",
- "Ccedilla",
- "Eacute",
- "Ecircumflex",
- "Edieresis",
- "Egrave",
- "Iacute",
- "Icircumflex",
- "Idieresis",
- "Igrave",
- "Ntilde",
- "Oacute",
- "Ocircumflex",
- "Odieresis",
- "Ograve",
- "Otilde",
- "Scaron",
- "Uacute",
- "Ucircumflex",
- "Udieresis",
- "Ugrave",
- "Yacute",
- "Ydieresis",
- "Zcaron",
- "aacute",
- "acircumflex",
- "adieresis",
- "agrave",
- "aring",
- "atilde",
- "ccedilla",
- "eacute",
- "ecircumflex",
- "edieresis",
- "egrave",
- "iacute",
- "icircumflex",
- "idieresis",
- "igrave",
- "ntilde",
- "oacute",
- "ocircumflex",
- "odieresis",
- "ograve",
- "otilde",
- "scaron",
- "uacute",
- "ucircumflex",
- "udieresis",
- "ugrave",
- "yacute",
- "ydieresis",
- "zcaron",
- "exclamsmall",
- "Hungarumlautsmall",
- "dollaroldstyle",
- "dollarsuperior",
- "ampersandsmall",
- "Acutesmall",
- "parenleftsuperior",
- "parenrightsuperior",
- "twodotenleader",
- "onedotenleader",
- "zerooldstyle",
- "oneoldstyle",
- "twooldstyle",
- "threeoldstyle",
- "fouroldstyle",
- "fiveoldstyle",
- "sixoldstyle",
- "sevenoldstyle",
- "eightoldstyle",
- "nineoldstyle",
- "commasuperior",
- "threequartersemdash",
- "periodsuperior",
- "questionsmall",
- "asuperior",
- "bsuperior",
- "centsuperior",
- "dsuperior",
- "esuperior",
- "isuperior",
- "lsuperior",
- "msuperior",
- "nsuperior",
- "osuperior",
- "rsuperior",
- "ssuperior",
- "tsuperior",
- "ff",
- "ffi",
- "ffl",
- "parenleftinferior",
- "parenrightinferior",
- "Circumflexsmall",
- "hyphensuperior",
- "Gravesmall",
- "Asmall",
- "Bsmall",
- "Csmall",
- "Dsmall",
- "Esmall",
- "Fsmall",
- "Gsmall",
- "Hsmall",
- "Ismall",
- "Jsmall",
- "Ksmall",
- "Lsmall",
- "Msmall",
- "Nsmall",
- "Osmall",
- "Psmall",
- "Qsmall",
- "Rsmall",
- "Ssmall",
- "Tsmall",
- "Usmall",
- "Vsmall",
- "Wsmall",
- "Xsmall",
- "Ysmall",
- "Zsmall",
- "colonmonetary",
- "onefitted",
- "rupiah",
- "Tildesmall",
- "exclamdownsmall",
- "centoldstyle",
- "Lslashsmall",
- "Scaronsmall",
- "Zcaronsmall",
- "Dieresissmall",
- "Brevesmall",
- "Caronsmall",
- "Dotaccentsmall",
- "Macronsmall",
- "figuredash",
- "hypheninferior",
- "Ogoneksmall",
- "Ringsmall",
- "Cedillasmall",
- "questiondownsmall",
- "oneeighth",
- "threeeighths",
- "fiveeighths",
- "seveneighths",
- "onethird",
- "twothirds",
- "zerosuperior",
- "foursuperior",
- "fivesuperior",
- "sixsuperior",
- "sevensuperior",
- "eightsuperior",
- "ninesuperior",
- "zeroinferior",
- "oneinferior",
- "twoinferior",
- "threeinferior",
- "fourinferior",
- "fiveinferior",
- "sixinferior",
- "seveninferior",
- "eightinferior",
- "nineinferior",
- "centinferior",
- "dollarinferior",
- "periodinferior",
- "commainferior",
- "Agravesmall",
- "Aacutesmall",
- "Acircumflexsmall",
- "Atildesmall",
- "Adieresissmall",
- "Aringsmall",
- "AEsmall",
- "Ccedillasmall",
- "Egravesmall",
- "Eacutesmall",
- "Ecircumflexsmall",
- "Edieresissmall",
- "Igravesmall",
- "Iacutesmall",
- "Icircumflexsmall",
- "Idieresissmall",
- "Ethsmall",
- "Ntildesmall",
- "Ogravesmall",
- "Oacutesmall",
- "Ocircumflexsmall",
- "Otildesmall",
- "Odieresissmall",
- "OEsmall",
- "Oslashsmall",
- "Ugravesmall",
- "Uacutesmall",
- "Ucircumflexsmall",
- "Udieresissmall",
- "Yacutesmall",
- "Thornsmall",
- "Ydieresissmall",
- "001.000",
- "001.001",
- "001.002",
- "001.003",
- "Black",
- "Bold",
- "Book",
- "Light",
- "Medium",
- "Regular",
- "Roman",
- "Semibold",
- )
- class INDEX:
- def __init__(self, fp: BinaryIO) -> None:
- self.fp = fp
- self.offsets: List[int] = []
- (count, offsize) = struct.unpack(">HB", self.fp.read(3))
- for i in range(count + 1):
- self.offsets.append(nunpack(self.fp.read(offsize)))
- self.base = self.fp.tell() - 1
- self.fp.seek(self.base + self.offsets[-1])
- def __repr__(self) -> str:
- return "<INDEX: size=%d>" % len(self)
- def __len__(self) -> int:
- return len(self.offsets) - 1
- def __getitem__(self, i: int) -> bytes:
- self.fp.seek(self.base + self.offsets[i])
- return self.fp.read(self.offsets[i + 1] - self.offsets[i])
- def __iter__(self) -> Iterator[bytes]:
- return iter(self[i] for i in range(len(self)))
- def __init__(self, name: str, fp: BinaryIO) -> None:
- self.name = name
- self.fp = fp
- # Header
- (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
- self.fp.read(hdrsize - 4)
- # Name INDEX
- self.name_index = self.INDEX(self.fp)
- # Top DICT INDEX
- self.dict_index = self.INDEX(self.fp)
- # String INDEX
- self.string_index = self.INDEX(self.fp)
- # Global Subr INDEX
- self.subr_index = self.INDEX(self.fp)
- # Top DICT DATA
- self.top_dict = getdict(self.dict_index[0])
- (charset_pos,) = self.top_dict.get(15, [0])
- (encoding_pos,) = self.top_dict.get(16, [0])
- (charstring_pos,) = self.top_dict.get(17, [0])
- # CharStrings
- self.fp.seek(cast(int, charstring_pos))
- self.charstring = self.INDEX(self.fp)
- self.nglyphs = len(self.charstring)
- # Encodings
- self.code2gid = {}
- self.gid2code = {}
- self.fp.seek(cast(int, encoding_pos))
- format = self.fp.read(1)
- if format == b"\x00":
- # Format 0
- (n,) = struct.unpack("B", self.fp.read(1))
- for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
- self.code2gid[code] = gid
- self.gid2code[gid] = code
- elif format == b"\x01":
- # Format 1
- (n,) = struct.unpack("B", self.fp.read(1))
- code = 0
- for i in range(n):
- (first, nleft) = struct.unpack("BB", self.fp.read(2))
- for gid in range(first, first + nleft + 1):
- self.code2gid[code] = gid
- self.gid2code[gid] = code
- code += 1
- else:
- raise PDFValueError("unsupported encoding format: %r" % format)
- # Charsets
- self.name2gid = {}
- self.gid2name = {}
- self.fp.seek(cast(int, charset_pos))
- format = self.fp.read(1)
- if format == b"\x00":
- # Format 0
- n = self.nglyphs - 1
- for gid, sid in enumerate(
- cast(
- Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
- ),
- ):
- gid += 1
- sidname = self.getstr(sid)
- self.name2gid[sidname] = gid
- self.gid2name[gid] = sidname
- elif format == b"\x01":
- # Format 1
- (n,) = struct.unpack("B", self.fp.read(1))
- sid = 0
- for i in range(n):
- (first, nleft) = struct.unpack("BB", self.fp.read(2))
- for gid in range(first, first + nleft + 1):
- sidname = self.getstr(sid)
- self.name2gid[sidname] = gid
- self.gid2name[gid] = sidname
- sid += 1
- elif format == b"\x02":
- # Format 2
- assert False, str(("Unhandled", format))
- else:
- raise PDFValueError("unsupported charset format: %r" % format)
- def getstr(self, sid: int) -> Union[str, bytes]:
- # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
- # and appears to be a needless source of type complexity.
- if sid < len(self.STANDARD_STRINGS):
- return self.STANDARD_STRINGS[sid]
- return self.string_index[sid - len(self.STANDARD_STRINGS)]
- class TrueTypeFont:
- class CMapNotFound(PDFException):
- pass
- def __init__(self, name: str, fp: BinaryIO) -> None:
- self.name = name
- self.fp = fp
- self.tables: Dict[bytes, Tuple[int, int]] = {}
- self.fonttype = fp.read(4)
- try:
- (ntables, _1, _2, _3) = cast(
- Tuple[int, int, int, int],
- struct.unpack(">HHHH", fp.read(8)),
- )
- for _ in range(ntables):
- (name_bytes, tsum, offset, length) = cast(
- Tuple[bytes, int, int, int],
- struct.unpack(">4sLLL", fp.read(16)),
- )
- self.tables[name_bytes] = (offset, length)
- except struct.error:
- # Do not fail if there are not enough bytes to read. Even for
- # corrupted PDFs we would like to get as much information as
- # possible, so continue.
- pass
- def create_unicode_map(self) -> FileUnicodeMap:
- if b"cmap" not in self.tables:
- raise TrueTypeFont.CMapNotFound
- (base_offset, length) = self.tables[b"cmap"]
- fp = self.fp
- fp.seek(base_offset)
- (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))
- subtables: List[Tuple[int, int, int]] = []
- for i in range(nsubtables):
- subtables.append(
- cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
- )
- char2gid: Dict[int, int] = {}
- # Only supports subtable type 0, 2 and 4.
- for platform_id, encoding_id, st_offset in subtables:
- # Skip non-Unicode cmaps.
- # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
- if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
- continue
- fp.seek(base_offset + st_offset)
- (fmttype, fmtlen, fmtlang) = cast(
- Tuple[int, int, int],
- struct.unpack(">HHH", fp.read(6)),
- )
- if fmttype == 0:
- char2gid.update(
- enumerate(
- cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),
- ),
- )
- elif fmttype == 2:
- subheaderkeys = cast(
- Tuple[int, ...],
- struct.unpack(">256H", fp.read(512)),
- )
- firstbytes = [0] * 8192
- for i, k in enumerate(subheaderkeys):
- firstbytes[k // 8] = i
- nhdrs = max(subheaderkeys) // 8 + 1
- hdrs: List[Tuple[int, int, int, int, int]] = []
- for i in range(nhdrs):
- (firstcode, entcount, delta, offset) = cast(
- Tuple[int, int, int, int],
- struct.unpack(">HHhH", fp.read(8)),
- )
- hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
- for i, firstcode, entcount, delta, pos in hdrs:
- if not entcount:
- continue
- first = firstcode + (firstbytes[i] << 8)
- fp.seek(pos)
- for c in range(entcount):
- gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
- if gid:
- gid += delta
- char2gid[first + c] = gid
- elif fmttype == 4:
- (segcount, _1, _2, _3) = cast(
- Tuple[int, int, int, int],
- struct.unpack(">HHHH", fp.read(8)),
- )
- segcount //= 2
- ecs = cast(
- Tuple[int, ...],
- struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
- )
- fp.read(2)
- scs = cast(
- Tuple[int, ...],
- struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
- )
- idds = cast(
- Tuple[int, ...],
- struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),
- )
- pos = fp.tell()
- idrs = cast(
- Tuple[int, ...],
- struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
- )
- for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):
- if idr:
- fp.seek(pos + idr)
- for c in range(sc, ec + 1):
- b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
- char2gid[c] = (b + idd) & 0xFFFF
- else:
- for c in range(sc, ec + 1):
- char2gid[c] = (c + idd) & 0xFFFF
- else:
- assert False, str(("Unhandled", fmttype))
- if not char2gid:
- raise TrueTypeFont.CMapNotFound
- # create unicode map
- unicode_map = FileUnicodeMap()
- for char, gid in char2gid.items():
- unicode_map.add_cid2unichr(gid, char)
- return unicode_map
- class PDFFontError(PDFException):
- pass
- class PDFUnicodeNotDefined(PDFFontError):
- pass
- LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
- LITERAL_TYPE1C = LIT("Type1C")
- # Font widths are maintained in a dict type that maps from *either* unicode
- # chars or integer character IDs.
- FontWidthDict = Dict[Union[int, str], float]
- class PDFFont:
- def __init__(
- self,
- descriptor: Mapping[str, Any],
- widths: FontWidthDict,
- default_width: Optional[float] = None,
- ) -> None:
- self.descriptor = descriptor
- self.widths: FontWidthDict = resolve_all(widths)
- self.fontname = resolve1(descriptor.get("FontName", "unknown"))
- if isinstance(self.fontname, PSLiteral):
- self.fontname = literal_name(self.fontname)
- self.flags = int_value(descriptor.get("Flags", 0))
- self.ascent = num_value(descriptor.get("Ascent", 0))
- self.descent = num_value(descriptor.get("Descent", 0))
- self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
- if default_width is None:
- self.default_width = num_value(descriptor.get("MissingWidth", 0))
- else:
- self.default_width = default_width
- self.default_width = resolve1(self.default_width)
- self.leading = num_value(descriptor.get("Leading", 0))
- self.bbox = self._parse_bbox(descriptor)
- self.hscale = self.vscale = 0.001
- # PDF RM 9.8.1 specifies /Descent should always be a negative number.
- # PScript5.dll seems to produce Descent with a positive number, but
- # text analysis will be wrong if this is taken as correct. So force
- # descent to negative.
- if self.descent > 0:
- self.descent = -self.descent
- def __repr__(self) -> str:
- return "<PDFFont>"
- def is_vertical(self) -> bool:
- return False
- def is_multibyte(self) -> bool:
- return False
- def decode(self, bytes: bytes) -> Iterable[int]:
- return bytearray(bytes) # map(ord, bytes)
- def get_ascent(self) -> float:
- """Ascent above the baseline, in text space units"""
- return self.ascent * self.vscale
- def get_descent(self) -> float:
- """Descent below the baseline, in text space units; always negative"""
- return self.descent * self.vscale
- def get_width(self) -> float:
- w = self.bbox[2] - self.bbox[0]
- if w == 0:
- w = -self.default_width
- return w * self.hscale
- def get_height(self) -> float:
- h = self.bbox[3] - self.bbox[1]
- if h == 0:
- h = self.ascent - self.descent
- return h * self.vscale
- def char_width(self, cid: int) -> float:
- # Because character widths may be mapping either IDs or strings,
- # we try to lookup the character ID first, then its str equivalent.
- cid_width = safe_float(self.widths.get(cid))
- if cid_width is not None:
- return cid_width * self.hscale
- try:
- str_cid = self.to_unichr(cid)
- cid_width = safe_float(self.widths.get(str_cid))
- if cid_width is not None:
- return cid_width * self.hscale
- except PDFUnicodeNotDefined:
- pass
- return self.default_width * self.hscale
- def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
- """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
- return 0
- def string_width(self, s: bytes) -> float:
- return sum(self.char_width(cid) for cid in self.decode(s))
- def to_unichr(self, cid: int) -> str:
- raise NotImplementedError
- @staticmethod
- def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:
- """Parse FontBBox from the fonts descriptor"""
- font_bbox = resolve_all(descriptor.get("FontBBox"))
- bbox = safe_rect_list(font_bbox)
- if bbox is None:
- log.warning(
- f"Could get FontBBox from font descriptor because {font_bbox!r} cannot be parsed as 4 floats"
- )
- return 0.0, 0.0, 0.0, 0.0
- return bbox
- class PDFSimpleFont(PDFFont):
- def __init__(
- self,
- descriptor: Mapping[str, Any],
- widths: FontWidthDict,
- spec: Mapping[str, Any],
- ) -> None:
- # Font encoding is specified either by a name of
- # built-in encoding or a dictionary that describes
- # the differences.
- if "Encoding" in spec:
- encoding = resolve1(spec["Encoding"])
- else:
- encoding = LITERAL_STANDARD_ENCODING
- if isinstance(encoding, dict):
- name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
- diff = list_value(encoding.get("Differences", []))
- self.cid2unicode = EncodingDB.get_encoding(name, diff)
- else:
- self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
- self.unicode_map: Optional[UnicodeMap] = None
- if "ToUnicode" in spec:
- strm = stream_value(spec["ToUnicode"])
- self.unicode_map = FileUnicodeMap()
- CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
- PDFFont.__init__(self, descriptor, widths)
- def to_unichr(self, cid: int) -> str:
- if self.unicode_map:
- try:
- return self.unicode_map.get_unichr(cid)
- except KeyError:
- pass
- try:
- return self.cid2unicode[cid]
- except KeyError:
- raise PDFUnicodeNotDefined(None, cid)
- class PDFType1Font(PDFSimpleFont):
- def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
- try:
- self.basefont = literal_name(spec["BaseFont"])
- except KeyError:
- if settings.STRICT:
- raise PDFFontError("BaseFont is missing")
- self.basefont = "unknown"
- widths: FontWidthDict
- try:
- (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
- widths = cast(
- Dict[Union[str, int], float], int_widths
- ) # implicit int->float
- except KeyError:
- descriptor = dict_value(spec.get("FontDescriptor", {}))
- firstchar = int_value(spec.get("FirstChar", 0))
- # lastchar = int_value(spec.get('LastChar', 255))
- width_list = list_value(spec.get("Widths", [0] * 256))
- widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
- PDFSimpleFont.__init__(self, descriptor, widths, spec)
- if "Encoding" not in spec and "FontFile" in descriptor:
- # try to recover the missing encoding info from the font file.
- self.fontfile = stream_value(descriptor.get("FontFile"))
- length1 = int_value(self.fontfile["Length1"])
- data = self.fontfile.get_data()[:length1]
- parser = Type1FontHeaderParser(BytesIO(data))
- self.cid2unicode = parser.get_encoding()
- def __repr__(self) -> str:
- return "<PDFType1Font: basefont=%r>" % self.basefont
- class PDFTrueTypeFont(PDFType1Font):
- def __repr__(self) -> str:
- return "<PDFTrueTypeFont: basefont=%r>" % self.basefont
- class PDFType3Font(PDFSimpleFont):
- def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
- firstchar = int_value(spec.get("FirstChar", 0))
- # lastchar = int_value(spec.get('LastChar', 0))
- width_list = list_value(spec.get("Widths", [0] * 256))
- widths: Dict[Union[str, int], float] = {
- i + firstchar: w for (i, w) in enumerate(width_list)
- }
- if "FontDescriptor" in spec:
- descriptor = dict_value(spec["FontDescriptor"])
- else:
- descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
- PDFSimpleFont.__init__(self, descriptor, widths, spec)
- self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
- (_, self.descent, _, self.ascent) = self.bbox
- (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
- def __repr__(self) -> str:
- return "<PDFType3Font>"
- class PDFCIDFont(PDFFont):
- default_disp: Union[float, Tuple[Optional[float], float]]
- def __init__(
- self,
- rsrcmgr: "PDFResourceManager",
- spec: Mapping[str, Any],
- strict: bool = settings.STRICT,
- ) -> None:
- try:
- self.basefont = literal_name(spec["BaseFont"])
- except KeyError:
- if strict:
- raise PDFFontError("BaseFont is missing")
- self.basefont = "unknown"
- self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
- cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
- "latin1",
- )
- cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
- "latin1",
- )
- self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
- self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
- try:
- descriptor = dict_value(spec["FontDescriptor"])
- except KeyError:
- if strict:
- raise PDFFontError("FontDescriptor is missing")
- descriptor = {}
- ttf = None
- if "FontFile2" in descriptor:
- self.fontfile = stream_value(descriptor.get("FontFile2"))
- ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
- self.unicode_map: Optional[UnicodeMap] = None
- if "ToUnicode" in spec:
- if isinstance(spec["ToUnicode"], PDFStream):
- strm = stream_value(spec["ToUnicode"])
- self.unicode_map = FileUnicodeMap()
- CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
- else:
- cmap_name = literal_name(spec["ToUnicode"])
- encoding = literal_name(spec["Encoding"])
- if (
- "Identity" in cid_ordering
- or "Identity" in cmap_name
- or "Identity" in encoding
- ):
- self.unicode_map = IdentityUnicodeMap()
- elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
- if ttf:
- try:
- self.unicode_map = ttf.create_unicode_map()
- except TrueTypeFont.CMapNotFound:
- pass
- else:
- try:
- self.unicode_map = CMapDB.get_unicode_map(
- self.cidcoding,
- self.cmap.is_vertical(),
- )
- except CMapDB.CMapNotFound:
- pass
- self.vertical = self.cmap.is_vertical()
- if self.vertical:
- # writing mode: vertical
- widths2 = get_widths2(list_value(spec.get("W2", [])))
- self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
- (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
- self.default_disp = (None, vy)
- widths: Dict[Union[str, int], float] = {
- cid: w for (cid, (w, _)) in widths2.items()
- }
- default_width = w
- else:
- # writing mode: horizontal
- self.disps = {}
- self.default_disp = 0
- widths = get_widths(list_value(spec.get("W", [])))
- default_width = spec.get("DW", 1000)
- PDFFont.__init__(self, descriptor, widths, default_width=default_width)
- def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
- """Get cmap from font specification
- For certain PDFs, Encoding Type isn't mentioned as an attribute of
- Encoding but as an attribute of CMapName, where CMapName is an
- attribute of spec['Encoding'].
- The horizontal/vertical modes are mentioned with different name
- such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
- """
- cmap_name = self._get_cmap_name(spec, strict)
- try:
- return CMapDB.get_cmap(cmap_name)
- except CMapDB.CMapNotFound as e:
- if strict:
- raise PDFFontError(e)
- return CMap()
- @staticmethod
- def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
- """Get cmap name from font specification"""
- cmap_name = "unknown" # default value
- try:
- spec_encoding = spec["Encoding"]
- if hasattr(spec_encoding, "name"):
- cmap_name = literal_name(spec["Encoding"])
- else:
- cmap_name = literal_name(spec_encoding["CMapName"])
- except KeyError:
- if strict:
- raise PDFFontError("Encoding is unspecified")
- if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
- cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
- if "CMapName" in cmap_name_stream:
- cmap_name = cmap_name_stream.get("CMapName").name
- elif strict:
- raise PDFFontError("CMapName unspecified for encoding")
- return IDENTITY_ENCODER.get(cmap_name, cmap_name)
- def __repr__(self) -> str:
- return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
- def is_vertical(self) -> bool:
- return self.vertical
- def is_multibyte(self) -> bool:
- return True
- def decode(self, bytes: bytes) -> Iterable[int]:
- return self.cmap.decode(bytes)
- def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
- """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
- return self.disps.get(cid, self.default_disp)
- def to_unichr(self, cid: int) -> str:
- try:
- if not self.unicode_map:
- raise PDFKeyError(cid)
- return self.unicode_map.get_unichr(cid)
- except KeyError:
- raise PDFUnicodeNotDefined(self.cidcoding, cid)
|