__main__.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140
  1. # -----------------------------------------------------------------------------
  2. # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
  3. # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
  4. # Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
  5. # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
  6. # maintained and developed by Artifex Software, Inc. https://artifex.com.
  7. # -----------------------------------------------------------------------------
  8. import argparse
  9. import bisect
  10. import os
  11. import sys
  12. import statistics
  13. from typing import Dict, List, Set
  14. from . import pymupdf
  15. def mycenter(x):
  16. return (" %s " % x).center(75, "-")
  17. def recoverpix(doc, item):
  18. """Return image for a given XREF."""
  19. x = item[0] # xref of PDF image
  20. s = item[1] # xref of its /SMask
  21. if s == 0: # no smask: use direct image output
  22. return doc.extract_image(x)
  23. def getimage(pix):
  24. if pix.colorspace.n != 4:
  25. return pix
  26. tpix = pymupdf.Pixmap(pymupdf.csRGB, pix)
  27. return tpix
  28. # we need to reconstruct the alpha channel with the smask
  29. pix1 = pymupdf.Pixmap(doc, x)
  30. pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry
  31. """Sanity check:
  32. - both pixmaps must have the same rectangle
  33. - both pixmaps must have alpha=0
  34. - pix2 must consist of 1 byte per pixel
  35. """
  36. if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
  37. pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x))
  38. pymupdf.message(pix2)
  39. pix2 = None
  40. return getimage(pix1) # return the pixmap as is
  41. pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added
  42. pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values
  43. pix1 = pix2 = None # free temp pixmaps
  44. # we may need to adjust something for CMYK pixmaps here:
  45. return getimage(pix)
  46. def open_file(filename, password, show=False, pdf=True):
  47. """Open and authenticate a document."""
  48. doc = pymupdf.open(filename)
  49. if not doc.is_pdf and pdf is True:
  50. sys.exit("this command supports PDF files only")
  51. rc = -1
  52. if not doc.needs_pass:
  53. return doc
  54. if password:
  55. rc = doc.authenticate(password)
  56. if not rc:
  57. sys.exit("authentication unsuccessful")
  58. if show is True:
  59. pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user")
  60. else:
  61. sys.exit("'%s' requires a password" % doc.name)
  62. return doc
  63. def print_dict(item):
  64. """Print a Python dictionary."""
  65. l = max([len(k) for k in item.keys()]) + 1
  66. for k, v in item.items():
  67. msg = "%s: %s" % (k.rjust(l), v)
  68. pymupdf.message(msg)
  69. def print_xref(doc, xref):
  70. """Print an object given by XREF number.
  71. Simulate the PDF source in "pretty" format.
  72. For a stream also print its size.
  73. """
  74. pymupdf.message("%i 0 obj" % xref)
  75. xref_str = doc.xref_object(xref)
  76. pymupdf.message(xref_str)
  77. if doc.xref_is_stream(xref):
  78. temp = xref_str.split()
  79. try:
  80. idx = temp.index("/Length") + 1
  81. size = temp[idx]
  82. if size.endswith("0 R"):
  83. size = "unknown"
  84. except Exception:
  85. size = "unknown"
  86. pymupdf.message("stream\n...%s bytes" % size)
  87. pymupdf.message("endstream")
  88. pymupdf.message("endobj")
  89. def get_list(rlist, limit, what="page"):
  90. """Transform a page / xref specification into a list of integers.
  91. Args
  92. ----
  93. rlist: (str) the specification
  94. limit: maximum number, i.e. number of pages, number of objects
  95. what: a string to be used in error messages
  96. Returns
  97. -------
  98. A list of integers representing the specification.
  99. """
  100. N = str(limit - 1)
  101. rlist = rlist.replace("N", N).replace(" ", "")
  102. rlist_arr = rlist.split(",")
  103. out_list = []
  104. for seq, item in enumerate(rlist_arr):
  105. n = seq + 1
  106. if item.isdecimal(): # a single integer
  107. i = int(item)
  108. if 1 <= i < limit:
  109. out_list.append(int(item))
  110. else:
  111. sys.exit("bad %s specification at item %i" % (what, n))
  112. continue
  113. try: # this must be a range now, and all of the following must work:
  114. i1, i2 = item.split("-") # will fail if not 2 items produced
  115. i1 = int(i1) # will fail on non-integers
  116. i2 = int(i2)
  117. except Exception:
  118. sys.exit("bad %s range specification at item %i" % (what, n))
  119. if not (1 <= i1 < limit and 1 <= i2 < limit):
  120. sys.exit("bad %s range specification at item %i" % (what, n))
  121. if i1 == i2: # just in case: a range of equal numbers
  122. out_list.append(i1)
  123. continue
  124. if i1 < i2: # first less than second
  125. out_list += list(range(i1, i2 + 1))
  126. else: # first larger than second
  127. out_list += list(range(i1, i2 - 1, -1))
  128. return out_list
  129. def show(args):
  130. doc = open_file(args.input, args.password, True)
  131. size = os.path.getsize(args.input) / 1024
  132. flag = "KB"
  133. if size > 1000:
  134. size /= 1024
  135. flag = "MB"
  136. size = round(size, 1)
  137. meta = doc.metadata # pylint: disable=no-member
  138. pymupdf.message(
  139. "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
  140. % (
  141. args.input,
  142. doc.page_count,
  143. doc.xref_length() - 1,
  144. size,
  145. flag,
  146. meta["format"],
  147. meta["encryption"],
  148. )
  149. )
  150. n = doc.is_form_pdf
  151. if n > 0:
  152. s = doc.get_sigflags()
  153. pymupdf.message(
  154. "document contains %i root form fields and is %ssigned"
  155. % (n, "not " if s != 3 else "")
  156. )
  157. n = doc.embfile_count()
  158. if n > 0:
  159. pymupdf.message("document contains %i embedded files" % n)
  160. pymupdf.message()
  161. if args.catalog:
  162. pymupdf.message(mycenter("PDF catalog"))
  163. xref = doc.pdf_catalog()
  164. print_xref(doc, xref)
  165. pymupdf.message()
  166. if args.metadata:
  167. pymupdf.message(mycenter("PDF metadata"))
  168. print_dict(doc.metadata) # pylint: disable=no-member
  169. pymupdf.message()
  170. if args.xrefs:
  171. pymupdf.message(mycenter("object information"))
  172. xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
  173. for xref in xrefl:
  174. print_xref(doc, xref)
  175. pymupdf.message()
  176. if args.pages:
  177. pymupdf.message(mycenter("page information"))
  178. pagel = get_list(args.pages, doc.page_count + 1)
  179. for pno in pagel:
  180. n = pno - 1
  181. xref = doc.page_xref(n)
  182. pymupdf.message("Page %i:" % pno)
  183. print_xref(doc, xref)
  184. pymupdf.message()
  185. if args.trailer:
  186. pymupdf.message(mycenter("PDF trailer"))
  187. pymupdf.message(doc.pdf_trailer())
  188. pymupdf.message()
  189. doc.close()
  190. def clean(args):
  191. doc = open_file(args.input, args.password, pdf=True)
  192. encryption = args.encryption
  193. encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
  194. encryption
  195. )
  196. if not args.pages: # simple cleaning
  197. doc.save(
  198. args.output,
  199. garbage=args.garbage,
  200. deflate=args.compress,
  201. pretty=args.pretty,
  202. clean=args.sanitize,
  203. ascii=args.ascii,
  204. linear=args.linear,
  205. encryption=encrypt,
  206. owner_pw=args.owner,
  207. user_pw=args.user,
  208. permissions=args.permission,
  209. )
  210. return
  211. # create sub document from page numbers
  212. pages = get_list(args.pages, doc.page_count + 1)
  213. outdoc = pymupdf.open()
  214. for pno in pages:
  215. n = pno - 1
  216. outdoc.insert_pdf(doc, from_page=n, to_page=n)
  217. outdoc.save(
  218. args.output,
  219. garbage=args.garbage,
  220. deflate=args.compress,
  221. pretty=args.pretty,
  222. clean=args.sanitize,
  223. ascii=args.ascii,
  224. linear=args.linear,
  225. encryption=encrypt,
  226. owner_pw=args.owner,
  227. user_pw=args.user,
  228. permissions=args.permission,
  229. )
  230. doc.close()
  231. outdoc.close()
  232. return
  233. def doc_join(args):
  234. """Join pages from several PDF documents."""
  235. doc_list = args.input # a list of input PDFs
  236. doc = pymupdf.open() # output PDF
  237. for src_item in doc_list: # process one input PDF
  238. src_list = src_item.split(",")
  239. password = src_list[1] if len(src_list) > 1 else None
  240. src = open_file(src_list[0], password, pdf=True)
  241. pages = ",".join(src_list[2:]) # get 'pages' specifications
  242. if pages: # if anything there, retrieve a list of desired pages
  243. page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
  244. else: # take all pages
  245. page_list = range(1, src.page_count + 1)
  246. for i in page_list:
  247. doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page
  248. src.close()
  249. doc.save(args.output, garbage=4, deflate=True)
  250. doc.close()
  251. def embedded_copy(args):
  252. """Copy embedded files between PDFs."""
  253. doc = open_file(args.input, args.password, pdf=True)
  254. if not doc.can_save_incrementally() and (
  255. not args.output or args.output == args.input
  256. ):
  257. sys.exit("cannot save PDF incrementally")
  258. src = open_file(args.source, args.pwdsource)
  259. names = set(args.name) if args.name else set()
  260. src_names = set(src.embfile_names())
  261. if names:
  262. if not names <= src_names:
  263. sys.exit("not all names are contained in source")
  264. else:
  265. names = src_names
  266. if not names:
  267. sys.exit("nothing to copy")
  268. intersect = names & set(doc.embfile_names()) # any equal name already in target?
  269. if intersect:
  270. sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
  271. for item in names:
  272. info = src.embfile_info(item)
  273. buff = src.embfile_get(item)
  274. doc.embfile_add(
  275. item,
  276. buff,
  277. filename=info["filename"],
  278. ufilename=info["ufilename"],
  279. desc=info["desc"],
  280. )
  281. pymupdf.message("copied entry '%s' from '%s'" % (item, src.name))
  282. src.close()
  283. if args.output and args.output != args.input:
  284. doc.save(args.output, garbage=3)
  285. else:
  286. doc.saveIncr()
  287. doc.close()
  288. def embedded_del(args):
  289. """Delete an embedded file entry."""
  290. doc = open_file(args.input, args.password, pdf=True)
  291. if not doc.can_save_incrementally() and (
  292. not args.output or args.output == args.input
  293. ):
  294. sys.exit("cannot save PDF incrementally")
  295. try:
  296. doc.embfile_del(args.name)
  297. except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
  298. sys.exit(f'no such embedded file {args.name!r}: {e}')
  299. if not args.output or args.output == args.input:
  300. doc.saveIncr()
  301. else:
  302. doc.save(args.output, garbage=1)
  303. doc.close()
  304. def embedded_get(args):
  305. """Retrieve contents of an embedded file."""
  306. doc = open_file(args.input, args.password, pdf=True)
  307. try:
  308. stream = doc.embfile_get(args.name)
  309. d = doc.embfile_info(args.name)
  310. except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
  311. sys.exit(f'no such embedded file {args.name!r}: {e}')
  312. filename = args.output if args.output else d["filename"]
  313. with open(filename, "wb") as output:
  314. output.write(stream)
  315. pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename))
  316. doc.close()
  317. def embedded_add(args):
  318. """Insert a new embedded file."""
  319. doc = open_file(args.input, args.password, pdf=True)
  320. if not doc.can_save_incrementally() and (
  321. args.output is None or args.output == args.input
  322. ):
  323. sys.exit("cannot save PDF incrementally")
  324. try:
  325. doc.embfile_del(args.name)
  326. sys.exit("entry '%s' already exists" % args.name)
  327. except Exception:
  328. pass
  329. if not os.path.exists(args.path) or not os.path.isfile(args.path):
  330. sys.exit("no such file '%s'" % args.path)
  331. with open(args.path, "rb") as f:
  332. stream = f.read()
  333. filename = args.path
  334. ufilename = filename
  335. if not args.desc:
  336. desc = filename
  337. else:
  338. desc = args.desc
  339. doc.embfile_add(
  340. args.name, stream, filename=filename, ufilename=ufilename, desc=desc
  341. )
  342. if not args.output or args.output == args.input:
  343. doc.saveIncr()
  344. else:
  345. doc.save(args.output, garbage=3)
  346. doc.close()
  347. def embedded_upd(args):
  348. """Update contents or metadata of an embedded file."""
  349. doc = open_file(args.input, args.password, pdf=True)
  350. if not doc.can_save_incrementally() and (
  351. args.output is None or args.output == args.input
  352. ):
  353. sys.exit("cannot save PDF incrementally")
  354. try:
  355. doc.embfile_info(args.name)
  356. except Exception:
  357. sys.exit("no such embedded file '%s'" % args.name)
  358. if (
  359. args.path is not None
  360. and os.path.exists(args.path)
  361. and os.path.isfile(args.path)
  362. ):
  363. with open(args.path, "rb") as f:
  364. stream = f.read()
  365. else:
  366. stream = None
  367. if args.filename:
  368. filename = args.filename
  369. else:
  370. filename = None
  371. if args.ufilename:
  372. ufilename = args.ufilename
  373. elif args.filename:
  374. ufilename = args.filename
  375. else:
  376. ufilename = None
  377. if args.desc:
  378. desc = args.desc
  379. else:
  380. desc = None
  381. doc.embfile_upd(
  382. args.name, stream, filename=filename, ufilename=ufilename, desc=desc
  383. )
  384. if args.output is None or args.output == args.input:
  385. doc.saveIncr()
  386. else:
  387. doc.save(args.output, garbage=3)
  388. doc.close()
  389. def embedded_list(args):
  390. """List embedded files."""
  391. doc = open_file(args.input, args.password, pdf=True)
  392. names = doc.embfile_names()
  393. if args.name is not None:
  394. if args.name not in names:
  395. sys.exit("no such embedded file '%s'" % args.name)
  396. else:
  397. pymupdf.message()
  398. pymupdf.message(
  399. "printing 1 of %i embedded file%s:"
  400. % (len(names), "s" if len(names) > 1 else "")
  401. )
  402. pymupdf.message()
  403. print_dict(doc.embfile_info(args.name))
  404. pymupdf.message()
  405. return
  406. if not names:
  407. pymupdf.message("'%s' contains no embedded files" % doc.name)
  408. return
  409. if len(names) > 1:
  410. msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
  411. else:
  412. msg = "'%s' contains the following embedded file" % doc.name
  413. pymupdf.message(msg)
  414. pymupdf.message()
  415. for name in names:
  416. if not args.detail:
  417. pymupdf.message(name)
  418. continue
  419. _ = doc.embfile_info(name)
  420. print_dict(doc.embfile_info(name))
  421. pymupdf.message()
  422. doc.close()
  423. def extract_objects(args):
  424. """Extract images and / or fonts from a PDF."""
  425. if not args.fonts and not args.images:
  426. sys.exit("neither fonts nor images requested")
  427. doc = open_file(args.input, args.password, pdf=True)
  428. if args.pages:
  429. pages = get_list(args.pages, doc.page_count + 1)
  430. else:
  431. pages = range(1, doc.page_count + 1)
  432. if not args.output:
  433. out_dir = os.path.abspath(os.curdir)
  434. else:
  435. out_dir = args.output
  436. if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
  437. sys.exit("output directory %s does not exist" % out_dir)
  438. font_xrefs = set() # already saved fonts
  439. image_xrefs = set() # already saved images
  440. for pno in pages:
  441. if args.fonts:
  442. itemlist = doc.get_page_fonts(pno - 1)
  443. for item in itemlist:
  444. xref = item[0]
  445. if xref not in font_xrefs:
  446. font_xrefs.add(xref)
  447. fontname, ext, _, buffer = doc.extract_font(xref)
  448. if ext == "n/a" or not buffer:
  449. continue
  450. outname = os.path.join(
  451. out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
  452. )
  453. with open(outname, "wb") as outfile:
  454. outfile.write(buffer)
  455. buffer = None
  456. if args.images:
  457. itemlist = doc.get_page_images(pno - 1)
  458. for item in itemlist:
  459. xref = item[0]
  460. if xref not in image_xrefs:
  461. image_xrefs.add(xref)
  462. pix = recoverpix(doc, item)
  463. if type(pix) is dict:
  464. ext = pix["ext"]
  465. imgdata = pix["image"]
  466. outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
  467. with open(outname, "wb") as outfile:
  468. outfile.write(imgdata)
  469. else:
  470. outname = os.path.join(out_dir, "img-%i.png" % xref)
  471. pix2 = (
  472. pix
  473. if pix.colorspace.n < 4
  474. else pymupdf.Pixmap(pymupdf.csRGB, pix)
  475. )
  476. pix2.save(outname)
  477. if args.fonts:
  478. pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
  479. if args.images:
  480. pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir))
  481. doc.close()
  482. def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
  483. eop = b"\n" if noformfeed else bytes([12])
  484. text = page.get_text("text", flags=flags)
  485. if not text:
  486. if not skip_empty:
  487. textout.write(eop) # write formfeed
  488. return
  489. textout.write(text.encode("utf8", errors="surrogatepass"))
  490. textout.write(eop)
  491. return
  492. def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
  493. eop = b"\n" if noformfeed else bytes([12])
  494. blocks = page.get_text("blocks", flags=flags)
  495. if blocks == []:
  496. if not skip_empty:
  497. textout.write(eop) # write formfeed
  498. return
  499. blocks.sort(key=lambda b: (b[3], b[0]))
  500. for b in blocks:
  501. textout.write(b[4].encode("utf8", errors="surrogatepass"))
  502. textout.write(eop)
  503. return
  504. def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
  505. eop = b"\n" if noformfeed else bytes([12])
  506. # --------------------------------------------------------------------
  507. def find_line_index(values: List[int], value: int) -> int:
  508. """Find the right row coordinate.
  509. Args:
  510. values: (list) y-coordinates of rows.
  511. value: (int) lookup for this value (y-origin of char).
  512. Returns:
  513. y-ccordinate of appropriate line for value.
  514. """
  515. i = bisect.bisect_right(values, value)
  516. if i:
  517. return values[i - 1]
  518. raise RuntimeError("Line for %g not found in %s" % (value, values))
  519. # --------------------------------------------------------------------
  520. def curate_rows(rows: Set[int], GRID) -> List:
  521. rows = list(rows)
  522. rows.sort() # sort ascending
  523. nrows = [rows[0]]
  524. for h in rows[1:]:
  525. if h >= nrows[-1] + GRID: # only keep significant differences
  526. nrows.append(h)
  527. return nrows # curated list of line bottom coordinates
  528. def process_blocks(blocks: List[Dict], page: pymupdf.Page):
  529. rows = set()
  530. page_width = page.rect.width
  531. page_height = page.rect.height
  532. rowheight = page_height
  533. left = page_width
  534. right = 0
  535. chars = []
  536. for block in blocks:
  537. for line in block["lines"]:
  538. if line["dir"] != (1, 0): # ignore non-horizontal text
  539. continue
  540. x0, y0, x1, y1 = line["bbox"]
  541. if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox
  542. continue
  543. # upd row height
  544. height = y1 - y0
  545. if rowheight > height:
  546. rowheight = height
  547. for span in line["spans"]:
  548. if span["size"] <= fontsize:
  549. continue
  550. for c in span["chars"]:
  551. x0, _, x1, _ = c["bbox"]
  552. cwidth = x1 - x0
  553. ox, oy = c["origin"]
  554. oy = int(round(oy))
  555. rows.add(oy)
  556. ch = c["c"]
  557. if left > ox and ch != " ":
  558. left = ox # update left coordinate
  559. if right < x1:
  560. right = x1 # update right coordinate
  561. # handle ligatures:
  562. if cwidth == 0 and chars != []: # potential ligature
  563. old_ch, old_ox, old_oy, old_cwidth = chars[-1]
  564. if old_oy == oy: # ligature
  565. if old_ch != chr(0xFB00): # previous "ff" char lig?
  566. lig = joinligature(old_ch + ch) # no
  567. # convert to one of the 3-char ligatures:
  568. elif ch == "i":
  569. lig = chr(0xFB03) # "ffi"
  570. elif ch == "l":
  571. lig = chr(0xFB04) # "ffl"
  572. else: # something wrong, leave old char in place
  573. lig = old_ch
  574. chars[-1] = (lig, old_ox, old_oy, old_cwidth)
  575. continue
  576. chars.append((ch, ox, oy, cwidth)) # all chars on page
  577. return chars, rows, left, right, rowheight
  578. def joinligature(lig: str) -> str:
  579. """Return ligature character for a given pair / triple of characters.
  580. Args:
  581. lig: (str) 2/3 characters, e.g. "ff"
  582. Returns:
  583. Ligature, e.g. "ff" -> chr(0xFB00)
  584. """
  585. if lig == "ff":
  586. return chr(0xFB00)
  587. elif lig == "fi":
  588. return chr(0xFB01)
  589. elif lig == "fl":
  590. return chr(0xFB02)
  591. elif lig == "ffi":
  592. return chr(0xFB03)
  593. elif lig == "ffl":
  594. return chr(0xFB04)
  595. elif lig == "ft":
  596. return chr(0xFB05)
  597. elif lig == "st":
  598. return chr(0xFB06)
  599. return lig
  600. # --------------------------------------------------------------------
  601. def make_textline(left, slot, minslot, lchars):
  602. """Produce the text of one output line.
  603. Args:
  604. left: (float) left most coordinate used on page
  605. slot: (float) avg width of one character in any font in use.
  606. minslot: (float) min width for the characters in this line.
  607. chars: (list[tuple]) characters of this line.
  608. Returns:
  609. text: (str) text string for this line
  610. """
  611. text = "" # we output this
  612. old_char = ""
  613. old_x1 = 0 # end coordinate of last char
  614. old_ox = 0 # x-origin of last char
  615. if minslot <= pymupdf.EPSILON:
  616. raise RuntimeError("program error: minslot too small = %g" % minslot)
  617. for c in lchars: # loop over characters
  618. char, ox, _, cwidth = c
  619. ox = ox - left # its (relative) start coordinate
  620. x1 = ox + cwidth # ending coordinate
  621. # eliminate overprint effect
  622. if old_char == char and ox - old_ox <= cwidth * 0.2:
  623. continue
  624. # omit spaces overlapping previous char
  625. if char == " " and (old_x1 - ox) / cwidth > 0.8:
  626. continue
  627. old_char = char
  628. # close enough to previous?
  629. if ox < old_x1 + minslot: # assume char adjacent to previous
  630. text += char # append to output
  631. old_x1 = x1 # new end coord
  632. old_ox = ox # new origin.x
  633. continue
  634. # else next char starts after some gap:
  635. # fill in right number of spaces, so char is positioned
  636. # in the right slot of the line
  637. if char == " ": # rest relevant for non-space only
  638. continue
  639. delta = int(ox / slot) - len(text)
  640. if ox > old_x1 and delta > 1:
  641. text += " " * delta
  642. # now append char
  643. text += char
  644. old_x1 = x1 # new end coordinate
  645. old_ox = ox # new origin
  646. return text.rstrip()
  647. # extract page text by single characters ("rawdict")
  648. blocks = page.get_text("rawdict", flags=flags)["blocks"]
  649. chars, rows, left, right, rowheight = process_blocks(blocks, page)
  650. if chars == []:
  651. if not skip_empty:
  652. textout.write(eop) # write formfeed
  653. return
  654. # compute list of line coordinates - ignoring small (GRID) differences
  655. rows = curate_rows(rows, GRID)
  656. # sort all chars by x-coordinates, so every line will receive char info,
  657. # sorted from left to right.
  658. chars.sort(key=lambda c: c[1])
  659. # populate the lines with their char info
  660. lines = {} # key: y1-ccordinate, value: char list
  661. for c in chars:
  662. _, _, oy, _ = c
  663. y = find_line_index(rows, oy) # y-coord of the right line
  664. lchars = lines.get(y, []) # read line chars so far
  665. lchars.append(c) # append this char
  666. lines[y] = lchars # write back to line
  667. # ensure line coordinates are ascending
  668. keys = list(lines.keys())
  669. keys.sort()
  670. # -------------------------------------------------------------------------
  671. # Compute "char resolution" for the page: the char width corresponding to
  672. # 1 text char position on output - call it 'slot'.
  673. # For each line, compute median of its char widths. The minimum across all
  674. # lines is 'slot'.
  675. # The minimum char width of each line is used to determine if spaces must
  676. # be inserted in between two characters.
  677. # -------------------------------------------------------------------------
  678. slot = right - left
  679. minslots = {}
  680. for k in keys:
  681. lchars = lines[k]
  682. ccount = len(lchars)
  683. if ccount < 2:
  684. minslots[k] = 1
  685. continue
  686. widths = [c[3] for c in lchars]
  687. widths.sort()
  688. this_slot = statistics.median(widths) # take median value
  689. if this_slot < slot:
  690. slot = this_slot
  691. minslots[k] = widths[0]
  692. # compute line advance in text output
  693. rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
  694. rowpos = rows[0] # first line positioned here
  695. textout.write(b"\n")
  696. for k in keys: # walk through the lines
  697. while rowpos < k: # honor distance between lines
  698. textout.write(b"\n")
  699. rowpos += rowheight
  700. text = make_textline(left, slot, minslots[k], lines[k])
  701. textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
  702. rowpos = k + rowheight
  703. textout.write(eop) # write formfeed
  704. def gettext(args):
  705. doc = open_file(args.input, args.password, pdf=False)
  706. pagel = get_list(args.pages, doc.page_count + 1)
  707. output = args.output
  708. if output is None:
  709. filename, _ = os.path.splitext(doc.name)
  710. output = filename + ".txt"
  711. with open(output, "wb") as textout:
  712. flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE
  713. if args.convert_white:
  714. flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE
  715. if args.noligatures:
  716. flags ^= pymupdf.TEXT_PRESERVE_LIGATURES
  717. if args.extra_spaces:
  718. flags ^= pymupdf.TEXT_INHIBIT_SPACES
  719. func = {
  720. "simple": page_simple,
  721. "blocks": page_blocksort,
  722. "layout": page_layout,
  723. }
  724. for pno in pagel:
  725. page = doc[pno - 1]
  726. func[args.mode](
  727. page,
  728. textout,
  729. args.grid,
  730. args.fontsize,
  731. args.noformfeed,
  732. args.skip_empty,
  733. flags=flags,
  734. )
  735. def _internal(args):
  736. pymupdf.message('This is from PyMuPDF message().')
  737. pymupdf.log('This is from PyMuPDF log().')
  738. def main():
  739. """Define command configurations."""
  740. parser = argparse.ArgumentParser(
  741. prog="pymupdf",
  742. description=mycenter("Basic PyMuPDF Functions"),
  743. )
  744. subps = parser.add_subparsers(
  745. title="Subcommands", help="Enter 'command -h' for subcommand specific help"
  746. )
  747. # -------------------------------------------------------------------------
  748. # 'show' command
  749. # -------------------------------------------------------------------------
  750. ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
  751. ps_show.add_argument("input", type=str, help="PDF filename")
  752. ps_show.add_argument("-password", help="password")
  753. ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
  754. ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
  755. ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
  756. ps_show.add_argument(
  757. "-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
  758. )
  759. ps_show.add_argument(
  760. "-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
  761. )
  762. ps_show.set_defaults(func=show)
  763. # -------------------------------------------------------------------------
  764. # 'clean' command
  765. # -------------------------------------------------------------------------
  766. ps_clean = subps.add_parser(
  767. "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
  768. )
  769. ps_clean.add_argument("input", type=str, help="PDF filename")
  770. ps_clean.add_argument("output", type=str, help="output PDF filename")
  771. ps_clean.add_argument("-password", help="password")
  772. ps_clean.add_argument(
  773. "-encryption",
  774. help="encryption method",
  775. choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
  776. default="none",
  777. )
  778. ps_clean.add_argument("-owner", type=str, help="owner password")
  779. ps_clean.add_argument("-user", type=str, help="user password")
  780. ps_clean.add_argument(
  781. "-garbage",
  782. type=int,
  783. help="garbage collection level",
  784. choices=range(5),
  785. default=0,
  786. )
  787. ps_clean.add_argument(
  788. "-compress",
  789. action="store_true",
  790. default=False,
  791. help="compress (deflate) output",
  792. )
  793. ps_clean.add_argument(
  794. "-ascii", action="store_true", default=False, help="ASCII encode binary data"
  795. )
  796. ps_clean.add_argument(
  797. "-linear",
  798. action="store_true",
  799. default=False,
  800. help="format for fast web display",
  801. )
  802. ps_clean.add_argument(
  803. "-permission", type=int, default=-1, help="integer with permission levels"
  804. )
  805. ps_clean.add_argument(
  806. "-sanitize",
  807. action="store_true",
  808. default=False,
  809. help="sanitize / clean contents",
  810. )
  811. ps_clean.add_argument(
  812. "-pretty", action="store_true", default=False, help="prettify PDF structure"
  813. )
  814. ps_clean.add_argument(
  815. "-pages", help="output selected pages pages, format: 1,5-7,50-N"
  816. )
  817. ps_clean.set_defaults(func=clean)
  818. # -------------------------------------------------------------------------
  819. # 'join' command
  820. # -------------------------------------------------------------------------
  821. ps_join = subps.add_parser(
  822. "join",
  823. description=mycenter("join PDF documents"),
  824. epilog="specify each input as 'filename[,password[,pages]]'",
  825. )
  826. ps_join.add_argument("input", nargs="*", help="input filenames")
  827. ps_join.add_argument("-output", required=True, help="output filename")
  828. ps_join.set_defaults(func=doc_join)
  829. # -------------------------------------------------------------------------
  830. # 'extract' command
  831. # -------------------------------------------------------------------------
  832. ps_extract = subps.add_parser(
  833. "extract", description=mycenter("extract images and fonts to disk")
  834. )
  835. ps_extract.add_argument("input", type=str, help="PDF filename")
  836. ps_extract.add_argument("-images", action="store_true", help="extract images")
  837. ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
  838. ps_extract.add_argument(
  839. "-output", help="folder to receive output, defaults to current"
  840. )
  841. ps_extract.add_argument("-password", help="password")
  842. ps_extract.add_argument(
  843. "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
  844. )
  845. ps_extract.set_defaults(func=extract_objects)
  846. # -------------------------------------------------------------------------
  847. # 'embed-info'
  848. # -------------------------------------------------------------------------
  849. ps_show = subps.add_parser(
  850. "embed-info", description=mycenter("list embedded files")
  851. )
  852. ps_show.add_argument("input", help="PDF filename")
  853. ps_show.add_argument("-name", help="if given, report only this one")
  854. ps_show.add_argument("-detail", action="store_true", help="detail information")
  855. ps_show.add_argument("-password", help="password")
  856. ps_show.set_defaults(func=embedded_list)
  857. # -------------------------------------------------------------------------
  858. # 'embed-add' command
  859. # -------------------------------------------------------------------------
  860. ps_embed_add = subps.add_parser(
  861. "embed-add", description=mycenter("add embedded file")
  862. )
  863. ps_embed_add.add_argument("input", help="PDF filename")
  864. ps_embed_add.add_argument("-password", help="password")
  865. ps_embed_add.add_argument(
  866. "-output", help="output PDF filename, incremental save if none"
  867. )
  868. ps_embed_add.add_argument("-name", required=True, help="name of new entry")
  869. ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
  870. ps_embed_add.add_argument("-desc", help="description of new entry")
  871. ps_embed_add.set_defaults(func=embedded_add)
  872. # -------------------------------------------------------------------------
  873. # 'embed-del' command
  874. # -------------------------------------------------------------------------
  875. ps_embed_del = subps.add_parser(
  876. "embed-del", description=mycenter("delete embedded file")
  877. )
  878. ps_embed_del.add_argument("input", help="PDF filename")
  879. ps_embed_del.add_argument("-password", help="password")
  880. ps_embed_del.add_argument(
  881. "-output", help="output PDF filename, incremental save if none"
  882. )
  883. ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
  884. ps_embed_del.set_defaults(func=embedded_del)
  885. # -------------------------------------------------------------------------
  886. # 'embed-upd' command
  887. # -------------------------------------------------------------------------
  888. ps_embed_upd = subps.add_parser(
  889. "embed-upd",
  890. description=mycenter("update embedded file"),
  891. epilog="except '-name' all parameters are optional",
  892. )
  893. ps_embed_upd.add_argument("input", help="PDF filename")
  894. ps_embed_upd.add_argument("-name", required=True, help="name of entry")
  895. ps_embed_upd.add_argument("-password", help="password")
  896. ps_embed_upd.add_argument(
  897. "-output", help="Output PDF filename, incremental save if none"
  898. )
  899. ps_embed_upd.add_argument("-path", help="path to new data for entry")
  900. ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
  901. ps_embed_upd.add_argument(
  902. "-ufilename", help="new unicode filename to store in entry"
  903. )
  904. ps_embed_upd.add_argument("-desc", help="new description to store in entry")
  905. ps_embed_upd.set_defaults(func=embedded_upd)
  906. # -------------------------------------------------------------------------
  907. # 'embed-extract' command
  908. # -------------------------------------------------------------------------
  909. ps_embed_extract = subps.add_parser(
  910. "embed-extract", description=mycenter("extract embedded file to disk")
  911. )
  912. ps_embed_extract.add_argument("input", type=str, help="PDF filename")
  913. ps_embed_extract.add_argument("-name", required=True, help="name of entry")
  914. ps_embed_extract.add_argument("-password", help="password")
  915. ps_embed_extract.add_argument(
  916. "-output", help="output filename, default is stored name"
  917. )
  918. ps_embed_extract.set_defaults(func=embedded_get)
  919. # -------------------------------------------------------------------------
  920. # 'embed-copy' command
  921. # -------------------------------------------------------------------------
  922. ps_embed_copy = subps.add_parser(
  923. "embed-copy", description=mycenter("copy embedded files between PDFs")
  924. )
  925. ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
  926. ps_embed_copy.add_argument("-password", help="password of input")
  927. ps_embed_copy.add_argument(
  928. "-output", help="output PDF, incremental save to 'input' if omitted"
  929. )
  930. ps_embed_copy.add_argument(
  931. "-source", required=True, help="copy embedded files from here"
  932. )
  933. ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
  934. ps_embed_copy.add_argument(
  935. "-name", nargs="*", help="restrict copy to these entries"
  936. )
  937. ps_embed_copy.set_defaults(func=embedded_copy)
  938. # -------------------------------------------------------------------------
  939. # 'textlayout' command
  940. # -------------------------------------------------------------------------
  941. ps_gettext = subps.add_parser(
  942. "gettext", description=mycenter("extract text in various formatting modes")
  943. )
  944. ps_gettext.add_argument("input", type=str, help="input document filename")
  945. ps_gettext.add_argument("-password", help="password for input document")
  946. ps_gettext.add_argument(
  947. "-mode",
  948. type=str,
  949. help="mode: simple, block sort, or layout (default)",
  950. choices=("simple", "blocks", "layout"),
  951. default="layout",
  952. )
  953. ps_gettext.add_argument(
  954. "-pages",
  955. type=str,
  956. help="select pages, format: 1,5-7,50-N",
  957. default="1-N",
  958. )
  959. ps_gettext.add_argument(
  960. "-noligatures",
  961. action="store_true",
  962. help="expand ligature characters (default False)",
  963. default=False,
  964. )
  965. ps_gettext.add_argument(
  966. "-convert-white",
  967. action="store_true",
  968. help="convert whitespace characters to white (default False)",
  969. default=False,
  970. )
  971. ps_gettext.add_argument(
  972. "-extra-spaces",
  973. action="store_true",
  974. help="fill gaps with spaces (default False)",
  975. default=False,
  976. )
  977. ps_gettext.add_argument(
  978. "-noformfeed",
  979. action="store_true",
  980. help="write linefeeds, no formfeeds (default False)",
  981. default=False,
  982. )
  983. ps_gettext.add_argument(
  984. "-skip-empty",
  985. action="store_true",
  986. help="suppress pages with no text (default False)",
  987. default=False,
  988. )
  989. ps_gettext.add_argument(
  990. "-output",
  991. help="store text in this file (default inputfilename.txt)",
  992. )
  993. ps_gettext.add_argument(
  994. "-grid",
  995. type=float,
  996. help="merge lines if closer than this (default 2)",
  997. default=2,
  998. )
  999. ps_gettext.add_argument(
  1000. "-fontsize",
  1001. type=float,
  1002. help="only include text with a larger fontsize (default 3)",
  1003. default=3,
  1004. )
  1005. ps_gettext.set_defaults(func=gettext)
  1006. # -------------------------------------------------------------------------
  1007. # '_internal' command
  1008. # -------------------------------------------------------------------------
  1009. ps_internal = subps.add_parser(
  1010. "internal", description=mycenter("internal testing")
  1011. )
  1012. ps_internal.set_defaults(func=_internal)
  1013. # -------------------------------------------------------------------------
  1014. # start program
  1015. # -------------------------------------------------------------------------
  1016. args = parser.parse_args() # create parameter arguments class
  1017. if not hasattr(args, "func"): # no function selected
  1018. parser.print_help() # so print top level help
  1019. else:
  1020. args.func(args) # execute requested command
  1021. if __name__ == "__main__":
  1022. main()