utils.py 190 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679
  1. # ------------------------------------------------------------------------
  2. # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
  3. # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
  4. #
  5. # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
  6. # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
  7. # maintained and developed by Artifex Software, Inc. https://artifex.com.
  8. # ------------------------------------------------------------------------
  9. import io
  10. import math
  11. import os
  12. import typing
  13. import weakref
  14. try:
  15. from . import pymupdf
  16. except Exception:
  17. import pymupdf
  18. try:
  19. from . import mupdf
  20. except Exception:
  21. import mupdf
  22. _format_g = pymupdf.format_g
  23. g_exceptions_verbose = pymupdf.g_exceptions_verbose
  24. point_like = "point_like"
  25. rect_like = "rect_like"
  26. matrix_like = "matrix_like"
  27. quad_like = "quad_like"
  28. # ByteString is gone from typing in 3.14.
  29. # collections.abc.Buffer available from 3.12 only
  30. try:
  31. ByteString = typing.ByteString
  32. except AttributeError:
  33. # pylint: disable=unsupported-binary-operation
  34. ByteString = bytes | bytearray | memoryview
  35. AnyType = typing.Any
  36. OptInt = typing.Union[int, None]
  37. OptFloat = typing.Optional[float]
  38. OptStr = typing.Optional[str]
  39. OptDict = typing.Optional[dict]
  40. OptBytes = typing.Optional[ByteString]
  41. OptSeq = typing.Optional[typing.Sequence]
  42. """
  43. This is a collection of functions to extend PyMupdf.
  44. """
  45. def write_text(
  46. page: pymupdf.Page,
  47. rect=None,
  48. writers=None,
  49. overlay=True,
  50. color=None,
  51. opacity=None,
  52. keep_proportion=True,
  53. rotate=0,
  54. oc=0,
  55. ) -> None:
  56. """Write the text of one or more pymupdf.TextWriter objects.
  57. Args:
  58. rect: target rectangle. If None, the union of the text writers is used.
  59. writers: one or more pymupdf.TextWriter objects.
  60. overlay: put in foreground or background.
  61. keep_proportion: maintain aspect ratio of rectangle sides.
  62. rotate: arbitrary rotation angle.
  63. oc: the xref of an optional content object
  64. """
  65. assert isinstance(page, pymupdf.Page)
  66. if not writers:
  67. raise ValueError("need at least one pymupdf.TextWriter")
  68. if type(writers) is pymupdf.TextWriter:
  69. if rotate == 0 and rect is None:
  70. writers.write_text(page, opacity=opacity, color=color, overlay=overlay)
  71. return None
  72. else:
  73. writers = (writers,)
  74. clip = writers[0].text_rect
  75. textdoc = pymupdf.Document()
  76. tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height)
  77. for writer in writers:
  78. clip |= writer.text_rect
  79. writer.write_text(tpage, opacity=opacity, color=color)
  80. if rect is None:
  81. rect = clip
  82. page.show_pdf_page(
  83. rect,
  84. textdoc,
  85. 0,
  86. overlay=overlay,
  87. keep_proportion=keep_proportion,
  88. rotate=rotate,
  89. clip=clip,
  90. oc=oc,
  91. )
  92. textdoc = None
  93. tpage = None
  94. def show_pdf_page(
  95. page,
  96. rect,
  97. docsrc,
  98. pno=0,
  99. keep_proportion=True,
  100. overlay=True,
  101. oc=0,
  102. rotate=0,
  103. clip=None,
  104. ) -> int:
  105. """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'.
  106. Args:
  107. rect: (rect-like) where to place the source image
  108. docsrc: (document) source PDF
  109. pno: (int) source page number
  110. keep_proportion: (bool) do not change width-height-ratio
  111. overlay: (bool) put in foreground
  112. oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF)
  113. rotate: (int) degrees (multiple of 90)
  114. clip: (rect-like) part of source page rectangle
  115. Returns:
  116. xref of inserted object (for reuse)
  117. """
  118. def calc_matrix(sr, tr, keep=True, rotate=0):
  119. """Calculate transformation matrix from source to target rect.
  120. Notes:
  121. The product of four matrices in this sequence: (1) translate correct
  122. source corner to origin, (2) rotate, (3) scale, (4) translate to
  123. target's top-left corner.
  124. Args:
  125. sr: source rect in PDF (!) coordinate system
  126. tr: target rect in PDF coordinate system
  127. keep: whether to keep source ratio of width to height
  128. rotate: rotation angle in degrees
  129. Returns:
  130. Transformation matrix.
  131. """
  132. # calc center point of source rect
  133. smp = (sr.tl + sr.br) / 2.0
  134. # calc center point of target rect
  135. tmp = (tr.tl + tr.br) / 2.0
  136. # m moves to (0, 0), then rotates
  137. m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate)
  138. sr1 = sr * m # resulting source rect to calculate scale factors
  139. fw = tr.width / sr1.width # scale the width
  140. fh = tr.height / sr1.height # scale the height
  141. if keep:
  142. fw = fh = min(fw, fh) # take min if keeping aspect ratio
  143. m *= pymupdf.Matrix(fw, fh) # concat scale matrix
  144. m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center
  145. return pymupdf.JM_TUPLE(m)
  146. pymupdf.CheckParent(page)
  147. doc = page.parent
  148. if not doc.is_pdf or not docsrc.is_pdf:
  149. raise ValueError("is no PDF")
  150. if rect.is_empty or rect.is_infinite:
  151. raise ValueError("rect must be finite and not empty")
  152. while pno < 0: # support negative page numbers
  153. pno += docsrc.page_count
  154. src_page = docsrc[pno] # load source page
  155. tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates
  156. src_rect = src_page.rect if not clip else src_page.rect & clip # source rect
  157. if src_rect.is_empty or src_rect.is_infinite:
  158. raise ValueError("clip must be finite and not empty")
  159. src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord
  160. matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate)
  161. # list of existing /Form /XObjects
  162. ilst = [i[1] for i in doc.get_page_xobjects(page.number)]
  163. ilst += [i[7] for i in doc.get_page_images(page.number)]
  164. ilst += [i[4] for i in doc.get_page_fonts(page.number)]
  165. # create a name not in that list
  166. n = "fzFrm"
  167. i = 0
  168. _imgname = n + "0"
  169. while _imgname in ilst:
  170. i += 1
  171. _imgname = n + str(i)
  172. isrc = docsrc._graft_id # used as key for graftmaps
  173. if doc._graft_id == isrc:
  174. raise ValueError("source document must not equal target")
  175. # retrieve / make pymupdf.Graftmap for source PDF
  176. gmap = doc.Graftmaps.get(isrc, None)
  177. if gmap is None:
  178. gmap = pymupdf.Graftmap(doc)
  179. doc.Graftmaps[isrc] = gmap
  180. # take note of generated xref for automatic reuse
  181. pno_id = (isrc, pno) # id of docsrc[pno]
  182. xref = doc.ShownPages.get(pno_id, 0)
  183. if overlay:
  184. page.wrap_contents() # ensure a balanced graphics state
  185. xref = page._show_pdf_page(
  186. src_page,
  187. overlay=overlay,
  188. matrix=matrix,
  189. xref=xref,
  190. oc=oc,
  191. clip=src_rect,
  192. graftmap=gmap,
  193. _imgname=_imgname,
  194. )
  195. doc.ShownPages[pno_id] = xref
  196. return xref
  197. def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None):
  198. """Replace the image referred to by xref.
  199. Replace the image by changing the object definition stored under xref. This
  200. will leave the pages appearance instructions intact, so the new image is
  201. being displayed with the same bbox, rotation etc.
  202. By providing a small fully transparent image, an effect as if the image had
  203. been deleted can be achieved.
  204. A typical use may include replacing large images by a smaller version,
  205. e.g. with a lower resolution or graylevel instead of colored.
  206. Args:
  207. xref: the xref of the image to replace.
  208. filename, pixmap, stream: exactly one of these must be provided. The
  209. meaning being the same as in Page.insert_image.
  210. """
  211. doc = page.parent # the owning document
  212. if not doc.xref_is_image(xref):
  213. raise ValueError("xref not an image") # insert new image anywhere in page
  214. if bool(filename) + bool(stream) + bool(pixmap) != 1:
  215. raise ValueError("Exactly one of filename/stream/pixmap must be given")
  216. new_xref = page.insert_image(
  217. page.rect, filename=filename, stream=stream, pixmap=pixmap
  218. )
  219. doc.xref_copy(new_xref, xref) # copy over new to old
  220. last_contents_xref = page.get_contents()[-1]
  221. # new image insertion has created a new /Contents source,
  222. # which we will set to spaces now
  223. doc.update_stream(last_contents_xref, b" ")
  224. page._image_info = None # clear cache of extracted image information
  225. def delete_image(page: pymupdf.Page, xref: int):
  226. """Delete the image referred to by xef.
  227. Actually replaces by a small transparent Pixmap using method Page.replace_image.
  228. Args:
  229. xref: xref of the image to delete.
  230. """
  231. # make a small 100% transparent pixmap (of just any dimension)
  232. pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1)
  233. pix.clear_with() # clear all samples bytes to 0x00
  234. page.replace_image(xref, pixmap=pix)
  235. def insert_image(
  236. page,
  237. rect,
  238. *,
  239. alpha=-1,
  240. filename=None,
  241. height=0,
  242. keep_proportion=True,
  243. mask=None,
  244. oc=0,
  245. overlay=True,
  246. pixmap=None,
  247. rotate=0,
  248. stream=None,
  249. width=0,
  250. xref=0,
  251. ):
  252. """Insert an image for display in a rectangle.
  253. Args:
  254. rect: (rect_like) position of image on the page.
  255. alpha: (int, optional) set to 0 if image has no transparency.
  256. filename: (str, Path, file object) image filename.
  257. height: (int)
  258. keep_proportion: (bool) keep width / height ratio (default).
  259. mask: (bytes, optional) image consisting of alpha values to use.
  260. oc: (int) xref of OCG or OCMD to declare as Optional Content.
  261. overlay: (bool) put in foreground (default) or background.
  262. pixmap: (pymupdf.Pixmap) use this as image.
  263. rotate: (int) rotate by 0, 90, 180 or 270 degrees.
  264. stream: (bytes) use this as image.
  265. width: (int)
  266. xref: (int) use this as image.
  267. 'page' and 'rect' are positional, all other parameters are keywords.
  268. If 'xref' is given, that image is used. Other input options are ignored.
  269. Else, exactly one of pixmap, stream or filename must be given.
  270. 'alpha=0' for non-transparent images improves performance significantly.
  271. Affects stream and filename only.
  272. Optimum transparent insertions are possible by using filename / stream in
  273. conjunction with a 'mask' image of alpha values.
  274. Returns:
  275. xref (int) of inserted image. Re-use as argument for multiple insertions.
  276. """
  277. pymupdf.CheckParent(page)
  278. doc = page.parent
  279. if not doc.is_pdf:
  280. raise ValueError("is no PDF")
  281. if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1):
  282. raise ValueError("xref=0 needs exactly one of filename, pixmap, stream")
  283. if filename:
  284. if type(filename) is str:
  285. pass
  286. elif hasattr(filename, "absolute"):
  287. filename = str(filename)
  288. elif hasattr(filename, "name"):
  289. filename = filename.name
  290. else:
  291. raise ValueError("bad filename")
  292. if filename and not os.path.exists(filename):
  293. raise FileNotFoundError("No such file: '%s'" % filename)
  294. elif stream and type(stream) not in (bytes, bytearray, io.BytesIO):
  295. raise ValueError("stream must be bytes-like / BytesIO")
  296. elif pixmap and type(pixmap) is not pymupdf.Pixmap:
  297. raise ValueError("pixmap must be a pymupdf.Pixmap")
  298. if mask and not (stream or filename):
  299. raise ValueError("mask requires stream or filename")
  300. if mask and type(mask) not in (bytes, bytearray, io.BytesIO):
  301. raise ValueError("mask must be bytes-like / BytesIO")
  302. while rotate < 0:
  303. rotate += 360
  304. while rotate >= 360:
  305. rotate -= 360
  306. if rotate not in (0, 90, 180, 270):
  307. raise ValueError("bad rotate value")
  308. r = pymupdf.Rect(rect)
  309. if r.is_empty or r.is_infinite:
  310. raise ValueError("rect must be finite and not empty")
  311. clip = r * ~page.transformation_matrix
  312. # Create a unique image reference name.
  313. ilst = [i[7] for i in doc.get_page_images(page.number)]
  314. ilst += [i[1] for i in doc.get_page_xobjects(page.number)]
  315. ilst += [i[4] for i in doc.get_page_fonts(page.number)]
  316. n = "fzImg" # 'pymupdf image'
  317. i = 0
  318. _imgname = n + "0" # first name candidate
  319. while _imgname in ilst:
  320. i += 1
  321. _imgname = n + str(i) # try new name
  322. if overlay:
  323. page.wrap_contents() # ensure a balanced graphics state
  324. digests = doc.InsertedImages
  325. xref, digests = page._insert_image(
  326. filename=filename,
  327. pixmap=pixmap,
  328. stream=stream,
  329. imask=mask,
  330. clip=clip,
  331. overlay=overlay,
  332. oc=oc,
  333. xref=xref,
  334. rotate=rotate,
  335. keep_proportion=keep_proportion,
  336. width=width,
  337. height=height,
  338. alpha=alpha,
  339. _imgname=_imgname,
  340. digests=digests,
  341. )
  342. if digests is not None:
  343. doc.InsertedImages = digests
  344. return xref
  345. def search_for(
  346. page,
  347. text,
  348. *,
  349. clip=None,
  350. quads=False,
  351. flags=pymupdf.TEXT_DEHYPHENATE
  352. | pymupdf.TEXT_PRESERVE_WHITESPACE
  353. | pymupdf.TEXT_PRESERVE_LIGATURES
  354. | pymupdf.TEXT_MEDIABOX_CLIP
  355. ,
  356. textpage=None,
  357. ) -> list:
  358. """Search for a string on a page.
  359. Args:
  360. text: string to be searched for
  361. clip: restrict search to this rectangle
  362. quads: (bool) return quads instead of rectangles
  363. flags: bit switches, default: join hyphened words
  364. textpage: a pre-created pymupdf.TextPage
  365. Returns:
  366. a list of rectangles or quads, each containing one occurrence.
  367. """
  368. if clip is not None:
  369. clip = pymupdf.Rect(clip)
  370. pymupdf.CheckParent(page)
  371. tp = textpage
  372. if tp is None:
  373. tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage
  374. elif getattr(tp, "parent") != page:
  375. raise ValueError("not a textpage of this page")
  376. rlist = tp.search(text, quads=quads)
  377. if textpage is None:
  378. del tp
  379. return rlist
  380. def search_page_for(
  381. doc: pymupdf.Document,
  382. pno: int,
  383. text: str,
  384. quads: bool = False,
  385. clip: rect_like = None,
  386. flags: int = pymupdf.TEXT_DEHYPHENATE
  387. | pymupdf.TEXT_PRESERVE_LIGATURES
  388. | pymupdf.TEXT_PRESERVE_WHITESPACE
  389. | pymupdf.TEXT_MEDIABOX_CLIP
  390. ,
  391. textpage: pymupdf.TextPage = None,
  392. ) -> list:
  393. """Search for a string on a page.
  394. Args:
  395. pno: page number
  396. text: string to be searched for
  397. clip: restrict search to this rectangle
  398. quads: (bool) return quads instead of rectangles
  399. flags: bit switches, default: join hyphened words
  400. textpage: reuse a prepared textpage
  401. Returns:
  402. a list of rectangles or quads, each containing an occurrence.
  403. """
  404. return doc[pno].search_for(
  405. text,
  406. quads=quads,
  407. clip=clip,
  408. flags=flags,
  409. textpage=textpage,
  410. )
  411. def get_text_blocks(
  412. page: pymupdf.Page,
  413. clip: rect_like = None,
  414. flags: OptInt = None,
  415. textpage: pymupdf.TextPage = None,
  416. sort: bool = False,
  417. ) -> list:
  418. """Return the text blocks on a page.
  419. Notes:
  420. Lines in a block are concatenated with line breaks.
  421. Args:
  422. flags: (int) control the amount of data parsed into the textpage.
  423. Returns:
  424. A list of the blocks. Each item contains the containing rectangle
  425. coordinates, text lines, running block number and block type.
  426. """
  427. pymupdf.CheckParent(page)
  428. if flags is None:
  429. flags = pymupdf.TEXTFLAGS_BLOCKS
  430. tp = textpage
  431. if tp is None:
  432. tp = page.get_textpage(clip=clip, flags=flags)
  433. elif getattr(tp, "parent") != page:
  434. raise ValueError("not a textpage of this page")
  435. blocks = tp.extractBLOCKS()
  436. if textpage is None:
  437. del tp
  438. if sort:
  439. blocks.sort(key=lambda b: (b[3], b[0]))
  440. return blocks
  441. def get_text_words(
  442. page: pymupdf.Page,
  443. clip: rect_like = None,
  444. flags: OptInt = None,
  445. textpage: pymupdf.TextPage = None,
  446. sort: bool = False,
  447. delimiters=None,
  448. tolerance=3,
  449. ) -> list:
  450. """Return the text words as a list with the bbox for each word.
  451. Args:
  452. page: pymupdf.Page
  453. clip: (rect-like) area on page to consider
  454. flags: (int) control the amount of data parsed into the textpage.
  455. textpage: (pymupdf.TextPage) either passed-in or None.
  456. sort: (bool) sort the words in reading sequence.
  457. delimiters: (str,list) characters to use as word delimiters.
  458. tolerance: (float) consider words to be part of the same line if
  459. top or bottom coordinate are not larger than this. Relevant
  460. only if sort=True.
  461. Returns:
  462. Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
  463. """
  464. def sort_words(words):
  465. """Sort words line-wise, forgiving small deviations."""
  466. words.sort(key=lambda w: (w[3], w[0]))
  467. nwords = [] # final word list
  468. line = [words[0]] # collects words roughly in same line
  469. lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle
  470. for w in words[1:]:
  471. wrect = pymupdf.Rect(w[:4])
  472. if (
  473. abs(wrect.y0 - lrect.y0) <= tolerance
  474. or abs(wrect.y1 - lrect.y1) <= tolerance
  475. ):
  476. line.append(w)
  477. lrect |= wrect
  478. else:
  479. line.sort(key=lambda w: w[0]) # sort words in line l-t-r
  480. nwords.extend(line) # append to final words list
  481. line = [w] # start next line
  482. lrect = wrect # start next line rect
  483. line.sort(key=lambda w: w[0]) # sort words in line l-t-r
  484. nwords.extend(line) # append to final words list
  485. return nwords
  486. pymupdf.CheckParent(page)
  487. if flags is None:
  488. flags = pymupdf.TEXTFLAGS_WORDS
  489. tp = textpage
  490. if tp is None:
  491. tp = page.get_textpage(clip=clip, flags=flags)
  492. elif getattr(tp, "parent") != page:
  493. raise ValueError("not a textpage of this page")
  494. words = tp.extractWORDS(delimiters)
  495. # if textpage was given, we subselect the words in clip
  496. if textpage is not None and clip is not None:
  497. # sub-select words contained in clip
  498. clip = pymupdf.Rect(clip)
  499. words = [
  500. w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
  501. ]
  502. if textpage is None:
  503. del tp
  504. if words and sort:
  505. # advanced sort if any words found
  506. words = sort_words(words)
  507. return words
  508. def get_sorted_text(
  509. page: pymupdf.Page,
  510. clip: rect_like = None,
  511. flags: OptInt = None,
  512. textpage: pymupdf.TextPage = None,
  513. tolerance=3,
  514. ) -> str:
  515. """Extract plain text avoiding unacceptable line breaks.
  516. Text contained in clip will be sorted in reading sequence. Some effort
  517. is also spent to simulate layout vertically and horizontally.
  518. Args:
  519. page: pymupdf.Page
  520. clip: (rect-like) only consider text inside
  521. flags: (int) text extraction flags
  522. textpage: pymupdf.TextPage
  523. tolerance: (float) consider words to be on the same line if their top
  524. or bottom coordinates do not differ more than this.
  525. Notes:
  526. If a TextPage is provided, all text is checked for being inside clip
  527. with at least 50% of its bbox.
  528. This allows to use some "global" TextPage in conjunction with sub-
  529. selecting words in parts of the defined TextPage rectangle.
  530. Returns:
  531. A text string in reading sequence. Left indentation of each line,
  532. inter-line and inter-word distances strive to reflect the layout.
  533. """
  534. def line_text(clip, line):
  535. """Create the string of one text line.
  536. We are trying to simulate some horizontal layout here, too.
  537. Args:
  538. clip: (pymupdf.Rect) the area from which all text is being read.
  539. line: (list) word tuples (rect, text) contained in the line
  540. Returns:
  541. Text in this line. Generated from words in 'line'. Distance from
  542. predecessor is translated to multiple spaces, thus simulating
  543. text indentations and large horizontal distances.
  544. """
  545. line.sort(key=lambda w: w[0].x0)
  546. ltext = "" # text in the line
  547. x1 = clip.x0 # end coordinate of ltext
  548. lrect = pymupdf.EMPTY_RECT() # bbox of this line
  549. for r, t in line:
  550. lrect |= r # update line bbox
  551. # convert distance to previous word to multiple spaces
  552. dist = max(
  553. int(round((r.x0 - x1) / r.width * len(t))),
  554. 0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
  555. ) # number of space characters
  556. ltext += " " * dist + t # append word string
  557. x1 = r.x1 # update new end position
  558. return ltext
  559. # Extract words in correct sequence first.
  560. words = [
  561. (pymupdf.Rect(w[:4]), w[4])
  562. for w in get_text_words(
  563. page,
  564. clip=clip,
  565. flags=flags,
  566. textpage=textpage,
  567. sort=True,
  568. tolerance=tolerance,
  569. )
  570. ]
  571. if not words: # no text present
  572. return ""
  573. totalbox = pymupdf.EMPTY_RECT() # area covering all text
  574. for wr, text in words:
  575. totalbox |= wr
  576. lines = [] # list of reconstituted lines
  577. line = [words[0]] # current line
  578. lrect = words[0][0] # the line's rectangle
  579. # walk through the words
  580. for wr, text in words[1:]: # start with second word
  581. w0r, _ = line[-1] # read previous word in current line
  582. # if this word matches top or bottom of the line, append it
  583. if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
  584. line.append((wr, text))
  585. lrect |= wr
  586. else:
  587. # output current line and re-initialize
  588. ltext = line_text(totalbox, line)
  589. lines.append((lrect, ltext))
  590. line = [(wr, text)]
  591. lrect = wr
  592. # also append unfinished last line
  593. ltext = line_text(totalbox, line)
  594. lines.append((lrect, ltext))
  595. # sort all lines vertically
  596. lines.sort(key=lambda l: (l[0].y1))
  597. text = lines[0][1] # text of first line
  598. y1 = lines[0][0].y1 # its bottom coordinate
  599. for lrect, ltext in lines[1:]:
  600. distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
  601. breaks = "\n" * (distance + 1)
  602. text += breaks + ltext
  603. y1 = lrect.y1
  604. # return text in clip
  605. return text
  606. def get_textbox(
  607. page: pymupdf.Page,
  608. rect: rect_like,
  609. textpage: pymupdf.TextPage = None,
  610. ) -> str:
  611. tp = textpage
  612. if tp is None:
  613. tp = page.get_textpage()
  614. elif getattr(tp, "parent") != page:
  615. raise ValueError("not a textpage of this page")
  616. rc = tp.extractTextbox(rect)
  617. if textpage is None:
  618. del tp
  619. return rc
  620. def get_text_selection(
  621. page: pymupdf.Page,
  622. p1: point_like,
  623. p2: point_like,
  624. clip: rect_like = None,
  625. textpage: pymupdf.TextPage = None,
  626. ):
  627. pymupdf.CheckParent(page)
  628. tp = textpage
  629. if tp is None:
  630. tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
  631. elif getattr(tp, "parent") != page:
  632. raise ValueError("not a textpage of this page")
  633. rc = tp.extractSelection(p1, p2)
  634. if textpage is None:
  635. del tp
  636. return rc
  637. def get_textpage_ocr(
  638. page: pymupdf.Page,
  639. flags: int = 0,
  640. language: str = "eng",
  641. dpi: int = 72,
  642. full: bool = False,
  643. tessdata: str = None,
  644. ) -> pymupdf.TextPage:
  645. """Create a Textpage from combined results of normal and OCR text parsing.
  646. Args:
  647. flags: (int) control content becoming part of the result.
  648. language: (str) specify expected language(s). Default is "eng" (English).
  649. dpi: (int) resolution in dpi, default 72.
  650. full: (bool) whether to OCR the full page image, or only its images (default)
  651. """
  652. pymupdf.CheckParent(page)
  653. tessdata = pymupdf.get_tessdata(tessdata)
  654. def full_ocr(page, dpi, language, flags):
  655. zoom = dpi / 72
  656. mat = pymupdf.Matrix(zoom, zoom)
  657. pix = page.get_pixmap(matrix=mat)
  658. ocr_pdf = pymupdf.Document(
  659. "pdf",
  660. pix.pdfocr_tobytes(
  661. compress=False,
  662. language=language,
  663. tessdata=tessdata,
  664. ),
  665. )
  666. ocr_page = ocr_pdf.load_page(0)
  667. unzoom = page.rect.width / ocr_page.rect.width
  668. ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
  669. tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
  670. ocr_pdf.close()
  671. pix = None
  672. tpage.parent = weakref.proxy(page)
  673. return tpage
  674. # if OCR for the full page, OCR its pixmap @ desired dpi
  675. if full:
  676. return full_ocr(page, dpi, language, flags)
  677. # For partial OCR, make a normal textpage, then extend it with text that
  678. # is OCRed from each image.
  679. # Because of this, we need the images flag bit set ON.
  680. tpage = page.get_textpage(flags=flags)
  681. for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
  682. if block["type"] != 1: # only look at images
  683. continue
  684. bbox = pymupdf.Rect(block["bbox"])
  685. if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
  686. continue
  687. try:
  688. pix = pymupdf.Pixmap(block["image"]) # get image pixmap
  689. if pix.n - pix.alpha != 3: # we need to convert this to RGB!
  690. pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
  691. if pix.alpha: # must remove alpha channel
  692. pix = pymupdf.Pixmap(pix, 0)
  693. imgdoc = pymupdf.Document(
  694. "pdf",
  695. pix.pdfocr_tobytes(language=language, tessdata=tessdata),
  696. ) # pdf with OCRed page
  697. imgpage = imgdoc.load_page(0) # read image as a page
  698. pix = None
  699. # compute matrix to transform coordinates back to that of 'page'
  700. imgrect = imgpage.rect # page size of image PDF
  701. shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
  702. mat = shrink * block["transform"]
  703. imgpage.extend_textpage(tpage, flags=0, matrix=mat)
  704. imgdoc.close()
  705. except (RuntimeError, mupdf.FzErrorBase):
  706. if 0 and g_exceptions_verbose:
  707. # Don't show exception info here because it can happen in
  708. # normal operation (see test_3842b).
  709. pymupdf.exception_info()
  710. tpage = None
  711. pymupdf.message("Falling back to full page OCR")
  712. return full_ocr(page, dpi, language, flags)
  713. return tpage
  714. def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list:
  715. """Extract image information only from a pymupdf.TextPage.
  716. Args:
  717. hashes: (bool) include MD5 hash for each image.
  718. xrefs: (bool) try to find the xref for each image. Sets hashes to true.
  719. """
  720. doc = page.parent
  721. if xrefs and doc.is_pdf:
  722. hashes = True
  723. if not doc.is_pdf:
  724. xrefs = False
  725. imginfo = getattr(page, "_image_info", None)
  726. if imginfo and not xrefs:
  727. return imginfo
  728. if not imginfo:
  729. tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES)
  730. imginfo = tp.extractIMGINFO(hashes=hashes)
  731. del tp
  732. if hashes:
  733. page._image_info = imginfo
  734. if not xrefs or not doc.is_pdf:
  735. return imginfo
  736. imglist = page.get_images()
  737. digests = {}
  738. for item in imglist:
  739. xref = item[0]
  740. pix = pymupdf.Pixmap(doc, xref)
  741. digests[pix.digest] = xref
  742. del pix
  743. for i in range(len(imginfo)):
  744. item = imginfo[i]
  745. xref = digests.get(item["digest"], 0)
  746. item["xref"] = xref
  747. imginfo[i] = item
  748. return imginfo
  749. def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:
  750. """Return list of image positions on a page.
  751. Args:
  752. name: (str, list, int) image identification. May be reference name, an
  753. item of the page's image list or an xref.
  754. transform: (bool) whether to also return the transformation matrix.
  755. Returns:
  756. A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix)
  757. for all image locations on the page.
  758. """
  759. if type(name) in (list, tuple):
  760. xref = name[0]
  761. elif type(name) is int:
  762. xref = name
  763. else:
  764. imglist = [i for i in page.get_images() if i[7] == name]
  765. if imglist == []:
  766. raise ValueError("bad image name")
  767. elif len(imglist) != 1:
  768. raise ValueError("multiple image names found")
  769. xref = imglist[0][0]
  770. pix = pymupdf.Pixmap(page.parent, xref) # make pixmap of the image to compute MD5
  771. digest = pix.digest
  772. del pix
  773. infos = page.get_image_info(hashes=True)
  774. if not transform:
  775. bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest]
  776. else:
  777. bboxes = [
  778. (pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"]))
  779. for im in infos
  780. if im["digest"] == digest
  781. ]
  782. return bboxes
  783. def get_text(
  784. page: pymupdf.Page,
  785. option: str = "text",
  786. *,
  787. clip: rect_like = None,
  788. flags: OptInt = None,
  789. textpage: pymupdf.TextPage = None,
  790. sort: bool = False,
  791. delimiters=None,
  792. tolerance=3,
  793. ):
  794. """Extract text from a page or an annotation.
  795. This is a unifying wrapper for various methods of the pymupdf.TextPage class.
  796. Args:
  797. option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
  798. clip: (rect-like) restrict output to this area.
  799. flags: bit switches to e.g. exclude images or decompose ligatures.
  800. textpage: reuse this pymupdf.TextPage and make no new one. If specified,
  801. 'flags' and 'clip' are ignored.
  802. Returns:
  803. the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
  804. methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
  805. extractXHTML or etractXML respectively.
  806. Default and misspelling choice is "text".
  807. """
  808. formats = {
  809. "text": pymupdf.TEXTFLAGS_TEXT,
  810. "html": pymupdf.TEXTFLAGS_HTML,
  811. "json": pymupdf.TEXTFLAGS_DICT,
  812. "rawjson": pymupdf.TEXTFLAGS_RAWDICT,
  813. "xml": pymupdf.TEXTFLAGS_XML,
  814. "xhtml": pymupdf.TEXTFLAGS_XHTML,
  815. "dict": pymupdf.TEXTFLAGS_DICT,
  816. "rawdict": pymupdf.TEXTFLAGS_RAWDICT,
  817. "words": pymupdf.TEXTFLAGS_WORDS,
  818. "blocks": pymupdf.TEXTFLAGS_BLOCKS,
  819. }
  820. option = option.lower()
  821. assert option in formats
  822. if option not in formats:
  823. option = "text"
  824. if flags is None:
  825. flags = formats[option]
  826. if option == "words":
  827. return get_text_words(
  828. page,
  829. clip=clip,
  830. flags=flags,
  831. textpage=textpage,
  832. sort=sort,
  833. delimiters=delimiters,
  834. )
  835. if option == "blocks":
  836. return get_text_blocks(
  837. page, clip=clip, flags=flags, textpage=textpage, sort=sort
  838. )
  839. if option == "text" and sort:
  840. return get_sorted_text(
  841. page,
  842. clip=clip,
  843. flags=flags,
  844. textpage=textpage,
  845. tolerance=tolerance,
  846. )
  847. pymupdf.CheckParent(page)
  848. cb = None
  849. if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
  850. clip = page.cropbox
  851. if clip is not None:
  852. clip = pymupdf.Rect(clip)
  853. cb = None
  854. elif type(page) is pymupdf.Page:
  855. cb = page.cropbox
  856. # pymupdf.TextPage with or without images
  857. tp = textpage
  858. #pymupdf.exception_info()
  859. if tp is None:
  860. tp = page.get_textpage(clip=clip, flags=flags)
  861. elif getattr(tp, "parent") != page:
  862. raise ValueError("not a textpage of this page")
  863. #pymupdf.log( '{option=}')
  864. if option == "json":
  865. t = tp.extractJSON(cb=cb, sort=sort)
  866. elif option == "rawjson":
  867. t = tp.extractRAWJSON(cb=cb, sort=sort)
  868. elif option == "dict":
  869. t = tp.extractDICT(cb=cb, sort=sort)
  870. elif option == "rawdict":
  871. t = tp.extractRAWDICT(cb=cb, sort=sort)
  872. elif option == "html":
  873. t = tp.extractHTML()
  874. elif option == "xml":
  875. t = tp.extractXML()
  876. elif option == "xhtml":
  877. t = tp.extractXHTML()
  878. else:
  879. t = tp.extractText(sort=sort)
  880. if textpage is None:
  881. del tp
  882. return t
  883. def get_page_text(
  884. doc: pymupdf.Document,
  885. pno: int,
  886. option: str = "text",
  887. clip: rect_like = None,
  888. flags: OptInt = None,
  889. textpage: pymupdf.TextPage = None,
  890. sort: bool = False,
  891. ) -> typing.Any:
  892. """Extract a document page's text by page number.
  893. Notes:
  894. Convenience function calling page.get_text().
  895. Args:
  896. pno: page number
  897. option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
  898. Returns:
  899. output from page.TextPage().
  900. """
  901. return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort)
  902. def get_pixmap(
  903. page: pymupdf.Page,
  904. *,
  905. matrix: matrix_like=pymupdf.Identity,
  906. dpi=None,
  907. colorspace: pymupdf.Colorspace=pymupdf.csRGB,
  908. clip: rect_like=None,
  909. alpha: bool=False,
  910. annots: bool=True,
  911. ) -> pymupdf.Pixmap:
  912. """Create pixmap of page.
  913. Keyword args:
  914. matrix: Matrix for transformation (default: Identity).
  915. dpi: desired dots per inch. If given, matrix is ignored.
  916. colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB.
  917. clip: (irect-like) restrict rendering to this area.
  918. alpha: (bool) whether to include alpha channel
  919. annots: (bool) whether to also render annotations
  920. """
  921. if dpi:
  922. zoom = dpi / 72
  923. matrix = pymupdf.Matrix(zoom, zoom)
  924. if type(colorspace) is str:
  925. if colorspace.upper() == "GRAY":
  926. colorspace = pymupdf.csGRAY
  927. elif colorspace.upper() == "CMYK":
  928. colorspace = pymupdf.csCMYK
  929. else:
  930. colorspace = pymupdf.csRGB
  931. if colorspace.n not in (1, 3, 4):
  932. raise ValueError("unsupported colorspace")
  933. dl = page.get_displaylist(annots=annots)
  934. pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip)
  935. dl = None
  936. if dpi:
  937. pix.set_dpi(dpi, dpi)
  938. return pix
  939. def get_page_pixmap(
  940. doc: pymupdf.Document,
  941. pno: int,
  942. *,
  943. matrix: matrix_like = pymupdf.Identity,
  944. dpi=None,
  945. colorspace: pymupdf.Colorspace = pymupdf.csRGB,
  946. clip: rect_like = None,
  947. alpha: bool = False,
  948. annots: bool = True,
  949. ) -> pymupdf.Pixmap:
  950. """Create pixmap of document page by page number.
  951. Notes:
  952. Convenience function calling page.get_pixmap.
  953. Args:
  954. pno: (int) page number
  955. matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity).
  956. colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB.
  957. clip: (irect-like) restrict rendering to this area.
  958. alpha: (bool) include alpha channel
  959. annots: (bool) also render annotations
  960. """
  961. return doc[pno].get_pixmap(
  962. matrix=matrix,
  963. dpi=dpi, colorspace=colorspace,
  964. clip=clip,
  965. alpha=alpha,
  966. annots=annots
  967. )
  968. def getLinkDict(ln, document=None) -> dict:
  969. if isinstance(ln, pymupdf.Outline):
  970. dest = ln.destination(document)
  971. elif isinstance(ln, pymupdf.Link):
  972. dest = ln.dest
  973. else:
  974. assert 0, f'Unexpected {type(ln)=}.'
  975. nl = {"kind": dest.kind, "xref": 0}
  976. try:
  977. if hasattr(ln, 'rect'):
  978. nl["from"] = ln.rect
  979. except Exception:
  980. # This seems to happen quite often in PyMuPDF/tests.
  981. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  982. pass
  983. pnt = pymupdf.Point(0, 0)
  984. if dest.flags & pymupdf.LINK_FLAG_L_VALID:
  985. pnt.x = dest.lt.x
  986. if dest.flags & pymupdf.LINK_FLAG_T_VALID:
  987. pnt.y = dest.lt.y
  988. if dest.kind == pymupdf.LINK_URI:
  989. nl["uri"] = dest.uri
  990. elif dest.kind == pymupdf.LINK_GOTO:
  991. nl["page"] = dest.page
  992. nl["to"] = pnt
  993. if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
  994. nl["zoom"] = dest.rb.x
  995. else:
  996. nl["zoom"] = 0.0
  997. elif dest.kind == pymupdf.LINK_GOTOR:
  998. nl["file"] = dest.file_spec.replace("\\", "/")
  999. nl["page"] = dest.page
  1000. if dest.page < 0:
  1001. nl["to"] = dest.dest
  1002. else:
  1003. nl["to"] = pnt
  1004. if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
  1005. nl["zoom"] = dest.rb.x
  1006. else:
  1007. nl["zoom"] = 0.0
  1008. elif dest.kind == pymupdf.LINK_LAUNCH:
  1009. nl["file"] = dest.file_spec.replace("\\", "/")
  1010. elif dest.kind == pymupdf.LINK_NAMED:
  1011. # The dicts should not have same key(s).
  1012. assert not (dest.named.keys() & nl.keys())
  1013. nl.update(dest.named)
  1014. if 'to' in nl:
  1015. nl['to'] = pymupdf.Point(nl['to'])
  1016. else:
  1017. nl["page"] = dest.page
  1018. return nl
  1019. def get_links(page: pymupdf.Page) -> list:
  1020. """Create a list of all links contained in a PDF page.
  1021. Notes:
  1022. see PyMuPDF ducmentation for details.
  1023. """
  1024. pymupdf.CheckParent(page)
  1025. ln = page.first_link
  1026. links = []
  1027. while ln:
  1028. nl = getLinkDict(ln, page.parent)
  1029. links.append(nl)
  1030. ln = ln.next
  1031. if links != [] and page.parent.is_pdf:
  1032. linkxrefs = [x for x in
  1033. #page.annot_xrefs()
  1034. pymupdf.JM_get_annot_xref_list2(page)
  1035. if x[1] == pymupdf.PDF_ANNOT_LINK # pylint: disable=no-member
  1036. ]
  1037. if len(linkxrefs) == len(links):
  1038. for i in range(len(linkxrefs)):
  1039. links[i]["xref"] = linkxrefs[i][0]
  1040. links[i]["id"] = linkxrefs[i][2]
  1041. return links
  1042. def get_toc(
  1043. doc: pymupdf.Document,
  1044. simple: bool = True,
  1045. ) -> list:
  1046. """Create a table of contents.
  1047. Args:
  1048. simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.
  1049. """
  1050. def recurse(olItem, liste, lvl):
  1051. """Recursively follow the outline item chain and record item information in a list."""
  1052. while olItem and olItem.this.m_internal:
  1053. if olItem.title:
  1054. title = olItem.title
  1055. else:
  1056. title = " "
  1057. if not olItem.is_external:
  1058. if olItem.uri:
  1059. if olItem.page == -1:
  1060. resolve = doc.resolve_link(olItem.uri)
  1061. page = resolve[0] + 1
  1062. else:
  1063. page = olItem.page + 1
  1064. else:
  1065. page = -1
  1066. else:
  1067. page = -1
  1068. if not simple:
  1069. link = getLinkDict(olItem, doc)
  1070. liste.append([lvl, title, page, link])
  1071. else:
  1072. liste.append([lvl, title, page])
  1073. if olItem.down:
  1074. liste = recurse(olItem.down, liste, lvl + 1)
  1075. olItem = olItem.next
  1076. return liste
  1077. # ensure document is open
  1078. if doc.is_closed:
  1079. raise ValueError("document closed")
  1080. doc.init_doc()
  1081. olItem = doc.outline
  1082. if not olItem:
  1083. return []
  1084. lvl = 1
  1085. liste = []
  1086. toc = recurse(olItem, liste, lvl)
  1087. if doc.is_pdf and not simple:
  1088. doc._extend_toc_items(toc)
  1089. return toc
  1090. def del_toc_item(
  1091. doc: pymupdf.Document,
  1092. idx: int,
  1093. ) -> None:
  1094. """Delete TOC / bookmark item by index."""
  1095. xref = doc.get_outline_xrefs()[idx]
  1096. doc._remove_toc_item(xref)
  1097. def set_toc_item(
  1098. doc: pymupdf.Document,
  1099. idx: int,
  1100. dest_dict: OptDict = None,
  1101. kind: OptInt = None,
  1102. pno: OptInt = None,
  1103. uri: OptStr = None,
  1104. title: OptStr = None,
  1105. to: point_like = None,
  1106. filename: OptStr = None,
  1107. zoom: float = 0,
  1108. ) -> None:
  1109. """Update TOC item by index.
  1110. It allows changing the item's title and link destination.
  1111. Args:
  1112. idx:
  1113. (int) desired index of the TOC list, as created by get_toc.
  1114. dest_dict:
  1115. (dict) destination dictionary as created by get_toc(False).
  1116. Outrules all other parameters. If None, the remaining parameters
  1117. are used to make a dest dictionary.
  1118. kind:
  1119. (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only
  1120. the title will be updated. If pymupdf.LINK_NONE, the TOC item will
  1121. be deleted.
  1122. pno:
  1123. (int) page number (1-based like in get_toc). Required if
  1124. pymupdf.LINK_GOTO.
  1125. uri:
  1126. (str) the URL, required if pymupdf.LINK_URI.
  1127. title:
  1128. (str) the new title. No change if None.
  1129. to:
  1130. (point-like) destination on the target page. If omitted, (72, 36)
  1131. will be used as target coordinates.
  1132. filename:
  1133. (str) destination filename, required for pymupdf.LINK_GOTOR and
  1134. pymupdf.LINK_LAUNCH.
  1135. name:
  1136. (str) a destination name for pymupdf.LINK_NAMED.
  1137. zoom:
  1138. (float) a zoom factor for the target location (pymupdf.LINK_GOTO).
  1139. """
  1140. xref = doc.get_outline_xrefs()[idx]
  1141. page_xref = 0
  1142. if type(dest_dict) is dict:
  1143. if dest_dict["kind"] == pymupdf.LINK_GOTO:
  1144. pno = dest_dict["page"]
  1145. page_xref = doc.page_xref(pno)
  1146. page_height = doc.page_cropbox(pno).height
  1147. to = dest_dict.get('to', pymupdf.Point(72, 36))
  1148. to.y = page_height - to.y
  1149. dest_dict["to"] = to
  1150. action = getDestStr(page_xref, dest_dict)
  1151. if not action.startswith("/A"):
  1152. raise ValueError("bad bookmark dest")
  1153. color = dest_dict.get("color")
  1154. if color:
  1155. color = list(map(float, color))
  1156. if len(color) != 3 or min(color) < 0 or max(color) > 1:
  1157. raise ValueError("bad color value")
  1158. bold = dest_dict.get("bold", False)
  1159. italic = dest_dict.get("italic", False)
  1160. flags = italic + 2 * bold
  1161. collapse = dest_dict.get("collapse")
  1162. return doc._update_toc_item(
  1163. xref,
  1164. action=action[2:],
  1165. title=title,
  1166. color=color,
  1167. flags=flags,
  1168. collapse=collapse,
  1169. )
  1170. if kind == pymupdf.LINK_NONE: # delete bookmark item
  1171. return doc.del_toc_item(idx)
  1172. if kind is None and title is None: # treat as no-op
  1173. return None
  1174. if kind is None: # only update title text
  1175. return doc._update_toc_item(xref, action=None, title=title)
  1176. if kind == pymupdf.LINK_GOTO:
  1177. if pno is None or pno not in range(1, doc.page_count + 1):
  1178. raise ValueError("bad page number")
  1179. page_xref = doc.page_xref(pno - 1)
  1180. page_height = doc.page_cropbox(pno - 1).height
  1181. if to is None:
  1182. to = pymupdf.Point(72, page_height - 36)
  1183. else:
  1184. to = pymupdf.Point(to)
  1185. to.y = page_height - to.y
  1186. ddict = {
  1187. "kind": kind,
  1188. "to": to,
  1189. "uri": uri,
  1190. "page": pno,
  1191. "file": filename,
  1192. "zoom": zoom,
  1193. }
  1194. action = getDestStr(page_xref, ddict)
  1195. if action == "" or not action.startswith("/A"):
  1196. raise ValueError("bad bookmark dest")
  1197. return doc._update_toc_item(xref, action=action[2:], title=title)
  1198. def get_area(*args) -> float:
  1199. """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'."""
  1200. rect = args[0]
  1201. if len(args) > 1:
  1202. unit = args[1]
  1203. else:
  1204. unit = "px"
  1205. u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)}
  1206. f = (u[unit][0] / u[unit][1]) ** 2
  1207. return f * rect.width * rect.height
  1208. def set_metadata(doc: pymupdf.Document, m: dict = None) -> None:
  1209. """Update the PDF /Info object.
  1210. Args:
  1211. m: a dictionary like doc.metadata.
  1212. """
  1213. if not doc.is_pdf:
  1214. raise ValueError("is no PDF")
  1215. if doc.is_closed or doc.is_encrypted:
  1216. raise ValueError("document closed or encrypted")
  1217. if m is None:
  1218. m = {}
  1219. elif type(m) is not dict:
  1220. raise ValueError("bad metadata")
  1221. keymap = {
  1222. "author": "Author",
  1223. "producer": "Producer",
  1224. "creator": "Creator",
  1225. "title": "Title",
  1226. "format": None,
  1227. "encryption": None,
  1228. "creationDate": "CreationDate",
  1229. "modDate": "ModDate",
  1230. "subject": "Subject",
  1231. "keywords": "Keywords",
  1232. "trapped": "Trapped",
  1233. }
  1234. valid_keys = set(keymap.keys())
  1235. diff_set = set(m.keys()).difference(valid_keys)
  1236. if diff_set != set():
  1237. msg = "bad dict key(s): %s" % diff_set
  1238. raise ValueError(msg)
  1239. t, temp = doc.xref_get_key(-1, "Info")
  1240. if t != "xref":
  1241. info_xref = 0
  1242. else:
  1243. info_xref = int(temp.replace("0 R", ""))
  1244. if m == {} and info_xref == 0: # nothing to do
  1245. return
  1246. if info_xref == 0: # no prev metadata: get new xref
  1247. info_xref = doc.get_new_xref()
  1248. doc.update_object(info_xref, "<<>>") # fill it with empty object
  1249. doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref)
  1250. elif m == {}: # remove existing metadata
  1251. doc.xref_set_key(-1, "Info", "null")
  1252. doc.init_doc()
  1253. return
  1254. for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]:
  1255. pdf_key = keymap[key]
  1256. if not bool(val) or val in ("none", "null"):
  1257. val = "null"
  1258. else:
  1259. val = pymupdf.get_pdf_str(val)
  1260. doc.xref_set_key(info_xref, pdf_key, val)
  1261. doc.init_doc()
  1262. return
  1263. def getDestStr(xref: int, ddict: dict) -> str:
  1264. """Calculate the PDF action string.
  1265. Notes:
  1266. Supports Link annotations and outline items (bookmarks).
  1267. """
  1268. if not ddict:
  1269. return ""
  1270. str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
  1271. str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
  1272. str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
  1273. str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
  1274. str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
  1275. if type(ddict) in (int, float):
  1276. dest = str_goto(xref, 0, ddict, 0)
  1277. return dest
  1278. d_kind = ddict.get("kind", pymupdf.LINK_NONE)
  1279. if d_kind == pymupdf.LINK_NONE:
  1280. return ""
  1281. if ddict["kind"] == pymupdf.LINK_GOTO:
  1282. d_zoom = ddict.get("zoom", 0)
  1283. to = ddict.get("to", pymupdf.Point(0, 0))
  1284. d_left, d_top = to
  1285. dest = str_goto(xref, d_left, d_top, d_zoom)
  1286. return dest
  1287. if ddict["kind"] == pymupdf.LINK_URI:
  1288. dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
  1289. return dest
  1290. if ddict["kind"] == pymupdf.LINK_LAUNCH:
  1291. fspec = pymupdf.get_pdf_str(ddict["file"])
  1292. dest = str_launch(fspec, fspec)
  1293. return dest
  1294. if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
  1295. fspec = pymupdf.get_pdf_str(ddict["file"])
  1296. dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
  1297. return dest
  1298. if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
  1299. fspec = pymupdf.get_pdf_str(ddict["file"])
  1300. dest = str_gotor1(
  1301. ddict["page"],
  1302. ddict["to"].x,
  1303. ddict["to"].y,
  1304. ddict["zoom"],
  1305. fspec,
  1306. fspec,
  1307. )
  1308. return dest
  1309. return ""
  1310. def set_toc(
  1311. doc: pymupdf.Document,
  1312. toc: list,
  1313. collapse: int = 1,
  1314. ) -> int:
  1315. """Create new outline tree (table of contents, TOC).
  1316. Args:
  1317. toc: (list, tuple) each entry must contain level, title, page and
  1318. optionally top margin on the page. None or '()' remove the TOC.
  1319. collapse: (int) collapses entries beyond this level. Zero or None
  1320. shows all entries unfolded.
  1321. Returns:
  1322. the number of inserted items, or the number of removed items respectively.
  1323. """
  1324. if doc.is_closed or doc.is_encrypted:
  1325. raise ValueError("document closed or encrypted")
  1326. if not doc.is_pdf:
  1327. raise ValueError("is no PDF")
  1328. if not toc: # remove all entries
  1329. return len(doc._delToC())
  1330. # validity checks --------------------------------------------------------
  1331. if type(toc) not in (list, tuple):
  1332. raise ValueError("'toc' must be list or tuple")
  1333. toclen = len(toc)
  1334. page_count = doc.page_count
  1335. t0 = toc[0]
  1336. if type(t0) not in (list, tuple):
  1337. raise ValueError("items must be sequences of 3 or 4 items")
  1338. if t0[0] != 1:
  1339. raise ValueError("hierarchy level of item 0 must be 1")
  1340. for i in list(range(toclen - 1)):
  1341. t1 = toc[i]
  1342. t2 = toc[i + 1]
  1343. if not -1 <= t1[2] <= page_count:
  1344. raise ValueError("row %i: page number out of range" % i)
  1345. if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4):
  1346. raise ValueError("bad row %i" % (i + 1))
  1347. if (type(t2[0]) is not int) or t2[0] < 1:
  1348. raise ValueError("bad hierarchy level in row %i" % (i + 1))
  1349. if t2[0] > t1[0] + 1:
  1350. raise ValueError("bad hierarchy level in row %i" % (i + 1))
  1351. # no formal errors in toc --------------------------------------------------
  1352. # --------------------------------------------------------------------------
  1353. # make a list of xref numbers, which we can use for our TOC entries
  1354. # --------------------------------------------------------------------------
  1355. old_xrefs = doc._delToC() # del old outlines, get their xref numbers
  1356. # prepare table of xrefs for new bookmarks
  1357. old_xrefs = []
  1358. xref = [0] + old_xrefs
  1359. xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number
  1360. if toclen > len(old_xrefs): # too few old xrefs?
  1361. for i in range((toclen - len(old_xrefs))):
  1362. xref.append(doc.get_new_xref()) # acquire new ones
  1363. lvltab = {0: 0} # to store last entry per hierarchy level
  1364. # ------------------------------------------------------------------------------
  1365. # contains new outline objects as strings - first one is the outline root
  1366. # ------------------------------------------------------------------------------
  1367. olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}]
  1368. # ------------------------------------------------------------------------------
  1369. # build olitems as a list of PDF-like connected dictionaries
  1370. # ------------------------------------------------------------------------------
  1371. for i in range(toclen):
  1372. o = toc[i]
  1373. lvl = o[0] # level
  1374. title = pymupdf.get_pdf_str(o[1]) # title
  1375. pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number
  1376. page_xref = doc.page_xref(pno)
  1377. page_height = doc.page_cropbox(pno).height
  1378. top = pymupdf.Point(72, page_height - 36)
  1379. dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target
  1380. if o[2] < 0:
  1381. dest_dict["kind"] = pymupdf.LINK_NONE
  1382. if len(o) > 3: # some target is specified
  1383. if type(o[3]) in (int, float): # convert a number to a point
  1384. dest_dict["to"] = pymupdf.Point(72, page_height - o[3])
  1385. else: # if something else, make sure we have a dict
  1386. # We make a copy of o[3] to avoid modifying our caller's data.
  1387. dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict
  1388. if "to" not in dest_dict: # target point not in dict?
  1389. dest_dict["to"] = top # put default in
  1390. else: # transform target to PDF coordinates
  1391. page = doc[pno]
  1392. point = pymupdf.Point(dest_dict["to"])
  1393. point.y = page.cropbox.height - point.y
  1394. point = point * page.rotation_matrix
  1395. dest_dict["to"] = (point.x, point.y)
  1396. d = {}
  1397. d["first"] = -1
  1398. d["count"] = 0
  1399. d["last"] = -1
  1400. d["prev"] = -1
  1401. d["next"] = -1
  1402. d["dest"] = getDestStr(page_xref, dest_dict)
  1403. d["top"] = dest_dict["to"]
  1404. d["title"] = title
  1405. d["parent"] = lvltab[lvl - 1]
  1406. d["xref"] = xref[i + 1]
  1407. d["color"] = dest_dict.get("color")
  1408. d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0)
  1409. lvltab[lvl] = i + 1
  1410. parent = olitems[lvltab[lvl - 1]] # the parent entry
  1411. if (
  1412. dest_dict.get("collapse") or collapse and lvl > collapse
  1413. ): # suppress expansion
  1414. parent["count"] -= 1 # make /Count negative
  1415. else:
  1416. parent["count"] += 1 # positive /Count
  1417. if parent["first"] == -1:
  1418. parent["first"] = i + 1
  1419. parent["last"] = i + 1
  1420. else:
  1421. d["prev"] = parent["last"]
  1422. prev = olitems[parent["last"]]
  1423. prev["next"] = i + 1
  1424. parent["last"] = i + 1
  1425. olitems.append(d)
  1426. # ------------------------------------------------------------------------------
  1427. # now create each outline item as a string and insert it in the PDF
  1428. # ------------------------------------------------------------------------------
  1429. for i, ol in enumerate(olitems):
  1430. txt = "<<"
  1431. if ol["count"] != 0:
  1432. txt += "/Count %i" % ol["count"]
  1433. try:
  1434. txt += ol["dest"]
  1435. except Exception:
  1436. # Verbose in PyMuPDF/tests.
  1437. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  1438. pass
  1439. try:
  1440. if ol["first"] > -1:
  1441. txt += "/First %i 0 R" % xref[ol["first"]]
  1442. except Exception:
  1443. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  1444. pass
  1445. try:
  1446. if ol["last"] > -1:
  1447. txt += "/Last %i 0 R" % xref[ol["last"]]
  1448. except Exception:
  1449. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  1450. pass
  1451. try:
  1452. if ol["next"] > -1:
  1453. txt += "/Next %i 0 R" % xref[ol["next"]]
  1454. except Exception:
  1455. # Verbose in PyMuPDF/tests.
  1456. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  1457. pass
  1458. try:
  1459. if ol["parent"] > -1:
  1460. txt += "/Parent %i 0 R" % xref[ol["parent"]]
  1461. except Exception:
  1462. # Verbose in PyMuPDF/tests.
  1463. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  1464. pass
  1465. try:
  1466. if ol["prev"] > -1:
  1467. txt += "/Prev %i 0 R" % xref[ol["prev"]]
  1468. except Exception:
  1469. # Verbose in PyMuPDF/tests.
  1470. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  1471. pass
  1472. try:
  1473. txt += "/Title" + ol["title"]
  1474. except Exception:
  1475. # Verbose in PyMuPDF/tests.
  1476. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  1477. pass
  1478. if ol.get("color") and len(ol["color"]) == 3:
  1479. txt += f"/C[ {_format_g(tuple(ol['color']))}]"
  1480. if ol.get("flags", 0) > 0:
  1481. txt += "/F %i" % ol["flags"]
  1482. if i == 0: # special: this is the outline root
  1483. txt += "/Type/Outlines" # so add the /Type entry
  1484. txt += ">>"
  1485. doc.update_object(xref[i], txt) # insert the PDF object
  1486. doc.init_doc()
  1487. return toclen
  1488. def do_widgets(
  1489. tar: pymupdf.Document,
  1490. src: pymupdf.Document,
  1491. graftmap,
  1492. from_page: int = -1,
  1493. to_page: int = -1,
  1494. start_at: int = -1,
  1495. join_duplicates=0,
  1496. ) -> None:
  1497. """Insert widgets of copied page range into target PDF.
  1498. Parameter values **must** equal those of method insert_pdf() which
  1499. must have been previously executed.
  1500. """
  1501. if not src.is_form_pdf: # nothing to do: source PDF has no fields
  1502. return
  1503. def clean_kid_parents(acro_fields):
  1504. """ Make sure all kids have correct "Parent" pointers."""
  1505. for i in range(acro_fields.pdf_array_len()):
  1506. parent = acro_fields.pdf_array_get(i)
  1507. kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
  1508. for j in range(kids.pdf_array_len()):
  1509. kid = kids.pdf_array_get(j)
  1510. kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent)
  1511. def join_widgets(pdf, acro_fields, xref1, xref2, name):
  1512. """Called for each pair of widgets having the same name.
  1513. Args:
  1514. pdf: target MuPDF document
  1515. acro_fields: object Root/AcroForm/Fields
  1516. xref1, xref2: widget xrefs having same names
  1517. name: (str) the name
  1518. Result:
  1519. Defined or updated widget parent that points to both widgets.
  1520. """
  1521. def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2):
  1522. """Merge widget in xref2 into "Kids" list of widget xref1.
  1523. Args:
  1524. xref1, kids1: target widget and its "Kids" array.
  1525. xref2, kids2: source wwidget and its "Kids" array (may be empty).
  1526. """
  1527. # make indirect objects from widgets
  1528. w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0)
  1529. w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0)
  1530. # find source widget in "Fields" array
  1531. idx = acro_fields.pdf_array_find(w2_ind)
  1532. acro_fields.pdf_array_delete(idx)
  1533. if not kids2.pdf_is_array(): # source widget has no kids
  1534. widget = mupdf.pdf_load_object(pdf, xref2)
  1535. # delete name from widget and insert target as parent
  1536. widget.pdf_dict_del(pymupdf.PDF_NAME("T"))
  1537. widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
  1538. # put in target Kids
  1539. kids1.pdf_array_push(w2_ind)
  1540. else: # copy source kids to target kids
  1541. for i in range(kids2.pdf_array_len()):
  1542. kid = kids2.pdf_array_get(i)
  1543. kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
  1544. kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0)
  1545. kids1.pdf_array_push(kid_ind)
  1546. def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name):
  1547. """Make new "Parent" for two widgets with same name.
  1548. Args:
  1549. xref1, w1: first widget
  1550. xref2, w2: second widget
  1551. name: field name
  1552. Result:
  1553. Both widgets have no "Kids". We create a new object with the
  1554. name and a "Kids" array containing the widgets.
  1555. Original widgets must be removed from AcroForm/Fields.
  1556. """
  1557. # make new "Parent" object
  1558. new = mupdf.pdf_new_dict(pdf, 5)
  1559. new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name)
  1560. kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2)
  1561. new_obj = mupdf.pdf_add_object(pdf, new)
  1562. new_obj_xref = new_obj.pdf_to_num()
  1563. new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0)
  1564. # copy over some required source widget properties
  1565. ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT"))
  1566. w1.pdf_dict_del(pymupdf.PDF_NAME("FT"))
  1567. new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft)
  1568. aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA"))
  1569. w1.pdf_dict_del(pymupdf.PDF_NAME("AA"))
  1570. new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa)
  1571. # remove name field, insert "Parent" field in source widgets
  1572. w1.pdf_dict_del(pymupdf.PDF_NAME("T"))
  1573. w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
  1574. w2.pdf_dict_del(pymupdf.PDF_NAME("T"))
  1575. w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
  1576. # put source widgets in "kids" array
  1577. ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0)
  1578. ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0)
  1579. kids.pdf_array_push(ind1)
  1580. kids.pdf_array_push(ind2)
  1581. # remove source widgets from "AcroForm/Fields"
  1582. idx = acro_fields.pdf_array_find(ind1)
  1583. acro_fields.pdf_array_delete(idx)
  1584. idx = acro_fields.pdf_array_find(ind2)
  1585. acro_fields.pdf_array_delete(idx)
  1586. acro_fields.pdf_array_push(new_ind)
  1587. w1 = mupdf.pdf_load_object(pdf, xref1)
  1588. w2 = mupdf.pdf_load_object(pdf, xref2)
  1589. kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
  1590. kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
  1591. # check which widget has a suitable "Kids" array
  1592. if kids1.pdf_is_array():
  1593. re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order
  1594. elif kids2.pdf_is_array():
  1595. re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order
  1596. else:
  1597. new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order
  1598. def get_kids(parent, kids_list):
  1599. """Return xref list of leaf kids for a parent.
  1600. Call with an empty list.
  1601. """
  1602. kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids"))
  1603. if not kids.pdf_is_array():
  1604. return kids_list
  1605. for i in range(kids.pdf_array_len()):
  1606. kid = kids.pdf_array_get(i)
  1607. if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))):
  1608. kids_list = get_kids(kid, kids_list)
  1609. else:
  1610. kids_list.append(kid.pdf_to_num())
  1611. return kids_list
  1612. def kids_xrefs(widget):
  1613. """Get the xref of top "Parent" and the list of leaf widgets."""
  1614. kids_list = []
  1615. parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent"))
  1616. parent_xref = parent.pdf_to_num()
  1617. if parent_xref == 0:
  1618. return parent_xref, kids_list
  1619. kids_list = get_kids(parent, kids_list)
  1620. return parent_xref, kids_list
  1621. def deduplicate_names(pdf, acro_fields, join_duplicates=False):
  1622. """Handle any widget name duplicates caused by the merge."""
  1623. names = {} # key is a widget name, value a list of widgets having it.
  1624. # extract all names and widgets in "AcroForm/Fields"
  1625. for i in range(mupdf.pdf_array_len(acro_fields)):
  1626. wobject = mupdf.pdf_array_get(acro_fields, i)
  1627. xref = wobject.pdf_to_num()
  1628. # extract widget name and collect widget(s) using it
  1629. T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T"))
  1630. xrefs = names.get(T, [])
  1631. xrefs.append(xref)
  1632. names[T] = xrefs
  1633. for name, xrefs in names.items():
  1634. if len(xrefs) < 2:
  1635. continue
  1636. xref0, xref1 = xrefs[:2] # only exactly 2 should occur!
  1637. if join_duplicates: # combine fields with equal names
  1638. join_widgets(pdf, acro_fields, xref0, xref1, name)
  1639. else: # make field names unique
  1640. newname = name + f" [{xref1}]" # append this to the name
  1641. wobject = mupdf.pdf_load_object(pdf, xref1)
  1642. wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname)
  1643. clean_kid_parents(acro_fields)
  1644. def get_acroform(doc):
  1645. """Retrieve the AcroForm dictionary form a PDF."""
  1646. pdf = mupdf.pdf_document_from_fz_document(doc)
  1647. # AcroForm (= central form field info)
  1648. return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm")
  1649. tarpdf = mupdf.pdf_document_from_fz_document(tar)
  1650. srcpdf = mupdf.pdf_document_from_fz_document(src)
  1651. if tar.is_form_pdf:
  1652. # target is a Form PDF, so use it to include source fields
  1653. acro = get_acroform(tar)
  1654. # Important arrays in AcroForm
  1655. acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
  1656. tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO"))
  1657. if not tar_co.pdf_is_array():
  1658. tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
  1659. else:
  1660. # target is no Form PDF, so copy over source AcroForm
  1661. acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy
  1662. # Clear "Fields" and "CO" arrays: will be populated by page fields.
  1663. # This is required to avoid copying unneeded objects.
  1664. acro.pdf_dict_del(pymupdf.PDF_NAME("Fields"))
  1665. acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5)
  1666. acro.pdf_dict_del(pymupdf.PDF_NAME("CO"))
  1667. acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
  1668. # Enrich AcroForm for copying to target
  1669. acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro)
  1670. # Insert AcroForm into target PDF
  1671. acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft)
  1672. acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
  1673. tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO"))
  1674. # get its xref and insert it into target catalog
  1675. tar_xref = acro_tar.pdf_to_num()
  1676. acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
  1677. root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root"))
  1678. root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)
  1679. if from_page <= to_page:
  1680. src_range = range(from_page, to_page + 1)
  1681. else:
  1682. src_range = range(from_page, to_page - 1, -1)
  1683. parents = {} # information about widget parents
  1684. # remove "P" owning page reference from all widgets of all source pages
  1685. for i in src_range:
  1686. src_page = src[i]
  1687. for xref in [
  1688. xref
  1689. for xref, wtype, _ in src_page.annot_xrefs()
  1690. if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
  1691. ]:
  1692. w_obj = mupdf.pdf_load_object(srcpdf, xref)
  1693. w_obj.pdf_dict_del(pymupdf.PDF_NAME("P"))
  1694. # get the widget's parent structure
  1695. parent_xref, old_kids = kids_xrefs(w_obj)
  1696. if parent_xref:
  1697. parents[parent_xref] = {
  1698. "new_xref": 0,
  1699. "old_kids": old_kids,
  1700. "new_kids": [],
  1701. }
  1702. # Copy over Parent widgets first - they are not page-dependent
  1703. for xref in parents.keys(): # pylint: disable=consider-using-dict-items
  1704. parent = mupdf.pdf_load_object(srcpdf, xref)
  1705. parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent)
  1706. parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft)
  1707. kids_xrefs_new = get_kids(parent_tar, [])
  1708. parent_xref_new = parent_tar.pdf_to_num()
  1709. parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0)
  1710. acro_fields.pdf_array_push(parent_ind)
  1711. parents[xref]["new_xref"] = parent_xref_new
  1712. parents[xref]["new_kids"] = kids_xrefs_new
  1713. for i in range(len(src_range)):
  1714. # read first copied over page in target
  1715. tar_page = tar[start_at + i]
  1716. # read the original page in the source PDF
  1717. src_page = src[src_range[i]]
  1718. # now walk through source page widgets and copy over
  1719. w_xrefs = [ # widget xrefs of the source page
  1720. xref
  1721. for xref, wtype, _ in src_page.annot_xrefs()
  1722. if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
  1723. ]
  1724. if not w_xrefs: # no widgets on this source page
  1725. continue
  1726. # convert to formal PDF page
  1727. tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
  1728. # extract annotations array
  1729. tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
  1730. if not mupdf.pdf_is_array(tar_annots):
  1731. tar_annots = mupdf.pdf_dict_put_array(
  1732. tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
  1733. )
  1734. for xref in w_xrefs:
  1735. w_obj = mupdf.pdf_load_object(srcpdf, xref)
  1736. # check if field takes part in inter-field validations
  1737. is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C"))
  1738. # check if parent of widget already in target
  1739. parent_xref = mupdf.pdf_to_num(
  1740. w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent"))
  1741. )
  1742. if parent_xref == 0: # parent not in target yet
  1743. try:
  1744. w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
  1745. except Exception as e:
  1746. pymupdf.message_warning(f"cannot copy widget at {xref=}: {e}")
  1747. continue
  1748. w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
  1749. tar_xref = w_obj_tar.pdf_to_num()
  1750. w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
  1751. mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
  1752. mupdf.pdf_array_push(acro_fields, w_obj_tar_ind)
  1753. else:
  1754. parent = parents[parent_xref]
  1755. idx = parent["old_kids"].index(xref) # search for xref in parent
  1756. tar_xref = parent["new_kids"][idx]
  1757. w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
  1758. mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
  1759. # Into "AcroForm/CO" if a computation field.
  1760. if is_aac:
  1761. mupdf.pdf_array_push(tar_co, w_obj_tar_ind)
  1762. deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates)
  1763. def do_links(
  1764. doc1: pymupdf.Document,
  1765. doc2: pymupdf.Document,
  1766. from_page: int = -1,
  1767. to_page: int = -1,
  1768. start_at: int = -1,
  1769. ) -> None:
  1770. """Insert links contained in copied page range into destination PDF.
  1771. Parameter values **must** equal those of method insert_pdf(), which must
  1772. have been previously executed.
  1773. """
  1774. #pymupdf.log( 'utils.do_links()')
  1775. # --------------------------------------------------------------------------
  1776. # internal function to create the actual "/Annots" object string
  1777. # --------------------------------------------------------------------------
  1778. def cre_annot(lnk, xref_dst, pno_src, ctm):
  1779. """Create annotation object string for a passed-in link."""
  1780. r = lnk["from"] * ctm # rect in PDF coordinates
  1781. rect = _format_g(tuple(r))
  1782. if lnk["kind"] == pymupdf.LINK_GOTO:
  1783. txt = pymupdf.annot_skel["goto1"] # annot_goto
  1784. idx = pno_src.index(lnk["page"])
  1785. p = lnk["to"] * ctm # target point in PDF coordinates
  1786. annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect)
  1787. elif lnk["kind"] == pymupdf.LINK_GOTOR:
  1788. if lnk["page"] >= 0:
  1789. txt = pymupdf.annot_skel["gotor1"] # annot_gotor
  1790. pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
  1791. if type(pnt) is not pymupdf.Point:
  1792. pnt = pymupdf.Point(0, 0)
  1793. annot = txt(
  1794. lnk["page"],
  1795. pnt.x,
  1796. pnt.y,
  1797. lnk["zoom"],
  1798. lnk["file"],
  1799. lnk["file"],
  1800. rect,
  1801. )
  1802. else:
  1803. txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
  1804. to = pymupdf.get_pdf_str(lnk["to"])
  1805. to = to[1:-1]
  1806. f = lnk["file"]
  1807. annot = txt(to, f, rect)
  1808. elif lnk["kind"] == pymupdf.LINK_LAUNCH:
  1809. txt = pymupdf.annot_skel["launch"] # annot_launch
  1810. annot = txt(lnk["file"], lnk["file"], rect)
  1811. elif lnk["kind"] == pymupdf.LINK_URI:
  1812. txt = pymupdf.annot_skel["uri"] # annot_uri
  1813. annot = txt(lnk["uri"], rect)
  1814. else:
  1815. annot = ""
  1816. return annot
  1817. # --------------------------------------------------------------------------
  1818. # validate & normalize parameters
  1819. if from_page < 0:
  1820. fp = 0
  1821. elif from_page >= doc2.page_count:
  1822. fp = doc2.page_count - 1
  1823. else:
  1824. fp = from_page
  1825. if to_page < 0 or to_page >= doc2.page_count:
  1826. tp = doc2.page_count - 1
  1827. else:
  1828. tp = to_page
  1829. if start_at < 0:
  1830. raise ValueError("'start_at' must be >= 0")
  1831. sa = start_at
  1832. incr = 1 if fp <= tp else -1 # page range could be reversed
  1833. # lists of source / destination page numbers
  1834. pno_src = list(range(fp, tp + incr, incr))
  1835. pno_dst = [sa + i for i in range(len(pno_src))]
  1836. # lists of source / destination page xrefs
  1837. xref_src = []
  1838. xref_dst = []
  1839. for i in range(len(pno_src)):
  1840. p_src = pno_src[i]
  1841. p_dst = pno_dst[i]
  1842. old_xref = doc2.page_xref(p_src)
  1843. new_xref = doc1.page_xref(p_dst)
  1844. xref_src.append(old_xref)
  1845. xref_dst.append(new_xref)
  1846. # create the links for each copied page in destination PDF
  1847. for i in range(len(xref_src)):
  1848. page_src = doc2[pno_src[i]] # load source page
  1849. links = page_src.get_links() # get all its links
  1850. #pymupdf.log( '{pno_src=}')
  1851. #pymupdf.log( '{type(page_src)=}')
  1852. #pymupdf.log( '{page_src=}')
  1853. #pymupdf.log( '{=i len(links)}')
  1854. if len(links) == 0: # no links there
  1855. page_src = None
  1856. continue
  1857. ctm = ~page_src.transformation_matrix # calc page transformation matrix
  1858. page_dst = doc1[pno_dst[i]] # load destination page
  1859. link_tab = [] # store all link definitions here
  1860. for l in links:
  1861. if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src):
  1862. continue # GOTO link target not in copied pages
  1863. annot_text = cre_annot(l, xref_dst, pno_src, ctm)
  1864. if annot_text:
  1865. link_tab.append(annot_text)
  1866. if link_tab != []:
  1867. page_dst._addAnnot_FromString( tuple(link_tab))
  1868. #pymupdf.log( 'utils.do_links() returning.')
  1869. def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
  1870. # --------------------------------------------------------------------------
  1871. # define skeletons for /Annots object texts
  1872. # --------------------------------------------------------------------------
  1873. ctm = page.transformation_matrix
  1874. ictm = ~ctm
  1875. r = lnk["from"]
  1876. rect = _format_g(tuple(r * ictm))
  1877. annot = ""
  1878. if lnk["kind"] == pymupdf.LINK_GOTO:
  1879. if lnk["page"] >= 0:
  1880. txt = pymupdf.annot_skel["goto1"] # annot_goto
  1881. pno = lnk["page"]
  1882. xref = page.parent.page_xref(pno)
  1883. pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
  1884. dest_page = page.parent[pno]
  1885. dest_ctm = dest_page.transformation_matrix
  1886. dest_ictm = ~dest_ctm
  1887. ipnt = pnt * dest_ictm
  1888. annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
  1889. else:
  1890. txt = pymupdf.annot_skel["goto2"] # annot_goto_n
  1891. annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
  1892. elif lnk["kind"] == pymupdf.LINK_GOTOR:
  1893. if lnk["page"] >= 0:
  1894. txt = pymupdf.annot_skel["gotor1"] # annot_gotor
  1895. pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
  1896. if type(pnt) is not pymupdf.Point:
  1897. pnt = pymupdf.Point(0, 0)
  1898. annot = txt(
  1899. lnk["page"],
  1900. pnt.x,
  1901. pnt.y,
  1902. lnk.get("zoom", 0),
  1903. lnk["file"],
  1904. lnk["file"],
  1905. rect,
  1906. )
  1907. else:
  1908. txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
  1909. annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
  1910. elif lnk["kind"] == pymupdf.LINK_LAUNCH:
  1911. txt = pymupdf.annot_skel["launch"] # annot_launch
  1912. annot = txt(lnk["file"], lnk["file"], rect)
  1913. elif lnk["kind"] == pymupdf.LINK_URI:
  1914. txt = pymupdf.annot_skel["uri"] # txt = annot_uri
  1915. annot = txt(lnk["uri"], rect)
  1916. elif lnk["kind"] == pymupdf.LINK_NAMED:
  1917. txt = pymupdf.annot_skel["named"] # annot_named
  1918. lname = lnk.get("name") # check presence of key
  1919. if lname is None: # if missing, fall back to alternative
  1920. lname = lnk["nameddest"]
  1921. annot = txt(lname, rect)
  1922. if not annot:
  1923. return annot
  1924. # add a /NM PDF key to the object definition
  1925. link_names = dict( # existing ids and their xref
  1926. [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member
  1927. )
  1928. old_name = lnk.get("id", "") # id value in the argument
  1929. if old_name and (lnk["xref"], old_name) in link_names.items():
  1930. name = old_name # no new name if this is an update only
  1931. else:
  1932. i = 0
  1933. stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
  1934. while True:
  1935. name = stem % i
  1936. if name not in link_names.values():
  1937. break
  1938. i += 1
  1939. # add /NM key to object definition
  1940. annot = annot.replace("/Link", "/Link/NM(%s)" % name)
  1941. return annot
  1942. def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget:
  1943. """Delete widget from page and return the next one."""
  1944. pymupdf.CheckParent(page)
  1945. annot = getattr(widget, "_annot", None)
  1946. if annot is None:
  1947. raise ValueError("bad type: widget")
  1948. nextwidget = widget.next
  1949. page.delete_annot(annot)
  1950. widget._annot.parent = None
  1951. keylist = list(widget.__dict__.keys())
  1952. for key in keylist:
  1953. del widget.__dict__[key]
  1954. return nextwidget
  1955. def update_link(page: pymupdf.Page, lnk: dict) -> None:
  1956. """Update a link on the current page."""
  1957. pymupdf.CheckParent(page)
  1958. annot = getLinkText(page, lnk)
  1959. if annot == "":
  1960. raise ValueError("link kind not supported")
  1961. page.parent.update_object(lnk["xref"], annot, page=page)
  1962. def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None:
  1963. """Insert a new link for the current page."""
  1964. pymupdf.CheckParent(page)
  1965. annot = getLinkText(page, lnk)
  1966. if annot == "":
  1967. raise ValueError("link kind not supported")
  1968. page._addAnnot_FromString((annot,))
  1969. def insert_textbox(
  1970. page: pymupdf.Page,
  1971. rect: rect_like,
  1972. buffer: typing.Union[str, list],
  1973. *,
  1974. fontname: str = "helv",
  1975. fontfile: OptStr = None,
  1976. set_simple: int = 0,
  1977. encoding: int = 0,
  1978. fontsize: float = 11,
  1979. lineheight: OptFloat = None,
  1980. color: OptSeq = None,
  1981. fill: OptSeq = None,
  1982. expandtabs: int = 1,
  1983. align: int = 0,
  1984. rotate: int = 0,
  1985. render_mode: int = 0,
  1986. miter_limit: float = 1,
  1987. border_width: float = 0.05,
  1988. morph: OptSeq = None,
  1989. overlay: bool = True,
  1990. stroke_opacity: float = 1,
  1991. fill_opacity: float = 1,
  1992. oc: int = 0,
  1993. ) -> float:
  1994. """Insert text into a given rectangle.
  1995. Notes:
  1996. Creates a Shape object, uses its same-named method and commits it.
  1997. Parameters:
  1998. rect: (rect-like) area to use for text.
  1999. buffer: text to be inserted
  2000. fontname: a Base-14 font, font name or '/name'
  2001. fontfile: name of a font file
  2002. fontsize: font size
  2003. lineheight: overwrite the font property
  2004. color: RGB color triple
  2005. expandtabs: handles tabulators with string function
  2006. align: left, center, right, justified
  2007. rotate: 0, 90, 180, or 270 degrees
  2008. morph: morph box with a matrix and a fixpoint
  2009. overlay: put text in foreground or background
  2010. Returns:
  2011. unused or deficit rectangle area (float)
  2012. """
  2013. img = page.new_shape()
  2014. rc = img.insert_textbox(
  2015. rect,
  2016. buffer,
  2017. fontsize=fontsize,
  2018. lineheight=lineheight,
  2019. fontname=fontname,
  2020. fontfile=fontfile,
  2021. set_simple=set_simple,
  2022. encoding=encoding,
  2023. color=color,
  2024. fill=fill,
  2025. expandtabs=expandtabs,
  2026. render_mode=render_mode,
  2027. miter_limit=miter_limit,
  2028. border_width=border_width,
  2029. align=align,
  2030. rotate=rotate,
  2031. morph=morph,
  2032. stroke_opacity=stroke_opacity,
  2033. fill_opacity=fill_opacity,
  2034. oc=oc,
  2035. )
  2036. if rc >= 0:
  2037. img.commit(overlay)
  2038. return rc
  2039. def insert_text(
  2040. page: pymupdf.Page,
  2041. point: point_like,
  2042. text: typing.Union[str, list],
  2043. *,
  2044. fontsize: float = 11,
  2045. lineheight: OptFloat = None,
  2046. fontname: str = "helv",
  2047. fontfile: OptStr = None,
  2048. set_simple: int = 0,
  2049. encoding: int = 0,
  2050. color: OptSeq = None,
  2051. fill: OptSeq = None,
  2052. border_width: float = 0.05,
  2053. miter_limit: float = 1,
  2054. render_mode: int = 0,
  2055. rotate: int = 0,
  2056. morph: OptSeq = None,
  2057. overlay: bool = True,
  2058. stroke_opacity: float = 1,
  2059. fill_opacity: float = 1,
  2060. oc: int = 0,
  2061. ):
  2062. img = page.new_shape()
  2063. rc = img.insert_text(
  2064. point,
  2065. text,
  2066. fontsize=fontsize,
  2067. lineheight=lineheight,
  2068. fontname=fontname,
  2069. fontfile=fontfile,
  2070. set_simple=set_simple,
  2071. encoding=encoding,
  2072. color=color,
  2073. fill=fill,
  2074. border_width=border_width,
  2075. render_mode=render_mode,
  2076. miter_limit=miter_limit,
  2077. rotate=rotate,
  2078. morph=morph,
  2079. stroke_opacity=stroke_opacity,
  2080. fill_opacity=fill_opacity,
  2081. oc=oc,
  2082. )
  2083. if rc >= 0:
  2084. img.commit(overlay)
  2085. return rc
  2086. def insert_htmlbox(
  2087. page,
  2088. rect,
  2089. text,
  2090. *,
  2091. css=None,
  2092. scale_low=0,
  2093. archive=None,
  2094. rotate=0,
  2095. oc=0,
  2096. opacity=1,
  2097. overlay=True,
  2098. ) -> float:
  2099. """Insert text with optional HTML tags and stylings into a rectangle.
  2100. Args:
  2101. rect: (rect-like) rectangle into which the text should be placed.
  2102. text: (str) text with optional HTML tags and stylings.
  2103. css: (str) CSS styling commands.
  2104. scale_low: (float) force-fit content by scaling it down. Must be in
  2105. range [0, 1]. If 1, no scaling will take place. If 0, arbitrary
  2106. down-scaling is acceptable. A value of 0.1 would mean that content
  2107. may be scaled down by at most 90%.
  2108. archive: Archive object pointing to locations of used fonts or images
  2109. rotate: (int) rotate the text in the box by a multiple of 90 degrees.
  2110. oc: (int) the xref of an OCG / OCMD (Optional Content).
  2111. opacity: (float) set opacity of inserted content.
  2112. overlay: (bool) put text on top of page content.
  2113. Returns:
  2114. A tuple of floats (spare_height, scale).
  2115. spare_height: -1 if content did not fit, else >= 0. It is the height of the
  2116. unused (still available) rectangle stripe. Positive only if
  2117. scale_min = 1 (no down scaling).
  2118. scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit).
  2119. """
  2120. # normalize rotation angle
  2121. if not rotate % 90 == 0:
  2122. raise ValueError("bad rotation angle")
  2123. while rotate < 0:
  2124. rotate += 360
  2125. while rotate >= 360:
  2126. rotate -= 360
  2127. if not 0 <= scale_low <= 1:
  2128. raise ValueError("'scale_low' must be in [0, 1]")
  2129. if css is None:
  2130. css = ""
  2131. rect = pymupdf.Rect(rect)
  2132. if rotate in (90, 270):
  2133. temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width)
  2134. else:
  2135. temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height)
  2136. # use a small border by default
  2137. mycss = "body {margin:1px;}" + css # append user CSS
  2138. # either make a story, or accept a given one
  2139. if isinstance(text, str): # if a string, convert to a Story
  2140. story = pymupdf.Story(html=text, user_css=mycss, archive=archive)
  2141. elif isinstance(text, pymupdf.Story):
  2142. story = text
  2143. else:
  2144. raise ValueError("'text' must be a string or a Story")
  2145. # ----------------------------------------------------------------
  2146. # Find a scaling factor that lets our story fit in
  2147. # ----------------------------------------------------------------
  2148. scale_max = None if scale_low == 0 else 1 / scale_low
  2149. fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max)
  2150. if not fit.big_enough: # there was no fit
  2151. return (-1, scale_low)
  2152. filled = fit.filled
  2153. scale = 1 / fit.parameter # shrink factor
  2154. spare_height = fit.rect.y1 - filled[3] # unused room at rectangle bottom
  2155. # Note: due to MuPDF's logic this may be negative even for successful fits.
  2156. if scale != 1 or spare_height < 0: # if scaling occurred, set spare_height to 0
  2157. spare_height = 0
  2158. def rect_function(*args):
  2159. return fit.rect, fit.rect, pymupdf.Identity
  2160. # draw story on temp PDF page
  2161. doc = story.write_with_links(rect_function)
  2162. # Insert opacity if requested.
  2163. # For this, we prepend a command to the /Contents.
  2164. if 0 <= opacity < 1:
  2165. tpage = doc[0] # load page
  2166. # generate /ExtGstate for the page
  2167. alp0 = tpage._set_opacity(CA=opacity, ca=opacity)
  2168. s = f"/{alp0} gs\n" # generate graphic state command
  2169. pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0)
  2170. # put result in target page
  2171. page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay)
  2172. # -------------------------------------------------------------------------
  2173. # re-insert links in target rect (show_pdf_page cannot copy annotations)
  2174. # -------------------------------------------------------------------------
  2175. # scaled center point of fit.rect
  2176. mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale
  2177. # center point of target rect
  2178. mp2 = (rect.tl + rect.br) / 2
  2179. # compute link positioning matrix:
  2180. # - move center of scaled-down fit.rect to (0,0)
  2181. # - rotate
  2182. # - move (0,0) to center of target rect
  2183. mat = (
  2184. pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y)
  2185. * pymupdf.Matrix(-rotate)
  2186. * pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y)
  2187. )
  2188. # copy over links
  2189. for link in doc[0].get_links():
  2190. link["from"] *= mat
  2191. page.insert_link(link)
  2192. return spare_height, scale
  2193. def new_page(
  2194. doc: pymupdf.Document,
  2195. pno: int = -1,
  2196. width: float = 595,
  2197. height: float = 842,
  2198. ) -> pymupdf.Page:
  2199. """Create and return a new page object.
  2200. Args:
  2201. pno: (int) insert before this page. Default: after last page.
  2202. width: (float) page width in points. Default: 595 (ISO A4 width).
  2203. height: (float) page height in points. Default 842 (ISO A4 height).
  2204. Returns:
  2205. A pymupdf.Page object.
  2206. """
  2207. doc._newPage(pno, width=width, height=height)
  2208. return doc[pno]
  2209. def insert_page(
  2210. doc: pymupdf.Document,
  2211. pno: int,
  2212. text: typing.Union[str, list, None] = None,
  2213. fontsize: float = 11,
  2214. width: float = 595,
  2215. height: float = 842,
  2216. fontname: str = "helv",
  2217. fontfile: OptStr = None,
  2218. color: OptSeq = (0,),
  2219. ) -> int:
  2220. """Create a new PDF page and insert some text.
  2221. Notes:
  2222. Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text().
  2223. For parameter details see these methods.
  2224. """
  2225. page = doc.new_page(pno=pno, width=width, height=height)
  2226. if not bool(text):
  2227. return 0
  2228. rc = page.insert_text(
  2229. (50, 72),
  2230. text,
  2231. fontsize=fontsize,
  2232. fontname=fontname,
  2233. fontfile=fontfile,
  2234. color=color,
  2235. )
  2236. return rc
  2237. def draw_line(
  2238. page: pymupdf.Page,
  2239. p1: point_like,
  2240. p2: point_like,
  2241. color: OptSeq = (0,),
  2242. dashes: OptStr = None,
  2243. width: float = 1,
  2244. lineCap: int = 0,
  2245. lineJoin: int = 0,
  2246. overlay: bool = True,
  2247. morph: OptSeq = None,
  2248. stroke_opacity: float = 1,
  2249. fill_opacity: float = 1,
  2250. oc=0,
  2251. ) -> pymupdf.Point:
  2252. """Draw a line from point p1 to point p2."""
  2253. img = page.new_shape()
  2254. p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2))
  2255. img.finish(
  2256. color=color,
  2257. dashes=dashes,
  2258. width=width,
  2259. closePath=False,
  2260. lineCap=lineCap,
  2261. lineJoin=lineJoin,
  2262. morph=morph,
  2263. stroke_opacity=stroke_opacity,
  2264. fill_opacity=fill_opacity,
  2265. oc=oc,
  2266. )
  2267. img.commit(overlay)
  2268. return p
  2269. def draw_squiggle(
  2270. page: pymupdf.Page,
  2271. p1: point_like,
  2272. p2: point_like,
  2273. breadth: float = 2,
  2274. color: OptSeq = (0,),
  2275. dashes: OptStr = None,
  2276. width: float = 1,
  2277. lineCap: int = 0,
  2278. lineJoin: int = 0,
  2279. overlay: bool = True,
  2280. morph: OptSeq = None,
  2281. stroke_opacity: float = 1,
  2282. fill_opacity: float = 1,
  2283. oc: int = 0,
  2284. ) -> pymupdf.Point:
  2285. """Draw a squiggly line from point p1 to point p2."""
  2286. img = page.new_shape()
  2287. p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
  2288. img.finish(
  2289. color=color,
  2290. dashes=dashes,
  2291. width=width,
  2292. closePath=False,
  2293. lineCap=lineCap,
  2294. lineJoin=lineJoin,
  2295. morph=morph,
  2296. stroke_opacity=stroke_opacity,
  2297. fill_opacity=fill_opacity,
  2298. oc=oc,
  2299. )
  2300. img.commit(overlay)
  2301. return p
  2302. def draw_zigzag(
  2303. page: pymupdf.Page,
  2304. p1: point_like,
  2305. p2: point_like,
  2306. breadth: float = 2,
  2307. color: OptSeq = (0,),
  2308. dashes: OptStr = None,
  2309. width: float = 1,
  2310. lineCap: int = 0,
  2311. lineJoin: int = 0,
  2312. overlay: bool = True,
  2313. morph: OptSeq = None,
  2314. stroke_opacity: float = 1,
  2315. fill_opacity: float = 1,
  2316. oc: int = 0,
  2317. ) -> pymupdf.Point:
  2318. """Draw a zigzag line from point p1 to point p2."""
  2319. img = page.new_shape()
  2320. p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
  2321. img.finish(
  2322. color=color,
  2323. dashes=dashes,
  2324. width=width,
  2325. closePath=False,
  2326. lineCap=lineCap,
  2327. lineJoin=lineJoin,
  2328. morph=morph,
  2329. stroke_opacity=stroke_opacity,
  2330. fill_opacity=fill_opacity,
  2331. oc=oc,
  2332. )
  2333. img.commit(overlay)
  2334. return p
  2335. def draw_rect(
  2336. page: pymupdf.Page,
  2337. rect: rect_like,
  2338. color: OptSeq = (0,),
  2339. fill: OptSeq = None,
  2340. dashes: OptStr = None,
  2341. width: float = 1,
  2342. lineCap: int = 0,
  2343. lineJoin: int = 0,
  2344. morph: OptSeq = None,
  2345. overlay: bool = True,
  2346. stroke_opacity: float = 1,
  2347. fill_opacity: float = 1,
  2348. oc: int = 0,
  2349. radius=None,
  2350. ) -> pymupdf.Point:
  2351. '''
  2352. Draw a rectangle. See Shape class method for details.
  2353. '''
  2354. img = page.new_shape()
  2355. Q = img.draw_rect(pymupdf.Rect(rect), radius=radius)
  2356. img.finish(
  2357. color=color,
  2358. fill=fill,
  2359. dashes=dashes,
  2360. width=width,
  2361. lineCap=lineCap,
  2362. lineJoin=lineJoin,
  2363. morph=morph,
  2364. stroke_opacity=stroke_opacity,
  2365. fill_opacity=fill_opacity,
  2366. oc=oc,
  2367. )
  2368. img.commit(overlay)
  2369. return Q
  2370. def draw_quad(
  2371. page: pymupdf.Page,
  2372. quad: quad_like,
  2373. color: OptSeq = (0,),
  2374. fill: OptSeq = None,
  2375. dashes: OptStr = None,
  2376. width: float = 1,
  2377. lineCap: int = 0,
  2378. lineJoin: int = 0,
  2379. morph: OptSeq = None,
  2380. overlay: bool = True,
  2381. stroke_opacity: float = 1,
  2382. fill_opacity: float = 1,
  2383. oc: int = 0,
  2384. ) -> pymupdf.Point:
  2385. """Draw a quadrilateral."""
  2386. img = page.new_shape()
  2387. Q = img.draw_quad(pymupdf.Quad(quad))
  2388. img.finish(
  2389. color=color,
  2390. fill=fill,
  2391. dashes=dashes,
  2392. width=width,
  2393. lineCap=lineCap,
  2394. lineJoin=lineJoin,
  2395. morph=morph,
  2396. stroke_opacity=stroke_opacity,
  2397. fill_opacity=fill_opacity,
  2398. oc=oc,
  2399. )
  2400. img.commit(overlay)
  2401. return Q
  2402. def draw_polyline(
  2403. page: pymupdf.Page,
  2404. points: list,
  2405. color: OptSeq = (0,),
  2406. fill: OptSeq = None,
  2407. dashes: OptStr = None,
  2408. width: float = 1,
  2409. morph: OptSeq = None,
  2410. lineCap: int = 0,
  2411. lineJoin: int = 0,
  2412. overlay: bool = True,
  2413. closePath: bool = False,
  2414. stroke_opacity: float = 1,
  2415. fill_opacity: float = 1,
  2416. oc: int = 0,
  2417. ) -> pymupdf.Point:
  2418. """Draw multiple connected line segments."""
  2419. img = page.new_shape()
  2420. Q = img.draw_polyline(points)
  2421. img.finish(
  2422. color=color,
  2423. fill=fill,
  2424. dashes=dashes,
  2425. width=width,
  2426. lineCap=lineCap,
  2427. lineJoin=lineJoin,
  2428. morph=morph,
  2429. closePath=closePath,
  2430. stroke_opacity=stroke_opacity,
  2431. fill_opacity=fill_opacity,
  2432. oc=oc,
  2433. )
  2434. img.commit(overlay)
  2435. return Q
  2436. def draw_circle(
  2437. page: pymupdf.Page,
  2438. center: point_like,
  2439. radius: float,
  2440. color: OptSeq = (0,),
  2441. fill: OptSeq = None,
  2442. morph: OptSeq = None,
  2443. dashes: OptStr = None,
  2444. width: float = 1,
  2445. lineCap: int = 0,
  2446. lineJoin: int = 0,
  2447. overlay: bool = True,
  2448. stroke_opacity: float = 1,
  2449. fill_opacity: float = 1,
  2450. oc: int = 0,
  2451. ) -> pymupdf.Point:
  2452. """Draw a circle given its center and radius."""
  2453. img = page.new_shape()
  2454. Q = img.draw_circle(pymupdf.Point(center), radius)
  2455. img.finish(
  2456. color=color,
  2457. fill=fill,
  2458. dashes=dashes,
  2459. width=width,
  2460. lineCap=lineCap,
  2461. lineJoin=lineJoin,
  2462. morph=morph,
  2463. stroke_opacity=stroke_opacity,
  2464. fill_opacity=fill_opacity,
  2465. oc=oc,
  2466. )
  2467. img.commit(overlay)
  2468. return Q
  2469. def draw_oval(
  2470. page: pymupdf.Page,
  2471. rect: typing.Union[rect_like, quad_like],
  2472. color: OptSeq = (0,),
  2473. fill: OptSeq = None,
  2474. dashes: OptStr = None,
  2475. morph: OptSeq = None,
  2476. width: float = 1,
  2477. lineCap: int = 0,
  2478. lineJoin: int = 0,
  2479. overlay: bool = True,
  2480. stroke_opacity: float = 1,
  2481. fill_opacity: float = 1,
  2482. oc: int = 0,
  2483. ) -> pymupdf.Point:
  2484. """Draw an oval given its containing rectangle or quad."""
  2485. img = page.new_shape()
  2486. Q = img.draw_oval(rect)
  2487. img.finish(
  2488. color=color,
  2489. fill=fill,
  2490. dashes=dashes,
  2491. width=width,
  2492. lineCap=lineCap,
  2493. lineJoin=lineJoin,
  2494. morph=morph,
  2495. stroke_opacity=stroke_opacity,
  2496. fill_opacity=fill_opacity,
  2497. oc=oc,
  2498. )
  2499. img.commit(overlay)
  2500. return Q
  2501. def draw_curve(
  2502. page: pymupdf.Page,
  2503. p1: point_like,
  2504. p2: point_like,
  2505. p3: point_like,
  2506. color: OptSeq = (0,),
  2507. fill: OptSeq = None,
  2508. dashes: OptStr = None,
  2509. width: float = 1,
  2510. morph: OptSeq = None,
  2511. closePath: bool = False,
  2512. lineCap: int = 0,
  2513. lineJoin: int = 0,
  2514. overlay: bool = True,
  2515. stroke_opacity: float = 1,
  2516. fill_opacity: float = 1,
  2517. oc: int = 0,
  2518. ) -> pymupdf.Point:
  2519. """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3."""
  2520. img = page.new_shape()
  2521. Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3))
  2522. img.finish(
  2523. color=color,
  2524. fill=fill,
  2525. dashes=dashes,
  2526. width=width,
  2527. lineCap=lineCap,
  2528. lineJoin=lineJoin,
  2529. morph=morph,
  2530. closePath=closePath,
  2531. stroke_opacity=stroke_opacity,
  2532. fill_opacity=fill_opacity,
  2533. oc=oc,
  2534. )
  2535. img.commit(overlay)
  2536. return Q
  2537. def draw_bezier(
  2538. page: pymupdf.Page,
  2539. p1: point_like,
  2540. p2: point_like,
  2541. p3: point_like,
  2542. p4: point_like,
  2543. color: OptSeq = (0,),
  2544. fill: OptSeq = None,
  2545. dashes: OptStr = None,
  2546. width: float = 1,
  2547. morph: OptStr = None,
  2548. closePath: bool = False,
  2549. lineCap: int = 0,
  2550. lineJoin: int = 0,
  2551. overlay: bool = True,
  2552. stroke_opacity: float = 1,
  2553. fill_opacity: float = 1,
  2554. oc: int = 0,
  2555. ) -> pymupdf.Point:
  2556. """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3."""
  2557. img = page.new_shape()
  2558. Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4))
  2559. img.finish(
  2560. color=color,
  2561. fill=fill,
  2562. dashes=dashes,
  2563. width=width,
  2564. lineCap=lineCap,
  2565. lineJoin=lineJoin,
  2566. morph=morph,
  2567. closePath=closePath,
  2568. stroke_opacity=stroke_opacity,
  2569. fill_opacity=fill_opacity,
  2570. oc=oc,
  2571. )
  2572. img.commit(overlay)
  2573. return Q
  2574. def draw_sector(
  2575. page: pymupdf.Page,
  2576. center: point_like,
  2577. point: point_like,
  2578. beta: float,
  2579. color: OptSeq = (0,),
  2580. fill: OptSeq = None,
  2581. dashes: OptStr = None,
  2582. fullSector: bool = True,
  2583. morph: OptSeq = None,
  2584. width: float = 1,
  2585. closePath: bool = False,
  2586. lineCap: int = 0,
  2587. lineJoin: int = 0,
  2588. overlay: bool = True,
  2589. stroke_opacity: float = 1,
  2590. fill_opacity: float = 1,
  2591. oc: int = 0,
  2592. ) -> pymupdf.Point:
  2593. """Draw a circle sector given circle center, one arc end point and the angle of the arc.
  2594. Parameters:
  2595. center -- center of circle
  2596. point -- arc end point
  2597. beta -- angle of arc (degrees)
  2598. fullSector -- connect arc ends with center
  2599. """
  2600. img = page.new_shape()
  2601. Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector)
  2602. img.finish(
  2603. color=color,
  2604. fill=fill,
  2605. dashes=dashes,
  2606. width=width,
  2607. lineCap=lineCap,
  2608. lineJoin=lineJoin,
  2609. morph=morph,
  2610. closePath=closePath,
  2611. stroke_opacity=stroke_opacity,
  2612. fill_opacity=fill_opacity,
  2613. oc=oc,
  2614. )
  2615. img.commit(overlay)
  2616. return Q
  2617. # ----------------------------------------------------------------------
  2618. # Name: wx.lib.colourdb.py
  2619. # Purpose: Adds a bunch of colour names and RGB values to the
  2620. # colour database so they can be found by name
  2621. #
  2622. # Author: Robin Dunn
  2623. #
  2624. # Created: 13-March-2001
  2625. # Copyright: (c) 2001-2017 by Total Control Software
  2626. # Licence: wxWindows license
  2627. # Tags: phoenix-port, unittest, documented
  2628. # ----------------------------------------------------------------------
  2629. def getColorList() -> list:
  2630. """
  2631. Returns a list of upper-case colour names.
  2632. :rtype: list of strings
  2633. """
  2634. return [name for name, r, g, b in pymupdf.colors_wx_list()]
  2635. def getColorInfoList() -> list:
  2636. """
  2637. Returns list of (name, red, gree, blue) tuples, where:
  2638. name: upper-case color name.
  2639. read, green, blue: integers in range 0..255.
  2640. :rtype: list of tuples
  2641. """
  2642. return pymupdf.colors_wx_list()
  2643. def getColor(name: str) -> tuple:
  2644. """Retrieve RGB color in PDF format by name.
  2645. Returns:
  2646. a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
  2647. """
  2648. return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1))
  2649. def getColorHSV(name: str) -> tuple:
  2650. """Retrieve the hue, saturation, value triple of a color name.
  2651. Returns:
  2652. a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
  2653. """
  2654. try:
  2655. x = getColorInfoList()[getColorList().index(name.upper())]
  2656. except Exception:
  2657. if g_exceptions_verbose: pymupdf.exception_info()
  2658. return (-1, -1, -1)
  2659. r = x[1] / 255.0
  2660. g = x[2] / 255.0
  2661. b = x[3] / 255.0
  2662. cmax = max(r, g, b)
  2663. V = round(cmax * 100, 1)
  2664. cmin = min(r, g, b)
  2665. delta = cmax - cmin
  2666. if delta == 0:
  2667. hue = 0
  2668. elif cmax == r:
  2669. hue = 60.0 * (((g - b) / delta) % 6)
  2670. elif cmax == g:
  2671. hue = 60.0 * (((b - r) / delta) + 2)
  2672. else:
  2673. hue = 60.0 * (((r - g) / delta) + 4)
  2674. H = int(round(hue))
  2675. if cmax == 0:
  2676. sat = 0
  2677. else:
  2678. sat = delta / cmax
  2679. S = int(round(sat * 100))
  2680. return (H, S, V)
  2681. def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
  2682. fontname, ext, stype, buffer = doc.extract_font(xref)
  2683. asc = 0.8
  2684. dsc = -0.2
  2685. if ext == "":
  2686. return fontname, ext, stype, asc, dsc
  2687. if buffer:
  2688. try:
  2689. font = pymupdf.Font(fontbuffer=buffer)
  2690. asc = font.ascender
  2691. dsc = font.descender
  2692. bbox = font.bbox
  2693. if asc - dsc < 1:
  2694. if bbox.y0 < dsc:
  2695. dsc = bbox.y0
  2696. asc = 1 - dsc
  2697. except Exception:
  2698. pymupdf.exception_info()
  2699. asc *= 1.2
  2700. dsc *= 1.2
  2701. return fontname, ext, stype, asc, dsc
  2702. if ext != "n/a":
  2703. try:
  2704. font = pymupdf.Font(fontname)
  2705. asc = font.ascender
  2706. dsc = font.descender
  2707. except Exception:
  2708. pymupdf.exception_info()
  2709. asc *= 1.2
  2710. dsc *= 1.2
  2711. else:
  2712. asc *= 1.2
  2713. dsc *= 1.2
  2714. return fontname, ext, stype, asc, dsc
  2715. def get_char_widths(
  2716. doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None
  2717. ) -> list:
  2718. """Get list of glyph information of a font.
  2719. Notes:
  2720. Must be provided by its XREF number. If we already dealt with the
  2721. font, it will be recorded in doc.FontInfos. Otherwise we insert an
  2722. entry there.
  2723. Finally we return the glyphs for the font. This is a list of
  2724. (glyph, width) where glyph is an integer controlling the char
  2725. appearance, and width is a float controlling the char's spacing:
  2726. width * fontsize is the actual space.
  2727. For 'simple' fonts, glyph == ord(char) will usually be true.
  2728. Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here.
  2729. """
  2730. fontinfo = pymupdf.CheckFontInfo(doc, xref)
  2731. if fontinfo is None: # not recorded yet: create it
  2732. if fontdict is None:
  2733. name, ext, stype, asc, dsc = _get_font_properties(doc, xref)
  2734. fontdict = {
  2735. "name": name,
  2736. "type": stype,
  2737. "ext": ext,
  2738. "ascender": asc,
  2739. "descender": dsc,
  2740. }
  2741. else:
  2742. name = fontdict["name"]
  2743. ext = fontdict["ext"]
  2744. stype = fontdict["type"]
  2745. ordering = fontdict["ordering"]
  2746. simple = fontdict["simple"]
  2747. if ext == "":
  2748. raise ValueError("xref is not a font")
  2749. # check for 'simple' fonts
  2750. if stype in ("Type1", "MMType1", "TrueType"):
  2751. simple = True
  2752. else:
  2753. simple = False
  2754. # check for CJK fonts
  2755. if name in ("Fangti", "Ming"):
  2756. ordering = 0
  2757. elif name in ("Heiti", "Song"):
  2758. ordering = 1
  2759. elif name in ("Gothic", "Mincho"):
  2760. ordering = 2
  2761. elif name in ("Dotum", "Batang"):
  2762. ordering = 3
  2763. else:
  2764. ordering = -1
  2765. fontdict["simple"] = simple
  2766. if name == "ZapfDingbats":
  2767. glyphs = pymupdf.zapf_glyphs
  2768. elif name == "Symbol":
  2769. glyphs = pymupdf.symbol_glyphs
  2770. else:
  2771. glyphs = None
  2772. fontdict["glyphs"] = glyphs
  2773. fontdict["ordering"] = ordering
  2774. fontinfo = [xref, fontdict]
  2775. doc.FontInfos.append(fontinfo)
  2776. else:
  2777. fontdict = fontinfo[1]
  2778. glyphs = fontdict["glyphs"]
  2779. simple = fontdict["simple"]
  2780. ordering = fontdict["ordering"]
  2781. if glyphs is None:
  2782. oldlimit = 0
  2783. else:
  2784. oldlimit = len(glyphs)
  2785. mylimit = max(256, limit)
  2786. if mylimit <= oldlimit:
  2787. return glyphs
  2788. if ordering < 0: # not a CJK font
  2789. glyphs = doc._get_char_widths(
  2790. xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx
  2791. )
  2792. else: # CJK fonts use char codes and width = 1
  2793. glyphs = None
  2794. fontdict["glyphs"] = glyphs
  2795. fontinfo[1] = fontdict
  2796. pymupdf.UpdateFontInfo(doc, fontinfo)
  2797. return glyphs
  2798. class Shape:
  2799. """Create a new shape."""
  2800. @staticmethod
  2801. def horizontal_angle(C, P):
  2802. """Return the angle to the horizontal for the connection from C to P.
  2803. This uses the arcus sine function and resolves its inherent ambiguity by
  2804. looking up in which quadrant vector S = P - C is located.
  2805. """
  2806. S = pymupdf.Point(P - C).unit # unit vector 'C' -> 'P'
  2807. alfa = math.asin(abs(S.y)) # absolute angle from horizontal
  2808. if S.x < 0: # make arcsin result unique
  2809. if S.y <= 0: # bottom-left
  2810. alfa = -(math.pi - alfa)
  2811. else: # top-left
  2812. alfa = math.pi - alfa
  2813. else:
  2814. if S.y >= 0: # top-right
  2815. pass
  2816. else: # bottom-right
  2817. alfa = -alfa
  2818. return alfa
  2819. def __init__(self, page: pymupdf.Page):
  2820. pymupdf.CheckParent(page)
  2821. self.page = page
  2822. self.doc = page.parent
  2823. if not self.doc.is_pdf:
  2824. raise ValueError("is no PDF")
  2825. self.height = page.mediabox_size.y
  2826. self.width = page.mediabox_size.x
  2827. self.x = page.cropbox_position.x
  2828. self.y = page.cropbox_position.y
  2829. self.pctm = page.transformation_matrix # page transf. matrix
  2830. self.ipctm = ~self.pctm # inverted transf. matrix
  2831. self.draw_cont = ""
  2832. self.text_cont = ""
  2833. self.totalcont = ""
  2834. self.last_point = None
  2835. self.rect = None
  2836. def updateRect(self, x):
  2837. if self.rect is None:
  2838. if len(x) == 2:
  2839. self.rect = pymupdf.Rect(x, x)
  2840. else:
  2841. self.rect = pymupdf.Rect(x)
  2842. else:
  2843. if len(x) == 2:
  2844. x = pymupdf.Point(x)
  2845. self.rect.x0 = min(self.rect.x0, x.x)
  2846. self.rect.y0 = min(self.rect.y0, x.y)
  2847. self.rect.x1 = max(self.rect.x1, x.x)
  2848. self.rect.y1 = max(self.rect.y1, x.y)
  2849. else:
  2850. x = pymupdf.Rect(x)
  2851. self.rect.x0 = min(self.rect.x0, x.x0)
  2852. self.rect.y0 = min(self.rect.y0, x.y0)
  2853. self.rect.x1 = max(self.rect.x1, x.x1)
  2854. self.rect.y1 = max(self.rect.y1, x.y1)
  2855. def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point:
  2856. """Draw a line between two points."""
  2857. p1 = pymupdf.Point(p1)
  2858. p2 = pymupdf.Point(p2)
  2859. if not (self.last_point == p1):
  2860. self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
  2861. self.last_point = p1
  2862. self.updateRect(p1)
  2863. self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n"
  2864. self.updateRect(p2)
  2865. self.last_point = p2
  2866. return self.last_point
  2867. def draw_polyline(self, points: list) -> pymupdf.Point:
  2868. """Draw several connected line segments."""
  2869. for i, p in enumerate(points):
  2870. if i == 0:
  2871. if not (self.last_point == pymupdf.Point(p)):
  2872. self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n"
  2873. self.last_point = pymupdf.Point(p)
  2874. else:
  2875. self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n"
  2876. self.updateRect(p)
  2877. self.last_point = pymupdf.Point(points[-1])
  2878. return self.last_point
  2879. def draw_bezier(
  2880. self,
  2881. p1: point_like,
  2882. p2: point_like,
  2883. p3: point_like,
  2884. p4: point_like,
  2885. ) -> pymupdf.Point:
  2886. """Draw a standard cubic Bezier curve."""
  2887. p1 = pymupdf.Point(p1)
  2888. p2 = pymupdf.Point(p2)
  2889. p3 = pymupdf.Point(p3)
  2890. p4 = pymupdf.Point(p4)
  2891. if not (self.last_point == p1):
  2892. self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
  2893. args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm))
  2894. self.draw_cont += _format_g(args) + " c\n"
  2895. self.updateRect(p1)
  2896. self.updateRect(p2)
  2897. self.updateRect(p3)
  2898. self.updateRect(p4)
  2899. self.last_point = p4
  2900. return self.last_point
  2901. def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point:
  2902. """Draw an ellipse inside a tetrapod."""
  2903. if len(tetra) != 4:
  2904. raise ValueError("invalid arg length")
  2905. if hasattr(tetra[0], "__float__"):
  2906. q = pymupdf.Rect(tetra).quad
  2907. else:
  2908. q = pymupdf.Quad(tetra)
  2909. mt = q.ul + (q.ur - q.ul) * 0.5
  2910. mr = q.ur + (q.lr - q.ur) * 0.5
  2911. mb = q.ll + (q.lr - q.ll) * 0.5
  2912. ml = q.ul + (q.ll - q.ul) * 0.5
  2913. if not (self.last_point == ml):
  2914. self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n"
  2915. self.last_point = ml
  2916. self.draw_curve(ml, q.ll, mb)
  2917. self.draw_curve(mb, q.lr, mr)
  2918. self.draw_curve(mr, q.ur, mt)
  2919. self.draw_curve(mt, q.ul, ml)
  2920. self.updateRect(q.rect)
  2921. self.last_point = ml
  2922. return self.last_point
  2923. def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point:
  2924. """Draw a circle given its center and radius."""
  2925. if not radius > pymupdf.EPSILON:
  2926. raise ValueError("radius must be positive")
  2927. center = pymupdf.Point(center)
  2928. p1 = center - (radius, 0)
  2929. return self.draw_sector(center, p1, 360, fullSector=False)
  2930. def draw_curve(
  2931. self,
  2932. p1: point_like,
  2933. p2: point_like,
  2934. p3: point_like,
  2935. ) -> pymupdf.Point:
  2936. """Draw a curve between points using one control point."""
  2937. kappa = 0.55228474983
  2938. p1 = pymupdf.Point(p1)
  2939. p2 = pymupdf.Point(p2)
  2940. p3 = pymupdf.Point(p3)
  2941. k1 = p1 + (p2 - p1) * kappa
  2942. k2 = p3 + (p2 - p3) * kappa
  2943. return self.draw_bezier(p1, k1, k2, p3)
  2944. def draw_sector(
  2945. self,
  2946. center: point_like,
  2947. point: point_like,
  2948. beta: float,
  2949. fullSector: bool = True,
  2950. ) -> pymupdf.Point:
  2951. """Draw a circle sector."""
  2952. center = pymupdf.Point(center)
  2953. point = pymupdf.Point(point)
  2954. l3 = lambda a, b: _format_g((a, b)) + " m\n"
  2955. l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n"
  2956. l5 = lambda a, b: _format_g((a, b)) + " l\n"
  2957. betar = math.radians(-beta)
  2958. w360 = math.radians(math.copysign(360, betar)) * (-1)
  2959. w90 = math.radians(math.copysign(90, betar))
  2960. w45 = w90 / 2
  2961. while abs(betar) > 2 * math.pi:
  2962. betar += w360 # bring angle below 360 degrees
  2963. if not (self.last_point == point):
  2964. self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
  2965. self.last_point = point
  2966. Q = pymupdf.Point(0, 0) # just make sure it exists
  2967. C = center
  2968. P = point
  2969. S = P - C # vector 'center' -> 'point'
  2970. rad = abs(S) # circle radius
  2971. if not rad > pymupdf.EPSILON:
  2972. raise ValueError("radius must be positive")
  2973. alfa = self.horizontal_angle(center, point)
  2974. while abs(betar) > abs(w90): # draw 90 degree arcs
  2975. q1 = C.x + math.cos(alfa + w90) * rad
  2976. q2 = C.y + math.sin(alfa + w90) * rad
  2977. Q = pymupdf.Point(q1, q2) # the arc's end point
  2978. r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45)
  2979. r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45)
  2980. R = pymupdf.Point(r1, r2) # crossing point of tangents
  2981. kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q)
  2982. kappa = kappah * abs(P - Q)
  2983. cp1 = P + (R - P) * kappa # control point 1
  2984. cp2 = Q + (R - Q) * kappa # control point 2
  2985. self.draw_cont += l4(*pymupdf.JM_TUPLE(
  2986. list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
  2987. ))
  2988. betar -= w90 # reduce param angle by 90 deg
  2989. alfa += w90 # advance start angle by 90 deg
  2990. P = Q # advance to arc end point
  2991. # draw (remaining) arc
  2992. if abs(betar) > 1e-3: # significant degrees left?
  2993. beta2 = betar / 2
  2994. q1 = C.x + math.cos(alfa + betar) * rad
  2995. q2 = C.y + math.sin(alfa + betar) * rad
  2996. Q = pymupdf.Point(q1, q2) # the arc's end point
  2997. r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2)
  2998. r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2)
  2999. R = pymupdf.Point(r1, r2) # crossing point of tangents
  3000. # kappa height is 4/3 of segment height
  3001. kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height
  3002. kappa = kappah * abs(P - Q) / (1 - math.cos(betar))
  3003. cp1 = P + (R - P) * kappa # control point 1
  3004. cp2 = Q + (R - Q) * kappa # control point 2
  3005. self.draw_cont += l4(*pymupdf.JM_TUPLE(
  3006. list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
  3007. ))
  3008. if fullSector:
  3009. self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
  3010. self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm))
  3011. self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm))
  3012. self.last_point = Q
  3013. return self.last_point
  3014. def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point:
  3015. """Draw a rectangle.
  3016. Args:
  3017. radius: if not None, the rectangle will have rounded corners.
  3018. This is the radius of the curvature, given as percentage of
  3019. the rectangle width or height. Valid are values 0 < v <= 0.5.
  3020. For a sequence of two values, the corners will have different
  3021. radii. Otherwise, the percentage will be computed from the
  3022. shorter side. A value of (0.5, 0.5) will draw an ellipse.
  3023. """
  3024. r = pymupdf.Rect(rect)
  3025. if radius is None: # standard rectangle
  3026. self.draw_cont += _format_g(pymupdf.JM_TUPLE(
  3027. list(r.bl * self.ipctm) + [r.width, r.height]
  3028. )) + " re\n"
  3029. self.updateRect(r)
  3030. self.last_point = r.tl
  3031. return self.last_point
  3032. # rounded corners requested. This requires 1 or 2 values, each
  3033. # with 0 < value <= 0.5
  3034. if hasattr(radius, "__float__"):
  3035. if radius <= 0 or radius > 0.5:
  3036. raise ValueError(f"bad radius value {radius}.")
  3037. d = min(r.width, r.height) * radius
  3038. px = (d, 0)
  3039. py = (0, d)
  3040. elif hasattr(radius, "__len__") and len(radius) == 2:
  3041. rx, ry = radius
  3042. px = (rx * r.width, 0)
  3043. py = (0, ry * r.height)
  3044. if min(rx, ry) <= 0 or max(rx, ry) > 0.5:
  3045. raise ValueError(f"bad radius value {radius}.")
  3046. else:
  3047. raise ValueError(f"bad radius value {radius}.")
  3048. lp = self.draw_line(r.tl + py, r.bl - py)
  3049. lp = self.draw_curve(lp, r.bl, r.bl + px)
  3050. lp = self.draw_line(lp, r.br - px)
  3051. lp = self.draw_curve(lp, r.br, r.br - py)
  3052. lp = self.draw_line(lp, r.tr + py)
  3053. lp = self.draw_curve(lp, r.tr, r.tr - px)
  3054. lp = self.draw_line(lp, r.tl + px)
  3055. self.last_point = self.draw_curve(lp, r.tl, r.tl + py)
  3056. self.updateRect(r)
  3057. return self.last_point
  3058. def draw_quad(self, quad: quad_like) -> pymupdf.Point:
  3059. """Draw a Quad."""
  3060. q = pymupdf.Quad(quad)
  3061. return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul])
  3062. def draw_zigzag(
  3063. self,
  3064. p1: point_like,
  3065. p2: point_like,
  3066. breadth: float = 2,
  3067. ) -> pymupdf.Point:
  3068. """Draw a zig-zagged line from p1 to p2."""
  3069. p1 = pymupdf.Point(p1)
  3070. p2 = pymupdf.Point(p2)
  3071. S = p2 - p1 # vector start - end
  3072. rad = abs(S) # distance of points
  3073. cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
  3074. if cnt < 4:
  3075. raise ValueError("points too close")
  3076. mb = rad / cnt # revised breadth
  3077. matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
  3078. i_mat = ~matrix # get original position
  3079. points = [] # stores edges
  3080. for i in range(1, cnt):
  3081. if i % 4 == 1: # point "above" connection
  3082. p = pymupdf.Point(i, -1) * mb
  3083. elif i % 4 == 3: # point "below" connection
  3084. p = pymupdf.Point(i, 1) * mb
  3085. else: # ignore others
  3086. continue
  3087. points.append(p * i_mat)
  3088. self.draw_polyline([p1] + points + [p2]) # add start and end points
  3089. return p2
  3090. def draw_squiggle(
  3091. self,
  3092. p1: point_like,
  3093. p2: point_like,
  3094. breadth=2,
  3095. ) -> pymupdf.Point:
  3096. """Draw a squiggly line from p1 to p2."""
  3097. p1 = pymupdf.Point(p1)
  3098. p2 = pymupdf.Point(p2)
  3099. S = p2 - p1 # vector start - end
  3100. rad = abs(S) # distance of points
  3101. cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
  3102. if cnt < 4:
  3103. raise ValueError("points too close")
  3104. mb = rad / cnt # revised breadth
  3105. matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
  3106. i_mat = ~matrix # get original position
  3107. k = 2.4142135623765633 # y of draw_curve helper point
  3108. points = [] # stores edges
  3109. for i in range(1, cnt):
  3110. if i % 4 == 1: # point "above" connection
  3111. p = pymupdf.Point(i, -k) * mb
  3112. elif i % 4 == 3: # point "below" connection
  3113. p = pymupdf.Point(i, k) * mb
  3114. else: # else on connection line
  3115. p = pymupdf.Point(i, 0) * mb
  3116. points.append(p * i_mat)
  3117. points = [p1] + points + [p2]
  3118. cnt = len(points)
  3119. i = 0
  3120. while i + 2 < cnt:
  3121. self.draw_curve(points[i], points[i + 1], points[i + 2])
  3122. i += 2
  3123. return p2
  3124. # ==============================================================================
  3125. # Shape.insert_text
  3126. # ==============================================================================
  3127. def insert_text(
  3128. self,
  3129. point: point_like,
  3130. buffer: typing.Union[str, list],
  3131. *,
  3132. fontsize: float = 11,
  3133. lineheight: OptFloat = None,
  3134. fontname: str = "helv",
  3135. fontfile: OptStr = None,
  3136. set_simple: bool = 0,
  3137. encoding: int = 0,
  3138. color: OptSeq = None,
  3139. fill: OptSeq = None,
  3140. render_mode: int = 0,
  3141. border_width: float = 0.05,
  3142. miter_limit: float = 1,
  3143. rotate: int = 0,
  3144. morph: OptSeq = None,
  3145. stroke_opacity: float = 1,
  3146. fill_opacity: float = 1,
  3147. oc: int = 0,
  3148. ) -> int:
  3149. # ensure 'text' is a list of strings, worth dealing with
  3150. if not bool(buffer):
  3151. return 0
  3152. if type(buffer) not in (list, tuple):
  3153. text = buffer.splitlines()
  3154. else:
  3155. text = buffer
  3156. if not len(text) > 0:
  3157. return 0
  3158. point = pymupdf.Point(point)
  3159. try:
  3160. maxcode = max([ord(c) for c in " ".join(text)])
  3161. except Exception:
  3162. pymupdf.exception_info()
  3163. return 0
  3164. # ensure valid 'fontname'
  3165. fname = fontname
  3166. if fname.startswith("/"):
  3167. fname = fname[1:]
  3168. xref = self.page.insert_font(
  3169. fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
  3170. )
  3171. fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
  3172. fontdict = fontinfo[1]
  3173. ordering = fontdict["ordering"]
  3174. simple = fontdict["simple"]
  3175. bfname = fontdict["name"]
  3176. ascender = fontdict["ascender"]
  3177. descender = fontdict["descender"]
  3178. if lineheight:
  3179. lheight = fontsize * lineheight
  3180. elif ascender - descender <= 1:
  3181. lheight = fontsize * 1.2
  3182. else:
  3183. lheight = fontsize * (ascender - descender)
  3184. if maxcode > 255:
  3185. glyphs = self.doc.get_char_widths(xref, maxcode + 1)
  3186. else:
  3187. glyphs = fontdict["glyphs"]
  3188. tab = []
  3189. for t in text:
  3190. if simple and bfname not in ("Symbol", "ZapfDingbats"):
  3191. g = None
  3192. else:
  3193. g = glyphs
  3194. tab.append(pymupdf.getTJstr(t, g, simple, ordering))
  3195. text = tab
  3196. color_str = pymupdf.ColorCode(color, "c")
  3197. fill_str = pymupdf.ColorCode(fill, "f")
  3198. if not fill and render_mode == 0: # ensure fill color when 0 Tr
  3199. fill = color
  3200. fill_str = pymupdf.ColorCode(color, "f")
  3201. morphing = pymupdf.CheckMorph(morph)
  3202. rot = rotate
  3203. if rot % 90 != 0:
  3204. raise ValueError("bad rotate value")
  3205. while rot < 0:
  3206. rot += 360
  3207. rot = rot % 360 # text rotate = 0, 90, 270, 180
  3208. templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf "
  3209. templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n"
  3210. cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise
  3211. cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise
  3212. cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
  3213. height = self.height
  3214. width = self.width
  3215. # setting up for standard rotation directions
  3216. # case rotate = 0
  3217. if morphing:
  3218. m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y)
  3219. mat = ~m1 * morph[1] * m1
  3220. cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
  3221. else:
  3222. cm = ""
  3223. top = height - point.y - self.y # start of 1st char
  3224. left = point.x + self.x # start of 1. char
  3225. space = top # space available
  3226. #headroom = point.y + self.y # distance to page border
  3227. if rot == 90:
  3228. left = height - point.y - self.y
  3229. top = -point.x - self.x
  3230. cm += cmp90
  3231. space = width - abs(top)
  3232. #headroom = point.x + self.x
  3233. elif rot == 270:
  3234. left = -height + point.y + self.y
  3235. top = point.x + self.x
  3236. cm += cmm90
  3237. space = abs(top)
  3238. #headroom = width - point.x - self.x
  3239. elif rot == 180:
  3240. left = -point.x - self.x
  3241. top = -height + point.y + self.y
  3242. cm += cm180
  3243. space = abs(point.y + self.y)
  3244. #headroom = height - point.y - self.y
  3245. optcont = self.page._get_optional_content(oc)
  3246. if optcont is not None:
  3247. bdc = "/OC /%s BDC\n" % optcont
  3248. emc = "EMC\n"
  3249. else:
  3250. bdc = emc = ""
  3251. alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
  3252. if alpha is None:
  3253. alpha = ""
  3254. else:
  3255. alpha = "/%s gs\n" % alpha
  3256. nres = templ1(bdc, alpha, cm, left, top, fname, fontsize)
  3257. if render_mode > 0:
  3258. nres += "%i Tr " % render_mode
  3259. nres += _format_g(border_width * fontsize) + " w "
  3260. if miter_limit is not None:
  3261. nres += _format_g(miter_limit) + " M "
  3262. if color is not None:
  3263. nres += color_str
  3264. if fill is not None:
  3265. nres += fill_str
  3266. # =========================================================================
  3267. # start text insertion
  3268. # =========================================================================
  3269. nres += text[0]
  3270. nlines = 1 # set output line counter
  3271. if len(text) > 1:
  3272. nres += templ2(lheight) # line 1
  3273. else:
  3274. nres += 'TJ'
  3275. for i in range(1, len(text)):
  3276. if space < lheight:
  3277. break # no space left on page
  3278. if i > 1:
  3279. nres += "\nT* "
  3280. nres += text[i] + 'TJ'
  3281. space -= lheight
  3282. nlines += 1
  3283. nres += "\nET\n%sQ\n" % emc
  3284. # =========================================================================
  3285. # end of text insertion
  3286. # =========================================================================
  3287. # update the /Contents object
  3288. self.text_cont += nres
  3289. return nlines
  3290. # ==============================================================================
  3291. # Shape.insert_textbox
  3292. # ==============================================================================
  3293. def insert_textbox(
  3294. self,
  3295. rect: rect_like,
  3296. buffer: typing.Union[str, list],
  3297. *,
  3298. fontname: OptStr = "helv",
  3299. fontfile: OptStr = None,
  3300. fontsize: float = 11,
  3301. lineheight: OptFloat = None,
  3302. set_simple: bool = 0,
  3303. encoding: int = 0,
  3304. color: OptSeq = None,
  3305. fill: OptSeq = None,
  3306. expandtabs: int = 1,
  3307. border_width: float = 0.05,
  3308. miter_limit: float = 1,
  3309. align: int = 0,
  3310. render_mode: int = 0,
  3311. rotate: int = 0,
  3312. morph: OptSeq = None,
  3313. stroke_opacity: float = 1,
  3314. fill_opacity: float = 1,
  3315. oc: int = 0,
  3316. ) -> float:
  3317. """Insert text into a given rectangle.
  3318. Args:
  3319. rect -- the textbox to fill
  3320. buffer -- text to be inserted
  3321. fontname -- a Base-14 font, font name or '/name'
  3322. fontfile -- name of a font file
  3323. fontsize -- font size
  3324. lineheight -- overwrite the font property
  3325. color -- RGB stroke color triple
  3326. fill -- RGB fill color triple
  3327. render_mode -- text rendering control
  3328. border_width -- thickness of glyph borders as percentage of fontsize
  3329. expandtabs -- handles tabulators with string function
  3330. align -- left, center, right, justified
  3331. rotate -- 0, 90, 180, or 270 degrees
  3332. morph -- morph box with a matrix and a fixpoint
  3333. Returns:
  3334. unused or deficit rectangle area (float)
  3335. """
  3336. rect = pymupdf.Rect(rect)
  3337. if rect.is_empty or rect.is_infinite:
  3338. raise ValueError("text box must be finite and not empty")
  3339. color_str = pymupdf.ColorCode(color, "c")
  3340. fill_str = pymupdf.ColorCode(fill, "f")
  3341. if fill is None and render_mode == 0: # ensure fill color for 0 Tr
  3342. fill = color
  3343. fill_str = pymupdf.ColorCode(color, "f")
  3344. optcont = self.page._get_optional_content(oc)
  3345. if optcont is not None:
  3346. bdc = "/OC /%s BDC\n" % optcont
  3347. emc = "EMC\n"
  3348. else:
  3349. bdc = emc = ""
  3350. # determine opacity / transparency
  3351. alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
  3352. if alpha is None:
  3353. alpha = ""
  3354. else:
  3355. alpha = "/%s gs\n" % alpha
  3356. if rotate % 90 != 0:
  3357. raise ValueError("rotate must be multiple of 90")
  3358. rot = rotate
  3359. while rot < 0:
  3360. rot += 360
  3361. rot = rot % 360
  3362. # is buffer worth of dealing with?
  3363. if not bool(buffer):
  3364. return rect.height if rot in (0, 180) else rect.width
  3365. cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise
  3366. cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise
  3367. cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
  3368. height = self.height
  3369. fname = fontname
  3370. if fname.startswith("/"):
  3371. fname = fname[1:]
  3372. xref = self.page.insert_font(
  3373. fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
  3374. )
  3375. fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
  3376. fontdict = fontinfo[1]
  3377. ordering = fontdict["ordering"]
  3378. simple = fontdict["simple"]
  3379. glyphs = fontdict["glyphs"]
  3380. bfname = fontdict["name"]
  3381. ascender = fontdict["ascender"]
  3382. descender = fontdict["descender"]
  3383. if lineheight:
  3384. lheight_factor = lineheight
  3385. elif ascender - descender <= 1:
  3386. lheight_factor = 1.2
  3387. else:
  3388. lheight_factor = ascender - descender
  3389. lheight = fontsize * lheight_factor
  3390. # create a list from buffer, split into its lines
  3391. if type(buffer) in (list, tuple):
  3392. t0 = "\n".join(buffer)
  3393. else:
  3394. t0 = buffer
  3395. maxcode = max([ord(c) for c in t0])
  3396. # replace invalid char codes for simple fonts
  3397. if simple and maxcode > 255:
  3398. t0 = "".join([c if ord(c) < 256 else "?" for c in t0])
  3399. t0 = t0.splitlines()
  3400. glyphs = self.doc.get_char_widths(xref, maxcode + 1)
  3401. if simple and bfname not in ("Symbol", "ZapfDingbats"):
  3402. tj_glyphs = None
  3403. else:
  3404. tj_glyphs = glyphs
  3405. # ----------------------------------------------------------------------
  3406. # calculate pixel length of a string
  3407. # ----------------------------------------------------------------------
  3408. def pixlen(x):
  3409. """Calculate pixel length of x."""
  3410. if ordering < 0:
  3411. return sum([glyphs[ord(c)][1] for c in x]) * fontsize
  3412. else:
  3413. return len(x) * fontsize
  3414. # ---------------------------------------------------------------------
  3415. if ordering < 0:
  3416. blen = glyphs[32][1] * fontsize # pixel size of space character
  3417. else:
  3418. blen = fontsize
  3419. text = "" # output buffer
  3420. if pymupdf.CheckMorph(morph):
  3421. m1 = pymupdf.Matrix(
  3422. 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
  3423. )
  3424. mat = ~m1 * morph[1] * m1
  3425. cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
  3426. else:
  3427. cm = ""
  3428. # ---------------------------------------------------------------------
  3429. # adjust for text orientation / rotation
  3430. # ---------------------------------------------------------------------
  3431. progr = 1 # direction of line progress
  3432. c_pnt = pymupdf.Point(0, fontsize * ascender) # used for line progress
  3433. if rot == 0: # normal orientation
  3434. point = rect.tl + c_pnt # line 1 is 'lheight' below top
  3435. maxwidth = rect.width # pixels available in one line
  3436. maxheight = rect.height # available text height
  3437. elif rot == 90: # rotate counter clockwise
  3438. c_pnt = pymupdf.Point(fontsize * ascender, 0) # progress in x-direction
  3439. point = rect.bl + c_pnt # line 1 'lheight' away from left
  3440. maxwidth = rect.height # pixels available in one line
  3441. maxheight = rect.width # available text height
  3442. cm += cmp90
  3443. elif rot == 180: # text upside down
  3444. # progress upwards in y direction
  3445. c_pnt = -pymupdf.Point(0, fontsize * ascender)
  3446. point = rect.br + c_pnt # line 1 'lheight' above bottom
  3447. maxwidth = rect.width # pixels available in one line
  3448. progr = -1 # subtract lheight for next line
  3449. maxheight =rect.height # available text height
  3450. cm += cm180
  3451. else: # rotate clockwise (270 or -90)
  3452. # progress from right to left
  3453. c_pnt = -pymupdf.Point(fontsize * ascender, 0)
  3454. point = rect.tr + c_pnt # line 1 'lheight' left of right
  3455. maxwidth = rect.height # pixels available in one line
  3456. progr = -1 # subtract lheight for next line
  3457. maxheight = rect.width # available text height
  3458. cm += cmm90
  3459. # =====================================================================
  3460. # line loop
  3461. # =====================================================================
  3462. just_tab = [] # 'justify' indicators per line
  3463. for i, line in enumerate(t0):
  3464. line_t = line.expandtabs(expandtabs).split(" ") # split into words
  3465. num_words = len(line_t)
  3466. lbuff = "" # init line buffer
  3467. rest = maxwidth # available line pixels
  3468. # =================================================================
  3469. # word loop
  3470. # =================================================================
  3471. for j in range(num_words):
  3472. word = line_t[j]
  3473. pl_w = pixlen(word) # pixel len of word
  3474. if rest >= pl_w: # does it fit on the line?
  3475. lbuff += word + " " # yes, append word
  3476. rest -= pl_w + blen # update available line space
  3477. continue # next word
  3478. # word doesn't fit - output line (if not empty)
  3479. if lbuff:
  3480. lbuff = lbuff.rstrip() + "\n" # line full, append line break
  3481. text += lbuff # append to total text
  3482. just_tab.append(True) # can align-justify
  3483. lbuff = "" # re-init line buffer
  3484. rest = maxwidth # re-init avail. space
  3485. if pl_w <= maxwidth: # word shorter than 1 line?
  3486. lbuff = word + " " # start the line with it
  3487. rest = maxwidth - pl_w - blen # update free space
  3488. continue
  3489. # long word: split across multiple lines - char by char ...
  3490. if len(just_tab) > 0:
  3491. just_tab[-1] = False # cannot align-justify
  3492. for c in word:
  3493. if pixlen(lbuff) <= maxwidth - pixlen(c):
  3494. lbuff += c
  3495. else: # line full
  3496. lbuff += "\n" # close line
  3497. text += lbuff # append to text
  3498. just_tab.append(False) # cannot align-justify
  3499. lbuff = c # start new line with this char
  3500. lbuff += " " # finish long word
  3501. rest = maxwidth - pixlen(lbuff) # long word stored
  3502. if lbuff: # unprocessed line content?
  3503. text += lbuff.rstrip() # append to text
  3504. just_tab.append(False) # cannot align-justify
  3505. if i < len(t0) - 1: # not the last line?
  3506. text += "\n" # insert line break
  3507. # compute used part of the textbox
  3508. if text.endswith("\n"):
  3509. text = text[:-1]
  3510. lb_count = text.count("\n") + 1 # number of lines written
  3511. # text height = line count * line height plus one descender value
  3512. text_height = lheight * lb_count - descender * fontsize
  3513. more = text_height - maxheight # difference to height limit
  3514. if more > pymupdf.EPSILON: # landed too much outside rect
  3515. return (-1) * more # return deficit, don't output
  3516. more = abs(more)
  3517. if more < pymupdf.EPSILON:
  3518. more = 0 # don't bother with epsilons
  3519. nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer
  3520. templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf "
  3521. # center, right, justify: output each line with its own specifics
  3522. text_t = text.splitlines() # split text in lines again
  3523. just_tab[-1] = False # never justify last line
  3524. for i, t in enumerate(text_t):
  3525. spacing = 0
  3526. pl = maxwidth - pixlen(t) # length of empty line part
  3527. pnt = point + c_pnt * (i * lheight_factor) # text start of line
  3528. if align == 1: # center: right shift by half width
  3529. if rot in (0, 180):
  3530. pnt = pnt + pymupdf.Point(pl / 2, 0) * progr
  3531. else:
  3532. pnt = pnt - pymupdf.Point(0, pl / 2) * progr
  3533. elif align == 2: # right: right shift by full width
  3534. if rot in (0, 180):
  3535. pnt = pnt + pymupdf.Point(pl, 0) * progr
  3536. else:
  3537. pnt = pnt - pymupdf.Point(0, pl) * progr
  3538. elif align == 3: # justify
  3539. spaces = t.count(" ") # number of spaces in line
  3540. if spaces > 0 and just_tab[i]: # if any, and we may justify
  3541. spacing = pl / spaces # make every space this much larger
  3542. else:
  3543. spacing = 0 # keep normal space length
  3544. top = height - pnt.y - self.y
  3545. left = pnt.x + self.x
  3546. if rot == 90:
  3547. left = height - pnt.y - self.y
  3548. top = -pnt.x - self.x
  3549. elif rot == 270:
  3550. left = -height + pnt.y + self.y
  3551. top = pnt.x + self.x
  3552. elif rot == 180:
  3553. left = -pnt.x - self.x
  3554. top = -height + pnt.y + self.y
  3555. nres += templ(left, top, fname, fontsize)
  3556. if render_mode > 0:
  3557. nres += "%i Tr " % render_mode
  3558. nres += _format_g(border_width * fontsize) + " w "
  3559. if miter_limit is not None:
  3560. nres += _format_g(miter_limit) + " M "
  3561. if align == 3:
  3562. nres += _format_g(spacing) + " Tw "
  3563. if color is not None:
  3564. nres += color_str
  3565. if fill is not None:
  3566. nres += fill_str
  3567. nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering)
  3568. nres += "ET\n%sQ\n" % emc
  3569. self.text_cont += nres
  3570. self.updateRect(rect)
  3571. return more
  3572. def finish(
  3573. self,
  3574. width: float = 1,
  3575. color: OptSeq = (0,),
  3576. fill: OptSeq = None,
  3577. lineCap: int = 0,
  3578. lineJoin: int = 0,
  3579. dashes: OptStr = None,
  3580. even_odd: bool = False,
  3581. morph: OptSeq = None,
  3582. closePath: bool = True,
  3583. fill_opacity: float = 1,
  3584. stroke_opacity: float = 1,
  3585. oc: int = 0,
  3586. ) -> None:
  3587. """Finish the current drawing segment.
  3588. Notes:
  3589. Apply colors, opacity, dashes, line style and width, or
  3590. morphing. Also whether to close the path
  3591. by connecting last to first point.
  3592. """
  3593. if self.draw_cont == "": # treat empty contents as no-op
  3594. return
  3595. if width == 0: # border color makes no sense then
  3596. color = None
  3597. elif color is None: # vice versa
  3598. width = 0
  3599. # if color == None and fill == None:
  3600. # raise ValueError("at least one of 'color' or 'fill' must be given")
  3601. color_str = pymupdf.ColorCode(color, "c") # ensure proper color string
  3602. fill_str = pymupdf.ColorCode(fill, "f") # ensure proper fill string
  3603. optcont = self.page._get_optional_content(oc)
  3604. if optcont is not None:
  3605. self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont
  3606. emc = "EMC\n"
  3607. else:
  3608. emc = ""
  3609. alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
  3610. if alpha is not None:
  3611. self.draw_cont = "/%s gs\n" % alpha + self.draw_cont
  3612. if width != 1 and width != 0:
  3613. self.draw_cont += _format_g(width) + " w\n"
  3614. if lineCap != 0:
  3615. self.draw_cont = "%i J\n" % lineCap + self.draw_cont
  3616. if lineJoin != 0:
  3617. self.draw_cont = "%i j\n" % lineJoin + self.draw_cont
  3618. if dashes not in (None, "", "[] 0"):
  3619. self.draw_cont = "%s d\n" % dashes + self.draw_cont
  3620. if closePath:
  3621. self.draw_cont += "h\n"
  3622. self.last_point = None
  3623. if color is not None:
  3624. self.draw_cont += color_str
  3625. if fill is not None:
  3626. self.draw_cont += fill_str
  3627. if color is not None:
  3628. if not even_odd:
  3629. self.draw_cont += "B\n"
  3630. else:
  3631. self.draw_cont += "B*\n"
  3632. else:
  3633. if not even_odd:
  3634. self.draw_cont += "f\n"
  3635. else:
  3636. self.draw_cont += "f*\n"
  3637. else:
  3638. self.draw_cont += "S\n"
  3639. self.draw_cont += emc
  3640. if pymupdf.CheckMorph(morph):
  3641. m1 = pymupdf.Matrix(
  3642. 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
  3643. )
  3644. mat = ~m1 * morph[1] * m1
  3645. self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont
  3646. self.totalcont += "\nq\n" + self.draw_cont + "Q\n"
  3647. self.draw_cont = ""
  3648. self.last_point = None
  3649. return
  3650. def commit(self, overlay: bool = True) -> None:
  3651. """Update the page's /Contents object with Shape data.
  3652. The argument controls whether data appear in foreground (default)
  3653. or background.
  3654. """
  3655. pymupdf.CheckParent(self.page) # doc may have died meanwhile
  3656. self.totalcont += self.text_cont
  3657. self.totalcont = self.totalcont.encode()
  3658. if self.totalcont:
  3659. if overlay:
  3660. self.page.wrap_contents() # ensure a balanced graphics state
  3661. # make /Contents object with dummy stream
  3662. xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay)
  3663. # update it with potential compression
  3664. self.doc.update_stream(xref, self.totalcont)
  3665. self.last_point = None # clean up ...
  3666. self.rect = None #
  3667. self.draw_cont = "" # for potential ...
  3668. self.text_cont = "" # ...
  3669. self.totalcont = "" # re-use
  3670. def apply_redactions(
  3671. page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0
  3672. ) -> bool:
  3673. """Apply the redaction annotations of the page.
  3674. Args:
  3675. page: the PDF page.
  3676. images:
  3677. 0 - ignore images
  3678. 1 - remove all overlapping images
  3679. 2 - blank out overlapping image parts
  3680. 3 - remove image unless invisible
  3681. graphics:
  3682. 0 - ignore graphics
  3683. 1 - remove graphics if contained in rectangle
  3684. 2 - remove all overlapping graphics
  3685. text:
  3686. 0 - remove text
  3687. 1 - ignore text
  3688. """
  3689. def center_rect(annot_rect, new_text, font, fsize):
  3690. """Calculate minimal sub-rectangle for the overlay text.
  3691. Notes:
  3692. Because 'insert_textbox' supports no vertical text centering,
  3693. we calculate an approximate number of lines here and return a
  3694. sub-rect with smaller height, which should still be sufficient.
  3695. Args:
  3696. annot_rect: the annotation rectangle
  3697. new_text: the text to insert.
  3698. font: the fontname. Must be one of the CJK or Base-14 set, else
  3699. the rectangle is returned unchanged.
  3700. fsize: the fontsize
  3701. Returns:
  3702. A rectangle to use instead of the annot rectangle.
  3703. """
  3704. if not new_text or annot_rect.width <= pymupdf.EPSILON:
  3705. return annot_rect
  3706. try:
  3707. text_width = pymupdf.get_text_length(new_text, font, fsize)
  3708. except (ValueError, mupdf.FzErrorBase): # unsupported font
  3709. if g_exceptions_verbose:
  3710. pymupdf.exception_info()
  3711. return annot_rect
  3712. line_height = fsize * 1.2
  3713. limit = annot_rect.width
  3714. h = math.ceil(text_width / limit) * line_height # estimate rect height
  3715. if h >= annot_rect.height:
  3716. return annot_rect
  3717. r = annot_rect
  3718. y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5
  3719. r.y0 = y
  3720. return r
  3721. pymupdf.CheckParent(page)
  3722. doc = page.parent
  3723. if doc.is_encrypted or doc.is_closed:
  3724. raise ValueError("document closed or encrypted")
  3725. if not doc.is_pdf:
  3726. raise ValueError("is no PDF")
  3727. redact_annots = [] # storage of annot values
  3728. for annot in page.annots(
  3729. types=(pymupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member
  3730. ):
  3731. # loop redactions
  3732. redact_annots.append(annot._get_redact_values()) # save annot values
  3733. if redact_annots == []: # any redactions on this page?
  3734. return False # no redactions
  3735. rc = page._apply_redactions(text, images, graphics) # call MuPDF
  3736. if not rc: # should not happen really
  3737. raise ValueError("Error applying redactions.")
  3738. # now write replacement text in old redact rectangles
  3739. shape = page.new_shape()
  3740. for redact in redact_annots:
  3741. annot_rect = redact["rect"]
  3742. fill = redact["fill"]
  3743. if fill:
  3744. shape.draw_rect(annot_rect) # colorize the rect background
  3745. shape.finish(fill=fill, color=fill)
  3746. if "text" in redact.keys(): # if we also have text
  3747. new_text = redact["text"]
  3748. align = redact.get("align", 0)
  3749. fname = redact["fontname"]
  3750. fsize = redact["fontsize"]
  3751. color = redact["text_color"]
  3752. # try finding vertical centered sub-rect
  3753. trect = center_rect(annot_rect, new_text, fname, fsize)
  3754. rc = -1
  3755. while rc < 0 and fsize >= 4: # while not enough room
  3756. # (re-) try insertion
  3757. rc = shape.insert_textbox(
  3758. trect,
  3759. new_text,
  3760. fontname=fname,
  3761. fontsize=fsize,
  3762. color=color,
  3763. align=align,
  3764. )
  3765. fsize -= 0.5 # reduce font if unsuccessful
  3766. shape.commit() # append new contents object
  3767. return True
  3768. # ------------------------------------------------------------------------------
  3769. # Remove potentially sensitive data from a PDF. Similar to the Adobe
  3770. # Acrobat 'sanitize' function
  3771. # ------------------------------------------------------------------------------
  3772. def scrub(
  3773. doc: pymupdf.Document,
  3774. attached_files: bool = True,
  3775. clean_pages: bool = True,
  3776. embedded_files: bool = True,
  3777. hidden_text: bool = True,
  3778. javascript: bool = True,
  3779. metadata: bool = True,
  3780. redactions: bool = True,
  3781. redact_images: int = 0,
  3782. remove_links: bool = True,
  3783. reset_fields: bool = True,
  3784. reset_responses: bool = True,
  3785. thumbnails: bool = True,
  3786. xml_metadata: bool = True,
  3787. ) -> None:
  3788. def remove_hidden(cont_lines):
  3789. """Remove hidden text from a PDF page.
  3790. Args:
  3791. cont_lines: list of lines with /Contents content. Should have status
  3792. from after page.cleanContents().
  3793. Returns:
  3794. List of /Contents lines from which hidden text has been removed.
  3795. Notes:
  3796. The input must have been created after the page's /Contents object(s)
  3797. have been cleaned with page.cleanContents(). This ensures a standard
  3798. formatting: one command per line, single spaces between operators.
  3799. This allows for drastic simplification of this code.
  3800. """
  3801. out_lines = [] # will return this
  3802. in_text = False # indicate if within BT/ET object
  3803. suppress = False # indicate text suppression active
  3804. make_return = False
  3805. for line in cont_lines:
  3806. if line == b"BT": # start of text object
  3807. in_text = True # switch on
  3808. out_lines.append(line) # output it
  3809. continue
  3810. if line == b"ET": # end of text object
  3811. in_text = False # switch off
  3812. out_lines.append(line) # output it
  3813. continue
  3814. if line == b"3 Tr": # text suppression operator
  3815. suppress = True # switch on
  3816. make_return = True
  3817. continue
  3818. if line[-2:] == b"Tr" and line[0] != b"3":
  3819. suppress = False # text rendering changed
  3820. out_lines.append(line)
  3821. continue
  3822. if line == b"Q": # unstack command also switches off
  3823. suppress = False
  3824. out_lines.append(line)
  3825. continue
  3826. if suppress and in_text: # suppress hidden lines
  3827. continue
  3828. out_lines.append(line)
  3829. if make_return:
  3830. return out_lines
  3831. else:
  3832. return None
  3833. if not doc.is_pdf: # only works for PDF
  3834. raise ValueError("is no PDF")
  3835. if doc.is_encrypted or doc.is_closed:
  3836. raise ValueError("closed or encrypted doc")
  3837. if not clean_pages:
  3838. hidden_text = False
  3839. redactions = False
  3840. if metadata:
  3841. doc.set_metadata({}) # remove standard metadata
  3842. for page in doc:
  3843. if reset_fields:
  3844. # reset form fields (widgets)
  3845. for widget in page.widgets():
  3846. widget.reset()
  3847. if remove_links:
  3848. links = page.get_links() # list of all links on page
  3849. for link in links: # remove all links
  3850. page.delete_link(link)
  3851. found_redacts = False
  3852. for annot in page.annots():
  3853. if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
  3854. annot.update_file(buffer_=b" ") # set file content to empty
  3855. if reset_responses:
  3856. annot.delete_responses()
  3857. if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member
  3858. found_redacts = True
  3859. if redactions and found_redacts:
  3860. page.apply_redactions(images=redact_images)
  3861. if not (clean_pages or hidden_text):
  3862. continue # done with the page
  3863. page.clean_contents()
  3864. if not page.get_contents():
  3865. continue
  3866. if hidden_text:
  3867. xref = page.get_contents()[0] # only one b/o cleaning!
  3868. cont = doc.xref_stream(xref)
  3869. cont_lines = remove_hidden(cont.splitlines()) # remove hidden text
  3870. if cont_lines: # something was actually removed
  3871. cont = b"\n".join(cont_lines)
  3872. doc.update_stream(xref, cont) # rewrite the page /Contents
  3873. if thumbnails: # remove page thumbnails?
  3874. if doc.xref_get_key(page.xref, "Thumb")[0] != "null":
  3875. doc.xref_set_key(page.xref, "Thumb", "null")
  3876. # pages are scrubbed, now perform document-wide scrubbing
  3877. # remove embedded files
  3878. if embedded_files:
  3879. for name in doc.embfile_names():
  3880. doc.embfile_del(name)
  3881. if xml_metadata:
  3882. doc.del_xml_metadata()
  3883. if not (xml_metadata or javascript):
  3884. xref_limit = 0
  3885. else:
  3886. xref_limit = doc.xref_length()
  3887. for xref in range(1, xref_limit):
  3888. if not doc.xref_object(xref):
  3889. msg = "bad xref %i - clean PDF before scrubbing" % xref
  3890. raise ValueError(msg)
  3891. if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript":
  3892. # a /JavaScript action object
  3893. obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript
  3894. doc.update_object(xref, obj) # update this object
  3895. continue # no further handling
  3896. if not xml_metadata:
  3897. continue
  3898. if doc.xref_get_key(xref, "Type")[1] == "/Metadata":
  3899. # delete any metadata object directly
  3900. doc.update_object(xref, "<<>>")
  3901. doc.update_stream(xref, b"deleted", new=True)
  3902. continue
  3903. if doc.xref_get_key(xref, "Metadata")[0] != "null":
  3904. doc.xref_set_key(xref, "Metadata", "null")
  3905. def _show_fz_text( text):
  3906. #if mupdf_cppyy:
  3907. # assert isinstance( text, cppyy.gbl.mupdf.Text)
  3908. #else:
  3909. # assert isinstance( text, mupdf.Text)
  3910. num_spans = 0
  3911. num_chars = 0
  3912. span = text.m_internal.head
  3913. while 1:
  3914. if not span:
  3915. break
  3916. num_spans += 1
  3917. num_chars += span.len
  3918. span = span.next
  3919. return f'num_spans={num_spans} num_chars={num_chars}'
  3920. def fill_textbox(
  3921. writer: pymupdf.TextWriter,
  3922. rect: rect_like,
  3923. text: typing.Union[str, list],
  3924. pos: point_like = None,
  3925. font: typing.Optional[pymupdf.Font] = None,
  3926. fontsize: float = 11,
  3927. lineheight: OptFloat = None,
  3928. align: int = 0,
  3929. warn: bool = None,
  3930. right_to_left: bool = False,
  3931. small_caps: bool = False,
  3932. ) -> tuple:
  3933. """Fill a rectangle with text.
  3934. Args:
  3935. writer: pymupdf.TextWriter object (= "self")
  3936. rect: rect-like to receive the text.
  3937. text: string or list/tuple of strings.
  3938. pos: point-like start position of first word.
  3939. font: pymupdf.Font object (default pymupdf.Font('helv')).
  3940. fontsize: the fontsize.
  3941. lineheight: overwrite the font property
  3942. align: (int) 0 = left, 1 = center, 2 = right, 3 = justify
  3943. warn: (bool) text overflow action: none, warn, or exception
  3944. right_to_left: (bool) indicate right-to-left language.
  3945. """
  3946. rect = pymupdf.Rect(rect)
  3947. if rect.is_empty:
  3948. raise ValueError("fill rect must not empty.")
  3949. if type(font) is not pymupdf.Font:
  3950. font = pymupdf.Font("helv")
  3951. def textlen(x):
  3952. """Return length of a string."""
  3953. return font.text_length(
  3954. x, fontsize=fontsize, small_caps=small_caps
  3955. ) # abbreviation
  3956. def char_lengths(x):
  3957. """Return list of single character lengths for a string."""
  3958. return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps)
  3959. def append_this(pos, text):
  3960. ret = writer.append(
  3961. pos, text, font=font, fontsize=fontsize, small_caps=small_caps
  3962. )
  3963. return ret
  3964. tolerance = fontsize * 0.2 # extra distance to left border
  3965. space_len = textlen(" ")
  3966. std_width = rect.width - tolerance
  3967. std_start = rect.x0 + tolerance
  3968. def norm_words(width, words):
  3969. """Cut any word in pieces no longer than 'width'."""
  3970. nwords = []
  3971. word_lengths = []
  3972. for w in words:
  3973. wl_lst = char_lengths(w)
  3974. wl = sum(wl_lst)
  3975. if wl <= width: # nothing to do - copy over
  3976. nwords.append(w)
  3977. word_lengths.append(wl)
  3978. continue
  3979. # word longer than rect width - split it in parts
  3980. n = len(wl_lst)
  3981. while n > 0:
  3982. wl = sum(wl_lst[:n])
  3983. if wl <= width:
  3984. nwords.append(w[:n])
  3985. word_lengths.append(wl)
  3986. w = w[n:]
  3987. wl_lst = wl_lst[n:]
  3988. n = len(wl_lst)
  3989. else:
  3990. n -= 1
  3991. return nwords, word_lengths
  3992. def output_justify(start, line):
  3993. """Justified output of a line."""
  3994. # ignore leading / trailing / multiple spaces
  3995. words = [w for w in line.split(" ") if w != ""]
  3996. nwords = len(words)
  3997. if nwords == 0:
  3998. return
  3999. if nwords == 1: # single word cannot be justified
  4000. append_this(start, words[0])
  4001. return
  4002. tl = sum([textlen(w) for w in words]) # total word lengths
  4003. gaps = nwords - 1 # number of word gaps
  4004. gapl = (std_width - tl) / gaps # width of each gap
  4005. for w in words:
  4006. _, lp = append_this(start, w) # output one word
  4007. start.x = lp.x + gapl # next start at word end plus gap
  4008. return
  4009. asc = font.ascender
  4010. dsc = font.descender
  4011. if not lineheight:
  4012. if asc - dsc <= 1:
  4013. lheight = 1.2
  4014. else:
  4015. lheight = asc - dsc
  4016. else:
  4017. lheight = lineheight
  4018. LINEHEIGHT = fontsize * lheight # effective line height
  4019. width = std_width # available horizontal space
  4020. # starting point of text
  4021. if pos is not None:
  4022. pos = pymupdf.Point(pos)
  4023. else: # default is just below rect top-left
  4024. pos = rect.tl + (tolerance, fontsize * asc)
  4025. if pos not in rect:
  4026. raise ValueError("Text must start in rectangle.")
  4027. # calculate displacement factor for alignment
  4028. if align == pymupdf.TEXT_ALIGN_CENTER:
  4029. factor = 0.5
  4030. elif align == pymupdf.TEXT_ALIGN_RIGHT:
  4031. factor = 1.0
  4032. else:
  4033. factor = 0
  4034. # split in lines if just a string was given
  4035. if type(text) is str:
  4036. textlines = text.splitlines()
  4037. else:
  4038. textlines = []
  4039. for line in text:
  4040. textlines.extend(line.splitlines())
  4041. max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1
  4042. new_lines = [] # the final list of textbox lines
  4043. no_justify = [] # no justify for these line numbers
  4044. for i, line in enumerate(textlines):
  4045. if line in ("", " "):
  4046. new_lines.append((line, space_len))
  4047. width = rect.width - tolerance
  4048. no_justify.append((len(new_lines) - 1))
  4049. continue
  4050. if i == 0:
  4051. width = rect.x1 - pos.x
  4052. else:
  4053. width = rect.width - tolerance
  4054. if right_to_left: # reverses Arabic / Hebrew text front to back
  4055. line = writer.clean_rtl(line)
  4056. tl = textlen(line)
  4057. if tl <= width: # line short enough
  4058. new_lines.append((line, tl))
  4059. no_justify.append((len(new_lines) - 1))
  4060. continue
  4061. # we need to split the line in fitting parts
  4062. words = line.split(" ") # the words in the line
  4063. # cut in parts any words that are longer than rect width
  4064. words, word_lengths = norm_words(width, words)
  4065. n = len(words)
  4066. while True:
  4067. line0 = " ".join(words[:n])
  4068. wl = sum(word_lengths[:n]) + space_len * (n - 1)
  4069. if wl <= width:
  4070. new_lines.append((line0, wl))
  4071. words = words[n:]
  4072. word_lengths = word_lengths[n:]
  4073. n = len(words)
  4074. line0 = None
  4075. else:
  4076. n -= 1
  4077. if len(words) == 0:
  4078. break
  4079. assert n
  4080. # -------------------------------------------------------------------------
  4081. # List of lines created. Each item is (text, tl), where 'tl' is the PDF
  4082. # output length (float) and 'text' is the text. Except for justified text,
  4083. # this is output-ready.
  4084. # -------------------------------------------------------------------------
  4085. nlines = len(new_lines)
  4086. if nlines > max_lines:
  4087. msg = "Only fitting %i of %i lines." % (max_lines, nlines)
  4088. if warn is None:
  4089. pass
  4090. elif warn:
  4091. pymupdf.message("Warning: " + msg)
  4092. else:
  4093. raise ValueError(msg)
  4094. start = pymupdf.Point()
  4095. no_justify += [len(new_lines) - 1] # no justifying of last line
  4096. for i in range(max_lines):
  4097. try:
  4098. line, tl = new_lines.pop(0)
  4099. except IndexError:
  4100. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  4101. break
  4102. if right_to_left: # Arabic, Hebrew
  4103. line = "".join(reversed(line))
  4104. if i == 0: # may have different start for first line
  4105. start = pos
  4106. if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width:
  4107. output_justify(start, line)
  4108. start.x = std_start
  4109. start.y += LINEHEIGHT
  4110. continue
  4111. if i > 0 or pos.x == std_start: # left, center, right alignments
  4112. start.x += (width - tl) * factor
  4113. append_this(start, line)
  4114. start.x = std_start
  4115. start.y += LINEHEIGHT
  4116. return new_lines # return non-written lines
  4117. # ------------------------------------------------------------------------
  4118. # Optional Content functions
  4119. # ------------------------------------------------------------------------
  4120. def get_oc(doc: pymupdf.Document, xref: int) -> int:
  4121. """Return optional content object xref for an image or form xobject.
  4122. Args:
  4123. xref: (int) xref number of an image or form xobject.
  4124. """
  4125. if doc.is_closed or doc.is_encrypted:
  4126. raise ValueError("document close or encrypted")
  4127. t, name = doc.xref_get_key(xref, "Subtype")
  4128. if t != "name" or name not in ("/Image", "/Form"):
  4129. raise ValueError("bad object type at xref %i" % xref)
  4130. t, oc = doc.xref_get_key(xref, "OC")
  4131. if t != "xref":
  4132. return 0
  4133. rc = int(oc.replace("0 R", ""))
  4134. return rc
  4135. def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None:
  4136. """Attach optional content object to image or form xobject.
  4137. Args:
  4138. xref: (int) xref number of an image or form xobject
  4139. oc: (int) xref number of an OCG or OCMD
  4140. """
  4141. if doc.is_closed or doc.is_encrypted:
  4142. raise ValueError("document close or encrypted")
  4143. t, name = doc.xref_get_key(xref, "Subtype")
  4144. if t != "name" or name not in ("/Image", "/Form"):
  4145. raise ValueError("bad object type at xref %i" % xref)
  4146. if oc > 0:
  4147. t, name = doc.xref_get_key(oc, "Type")
  4148. if t != "name" or name not in ("/OCG", "/OCMD"):
  4149. raise ValueError("bad object type at xref %i" % oc)
  4150. if oc == 0 and "OC" in doc.xref_get_keys(xref):
  4151. doc.xref_set_key(xref, "OC", "null")
  4152. return None
  4153. doc.xref_set_key(xref, "OC", "%i 0 R" % oc)
  4154. return None
  4155. def set_ocmd(
  4156. doc: pymupdf.Document,
  4157. xref: int = 0,
  4158. ocgs: typing.Union[list, None] = None,
  4159. policy: OptStr = None,
  4160. ve: typing.Union[list, None] = None,
  4161. ) -> int:
  4162. """Create or update an OCMD object in a PDF document.
  4163. Args:
  4164. xref: (int) 0 for creating a new object, otherwise update existing one.
  4165. ocgs: (list) OCG xref numbers, which shall be subject to 'policy'.
  4166. policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing).
  4167. ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'.
  4168. Returns:
  4169. Xref of the created or updated OCMD.
  4170. """
  4171. all_ocgs = set(doc.get_ocgs().keys())
  4172. def ve_maker(ve):
  4173. if type(ve) not in (list, tuple) or len(ve) < 2:
  4174. raise ValueError("bad 've' format: %s" % ve)
  4175. if ve[0].lower() not in ("and", "or", "not"):
  4176. raise ValueError("bad operand: %s" % ve[0])
  4177. if ve[0].lower() == "not" and len(ve) != 2:
  4178. raise ValueError("bad 've' format: %s" % ve)
  4179. item = "[/%s" % ve[0].title()
  4180. for x in ve[1:]:
  4181. if type(x) is int:
  4182. if x not in all_ocgs:
  4183. raise ValueError("bad OCG %i" % x)
  4184. item += " %i 0 R" % x
  4185. else:
  4186. item += " %s" % ve_maker(x)
  4187. item += "]"
  4188. return item
  4189. text = "<</Type/OCMD"
  4190. if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided
  4191. s = set(ocgs).difference(all_ocgs) # contains illegal xrefs
  4192. if s != set():
  4193. msg = "bad OCGs: %s" % s
  4194. raise ValueError(msg)
  4195. text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]"
  4196. if policy:
  4197. policy = str(policy).lower()
  4198. pols = {
  4199. "anyon": "AnyOn",
  4200. "allon": "AllOn",
  4201. "anyoff": "AnyOff",
  4202. "alloff": "AllOff",
  4203. }
  4204. if policy not in ("anyon", "allon", "anyoff", "alloff"):
  4205. raise ValueError("bad policy: %s" % policy)
  4206. text += "/P/%s" % pols[policy]
  4207. if ve:
  4208. text += "/VE%s" % ve_maker(ve)
  4209. text += ">>"
  4210. # make new object or replace old OCMD (check type first)
  4211. if xref == 0:
  4212. xref = doc.get_new_xref()
  4213. elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True):
  4214. raise ValueError("bad xref or not an OCMD")
  4215. doc.update_object(xref, text)
  4216. return xref
  4217. def get_ocmd(doc: pymupdf.Document, xref: int) -> dict:
  4218. """Return the definition of an OCMD (optional content membership dictionary).
  4219. Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and
  4220. /VE (visibility expression, PDF array). Via string manipulation, this
  4221. info is converted to a Python dictionary with keys "xref", "ocgs", "policy"
  4222. and "ve" - ready to recycle as input for 'set_ocmd()'.
  4223. """
  4224. if xref not in range(doc.xref_length()):
  4225. raise ValueError("bad xref")
  4226. text = doc.xref_object(xref, compressed=True)
  4227. if "/Type/OCMD" not in text:
  4228. raise ValueError("bad object type")
  4229. textlen = len(text)
  4230. p0 = text.find("/OCGs[") # look for /OCGs key
  4231. p1 = text.find("]", p0)
  4232. if p0 < 0 or p1 < 0: # no OCGs found
  4233. ocgs = None
  4234. else:
  4235. ocgs = text[p0 + 6 : p1].replace("0 R", " ").split()
  4236. ocgs = list(map(int, ocgs))
  4237. p0 = text.find("/P/") # look for /P policy key
  4238. if p0 < 0:
  4239. policy = None
  4240. else:
  4241. p1 = text.find("ff", p0)
  4242. if p1 < 0:
  4243. p1 = text.find("on", p0)
  4244. if p1 < 0: # some irregular syntax
  4245. raise ValueError("bad object at xref")
  4246. else:
  4247. policy = text[p0 + 3 : p1 + 2]
  4248. p0 = text.find("/VE[") # look for /VE visibility expression key
  4249. if p0 < 0: # no visibility expression found
  4250. ve = None
  4251. else:
  4252. lp = rp = 0 # find end of /VE by finding last ']'.
  4253. p1 = p0
  4254. while lp < 1 or lp != rp:
  4255. p1 += 1
  4256. if not p1 < textlen: # some irregular syntax
  4257. raise ValueError("bad object at xref")
  4258. if text[p1] == "[":
  4259. lp += 1
  4260. if text[p1] == "]":
  4261. rp += 1
  4262. # p1 now positioned at the last "]"
  4263. ve = text[p0 + 3 : p1 + 1] # the PDF /VE array
  4264. ve = (
  4265. ve.replace("/And", '"and",')
  4266. .replace("/Not", '"not",')
  4267. .replace("/Or", '"or",')
  4268. )
  4269. ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[")
  4270. import json
  4271. try:
  4272. ve = json.loads(ve)
  4273. except Exception:
  4274. pymupdf.exception_info()
  4275. pymupdf.message(f"bad /VE key: {ve!r}")
  4276. raise
  4277. return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve}
  4278. """
  4279. Handle page labels for PDF documents.
  4280. Reading
  4281. -------
  4282. * compute the label of a page
  4283. * find page number(s) having the given label.
  4284. Writing
  4285. -------
  4286. Supports setting (defining) page labels for PDF documents.
  4287. A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
  4288. significant parts of the following code during late December 2020
  4289. through early January 2021.
  4290. """
  4291. def rule_dict(item):
  4292. """Make a Python dict from a PDF page label rule.
  4293. Args:
  4294. item -- a tuple (pno, rule) with the start page number and the rule
  4295. string like <</S/D...>>.
  4296. Returns:
  4297. A dict like
  4298. {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
  4299. """
  4300. # Jorj McKie, 2021-01-06
  4301. pno, rule = item
  4302. rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>"
  4303. d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
  4304. skip = False
  4305. for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
  4306. if skip: # this item has already been processed
  4307. skip = False # deactivate skipping again
  4308. continue
  4309. if item == "S": # style specification
  4310. d["style"] = rule[i + 1] # next item has the style
  4311. skip = True # do not process next item again
  4312. continue
  4313. if item.startswith("P"): # prefix specification: extract the string
  4314. x = item[1:].replace("(", "").replace(")", "")
  4315. d["prefix"] = x
  4316. continue
  4317. if item.startswith("St"): # start page number specification
  4318. x = int(item[2:])
  4319. d["firstpagenum"] = x
  4320. return d
  4321. def get_label_pno(pgNo, labels):
  4322. """Return the label for this page number.
  4323. Args:
  4324. pgNo: page number, 0-based.
  4325. labels: result of doc._get_page_labels().
  4326. Returns:
  4327. The label (str) of the page number. Errors return an empty string.
  4328. """
  4329. # Jorj McKie, 2021-01-06
  4330. item = [x for x in labels if x[0] <= pgNo][-1]
  4331. rule = rule_dict(item)
  4332. prefix = rule.get("prefix", "")
  4333. style = rule.get("style", "")
  4334. # make sure we start at 0 when enumerating the alphabet
  4335. delta = -1 if style in ("a", "A") else 0
  4336. pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta
  4337. return construct_label(style, prefix, pagenumber)
  4338. def get_label(page):
  4339. """Return the label for this PDF page.
  4340. Args:
  4341. page: page object.
  4342. Returns:
  4343. The label (str) of the page. Errors return an empty string.
  4344. """
  4345. # Jorj McKie, 2021-01-06
  4346. labels = page.parent._get_page_labels()
  4347. if not labels:
  4348. return ""
  4349. labels.sort()
  4350. return get_label_pno(page.number, labels)
  4351. def get_page_numbers(doc, label, only_one=False):
  4352. """Return a list of page numbers with the given label.
  4353. Args:
  4354. doc: PDF document object (resp. 'self').
  4355. label: (str) label.
  4356. only_one: (bool) stop searching after first hit.
  4357. Returns:
  4358. List of page numbers having this label.
  4359. """
  4360. # Jorj McKie, 2021-01-06
  4361. numbers = []
  4362. if not label:
  4363. return numbers
  4364. labels = doc._get_page_labels()
  4365. if labels == []:
  4366. return numbers
  4367. for i in range(doc.page_count):
  4368. plabel = get_label_pno(i, labels)
  4369. if plabel == label:
  4370. numbers.append(i)
  4371. if only_one:
  4372. break
  4373. return numbers
  4374. def construct_label(style, prefix, pno) -> str:
  4375. """Construct a label based on style, prefix and page number."""
  4376. # William Chapman, 2021-01-06
  4377. n_str = ""
  4378. if style == "D":
  4379. n_str = str(pno)
  4380. elif style == "r":
  4381. n_str = integerToRoman(pno).lower()
  4382. elif style == "R":
  4383. n_str = integerToRoman(pno).upper()
  4384. elif style == "a":
  4385. n_str = integerToLetter(pno).lower()
  4386. elif style == "A":
  4387. n_str = integerToLetter(pno).upper()
  4388. result = prefix + n_str
  4389. return result
  4390. def integerToLetter(i) -> str:
  4391. """Returns letter sequence string for integer i."""
  4392. # William Chapman, Jorj McKie, 2021-01-06
  4393. import string
  4394. ls = string.ascii_uppercase
  4395. n, a = 1, i
  4396. while pow(26, n) <= a:
  4397. a -= int(math.pow(26, n))
  4398. n += 1
  4399. str_t = ""
  4400. for j in reversed(range(n)):
  4401. f, g = divmod(a, int(math.pow(26, j)))
  4402. str_t += ls[f]
  4403. a = g
  4404. return str_t
  4405. def integerToRoman(num: int) -> str:
  4406. """Return roman numeral for an integer."""
  4407. # William Chapman, Jorj McKie, 2021-01-06
  4408. roman = (
  4409. (1000, "M"),
  4410. (900, "CM"),
  4411. (500, "D"),
  4412. (400, "CD"),
  4413. (100, "C"),
  4414. (90, "XC"),
  4415. (50, "L"),
  4416. (40, "XL"),
  4417. (10, "X"),
  4418. (9, "IX"),
  4419. (5, "V"),
  4420. (4, "IV"),
  4421. (1, "I"),
  4422. )
  4423. def roman_num(num):
  4424. for r, ltr in roman:
  4425. x, _ = divmod(num, r)
  4426. yield ltr * x
  4427. num -= r * x
  4428. if num <= 0:
  4429. break
  4430. return "".join([a for a in roman_num(num)])
  4431. def get_page_labels(doc):
  4432. """Return page label definitions in PDF document.
  4433. Args:
  4434. doc: PDF document (resp. 'self').
  4435. Returns:
  4436. A list of dictionaries with the following format:
  4437. {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
  4438. """
  4439. # Jorj McKie, 2021-01-10
  4440. return [rule_dict(item) for item in doc._get_page_labels()]
  4441. def set_page_labels(doc, labels):
  4442. """Add / replace page label definitions in PDF document.
  4443. Args:
  4444. doc: PDF document (resp. 'self').
  4445. labels: list of label dictionaries like:
  4446. {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int},
  4447. as returned by get_page_labels().
  4448. """
  4449. # William Chapman, 2021-01-06
  4450. def create_label_str(label):
  4451. """Convert Python label dict to corresponding PDF rule string.
  4452. Args:
  4453. label: (dict) build rule for the label.
  4454. Returns:
  4455. PDF label rule string wrapped in "<<", ">>".
  4456. """
  4457. s = "%i<<" % label["startpage"]
  4458. if label.get("prefix", "") != "":
  4459. s += "/P(%s)" % label["prefix"]
  4460. if label.get("style", "") != "":
  4461. s += "/S/%s" % label["style"]
  4462. if label.get("firstpagenum", 1) > 1:
  4463. s += "/St %i" % label["firstpagenum"]
  4464. s += ">>"
  4465. return s
  4466. def create_nums(labels):
  4467. """Return concatenated string of all labels rules.
  4468. Args:
  4469. labels: (list) dictionaries as created by function 'rule_dict'.
  4470. Returns:
  4471. PDF compatible string for page label definitions, ready to be
  4472. enclosed in PDF array 'Nums[...]'.
  4473. """
  4474. labels.sort(key=lambda x: x["startpage"])
  4475. s = "".join([create_label_str(label) for label in labels])
  4476. return s
  4477. doc._set_page_labels(create_nums(labels))
  4478. # End of Page Label Code -------------------------------------------------
  4479. def has_links(doc: pymupdf.Document) -> bool:
  4480. """Check whether there are links on any page."""
  4481. if doc.is_closed:
  4482. raise ValueError("document closed")
  4483. if not doc.is_pdf:
  4484. raise ValueError("is no PDF")
  4485. for i in range(doc.page_count):
  4486. for item in doc.page_annot_xrefs(i):
  4487. if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member
  4488. return True
  4489. return False
  4490. def has_annots(doc: pymupdf.Document) -> bool:
  4491. """Check whether there are annotations on any page."""
  4492. if doc.is_closed:
  4493. raise ValueError("document closed")
  4494. if not doc.is_pdf:
  4495. raise ValueError("is no PDF")
  4496. for i in range(doc.page_count):
  4497. for item in doc.page_annot_xrefs(i):
  4498. # pylint: disable=no-member
  4499. if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member
  4500. return True
  4501. return False
  4502. # -------------------------------------------------------------------
  4503. # Functions to recover the quad contained in a text extraction bbox
  4504. # -------------------------------------------------------------------
  4505. def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
  4506. """Compute the quad located inside the bbox.
  4507. The bbox may be any of the resp. tuples occurring inside the given span.
  4508. Args:
  4509. line_dir: (tuple) 'line["dir"]' of the owning line or None.
  4510. span: (dict) the span. May be from get_texttrace() method.
  4511. bbox: (tuple) the bbox of the span or any of its characters.
  4512. Returns:
  4513. The quad which is wrapped by the bbox.
  4514. """
  4515. if line_dir is None:
  4516. line_dir = span["dir"]
  4517. cos, sin = line_dir
  4518. bbox = pymupdf.Rect(bbox) # make it a rect
  4519. if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height
  4520. d = 1
  4521. else:
  4522. d = span["ascender"] - span["descender"]
  4523. height = d * span["size"] # the quad's rectangle height
  4524. # The following are distances from the bbox corners, at which we find the
  4525. # respective quad points. The computation depends on in which quadrant the
  4526. # text writing angle is located.
  4527. hs = height * sin
  4528. hc = height * cos
  4529. if hc >= 0 and hs <= 0: # quadrant 1
  4530. ul = bbox.bl - (0, hc)
  4531. ur = bbox.tr + (hs, 0)
  4532. ll = bbox.bl - (hs, 0)
  4533. lr = bbox.tr + (0, hc)
  4534. elif hc <= 0 and hs <= 0: # quadrant 2
  4535. ul = bbox.br + (hs, 0)
  4536. ur = bbox.tl - (0, hc)
  4537. ll = bbox.br + (0, hc)
  4538. lr = bbox.tl - (hs, 0)
  4539. elif hc <= 0 and hs >= 0: # quadrant 3
  4540. ul = bbox.tr - (0, hc)
  4541. ur = bbox.bl + (hs, 0)
  4542. ll = bbox.tr - (hs, 0)
  4543. lr = bbox.bl + (0, hc)
  4544. else: # quadrant 4
  4545. ul = bbox.tl + (hs, 0)
  4546. ur = bbox.br - (0, hc)
  4547. ll = bbox.tl + (0, hc)
  4548. lr = bbox.br - (hs, 0)
  4549. return pymupdf.Quad(ul, ur, ll, lr)
  4550. def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
  4551. """Recover the quadrilateral of a text span.
  4552. Args:
  4553. line_dir: (tuple) 'line["dir"]' of the owning line.
  4554. span: the span.
  4555. Returns:
  4556. The quadrilateral enveloping the span's text.
  4557. """
  4558. if type(line_dir) is not tuple or len(line_dir) != 2:
  4559. raise ValueError("bad line dir argument")
  4560. if type(span) is not dict:
  4561. raise ValueError("bad span argument")
  4562. return recover_bbox_quad(line_dir, span, span["bbox"])
  4563. def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
  4564. """Calculate the line quad for 'dict' / 'rawdict' text extractions.
  4565. The lower quad points are those of the first, resp. last span quad.
  4566. The upper points are determined by the maximum span quad height.
  4567. From this, compute a rect with bottom-left in (0, 0), convert this to a
  4568. quad and rotate and shift back to cover the text of the spans.
  4569. Args:
  4570. spans: (list, optional) sub-list of spans to consider.
  4571. Returns:
  4572. pymupdf.Quad covering selected spans.
  4573. """
  4574. if spans is None: # no sub-selection
  4575. spans = line["spans"] # all spans
  4576. if len(spans) == 0:
  4577. raise ValueError("bad span list")
  4578. line_dir = line["dir"] # text direction
  4579. cos, sin = line_dir
  4580. q0 = recover_quad(line_dir, spans[0]) # quad of first span
  4581. if len(spans) > 1: # get quad of last span
  4582. q1 = recover_quad(line_dir, spans[-1])
  4583. else:
  4584. q1 = q0 # last = first
  4585. line_ll = q0.ll # lower-left of line quad
  4586. line_lr = q1.lr # lower-right of line quad
  4587. mat0 = pymupdf.planish_line(line_ll, line_lr)
  4588. # map base line to x-axis such that line_ll goes to (0, 0)
  4589. x_lr = line_lr * mat0
  4590. small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
  4591. h = max(
  4592. [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
  4593. )
  4594. line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
  4595. line_quad = line_rect.quad # make it a quad and:
  4596. line_quad *= ~mat0
  4597. return line_quad
  4598. def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
  4599. """Calculate the span quad for 'dict' / 'rawdict' text extractions.
  4600. Notes:
  4601. There are two execution paths:
  4602. 1. For the full span quad, the result of 'recover_quad' is returned.
  4603. 2. For the quad of a sub-list of characters, the char quads are
  4604. computed and joined. This is only supported for the "rawdict"
  4605. extraction option.
  4606. Args:
  4607. line_dir: (tuple) 'line["dir"]' of the owning line.
  4608. span: (dict) the span.
  4609. chars: (list, optional) sub-list of characters to consider.
  4610. Returns:
  4611. pymupdf.Quad covering selected characters.
  4612. """
  4613. if line_dir is None: # must be a span from get_texttrace()
  4614. line_dir = span["dir"]
  4615. if chars is None: # no sub-selection
  4616. return recover_quad(line_dir, span)
  4617. if "chars" not in span.keys():
  4618. raise ValueError("need 'rawdict' option to sub-select chars")
  4619. q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char
  4620. if len(chars) > 1: # get quad of last char
  4621. q1 = recover_char_quad(line_dir, span, chars[-1])
  4622. else:
  4623. q1 = q0 # last = first
  4624. span_ll = q0.ll # lower-left of span quad
  4625. span_lr = q1.lr # lower-right of span quad
  4626. mat0 = pymupdf.planish_line(span_ll, span_lr)
  4627. # map base line to x-axis such that span_ll goes to (0, 0)
  4628. x_lr = span_lr * mat0
  4629. small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
  4630. h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
  4631. span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
  4632. span_quad = span_rect.quad # make it a quad and:
  4633. span_quad *= ~mat0 # rotate back and shift back
  4634. return span_quad
  4635. def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
  4636. """Recover the quadrilateral of a text character.
  4637. This requires the "rawdict" option of text extraction.
  4638. Args:
  4639. line_dir: (tuple) 'line["dir"]' of the span's line.
  4640. span: (dict) the span dict.
  4641. char: (dict) the character dict.
  4642. Returns:
  4643. The quadrilateral enveloping the character.
  4644. """
  4645. if line_dir is None:
  4646. line_dir = span["dir"]
  4647. if type(line_dir) is not tuple or len(line_dir) != 2:
  4648. raise ValueError("bad line dir argument")
  4649. if type(span) is not dict:
  4650. raise ValueError("bad span argument")
  4651. if type(char) is dict:
  4652. bbox = pymupdf.Rect(char["bbox"])
  4653. elif type(char) is tuple:
  4654. bbox = pymupdf.Rect(char[3])
  4655. else:
  4656. raise ValueError("bad span argument")
  4657. return recover_bbox_quad(line_dir, span, bbox)
  4658. # -------------------------------------------------------------------
  4659. # Building font subsets using fontTools
  4660. # -------------------------------------------------------------------
  4661. def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> OptInt:
  4662. """Build font subsets in a PDF.
  4663. Eligible fonts are potentially replaced by smaller versions. Page text is
  4664. NOT rewritten and thus should retain properties like being hidden or
  4665. controlled by optional content.
  4666. This method by default uses MuPDF's own internal feature to create subset
  4667. fonts. As this is a new function, errors may still occur. In this case,
  4668. please fall back to using the previous version by using "fallback=True".
  4669. Fallback mode requires the external package 'fontTools'.
  4670. Args:
  4671. fallback: use the older deprecated implementation.
  4672. verbose: only used by fallback mode.
  4673. Returns:
  4674. The new MuPDF-based code returns None. The deprecated fallback
  4675. mode returns 0 if there are no fonts to subset. Otherwise, it
  4676. returns the decrease in fontsize (the difference in fontsize),
  4677. measured in bytes.
  4678. """
  4679. # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs))
  4680. # An embedded font is uniquely defined by its fontbuffer only. It may have
  4681. # multiple names and xrefs.
  4682. # Once the sets of used unicodes and glyphs are known, we compute a
  4683. # smaller version of the buffer user package fontTools.
  4684. if not fallback: # by default use MuPDF function
  4685. pdf = mupdf.pdf_document_from_fz_document(doc)
  4686. mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count)))
  4687. return
  4688. font_buffers = {}
  4689. def get_old_widths(xref):
  4690. """Retrieve old font '/W' and '/DW' values."""
  4691. df = doc.xref_get_key(xref, "DescendantFonts")
  4692. if df[0] != "array": # only handle xref specifications
  4693. return None, None
  4694. df_xref = int(df[1][1:-1].replace("0 R", ""))
  4695. widths = doc.xref_get_key(df_xref, "W")
  4696. if widths[0] != "array": # no widths key found
  4697. widths = None
  4698. else:
  4699. widths = widths[1]
  4700. dwidths = doc.xref_get_key(df_xref, "DW")
  4701. if dwidths[0] != "int":
  4702. dwidths = None
  4703. else:
  4704. dwidths = dwidths[1]
  4705. return widths, dwidths
  4706. def set_old_widths(xref, widths, dwidths):
  4707. """Restore the old '/W' and '/DW' in subsetted font.
  4708. If either parameter is None or evaluates to False, the corresponding
  4709. dictionary key will be set to null.
  4710. """
  4711. df = doc.xref_get_key(xref, "DescendantFonts")
  4712. if df[0] != "array": # only handle xref specs
  4713. return None
  4714. df_xref = int(df[1][1:-1].replace("0 R", ""))
  4715. if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[
  4716. 0
  4717. ] != "null":
  4718. doc.xref_set_key(df_xref, "W", "null")
  4719. else:
  4720. doc.xref_set_key(df_xref, "W", widths)
  4721. if (type(dwidths) is not str or not dwidths) and doc.xref_get_key(
  4722. df_xref, "DW"
  4723. )[0] != "null":
  4724. doc.xref_set_key(df_xref, "DW", "null")
  4725. else:
  4726. doc.xref_set_key(df_xref, "DW", dwidths)
  4727. return None
  4728. def set_subset_fontname(new_xref):
  4729. """Generate a name prefix to tag a font as subset.
  4730. We use a random generator to select 6 upper case ASCII characters.
  4731. The prefixed name must be put in the font xref as the "/BaseFont" value
  4732. and in the FontDescriptor object as the '/FontName' value.
  4733. """
  4734. # The following generates a prefix like 'ABCDEF+'
  4735. import random
  4736. import string
  4737. prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+"
  4738. font_str = doc.xref_object(new_xref, compressed=True)
  4739. font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix)
  4740. df = doc.xref_get_key(new_xref, "DescendantFonts")
  4741. if df[0] == "array":
  4742. df_xref = int(df[1][1:-1].replace("0 R", ""))
  4743. fd = doc.xref_get_key(df_xref, "FontDescriptor")
  4744. if fd[0] == "xref":
  4745. fd_xref = int(fd[1].replace("0 R", ""))
  4746. fd_str = doc.xref_object(fd_xref, compressed=True)
  4747. fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix)
  4748. doc.update_object(fd_xref, fd_str)
  4749. doc.update_object(new_xref, font_str)
  4750. def build_subset(buffer, unc_set, gid_set):
  4751. """Build font subset using fontTools.
  4752. Args:
  4753. buffer: (bytes) the font given as a binary buffer.
  4754. unc_set: (set) required glyph ids.
  4755. Returns:
  4756. Either None if subsetting is unsuccessful or the subset font buffer.
  4757. """
  4758. try:
  4759. import fontTools.subset as fts
  4760. except ImportError:
  4761. if g_exceptions_verbose: pymupdf.exception_info()
  4762. pymupdf.message("This method requires fontTools to be installed.")
  4763. raise
  4764. import tempfile
  4765. with tempfile.TemporaryDirectory() as tmp_dir:
  4766. oldfont_path = f"{tmp_dir}/oldfont.ttf"
  4767. newfont_path = f"{tmp_dir}/newfont.ttf"
  4768. uncfile_path = f"{tmp_dir}/uncfile.txt"
  4769. args = [
  4770. oldfont_path,
  4771. "--retain-gids",
  4772. f"--output-file={newfont_path}",
  4773. "--layout-features=*",
  4774. "--passthrough-tables",
  4775. "--ignore-missing-glyphs",
  4776. "--ignore-missing-unicodes",
  4777. "--symbol-cmap",
  4778. ]
  4779. # store glyph ids or unicodes as file
  4780. with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file:
  4781. if 0xFFFD in unc_set: # error unicode exists -> use glyphs
  4782. args.append(f"--gids-file={uncfile_path}")
  4783. gid_set.add(189)
  4784. unc_list = list(gid_set)
  4785. for unc in unc_list:
  4786. unc_file.write("%i\n" % unc)
  4787. else:
  4788. args.append(f"--unicodes-file={uncfile_path}")
  4789. unc_set.add(255)
  4790. unc_list = list(unc_set)
  4791. for unc in unc_list:
  4792. unc_file.write("%04x\n" % unc)
  4793. # store fontbuffer as a file
  4794. with open(oldfont_path, "wb") as fontfile:
  4795. fontfile.write(buffer)
  4796. try:
  4797. os.remove(newfont_path) # remove old file
  4798. except Exception:
  4799. pass
  4800. try: # invoke fontTools subsetter
  4801. fts.main(args)
  4802. font = pymupdf.Font(fontfile=newfont_path)
  4803. new_buffer = font.buffer # subset font binary
  4804. if font.glyph_count == 0: # intercept empty font
  4805. new_buffer = None
  4806. except Exception:
  4807. pymupdf.exception_info()
  4808. new_buffer = None
  4809. return new_buffer
  4810. def repl_fontnames(doc):
  4811. """Populate 'font_buffers'.
  4812. For each font candidate, store its xref and the list of names
  4813. by which PDF text may refer to it (there may be multiple).
  4814. """
  4815. def norm_name(name):
  4816. """Recreate font name that contains PDF hex codes.
  4817. E.g. #20 -> space, chr(32)
  4818. """
  4819. while "#" in name:
  4820. p = name.find("#")
  4821. c = int(name[p + 1 : p + 3], 16)
  4822. name = name.replace(name[p : p + 3], chr(c))
  4823. return name
  4824. def get_fontnames(doc, item):
  4825. """Return a list of fontnames for an item of page.get_fonts().
  4826. There may be multiple names e.g. for Type0 fonts.
  4827. """
  4828. fontname = item[3]
  4829. names = [fontname]
  4830. fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:]
  4831. fontname = norm_name(fontname)
  4832. if fontname not in names:
  4833. names.append(fontname)
  4834. descendents = doc.xref_get_key(item[0], "DescendantFonts")
  4835. if descendents[0] != "array":
  4836. return names
  4837. descendents = descendents[1][1:-1]
  4838. if descendents.endswith(" 0 R"):
  4839. xref = int(descendents[:-4])
  4840. descendents = doc.xref_object(xref, compressed=True)
  4841. p1 = descendents.find("/BaseFont")
  4842. if p1 >= 0:
  4843. p2 = descendents.find("/", p1 + 1)
  4844. p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1))
  4845. fontname = descendents[p2 + 1 : p1]
  4846. fontname = norm_name(fontname)
  4847. if fontname not in names:
  4848. names.append(fontname)
  4849. return names
  4850. for i in range(doc.page_count):
  4851. for f in doc.get_page_fonts(i, full=True):
  4852. font_xref = f[0] # font xref
  4853. font_ext = f[1] # font file extension
  4854. basename = f[3] # font basename
  4855. if font_ext not in ( # skip if not supported by fontTools
  4856. "otf",
  4857. "ttf",
  4858. "woff",
  4859. "woff2",
  4860. ):
  4861. continue
  4862. # skip fonts which already are subsets
  4863. if len(basename) > 6 and basename[6] == "+":
  4864. continue
  4865. extr = doc.extract_font(font_xref)
  4866. fontbuffer = extr[-1]
  4867. names = get_fontnames(doc, f)
  4868. name_set, xref_set, subsets = font_buffers.get(
  4869. fontbuffer, (set(), set(), (set(), set()))
  4870. )
  4871. xref_set.add(font_xref)
  4872. for name in names:
  4873. name_set.add(name)
  4874. font = pymupdf.Font(fontbuffer=fontbuffer)
  4875. name_set.add(font.name)
  4876. del font
  4877. font_buffers[fontbuffer] = (name_set, xref_set, subsets)
  4878. def find_buffer_by_name(name):
  4879. for buffer, (name_set, _, _) in font_buffers.items():
  4880. if name in name_set:
  4881. return buffer
  4882. return None
  4883. # -----------------
  4884. # main function
  4885. # -----------------
  4886. repl_fontnames(doc) # populate font information
  4887. if not font_buffers: # nothing found to do
  4888. if verbose:
  4889. pymupdf.message(f'No fonts to subset.')
  4890. return 0
  4891. old_fontsize = 0
  4892. new_fontsize = 0
  4893. for fontbuffer in font_buffers.keys():
  4894. old_fontsize += len(fontbuffer)
  4895. # Scan page text for usage of subsettable fonts
  4896. for page in doc:
  4897. # go through the text and extend set of used glyphs by font
  4898. # we use a modified MuPDF trace device, which delivers us glyph ids.
  4899. for span in page.get_texttrace():
  4900. if type(span) is not dict: # skip useless information
  4901. continue
  4902. fontname = span["font"][:33] # fontname for the span
  4903. buffer = find_buffer_by_name(fontname)
  4904. if buffer is None:
  4905. continue
  4906. name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer]
  4907. for c in span["chars"]:
  4908. set_ucs.add(c[0]) # unicode
  4909. set_gid.add(c[1]) # glyph id
  4910. font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid))
  4911. # build the font subsets
  4912. for old_buffer, (name_set, xref_set, subsets) in font_buffers.items():
  4913. new_buffer = build_subset(old_buffer, subsets[0], subsets[1])
  4914. fontname = list(name_set)[0]
  4915. if new_buffer is None or len(new_buffer) >= len(old_buffer):
  4916. # subset was not created or did not get smaller
  4917. if verbose:
  4918. pymupdf.message(f'Cannot subset {fontname!r}.')
  4919. continue
  4920. if verbose:
  4921. pymupdf.message(f"Built subset of font {fontname!r}.")
  4922. val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF
  4923. new_xref = val[0] # get its xref
  4924. set_subset_fontname(new_xref) # tag fontname as subset font
  4925. font_str = doc.xref_object( # get its object definition
  4926. new_xref,
  4927. compressed=True,
  4928. )
  4929. # walk through the original font xrefs and replace each by the subset def
  4930. for font_xref in xref_set:
  4931. # we need the original '/W' and '/DW' width values
  4932. width_table, def_width = get_old_widths(font_xref)
  4933. # ... and replace original font definition at xref with it
  4934. doc.update_object(font_xref, font_str)
  4935. # now copy over old '/W' and '/DW' values
  4936. if width_table or def_width:
  4937. set_old_widths(font_xref, width_table, def_width)
  4938. # 'new_xref' remains unused in the PDF and must be removed
  4939. # by garbage collection.
  4940. new_fontsize += len(new_buffer)
  4941. return old_fontsize - new_fontsize
  4942. # -------------------------------------------------------------------
  4943. # Copy XREF object to another XREF
  4944. # -------------------------------------------------------------------
  4945. def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None:
  4946. """Copy a PDF dictionary object to another one given their xref numbers.
  4947. Args:
  4948. doc: PDF document object
  4949. source: source xref number
  4950. target: target xref number, the xref must already exist
  4951. keep: an optional list of 1st level keys in target that should not be
  4952. removed before copying.
  4953. Notes:
  4954. This works similar to the copy() method of dictionaries in Python. The
  4955. source may be a stream object.
  4956. """
  4957. if doc.xref_is_stream(source):
  4958. # read new xref stream, maintaining compression
  4959. stream = doc.xref_stream_raw(source)
  4960. doc.update_stream(
  4961. target,
  4962. stream,
  4963. compress=False, # keeps source compression
  4964. new=True, # in case target is no stream
  4965. )
  4966. # empty the target completely, observe exceptions
  4967. if keep is None:
  4968. keep = []
  4969. for key in doc.xref_get_keys(target):
  4970. if key in keep:
  4971. continue
  4972. doc.xref_set_key(target, key, "null")
  4973. # copy over all source dict items
  4974. for key in doc.xref_get_keys(source):
  4975. item = doc.xref_get_key(source, key)
  4976. doc.xref_set_key(target, key, item[1])