| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679 |
- # ------------------------------------------------------------------------
- # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
- # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
- #
- # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
- # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
- # maintained and developed by Artifex Software, Inc. https://artifex.com.
- # ------------------------------------------------------------------------
- import io
- import math
- import os
- import typing
- import weakref
- try:
- from . import pymupdf
- except Exception:
- import pymupdf
- try:
- from . import mupdf
- except Exception:
- import mupdf
- _format_g = pymupdf.format_g
- g_exceptions_verbose = pymupdf.g_exceptions_verbose
- point_like = "point_like"
- rect_like = "rect_like"
- matrix_like = "matrix_like"
- quad_like = "quad_like"
- # ByteString is gone from typing in 3.14.
- # collections.abc.Buffer available from 3.12 only
- try:
- ByteString = typing.ByteString
- except AttributeError:
- # pylint: disable=unsupported-binary-operation
- ByteString = bytes | bytearray | memoryview
- AnyType = typing.Any
- OptInt = typing.Union[int, None]
- OptFloat = typing.Optional[float]
- OptStr = typing.Optional[str]
- OptDict = typing.Optional[dict]
- OptBytes = typing.Optional[ByteString]
- OptSeq = typing.Optional[typing.Sequence]
- """
- This is a collection of functions to extend PyMupdf.
- """
- def write_text(
- page: pymupdf.Page,
- rect=None,
- writers=None,
- overlay=True,
- color=None,
- opacity=None,
- keep_proportion=True,
- rotate=0,
- oc=0,
- ) -> None:
- """Write the text of one or more pymupdf.TextWriter objects.
- Args:
- rect: target rectangle. If None, the union of the text writers is used.
- writers: one or more pymupdf.TextWriter objects.
- overlay: put in foreground or background.
- keep_proportion: maintain aspect ratio of rectangle sides.
- rotate: arbitrary rotation angle.
- oc: the xref of an optional content object
- """
- assert isinstance(page, pymupdf.Page)
- if not writers:
- raise ValueError("need at least one pymupdf.TextWriter")
- if type(writers) is pymupdf.TextWriter:
- if rotate == 0 and rect is None:
- writers.write_text(page, opacity=opacity, color=color, overlay=overlay)
- return None
- else:
- writers = (writers,)
- clip = writers[0].text_rect
- textdoc = pymupdf.Document()
- tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height)
- for writer in writers:
- clip |= writer.text_rect
- writer.write_text(tpage, opacity=opacity, color=color)
- if rect is None:
- rect = clip
- page.show_pdf_page(
- rect,
- textdoc,
- 0,
- overlay=overlay,
- keep_proportion=keep_proportion,
- rotate=rotate,
- clip=clip,
- oc=oc,
- )
- textdoc = None
- tpage = None
- def show_pdf_page(
- page,
- rect,
- docsrc,
- pno=0,
- keep_proportion=True,
- overlay=True,
- oc=0,
- rotate=0,
- clip=None,
- ) -> int:
- """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'.
- Args:
- rect: (rect-like) where to place the source image
- docsrc: (document) source PDF
- pno: (int) source page number
- keep_proportion: (bool) do not change width-height-ratio
- overlay: (bool) put in foreground
- oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF)
- rotate: (int) degrees (multiple of 90)
- clip: (rect-like) part of source page rectangle
- Returns:
- xref of inserted object (for reuse)
- """
- def calc_matrix(sr, tr, keep=True, rotate=0):
- """Calculate transformation matrix from source to target rect.
- Notes:
- The product of four matrices in this sequence: (1) translate correct
- source corner to origin, (2) rotate, (3) scale, (4) translate to
- target's top-left corner.
- Args:
- sr: source rect in PDF (!) coordinate system
- tr: target rect in PDF coordinate system
- keep: whether to keep source ratio of width to height
- rotate: rotation angle in degrees
- Returns:
- Transformation matrix.
- """
- # calc center point of source rect
- smp = (sr.tl + sr.br) / 2.0
- # calc center point of target rect
- tmp = (tr.tl + tr.br) / 2.0
- # m moves to (0, 0), then rotates
- m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate)
- sr1 = sr * m # resulting source rect to calculate scale factors
- fw = tr.width / sr1.width # scale the width
- fh = tr.height / sr1.height # scale the height
- if keep:
- fw = fh = min(fw, fh) # take min if keeping aspect ratio
- m *= pymupdf.Matrix(fw, fh) # concat scale matrix
- m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center
- return pymupdf.JM_TUPLE(m)
- pymupdf.CheckParent(page)
- doc = page.parent
- if not doc.is_pdf or not docsrc.is_pdf:
- raise ValueError("is no PDF")
- if rect.is_empty or rect.is_infinite:
- raise ValueError("rect must be finite and not empty")
- while pno < 0: # support negative page numbers
- pno += docsrc.page_count
- src_page = docsrc[pno] # load source page
- tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates
- src_rect = src_page.rect if not clip else src_page.rect & clip # source rect
- if src_rect.is_empty or src_rect.is_infinite:
- raise ValueError("clip must be finite and not empty")
- src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord
- matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate)
- # list of existing /Form /XObjects
- ilst = [i[1] for i in doc.get_page_xobjects(page.number)]
- ilst += [i[7] for i in doc.get_page_images(page.number)]
- ilst += [i[4] for i in doc.get_page_fonts(page.number)]
- # create a name not in that list
- n = "fzFrm"
- i = 0
- _imgname = n + "0"
- while _imgname in ilst:
- i += 1
- _imgname = n + str(i)
- isrc = docsrc._graft_id # used as key for graftmaps
- if doc._graft_id == isrc:
- raise ValueError("source document must not equal target")
- # retrieve / make pymupdf.Graftmap for source PDF
- gmap = doc.Graftmaps.get(isrc, None)
- if gmap is None:
- gmap = pymupdf.Graftmap(doc)
- doc.Graftmaps[isrc] = gmap
- # take note of generated xref for automatic reuse
- pno_id = (isrc, pno) # id of docsrc[pno]
- xref = doc.ShownPages.get(pno_id, 0)
- if overlay:
- page.wrap_contents() # ensure a balanced graphics state
- xref = page._show_pdf_page(
- src_page,
- overlay=overlay,
- matrix=matrix,
- xref=xref,
- oc=oc,
- clip=src_rect,
- graftmap=gmap,
- _imgname=_imgname,
- )
- doc.ShownPages[pno_id] = xref
- return xref
- def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None):
- """Replace the image referred to by xref.
- Replace the image by changing the object definition stored under xref. This
- will leave the pages appearance instructions intact, so the new image is
- being displayed with the same bbox, rotation etc.
- By providing a small fully transparent image, an effect as if the image had
- been deleted can be achieved.
- A typical use may include replacing large images by a smaller version,
- e.g. with a lower resolution or graylevel instead of colored.
- Args:
- xref: the xref of the image to replace.
- filename, pixmap, stream: exactly one of these must be provided. The
- meaning being the same as in Page.insert_image.
- """
- doc = page.parent # the owning document
- if not doc.xref_is_image(xref):
- raise ValueError("xref not an image") # insert new image anywhere in page
- if bool(filename) + bool(stream) + bool(pixmap) != 1:
- raise ValueError("Exactly one of filename/stream/pixmap must be given")
- new_xref = page.insert_image(
- page.rect, filename=filename, stream=stream, pixmap=pixmap
- )
- doc.xref_copy(new_xref, xref) # copy over new to old
- last_contents_xref = page.get_contents()[-1]
- # new image insertion has created a new /Contents source,
- # which we will set to spaces now
- doc.update_stream(last_contents_xref, b" ")
- page._image_info = None # clear cache of extracted image information
- def delete_image(page: pymupdf.Page, xref: int):
- """Delete the image referred to by xef.
- Actually replaces by a small transparent Pixmap using method Page.replace_image.
- Args:
- xref: xref of the image to delete.
- """
- # make a small 100% transparent pixmap (of just any dimension)
- pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1)
- pix.clear_with() # clear all samples bytes to 0x00
- page.replace_image(xref, pixmap=pix)
- def insert_image(
- page,
- rect,
- *,
- alpha=-1,
- filename=None,
- height=0,
- keep_proportion=True,
- mask=None,
- oc=0,
- overlay=True,
- pixmap=None,
- rotate=0,
- stream=None,
- width=0,
- xref=0,
- ):
- """Insert an image for display in a rectangle.
- Args:
- rect: (rect_like) position of image on the page.
- alpha: (int, optional) set to 0 if image has no transparency.
- filename: (str, Path, file object) image filename.
- height: (int)
- keep_proportion: (bool) keep width / height ratio (default).
- mask: (bytes, optional) image consisting of alpha values to use.
- oc: (int) xref of OCG or OCMD to declare as Optional Content.
- overlay: (bool) put in foreground (default) or background.
- pixmap: (pymupdf.Pixmap) use this as image.
- rotate: (int) rotate by 0, 90, 180 or 270 degrees.
- stream: (bytes) use this as image.
- width: (int)
- xref: (int) use this as image.
- 'page' and 'rect' are positional, all other parameters are keywords.
- If 'xref' is given, that image is used. Other input options are ignored.
- Else, exactly one of pixmap, stream or filename must be given.
- 'alpha=0' for non-transparent images improves performance significantly.
- Affects stream and filename only.
- Optimum transparent insertions are possible by using filename / stream in
- conjunction with a 'mask' image of alpha values.
- Returns:
- xref (int) of inserted image. Re-use as argument for multiple insertions.
- """
- pymupdf.CheckParent(page)
- doc = page.parent
- if not doc.is_pdf:
- raise ValueError("is no PDF")
- if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1):
- raise ValueError("xref=0 needs exactly one of filename, pixmap, stream")
- if filename:
- if type(filename) is str:
- pass
- elif hasattr(filename, "absolute"):
- filename = str(filename)
- elif hasattr(filename, "name"):
- filename = filename.name
- else:
- raise ValueError("bad filename")
- if filename and not os.path.exists(filename):
- raise FileNotFoundError("No such file: '%s'" % filename)
- elif stream and type(stream) not in (bytes, bytearray, io.BytesIO):
- raise ValueError("stream must be bytes-like / BytesIO")
- elif pixmap and type(pixmap) is not pymupdf.Pixmap:
- raise ValueError("pixmap must be a pymupdf.Pixmap")
- if mask and not (stream or filename):
- raise ValueError("mask requires stream or filename")
- if mask and type(mask) not in (bytes, bytearray, io.BytesIO):
- raise ValueError("mask must be bytes-like / BytesIO")
- while rotate < 0:
- rotate += 360
- while rotate >= 360:
- rotate -= 360
- if rotate not in (0, 90, 180, 270):
- raise ValueError("bad rotate value")
- r = pymupdf.Rect(rect)
- if r.is_empty or r.is_infinite:
- raise ValueError("rect must be finite and not empty")
- clip = r * ~page.transformation_matrix
- # Create a unique image reference name.
- ilst = [i[7] for i in doc.get_page_images(page.number)]
- ilst += [i[1] for i in doc.get_page_xobjects(page.number)]
- ilst += [i[4] for i in doc.get_page_fonts(page.number)]
- n = "fzImg" # 'pymupdf image'
- i = 0
- _imgname = n + "0" # first name candidate
- while _imgname in ilst:
- i += 1
- _imgname = n + str(i) # try new name
- if overlay:
- page.wrap_contents() # ensure a balanced graphics state
- digests = doc.InsertedImages
- xref, digests = page._insert_image(
- filename=filename,
- pixmap=pixmap,
- stream=stream,
- imask=mask,
- clip=clip,
- overlay=overlay,
- oc=oc,
- xref=xref,
- rotate=rotate,
- keep_proportion=keep_proportion,
- width=width,
- height=height,
- alpha=alpha,
- _imgname=_imgname,
- digests=digests,
- )
- if digests is not None:
- doc.InsertedImages = digests
- return xref
- def search_for(
- page,
- text,
- *,
- clip=None,
- quads=False,
- flags=pymupdf.TEXT_DEHYPHENATE
- | pymupdf.TEXT_PRESERVE_WHITESPACE
- | pymupdf.TEXT_PRESERVE_LIGATURES
- | pymupdf.TEXT_MEDIABOX_CLIP
- ,
- textpage=None,
- ) -> list:
- """Search for a string on a page.
- Args:
- text: string to be searched for
- clip: restrict search to this rectangle
- quads: (bool) return quads instead of rectangles
- flags: bit switches, default: join hyphened words
- textpage: a pre-created pymupdf.TextPage
- Returns:
- a list of rectangles or quads, each containing one occurrence.
- """
- if clip is not None:
- clip = pymupdf.Rect(clip)
- pymupdf.CheckParent(page)
- tp = textpage
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- rlist = tp.search(text, quads=quads)
- if textpage is None:
- del tp
- return rlist
- def search_page_for(
- doc: pymupdf.Document,
- pno: int,
- text: str,
- quads: bool = False,
- clip: rect_like = None,
- flags: int = pymupdf.TEXT_DEHYPHENATE
- | pymupdf.TEXT_PRESERVE_LIGATURES
- | pymupdf.TEXT_PRESERVE_WHITESPACE
- | pymupdf.TEXT_MEDIABOX_CLIP
- ,
- textpage: pymupdf.TextPage = None,
- ) -> list:
- """Search for a string on a page.
- Args:
- pno: page number
- text: string to be searched for
- clip: restrict search to this rectangle
- quads: (bool) return quads instead of rectangles
- flags: bit switches, default: join hyphened words
- textpage: reuse a prepared textpage
- Returns:
- a list of rectangles or quads, each containing an occurrence.
- """
- return doc[pno].search_for(
- text,
- quads=quads,
- clip=clip,
- flags=flags,
- textpage=textpage,
- )
- def get_text_blocks(
- page: pymupdf.Page,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- sort: bool = False,
- ) -> list:
- """Return the text blocks on a page.
- Notes:
- Lines in a block are concatenated with line breaks.
- Args:
- flags: (int) control the amount of data parsed into the textpage.
- Returns:
- A list of the blocks. Each item contains the containing rectangle
- coordinates, text lines, running block number and block type.
- """
- pymupdf.CheckParent(page)
- if flags is None:
- flags = pymupdf.TEXTFLAGS_BLOCKS
- tp = textpage
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=flags)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- blocks = tp.extractBLOCKS()
- if textpage is None:
- del tp
- if sort:
- blocks.sort(key=lambda b: (b[3], b[0]))
- return blocks
- def get_text_words(
- page: pymupdf.Page,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- sort: bool = False,
- delimiters=None,
- tolerance=3,
- ) -> list:
- """Return the text words as a list with the bbox for each word.
- Args:
- page: pymupdf.Page
- clip: (rect-like) area on page to consider
- flags: (int) control the amount of data parsed into the textpage.
- textpage: (pymupdf.TextPage) either passed-in or None.
- sort: (bool) sort the words in reading sequence.
- delimiters: (str,list) characters to use as word delimiters.
- tolerance: (float) consider words to be part of the same line if
- top or bottom coordinate are not larger than this. Relevant
- only if sort=True.
- Returns:
- Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
- """
- def sort_words(words):
- """Sort words line-wise, forgiving small deviations."""
- words.sort(key=lambda w: (w[3], w[0]))
- nwords = [] # final word list
- line = [words[0]] # collects words roughly in same line
- lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle
- for w in words[1:]:
- wrect = pymupdf.Rect(w[:4])
- if (
- abs(wrect.y0 - lrect.y0) <= tolerance
- or abs(wrect.y1 - lrect.y1) <= tolerance
- ):
- line.append(w)
- lrect |= wrect
- else:
- line.sort(key=lambda w: w[0]) # sort words in line l-t-r
- nwords.extend(line) # append to final words list
- line = [w] # start next line
- lrect = wrect # start next line rect
- line.sort(key=lambda w: w[0]) # sort words in line l-t-r
- nwords.extend(line) # append to final words list
- return nwords
- pymupdf.CheckParent(page)
- if flags is None:
- flags = pymupdf.TEXTFLAGS_WORDS
- tp = textpage
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=flags)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- words = tp.extractWORDS(delimiters)
- # if textpage was given, we subselect the words in clip
- if textpage is not None and clip is not None:
- # sub-select words contained in clip
- clip = pymupdf.Rect(clip)
- words = [
- w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
- ]
- if textpage is None:
- del tp
- if words and sort:
- # advanced sort if any words found
- words = sort_words(words)
- return words
- def get_sorted_text(
- page: pymupdf.Page,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- tolerance=3,
- ) -> str:
- """Extract plain text avoiding unacceptable line breaks.
- Text contained in clip will be sorted in reading sequence. Some effort
- is also spent to simulate layout vertically and horizontally.
- Args:
- page: pymupdf.Page
- clip: (rect-like) only consider text inside
- flags: (int) text extraction flags
- textpage: pymupdf.TextPage
- tolerance: (float) consider words to be on the same line if their top
- or bottom coordinates do not differ more than this.
- Notes:
- If a TextPage is provided, all text is checked for being inside clip
- with at least 50% of its bbox.
- This allows to use some "global" TextPage in conjunction with sub-
- selecting words in parts of the defined TextPage rectangle.
- Returns:
- A text string in reading sequence. Left indentation of each line,
- inter-line and inter-word distances strive to reflect the layout.
- """
- def line_text(clip, line):
- """Create the string of one text line.
- We are trying to simulate some horizontal layout here, too.
- Args:
- clip: (pymupdf.Rect) the area from which all text is being read.
- line: (list) word tuples (rect, text) contained in the line
- Returns:
- Text in this line. Generated from words in 'line'. Distance from
- predecessor is translated to multiple spaces, thus simulating
- text indentations and large horizontal distances.
- """
- line.sort(key=lambda w: w[0].x0)
- ltext = "" # text in the line
- x1 = clip.x0 # end coordinate of ltext
- lrect = pymupdf.EMPTY_RECT() # bbox of this line
- for r, t in line:
- lrect |= r # update line bbox
- # convert distance to previous word to multiple spaces
- dist = max(
- int(round((r.x0 - x1) / r.width * len(t))),
- 0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
- ) # number of space characters
- ltext += " " * dist + t # append word string
- x1 = r.x1 # update new end position
- return ltext
- # Extract words in correct sequence first.
- words = [
- (pymupdf.Rect(w[:4]), w[4])
- for w in get_text_words(
- page,
- clip=clip,
- flags=flags,
- textpage=textpage,
- sort=True,
- tolerance=tolerance,
- )
- ]
- if not words: # no text present
- return ""
- totalbox = pymupdf.EMPTY_RECT() # area covering all text
- for wr, text in words:
- totalbox |= wr
- lines = [] # list of reconstituted lines
- line = [words[0]] # current line
- lrect = words[0][0] # the line's rectangle
- # walk through the words
- for wr, text in words[1:]: # start with second word
- w0r, _ = line[-1] # read previous word in current line
- # if this word matches top or bottom of the line, append it
- if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
- line.append((wr, text))
- lrect |= wr
- else:
- # output current line and re-initialize
- ltext = line_text(totalbox, line)
- lines.append((lrect, ltext))
- line = [(wr, text)]
- lrect = wr
- # also append unfinished last line
- ltext = line_text(totalbox, line)
- lines.append((lrect, ltext))
- # sort all lines vertically
- lines.sort(key=lambda l: (l[0].y1))
- text = lines[0][1] # text of first line
- y1 = lines[0][0].y1 # its bottom coordinate
- for lrect, ltext in lines[1:]:
- distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
- breaks = "\n" * (distance + 1)
- text += breaks + ltext
- y1 = lrect.y1
- # return text in clip
- return text
- def get_textbox(
- page: pymupdf.Page,
- rect: rect_like,
- textpage: pymupdf.TextPage = None,
- ) -> str:
- tp = textpage
- if tp is None:
- tp = page.get_textpage()
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- rc = tp.extractTextbox(rect)
- if textpage is None:
- del tp
- return rc
- def get_text_selection(
- page: pymupdf.Page,
- p1: point_like,
- p2: point_like,
- clip: rect_like = None,
- textpage: pymupdf.TextPage = None,
- ):
- pymupdf.CheckParent(page)
- tp = textpage
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- rc = tp.extractSelection(p1, p2)
- if textpage is None:
- del tp
- return rc
- def get_textpage_ocr(
- page: pymupdf.Page,
- flags: int = 0,
- language: str = "eng",
- dpi: int = 72,
- full: bool = False,
- tessdata: str = None,
- ) -> pymupdf.TextPage:
- """Create a Textpage from combined results of normal and OCR text parsing.
- Args:
- flags: (int) control content becoming part of the result.
- language: (str) specify expected language(s). Default is "eng" (English).
- dpi: (int) resolution in dpi, default 72.
- full: (bool) whether to OCR the full page image, or only its images (default)
- """
- pymupdf.CheckParent(page)
- tessdata = pymupdf.get_tessdata(tessdata)
- def full_ocr(page, dpi, language, flags):
- zoom = dpi / 72
- mat = pymupdf.Matrix(zoom, zoom)
- pix = page.get_pixmap(matrix=mat)
- ocr_pdf = pymupdf.Document(
- "pdf",
- pix.pdfocr_tobytes(
- compress=False,
- language=language,
- tessdata=tessdata,
- ),
- )
- ocr_page = ocr_pdf.load_page(0)
- unzoom = page.rect.width / ocr_page.rect.width
- ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
- tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
- ocr_pdf.close()
- pix = None
- tpage.parent = weakref.proxy(page)
- return tpage
- # if OCR for the full page, OCR its pixmap @ desired dpi
- if full:
- return full_ocr(page, dpi, language, flags)
- # For partial OCR, make a normal textpage, then extend it with text that
- # is OCRed from each image.
- # Because of this, we need the images flag bit set ON.
- tpage = page.get_textpage(flags=flags)
- for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
- if block["type"] != 1: # only look at images
- continue
- bbox = pymupdf.Rect(block["bbox"])
- if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
- continue
- try:
- pix = pymupdf.Pixmap(block["image"]) # get image pixmap
- if pix.n - pix.alpha != 3: # we need to convert this to RGB!
- pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
- if pix.alpha: # must remove alpha channel
- pix = pymupdf.Pixmap(pix, 0)
- imgdoc = pymupdf.Document(
- "pdf",
- pix.pdfocr_tobytes(language=language, tessdata=tessdata),
- ) # pdf with OCRed page
- imgpage = imgdoc.load_page(0) # read image as a page
- pix = None
- # compute matrix to transform coordinates back to that of 'page'
- imgrect = imgpage.rect # page size of image PDF
- shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
- mat = shrink * block["transform"]
- imgpage.extend_textpage(tpage, flags=0, matrix=mat)
- imgdoc.close()
- except (RuntimeError, mupdf.FzErrorBase):
- if 0 and g_exceptions_verbose:
- # Don't show exception info here because it can happen in
- # normal operation (see test_3842b).
- pymupdf.exception_info()
- tpage = None
- pymupdf.message("Falling back to full page OCR")
- return full_ocr(page, dpi, language, flags)
- return tpage
- def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list:
- """Extract image information only from a pymupdf.TextPage.
- Args:
- hashes: (bool) include MD5 hash for each image.
- xrefs: (bool) try to find the xref for each image. Sets hashes to true.
- """
- doc = page.parent
- if xrefs and doc.is_pdf:
- hashes = True
- if not doc.is_pdf:
- xrefs = False
- imginfo = getattr(page, "_image_info", None)
- if imginfo and not xrefs:
- return imginfo
- if not imginfo:
- tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES)
- imginfo = tp.extractIMGINFO(hashes=hashes)
- del tp
- if hashes:
- page._image_info = imginfo
- if not xrefs or not doc.is_pdf:
- return imginfo
- imglist = page.get_images()
- digests = {}
- for item in imglist:
- xref = item[0]
- pix = pymupdf.Pixmap(doc, xref)
- digests[pix.digest] = xref
- del pix
- for i in range(len(imginfo)):
- item = imginfo[i]
- xref = digests.get(item["digest"], 0)
- item["xref"] = xref
- imginfo[i] = item
- return imginfo
- def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:
- """Return list of image positions on a page.
- Args:
- name: (str, list, int) image identification. May be reference name, an
- item of the page's image list or an xref.
- transform: (bool) whether to also return the transformation matrix.
- Returns:
- A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix)
- for all image locations on the page.
- """
- if type(name) in (list, tuple):
- xref = name[0]
- elif type(name) is int:
- xref = name
- else:
- imglist = [i for i in page.get_images() if i[7] == name]
- if imglist == []:
- raise ValueError("bad image name")
- elif len(imglist) != 1:
- raise ValueError("multiple image names found")
- xref = imglist[0][0]
- pix = pymupdf.Pixmap(page.parent, xref) # make pixmap of the image to compute MD5
- digest = pix.digest
- del pix
- infos = page.get_image_info(hashes=True)
- if not transform:
- bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest]
- else:
- bboxes = [
- (pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"]))
- for im in infos
- if im["digest"] == digest
- ]
- return bboxes
- def get_text(
- page: pymupdf.Page,
- option: str = "text",
- *,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- sort: bool = False,
- delimiters=None,
- tolerance=3,
- ):
- """Extract text from a page or an annotation.
- This is a unifying wrapper for various methods of the pymupdf.TextPage class.
- Args:
- option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
- clip: (rect-like) restrict output to this area.
- flags: bit switches to e.g. exclude images or decompose ligatures.
- textpage: reuse this pymupdf.TextPage and make no new one. If specified,
- 'flags' and 'clip' are ignored.
- Returns:
- the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
- methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
- extractXHTML or etractXML respectively.
- Default and misspelling choice is "text".
- """
- formats = {
- "text": pymupdf.TEXTFLAGS_TEXT,
- "html": pymupdf.TEXTFLAGS_HTML,
- "json": pymupdf.TEXTFLAGS_DICT,
- "rawjson": pymupdf.TEXTFLAGS_RAWDICT,
- "xml": pymupdf.TEXTFLAGS_XML,
- "xhtml": pymupdf.TEXTFLAGS_XHTML,
- "dict": pymupdf.TEXTFLAGS_DICT,
- "rawdict": pymupdf.TEXTFLAGS_RAWDICT,
- "words": pymupdf.TEXTFLAGS_WORDS,
- "blocks": pymupdf.TEXTFLAGS_BLOCKS,
- }
- option = option.lower()
- assert option in formats
- if option not in formats:
- option = "text"
- if flags is None:
- flags = formats[option]
- if option == "words":
- return get_text_words(
- page,
- clip=clip,
- flags=flags,
- textpage=textpage,
- sort=sort,
- delimiters=delimiters,
- )
- if option == "blocks":
- return get_text_blocks(
- page, clip=clip, flags=flags, textpage=textpage, sort=sort
- )
- if option == "text" and sort:
- return get_sorted_text(
- page,
- clip=clip,
- flags=flags,
- textpage=textpage,
- tolerance=tolerance,
- )
- pymupdf.CheckParent(page)
- cb = None
- if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
- clip = page.cropbox
- if clip is not None:
- clip = pymupdf.Rect(clip)
- cb = None
- elif type(page) is pymupdf.Page:
- cb = page.cropbox
- # pymupdf.TextPage with or without images
- tp = textpage
- #pymupdf.exception_info()
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=flags)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- #pymupdf.log( '{option=}')
- if option == "json":
- t = tp.extractJSON(cb=cb, sort=sort)
- elif option == "rawjson":
- t = tp.extractRAWJSON(cb=cb, sort=sort)
- elif option == "dict":
- t = tp.extractDICT(cb=cb, sort=sort)
- elif option == "rawdict":
- t = tp.extractRAWDICT(cb=cb, sort=sort)
- elif option == "html":
- t = tp.extractHTML()
- elif option == "xml":
- t = tp.extractXML()
- elif option == "xhtml":
- t = tp.extractXHTML()
- else:
- t = tp.extractText(sort=sort)
- if textpage is None:
- del tp
- return t
- def get_page_text(
- doc: pymupdf.Document,
- pno: int,
- option: str = "text",
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- sort: bool = False,
- ) -> typing.Any:
- """Extract a document page's text by page number.
- Notes:
- Convenience function calling page.get_text().
- Args:
- pno: page number
- option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
- Returns:
- output from page.TextPage().
- """
- return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort)
- def get_pixmap(
- page: pymupdf.Page,
- *,
- matrix: matrix_like=pymupdf.Identity,
- dpi=None,
- colorspace: pymupdf.Colorspace=pymupdf.csRGB,
- clip: rect_like=None,
- alpha: bool=False,
- annots: bool=True,
- ) -> pymupdf.Pixmap:
- """Create pixmap of page.
- Keyword args:
- matrix: Matrix for transformation (default: Identity).
- dpi: desired dots per inch. If given, matrix is ignored.
- colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB.
- clip: (irect-like) restrict rendering to this area.
- alpha: (bool) whether to include alpha channel
- annots: (bool) whether to also render annotations
- """
- if dpi:
- zoom = dpi / 72
- matrix = pymupdf.Matrix(zoom, zoom)
- if type(colorspace) is str:
- if colorspace.upper() == "GRAY":
- colorspace = pymupdf.csGRAY
- elif colorspace.upper() == "CMYK":
- colorspace = pymupdf.csCMYK
- else:
- colorspace = pymupdf.csRGB
- if colorspace.n not in (1, 3, 4):
- raise ValueError("unsupported colorspace")
- dl = page.get_displaylist(annots=annots)
- pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip)
- dl = None
- if dpi:
- pix.set_dpi(dpi, dpi)
- return pix
- def get_page_pixmap(
- doc: pymupdf.Document,
- pno: int,
- *,
- matrix: matrix_like = pymupdf.Identity,
- dpi=None,
- colorspace: pymupdf.Colorspace = pymupdf.csRGB,
- clip: rect_like = None,
- alpha: bool = False,
- annots: bool = True,
- ) -> pymupdf.Pixmap:
- """Create pixmap of document page by page number.
- Notes:
- Convenience function calling page.get_pixmap.
- Args:
- pno: (int) page number
- matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity).
- colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB.
- clip: (irect-like) restrict rendering to this area.
- alpha: (bool) include alpha channel
- annots: (bool) also render annotations
- """
- return doc[pno].get_pixmap(
- matrix=matrix,
- dpi=dpi, colorspace=colorspace,
- clip=clip,
- alpha=alpha,
- annots=annots
- )
- def getLinkDict(ln, document=None) -> dict:
- if isinstance(ln, pymupdf.Outline):
- dest = ln.destination(document)
- elif isinstance(ln, pymupdf.Link):
- dest = ln.dest
- else:
- assert 0, f'Unexpected {type(ln)=}.'
- nl = {"kind": dest.kind, "xref": 0}
- try:
- if hasattr(ln, 'rect'):
- nl["from"] = ln.rect
- except Exception:
- # This seems to happen quite often in PyMuPDF/tests.
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- pnt = pymupdf.Point(0, 0)
- if dest.flags & pymupdf.LINK_FLAG_L_VALID:
- pnt.x = dest.lt.x
- if dest.flags & pymupdf.LINK_FLAG_T_VALID:
- pnt.y = dest.lt.y
- if dest.kind == pymupdf.LINK_URI:
- nl["uri"] = dest.uri
- elif dest.kind == pymupdf.LINK_GOTO:
- nl["page"] = dest.page
- nl["to"] = pnt
- if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
- nl["zoom"] = dest.rb.x
- else:
- nl["zoom"] = 0.0
- elif dest.kind == pymupdf.LINK_GOTOR:
- nl["file"] = dest.file_spec.replace("\\", "/")
- nl["page"] = dest.page
- if dest.page < 0:
- nl["to"] = dest.dest
- else:
- nl["to"] = pnt
- if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
- nl["zoom"] = dest.rb.x
- else:
- nl["zoom"] = 0.0
- elif dest.kind == pymupdf.LINK_LAUNCH:
- nl["file"] = dest.file_spec.replace("\\", "/")
- elif dest.kind == pymupdf.LINK_NAMED:
- # The dicts should not have same key(s).
- assert not (dest.named.keys() & nl.keys())
- nl.update(dest.named)
- if 'to' in nl:
- nl['to'] = pymupdf.Point(nl['to'])
- else:
- nl["page"] = dest.page
- return nl
- def get_links(page: pymupdf.Page) -> list:
- """Create a list of all links contained in a PDF page.
- Notes:
- see PyMuPDF ducmentation for details.
- """
- pymupdf.CheckParent(page)
- ln = page.first_link
- links = []
- while ln:
- nl = getLinkDict(ln, page.parent)
- links.append(nl)
- ln = ln.next
- if links != [] and page.parent.is_pdf:
- linkxrefs = [x for x in
- #page.annot_xrefs()
- pymupdf.JM_get_annot_xref_list2(page)
- if x[1] == pymupdf.PDF_ANNOT_LINK # pylint: disable=no-member
- ]
- if len(linkxrefs) == len(links):
- for i in range(len(linkxrefs)):
- links[i]["xref"] = linkxrefs[i][0]
- links[i]["id"] = linkxrefs[i][2]
- return links
- def get_toc(
- doc: pymupdf.Document,
- simple: bool = True,
- ) -> list:
- """Create a table of contents.
- Args:
- simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.
- """
- def recurse(olItem, liste, lvl):
- """Recursively follow the outline item chain and record item information in a list."""
- while olItem and olItem.this.m_internal:
- if olItem.title:
- title = olItem.title
- else:
- title = " "
- if not olItem.is_external:
- if olItem.uri:
- if olItem.page == -1:
- resolve = doc.resolve_link(olItem.uri)
- page = resolve[0] + 1
- else:
- page = olItem.page + 1
- else:
- page = -1
- else:
- page = -1
- if not simple:
- link = getLinkDict(olItem, doc)
- liste.append([lvl, title, page, link])
- else:
- liste.append([lvl, title, page])
- if olItem.down:
- liste = recurse(olItem.down, liste, lvl + 1)
- olItem = olItem.next
- return liste
- # ensure document is open
- if doc.is_closed:
- raise ValueError("document closed")
- doc.init_doc()
- olItem = doc.outline
- if not olItem:
- return []
- lvl = 1
- liste = []
- toc = recurse(olItem, liste, lvl)
- if doc.is_pdf and not simple:
- doc._extend_toc_items(toc)
- return toc
- def del_toc_item(
- doc: pymupdf.Document,
- idx: int,
- ) -> None:
- """Delete TOC / bookmark item by index."""
- xref = doc.get_outline_xrefs()[idx]
- doc._remove_toc_item(xref)
- def set_toc_item(
- doc: pymupdf.Document,
- idx: int,
- dest_dict: OptDict = None,
- kind: OptInt = None,
- pno: OptInt = None,
- uri: OptStr = None,
- title: OptStr = None,
- to: point_like = None,
- filename: OptStr = None,
- zoom: float = 0,
- ) -> None:
- """Update TOC item by index.
- It allows changing the item's title and link destination.
- Args:
- idx:
- (int) desired index of the TOC list, as created by get_toc.
- dest_dict:
- (dict) destination dictionary as created by get_toc(False).
- Outrules all other parameters. If None, the remaining parameters
- are used to make a dest dictionary.
- kind:
- (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only
- the title will be updated. If pymupdf.LINK_NONE, the TOC item will
- be deleted.
- pno:
- (int) page number (1-based like in get_toc). Required if
- pymupdf.LINK_GOTO.
- uri:
- (str) the URL, required if pymupdf.LINK_URI.
- title:
- (str) the new title. No change if None.
- to:
- (point-like) destination on the target page. If omitted, (72, 36)
- will be used as target coordinates.
- filename:
- (str) destination filename, required for pymupdf.LINK_GOTOR and
- pymupdf.LINK_LAUNCH.
- name:
- (str) a destination name for pymupdf.LINK_NAMED.
- zoom:
- (float) a zoom factor for the target location (pymupdf.LINK_GOTO).
- """
- xref = doc.get_outline_xrefs()[idx]
- page_xref = 0
- if type(dest_dict) is dict:
- if dest_dict["kind"] == pymupdf.LINK_GOTO:
- pno = dest_dict["page"]
- page_xref = doc.page_xref(pno)
- page_height = doc.page_cropbox(pno).height
- to = dest_dict.get('to', pymupdf.Point(72, 36))
- to.y = page_height - to.y
- dest_dict["to"] = to
- action = getDestStr(page_xref, dest_dict)
- if not action.startswith("/A"):
- raise ValueError("bad bookmark dest")
- color = dest_dict.get("color")
- if color:
- color = list(map(float, color))
- if len(color) != 3 or min(color) < 0 or max(color) > 1:
- raise ValueError("bad color value")
- bold = dest_dict.get("bold", False)
- italic = dest_dict.get("italic", False)
- flags = italic + 2 * bold
- collapse = dest_dict.get("collapse")
- return doc._update_toc_item(
- xref,
- action=action[2:],
- title=title,
- color=color,
- flags=flags,
- collapse=collapse,
- )
- if kind == pymupdf.LINK_NONE: # delete bookmark item
- return doc.del_toc_item(idx)
- if kind is None and title is None: # treat as no-op
- return None
- if kind is None: # only update title text
- return doc._update_toc_item(xref, action=None, title=title)
- if kind == pymupdf.LINK_GOTO:
- if pno is None or pno not in range(1, doc.page_count + 1):
- raise ValueError("bad page number")
- page_xref = doc.page_xref(pno - 1)
- page_height = doc.page_cropbox(pno - 1).height
- if to is None:
- to = pymupdf.Point(72, page_height - 36)
- else:
- to = pymupdf.Point(to)
- to.y = page_height - to.y
- ddict = {
- "kind": kind,
- "to": to,
- "uri": uri,
- "page": pno,
- "file": filename,
- "zoom": zoom,
- }
- action = getDestStr(page_xref, ddict)
- if action == "" or not action.startswith("/A"):
- raise ValueError("bad bookmark dest")
- return doc._update_toc_item(xref, action=action[2:], title=title)
- def get_area(*args) -> float:
- """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'."""
- rect = args[0]
- if len(args) > 1:
- unit = args[1]
- else:
- unit = "px"
- u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)}
- f = (u[unit][0] / u[unit][1]) ** 2
- return f * rect.width * rect.height
- def set_metadata(doc: pymupdf.Document, m: dict = None) -> None:
- """Update the PDF /Info object.
- Args:
- m: a dictionary like doc.metadata.
- """
- if not doc.is_pdf:
- raise ValueError("is no PDF")
- if doc.is_closed or doc.is_encrypted:
- raise ValueError("document closed or encrypted")
- if m is None:
- m = {}
- elif type(m) is not dict:
- raise ValueError("bad metadata")
- keymap = {
- "author": "Author",
- "producer": "Producer",
- "creator": "Creator",
- "title": "Title",
- "format": None,
- "encryption": None,
- "creationDate": "CreationDate",
- "modDate": "ModDate",
- "subject": "Subject",
- "keywords": "Keywords",
- "trapped": "Trapped",
- }
- valid_keys = set(keymap.keys())
- diff_set = set(m.keys()).difference(valid_keys)
- if diff_set != set():
- msg = "bad dict key(s): %s" % diff_set
- raise ValueError(msg)
- t, temp = doc.xref_get_key(-1, "Info")
- if t != "xref":
- info_xref = 0
- else:
- info_xref = int(temp.replace("0 R", ""))
- if m == {} and info_xref == 0: # nothing to do
- return
- if info_xref == 0: # no prev metadata: get new xref
- info_xref = doc.get_new_xref()
- doc.update_object(info_xref, "<<>>") # fill it with empty object
- doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref)
- elif m == {}: # remove existing metadata
- doc.xref_set_key(-1, "Info", "null")
- doc.init_doc()
- return
- for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]:
- pdf_key = keymap[key]
- if not bool(val) or val in ("none", "null"):
- val = "null"
- else:
- val = pymupdf.get_pdf_str(val)
- doc.xref_set_key(info_xref, pdf_key, val)
- doc.init_doc()
- return
- def getDestStr(xref: int, ddict: dict) -> str:
- """Calculate the PDF action string.
- Notes:
- Supports Link annotations and outline items (bookmarks).
- """
- if not ddict:
- return ""
- str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
- str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
- str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
- str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
- str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
- if type(ddict) in (int, float):
- dest = str_goto(xref, 0, ddict, 0)
- return dest
- d_kind = ddict.get("kind", pymupdf.LINK_NONE)
- if d_kind == pymupdf.LINK_NONE:
- return ""
- if ddict["kind"] == pymupdf.LINK_GOTO:
- d_zoom = ddict.get("zoom", 0)
- to = ddict.get("to", pymupdf.Point(0, 0))
- d_left, d_top = to
- dest = str_goto(xref, d_left, d_top, d_zoom)
- return dest
- if ddict["kind"] == pymupdf.LINK_URI:
- dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
- return dest
- if ddict["kind"] == pymupdf.LINK_LAUNCH:
- fspec = pymupdf.get_pdf_str(ddict["file"])
- dest = str_launch(fspec, fspec)
- return dest
- if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
- fspec = pymupdf.get_pdf_str(ddict["file"])
- dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
- return dest
- if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
- fspec = pymupdf.get_pdf_str(ddict["file"])
- dest = str_gotor1(
- ddict["page"],
- ddict["to"].x,
- ddict["to"].y,
- ddict["zoom"],
- fspec,
- fspec,
- )
- return dest
- return ""
- def set_toc(
- doc: pymupdf.Document,
- toc: list,
- collapse: int = 1,
- ) -> int:
- """Create new outline tree (table of contents, TOC).
- Args:
- toc: (list, tuple) each entry must contain level, title, page and
- optionally top margin on the page. None or '()' remove the TOC.
- collapse: (int) collapses entries beyond this level. Zero or None
- shows all entries unfolded.
- Returns:
- the number of inserted items, or the number of removed items respectively.
- """
- if doc.is_closed or doc.is_encrypted:
- raise ValueError("document closed or encrypted")
- if not doc.is_pdf:
- raise ValueError("is no PDF")
- if not toc: # remove all entries
- return len(doc._delToC())
- # validity checks --------------------------------------------------------
- if type(toc) not in (list, tuple):
- raise ValueError("'toc' must be list or tuple")
- toclen = len(toc)
- page_count = doc.page_count
- t0 = toc[0]
- if type(t0) not in (list, tuple):
- raise ValueError("items must be sequences of 3 or 4 items")
- if t0[0] != 1:
- raise ValueError("hierarchy level of item 0 must be 1")
- for i in list(range(toclen - 1)):
- t1 = toc[i]
- t2 = toc[i + 1]
- if not -1 <= t1[2] <= page_count:
- raise ValueError("row %i: page number out of range" % i)
- if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4):
- raise ValueError("bad row %i" % (i + 1))
- if (type(t2[0]) is not int) or t2[0] < 1:
- raise ValueError("bad hierarchy level in row %i" % (i + 1))
- if t2[0] > t1[0] + 1:
- raise ValueError("bad hierarchy level in row %i" % (i + 1))
- # no formal errors in toc --------------------------------------------------
- # --------------------------------------------------------------------------
- # make a list of xref numbers, which we can use for our TOC entries
- # --------------------------------------------------------------------------
- old_xrefs = doc._delToC() # del old outlines, get their xref numbers
- # prepare table of xrefs for new bookmarks
- old_xrefs = []
- xref = [0] + old_xrefs
- xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number
- if toclen > len(old_xrefs): # too few old xrefs?
- for i in range((toclen - len(old_xrefs))):
- xref.append(doc.get_new_xref()) # acquire new ones
- lvltab = {0: 0} # to store last entry per hierarchy level
- # ------------------------------------------------------------------------------
- # contains new outline objects as strings - first one is the outline root
- # ------------------------------------------------------------------------------
- olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}]
- # ------------------------------------------------------------------------------
- # build olitems as a list of PDF-like connected dictionaries
- # ------------------------------------------------------------------------------
- for i in range(toclen):
- o = toc[i]
- lvl = o[0] # level
- title = pymupdf.get_pdf_str(o[1]) # title
- pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number
- page_xref = doc.page_xref(pno)
- page_height = doc.page_cropbox(pno).height
- top = pymupdf.Point(72, page_height - 36)
- dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target
- if o[2] < 0:
- dest_dict["kind"] = pymupdf.LINK_NONE
- if len(o) > 3: # some target is specified
- if type(o[3]) in (int, float): # convert a number to a point
- dest_dict["to"] = pymupdf.Point(72, page_height - o[3])
- else: # if something else, make sure we have a dict
- # We make a copy of o[3] to avoid modifying our caller's data.
- dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict
- if "to" not in dest_dict: # target point not in dict?
- dest_dict["to"] = top # put default in
- else: # transform target to PDF coordinates
- page = doc[pno]
- point = pymupdf.Point(dest_dict["to"])
- point.y = page.cropbox.height - point.y
- point = point * page.rotation_matrix
- dest_dict["to"] = (point.x, point.y)
- d = {}
- d["first"] = -1
- d["count"] = 0
- d["last"] = -1
- d["prev"] = -1
- d["next"] = -1
- d["dest"] = getDestStr(page_xref, dest_dict)
- d["top"] = dest_dict["to"]
- d["title"] = title
- d["parent"] = lvltab[lvl - 1]
- d["xref"] = xref[i + 1]
- d["color"] = dest_dict.get("color")
- d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0)
- lvltab[lvl] = i + 1
- parent = olitems[lvltab[lvl - 1]] # the parent entry
- if (
- dest_dict.get("collapse") or collapse and lvl > collapse
- ): # suppress expansion
- parent["count"] -= 1 # make /Count negative
- else:
- parent["count"] += 1 # positive /Count
- if parent["first"] == -1:
- parent["first"] = i + 1
- parent["last"] = i + 1
- else:
- d["prev"] = parent["last"]
- prev = olitems[parent["last"]]
- prev["next"] = i + 1
- parent["last"] = i + 1
- olitems.append(d)
- # ------------------------------------------------------------------------------
- # now create each outline item as a string and insert it in the PDF
- # ------------------------------------------------------------------------------
- for i, ol in enumerate(olitems):
- txt = "<<"
- if ol["count"] != 0:
- txt += "/Count %i" % ol["count"]
- try:
- txt += ol["dest"]
- except Exception:
- # Verbose in PyMuPDF/tests.
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- try:
- if ol["first"] > -1:
- txt += "/First %i 0 R" % xref[ol["first"]]
- except Exception:
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- try:
- if ol["last"] > -1:
- txt += "/Last %i 0 R" % xref[ol["last"]]
- except Exception:
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- try:
- if ol["next"] > -1:
- txt += "/Next %i 0 R" % xref[ol["next"]]
- except Exception:
- # Verbose in PyMuPDF/tests.
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- try:
- if ol["parent"] > -1:
- txt += "/Parent %i 0 R" % xref[ol["parent"]]
- except Exception:
- # Verbose in PyMuPDF/tests.
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- try:
- if ol["prev"] > -1:
- txt += "/Prev %i 0 R" % xref[ol["prev"]]
- except Exception:
- # Verbose in PyMuPDF/tests.
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- try:
- txt += "/Title" + ol["title"]
- except Exception:
- # Verbose in PyMuPDF/tests.
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- if ol.get("color") and len(ol["color"]) == 3:
- txt += f"/C[ {_format_g(tuple(ol['color']))}]"
- if ol.get("flags", 0) > 0:
- txt += "/F %i" % ol["flags"]
- if i == 0: # special: this is the outline root
- txt += "/Type/Outlines" # so add the /Type entry
- txt += ">>"
- doc.update_object(xref[i], txt) # insert the PDF object
- doc.init_doc()
- return toclen
- def do_widgets(
- tar: pymupdf.Document,
- src: pymupdf.Document,
- graftmap,
- from_page: int = -1,
- to_page: int = -1,
- start_at: int = -1,
- join_duplicates=0,
- ) -> None:
- """Insert widgets of copied page range into target PDF.
- Parameter values **must** equal those of method insert_pdf() which
- must have been previously executed.
- """
- if not src.is_form_pdf: # nothing to do: source PDF has no fields
- return
- def clean_kid_parents(acro_fields):
- """ Make sure all kids have correct "Parent" pointers."""
- for i in range(acro_fields.pdf_array_len()):
- parent = acro_fields.pdf_array_get(i)
- kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
- for j in range(kids.pdf_array_len()):
- kid = kids.pdf_array_get(j)
- kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent)
- def join_widgets(pdf, acro_fields, xref1, xref2, name):
- """Called for each pair of widgets having the same name.
- Args:
- pdf: target MuPDF document
- acro_fields: object Root/AcroForm/Fields
- xref1, xref2: widget xrefs having same names
- name: (str) the name
- Result:
- Defined or updated widget parent that points to both widgets.
- """
- def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2):
- """Merge widget in xref2 into "Kids" list of widget xref1.
- Args:
- xref1, kids1: target widget and its "Kids" array.
- xref2, kids2: source wwidget and its "Kids" array (may be empty).
- """
- # make indirect objects from widgets
- w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0)
- w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0)
- # find source widget in "Fields" array
- idx = acro_fields.pdf_array_find(w2_ind)
- acro_fields.pdf_array_delete(idx)
- if not kids2.pdf_is_array(): # source widget has no kids
- widget = mupdf.pdf_load_object(pdf, xref2)
- # delete name from widget and insert target as parent
- widget.pdf_dict_del(pymupdf.PDF_NAME("T"))
- widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
- # put in target Kids
- kids1.pdf_array_push(w2_ind)
- else: # copy source kids to target kids
- for i in range(kids2.pdf_array_len()):
- kid = kids2.pdf_array_get(i)
- kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
- kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0)
- kids1.pdf_array_push(kid_ind)
- def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name):
- """Make new "Parent" for two widgets with same name.
- Args:
- xref1, w1: first widget
- xref2, w2: second widget
- name: field name
- Result:
- Both widgets have no "Kids". We create a new object with the
- name and a "Kids" array containing the widgets.
- Original widgets must be removed from AcroForm/Fields.
- """
- # make new "Parent" object
- new = mupdf.pdf_new_dict(pdf, 5)
- new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name)
- kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2)
- new_obj = mupdf.pdf_add_object(pdf, new)
- new_obj_xref = new_obj.pdf_to_num()
- new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0)
- # copy over some required source widget properties
- ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT"))
- w1.pdf_dict_del(pymupdf.PDF_NAME("FT"))
- new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft)
- aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA"))
- w1.pdf_dict_del(pymupdf.PDF_NAME("AA"))
- new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa)
- # remove name field, insert "Parent" field in source widgets
- w1.pdf_dict_del(pymupdf.PDF_NAME("T"))
- w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
- w2.pdf_dict_del(pymupdf.PDF_NAME("T"))
- w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
- # put source widgets in "kids" array
- ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0)
- ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0)
- kids.pdf_array_push(ind1)
- kids.pdf_array_push(ind2)
- # remove source widgets from "AcroForm/Fields"
- idx = acro_fields.pdf_array_find(ind1)
- acro_fields.pdf_array_delete(idx)
- idx = acro_fields.pdf_array_find(ind2)
- acro_fields.pdf_array_delete(idx)
- acro_fields.pdf_array_push(new_ind)
- w1 = mupdf.pdf_load_object(pdf, xref1)
- w2 = mupdf.pdf_load_object(pdf, xref2)
- kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
- kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
- # check which widget has a suitable "Kids" array
- if kids1.pdf_is_array():
- re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order
- elif kids2.pdf_is_array():
- re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order
- else:
- new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order
- def get_kids(parent, kids_list):
- """Return xref list of leaf kids for a parent.
- Call with an empty list.
- """
- kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids"))
- if not kids.pdf_is_array():
- return kids_list
- for i in range(kids.pdf_array_len()):
- kid = kids.pdf_array_get(i)
- if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))):
- kids_list = get_kids(kid, kids_list)
- else:
- kids_list.append(kid.pdf_to_num())
- return kids_list
- def kids_xrefs(widget):
- """Get the xref of top "Parent" and the list of leaf widgets."""
- kids_list = []
- parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent"))
- parent_xref = parent.pdf_to_num()
- if parent_xref == 0:
- return parent_xref, kids_list
- kids_list = get_kids(parent, kids_list)
- return parent_xref, kids_list
- def deduplicate_names(pdf, acro_fields, join_duplicates=False):
- """Handle any widget name duplicates caused by the merge."""
- names = {} # key is a widget name, value a list of widgets having it.
- # extract all names and widgets in "AcroForm/Fields"
- for i in range(mupdf.pdf_array_len(acro_fields)):
- wobject = mupdf.pdf_array_get(acro_fields, i)
- xref = wobject.pdf_to_num()
- # extract widget name and collect widget(s) using it
- T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T"))
- xrefs = names.get(T, [])
- xrefs.append(xref)
- names[T] = xrefs
- for name, xrefs in names.items():
- if len(xrefs) < 2:
- continue
- xref0, xref1 = xrefs[:2] # only exactly 2 should occur!
- if join_duplicates: # combine fields with equal names
- join_widgets(pdf, acro_fields, xref0, xref1, name)
- else: # make field names unique
- newname = name + f" [{xref1}]" # append this to the name
- wobject = mupdf.pdf_load_object(pdf, xref1)
- wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname)
- clean_kid_parents(acro_fields)
- def get_acroform(doc):
- """Retrieve the AcroForm dictionary form a PDF."""
- pdf = mupdf.pdf_document_from_fz_document(doc)
- # AcroForm (= central form field info)
- return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm")
- tarpdf = mupdf.pdf_document_from_fz_document(tar)
- srcpdf = mupdf.pdf_document_from_fz_document(src)
- if tar.is_form_pdf:
- # target is a Form PDF, so use it to include source fields
- acro = get_acroform(tar)
- # Important arrays in AcroForm
- acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
- tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO"))
- if not tar_co.pdf_is_array():
- tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
- else:
- # target is no Form PDF, so copy over source AcroForm
- acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy
- # Clear "Fields" and "CO" arrays: will be populated by page fields.
- # This is required to avoid copying unneeded objects.
- acro.pdf_dict_del(pymupdf.PDF_NAME("Fields"))
- acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5)
- acro.pdf_dict_del(pymupdf.PDF_NAME("CO"))
- acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
- # Enrich AcroForm for copying to target
- acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro)
- # Insert AcroForm into target PDF
- acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft)
- acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
- tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO"))
- # get its xref and insert it into target catalog
- tar_xref = acro_tar.pdf_to_num()
- acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
- root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root"))
- root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)
- if from_page <= to_page:
- src_range = range(from_page, to_page + 1)
- else:
- src_range = range(from_page, to_page - 1, -1)
- parents = {} # information about widget parents
- # remove "P" owning page reference from all widgets of all source pages
- for i in src_range:
- src_page = src[i]
- for xref in [
- xref
- for xref, wtype, _ in src_page.annot_xrefs()
- if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
- ]:
- w_obj = mupdf.pdf_load_object(srcpdf, xref)
- w_obj.pdf_dict_del(pymupdf.PDF_NAME("P"))
- # get the widget's parent structure
- parent_xref, old_kids = kids_xrefs(w_obj)
- if parent_xref:
- parents[parent_xref] = {
- "new_xref": 0,
- "old_kids": old_kids,
- "new_kids": [],
- }
- # Copy over Parent widgets first - they are not page-dependent
- for xref in parents.keys(): # pylint: disable=consider-using-dict-items
- parent = mupdf.pdf_load_object(srcpdf, xref)
- parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent)
- parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft)
- kids_xrefs_new = get_kids(parent_tar, [])
- parent_xref_new = parent_tar.pdf_to_num()
- parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0)
- acro_fields.pdf_array_push(parent_ind)
- parents[xref]["new_xref"] = parent_xref_new
- parents[xref]["new_kids"] = kids_xrefs_new
- for i in range(len(src_range)):
- # read first copied over page in target
- tar_page = tar[start_at + i]
- # read the original page in the source PDF
- src_page = src[src_range[i]]
- # now walk through source page widgets and copy over
- w_xrefs = [ # widget xrefs of the source page
- xref
- for xref, wtype, _ in src_page.annot_xrefs()
- if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
- ]
- if not w_xrefs: # no widgets on this source page
- continue
- # convert to formal PDF page
- tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
- # extract annotations array
- tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
- if not mupdf.pdf_is_array(tar_annots):
- tar_annots = mupdf.pdf_dict_put_array(
- tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
- )
- for xref in w_xrefs:
- w_obj = mupdf.pdf_load_object(srcpdf, xref)
- # check if field takes part in inter-field validations
- is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C"))
- # check if parent of widget already in target
- parent_xref = mupdf.pdf_to_num(
- w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent"))
- )
- if parent_xref == 0: # parent not in target yet
- try:
- w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
- except Exception as e:
- pymupdf.message_warning(f"cannot copy widget at {xref=}: {e}")
- continue
- w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
- tar_xref = w_obj_tar.pdf_to_num()
- w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
- mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
- mupdf.pdf_array_push(acro_fields, w_obj_tar_ind)
- else:
- parent = parents[parent_xref]
- idx = parent["old_kids"].index(xref) # search for xref in parent
- tar_xref = parent["new_kids"][idx]
- w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
- mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
- # Into "AcroForm/CO" if a computation field.
- if is_aac:
- mupdf.pdf_array_push(tar_co, w_obj_tar_ind)
- deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates)
- def do_links(
- doc1: pymupdf.Document,
- doc2: pymupdf.Document,
- from_page: int = -1,
- to_page: int = -1,
- start_at: int = -1,
- ) -> None:
- """Insert links contained in copied page range into destination PDF.
- Parameter values **must** equal those of method insert_pdf(), which must
- have been previously executed.
- """
- #pymupdf.log( 'utils.do_links()')
- # --------------------------------------------------------------------------
- # internal function to create the actual "/Annots" object string
- # --------------------------------------------------------------------------
- def cre_annot(lnk, xref_dst, pno_src, ctm):
- """Create annotation object string for a passed-in link."""
- r = lnk["from"] * ctm # rect in PDF coordinates
- rect = _format_g(tuple(r))
- if lnk["kind"] == pymupdf.LINK_GOTO:
- txt = pymupdf.annot_skel["goto1"] # annot_goto
- idx = pno_src.index(lnk["page"])
- p = lnk["to"] * ctm # target point in PDF coordinates
- annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect)
- elif lnk["kind"] == pymupdf.LINK_GOTOR:
- if lnk["page"] >= 0:
- txt = pymupdf.annot_skel["gotor1"] # annot_gotor
- pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
- if type(pnt) is not pymupdf.Point:
- pnt = pymupdf.Point(0, 0)
- annot = txt(
- lnk["page"],
- pnt.x,
- pnt.y,
- lnk["zoom"],
- lnk["file"],
- lnk["file"],
- rect,
- )
- else:
- txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
- to = pymupdf.get_pdf_str(lnk["to"])
- to = to[1:-1]
- f = lnk["file"]
- annot = txt(to, f, rect)
- elif lnk["kind"] == pymupdf.LINK_LAUNCH:
- txt = pymupdf.annot_skel["launch"] # annot_launch
- annot = txt(lnk["file"], lnk["file"], rect)
- elif lnk["kind"] == pymupdf.LINK_URI:
- txt = pymupdf.annot_skel["uri"] # annot_uri
- annot = txt(lnk["uri"], rect)
- else:
- annot = ""
- return annot
- # --------------------------------------------------------------------------
- # validate & normalize parameters
- if from_page < 0:
- fp = 0
- elif from_page >= doc2.page_count:
- fp = doc2.page_count - 1
- else:
- fp = from_page
- if to_page < 0 or to_page >= doc2.page_count:
- tp = doc2.page_count - 1
- else:
- tp = to_page
- if start_at < 0:
- raise ValueError("'start_at' must be >= 0")
- sa = start_at
- incr = 1 if fp <= tp else -1 # page range could be reversed
- # lists of source / destination page numbers
- pno_src = list(range(fp, tp + incr, incr))
- pno_dst = [sa + i for i in range(len(pno_src))]
- # lists of source / destination page xrefs
- xref_src = []
- xref_dst = []
- for i in range(len(pno_src)):
- p_src = pno_src[i]
- p_dst = pno_dst[i]
- old_xref = doc2.page_xref(p_src)
- new_xref = doc1.page_xref(p_dst)
- xref_src.append(old_xref)
- xref_dst.append(new_xref)
- # create the links for each copied page in destination PDF
- for i in range(len(xref_src)):
- page_src = doc2[pno_src[i]] # load source page
- links = page_src.get_links() # get all its links
- #pymupdf.log( '{pno_src=}')
- #pymupdf.log( '{type(page_src)=}')
- #pymupdf.log( '{page_src=}')
- #pymupdf.log( '{=i len(links)}')
- if len(links) == 0: # no links there
- page_src = None
- continue
- ctm = ~page_src.transformation_matrix # calc page transformation matrix
- page_dst = doc1[pno_dst[i]] # load destination page
- link_tab = [] # store all link definitions here
- for l in links:
- if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src):
- continue # GOTO link target not in copied pages
- annot_text = cre_annot(l, xref_dst, pno_src, ctm)
- if annot_text:
- link_tab.append(annot_text)
- if link_tab != []:
- page_dst._addAnnot_FromString( tuple(link_tab))
- #pymupdf.log( 'utils.do_links() returning.')
- def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
- # --------------------------------------------------------------------------
- # define skeletons for /Annots object texts
- # --------------------------------------------------------------------------
- ctm = page.transformation_matrix
- ictm = ~ctm
- r = lnk["from"]
- rect = _format_g(tuple(r * ictm))
- annot = ""
- if lnk["kind"] == pymupdf.LINK_GOTO:
- if lnk["page"] >= 0:
- txt = pymupdf.annot_skel["goto1"] # annot_goto
- pno = lnk["page"]
- xref = page.parent.page_xref(pno)
- pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
- dest_page = page.parent[pno]
- dest_ctm = dest_page.transformation_matrix
- dest_ictm = ~dest_ctm
- ipnt = pnt * dest_ictm
- annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
- else:
- txt = pymupdf.annot_skel["goto2"] # annot_goto_n
- annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
- elif lnk["kind"] == pymupdf.LINK_GOTOR:
- if lnk["page"] >= 0:
- txt = pymupdf.annot_skel["gotor1"] # annot_gotor
- pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
- if type(pnt) is not pymupdf.Point:
- pnt = pymupdf.Point(0, 0)
- annot = txt(
- lnk["page"],
- pnt.x,
- pnt.y,
- lnk.get("zoom", 0),
- lnk["file"],
- lnk["file"],
- rect,
- )
- else:
- txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
- annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
- elif lnk["kind"] == pymupdf.LINK_LAUNCH:
- txt = pymupdf.annot_skel["launch"] # annot_launch
- annot = txt(lnk["file"], lnk["file"], rect)
- elif lnk["kind"] == pymupdf.LINK_URI:
- txt = pymupdf.annot_skel["uri"] # txt = annot_uri
- annot = txt(lnk["uri"], rect)
- elif lnk["kind"] == pymupdf.LINK_NAMED:
- txt = pymupdf.annot_skel["named"] # annot_named
- lname = lnk.get("name") # check presence of key
- if lname is None: # if missing, fall back to alternative
- lname = lnk["nameddest"]
- annot = txt(lname, rect)
- if not annot:
- return annot
- # add a /NM PDF key to the object definition
- link_names = dict( # existing ids and their xref
- [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member
- )
- old_name = lnk.get("id", "") # id value in the argument
- if old_name and (lnk["xref"], old_name) in link_names.items():
- name = old_name # no new name if this is an update only
- else:
- i = 0
- stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
- while True:
- name = stem % i
- if name not in link_names.values():
- break
- i += 1
- # add /NM key to object definition
- annot = annot.replace("/Link", "/Link/NM(%s)" % name)
- return annot
- def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget:
- """Delete widget from page and return the next one."""
- pymupdf.CheckParent(page)
- annot = getattr(widget, "_annot", None)
- if annot is None:
- raise ValueError("bad type: widget")
- nextwidget = widget.next
- page.delete_annot(annot)
- widget._annot.parent = None
- keylist = list(widget.__dict__.keys())
- for key in keylist:
- del widget.__dict__[key]
- return nextwidget
- def update_link(page: pymupdf.Page, lnk: dict) -> None:
- """Update a link on the current page."""
- pymupdf.CheckParent(page)
- annot = getLinkText(page, lnk)
- if annot == "":
- raise ValueError("link kind not supported")
- page.parent.update_object(lnk["xref"], annot, page=page)
- def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None:
- """Insert a new link for the current page."""
- pymupdf.CheckParent(page)
- annot = getLinkText(page, lnk)
- if annot == "":
- raise ValueError("link kind not supported")
- page._addAnnot_FromString((annot,))
- def insert_textbox(
- page: pymupdf.Page,
- rect: rect_like,
- buffer: typing.Union[str, list],
- *,
- fontname: str = "helv",
- fontfile: OptStr = None,
- set_simple: int = 0,
- encoding: int = 0,
- fontsize: float = 11,
- lineheight: OptFloat = None,
- color: OptSeq = None,
- fill: OptSeq = None,
- expandtabs: int = 1,
- align: int = 0,
- rotate: int = 0,
- render_mode: int = 0,
- miter_limit: float = 1,
- border_width: float = 0.05,
- morph: OptSeq = None,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> float:
- """Insert text into a given rectangle.
- Notes:
- Creates a Shape object, uses its same-named method and commits it.
- Parameters:
- rect: (rect-like) area to use for text.
- buffer: text to be inserted
- fontname: a Base-14 font, font name or '/name'
- fontfile: name of a font file
- fontsize: font size
- lineheight: overwrite the font property
- color: RGB color triple
- expandtabs: handles tabulators with string function
- align: left, center, right, justified
- rotate: 0, 90, 180, or 270 degrees
- morph: morph box with a matrix and a fixpoint
- overlay: put text in foreground or background
- Returns:
- unused or deficit rectangle area (float)
- """
- img = page.new_shape()
- rc = img.insert_textbox(
- rect,
- buffer,
- fontsize=fontsize,
- lineheight=lineheight,
- fontname=fontname,
- fontfile=fontfile,
- set_simple=set_simple,
- encoding=encoding,
- color=color,
- fill=fill,
- expandtabs=expandtabs,
- render_mode=render_mode,
- miter_limit=miter_limit,
- border_width=border_width,
- align=align,
- rotate=rotate,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- if rc >= 0:
- img.commit(overlay)
- return rc
- def insert_text(
- page: pymupdf.Page,
- point: point_like,
- text: typing.Union[str, list],
- *,
- fontsize: float = 11,
- lineheight: OptFloat = None,
- fontname: str = "helv",
- fontfile: OptStr = None,
- set_simple: int = 0,
- encoding: int = 0,
- color: OptSeq = None,
- fill: OptSeq = None,
- border_width: float = 0.05,
- miter_limit: float = 1,
- render_mode: int = 0,
- rotate: int = 0,
- morph: OptSeq = None,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ):
- img = page.new_shape()
- rc = img.insert_text(
- point,
- text,
- fontsize=fontsize,
- lineheight=lineheight,
- fontname=fontname,
- fontfile=fontfile,
- set_simple=set_simple,
- encoding=encoding,
- color=color,
- fill=fill,
- border_width=border_width,
- render_mode=render_mode,
- miter_limit=miter_limit,
- rotate=rotate,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- if rc >= 0:
- img.commit(overlay)
- return rc
- def insert_htmlbox(
- page,
- rect,
- text,
- *,
- css=None,
- scale_low=0,
- archive=None,
- rotate=0,
- oc=0,
- opacity=1,
- overlay=True,
- ) -> float:
- """Insert text with optional HTML tags and stylings into a rectangle.
- Args:
- rect: (rect-like) rectangle into which the text should be placed.
- text: (str) text with optional HTML tags and stylings.
- css: (str) CSS styling commands.
- scale_low: (float) force-fit content by scaling it down. Must be in
- range [0, 1]. If 1, no scaling will take place. If 0, arbitrary
- down-scaling is acceptable. A value of 0.1 would mean that content
- may be scaled down by at most 90%.
- archive: Archive object pointing to locations of used fonts or images
- rotate: (int) rotate the text in the box by a multiple of 90 degrees.
- oc: (int) the xref of an OCG / OCMD (Optional Content).
- opacity: (float) set opacity of inserted content.
- overlay: (bool) put text on top of page content.
- Returns:
- A tuple of floats (spare_height, scale).
- spare_height: -1 if content did not fit, else >= 0. It is the height of the
- unused (still available) rectangle stripe. Positive only if
- scale_min = 1 (no down scaling).
- scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit).
- """
- # normalize rotation angle
- if not rotate % 90 == 0:
- raise ValueError("bad rotation angle")
- while rotate < 0:
- rotate += 360
- while rotate >= 360:
- rotate -= 360
- if not 0 <= scale_low <= 1:
- raise ValueError("'scale_low' must be in [0, 1]")
- if css is None:
- css = ""
- rect = pymupdf.Rect(rect)
- if rotate in (90, 270):
- temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width)
- else:
- temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height)
- # use a small border by default
- mycss = "body {margin:1px;}" + css # append user CSS
- # either make a story, or accept a given one
- if isinstance(text, str): # if a string, convert to a Story
- story = pymupdf.Story(html=text, user_css=mycss, archive=archive)
- elif isinstance(text, pymupdf.Story):
- story = text
- else:
- raise ValueError("'text' must be a string or a Story")
- # ----------------------------------------------------------------
- # Find a scaling factor that lets our story fit in
- # ----------------------------------------------------------------
- scale_max = None if scale_low == 0 else 1 / scale_low
- fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max)
- if not fit.big_enough: # there was no fit
- return (-1, scale_low)
- filled = fit.filled
- scale = 1 / fit.parameter # shrink factor
- spare_height = fit.rect.y1 - filled[3] # unused room at rectangle bottom
- # Note: due to MuPDF's logic this may be negative even for successful fits.
- if scale != 1 or spare_height < 0: # if scaling occurred, set spare_height to 0
- spare_height = 0
- def rect_function(*args):
- return fit.rect, fit.rect, pymupdf.Identity
- # draw story on temp PDF page
- doc = story.write_with_links(rect_function)
- # Insert opacity if requested.
- # For this, we prepend a command to the /Contents.
- if 0 <= opacity < 1:
- tpage = doc[0] # load page
- # generate /ExtGstate for the page
- alp0 = tpage._set_opacity(CA=opacity, ca=opacity)
- s = f"/{alp0} gs\n" # generate graphic state command
- pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0)
- # put result in target page
- page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay)
- # -------------------------------------------------------------------------
- # re-insert links in target rect (show_pdf_page cannot copy annotations)
- # -------------------------------------------------------------------------
- # scaled center point of fit.rect
- mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale
- # center point of target rect
- mp2 = (rect.tl + rect.br) / 2
- # compute link positioning matrix:
- # - move center of scaled-down fit.rect to (0,0)
- # - rotate
- # - move (0,0) to center of target rect
- mat = (
- pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y)
- * pymupdf.Matrix(-rotate)
- * pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y)
- )
- # copy over links
- for link in doc[0].get_links():
- link["from"] *= mat
- page.insert_link(link)
- return spare_height, scale
- def new_page(
- doc: pymupdf.Document,
- pno: int = -1,
- width: float = 595,
- height: float = 842,
- ) -> pymupdf.Page:
- """Create and return a new page object.
- Args:
- pno: (int) insert before this page. Default: after last page.
- width: (float) page width in points. Default: 595 (ISO A4 width).
- height: (float) page height in points. Default 842 (ISO A4 height).
- Returns:
- A pymupdf.Page object.
- """
- doc._newPage(pno, width=width, height=height)
- return doc[pno]
- def insert_page(
- doc: pymupdf.Document,
- pno: int,
- text: typing.Union[str, list, None] = None,
- fontsize: float = 11,
- width: float = 595,
- height: float = 842,
- fontname: str = "helv",
- fontfile: OptStr = None,
- color: OptSeq = (0,),
- ) -> int:
- """Create a new PDF page and insert some text.
- Notes:
- Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text().
- For parameter details see these methods.
- """
- page = doc.new_page(pno=pno, width=width, height=height)
- if not bool(text):
- return 0
- rc = page.insert_text(
- (50, 72),
- text,
- fontsize=fontsize,
- fontname=fontname,
- fontfile=fontfile,
- color=color,
- )
- return rc
- def draw_line(
- page: pymupdf.Page,
- p1: point_like,
- p2: point_like,
- color: OptSeq = (0,),
- dashes: OptStr = None,
- width: float = 1,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- morph: OptSeq = None,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc=0,
- ) -> pymupdf.Point:
- """Draw a line from point p1 to point p2."""
- img = page.new_shape()
- p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2))
- img.finish(
- color=color,
- dashes=dashes,
- width=width,
- closePath=False,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return p
- def draw_squiggle(
- page: pymupdf.Page,
- p1: point_like,
- p2: point_like,
- breadth: float = 2,
- color: OptSeq = (0,),
- dashes: OptStr = None,
- width: float = 1,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- morph: OptSeq = None,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw a squiggly line from point p1 to point p2."""
- img = page.new_shape()
- p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
- img.finish(
- color=color,
- dashes=dashes,
- width=width,
- closePath=False,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return p
- def draw_zigzag(
- page: pymupdf.Page,
- p1: point_like,
- p2: point_like,
- breadth: float = 2,
- color: OptSeq = (0,),
- dashes: OptStr = None,
- width: float = 1,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- morph: OptSeq = None,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw a zigzag line from point p1 to point p2."""
- img = page.new_shape()
- p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
- img.finish(
- color=color,
- dashes=dashes,
- width=width,
- closePath=False,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return p
- def draw_rect(
- page: pymupdf.Page,
- rect: rect_like,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- dashes: OptStr = None,
- width: float = 1,
- lineCap: int = 0,
- lineJoin: int = 0,
- morph: OptSeq = None,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- radius=None,
- ) -> pymupdf.Point:
- '''
- Draw a rectangle. See Shape class method for details.
- '''
- img = page.new_shape()
- Q = img.draw_rect(pymupdf.Rect(rect), radius=radius)
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- def draw_quad(
- page: pymupdf.Page,
- quad: quad_like,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- dashes: OptStr = None,
- width: float = 1,
- lineCap: int = 0,
- lineJoin: int = 0,
- morph: OptSeq = None,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw a quadrilateral."""
- img = page.new_shape()
- Q = img.draw_quad(pymupdf.Quad(quad))
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- def draw_polyline(
- page: pymupdf.Page,
- points: list,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- dashes: OptStr = None,
- width: float = 1,
- morph: OptSeq = None,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- closePath: bool = False,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw multiple connected line segments."""
- img = page.new_shape()
- Q = img.draw_polyline(points)
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- closePath=closePath,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- def draw_circle(
- page: pymupdf.Page,
- center: point_like,
- radius: float,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- morph: OptSeq = None,
- dashes: OptStr = None,
- width: float = 1,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw a circle given its center and radius."""
- img = page.new_shape()
- Q = img.draw_circle(pymupdf.Point(center), radius)
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- def draw_oval(
- page: pymupdf.Page,
- rect: typing.Union[rect_like, quad_like],
- color: OptSeq = (0,),
- fill: OptSeq = None,
- dashes: OptStr = None,
- morph: OptSeq = None,
- width: float = 1,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw an oval given its containing rectangle or quad."""
- img = page.new_shape()
- Q = img.draw_oval(rect)
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- def draw_curve(
- page: pymupdf.Page,
- p1: point_like,
- p2: point_like,
- p3: point_like,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- dashes: OptStr = None,
- width: float = 1,
- morph: OptSeq = None,
- closePath: bool = False,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3."""
- img = page.new_shape()
- Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3))
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- closePath=closePath,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- def draw_bezier(
- page: pymupdf.Page,
- p1: point_like,
- p2: point_like,
- p3: point_like,
- p4: point_like,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- dashes: OptStr = None,
- width: float = 1,
- morph: OptStr = None,
- closePath: bool = False,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3."""
- img = page.new_shape()
- Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4))
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- closePath=closePath,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- def draw_sector(
- page: pymupdf.Page,
- center: point_like,
- point: point_like,
- beta: float,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- dashes: OptStr = None,
- fullSector: bool = True,
- morph: OptSeq = None,
- width: float = 1,
- closePath: bool = False,
- lineCap: int = 0,
- lineJoin: int = 0,
- overlay: bool = True,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> pymupdf.Point:
- """Draw a circle sector given circle center, one arc end point and the angle of the arc.
- Parameters:
- center -- center of circle
- point -- arc end point
- beta -- angle of arc (degrees)
- fullSector -- connect arc ends with center
- """
- img = page.new_shape()
- Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector)
- img.finish(
- color=color,
- fill=fill,
- dashes=dashes,
- width=width,
- lineCap=lineCap,
- lineJoin=lineJoin,
- morph=morph,
- closePath=closePath,
- stroke_opacity=stroke_opacity,
- fill_opacity=fill_opacity,
- oc=oc,
- )
- img.commit(overlay)
- return Q
- # ----------------------------------------------------------------------
- # Name: wx.lib.colourdb.py
- # Purpose: Adds a bunch of colour names and RGB values to the
- # colour database so they can be found by name
- #
- # Author: Robin Dunn
- #
- # Created: 13-March-2001
- # Copyright: (c) 2001-2017 by Total Control Software
- # Licence: wxWindows license
- # Tags: phoenix-port, unittest, documented
- # ----------------------------------------------------------------------
- def getColorList() -> list:
- """
- Returns a list of upper-case colour names.
- :rtype: list of strings
- """
- return [name for name, r, g, b in pymupdf.colors_wx_list()]
- def getColorInfoList() -> list:
- """
- Returns list of (name, red, gree, blue) tuples, where:
- name: upper-case color name.
- read, green, blue: integers in range 0..255.
- :rtype: list of tuples
- """
- return pymupdf.colors_wx_list()
- def getColor(name: str) -> tuple:
- """Retrieve RGB color in PDF format by name.
- Returns:
- a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
- """
- return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1))
- def getColorHSV(name: str) -> tuple:
- """Retrieve the hue, saturation, value triple of a color name.
- Returns:
- a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
- """
- try:
- x = getColorInfoList()[getColorList().index(name.upper())]
- except Exception:
- if g_exceptions_verbose: pymupdf.exception_info()
- return (-1, -1, -1)
- r = x[1] / 255.0
- g = x[2] / 255.0
- b = x[3] / 255.0
- cmax = max(r, g, b)
- V = round(cmax * 100, 1)
- cmin = min(r, g, b)
- delta = cmax - cmin
- if delta == 0:
- hue = 0
- elif cmax == r:
- hue = 60.0 * (((g - b) / delta) % 6)
- elif cmax == g:
- hue = 60.0 * (((b - r) / delta) + 2)
- else:
- hue = 60.0 * (((r - g) / delta) + 4)
- H = int(round(hue))
- if cmax == 0:
- sat = 0
- else:
- sat = delta / cmax
- S = int(round(sat * 100))
- return (H, S, V)
- def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
- fontname, ext, stype, buffer = doc.extract_font(xref)
- asc = 0.8
- dsc = -0.2
- if ext == "":
- return fontname, ext, stype, asc, dsc
- if buffer:
- try:
- font = pymupdf.Font(fontbuffer=buffer)
- asc = font.ascender
- dsc = font.descender
- bbox = font.bbox
- if asc - dsc < 1:
- if bbox.y0 < dsc:
- dsc = bbox.y0
- asc = 1 - dsc
- except Exception:
- pymupdf.exception_info()
- asc *= 1.2
- dsc *= 1.2
- return fontname, ext, stype, asc, dsc
- if ext != "n/a":
- try:
- font = pymupdf.Font(fontname)
- asc = font.ascender
- dsc = font.descender
- except Exception:
- pymupdf.exception_info()
- asc *= 1.2
- dsc *= 1.2
- else:
- asc *= 1.2
- dsc *= 1.2
- return fontname, ext, stype, asc, dsc
- def get_char_widths(
- doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None
- ) -> list:
- """Get list of glyph information of a font.
- Notes:
- Must be provided by its XREF number. If we already dealt with the
- font, it will be recorded in doc.FontInfos. Otherwise we insert an
- entry there.
- Finally we return the glyphs for the font. This is a list of
- (glyph, width) where glyph is an integer controlling the char
- appearance, and width is a float controlling the char's spacing:
- width * fontsize is the actual space.
- For 'simple' fonts, glyph == ord(char) will usually be true.
- Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here.
- """
- fontinfo = pymupdf.CheckFontInfo(doc, xref)
- if fontinfo is None: # not recorded yet: create it
- if fontdict is None:
- name, ext, stype, asc, dsc = _get_font_properties(doc, xref)
- fontdict = {
- "name": name,
- "type": stype,
- "ext": ext,
- "ascender": asc,
- "descender": dsc,
- }
- else:
- name = fontdict["name"]
- ext = fontdict["ext"]
- stype = fontdict["type"]
- ordering = fontdict["ordering"]
- simple = fontdict["simple"]
- if ext == "":
- raise ValueError("xref is not a font")
- # check for 'simple' fonts
- if stype in ("Type1", "MMType1", "TrueType"):
- simple = True
- else:
- simple = False
- # check for CJK fonts
- if name in ("Fangti", "Ming"):
- ordering = 0
- elif name in ("Heiti", "Song"):
- ordering = 1
- elif name in ("Gothic", "Mincho"):
- ordering = 2
- elif name in ("Dotum", "Batang"):
- ordering = 3
- else:
- ordering = -1
- fontdict["simple"] = simple
- if name == "ZapfDingbats":
- glyphs = pymupdf.zapf_glyphs
- elif name == "Symbol":
- glyphs = pymupdf.symbol_glyphs
- else:
- glyphs = None
- fontdict["glyphs"] = glyphs
- fontdict["ordering"] = ordering
- fontinfo = [xref, fontdict]
- doc.FontInfos.append(fontinfo)
- else:
- fontdict = fontinfo[1]
- glyphs = fontdict["glyphs"]
- simple = fontdict["simple"]
- ordering = fontdict["ordering"]
- if glyphs is None:
- oldlimit = 0
- else:
- oldlimit = len(glyphs)
- mylimit = max(256, limit)
- if mylimit <= oldlimit:
- return glyphs
- if ordering < 0: # not a CJK font
- glyphs = doc._get_char_widths(
- xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx
- )
- else: # CJK fonts use char codes and width = 1
- glyphs = None
- fontdict["glyphs"] = glyphs
- fontinfo[1] = fontdict
- pymupdf.UpdateFontInfo(doc, fontinfo)
- return glyphs
- class Shape:
- """Create a new shape."""
- @staticmethod
- def horizontal_angle(C, P):
- """Return the angle to the horizontal for the connection from C to P.
- This uses the arcus sine function and resolves its inherent ambiguity by
- looking up in which quadrant vector S = P - C is located.
- """
- S = pymupdf.Point(P - C).unit # unit vector 'C' -> 'P'
- alfa = math.asin(abs(S.y)) # absolute angle from horizontal
- if S.x < 0: # make arcsin result unique
- if S.y <= 0: # bottom-left
- alfa = -(math.pi - alfa)
- else: # top-left
- alfa = math.pi - alfa
- else:
- if S.y >= 0: # top-right
- pass
- else: # bottom-right
- alfa = -alfa
- return alfa
- def __init__(self, page: pymupdf.Page):
- pymupdf.CheckParent(page)
- self.page = page
- self.doc = page.parent
- if not self.doc.is_pdf:
- raise ValueError("is no PDF")
- self.height = page.mediabox_size.y
- self.width = page.mediabox_size.x
- self.x = page.cropbox_position.x
- self.y = page.cropbox_position.y
- self.pctm = page.transformation_matrix # page transf. matrix
- self.ipctm = ~self.pctm # inverted transf. matrix
- self.draw_cont = ""
- self.text_cont = ""
- self.totalcont = ""
- self.last_point = None
- self.rect = None
- def updateRect(self, x):
- if self.rect is None:
- if len(x) == 2:
- self.rect = pymupdf.Rect(x, x)
- else:
- self.rect = pymupdf.Rect(x)
- else:
- if len(x) == 2:
- x = pymupdf.Point(x)
- self.rect.x0 = min(self.rect.x0, x.x)
- self.rect.y0 = min(self.rect.y0, x.y)
- self.rect.x1 = max(self.rect.x1, x.x)
- self.rect.y1 = max(self.rect.y1, x.y)
- else:
- x = pymupdf.Rect(x)
- self.rect.x0 = min(self.rect.x0, x.x0)
- self.rect.y0 = min(self.rect.y0, x.y0)
- self.rect.x1 = max(self.rect.x1, x.x1)
- self.rect.y1 = max(self.rect.y1, x.y1)
- def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point:
- """Draw a line between two points."""
- p1 = pymupdf.Point(p1)
- p2 = pymupdf.Point(p2)
- if not (self.last_point == p1):
- self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
- self.last_point = p1
- self.updateRect(p1)
- self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n"
- self.updateRect(p2)
- self.last_point = p2
- return self.last_point
- def draw_polyline(self, points: list) -> pymupdf.Point:
- """Draw several connected line segments."""
- for i, p in enumerate(points):
- if i == 0:
- if not (self.last_point == pymupdf.Point(p)):
- self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n"
- self.last_point = pymupdf.Point(p)
- else:
- self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n"
- self.updateRect(p)
- self.last_point = pymupdf.Point(points[-1])
- return self.last_point
- def draw_bezier(
- self,
- p1: point_like,
- p2: point_like,
- p3: point_like,
- p4: point_like,
- ) -> pymupdf.Point:
- """Draw a standard cubic Bezier curve."""
- p1 = pymupdf.Point(p1)
- p2 = pymupdf.Point(p2)
- p3 = pymupdf.Point(p3)
- p4 = pymupdf.Point(p4)
- if not (self.last_point == p1):
- self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
- args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm))
- self.draw_cont += _format_g(args) + " c\n"
- self.updateRect(p1)
- self.updateRect(p2)
- self.updateRect(p3)
- self.updateRect(p4)
- self.last_point = p4
- return self.last_point
- def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point:
- """Draw an ellipse inside a tetrapod."""
- if len(tetra) != 4:
- raise ValueError("invalid arg length")
- if hasattr(tetra[0], "__float__"):
- q = pymupdf.Rect(tetra).quad
- else:
- q = pymupdf.Quad(tetra)
- mt = q.ul + (q.ur - q.ul) * 0.5
- mr = q.ur + (q.lr - q.ur) * 0.5
- mb = q.ll + (q.lr - q.ll) * 0.5
- ml = q.ul + (q.ll - q.ul) * 0.5
- if not (self.last_point == ml):
- self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n"
- self.last_point = ml
- self.draw_curve(ml, q.ll, mb)
- self.draw_curve(mb, q.lr, mr)
- self.draw_curve(mr, q.ur, mt)
- self.draw_curve(mt, q.ul, ml)
- self.updateRect(q.rect)
- self.last_point = ml
- return self.last_point
- def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point:
- """Draw a circle given its center and radius."""
- if not radius > pymupdf.EPSILON:
- raise ValueError("radius must be positive")
- center = pymupdf.Point(center)
- p1 = center - (radius, 0)
- return self.draw_sector(center, p1, 360, fullSector=False)
- def draw_curve(
- self,
- p1: point_like,
- p2: point_like,
- p3: point_like,
- ) -> pymupdf.Point:
- """Draw a curve between points using one control point."""
- kappa = 0.55228474983
- p1 = pymupdf.Point(p1)
- p2 = pymupdf.Point(p2)
- p3 = pymupdf.Point(p3)
- k1 = p1 + (p2 - p1) * kappa
- k2 = p3 + (p2 - p3) * kappa
- return self.draw_bezier(p1, k1, k2, p3)
- def draw_sector(
- self,
- center: point_like,
- point: point_like,
- beta: float,
- fullSector: bool = True,
- ) -> pymupdf.Point:
- """Draw a circle sector."""
- center = pymupdf.Point(center)
- point = pymupdf.Point(point)
- l3 = lambda a, b: _format_g((a, b)) + " m\n"
- l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n"
- l5 = lambda a, b: _format_g((a, b)) + " l\n"
- betar = math.radians(-beta)
- w360 = math.radians(math.copysign(360, betar)) * (-1)
- w90 = math.radians(math.copysign(90, betar))
- w45 = w90 / 2
- while abs(betar) > 2 * math.pi:
- betar += w360 # bring angle below 360 degrees
- if not (self.last_point == point):
- self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
- self.last_point = point
- Q = pymupdf.Point(0, 0) # just make sure it exists
- C = center
- P = point
- S = P - C # vector 'center' -> 'point'
- rad = abs(S) # circle radius
- if not rad > pymupdf.EPSILON:
- raise ValueError("radius must be positive")
- alfa = self.horizontal_angle(center, point)
- while abs(betar) > abs(w90): # draw 90 degree arcs
- q1 = C.x + math.cos(alfa + w90) * rad
- q2 = C.y + math.sin(alfa + w90) * rad
- Q = pymupdf.Point(q1, q2) # the arc's end point
- r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45)
- r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45)
- R = pymupdf.Point(r1, r2) # crossing point of tangents
- kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q)
- kappa = kappah * abs(P - Q)
- cp1 = P + (R - P) * kappa # control point 1
- cp2 = Q + (R - Q) * kappa # control point 2
- self.draw_cont += l4(*pymupdf.JM_TUPLE(
- list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
- ))
- betar -= w90 # reduce param angle by 90 deg
- alfa += w90 # advance start angle by 90 deg
- P = Q # advance to arc end point
- # draw (remaining) arc
- if abs(betar) > 1e-3: # significant degrees left?
- beta2 = betar / 2
- q1 = C.x + math.cos(alfa + betar) * rad
- q2 = C.y + math.sin(alfa + betar) * rad
- Q = pymupdf.Point(q1, q2) # the arc's end point
- r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2)
- r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2)
- R = pymupdf.Point(r1, r2) # crossing point of tangents
- # kappa height is 4/3 of segment height
- kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height
- kappa = kappah * abs(P - Q) / (1 - math.cos(betar))
- cp1 = P + (R - P) * kappa # control point 1
- cp2 = Q + (R - Q) * kappa # control point 2
- self.draw_cont += l4(*pymupdf.JM_TUPLE(
- list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
- ))
- if fullSector:
- self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
- self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm))
- self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm))
- self.last_point = Q
- return self.last_point
- def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point:
- """Draw a rectangle.
- Args:
- radius: if not None, the rectangle will have rounded corners.
- This is the radius of the curvature, given as percentage of
- the rectangle width or height. Valid are values 0 < v <= 0.5.
- For a sequence of two values, the corners will have different
- radii. Otherwise, the percentage will be computed from the
- shorter side. A value of (0.5, 0.5) will draw an ellipse.
- """
- r = pymupdf.Rect(rect)
- if radius is None: # standard rectangle
- self.draw_cont += _format_g(pymupdf.JM_TUPLE(
- list(r.bl * self.ipctm) + [r.width, r.height]
- )) + " re\n"
- self.updateRect(r)
- self.last_point = r.tl
- return self.last_point
- # rounded corners requested. This requires 1 or 2 values, each
- # with 0 < value <= 0.5
- if hasattr(radius, "__float__"):
- if radius <= 0 or radius > 0.5:
- raise ValueError(f"bad radius value {radius}.")
- d = min(r.width, r.height) * radius
- px = (d, 0)
- py = (0, d)
- elif hasattr(radius, "__len__") and len(radius) == 2:
- rx, ry = radius
- px = (rx * r.width, 0)
- py = (0, ry * r.height)
- if min(rx, ry) <= 0 or max(rx, ry) > 0.5:
- raise ValueError(f"bad radius value {radius}.")
- else:
- raise ValueError(f"bad radius value {radius}.")
- lp = self.draw_line(r.tl + py, r.bl - py)
- lp = self.draw_curve(lp, r.bl, r.bl + px)
- lp = self.draw_line(lp, r.br - px)
- lp = self.draw_curve(lp, r.br, r.br - py)
- lp = self.draw_line(lp, r.tr + py)
- lp = self.draw_curve(lp, r.tr, r.tr - px)
- lp = self.draw_line(lp, r.tl + px)
- self.last_point = self.draw_curve(lp, r.tl, r.tl + py)
- self.updateRect(r)
- return self.last_point
- def draw_quad(self, quad: quad_like) -> pymupdf.Point:
- """Draw a Quad."""
- q = pymupdf.Quad(quad)
- return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul])
- def draw_zigzag(
- self,
- p1: point_like,
- p2: point_like,
- breadth: float = 2,
- ) -> pymupdf.Point:
- """Draw a zig-zagged line from p1 to p2."""
- p1 = pymupdf.Point(p1)
- p2 = pymupdf.Point(p2)
- S = p2 - p1 # vector start - end
- rad = abs(S) # distance of points
- cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
- if cnt < 4:
- raise ValueError("points too close")
- mb = rad / cnt # revised breadth
- matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
- i_mat = ~matrix # get original position
- points = [] # stores edges
- for i in range(1, cnt):
- if i % 4 == 1: # point "above" connection
- p = pymupdf.Point(i, -1) * mb
- elif i % 4 == 3: # point "below" connection
- p = pymupdf.Point(i, 1) * mb
- else: # ignore others
- continue
- points.append(p * i_mat)
- self.draw_polyline([p1] + points + [p2]) # add start and end points
- return p2
- def draw_squiggle(
- self,
- p1: point_like,
- p2: point_like,
- breadth=2,
- ) -> pymupdf.Point:
- """Draw a squiggly line from p1 to p2."""
- p1 = pymupdf.Point(p1)
- p2 = pymupdf.Point(p2)
- S = p2 - p1 # vector start - end
- rad = abs(S) # distance of points
- cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
- if cnt < 4:
- raise ValueError("points too close")
- mb = rad / cnt # revised breadth
- matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
- i_mat = ~matrix # get original position
- k = 2.4142135623765633 # y of draw_curve helper point
- points = [] # stores edges
- for i in range(1, cnt):
- if i % 4 == 1: # point "above" connection
- p = pymupdf.Point(i, -k) * mb
- elif i % 4 == 3: # point "below" connection
- p = pymupdf.Point(i, k) * mb
- else: # else on connection line
- p = pymupdf.Point(i, 0) * mb
- points.append(p * i_mat)
- points = [p1] + points + [p2]
- cnt = len(points)
- i = 0
- while i + 2 < cnt:
- self.draw_curve(points[i], points[i + 1], points[i + 2])
- i += 2
- return p2
- # ==============================================================================
- # Shape.insert_text
- # ==============================================================================
- def insert_text(
- self,
- point: point_like,
- buffer: typing.Union[str, list],
- *,
- fontsize: float = 11,
- lineheight: OptFloat = None,
- fontname: str = "helv",
- fontfile: OptStr = None,
- set_simple: bool = 0,
- encoding: int = 0,
- color: OptSeq = None,
- fill: OptSeq = None,
- render_mode: int = 0,
- border_width: float = 0.05,
- miter_limit: float = 1,
- rotate: int = 0,
- morph: OptSeq = None,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> int:
- # ensure 'text' is a list of strings, worth dealing with
- if not bool(buffer):
- return 0
- if type(buffer) not in (list, tuple):
- text = buffer.splitlines()
- else:
- text = buffer
- if not len(text) > 0:
- return 0
- point = pymupdf.Point(point)
- try:
- maxcode = max([ord(c) for c in " ".join(text)])
- except Exception:
- pymupdf.exception_info()
- return 0
- # ensure valid 'fontname'
- fname = fontname
- if fname.startswith("/"):
- fname = fname[1:]
- xref = self.page.insert_font(
- fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
- )
- fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
- fontdict = fontinfo[1]
- ordering = fontdict["ordering"]
- simple = fontdict["simple"]
- bfname = fontdict["name"]
- ascender = fontdict["ascender"]
- descender = fontdict["descender"]
- if lineheight:
- lheight = fontsize * lineheight
- elif ascender - descender <= 1:
- lheight = fontsize * 1.2
- else:
- lheight = fontsize * (ascender - descender)
- if maxcode > 255:
- glyphs = self.doc.get_char_widths(xref, maxcode + 1)
- else:
- glyphs = fontdict["glyphs"]
- tab = []
- for t in text:
- if simple and bfname not in ("Symbol", "ZapfDingbats"):
- g = None
- else:
- g = glyphs
- tab.append(pymupdf.getTJstr(t, g, simple, ordering))
- text = tab
- color_str = pymupdf.ColorCode(color, "c")
- fill_str = pymupdf.ColorCode(fill, "f")
- if not fill and render_mode == 0: # ensure fill color when 0 Tr
- fill = color
- fill_str = pymupdf.ColorCode(color, "f")
- morphing = pymupdf.CheckMorph(morph)
- rot = rotate
- if rot % 90 != 0:
- raise ValueError("bad rotate value")
- while rot < 0:
- rot += 360
- rot = rot % 360 # text rotate = 0, 90, 270, 180
- templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf "
- templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n"
- cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise
- cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise
- cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
- height = self.height
- width = self.width
- # setting up for standard rotation directions
- # case rotate = 0
- if morphing:
- m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y)
- mat = ~m1 * morph[1] * m1
- cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
- else:
- cm = ""
- top = height - point.y - self.y # start of 1st char
- left = point.x + self.x # start of 1. char
- space = top # space available
- #headroom = point.y + self.y # distance to page border
- if rot == 90:
- left = height - point.y - self.y
- top = -point.x - self.x
- cm += cmp90
- space = width - abs(top)
- #headroom = point.x + self.x
- elif rot == 270:
- left = -height + point.y + self.y
- top = point.x + self.x
- cm += cmm90
- space = abs(top)
- #headroom = width - point.x - self.x
- elif rot == 180:
- left = -point.x - self.x
- top = -height + point.y + self.y
- cm += cm180
- space = abs(point.y + self.y)
- #headroom = height - point.y - self.y
- optcont = self.page._get_optional_content(oc)
- if optcont is not None:
- bdc = "/OC /%s BDC\n" % optcont
- emc = "EMC\n"
- else:
- bdc = emc = ""
- alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
- if alpha is None:
- alpha = ""
- else:
- alpha = "/%s gs\n" % alpha
- nres = templ1(bdc, alpha, cm, left, top, fname, fontsize)
- if render_mode > 0:
- nres += "%i Tr " % render_mode
- nres += _format_g(border_width * fontsize) + " w "
- if miter_limit is not None:
- nres += _format_g(miter_limit) + " M "
- if color is not None:
- nres += color_str
- if fill is not None:
- nres += fill_str
- # =========================================================================
- # start text insertion
- # =========================================================================
- nres += text[0]
- nlines = 1 # set output line counter
- if len(text) > 1:
- nres += templ2(lheight) # line 1
- else:
- nres += 'TJ'
- for i in range(1, len(text)):
- if space < lheight:
- break # no space left on page
- if i > 1:
- nres += "\nT* "
- nres += text[i] + 'TJ'
- space -= lheight
- nlines += 1
- nres += "\nET\n%sQ\n" % emc
- # =========================================================================
- # end of text insertion
- # =========================================================================
- # update the /Contents object
- self.text_cont += nres
- return nlines
- # ==============================================================================
- # Shape.insert_textbox
- # ==============================================================================
- def insert_textbox(
- self,
- rect: rect_like,
- buffer: typing.Union[str, list],
- *,
- fontname: OptStr = "helv",
- fontfile: OptStr = None,
- fontsize: float = 11,
- lineheight: OptFloat = None,
- set_simple: bool = 0,
- encoding: int = 0,
- color: OptSeq = None,
- fill: OptSeq = None,
- expandtabs: int = 1,
- border_width: float = 0.05,
- miter_limit: float = 1,
- align: int = 0,
- render_mode: int = 0,
- rotate: int = 0,
- morph: OptSeq = None,
- stroke_opacity: float = 1,
- fill_opacity: float = 1,
- oc: int = 0,
- ) -> float:
- """Insert text into a given rectangle.
- Args:
- rect -- the textbox to fill
- buffer -- text to be inserted
- fontname -- a Base-14 font, font name or '/name'
- fontfile -- name of a font file
- fontsize -- font size
- lineheight -- overwrite the font property
- color -- RGB stroke color triple
- fill -- RGB fill color triple
- render_mode -- text rendering control
- border_width -- thickness of glyph borders as percentage of fontsize
- expandtabs -- handles tabulators with string function
- align -- left, center, right, justified
- rotate -- 0, 90, 180, or 270 degrees
- morph -- morph box with a matrix and a fixpoint
- Returns:
- unused or deficit rectangle area (float)
- """
- rect = pymupdf.Rect(rect)
- if rect.is_empty or rect.is_infinite:
- raise ValueError("text box must be finite and not empty")
- color_str = pymupdf.ColorCode(color, "c")
- fill_str = pymupdf.ColorCode(fill, "f")
- if fill is None and render_mode == 0: # ensure fill color for 0 Tr
- fill = color
- fill_str = pymupdf.ColorCode(color, "f")
- optcont = self.page._get_optional_content(oc)
- if optcont is not None:
- bdc = "/OC /%s BDC\n" % optcont
- emc = "EMC\n"
- else:
- bdc = emc = ""
- # determine opacity / transparency
- alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
- if alpha is None:
- alpha = ""
- else:
- alpha = "/%s gs\n" % alpha
- if rotate % 90 != 0:
- raise ValueError("rotate must be multiple of 90")
- rot = rotate
- while rot < 0:
- rot += 360
- rot = rot % 360
- # is buffer worth of dealing with?
- if not bool(buffer):
- return rect.height if rot in (0, 180) else rect.width
- cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise
- cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise
- cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
- height = self.height
- fname = fontname
- if fname.startswith("/"):
- fname = fname[1:]
- xref = self.page.insert_font(
- fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
- )
- fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
- fontdict = fontinfo[1]
- ordering = fontdict["ordering"]
- simple = fontdict["simple"]
- glyphs = fontdict["glyphs"]
- bfname = fontdict["name"]
- ascender = fontdict["ascender"]
- descender = fontdict["descender"]
- if lineheight:
- lheight_factor = lineheight
- elif ascender - descender <= 1:
- lheight_factor = 1.2
- else:
- lheight_factor = ascender - descender
- lheight = fontsize * lheight_factor
- # create a list from buffer, split into its lines
- if type(buffer) in (list, tuple):
- t0 = "\n".join(buffer)
- else:
- t0 = buffer
- maxcode = max([ord(c) for c in t0])
- # replace invalid char codes for simple fonts
- if simple and maxcode > 255:
- t0 = "".join([c if ord(c) < 256 else "?" for c in t0])
- t0 = t0.splitlines()
- glyphs = self.doc.get_char_widths(xref, maxcode + 1)
- if simple and bfname not in ("Symbol", "ZapfDingbats"):
- tj_glyphs = None
- else:
- tj_glyphs = glyphs
- # ----------------------------------------------------------------------
- # calculate pixel length of a string
- # ----------------------------------------------------------------------
- def pixlen(x):
- """Calculate pixel length of x."""
- if ordering < 0:
- return sum([glyphs[ord(c)][1] for c in x]) * fontsize
- else:
- return len(x) * fontsize
- # ---------------------------------------------------------------------
- if ordering < 0:
- blen = glyphs[32][1] * fontsize # pixel size of space character
- else:
- blen = fontsize
- text = "" # output buffer
- if pymupdf.CheckMorph(morph):
- m1 = pymupdf.Matrix(
- 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
- )
- mat = ~m1 * morph[1] * m1
- cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
- else:
- cm = ""
- # ---------------------------------------------------------------------
- # adjust for text orientation / rotation
- # ---------------------------------------------------------------------
- progr = 1 # direction of line progress
- c_pnt = pymupdf.Point(0, fontsize * ascender) # used for line progress
- if rot == 0: # normal orientation
- point = rect.tl + c_pnt # line 1 is 'lheight' below top
- maxwidth = rect.width # pixels available in one line
- maxheight = rect.height # available text height
- elif rot == 90: # rotate counter clockwise
- c_pnt = pymupdf.Point(fontsize * ascender, 0) # progress in x-direction
- point = rect.bl + c_pnt # line 1 'lheight' away from left
- maxwidth = rect.height # pixels available in one line
- maxheight = rect.width # available text height
- cm += cmp90
- elif rot == 180: # text upside down
- # progress upwards in y direction
- c_pnt = -pymupdf.Point(0, fontsize * ascender)
- point = rect.br + c_pnt # line 1 'lheight' above bottom
- maxwidth = rect.width # pixels available in one line
- progr = -1 # subtract lheight for next line
- maxheight =rect.height # available text height
- cm += cm180
- else: # rotate clockwise (270 or -90)
- # progress from right to left
- c_pnt = -pymupdf.Point(fontsize * ascender, 0)
- point = rect.tr + c_pnt # line 1 'lheight' left of right
- maxwidth = rect.height # pixels available in one line
- progr = -1 # subtract lheight for next line
- maxheight = rect.width # available text height
- cm += cmm90
- # =====================================================================
- # line loop
- # =====================================================================
- just_tab = [] # 'justify' indicators per line
- for i, line in enumerate(t0):
- line_t = line.expandtabs(expandtabs).split(" ") # split into words
- num_words = len(line_t)
- lbuff = "" # init line buffer
- rest = maxwidth # available line pixels
- # =================================================================
- # word loop
- # =================================================================
- for j in range(num_words):
- word = line_t[j]
- pl_w = pixlen(word) # pixel len of word
- if rest >= pl_w: # does it fit on the line?
- lbuff += word + " " # yes, append word
- rest -= pl_w + blen # update available line space
- continue # next word
- # word doesn't fit - output line (if not empty)
- if lbuff:
- lbuff = lbuff.rstrip() + "\n" # line full, append line break
- text += lbuff # append to total text
- just_tab.append(True) # can align-justify
- lbuff = "" # re-init line buffer
- rest = maxwidth # re-init avail. space
- if pl_w <= maxwidth: # word shorter than 1 line?
- lbuff = word + " " # start the line with it
- rest = maxwidth - pl_w - blen # update free space
- continue
- # long word: split across multiple lines - char by char ...
- if len(just_tab) > 0:
- just_tab[-1] = False # cannot align-justify
- for c in word:
- if pixlen(lbuff) <= maxwidth - pixlen(c):
- lbuff += c
- else: # line full
- lbuff += "\n" # close line
- text += lbuff # append to text
- just_tab.append(False) # cannot align-justify
- lbuff = c # start new line with this char
- lbuff += " " # finish long word
- rest = maxwidth - pixlen(lbuff) # long word stored
- if lbuff: # unprocessed line content?
- text += lbuff.rstrip() # append to text
- just_tab.append(False) # cannot align-justify
- if i < len(t0) - 1: # not the last line?
- text += "\n" # insert line break
- # compute used part of the textbox
- if text.endswith("\n"):
- text = text[:-1]
- lb_count = text.count("\n") + 1 # number of lines written
- # text height = line count * line height plus one descender value
- text_height = lheight * lb_count - descender * fontsize
- more = text_height - maxheight # difference to height limit
- if more > pymupdf.EPSILON: # landed too much outside rect
- return (-1) * more # return deficit, don't output
- more = abs(more)
- if more < pymupdf.EPSILON:
- more = 0 # don't bother with epsilons
- nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer
- templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf "
- # center, right, justify: output each line with its own specifics
- text_t = text.splitlines() # split text in lines again
- just_tab[-1] = False # never justify last line
- for i, t in enumerate(text_t):
- spacing = 0
- pl = maxwidth - pixlen(t) # length of empty line part
- pnt = point + c_pnt * (i * lheight_factor) # text start of line
- if align == 1: # center: right shift by half width
- if rot in (0, 180):
- pnt = pnt + pymupdf.Point(pl / 2, 0) * progr
- else:
- pnt = pnt - pymupdf.Point(0, pl / 2) * progr
- elif align == 2: # right: right shift by full width
- if rot in (0, 180):
- pnt = pnt + pymupdf.Point(pl, 0) * progr
- else:
- pnt = pnt - pymupdf.Point(0, pl) * progr
- elif align == 3: # justify
- spaces = t.count(" ") # number of spaces in line
- if spaces > 0 and just_tab[i]: # if any, and we may justify
- spacing = pl / spaces # make every space this much larger
- else:
- spacing = 0 # keep normal space length
- top = height - pnt.y - self.y
- left = pnt.x + self.x
- if rot == 90:
- left = height - pnt.y - self.y
- top = -pnt.x - self.x
- elif rot == 270:
- left = -height + pnt.y + self.y
- top = pnt.x + self.x
- elif rot == 180:
- left = -pnt.x - self.x
- top = -height + pnt.y + self.y
- nres += templ(left, top, fname, fontsize)
- if render_mode > 0:
- nres += "%i Tr " % render_mode
- nres += _format_g(border_width * fontsize) + " w "
- if miter_limit is not None:
- nres += _format_g(miter_limit) + " M "
- if align == 3:
- nres += _format_g(spacing) + " Tw "
- if color is not None:
- nres += color_str
- if fill is not None:
- nres += fill_str
- nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering)
- nres += "ET\n%sQ\n" % emc
- self.text_cont += nres
- self.updateRect(rect)
- return more
- def finish(
- self,
- width: float = 1,
- color: OptSeq = (0,),
- fill: OptSeq = None,
- lineCap: int = 0,
- lineJoin: int = 0,
- dashes: OptStr = None,
- even_odd: bool = False,
- morph: OptSeq = None,
- closePath: bool = True,
- fill_opacity: float = 1,
- stroke_opacity: float = 1,
- oc: int = 0,
- ) -> None:
- """Finish the current drawing segment.
- Notes:
- Apply colors, opacity, dashes, line style and width, or
- morphing. Also whether to close the path
- by connecting last to first point.
- """
- if self.draw_cont == "": # treat empty contents as no-op
- return
- if width == 0: # border color makes no sense then
- color = None
- elif color is None: # vice versa
- width = 0
- # if color == None and fill == None:
- # raise ValueError("at least one of 'color' or 'fill' must be given")
- color_str = pymupdf.ColorCode(color, "c") # ensure proper color string
- fill_str = pymupdf.ColorCode(fill, "f") # ensure proper fill string
- optcont = self.page._get_optional_content(oc)
- if optcont is not None:
- self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont
- emc = "EMC\n"
- else:
- emc = ""
- alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
- if alpha is not None:
- self.draw_cont = "/%s gs\n" % alpha + self.draw_cont
- if width != 1 and width != 0:
- self.draw_cont += _format_g(width) + " w\n"
- if lineCap != 0:
- self.draw_cont = "%i J\n" % lineCap + self.draw_cont
- if lineJoin != 0:
- self.draw_cont = "%i j\n" % lineJoin + self.draw_cont
- if dashes not in (None, "", "[] 0"):
- self.draw_cont = "%s d\n" % dashes + self.draw_cont
- if closePath:
- self.draw_cont += "h\n"
- self.last_point = None
- if color is not None:
- self.draw_cont += color_str
- if fill is not None:
- self.draw_cont += fill_str
- if color is not None:
- if not even_odd:
- self.draw_cont += "B\n"
- else:
- self.draw_cont += "B*\n"
- else:
- if not even_odd:
- self.draw_cont += "f\n"
- else:
- self.draw_cont += "f*\n"
- else:
- self.draw_cont += "S\n"
- self.draw_cont += emc
- if pymupdf.CheckMorph(morph):
- m1 = pymupdf.Matrix(
- 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
- )
- mat = ~m1 * morph[1] * m1
- self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont
- self.totalcont += "\nq\n" + self.draw_cont + "Q\n"
- self.draw_cont = ""
- self.last_point = None
- return
- def commit(self, overlay: bool = True) -> None:
- """Update the page's /Contents object with Shape data.
- The argument controls whether data appear in foreground (default)
- or background.
- """
- pymupdf.CheckParent(self.page) # doc may have died meanwhile
- self.totalcont += self.text_cont
- self.totalcont = self.totalcont.encode()
- if self.totalcont:
- if overlay:
- self.page.wrap_contents() # ensure a balanced graphics state
- # make /Contents object with dummy stream
- xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay)
- # update it with potential compression
- self.doc.update_stream(xref, self.totalcont)
- self.last_point = None # clean up ...
- self.rect = None #
- self.draw_cont = "" # for potential ...
- self.text_cont = "" # ...
- self.totalcont = "" # re-use
- def apply_redactions(
- page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0
- ) -> bool:
- """Apply the redaction annotations of the page.
- Args:
- page: the PDF page.
- images:
- 0 - ignore images
- 1 - remove all overlapping images
- 2 - blank out overlapping image parts
- 3 - remove image unless invisible
- graphics:
- 0 - ignore graphics
- 1 - remove graphics if contained in rectangle
- 2 - remove all overlapping graphics
- text:
- 0 - remove text
- 1 - ignore text
- """
- def center_rect(annot_rect, new_text, font, fsize):
- """Calculate minimal sub-rectangle for the overlay text.
- Notes:
- Because 'insert_textbox' supports no vertical text centering,
- we calculate an approximate number of lines here and return a
- sub-rect with smaller height, which should still be sufficient.
- Args:
- annot_rect: the annotation rectangle
- new_text: the text to insert.
- font: the fontname. Must be one of the CJK or Base-14 set, else
- the rectangle is returned unchanged.
- fsize: the fontsize
- Returns:
- A rectangle to use instead of the annot rectangle.
- """
- if not new_text or annot_rect.width <= pymupdf.EPSILON:
- return annot_rect
- try:
- text_width = pymupdf.get_text_length(new_text, font, fsize)
- except (ValueError, mupdf.FzErrorBase): # unsupported font
- if g_exceptions_verbose:
- pymupdf.exception_info()
- return annot_rect
- line_height = fsize * 1.2
- limit = annot_rect.width
- h = math.ceil(text_width / limit) * line_height # estimate rect height
- if h >= annot_rect.height:
- return annot_rect
- r = annot_rect
- y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5
- r.y0 = y
- return r
- pymupdf.CheckParent(page)
- doc = page.parent
- if doc.is_encrypted or doc.is_closed:
- raise ValueError("document closed or encrypted")
- if not doc.is_pdf:
- raise ValueError("is no PDF")
- redact_annots = [] # storage of annot values
- for annot in page.annots(
- types=(pymupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member
- ):
- # loop redactions
- redact_annots.append(annot._get_redact_values()) # save annot values
- if redact_annots == []: # any redactions on this page?
- return False # no redactions
- rc = page._apply_redactions(text, images, graphics) # call MuPDF
- if not rc: # should not happen really
- raise ValueError("Error applying redactions.")
- # now write replacement text in old redact rectangles
- shape = page.new_shape()
- for redact in redact_annots:
- annot_rect = redact["rect"]
- fill = redact["fill"]
- if fill:
- shape.draw_rect(annot_rect) # colorize the rect background
- shape.finish(fill=fill, color=fill)
- if "text" in redact.keys(): # if we also have text
- new_text = redact["text"]
- align = redact.get("align", 0)
- fname = redact["fontname"]
- fsize = redact["fontsize"]
- color = redact["text_color"]
- # try finding vertical centered sub-rect
- trect = center_rect(annot_rect, new_text, fname, fsize)
- rc = -1
- while rc < 0 and fsize >= 4: # while not enough room
- # (re-) try insertion
- rc = shape.insert_textbox(
- trect,
- new_text,
- fontname=fname,
- fontsize=fsize,
- color=color,
- align=align,
- )
- fsize -= 0.5 # reduce font if unsuccessful
- shape.commit() # append new contents object
- return True
- # ------------------------------------------------------------------------------
- # Remove potentially sensitive data from a PDF. Similar to the Adobe
- # Acrobat 'sanitize' function
- # ------------------------------------------------------------------------------
- def scrub(
- doc: pymupdf.Document,
- attached_files: bool = True,
- clean_pages: bool = True,
- embedded_files: bool = True,
- hidden_text: bool = True,
- javascript: bool = True,
- metadata: bool = True,
- redactions: bool = True,
- redact_images: int = 0,
- remove_links: bool = True,
- reset_fields: bool = True,
- reset_responses: bool = True,
- thumbnails: bool = True,
- xml_metadata: bool = True,
- ) -> None:
- def remove_hidden(cont_lines):
- """Remove hidden text from a PDF page.
- Args:
- cont_lines: list of lines with /Contents content. Should have status
- from after page.cleanContents().
- Returns:
- List of /Contents lines from which hidden text has been removed.
- Notes:
- The input must have been created after the page's /Contents object(s)
- have been cleaned with page.cleanContents(). This ensures a standard
- formatting: one command per line, single spaces between operators.
- This allows for drastic simplification of this code.
- """
- out_lines = [] # will return this
- in_text = False # indicate if within BT/ET object
- suppress = False # indicate text suppression active
- make_return = False
- for line in cont_lines:
- if line == b"BT": # start of text object
- in_text = True # switch on
- out_lines.append(line) # output it
- continue
- if line == b"ET": # end of text object
- in_text = False # switch off
- out_lines.append(line) # output it
- continue
- if line == b"3 Tr": # text suppression operator
- suppress = True # switch on
- make_return = True
- continue
- if line[-2:] == b"Tr" and line[0] != b"3":
- suppress = False # text rendering changed
- out_lines.append(line)
- continue
- if line == b"Q": # unstack command also switches off
- suppress = False
- out_lines.append(line)
- continue
- if suppress and in_text: # suppress hidden lines
- continue
- out_lines.append(line)
- if make_return:
- return out_lines
- else:
- return None
- if not doc.is_pdf: # only works for PDF
- raise ValueError("is no PDF")
- if doc.is_encrypted or doc.is_closed:
- raise ValueError("closed or encrypted doc")
- if not clean_pages:
- hidden_text = False
- redactions = False
- if metadata:
- doc.set_metadata({}) # remove standard metadata
- for page in doc:
- if reset_fields:
- # reset form fields (widgets)
- for widget in page.widgets():
- widget.reset()
- if remove_links:
- links = page.get_links() # list of all links on page
- for link in links: # remove all links
- page.delete_link(link)
- found_redacts = False
- for annot in page.annots():
- if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
- annot.update_file(buffer_=b" ") # set file content to empty
- if reset_responses:
- annot.delete_responses()
- if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member
- found_redacts = True
- if redactions and found_redacts:
- page.apply_redactions(images=redact_images)
- if not (clean_pages or hidden_text):
- continue # done with the page
- page.clean_contents()
- if not page.get_contents():
- continue
- if hidden_text:
- xref = page.get_contents()[0] # only one b/o cleaning!
- cont = doc.xref_stream(xref)
- cont_lines = remove_hidden(cont.splitlines()) # remove hidden text
- if cont_lines: # something was actually removed
- cont = b"\n".join(cont_lines)
- doc.update_stream(xref, cont) # rewrite the page /Contents
- if thumbnails: # remove page thumbnails?
- if doc.xref_get_key(page.xref, "Thumb")[0] != "null":
- doc.xref_set_key(page.xref, "Thumb", "null")
- # pages are scrubbed, now perform document-wide scrubbing
- # remove embedded files
- if embedded_files:
- for name in doc.embfile_names():
- doc.embfile_del(name)
- if xml_metadata:
- doc.del_xml_metadata()
- if not (xml_metadata or javascript):
- xref_limit = 0
- else:
- xref_limit = doc.xref_length()
- for xref in range(1, xref_limit):
- if not doc.xref_object(xref):
- msg = "bad xref %i - clean PDF before scrubbing" % xref
- raise ValueError(msg)
- if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript":
- # a /JavaScript action object
- obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript
- doc.update_object(xref, obj) # update this object
- continue # no further handling
- if not xml_metadata:
- continue
- if doc.xref_get_key(xref, "Type")[1] == "/Metadata":
- # delete any metadata object directly
- doc.update_object(xref, "<<>>")
- doc.update_stream(xref, b"deleted", new=True)
- continue
- if doc.xref_get_key(xref, "Metadata")[0] != "null":
- doc.xref_set_key(xref, "Metadata", "null")
- def _show_fz_text( text):
- #if mupdf_cppyy:
- # assert isinstance( text, cppyy.gbl.mupdf.Text)
- #else:
- # assert isinstance( text, mupdf.Text)
- num_spans = 0
- num_chars = 0
- span = text.m_internal.head
- while 1:
- if not span:
- break
- num_spans += 1
- num_chars += span.len
- span = span.next
- return f'num_spans={num_spans} num_chars={num_chars}'
- def fill_textbox(
- writer: pymupdf.TextWriter,
- rect: rect_like,
- text: typing.Union[str, list],
- pos: point_like = None,
- font: typing.Optional[pymupdf.Font] = None,
- fontsize: float = 11,
- lineheight: OptFloat = None,
- align: int = 0,
- warn: bool = None,
- right_to_left: bool = False,
- small_caps: bool = False,
- ) -> tuple:
- """Fill a rectangle with text.
- Args:
- writer: pymupdf.TextWriter object (= "self")
- rect: rect-like to receive the text.
- text: string or list/tuple of strings.
- pos: point-like start position of first word.
- font: pymupdf.Font object (default pymupdf.Font('helv')).
- fontsize: the fontsize.
- lineheight: overwrite the font property
- align: (int) 0 = left, 1 = center, 2 = right, 3 = justify
- warn: (bool) text overflow action: none, warn, or exception
- right_to_left: (bool) indicate right-to-left language.
- """
- rect = pymupdf.Rect(rect)
- if rect.is_empty:
- raise ValueError("fill rect must not empty.")
- if type(font) is not pymupdf.Font:
- font = pymupdf.Font("helv")
- def textlen(x):
- """Return length of a string."""
- return font.text_length(
- x, fontsize=fontsize, small_caps=small_caps
- ) # abbreviation
- def char_lengths(x):
- """Return list of single character lengths for a string."""
- return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps)
- def append_this(pos, text):
- ret = writer.append(
- pos, text, font=font, fontsize=fontsize, small_caps=small_caps
- )
- return ret
- tolerance = fontsize * 0.2 # extra distance to left border
- space_len = textlen(" ")
- std_width = rect.width - tolerance
- std_start = rect.x0 + tolerance
- def norm_words(width, words):
- """Cut any word in pieces no longer than 'width'."""
- nwords = []
- word_lengths = []
- for w in words:
- wl_lst = char_lengths(w)
- wl = sum(wl_lst)
- if wl <= width: # nothing to do - copy over
- nwords.append(w)
- word_lengths.append(wl)
- continue
- # word longer than rect width - split it in parts
- n = len(wl_lst)
- while n > 0:
- wl = sum(wl_lst[:n])
- if wl <= width:
- nwords.append(w[:n])
- word_lengths.append(wl)
- w = w[n:]
- wl_lst = wl_lst[n:]
- n = len(wl_lst)
- else:
- n -= 1
- return nwords, word_lengths
- def output_justify(start, line):
- """Justified output of a line."""
- # ignore leading / trailing / multiple spaces
- words = [w for w in line.split(" ") if w != ""]
- nwords = len(words)
- if nwords == 0:
- return
- if nwords == 1: # single word cannot be justified
- append_this(start, words[0])
- return
- tl = sum([textlen(w) for w in words]) # total word lengths
- gaps = nwords - 1 # number of word gaps
- gapl = (std_width - tl) / gaps # width of each gap
- for w in words:
- _, lp = append_this(start, w) # output one word
- start.x = lp.x + gapl # next start at word end plus gap
- return
- asc = font.ascender
- dsc = font.descender
- if not lineheight:
- if asc - dsc <= 1:
- lheight = 1.2
- else:
- lheight = asc - dsc
- else:
- lheight = lineheight
- LINEHEIGHT = fontsize * lheight # effective line height
- width = std_width # available horizontal space
- # starting point of text
- if pos is not None:
- pos = pymupdf.Point(pos)
- else: # default is just below rect top-left
- pos = rect.tl + (tolerance, fontsize * asc)
- if pos not in rect:
- raise ValueError("Text must start in rectangle.")
- # calculate displacement factor for alignment
- if align == pymupdf.TEXT_ALIGN_CENTER:
- factor = 0.5
- elif align == pymupdf.TEXT_ALIGN_RIGHT:
- factor = 1.0
- else:
- factor = 0
- # split in lines if just a string was given
- if type(text) is str:
- textlines = text.splitlines()
- else:
- textlines = []
- for line in text:
- textlines.extend(line.splitlines())
- max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1
- new_lines = [] # the final list of textbox lines
- no_justify = [] # no justify for these line numbers
- for i, line in enumerate(textlines):
- if line in ("", " "):
- new_lines.append((line, space_len))
- width = rect.width - tolerance
- no_justify.append((len(new_lines) - 1))
- continue
- if i == 0:
- width = rect.x1 - pos.x
- else:
- width = rect.width - tolerance
- if right_to_left: # reverses Arabic / Hebrew text front to back
- line = writer.clean_rtl(line)
- tl = textlen(line)
- if tl <= width: # line short enough
- new_lines.append((line, tl))
- no_justify.append((len(new_lines) - 1))
- continue
- # we need to split the line in fitting parts
- words = line.split(" ") # the words in the line
- # cut in parts any words that are longer than rect width
- words, word_lengths = norm_words(width, words)
- n = len(words)
- while True:
- line0 = " ".join(words[:n])
- wl = sum(word_lengths[:n]) + space_len * (n - 1)
- if wl <= width:
- new_lines.append((line0, wl))
- words = words[n:]
- word_lengths = word_lengths[n:]
- n = len(words)
- line0 = None
- else:
- n -= 1
- if len(words) == 0:
- break
- assert n
- # -------------------------------------------------------------------------
- # List of lines created. Each item is (text, tl), where 'tl' is the PDF
- # output length (float) and 'text' is the text. Except for justified text,
- # this is output-ready.
- # -------------------------------------------------------------------------
- nlines = len(new_lines)
- if nlines > max_lines:
- msg = "Only fitting %i of %i lines." % (max_lines, nlines)
- if warn is None:
- pass
- elif warn:
- pymupdf.message("Warning: " + msg)
- else:
- raise ValueError(msg)
- start = pymupdf.Point()
- no_justify += [len(new_lines) - 1] # no justifying of last line
- for i in range(max_lines):
- try:
- line, tl = new_lines.pop(0)
- except IndexError:
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- break
- if right_to_left: # Arabic, Hebrew
- line = "".join(reversed(line))
- if i == 0: # may have different start for first line
- start = pos
- if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width:
- output_justify(start, line)
- start.x = std_start
- start.y += LINEHEIGHT
- continue
- if i > 0 or pos.x == std_start: # left, center, right alignments
- start.x += (width - tl) * factor
- append_this(start, line)
- start.x = std_start
- start.y += LINEHEIGHT
- return new_lines # return non-written lines
- # ------------------------------------------------------------------------
- # Optional Content functions
- # ------------------------------------------------------------------------
- def get_oc(doc: pymupdf.Document, xref: int) -> int:
- """Return optional content object xref for an image or form xobject.
- Args:
- xref: (int) xref number of an image or form xobject.
- """
- if doc.is_closed or doc.is_encrypted:
- raise ValueError("document close or encrypted")
- t, name = doc.xref_get_key(xref, "Subtype")
- if t != "name" or name not in ("/Image", "/Form"):
- raise ValueError("bad object type at xref %i" % xref)
- t, oc = doc.xref_get_key(xref, "OC")
- if t != "xref":
- return 0
- rc = int(oc.replace("0 R", ""))
- return rc
- def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None:
- """Attach optional content object to image or form xobject.
- Args:
- xref: (int) xref number of an image or form xobject
- oc: (int) xref number of an OCG or OCMD
- """
- if doc.is_closed or doc.is_encrypted:
- raise ValueError("document close or encrypted")
- t, name = doc.xref_get_key(xref, "Subtype")
- if t != "name" or name not in ("/Image", "/Form"):
- raise ValueError("bad object type at xref %i" % xref)
- if oc > 0:
- t, name = doc.xref_get_key(oc, "Type")
- if t != "name" or name not in ("/OCG", "/OCMD"):
- raise ValueError("bad object type at xref %i" % oc)
- if oc == 0 and "OC" in doc.xref_get_keys(xref):
- doc.xref_set_key(xref, "OC", "null")
- return None
- doc.xref_set_key(xref, "OC", "%i 0 R" % oc)
- return None
- def set_ocmd(
- doc: pymupdf.Document,
- xref: int = 0,
- ocgs: typing.Union[list, None] = None,
- policy: OptStr = None,
- ve: typing.Union[list, None] = None,
- ) -> int:
- """Create or update an OCMD object in a PDF document.
- Args:
- xref: (int) 0 for creating a new object, otherwise update existing one.
- ocgs: (list) OCG xref numbers, which shall be subject to 'policy'.
- policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing).
- ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'.
- Returns:
- Xref of the created or updated OCMD.
- """
- all_ocgs = set(doc.get_ocgs().keys())
- def ve_maker(ve):
- if type(ve) not in (list, tuple) or len(ve) < 2:
- raise ValueError("bad 've' format: %s" % ve)
- if ve[0].lower() not in ("and", "or", "not"):
- raise ValueError("bad operand: %s" % ve[0])
- if ve[0].lower() == "not" and len(ve) != 2:
- raise ValueError("bad 've' format: %s" % ve)
- item = "[/%s" % ve[0].title()
- for x in ve[1:]:
- if type(x) is int:
- if x not in all_ocgs:
- raise ValueError("bad OCG %i" % x)
- item += " %i 0 R" % x
- else:
- item += " %s" % ve_maker(x)
- item += "]"
- return item
- text = "<</Type/OCMD"
- if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided
- s = set(ocgs).difference(all_ocgs) # contains illegal xrefs
- if s != set():
- msg = "bad OCGs: %s" % s
- raise ValueError(msg)
- text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]"
- if policy:
- policy = str(policy).lower()
- pols = {
- "anyon": "AnyOn",
- "allon": "AllOn",
- "anyoff": "AnyOff",
- "alloff": "AllOff",
- }
- if policy not in ("anyon", "allon", "anyoff", "alloff"):
- raise ValueError("bad policy: %s" % policy)
- text += "/P/%s" % pols[policy]
- if ve:
- text += "/VE%s" % ve_maker(ve)
- text += ">>"
- # make new object or replace old OCMD (check type first)
- if xref == 0:
- xref = doc.get_new_xref()
- elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True):
- raise ValueError("bad xref or not an OCMD")
- doc.update_object(xref, text)
- return xref
- def get_ocmd(doc: pymupdf.Document, xref: int) -> dict:
- """Return the definition of an OCMD (optional content membership dictionary).
- Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and
- /VE (visibility expression, PDF array). Via string manipulation, this
- info is converted to a Python dictionary with keys "xref", "ocgs", "policy"
- and "ve" - ready to recycle as input for 'set_ocmd()'.
- """
- if xref not in range(doc.xref_length()):
- raise ValueError("bad xref")
- text = doc.xref_object(xref, compressed=True)
- if "/Type/OCMD" not in text:
- raise ValueError("bad object type")
- textlen = len(text)
- p0 = text.find("/OCGs[") # look for /OCGs key
- p1 = text.find("]", p0)
- if p0 < 0 or p1 < 0: # no OCGs found
- ocgs = None
- else:
- ocgs = text[p0 + 6 : p1].replace("0 R", " ").split()
- ocgs = list(map(int, ocgs))
- p0 = text.find("/P/") # look for /P policy key
- if p0 < 0:
- policy = None
- else:
- p1 = text.find("ff", p0)
- if p1 < 0:
- p1 = text.find("on", p0)
- if p1 < 0: # some irregular syntax
- raise ValueError("bad object at xref")
- else:
- policy = text[p0 + 3 : p1 + 2]
- p0 = text.find("/VE[") # look for /VE visibility expression key
- if p0 < 0: # no visibility expression found
- ve = None
- else:
- lp = rp = 0 # find end of /VE by finding last ']'.
- p1 = p0
- while lp < 1 or lp != rp:
- p1 += 1
- if not p1 < textlen: # some irregular syntax
- raise ValueError("bad object at xref")
- if text[p1] == "[":
- lp += 1
- if text[p1] == "]":
- rp += 1
- # p1 now positioned at the last "]"
- ve = text[p0 + 3 : p1 + 1] # the PDF /VE array
- ve = (
- ve.replace("/And", '"and",')
- .replace("/Not", '"not",')
- .replace("/Or", '"or",')
- )
- ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[")
- import json
- try:
- ve = json.loads(ve)
- except Exception:
- pymupdf.exception_info()
- pymupdf.message(f"bad /VE key: {ve!r}")
- raise
- return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve}
- """
- Handle page labels for PDF documents.
- Reading
- -------
- * compute the label of a page
- * find page number(s) having the given label.
- Writing
- -------
- Supports setting (defining) page labels for PDF documents.
- A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
- significant parts of the following code during late December 2020
- through early January 2021.
- """
- def rule_dict(item):
- """Make a Python dict from a PDF page label rule.
- Args:
- item -- a tuple (pno, rule) with the start page number and the rule
- string like <</S/D...>>.
- Returns:
- A dict like
- {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
- """
- # Jorj McKie, 2021-01-06
- pno, rule = item
- rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>"
- d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
- skip = False
- for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
- if skip: # this item has already been processed
- skip = False # deactivate skipping again
- continue
- if item == "S": # style specification
- d["style"] = rule[i + 1] # next item has the style
- skip = True # do not process next item again
- continue
- if item.startswith("P"): # prefix specification: extract the string
- x = item[1:].replace("(", "").replace(")", "")
- d["prefix"] = x
- continue
- if item.startswith("St"): # start page number specification
- x = int(item[2:])
- d["firstpagenum"] = x
- return d
- def get_label_pno(pgNo, labels):
- """Return the label for this page number.
- Args:
- pgNo: page number, 0-based.
- labels: result of doc._get_page_labels().
- Returns:
- The label (str) of the page number. Errors return an empty string.
- """
- # Jorj McKie, 2021-01-06
- item = [x for x in labels if x[0] <= pgNo][-1]
- rule = rule_dict(item)
- prefix = rule.get("prefix", "")
- style = rule.get("style", "")
- # make sure we start at 0 when enumerating the alphabet
- delta = -1 if style in ("a", "A") else 0
- pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta
- return construct_label(style, prefix, pagenumber)
- def get_label(page):
- """Return the label for this PDF page.
- Args:
- page: page object.
- Returns:
- The label (str) of the page. Errors return an empty string.
- """
- # Jorj McKie, 2021-01-06
- labels = page.parent._get_page_labels()
- if not labels:
- return ""
- labels.sort()
- return get_label_pno(page.number, labels)
- def get_page_numbers(doc, label, only_one=False):
- """Return a list of page numbers with the given label.
- Args:
- doc: PDF document object (resp. 'self').
- label: (str) label.
- only_one: (bool) stop searching after first hit.
- Returns:
- List of page numbers having this label.
- """
- # Jorj McKie, 2021-01-06
- numbers = []
- if not label:
- return numbers
- labels = doc._get_page_labels()
- if labels == []:
- return numbers
- for i in range(doc.page_count):
- plabel = get_label_pno(i, labels)
- if plabel == label:
- numbers.append(i)
- if only_one:
- break
- return numbers
- def construct_label(style, prefix, pno) -> str:
- """Construct a label based on style, prefix and page number."""
- # William Chapman, 2021-01-06
- n_str = ""
- if style == "D":
- n_str = str(pno)
- elif style == "r":
- n_str = integerToRoman(pno).lower()
- elif style == "R":
- n_str = integerToRoman(pno).upper()
- elif style == "a":
- n_str = integerToLetter(pno).lower()
- elif style == "A":
- n_str = integerToLetter(pno).upper()
- result = prefix + n_str
- return result
- def integerToLetter(i) -> str:
- """Returns letter sequence string for integer i."""
- # William Chapman, Jorj McKie, 2021-01-06
- import string
- ls = string.ascii_uppercase
- n, a = 1, i
- while pow(26, n) <= a:
- a -= int(math.pow(26, n))
- n += 1
- str_t = ""
- for j in reversed(range(n)):
- f, g = divmod(a, int(math.pow(26, j)))
- str_t += ls[f]
- a = g
- return str_t
- def integerToRoman(num: int) -> str:
- """Return roman numeral for an integer."""
- # William Chapman, Jorj McKie, 2021-01-06
- roman = (
- (1000, "M"),
- (900, "CM"),
- (500, "D"),
- (400, "CD"),
- (100, "C"),
- (90, "XC"),
- (50, "L"),
- (40, "XL"),
- (10, "X"),
- (9, "IX"),
- (5, "V"),
- (4, "IV"),
- (1, "I"),
- )
- def roman_num(num):
- for r, ltr in roman:
- x, _ = divmod(num, r)
- yield ltr * x
- num -= r * x
- if num <= 0:
- break
- return "".join([a for a in roman_num(num)])
- def get_page_labels(doc):
- """Return page label definitions in PDF document.
- Args:
- doc: PDF document (resp. 'self').
- Returns:
- A list of dictionaries with the following format:
- {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
- """
- # Jorj McKie, 2021-01-10
- return [rule_dict(item) for item in doc._get_page_labels()]
- def set_page_labels(doc, labels):
- """Add / replace page label definitions in PDF document.
- Args:
- doc: PDF document (resp. 'self').
- labels: list of label dictionaries like:
- {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int},
- as returned by get_page_labels().
- """
- # William Chapman, 2021-01-06
- def create_label_str(label):
- """Convert Python label dict to corresponding PDF rule string.
- Args:
- label: (dict) build rule for the label.
- Returns:
- PDF label rule string wrapped in "<<", ">>".
- """
- s = "%i<<" % label["startpage"]
- if label.get("prefix", "") != "":
- s += "/P(%s)" % label["prefix"]
- if label.get("style", "") != "":
- s += "/S/%s" % label["style"]
- if label.get("firstpagenum", 1) > 1:
- s += "/St %i" % label["firstpagenum"]
- s += ">>"
- return s
- def create_nums(labels):
- """Return concatenated string of all labels rules.
- Args:
- labels: (list) dictionaries as created by function 'rule_dict'.
- Returns:
- PDF compatible string for page label definitions, ready to be
- enclosed in PDF array 'Nums[...]'.
- """
- labels.sort(key=lambda x: x["startpage"])
- s = "".join([create_label_str(label) for label in labels])
- return s
- doc._set_page_labels(create_nums(labels))
- # End of Page Label Code -------------------------------------------------
- def has_links(doc: pymupdf.Document) -> bool:
- """Check whether there are links on any page."""
- if doc.is_closed:
- raise ValueError("document closed")
- if not doc.is_pdf:
- raise ValueError("is no PDF")
- for i in range(doc.page_count):
- for item in doc.page_annot_xrefs(i):
- if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member
- return True
- return False
- def has_annots(doc: pymupdf.Document) -> bool:
- """Check whether there are annotations on any page."""
- if doc.is_closed:
- raise ValueError("document closed")
- if not doc.is_pdf:
- raise ValueError("is no PDF")
- for i in range(doc.page_count):
- for item in doc.page_annot_xrefs(i):
- # pylint: disable=no-member
- if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member
- return True
- return False
- # -------------------------------------------------------------------
- # Functions to recover the quad contained in a text extraction bbox
- # -------------------------------------------------------------------
- def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
- """Compute the quad located inside the bbox.
- The bbox may be any of the resp. tuples occurring inside the given span.
- Args:
- line_dir: (tuple) 'line["dir"]' of the owning line or None.
- span: (dict) the span. May be from get_texttrace() method.
- bbox: (tuple) the bbox of the span or any of its characters.
- Returns:
- The quad which is wrapped by the bbox.
- """
- if line_dir is None:
- line_dir = span["dir"]
- cos, sin = line_dir
- bbox = pymupdf.Rect(bbox) # make it a rect
- if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height
- d = 1
- else:
- d = span["ascender"] - span["descender"]
- height = d * span["size"] # the quad's rectangle height
- # The following are distances from the bbox corners, at which we find the
- # respective quad points. The computation depends on in which quadrant the
- # text writing angle is located.
- hs = height * sin
- hc = height * cos
- if hc >= 0 and hs <= 0: # quadrant 1
- ul = bbox.bl - (0, hc)
- ur = bbox.tr + (hs, 0)
- ll = bbox.bl - (hs, 0)
- lr = bbox.tr + (0, hc)
- elif hc <= 0 and hs <= 0: # quadrant 2
- ul = bbox.br + (hs, 0)
- ur = bbox.tl - (0, hc)
- ll = bbox.br + (0, hc)
- lr = bbox.tl - (hs, 0)
- elif hc <= 0 and hs >= 0: # quadrant 3
- ul = bbox.tr - (0, hc)
- ur = bbox.bl + (hs, 0)
- ll = bbox.tr - (hs, 0)
- lr = bbox.bl + (0, hc)
- else: # quadrant 4
- ul = bbox.tl + (hs, 0)
- ur = bbox.br - (0, hc)
- ll = bbox.tl + (0, hc)
- lr = bbox.br - (hs, 0)
- return pymupdf.Quad(ul, ur, ll, lr)
- def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
- """Recover the quadrilateral of a text span.
- Args:
- line_dir: (tuple) 'line["dir"]' of the owning line.
- span: the span.
- Returns:
- The quadrilateral enveloping the span's text.
- """
- if type(line_dir) is not tuple or len(line_dir) != 2:
- raise ValueError("bad line dir argument")
- if type(span) is not dict:
- raise ValueError("bad span argument")
- return recover_bbox_quad(line_dir, span, span["bbox"])
- def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
- """Calculate the line quad for 'dict' / 'rawdict' text extractions.
- The lower quad points are those of the first, resp. last span quad.
- The upper points are determined by the maximum span quad height.
- From this, compute a rect with bottom-left in (0, 0), convert this to a
- quad and rotate and shift back to cover the text of the spans.
- Args:
- spans: (list, optional) sub-list of spans to consider.
- Returns:
- pymupdf.Quad covering selected spans.
- """
- if spans is None: # no sub-selection
- spans = line["spans"] # all spans
- if len(spans) == 0:
- raise ValueError("bad span list")
- line_dir = line["dir"] # text direction
- cos, sin = line_dir
- q0 = recover_quad(line_dir, spans[0]) # quad of first span
- if len(spans) > 1: # get quad of last span
- q1 = recover_quad(line_dir, spans[-1])
- else:
- q1 = q0 # last = first
- line_ll = q0.ll # lower-left of line quad
- line_lr = q1.lr # lower-right of line quad
- mat0 = pymupdf.planish_line(line_ll, line_lr)
- # map base line to x-axis such that line_ll goes to (0, 0)
- x_lr = line_lr * mat0
- small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
- h = max(
- [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
- )
- line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
- line_quad = line_rect.quad # make it a quad and:
- line_quad *= ~mat0
- return line_quad
- def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
- """Calculate the span quad for 'dict' / 'rawdict' text extractions.
- Notes:
- There are two execution paths:
- 1. For the full span quad, the result of 'recover_quad' is returned.
- 2. For the quad of a sub-list of characters, the char quads are
- computed and joined. This is only supported for the "rawdict"
- extraction option.
- Args:
- line_dir: (tuple) 'line["dir"]' of the owning line.
- span: (dict) the span.
- chars: (list, optional) sub-list of characters to consider.
- Returns:
- pymupdf.Quad covering selected characters.
- """
- if line_dir is None: # must be a span from get_texttrace()
- line_dir = span["dir"]
- if chars is None: # no sub-selection
- return recover_quad(line_dir, span)
- if "chars" not in span.keys():
- raise ValueError("need 'rawdict' option to sub-select chars")
- q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char
- if len(chars) > 1: # get quad of last char
- q1 = recover_char_quad(line_dir, span, chars[-1])
- else:
- q1 = q0 # last = first
- span_ll = q0.ll # lower-left of span quad
- span_lr = q1.lr # lower-right of span quad
- mat0 = pymupdf.planish_line(span_ll, span_lr)
- # map base line to x-axis such that span_ll goes to (0, 0)
- x_lr = span_lr * mat0
- small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
- h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
- span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
- span_quad = span_rect.quad # make it a quad and:
- span_quad *= ~mat0 # rotate back and shift back
- return span_quad
- def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
- """Recover the quadrilateral of a text character.
- This requires the "rawdict" option of text extraction.
- Args:
- line_dir: (tuple) 'line["dir"]' of the span's line.
- span: (dict) the span dict.
- char: (dict) the character dict.
- Returns:
- The quadrilateral enveloping the character.
- """
- if line_dir is None:
- line_dir = span["dir"]
- if type(line_dir) is not tuple or len(line_dir) != 2:
- raise ValueError("bad line dir argument")
- if type(span) is not dict:
- raise ValueError("bad span argument")
- if type(char) is dict:
- bbox = pymupdf.Rect(char["bbox"])
- elif type(char) is tuple:
- bbox = pymupdf.Rect(char[3])
- else:
- raise ValueError("bad span argument")
- return recover_bbox_quad(line_dir, span, bbox)
- # -------------------------------------------------------------------
- # Building font subsets using fontTools
- # -------------------------------------------------------------------
- def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> OptInt:
- """Build font subsets in a PDF.
- Eligible fonts are potentially replaced by smaller versions. Page text is
- NOT rewritten and thus should retain properties like being hidden or
- controlled by optional content.
- This method by default uses MuPDF's own internal feature to create subset
- fonts. As this is a new function, errors may still occur. In this case,
- please fall back to using the previous version by using "fallback=True".
- Fallback mode requires the external package 'fontTools'.
- Args:
- fallback: use the older deprecated implementation.
- verbose: only used by fallback mode.
- Returns:
- The new MuPDF-based code returns None. The deprecated fallback
- mode returns 0 if there are no fonts to subset. Otherwise, it
- returns the decrease in fontsize (the difference in fontsize),
- measured in bytes.
- """
- # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs))
- # An embedded font is uniquely defined by its fontbuffer only. It may have
- # multiple names and xrefs.
- # Once the sets of used unicodes and glyphs are known, we compute a
- # smaller version of the buffer user package fontTools.
- if not fallback: # by default use MuPDF function
- pdf = mupdf.pdf_document_from_fz_document(doc)
- mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count)))
- return
- font_buffers = {}
- def get_old_widths(xref):
- """Retrieve old font '/W' and '/DW' values."""
- df = doc.xref_get_key(xref, "DescendantFonts")
- if df[0] != "array": # only handle xref specifications
- return None, None
- df_xref = int(df[1][1:-1].replace("0 R", ""))
- widths = doc.xref_get_key(df_xref, "W")
- if widths[0] != "array": # no widths key found
- widths = None
- else:
- widths = widths[1]
- dwidths = doc.xref_get_key(df_xref, "DW")
- if dwidths[0] != "int":
- dwidths = None
- else:
- dwidths = dwidths[1]
- return widths, dwidths
- def set_old_widths(xref, widths, dwidths):
- """Restore the old '/W' and '/DW' in subsetted font.
- If either parameter is None or evaluates to False, the corresponding
- dictionary key will be set to null.
- """
- df = doc.xref_get_key(xref, "DescendantFonts")
- if df[0] != "array": # only handle xref specs
- return None
- df_xref = int(df[1][1:-1].replace("0 R", ""))
- if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[
- 0
- ] != "null":
- doc.xref_set_key(df_xref, "W", "null")
- else:
- doc.xref_set_key(df_xref, "W", widths)
- if (type(dwidths) is not str or not dwidths) and doc.xref_get_key(
- df_xref, "DW"
- )[0] != "null":
- doc.xref_set_key(df_xref, "DW", "null")
- else:
- doc.xref_set_key(df_xref, "DW", dwidths)
- return None
- def set_subset_fontname(new_xref):
- """Generate a name prefix to tag a font as subset.
- We use a random generator to select 6 upper case ASCII characters.
- The prefixed name must be put in the font xref as the "/BaseFont" value
- and in the FontDescriptor object as the '/FontName' value.
- """
- # The following generates a prefix like 'ABCDEF+'
- import random
- import string
- prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+"
- font_str = doc.xref_object(new_xref, compressed=True)
- font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix)
- df = doc.xref_get_key(new_xref, "DescendantFonts")
- if df[0] == "array":
- df_xref = int(df[1][1:-1].replace("0 R", ""))
- fd = doc.xref_get_key(df_xref, "FontDescriptor")
- if fd[0] == "xref":
- fd_xref = int(fd[1].replace("0 R", ""))
- fd_str = doc.xref_object(fd_xref, compressed=True)
- fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix)
- doc.update_object(fd_xref, fd_str)
- doc.update_object(new_xref, font_str)
- def build_subset(buffer, unc_set, gid_set):
- """Build font subset using fontTools.
- Args:
- buffer: (bytes) the font given as a binary buffer.
- unc_set: (set) required glyph ids.
- Returns:
- Either None if subsetting is unsuccessful or the subset font buffer.
- """
- try:
- import fontTools.subset as fts
- except ImportError:
- if g_exceptions_verbose: pymupdf.exception_info()
- pymupdf.message("This method requires fontTools to be installed.")
- raise
- import tempfile
- with tempfile.TemporaryDirectory() as tmp_dir:
- oldfont_path = f"{tmp_dir}/oldfont.ttf"
- newfont_path = f"{tmp_dir}/newfont.ttf"
- uncfile_path = f"{tmp_dir}/uncfile.txt"
- args = [
- oldfont_path,
- "--retain-gids",
- f"--output-file={newfont_path}",
- "--layout-features=*",
- "--passthrough-tables",
- "--ignore-missing-glyphs",
- "--ignore-missing-unicodes",
- "--symbol-cmap",
- ]
- # store glyph ids or unicodes as file
- with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file:
- if 0xFFFD in unc_set: # error unicode exists -> use glyphs
- args.append(f"--gids-file={uncfile_path}")
- gid_set.add(189)
- unc_list = list(gid_set)
- for unc in unc_list:
- unc_file.write("%i\n" % unc)
- else:
- args.append(f"--unicodes-file={uncfile_path}")
- unc_set.add(255)
- unc_list = list(unc_set)
- for unc in unc_list:
- unc_file.write("%04x\n" % unc)
- # store fontbuffer as a file
- with open(oldfont_path, "wb") as fontfile:
- fontfile.write(buffer)
- try:
- os.remove(newfont_path) # remove old file
- except Exception:
- pass
- try: # invoke fontTools subsetter
- fts.main(args)
- font = pymupdf.Font(fontfile=newfont_path)
- new_buffer = font.buffer # subset font binary
- if font.glyph_count == 0: # intercept empty font
- new_buffer = None
- except Exception:
- pymupdf.exception_info()
- new_buffer = None
- return new_buffer
- def repl_fontnames(doc):
- """Populate 'font_buffers'.
- For each font candidate, store its xref and the list of names
- by which PDF text may refer to it (there may be multiple).
- """
- def norm_name(name):
- """Recreate font name that contains PDF hex codes.
- E.g. #20 -> space, chr(32)
- """
- while "#" in name:
- p = name.find("#")
- c = int(name[p + 1 : p + 3], 16)
- name = name.replace(name[p : p + 3], chr(c))
- return name
- def get_fontnames(doc, item):
- """Return a list of fontnames for an item of page.get_fonts().
- There may be multiple names e.g. for Type0 fonts.
- """
- fontname = item[3]
- names = [fontname]
- fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:]
- fontname = norm_name(fontname)
- if fontname not in names:
- names.append(fontname)
- descendents = doc.xref_get_key(item[0], "DescendantFonts")
- if descendents[0] != "array":
- return names
- descendents = descendents[1][1:-1]
- if descendents.endswith(" 0 R"):
- xref = int(descendents[:-4])
- descendents = doc.xref_object(xref, compressed=True)
- p1 = descendents.find("/BaseFont")
- if p1 >= 0:
- p2 = descendents.find("/", p1 + 1)
- p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1))
- fontname = descendents[p2 + 1 : p1]
- fontname = norm_name(fontname)
- if fontname not in names:
- names.append(fontname)
- return names
- for i in range(doc.page_count):
- for f in doc.get_page_fonts(i, full=True):
- font_xref = f[0] # font xref
- font_ext = f[1] # font file extension
- basename = f[3] # font basename
- if font_ext not in ( # skip if not supported by fontTools
- "otf",
- "ttf",
- "woff",
- "woff2",
- ):
- continue
- # skip fonts which already are subsets
- if len(basename) > 6 and basename[6] == "+":
- continue
- extr = doc.extract_font(font_xref)
- fontbuffer = extr[-1]
- names = get_fontnames(doc, f)
- name_set, xref_set, subsets = font_buffers.get(
- fontbuffer, (set(), set(), (set(), set()))
- )
- xref_set.add(font_xref)
- for name in names:
- name_set.add(name)
- font = pymupdf.Font(fontbuffer=fontbuffer)
- name_set.add(font.name)
- del font
- font_buffers[fontbuffer] = (name_set, xref_set, subsets)
- def find_buffer_by_name(name):
- for buffer, (name_set, _, _) in font_buffers.items():
- if name in name_set:
- return buffer
- return None
- # -----------------
- # main function
- # -----------------
- repl_fontnames(doc) # populate font information
- if not font_buffers: # nothing found to do
- if verbose:
- pymupdf.message(f'No fonts to subset.')
- return 0
- old_fontsize = 0
- new_fontsize = 0
- for fontbuffer in font_buffers.keys():
- old_fontsize += len(fontbuffer)
- # Scan page text for usage of subsettable fonts
- for page in doc:
- # go through the text and extend set of used glyphs by font
- # we use a modified MuPDF trace device, which delivers us glyph ids.
- for span in page.get_texttrace():
- if type(span) is not dict: # skip useless information
- continue
- fontname = span["font"][:33] # fontname for the span
- buffer = find_buffer_by_name(fontname)
- if buffer is None:
- continue
- name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer]
- for c in span["chars"]:
- set_ucs.add(c[0]) # unicode
- set_gid.add(c[1]) # glyph id
- font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid))
- # build the font subsets
- for old_buffer, (name_set, xref_set, subsets) in font_buffers.items():
- new_buffer = build_subset(old_buffer, subsets[0], subsets[1])
- fontname = list(name_set)[0]
- if new_buffer is None or len(new_buffer) >= len(old_buffer):
- # subset was not created or did not get smaller
- if verbose:
- pymupdf.message(f'Cannot subset {fontname!r}.')
- continue
- if verbose:
- pymupdf.message(f"Built subset of font {fontname!r}.")
- val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF
- new_xref = val[0] # get its xref
- set_subset_fontname(new_xref) # tag fontname as subset font
- font_str = doc.xref_object( # get its object definition
- new_xref,
- compressed=True,
- )
- # walk through the original font xrefs and replace each by the subset def
- for font_xref in xref_set:
- # we need the original '/W' and '/DW' width values
- width_table, def_width = get_old_widths(font_xref)
- # ... and replace original font definition at xref with it
- doc.update_object(font_xref, font_str)
- # now copy over old '/W' and '/DW' values
- if width_table or def_width:
- set_old_widths(font_xref, width_table, def_width)
- # 'new_xref' remains unused in the PDF and must be removed
- # by garbage collection.
- new_fontsize += len(new_buffer)
- return old_fontsize - new_fontsize
- # -------------------------------------------------------------------
- # Copy XREF object to another XREF
- # -------------------------------------------------------------------
- def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None:
- """Copy a PDF dictionary object to another one given their xref numbers.
- Args:
- doc: PDF document object
- source: source xref number
- target: target xref number, the xref must already exist
- keep: an optional list of 1st level keys in target that should not be
- removed before copying.
- Notes:
- This works similar to the copy() method of dictionaries in Python. The
- source may be a stream object.
- """
- if doc.xref_is_stream(source):
- # read new xref stream, maintaining compression
- stream = doc.xref_stream_raw(source)
- doc.update_stream(
- target,
- stream,
- compress=False, # keeps source compression
- new=True, # in case target is no stream
- )
- # empty the target completely, observe exceptions
- if keep is None:
- keep = []
- for key in doc.xref_get_keys(target):
- if key in keep:
- continue
- doc.xref_set_key(target, key, "null")
- # copy over all source dict items
- for key in doc.xref_get_keys(source):
- item = doc.xref_get_key(source, key)
- doc.xref_set_key(target, key, item[1])
|