简述

浏览PDF文档时,有时会遇到底部文字水印,如果能简单地去掉,应该是不错的。

翻阅PDF规范,通过 pdf-inspect 工具验证后,有了这个简易验证程序,应付简易添加水印程序。

操作

使用 pypdf==4.2.0 库。

from pypdf import PdfReader, PdfWriter
from pypdf.generic import ContentStream, NameObject

def remove(pdf_file, output_file):
    with open(pdf_file, 'rb') as f:
        source = PdfReader(f, "rb")
        output = PdfWriter()

        for idx in range(len(source.pages)):
            print("page: %d" % idx)
            page = source.pages[idx]

            # remove last element ( watermask )
            # e.g.
            # q
            #   0 g
            #   /GS0 gs
            #   1 0 0 1 146.72 28.2107 cm
            #   0 Tc
            #   /X22 Do
            # Q
            #

            xobject = page["/Resources"]["/XObject"]
            xobject.popitem()

            ext_g_state = page["/Resources"]["/ExtGState"]
            ext_g_state.popitem()

            content_object = page["/Contents"].get_object()
            content = ContentStream(content_object, source)
            if content.operations[-1][-1] == b"Q":
                it = content.operations.pop()
                while (not it[-1] == b"q"):
                    it = content.operations.pop()

            content.get_data()      # this ensures ._data is rebuilt from the .operations
            page.__setitem__(NameObject('/Contents'), content.flate_encode())

            output.add_page(page)

        with open(output_file, "wb") as outputStream:
            output.write(outputStream)


remove("target.pdf", "output.pdf")

参考


Related Posts


Published

Category

misc

Tags

Contact