python pdf
2018-07-21 23:54
120 查看
# 从pdf中读取文本 # 写pdf # 加密解密pdf # 和平pdf,加水印
# pip install PyPDF2 %cd D:\python全站\office import PyPDF2
D:\python全站\office
pdf_obj = open('coop.pdf', 'rb') pdf = PyPDF2.PdfFileReader(pdf_obj) pdf.numPages
3
page = pdf.getPage(0)
page.extractText() # 提取文件
'\n\n \n \n1\\\n1\nN¥\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n'
# 提取中文 pip install pdfminer3k #支持中文 from pdfminer.pdfinterp import PDFResourceManager, process_pdf # 资源管理 from pdfminer.converter import TextConverter # 文本转换 from pdfminer.layout import LAParams #布局 from io import StringIO # 生成临时文件 def convert_pdf(path): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams = laparams) fp = open(path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() out = retstr.getvalue() retstr.close() return out
s = convert_pdf('coop.pdf') # print(s) # convert_pdf('coop.pdf') s.split('\n\x0c')
['测试语句 \n\n第 1 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n测试语句 \n\n第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n测试语句 \n\n第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n ', '测试语句 \n\n第 2 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n ', 'de8ug word \n\n测试语句 \n\n第 3 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n ', '']
# 写pdf,从上文打开的pdf找出第二页,新鞋一个pdf pdf_writer = PyPDF2.PdfFileWriter() page = pdf.getPage(1) pdf_writer.addPage(page)
with open('coop-1.pdf', 'wb') as f: pdf_writer.write(f)
pdf_obj.close()
# 加密pdf with open('coop.pdf', 'rb') as f_in: pdf = PyPDF2.PdfFileReader(f_in) pdf_writer = PyPDF2.PdfFileWriter() for page_num in range(pdf.numPages): pdf_writer.addPage(pdf.getPage(page_num)) pdf_writer.encrypt('hicoop') with open('coop-s.pdf', 'wb') as f_out: pdf_writer.write(f_out)
# 解密 with open('coop-s.pdf', 'rb') as f_in: pdf = PyPDF2.PdfFileReader(f_in) print(pdf.isEncrypted) pdf.decrypt('hicoop') pdf.getPage(0) #取到解密后的数据才能正常操作
True
# 合并多个pdf,加水印 with open('coop.pdf', 'rb') as f_in: with open('coop-watermarked.pdf', 'rb') as f_w: pdf = PyPDF2.PdfFileReader(f_in) pdf_w = PyPDF2.PdfFileReader(f_w) pdf_write = PyPDF2.PdfFileWriter() for page_num in range(pdf.numPages): page = pdf.getPage(page_num) page.mergePage(pdf_w.getPage(0)) pdf_write.addPage(page) with open('coop-watermarked.pdf', 'wb') as f_out: pdf_write.write(f_out)
--------------------------------------------------------------------------- OSError Traceback (most recent call last) <ipython-input-39-b87325251ec9> in <module>() 3 with open('coop-watermarked.pdf', 'rb') as f_w: 4 pdf = PyPDF2.PdfFileReader(f_in) ----> 5 pdf_w = PyPDF2.PdfFileReader(f_w) 6 7 pdf_write = PyPDF2.PdfFileWriter() c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in __init__(self, stream, strict, warndest, overwriteWarnings) 1082 stream = BytesIO(b_(fileobj.read())) 1083 fileobj.close() -> 1084 self.read(stream) 1085 self.stream = stream 1086 c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in read(self, stream) 1687 if debug: print(">>read", stream) 1688 # start at the end: -> 1689 stream.seek(-1, 2) 1690 if not stream.tell(): 1691 raise utils.PdfReadError('Cannot read an empty file') OSError: [Errno 22] Invalid argument
相关文章推荐
- python与reportlab类库技术用例-pdf报表1
- Python学习——爬虫之pdfkit用法
- python转pdf到文本
- 利用Python进行数据分析.pdf
- 【Python小工具】Python实现批量解析PDF文件提取内容并写入到Excel中
- python将HTML转PDF
- Python3基础(十二) 学习总结·附PDF
- 贝叶斯思维 统计建模的Python学习法pdf
- Python解析并读取PDF文件内容的方法
- 安装html5转化为pdf的python库pisa 安装matplotlab把数据转化为图形的python库
- Python爬虫下载PDF文件
- 【Python库】ReportLab生成PDF文档
- python 将pdf转换成txt
- PDF自动改名的Python程序
- Python+Django+SAE系列教程15-----输出非HTML内容(图片/PDF)
- python使用reportlab实现图片转换成pdf的方法
- 【Python】pdf文件处理之“PyPDF2”库简易安装笔记
- 【Python】pdf文件逐页转图片/修改图片存储大小脚本
- 《Expert Python Programming》pdf
- Python3基础(十二) 学习总结·附PDF