如何从PDF文件中提取文本?

我试图提取包含在 PDF文件使用Python的文本。

我正在使用PyPDF2包(版本为1.27.2),并有以下脚本:

import PyPDF2


with open("sample.pdf", "rb") as pdf_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.pages[0]
page_content = page.extractText()
print(page_content)

当我运行代码时,我得到以下输出,这与PDF文档中包含的输出不同:

 ! " # $ % # $ % &% $ &' ( ) * % + , - % . / 0 1 ' * 2 3% 4
5
' % 1 $ # 2 6 % 3/ % 7 / ) ) / 8 % &) / 2 6 % 8 # 3" % 3" * % 31 3/ 9 # &)
%

如何提取PDF文档中的文本?

625668 次浏览

你可能想要使用时间证明的xPDF和派生工具来提取文本,因为pyPDF2似乎仍然有各种问题和文本提取。

长的答案是,文本如何在PDF中编码有很多变化,它可能需要解码PDF字符串本身,然后可能需要与CMAP映射,然后可能需要分析单词和字母之间的距离等。

如果PDF被损坏(即显示正确的文本,但复制时产生垃圾),并且你确实需要提取文本,那么你可能会考虑将PDF转换为图像(使用ImageMagik),然后使用超正方体使用OCR从图像中获取文本。

看看这段代码:

import PyPDF2
pdf_file = open('sample.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
page = read_pdf.getPage(0)
page_content = page.extractText()
print page_content.encode('utf-8')

输出结果为:

!"#$%#$%&%$&'()*%+,-%./01'*23%4
5'%1$#26%3/%7/))/8%&)/26%8#3"%3"*%313/9#&)
%

使用相同的代码从201308FCR.pdf读取pdf . .输出是正常的

它的文档解释了为什么:

def extractText(self):
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text.  This works well for some PDF
files, but poorly for others, depending on the generator used.  This will
be refined in the future.  Do not rely on the order of text coming out of
this function, as it will change if this function is made more
sophisticated.
:return: a unicode string object.
"""

使用textract。

它支持包括pdf在内的多种文件类型

import textract
text = textract.process("path/to/file.extension")

可以使用PDFtoText https://github.com/jalan/pdftotext < / p >

PDF到文本保持文本格式缩进,不管你是否有表格。

我在寻找一个简单的解决方案来使用python 3。X和窗口。textract似乎不支持,这是不幸的,但如果你正在寻找一个简单的解决方案为windows/python 3签出tika包,真的直接读取pdf。

Tika-Python是绑定到Apache Tika™REST服务的Python,允许在Python社区中本地调用Tika。

from tika import parser # pip install tika


raw = parser.from_file('sample.pdf')
print(raw['content'])

注意,Tika是用Java编写的,因此需要安装Java运行时

我正在添加代码来实现这一点: 它为我工作很好:

# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720


import os
import shutil
import warnings
from io import StringIO


import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage


warnings.filterwarnings("ignore")




def download_file(url):
local_filename = url.split('/')[-1]
local_filename = local_filename.replace("%20", "_")
r = requests.get(url, stream=True)
print(r)
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)


return local_filename




class PDFExtractor():
def __init__(self, url):
self.url = url


# Downloading File in local
def break_pdf(self, filename, start_page=-1, end_page=-1):
pdf_reader = PdfFileReader(open(filename, "rb"))
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1


if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page


for i in range(start_page, end_page):
output = PdfFileWriter()
output.addPage(pdf_reader.getPage(i))
with open(str(i + 1) + "_" + filename, "wb") as outputStream:
output.write(outputStream)


def extract_text_algo_1(self, file):
pdf_reader = PdfFileReader(open(file, 'rb'))
# creating a page object
pageObj = pdf_reader.getPage(0)


# extracting extract_text from page
text = pageObj.extractText()
text = text.replace("\n", "").replace("\t", "")
return text


def extract_text_algo_2(self, file):
pdfResourceManager = PDFResourceManager()
retstr = StringIO()
la_params = LAParams()
device = TextConverter(pdfResourceManager, retstr, codec='utf-8', laparams=la_params)
fp = open(file, 'rb')
interpreter = PDFPageInterpreter(pdfResourceManager, device)
password = ""
max_pages = 0
caching = True
page_num = set()


for page in PDFPage.get_pages(fp, page_num, maxpages=max_pages, password=password, caching=caching,
check_extractable=True):
interpreter.process_page(page)


text = retstr.getvalue()
text = text.replace("\t", "").replace("\n", "")


fp.close()
device.close()
retstr.close()
return text


def extract_text(self, file):
text1 = self.extract_text_algo_1(file)
text2 = self.extract_text_algo_2(file)


if len(text2) > len(str(text1)):
return text2
else:
return text1


def extarct_table(self, file):


# Read pdf into DataFrame
try:
df = tabula.read_pdf(file, output_format="csv")
except:
print("Error Reading Table")
return


print("\nPrinting Table Content: \n", df)
print("\nDone Printing Table Content\n")


def tiff_header_for_CCITT(self, width, height, img_size, CCITT_group=4):
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
return struct.pack(tiff_header_struct,
b'II',  # Byte order indication: Little indian
42,  # Version number (always 42)
8,  # Offset to first IFD
8,  # Number of tags in IFD
256, 4, 1, width,  # ImageWidth, LONG, 1, width
257, 4, 1, height,  # ImageLength, LONG, 1, lenght
258, 3, 1, 1,  # BitsPerSample, SHORT, 1, 1
259, 3, 1, CCITT_group,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262, 3, 1, 0,  # Threshholding, SHORT, 1, 0 = WhiteIsZero
273, 4, 1, struct.calcsize(tiff_header_struct),  # StripOffsets, LONG, 1, len of header
278, 4, 1, height,  # RowsPerStrip, LONG, 1, lenght
279, 4, 1, img_size,  # StripByteCounts, LONG, 1, size of extract_image
0  # last IFD
)


def extract_image(self, filename):
number = 1
pdf_reader = PdfFileReader(open(filename, 'rb'))


for i in range(0, pdf_reader.numPages):


page = pdf_reader.getPage(i)


try:
xObject = page['/Resources']['/XObject'].getObject()
except:
print("No XObject Found")
return


for obj in xObject:


try:


if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj]._data
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"


image_name = filename.split(".")[0] + str(number)


print(xObject[obj]['/Filter'])


if xObject[obj]['/Filter'] == '/FlateDecode':
data = xObject[obj].getData()
img = Image.frombytes(mode, size, data)
img.save(image_name + "_Flate.png")
# save_to_s3(imagename + "_Flate.png")
print("Image_Saved")


number += 1
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(image_name + "_DCT.jpg", "wb")
img.write(data)
# save_to_s3(imagename + "_DCT.jpg")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(image_name + "_JPX.jp2", "wb")
img.write(data)
# save_to_s3(imagename + "_JPX.jp2")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
if xObject[obj]['/DecodeParms']['/K'] == -1:
CCITT_group = 4
else:
CCITT_group = 3
width = xObject[obj]['/Width']
height = xObject[obj]['/Height']
data = xObject[obj]._data  # sorry, getData() does not work for CCITTFaxDecode
img_size = len(data)
tiff_header = self.tiff_header_for_CCITT(width, height, img_size, CCITT_group)
img_name = image_name + '_CCITT.tiff'
with open(img_name, 'wb') as img_file:
img_file.write(tiff_header + data)


# save_to_s3(img_name)
number += 1
except:
continue


return number


def read_pages(self, start_page=-1, end_page=-1):


# Downloading file locally
downloaded_file = download_file(self.url)
print(downloaded_file)


# breaking PDF into number of pages in diff pdf files
self.break_pdf(downloaded_file, start_page, end_page)


# creating a pdf reader object
pdf_reader = PdfFileReader(open(downloaded_file, 'rb'))


# Reading each pdf one by one
total_pages = pdf_reader.numPages


if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1


if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page


for i in range(start_page, end_page):
# creating a page based filename
file = str(i + 1) + "_" + downloaded_file


print("\nStarting to Read Page: ", i + 1, "\n -----------===-------------")


file_text = self.extract_text(file)
print(file_text)
self.extract_image(file)


self.extarct_table(file)
os.remove(file)
print("Stopped Reading Page: ", i + 1, "\n -----------===-------------")


os.remove(downloaded_file)




# I have tested on these 3 pdf files
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Healthcare-January-2017.pdf"
url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sample_Test.pdf"
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sazerac_FS_2017_06_30%20Annual.pdf"
# creating the instance of class
pdf_extractor = PDFExtractor(url)


# Getting desired data out
pdf_extractor.read_pages(15, 23)

在尝试textract(似乎有太多依赖项)和pypdf2(无法从我测试的pdf中提取文本)和tika(太慢)后,我最终使用了xpdf中的pdftotext(正如已经在另一个答案中建议的那样),并直接从python中调用二进制(您可能需要调整路径到pdftotext):

import os, subprocess
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
args = ["/usr/local/bin/pdftotext",
'-enc',
'UTF-8',
"{}/my-pdf.pdf".format(SCRIPT_DIR),
'-']
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = res.stdout.decode('utf-8')

pdftotext,它基本上是相同的,但这假设pdftotext在/usr/local/bin中,而我在AWS lambda中使用这个,并希望从当前目录使用它。

顺便说一句:为了在lambda上使用这个,你需要把二进制文件和对libstdc++.so的依赖放在你的lambda函数中。我个人需要编译xpdf。由于这个指令会把这个答案放大,我把它们放在在我的个人博客上

下面的代码是Python 3中问题的解决方案。在运行代码之前,请确保已经在您的环境中安装了PyPDF2库。如果未安装,打开命令提示符,执行以下命令:

pip3 install PyPDF2

解决方案的代码:

import PyPDF2
pdfFileObject = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
count = pdfReader.numPages
for i in range(count):
page = pdfReader.getPage(i)
print(page.extractText())

下面是提取文本的最简单代码

代码:

# importing required modules
import PyPDF2


# creating a pdf file object
pdfFileObj = open('filename.pdf', 'rb')


# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)


# printing number of pages in pdf file
print(pdfReader.numPages)


# creating a page object
pageObj = pdfReader.getPage(5)


# extracting text from page
print(pageObj.extractText())


# closing the pdf file object
pdfFileObj.close()

多页pdf可以提取为文本在单一延伸,而不是给个别页码作为参数使用下面的代码

import PyPDF2
import collections
pdf_file = open('samples.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
c = collections.Counter(range(number_of_pages))
for i in c:
page = read_pdf.getPage(i)
page_content = page.extractText()
print page_content.encode('utf-8')
PyPDF2在某些情况下忽略空白,使结果文本混乱,但我使用PyMuPDF,我真的很满意 你可以使用链接获取更多信息

你可以从在这里下载tika-app-xxx.jar(latest)。

然后将这个.jar文件放在python脚本文件的同一文件夹中。

然后在脚本中插入如下代码:

import os
import os.path


tika_dir=os.path.join(os.path.dirname(__file__),'<tika-app-xxx>.jar')


def extract_pdf(source_pdf:str,target_txt:str):
os.system('java -jar '+tika_dir+' -t {} > {}'.format(source_pdf,target_txt))

该方法的优点:

更少的依赖。单个.jar文件比python包更容易管理。

开始支持。位置source_pdf可以是任何类型文档的目录。(.doc, .html, .odt等)

最新的。tika-app.jar始终先于相关版本的tika python包发布。

稳定。它比PyPDF更加稳定和维护良好(由Apache提供支持)。

劣势:

一个无头小丑是必要的。

PyPDF2确实有效,但结果可能有所不同。我从其结果提取中看到了相当不一致的结果。

reader=PyPDF2.pdf.PdfFileReader(self._path)
eachPageText=[]
for i in range(0,reader.getNumPages()):
pageText=reader.getPage(i).extractText()
print(pageText)
eachPageText.append(pageText)

我尝试过许多Python PDF转换器,我想更新这篇评论。__abc0是其中最好的一个。但是@ehsaneha用户的PyMuPDF是一个好消息。

我做了一个代码来比较它们:https://github.com/erfelipe/PDFtextExtraction,我希望能帮助你。

Tika-Python是Apache Tika™REST服务的Python绑定 允许在Python社区中本地调用Tika

from tika import parser


raw = parser.from_file("///Users/Documents/Textos/Texto1.pdf")
raw = str(raw)


safe_text = raw.encode('utf-8', errors='ignore')


safe_text = str(safe_text).replace("\n", "").replace("\\", "")
print('--- safe text ---' )
print( safe_text )

pdftotext是最好和最简单的一个! Pdftotext也保留结构

我尝试了PyPDF2, PDFMiner和其他一些程序,但没有一个能给出令人满意的结果。

我找到了一个解决方案在这里PDFLayoutTextStripper

它很好,因为它可以保留原始PDF的布局

它是用Java编写的,但我已经添加了一个网关来支持Python。

示例代码:

from py4j.java_gateway import JavaGateway


gw = JavaGateway()
result = gw.entry_point.strip('samples/bus.pdf')


# result is a dict of {
#   'success': 'true' or 'false',
#   'payload': pdf file content if 'success' is 'true'
#   'error': error message if 'success' is 'false'
# }


print result['payload']

PDFLayoutTextStripper的示例输出: enter image description here < / p >

你可以在这里看到更多细节使用Python的脱衣舞娘

如果您在Windows上的Anaconda中尝试它,PyPDF2可能无法处理一些具有非标准结构或unicode字符的pdf。如果你需要打开并阅读大量pdf文件,我建议使用以下代码-相对路径为.//pdfs//的文件夹中所有pdf文件的文本将存储在列表pdf_text_list中。

from tika import parser
import glob


def read_pdf(filename):
text = parser.from_file(filename)
return(text)




all_files = glob.glob(".\\pdfs\\*.pdf")
pdf_text_list=[]
for i,file in enumerate(all_files):
text=read_pdf(file)
pdf_text_list.append(text['content'])


print(pdf_text_list)

从PDF中提取文本使用下面的代码

import PyPDF2
pdfFileObj = open('mypdf.pdf', 'rb')


pdfReader = PyPDF2.PdfFileReader(pdfFileObj)


print(pdfReader.numPages)


pageObj = pdfReader.getPage(0)


a = pageObj.extractText()


print(a)

我有一个比OCR更好的工作,并保持页面对齐,同时从PDF中提取文本。应该有帮助:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO


def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()




for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)




text = retstr.getvalue()


fp.close()
device.close()
retstr.close()
return text


text= convert_pdf_to_txt('test.pdf')
print(text)

在2020年,上述解决方案并不适用于我正在使用的特定pdf。下面是诀窍。我用的是Windows 10和Python 3.8

测试pdf文件:https://drive.google.com/file/d/1aUfQAlvq5hA9kz2c9CyJADiY3KpY3-Vn/view?usp=sharing

#pip install pdfminer.six
import io


from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage




def convert_pdf_to_txt(path):
'''Convert pdf content from a file path to text


:path the file path
'''
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()


with io.StringIO() as retstr:
with TextConverter(rsrcmgr, retstr, codec=codec,
laparams=laparams) as device:
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()


for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)


return retstr.getvalue()




if __name__ == "__main__":
print(convert_pdf_to_txt('C:\\Path\\To\\Test_PDF.pdf'))

一种更健壮的方法,假设有多个PDF或只有一个!

import os
from PyPDF2 import PdfFileWriter, PdfFileReader
from io import BytesIO


mydir = # specify path to your directory where PDF or PDF's are


for arch in os.listdir(mydir):
buffer = io.BytesIO()
archpath = os.path.join(mydir, arch)
with open(archpath) as f:
pdfFileObj = open(archpath, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
ley = pageObj.extractText()
file1 = open("myfile.txt","w")
file1.writelines(ley)
file1.close()
            

PyPDF2最近改进了很多。根据数据的不同,它与pdfminer.six相当或更好。

pymupdf / tika / PDFium优于PyPDF2,但差异变得相当小 (主要是在设置新行时)。最核心的部分是它们要快得多。但它们不是纯python,这意味着你不能执行它。有些可能有太严格的许可,所以你可能不能使用它

有一个看看基准

截至2022年11月:

enter image description here

enter image description here

PyPDF2

编辑:我最近成为了PyPDF2的维护者!😁社区对文本提取进行了大量改进。试一试:-)

from PyPDF2 import PdfReader


reader = PdfReader("example.pdf")
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"

请注意,这些包是不维护的:

  • PyPDF3, PyPDF4
  • pdfminer(没有。6)

pymupdf

import fitz # install using: pip install PyMuPDF


with fitz.open("my.pdf") as doc:
text = ""
for page in doc:
text += page.get_text()


print(text)

其他PDF库

  • pikepdf不支持文本提取()

如何从PDF文件中提取文本?

首先要理解的是PDF格式。它有一个用英文编写的公共规范,请参阅ISO 32000 - 2:2017并阅读超过700页的PDF 1.7规范。你当然至少需要阅读维基百科关于PDF的页面

一旦你理解了PDF格式的细节,提取文本或多或少是容易的(但是出现在图形或图像中的文本呢?它的数字1)?不要指望在几周内单独编写一个完美的软件文本提取器....

在Linux上,你也可以使用pdf2text,你可以从你的Python代码中popen

一般来说,从PDF文件中提取文本是一个定义不清的问题。对于人类读者来说,一些文本可以由不同的点制成(图形),或者一张照片等等。

谷歌搜索引擎能够从PDF中提取文本,但据传需要超过5亿行的源代码。你有必要的资源(人力和预算)来发展一个竞争对手吗?

一种可能是将PDF打印到某个虚拟打印机(例如使用内容火狐),然后使用光学字符识别技术提取文本。

相反,我建议处理生成该PDF文件的数据表示,例如原始的乳胶代码(或笨拙的人代码)或OOXML代码。

在所有情况下,您都需要为至少几个人年的软件开发预算。

如果想要从表格中提取文本,我发现tabula很容易实现,准确且快速:

获取熊猫数据框架:

import tabula


df = tabula.read_pdf('your.pdf')


df

默认情况下,它忽略表之外的页面内容。到目前为止,我只在单页、单表文件上进行了测试,但是有一些kwarg可以容纳多页和/或多表。

安装通过:

pip install tabula-py
# or
conda install -c conda-forge tabula-py
在直接的文本提取方面,参见: https://stackoverflow.com/a/63190886/9249533 < / p >

使用pdfminer.six。下面是文档:https://pdfminersix.readthedocs.io/en/latest/index.html

将pdf转换为文本:

    def pdf_to_text():
from pdfminer.high_level import extract_text


text = extract_text('test.pdf')
print(text)

卡米洛特似乎是Python中从pdf中提取表格的一个相当强大的解决方案。

乍一看,它似乎实现了几乎和CreekGeek建议的tabula-py包一样准确的提取,它已经超过了今天在可靠性方面发布的任何其他解决方案,但它应该是更具可配置性。此外,它有自己的精度指示器(results.parsing_report),和伟大的调试功能。

Camelot和Tabula都将结果作为Pandas的dataframe提供,因此之后很容易调整表。

pip install camelot-py

(不要与camelot包混淆。)

import camelot


df_list = []
results = camelot.read_pdf("file.pdf", ...)
for table in results:
print(table.parsing_report)
df_list.append(results[0].df)

它还可以输出结果为CSV, JSON, HTML或Excel。

卡梅洛特的到来是以牺牲许多属地为代价的。

NB :由于我的输入非常复杂,有许多不同的表,我最终使用这两个 Camelot和Tabula,根据表,以实现最佳结果。

试试borb,一个纯python PDF库

import typing
from borb.pdf.document import Document
from borb.pdf.pdf import PDF
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction




def main():


# variable to hold Document instance
doc: typing.Optional[Document] = None


# this implementation of EventListener handles text-rendering instructions
l: SimpleTextExtraction = SimpleTextExtraction()


# open the document, passing along the array of listeners
with open("input.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l])
  

# were we able to read the document?
assert doc is not None


# print the text on page 0
print(l.get_text(0))


if __name__ == "__main__":
main()


你可以使用pytessaract和OpenCV简单地做到这一点。参考下面的代码。你可以从这篇文章中获得更多细节。

import os
from PIL import Image
from pdf2image import convert_from_path
import pytesseract


filePath = ‘021-DO-YOU-WONDER-ABOUT-RAIN-SNOW-SLEET-AND-HAIL-Free-Childrens-Book-By-Monkey-Pen.pdf’
doc = convert_from_path(filePath)


path, fileName = os.path.split(filePath)
fileBaseName, fileExtension = os.path.splitext(fileName)


for page_number, page_data in enumerate(doc):
txt = pytesseract.image_to_string(page_data).encode(“utf-8”)
print(“Page # {} — {}”.format(str(page_number),txt))

截至2021年,我想推荐pdfreader,因为PyPDF2/3现在似乎很麻烦,而tika实际上是用java编写的,需要在后台安装jre。pdfreader是python的,目前维护良好,并有广泛的文档在这里

照常安装:pip install pdfreader

用法的简短例子:

from pdfreader import PDFDocument, SimplePDFViewer


# get raw document
fd = open(file_name, "rb")
doc = PDFDocument(fd)


# there is an iterator for pages
page_one = next(doc.pages())
all_pages = [p for p in doc.pages()]


# and even a viewer
fd = open(file_name, "rb")
viewer = SimplePDFViewer(fd)

它包括根据文档中的页数动态设置为每个PDF页创建一个新工作表。

import PyPDF2 as p2
import xlsxwriter


pdfFileName = "sample.pdf"
pdfFile = open(pdfFileName, 'rb')
pdfread = p2.PdfFileReader(pdfFile)
number_of_pages = pdfread.getNumPages()
workbook = xlsxwriter.Workbook('pdftoexcel.xlsx')


for page_number in range(number_of_pages):
print(f'Sheet{page_number}')
pageinfo = pdfread.getPage(page_number)
rawInfo = pageinfo.extractText().split('\n')


row = 0
column = 0
worksheet = workbook.add_worksheet(f'Sheet{page_number}')


for line in rawInfo:
worksheet.write(row, column, line)
row += 1
workbook.close()

Pdfplumber是一个更好的从pdf中读取和提取数据的库。它还提供了读取表数据的方法,在经历了大量这样的库之后,pdfplumber最适合我。

请注意,它最适合机器编写的pdf,而不是扫描的pdf。

import pdfplumber
with pdfplumber.open(r'D:\examplepdf.pdf') as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())

目的:从PDF中提取文本

所需工具:

  1. 窗户用弹簧管: windows中pdftotext文件的包装器 对于anaanaconda: conda install -c conda-forge

  2. pdftotext实用程序转换PDF到文本。

< p >步骤: 安装荡漾。windows操作系统:在env路径下增加“xxx/bin/” PIP install pdftotext

import pdftotext
 

# Load your PDF
with open("Target.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
 

# Save all text to a txt file.
with open('output.txt', 'w') as f:
f.write("\n\n".join(pdf))