使用Python从HTML文件中提取文本

小开

最佳答案

html2text是一个Python程序，它在这方面做得很好。

小开

PyParsing做得很好。PyParsing wiki被杀死了，所以这里有另一个位置，其中有使用PyParsing (例子链接)的示例。花点时间在pyparsing上的一个原因是，他还写了一本非常简短、组织良好的O'Reilly捷径手册，而且价格便宜。

话虽如此，我经常使用BeautifulSoup，处理实体问题并不难，你可以在运行BeautifulSoup之前转换它们。

古德勒克

小开

你也可以在stripogram库中使用html2text方法。

from stripogram import html2text
text = html2text(your_html_string)

需要安装stripogram，请执行sudo easy_install stripogram命令

小开

今天我发现自己面临着同样的问题。我编写了一个非常简单的HTML解析器来剥离传入内容中的所有标记，仅以最小的格式返回剩余的文本。

from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc


class _DeHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []


def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')


def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')


def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')


def text(self):
return ''.join(self.__text).strip()




def dehtml(text):
try:
parser = _DeHTMLParser()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return text




def main():
text = r'''
<html>
<body>
<b>Project:</b> DeHTML<br>
<b>Description</b>:<br>
This small script is intended to allow conversion from HTML markup to
plain text.
</body>
</html>
'''
print(dehtml(text))




if __name__ == '__main__':
main()

小开

注意: NTLK不再支持clean_html函数

下面是原始答案，评论部分有备选答案。

使用NLTK

我浪费了4-5个小时来修复html2text的问题。幸运的是我能遇到NLTK 它神奇地起作用。

import nltk
from urllib import urlopen


url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
raw = nltk.clean_html(html)
print(raw)

小开

而不是HTMLParser模块，签出htmllib。它有一个类似的界面，但是为您做了更多的工作。(它非常古老，所以在摆脱javascript和css方面没有多大帮助。你可以创建一个派生类，但是可以添加start_script和end_style这样的方法(详见python文档)，但对于格式不正确的html来说，很难可靠地做到这一点。)不管怎样，这里有一些简单的东西，它将纯文本打印到控制台

from htmllib import HTMLParser, HTMLParseError
from formatter import AbstractFormatter, DumbWriter
p = HTMLParser(AbstractFormatter(DumbWriter()))
try: p.feed('hello<br>there'); p.close() #calling close is not usually needed, but let's play it safe
except HTMLParseError: print ':(' #the html is badly malformed (or you found a bug)

小开

这不是一个完全的Python解决方案，但它会将Javascript生成的文本转换为文本，我认为这是重要的(例如google.com)。浏览器Links(不是Lynx)有一个Javascript引擎，可以通过-dump选项将源代码转换为文本。

所以你可以这样做:

fname = os.tmpnam()
fname.write(html_source)
proc = subprocess.Popen(['links', '-dump', fname],
stdout=subprocess.PIPE,
stderr=open('/dev/null','w'))
text = proc.stdout.read()

小开

有用于数据挖掘的模式库。

http://www.clips.ua.ac.be/pages/pattern-web

你甚至可以决定保留什么标签:

s = URL('http://www.clips.ua.ac.be').download()
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
print s

小开

Beautiful soup可以转换html实体。考虑到HTML经常有bug并且充满unicode和HTML编码问题，这可能是您最好的选择。这是我用来将html转换为原始文本的代码:

import BeautifulSoup
def getsoup(data, to_unicode=False):
data = data.replace("&nbsp;", " ")
# Fixes for bad markup I've seen in the wild.  Remove if not applicable.
masssage_bad_comments = [
(re.compile('<!-([^-])'), lambda match: '<!--' + match.group(1)),
(re.compile('<!WWWAnswer T[=\w\d\s]*>'), lambda match: '<!--' + match.group(0) + '-->'),
]
myNewMassage = copy.copy(BeautifulSoup.BeautifulSoup.MARKUP_MASSAGE)
myNewMassage.extend(masssage_bad_comments)
return BeautifulSoup.BeautifulSoup(data, markupMassage=myNewMassage,
convertEntities=BeautifulSoup.BeautifulSoup.ALL_ENTITIES
if to_unicode else None)


remove_html = lambda c: getsoup(c, to_unicode=True).getText(separator=u' ') if c else ""

小开

下面是xperroni回答的一个版本，它更完整一些。它跳过脚本和样式部分，并翻译charref(例如，')和HTML实体(例如，&)。

它还包括一个简单的纯文本到html的反向转换器。

"""
HTML <-> text conversions.
"""
from HTMLParser import HTMLParser, HTMLParseError
from htmlentitydefs import name2codepoint
import re


class _HTMLToText(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._buf = []
self.hide_output = False


def handle_starttag(self, tag, attrs):
if tag in ('p', 'br') and not self.hide_output:
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = True


def handle_startendtag(self, tag, attrs):
if tag == 'br':
self._buf.append('\n')


def handle_endtag(self, tag):
if tag == 'p':
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = False


def handle_data(self, text):
if text and not self.hide_output:
self._buf.append(re.sub(r'\s+', ' ', text))


def handle_entityref(self, name):
if name in name2codepoint and not self.hide_output:
c = unichr(name2codepoint[name])
self._buf.append(c)


def handle_charref(self, name):
if not self.hide_output:
n = int(name[1:], 16) if name.startswith('x') else int(name)
self._buf.append(unichr(n))


def get_text(self):
return re.sub(r' +', ' ', ''.join(self._buf))


def html_to_text(html):
"""
Given a piece of HTML, return the plain text it contains.
This handles entities and char refs, but not javascript and stylesheets.
"""
parser = _HTMLToText()
try:
parser.feed(html)
parser.close()
except HTMLParseError:
pass
return parser.get_text()


def text_to_html(text):
"""
Convert the given text to html, wrapping what looks like URLs with <a> tags,
converting newlines to <br> tags and converting confusing chars into html
entities.
"""
def f(mo):
t = mo.group()
if len(t) == 1:
return {'&':'&amp;', "'":'&#39;', '"':'&quot;', '<':'&lt;', '>':'&gt;'}.get(t)
return '<a href="%s">%s</a>' % (t, t)
return re.sub(r'https?://[^] ()"\';]+|[&\'"<>]', f, text)

小开

在Python 3中。X，你可以通过导入“imaplib”和“电子邮件”包，以非常简单的方式做到这一点。虽然这是一个老帖子，但也许我的答案可以帮助到这个帖子的新人。

status, data = self.imap.fetch(num, '(RFC822)')
email_msg = email.message_from_bytes(data[0][1])
#email.message_from_string(data[0][1])


#If message is multi part we only want the text version of the body, this walks the message and gets the body.


if email_msg.is_multipart():
for part in email_msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True) #to control automatic email-style MIME decoding (e.g., Base64, uuencode, quoted-printable)
body = body.decode()
elif part.get_content_type() == "text/html":
continue

现在你可以打印主体变量，它将是明文格式:)如果它对你来说足够好，那么它将很好地选择它作为接受的答案。

小开

我发现的最好的一段代码提取文本没有javascript或不想要的东西:

from urllib.request import urlopen
from bs4 import BeautifulSoup


url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")


# kill all script and style elements
for script in soup(["script", "style"]):
script.extract()    # rip it out


# get text
text = soup.get_text()


# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)


print(text)

你只需要安装BeautifulSoup:

pip install beautifulsoup4

小开

另一种选择是通过基于文本的web浏览器运行html并转储它。例如(使用Lynx):

lynx -dump html_to_convert.html > converted_html.txt

这可以在python脚本中完成，如下所示:

import subprocess


with open('converted_html.txt', 'w') as outputFile:
subprocess.call(['lynx', '-dump', 'html_to_convert.html'], stdout=testFile)

它不会精确地为您提供HTML文件中的文本，但根据您的用例，它可能比html2text的输出更好。

小开

我推荐一个名为goose-extractor的Python包 Goose将尝试提取以下信息:

文章正文文章主图任何Youtube/Vimeo电影嵌入文章元数据描述元标记

小开

另一个非python解决方案:Libre Office:

soffice --headless --invisible --convert-to txt input1.html

我更喜欢这种方法的原因是，每个HTML段落都转换为单个文本行(没有换行符)，这正是我所寻找的。其他方法需要后处理。Lynx的输出确实不错，但并不是我想要的。此外，Libre Office可以用来从各种格式转换…

小开

用一种简单的方式

import re


html_text = open('html_file.html').read()
text_filtered = re.sub(r'<(.*?)>', '', html_text)

这段代码找到html_text中以'<'开头，以'>'结尾的所有部分，并将所有找到的部分替换为空字符串

小开

我得到的结果是这样的。

>>> import requests
>>> url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
>>> res = requests.get(url)
>>> text = res.text

小开

如果您需要更高的速度和更低的准确性，那么您可以使用原始lxml。

import lxml.html as lh
from lxml.html.clean import clean_html


def lxml_to_text(html):
doc = lh.fromstring(html)
doc = clean_html(doc)
return doc.text_content()

小开

我知道已经有很多答案了，但我发现的最高贵的和神谕的的解决方案部分被描述为在这里。

from bs4 import BeautifulSoup


text = ' '.join(BeautifulSoup(some_html_string, "html.parser").findAll(text=True))

更新

根据弗雷泽的评论，这里有一个更优雅的解决方案:

from bs4 import BeautifulSoup


clean_text = ' '.join(BeautifulSoup(some_html_string, "html.parser").stripped_strings)

小开

@PeYoTIL使用BeautifulSoup并删除样式和脚本内容的回答对我来说并不管用。我尝试使用decompose而不是extract，但它仍然不工作。所以我创建了自己的，它也使用标记格式化文本，并用href链接替换<a>标记。也处理文本内的链接。可在这个要点中使用，并嵌入了测试文档。

from bs4 import BeautifulSoup, NavigableString


def html_to_text(html):
"Creates a formatted text email message as a string from a rendered html template (page)"
soup = BeautifulSoup(html, 'html.parser')
# Ignore anything in head
body, text = soup.body, []
for element in body.descendants:
# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
if type(element) == NavigableString:
# We use the assumption that other tags can't be inside a script or style
if element.parent.name in ('script', 'style'):
continue


# remove any multiple and leading/trailing whitespace
string = ' '.join(element.string.split())
if string:
if element.parent.name == 'a':
a_tag = element.parent
# replace link text with the link
string = a_tag['href']
# concatenate with any non-empty immediately previous string
if (    type(a_tag.previous_sibling) == NavigableString and
a_tag.previous_sibling.string.strip() ):
text[-1] = text[-1] + ' ' + string
continue
elif element.previous_sibling and element.previous_sibling.name == 'a':
text[-1] = text[-1] + ' ' + string
continue
elif element.parent.name == 'p':
# Add extra paragraph formatting newline
string = '\n' + string
text += [string]
doc = '\n'.join(text)
return doc

小开

有人尝试过bleach.clean(html,tags=[],strip=True)和漂白剂吗?这对我很有用。

小开

安装html2text using

PIP安装html2text

然后,

>>> import html2text
>>>
>>> h = html2text.HTML2Text()
>>> # Ignore converting links from HTML
>>> h.ignore_links = True
>>> print h.handle("<p>Hello, <a href='http://earth.google.com/'>world</a>!")
Hello, world!

小开

下面是我经常使用的代码。

from bs4 import BeautifulSoup
import urllib.request




def processText(webpage):


# EMPTY LIST TO STORE PROCESSED TEXT
proc_text = []


try:
news_open = urllib.request.urlopen(webpage.group())
news_soup = BeautifulSoup(news_open, "lxml")
news_para = news_soup.find_all("p", text = True)


for item in news_para:
# SPLIT WORDS, JOIN WORDS TO REMOVE EXTRA SPACES
para_text = (' ').join((item.text).split())


# COMBINE LINES/PARAGRAPHS INTO A LIST
proc_text.append(para_text)


except urllib.error.HTTPError:
pass


return proc_text

我希望这对你有所帮助。

小开

我知道这里已经有很多答案了，但我认为newspaper3k也值得一提。我最近需要完成一个类似的任务，即从网络上的文章中提取文本，到目前为止，这个库在我的测试中完成了出色的工作。它忽略菜单项和边栏中的文本，以及OP请求时出现在页面上的任何JavaScript。

from newspaper import Article


article = Article(url)
article.download()
article.parse()
article.text

如果你已经下载了HTML文件，你可以这样做:

article = Article('')
article.set_html(html)
article.parse()
article.text

它甚至有一些NLP功能来总结文章的主题:

article.nlp()
article.summary

小开

对我来说最好的工作是铭文。

https://github.com/weblyzard/inscriptis

import urllib.request
from inscriptis import get_text


url = "http://www.informationscience.ch"
html = urllib.request.urlopen(url).read().decode('utf-8')


text = get_text(html)
print(text)

结果真的很好

小开

你可以用BeautifulSoup从HTML中提取文本

url = "https://www.geeksforgeeks.org/extracting-email-addresses-using-regular-expressions-python/"
con = urlopen(url).read()
soup = BeautifulSoup(con,'html.parser')
texts = soup.get_text()
print(texts)

小开

我用Apache Tika得到了很好的结果。它的目的是从内容中提取元数据和文本，因此底层解析器要进行相应的开箱即用调优。

Tika可以作为服务器运行，在Docker容器中运行/部署是很简单的，从那里可以通过Python绑定访问。

小开

LibreOffice writer注释有其优点，因为应用程序可以使用python宏。它似乎为回答这个问题和进一步扩展LibreOffice的宏观基础提供了多种好处。如果这个解决方案是一次性实现，而不是作为更大的生产程序的一部分使用，那么在writer中打开HTML并将页面保存为文本似乎可以解决这里讨论的问题。

小开

Perl方式(对不起妈妈，我永远不会在生产中这样做)。

import re


def html2text(html):
res = re.sub('<.*?>', ' ', html, flags=re.DOTALL | re.MULTILINE)
res = re.sub('\n+', '\n', res)
res = re.sub('\r+', '', res)
res = re.sub('[\t ]+', ' ', res)
res = re.sub('\t+', '\t', res)
res = re.sub('(\n )+', '\n ', res)
return res

小开

虽然很多人提到使用regex来剥离html标记，但它有很多缺点。

例如:

<p>hello&nbsp;world</p>I love you

应该解析为:

Hello world
I love you

这是我想到的一个片段，你可以根据你的特定需求定制它，它就像一个魅力

import re
import html
def html2text(htm):
ret = html.unescape(htm)
ret = ret.translate({
8209: ord('-'),
8220: ord('"'),
8221: ord('"'),
160: ord(' '),
})
ret = re.sub(r"\s", " ", ret, flags = re.MULTILINE)
ret = re.sub("<br>|<br />|</p>|</div>|</h\d>", "\n", ret, flags = re.IGNORECASE)
ret = re.sub('<.*?>', ' ', ret, flags=re.DOTALL)
ret = re.sub(r"  +", " ", ret)
return ret

小开

另一个在Python 2.7.9+中使用BeautifulSoup4的例子

包括:

import urllib2
from bs4 import BeautifulSoup

代码:

def read_website_to_text(url):
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return str(text.encode('utf-8'))

解释道:

将url数据读入为html(使用BeautifulSoup)，删除所有脚本和样式元素，并使用.get_text()仅获取文本。分割成行，删除每个标题的开头和结尾空格，然后将多个标题分割成一行，each chunks = (phrase.strip() for line in line for phrase in line。(" "))。然后使用text = '\n'。加入，删除空行，最后返回为批准的utf-8。

指出:

一些系统这是运行在https://连接失败，因为SSL问题，你可以关闭验证来解决这个问题。修正示例:http://blog.pengyifan.com/how-to-fix-python-ssl-certificate_verify_failed/

Python & lt;2.7.9运行这个可能会有一些问题

text.encode('utf-8')可能会留下奇怪的编码，可能只需要返回str(text)即可。

小开

我也有一个类似的问题，实际上我用了BeautifulSoup的一个答案。问题是它真的很慢。我最终使用了一个叫做selectolax的库。虽然它的功能很有限，但它对这个任务很有效。唯一的问题是我手动删除了不必要的空白。但它的工作速度似乎比BeautifulSoup解决方案快得多

from selectolax.parser import HTMLParser


def get_text_selectolax(html):
tree = HTMLParser(html)


if tree.body is None:
return None


for tag in tree.css('script'):
tag.decompose()
for tag in tree.css('style'):
tag.decompose()


text = tree.body.text(separator='')
text = " ".join(text.split()) # this will remove all the whitespaces
return text

小开

这里的所有方法在一些网站上都不能很好地工作。由JS代码生成的段落可以抵抗上述所有问题。下面是受这个答案和这启发最终为我工作的方法。

这个想法是在webdriver中加载页面，并滚动到页面的末尾，让JS做它的事情来生成/加载页面的其余部分。然后插入键击命令选择全部复制/粘贴整个页面:

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pyperclip
import time


driver = webdriver.Chrome()
driver.get("https://www.lazada.com.ph/products/nike-womens-revolution-5-running-shoes-black-i1262506154-s4552606107.html?spm=a2o4l.seller.list.3.6f5d7b6cHO8G2Y&mp=1&freeshipping=1")


# Scroll down to end of the page to let all javascript code load its content
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False
while(match==False):
lastCount = lenOfPage
time.sleep(1)
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount==lenOfPage:
match=True


# copy from the webpage
element = driver.find_element_by_tag_name('body')
element.send_keys(Keys.CONTROL,'a')
element.send_keys(Keys.CONTROL,'c')
alltext = pyperclip.paste()
alltext = alltext.replace("\n", " ").replace("\r", " ")  # cleaning the copied text
print(alltext )

它很慢。但其他的都不奏效。

更新:一个更好的方法是在滚动到页面末尾后使用inscriptis库加载页面的源代码:

from inscriptis import get_text
text = get_text(driver.page_source)

仍然不能与无头驱动程序一起工作(页面检测到它不是由real显示，滚动到末尾不会使JS代码加载它的东西)，但至少我们不需要疯狂的复制/粘贴，这阻碍了我们在共享剪贴板的机器上运行多个脚本。

小开

使用Pandas从HTML中获取表数据。

如果您想从HTML中快速提取表数据。你可以使用read_HTML函数文档是在这里。在使用此函数之前，您应该阅读有关BeautifulSoup4/html5lib/lxml解析器HTML解析库的陷阱/issues。

import pandas as pd


http = r'https://www.ibm.com/docs/en/cmofz/10.1.0?topic=SSQHWE_10.1.0/com.ibm.ondemand.mp.doc/arsa0257.htm'
table = pd.read_html(http)
df = table[0]
df

输出

有许多选项可以玩，见在这里和在这里。

小开

如果你想从网页中自动提取文本段落，有一些可用的python包，如Trafilatura。作为基准测试的一部分，比较了几个python包:

https://github.com/adbar/trafilatura#evaluation-and-alternatives

html_text https://github.com/TeamHG-Memex/html-text
inscriptis https://github.com/weblyzard/inscriptis
newspaper3k
justext
boilerpy3 https://github.com/jmriebold/BoilerPy3
基线
goose3 https://github.com/goose3/goose3
readability-lxml https://github.com/predatell/python-readability-lxml
news-please https://github.com/fhamborg/news-please
readabilipy https://github.com/alan-turing-institute/ReadabiliPy
trafilatura