html = open('21storm.html').read()
soup = BeautifulSoup(html)
[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText()
<html>
<title> Title here</title>
<body>
lots of text here <p> <br>
<h1> even headings </h1>
<style type="text/css">
<div > this will not be visible </div>
</style>
</body>
</html>
上面提到的一个解决方案是:
html = Utilities.ReadFile('simple.html')
soup = BeautifulSoup.BeautifulSoup(html)
texts = soup.findAll(text=True)
visible_texts = filter(visible, texts)
print(visible_texts)
[u'\n', u'\n', u'\n\n lots of text here ', u' ', u'\n', u' even headings ', u'\n', u' this will not be visible ', u'\n', u'\n']
这个解决方案当然在许多情况下都有应用程序,并且在一般情况下都能很好地完成工作,但是在上面贴出的 html 中,它保留了未呈现的文本。经过搜索,有几个解决方案出现在这里和这里
import urllib
from bs4 import BeautifulSoup
url = "https://www.yahoo.com"
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text.encode('utf-8'))
import re
INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
RE_SPACES = re.compile(r'\s{3,}')
def visible_texts(soup):
""" get visible text from a document """
text = ' '.join([
s for s in soup.strings
if s.parent.name not in INVISIBLE_ELEMS
])
# collapse multiple spaces to two spaces.
return RE_SPACES.sub(' ', text)
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import re
import ssl
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
if re.match(r"[\n]+",str(element)): return False
return True
def text_from_html(url):
body = urllib.request.urlopen(url,context=ssl._create_unverified_context()).read()
soup = BeautifulSoup(body ,"lxml")
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
text = u",".join(t.strip() for t in visible_texts)
text = text.lstrip().rstrip()
text = text.split(',')
clean_text = ''
for sen in text:
if sen:
sen = sen.rstrip().lstrip()
clean_text += sen+','
return clean_text
url = 'http://www.nytimes.com/2009/12/21/us/21storm.html'
print(text_from_html(url))