import urllib2
import BeautifulSoup
request = urllib2.Request("")
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for a in soup.findAll('a'):
if 'national-park' in a['href']:
print 'found a url with national-park in the link'
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
没有理由再使用 BeautifulSoup 了,除非你使用的是 Google App Engine 或者其他不允许使用 Python 的软件。
Lxml.html 还支持 CSS3选择器,所以这种事情是微不足道的。
Lxml 和 xpath 的示例如下所示:
import urllib
import lxml.html
connection = urllib.urlopen('')
dom = lxml.html.fromstring(
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
print link
import requests
import lxml.html
dom = lxml.html.fromstring(requests.get('').content)
[x for x in dom.xpath('//a/@href') if '//' in x and '' not in x]
在 list comp 中,“ if’//’和‘’not In x”是一个简单的方法,可以擦除站点的内部导航 URL 的 URL 列表,等等。
import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("").read()
soup = BeautifulSoup(url)
for line in soup.find_all('a'):
from bs4 import BeautifulSoup
import urllib.request
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("")
soup = BeautifulSoup(resp, parser,'charset'))
for link in soup.find_all('a', href=True):
或 Python 2版本:
from bs4 import BeautifulSoup
import urllib2
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib2.urlopen("")
soup = BeautifulSoup(resp, parser,'charset'))
for link in soup.find_all('a', href=True):
print link['href']
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = requests.get("")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)
for link in soup.find_all('a', href=True):
import urllib
import lxml.html
import urlparse
def get_dom(url):
connection = urllib.urlopen(url)
return lxml.html.fromstring(
def get_links(url):
return resolve_links((link for link in get_dom(url).xpath('//a/@href')))
def guess_root(links):
for link in links:
if link.startswith('http'):
parsed_link = urlparse.urlparse(link)
scheme = parsed_link.scheme + '://'
netloc = parsed_link.netloc
return scheme + netloc
def resolve_links(links):
root = guess_root(links)
for link in links:
if not link.startswith('http'):
link = urlparse.urljoin(root, link)
yield link
for link in get_links(''):
print link
import urllib2
import re
#connect to a URL
website = urllib2.urlopen(url)
#read html code
html =
#use re.findall to get all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
print links
import requests
import wget
import os
from bs4 import BeautifulSoup, SoupStrainer
url = ''
file_type = '.tar.gz'
response = requests.get(url)
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path = url + link['href']
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path =urlparse.urljoin(url , link['href']) #module urlparse need to be imported
例如,使用 src和 href属性(这里我使用 start with ^ 操作符来指定这两个属性值中的任何一个以 http 开头) :
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('')
soup = bs(r.content, 'lxml')
links = [item['href'] if item.get('href') is not None else item['src'] for item in'[href^="http"], [src^="http"]') ]
# Python 3.
import urllib
from bs4 import BeautifulSoup
url = ""
resp = urllib.request.urlopen(url)
# Get server encoding per recommendation of Martijn Pieters.
soup = BeautifulSoup(resp,'charset'))
external_links = set()
internal_links = set()
for line in soup.find_all('a'):
link = line.get('href')
if not link:
if link.startswith('http'):
# Depending on usage, full internal links may be preferred.
full_internal_links = {
urllib.parse.urljoin(url, internal_link)
for internal_link in internal_links
# Print all unique external and full internal links.
for link in external_links.union(full_internal_links):