最佳答案
因此,我试图在我的 WebDriver 中打开新标签页上的网站。我想这样做,因为打开一个新的网页驱动程序每个网站需要约3.5秒使用 PhantomJS,我想更快..。
我使用了一个多进程 python 脚本,我想从每个页面中获取一些元素,所以工作流是这样的:
Open Browser
Loop throught my array
For element in array -> Open website in new tab -> do my business -> close it
但我找不到任何方法来实现这一点。
这是我用的代码。在不同的网站之间需要很长的时间,我需要它的快速... 其他工具是允许的,但我不知道有太多的工具可以删除网站内容的加载与 JavaScript (div 创建时,一些事件触发加载等)这就是为什么我需要 Selenium... BeautifulSoup 不能用于我的一些页面。
#!/usr/bin/env python
import multiprocessing, time, pika, json, traceback, logging, sys, os, itertools, urllib, urllib2, cStringIO, mysql.connector, shutil, hashlib, socket, urllib2, re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from PIL import Image
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
from pprint import pprint
def getPhantomData(parameters):
try:
# We create WebDriver
browser = webdriver.Firefox()
# Navigate to URL
browser.get(parameters['target_url'])
# Find all links by Selector
links = browser.find_elements_by_css_selector(parameters['selector'])
result = []
for link in links:
# Extract link attribute and append to our list
result.append(link.get_attribute(parameters['attribute']))
browser.close()
browser.quit()
return json.dumps({'data': result})
except Exception, err:
browser.close()
browser.quit()
print err
def callback(ch, method, properties, body):
parameters = json.loads(body)
message = getPhantomData(parameters)
if message['data']:
ch.basic_ack(delivery_tag=method.delivery_tag)
else:
ch.basic_reject(delivery_tag=method.delivery_tag, requeue=True)
def consume():
credentials = pika.PlainCredentials('invitado', 'invitado')
rabbit = pika.ConnectionParameters('localhost',5672,'/',credentials)
connection = pika.BlockingConnection(rabbit)
channel = connection.channel()
# Conectamos al canal
channel.queue_declare(queue='com.stuff.images', durable=True)
channel.basic_consume(callback,queue='com.stuff.images')
print ' [*] Waiting for messages. To exit press CTRL^C'
try:
channel.start_consuming()
except KeyboardInterrupt:
pass
workers = 5
pool = multiprocessing.Pool(processes=workers)
for i in xrange(0, workers):
pool.apply_async(consume)
try:
while True:
continue
except KeyboardInterrupt:
print ' [*] Exiting...'
pool.terminate()
pool.join()