edit proxymiddleware
This commit is contained in:
parent
8b156e18ed
commit
6834609725
|
@ -9,7 +9,9 @@ from scrapy import signals
|
|||
from .myutils import ProxyPool
|
||||
from .settings import USERNAME, PASSWORD
|
||||
from faker import Faker
|
||||
from scrapy.core.downloader.handlers.http11 import TunnelError
|
||||
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, TunnelError
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class ProxyMiddleware:
|
||||
|
@ -19,7 +21,6 @@ class ProxyMiddleware:
|
|||
self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list}
|
||||
self.fake = Faker()
|
||||
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# 为每个请求随机选择一个代理
|
||||
proxy = self.proxy_pool.get_one()
|
||||
|
@ -30,25 +31,28 @@ class ProxyMiddleware:
|
|||
request.headers['User-Agent'] = ua
|
||||
spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}')
|
||||
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# 如果响应正常,返回响应
|
||||
if response.status in [200, 301, 302]:
|
||||
return response
|
||||
# 如果响应异常,处理失败计数
|
||||
else:
|
||||
self._handle_proxy_failure(request.meta['proxy'], spider)
|
||||
# 重新调度请求
|
||||
return request
|
||||
|
||||
# 重新调度请求,换一个代理重试
|
||||
new_request = request.copy()
|
||||
self._retry_with_new_proxy(new_request, spider)
|
||||
return new_request
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# 处理发生异常的请求
|
||||
if isinstance(exception, TunnelError):
|
||||
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to TunnelError: {exception}")
|
||||
else:
|
||||
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to exception: {exception}")
|
||||
self._handle_proxy_failure(request.meta['proxy'], spider)
|
||||
spider.logger.info(f"Changing proxy to {request.meta['proxy']} due to exception: {exception}")
|
||||
# 重新调度请求
|
||||
return request
|
||||
|
||||
# 重新调度请求,换一个代理重试
|
||||
new_request = request.copy()
|
||||
self._retry_with_new_proxy(new_request, spider)
|
||||
return new_request
|
||||
|
||||
def _handle_proxy_failure(self, http_proxy, spider):
|
||||
# 增加指定代理的失败计数
|
||||
|
@ -63,6 +67,15 @@ class ProxyMiddleware:
|
|||
del self.proxy_failures[proxy]
|
||||
spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR)
|
||||
|
||||
def _retry_with_new_proxy(self, request, spider):
|
||||
proxy = self.proxy_pool.get_one()
|
||||
if proxy:
|
||||
request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": USERNAME, "pwd": PASSWORD, "proxy": proxy}
|
||||
ua = self.fake.user_agent()
|
||||
request.headers['User-Agent'] = ua
|
||||
spider.logger.info(f'Retrying with new proxy: {proxy}\nUsing UA: {ua}')
|
||||
else:
|
||||
spider.logger.error("No proxies available, cannot retry request.")
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
|
@ -71,6 +84,5 @@ class ProxyMiddleware:
|
|||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
Loading…
Reference in New Issue
Block a user