edit proxymiddleware
This commit is contained in:
parent
8b156e18ed
commit
6834609725
|
@ -9,7 +9,9 @@ from scrapy import signals
|
||||||
from .myutils import ProxyPool
|
from .myutils import ProxyPool
|
||||||
from .settings import USERNAME, PASSWORD
|
from .settings import USERNAME, PASSWORD
|
||||||
from faker import Faker
|
from faker import Faker
|
||||||
from scrapy.core.downloader.handlers.http11 import TunnelError
|
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, TunnelError
|
||||||
|
from scrapy.utils.defer import mustbe_deferred
|
||||||
|
from scrapy.exceptions import NotConfigured
|
||||||
|
|
||||||
|
|
||||||
class ProxyMiddleware:
|
class ProxyMiddleware:
|
||||||
|
@ -19,7 +21,6 @@ class ProxyMiddleware:
|
||||||
self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list}
|
self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list}
|
||||||
self.fake = Faker()
|
self.fake = Faker()
|
||||||
|
|
||||||
|
|
||||||
def process_request(self, request, spider):
|
def process_request(self, request, spider):
|
||||||
# 为每个请求随机选择一个代理
|
# 为每个请求随机选择一个代理
|
||||||
proxy = self.proxy_pool.get_one()
|
proxy = self.proxy_pool.get_one()
|
||||||
|
@ -30,25 +31,28 @@ class ProxyMiddleware:
|
||||||
request.headers['User-Agent'] = ua
|
request.headers['User-Agent'] = ua
|
||||||
spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}')
|
spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}')
|
||||||
|
|
||||||
|
|
||||||
def process_response(self, request, response, spider):
|
def process_response(self, request, response, spider):
|
||||||
# 如果响应正常,返回响应
|
# 如果响应正常,返回响应
|
||||||
if response.status in [200, 301, 302]:
|
if response.status in [200, 301, 302]:
|
||||||
return response
|
return response
|
||||||
# 如果响应异常,处理失败计数
|
|
||||||
else:
|
else:
|
||||||
self._handle_proxy_failure(request.meta['proxy'], spider)
|
self._handle_proxy_failure(request.meta['proxy'], spider)
|
||||||
# 重新调度请求
|
# 重新调度请求,换一个代理重试
|
||||||
return request
|
new_request = request.copy()
|
||||||
|
self._retry_with_new_proxy(new_request, spider)
|
||||||
|
return new_request
|
||||||
|
|
||||||
def process_exception(self, request, exception, spider):
|
def process_exception(self, request, exception, spider):
|
||||||
# 处理发生异常的请求
|
# 处理发生异常的请求
|
||||||
|
if isinstance(exception, TunnelError):
|
||||||
|
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to TunnelError: {exception}")
|
||||||
|
else:
|
||||||
|
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to exception: {exception}")
|
||||||
self._handle_proxy_failure(request.meta['proxy'], spider)
|
self._handle_proxy_failure(request.meta['proxy'], spider)
|
||||||
spider.logger.info(f"Changing proxy to {request.meta['proxy']} due to exception: {exception}")
|
# 重新调度请求,换一个代理重试
|
||||||
# 重新调度请求
|
new_request = request.copy()
|
||||||
return request
|
self._retry_with_new_proxy(new_request, spider)
|
||||||
|
return new_request
|
||||||
|
|
||||||
def _handle_proxy_failure(self, http_proxy, spider):
|
def _handle_proxy_failure(self, http_proxy, spider):
|
||||||
# 增加指定代理的失败计数
|
# 增加指定代理的失败计数
|
||||||
|
@ -63,6 +67,15 @@ class ProxyMiddleware:
|
||||||
del self.proxy_failures[proxy]
|
del self.proxy_failures[proxy]
|
||||||
spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR)
|
spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR)
|
||||||
|
|
||||||
|
def _retry_with_new_proxy(self, request, spider):
|
||||||
|
proxy = self.proxy_pool.get_one()
|
||||||
|
if proxy:
|
||||||
|
request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": USERNAME, "pwd": PASSWORD, "proxy": proxy}
|
||||||
|
ua = self.fake.user_agent()
|
||||||
|
request.headers['User-Agent'] = ua
|
||||||
|
spider.logger.info(f'Retrying with new proxy: {proxy}\nUsing UA: {ua}')
|
||||||
|
else:
|
||||||
|
spider.logger.error("No proxies available, cannot retry request.")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler):
|
def from_crawler(cls, crawler):
|
||||||
|
@ -71,6 +84,5 @@ class ProxyMiddleware:
|
||||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
def spider_opened(self, spider):
|
||||||
spider.logger.info('Spider opened: %s' % spider.name)
|
spider.logger.info('Spider opened: %s' % spider.name)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user