edit proxymiddleware

This commit is contained in:
cooper 2024-06-18 23:07:53 +08:00
parent 8b156e18ed
commit 6834609725

View File

@ -9,7 +9,9 @@ from scrapy import signals
from .myutils import ProxyPool
from .settings import USERNAME, PASSWORD
from faker import Faker
from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, TunnelError
from scrapy.utils.defer import mustbe_deferred
from scrapy.exceptions import NotConfigured
class ProxyMiddleware:
@ -19,7 +21,6 @@ class ProxyMiddleware:
self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list}
self.fake = Faker()
def process_request(self, request, spider):
# 为每个请求随机选择一个代理
proxy = self.proxy_pool.get_one()
@ -30,25 +31,28 @@ class ProxyMiddleware:
request.headers['User-Agent'] = ua
spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}')
def process_response(self, request, response, spider):
# 如果响应正常,返回响应
if response.status in [200, 301, 302]:
return response
# 如果响应异常,处理失败计数
else:
self._handle_proxy_failure(request.meta['proxy'], spider)
# 重新调度请求
return request
# 重新调度请求,换一个代理重试
new_request = request.copy()
self._retry_with_new_proxy(new_request, spider)
return new_request
def process_exception(self, request, exception, spider):
# 处理发生异常的请求
if isinstance(exception, TunnelError):
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to TunnelError: {exception}")
else:
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to exception: {exception}")
self._handle_proxy_failure(request.meta['proxy'], spider)
spider.logger.info(f"Changing proxy to {request.meta['proxy']} due to exception: {exception}")
# 重新调度请求
return request
# 重新调度请求,换一个代理重试
new_request = request.copy()
self._retry_with_new_proxy(new_request, spider)
return new_request
def _handle_proxy_failure(self, http_proxy, spider):
# 增加指定代理的失败计数
@ -63,6 +67,15 @@ class ProxyMiddleware:
del self.proxy_failures[proxy]
spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR)
def _retry_with_new_proxy(self, request, spider):
proxy = self.proxy_pool.get_one()
if proxy:
request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": USERNAME, "pwd": PASSWORD, "proxy": proxy}
ua = self.fake.user_agent()
request.headers['User-Agent'] = ua
spider.logger.info(f'Retrying with new proxy: {proxy}\nUsing UA: {ua}')
else:
spider.logger.error("No proxies available, cannot retry request.")
@classmethod
def from_crawler(cls, crawler):
@ -71,6 +84,5 @@ class ProxyMiddleware:
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)