edit proxymiddleware

This commit is contained in:
cooper 2024-06-18 23:07:53 +08:00
parent 8b156e18ed
commit 6834609725

View File

@ -9,7 +9,9 @@ from scrapy import signals
from .myutils import ProxyPool from .myutils import ProxyPool
from .settings import USERNAME, PASSWORD from .settings import USERNAME, PASSWORD
from faker import Faker from faker import Faker
from scrapy.core.downloader.handlers.http11 import TunnelError from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, TunnelError
from scrapy.utils.defer import mustbe_deferred
from scrapy.exceptions import NotConfigured
class ProxyMiddleware: class ProxyMiddleware:
@ -19,7 +21,6 @@ class ProxyMiddleware:
self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list} self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list}
self.fake = Faker() self.fake = Faker()
def process_request(self, request, spider): def process_request(self, request, spider):
# 为每个请求随机选择一个代理 # 为每个请求随机选择一个代理
proxy = self.proxy_pool.get_one() proxy = self.proxy_pool.get_one()
@ -30,25 +31,28 @@ class ProxyMiddleware:
request.headers['User-Agent'] = ua request.headers['User-Agent'] = ua
spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}') spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}')
def process_response(self, request, response, spider): def process_response(self, request, response, spider):
# 如果响应正常,返回响应 # 如果响应正常,返回响应
if response.status in [200, 301, 302]: if response.status in [200, 301, 302]:
return response return response
# 如果响应异常,处理失败计数
else: else:
self._handle_proxy_failure(request.meta['proxy'], spider) self._handle_proxy_failure(request.meta['proxy'], spider)
# 重新调度请求 # 重新调度请求,换一个代理重试
return request new_request = request.copy()
self._retry_with_new_proxy(new_request, spider)
return new_request
def process_exception(self, request, exception, spider): def process_exception(self, request, exception, spider):
# 处理发生异常的请求 # 处理发生异常的请求
if isinstance(exception, TunnelError):
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to TunnelError: {exception}")
else:
spider.logger.info(f"Proxy {request.meta['proxy']} failed due to exception: {exception}")
self._handle_proxy_failure(request.meta['proxy'], spider) self._handle_proxy_failure(request.meta['proxy'], spider)
spider.logger.info(f"Changing proxy to {request.meta['proxy']} due to exception: {exception}") # 重新调度请求,换一个代理重试
# 重新调度请求 new_request = request.copy()
return request self._retry_with_new_proxy(new_request, spider)
return new_request
def _handle_proxy_failure(self, http_proxy, spider): def _handle_proxy_failure(self, http_proxy, spider):
# 增加指定代理的失败计数 # 增加指定代理的失败计数
@ -63,6 +67,15 @@ class ProxyMiddleware:
del self.proxy_failures[proxy] del self.proxy_failures[proxy]
spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR) spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR)
def _retry_with_new_proxy(self, request, spider):
proxy = self.proxy_pool.get_one()
if proxy:
request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": USERNAME, "pwd": PASSWORD, "proxy": proxy}
ua = self.fake.user_agent()
request.headers['User-Agent'] = ua
spider.logger.info(f'Retrying with new proxy: {proxy}\nUsing UA: {ua}')
else:
spider.logger.error("No proxies available, cannot retry request.")
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
@ -71,6 +84,5 @@ class ProxyMiddleware:
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s return s
def spider_opened(self, spider): def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)