From 6834609725c7365bba1e4030af4cf0263e10f166 Mon Sep 17 00:00:00 2001 From: cooper Date: Tue, 18 Jun 2024 23:07:53 +0800 Subject: [PATCH] edit proxymiddleware --- newsspider/middlewares.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/newsspider/middlewares.py b/newsspider/middlewares.py index 2962913..1599f4d 100644 --- a/newsspider/middlewares.py +++ b/newsspider/middlewares.py @@ -9,7 +9,9 @@ from scrapy import signals from .myutils import ProxyPool from .settings import USERNAME, PASSWORD from faker import Faker -from scrapy.core.downloader.handlers.http11 import TunnelError +from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, TunnelError +from scrapy.utils.defer import mustbe_deferred +from scrapy.exceptions import NotConfigured class ProxyMiddleware: @@ -19,7 +21,6 @@ class ProxyMiddleware: self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list} self.fake = Faker() - def process_request(self, request, spider): # 为每个请求随机选择一个代理 proxy = self.proxy_pool.get_one() @@ -30,25 +31,28 @@ class ProxyMiddleware: request.headers['User-Agent'] = ua spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}') - def process_response(self, request, response, spider): # 如果响应正常,返回响应 if response.status in [200, 301, 302]: return response - # 如果响应异常,处理失败计数 else: self._handle_proxy_failure(request.meta['proxy'], spider) - # 重新调度请求 - return request - + # 重新调度请求,换一个代理重试 + new_request = request.copy() + self._retry_with_new_proxy(new_request, spider) + return new_request def process_exception(self, request, exception, spider): # 处理发生异常的请求 + if isinstance(exception, TunnelError): + spider.logger.info(f"Proxy {request.meta['proxy']} failed due to TunnelError: {exception}") + else: + spider.logger.info(f"Proxy {request.meta['proxy']} failed due to exception: {exception}") self._handle_proxy_failure(request.meta['proxy'], spider) - spider.logger.info(f"Changing proxy to {request.meta['proxy']} due to exception: {exception}") - # 重新调度请求 - return request - + # 重新调度请求,换一个代理重试 + new_request = request.copy() + self._retry_with_new_proxy(new_request, spider) + return new_request def _handle_proxy_failure(self, http_proxy, spider): # 增加指定代理的失败计数 @@ -63,6 +67,15 @@ class ProxyMiddleware: del self.proxy_failures[proxy] spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR) + def _retry_with_new_proxy(self, request, spider): + proxy = self.proxy_pool.get_one() + if proxy: + request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": USERNAME, "pwd": PASSWORD, "proxy": proxy} + ua = self.fake.user_agent() + request.headers['User-Agent'] = ua + spider.logger.info(f'Retrying with new proxy: {proxy}\nUsing UA: {ua}') + else: + spider.logger.error("No proxies available, cannot retry request.") @classmethod def from_crawler(cls, crawler): @@ -71,6 +84,5 @@ class ProxyMiddleware: crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) \ No newline at end of file + spider.logger.info('Spider opened: %s' % spider.name)