diff --git a/newsspider/middlewares.py b/newsspider/middlewares.py index 3cf598f..f13b799 100644 --- a/newsspider/middlewares.py +++ b/newsspider/middlewares.py @@ -6,106 +6,11 @@ from scrapy import signals # useful for handling different item types with a single interface -from itemadapter import is_item, ItemAdapter from .myutils import ProxyPool from .settings import USERNAME, PASSWORD from faker import Faker - -class NewsspiderSpiderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, or item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request or item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info("Spider opened: %s" % spider.name) - - -class DecspiderDownloaderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info("Spider opened: %s" % spider.name) - class ProxyMiddleware: def __init__(self): # 初始化代理列表和每个代理的失败计数 diff --git a/newsspider/settings.py b/newsspider/settings.py index 7055880..1e2de7e 100644 --- a/newsspider/settings.py +++ b/newsspider/settings.py @@ -105,8 +105,8 @@ PROXYPOOL_MIN_DURATION = 1 # MySQL Configuration MYSQL_USERNAME = "root" MYSQL_PASSWORD = "yGWptA_tX4bZ2q" -MYSQL_HOST = "10.18.30.148" -MYSQL_PORT = 3307 +MYSQL_HOST = "chenbingyuan.com" +MYSQL_PORT = 7033 MYSQL_DATABASE = "crawler_data"