diff --git a/newsspider/spiders/bjxpv.py b/newsspider/spiders/bjxpv.py index c839399..8c37aad 100644 --- a/newsspider/spiders/bjxpv.py +++ b/newsspider/spiders/bjxpv.py @@ -1,11 +1,59 @@ import scrapy +import mysql.connector +from mysql.connector import errorcode from ..items import NewsItem - +from ..settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE class BjxpvSpider(scrapy.Spider): name = "bjxpv" allowed_domains = ["guangfu.bjx.com.cn"] - start_urls = ["https://guangfu.bjx.com.cn/yw/"] + + + def __init__(self, *args, **kwargs): + super(BjxpvSpider, self).__init__(*args, **kwargs) + self.crawled_urls = set() + + + def start_requests(self): + # 连接数据库 + self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT) + self.cursor = self.conn.cursor() + + # 动态生成表名 + dataset_name = f'{self.settings.get("BOT_NAME")}_{self.name}' + + # 获取当前数据库中已经爬取的 URLs + try: + self.cursor.execute(f"SELECT url FROM `{dataset_name}`") + self.crawled_urls = {row[0] for row in self.cursor.fetchall()} + except mysql.connector.Error as err: + if err.errno == errorcode.ER_NO_SUCH_TABLE: + self.log(f"Table `{dataset_name}` does not exist. Initializing crawled URLs as an empty set.") + self.crawled_urls = set() + else: + self.log(f"Error fetching URLs from `{dataset_name}`: {err}") + self.crawled_urls = set() + + # 断开数据库连接 + self.conn.close() + + # 开始请求 + start_urls = [ + 'https://guangfu.bjx.com.cn/yw/', + 'https://guangfu.bjx.com.cn/zc/', + 'https://guangfu.bjx.com.cn/sc/', + 'https://guangfu.bjx.com.cn/mq/', + 'https://guangfu.bjx.com.cn/dj/', + 'https://guangfu.bjx.com.cn/xm/', + 'https://guangfu.bjx.com.cn/zb/', + 'https://guangfu.bjx.com.cn/cj/', + 'https://guangfu.bjx.com.cn/gj/', + 'https://guangfu.bjx.com.cn/sj/', + 'https://guangfu.bjx.com.cn/js/', + ] + for url in start_urls: + yield scrapy.Request(url, self.parse) + def parse(self, response): news_list = response.css('.cc-list-content a') @@ -18,6 +66,7 @@ class BjxpvSpider(scrapy.Spider): url = next_page.attrib['href'] yield response.follow(url, self.parse) + def news_parse(self, response): news_item = NewsItem() news_item['website'] = '北极星太阳能光伏网'