add urls to bjxpv

2024-05-20 14:44:50 +08:00 · 2024-05-20 14:44:50 +08:00 · 63df475acd
commit 63df475acd
parent dc35f3f990
1 changed files with 51 additions and 2 deletions
--- a/newsspider/spiders/bjxpv.py
+++ b/newsspider/spiders/bjxpv.py
@ -1,11 +1,59 @@
 import scrapy
+import mysql.connector
+from mysql.connector import errorcode
 from ..items import NewsItem
-
+from ..settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE

 class BjxpvSpider(scrapy.Spider):
    name = "bjxpv"
    allowed_domains = ["guangfu.bjx.com.cn"]
-    start_urls = ["https://guangfu.bjx.com.cn/yw/"]
+
+
+    def __init__(self, *args, **kwargs):
+        super(BjxpvSpider, self).__init__(*args, **kwargs)
+        self.crawled_urls = set()
+
+
+    def start_requests(self):
+        # 连接数据库
+        self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
+        self.cursor = self.conn.cursor()
+        
+        # 动态生成表名
+        dataset_name = f'{self.settings.get("BOT_NAME")}_{self.name}'
+        
+        # 获取当前数据库中已经爬取的 URLs
+        try:
+            self.cursor.execute(f"SELECT url FROM `{dataset_name}`")
+            self.crawled_urls = {row[0] for row in self.cursor.fetchall()}
+        except mysql.connector.Error as err:
+            if err.errno == errorcode.ER_NO_SUCH_TABLE:
+                self.log(f"Table `{dataset_name}` does not exist. Initializing crawled URLs as an empty set.")
+                self.crawled_urls = set()
+            else:
+                self.log(f"Error fetching URLs from `{dataset_name}`: {err}")
+                self.crawled_urls = set()
+        
+        # 断开数据库连接
+        self.conn.close()
+
+        # 开始请求
+        start_urls = [
+            'https://guangfu.bjx.com.cn/yw/',
+            'https://guangfu.bjx.com.cn/zc/',
+            'https://guangfu.bjx.com.cn/sc/',
+            'https://guangfu.bjx.com.cn/mq/',
+            'https://guangfu.bjx.com.cn/dj/',
+            'https://guangfu.bjx.com.cn/xm/',
+            'https://guangfu.bjx.com.cn/zb/',
+            'https://guangfu.bjx.com.cn/cj/',
+            'https://guangfu.bjx.com.cn/gj/',
+            'https://guangfu.bjx.com.cn/sj/',
+            'https://guangfu.bjx.com.cn/js/',
+        ]
+        for url in start_urls:
+            yield scrapy.Request(url, self.parse)
+

    def parse(self, response):
        news_list = response.css('.cc-list-content a')
@ -18,6 +66,7 @@ class BjxpvSpider(scrapy.Spider):
            url = next_page.attrib['href']
            yield response.follow(url, self.parse)

+
    def news_parse(self, response):
        news_item = NewsItem()
        news_item['website'] = '北极星太阳能光伏网'