add urls to bjxpv
This commit is contained in:
parent
dc35f3f990
commit
63df475acd
|
@ -1,11 +1,59 @@
|
||||||
import scrapy
|
import scrapy
|
||||||
|
import mysql.connector
|
||||||
|
from mysql.connector import errorcode
|
||||||
from ..items import NewsItem
|
from ..items import NewsItem
|
||||||
|
from ..settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE
|
||||||
|
|
||||||
class BjxpvSpider(scrapy.Spider):
|
class BjxpvSpider(scrapy.Spider):
|
||||||
name = "bjxpv"
|
name = "bjxpv"
|
||||||
allowed_domains = ["guangfu.bjx.com.cn"]
|
allowed_domains = ["guangfu.bjx.com.cn"]
|
||||||
start_urls = ["https://guangfu.bjx.com.cn/yw/"]
|
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(BjxpvSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.crawled_urls = set()
|
||||||
|
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
# 连接数据库
|
||||||
|
self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
|
||||||
|
# 动态生成表名
|
||||||
|
dataset_name = f'{self.settings.get("BOT_NAME")}_{self.name}'
|
||||||
|
|
||||||
|
# 获取当前数据库中已经爬取的 URLs
|
||||||
|
try:
|
||||||
|
self.cursor.execute(f"SELECT url FROM `{dataset_name}`")
|
||||||
|
self.crawled_urls = {row[0] for row in self.cursor.fetchall()}
|
||||||
|
except mysql.connector.Error as err:
|
||||||
|
if err.errno == errorcode.ER_NO_SUCH_TABLE:
|
||||||
|
self.log(f"Table `{dataset_name}` does not exist. Initializing crawled URLs as an empty set.")
|
||||||
|
self.crawled_urls = set()
|
||||||
|
else:
|
||||||
|
self.log(f"Error fetching URLs from `{dataset_name}`: {err}")
|
||||||
|
self.crawled_urls = set()
|
||||||
|
|
||||||
|
# 断开数据库连接
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
# 开始请求
|
||||||
|
start_urls = [
|
||||||
|
'https://guangfu.bjx.com.cn/yw/',
|
||||||
|
'https://guangfu.bjx.com.cn/zc/',
|
||||||
|
'https://guangfu.bjx.com.cn/sc/',
|
||||||
|
'https://guangfu.bjx.com.cn/mq/',
|
||||||
|
'https://guangfu.bjx.com.cn/dj/',
|
||||||
|
'https://guangfu.bjx.com.cn/xm/',
|
||||||
|
'https://guangfu.bjx.com.cn/zb/',
|
||||||
|
'https://guangfu.bjx.com.cn/cj/',
|
||||||
|
'https://guangfu.bjx.com.cn/gj/',
|
||||||
|
'https://guangfu.bjx.com.cn/sj/',
|
||||||
|
'https://guangfu.bjx.com.cn/js/',
|
||||||
|
]
|
||||||
|
for url in start_urls:
|
||||||
|
yield scrapy.Request(url, self.parse)
|
||||||
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
news_list = response.css('.cc-list-content a')
|
news_list = response.css('.cc-list-content a')
|
||||||
|
@ -18,6 +66,7 @@ class BjxpvSpider(scrapy.Spider):
|
||||||
url = next_page.attrib['href']
|
url = next_page.attrib['href']
|
||||||
yield response.follow(url, self.parse)
|
yield response.follow(url, self.parse)
|
||||||
|
|
||||||
|
|
||||||
def news_parse(self, response):
|
def news_parse(self, response):
|
||||||
news_item = NewsItem()
|
news_item = NewsItem()
|
||||||
news_item['website'] = '北极星太阳能光伏网'
|
news_item['website'] = '北极星太阳能光伏网'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user