96 lines
4.0 KiB
Python
96 lines
4.0 KiB
Python
|
import scrapy
|
||
|
import mysql.connector
|
||
|
from mysql.connector import errorcode
|
||
|
from urllib.parse import urljoin
|
||
|
from ..items import NewsItem
|
||
|
from ..settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE
|
||
|
|
||
|
|
||
|
class CorpnewsSpider(scrapy.Spider):
|
||
|
name = "corpnews"
|
||
|
allowed_domains = ["dongfang.com"]
|
||
|
|
||
|
|
||
|
def __init__(self, *args, **kwargs):
|
||
|
super(CorpnewsSpider, self).__init__(*args, **kwargs)
|
||
|
self.crawled_urls = set()
|
||
|
|
||
|
|
||
|
def start_requests(self):
|
||
|
# 连接数据库
|
||
|
self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
|
||
|
self.cursor = self.conn.cursor()
|
||
|
|
||
|
# 动态生成表名
|
||
|
dataset_name = f'{self.settings.get("BOT_NAME")}_{self.name}'
|
||
|
|
||
|
# 获取当前数据库中已经爬取的 URLs
|
||
|
try:
|
||
|
self.cursor.execute(f"SELECT url FROM `{dataset_name}`")
|
||
|
self.crawled_urls = {row[0] for row in self.cursor.fetchall()}
|
||
|
except mysql.connector.Error as err:
|
||
|
if err.errno == errorcode.ER_NO_SUCH_TABLE:
|
||
|
self.log(f"Table `{dataset_name}` does not exist. Initializing crawled URLs as an empty set.")
|
||
|
self.crawled_urls = set()
|
||
|
else:
|
||
|
self.log(f"Error fetching URLs from `{dataset_name}`: {err}")
|
||
|
self.crawled_urls = set()
|
||
|
|
||
|
# 断开数据库连接
|
||
|
self.conn.close()
|
||
|
|
||
|
# 开始请求
|
||
|
start_urls = ["https://www.dongfang.com/xwzx/jtyw1/qb.htm", "https://www.dongfang.com/xwzx/jcdt.htm", 'https://www.dongfang.com/xwzx/mtzs.htm']
|
||
|
for url in start_urls:
|
||
|
yield scrapy.Request(url, self.parse)
|
||
|
|
||
|
|
||
|
def parse(self, response):
|
||
|
first_news = response.xpath('//div[@class="news_top"]/div[@class="news_img"]/a')
|
||
|
self.log(f'crawled_urls: {self.crawled_urls}')
|
||
|
if first_news:
|
||
|
first_news_url = first_news.attrib['href']
|
||
|
full_url = urljoin(response.url, first_news_url)
|
||
|
if full_url not in self.crawled_urls:
|
||
|
yield scrapy.Request(full_url, self.news_parse)
|
||
|
|
||
|
news_list = response.xpath('//div[contains(@class,"swiper-slide")]/dl/dd/a')
|
||
|
for news in news_list:
|
||
|
news_url = news.attrib['href']
|
||
|
full_url = urljoin(response.url, news_url)
|
||
|
if full_url not in self.crawled_urls:
|
||
|
self.log(f'full_url: {full_url}')
|
||
|
yield scrapy.Request(full_url, self.news_parse)
|
||
|
|
||
|
next_page = response.xpath('//span[contains(@class, "p_next")]/a')
|
||
|
if next_page:
|
||
|
next_page_url = next_page.attrib['href']
|
||
|
yield response.follow(next_page_url, self.parse)
|
||
|
|
||
|
|
||
|
def news_parse(self, response):
|
||
|
news_item = NewsItem()
|
||
|
news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()
|
||
|
news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()
|
||
|
news_item['url'] = response.url
|
||
|
|
||
|
news_info = response.xpath('//div[@class="xq_nr_hd"]/span/text()')
|
||
|
news_item['date'] = news_info.re(r'时间:(\d{4}-\d{2}-\d{2}) ')[0]
|
||
|
|
||
|
source_label = news_info.re(r'来源: (.*) ')
|
||
|
if source_label:
|
||
|
news_item['source'] = source_label[0]
|
||
|
news_item['source_url'] = ''
|
||
|
else:
|
||
|
news_item['source'] = response.xpath('//div[@class="xq_nr_hd"]/span/a/text()').get()
|
||
|
news_item['source_url'] = response.xpath('//div[@class="xq_nr_hd"]/span/a').attrib['href']
|
||
|
|
||
|
news_text_list = response.xpath('//div[@class="v_news_content"]/p/text()')
|
||
|
news_item['content'] = '\n'.join([t.get() for t in news_text_list])
|
||
|
|
||
|
news_image_urls = response.xpath('//div[@class="v_news_content"]/p/img')
|
||
|
news_item['image_urls'] = ';\n'.join([i.attrib['src'] for i in news_image_urls])
|
||
|
|
||
|
yield news_item
|
||
|
|