newsspider/decspider/spiders/corpnews.py

import scrapy
import mysql.connector
from mysql.connector import errorcode
from urllib.parse import urljoin
from ..items import NewsItem
from ..settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE


class CorpnewsSpider(scrapy.Spider):
    name = "corpnews"
    allowed_domains = ["dongfang.com"]
    

    def __init__(self, *args, **kwargs):
        super(CorpnewsSpider, self).__init__(*args, **kwargs)
        self.crawled_urls = set()


    def start_requests(self):
        # 连接数据库
        self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
        self.cursor = self.conn.cursor()
        
        # 动态生成表名
        dataset_name = f'{self.settings.get("BOT_NAME")}_{self.name}'
        
        # 获取当前数据库中已经爬取的 URLs
        try:
            self.cursor.execute(f"SELECT url FROM `{dataset_name}`")
            self.crawled_urls = {row[0] for row in self.cursor.fetchall()}
        except mysql.connector.Error as err:
            if err.errno == errorcode.ER_NO_SUCH_TABLE:
                self.log(f"Table `{dataset_name}` does not exist. Initializing crawled URLs as an empty set.")
                self.crawled_urls = set()
            else:
                self.log(f"Error fetching URLs from `{dataset_name}`: {err}")
                self.crawled_urls = set()
        
        # 断开数据库连接
        self.conn.close()

        # 开始请求
        start_urls = ["https://www.dongfang.com/xwzx/jtyw1/qb.htm", "https://www.dongfang.com/xwzx/jcdt.htm", 'https://www.dongfang.com/xwzx/mtzs.htm']
        for url in start_urls:
            yield scrapy.Request(url, self.parse)
        

    def parse(self, response):
        first_news = response.xpath('//div[@class="news_top"]/div[@class="news_img"]/a')
        self.log(f'crawled_urls: {self.crawled_urls}')
        if first_news:
            first_news_url = first_news.attrib['href']
            full_url = urljoin(response.url, first_news_url)
            if full_url not in self.crawled_urls:
                yield scrapy.Request(full_url, self.news_parse)

        news_list = response.xpath('//div[contains(@class,"swiper-slide")]/dl/dd/a')
        for news in news_list:
            news_url = news.attrib['href']
            full_url = urljoin(response.url, news_url)
            if full_url not in self.crawled_urls:
                self.log(f'full_url: {full_url}')
                yield scrapy.Request(full_url, self.news_parse)

        next_page = response.xpath('//span[contains(@class, "p_next")]/a')
        if next_page:
            next_page_url = next_page.attrib['href']
            yield response.follow(next_page_url, self.parse)


    def news_parse(self, response):
        news_item = NewsItem()
        news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()
        news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()
        news_item['url'] = response.url

        news_info = response.xpath('//div[@class="xq_nr_hd"]/span/text()')
        news_item['date'] = news_info.re(r'时间：(\d{4}-\d{2}-\d{2})  ')[0]

        source_label = news_info.re(r'来源： (.*)  ')
        if source_label:
            news_item['source'] = source_label[0]
            news_item['source_url'] = ''
        else:
            news_item['source'] = response.xpath('//div[@class="xq_nr_hd"]/span/a/text()').get()
            news_item['source_url'] = response.xpath('//div[@class="xq_nr_hd"]/span/a').attrib['href']

        news_text_list = response.xpath('//div[@class="v_news_content"]/p/text()')
        news_item['content'] = '\n'.join([t.get() for t in news_text_list])

        news_image_urls = response.xpath('//div[@class="v_news_content"]/p/img')
        news_item['image_urls'] = ';\n'.join([i.attrib['src'] for i in news_image_urls])

        yield news_item
init repo 2024-05-17 13:49:44 +08:00			`import scrapy`
			`import mysql.connector`
			`from mysql.connector import errorcode`
			`from urllib.parse import urljoin`
			`from ..items import NewsItem`
			`from ..settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE`


			`class CorpnewsSpider(scrapy.Spider):`
			`name = "corpnews"`
			`allowed_domains = ["dongfang.com"]`


			`def __init__(self, args, *kwargs):`
			`super(CorpnewsSpider, self).__init__(args, *kwargs)`
			`self.crawled_urls = set()`


			`def start_requests(self):`
			`# 连接数据库`
			`self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)`
			`self.cursor = self.conn.cursor()`

			`# 动态生成表名`
			`dataset_name = f'{self.settings.get("BOT_NAME")}_{self.name}'`

			`# 获取当前数据库中已经爬取的 URLs`
			`try:`
			self.cursor.execute(f"SELECT url FROM `{dataset_name}`")
			`self.crawled_urls = {row[0] for row in self.cursor.fetchall()}`
			`except mysql.connector.Error as err:`
			`if err.errno == errorcode.ER_NO_SUCH_TABLE:`
			self.log(f"Table `{dataset_name}` does not exist. Initializing crawled URLs as an empty set.")
			`self.crawled_urls = set()`
			`else:`
			self.log(f"Error fetching URLs from `{dataset_name}`: {err}")
			`self.crawled_urls = set()`

			`# 断开数据库连接`
			`self.conn.close()`

			`# 开始请求`
			`start_urls = ["https://www.dongfang.com/xwzx/jtyw1/qb.htm", "https://www.dongfang.com/xwzx/jcdt.htm", 'https://www.dongfang.com/xwzx/mtzs.htm']`
			`for url in start_urls:`
			`yield scrapy.Request(url, self.parse)`


			`def parse(self, response):`
			`first_news = response.xpath('//div[@class="news_top"]/div[@class="news_img"]/a')`
			`self.log(f'crawled_urls: {self.crawled_urls}')`
			`if first_news:`
			`first_news_url = first_news.attrib['href']`
			`full_url = urljoin(response.url, first_news_url)`
			`if full_url not in self.crawled_urls:`
			`yield scrapy.Request(full_url, self.news_parse)`

			`news_list = response.xpath('//div[contains(@class,"swiper-slide")]/dl/dd/a')`
			`for news in news_list:`
			`news_url = news.attrib['href']`
			`full_url = urljoin(response.url, news_url)`
			`if full_url not in self.crawled_urls:`
			`self.log(f'full_url: {full_url}')`
			`yield scrapy.Request(full_url, self.news_parse)`

			`next_page = response.xpath('//span[contains(@class, "p_next")]/a')`
			`if next_page:`
			`next_page_url = next_page.attrib['href']`
			`yield response.follow(next_page_url, self.parse)`


			`def news_parse(self, response):`
			`news_item = NewsItem()`
			`news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()`
			`news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()`
			`news_item['url'] = response.url`

			`news_info = response.xpath('//div[@class="xq_nr_hd"]/span/text()')`
			`news_item['date'] = news_info.re(r'时间：(\d{4}-\d{2}-\d{2}) ')[0]`

			`source_label = news_info.re(r'来源： (.*) ')`
			`if source_label:`
			`news_item['source'] = source_label[0]`
			`news_item['source_url'] = ''`
			`else:`
			`news_item['source'] = response.xpath('//div[@class="xq_nr_hd"]/span/a/text()').get()`
			`news_item['source_url'] = response.xpath('//div[@class="xq_nr_hd"]/span/a').attrib['href']`

			`news_text_list = response.xpath('//div[@class="v_news_content"]/p/text()')`
			`news_item['content'] = '\n'.join([t.get() for t in news_text_list])`

			`news_image_urls = response.xpath('//div[@class="v_news_content"]/p/img')`
			`news_item['image_urls'] = ';\n'.join([i.attrib['src'] for i in news_image_urls])`

			`yield news_item`