rename spider and create bjxspider

2024-05-20 14:07:39 +08:00 · 2024-05-20 14:07:39 +08:00 · d1e3d99a76
commit d1e3d99a76
parent 94536a09c1
10 changed files with 56 additions and 17 deletions
--- a/newsspider/init.py
+++ b/newsspider/init.py
--- a/newsspider/items.py
+++ b/newsspider/items.py
@ -18,3 +18,5 @@ class NewsItem(scrapy.Item):
    collection = scrapy.Field()
    url = scrapy.Field()
    source_url = scrapy.Field()
+    website = scrapy.Field()
+    keywords = scrapy.Field()
--- a/newsspider/middlewares.py
+++ b/newsspider/middlewares.py
@ -12,7 +12,7 @@ from .settings import USERNAME, PASSWORD
 from faker import Faker


-class DecspiderSpiderMiddleware:
+class NewsspiderSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
--- a/newsspider/myutils.py
+++ b/newsspider/myutils.py
--- a/newsspider/pipelines.py
+++ b/newsspider/pipelines.py
@ -11,7 +11,7 @@ from .items import NewsItem
 from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE


-class DecspiderPipeline:
+class NewsspiderPipeline:
    def open_spider(self, spider):
        # 连接数据库
        self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
@ -52,11 +52,11 @@ class DecspiderPipeline:
            self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
            spider.log(f'Added column `updated_at` to `{self.table_name}` table')

-        # 删除表中不存在于 NewsItem 中的字段
-        for column in existing_columns:
-            if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
-                self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
-                spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
+        # # 删除表中不存在于 NewsItem 中的字段
+        # for column in existing_columns:
+        #     if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
+        #         self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
+        #         spider.log(f'Dropped column `{column}` from `{self.table_name}` table')

        self.conn.commit()

--- a/newsspider/settings.py
+++ b/newsspider/settings.py
@ -1,4 +1,4 @@
-# Scrapy settings for decspider project
+# Scrapy settings for newsspider project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
@ -7,14 +7,14 @@
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

-BOT_NAME = "decspider"
+BOT_NAME = "newsspider"

-SPIDER_MODULES = ["decspider.spiders"]
-NEWSPIDER_MODULE = "decspider.spiders"
+SPIDER_MODULES = ["newsspider.spiders"]
+NEWSPIDER_MODULE = "newsspider.spiders"


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = "decspider (+http://www.yourdomain.com)"
+#USER_AGENT = "newsspider (+http://www.yourdomain.com)"

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
@ -45,11 +45,11 @@ ROBOTSTXT_OBEY = False
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
-#    "decspider.middlewares.DecspiderSpiderMiddleware": 543,
+#    "newsspider.middlewares.NewsspiderSpiderMiddleware": 543,
 #}

 DOWNLOADER_MIDDLEWARES = {
-    "decspider.middlewares.ProxyMiddleware": 543,
+    "newsspider.middlewares.ProxyMiddleware": 543,
 }

 # Enable or disable extensions
@ -61,7 +61,7 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'decspider.pipelines.DecspiderPipeline': 300,
+    'newsspider.pipelines.NewsspiderPipeline': 300,
    'crawlab.CrawlabPipeline': 888,
 }

--- a/newsspider/spiders/init.py
+++ b/newsspider/spiders/init.py
--- a/newsspider/spiders/bjxpv.py
+++ b/newsspider/spiders/bjxpv.py
@ -0,0 +1,36 @@
+import scrapy
+from ..items import NewsItem
+
+
+class BjxpvSpider(scrapy.Spider):
+    name = "bjxpv"
+    allowed_domains = ["guangfu.bjx.com.cn"]
+    start_urls = ["https://guangfu.bjx.com.cn/yw/"]
+
+    def parse(self, response):
+        news_list = response.css('.cc-list-content a')
+        for news in news_list:
+            url = news.attrib['href']
+            yield scrapy.Request(url, self.news_parse)
+
+        next_page = response.xpath('//a[contains(text(), "下一页")]')
+        if next_page.attrib['class'] != 'disable':
+            url = next_page.attrib['href']
+            yield response.follow(url, self.parse)
+
+
+    def news_parse(self, response):
+        news_item = NewsItem()
+        news_item['website'] = '北极星太阳能光伏网'
+        news_item['url'] = response.url
+        news_item['collection'] = ';'.join([c.get() for c in response.xpath('//div[@class="cc-crumbs"]/div[@class="box"]/em/a/text()')])
+
+        news_item['title'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/h1/text()').get()
+        news_item['date'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span/text()').re(r'(\d{4}-\d{2}-\d{2})')[0]
+        news_item['source'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span[2]/text()').re(r'来源：(.*)')[0]
+        news_item['keywords'] = ';'.join([k.get() for k in response.xpath('//span[@id="key_word"]/em/a/text()')])
+ 
+        paragraphs = response.css('.cc-article p')
+        news_item['content'] = '\n'.join([''.join([c.get() for c in p.xpath('.//text()')]) for p in paragraphs])
+
+        yield news_item
--- a/newsspider/spiders/corpnews.py
+++ b/newsspider/spiders/corpnews.py
@ -70,6 +70,7 @@ class CorpnewsSpider(scrapy.Spider):

    def news_parse(self, response):
        news_item = NewsItem()
+        news_item['website'] = '东方电气集团官网'
        news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()
        news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()
        news_item['url'] = response.url
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -4,9 +4,9 @@
 # https://scrapyd.readthedocs.io/en/latest/deploy.html

 [settings]
-default = decspider.settings
+default = newsspider.settings
 shell = ipython

 [deploy]
 #url = http://localhost:6800/
-project = decspider
+project = newsspider