diff --git a/decspider/__init__.py b/newsspider/__init__.py similarity index 100% rename from decspider/__init__.py rename to newsspider/__init__.py diff --git a/decspider/items.py b/newsspider/items.py similarity index 86% rename from decspider/items.py rename to newsspider/items.py index c44adb5..fd7b77e 100644 --- a/decspider/items.py +++ b/newsspider/items.py @@ -18,3 +18,5 @@ class NewsItem(scrapy.Item): collection = scrapy.Field() url = scrapy.Field() source_url = scrapy.Field() + website = scrapy.Field() + keywords = scrapy.Field() diff --git a/decspider/middlewares.py b/newsspider/middlewares.py similarity index 96% rename from decspider/middlewares.py rename to newsspider/middlewares.py index 828e145..3cf598f 100644 --- a/decspider/middlewares.py +++ b/newsspider/middlewares.py @@ -12,7 +12,7 @@ from .settings import USERNAME, PASSWORD from faker import Faker -class DecspiderSpiderMiddleware: +class NewsspiderSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. diff --git a/decspider/myutils.py b/newsspider/myutils.py similarity index 100% rename from decspider/myutils.py rename to newsspider/myutils.py diff --git a/decspider/pipelines.py b/newsspider/pipelines.py similarity index 86% rename from decspider/pipelines.py rename to newsspider/pipelines.py index bf76928..5c4ea15 100644 --- a/decspider/pipelines.py +++ b/newsspider/pipelines.py @@ -11,7 +11,7 @@ from .items import NewsItem from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE -class DecspiderPipeline: +class NewsspiderPipeline: def open_spider(self, spider): # 连接数据库 self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT) @@ -52,11 +52,11 @@ class DecspiderPipeline: self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP") spider.log(f'Added column `updated_at` to `{self.table_name}` table') - # 删除表中不存在于 NewsItem 中的字段 - for column in existing_columns: - if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}: - self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`") - spider.log(f'Dropped column `{column}` from `{self.table_name}` table') + # # 删除表中不存在于 NewsItem 中的字段 + # for column in existing_columns: + # if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}: + # self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`") + # spider.log(f'Dropped column `{column}` from `{self.table_name}` table') self.conn.commit() diff --git a/decspider/settings.py b/newsspider/settings.py similarity index 87% rename from decspider/settings.py rename to newsspider/settings.py index 24c0aef..7055880 100644 --- a/decspider/settings.py +++ b/newsspider/settings.py @@ -1,4 +1,4 @@ -# Scrapy settings for decspider project +# Scrapy settings for newsspider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: @@ -7,14 +7,14 @@ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html -BOT_NAME = "decspider" +BOT_NAME = "newsspider" -SPIDER_MODULES = ["decspider.spiders"] -NEWSPIDER_MODULE = "decspider.spiders" +SPIDER_MODULES = ["newsspider.spiders"] +NEWSPIDER_MODULE = "newsspider.spiders" # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = "decspider (+http://www.yourdomain.com)" +#USER_AGENT = "newsspider (+http://www.yourdomain.com)" # Obey robots.txt rules ROBOTSTXT_OBEY = False @@ -45,11 +45,11 @@ ROBOTSTXT_OBEY = False # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { -# "decspider.middlewares.DecspiderSpiderMiddleware": 543, +# "newsspider.middlewares.NewsspiderSpiderMiddleware": 543, #} DOWNLOADER_MIDDLEWARES = { - "decspider.middlewares.ProxyMiddleware": 543, + "newsspider.middlewares.ProxyMiddleware": 543, } # Enable or disable extensions @@ -61,7 +61,7 @@ DOWNLOADER_MIDDLEWARES = { # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'decspider.pipelines.DecspiderPipeline': 300, + 'newsspider.pipelines.NewsspiderPipeline': 300, 'crawlab.CrawlabPipeline': 888, } diff --git a/decspider/spiders/__init__.py b/newsspider/spiders/__init__.py similarity index 100% rename from decspider/spiders/__init__.py rename to newsspider/spiders/__init__.py diff --git a/newsspider/spiders/bjxpv.py b/newsspider/spiders/bjxpv.py new file mode 100644 index 0000000..059f0a2 --- /dev/null +++ b/newsspider/spiders/bjxpv.py @@ -0,0 +1,36 @@ +import scrapy +from ..items import NewsItem + + +class BjxpvSpider(scrapy.Spider): + name = "bjxpv" + allowed_domains = ["guangfu.bjx.com.cn"] + start_urls = ["https://guangfu.bjx.com.cn/yw/"] + + def parse(self, response): + news_list = response.css('.cc-list-content a') + for news in news_list: + url = news.attrib['href'] + yield scrapy.Request(url, self.news_parse) + + next_page = response.xpath('//a[contains(text(), "下一页")]') + if next_page.attrib['class'] != 'disable': + url = next_page.attrib['href'] + yield response.follow(url, self.parse) + + + def news_parse(self, response): + news_item = NewsItem() + news_item['website'] = '北极星太阳能光伏网' + news_item['url'] = response.url + news_item['collection'] = ';'.join([c.get() for c in response.xpath('//div[@class="cc-crumbs"]/div[@class="box"]/em/a/text()')]) + + news_item['title'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/h1/text()').get() + news_item['date'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span/text()').re(r'(\d{4}-\d{2}-\d{2})')[0] + news_item['source'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span[2]/text()').re(r'来源:(.*)')[0] + news_item['keywords'] = ';'.join([k.get() for k in response.xpath('//span[@id="key_word"]/em/a/text()')]) + + paragraphs = response.css('.cc-article p') + news_item['content'] = '\n'.join([''.join([c.get() for c in p.xpath('.//text()')]) for p in paragraphs]) + + yield news_item diff --git a/decspider/spiders/corpnews.py b/newsspider/spiders/corpnews.py similarity index 96% rename from decspider/spiders/corpnews.py rename to newsspider/spiders/corpnews.py index ee68a94..9aa47a6 100644 --- a/decspider/spiders/corpnews.py +++ b/newsspider/spiders/corpnews.py @@ -70,6 +70,7 @@ class CorpnewsSpider(scrapy.Spider): def news_parse(self, response): news_item = NewsItem() + news_item['website'] = '东方电气集团官网' news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get() news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get() news_item['url'] = response.url diff --git a/scrapy.cfg b/scrapy.cfg index 33de24a..6c1cacf 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,9 +4,9 @@ # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] -default = decspider.settings +default = newsspider.settings shell = ipython [deploy] #url = http://localhost:6800/ -project = decspider +project = newsspider