rename spider and create bjxspider

This commit is contained in:
cooper 2024-05-20 14:07:39 +08:00
parent 94536a09c1
commit d1e3d99a76
10 changed files with 56 additions and 17 deletions

View File

@ -18,3 +18,5 @@ class NewsItem(scrapy.Item):
collection = scrapy.Field()
url = scrapy.Field()
source_url = scrapy.Field()
website = scrapy.Field()
keywords = scrapy.Field()

View File

@ -12,7 +12,7 @@ from .settings import USERNAME, PASSWORD
from faker import Faker
class DecspiderSpiderMiddleware:
class NewsspiderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

View File

@ -11,7 +11,7 @@ from .items import NewsItem
from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE
class DecspiderPipeline:
class NewsspiderPipeline:
def open_spider(self, spider):
# 连接数据库
self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
@ -52,11 +52,11 @@ class DecspiderPipeline:
self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
spider.log(f'Added column `updated_at` to `{self.table_name}` table')
# 删除表中不存在于 NewsItem 中的字段
for column in existing_columns:
if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
# # 删除表中不存在于 NewsItem 中的字段
# for column in existing_columns:
# if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
# self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
# spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
self.conn.commit()

View File

@ -1,4 +1,4 @@
# Scrapy settings for decspider project
# Scrapy settings for newsspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
@ -7,14 +7,14 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "decspider"
BOT_NAME = "newsspider"
SPIDER_MODULES = ["decspider.spiders"]
NEWSPIDER_MODULE = "decspider.spiders"
SPIDER_MODULES = ["newsspider.spiders"]
NEWSPIDER_MODULE = "newsspider.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "decspider (+http://www.yourdomain.com)"
#USER_AGENT = "newsspider (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
@ -45,11 +45,11 @@ ROBOTSTXT_OBEY = False
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "decspider.middlewares.DecspiderSpiderMiddleware": 543,
# "newsspider.middlewares.NewsspiderSpiderMiddleware": 543,
#}
DOWNLOADER_MIDDLEWARES = {
"decspider.middlewares.ProxyMiddleware": 543,
"newsspider.middlewares.ProxyMiddleware": 543,
}
# Enable or disable extensions
@ -61,7 +61,7 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'decspider.pipelines.DecspiderPipeline': 300,
'newsspider.pipelines.NewsspiderPipeline': 300,
'crawlab.CrawlabPipeline': 888,
}

View File

@ -0,0 +1,36 @@
import scrapy
from ..items import NewsItem
class BjxpvSpider(scrapy.Spider):
name = "bjxpv"
allowed_domains = ["guangfu.bjx.com.cn"]
start_urls = ["https://guangfu.bjx.com.cn/yw/"]
def parse(self, response):
news_list = response.css('.cc-list-content a')
for news in news_list:
url = news.attrib['href']
yield scrapy.Request(url, self.news_parse)
next_page = response.xpath('//a[contains(text(), "下一页")]')
if next_page.attrib['class'] != 'disable':
url = next_page.attrib['href']
yield response.follow(url, self.parse)
def news_parse(self, response):
news_item = NewsItem()
news_item['website'] = '北极星太阳能光伏网'
news_item['url'] = response.url
news_item['collection'] = ';'.join([c.get() for c in response.xpath('//div[@class="cc-crumbs"]/div[@class="box"]/em/a/text()')])
news_item['title'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/h1/text()').get()
news_item['date'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span/text()').re(r'(\d{4}-\d{2}-\d{2})')[0]
news_item['source'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span[2]/text()').re(r'来源:(.*)')[0]
news_item['keywords'] = ';'.join([k.get() for k in response.xpath('//span[@id="key_word"]/em/a/text()')])
paragraphs = response.css('.cc-article p')
news_item['content'] = '\n'.join([''.join([c.get() for c in p.xpath('.//text()')]) for p in paragraphs])
yield news_item

View File

@ -70,6 +70,7 @@ class CorpnewsSpider(scrapy.Spider):
def news_parse(self, response):
news_item = NewsItem()
news_item['website'] = '东方电气集团官网'
news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()
news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()
news_item['url'] = response.url

View File

@ -4,9 +4,9 @@
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = decspider.settings
default = newsspider.settings
shell = ipython
[deploy]
#url = http://localhost:6800/
project = decspider
project = newsspider