rename spider and create bjxspider
This commit is contained in:
parent
94536a09c1
commit
d1e3d99a76
|
@ -18,3 +18,5 @@ class NewsItem(scrapy.Item):
|
|||
collection = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
source_url = scrapy.Field()
|
||||
website = scrapy.Field()
|
||||
keywords = scrapy.Field()
|
|
@ -12,7 +12,7 @@ from .settings import USERNAME, PASSWORD
|
|||
from faker import Faker
|
||||
|
||||
|
||||
class DecspiderSpiderMiddleware:
|
||||
class NewsspiderSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
|
@ -11,7 +11,7 @@ from .items import NewsItem
|
|||
from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE
|
||||
|
||||
|
||||
class DecspiderPipeline:
|
||||
class NewsspiderPipeline:
|
||||
def open_spider(self, spider):
|
||||
# 连接数据库
|
||||
self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
|
||||
|
@ -52,11 +52,11 @@ class DecspiderPipeline:
|
|||
self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
|
||||
spider.log(f'Added column `updated_at` to `{self.table_name}` table')
|
||||
|
||||
# 删除表中不存在于 NewsItem 中的字段
|
||||
for column in existing_columns:
|
||||
if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
|
||||
self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
|
||||
spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
|
||||
# # 删除表中不存在于 NewsItem 中的字段
|
||||
# for column in existing_columns:
|
||||
# if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
|
||||
# self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
|
||||
# spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
|
||||
|
||||
self.conn.commit()
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# Scrapy settings for decspider project
|
||||
# Scrapy settings for newsspider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
|
@ -7,14 +7,14 @@
|
|||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "decspider"
|
||||
BOT_NAME = "newsspider"
|
||||
|
||||
SPIDER_MODULES = ["decspider.spiders"]
|
||||
NEWSPIDER_MODULE = "decspider.spiders"
|
||||
SPIDER_MODULES = ["newsspider.spiders"]
|
||||
NEWSPIDER_MODULE = "newsspider.spiders"
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = "decspider (+http://www.yourdomain.com)"
|
||||
#USER_AGENT = "newsspider (+http://www.yourdomain.com)"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
@ -45,11 +45,11 @@ ROBOTSTXT_OBEY = False
|
|||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "decspider.middlewares.DecspiderSpiderMiddleware": 543,
|
||||
# "newsspider.middlewares.NewsspiderSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
"decspider.middlewares.ProxyMiddleware": 543,
|
||||
"newsspider.middlewares.ProxyMiddleware": 543,
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
|
@ -61,7 +61,7 @@ DOWNLOADER_MIDDLEWARES = {
|
|||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'decspider.pipelines.DecspiderPipeline': 300,
|
||||
'newsspider.pipelines.NewsspiderPipeline': 300,
|
||||
'crawlab.CrawlabPipeline': 888,
|
||||
}
|
||||
|
36
newsspider/spiders/bjxpv.py
Normal file
36
newsspider/spiders/bjxpv.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
import scrapy
|
||||
from ..items import NewsItem
|
||||
|
||||
|
||||
class BjxpvSpider(scrapy.Spider):
|
||||
name = "bjxpv"
|
||||
allowed_domains = ["guangfu.bjx.com.cn"]
|
||||
start_urls = ["https://guangfu.bjx.com.cn/yw/"]
|
||||
|
||||
def parse(self, response):
|
||||
news_list = response.css('.cc-list-content a')
|
||||
for news in news_list:
|
||||
url = news.attrib['href']
|
||||
yield scrapy.Request(url, self.news_parse)
|
||||
|
||||
next_page = response.xpath('//a[contains(text(), "下一页")]')
|
||||
if next_page.attrib['class'] != 'disable':
|
||||
url = next_page.attrib['href']
|
||||
yield response.follow(url, self.parse)
|
||||
|
||||
|
||||
def news_parse(self, response):
|
||||
news_item = NewsItem()
|
||||
news_item['website'] = '北极星太阳能光伏网'
|
||||
news_item['url'] = response.url
|
||||
news_item['collection'] = ';'.join([c.get() for c in response.xpath('//div[@class="cc-crumbs"]/div[@class="box"]/em/a/text()')])
|
||||
|
||||
news_item['title'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/h1/text()').get()
|
||||
news_item['date'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span/text()').re(r'(\d{4}-\d{2}-\d{2})')[0]
|
||||
news_item['source'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span[2]/text()').re(r'来源:(.*)')[0]
|
||||
news_item['keywords'] = ';'.join([k.get() for k in response.xpath('//span[@id="key_word"]/em/a/text()')])
|
||||
|
||||
paragraphs = response.css('.cc-article p')
|
||||
news_item['content'] = '\n'.join([''.join([c.get() for c in p.xpath('.//text()')]) for p in paragraphs])
|
||||
|
||||
yield news_item
|
|
@ -70,6 +70,7 @@ class CorpnewsSpider(scrapy.Spider):
|
|||
|
||||
def news_parse(self, response):
|
||||
news_item = NewsItem()
|
||||
news_item['website'] = '东方电气集团官网'
|
||||
news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()
|
||||
news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()
|
||||
news_item['url'] = response.url
|
|
@ -4,9 +4,9 @@
|
|||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = decspider.settings
|
||||
default = newsspider.settings
|
||||
shell = ipython
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = decspider
|
||||
project = newsspider
|
||||
|
|
Loading…
Reference in New Issue
Block a user