rename spider and create bjxspider
This commit is contained in:
parent
94536a09c1
commit
d1e3d99a76
|
@ -18,3 +18,5 @@ class NewsItem(scrapy.Item):
|
||||||
collection = scrapy.Field()
|
collection = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
source_url = scrapy.Field()
|
source_url = scrapy.Field()
|
||||||
|
website = scrapy.Field()
|
||||||
|
keywords = scrapy.Field()
|
|
@ -12,7 +12,7 @@ from .settings import USERNAME, PASSWORD
|
||||||
from faker import Faker
|
from faker import Faker
|
||||||
|
|
||||||
|
|
||||||
class DecspiderSpiderMiddleware:
|
class NewsspiderSpiderMiddleware:
|
||||||
# Not all methods need to be defined. If a method is not defined,
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
# scrapy acts as if the spider middleware does not modify the
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
# passed objects.
|
# passed objects.
|
|
@ -11,7 +11,7 @@ from .items import NewsItem
|
||||||
from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE
|
from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE
|
||||||
|
|
||||||
|
|
||||||
class DecspiderPipeline:
|
class NewsspiderPipeline:
|
||||||
def open_spider(self, spider):
|
def open_spider(self, spider):
|
||||||
# 连接数据库
|
# 连接数据库
|
||||||
self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
|
self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
|
||||||
|
@ -52,11 +52,11 @@ class DecspiderPipeline:
|
||||||
self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
|
self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
|
||||||
spider.log(f'Added column `updated_at` to `{self.table_name}` table')
|
spider.log(f'Added column `updated_at` to `{self.table_name}` table')
|
||||||
|
|
||||||
# 删除表中不存在于 NewsItem 中的字段
|
# # 删除表中不存在于 NewsItem 中的字段
|
||||||
for column in existing_columns:
|
# for column in existing_columns:
|
||||||
if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
|
# if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
|
||||||
self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
|
# self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
|
||||||
spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
|
# spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
|
||||||
|
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Scrapy settings for decspider project
|
# Scrapy settings for newsspider project
|
||||||
#
|
#
|
||||||
# For simplicity, this file contains only settings considered important or
|
# For simplicity, this file contains only settings considered important or
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
@ -7,14 +7,14 @@
|
||||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
BOT_NAME = "decspider"
|
BOT_NAME = "newsspider"
|
||||||
|
|
||||||
SPIDER_MODULES = ["decspider.spiders"]
|
SPIDER_MODULES = ["newsspider.spiders"]
|
||||||
NEWSPIDER_MODULE = "decspider.spiders"
|
NEWSPIDER_MODULE = "newsspider.spiders"
|
||||||
|
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
#USER_AGENT = "decspider (+http://www.yourdomain.com)"
|
#USER_AGENT = "newsspider (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = False
|
ROBOTSTXT_OBEY = False
|
||||||
|
@ -45,11 +45,11 @@ ROBOTSTXT_OBEY = False
|
||||||
# Enable or disable spider middlewares
|
# Enable or disable spider middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
#SPIDER_MIDDLEWARES = {
|
#SPIDER_MIDDLEWARES = {
|
||||||
# "decspider.middlewares.DecspiderSpiderMiddleware": 543,
|
# "newsspider.middlewares.NewsspiderSpiderMiddleware": 543,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
DOWNLOADER_MIDDLEWARES = {
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
"decspider.middlewares.ProxyMiddleware": 543,
|
"newsspider.middlewares.ProxyMiddleware": 543,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Enable or disable extensions
|
# Enable or disable extensions
|
||||||
|
@ -61,7 +61,7 @@ DOWNLOADER_MIDDLEWARES = {
|
||||||
# Configure item pipelines
|
# Configure item pipelines
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
'decspider.pipelines.DecspiderPipeline': 300,
|
'newsspider.pipelines.NewsspiderPipeline': 300,
|
||||||
'crawlab.CrawlabPipeline': 888,
|
'crawlab.CrawlabPipeline': 888,
|
||||||
}
|
}
|
||||||
|
|
36
newsspider/spiders/bjxpv.py
Normal file
36
newsspider/spiders/bjxpv.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
import scrapy
|
||||||
|
from ..items import NewsItem
|
||||||
|
|
||||||
|
|
||||||
|
class BjxpvSpider(scrapy.Spider):
|
||||||
|
name = "bjxpv"
|
||||||
|
allowed_domains = ["guangfu.bjx.com.cn"]
|
||||||
|
start_urls = ["https://guangfu.bjx.com.cn/yw/"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
news_list = response.css('.cc-list-content a')
|
||||||
|
for news in news_list:
|
||||||
|
url = news.attrib['href']
|
||||||
|
yield scrapy.Request(url, self.news_parse)
|
||||||
|
|
||||||
|
next_page = response.xpath('//a[contains(text(), "下一页")]')
|
||||||
|
if next_page.attrib['class'] != 'disable':
|
||||||
|
url = next_page.attrib['href']
|
||||||
|
yield response.follow(url, self.parse)
|
||||||
|
|
||||||
|
|
||||||
|
def news_parse(self, response):
|
||||||
|
news_item = NewsItem()
|
||||||
|
news_item['website'] = '北极星太阳能光伏网'
|
||||||
|
news_item['url'] = response.url
|
||||||
|
news_item['collection'] = ';'.join([c.get() for c in response.xpath('//div[@class="cc-crumbs"]/div[@class="box"]/em/a/text()')])
|
||||||
|
|
||||||
|
news_item['title'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/h1/text()').get()
|
||||||
|
news_item['date'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span/text()').re(r'(\d{4}-\d{2}-\d{2})')[0]
|
||||||
|
news_item['source'] = response.xpath('//div[@class="cc-headline"]/div[@class="box"]/p/span[2]/text()').re(r'来源:(.*)')[0]
|
||||||
|
news_item['keywords'] = ';'.join([k.get() for k in response.xpath('//span[@id="key_word"]/em/a/text()')])
|
||||||
|
|
||||||
|
paragraphs = response.css('.cc-article p')
|
||||||
|
news_item['content'] = '\n'.join([''.join([c.get() for c in p.xpath('.//text()')]) for p in paragraphs])
|
||||||
|
|
||||||
|
yield news_item
|
|
@ -70,6 +70,7 @@ class CorpnewsSpider(scrapy.Spider):
|
||||||
|
|
||||||
def news_parse(self, response):
|
def news_parse(self, response):
|
||||||
news_item = NewsItem()
|
news_item = NewsItem()
|
||||||
|
news_item['website'] = '东方电气集团官网'
|
||||||
news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()
|
news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get()
|
||||||
news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()
|
news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get()
|
||||||
news_item['url'] = response.url
|
news_item['url'] = response.url
|
|
@ -4,9 +4,9 @@
|
||||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
[settings]
|
[settings]
|
||||||
default = decspider.settings
|
default = newsspider.settings
|
||||||
shell = ipython
|
shell = ipython
|
||||||
|
|
||||||
[deploy]
|
[deploy]
|
||||||
#url = http://localhost:6800/
|
#url = http://localhost:6800/
|
||||||
project = decspider
|
project = newsspider
|
||||||
|
|
Loading…
Reference in New Issue
Block a user