2024-05-17 13:49:44 +08:00
|
|
|
# Define your item pipelines here
|
|
|
|
#
|
|
|
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
|
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
|
|
|
|
|
|
|
|
|
|
# useful for handling different item types with a single interface
|
|
|
|
import mysql.connector
|
|
|
|
from scrapy.exceptions import DropItem
|
|
|
|
from .items import NewsItem
|
|
|
|
from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE
|
|
|
|
|
|
|
|
|
|
|
|
class DecspiderPipeline:
|
|
|
|
def open_spider(self, spider):
|
|
|
|
# 连接数据库
|
|
|
|
self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT)
|
|
|
|
self.cursor = self.conn.cursor()
|
|
|
|
|
|
|
|
# 动态生成表名
|
|
|
|
self.table_name = f'{spider.settings.get("BOT_NAME")}_{spider.name}'
|
|
|
|
spider.log(f'Dataset name: {self.table_name}')
|
|
|
|
|
|
|
|
# 检查表是否存在,如果不存在就创建表
|
|
|
|
self.cursor.execute(f"""
|
|
|
|
CREATE TABLE IF NOT EXISTS `{self.table_name}` (
|
2024-05-17 16:25:45 +08:00
|
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
|
2024-05-17 13:49:44 +08:00
|
|
|
)
|
|
|
|
""")
|
|
|
|
|
|
|
|
# 获取当前表的列信息
|
|
|
|
self.cursor.execute(f"SHOW COLUMNS FROM `{self.table_name}`")
|
|
|
|
existing_columns = {row[0] for row in self.cursor.fetchall()}
|
|
|
|
|
|
|
|
# 获取 NewsItem 字段信息
|
|
|
|
item_columns = set(NewsItem.fields.keys())
|
|
|
|
|
2024-05-17 16:25:45 +08:00
|
|
|
# 添加 NewsItem 字段到表中
|
2024-05-17 13:49:44 +08:00
|
|
|
for column in item_columns:
|
|
|
|
if column not in existing_columns:
|
|
|
|
self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `{column}` TEXT")
|
|
|
|
spider.log(f'Added column `{column}` to `{self.table_name}` table')
|
|
|
|
|
2024-05-17 16:25:45 +08:00
|
|
|
# 添加 created_at 和 updated_at 字段,如果它们不存在
|
|
|
|
if 'created_at' not in existing_columns:
|
|
|
|
self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP")
|
|
|
|
spider.log(f'Added column `created_at` to `{self.table_name}` table')
|
|
|
|
|
|
|
|
if 'updated_at' not in existing_columns:
|
|
|
|
self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
|
|
|
|
spider.log(f'Added column `updated_at` to `{self.table_name}` table')
|
|
|
|
|
2024-05-17 13:49:44 +08:00
|
|
|
# 删除表中不存在于 NewsItem 中的字段
|
|
|
|
for column in existing_columns:
|
2024-05-17 16:25:45 +08:00
|
|
|
if column not in item_columns and column not in {'id', 'created_at', 'updated_at'}:
|
2024-05-17 13:49:44 +08:00
|
|
|
self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`")
|
|
|
|
spider.log(f'Dropped column `{column}` from `{self.table_name}` table')
|
|
|
|
|
|
|
|
self.conn.commit()
|
|
|
|
|
|
|
|
|
2024-05-17 16:25:45 +08:00
|
|
|
|
2024-05-17 13:49:44 +08:00
|
|
|
def close_spider(self, spider):
|
|
|
|
self.conn.close()
|
|
|
|
|
|
|
|
|
|
|
|
def process_item(self, item, spider):
|
|
|
|
|
|
|
|
if isinstance(item, NewsItem):
|
|
|
|
# 插入数据
|
|
|
|
columns = ', '.join(item.keys())
|
|
|
|
placeholders = ', '.join(['%s'] * len(item))
|
|
|
|
sql = f"INSERT INTO `{self.table_name}` ({columns}) VALUES ({placeholders})"
|
|
|
|
|
|
|
|
try:
|
|
|
|
self.cursor.execute(sql, list(item.values()))
|
|
|
|
self.conn.commit()
|
|
|
|
except mysql.connector.Error as e:
|
|
|
|
spider.log(f"Error when inserting item: {e}")
|
|
|
|
self.conn.rollback()
|
|
|
|
raise DropItem(f"Error when inserting item: {e}")
|
|
|
|
|
|
|
|
return item
|
|
|
|
|
|
|
|
|