commit
This commit is contained in:
107
db/QUERY_TASK_README.md
Normal file
107
db/QUERY_TASK_README.md
Normal file
@@ -0,0 +1,107 @@
|
||||
# AI MIP Query Task 表创建说明
|
||||
|
||||
## 1. 创建表
|
||||
|
||||
在MySQL数据库中执行以下文件:
|
||||
|
||||
```bash
|
||||
mysql -u your_user -p your_database < db/ai_mip_query_task.sql
|
||||
```
|
||||
|
||||
或者在MySQL客户端中直接执行 `db/ai_mip_query_task.sql` 文件内容。
|
||||
|
||||
## 2. 表结构说明
|
||||
|
||||
### 字段列表
|
||||
|
||||
| 字段名 | 类型 | 说明 |
|
||||
|--------|------|------|
|
||||
| id | int | 主键ID |
|
||||
| query_word | varchar(512) | 查询词/关键词 |
|
||||
| query_type | enum | 查询类型:keyword/phrase/long_tail |
|
||||
| task_date | char(8) | 任务日期 YYYYMMDD |
|
||||
| threshold_max | int | 最大抓取数量阈值 |
|
||||
| current_count | int | 当前已抓取数量 |
|
||||
| status | enum | 任务状态:ready/doing/failed/finished/closed |
|
||||
| priority | tinyint | 优先级 1-10 |
|
||||
| category | varchar(64) | 分类标签 |
|
||||
| source_platform | varchar(64) | 来源平台 |
|
||||
| crawl_url_count | int | 已爬取URL数量 |
|
||||
| valid_url_count | int | 有效URL数量(带广告) |
|
||||
| error_message | text | 错误信息 |
|
||||
| started_at | timestamp | 开始执行时间 |
|
||||
| finished_at | timestamp | 完成时间 |
|
||||
| closed_at | timestamp | 达到阈值关闭时间 |
|
||||
| created_at | timestamp | 创建时间 |
|
||||
| updated_at | timestamp | 更新时间 |
|
||||
| created_by | varchar(64) | 创建人 |
|
||||
| remark | varchar(512) | 备注信息 |
|
||||
|
||||
### 索引
|
||||
|
||||
- `uniq_query_date`: 同一查询词每天只有一个任务
|
||||
- `idx_date_status`: 按日期和状态查询
|
||||
- `idx_status_priority`: 按状态和优先级查询
|
||||
- `idx_category`: 按分类查询
|
||||
- `idx_threshold`: 阈值监控
|
||||
- `idx_closed`: 关闭时间索引
|
||||
|
||||
## 3. 使用示例
|
||||
|
||||
### Python代码
|
||||
|
||||
```python
|
||||
from db_manager import QueryTaskManager
|
||||
|
||||
# 初始化管理器
|
||||
task_mgr = QueryTaskManager()
|
||||
|
||||
# 创建任务
|
||||
task_id = task_mgr.create_task(
|
||||
query_word="糖尿病治疗",
|
||||
query_type="keyword",
|
||||
threshold_max=50,
|
||||
priority=3,
|
||||
category="医疗"
|
||||
)
|
||||
|
||||
# 获取ready任务
|
||||
ready_tasks = task_mgr.get_ready_tasks(limit=10)
|
||||
|
||||
# 更新任务状态
|
||||
task_mgr.update_task_status(task_id, 'doing')
|
||||
|
||||
# 增加抓取计数
|
||||
task_mgr.increment_crawl_count(task_id, crawl_count=5, valid_count=3)
|
||||
|
||||
# 检查阈值
|
||||
task_mgr.check_threshold(task_id)
|
||||
|
||||
# 获取统计信息
|
||||
stats = task_mgr.get_task_statistics('20260119')
|
||||
```
|
||||
|
||||
## 4. 测试
|
||||
|
||||
运行测试脚本:
|
||||
|
||||
```bash
|
||||
python test_query_task.py
|
||||
```
|
||||
|
||||
## 5. 任务状态流转
|
||||
|
||||
```
|
||||
ready (准备中)
|
||||
↓
|
||||
doing (执行中)
|
||||
↓
|
||||
finished (完成) / failed (失败) / closed (达到阈值关闭)
|
||||
```
|
||||
|
||||
## 6. 注意事项
|
||||
|
||||
1. **唯一约束**:同一查询词在同一天只能有一个任务
|
||||
2. **阈值检查**:达到threshold_max时自动关闭任务
|
||||
3. **优先级**:数字越小优先级越高(1-10)
|
||||
4. **时间戳**:状态变更会自动更新对应的时间字段
|
||||
60
db/ai_mip_query_task.sql
Normal file
60
db/ai_mip_query_task.sql
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
MIP Query Task Table
|
||||
用于存储查询词任务,抓取需要自动点击的网址
|
||||
|
||||
Date: 2026-01-19
|
||||
*/
|
||||
|
||||
SET NAMES utf8mb4;
|
||||
SET FOREIGN_KEY_CHECKS = 0;
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for ai_mip_query_task
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `ai_mip_query_task`;
|
||||
CREATE TABLE `ai_mip_query_task` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '主键ID',
|
||||
`query_word` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '查询词/关键词',
|
||||
`query_type` enum('keyword','phrase','long_tail') CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT 'keyword' COMMENT '查询类型:关键词/短语/长尾词',
|
||||
`task_date` char(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '任务日期,格式:YYYYMMDD',
|
||||
`threshold_max` int NOT NULL DEFAULT 100 COMMENT '最大抓取数量阈值',
|
||||
`current_count` int NOT NULL DEFAULT 0 COMMENT '当前已抓取数量',
|
||||
`status` enum('ready','doing','failed','finished','closed') CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT 'ready' COMMENT '任务状态:准备中/执行中/失败/完成/已关闭',
|
||||
`priority` tinyint NOT NULL DEFAULT 5 COMMENT '优先级(1-10,数字越小优先级越高)',
|
||||
`category` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '分类标签(如:医疗、教育、法律等)',
|
||||
`source_platform` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT 'baidu' COMMENT '来源平台:baidu/sogou/360等',
|
||||
`crawl_url_count` int NOT NULL DEFAULT 0 COMMENT '已爬取URL数量',
|
||||
`valid_url_count` int NOT NULL DEFAULT 0 COMMENT '有效URL数量(带广告)',
|
||||
`error_message` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL COMMENT '错误信息',
|
||||
`started_at` timestamp NULL DEFAULT NULL COMMENT '开始执行时间',
|
||||
`finished_at` timestamp NULL DEFAULT NULL COMMENT '完成时间',
|
||||
`closed_at` timestamp NULL DEFAULT NULL COMMENT '达到阈值关闭时间',
|
||||
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
`created_by` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT 'system' COMMENT '创建人',
|
||||
`remark` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '备注信息',
|
||||
PRIMARY KEY (`id`) USING BTREE,
|
||||
UNIQUE INDEX `uniq_query_date`(`query_word`(191) ASC, `task_date` ASC) USING BTREE COMMENT '同一查询词每天只有一个任务',
|
||||
INDEX `idx_date_status`(`task_date` ASC, `status` ASC) USING BTREE COMMENT '按日期和状态查询',
|
||||
INDEX `idx_status_priority`(`status` ASC, `priority` ASC) USING BTREE COMMENT '按状态和优先级查询',
|
||||
INDEX `idx_category`(`category` ASC) USING BTREE COMMENT '按分类查询',
|
||||
INDEX `idx_threshold`(`threshold_max` ASC, `current_count` ASC) USING BTREE COMMENT '阈值监控',
|
||||
INDEX `idx_closed`(`closed_at` ASC) USING BTREE COMMENT '关闭时间索引'
|
||||
) ENGINE = InnoDB
|
||||
AUTO_INCREMENT = 1
|
||||
CHARACTER SET = utf8mb4
|
||||
COLLATE = utf8mb4_general_ci
|
||||
COMMENT = 'MIP查询任务表 - 用于存储查询词抓取网址任务'
|
||||
ROW_FORMAT = DYNAMIC;
|
||||
|
||||
-- ----------------------------
|
||||
-- 示例数据
|
||||
-- ----------------------------
|
||||
INSERT INTO `ai_mip_query_task`
|
||||
(`query_word`, `query_type`, `task_date`, `threshold_max`, `priority`, `category`, `source_platform`, `remark`)
|
||||
VALUES
|
||||
('糖尿病治疗', 'keyword', '20260119', 50, 3, '医疗', 'baidu', '医疗类关键词测试'),
|
||||
('在线教育平台', 'phrase', '20260119', 30, 5, '教育', 'baidu', '教育类短语测试'),
|
||||
('法律咨询免费在线', 'long_tail', '20260119', 20, 7, '法律', 'baidu', '法律类长尾词测试');
|
||||
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
||||
14
db/alter_add_query_word.sql
Normal file
14
db/alter_add_query_word.sql
Normal file
@@ -0,0 +1,14 @@
|
||||
/*
|
||||
为ai_mip_site表添加query_word字段
|
||||
用于记录该URL是从哪个查询词抓取的
|
||||
|
||||
Date: 2026-01-19
|
||||
*/
|
||||
|
||||
ALTER TABLE `ai_mip_site`
|
||||
ADD COLUMN `query_word` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '来源查询词(从哪个关键词抓取)'
|
||||
AFTER `site_dimension`;
|
||||
|
||||
-- 添加索引,方便按查询词查询
|
||||
ALTER TABLE `ai_mip_site`
|
||||
ADD INDEX `idx_query_word`(`query_word`(191) ASC) USING BTREE COMMENT '按查询词查询';
|
||||
Reference in New Issue
Block a user