From 318344585793940eed8579dda143e615174e588c Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Tue, 7 Apr 2026 15:11:53 +0800 Subject: [PATCH 01/14] =?UTF-8?q?=E5=91=8A=E8=AD=A6=E6=88=90=E5=8A=9F?= =?UTF-8?q?=E7=8E=87=E9=98=88=E5=80=BC=E4=BB=8E=E7=A1=AC=E7=BC=96=E7=A0=81?= =?UTF-8?q?50%=E7=A7=BB=E5=85=A5=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= =?UTF-8?q?=EF=BC=8C=E4=BB=A5=E6=94=AF=E6=8C=81=E8=87=AA=E5=AE=9A=E4=B9=89?= =?UTF-8?q?;=E7=9B=B8=E5=85=B3=E6=97=A5=E5=BF=97=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- feapder/core/parser_control.py | 4 ++-- feapder/core/scheduler.py | 14 +++++++------- feapder/setting.py | 3 ++- feapder/templates/project_template/setting.py | 3 ++- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/feapder/core/parser_control.py b/feapder/core/parser_control.py index 021d2956..1da45921 100644 --- a/feapder/core/parser_control.py +++ b/feapder/core/parser_control.py @@ -33,7 +33,7 @@ class ParserControl(threading.Thread): is_show_tip = False - # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警 + # 实时统计请求成功数及失败数,用于计算请求成功率报警 _success_task_count = 0 _failed_task_count = 0 _total_task_count = 0 @@ -455,7 +455,7 @@ def add_parser(self, parser: BaseParser): class AirSpiderParserControl(ParserControl): is_show_tip = False - # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警 + # 实时统计请求成功数及失败数,用于计算请求成功率报警 _success_task_count = 0 _failed_task_count = 0 diff --git a/feapder/core/scheduler.py b/feapder/core/scheduler.py index 0177d185..caff471d 100644 --- a/feapder/core/scheduler.py +++ b/feapder/core/scheduler.py @@ -330,19 +330,19 @@ def check_task_status(self): else: return - # 检查失败任务数量 超过1000 报警, + # 检查失败请求数量,超过阈值则报警 failed_count = self._redisdb.zget_count(self._tab_failed_requests) if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 - msg = "《%s》爬虫当前失败任务数:%s, 请检查爬虫是否正常" % (self._spider_name, failed_count) + msg = "《%s》爬虫当前失败请求数:%s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", - message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name), + message_prefix="《%s》爬虫当前失败请求数报警" % (self._spider_name), ) - # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警 + # parser_control实时统计请求成功数及失败数,若请求成功率低于阈值则报警 ( failed_task_count, success_task_count, @@ -351,9 +351,9 @@ def check_task_status(self): total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count - if task_success_rate < 0.5: + if task_success_rate < setting.WARNING_SUCCESS_RATE: # 发送报警 - msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( + msg = "《%s》爬虫当前请求成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, @@ -363,7 +363,7 @@ def check_task_status(self): self.send_msg( msg, level="error", - message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name), + message_prefix="《%s》爬虫当前请求成功率报警" % (self._spider_name), ) # 判断任务数是否变化 diff --git a/feapder/setting.py b/feapder/setting.py index c52b318c..b7b127bf 100644 --- a/feapder/setting.py +++ b/feapder/setting.py @@ -193,7 +193,8 @@ # 时间间隔 WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏; 0表示不去重 WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / INFO / ERROR -WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 +WARNING_FAILED_COUNT = 1000 # 失败请求数 超过WARNING_FAILED_COUNT则报警 +WARNING_SUCCESS_RATE = 0.5 # 请求成功率低于WARNING_SUCCESS_RATE则报警 WARNING_CHECK_TASK_COUNT_INTERVAL = 1200 # 检查已做任务数量的时间间隔,若两次时间间隔之间,任务数无变化则报警 # 日志 diff --git a/feapder/templates/project_template/setting.py b/feapder/templates/project_template/setting.py index 140aaa07..890ec9d1 100644 --- a/feapder/templates/project_template/setting.py +++ b/feapder/templates/project_template/setting.py @@ -175,7 +175,8 @@ # # 时间间隔 # WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏; 0表示不去重 # WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / INFO / ERROR -# WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 +# WARNING_FAILED_COUNT = 1000 # 失败请求数 超过WARNING_FAILED_COUNT则报警 +# WARNING_SUCCESS_RATE = 0.5 # 请求成功率低于WARNING_SUCCESS_RATE则报警 # # LOG_NAME = os.path.basename(os.getcwd()) # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 From e8426e7c333b6070e5d2bc824d44d74da4654562 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Tue, 7 Apr 2026 15:32:31 +0800 Subject: [PATCH 02/14] =?UTF-8?q?=E6=8F=90=E9=AB=98redis=E4=BE=9D=E8=B5=96?= =?UTF-8?q?=E5=85=BC=E5=AE=B9=E6=80=A7=EF=BC=9A=E6=9C=80=E5=A4=A7=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E6=94=AF=E6=8C=81=E4=BB=8E<4.0=E6=8F=90=E5=8D=87?= =?UTF-8?q?=E5=88=B0<6.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- feapder/db/redisdb.py | 22 ++++++++++++++-------- feapder/requirements.txt | 4 ++-- setup.py | 2 +- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/feapder/db/redisdb.py b/feapder/db/redisdb.py index d882e687..97b7d943 100644 --- a/feapder/db/redisdb.py +++ b/feapder/db/redisdb.py @@ -137,15 +137,14 @@ def get_connect(self): else self._ip_ports.split(",") ) if len(ip_ports) > 1: - startup_nodes = [] + parsed_nodes = [] for ip_port in ip_ports: ip, port = ip_port.split(":") - startup_nodes.append({"host": ip, "port": port}) + parsed_nodes.append((ip, int(port))) if self._service_name: # log.debug("使用redis哨兵模式") - hosts = [(node["host"], node["port"]) for node in startup_nodes] - sentinel = Sentinel(hosts, socket_timeout=3, **self._kwargs) + sentinel = Sentinel(parsed_nodes, socket_timeout=3, **self._kwargs) self._redis = sentinel.master_for( self._service_name, password=self._user_pass, @@ -158,10 +157,17 @@ def get_connect(self): else: try: - from rediscluster import RedisCluster - except ModuleNotFoundError as e: - log.error('请安装 pip install "feapder[all]"') - os._exit(0) + from redis.cluster import RedisCluster, ClusterNode + startup_nodes = [ClusterNode(host=ip, port=port) for ip, port in parsed_nodes] + except ModuleNotFoundError: + try: + from rediscluster import RedisCluster + startup_nodes = [{"host": ip, "port": port} for ip, port in parsed_nodes] + except ModuleNotFoundError: + log.error( + '请安装 pip install "feapder[all]",或升级 redis>=4.0,或安装 redis-py-cluster' + ) + os._exit(0) # log.debug("使用redis集群模式") self._redis = RedisCluster( diff --git a/feapder/requirements.txt b/feapder/requirements.txt index 21717674..6b2a5230 100644 --- a/feapder/requirements.txt +++ b/feapder/requirements.txt @@ -4,7 +4,7 @@ parsel>=1.5.2 PyExecJS>=1.5.1 pymongo>=3.10.1 PyMySQL>=0.9.3 -redis>=2.10.6,<4.0.0 +redis>=2.10.6,<6.0.0 requests>=2.22.0 selenium>=3.141.0 bs4>=0.0.1 @@ -18,4 +18,4 @@ influxdb>=5.3.1 pyperclip>=1.8.2 webdriver-manager>=4.0.0 terminal-layout>=2.1.3 -playwright \ No newline at end of file +playwright diff --git a/setup.py b/setup.py index cf4fe542..090efad1 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "DBUtils>=2.0", "parsel>=1.5.2", "PyMySQL>=0.9.3", - "redis>=2.10.6,<4.0.0", + "redis>=2.10.6,<6.0.0", "requests>=2.22.0", "bs4>=0.0.1", "ipython>=7.14.0", From f471fbf6d060350343dec850ec110e068add7b62 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Tue, 7 Apr 2026 15:58:34 +0800 Subject: [PATCH 03/14] =?UTF-8?q?=E6=8F=90=E9=AB=98redis=E4=BE=9D=E8=B5=96?= =?UTF-8?q?=E5=85=BC=E5=AE=B9=E6=80=A7=EF=BC=9A=E6=9C=80=E5=A4=A7=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E6=94=AF=E6=8C=81=E4=BB=8E<4.0=E6=8F=90=E5=8D=87?= =?UTF-8?q?=E5=88=B0<6.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- feapder/requirements.txt | 1 - setup.py | 1 - 2 files changed, 2 deletions(-) diff --git a/feapder/requirements.txt b/feapder/requirements.txt index 6b2a5230..a882ee80 100644 --- a/feapder/requirements.txt +++ b/feapder/requirements.txt @@ -10,7 +10,6 @@ selenium>=3.141.0 bs4>=0.0.1 ipython>=7.14.0 bitarray>=1.5.3 -redis-py-cluster>=2.1.0 cryptography>=3.3.2 urllib3>=1.25.8 loguru>=0.5.3 diff --git a/setup.py b/setup.py index 090efad1..70baf38c 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,6 @@ "bitarray>=1.5.3", "PyExecJS>=1.5.1", "pymongo>=3.10.1", - "redis-py-cluster>=2.1.0", ] + render_requires setuptools.setup( From b1c5afa6a5ca8ca562eacc89cfcae3f22d36c133 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Tue, 7 Apr 2026 18:16:00 +0800 Subject: [PATCH 04/14] =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/_sidebar.md | 1 + ...37\350\203\275\346\246\202\350\247\210.md" | 14 +- docs/usage/FileSpider.md | 301 ++++++++++ feapder/__init__.py | 6 +- feapder/core/base_parser.py | 84 +++ feapder/core/spiders/__init__.py | 3 +- feapder/core/spiders/file_spider.py | 513 ++++++++++++++++++ feapder/dedup/file_dedup.py | 118 ++++ feapder/setting.py | 6 + tests/file-spider/table.sql | 20 + tests/file-spider/test_dedup_file_spider.py | 51 ++ tests/file-spider/test_local_file_spider.py | 47 ++ tests/file-spider/test_oss_file_spider.py | 62 +++ tests/file-spider/test_oss_result_spider.py | 92 ++++ 14 files changed, 1310 insertions(+), 8 deletions(-) create mode 100644 docs/usage/FileSpider.md create mode 100644 feapder/core/spiders/file_spider.py create mode 100644 feapder/dedup/file_dedup.py create mode 100644 tests/file-spider/table.sql create mode 100644 tests/file-spider/test_dedup_file_spider.py create mode 100644 tests/file-spider/test_local_file_spider.py create mode 100644 tests/file-spider/test_oss_file_spider.py create mode 100644 tests/file-spider/test_oss_result_spider.py diff --git a/docs/_sidebar.md b/docs/_sidebar.md index bef51b37..2b8efd4a 100644 --- a/docs/_sidebar.md +++ b/docs/_sidebar.md @@ -13,6 +13,7 @@ * [分布式爬虫-Spider](usage/Spider.md) * [任务爬虫-TaskSpider](usage/TaskSpider.md) * [批次爬虫-BatchSpider](usage/BatchSpider.md) + * [文件爬虫-FileSpider](usage/FileSpider.md) * [爬虫集成](usage/爬虫集成.md) * 使用进阶 diff --git "a/docs/foreword/\345\212\237\350\203\275\346\246\202\350\247\210.md" "b/docs/foreword/\345\212\237\350\203\275\346\246\202\350\247\210.md" index 9c714a34..4302064f 100644 --- "a/docs/foreword/\345\212\237\350\203\275\346\246\202\350\247\210.md" +++ "b/docs/foreword/\345\212\237\350\203\275\346\246\202\350\247\210.md" @@ -24,19 +24,23 @@ 2. 内存去重:处理一万条数据约0.5秒。 去重一亿条数据占用内存约285MB 3. 永久去重:处理一万条数据约3.5秒。去重一亿条数据占用内存约285MB -## 5. 数据采集完整性 +## 5. 支持批量文件下载 + +FileSpider 专用于批量下载文件/图片场景。一个任务包含多个待下载文件的 URL 列表,框架自动遍历生成下载请求,追踪下载进度,支持保存到本地磁盘或直接上传云存储。内置可选的文件去重机制,同一 URL 跨任务不重复下载。 + +## 6. 数据采集完整性 feapder对于每一条URL数据的抓取采取了强状态的控制,做到采集任务中URL抓取100%不丢失,即使多次尝试失败的URL也会进入错误队列并记录失败原因日志。这一特性对于很多强依赖采集数据的业务场景非常重要,保证数据用的放心。 -## 6. 数据自动入库 +## 7. 数据自动入库 只需要根据数据库表自动生成item,然后给item属性赋值,直接yield 返回即可批量入库 -## 7. 支持Debug模式 +## 8. 支持Debug模式 爬虫支持debug模式,debug模式下默认数据不入库、不修改任务状态。可针对某个任务进行调试,方便开发 -## 8. 完善的报警机制 +## 9. 完善的报警机制 为了保证数据的全量性、准确性、时效性,本框架内置报警机制,有了这些报警,我们可以实时掌握爬虫状态 @@ -53,7 +57,7 @@ feapder对于每一条URL数据的抓取采取了强状态的控制,做到采 ![-w416](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/29/16092335882158.jpg) -## 9. 下载监控 +## 10. 下载监控 框架对请求总数、成功数、失败数、解析异常数进行监控,将数据点打入到infuxdb,结合Grafana面板,可方便掌握抓取情况 diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md new file mode 100644 index 00000000..733d262f --- /dev/null +++ b/docs/usage/FileSpider.md @@ -0,0 +1,301 @@ +# FileSpider + +FileSpider 是一款分布式文件下载爬虫,专用于批量下载文件/图片的场景。 + +核心特征: +- **一对多**: 一个任务包含多个待下载文件的 URL 列表,框架自动遍历生成下载请求 +- **进度追踪**: 框架自动追踪每个任务的下载进度(成功数/失败数/总数) +- **结果有序**: 下载结果列表与原始 URL 列表严格位置对应 +- **灵活存储**: 默认保存到本地磁盘,可重写为上传云存储(OSS/S3 等),不落盘 +- **文件去重**: 可选功能,同一 URL 不重复下载,支持 Redis / MySQL 两种策略 +- **用户控制**: 任务成功/失败由用户在回调中显式决定 + +FileSpider 继承自 TaskSpider,复用了全部任务管理能力(MySQL 任务表、Redis 队列、断点续爬、丢失任务回收、分布式支持等)。 + +## 1. 任务表 + +### MySQL 任务表(建议结构) + +```sql +CREATE TABLE `file_task` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `file_urls` text COMMENT '待下载文件URL列表,JSON数组格式', + `state` int(11) DEFAULT 0 COMMENT '任务状态: 0待做 2下载中 1完成 -1失败', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; +``` + +字段说明: +- `id`: 任务主键,必须有 +- `file_urls`: 存放待下载文件 URL 的 JSON 数组,字段名可自定义 +- `state`: 任务状态字段,字段名可通过 `task_state` 参数配置。0=待做,2=已下发(框架自动设置),1=完成,-1=失败(由用户代码设置) + +## 2. 用户需实现的方法 + +### 必须实现 + +| 方法 | 说明 | +|------|------| +| `get_download_urls(task)` | 从 task 中提取文件 URL 列表,返回 `List[str]` | +| `on_task_all_done(task_id, success_count, fail_count, total_count, results)` | 任务所有文件处理完毕的回调,在此 yield Item 或 update_task_batch 更新状态 | + +### 可选重写 + +| 方法 | 说明 | 默认行为 | +|------|------|----------| +| `get_file_path(task, url)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{filename}` | +| `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置 | 保存到本地磁盘,返回本地路径 | +| `on_file_downloaded(task_id, url, file_path)` | 单个文件下载成功回调 | 无 | +| `on_file_failed(task_id, url, error)` | 单个文件下载失败回调 | 无 | + +### 方法分层 + +``` +save_file (框架层,不应重写) + ├── process_file (用户层,按需重写) + │ ├── 默认: 保存到本地磁盘,返回本地路径 + │ └── 重写: 上传云存储,返回云存储 URL + ├── Redis 进度追踪 (自动) + ├── on_file_downloaded 回调 + └── 检查是否所有文件完成 + └── on_task_all_done (用户实现) + ├── yield Item → 写入结果表 + └── yield update_task_batch → 更新任务状态 +``` + +### `on_task_all_done` 参数说明 + +```python +def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + """ + results: List[str|None] + - 与 get_download_urls 返回的列表严格位置对应 + - 成功: 文件存储位置(本地路径或云存储 URL) + - 失败: None + 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/d.jpg"] + """ +``` + +## 3. 构造参数 + +| 参数 | 类型 | 说明 | +|------|------|------| +| `redis_key` | str | Redis key 前缀(必填) | +| `task_table` | str | MySQL 任务表名(必填) | +| `task_keys` | list | 需要获取的任务字段列表(必填) | +| `save_dir` | str | 文件保存根目录,默认 `./downloads` | +| `file_dedup` | None/str/FileDedup | 文件去重策略:None 不去重,`"redis"` / `"mysql"` / FileDedup 实例 | +| `file_dedup_expire` | int | Redis 去重缓存过期时间(秒),仅 `file_dedup="redis"` 时生效 | +| `task_state` | str | 任务状态字段名,默认 `state` | +| `min_task_count` | int | Redis 中最少任务数,默认 10000 | +| `check_task_interval` | int | 检查任务间隔(秒),默认 5 | +| `task_limit` | int | 每次取任务数量,默认 10000 | +| `task_condition` | str | 任务筛选条件(WHERE 后的 SQL) | +| `task_order_by` | str | 取任务排序条件 | +| `thread_count` | int | 线程数 | +| `keep_alive` | bool | 是否常驻 | + +## 4. 使用示例 + +### 场景一:保存到本地磁盘 + +最简单的用法,下载文件保存到本地: + +```python +import json +import feapder + + +class LocalFileSpider(feapder.FileSpider): + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + if fail_count == 0: + yield self.update_task_batch(task_id, 1) + else: + yield self.update_task_batch(task_id, -1) + + +if __name__ == "__main__": + spider = LocalFileSpider( + redis_key="local_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + save_dir="./downloads", + ) + spider.start_monitor_task() +``` + +### 场景二:上传云存储(不落盘) + +重写 `process_file` 实现直接上传云存储: + +```python +import json +import os +import feapder +from urllib.parse import urlparse, unquote + + +class OssFileSpider(feapder.FileSpider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # 初始化云存储客户端 + # self.oss_client = OSSClient(bucket="my-bucket") + + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def get_file_path(self, task, url): + """返回 OSS 存储 key(不是本地路径)""" + filename = os.path.basename(unquote(urlparse(url).path)) + return f"images/{task.id}/{filename}" + + def process_file(self, task_id, url, file_path, response): + """上传 OSS,返回云存储 URL""" + # self.oss_client.put_object(file_path, response.content) + return f"https://my-bucket.oss.aliyuncs.com/{file_path}" + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + if success_count > 0: + yield self.update_task_batch(task_id, 1) + else: + yield self.update_task_batch(task_id, -1) + + +if __name__ == "__main__": + spider = OssFileSpider( + redis_key="oss_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + ) + spider.start_monitor_task() +``` + +### 场景三:上传云存储 + 结果入库 + +先创建结果 Item: + +```bash +feapder create -i file_result +``` + +编辑生成的 `items/file_result_item.py`,添加所需字段,然后在爬虫中引用: + +```python +import json +import os +import feapder +from urllib.parse import urlparse, unquote +from items.file_result_item import FileResultItem + + +class OssResultSpider(feapder.FileSpider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # self.oss_client = OSSClient(bucket="my-bucket") + + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def get_file_path(self, task, url): + filename = os.path.basename(unquote(urlparse(url).path)) + return f"images/{task.id}/{filename}" + + def process_file(self, task_id, url, file_path, response): + # self.oss_client.put_object(file_path, response.content) + return f"https://my-bucket.oss.aliyuncs.com/{file_path}" + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + # results 与 get_download_urls 返回的列表严格位置对应 + item = FileResultItem() + item.task_id = task_id + item.result_urls = json.dumps(results) + yield item + + if fail_count == 0: + yield self.update_task_batch(task_id, 1) + else: + yield self.update_task_batch(task_id, -1) +``` + +### 场景四:启用文件去重 + +通过 `file_dedup` 参数启用,同一 URL 跨任务不重复下载: + +```python +import json +import feapder + + +class DedupFileSpider(feapder.FileSpider): + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + yield self.update_task_batch(task_id, 1 if fail_count == 0 else -1) + + +if __name__ == "__main__": + spider = DedupFileSpider( + redis_key="dedup_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + save_dir="./downloads", + file_dedup="redis", # "redis" / "mysql" / FileDedup 实例 + ) + spider.start_monitor_task() +``` + +去重行为: +- `start_requests` 中遍历 URL 列表时,先查去重缓存 +- 缓存命中:直接复用已有结果,不生成 Request,不重复下载 +- 缓存未命中:正常下载,成功后自动写入去重缓存 +- 跨任务共享:不同任务中出现的相同 URL 只下载一次 + +## 5. 文件去重 + +### 去重策略 + +| 策略 | 参数值 | 存储 | 适用场景 | +|------|--------|------|----------| +| 不去重 | `None`(默认) | - | 每次都重新下载 | +| Redis 去重 | `"redis"` | Redis Hash | 分布式共享,多进程安全 | +| MySQL 去重 | `"mysql"` | MySQL 表(自动建表) | 持久化,长期缓存 | +| 自定义去重 | `FileDedup` 实例 | 用户自定义 | 特殊需求 | + +### 自定义去重 + +继承 `FileDedup` 接口: + +```python +from feapder.dedup.file_dedup import FileDedup + +class MyFileDedup(FileDedup): + def get(self, url): + """返回缓存结果,无缓存返回 None""" + ... + + def set(self, url, result_url): + """缓存处理结果""" + ... +``` + +## 6. Debug 模式 + +支持 Debug 模式,可针对单个任务调试: + +```python +if __name__ == "__main__": + spider = MyFileSpider.to_DebugFileSpider( + task_id=1, + redis_key="my_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + save_dir="./downloads", + ) + spider.start() +``` + +Debug 模式下默认不入库、不更新任务状态。 diff --git a/feapder/__init__.py b/feapder/__init__.py index 565be4b9..ed5acf90 100644 --- a/feapder/__init__.py +++ b/feapder/__init__.py @@ -18,9 +18,11 @@ "Spider", "TaskSpider", "BatchSpider", + "FileSpider", "BaseParser", "TaskParser", "BatchParser", + "FileParser", "Request", "Response", "Item", @@ -28,8 +30,8 @@ "ArgumentParser", ] -from feapder.core.spiders import AirSpider, Spider, TaskSpider, BatchSpider -from feapder.core.base_parser import BaseParser, TaskParser, BatchParser +from feapder.core.spiders import AirSpider, Spider, TaskSpider, BatchSpider, FileSpider +from feapder.core.base_parser import BaseParser, TaskParser, BatchParser, FileParser from feapder.network.request import Request from feapder.network.response import Response from feapder.network.item import Item, UpdateItem diff --git a/feapder/core/base_parser.py b/feapder/core/base_parser.py index a06f9c44..1a379bad 100644 --- a/feapder/core/base_parser.py +++ b/feapder/core/base_parser.py @@ -191,6 +191,90 @@ def update_task_batch(self, task_id, state=1, **kwargs): return update_item +class FileParser(TaskParser): + """ + @summary: 文件下载爬虫模版 + --------- + """ + + def __init__(self, task_table, task_state, mysqldb=None, save_dir="./downloads"): + super(FileParser, self).__init__( + task_table=task_table, task_state=task_state, mysqldb=mysqldb + ) + self._save_dir = save_dir + + def get_download_urls(self, task): + """ + 从 task 中获取需要下载的文件 URL 列表,用户必须实现 + @param task: 任务信息 + @return: List[str] - URL 列表 + """ + raise NotImplementedError("必须实现 get_download_urls 方法") + + def get_file_path(self, task, url): + """ + 返回文件保存路径/标识,用户可重写 + 本地场景: 返回本地文件路径,如 ./downloads/123/image.jpg + 云存储场景: 返回存储标识/key,如 bucket/prefix/123/image.jpg + @param task: 任务信息 + @param url: 文件 URL + @return: str - 文件路径或存储标识 + """ + from urllib.parse import urlparse, unquote + + parsed = urlparse(url) + filename = os.path.basename(unquote(parsed.path)) or "unknown" + return os.path.join(self._save_dir, str(task.id), filename) + + def process_file(self, task_id, url, file_path, response): + """ + 处理下载的文件内容,返回文件最终存储位置。用户按需重写 + 默认实现: 保存到本地磁盘,返回本地路径 + 云存储场景: 重写此方法上传到 OSS/S3 等,返回云存储 URL + @param task_id: 任务 ID + @param url: 文件原始 URL + @param file_path: get_file_path 返回的路径/标识 + @param response: 下载响应 + @return: str - 文件最终存储位置(本地路径或云存储 URL) + """ + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "wb") as f: + f.write(response.content) + return file_path + + def on_file_downloaded(self, task_id, url, file_path): + """ + 单个文件下载成功的回调,用户可重写 + @param task_id: 任务 ID + @param url: 文件原始 URL + @param file_path: 文件存储位置 + """ + pass + + def on_file_failed(self, task_id, url, error): + """ + 单个文件下载失败的回调,用户可重写 + @param task_id: 任务 ID + @param url: 文件原始 URL + @param error: 异常信息 + """ + pass + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + """ + 任务所有文件处理完毕的回调 + 用户应在此方法中 yield Item 写入结果表、yield self.update_task_batch() 更新任务状态 + @param task_id: 任务 ID + @param success_count: 成功数 + @param fail_count: 失败数 + @param total_count: 总数 + @param results: List[str|None] - 每个文件的处理结果, + 顺序与 get_download_urls 返回的列表一致。 + 成功为文件存储位置(本地路径或云存储 URL),失败为 None + """ + pass + + class BatchParser(TaskParser): """ @summary: 批次爬虫模版 diff --git a/feapder/core/spiders/__init__.py b/feapder/core/spiders/__init__.py index a32ba668..041e47fe 100644 --- a/feapder/core/spiders/__init__.py +++ b/feapder/core/spiders/__init__.py @@ -8,9 +8,10 @@ @email: boris_liu@foxmail.com """ -__all__ = ["AirSpider", "TaskSpider", "Spider", "BatchSpider"] +__all__ = ["AirSpider", "TaskSpider", "Spider", "BatchSpider", "FileSpider"] from feapder.core.spiders.air_spider import AirSpider from feapder.core.spiders.spider import Spider from feapder.core.spiders.task_spider import TaskSpider from feapder.core.spiders.batch_spider import BatchSpider +from feapder.core.spiders.file_spider import FileSpider diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py new file mode 100644 index 00000000..9c4bde06 --- /dev/null +++ b/feapder/core/spiders/file_spider.py @@ -0,0 +1,513 @@ +# -*- coding: utf-8 -*- +""" +Created on 2026/4/7 +--------- +@summary: 文件下载爬虫 +--------- +""" + +import os +import warnings + +import feapder.setting as setting +import feapder.utils.tools as tools +from feapder.core.spiders.task_spider import TaskSpider +from feapder.dedup.file_dedup import FileDedup, RedisFileDedup, MysqlFileDedup +from feapder.network.item import UpdateItem +from feapder.network.request import Request +from feapder.utils.log import log + +CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline" + + +class FileSpider(TaskSpider): + """ + 文件下载爬虫 + + 基于 TaskSpider,专用于批量下载文件/图片的场景。 + - 一个任务包含多个待下载文件的 URL 列表(一对多) + - 框架自动追踪每个任务的下载进度 + - 支持保存到本地磁盘或上传云存储 + - 任务成功/失败由用户在 on_task_all_done 中显式决定 + - 可选文件去重,同一 URL 不重复下载 + """ + + def __init__( + self, + redis_key, + task_table, + task_keys, + save_dir="./downloads", + file_dedup=None, + file_dedup_expire=None, + task_table_type="mysql", + task_state="state", + min_task_count=10000, + check_task_interval=5, + task_limit=10000, + related_redis_key=None, + related_batch_record=None, + task_condition="", + task_order_by="", + thread_count=None, + begin_callback=None, + end_callback=None, + delete_keys=(), + keep_alive=None, + batch_interval=0, + use_mysql=True, + **kwargs, + ): + """ + @summary: 文件下载爬虫 + --------- + @param redis_key: 任务等数据存放在 redis 中的 key 前缀 + @param task_table: mysql 中的任务表 + @param task_keys: 需要获取的任务字段 列表 + @param save_dir: 文件保存根目录,默认 ./downloads + @param file_dedup: 文件去重策略。 + None: 不去重(默认) + "redis": 使用 Redis Hash 去重 + "mysql": 使用 MySQL 表去重 + FileDedup 实例: 自定义去重实现 + @param file_dedup_expire: Redis 去重缓存过期时间(秒),仅 file_dedup="redis" 时生效 + @param task_table_type: 任务表类型 支持 redis、mysql + @param task_state: mysql 中任务表的任务状态字段 + @param min_task_count: redis 中最少任务数,少于这个数量会从种子表中取任务 + @param check_task_interval: 检查是否还有任务的时间间隔 + @param task_limit: 每次从数据库中取任务的数量 + @param related_redis_key: 有关联的其他爬虫任务表(redis) + @param related_batch_record: 有关联的其他爬虫批次表(mysql) + @param task_condition: 任务条件,用于筛选任务 + @param task_order_by: 取任务时的排序条件 + @param thread_count: 线程数 + @param begin_callback: 爬虫开始回调函数 + @param end_callback: 爬虫结束回调函数 + @param delete_keys: 爬虫启动时删除的 key + @param keep_alive: 爬虫是否常驻 + @param batch_interval: 抓取时间间隔(天) + @param use_mysql: 是否使用 mysql 数据库 + --------- + """ + + super(FileSpider, self).__init__( + redis_key=redis_key, + task_table=task_table, + task_table_type=task_table_type, + task_keys=task_keys, + task_state=task_state, + min_task_count=min_task_count, + check_task_interval=check_task_interval, + task_limit=task_limit, + related_redis_key=related_redis_key, + related_batch_record=related_batch_record, + task_condition=task_condition, + task_order_by=task_order_by, + thread_count=thread_count, + begin_callback=begin_callback, + end_callback=end_callback, + delete_keys=delete_keys, + keep_alive=keep_alive, + batch_interval=batch_interval, + use_mysql=use_mysql, + **kwargs, + ) + + self._save_dir = save_dir + + if file_dedup == "redis": + dedup_table = setting.TAB_FILE_DEDUP.format(redis_key=self._redis_key) + self._file_dedup = RedisFileDedup(dedup_table, file_dedup_expire) + elif file_dedup == "mysql": + self._file_dedup = MysqlFileDedup() + elif isinstance(file_dedup, FileDedup): + self._file_dedup = file_dedup + else: + self._file_dedup = None + + # ===================== 用户需实现/可重写的方法 ===================== + + def get_download_urls(self, task): + """ + 从 task 中获取需要下载的文件 URL 列表,用户必须实现 + @param task: 任务信息 + @return: List[str] - URL 列表 + """ + raise NotImplementedError("必须实现 get_download_urls 方法") + + def get_file_path(self, task, url): + """ + 返回文件保存路径/标识,用户可重写 + 本地场景: 返回本地文件路径 + 云存储场景: 返回存储标识/key + @param task: 任务信息 + @param url: 文件 URL + @return: str + """ + from urllib.parse import urlparse, unquote + + parsed = urlparse(url) + filename = os.path.basename(unquote(parsed.path)) or "unknown" + return os.path.join(self._save_dir, str(task.id), filename) + + def process_file(self, task_id, url, file_path, response): + """ + 处理下载的文件内容,返回文件最终存储位置。用户按需重写 + 默认实现: 保存到本地磁盘,返回本地路径 + 云存储场景: 重写此方法上传到 OSS/S3 等,返回云存储 URL + @param task_id: 任务 ID + @param url: 文件原始 URL + @param file_path: get_file_path 返回的路径/标识 + @param response: 下载响应 + @return: str - 文件最终存储位置 + """ + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "wb") as f: + f.write(response.content) + return file_path + + def on_file_downloaded(self, task_id, url, file_path): + """ + 单个文件下载成功的回调,用户可重写 + @param task_id: 任务 ID + @param url: 文件原始 URL + @param file_path: 文件存储位置 + """ + pass + + def on_file_failed(self, task_id, url, error): + """ + 单个文件下载失败的回调,用户可重写 + @param task_id: 任务 ID + @param url: 文件原始 URL + @param error: 异常信息 + """ + pass + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + """ + 任务所有文件处理完毕的回调 + 用户应在此方法中 yield Item 写入结果表、yield self.update_task_batch() 更新任务状态 + @param task_id: 任务 ID + @param success_count: 成功数 + @param fail_count: 失败数 + @param total_count: 总数 + @param results: List[str|None] - 每个文件的处理结果, + 顺序与 get_download_urls 返回的列表一致。 + 成功为文件存储位置,失败为 None + """ + pass + + # ===================== 框架内部方法 ===================== + + def start_requests(self, task): + """ + 遍历 URL 列表生成下载请求。 + 去重缓存命中的 URL 直接复用结果,不生成 Request。 + """ + urls = self.get_download_urls(task) + if not urls: + log.warning(f"任务{task.id}无下载URL") + return + + total = len(urls) + task_id = task.id + progress_key = setting.TAB_FILE_PROGRESS.format( + redis_key=self._redis_key, task_id=task_id + ) + result_key = setting.TAB_FILE_RESULT.format( + redis_key=self._redis_key, task_id=task_id + ) + + self._redisdb.hset(progress_key, "total", total) + self._redisdb.hset(progress_key, "success", 0) + self._redisdb.hset(progress_key, "fail", 0) + + cached_count = 0 + for index, url in enumerate(urls): + # 去重缓存检查 + if self._file_dedup: + cached_result = self._file_dedup.get(url) + if cached_result is not None: + self._redisdb.hset(result_key, str(index), cached_result) + self._redisdb.hincrby(progress_key, "success", 1) + cached_count += 1 + log.debug(f"任务{task_id} 文件去重命中 url={url}") + self.on_file_downloaded(task_id, url, cached_result) + continue + + file_path = self.get_file_path(task, url) + yield Request( + url, + task_id=task_id, + file_index=index, + file_path=file_path, + callback=self.save_file, + ) + + if cached_count > 0: + log.info(f"任务{task_id} 去重命中{cached_count}/{total}个文件") + + # 全部命中缓存,直接触发 on_task_all_done + if cached_count >= total: + results = self._assemble_results(task_id, total) + for result in self.on_task_all_done( + task_id, cached_count, 0, total, results + ) or []: + yield result + self._cleanup_task_redis(task_id) + + def save_file(self, request, response): + """ + 框架内部回调,处理文件保存和进度追踪。用户不应重写此方法。 + """ + task_id = request.task_id + file_index = request.file_index + url = request.url + file_path = request.file_path + + result_url = self.process_file(task_id, url, file_path, response) + + # 写入去重缓存 + if self._file_dedup and result_url: + self._file_dedup.set(url, result_url) + + # 记录结果 + result_key = setting.TAB_FILE_RESULT.format( + redis_key=self._redis_key, task_id=task_id + ) + self._redisdb.hset(result_key, str(file_index), result_url or "") + + # 更新进度 + progress_key = setting.TAB_FILE_PROGRESS.format( + redis_key=self._redis_key, task_id=task_id + ) + success = self._redisdb.hincrby(progress_key, "success", 1) + total = int(self._redisdb.hget(progress_key, "total") or 0) + fail = int(self._redisdb.hget(progress_key, "fail") or 0) + + log.info(f"任务{task_id} 文件下载成功 [{success + fail}/{total}] url={url}") + self.on_file_downloaded(task_id, url, result_url) + + # 检查任务是否全部完成 + if success + fail >= total: + results = self._assemble_results(task_id, total) + for result in self.on_task_all_done( + task_id, success, fail, total, results + ) or []: + yield result + self._cleanup_task_redis(task_id) + + def failed_request(self, request, response, e): + """ + 文件下载失败(重试耗尽)的处理。 + """ + task_id = getattr(request, "task_id", None) + file_index = getattr(request, "file_index", None) + + if task_id is None or file_index is None: + yield request + return + + # 记录失败结果 + result_key = setting.TAB_FILE_RESULT.format( + redis_key=self._redis_key, task_id=task_id + ) + self._redisdb.hset(result_key, str(file_index), "") + + # 更新进度 + progress_key = setting.TAB_FILE_PROGRESS.format( + redis_key=self._redis_key, task_id=task_id + ) + fail = self._redisdb.hincrby(progress_key, "fail", 1) + total = int(self._redisdb.hget(progress_key, "total") or 0) + success = int(self._redisdb.hget(progress_key, "success") or 0) + + log.error(f"任务{task_id} 文件下载失败 [{success + fail}/{total}] url={request.url}") + self.on_file_failed(task_id, request.url, e) + + # 检查任务是否全部完成 + if success + fail >= total: + results = self._assemble_results(task_id, total) + for result in self.on_task_all_done( + task_id, success, fail, total, results + ) or []: + yield result + self._cleanup_task_redis(task_id) + + yield request + + def _assemble_results(self, task_id, total): + """ + 从 Redis 结果 Hash 中按 0~total-1 顺序读取所有文件处理结果, + 组装为有序列表返回。 + """ + result_key = setting.TAB_FILE_RESULT.format( + redis_key=self._redis_key, task_id=task_id + ) + results = [] + for i in range(total): + value = self._redisdb.hget(result_key, str(i)) + if value is None or value == "" or value == b"": + results.append(None) + else: + if isinstance(value, bytes): + value = value.decode() + results.append(value) + return results + + def _cleanup_task_redis(self, task_id): + """清理任务相关的 Redis 进度和结果 key""" + progress_key = setting.TAB_FILE_PROGRESS.format( + redis_key=self._redis_key, task_id=task_id + ) + result_key = setting.TAB_FILE_RESULT.format( + redis_key=self._redis_key, task_id=task_id + ) + self._redisdb.clear(progress_key) + self._redisdb.clear(result_key) + + @classmethod + def to_DebugFileSpider(cls, *args, **kwargs): + DebugFileSpider.__bases__ = (cls,) + DebugFileSpider.__name__ = cls.__name__ + return DebugFileSpider(*args, **kwargs) + + +class DebugFileSpider(FileSpider): + """ + Debug 文件下载爬虫 + """ + + __debug_custom_setting__ = dict( + COLLECTOR_TASK_COUNT=1, + SPIDER_THREAD_COUNT=1, + SPIDER_SLEEP_TIME=0, + SPIDER_MAX_RETRY_TIMES=10, + REQUEST_LOST_TIMEOUT=600, + PROXY_ENABLE=False, + RETRY_FAILED_REQUESTS=False, + SAVE_FAILED_REQUEST=False, + ITEM_FILTER_ENABLE=False, + REQUEST_FILTER_ENABLE=False, + OSS_UPLOAD_TABLES=(), + DELETE_KEYS=True, + ) + + def __init__( + self, + task_id=None, + task=None, + save_to_db=False, + update_task=False, + *args, + **kwargs, + ): + """ + @param task_id: 任务 id + @param task: 任务,task 与 task_id 二者选一即可。如 task = {"url":""} + @param save_to_db: 数据是否入库,默认否 + @param update_task: 是否更新任务,默认否 + """ + warnings.warn( + "您正处于debug模式下,该模式下不会更新任务状态及数据入库,仅用于调试。" + "正式发布前请更改为正常模式", + category=Warning, + ) + + if not task and not task_id: + raise Exception("task_id 与 task 不能同时为空") + + kwargs["redis_key"] = kwargs["redis_key"] + "_debug" + if not save_to_db: + self.__class__.__debug_custom_setting__["ITEM_PIPELINES"] = [ + CONSOLE_PIPELINE_PATH + ] + self.__class__.__custom_setting__.update( + self.__class__.__debug_custom_setting__ + ) + + super(DebugFileSpider, self).__init__(*args, **kwargs) + + self._task_id = task_id + self._task = task + self._update_task = update_task + + def start_monitor_task(self): + if not self._parsers: + self._is_more_parsers = False + self._parsers.append(self) + elif len(self._parsers) <= 1: + self._is_more_parsers = False + + if self._task: + self.distribute_task([self._task]) + else: + tasks = self.get_todo_task_from_mysql() + if not tasks: + raise Exception( + f"未获取到任务 请检查 task_id: {self._task_id} 是否存在" + ) + self.distribute_task(tasks) + + log.debug("下发任务完毕") + + def get_todo_task_from_mysql(self): + task_keys = ", ".join([f"`{key}`" for key in self._task_keys]) + sql = "select %s from %s where id=%s" % ( + task_keys, + self._task_table, + self._task_id, + ) + tasks = self._mysqldb.find(sql) + return tasks + + def save_cached(self, request, response, table): + pass + + def update_task_state(self, task_id, state=1, *args, **kwargs): + if self._update_task: + kwargs["id"] = task_id + kwargs[self._task_state] = state + + sql = tools.make_update_sql( + self._task_table, + kwargs, + condition=f"id = {task_id}", + ) + + if self._mysqldb.update(sql): + log.debug(f"置任务{task_id}状态成功") + else: + log.error(f"置任务{task_id}状态失败 sql={sql}") + + def update_task_batch(self, task_id, state=1, *args, **kwargs): + if self._update_task: + kwargs["id"] = task_id + kwargs[self._task_state] = state + + update_item = UpdateItem(**kwargs) + update_item.table_name = self._task_table + update_item.name_underline = self._task_table + "_item" + + return update_item + + def run(self): + self.start_monitor_task() + + if not self._parsers: + self._parsers.append(self) + + self._start() + + while True: + try: + if self.all_thread_is_done(): + self._stop_all_thread() + break + except Exception as e: + log.exception(e) + + tools.delay_time(1) + + self.delete_tables([self._redis_key + "*"]) diff --git a/feapder/dedup/file_dedup.py b/feapder/dedup/file_dedup.py new file mode 100644 index 00000000..dd6fd6dc --- /dev/null +++ b/feapder/dedup/file_dedup.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +""" +文件去重缓存 + +与现有 Dedup(布隆过滤器等,只判断存在性)不同, +FileDedup 存储 URL -> result_url 的完整映射,用于直接复用下载结果。 +""" + +from feapder.db.mysqldb import MysqlDB +from feapder.db.redisdb import RedisDB +from feapder.utils.log import log + + +class FileDedup: + """文件去重缓存接口 + + 用于存储和检索文件下载结果的缓存。 + 子类需实现 get / set 方法。 + """ + + def get(self, url): + """获取 URL 对应的缓存结果 + + Args: + url: 文件原始 URL + + Returns: + str or None: 缓存的文件存储位置,无缓存返回 None + """ + return None + + def set(self, url, result_url): + """缓存 URL 的处理结果 + + Args: + url: 文件原始 URL + result_url: 文件最终存储位置(本地路径或云存储 URL) + """ + pass + + def close(self): + pass + + +class RedisFileDedup(FileDedup): + """基于 Redis Hash 的文件去重 + + 适合分布式场景,多进程共享。 + """ + + def __init__(self, table, expire_time=None): + """ + Args: + table: Redis Hash 的 key + expire_time: 过期时间(秒),None 表示不过期 + """ + self._redisdb = RedisDB() + self._table = table + self._expire_time = expire_time + + def get(self, url): + result = self._redisdb.hget(self._table, url) + if result is None: + return None + if isinstance(result, bytes): + result = result.decode() + return result or None + + def set(self, url, result_url): + self._redisdb.hset(self._table, url, result_url) + + +class MysqlFileDedup(FileDedup): + """基于 MySQL 表的文件去重 + + 持久化可靠,适合长期缓存。 + 首次使用时会自动建表。 + """ + + _table_ensured = set() + + def __init__(self, table="file_dedup", mysqldb=None): + """ + Args: + table: MySQL 表名 + mysqldb: MysqlDB 实例,默认使用全局配置 + """ + self._mysqldb = mysqldb or MysqlDB() + self._table = table + self._ensure_table() + + def _ensure_table(self): + if self._table in self.__class__._table_ensured: + return + sql = ( + f"CREATE TABLE IF NOT EXISTS `{self._table}` (" + f" `id` int(11) NOT NULL AUTO_INCREMENT," + f" `url` varchar(2048) NOT NULL COMMENT '文件原始URL'," + f" `result_url` text COMMENT '文件存储位置'," + f" `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP," + f" PRIMARY KEY (`id`)," + f" UNIQUE KEY `uk_url` (`url`) USING BTREE" + f") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4" + ) + self._mysqldb.execute(sql) + self.__class__._table_ensured.add(self._table) + + def get(self, url): + sql = f"SELECT result_url FROM `{self._table}` WHERE `url` = %s LIMIT 1" + result = self._mysqldb.find(sql, (url,)) + return result[0][0] if result else None + + def set(self, url, result_url): + sql = ( + f"INSERT INTO `{self._table}` (`url`, `result_url`) VALUES (%s, %s) " + f"ON DUPLICATE KEY UPDATE `result_url` = VALUES(`result_url`)" + ) + self._mysqldb.execute(sql, (url, result_url)) diff --git a/feapder/setting.py b/feapder/setting.py index c52b318c..91d6e4ca 100644 --- a/feapder/setting.py +++ b/feapder/setting.py @@ -11,6 +11,12 @@ TAB_FAILED_ITEMS = "{redis_key}:s_failed_items" # 爬虫状态表模版 TAB_SPIDER_STATUS = "{redis_key}:h_spider_status" +# 文件爬虫 - 进度追踪 +TAB_FILE_PROGRESS = "{redis_key}:h_file_progress:{task_id}" +# 文件爬虫 - 文件结果 +TAB_FILE_RESULT = "{redis_key}:h_file_result:{task_id}" +# 文件爬虫 - 去重缓存 +TAB_FILE_DEDUP = "{redis_key}:h_file_dedup" # 用户池 TAB_USER_POOL = "{redis_key}:h_{user_type}_pool" diff --git a/tests/file-spider/table.sql b/tests/file-spider/table.sql new file mode 100644 index 00000000..772d4e39 --- /dev/null +++ b/tests/file-spider/table.sql @@ -0,0 +1,20 @@ +-- FileSpider 任务表 +CREATE TABLE IF NOT EXISTS `file_task` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `file_urls` text COMMENT '待下载文件URL列表,JSON数组格式', + `state` int(11) DEFAULT 0 COMMENT '任务状态: 0待做 2下载中 1完成 -1失败', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + +-- 结果表(场景三使用) +CREATE TABLE IF NOT EXISTS `file_result` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `task_id` int(11) DEFAULT NULL COMMENT '任务ID', + `result_urls` text COMMENT '文件存储位置列表,JSON数组,与file_urls位置对应', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + +-- 示例数据 +INSERT INTO `file_task` (`file_urls`, `state`) VALUES +('["https://httpbin.org/image/png", "https://httpbin.org/image/jpeg"]', 0), +('["https://httpbin.org/image/svg", "https://httpbin.org/image/webp", "https://httpbin.org/image/png"]', 0); diff --git a/tests/file-spider/test_dedup_file_spider.py b/tests/file-spider/test_dedup_file_spider.py new file mode 100644 index 00000000..f5128b9b --- /dev/null +++ b/tests/file-spider/test_dedup_file_spider.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +""" +场景四:启用文件去重 + +通过 file_dedup 参数启用去重,同一 URL 跨任务不重复下载。 + +去重行为: +- start_requests 中遍历 URL 列表时,先查去重缓存 +- 缓存命中:直接复用已有结果,不生成 Request,不重复下载 +- 缓存未命中:正常下载,成功后自动写入去重缓存 +- 跨任务共享:不同任务中出现的相同 URL 只下载一次 +""" + +import json + +import feapder +from feapder.utils.log import log + + +class DedupFileSpider(feapder.FileSpider): + __custom_setting__ = dict( + REDISDB_IP_PORTS="localhost:6379", + REDISDB_USER_PASS="", + REDISDB_DB=0, + MYSQL_IP="localhost", + MYSQL_PORT=3306, + MYSQL_DB="feapder", + MYSQL_USER_NAME="feapder", + MYSQL_USER_PASS="feapder123", + ) + + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def on_file_downloaded(self, task_id, url, file_path): + log.info(f"任务{task_id} 文件就绪 path={file_path}") + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + log.info(f"任务{task_id} 完成 成功={success_count} 失败={fail_count}") + yield self.update_task_batch(task_id, 1 if fail_count == 0 else -1) + + +if __name__ == "__main__": + spider = DedupFileSpider( + redis_key="dedup_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + save_dir="./downloads", + file_dedup="redis", # "redis" / "mysql" / FileDedup 实例 + ) + spider.start_monitor_task() diff --git a/tests/file-spider/test_local_file_spider.py b/tests/file-spider/test_local_file_spider.py new file mode 100644 index 00000000..5875ca1e --- /dev/null +++ b/tests/file-spider/test_local_file_spider.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" +场景一:保存到本地磁盘 + +最简单的用法,下载文件保存到本地。 +任务表结构见 table.sql +""" + +import json + +import feapder +from feapder.utils.log import log + + +class LocalFileSpider(feapder.FileSpider): + __custom_setting__ = dict( + REDISDB_IP_PORTS="localhost:6379", + REDISDB_USER_PASS="", + REDISDB_DB=0, + MYSQL_IP="localhost", + MYSQL_PORT=3306, + MYSQL_DB="feapder", + MYSQL_USER_NAME="feapder", + MYSQL_USER_PASS="feapder123", + ) + + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def on_file_downloaded(self, task_id, url, file_path): + log.info(f"任务{task_id} 文件保存成功 path={file_path}") + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + if fail_count == 0: + yield self.update_task_batch(task_id, 1) + else: + yield self.update_task_batch(task_id, -1) + + +if __name__ == "__main__": + spider = LocalFileSpider( + redis_key="local_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + save_dir="./downloads", + ) + spider.start_monitor_task() diff --git a/tests/file-spider/test_oss_file_spider.py b/tests/file-spider/test_oss_file_spider.py new file mode 100644 index 00000000..249a9f22 --- /dev/null +++ b/tests/file-spider/test_oss_file_spider.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +""" +场景二:上传云存储(不落盘) + +重写 process_file 实现直接上传云存储,文件不保存到本地磁盘。 +""" + +import json +import os +from urllib.parse import urlparse, unquote + +import feapder +from feapder.utils.log import log + + +class OssFileSpider(feapder.FileSpider): + __custom_setting__ = dict( + REDISDB_IP_PORTS="localhost:6379", + REDISDB_USER_PASS="", + REDISDB_DB=0, + MYSQL_IP="localhost", + MYSQL_PORT=3306, + MYSQL_DB="feapder", + MYSQL_USER_NAME="feapder", + MYSQL_USER_PASS="feapder123", + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # 初始化云存储客户端 + # self.oss_client = OSSClient(bucket="my-bucket") + + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def get_file_path(self, task, url): + """返回 OSS 存储 key(不是本地路径)""" + filename = os.path.basename(unquote(urlparse(url).path)) + return f"images/{task.id}/{filename}" + + def process_file(self, task_id, url, file_path, response): + """上传到 OSS,返回云存储 URL""" + # self.oss_client.put_object(file_path, response.content) + cloud_url = f"https://my-bucket.oss.aliyuncs.com/{file_path}" + log.info(f"任务{task_id} 上传成功 url={cloud_url}") + return cloud_url + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + log.info(f"任务{task_id} 完成 成功={success_count} 失败={fail_count}") + if success_count > 0: + yield self.update_task_batch(task_id, 1) + else: + yield self.update_task_batch(task_id, -1) + + +if __name__ == "__main__": + spider = OssFileSpider( + redis_key="oss_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + ) + spider.start_monitor_task() diff --git a/tests/file-spider/test_oss_result_spider.py b/tests/file-spider/test_oss_result_spider.py new file mode 100644 index 00000000..3a906152 --- /dev/null +++ b/tests/file-spider/test_oss_result_spider.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" +场景三:上传云存储 + 结果入库 + +下载文件上传到云存储后,将有序的云存储 URL 列表组装成 Item 写入结果表。 + +使用前先创建结果 Item: + feapder create -i file_result + +然后编辑 items/file_result_item.py 添加 task_id、result_urls 字段。 +""" + +import json +import os +from urllib.parse import urlparse, unquote + +import feapder +from feapder.network.item import Item +from feapder.utils.log import log + + +class FileResultItem(Item): + """ + 结果表 Item(实际项目中应通过 feapder create -i 生成) + 对应的 MySQL 表: + CREATE TABLE `file_result` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `task_id` int(11) DEFAULT NULL, + `result_urls` text COMMENT '云存储URL列表,JSON数组', + PRIMARY KEY (`id`) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.table_name = "file_result" + self.task_id = None + self.result_urls = None + + +class OssResultSpider(feapder.FileSpider): + __custom_setting__ = dict( + REDISDB_IP_PORTS="localhost:6379", + REDISDB_USER_PASS="", + REDISDB_DB=0, + MYSQL_IP="localhost", + MYSQL_PORT=3306, + MYSQL_DB="feapder", + MYSQL_USER_NAME="feapder", + MYSQL_USER_PASS="feapder123", + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # self.oss_client = OSSClient(bucket="my-bucket") + + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def get_file_path(self, task, url): + filename = os.path.basename(unquote(urlparse(url).path)) + return f"images/{task.id}/{filename}" + + def process_file(self, task_id, url, file_path, response): + # self.oss_client.put_object(file_path, response.content) + return f"https://my-bucket.oss.aliyuncs.com/{file_path}" + + def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + # results 与 get_download_urls 返回的列表严格位置对应 + # 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/d.jpg"] + log.info(f"任务{task_id} 完成 成功={success_count} 失败={fail_count}") + + # 组装结果 Item 写入结果表 + item = FileResultItem() + item.task_id = task_id + item.result_urls = json.dumps(results) + yield item + + # 更新任务状态 + if fail_count == 0: + yield self.update_task_batch(task_id, 1) + else: + yield self.update_task_batch(task_id, -1) + + +if __name__ == "__main__": + spider = OssResultSpider( + redis_key="oss_result_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + ) + spider.start_monitor_task() From 5c0e78aa4d99df15036a03dfaa7e2cc7eb866806 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Tue, 7 Apr 2026 18:45:28 +0800 Subject: [PATCH 05/14] =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E7=AC=AC=E4=BA=8C=E6=AC=A1=E6=8F=90=E4=BA=A4?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=A4=8Dbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 4 +- feapder/core/base_parser.py | 17 ++-- feapder/core/spiders/file_spider.py | 117 +++++++++++++++++++++------- feapder/dedup/file_dedup.py | 4 + 4 files changed, 106 insertions(+), 36 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index 733d262f..e7539974 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -43,8 +43,8 @@ CREATE TABLE `file_task` ( | 方法 | 说明 | 默认行为 | |------|------|----------| -| `get_file_path(task, url)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{filename}` | -| `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置 | 保存到本地磁盘,返回本地路径 | +| `get_file_path(task, url, index)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{index}_{filename}` | +| `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置 | 流式保存到本地磁盘,返回本地路径 | | `on_file_downloaded(task_id, url, file_path)` | 单个文件下载成功回调 | 无 | | `on_file_failed(task_id, url, error)` | 单个文件下载失败回调 | 无 | diff --git a/feapder/core/base_parser.py b/feapder/core/base_parser.py index 1a379bad..bd5b0778 100644 --- a/feapder/core/base_parser.py +++ b/feapder/core/base_parser.py @@ -8,6 +8,7 @@ @email: boris_liu@foxmail.com """ import os +from urllib.parse import urlparse, unquote import feapder.utils.tools as tools from feapder.db.mysqldb import MysqlDB @@ -211,25 +212,25 @@ def get_download_urls(self, task): """ raise NotImplementedError("必须实现 get_download_urls 方法") - def get_file_path(self, task, url): + def get_file_path(self, task, url, index): """ 返回文件保存路径/标识,用户可重写 - 本地场景: 返回本地文件路径,如 ./downloads/123/image.jpg - 云存储场景: 返回存储标识/key,如 bucket/prefix/123/image.jpg + 本地场景: 返回本地文件路径,如 ./downloads/123/0_image.jpg + 云存储场景: 返回存储标识/key,如 bucket/prefix/123/0_image.jpg @param task: 任务信息 @param url: 文件 URL + @param index: 文件在 URL 列表中的索引,默认实现用于避免同名文件覆盖 @return: str - 文件路径或存储标识 """ - from urllib.parse import urlparse, unquote - parsed = urlparse(url) filename = os.path.basename(unquote(parsed.path)) or "unknown" + filename = f"{index}_{filename}" return os.path.join(self._save_dir, str(task.id), filename) def process_file(self, task_id, url, file_path, response): """ 处理下载的文件内容,返回文件最终存储位置。用户按需重写 - 默认实现: 保存到本地磁盘,返回本地路径 + 默认实现: 流式保存到本地磁盘,返回本地路径 云存储场景: 重写此方法上传到 OSS/S3 等,返回云存储 URL @param task_id: 任务 ID @param url: 文件原始 URL @@ -239,7 +240,9 @@ def process_file(self, task_id, url, file_path, response): """ os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as f: - f.write(response.content) + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) return file_path def on_file_downloaded(self, task_id, url, file_path): diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 9c4bde06..670c4456 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -8,6 +8,7 @@ import os import warnings +from urllib.parse import urlparse, unquote import feapder.setting as setting import feapder.utils.tools as tools @@ -135,25 +136,25 @@ def get_download_urls(self, task): """ raise NotImplementedError("必须实现 get_download_urls 方法") - def get_file_path(self, task, url): + def get_file_path(self, task, url, index): """ 返回文件保存路径/标识,用户可重写 本地场景: 返回本地文件路径 云存储场景: 返回存储标识/key @param task: 任务信息 @param url: 文件 URL + @param index: 文件在 URL 列表中的索引,默认实现用于避免同名文件覆盖 @return: str """ - from urllib.parse import urlparse, unquote - parsed = urlparse(url) filename = os.path.basename(unquote(parsed.path)) or "unknown" + filename = f"{index}_{filename}" return os.path.join(self._save_dir, str(task.id), filename) def process_file(self, task_id, url, file_path, response): """ 处理下载的文件内容,返回文件最终存储位置。用户按需重写 - 默认实现: 保存到本地磁盘,返回本地路径 + 默认实现: 流式保存到本地磁盘,返回本地路径 云存储场景: 重写此方法上传到 OSS/S3 等,返回云存储 URL @param task_id: 任务 ID @param url: 文件原始 URL @@ -163,7 +164,9 @@ def process_file(self, task_id, url, file_path, response): """ os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as f: - f.write(response.content) + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) return file_path def on_file_downloaded(self, task_id, url, file_path): @@ -200,6 +203,36 @@ def on_task_all_done(self, task_id, success_count, fail_count, total_count, resu # ===================== 框架内部方法 ===================== + # Lua 脚本: 原子递增计数并判断是否首次达到完成条件 + # 返回值: 0=未完成或已触发过, 1=首次达到完成条件 + _LUA_INCR_AND_CHECK = """ +local key = KEYS[1] +local field = ARGV[1] +redis.call('hincrby', key, field, 1) +local total = tonumber(redis.call('hget', key, 'total') or 0) +local success = tonumber(redis.call('hget', key, 'success') or 0) +local fail = tonumber(redis.call('hget', key, 'fail') or 0) +if success + fail >= total and total > 0 then + local done = redis.call('hsetnx', key, 'done', 1) + if done == 1 then + return 1 + end +end +return 0 +""" + _lua_incr_and_check_sha = None + + def _incr_and_check_done(self, progress_key, field): + """原子递增计数并检查是否首次达到完成条件""" + redis_client = self._redisdb._redis + if self.__class__._lua_incr_and_check_sha is None: + self.__class__._lua_incr_and_check_sha = redis_client.script_load( + self._LUA_INCR_AND_CHECK + ) + return redis_client.evalsha( + self.__class__._lua_incr_and_check_sha, 1, progress_key, field + ) + def start_requests(self, task): """ 遍历 URL 列表生成下载请求。 @@ -208,6 +241,8 @@ def start_requests(self, task): urls = self.get_download_urls(task) if not urls: log.warning(f"任务{task.id}无下载URL") + for result in self.on_task_all_done(task.id, 0, 0, 0, []) or []: + yield result return total = len(urls) @@ -224,7 +259,15 @@ def start_requests(self, task): self._redisdb.hset(progress_key, "fail", 0) cached_count = 0 + skipped_count = 0 for index, url in enumerate(urls): + if not url or not isinstance(url, str) or not url.strip(): + self._redisdb.hset(result_key, str(index), "") + self._redisdb.hincrby(progress_key, "fail", 1) + skipped_count += 1 + log.warning(f"任务{task_id} 跳过无效URL index={index}") + continue + # 去重缓存检查 if self._file_dedup: cached_result = self._file_dedup.get(url) @@ -233,10 +276,21 @@ def start_requests(self, task): self._redisdb.hincrby(progress_key, "success", 1) cached_count += 1 log.debug(f"任务{task_id} 文件去重命中 url={url}") - self.on_file_downloaded(task_id, url, cached_result) + try: + self.on_file_downloaded(task_id, url, cached_result) + except Exception as e: + log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") continue - file_path = self.get_file_path(task, url) + try: + file_path = self.get_file_path(task, url, index) + except Exception as e: + self._redisdb.hset(result_key, str(index), "") + self._redisdb.hincrby(progress_key, "fail", 1) + skipped_count += 1 + log.error(f"任务{task_id} get_file_path异常 url={url} error={e}") + continue + yield Request( url, task_id=task_id, @@ -248,11 +302,11 @@ def start_requests(self, task): if cached_count > 0: log.info(f"任务{task_id} 去重命中{cached_count}/{total}个文件") - # 全部命中缓存,直接触发 on_task_all_done - if cached_count >= total: + # 全部命中缓存或跳过,直接触发 on_task_all_done + if cached_count + skipped_count >= total: results = self._assemble_results(task_id, total) for result in self.on_task_all_done( - task_id, cached_count, 0, total, results + task_id, cached_count, skipped_count, total, results ) or []: yield result self._cleanup_task_redis(task_id) @@ -278,19 +332,24 @@ def save_file(self, request, response): ) self._redisdb.hset(result_key, str(file_index), result_url or "") - # 更新进度 + # 原子递增成功计数并检查是否首次完成 progress_key = setting.TAB_FILE_PROGRESS.format( redis_key=self._redis_key, task_id=task_id ) - success = self._redisdb.hincrby(progress_key, "success", 1) + is_first_done = self._incr_and_check_done(progress_key, "success") + total = int(self._redisdb.hget(progress_key, "total") or 0) + success = int(self._redisdb.hget(progress_key, "success") or 0) fail = int(self._redisdb.hget(progress_key, "fail") or 0) log.info(f"任务{task_id} 文件下载成功 [{success + fail}/{total}] url={url}") - self.on_file_downloaded(task_id, url, result_url) - # 检查任务是否全部完成 - if success + fail >= total: + try: + self.on_file_downloaded(task_id, url, result_url) + except Exception as e: + log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") + + if is_first_done: results = self._assemble_results(task_id, total) for result in self.on_task_all_done( task_id, success, fail, total, results @@ -315,19 +374,24 @@ def failed_request(self, request, response, e): ) self._redisdb.hset(result_key, str(file_index), "") - # 更新进度 + # 原子递增失败计数并检查是否首次完成 progress_key = setting.TAB_FILE_PROGRESS.format( redis_key=self._redis_key, task_id=task_id ) - fail = self._redisdb.hincrby(progress_key, "fail", 1) + is_first_done = self._incr_and_check_done(progress_key, "fail") + total = int(self._redisdb.hget(progress_key, "total") or 0) success = int(self._redisdb.hget(progress_key, "success") or 0) + fail = int(self._redisdb.hget(progress_key, "fail") or 0) log.error(f"任务{task_id} 文件下载失败 [{success + fail}/{total}] url={request.url}") - self.on_file_failed(task_id, request.url, e) - # 检查任务是否全部完成 - if success + fail >= total: + try: + self.on_file_failed(task_id, request.url, e) + except Exception as e_cb: + log.error(f"任务{task_id} on_file_failed回调异常 url={request.url} error={e_cb}") + + if is_first_done: results = self._assemble_results(task_id, total) for result in self.on_task_all_done( task_id, success, fail, total, results @@ -339,21 +403,20 @@ def failed_request(self, request, response, e): def _assemble_results(self, task_id, total): """ - 从 Redis 结果 Hash 中按 0~total-1 顺序读取所有文件处理结果, - 组装为有序列表返回。 + 从 Redis 结果 Hash 中一次性拉取所有文件处理结果, + 按 0~total-1 顺序组装为有序列表返回。 """ result_key = setting.TAB_FILE_RESULT.format( redis_key=self._redis_key, task_id=task_id ) + all_data = self._redisdb.hgetall(result_key) results = [] for i in range(total): - value = self._redisdb.hget(result_key, str(i)) - if value is None or value == "" or value == b"": + value = all_data.get(str(i)) or all_data.get(str(i).encode()) + if value is None or value == b"" or value == "": results.append(None) else: - if isinstance(value, bytes): - value = value.decode() - results.append(value) + results.append(value.decode() if isinstance(value, bytes) else value) return results def _cleanup_task_redis(self, task_id): diff --git a/feapder/dedup/file_dedup.py b/feapder/dedup/file_dedup.py index dd6fd6dc..1848ea49 100644 --- a/feapder/dedup/file_dedup.py +++ b/feapder/dedup/file_dedup.py @@ -57,6 +57,7 @@ def __init__(self, table, expire_time=None): self._redisdb = RedisDB() self._table = table self._expire_time = expire_time + self._expire_set = False def get(self, url): result = self._redisdb.hget(self._table, url) @@ -68,6 +69,9 @@ def get(self, url): def set(self, url, result_url): self._redisdb.hset(self._table, url, result_url) + if self._expire_time and not self._expire_set: + self._redisdb._redis.expire(self._table, self._expire_time) + self._expire_set = True class MysqlFileDedup(FileDedup): From edaeee4c6e1351b84604fa0642ebef2da1275b70 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Tue, 7 Apr 2026 19:10:50 +0800 Subject: [PATCH 06/14] =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E7=AC=AC=E4=B8=89=E6=AC=A1=E6=8F=90=E4=BA=A4?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=A4=8D=E7=BB=86=E8=8A=82bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 8 ++++---- feapder/core/parser_control.py | 14 ++++++++++++++ feapder/core/spiders/file_spider.py | 16 +++++++++++++--- feapder/dedup/file_dedup.py | 4 +--- tests/file-spider/test_oss_file_spider.py | 4 ++-- tests/file-spider/test_oss_result_spider.py | 4 ++-- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index e7539974..b4cae73a 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -147,10 +147,10 @@ class OssFileSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def get_file_path(self, task, url): + def get_file_path(self, task, url, index): """返回 OSS 存储 key(不是本地路径)""" filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{filename}" + return f"images/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): """上传 OSS,返回云存储 URL""" @@ -199,9 +199,9 @@ class OssResultSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def get_file_path(self, task, url): + def get_file_path(self, task, url, index): filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{filename}" + return f"images/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): # self.oss_client.put_object(file_path, response.content) diff --git a/feapder/core/parser_control.py b/feapder/core/parser_control.py index 021d2956..278f9b57 100644 --- a/feapder/core/parser_control.py +++ b/feapder/core/parser_control.py @@ -396,6 +396,13 @@ def deal_request(self, request): if response and getattr(response, "browser", None): request.render_downloader.put_back(response.browser) + # 释放连接(stream=True 时未消费完 body 会占用连接池) + if response and hasattr(response, "close"): + try: + response.close() + except Exception: + pass + break # 删除正在做的request 跟随item优先 @@ -732,6 +739,13 @@ def deal_request(self, request): if response and getattr(response, "browser", None): request.render_downloader.put_back(response.browser) + # 释放连接(stream=True 时未消费完 body 会占用连接池) + if response and hasattr(response, "close"): + try: + response.close() + except Exception: + pass + break if setting.SPIDER_SLEEP_TIME: diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 670c4456..3109f5cf 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -10,6 +10,8 @@ import warnings from urllib.parse import urlparse, unquote +from redis.exceptions import NoScriptError + import feapder.setting as setting import feapder.utils.tools as tools from feapder.core.spiders.task_spider import TaskSpider @@ -229,9 +231,17 @@ def _incr_and_check_done(self, progress_key, field): self.__class__._lua_incr_and_check_sha = redis_client.script_load( self._LUA_INCR_AND_CHECK ) - return redis_client.evalsha( - self.__class__._lua_incr_and_check_sha, 1, progress_key, field - ) + try: + return redis_client.evalsha( + self.__class__._lua_incr_and_check_sha, 1, progress_key, field + ) + except NoScriptError: + self.__class__._lua_incr_and_check_sha = redis_client.script_load( + self._LUA_INCR_AND_CHECK + ) + return redis_client.evalsha( + self.__class__._lua_incr_and_check_sha, 1, progress_key, field + ) def start_requests(self, task): """ diff --git a/feapder/dedup/file_dedup.py b/feapder/dedup/file_dedup.py index 1848ea49..21c7b2a6 100644 --- a/feapder/dedup/file_dedup.py +++ b/feapder/dedup/file_dedup.py @@ -57,7 +57,6 @@ def __init__(self, table, expire_time=None): self._redisdb = RedisDB() self._table = table self._expire_time = expire_time - self._expire_set = False def get(self, url): result = self._redisdb.hget(self._table, url) @@ -69,9 +68,8 @@ def get(self, url): def set(self, url, result_url): self._redisdb.hset(self._table, url, result_url) - if self._expire_time and not self._expire_set: + if self._expire_time: self._redisdb._redis.expire(self._table, self._expire_time) - self._expire_set = True class MysqlFileDedup(FileDedup): diff --git a/tests/file-spider/test_oss_file_spider.py b/tests/file-spider/test_oss_file_spider.py index 249a9f22..9115a8e3 100644 --- a/tests/file-spider/test_oss_file_spider.py +++ b/tests/file-spider/test_oss_file_spider.py @@ -33,10 +33,10 @@ def __init__(self, *args, **kwargs): def get_download_urls(self, task): return json.loads(task.file_urls) - def get_file_path(self, task, url): + def get_file_path(self, task, url, index): """返回 OSS 存储 key(不是本地路径)""" filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{filename}" + return f"images/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): """上传到 OSS,返回云存储 URL""" diff --git a/tests/file-spider/test_oss_result_spider.py b/tests/file-spider/test_oss_result_spider.py index 3a906152..ff8aa339 100644 --- a/tests/file-spider/test_oss_result_spider.py +++ b/tests/file-spider/test_oss_result_spider.py @@ -57,9 +57,9 @@ def __init__(self, *args, **kwargs): def get_download_urls(self, task): return json.loads(task.file_urls) - def get_file_path(self, task, url): + def get_file_path(self, task, url, index): filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{filename}" + return f"images/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): # self.oss_client.put_object(file_path, response.content) From 744f17247f2b8c1f1069550bfe50b98dde97ff17 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Tue, 7 Apr 2026 19:29:48 +0800 Subject: [PATCH 07/14] =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E7=AC=AC=E5=9B=9B=E6=AC=A1=E6=8F=90=E4=BA=A4?= =?UTF-8?q?=EF=BC=8C=E6=96=87=E6=A1=A3=E7=BB=86=E8=8A=82=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index b4cae73a..c4aa79e8 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -7,7 +7,7 @@ FileSpider 是一款分布式文件下载爬虫,专用于批量下载文件/ - **进度追踪**: 框架自动追踪每个任务的下载进度(成功数/失败数/总数) - **结果有序**: 下载结果列表与原始 URL 列表严格位置对应 - **灵活存储**: 默认保存到本地磁盘,可重写为上传云存储(OSS/S3 等),不落盘 -- **文件去重**: 可选功能,同一 URL 不重复下载,支持 Redis / MySQL 两种策略 +- **文件去重**: 可选功能,同一 URL 不重复下载,支持 Redis / MySQL / 自定义 三种策略 - **用户控制**: 任务成功/失败由用户在回调中显式决定 FileSpider 继承自 TaskSpider,复用了全部任务管理能力(MySQL 任务表、Redis 队列、断点续爬、丢失任务回收、分布式支持等)。 @@ -127,7 +127,7 @@ if __name__ == "__main__": spider.start_monitor_task() ``` -### 场景二:上传云存储(不落盘) +### 场景二:上传云存储 重写 `process_file` 实现直接上传云存储: @@ -142,7 +142,7 @@ class OssFileSpider(feapder.FileSpider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # 初始化云存储客户端 - # self.oss_client = OSSClient(bucket="my-bucket") + self.oss_client = OSSClient(bucket="my-bucket") def get_download_urls(self, task): return json.loads(task.file_urls) @@ -154,7 +154,7 @@ class OssFileSpider(feapder.FileSpider): def process_file(self, task_id, url, file_path, response): """上传 OSS,返回云存储 URL""" - # self.oss_client.put_object(file_path, response.content) + self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): @@ -194,7 +194,7 @@ from items.file_result_item import FileResultItem class OssResultSpider(feapder.FileSpider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # self.oss_client = OSSClient(bucket="my-bucket") + self.oss_client = OSSClient(bucket="my-bucket") def get_download_urls(self, task): return json.loads(task.file_urls) @@ -204,11 +204,11 @@ class OssResultSpider(feapder.FileSpider): return f"images/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): - # self.oss_client.put_object(file_path, response.content) + self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): - # results 与 get_download_urls 返回的列表严格位置对应 + # results 与 get_download_urls 返回的列表严格位置对应,下载失败的用null占位。如需去空,手动lamda表达式过滤即可。 item = FileResultItem() item.task_id = task_id item.result_urls = json.dumps(results) From 0bfd88daee8ce668954cc4a0b1d9a60284719245 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Wed, 8 Apr 2026 10:42:38 +0800 Subject: [PATCH 08/14] =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E6=94=AF=E6=8C=81=E4=BB=BB=E5=8A=A1=E5=AD=97?= =?UTF-8?q?=E6=AE=B5=E9=80=8F=E4=BC=A0=E7=BB=84=E8=A3=85=E6=9C=80=E7=BB=88?= =?UTF-8?q?item=EF=BC=9Bon=5Ftask=5Fall=5Fdone=E6=96=B9=E6=B3=95=E7=AD=BE?= =?UTF-8?q?=E5=90=8D=E4=BF=AE=E6=94=B9=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 39 +++++++++---------- feapder/core/base_parser.py | 10 ++--- feapder/core/spiders/file_spider.py | 42 +++++++++++---------- tests/file-spider/test_dedup_file_spider.py | 6 +-- tests/file-spider/test_local_file_spider.py | 6 +-- tests/file-spider/test_oss_file_spider.py | 10 ++--- tests/file-spider/test_oss_result_spider.py | 16 ++++---- 7 files changed, 67 insertions(+), 62 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index c4aa79e8..d008dc4a 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -37,7 +37,7 @@ CREATE TABLE `file_task` ( | 方法 | 说明 | |------|------| | `get_download_urls(task)` | 从 task 中提取文件 URL 列表,返回 `List[str]` | -| `on_task_all_done(task_id, success_count, fail_count, total_count, results)` | 任务所有文件处理完毕的回调,在此 yield Item 或 update_task_batch 更新状态 | +| `on_task_all_done(task, result, success_count, fail_count, total_count)` | 任务所有文件处理完毕的回调,在此 yield Item 或 update_task_batch 更新状态 | ### 可选重写 @@ -66,9 +66,10 @@ save_file (框架层,不应重写) ### `on_task_all_done` 参数说明 ```python -def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): +def on_task_all_done(self, task, result, success_count, fail_count, total_count): """ - results: List[str|None] + task: PerfectDict - 任务对象,包含 task_keys 指定的字段,可通过 task.id 获取任务 ID + result: List[str|None] - 与 get_download_urls 返回的列表严格位置对应 - 成功: 文件存储位置(本地路径或云存储 URL) - 失败: None @@ -110,11 +111,11 @@ class LocalFileSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + def on_task_all_done(self, task, result, success_count, fail_count, total_count): if fail_count == 0: - yield self.update_task_batch(task_id, 1) + yield self.update_task_batch(task.id, 1) else: - yield self.update_task_batch(task_id, -1) + yield self.update_task_batch(task.id, -1) if __name__ == "__main__": @@ -150,18 +151,18 @@ class OssFileSpider(feapder.FileSpider): def get_file_path(self, task, url, index): """返回 OSS 存储 key(不是本地路径)""" filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{index}_{filename}" + return f"files/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): """上传 OSS,返回云存储 URL""" self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + def on_task_all_done(self, task, result, success_count, fail_count, total_count): if success_count > 0: - yield self.update_task_batch(task_id, 1) + yield self.update_task_batch(task.id, 1) else: - yield self.update_task_batch(task_id, -1) + yield self.update_task_batch(task.id, -1) if __name__ == "__main__": @@ -201,23 +202,23 @@ class OssResultSpider(feapder.FileSpider): def get_file_path(self, task, url, index): filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{index}_{filename}" + return f"files/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): - # results 与 get_download_urls 返回的列表严格位置对应,下载失败的用null占位。如需去空,手动lamda表达式过滤即可。 + def on_task_all_done(self, task, result, success_count, fail_count, total_count): + # result 与 get_download_urls 返回的列表严格位置对应,下载失败的用 None 占位 item = FileResultItem() - item.task_id = task_id - item.result_urls = json.dumps(results) + item.task_id = task.id + item.result_urls = result yield item if fail_count == 0: - yield self.update_task_batch(task_id, 1) + yield self.update_task_batch(task.id, 1) else: - yield self.update_task_batch(task_id, -1) + yield self.update_task_batch(task.id, -1) ``` ### 场景四:启用文件去重 @@ -233,8 +234,8 @@ class DedupFileSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): - yield self.update_task_batch(task_id, 1 if fail_count == 0 else -1) + def on_task_all_done(self, task, result, success_count, fail_count, total_count): + yield self.update_task_batch(task.id, 1 if fail_count == 0 else -1) if __name__ == "__main__": diff --git a/feapder/core/base_parser.py b/feapder/core/base_parser.py index bd5b0778..e3a4dcd4 100644 --- a/feapder/core/base_parser.py +++ b/feapder/core/base_parser.py @@ -263,17 +263,17 @@ def on_file_failed(self, task_id, url, error): """ pass - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + def on_task_all_done(self, task, result, success_count, fail_count, total_count): """ 任务所有文件处理完毕的回调 用户应在此方法中 yield Item 写入结果表、yield self.update_task_batch() 更新任务状态 - @param task_id: 任务 ID + @param task: PerfectDict - 任务对象,包含 task_keys 指定的字段 + @param result: List[str|None] - 每个文件的处理结果, + 顺序与 get_download_urls 返回的列表一致。 + 成功为文件存储位置(本地路径或云存储 URL),失败为 None @param success_count: 成功数 @param fail_count: 失败数 @param total_count: 总数 - @param results: List[str|None] - 每个文件的处理结果, - 顺序与 get_download_urls 返回的列表一致。 - 成功为文件存储位置(本地路径或云存储 URL),失败为 None """ pass diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 3109f5cf..dff8c9ce 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -19,6 +19,7 @@ from feapder.network.item import UpdateItem from feapder.network.request import Request from feapder.utils.log import log +from feapder.utils.perfect_dict import PerfectDict CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline" @@ -189,17 +190,17 @@ def on_file_failed(self, task_id, url, error): """ pass - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + def on_task_all_done(self, task, result, success_count, fail_count, total_count): """ 任务所有文件处理完毕的回调 用户应在此方法中 yield Item 写入结果表、yield self.update_task_batch() 更新任务状态 - @param task_id: 任务 ID + @param task: PerfectDict - 任务对象,包含 task_keys 指定的字段 + @param result: List[str|None] - 每个文件的处理结果, + 顺序与 get_download_urls 返回的列表一致。 + 成功为文件存储位置,失败为 None @param success_count: 成功数 @param fail_count: 失败数 @param total_count: 总数 - @param results: List[str|None] - 每个文件的处理结果, - 顺序与 get_download_urls 返回的列表一致。 - 成功为文件存储位置,失败为 None """ pass @@ -251,8 +252,8 @@ def start_requests(self, task): urls = self.get_download_urls(task) if not urls: log.warning(f"任务{task.id}无下载URL") - for result in self.on_task_all_done(task.id, 0, 0, 0, []) or []: - yield result + for item in self.on_task_all_done(task, [], 0, 0, 0) or []: + yield item return total = len(urls) @@ -306,6 +307,7 @@ def start_requests(self, task): task_id=task_id, file_index=index, file_path=file_path, + task_data=dict(task), callback=self.save_file, ) @@ -314,11 +316,11 @@ def start_requests(self, task): # 全部命中缓存或跳过,直接触发 on_task_all_done if cached_count + skipped_count >= total: - results = self._assemble_results(task_id, total) - for result in self.on_task_all_done( - task_id, cached_count, skipped_count, total, results + result = self._assemble_results(task_id, total) + for item in self.on_task_all_done( + task, result, cached_count, skipped_count, total ) or []: - yield result + yield item self._cleanup_task_redis(task_id) def save_file(self, request, response): @@ -360,11 +362,12 @@ def save_file(self, request, response): log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") if is_first_done: - results = self._assemble_results(task_id, total) - for result in self.on_task_all_done( - task_id, success, fail, total, results + task = PerfectDict(_dict=request.task_data) + result = self._assemble_results(task_id, total) + for item in self.on_task_all_done( + task, result, success, fail, total ) or []: - yield result + yield item self._cleanup_task_redis(task_id) def failed_request(self, request, response, e): @@ -402,11 +405,12 @@ def failed_request(self, request, response, e): log.error(f"任务{task_id} on_file_failed回调异常 url={request.url} error={e_cb}") if is_first_done: - results = self._assemble_results(task_id, total) - for result in self.on_task_all_done( - task_id, success, fail, total, results + task = PerfectDict(_dict=request.task_data) + result = self._assemble_results(task_id, total) + for item in self.on_task_all_done( + task, result, success, fail, total ) or []: - yield result + yield item self._cleanup_task_redis(task_id) yield request diff --git a/tests/file-spider/test_dedup_file_spider.py b/tests/file-spider/test_dedup_file_spider.py index f5128b9b..e11ecdc9 100644 --- a/tests/file-spider/test_dedup_file_spider.py +++ b/tests/file-spider/test_dedup_file_spider.py @@ -35,9 +35,9 @@ def get_download_urls(self, task): def on_file_downloaded(self, task_id, url, file_path): log.info(f"任务{task_id} 文件就绪 path={file_path}") - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): - log.info(f"任务{task_id} 完成 成功={success_count} 失败={fail_count}") - yield self.update_task_batch(task_id, 1 if fail_count == 0 else -1) + def on_task_all_done(self, task, result, success_count, fail_count, total_count): + log.info(f"任务{task.id} 完成 成功={success_count} 失败={fail_count}") + yield self.update_task_batch(task.id, 1 if fail_count == 0 else -1) if __name__ == "__main__": diff --git a/tests/file-spider/test_local_file_spider.py b/tests/file-spider/test_local_file_spider.py index 5875ca1e..f0a87454 100644 --- a/tests/file-spider/test_local_file_spider.py +++ b/tests/file-spider/test_local_file_spider.py @@ -30,11 +30,11 @@ def get_download_urls(self, task): def on_file_downloaded(self, task_id, url, file_path): log.info(f"任务{task_id} 文件保存成功 path={file_path}") - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): + def on_task_all_done(self, task, result, success_count, fail_count, total_count): if fail_count == 0: - yield self.update_task_batch(task_id, 1) + yield self.update_task_batch(task.id, 1) else: - yield self.update_task_batch(task_id, -1) + yield self.update_task_batch(task.id, -1) if __name__ == "__main__": diff --git a/tests/file-spider/test_oss_file_spider.py b/tests/file-spider/test_oss_file_spider.py index 9115a8e3..58a2c00c 100644 --- a/tests/file-spider/test_oss_file_spider.py +++ b/tests/file-spider/test_oss_file_spider.py @@ -36,7 +36,7 @@ def get_download_urls(self, task): def get_file_path(self, task, url, index): """返回 OSS 存储 key(不是本地路径)""" filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{index}_{filename}" + return f"files/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): """上传到 OSS,返回云存储 URL""" @@ -45,12 +45,12 @@ def process_file(self, task_id, url, file_path, response): log.info(f"任务{task_id} 上传成功 url={cloud_url}") return cloud_url - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): - log.info(f"任务{task_id} 完成 成功={success_count} 失败={fail_count}") + def on_task_all_done(self, task, result, success_count, fail_count, total_count): + log.info(f"任务{task.id} 完成 成功={success_count} 失败={fail_count}") if success_count > 0: - yield self.update_task_batch(task_id, 1) + yield self.update_task_batch(task.id, 1) else: - yield self.update_task_batch(task_id, -1) + yield self.update_task_batch(task.id, -1) if __name__ == "__main__": diff --git a/tests/file-spider/test_oss_result_spider.py b/tests/file-spider/test_oss_result_spider.py index ff8aa339..96143712 100644 --- a/tests/file-spider/test_oss_result_spider.py +++ b/tests/file-spider/test_oss_result_spider.py @@ -59,28 +59,28 @@ def get_download_urls(self, task): def get_file_path(self, task, url, index): filename = os.path.basename(unquote(urlparse(url).path)) - return f"images/{task.id}/{index}_{filename}" + return f"files/{task.id}/{index}_{filename}" def process_file(self, task_id, url, file_path, response): # self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task_id, success_count, fail_count, total_count, results): - # results 与 get_download_urls 返回的列表严格位置对应 + def on_task_all_done(self, task, result, success_count, fail_count, total_count): + # result 与 get_download_urls 返回的列表严格位置对应 # 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/d.jpg"] - log.info(f"任务{task_id} 完成 成功={success_count} 失败={fail_count}") + log.info(f"任务{task.id} 完成 成功={success_count} 失败={fail_count}") # 组装结果 Item 写入结果表 item = FileResultItem() - item.task_id = task_id - item.result_urls = json.dumps(results) + item.task_id = task.id + item.result_urls = result yield item # 更新任务状态 if fail_count == 0: - yield self.update_task_batch(task_id, 1) + yield self.update_task_batch(task.id, 1) else: - yield self.update_task_batch(task_id, -1) + yield self.update_task_batch(task.id, -1) if __name__ == "__main__": From e25915c792d453281fe51b81a925ed9f42b5e516 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Wed, 8 Apr 2026 14:50:47 +0800 Subject: [PATCH 09/14] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E2=80=9C=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD=E7=88=AC=E8=99=AB=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E5=AD=97=E6=AE=B5=E9=80=8F=E4=BC=A0=E7=BB=84?= =?UTF-8?q?=E8=A3=85=E6=9C=80=E7=BB=88item=E2=80=9D=E7=9A=84bug=E2=80=94?= =?UTF-8?q?=E2=80=94=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 2 +- feapder/core/spiders/file_spider.py | 159 ++++++++++++++++++---------- 2 files changed, 106 insertions(+), 55 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index d008dc4a..d0a24bed 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -43,7 +43,7 @@ CREATE TABLE `file_task` ( | 方法 | 说明 | 默认行为 | |------|------|----------| -| `get_file_path(task, url, index)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{index}_{filename}` | +| `get_file_path(task, url, index)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{index}_{md5(filename)}{ext}` | | `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置 | 流式保存到本地磁盘,返回本地路径 | | `on_file_downloaded(task_id, url, file_path)` | 单个文件下载成功回调 | 无 | | `on_file_failed(task_id, url, error)` | 单个文件下载失败回调 | 无 | diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index dff8c9ce..506c10fb 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -6,6 +6,7 @@ --------- """ +import hashlib import os import warnings from urllib.parse import urlparse, unquote @@ -19,7 +20,6 @@ from feapder.network.item import UpdateItem from feapder.network.request import Request from feapder.utils.log import log -from feapder.utils.perfect_dict import PerfectDict CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline" @@ -123,12 +123,16 @@ def __init__( dedup_table = setting.TAB_FILE_DEDUP.format(redis_key=self._redis_key) self._file_dedup = RedisFileDedup(dedup_table, file_dedup_expire) elif file_dedup == "mysql": + if file_dedup_expire is not None: + log.warning("file_dedup_expire仅在file_dedup='redis'时生效") self._file_dedup = MysqlFileDedup() elif isinstance(file_dedup, FileDedup): self._file_dedup = file_dedup else: self._file_dedup = None + self._lua_incr_and_check_sha = None + # ===================== 用户需实现/可重写的方法 ===================== def get_download_urls(self, task): @@ -150,8 +154,10 @@ def get_file_path(self, task, url, index): @return: str """ parsed = urlparse(url) - filename = os.path.basename(unquote(parsed.path)) or "unknown" - filename = f"{index}_{filename}" + raw_name = os.path.basename(unquote(parsed.path)) or "unknown" + _, ext = os.path.splitext(raw_name) + name_hash = hashlib.md5(raw_name.encode()).hexdigest() + filename = f"{index}_{name_hash}{ext}" return os.path.join(self._save_dir, str(task.id), filename) def process_file(self, task_id, url, file_path, response): @@ -212,9 +218,9 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) local key = KEYS[1] local field = ARGV[1] redis.call('hincrby', key, field, 1) -local total = tonumber(redis.call('hget', key, 'total') or 0) -local success = tonumber(redis.call('hget', key, 'success') or 0) -local fail = tonumber(redis.call('hget', key, 'fail') or 0) +local total = tonumber(redis.call('hget', key, 'total')) or 0 +local success = tonumber(redis.call('hget', key, 'success')) or 0 +local fail = tonumber(redis.call('hget', key, 'fail')) or 0 if success + fail >= total and total > 0 then local done = redis.call('hsetnx', key, 'done', 1) if done == 1 then @@ -223,33 +229,36 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) end return 0 """ - _lua_incr_and_check_sha = None def _incr_and_check_done(self, progress_key, field): """原子递增计数并检查是否首次达到完成条件""" redis_client = self._redisdb._redis - if self.__class__._lua_incr_and_check_sha is None: - self.__class__._lua_incr_and_check_sha = redis_client.script_load( + if self._lua_incr_and_check_sha is None: + self._lua_incr_and_check_sha = redis_client.script_load( self._LUA_INCR_AND_CHECK ) try: return redis_client.evalsha( - self.__class__._lua_incr_and_check_sha, 1, progress_key, field + self._lua_incr_and_check_sha, 1, progress_key, field ) except NoScriptError: - self.__class__._lua_incr_and_check_sha = redis_client.script_load( + self._lua_incr_and_check_sha = redis_client.script_load( self._LUA_INCR_AND_CHECK ) return redis_client.evalsha( - self.__class__._lua_incr_and_check_sha, 1, progress_key, field + self._lua_incr_and_check_sha, 1, progress_key, field ) def start_requests(self, task): """ 遍历 URL 列表生成下载请求。 去重缓存命中的 URL 直接复用结果,不生成 Request。 + 先在本地收集所有缓存/跳过结果,通过 pipeline 一次性写入 Redis, + 再 yield Request,避免 worker 线程与初始化之间的竞态。 """ urls = self.get_download_urls(task) + if isinstance(urls, str): + raise TypeError(f"get_download_urls应返回列表, 实际返回了字符串: {urls[:100]}") if not urls: log.warning(f"任务{task.id}无下载URL") for item in self.on_task_all_done(task, [], 0, 0, 0) or []: @@ -265,26 +274,26 @@ def start_requests(self, task): redis_key=self._redis_key, task_id=task_id ) - self._redisdb.hset(progress_key, "total", total) - self._redisdb.hset(progress_key, "success", 0) - self._redisdb.hset(progress_key, "fail", 0) - cached_count = 0 skipped_count = 0 + result_mapping = {} + pending_requests = [] + for index, url in enumerate(urls): if not url or not isinstance(url, str) or not url.strip(): - self._redisdb.hset(result_key, str(index), "") - self._redisdb.hincrby(progress_key, "fail", 1) + result_mapping[str(index)] = "" skipped_count += 1 log.warning(f"任务{task_id} 跳过无效URL index={index}") continue - # 去重缓存检查 if self._file_dedup: - cached_result = self._file_dedup.get(url) + try: + cached_result = self._file_dedup.get(url) + except Exception as e: + log.error(f"任务{task_id} 去重缓存查询异常 url={url} error={e}") + cached_result = None if cached_result is not None: - self._redisdb.hset(result_key, str(index), cached_result) - self._redisdb.hincrby(progress_key, "success", 1) + result_mapping[str(index)] = cached_result cached_count += 1 log.debug(f"任务{task_id} 文件去重命中 url={url}") try: @@ -296,21 +305,35 @@ def start_requests(self, task): try: file_path = self.get_file_path(task, url, index) except Exception as e: - self._redisdb.hset(result_key, str(index), "") - self._redisdb.hincrby(progress_key, "fail", 1) + result_mapping[str(index)] = "" skipped_count += 1 log.error(f"任务{task_id} get_file_path异常 url={url} error={e}") continue - yield Request( - url, - task_id=task_id, - file_index=index, - file_path=file_path, - task_data=dict(task), - callback=self.save_file, + pending_requests.append( + Request( + url, + task_id=task_id, + file_index=index, + file_path=file_path, + task=task, + callback=self.save_file, + ) ) + # 清理旧 key 并通过 pipeline 原子写入初始状态 + pipe = self._redisdb._redis.pipeline() + pipe.delete(progress_key) + pipe.delete(result_key) + for field, value in {"total": total, "success": cached_count, "fail": skipped_count}.items(): + pipe.hset(progress_key, field, value) + pipe.expire(progress_key, 86400) + if result_mapping: + for field, value in result_mapping.items(): + pipe.hset(result_key, field, value) + pipe.expire(result_key, 86400) + pipe.execute() + if cached_count > 0: log.info(f"任务{task_id} 去重命中{cached_count}/{total}个文件") @@ -321,7 +344,12 @@ def start_requests(self, task): task, result, cached_count, skipped_count, total ) or []: yield item - self._cleanup_task_redis(task_id) + yield lambda: self._cleanup_task_redis(task_id) + return + + # Redis 状态就绪后再下发请求 + for request in pending_requests: + yield request def save_file(self, request, response): """ @@ -332,11 +360,30 @@ def save_file(self, request, response): url = request.url file_path = request.file_path - result_url = self.process_file(task_id, url, file_path, response) + try: + result_url = self.process_file(task_id, url, file_path, response) + except Exception as e: + log.error(f"任务{task_id} process_file异常 url={url} error={e}") + raise - # 写入去重缓存 + # 写入去重缓存(异常不影响主流程) if self._file_dedup and result_url: - self._file_dedup.set(url, result_url) + try: + self._file_dedup.set(url, result_url) + except Exception as e: + log.error(f"任务{task_id} 去重缓存写入异常 url={url} error={e}") + + # 晚到回调检查:若 progress_key 已被清理,跳过 Redis 写入避免重建脏 key + progress_key = setting.TAB_FILE_PROGRESS.format( + redis_key=self._redis_key, task_id=task_id + ) + if not self._redisdb._redis.exists(progress_key): + log.debug(f"任务{task_id} 进度key已清理, 跳过晚到回调的Redis写入") + try: + self.on_file_downloaded(task_id, url, result_url) + except Exception as e: + log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") + return # 记录结果 result_key = setting.TAB_FILE_RESULT.format( @@ -345,9 +392,6 @@ def save_file(self, request, response): self._redisdb.hset(result_key, str(file_index), result_url or "") # 原子递增成功计数并检查是否首次完成 - progress_key = setting.TAB_FILE_PROGRESS.format( - redis_key=self._redis_key, task_id=task_id - ) is_first_done = self._incr_and_check_done(progress_key, "success") total = int(self._redisdb.hget(progress_key, "total") or 0) @@ -362,13 +406,13 @@ def save_file(self, request, response): log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") if is_first_done: - task = PerfectDict(_dict=request.task_data) + task = request.task result = self._assemble_results(task_id, total) for item in self.on_task_all_done( task, result, success, fail, total ) or []: yield item - self._cleanup_task_redis(task_id) + yield lambda: self._cleanup_task_redis(task_id) def failed_request(self, request, response, e): """ @@ -381,6 +425,19 @@ def failed_request(self, request, response, e): yield request return + # 晚到回调检查:若 progress_key 已被清理,跳过 Redis 写入避免重建脏 key + progress_key = setting.TAB_FILE_PROGRESS.format( + redis_key=self._redis_key, task_id=task_id + ) + if not self._redisdb._redis.exists(progress_key): + log.debug(f"任务{task_id} 进度key已清理, 跳过晚到回调的Redis写入") + try: + self.on_file_failed(task_id, request.url, e) + except Exception as e_cb: + log.error(f"任务{task_id} on_file_failed回调异常 url={request.url} error={e_cb}") + yield request + return + # 记录失败结果 result_key = setting.TAB_FILE_RESULT.format( redis_key=self._redis_key, task_id=task_id @@ -388,9 +445,6 @@ def failed_request(self, request, response, e): self._redisdb.hset(result_key, str(file_index), "") # 原子递增失败计数并检查是否首次完成 - progress_key = setting.TAB_FILE_PROGRESS.format( - redis_key=self._redis_key, task_id=task_id - ) is_first_done = self._incr_and_check_done(progress_key, "fail") total = int(self._redisdb.hget(progress_key, "total") or 0) @@ -405,13 +459,13 @@ def failed_request(self, request, response, e): log.error(f"任务{task_id} on_file_failed回调异常 url={request.url} error={e_cb}") if is_first_done: - task = PerfectDict(_dict=request.task_data) + task = request.task result = self._assemble_results(task_id, total) for item in self.on_task_all_done( task, result, success, fail, total ) or []: yield item - self._cleanup_task_redis(task_id) + yield lambda: self._cleanup_task_redis(task_id) yield request @@ -423,15 +477,12 @@ def _assemble_results(self, task_id, total): result_key = setting.TAB_FILE_RESULT.format( redis_key=self._redis_key, task_id=task_id ) - all_data = self._redisdb.hgetall(result_key) - results = [] - for i in range(total): - value = all_data.get(str(i)) or all_data.get(str(i).encode()) - if value is None or value == b"" or value == "": - results.append(None) - else: - results.append(value.decode() if isinstance(value, bytes) else value) - return results + raw_data = self._redisdb.hgetall(result_key) + all_data = { + (k.decode() if isinstance(k, bytes) else k): (v.decode() if isinstance(v, bytes) else v) + for k, v in raw_data.items() + } + return [all_data.get(str(i)) or None for i in range(total)] def _cleanup_task_redis(self, task_id): """清理任务相关的 Redis 进度和结果 key""" From 5f4ddb4608beeae4380941e8603fbaa84aa066a3 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Wed, 8 Apr 2026 15:34:21 +0800 Subject: [PATCH 10/14] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E2=80=9C=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD=E7=88=AC=E8=99=AB=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E5=AD=97=E6=AE=B5=E9=80=8F=E4=BC=A0=E7=BB=84?= =?UTF-8?q?=E8=A3=85=E6=9C=80=E7=BB=88item=E2=80=9D=E7=9A=84bug=E2=80=94?= =?UTF-8?q?=E2=80=94=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 9 +++- feapder/core/spiders/file_spider.py | 75 +++++++++++++++++------------ 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index d0a24bed..17454ec0 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -44,7 +44,7 @@ CREATE TABLE `file_task` ( | 方法 | 说明 | 默认行为 | |------|------|----------| | `get_file_path(task, url, index)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{index}_{md5(filename)}{ext}` | -| `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置 | 流式保存到本地磁盘,返回本地路径 | +| `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置(需保证幂等) | 流式保存到本地磁盘,返回本地路径 | | `on_file_downloaded(task_id, url, file_path)` | 单个文件下载成功回调 | 无 | | `on_file_failed(task_id, url, error)` | 单个文件下载失败回调 | 无 | @@ -63,6 +63,13 @@ save_file (框架层,不应重写) └── yield update_task_batch → 更新任务状态 ``` +### `process_file` 幂等性要求 + +`process_file` 在下载失败重试时可能被多次调用(同一 URL、同一 `file_path`),实现需保证幂等性: +- 默认实现使用 `"wb"` 模式覆盖写入,天然幂等 +- 重写时避免使用追加模式(`"ab"`) +- 云存储场景建议使用 `put_object` 等覆盖语义的 API + ### `on_task_all_done` 参数说明 ```python diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 506c10fb..62eff8bf 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -128,6 +128,11 @@ def __init__( self._file_dedup = MysqlFileDedup() elif isinstance(file_dedup, FileDedup): self._file_dedup = file_dedup + elif file_dedup is not None: + raise ValueError( + f"file_dedup参数无效: {file_dedup!r}, " + f"支持: None, 'redis', 'mysql', 或 FileDedup 实例" + ) else: self._file_dedup = None @@ -165,6 +170,7 @@ def process_file(self, task_id, url, file_path, response): 处理下载的文件内容,返回文件最终存储位置。用户按需重写 默认实现: 流式保存到本地磁盘,返回本地路径 云存储场景: 重写此方法上传到 OSS/S3 等,返回云存储 URL + 注意: 此方法在下载失败重试时可能被多次调用,实现需保证幂等性 @param task_id: 任务 ID @param url: 文件原始 URL @param file_path: get_file_path 返回的路径/标识 @@ -213,7 +219,7 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) # ===================== 框架内部方法 ===================== # Lua 脚本: 原子递增计数并判断是否首次达到完成条件 - # 返回值: 0=未完成或已触发过, 1=首次达到完成条件 + # 返回值: {is_done, total, success, fail} _LUA_INCR_AND_CHECK = """ local key = KEYS[1] local field = ARGV[1] @@ -224,30 +230,33 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) if success + fail >= total and total > 0 then local done = redis.call('hsetnx', key, 'done', 1) if done == 1 then - return 1 + return {1, total, success, fail} end end -return 0 +return {0, total, success, fail} """ def _incr_and_check_done(self, progress_key, field): - """原子递增计数并检查是否首次达到完成条件""" + """原子递增计数并检查是否首次达到完成条件 + @return: (is_first_done, total, success, fail) + """ redis_client = self._redisdb._redis if self._lua_incr_and_check_sha is None: self._lua_incr_and_check_sha = redis_client.script_load( self._LUA_INCR_AND_CHECK ) try: - return redis_client.evalsha( + result = redis_client.evalsha( self._lua_incr_and_check_sha, 1, progress_key, field ) except NoScriptError: self._lua_incr_and_check_sha = redis_client.script_load( self._LUA_INCR_AND_CHECK ) - return redis_client.evalsha( + result = redis_client.evalsha( self._lua_incr_and_check_sha, 1, progress_key, field ) + return result[0], result[1], result[2], result[3] def start_requests(self, task): """ @@ -340,11 +349,15 @@ def start_requests(self, task): # 全部命中缓存或跳过,直接触发 on_task_all_done if cached_count + skipped_count >= total: result = self._assemble_results(task_id, total) - for item in self.on_task_all_done( - task, result, cached_count, skipped_count, total - ) or []: - yield item - yield lambda: self._cleanup_task_redis(task_id) + try: + for item in self.on_task_all_done( + task, result, cached_count, skipped_count, total + ) or []: + yield item + except Exception as e: + log.error(f"任务{task_id} on_task_all_done异常 error={e}") + finally: + yield lambda: self._cleanup_task_redis(task_id) return # Redis 状态就绪后再下发请求 @@ -392,11 +405,7 @@ def save_file(self, request, response): self._redisdb.hset(result_key, str(file_index), result_url or "") # 原子递增成功计数并检查是否首次完成 - is_first_done = self._incr_and_check_done(progress_key, "success") - - total = int(self._redisdb.hget(progress_key, "total") or 0) - success = int(self._redisdb.hget(progress_key, "success") or 0) - fail = int(self._redisdb.hget(progress_key, "fail") or 0) + is_first_done, total, success, fail = self._incr_and_check_done(progress_key, "success") log.info(f"任务{task_id} 文件下载成功 [{success + fail}/{total}] url={url}") @@ -408,11 +417,15 @@ def save_file(self, request, response): if is_first_done: task = request.task result = self._assemble_results(task_id, total) - for item in self.on_task_all_done( - task, result, success, fail, total - ) or []: - yield item - yield lambda: self._cleanup_task_redis(task_id) + try: + for item in self.on_task_all_done( + task, result, success, fail, total + ) or []: + yield item + except Exception as e: + log.error(f"任务{task_id} on_task_all_done异常 error={e}") + finally: + yield lambda: self._cleanup_task_redis(task_id) def failed_request(self, request, response, e): """ @@ -445,11 +458,7 @@ def failed_request(self, request, response, e): self._redisdb.hset(result_key, str(file_index), "") # 原子递增失败计数并检查是否首次完成 - is_first_done = self._incr_and_check_done(progress_key, "fail") - - total = int(self._redisdb.hget(progress_key, "total") or 0) - success = int(self._redisdb.hget(progress_key, "success") or 0) - fail = int(self._redisdb.hget(progress_key, "fail") or 0) + is_first_done, total, success, fail = self._incr_and_check_done(progress_key, "fail") log.error(f"任务{task_id} 文件下载失败 [{success + fail}/{total}] url={request.url}") @@ -461,11 +470,15 @@ def failed_request(self, request, response, e): if is_first_done: task = request.task result = self._assemble_results(task_id, total) - for item in self.on_task_all_done( - task, result, success, fail, total - ) or []: - yield item - yield lambda: self._cleanup_task_redis(task_id) + try: + for item in self.on_task_all_done( + task, result, success, fail, total + ) or []: + yield item + except Exception as e_done: + log.error(f"任务{task_id} on_task_all_done异常 error={e_done}") + finally: + yield lambda: self._cleanup_task_redis(task_id) yield request From 716276336d53186f045afb1195da9bb688f06562 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Wed, 8 Apr 2026 16:21:57 +0800 Subject: [PATCH 11/14] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E2=80=9C=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD=E7=88=AC=E8=99=AB=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E5=AD=97=E6=AE=B5=E9=80=8F=E4=BC=A0=E7=BB=84?= =?UTF-8?q?=E8=A3=85=E6=9C=80=E7=BB=88item=E2=80=9D=E7=9A=84bug=E2=80=94?= =?UTF-8?q?=E2=80=94=E7=AC=AC=E4=BA=8C=E6=AC=A1=E6=8F=90=E4=BA=A4=E2=80=94?= =?UTF-8?q?=E2=80=94=E4=BF=AE=E5=A4=8D=E7=AB=9E=E6=80=81=E6=9D=A1=E4=BB=B6?= =?UTF-8?q?bug=EF=BC=8C=E6=96=B0=E5=A2=9Eskipped=5Fcount=E5=AD=97=E6=AE=B5?= =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 26 +++--- feapder/core/spiders/file_spider.py | 139 +++++++++++++++------------- feapder/dedup/file_dedup.py | 21 +++-- 3 files changed, 105 insertions(+), 81 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index 17454ec0..c34ce147 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -4,7 +4,7 @@ FileSpider 是一款分布式文件下载爬虫,专用于批量下载文件/ 核心特征: - **一对多**: 一个任务包含多个待下载文件的 URL 列表,框架自动遍历生成下载请求 -- **进度追踪**: 框架自动追踪每个任务的下载进度(成功数/失败数/总数) +- **进度追踪**: 框架自动追踪每个任务的下载进度(成功数/失败数/跳过数/总数) - **结果有序**: 下载结果列表与原始 URL 列表严格位置对应 - **灵活存储**: 默认保存到本地磁盘,可重写为上传云存储(OSS/S3 等),不落盘 - **文件去重**: 可选功能,同一 URL 不重复下载,支持 Redis / MySQL / 自定义 三种策略 @@ -37,7 +37,7 @@ CREATE TABLE `file_task` ( | 方法 | 说明 | |------|------| | `get_download_urls(task)` | 从 task 中提取文件 URL 列表,返回 `List[str]` | -| `on_task_all_done(task, result, success_count, fail_count, total_count)` | 任务所有文件处理完毕的回调,在此 yield Item 或 update_task_batch 更新状态 | +| `on_task_all_done(task, result, success_count, fail_count, skipped_count, total_count)` | 任务所有文件处理完毕的回调,在此 yield Item 或 update_task_batch 更新状态 | ### 可选重写 @@ -73,14 +73,18 @@ save_file (框架层,不应重写) ### `on_task_all_done` 参数说明 ```python -def on_task_all_done(self, task, result, success_count, fail_count, total_count): +def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): """ task: PerfectDict - 任务对象,包含 task_keys 指定的字段,可通过 task.id 获取任务 ID result: List[str|None] - 与 get_download_urls 返回的列表严格位置对应 - 成功: 文件存储位置(本地路径或云存储 URL) - - 失败: None + - 失败/跳过: None 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/d.jpg"] + success_count: 成功数(含去重缓存命中) + fail_count: 下载失败数(重试耗尽) + skipped_count: 跳过数(无效URL、get_file_path异常等) + total_count: 总数 """ ``` @@ -118,7 +122,7 @@ class LocalFileSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): if fail_count == 0: yield self.update_task_batch(task.id, 1) else: @@ -132,7 +136,7 @@ if __name__ == "__main__": task_keys=["id", "file_urls"], save_dir="./downloads", ) - spider.start_monitor_task() + spider.start() ``` ### 场景二:上传云存储 @@ -165,7 +169,7 @@ class OssFileSpider(feapder.FileSpider): self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): if success_count > 0: yield self.update_task_batch(task.id, 1) else: @@ -178,7 +182,7 @@ if __name__ == "__main__": task_table="file_task", task_keys=["id", "file_urls"], ) - spider.start_monitor_task() + spider.start() ``` ### 场景三:上传云存储 + 结果入库 @@ -215,7 +219,7 @@ class OssResultSpider(feapder.FileSpider): self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): # result 与 get_download_urls 返回的列表严格位置对应,下载失败的用 None 占位 item = FileResultItem() item.task_id = task.id @@ -241,7 +245,7 @@ class DedupFileSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): yield self.update_task_batch(task.id, 1 if fail_count == 0 else -1) @@ -253,7 +257,7 @@ if __name__ == "__main__": save_dir="./downloads", file_dedup="redis", # "redis" / "mysql" / FileDedup 实例 ) - spider.start_monitor_task() + spider.start() ``` 去重行为: diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 62eff8bf..956023a4 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -136,7 +136,9 @@ def __init__( else: self._file_dedup = None - self._lua_incr_and_check_sha = None + self._lua_record_and_check_sha = self._redisdb._redis.script_load( + self._LUA_RECORD_AND_CHECK + ) # ===================== 用户需实现/可重写的方法 ===================== @@ -202,7 +204,7 @@ def on_file_failed(self, task_id, url, error): """ pass - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): """ 任务所有文件处理完毕的回调 用户应在此方法中 yield Item 写入结果表、yield self.update_task_batch() 更新任务状态 @@ -210,53 +212,59 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) @param result: List[str|None] - 每个文件的处理结果, 顺序与 get_download_urls 返回的列表一致。 成功为文件存储位置,失败为 None - @param success_count: 成功数 - @param fail_count: 失败数 + @param success_count: 成功数(含去重缓存命中) + @param fail_count: 下载失败数(重试耗尽) + @param skipped_count: 跳过数(无效URL、get_file_path异常等) @param total_count: 总数 """ pass # ===================== 框架内部方法 ===================== - # Lua 脚本: 原子递增计数并判断是否首次达到完成条件 - # 返回值: {is_done, total, success, fail} - _LUA_INCR_AND_CHECK = """ -local key = KEYS[1] -local field = ARGV[1] -redis.call('hincrby', key, field, 1) -local total = tonumber(redis.call('hget', key, 'total')) or 0 -local success = tonumber(redis.call('hget', key, 'success')) or 0 -local fail = tonumber(redis.call('hget', key, 'fail')) or 0 -if success + fail >= total and total > 0 then - local done = redis.call('hsetnx', key, 'done', 1) + # Lua 脚本: 原子操作 - 检查key存在 + 写入结果 + 递增计数 + 设置TTL + 检查完成 + # KEYS[1]=progress_key KEYS[2]=result_key + # ARGV[1]=field("success"/"fail") ARGV[2]=file_index ARGV[3]=result_value + # 返回值: {status, total, success, fail, skipped} + # status: -1=key不存在(晚到回调), 0=未完成, 1=首次完成 + _LUA_RECORD_AND_CHECK = """ +if redis.call('exists', KEYS[1]) == 0 then + return {-1, 0, 0, 0, 0} +end +redis.call('hset', KEYS[2], ARGV[2], ARGV[3]) +redis.call('expire', KEYS[2], 86400) +redis.call('hincrby', KEYS[1], ARGV[1], 1) +local total = tonumber(redis.call('hget', KEYS[1], 'total')) or 0 +local success = tonumber(redis.call('hget', KEYS[1], 'success')) or 0 +local fail = tonumber(redis.call('hget', KEYS[1], 'fail')) or 0 +local skipped = tonumber(redis.call('hget', KEYS[1], 'skipped')) or 0 +if success + fail + skipped >= total and total > 0 then + local done = redis.call('hsetnx', KEYS[1], 'done', 1) if done == 1 then - return {1, total, success, fail} + return {1, total, success, fail, skipped} end end -return {0, total, success, fail} +return {0, total, success, fail, skipped} """ - def _incr_and_check_done(self, progress_key, field): - """原子递增计数并检查是否首次达到完成条件 - @return: (is_first_done, total, success, fail) + def _record_and_check_done(self, progress_key, result_key, field, file_index, result_value): + """原子操作: 检查key存在 + 写入结果 + 递增计数 + 检查完成 + @return: (status, total, success, fail, skipped) + status: -1=key不存在(晚到回调), 0=未完成, 1=首次完成 """ - redis_client = self._redisdb._redis - if self._lua_incr_and_check_sha is None: - self._lua_incr_and_check_sha = redis_client.script_load( - self._LUA_INCR_AND_CHECK - ) try: - result = redis_client.evalsha( - self._lua_incr_and_check_sha, 1, progress_key, field + result = self._redisdb._redis.evalsha( + self._lua_record_and_check_sha, 2, + progress_key, result_key, field, file_index, result_value, ) except NoScriptError: - self._lua_incr_and_check_sha = redis_client.script_load( - self._LUA_INCR_AND_CHECK + self._lua_record_and_check_sha = self._redisdb._redis.script_load( + self._LUA_RECORD_AND_CHECK ) - result = redis_client.evalsha( - self._lua_incr_and_check_sha, 1, progress_key, field + result = self._redisdb._redis.evalsha( + self._lua_record_and_check_sha, 2, + progress_key, result_key, field, file_index, result_value, ) - return result[0], result[1], result[2], result[3] + return result[0], result[1], result[2], result[3], result[4] def start_requests(self, task): """ @@ -270,7 +278,7 @@ def start_requests(self, task): raise TypeError(f"get_download_urls应返回列表, 实际返回了字符串: {urls[:100]}") if not urls: log.warning(f"任务{task.id}无下载URL") - for item in self.on_task_all_done(task, [], 0, 0, 0) or []: + for item in self.on_task_all_done(task, [], 0, 0, 0, 0) or []: yield item return @@ -334,13 +342,17 @@ def start_requests(self, task): pipe = self._redisdb._redis.pipeline() pipe.delete(progress_key) pipe.delete(result_key) - for field, value in {"total": total, "success": cached_count, "fail": skipped_count}.items(): + progress_fields = { + "total": total, "success": cached_count, + "fail": 0, "skipped": skipped_count, + } + for field, value in progress_fields.items(): pipe.hset(progress_key, field, value) pipe.expire(progress_key, 86400) if result_mapping: for field, value in result_mapping.items(): pipe.hset(result_key, field, value) - pipe.expire(result_key, 86400) + pipe.expire(result_key, 86400) pipe.execute() if cached_count > 0: @@ -351,11 +363,12 @@ def start_requests(self, task): result = self._assemble_results(task_id, total) try: for item in self.on_task_all_done( - task, result, cached_count, skipped_count, total + task, result, cached_count, 0, skipped_count, total ) or []: yield item except Exception as e: log.error(f"任务{task_id} on_task_all_done异常 error={e}") + log.warning(f"任务{task_id} 状态未更新, 请检查on_task_all_done实现") finally: yield lambda: self._cleanup_task_redis(task_id) return @@ -386,11 +399,18 @@ def save_file(self, request, response): except Exception as e: log.error(f"任务{task_id} 去重缓存写入异常 url={url} error={e}") - # 晚到回调检查:若 progress_key 已被清理,跳过 Redis 写入避免重建脏 key + # 原子操作: 检查key存在 + 写入结果 + 递增计数 + 检查完成 progress_key = setting.TAB_FILE_PROGRESS.format( redis_key=self._redis_key, task_id=task_id ) - if not self._redisdb._redis.exists(progress_key): + result_key = setting.TAB_FILE_RESULT.format( + redis_key=self._redis_key, task_id=task_id + ) + status, total, success, fail, skipped = self._record_and_check_done( + progress_key, result_key, "success", str(file_index), result_url or "", + ) + + if status == -1: log.debug(f"任务{task_id} 进度key已清理, 跳过晚到回调的Redis写入") try: self.on_file_downloaded(task_id, url, result_url) @@ -398,32 +418,24 @@ def save_file(self, request, response): log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") return - # 记录结果 - result_key = setting.TAB_FILE_RESULT.format( - redis_key=self._redis_key, task_id=task_id - ) - self._redisdb.hset(result_key, str(file_index), result_url or "") - - # 原子递增成功计数并检查是否首次完成 - is_first_done, total, success, fail = self._incr_and_check_done(progress_key, "success") - - log.info(f"任务{task_id} 文件下载成功 [{success + fail}/{total}] url={url}") + log.info(f"任务{task_id} 文件下载成功 [{success + fail + skipped}/{total}] url={url}") try: self.on_file_downloaded(task_id, url, result_url) except Exception as e: log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") - if is_first_done: + if status == 1: task = request.task result = self._assemble_results(task_id, total) try: for item in self.on_task_all_done( - task, result, success, fail, total + task, result, success, fail, skipped, total ) or []: yield item except Exception as e: log.error(f"任务{task_id} on_task_all_done异常 error={e}") + log.warning(f"任务{task_id} 状态未更新, 请检查on_task_all_done实现") finally: yield lambda: self._cleanup_task_redis(task_id) @@ -438,11 +450,18 @@ def failed_request(self, request, response, e): yield request return - # 晚到回调检查:若 progress_key 已被清理,跳过 Redis 写入避免重建脏 key + # 原子操作: 检查key存在 + 写入结果 + 递增计数 + 检查完成 progress_key = setting.TAB_FILE_PROGRESS.format( redis_key=self._redis_key, task_id=task_id ) - if not self._redisdb._redis.exists(progress_key): + result_key = setting.TAB_FILE_RESULT.format( + redis_key=self._redis_key, task_id=task_id + ) + status, total, success, fail, skipped = self._record_and_check_done( + progress_key, result_key, "fail", str(file_index), "", + ) + + if status == -1: log.debug(f"任务{task_id} 进度key已清理, 跳过晚到回调的Redis写入") try: self.on_file_failed(task_id, request.url, e) @@ -451,32 +470,24 @@ def failed_request(self, request, response, e): yield request return - # 记录失败结果 - result_key = setting.TAB_FILE_RESULT.format( - redis_key=self._redis_key, task_id=task_id - ) - self._redisdb.hset(result_key, str(file_index), "") - - # 原子递增失败计数并检查是否首次完成 - is_first_done, total, success, fail = self._incr_and_check_done(progress_key, "fail") - - log.error(f"任务{task_id} 文件下载失败 [{success + fail}/{total}] url={request.url}") + log.error(f"任务{task_id} 文件下载失败 [{success + fail + skipped}/{total}] url={request.url}") try: self.on_file_failed(task_id, request.url, e) except Exception as e_cb: log.error(f"任务{task_id} on_file_failed回调异常 url={request.url} error={e_cb}") - if is_first_done: + if status == 1: task = request.task result = self._assemble_results(task_id, total) try: for item in self.on_task_all_done( - task, result, success, fail, total + task, result, success, fail, skipped, total ) or []: yield item except Exception as e_done: log.error(f"任务{task_id} on_task_all_done异常 error={e_done}") + log.warning(f"任务{task_id} 状态未更新, 请检查on_task_all_done实现") finally: yield lambda: self._cleanup_task_redis(task_id) diff --git a/feapder/dedup/file_dedup.py b/feapder/dedup/file_dedup.py index 21c7b2a6..5616cad2 100644 --- a/feapder/dedup/file_dedup.py +++ b/feapder/dedup/file_dedup.py @@ -6,6 +6,8 @@ FileDedup 存储 URL -> result_url 的完整映射,用于直接复用下载结果。 """ +import hashlib + from feapder.db.mysqldb import MysqlDB from feapder.db.redisdb import RedisDB from feapder.utils.log import log @@ -97,24 +99,31 @@ def _ensure_table(self): sql = ( f"CREATE TABLE IF NOT EXISTS `{self._table}` (" f" `id` int(11) NOT NULL AUTO_INCREMENT," - f" `url` varchar(2048) NOT NULL COMMENT '文件原始URL'," + f" `url` text NOT NULL COMMENT '文件原始URL'," + f" `url_hash` char(32) NOT NULL COMMENT 'URL的MD5哈希'," f" `result_url` text COMMENT '文件存储位置'," f" `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP," f" PRIMARY KEY (`id`)," - f" UNIQUE KEY `uk_url` (`url`) USING BTREE" + f" UNIQUE KEY `uk_url_hash` (`url_hash`) USING BTREE" f") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4" ) self._mysqldb.execute(sql) self.__class__._table_ensured.add(self._table) + @staticmethod + def _hash_url(url): + return hashlib.md5(url.encode()).hexdigest() + def get(self, url): - sql = f"SELECT result_url FROM `{self._table}` WHERE `url` = %s LIMIT 1" - result = self._mysqldb.find(sql, (url,)) + url_hash = self._hash_url(url) + sql = f"SELECT result_url FROM `{self._table}` WHERE `url_hash` = %s LIMIT 1" + result = self._mysqldb.find(sql, (url_hash,)) return result[0][0] if result else None def set(self, url, result_url): + url_hash = self._hash_url(url) sql = ( - f"INSERT INTO `{self._table}` (`url`, `result_url`) VALUES (%s, %s) " + f"INSERT INTO `{self._table}` (`url`, `url_hash`, `result_url`) VALUES (%s, %s, %s) " f"ON DUPLICATE KEY UPDATE `result_url` = VALUES(`result_url`)" ) - self._mysqldb.execute(sql, (url, result_url)) + self._mysqldb.execute(sql, (url, url_hash, result_url)) From 6c04fc07141f517dde90315b5ad3a207e6bb3f7b Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Wed, 8 Apr 2026 18:51:37 +0800 Subject: [PATCH 12/14] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E2=80=9C=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD=E7=88=AC=E8=99=AB=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E5=AD=97=E6=AE=B5=E9=80=8F=E4=BC=A0=E7=BB=84?= =?UTF-8?q?=E8=A3=85=E6=9C=80=E7=BB=88item=E2=80=9D=E7=9A=84bug=E2=80=94?= =?UTF-8?q?=E2=80=94=E7=AC=AC=E5=9B=9B=E6=AC=A1=E6=8F=90=E4=BA=A4=E2=80=94?= =?UTF-8?q?=E2=80=94=E6=B7=BB=E5=8A=A0run=5Fid=E6=A0=87=E8=AF=86=EF=BC=8C?= =?UTF-8?q?=E9=81=BF=E5=85=8D=E5=B0=8F=E6=A6=82=E7=8E=87=E6=83=85=E5=86=B5?= =?UTF-8?q?=E4=B8=8B=E8=B7=A8=E6=89=B9=E6=AC=A1=E8=AF=B7=E6=B1=82=E9=80=A0?= =?UTF-8?q?=E6=88=90=E7=BB=9F=E8=AE=A1=E9=94=99=E8=AF=AF=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 98 +++++++--- feapder/commands/create/create_spider.py | 4 +- feapder/commands/create_builder.py | 2 +- feapder/core/spiders/file_spider.py | 188 +++++++++++++------- feapder/setting.py | 2 + feapder/templates/file_spider_template.tmpl | 71 ++++++++ tests/file-spider/test_dedup_file_spider.py | 6 +- tests/file-spider/test_local_file_spider.py | 6 +- tests/file-spider/test_oss_file_spider.py | 6 +- 9 files changed, 288 insertions(+), 95 deletions(-) create mode 100644 feapder/templates/file_spider_template.tmpl diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index c34ce147..d669e705 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -4,10 +4,11 @@ FileSpider 是一款分布式文件下载爬虫,专用于批量下载文件/ 核心特征: - **一对多**: 一个任务包含多个待下载文件的 URL 列表,框架自动遍历生成下载请求 -- **进度追踪**: 框架自动追踪每个任务的下载进度(成功数/失败数/跳过数/总数) +- **进度追踪**: 框架自动追踪每个任务的下载进度(成功数/失败数/跳过数/去重数/总数) - **结果有序**: 下载结果列表与原始 URL 列表严格位置对应 - **灵活存储**: 默认保存到本地磁盘,可重写为上传云存储(OSS/S3 等),不落盘 -- **文件去重**: 可选功能,同一 URL 不重复下载,支持 Redis / MySQL / 自定义 三种策略 +- **文件去重**: 任务内相同 URL 自动去重;可选跨任务去重(Redis / MySQL / 自定义) +- **HTTP 校验**: 默认对 4xx/5xx 响应触发重试,用户可重写 `validate` 自定义校验 - **用户控制**: 任务成功/失败由用户在回调中显式决定 FileSpider 继承自 TaskSpider,复用了全部任务管理能力(MySQL 任务表、Redis 队列、断点续爬、丢失任务回收、分布式支持等)。 @@ -37,7 +38,7 @@ CREATE TABLE `file_task` ( | 方法 | 说明 | |------|------| | `get_download_urls(task)` | 从 task 中提取文件 URL 列表,返回 `List[str]` | -| `on_task_all_done(task, result, success_count, fail_count, skipped_count, total_count)` | 任务所有文件处理完毕的回调,在此 yield Item 或 update_task_batch 更新状态 | +| `on_task_all_done(task, result, success_count, fail_count, skipped_count, dup_count, total_count)` | 任务所有文件处理完毕的回调,在此 yield Item 或 update_task_batch 更新状态 | ### 可选重写 @@ -45,6 +46,7 @@ CREATE TABLE `file_task` ( |------|------|----------| | `get_file_path(task, url, index)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{index}_{md5(filename)}{ext}` | | `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置(需保证幂等) | 流式保存到本地磁盘,返回本地路径 | +| `validate(request, response)` | 校验下载响应 | 4xx/5xx抛异常触发重试,3xx自动跟随 | | `on_file_downloaded(task_id, url, file_path)` | 单个文件下载成功回调 | 无 | | `on_file_failed(task_id, url, error)` | 单个文件下载失败回调 | 无 | @@ -55,7 +57,7 @@ save_file (框架层,不应重写) ├── process_file (用户层,按需重写) │ ├── 默认: 保存到本地磁盘,返回本地路径 │ └── 重写: 上传云存储,返回云存储 URL - ├── Redis 进度追踪 (自动) + ├── Redis 进度追踪 (自动,幂等计数) ├── on_file_downloaded 回调 └── 检查是否所有文件完成 └── on_task_all_done (用户实现) @@ -73,21 +75,71 @@ save_file (框架层,不应重写) ### `on_task_all_done` 参数说明 ```python -def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): +def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): """ task: PerfectDict - 任务对象,包含 task_keys 指定的字段,可通过 task.id 获取任务 ID result: List[str|None] - 与 get_download_urls 返回的列表严格位置对应 - 成功: 文件存储位置(本地路径或云存储 URL) - 失败/跳过: None - 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/d.jpg"] + - 任务内重复URL: 继承首次出现的结果 + 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/a.jpg"] success_count: 成功数(含去重缓存命中) fail_count: 下载失败数(重试耗尽) skipped_count: 跳过数(无效URL、get_file_path异常等) - total_count: 总数 + dup_count: 任务内重复URL数 + total_count: 总数(success + fail + skipped + dup = total) """ ``` +### `on_task_all_done` 设计约定与实现建议 + +`on_task_all_done` 是业务回调,**任务状态由用户代码显式控制**(通常通过 `yield self.update_task_batch(...)`)。 + +- 若该方法抛异常,框架不会自动改写任务状态;任务可能保持 `doing(2)` +- 后续会由 TaskSpider 的丢失任务恢复机制重新下发任务 +- 因此该方法建议按“可重试、可重入”方式实现,保证幂等 + +推荐实践: +- 先产出结果数据,再更新任务状态,避免状态先行导致结果缺失 +- 对外部副作用(通知、回调第三方、写非幂等系统)增加幂等保护 +- 异常日志要包含 `task.id`、计数信息和关键上下文,便于快速排障 + +#### 新手解释:什么是“幂等” + +幂等可以理解为:**同一个操作执行 1 次和执行多次,最终结果一致**。 + +在 `FileSpider` 中,常见重试来源有网络重试、进程重启、丢失任务回收。 +因此 `on_task_all_done` 需要按“可能被重复执行”来设计: + +- 幂等写法:`state` 直接设置为目标值(如 1 或 -1) +- 非幂等写法:每次执行都做自增/重复插入/重复通知 + +#### 推荐写法案例(可重试、可重入) + +```python +from feapder.utils.log import log + + +class MyFileSpider(feapder.FileSpider): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): + task_id = task.id + log.info( + f"任务{task_id}完成 success={success_count} fail={fail_count} " + f"skipped={skipped_count} dup={dup_count} total={total_count}" + ) + + # 1) 先写业务结果(示例:可按需 yield Item) + # item = FileResultItem() + # item.task_id = task_id + # item.result_urls = result + # yield item + + # 2) 最后更新任务状态(设置目标值,天然幂等) + done_state = 1 if fail_count == 0 and success_count > 0 else -1 + yield self.update_task_batch(task_id, done_state) +``` + ## 3. 构造参数 | 参数 | 类型 | 说明 | @@ -122,8 +174,9 @@ class LocalFileSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): - if fail_count == 0: + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): + # fail_count == 0 且有实际成功下载则标记完成;全部跳过或无有效URL标记失败 + if fail_count == 0 and success_count > 0: yield self.update_task_batch(task.id, 1) else: yield self.update_task_batch(task.id, -1) @@ -169,7 +222,7 @@ class OssFileSpider(feapder.FileSpider): self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): if success_count > 0: yield self.update_task_batch(task.id, 1) else: @@ -219,14 +272,14 @@ class OssResultSpider(feapder.FileSpider): self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): # result 与 get_download_urls 返回的列表严格位置对应,下载失败的用 None 占位 item = FileResultItem() item.task_id = task.id item.result_urls = result yield item - if fail_count == 0: + if fail_count == 0 and success_count > 0: yield self.update_task_batch(task.id, 1) else: yield self.update_task_batch(task.id, -1) @@ -234,7 +287,7 @@ class OssResultSpider(feapder.FileSpider): ### 场景四:启用文件去重 -通过 `file_dedup` 参数启用,同一 URL 跨任务不重复下载: +通过 `file_dedup` 参数启用跨任务去重,同一 URL 跨任务不重复下载: ```python import json @@ -245,8 +298,8 @@ class DedupFileSpider(feapder.FileSpider): def get_download_urls(self, task): return json.loads(task.file_urls) - def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): - yield self.update_task_batch(task.id, 1 if fail_count == 0 else -1) + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): + yield self.update_task_batch(task.id, 1 if fail_count == 0 and success_count > 0 else -1) if __name__ == "__main__": @@ -260,15 +313,16 @@ if __name__ == "__main__": spider.start() ``` -去重行为: -- `start_requests` 中遍历 URL 列表时,先查去重缓存 -- 缓存命中:直接复用已有结果,不生成 Request,不重复下载 -- 缓存未命中:正常下载,成功后自动写入去重缓存 -- 跨任务共享:不同任务中出现的相同 URL 只下载一次 - ## 5. 文件去重 -### 去重策略 +### 去重层级 + +FileSpider 提供两级去重: + +1. **任务内去重(自动)**: 同一任务的 URL 列表中出现的重复 URL,只下载一次,重复项继承首次出现的结果 +2. **跨任务去重(可选)**: 通过 `file_dedup` 参数启用,不同任务中出现的相同 URL 只下载一次 + +### 跨任务去重策略 | 策略 | 参数值 | 存储 | 适用场景 | |------|--------|------|----------| diff --git a/feapder/commands/create/create_spider.py b/feapder/commands/create/create_spider.py index f464e059..13243312 100644 --- a/feapder/commands/create/create_spider.py +++ b/feapder/commands/create/create_spider.py @@ -57,8 +57,10 @@ def get_spider_template(self, spider_type): template_path = "task_spider_template.tmpl" elif spider_type == "BatchSpider": template_path = "batch_spider_template.tmpl" + elif spider_type == "FileSpider": + template_path = "file_spider_template.tmpl" else: - raise ValueError("spider type error, only support AirSpider、 Spider、TaskSpider、BatchSpider") + raise ValueError("spider type error, only support AirSpider、Spider、TaskSpider、BatchSpider、FileSpider") template_path = os.path.abspath( os.path.join(__file__, "../../../templates", template_path) diff --git a/feapder/commands/create_builder.py b/feapder/commands/create_builder.py index dec0ba05..1703ebf9 100644 --- a/feapder/commands/create_builder.py +++ b/feapder/commands/create_builder.py @@ -87,7 +87,7 @@ def main(): elif args.spider: c = Choice( "请选择爬虫模板", - ["AirSpider", "Spider", "TaskSpider", "BatchSpider"], + ["AirSpider", "Spider", "TaskSpider", "BatchSpider", "FileSpider"], icon_style=StringStyle(fore=Fore.green), selected_style=StringStyle(fore=Fore.green), ) diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 956023a4..45c40d39 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -186,6 +186,13 @@ def process_file(self, task_id, url, file_path, response): f.write(chunk) return file_path + def validate(self, request, response): + """文件下载默认校验: 4xx/5xx响应抛异常触发重试,3xx由requests自动跟随。用户可重写""" + if response and response.status_code >= 400: + raise Exception( + f"文件下载HTTP {response.status_code} url={request.url}" + ) + def on_file_downloaded(self, task_id, url, file_path): """ 单个文件下载成功的回调,用户可重写 @@ -204,57 +211,68 @@ def on_file_failed(self, task_id, url, error): """ pass - def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): """ 任务所有文件处理完毕的回调 用户应在此方法中 yield Item 写入结果表、yield self.update_task_batch() 更新任务状态 @param task: PerfectDict - 任务对象,包含 task_keys 指定的字段 @param result: List[str|None] - 每个文件的处理结果, 顺序与 get_download_urls 返回的列表一致。 - 成功为文件存储位置,失败为 None + 成功为文件存储位置,失败为 None。 + 任务内重复URL的结果继承首次出现的结果 @param success_count: 成功数(含去重缓存命中) @param fail_count: 下载失败数(重试耗尽) @param skipped_count: 跳过数(无效URL、get_file_path异常等) - @param total_count: 总数 + @param dup_count: 任务内重复URL数 + @param total_count: 总数(success + fail + skipped + dup = total) """ pass # ===================== 框架内部方法 ===================== - # Lua 脚本: 原子操作 - 检查key存在 + 写入结果 + 递增计数 + 设置TTL + 检查完成 + # Lua 脚本: 原子操作 - 轮次校验 + 幂等写入结果 + 递增计数 + 设置TTL + 检查完成 # KEYS[1]=progress_key KEYS[2]=result_key - # ARGV[1]=field("success"/"fail") ARGV[2]=file_index ARGV[3]=result_value - # 返回值: {status, total, success, fail, skipped} - # status: -1=key不存在(晚到回调), 0=未完成, 1=首次完成 + # ARGV[1]=field("success"/"fail") ARGV[2]=file_index ARGV[3]=result_value ARGV[4]=run_id + # 返回值: {status, total, success, fail, skipped, dup} + # status: -1=key不存在或run_id不匹配(过期回调), 0=未完成, 1=首次完成 _LUA_RECORD_AND_CHECK = """ if redis.call('exists', KEYS[1]) == 0 then - return {-1, 0, 0, 0, 0} + return {-1, 0, 0, 0, 0, 0} +end +if redis.call('hget', KEYS[1], 'run_id') ~= ARGV[4] then + return {-1, 0, 0, 0, 0, 0} +end +local is_new = redis.call('hsetnx', KEYS[2], ARGV[2], ARGV[3]) +if is_new == 1 then + redis.call('hincrby', KEYS[1], ARGV[1], 1) end -redis.call('hset', KEYS[2], ARGV[2], ARGV[3]) redis.call('expire', KEYS[2], 86400) -redis.call('hincrby', KEYS[1], ARGV[1], 1) +redis.call('expire', KEYS[1], 86400) local total = tonumber(redis.call('hget', KEYS[1], 'total')) or 0 local success = tonumber(redis.call('hget', KEYS[1], 'success')) or 0 local fail = tonumber(redis.call('hget', KEYS[1], 'fail')) or 0 local skipped = tonumber(redis.call('hget', KEYS[1], 'skipped')) or 0 -if success + fail + skipped >= total and total > 0 then +local dup = tonumber(redis.call('hget', KEYS[1], 'dup')) or 0 +if success + fail + skipped + dup >= total and total > 0 then local done = redis.call('hsetnx', KEYS[1], 'done', 1) if done == 1 then - return {1, total, success, fail, skipped} + return {1, total, success, fail, skipped, dup} end end -return {0, total, success, fail, skipped} +return {0, total, success, fail, skipped, dup} """ - def _record_and_check_done(self, progress_key, result_key, field, file_index, result_value): - """原子操作: 检查key存在 + 写入结果 + 递增计数 + 检查完成 - @return: (status, total, success, fail, skipped) - status: -1=key不存在(晚到回调), 0=未完成, 1=首次完成 + def _record_and_check_done(self, progress_key, result_key, field, file_index, result_value, run_id): + """原子操作: 轮次校验 + 幂等写入结果 + 递增计数 + 检查完成 + run_id 不匹配时视为过期回调直接丢弃,防止跨轮次数据污染。 + 同一 file_index 仅首次写入时递增计数器。 + @return: (status, total, success, fail, skipped, dup) + status: -1=key不存在或run_id不匹配(过期回调), 0=未完成, 1=首次完成 """ try: result = self._redisdb._redis.evalsha( self._lua_record_and_check_sha, 2, - progress_key, result_key, field, file_index, result_value, + progress_key, result_key, field, file_index, result_value, run_id, ) except NoScriptError: self._lua_record_and_check_sha = self._redisdb._redis.script_load( @@ -262,23 +280,24 @@ def _record_and_check_done(self, progress_key, result_key, field, file_index, re ) result = self._redisdb._redis.evalsha( self._lua_record_and_check_sha, 2, - progress_key, result_key, field, file_index, result_value, + progress_key, result_key, field, file_index, result_value, run_id, ) - return result[0], result[1], result[2], result[3], result[4] + return result[0], result[1], result[2], result[3], result[4], result[5] def start_requests(self, task): """ 遍历 URL 列表生成下载请求。 - 去重缓存命中的 URL 直接复用结果,不生成 Request。 - 先在本地收集所有缓存/跳过结果,通过 pipeline 一次性写入 Redis, - 再 yield Request,避免 worker 线程与初始化之间的竞态。 + - 任务内重复 URL 自动去重,结果继承首次出现的下载结果 + - 跨任务去重缓存命中的 URL 直接复用结果,不生成 Request + - 先在本地收集所有结果,通过 pipeline 一次性写入 Redis, + 再 yield Request,避免 worker 线程与初始化之间的竞态 """ urls = self.get_download_urls(task) if isinstance(urls, str): raise TypeError(f"get_download_urls应返回列表, 实际返回了字符串: {urls[:100]}") if not urls: log.warning(f"任务{task.id}无下载URL") - for item in self.on_task_all_done(task, [], 0, 0, 0, 0) or []: + for item in self.on_task_all_done(task, [], 0, 0, 0, 0, 0) or []: yield item return @@ -291,9 +310,14 @@ def start_requests(self, task): redis_key=self._redis_key, task_id=task_id ) + run_id = os.urandom(8).hex() + cached_count = 0 skipped_count = 0 + dup_count = 0 result_mapping = {} + dup_to_source = {} + seen_urls = {} pending_requests = [] for index, url in enumerate(urls): @@ -303,6 +327,14 @@ def start_requests(self, task): log.warning(f"任务{task_id} 跳过无效URL index={index}") continue + url = url.strip() + if url in seen_urls: + dup_to_source[index] = seen_urls[url] + dup_count += 1 + log.debug(f"任务{task_id} URL任务内去重 index={index} -> {seen_urls[url]}") + continue + seen_urls[url] = index + if self._file_dedup: try: cached_result = self._file_dedup.get(url) @@ -334,17 +366,23 @@ def start_requests(self, task): file_index=index, file_path=file_path, task=task, + run_id=run_id, callback=self.save_file, ) ) # 清理旧 key 并通过 pipeline 原子写入初始状态 + dup_key = setting.TAB_FILE_DUP.format( + redis_key=self._redis_key, task_id=task_id + ) pipe = self._redisdb._redis.pipeline() pipe.delete(progress_key) pipe.delete(result_key) + pipe.delete(dup_key) progress_fields = { "total": total, "success": cached_count, - "fail": 0, "skipped": skipped_count, + "fail": 0, "skipped": skipped_count, "dup": dup_count, + "run_id": run_id, } for field, value in progress_fields.items(): pipe.hset(progress_key, field, value) @@ -353,17 +391,23 @@ def start_requests(self, task): for field, value in result_mapping.items(): pipe.hset(result_key, field, value) pipe.expire(result_key, 86400) + if dup_to_source: + for dup_idx, src_idx in dup_to_source.items(): + pipe.hset(dup_key, str(dup_idx), str(src_idx)) + pipe.expire(dup_key, 86400) pipe.execute() + if dup_count > 0: + log.info(f"任务{task_id} 任务内URL去重{dup_count}个") if cached_count > 0: - log.info(f"任务{task_id} 去重命中{cached_count}/{total}个文件") + log.info(f"任务{task_id} 去重缓存命中{cached_count}/{total}个文件") - # 全部命中缓存或跳过,直接触发 on_task_all_done - if cached_count + skipped_count >= total: - result = self._assemble_results(task_id, total) + # 全部命中缓存/跳过/去重,直接触发 on_task_all_done + if cached_count + skipped_count + dup_count >= total: try: + result = self._assemble_results(task_id, total) for item in self.on_task_all_done( - task, result, cached_count, 0, skipped_count, total + task, result, cached_count, 0, skipped_count, dup_count, total ) or []: yield item except Exception as e: @@ -385,6 +429,7 @@ def save_file(self, request, response): file_index = request.file_index url = request.url file_path = request.file_path + run_id = getattr(request, "run_id", "") try: result_url = self.process_file(task_id, url, file_path, response) @@ -392,6 +437,9 @@ def save_file(self, request, response): log.error(f"任务{task_id} process_file异常 url={url} error={e}") raise + if not result_url: + log.warning(f"任务{task_id} process_file返回空值 url={url}, 将计为成功但结果为None") + # 写入去重缓存(异常不影响主流程) if self._file_dedup and result_url: try: @@ -399,26 +447,22 @@ def save_file(self, request, response): except Exception as e: log.error(f"任务{task_id} 去重缓存写入异常 url={url} error={e}") - # 原子操作: 检查key存在 + 写入结果 + 递增计数 + 检查完成 + # 原子操作: 轮次校验 + 幂等写入结果 + 递增计数 + 检查完成 progress_key = setting.TAB_FILE_PROGRESS.format( redis_key=self._redis_key, task_id=task_id ) result_key = setting.TAB_FILE_RESULT.format( redis_key=self._redis_key, task_id=task_id ) - status, total, success, fail, skipped = self._record_and_check_done( - progress_key, result_key, "success", str(file_index), result_url or "", + status, total, success, fail, skipped, dup = self._record_and_check_done( + progress_key, result_key, "success", str(file_index), result_url or "", run_id, ) if status == -1: - log.debug(f"任务{task_id} 进度key已清理, 跳过晚到回调的Redis写入") - try: - self.on_file_downloaded(task_id, url, result_url) - except Exception as e: - log.error(f"任务{task_id} on_file_downloaded回调异常 url={url} error={e}") + log.debug(f"任务{task_id} 过期回调已丢弃 url={url}") return - log.info(f"任务{task_id} 文件下载成功 [{success + fail + skipped}/{total}] url={url}") + log.info(f"任务{task_id} 文件下载成功 [{success + fail + skipped + dup}/{total}] url={url}") try: self.on_file_downloaded(task_id, url, result_url) @@ -427,10 +471,10 @@ def save_file(self, request, response): if status == 1: task = request.task - result = self._assemble_results(task_id, total) try: + result = self._assemble_results(task_id, total) for item in self.on_task_all_done( - task, result, success, fail, skipped, total + task, result, success, fail, skipped, dup, total ) or []: yield item except Exception as e: @@ -450,27 +494,24 @@ def failed_request(self, request, response, e): yield request return - # 原子操作: 检查key存在 + 写入结果 + 递增计数 + 检查完成 + run_id = getattr(request, "run_id", "") + + # 原子操作: 轮次校验 + 幂等写入结果 + 递增计数 + 检查完成 progress_key = setting.TAB_FILE_PROGRESS.format( redis_key=self._redis_key, task_id=task_id ) result_key = setting.TAB_FILE_RESULT.format( redis_key=self._redis_key, task_id=task_id ) - status, total, success, fail, skipped = self._record_and_check_done( - progress_key, result_key, "fail", str(file_index), "", + status, total, success, fail, skipped, dup = self._record_and_check_done( + progress_key, result_key, "fail", str(file_index), "", run_id, ) if status == -1: - log.debug(f"任务{task_id} 进度key已清理, 跳过晚到回调的Redis写入") - try: - self.on_file_failed(task_id, request.url, e) - except Exception as e_cb: - log.error(f"任务{task_id} on_file_failed回调异常 url={request.url} error={e_cb}") - yield request + log.debug(f"任务{task_id} 过期回调已丢弃 url={request.url}") return - log.error(f"任务{task_id} 文件下载失败 [{success + fail + skipped}/{total}] url={request.url}") + log.error(f"任务{task_id} 文件下载失败 [{success + fail + skipped + dup}/{total}] url={request.url}") try: self.on_file_failed(task_id, request.url, e) @@ -479,10 +520,10 @@ def failed_request(self, request, response, e): if status == 1: task = request.task - result = self._assemble_results(task_id, total) try: + result = self._assemble_results(task_id, total) for item in self.on_task_all_done( - task, result, success, fail, skipped, total + task, result, success, fail, skipped, dup, total ) or []: yield item except Exception as e_done: @@ -495,29 +536,52 @@ def failed_request(self, request, response, e): def _assemble_results(self, task_id, total): """ - 从 Redis 结果 Hash 中一次性拉取所有文件处理结果, - 按 0~total-1 顺序组装为有序列表返回。 + 从 Redis 中拉取文件处理结果和任务内重复映射, + 按 0~total-1 顺序组装为有序列表,重复索引继承首次出现的结果。 + 使用 hscan_iter 分批读取,避免超大任务时 hgetall 的内存峰值。 """ result_key = setting.TAB_FILE_RESULT.format( redis_key=self._redis_key, task_id=task_id ) - raw_data = self._redisdb.hgetall(result_key) - all_data = { - (k.decode() if isinstance(k, bytes) else k): (v.decode() if isinstance(v, bytes) else v) - for k, v in raw_data.items() - } - return [all_data.get(str(i)) or None for i in range(total)] + all_data = {} + for k, v in self._redisdb._redis.hscan_iter(result_key, count=1000): + key = k.decode() if isinstance(k, bytes) else k + val = v.decode() if isinstance(v, bytes) else v + all_data[key] = val + result = [all_data.get(str(i)) or None for i in range(total)] + + dup_key = setting.TAB_FILE_DUP.format( + redis_key=self._redis_key, task_id=task_id + ) + for dup_idx_raw, src_idx_raw in self._redisdb._redis.hscan_iter(dup_key, count=1000): + dup_idx = int(dup_idx_raw.decode() if isinstance(dup_idx_raw, bytes) else dup_idx_raw) + src_idx = int(src_idx_raw.decode() if isinstance(src_idx_raw, bytes) else src_idx_raw) + result[dup_idx] = result[src_idx] + + return result def _cleanup_task_redis(self, task_id): - """清理任务相关的 Redis 进度和结果 key""" + """清理任务相关的 Redis 进度、结果和重复映射 key""" progress_key = setting.TAB_FILE_PROGRESS.format( redis_key=self._redis_key, task_id=task_id ) result_key = setting.TAB_FILE_RESULT.format( redis_key=self._redis_key, task_id=task_id ) + dup_key = setting.TAB_FILE_DUP.format( + redis_key=self._redis_key, task_id=task_id + ) self._redisdb.clear(progress_key) self._redisdb.clear(result_key) + self._redisdb.clear(dup_key) + + def close(self): + """释放文件去重缓存资源""" + if self._file_dedup: + try: + self._file_dedup.close() + except Exception as e: + log.error(f"文件去重缓存关闭异常 error={e}") @classmethod def to_DebugFileSpider(cls, *args, **kwargs): diff --git a/feapder/setting.py b/feapder/setting.py index 91d6e4ca..34a38ded 100644 --- a/feapder/setting.py +++ b/feapder/setting.py @@ -15,6 +15,8 @@ TAB_FILE_PROGRESS = "{redis_key}:h_file_progress:{task_id}" # 文件爬虫 - 文件结果 TAB_FILE_RESULT = "{redis_key}:h_file_result:{task_id}" +# 文件爬虫 - 任务内重复URL映射 +TAB_FILE_DUP = "{redis_key}:h_file_dup:{task_id}" # 文件爬虫 - 去重缓存 TAB_FILE_DEDUP = "{redis_key}:h_file_dedup" # 用户池 diff --git a/feapder/templates/file_spider_template.tmpl b/feapder/templates/file_spider_template.tmpl new file mode 100644 index 00000000..4c6f68dd --- /dev/null +++ b/feapder/templates/file_spider_template.tmpl @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +""" +Created on {DATE} +--------- +@summary: +--------- +@author: {USER} +""" + +import json + +import feapder +from feapder import ArgumentParser + + +class ${spider_name}(feapder.FileSpider): + # 自定义数据库,若项目中有setting.py文件,此自定义可删除 + __custom_setting__ = dict( + REDISDB_IP_PORTS="localhost:6379", + REDISDB_USER_PASS="", + REDISDB_DB=0, + MYSQL_IP="localhost", + MYSQL_PORT=3306, + MYSQL_DB="", + MYSQL_USER_NAME="", + MYSQL_USER_PASS="", + ) + + def get_download_urls(self, task): + return json.loads(task.file_urls) + + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): + # 任务状态需在此显式更新;实现需保证幂等,异常可能触发任务重试 + # fail_count == 0 且有实际成功下载则标记完成;全部跳过或无有效URL标记失败 + if fail_count == 0 and success_count > 0: + yield self.update_task_batch(task.id, 1) + else: + yield self.update_task_batch(task.id, -1) + + +if __name__ == "__main__": + spider = ${spider_name}( + redis_key="xxx:xxx", # 分布式爬虫调度信息存储位置 + task_table="", # mysql中的任务表 + task_keys=["id", "file_urls"], # 需要获取任务表里的字段名,可添加多个 + task_state="state", # mysql中任务状态字段 + save_dir="./downloads", # 文件保存根目录 + # file_dedup="redis", # 跨任务去重策略: None / "redis" / "mysql" + ) + + parser = ArgumentParser(description="${spider_name}爬虫") + + parser.add_argument( + "--start_master", + action="store_true", + help="添加任务", + function=spider.start_monitor_task, + ) + parser.add_argument( + "--start_worker", action="store_true", help="启动爬虫", function=spider.start + ) + + parser.start() + + # 直接启动 + # spider.start() # 启动爬虫 + # spider.start_monitor_task() # 添加任务 + + # 通过命令行启动 + # python ${file_name} --start_master # 添加任务 + # python ${file_name} --start_worker # 启动爬虫 diff --git a/tests/file-spider/test_dedup_file_spider.py b/tests/file-spider/test_dedup_file_spider.py index e11ecdc9..ca64ab5f 100644 --- a/tests/file-spider/test_dedup_file_spider.py +++ b/tests/file-spider/test_dedup_file_spider.py @@ -35,9 +35,9 @@ def get_download_urls(self, task): def on_file_downloaded(self, task_id, url, file_path): log.info(f"任务{task_id} 文件就绪 path={file_path}") - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): log.info(f"任务{task.id} 完成 成功={success_count} 失败={fail_count}") - yield self.update_task_batch(task.id, 1 if fail_count == 0 else -1) + yield self.update_task_batch(task.id, 1 if fail_count == 0 and success_count > 0 else -1) if __name__ == "__main__": @@ -48,4 +48,4 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) save_dir="./downloads", file_dedup="redis", # "redis" / "mysql" / FileDedup 实例 ) - spider.start_monitor_task() + spider.start() diff --git a/tests/file-spider/test_local_file_spider.py b/tests/file-spider/test_local_file_spider.py index f0a87454..3c2a4127 100644 --- a/tests/file-spider/test_local_file_spider.py +++ b/tests/file-spider/test_local_file_spider.py @@ -30,8 +30,8 @@ def get_download_urls(self, task): def on_file_downloaded(self, task_id, url, file_path): log.info(f"任务{task_id} 文件保存成功 path={file_path}") - def on_task_all_done(self, task, result, success_count, fail_count, total_count): - if fail_count == 0: + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): + if fail_count == 0 and success_count > 0: yield self.update_task_batch(task.id, 1) else: yield self.update_task_batch(task.id, -1) @@ -44,4 +44,4 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) task_keys=["id", "file_urls"], save_dir="./downloads", ) - spider.start_monitor_task() + spider.start() diff --git a/tests/file-spider/test_oss_file_spider.py b/tests/file-spider/test_oss_file_spider.py index 58a2c00c..9811d9cc 100644 --- a/tests/file-spider/test_oss_file_spider.py +++ b/tests/file-spider/test_oss_file_spider.py @@ -45,9 +45,9 @@ def process_file(self, task_id, url, file_path, response): log.info(f"任务{task_id} 上传成功 url={cloud_url}") return cloud_url - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): log.info(f"任务{task.id} 完成 成功={success_count} 失败={fail_count}") - if success_count > 0: + if fail_count == 0 and success_count > 0: yield self.update_task_batch(task.id, 1) else: yield self.update_task_batch(task.id, -1) @@ -59,4 +59,4 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) task_table="file_task", task_keys=["id", "file_urls"], ) - spider.start_monitor_task() + spider.start() From 517042a08c695fe290ff103eb9f4e605e31831ce Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Wed, 8 Apr 2026 19:14:34 +0800 Subject: [PATCH 13/14] =?UTF-8?q?=E7=AC=AC=E4=BA=94=E6=AC=A1=E6=8F=90?= =?UTF-8?q?=E4=BA=A4=EF=BC=9A=E4=BF=AE=E5=A4=8D=E6=96=B9=E6=B3=95=E7=AD=BE?= =?UTF-8?q?=E5=90=8D=E4=B8=8D=E4=B8=80=E8=87=B4=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E6=9B=B4=E6=96=B0=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 6 ++++-- feapder/core/base_parser.py | 18 ++++++++++++------ feapder/core/spiders/file_spider.py | 16 +++++++++++----- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index d669e705..14380ae3 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -45,7 +45,7 @@ CREATE TABLE `file_task` ( | 方法 | 说明 | 默认行为 | |------|------|----------| | `get_file_path(task, url, index)` | 返回文件保存路径/存储标识 | `{save_dir}/{task_id}/{index}_{md5(filename)}{ext}` | -| `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置(需保证幂等) | 流式保存到本地磁盘,返回本地路径 | +| `process_file(task_id, url, file_path, response)` | 处理文件内容,返回最终存储位置(需保证幂等,不可返回空值) | 流式保存到本地磁盘,返回本地路径 | | `validate(request, response)` | 校验下载响应 | 4xx/5xx抛异常触发重试,3xx自动跟随 | | `on_file_downloaded(task_id, url, file_path)` | 单个文件下载成功回调 | 无 | | `on_file_failed(task_id, url, error)` | 单个文件下载失败回调 | 无 | @@ -65,13 +65,15 @@ save_file (框架层,不应重写) └── yield update_task_batch → 更新任务状态 ``` -### `process_file` 幂等性要求 +### `process_file` 约束 `process_file` 在下载失败重试时可能被多次调用(同一 URL、同一 `file_path`),实现需保证幂等性: - 默认实现使用 `"wb"` 模式覆盖写入,天然幂等 - 重写时避免使用追加模式(`"ab"`) - 云存储场景建议使用 `put_object` 等覆盖语义的 API +**返回值要求**: 必须返回非空字符串(文件最终存储位置)。返回 `None` 或空字符串 `""` 会被视为处理失败,触发框架重试,直至重试次数耗尽后计入失败。 + ### `on_task_all_done` 参数说明 ```python diff --git a/feapder/core/base_parser.py b/feapder/core/base_parser.py index e3a4dcd4..42d76bd0 100644 --- a/feapder/core/base_parser.py +++ b/feapder/core/base_parser.py @@ -232,11 +232,14 @@ def process_file(self, task_id, url, file_path, response): 处理下载的文件内容,返回文件最终存储位置。用户按需重写 默认实现: 流式保存到本地磁盘,返回本地路径 云存储场景: 重写此方法上传到 OSS/S3 等,返回云存储 URL + 注意: + - 此方法在下载失败重试时可能被多次调用,实现需保证幂等性 + - 必须返回非空字符串,返回空值会触发重试直至失败 @param task_id: 任务 ID @param url: 文件原始 URL @param file_path: get_file_path 返回的路径/标识 @param response: 下载响应 - @return: str - 文件最终存储位置(本地路径或云存储 URL) + @return: str - 文件最终存储位置(不可为空) """ os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as f: @@ -263,17 +266,20 @@ def on_file_failed(self, task_id, url, error): """ pass - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): """ 任务所有文件处理完毕的回调 用户应在此方法中 yield Item 写入结果表、yield self.update_task_batch() 更新任务状态 @param task: PerfectDict - 任务对象,包含 task_keys 指定的字段 @param result: List[str|None] - 每个文件的处理结果, 顺序与 get_download_urls 返回的列表一致。 - 成功为文件存储位置(本地路径或云存储 URL),失败为 None - @param success_count: 成功数 - @param fail_count: 失败数 - @param total_count: 总数 + 成功为文件存储位置(本地路径或云存储 URL),失败为 None。 + 任务内重复URL的结果继承首次出现的结果 + @param success_count: 成功数(含去重缓存命中) + @param fail_count: 下载失败数(重试耗尽) + @param skipped_count: 跳过数(无效URL、get_file_path异常等) + @param dup_count: 任务内重复URL数 + @param total_count: 总数(success + fail + skipped + dup = total) """ pass diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 45c40d39..5e1e0b34 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -172,12 +172,14 @@ def process_file(self, task_id, url, file_path, response): 处理下载的文件内容,返回文件最终存储位置。用户按需重写 默认实现: 流式保存到本地磁盘,返回本地路径 云存储场景: 重写此方法上传到 OSS/S3 等,返回云存储 URL - 注意: 此方法在下载失败重试时可能被多次调用,实现需保证幂等性 + 注意: + - 此方法在下载失败重试时可能被多次调用,实现需保证幂等性 + - 必须返回非空字符串,返回空值会触发重试直至失败 @param task_id: 任务 ID @param url: 文件原始 URL @param file_path: get_file_path 返回的路径/标识 @param response: 下载响应 - @return: str - 文件最终存储位置 + @return: str - 文件最终存储位置(不可为空) """ os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as f: @@ -297,8 +299,12 @@ def start_requests(self, task): raise TypeError(f"get_download_urls应返回列表, 实际返回了字符串: {urls[:100]}") if not urls: log.warning(f"任务{task.id}无下载URL") - for item in self.on_task_all_done(task, [], 0, 0, 0, 0, 0) or []: - yield item + try: + for item in self.on_task_all_done(task, [], 0, 0, 0, 0, 0) or []: + yield item + except Exception as e: + log.error(f"任务{task.id} on_task_all_done异常 error={e}") + log.warning(f"任务{task.id} 状态未更新, 请检查on_task_all_done实现") return total = len(urls) @@ -438,7 +444,7 @@ def save_file(self, request, response): raise if not result_url: - log.warning(f"任务{task_id} process_file返回空值 url={url}, 将计为成功但结果为None") + raise Exception(f"process_file返回空值 url={url}, 请检查实现是否正确返回了文件存储位置") # 写入去重缓存(异常不影响主流程) if self._file_dedup and result_url: From b1db0c9bf0b0ece98f3850d20f5e045bb4a76df0 Mon Sep 17 00:00:00 2001 From: gaoyunjian Date: Wed, 8 Apr 2026 19:28:21 +0800 Subject: [PATCH 14/14] =?UTF-8?q?=E7=AC=AC=E5=85=AD=E6=AC=A1=E6=8F=90?= =?UTF-8?q?=E4=BA=A4=EF=BC=9A=E4=BF=AE=E5=A4=8D=E6=96=B9=E6=B3=95=E7=AD=BE?= =?UTF-8?q?=E5=90=8D=E4=B8=8D=E4=B8=80=E8=87=B4=E9=97=AE=E9=A2=98=EF=BC=9B?= =?UTF-8?q?=E9=80=89=E6=8B=A9mysql=E4=BD=9C=E4=B8=BA=E7=BC=93=E5=AD=98?= =?UTF-8?q?=E6=97=B6=EF=BC=8C=E6=8C=89=20redis=5Fkey=E5=88=86=E8=A1=A8?= =?UTF-8?q?=EF=BC=8C=E5=87=8F=E5=B0=91=E8=B7=A8=E4=B8=9A=E5=8A=A1=E4=B8=B2?= =?UTF-8?q?=E6=89=B0=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/FileSpider.md | 42 ++++++++++++++++++++- feapder/core/spiders/file_spider.py | 5 ++- feapder/templates/file_spider_template.tmpl | 11 +++--- tests/file-spider/test_oss_result_spider.py | 24 ++++++++++-- 4 files changed, 71 insertions(+), 11 deletions(-) diff --git a/docs/usage/FileSpider.md b/docs/usage/FileSpider.md index 14380ae3..74c143e4 100644 --- a/docs/usage/FileSpider.md +++ b/docs/usage/FileSpider.md @@ -163,6 +163,46 @@ class MyFileSpider(feapder.FileSpider): ## 4. 使用示例 +### 启动方式(单进程 / master-worker 分离) + +FileSpider 支持两种启动方式: + +1. 单进程:`spider.start()`,适合本地调试 +2. 分离运行:master 仅负责派发任务,worker 仅负责下载处理,适合生产部署 + +```python +from feapder import ArgumentParser + +if __name__ == "__main__": + spider = MyFileSpider( + redis_key="my_file_spider", + task_table="file_task", + task_keys=["id", "file_urls"], + ) + + parser = ArgumentParser(description="MyFileSpider 文件下载爬虫") + parser.add_argument( + "--start_master", + action="store_true", + help="添加任务", + function=spider.start_monitor_task, + ) + parser.add_argument( + "--start_worker", + action="store_true", + help="启动爬虫", + function=spider.start, + ) + parser.start() +``` + +命令行启动: + +```bash +uv run my_file_spider.py --start_master +uv run my_file_spider.py --start_worker +``` + ### 场景一:保存到本地磁盘 最简单的用法,下载文件保存到本地: @@ -330,7 +370,7 @@ FileSpider 提供两级去重: |------|--------|------|----------| | 不去重 | `None`(默认) | - | 每次都重新下载 | | Redis 去重 | `"redis"` | Redis Hash | 分布式共享,多进程安全 | -| MySQL 去重 | `"mysql"` | MySQL 表(自动建表) | 持久化,长期缓存 | +| MySQL 去重 | `"mysql"` | MySQL 表(按 `redis_key` 自动分表) | 持久化,隔离不同业务 | | 自定义去重 | `FileDedup` 实例 | 用户自定义 | 特殊需求 | ### 自定义去重 diff --git a/feapder/core/spiders/file_spider.py b/feapder/core/spiders/file_spider.py index 5e1e0b34..d1c7e54f 100644 --- a/feapder/core/spiders/file_spider.py +++ b/feapder/core/spiders/file_spider.py @@ -8,6 +8,7 @@ import hashlib import os +import re import warnings from urllib.parse import urlparse, unquote @@ -125,7 +126,9 @@ def __init__( elif file_dedup == "mysql": if file_dedup_expire is not None: log.warning("file_dedup_expire仅在file_dedup='redis'时生效") - self._file_dedup = MysqlFileDedup() + redis_namespace = re.sub(r"[^0-9a-zA-Z_]+", "_", self._redis_key).strip("_") + dedup_table = f"file_dedup_{redis_namespace}" if redis_namespace else "file_dedup_default" + self._file_dedup = MysqlFileDedup(table=dedup_table) elif isinstance(file_dedup, FileDedup): self._file_dedup = file_dedup elif file_dedup is not None: diff --git a/feapder/templates/file_spider_template.tmpl b/feapder/templates/file_spider_template.tmpl index 4c6f68dd..4c261233 100644 --- a/feapder/templates/file_spider_template.tmpl +++ b/feapder/templates/file_spider_template.tmpl @@ -62,10 +62,9 @@ if __name__ == "__main__": parser.start() - # 直接启动 - # spider.start() # 启动爬虫 - # spider.start_monitor_task() # 添加任务 + # 启动方式一:单进程(调试方便) + # spider.start() - # 通过命令行启动 - # python ${file_name} --start_master # 添加任务 - # python ${file_name} --start_worker # 启动爬虫 + # 启动方式二:分离 master/worker(生产推荐) + # uv run ${file_name} --start_master # 仅负责派发任务 + # uv run ${file_name} --start_worker # 仅负责消费下载 diff --git a/tests/file-spider/test_oss_result_spider.py b/tests/file-spider/test_oss_result_spider.py index 96143712..d8a91838 100644 --- a/tests/file-spider/test_oss_result_spider.py +++ b/tests/file-spider/test_oss_result_spider.py @@ -15,6 +15,7 @@ from urllib.parse import urlparse, unquote import feapder +from feapder import ArgumentParser from feapder.network.item import Item from feapder.utils.log import log @@ -65,10 +66,13 @@ def process_file(self, task_id, url, file_path, response): # self.oss_client.put_object(file_path, response.content) return f"https://my-bucket.oss.aliyuncs.com/{file_path}" - def on_task_all_done(self, task, result, success_count, fail_count, total_count): + def on_task_all_done(self, task, result, success_count, fail_count, skipped_count, dup_count, total_count): # result 与 get_download_urls 返回的列表严格位置对应 # 例: ["https://oss.com/a.jpg", "https://oss.com/b.jpg", None, "https://oss.com/d.jpg"] - log.info(f"任务{task.id} 完成 成功={success_count} 失败={fail_count}") + log.info( + f"任务{task.id} 完成 成功={success_count} 失败={fail_count} " + f"跳过={skipped_count} 去重={dup_count}" + ) # 组装结果 Item 写入结果表 item = FileResultItem() @@ -89,4 +93,18 @@ def on_task_all_done(self, task, result, success_count, fail_count, total_count) task_table="file_task", task_keys=["id", "file_urls"], ) - spider.start_monitor_task() + + parser = ArgumentParser(description="OssResultSpider 文件下载爬虫") + parser.add_argument( + "--start_master", + action="store_true", + help="添加任务", + function=spider.start_monitor_task, + ) + parser.add_argument( + "--start_worker", + action="store_true", + help="启动爬虫", + function=spider.start, + ) + parser.start()