当前位置: 代码迷 >> 综合 >> scrapy----下载文件
  详细解决方案

scrapy----下载文件

热度:80   发布时间:2023-12-24 20:53:55.0

1.qishu.py

# 需要下载的文件地址,需要是一个列表
# 如果不下载,只是将地址保存在数据库中,不需要设置列表qishu['download_url'] = [download_url]

2.在pipelines.py中自定义自己的pipeline

from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
class QishuxiazaiPipeline(FilesPipeline):def get_media_requests(self, item, info):image_url = item['download_url'][0]yield Request(image_url, meta={'item': item})def file_path(self, request, response=None, info=None):item = request.meta['item']novel_name = item['download_url'][0].split('/')[-1]return '%s' % novel_namedef item_completed(self, results, item, info):print(results)return item

3.在settings.py中开启自己的pipeline

ITEM_PIPELINES = {'Qishu.pipelines.QishuxiazaiPipeline': 3,
#如果采用自定义的CustomImagesPipeline,需要将自带的ImagesPipeline设置为None。'scrapy.pipelines.files.FilesPipeline':None
}
FILES_STORE = 'files'
FILES_URLS_FIELD = 'download_url'