scrapy自己在piplines中实现下载图片速度很慢,为什么?
没有用文档中所说的图片管道来实现,为什么我这么实现就很慢呢?
以下附上代码:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
import urllib2
import requests
class boboCrawlPipeline(object):
def __init__(self):
self.f = open('data.txt', 'w+')
self.browse_headers ={'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0'}
if not os.path.exists('av'):
os.makedirs('av')
os.chdir('av')
def process_item(self, item, spider):
title = item['headTitle'][0].split('-')[0].encode('gbk').rstrip()
dirname = title.decode('gbk')
if not os.path.exists(dirname):
os.makedirs(dirname)
self.f.write(title+'\n')
for img in set(item['imgurl']):
self.down_link(img, dirname + '/' + os.path.basename(img))
#self.f.write(img+'\n')
return item
def down_link(self,url, filename, istorrent = 0):
forumurl = "http://38.103.161.185"
if os.path.exists(filename) and os.path.getsize(filename) > 0: #TODO MD5
return
if url.find('attachments/month')>=0: #如果是本论坛的图片则补全地址
url = forumurl + "/forum/" + url
elif url.find('attachments/day')>=0: #如果是本论坛的图片则补全地址
url = forumurl + "/forum/" + url
#print("+++++++%s+++"%url)
attempts = 0
while attempts < 10:
try:
req = requests.Session()#新建连接来下载图片
save_html = req.get(url,headers=self.browse_headers,timeout=10)
if save_html.content == None:
return
f=open(filename, "wb").write(save_html.content)
f.close()
break
except Exception as e:
attempts += 1
#self.log(e)
#self.log(filename +"||"+ url)
return
def close_spider(self,spider):
self.f.close()
Rasho
9 years, 7 months ago