scrapy自己在piplines中实现下载图片速度很慢,为什么?


没有用文档中所说的图片管道来实现,为什么我这么实现就很慢呢?

以下附上代码:


 # -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
import urllib2
import requests

class boboCrawlPipeline(object):
    def __init__(self):
        self.f = open('data.txt', 'w+')
        self.browse_headers ={'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0'}
        if not os.path.exists('av'):
            os.makedirs('av')
        os.chdir('av')

    def process_item(self, item, spider):
        title = item['headTitle'][0].split('-')[0].encode('gbk').rstrip()
        dirname = title.decode('gbk')
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        self.f.write(title+'\n')
        for img in set(item['imgurl']):
            self.down_link(img, dirname + '/' + os.path.basename(img))
            #self.f.write(img+'\n')
        return item

    def down_link(self,url, filename, istorrent = 0):
        forumurl = "http://38.103.161.185"
        if os.path.exists(filename) and os.path.getsize(filename) > 0: #TODO MD5
            return
        if url.find('attachments/month')>=0: #如果是本论坛的图片则补全地址
            url = forumurl + "/forum/" + url
        elif url.find('attachments/day')>=0: #如果是本论坛的图片则补全地址
            url = forumurl + "/forum/" + url
        #print("+++++++%s+++"%url)
        attempts = 0

        while attempts < 10:
            try:
                req = requests.Session()#新建连接来下载图片
                save_html = req.get(url,headers=self.browse_headers,timeout=10)
                if save_html.content == None:
                    return
                f=open(filename, "wb").write(save_html.content)
                f.close()
                break
            except Exception as e:
                attempts += 1
                #self.log(e)            
        #self.log(filename +"||"+ url)  
        return

    def close_spider(self,spider):
        self.f.close()

python scrapy

Rasho 9 years, 7 months ago

Your Answer