scrapy自动翻页采集,第二页跳转后,爬虫自动结束
# -*- coding: utf-8 -*-
import scrapy
from weather.items import WeatherItem
from scrapy.http import Request
class WeatherSpider(scrapy.Spider):
name = "myweather"
allowed_domains = ["http://xjh.haitou.cc/nj/uni-21"]
start_urls = ["http://xjh.haitou.cc/nj/uni-21/page-2"]
url="http://xjh.haitou.cc"
def parse(self, response):
item = WeatherItem()
preachs=response.xpath('//table[@id="mainInfoTable"]/tbody/tr')
for preach in preachs:
item['corp']=preach.xpath('.//div[@class="text-success company"]/text()').extract()
item['date']=preach.xpath('.//span[@class="hold-ymd"]/text()').extract()
item['location']=preach.xpath('.//td[@class="text-ellipsis"]/span/text()').extract()
item['click']=preach.xpath('.//td[@class="text-right"]/text()').extract()
yield item
nextlink=response.xpath('//li[@class="next"]/a/@href').extract()
if nextlink:
link=nextlink[0]
print "##############"
print self.url+link
print "##############"
yield Request(self.url+link,callback=self.parse )
##############
http://xjh.haitou.cc/nj/uni-21/page-3
##############
2015-10-23 22:05:57 [scrapy] DEBUG: Filtered offsite request to 'xjh.haitou.cc': <GET http://xjh.haitou.cc/nj/uni-21/page-3>
2015-10-23 22:05:57 [scrapy] INFO: Closing spider (finished)
2015-10-23 22:05:57 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 261,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 10508,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 10, 23, 14, 5, 57, 9032),
'item_scraped_count': 20,
'log_count/DEBUG': 23,
'log_count/INFO': 7,
'offsite/domains': 1,
'offsite/filtered': 1,
'request_depth_max': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2015, 10, 23, 14, 5, 56, 662979)}
2015-10-23 22:05:57 [scrapy] INFO: Spider closed (finished)
王讨厌与小吉祥
9 years, 4 months ago