scrapy中为什么不能进入第二层爬取?
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from scrapy.http import Request,FormRequest
from SCR.items import ScrItem
class ScrSpider(BaseSpider):
name = "scr"
allowed_domains = ["http://pubs.rsc.org"]
def start_requests(self):
return [FormRequest("http://pubs.rsc.org/en/search/journalresult",
formdata={'k1': 'v1', 'k2':
'v2','k3':'v3','k4':'v4'},
callback=self.parse)]
def parse2(self,response):
print 'The second step'
def parse(self, response):
sel = Selector(response)
base_url=get_base_url(response)
sites = sel.xpath(u'//h2[@class="title_text_s4_jrnls"]')
for site in sites:
urls=site.xpath('a/@href').extract()
links=[]
links = [urljoin_rfc(base_url,link) for link in urls]
for link in links:
print 'The first step'
yield Request(link,callback=self.parse2)
输出: The first step
但,没有输出 The second step
请教什么原因?
..............解决分割线.................
allowed_domains = ["
http://pubs.rsc.org
"]
应该是:allowed_domains = ["pubs.rsc.org"]
就是进不到第二层的原因
-July-
9 years, 8 months ago