使用python和beautifulSoup爬数据,爬到第三条就爬不动了
#coding=utf-8
from __future__ import print_function
from bs4 import BeautifulSoup
import urllib2
import sys
import string
reload(sys)
sys.setdefaultencoding("gbk")
filename="top500.csv"
f=open(filename,'w')
f.write('num,name,country,address,indroduce'+'\n')
a=1
for ye in range(0,20):
url="http://www.alexa.com/topsites/global;"+str(ye)
page=urllib2.urlopen(url)
soup=BeautifulSoup(page)
nameLists=soup.findAll(attrs={"class":"site-listing"})
for names in nameLists:
name=names.find('a').text
print(name)
siteUrl="http://www.alexa.com/siteinfo/"+name
ipUrl="http://www.ip.cn/index.php?ip=www."+name
print(siteUrl)
#获取网站所在国家
pageSite=urllib2.urlopen(siteUrl)
soup=BeautifulSoup(pageSite)
titleLists=soup.findAll(attrs={"class":"metrics-title"})
country=titleLists[1].find('a').text
print(country)
#获取网站服务器所在位置
ipSite=urllib2.urlopen(ipUrl)
soup=BeautifulSoup(ipSite)
addressList=soup.find(attrs={"class":"well"})
addresses=addressList.findAll('p')
address=addresses[1].text
address=address.replace(',',' ')#替换掉','
print(address)
name1="www."+name
print(name1)
introduce=names.find(attrs={"class":"description"}).text
introduce=introduce.replace('... More','')
introduce=introduce.replace(',','.')#替换掉','
paiming=str(a)
f.write(paiming+','+name+','+country+','+address+','+introduce.encode('gbk','ignore')+'\n')
a=string.atoi(paiming)+1
f.close()
print("\nover")
贴出代码,爬到第四个数据就爬不动了,光标一直闪没有继续下去,请教一下是因为打开的url太多了么,并且有循环嵌套,不是的话是什么原因呢,说明一下我的是10M宽带应该不是网速的问题吧。
python beautifulsoup python-爬虫
摸摸你的胸
10 years, 3 months ago