用BS4将信息写入文件时遇到的问题
代码如下:
import requests
import codecs
from bs4 import BeautifulSoup
i = 0
fp1 = codecs.open('D:/Program Files/python/abcd11.txt', 'w', 'utf-8')
answer_soup = BeautifulSoup(open('D:/Program Files/python/abcd2.txt'))
author_tag = answer_soup.find_all("h3", class_ = "zm-item-answer-author-wrap")
print type(author_tag)
for s in author_tag:
fp1.write(s.text)
print s.text
print i
i = i + 1
fp1.close
运行结果如下:
为什么都能print出来,但却不能写入到文件中呢?
打开文件的空白的。
我自己实在是不知道是为什么了,在另一段代码中是可行的的啊。(第一段的写入文件是正常的,第二段写入就是空白的了)
- -coding:utf-8- -
import requests
import codecs
import ConfigParser
import json
from bs4 import BeautifulSoup
global session
global cookies
i = 0
url = '
http://www.zhihu.com/question/20899988
'
fp = codecs.open('D:/Program Files/python/abcd.txt', 'w', 'utf-8')
fp1 = codecs.open('D:/Program Files/python/abcd1.txt', 'w', 'utf-8')
fp2 = codecs.open('D:/Program Files/python/abcd2.txt', 'w', 'utf-8')
cf = ConfigParser.ConfigParser()
cf.read("D:/Program Files/python/config.ini")
cookies = cf._sections["cookies"]
email = cf.get("info", "email")
password = cf.get("info", "password")
cookies = dict(cookies)
print cookies
s = requests.session()
login_data = {"email": email, "password": password}
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
'Host': "www.zhihu.com",
'Referer': "
http://www.zhihu.com/
",
'X-Requested-With': "XMLHttpRequest"
}
r = s.post('
http://www.zhihu.com/login
', data = login_data, headers = header)
if r.json()["r"] == 1:
print "Login Failed, reason is:"
for m in r.json()["msg"]:
print r.json()["msg"][m]
print "Use cookies"
has_cookies = False
for key in cookies:
if key != '
name
' and cookies[key] != '':
has_cookies = True
break
if has_cookies == False:
raise ValueError("请填写config.ini文件中的cookies项.")
session = s
print r.text
soup = BeautifulSoup(r.content)
if session == None:
create_session()
s2 = session
has_cookies = False
for key in cookies:
if key != '
name
' and cookies[key] != '':
has_cookies = True
r2 = s2.get(url,cookies = cookies)
break
if has_cookies == False:
r2 = s2.get(url)
# print "aaaaaaaaaaaaaaaaaaaa"
soup2 = BeautifulSoup(r2.content)
author_tag = soup2.find_all("h3", class_ = "zm-item-answer-author-wrap")
for ss in author_tag:
fp.write(ss.text)
print ss.text
print type(ss.text)
i = i + 1
s3 = session
post_url = "
http://www.zhihu.com/node/QuestionAnswerListV2
"
_xsrf = soup2.find("input", attrs = {'name': '_xsrf'})["value"]
i = 1
offset = i * 50
params = json.dumps({"url_token":int(url[-8:-1] + url[-1]), "pagesize":50, "offset": offset})
data = {
'_xsrf': _xsrf,
'method': "next",
'params': params
}
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
'Host': "www.zhihu.com",
'Referer': url
}
has_cookies = False
for key in cookies:
if key != '
name
' and cookies[key] != '':
has_cookies = True
r3 = s3.post(post_url, data = data, headers = header, cookies = cookies)
break
if has_cookies == False:
r3 = s3.post(post_url, data = data, headers = header)
answer_list = r3.json()["msg"]
for j in range(7):
fp2.write(answer_list[j])
soup = BeautifulSoup(soup.encode("utf-8"))
print j
answer_soup = BeautifulSoup(open('D:/Program Files/python/abcd2.txt'))
i = 0
author_tag = answer_soup.find_all("h3", class_ = "zm-item-answer-author-wrap")
for ss in author_tag:
fp1.write(ss.text)
print ss.text
print i
i = i + 1
fp.close
fp1.close
fp2.close
小白第一次提问,python爬虫也是感兴趣刚学……
有问题大家尽管批评,我改……