用BS4将信息写入文件时遇到的问题


代码如下:

import requests
import codecs
from bs4 import BeautifulSoup

i = 0

fp1 = codecs.open('D:/Program Files/python/abcd11.txt', 'w', 'utf-8')

answer_soup = BeautifulSoup(open('D:/Program Files/python/abcd2.txt'))

author_tag = answer_soup.find_all("h3", class_ = "zm-item-answer-author-wrap")
print type(author_tag)
for s in author_tag:
fp1.write(s.text)
print s.text
print i
i = i + 1

fp1.close

运行结果如下:
图片描述

为什么都能print出来,但却不能写入到文件中呢?
打开文件的空白的。
我自己实在是不知道是为什么了,在另一段代码中是可行的的啊。(第一段的写入文件是正常的,第二段写入就是空白的了)

- -coding:utf-8- -

import requests
import codecs
import ConfigParser
import json
from bs4 import BeautifulSoup

global session
global cookies
i = 0

url = ' http://www.zhihu.com/question/20899988 '
fp = codecs.open('D:/Program Files/python/abcd.txt', 'w', 'utf-8')
fp1 = codecs.open('D:/Program Files/python/abcd1.txt', 'w', 'utf-8')
fp2 = codecs.open('D:/Program Files/python/abcd2.txt', 'w', 'utf-8')

cf = ConfigParser.ConfigParser()
cf.read("D:/Program Files/python/config.ini")

cookies = cf._sections["cookies"]

email = cf.get("info", "email")
password = cf.get("info", "password")

cookies = dict(cookies)
print cookies

s = requests.session()
login_data = {"email": email, "password": password}
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
'Host': "www.zhihu.com",
'Referer': " http://www.zhihu.com/ ",
'X-Requested-With': "XMLHttpRequest"
}

r = s.post(' http://www.zhihu.com/login ', data = login_data, headers = header)
if r.json()["r"] == 1:
print "Login Failed, reason is:"
for m in r.json()["msg"]:
print r.json()["msg"][m]
print "Use cookies"
has_cookies = False
for key in cookies:
if key != ' name ' and cookies[key] != '':
has_cookies = True
break
if has_cookies == False:
raise ValueError("请填写config.ini文件中的cookies项.")
session = s
print r.text
soup = BeautifulSoup(r.content)

if session == None:
create_session()
s2 = session
has_cookies = False
for key in cookies:
if key != ' name ' and cookies[key] != '':
has_cookies = True
r2 = s2.get(url,cookies = cookies)
break
if has_cookies == False:
r2 = s2.get(url)
# print "aaaaaaaaaaaaaaaaaaaa"
soup2 = BeautifulSoup(r2.content)
author_tag = soup2.find_all("h3", class_ = "zm-item-answer-author-wrap")
for ss in author_tag:
fp.write(ss.text)
print ss.text
print type(ss.text)
i = i + 1

s3 = session
post_url = " http://www.zhihu.com/node/QuestionAnswerListV2 "
_xsrf = soup2.find("input", attrs = {'name': '_xsrf'})["value"]
i = 1
offset = i * 50
params = json.dumps({"url_token":int(url[-8:-1] + url[-1]), "pagesize":50, "offset": offset})
data = {
'_xsrf': _xsrf,
'method': "next",
'params': params
}
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
'Host': "www.zhihu.com",
'Referer': url
}
has_cookies = False
for key in cookies:
if key != ' name ' and cookies[key] != '':
has_cookies = True
r3 = s3.post(post_url, data = data, headers = header, cookies = cookies)
break
if has_cookies == False:
r3 = s3.post(post_url, data = data, headers = header)

answer_list = r3.json()["msg"]

for j in range(7):
fp2.write(answer_list[j])
soup = BeautifulSoup(soup.encode("utf-8"))


 print j

answer_soup = BeautifulSoup(open('D:/Program Files/python/abcd2.txt'))

i = 0
author_tag = answer_soup.find_all("h3", class_ = "zm-item-answer-author-wrap")
for ss in author_tag:
fp1.write(ss.text)
print ss.text
print i
i = i + 1

fp.close
fp1.close
fp2.close

小白第一次提问,python爬虫也是感兴趣刚学……
有问题大家尽管批评,我改……

python python-爬虫 python2.7

Stefan 10 years, 5 months ago

Your Answer