使用了一个知乎目前已经修改的API。但是仍能使用。
目前是3线程,token是问题URL的后缀。
效率不是很高。

#encoding=utf-8
import requests
import urllib2
from bs4 import BeautifulSoup
import urllib
import threading
mutex = threading.Lock()
Num=0
def getMsg(token,x):
import json
url = "https://www.zhihu.com/node/QuestionAnswerListV2"
payload = {
'method':'next',
###修改url_token改变知乎的问题
'params':'{"url_token":'+str(token)+',"pagesize":0,"offset":'+str(x)+'}'
}
headers = {
'accept': "*/*",
'accept-language': "zh-CN,zh;q=0.8,zh-TW;q=0.6",
'content-type': "application/x-www-form-urlencoded",
'host': "www.zhihu.com",
'origin': "https://www.zhihu.com",
'referer': "https://www.zhihu.com/question/41155042",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
'x-requested-with': "XMLHttpRequest",
'cache-control': "no-cache",
}
post_data=urllib.urlencode(payload)
response = requests.request("POST", url, data=post_data, headers=headers)
json=json.loads(response.text)
msg=json['msg']
if len(msg)>0:
return msg[0]
else:
return "<html></html>"
def inputPage(token,page):
msg=getMsg(token,page)
global Num
soup=BeautifulSoup(msg,'html.parser')
for noscript in soup.find_all('noscript'):
for img in noscript.find_all('img'):
if img['data-original']=="":
continue
print img['data-original']
req = urllib2.urlopen(img['data-original']) #src
content = req.read()
mutex.acquire()
f=open("./test/"+str(Num)+'.jpg','a+')
f.write(content)
mutex.release()
print Num
Num = Num + 1
f.close()
# print '部分下载完成'
class myThread(threading.Thread):
def __init__(self,token,star,end):
threading.Thread.__init__(self)
self.token=token
self.star=star
self.end=end
def run(self):
while self.star<self.end:
inputPage(self.token,self.star)
self.star=self.star+1
print str(self.star)+'页面下载完成'
token=23252018
thread1=myThread(token,0,30)
thread2=myThread(token,31,50)
thread3=myThread(token,51,60)
thread1.start()
thread2.start()
thread3.start()