python之urllib.request抓取今日头条 作者:马育民 • 2018-07-23 00:00 • 阅读:10242 需了解:http://www.malaoshi.top/show_1EF2cBxMheSI.html ```python # -*- coding:utf-8 -*- from urllib import request import json import demjson from bs4 import BeautifulSoup url='https://www.toutiao.com/api/pc/feed/?max_behot_time=1532249666&category=__all__&utm_source=toutiao&widen=1&tadrequire=true&as=A1C59BD594A866B&cp=5B541806360B9E1&_signature=SQ1puxAZElvQD8mGi2rOw0kNaa' articleurl='https://www.toutiao.com/group/%s' def main(): req = request.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36') with request.urlopen(req) as f: data = f.read() str=data.decode('unicode_escape') # print(str) j=json.loads(str, strict=False) # print(j) for item in j['data']: print(item['title']) print(item['group_id']) getarticle(articleurl%item['group_id']) def getarticle(url): print('article----',url) req = request.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36') with request.urlopen(req) as f: data = f.read() html = data.decode('utf-8') # print(html) bs=BeautifulSoup(html,'html.parser') scripts=bs.select("script") for item in scripts: txt=item.text if 'BASE_DATA' in txt: jsonstr=txt[txt.find('{'):-1] jsonstr=jsonstr.replace(".replace(//ig, '')","") # print(jsonstr) j = demjson.decode(jsonstr) print('title:',j['articleInfo']['title']) print('content:',j['articleInfo']['content'].replace('<','<').replace('>','>').replace('"',"\"").replace('=','=')) if __name__=='__main__': main() ``` 原文出处:http://www.malaoshi.top/show_1EF1YeXZlCtz.html