python之urllib.request抓取今日头条-马育民老师

需了解：http://www.malaoshi.top/show_1EF2cBxMheSI.html

```python
# -*- coding:utf-8 -*-
from urllib import request
import json
import demjson
from bs4 import BeautifulSoup

url='https://www.toutiao.com/api/pc/feed/?max_behot_time=1532249666&category=__all__&utm_source=toutiao&widen=1&tadrequire=true&as=A1C59BD594A866B&cp=5B541806360B9E1&_signature=SQ1puxAZElvQD8mGi2rOw0kNaa'
articleurl='https://www.toutiao.com/group/%s'
def main():
    req = request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36')

with request.urlopen(req) as f:
        data = f.read()
        str=data.decode('unicode_escape')
        # print(str)
        j=json.loads(str, strict=False)
        # print(j)
        for item in j['data']:
            print(item['title'])
            print(item['group_id'])
            getarticle(articleurl%item['group_id'])

def getarticle(url):
    print('article----',url)
    req = request.Request(url)
    req.add_header('User-Agent',
                   'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36')

with request.urlopen(req) as f:
        data = f.read()
        html = data.decode('utf-8')
        # print(html)
        bs=BeautifulSoup(html,'html.parser')
        scripts=bs.select("script")
        for item in scripts:
            txt=item.text
            if 'BASE_DATA' in txt:
                jsonstr=txt[txt.find('{'):-1]
                jsonstr=jsonstr.replace(".replace(/<br \/>/ig, '')","")
                # print(jsonstr)
                j = demjson.decode(jsonstr)
                print('title:',j['articleInfo']['title'])
                print('content:',j['articleInfo']['content'].replace('<','<').replace('>','>').replace('"',"\"").replace('=','='))

if __name__=='__main__':
    main()
```

原文出处：http://www.malaoshi.top/show_1EF1YeXZlCtz.html