#!/usr/bin/env python3
#-*- coding=utf-8 -*-
import urllib3
if __name__ == '__main__':
http=urllib3.PoolManager()
r=http.request('GET','IP')
print(r.data.decode(gbk))
可以正常抓取。需要安装urllib3,py版本3.43
如何用Python爬虫抓取网页内容?
首先,你要安装requests和BeautifulSoup4,然后执行如下代码.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
importrequests
frombs4 importBeautifulSoup
iurl =''
res =requests.get(iurl)
res.encoding ='utf-8'
#print(len(res.text))
soup =BeautifulSoup(res.text,'html.parser')
#标题
H1 =soup.select('#artibodyTitle')[0].text
#来源
time_source =soup.select('.time-source')[0].text
#来源
origin =soup.select('#artibody p')[0].text.strip()
#原标题
oriTitle =soup.select('#artibody p')[1].text.strip()
#内容
raw_content =soup.select('#artibody p')[2:19]
content =[]
forparagraph inraw_content:
content.append(paragraph.text.strip())
'@'.join(content)
#责任编辑
ae =soup.select('.article-editor')[0].text
这样就可以了
- 上一篇: python爬虫怎么获取动态的网页源码
- 下一篇: 返回列表