Tag:bs4

Tag (bs4)'s result:

python爬取糗事百科笑话

# coding:utf-8 import urllib2 import re from bs4 import BeautifulSoup number = 0 #定义序号变量 page = raw_input(“Please input page:”) user_agent = ‘User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11’ headers = { ‘User-Agent’ : user_agent } request = urllib2.Request(‘http://www.qiushibaike.com/hot/page/’+str(page),headers = headers) response = urllib2.urlopen(request) html = response.read() #获取页面源码 soup = BeautifulSoup(html,”html.parser”) items = soup.find_all(‘div’,attrs={“class”:”content”}) #搜索div标签并且同时含有class=content内容 for item in items: number = number + 1 pattern = re.compile(‘<span>(.*?)</span>’) lists = re.findall(pattern,str(item)) print ‘NO’,number,’:’,lists[0].decode(‘utf-8′),’\n’ print “Over……”