首页 » Program » Python » 正文

python爬取糗事百科笑话

# coding:utf-8

import urllib2
import re
from bs4 import BeautifulSoup


number = 0   #定义序号变量
page = raw_input("Please input page:")
user_agent = 'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
headers = { 'User-Agent' : user_agent }
request = urllib2.Request('http://www.qiushibaike.com/hot/page/'+str(page),headers = headers)
response = urllib2.urlopen(request)
html = response.read()   #获取页面源码
soup = BeautifulSoup(html,"html.parser")
items = soup.find_all('div',attrs={"class":"content"})   #搜索div标签并且同时含有class=content内容

for item in items:
    number = number + 1
    pattern = re.compile('<span>(.*?)</span>')
    lists = re.findall(pattern,str(item))
    print 'NO',number,':',lists[0].decode('utf-8'),'\n'
    
print "Over......"

Comment