Python书籍采集程序 | LSABLOG

首页 » Program » Python » 正文

Python书籍采集程序

用python采集douban上的书籍的简单程序,一个小爬虫,用tk做了个简单的gui,单线程,加上多线程速度更快,可能会在2.0版加上多线程。

#coding:utf-8
#author:LSA
#date:20160730
#description:collect douban's book,single thread.

from ScrolledText import ScrolledText
from Tkinter import *
import urllib,re,threading


def get_book(ID):
    varl.set('collecting %s page'%str((ID/20)+1))
    html = urllib.urlopen('https://read.douban.com/tag/%E7%88%B1%E6%83%85/?cat=article&sort=top&start='+str(ID))
    bookmsg = html.read()
    reg = r'<span class="price-tag ">(.*?)元</span>.*? \'read.douban.com\'\)">(.*?)</a>'
    reg = re.compile(reg)
    h = re.findall(reg,bookmsg)
    return h


def get_msg():
    ID = 0
    p = []
    x = 0
    while ID<400:
        List = get_book(ID)
        ID +=20
        for i in List:
            x = x+1
            p.append(float(i[0]))
            text.insert(END,'book_name:%s      price:%s\n'%(i[1],i[0]))

    text.insert(END,'===============statistical======================\n')
    text.insert(END,'book_sum_number: %s\n'%x)
    text.insert(END,'book_sum_price: %s\n'%sum(p))
    text.insert(END,'book_average_price: %f\n'%(sum(p)/x))

def startthread():
    t=threading.Thread(target= get_msg)
    t.start()
    




    
root = Tk()
root.title('douban_book_collection')
#root.geometry('+2100+300')

text = ScrolledText(root,font=('新宋体',10))
text.grid()

b = Button(root,text='start',font=('新宋体',10),command=startthread)
b.grid()

varl = StringVar()

l = Label(root,textvariable= varl,fg='green')
l.grid()
varl.set('ready')


root.mainloop()

Comment