Tag:信息采集

Tag (信息采集)'s result:

Python书籍采集程序

用python采集douban上的书籍的简单程序,一个小爬虫,用tk做了个简单的gui,单线程,加上多线程速度更快,可能会在2.0版加上多线程。 #coding:utf-8 #author:LSA #date:20160730 #description:collect douban’s book,single thread. from ScrolledText import ScrolledText from Tkinter import * import urllib,re,threading def get_book(ID): varl.set(‘collecting %s page’%str((ID/20)+1)) html = urllib.urlopen(‘https://read.douban.com/tag/%E7%88%B1%E6%83%85/?cat=article&sort=top&start=’+str(ID)) bookmsg = html.read() reg = r'<span class=”price-tag “>(.*?)元</span>.*? \’read.douban.com\’\)”>(.*?)</a>’ reg = re.compile(reg) h = re.findall(reg,bookmsg) return h def get_msg(): ID = 0 p = [] x = 0 while ID<400: List = get_book(ID) ID +=20 for i in List: x = x+1 p.append(float(i[0])) text.insert(END,’book_name:%s price:%s\n’%(i[1],i[0])) text.insert(END,’===============statistical======================\n’) text.insert(END,’book_sum_number: %s\n’%x) text.insert(END,’book_sum_price: %s\n’%sum(p)) text.insert(END,’book_average_price: %f\n’%(sum(p)/x)) def startthread(): t=threading.Thread(target= get_msg) t.start() root = Tk() root.title(‘douban_book_collection’) #root.geometry(‘+2100+300’) text = ScrolledText(root,font=(‘新宋体’,10)) text.grid() b = Button(root,text=’start’,font=(‘新宋体’,10),command=startthread) b.grid() varl = StringVar() l = Label(root,textvariable= varl,fg=’green’) l.grid() varl.set(‘ready’) root.mainloop()