Tag:crawler

Tag (crawler)'s result:

python采集百度url

最近用python写了一个采集百度url的脚本,一个线程一次爬一个页面的全部url,使用了多线程、队列、bs4,感觉bs4挺强大。 功能描述:采集百度url,自定义页数,线程数,关键词,保存文件名 (注:此脚本使用线程数小于或等于页面数即可,由于使用了队列,即使线程数大于页面数效果也和等于页面数一样) 源代码: #coding: utf-8 import requests,re,threading,time from bs4 import BeautifulSoup as bs from Queue import Queue from argparse import ArgumentParser arg = ArgumentParser(description=’baidu_url_collection’) arg.add_argument(‘keyword’,help=’inurl:.asp?id=1′) arg.add_argument(‘-p’, ‘–page’, help=’page count’, dest=’pagecount’, type=int) arg.add_argument(‘-t’, ‘–thread’, help=’the thread_count’, dest=’thread_count’, type=int, default=10) arg.add_argument(‘-o’, ‘–outfile’, help=’the file save result’, dest=’outfile’, default=’result.txt’) result = arg.parse_args() headers = {‘User-Agent’:’Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)’} class Bd_url(threading.Thread): def __init__(self, que): threading.Thread.__init__(self) self._que = que def run(self): while not self._que.empty(): URL = self._que.get() try: self.bd_url_collect(URL) except Exception,e: print e pass def bd_url_collect(self, url): r = requests.get(url, headers=headers, timeout=5) soup = bs(r.content, ‘lxml’, from_encoding=’utf-8′) bqs = soup.find_all(name=’a’, attrs={‘data-click’:re.compile(r’.’), ‘class’:None}) for bq in bqs: r = requests.get(bq[‘href’], headers=headers, timeout=5) if r.status_code == 200: print r.url with open(result.outfile, ‘a’) as f: f.write(r.url + ‘\n’) def main(): thread = [] thread_count = result.thread_count que = Queue() for i in range(0,(result.pagecount)): que.put(‘https://www.baidu.com/s?wd=’ + result.keyword + ‘&pn=’ + str(i)) for i in range(thread_count): thread.append(Bd_url(que)) for i in thread: i.start() for i in thread: i.join() if __name__ == ‘__main__’: start = time.clock()……

python爬取代理脚本

最近写了一个爬取代理的python脚本,参考了一下别人的代码,有了大量代理就方便了,这是v1.0版本,采用了多线程(一页用一个线程爬代理),顺便熟悉一下队列和bs4,感觉bs4的确很强大而且方便很多。还有很多地方不是很完善,日后有空会继续完善这个脚本。 功能描述:爬取www.xicidaili.com的代理,并去1212.ip138.com/ic.asp验证代理的可用性,最后把可用代理写入useful_proxies.txt文件。 代码: #coding:utf-8 import requests from bs4 import BeautifulSoup as bs import re import Queue import threading import time import optparse url = ‘http://www.xicidaili.com/nn/’ headers = {‘User-Agent’:’Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko’} class Proxy_collection(threading.Thread): #继承Thread实现多线程 def __init__(self, que): threading.Thread.__init__(self) #重用父类Thread的__init__() self._que = que def run(self): while not self._que.empty(): url = self._que.get() r = requests.get(url, headers=headers, timeout=5) soup = bs(r.content, ‘lxml’, from_encoding=’utf-8′) bqs = soup.find_all(name=’tr’, attrs={‘class’:re.compile(r’|[^odd]’)}) for bq in bqs: us = bq.find_all(name=’td’) try: self.proxies_confirm(str(us[5].string), str(us[1].string), str(us[2].string)) #取协议:ip:端口去验证 except Exception,e: #print e pass def proxies_confirm(self, type_self, ip, port): ip_dic = {} ip_dic[type_self.lower()] = ip + ‘:’ + port r = requests.get(‘http://1212.ip138.com/ic.asp’, headers=headers, proxies=ip_dic, timeout=5) result = re.findall(r’\d+\.\d+\.\d+\.\d+’, r.content) result_ip = ”.join(result) #转为字符串 if ip == result_ip: print type_self + ‘—‘ + ip + ‘:’ + port + ‘ is useful!!!\n’ with open(‘useful_proxies.txt’, ‘a’) as f: f.write(type_self.lower() + ‘—‘ + ip + ‘:’ + port + ‘\n’) if __name__ == ‘__main__’: thread = [] que =……

Python collect animal‘s pictures

I use python to collect some animal’s pictures,this is a little crawler.As a matter of fact,python is very useful! #coding:utf-8 import urllib,re def gethtml(url): page = urllib.urlopen(url) html = page.read() return html x = 0 def getimg(html): imgre = re.compile(r'<img src=”(.*?)” ‘) imglist = re.findall(imgre,html) for imgurl in imglist: print imgurl global x urllib.urlretrieve(imgurl,’E:\\python_pic_vid_etc\get_animal_pic\%s.jpg’%x) print ‘downloading %s’%x x+=1 for i in range(1,10): url = ‘http://www.ivsky.com/tupian/dongwutupian/index_%s.html’%i html = gethtml(url) getimg(html)

Python书籍采集程序

用python采集douban上的书籍的简单程序,一个小爬虫,用tk做了个简单的gui,单线程,加上多线程速度更快,可能会在2.0版加上多线程。 #coding:utf-8 #author:LSA #date:20160730 #description:collect douban’s book,single thread. from ScrolledText import ScrolledText from Tkinter import * import urllib,re,threading def get_book(ID): varl.set(‘collecting %s page’%str((ID/20)+1)) html = urllib.urlopen(‘https://read.douban.com/tag/%E7%88%B1%E6%83%85/?cat=article&sort=top&start=’+str(ID)) bookmsg = html.read() reg = r'<span class=”price-tag “>(.*?)元</span>.*? \’read.douban.com\’\)”>(.*?)</a>’ reg = re.compile(reg) h = re.findall(reg,bookmsg) return h def get_msg(): ID = 0 p = [] x = 0 while ID<400: List = get_book(ID) ID +=20 for i in List: x = x+1 p.append(float(i[0])) text.insert(END,’book_name:%s price:%s\n’%(i[1],i[0])) text.insert(END,’===============statistical======================\n’) text.insert(END,’book_sum_number: %s\n’%x) text.insert(END,’book_sum_price: %s\n’%sum(p)) text.insert(END,’book_average_price: %f\n’%(sum(p)/x)) def startthread(): t=threading.Thread(target= get_msg) t.start() root = Tk() root.title(‘douban_book_collection’) #root.geometry(‘+2100+300’) text = ScrolledText(root,font=(‘新宋体’,10)) text.grid() b = Button(root,text=’start’,font=(‘新宋体’,10),command=startthread) b.grid() varl = StringVar() l = Label(root,textvariable= varl,fg=’green’) l.grid() varl.set(‘ready’) root.mainloop()

python爬取糗事百科笑话

# coding:utf-8 import urllib2 import re from bs4 import BeautifulSoup number = 0 #定义序号变量 page = raw_input(“Please input page:”) user_agent = ‘User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11’ headers = { ‘User-Agent’ : user_agent } request = urllib2.Request(‘http://www.qiushibaike.com/hot/page/’+str(page),headers = headers) response = urllib2.urlopen(request) html = response.read() #获取页面源码 soup = BeautifulSoup(html,”html.parser”) items = soup.find_all(‘div’,attrs={“class”:”content”}) #搜索div标签并且同时含有class=content内容 for item in items: number = number + 1 pattern = re.compile(‘<span>(.*?)</span>’) lists = re.findall(pattern,str(item)) print ‘NO’,number,’:’,lists[0].decode(‘utf-8′),’\n’ print “Over……”