最近写了一个爬取代理的python脚本,参考了一下别人的代码,有了大量代理就方便了,这是v1.0版本,采用了多线程(一页用一个线程爬代理),顺便熟悉一下队列和bs4,感觉bs4的确很强大而且方便很多。还有很多地方不是很完善,日后有空会继续完善这个脚本。
功能描述:爬取www.xicidaili.com的代理,并去1212.ip138.com/ic.asp验证代理的可用性,最后把可用代理写入useful_proxies.txt文件。
代码:
#coding:utf-8 import requests from bs4 import BeautifulSoup as bs import re import Queue import threading import time import optparse url = 'http://www.xicidaili.com/nn/' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko'} class Proxy_collection(threading.Thread): #继承Thread实现多线程 def __init__(self, que): threading.Thread.__init__(self) #重用父类Thread的__init__() self._que = que def run(self): while not self._que.empty(): url = self._que.get() r = requests.get(url, headers=headers, timeout=5) soup = bs(r.content, 'lxml', from_encoding='utf-8') bqs = soup.find_all(name='tr', attrs={'class':re.compile(r'|[^odd]')}) for bq in bqs: us = bq.find_all(name='td') try: self.proxies_confirm(str(us[5].string), str(us[1].string), str(us[2].string)) #取协议:ip:端口去验证 except Exception,e: #print e pass def proxies_confirm(self, type_self, ip, port): ip_dic = {} ip_dic[type_self.lower()] = ip + ':' + port r = requests.get('http://1212.ip138.com/ic.asp', headers=headers, proxies=ip_dic, timeout=5) result = re.findall(r'\d+\.\d+\.\d+\.\d+', r.content) result_ip = ''.join(result) #转为字符串 if ip == result_ip: print type_self + '---' + ip + ':' + port + ' is useful!!!\n' with open('useful_proxies.txt', 'a') as f: f.write(type_self.lower() + '---' + ip + ':' + port + '\n') if __name__ == '__main__': thread = [] que = Queue.Queue() parser = optparse.OptionParser('usage %prog '+\ '-p <page num>') parser.add_option('-p', dest='pagenum', type='int',\ help='specify page nums,default 5',default=5) (options, args) = parser.parse_args() pagenum = options.pagenum for i in range(1, pagenum+1): que.put('http://www.xicidaili.com/nn/' + str(i)) for i in range(pagenum): thread.append(Proxy_collection(que)) start = time.clock() for i in thread: i.start() for i in thread: i.join() end = time.clock() print end - start