Tag:爬虫

Tag (爬虫)'s result:

王者荣耀高清英雄大图赠送

听说xx荣耀最近很火,那不如一起……写个小爬虫玩玩,顺便找回手感。 功能说明: 爬取xx荣耀官网英雄大图,并保存到本地。 这是单线程版本,保存的地址根据自身情况修改。还写了个多线程版本,但是开5个线程就被反爬了,有空优化一下再发。 源码: #coding:utf-8 #Author:LSA #Description:crawler wzyy heros pics #Data:20170827 import requests,os,urllib2,json import sys reload(sys) sys.setdefaultencoding(‘utf-8’) heroListUrl = ‘http://pvp.qq.com/web201605/js/herolist.json’ headers = { “User-Agent”:”Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)” } def main(): print ‘Starting……’ heroListHtml = requests.get(heroListUrl,headers=headers) heroListHtml.encoding = ‘utf-8’ heroListJson = heroListHtml.json() #print heroListJson heroName = list(map(lambda x:x[‘cname’],heroListJson)) heroNumber = list(map(lambda x:x[‘ename’],heroListJson)) print ‘——–Total hero number ‘+str(len(heroNumber))+’———‘ print ‘Crawler……’ n = 0 for i in heroNumber: os.mkdir(“/lsa/Pictures/kingheropics/”+heroName[n]) os.chdir(“/lsa/Pictures/kingheropics/”+heroName[n]) print ‘===Get ‘+heroName[n]+’ pics===’ for j in range(1,16): heroUrl = “http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/%s/%s-bigskin-%s.jpg” % (str(i),str(i),str(j)) heroPic = requests.get(heroUrl,headers=headers) if heroPic.status_code == 200 or heroPic.status_code == 304: open(heroUrl.split(‘-‘)[-1],’wb’).write(heroPic.content) print ‘Got ‘+str(heroName[n])+’ pic’+’ ‘+str(j) else: break n = n + 1 print ‘=========================All over!=======================’ if __name__==’__main__’: main() 效果图: 写这个程序的时候遇到了中文乱码问题,将在下一篇文章中细细道来。  

python采集百度url

最近用python写了一个采集百度url的脚本,一个线程一次爬一个页面的全部url,使用了多线程、队列、bs4,感觉bs4挺强大。 功能描述:采集百度url,自定义页数,线程数,关键词,保存文件名 (注:此脚本使用线程数小于或等于页面数即可,由于使用了队列,即使线程数大于页面数效果也和等于页面数一样) 源代码: #coding: utf-8 import requests,re,threading,time from bs4 import BeautifulSoup as bs from Queue import Queue from argparse import ArgumentParser arg = ArgumentParser(description=’baidu_url_collection’) arg.add_argument(‘keyword’,help=’inurl:.asp?id=1′) arg.add_argument(‘-p’, ‘–page’, help=’page count’, dest=’pagecount’, type=int) arg.add_argument(‘-t’, ‘–thread’, help=’the thread_count’, dest=’thread_count’, type=int, default=10) arg.add_argument(‘-o’, ‘–outfile’, help=’the file save result’, dest=’outfile’, default=’result.txt’) result = arg.parse_args() headers = {‘User-Agent’:’Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)’} class Bd_url(threading.Thread): def __init__(self, que): threading.Thread.__init__(self) self._que = que def run(self): while not self._que.empty(): URL = self._que.get() try: self.bd_url_collect(URL) except Exception,e: print e pass def bd_url_collect(self, url): r = requests.get(url, headers=headers, timeout=5) soup = bs(r.content, ‘lxml’, from_encoding=’utf-8′) bqs = soup.find_all(name=’a’, attrs={‘data-click’:re.compile(r’.’), ‘class’:None}) for bq in bqs: r = requests.get(bq[‘href’], headers=headers, timeout=5) if r.status_code == 200: print r.url with open(result.outfile, ‘a’) as f: f.write(r.url + ‘\n’) def main(): thread = [] thread_count = result.thread_count que = Queue() for i in range(0,(result.pagecount)): que.put(‘https://www.baidu.com/s?wd=’ + result.keyword + ‘&pn=’ + str(i)) for i in range(thread_count): thread.append(Bd_url(que)) for i in thread: i.start() for i in thread: i.join() if __name__ == ‘__main__’: start = time.clock()……

Python书籍采集程序

用python采集douban上的书籍的简单程序,一个小爬虫,用tk做了个简单的gui,单线程,加上多线程速度更快,可能会在2.0版加上多线程。 #coding:utf-8 #author:LSA #date:20160730 #description:collect douban’s book,single thread. from ScrolledText import ScrolledText from Tkinter import * import urllib,re,threading def get_book(ID): varl.set(‘collecting %s page’%str((ID/20)+1)) html = urllib.urlopen(‘https://read.douban.com/tag/%E7%88%B1%E6%83%85/?cat=article&sort=top&start=’+str(ID)) bookmsg = html.read() reg = r'<span class=”price-tag “>(.*?)元</span>.*? \’read.douban.com\’\)”>(.*?)</a>’ reg = re.compile(reg) h = re.findall(reg,bookmsg) return h def get_msg(): ID = 0 p = [] x = 0 while ID<400: List = get_book(ID) ID +=20 for i in List: x = x+1 p.append(float(i[0])) text.insert(END,’book_name:%s price:%s\n’%(i[1],i[0])) text.insert(END,’===============statistical======================\n’) text.insert(END,’book_sum_number: %s\n’%x) text.insert(END,’book_sum_price: %s\n’%sum(p)) text.insert(END,’book_average_price: %f\n’%(sum(p)/x)) def startthread(): t=threading.Thread(target= get_msg) t.start() root = Tk() root.title(‘douban_book_collection’) #root.geometry(‘+2100+300’) text = ScrolledText(root,font=(‘新宋体’,10)) text.grid() b = Button(root,text=’start’,font=(‘新宋体’,10),command=startthread) b.grid() varl = StringVar() l = Label(root,textvariable= varl,fg=’green’) l.grid() varl.set(‘ready’) root.mainloop()

python批量注册脚本

上次用php写了php+mysql实现超原始的注册登录 这个python脚本基于上次的注册页面,实现批量注册。 相关知识点:1.抓包拿数据 2.urllib和urllib2简单用法 3.简单的爬虫操作 4.文件写入   抓包得到数据: #coding:utf-8 import random,urllib,urllib2 import re,time nums = raw_input(“input the number for reg:”) def reg(num,nums): username = str(random.randrange(100000,9999999)) password = str(random.randrange(1000000,99999999)) url=”http://192.168.43.106/reg_check.php” headers = { ‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36’, ‘Origin’: ‘http://192.168.43.106’, ‘Referer’: ‘http://192.168.43.106/regtest.php’, ‘Content-Type’: ‘application/x-www-form-urlencoded’, } data = {“username”:username, “password”:password, “pwd_again”:password, } data = urllib.urlencode(data) req = urllib2.Request(url,headers=headers,data=data) html = urllib2.urlopen(req).read() reg = u’Success reg!’ reg = re.compile(reg) r = re.findall(reg,html) if r!=[]: print “success reg!—>username=%s,password=%s——(%s/%s)” %(username,password,num+1,nums) f = open(r’E:\python_projects\regusers.txt’,’a’) f.write(“%s—%s(%s)\n” %(username,password,str(num+1))) f.close() for num in range(int(nums)): reg(num,nums) time.sleep(1) 去数据库和网站上看看有没注册成功 成功注册! 这个注册脚本比较简单,后期我有空再优化。