您现在的位置：主页 > 上位机技术 > python > python百度图片爬虫

本文所属标签:

python百度图片爬虫

来源：网络整理网络用户发布，如有版权联系网管删除　2018-08-13　

上一篇我写了如何爬取百度网盘的爬虫，在这里还是重温一下，把链接附上:

http://www.cnblogs.com/huangxie/p/5473273.html

这一篇我想写写如何爬取百度图片的爬虫，这个爬虫也是:搜搜gif(在线制作功能点我) 的爬虫代码，其实爬虫整体框架还是差不多的，但就是会涉及到图片的的一些处理，还是花费了我不少时间的，所以我请阅读的本爬虫的孩子还是认真一些，毕竟程序猿都不容易啊。好的，我也不想多说，爬虫的代码我会分享到去转盘网，想下载本爬虫代码的孩子请点我下载，如果没有下载到，请点击这个链接。

PS:不会python的孩子赶快去补补吧，先把基础搞清楚再说

"""

Created on 2015-9-17

import time,math,os,re,urllib,urllib2,cookielib

from bs4 import BeautifulSoup

import time

from threading import Thread

from Queue import Queue

import MySQLdb as mdb

import imitate_browser

from MySQLdb.constants.REFRESH import STATUS

sys.setdefaultencoding('utf-8')

DB_HOST = '127.0.0.1'

proxy = {u'http':u'222.39.64.13:8118'}

TOP_URL="http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"

KEYWORD_URL="https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}"

i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',

'Accept':'json;q=0.9,*/*;q=0.8',

'Accept-Charset':'utf-8;q=0.7,*;q=0.3',

'Accept-Encoding':'gzip',

'Connection':'close',

'Referer':None #注意如果依然不能抓取的话，这里可以设置抓取网站的host

i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}

def GetDateString():

x = time.localtime(time.time())

foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))

return foldername

class BaiduImage(threading.Thread):

def __init__(self):

Thread.__init__(self)

self.browser=imitate_browser.BrowserBase()

self.request_queue=Queue()

self.wait_ana_queue=Queue()

#self.key_word_queue.put((("动态图", 0, 24)))

self.count=0

self.mutex = threading.RLock() #可重入锁，使单线程可以再次获得已经获得的锁

self.commit_count=0

self.next_proxy_set = set()

self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'sosogif', charset='utf8')

self.dbconn.autocommit(False)

self.dbcurr = self.dbconn.cursor()

self.dbcurr.execute('SET NAMES utf8')

self.get_pic()

def work(self,item):

print "start thread",item

while True: #MAX_REQUEST条以上则等待

self.prepare_request()

def format_keyword_url(self,keyword):

return KEYWORD_URL.format(wd=keyword).encode('utf-8')

def generateSeed(self,url):

html = self.browser.openurl(url).read()

soup = BeautifulSoup(html)

trs = soup.find('div', id='rs').find('table').find_all('tr') #获得所有行

ths=tr.find_all('th')

a=th.find_all('a')[0]

keyword=a.text.strip()

if "动态图" in keyword or "gif" in keyword:

print "keyword",keyword

self.dbcurr.execute('select id from info where word=%s',(keyword))

y = self.dbcurr.fetchone()

self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)',(keyword))

self.dbconn.commit()

def prepare_request(self):

self.dbcurr.execute('select * from info where status=0')

result = self.dbcurr.fetchone()

id,word,status,page_num,left_num,how_many=result

self.request_queue.put((id,word,page_num))

if page_num==0 and left_num==0 and how_many==0:

url=self.format_keyword_url(word)

self.generateSeed(url)

url=self.format_top_url(word, page_num, 24)

except Exception as err:

print "err",err

if html!="":

how_many=self.how_many(html)

print "how_many",how_many

if how_many==None:

t=math.ceil(how_many/24*100) #只要前1/100即可

for i in xrange(0,num-1):

self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))

self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id)) #置为已经访问

def start_work(self,req_max):

for item in xrange(req_max):

t = threading.Thread(target=self.work, args=(item,))

def lock(self): #加锁

self.mutex.acquire()

def unlock(self): #解锁

self.mutex.release()

def get_para(self,url,key):

values = url.split('?')[-1]

for key_value in values.split('&'):

value=key_value.split('=')

def makeDateFolder( self,par,child):

if os.path.isdir( par ):

path=par + '//' + GetDateString()

newFolderName = path+'//'+child

if not os.path.isdir(path):

if not os.path.isdir( newFolderName ):

os.mkdir( newFolderName )

return newFolderName

def parse_json(self,data):

ipdata = json.loads(data)

if ipdata['imgs']:

for n in ipdata['imgs']: #data子项

if n['objURL']:

proxy_support = urllib2.ProxyHandler(proxy)

opener = urllib2.build_opener(proxy_support)

urllib2.install_opener(opener)

#print "proxy",proxy

self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL']))

#print "y=",y

print "database exist"

self.unlock() #continue 前解锁

real_extension=utils.get_extension(n['objURL'])

req = urllib2.Request(n['objURL'],headers=i_headers)

resp = urllib2.urlopen(req,None,5)

dataimg=resp.read()

name=str(uuid.uuid1())

filename=""

if len(real_extension)>4:

real_extension=".gif"

real_extension=real_extension.lower()

if real_extension==".gif":

filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension

filename =self.makeDateFolder("E://sosogif", "o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension

if not os.path.exists(filename):

file_object = open(filename,'w+b')

file_object.write(dataimg)

file_object.close()

self.anaylis_info(n,filename,real_extension) #入库操作

print "file exist"

except IOError,e1:

print "e1=",e1

except IOError,e2:

#print "e2=",e2

except Exception as parse_error:

print "parse_error",parse_error

def title_dealwith(self,title):

#print "title",title

a=title.find("<strong>")

b=title.find("</strong>")

temp2=title[a+8:b]

temp3=title[b+9:len(title)]

return (temp1+temp2+temp3).strip()

def anaylis_info(self,n,filename,real_extension):

print "success."

#if self.wait_ana_queue.qsize()!=0:

#n,filename,real_extension=self.wait.ana_queue.get()

objURL=n['objURL'] #图片地址

fromURLHost=n['fromURLHost'] #来源网站

width=n['width'] #宽度

height=n['height'] #高度

di=n['di'] #用来唯一标识

type=n['type'] #格式

fromPageTitle=n['fromPageTitle'] #来自网站

keyword=self.title_dealwith(fromPageTitle)

temp = time.time()

x = time.localtime(float(temp))

acTime = time.strftime("%Y-%m-%d %H:%M:%S",x) #爬取时间

self.dbcurr.execute('select ID from pic_info where cs=%s', (cs))

print 'add pic',filename

self.commit_count+=1

self.dbcurr.execute('INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))

if self.commit_count==10:

def format_top_url(self,word,pn,rn):

url = TOP_URL.format(word=word, pn=pn,rn=rn).encode('utf-8')

def how_many(self,data):

if ipdata['displayNum']>0:

how_many=ipdata['displayNum']

return int(how_many)

except Exception as e:

def get_pic(self):

word="gif"

if self.key_word_queue.qsize()!=0:

word,pn,rn=self.key_word_queue.get()

url=self.format_top_url(word,pn,rn)

req = urllib2.Request(url,headers=i_headers)

response = urllib2.urlopen(req, None,5)

#print "url",url

word=self.get_para(url,"word")

rn=int(self.get_para(url,"rn"))

t=math.ceil(how_many/rn)

for item in xrange(0,num-1):

print "size of queue",self.request_queue.qsize()

if self.request_queue.qsize()!=0:

id,word,page_num = self.request_queue.get()

u=self.format_top_url(word,page_num,24)

self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id))

if self.chance >0 or self.chance1>1: #任何一个出问题都给换代理

if self.ID % 100==0:

self.dbcurr.execute("select count(*) from proxy")

for r in self.dbcurr:

if self.ID>count:

self.dbcurr.execute("select * from proxy where ID=%s",(self.ID))

results = self.dbcurr.fetchall()

pro=(protocol,ip+":"+port)

if pro not in self.next_proxy_set:

self.next_proxy_set.add(pro)

req = urllib2.Request(u,headers=i_headers)

#print "u=",u

html = response.read()

#print "html",type(html)

self.parse_json(html)

except Exception as ex1:

#print "error=",ex1

if self.chance>0 or self.chance1>1:

if len(self.next_proxy_set)>0:

protocol,socket=self.next_proxy_set.pop()

proxy= {protocol:socket}

print "change proxy finished<<",proxy,self.ID

print "error1",e

if __name__ == '__main__':

app = BaiduImage()

app.start_work(80)

#app.generateSeed()

本人建个qq群，欢迎大家一起交流技术，群号:512245829 喜欢微博的朋友关注:转盘娱乐即可

　　　　　　　　　　　　　查看评论回复

嵌入式交流网主页 > 上位机技术 > python > python百度图片爬虫

代码还是爬虫

"python百度图片爬虫"的相关文章

上一篇：Python3是趋势还是坑!5年内能普及吗?
下一篇：那些诡异难调的Bug

热门文章

♦: Django之模板系统

♦: 为什么我们程序员工作得这么累？

♦: Pandas基础(11)-用melt做格式转换

♦: Python爬虫是怎么回事？一张图告诉你爬

♦: Python高级爬虫（四）：动态加载页面的

♦: python通过pil为png图片填充上背景颜色

相关文章

python百度图片爬虫

"python百度图片爬虫"的相关文章

热门文章

论坛热帖

相关图文

网站地图