2012年6月25日 星期一

python 實用的抓圖片程式碼


#-*-coding:utf8 -*-
import re
import time
import urllib2
#set target webpage
target = 'http://www.twbbs.net.tw/760449.html'
content= urllib2.urlopen(target).read()




#simulate web access action
headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
    'Referer':'http://www.twbbs.net.tw/760449.html'
}


#if match the url pattern,append each match into urls list
urls = re.findall(r'src=[\'"]?([^\'" >]+)', content)
picurls = []


#Distinguish the picture extension from others
for url in urls:
    if url.startswith('http://tinypic.com/') and ( url.endswith('jpg') or url.endswith('png') or url.endswith('gif') ):
        picurls.append(url)








#Starting  the Scrap process
for idx,eachpic in enumerate(picurls):
    req = urllib2.Request(
        url = str(eachpic),
        headers = headers
    )
    result = urllib2.urlopen(req).read()
    #you should select binary option in your file mode setting
    picf = open("D:\\onepiecePics\\"+str(idx+1)+'.jpg',"wb")
    picf.write(urllib2.urlopen(req).read())
    picf.close()
    print str(idx+1)+'.jpg picture Saved.'
    #you should wait for some minutes to avoid robot detection
    #maybe random time for each iteration will be better.
    time.sleep(1) 

沒有留言:

張貼留言