#-*-coding:utf8 -*-
import re
import time
import urllib2
#set target webpage
target = 'http://www.twbbs.net.tw/760449.html'
content= urllib2.urlopen(target).read()
#simulate web access action
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Referer':'http://www.twbbs.net.tw/760449.html'
}
#if match the url pattern,append each match into urls list
urls = re.findall(r'src=[\'"]?([^\'" >]+)', content)
picurls = []
#Distinguish the picture extension from others
for url in urls:
if url.startswith('http://tinypic.com/') and ( url.endswith('jpg') or url.endswith('png') or url.endswith('gif') ):
picurls.append(url)
#Starting the Scrap process
for idx,eachpic in enumerate(picurls):
req = urllib2.Request(
url = str(eachpic),
headers = headers
)
result = urllib2.urlopen(req).read()
#you should select binary option in your file mode setting
picf = open("D:\\onepiecePics\\"+str(idx+1)+'.jpg',"wb")
picf.write(urllib2.urlopen(req).read())
picf.close()
print str(idx+1)+'.jpg picture Saved.'
#you should wait for some minutes to avoid robot detection
#maybe random time for each iteration will be better.
time.sleep(1)
沒有留言:
張貼留言