2013年1月7日 星期一

利用BeautifulSoup抓取所有網頁中的子連結


from BeautifulSoup import BeautifulSoup as bs
import urllib

#建立BeautifulSoup物件bsen
url = "http://kilfu0701.blogspot.tw/2011/03/python-beautifulsoup.html" 
handle  = urllib.urlopen(url)
content= handle.read()
bsen = bs(content)

#處理編碼問題,利用UnicodeDammit模組
print "Process the dammit strings into appropriate encodings...."
from bs4 import UnicodeDammit
dammit = UnicodeDammit(content,["utf-8","cp950"])
print "dammit strings have cleaned."

#取出Dammit物件dammit,其中屬性unicode_markup為處理完後的文字串
print "The length of dammit message is ",len(dammit.unicode_markup)
bsgood = bs(dammit.unicode_markup)
print "The links of this html have ",len(bsgood.findAll('a'))

#以for迴圈取出標籤lnktag,並使用lnktag.get('href')取出連結
for lnktag in bsgood.findAll('a',href=True):
    if lnktag.get('href').startswith('http') or lnktag.get('href').startswith(r'//'):
        print lnktag.get('href')