2013年1月7日 星期一
利用BeautifulSoup抓取所有網頁中的子連結
from BeautifulSoup import BeautifulSoup as bs
import urllib
#建立BeautifulSoup物件bsen
url = "http://kilfu0701.blogspot.tw/2011/03/python-beautifulsoup.html"
handle = urllib.urlopen(url)
content= handle.read()
bsen = bs(content)
#處理編碼問題,利用UnicodeDammit模組
print "Process the dammit strings into appropriate encodings...."
from bs4 import UnicodeDammit
dammit = UnicodeDammit(content,["utf-8","cp950"])
print "dammit strings have cleaned."
#取出Dammit物件dammit,其中屬性unicode_markup為處理完後的文字串
print "The length of dammit message is ",len(dammit.unicode_markup)
bsgood = bs(dammit.unicode_markup)
print "The links of this html have ",len(bsgood.findAll('a'))
#以for迴圈取出標籤lnktag,並使用lnktag.get('href')取出連結
for lnktag in bsgood.findAll('a',href=True):
if lnktag.get('href').startswith('http') or lnktag.get('href').startswith(r'//'):
print lnktag.get('href')
訂閱:
文章 (Atom)