stayhigh, Python: 1月 2013

2013年1月7日星期一

利用BeautifulSoup抓取所有網頁中的子連結

from BeautifulSoup import BeautifulSoup as bs
import urllib

#建立BeautifulSoup物件bsen
url = "http://kilfu0701.blogspot.tw/2011/03/python-beautifulsoup.html"
handle = urllib.urlopen(url)
content= handle.read()
bsen = bs(content)

#處理編碼問題，利用UnicodeDammit模組
print "Process the dammit strings into appropriate encodings...."
from bs4 import UnicodeDammit
dammit = UnicodeDammit(content,["utf-8","cp950"])
print "dammit strings have cleaned."

#取出Dammit物件dammit，其中屬性unicode_markup為處理完後的文字串
print "The length of dammit message is ",len(dammit.unicode_markup)
bsgood = bs(dammit.unicode_markup)
print "The links of this html have ",len(bsgood.findAll('a'))

#以for迴圈取出標籤lnktag，並使用lnktag.get('href')取出連結
for lnktag in bsgood.findAll('a',href=True):
if lnktag.get('href').startswith('http') or lnktag.get('href').startswith(r'//'):
print lnktag.get('href')

2013年1月7日 星期一

利用BeautifulSoup抓取所有網頁中的子連結

2013年1月7日星期一