# 安裝google chrome plugin,SelectorGadget快速取得selector與xpath等相關資訊,網頁爬蟲特別實用
# 安裝firefox plugin,sqlite manager觀看資料庫內容:http://www.minwt.com/website/server/4964.html
#開啟apple新的scrapy專案
stayhigh@stayhighnet:/Users/stayhigh/projects/apple $ scrapy startproject apple
#執行apple的scrapy專案
stayhigh@stayhighnet:/Users/stayhigh/projects/apple $ scrapy crawl apple
#執行apple的scrap專案,並且將輸出a.json的json格式檔案
stayhigh@stayhighnet:/Users/stayhigh/projects/apple $ scrapy crawl apple -o a.json -t json
#執行分段爬蟲任務並放置相關資料於job1目錄
stayhigh@stayhighnet:/Users/stayhigh/projects/apple $ scrapy crawl apple -s JOBDIR=job1
#觀看apple專案目錄結構
- crawler.py為使用者自行定義的爬蟲程式,藉由繼承scrapy.Spider類別進行網頁抓取
- items.py 用於定義資料欄位
- pipeline.py 用於定義爬蟲程式的控制流程
- settings.py 設定檔,用於設定啟用的功能,如常見的pipeline功能,並切記設定時指定pipeline.py當中的apple.pipelines.ApplePipeline
ITEM_PIPELINES = {
'apple.pipelines.ApplePipeline': 300,
}
'apple.pipelines.ApplePipeline': 300,
}
stayhigh@stayhighnet:/Users/stayhigh/projects/apple $ tree
.
├── a.json
├── apple
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── items.pyc
│ ├── pipelines.py
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── crawler.py
│ └── crawler.pyc
└── scrapy.cfg
#如何實現多網頁爬取功能
from scrapy.spiders import CrawlSpider
# crawler.py內的爬蟲類別繼承CrawlSpider
class AppleCrawler(CrawlSpider):
沒有留言:
張貼留言