爬虫实践教程0

爬虫的基本就是从网页上获取我们需要的信息。

# coding=utf-8
from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient

client = MongoClient('localhost',27017)
db = client['guazigz']
item_info = db['item_info']


def detailOper(url):
    web_data = requests.get(url)
    web_data.encoding='utf-8'
    soup = BeautifulSoup(web_data.text, 'lxml')
    titles = soup.select('div.list > ul > li > div > p.infoBox > a')
    prices = soup.select('div.list > ul > li > div > p.priType-s > span > i.fc-org.priType')
    for title, price in zip(titles, prices):
        data = {
            'title': title.get_text(),
            'detailHerf': title.get('href'),
            'price':price.get_text().replace(u'万', '').replace(' ','').replace('\n','')
    }
        item_info.insert_one(data)
        print(data)

def start():
    urls = ['http://www.guazi.com/gz/buy/o{}/'.format(str(i)) for i in range(1, 30, 1)]
    for url in urls:
        detailOper(url)

if __name__ == '__main__':
    start()

如上的程序相对来说很好阅读,
从瓜子二手车网站抓取了29页数据,每页40条,共1160条数据存入到名为guazigz的mongo数据库中。集合名为item_info,注意一下,print(data)是为了能清晰显示数据,实际是可以删除注释掉的。