爬虫的基本就是从网页上获取我们需要的信息。
# coding=utf-8
from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client['guazigz']
item_info = db['item_info']
def detailOper(url):
web_data = requests.get(url)
web_data.encoding='utf-8'
soup = BeautifulSoup(web_data.text, 'lxml')
titles = soup.select('div.list > ul > li > div > p.infoBox > a')
prices = soup.select('div.list > ul > li > div > p.priType-s > span > i.fc-org.priType')
for title, price in zip(titles, prices):
data = {
'title': title.get_text(),
'detailHerf': title.get('href'),
'price':price.get_text().replace(u'万', '').replace(' ','').replace('\n','')
}
item_info.insert_one(data)
print(data)
def start():
urls = ['http://www.guazi.com/gz/buy/o{}/'.format(str(i)) for i in range(1, 30, 1)]
for url in urls:
detailOper(url)
if __name__ == '__main__':
start()
如上的程序相对来说很好阅读,
从瓜子二手车网站抓取了29页数据,每页40条,共1160条数据存入到名为guazigz的mongo数据库中。集合名为item_info,注意一下,print(data)是为了能清晰显示数据,实际是可以删除注释掉的。