lagou-spider

很久没有写爬虫了

今天试着爬了一下 拉勾网站 深圳 python 的前 150 个职位

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import time
import requests
from pprint import pprint
from fake_useragent import UserAgent
ua = UserAgent()
import pymongo
client = pymongo.MongoClient("localhost", 27017)
db = client.job

user_agent = ua.random

url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false&isSchoolJob=0'

headers = {
"Cookie": "user_trace_token=20171105173814-0b824fa6-c20d-11e7-978b-5254005c3644; LGUID=20171105173814-0b825535-c20d-11e7-978b-5254005c3644; index_location_city=%E6%B7%B1%E5%9C%B3; TG-TRACK-CODE=index_search; SEARCH_ID=5bb5150d5d314f32a34bdb68f6e2dbbc; _gid=GA1.2.1886275699.1509874693; _ga=GA1.2.1197843841.1509874693; LGRID=20171105173909-2cac9846-c20d-11e7-978b-5254005c3644; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1509874693; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1509874749; JSESSIONID=ABAAABAAAFCAAEG674AE53B34CFB49996425F9659B657CF",
"Referer": "https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3",
"User-Agent": user_agent
}

def get_job_data(num):

for i in range(num):
payload = {
"first": "false",
"pn": str(i+1),
"kd": "python"
}

response = requests.post(url, data=payload, headers=headers)
time.sleep(5)
# pprint(response.json()['content']['positionResult']['result'])
for item in response.json()['content']['positionResult']['result']:
db.my_collection.insert_one(item)


if __name__ == '__main__':
get_job_data(10)

还想着做一些可视化的东西。。。