爬取需求分析
# 第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code# 1、请求url:https://passport.lagou.com/login/login.html# 2、请求方法:GET# 3、请求头:# User-agentr1 = session.get('https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, )X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]# 第二步:登陆# 1、请求url:https://passport.lagou.com/login/login.json# 2、请求方法:POST# 3、请求头:# cookie# User-agent# Referer:https://passport.lagou.com/login/login.html# X-Anit-Forge-Code:53165984# X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78# X-Requested-With:XMLHttpRequest# 4、请求体:# isValidate:true# username:18611453110# password:70621c64832c4d4d66a47be6150b4a8e# request_form_verifyCode:''# submit:''r2 = session.post('https://passport.lagou.com/login/login.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': 'https://passport.lagou.com/login/login.html', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ "isValidate": True, 'username': '18611453110', 'password': '70621c64832c4d4d66a47be6150b4a8e', 'request_form_verifyCode': '', 'submit': '' } )# 第三步:授权# 1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html# 2、请求方法:GET# 3、请求头:# User-agent# Referer:https://passport.lagou.com/login/login.htmlr3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': 'https://passport.lagou.com/login/login.html', } )# 第四步:验证r4 = session.get('https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } )# 第五步:筛选职位信息# 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91# 请求方法:GET# 请求头:# User-Agent# 请求参数:# gj:3年及以下# px:default# yx:25k-50k# city:北京#第六步,详细的筛选出职位信息条件#请求参数# params={# 'gj': '3年及以下',# 'px': 'default',# 'yx': '25k-50k',# 'city': '北京',# 'needAddtionalResult':False,# 'isSchoolJob':0# }#第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 请求url:详情页地址 # 请求方式:GET # 请求头:User-Agent r7=session.get(company_link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] #第八步:投递简历 #请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json #请求方式:POST #请求头: #Referer:详情页地址 #User-agent #X-Anit-Forge-Code:53165984 #X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78 #X-Requested-With:XMLHttpRequest #请求体: # positionId:职位ID # type:1 # force:true session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': company_link, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId':positionId, 'type':1, 'force':True } ) print('%s 投递成功' %(companyShortName))
import requestsimport refrom urllib.parse import urlencodesession = requests.session()r1 = session.get( "https://passport.lagou.com/login/login.html", headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", })X_Anit_Forge_Code = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S)X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S)r2 = session.post( "https://passport.lagou.com/login/login.json", headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", "Referer":"https://passport.lagou.com/login/login.html", "X-Anit-Forge-Code":X_Anit_Forge_Code, "X-Anit-Forge-Token":X_Anit_Forge_Token, "X-Requested-With":"XMLHttpRequest" }, data={ "isValidate": True, 'username': '18611453110', 'password': '70621c64832c4d4d66a47be6150b4a8e', 'request_form_verifyCode': '', 'submit': '' })r3 = session.get( "https://passport.lagou.com/grantServiceTicket/grant.html", headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 'Referer': 'https://passport.lagou.com/login/login.html', })r4 = session.get( 'https://www.lagou.com/resume/myresume.html', headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", })print('18611453110' in r4.text)# ============================# res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]# url = "https://www.lagou.com/jobs/list_"+res# r5 =session.get(url,# headers={ # "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",# },# params={ # 'gj': '3年及以下',# 'px': 'default',# 'yx': '25k-50k',# 'city': '北京'# }# ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式res = urlencode({ "k":"java高级开发"},encoding="utf-8").split("=")[-1]url = "https://www.lagou.com/jobs/list_"+resr6 = session.post( 'https://www.lagou.com/jobs/postionAjax.json', headers = { 'Referer': url, "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", }, data = { "first":True, "pn":1, "kd":"java高级开发" }, params = { "gj":"3年及以下", "gx":"default", "yx":"15k-25k", "city":"北京", "needAddtionResult":False, "isSchoolJob":0 })from pprint import pprint# print(r6.json())comapines_list=r6.json()['content']['positionResult']['result']for comapiny in comapines_list: positionId=comapiny['positionId'] company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId) companyShortName = comapiny['companyShortName'] positionName = comapiny['positionName'] salary = comapiny['salary'] print(''' 详情连接:%s 公司名:%s 职位名:%s 薪资:%s ''' %(company_link,companyShortName,positionName,salary)) r7=session.get(company_link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] # print(X_Anti_Forge_Token,X_Anti_Forge_Code) session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': company_link, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId':positionId, 'type':1, 'force':True } ) print('%s 投递成功' %(companyShortName))