博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
爬取拉勾网示例
阅读量:6643 次
发布时间:2019-06-25

本文共 9043 字,大约阅读时间需要 30 分钟。

爬取需求分析

# 第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code# 1、请求url:https://passport.lagou.com/login/login.html# 2、请求方法:GET# 3、请求头:#    User-agentr1 = session.get('https://passport.lagou.com/login/login.html',                 headers={                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',                 },                 )X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]# 第二步:登陆# 1、请求url:https://passport.lagou.com/login/login.json# 2、请求方法:POST# 3、请求头:#    cookie#    User-agent#    Referer:https://passport.lagou.com/login/login.html#    X-Anit-Forge-Code:53165984#    X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78#    X-Requested-With:XMLHttpRequest# 4、请求体:# isValidate:true# username:18611453110# password:70621c64832c4d4d66a47be6150b4a8e# request_form_verifyCode:''# submit:''r2 = session.post('https://passport.lagou.com/login/login.json',                  headers={                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',                      'Referer': 'https://passport.lagou.com/login/login.html',                      'X-Anit-Forge-Code': X_Anti_Forge_Code,                      'X-Anit-Forge-Token': X_Anti_Forge_Token,                      'X-Requested-With': 'XMLHttpRequest'                  },                  data={                      "isValidate": True,                      'username': '18611453110',                      'password': '70621c64832c4d4d66a47be6150b4a8e',                      'request_form_verifyCode': '',                      'submit': ''                  }                  )# 第三步:授权# 1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html# 2、请求方法:GET# 3、请求头:#    User-agent#    Referer:https://passport.lagou.com/login/login.htmlr3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',                 headers={                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',                     'Referer': 'https://passport.lagou.com/login/login.html',                 }                 )# 第四步:验证r4 = session.get('https://www.lagou.com/resume/myresume.html',                 headers={                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',                 }                 )# 第五步:筛选职位信息# 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91# 请求方法:GET# 请求头:# User-Agent# 请求参数:# gj:3年及以下# px:default# yx:25k-50k# city:北京#第六步,详细的筛选出职位信息条件#请求参数# params={#      'gj': '3年及以下',#      'px': 'default',#      'yx': '25k-50k',#      'city': '北京',#     'needAddtionalResult':False,#     'isSchoolJob':0# }#第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code    # 请求url:详情页地址    # 请求方式:GET    # 请求头:User-Agent    r7=session.get(company_link,                headers={                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',                }                )    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]    #第八步:投递简历    #请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json    #请求方式:POST    #请求头:        #Referer:详情页地址        #User-agent        #X-Anit-Forge-Code:53165984        #X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78        #X-Requested-With:XMLHttpRequest    #请求体:    # positionId:职位ID    # type:1    # force:true    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',                 headers={                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',                     'Referer': company_link,                     'X-Anit-Forge-Code': X_Anti_Forge_Code,                     'X-Anit-Forge-Token': X_Anti_Forge_Token,                     'X-Requested-With': 'XMLHttpRequest'                 },                 data={    'positionId':positionId,    'type':1,    'force':True                 }                 )    print('%s 投递成功' %(companyShortName))

 

import requestsimport refrom urllib.parse import urlencodesession = requests.session()r1 = session.get(    "https://passport.lagou.com/login/login.html",    headers = {        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",    })X_Anit_Forge_Code  = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S)X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S)r2 = session.post(    "https://passport.lagou.com/login/login.json",    headers = {        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",        "Referer":"https://passport.lagou.com/login/login.html",        "X-Anit-Forge-Code":X_Anit_Forge_Code,        "X-Anit-Forge-Token":X_Anit_Forge_Token,        "X-Requested-With":"XMLHttpRequest"    },    data={        "isValidate": True,        'username': '18611453110',        'password': '70621c64832c4d4d66a47be6150b4a8e',        'request_form_verifyCode': '',        'submit': ''    })r3 = session.get(    "https://passport.lagou.com/grantServiceTicket/grant.html",    headers = {        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",        'Referer': 'https://passport.lagou.com/login/login.html',    })r4 = session.get(    'https://www.lagou.com/resume/myresume.html',    headers = {        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",    })print('18611453110' in r4.text)# ============================# res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]# url = "https://www.lagou.com/jobs/list_"+res# r5 =session.get(url,#             headers={
# "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",# },# params={
# 'gj': '3年及以下',# 'px': 'default',# 'yx': '25k-50k',# 'city': '北京'# }# ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式res = urlencode({
"k":"java高级开发"},encoding="utf-8").split("=")[-1]url = "https://www.lagou.com/jobs/list_"+resr6 = session.post( 'https://www.lagou.com/jobs/postionAjax.json', headers = { 'Referer': url, "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", }, data = { "first":True, "pn":1, "kd":"java高级开发" }, params = { "gj":"3年及以下", "gx":"default", "yx":"15k-25k", "city":"北京", "needAddtionResult":False, "isSchoolJob":0 })from pprint import pprint# print(r6.json())comapines_list=r6.json()['content']['positionResult']['result']for comapiny in comapines_list: positionId=comapiny['positionId'] company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId) companyShortName = comapiny['companyShortName'] positionName = comapiny['positionName'] salary = comapiny['salary'] print(''' 详情连接:%s 公司名:%s 职位名:%s 薪资:%s ''' %(company_link,companyShortName,positionName,salary)) r7=session.get(company_link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] # print(X_Anti_Forge_Token,X_Anti_Forge_Code) session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': company_link, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId':positionId, 'type':1, 'force':True } ) print('%s 投递成功' %(companyShortName))
代码示例

 

转载于:https://www.cnblogs.com/shaojiafeng/p/8310306.html

你可能感兴趣的文章
【Xamarin挖墙脚系列:Xamarin.Android的API设计准则】
查看>>
CodeFirst时使用T4模板
查看>>
MyBatis2:config.xml文件
查看>>
inux redis 安装配置, 以及redis php扩展
查看>>
CSS中常见的6种文本样式
查看>>
【简易版】IOS仿periscope自制狂赞飘桃心
查看>>
Touch Devices
查看>>
python中的反射
查看>>
IOS各种集合遍历效率对比
查看>>
IL指令大全
查看>>
开源:ASP.NET Aries 开发框架(已支持.NET Core)
查看>>
Atitit.100% 多个子元素自适应布局属性
查看>>
spring aop源码实现分析
查看>>
sublim3常用插件安装
查看>>
Arduino可穿戴开发入门教程LilyPad和LilyPad Simple的介绍
查看>>
《软件小设计》推出
查看>>
Config
查看>>
Scanner和BufferedReader
查看>>
java.lang.NoClassDefFoundError: org/jaxen/JaxenException
查看>>
.htaccess 基础教程(三)RewriteCond标志符,RewriteRule适用的标志符
查看>>