3.4 【实战】爬取智联招聘信息

课程目标

爬取智联招聘信息

课程内容

编码实现

解析网站：https://www.spidertools.cn/#/curl2Request
目标链接: https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1

1. 基于playwright 获取cookie

如果cookie 失效,可基于这个获取

from playwright.sync_api import sync_playwright
import time
with sync_playwright() as p:browser = p.chromium.launch(headless=False)context = browser.new_context()page = context.new_page()page.goto("https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1")# 获取cookietime.sleep(5)cookies = context.cookies()cookie_dict = {}for cookie in cookies:cookie_dict[cookie["name"]] = cookie["value"]print(cookie_dict)

2. 爬取到excel

from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm
import pandas as pd
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6","cache-control": "max-age=0","priority": "u=0, i","referer": "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1","sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Microsoft Edge\";v=\"128\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-origin","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",}
cookies = {'_uab_collina': '172603833692980471099565', 'acw_tc': '1a0c638f17260383368666189e00347e6ac270373627cb2b28b40bf7747fb6', 'acw_sc__v2': '66e1413f368267fe14db235d7102befafd592806', 'x-zp-client-id': '62556d7c-493b-4b3f-b799-4852d97e40a5', 'FSSBBIl1UgzbN7NS': '5KhbTbuE1pM96Ujhm28DlCLRGMQdDm8ev2w9yTmAl1TxcLpfH.32gn9jHWAG63XY5m.zwJzIpwjpsWQypqhJRqq', 'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%22191dfe6e4a11254-01e41e0f4f92ff8-26001151-921600-191dfe6e4a220c%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkxZGZlNmU0YTExMjU0LTAxZTQxZTBmNGY5MmZmOC0yNjAwMTE1MS05MjE2MDAtMTkxZGZlNmU0YTIyMGMifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22191dfe6e4a11254-01e41e0f4f92ff8-26001151-921600-191dfe6e4a220c%22%7D', 'sajssdk_2015_cross_new_user': '1', 'HMACCOUNT_BFESS': '5CE1833FAA2F8083', 'Hm_lvt_21a348fada873bdc2f7f75015beeefeb': '1726038337', 'Hm_lpvt_21a348fada873bdc2f7f75015beeefeb': '1726038337', 'HMACCOUNT': '5CE1833FAA2F8083', 'locationInfo_search': '{%22code%22:%22801%22%2C%22name%22:%22%E6%88%90%E9%83%BD%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}', 'FSSBBIl1UgzbN7NT': '5RXmrHCMfAsZqqqDp65MjXayj2UoH1vC_Yw1wF8F4tofi7LSoYbUF_om_EhdyKyFar.pUPc.OI1g19MJPU8CPbKI8.Vq4R8.hdQZHCwBvXRaYKUYo1Dfqp6ROvOHQT4H2sELfG01VM5i6tXFohtYVF4ZSM8pg2nMp_5_JaC43_YOU2BTFOv.E8c6bC9i1J93sERm3jiefBNCllZtt4G0f1q_6dRW6PC1QPW6e1X5oNctO5dsUGf1cOs0hW5mUWzlkWUsW5vR29qXbPFhZ4X.S6p', '1420ba6bb40c9512e9642a1f8c243891': 'bb4a9dc0-b1d0-4824-98c2-ef484a86f8a5'}
infos = []
for i in range(1,6):print(f"开始爬取{i}页")time.sleep(1)url = f"https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p{i}"referer = i if i == 1 else i-1headers["referer"] = f"https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p{referer}"response = requests.get(url, headers=headers,cookies=cookies)html = response.textsoup = BeautifulSoup(html, 'html.parser') # 解析器divs = soup.find_all('div',class_='joblist-box__item') # 找到所有职位块节点for div in tqdm(divs):job_name = div.find("a",class_="jobinfo__name").text.strip()salary = div.find("p",class_="jobinfo__salary").text.strip()skills = []skill_father = div.find('div',class_='jobinfo__tag') # 技能要求所有标签if skill_father:skill_items = skill_father.find_all('div',class_="joblist-box__item-tag")for skill_item in skill_items:skills.append(skill_item.text)recruiter_name_recruiter_position = div.find('div',class_="companyinfo__staff-name").text.strip()if "·" in recruiter_name_recruiter_position:recruiter_name,recruiter_position = recruiter_name_recruiter_position.split('·')else:recruiter_name,recruiter_position = recruiter_name_recruiter_position,""companyinfo_tag = div.find('div',class_="companyinfo__tag")companyinfo_items = companyinfo_tag.find_all('div', class_='joblist-box__item-tag')if companyinfo_items:financing = companyinfo_items[0].text.strip()else:financing = ""if len(companyinfo_items) >= 2:scale = companyinfo_items[1].text.strip()else:scale = ""if len(companyinfo_items) == 3:industry = companyinfo_items[2].text.strip()else:industry = ""otherinfo_father = div.find('div', class_='jobinfo__other-info')companyinfo_items = otherinfo_father.find_all('div', class_='jobinfo__other-info-item')areas = companyinfo_items[0].text.strip()areas = areas.split('·')try:area_grandfather = areas[0]except:area_grandfather = ""try:area_pather = areas[1]except:area_pather = ""try:area_son = areas[2]except:area_son = ""experience_requirement = companyinfo_items[1].text.strip()education_background_requirement = companyinfo_items[2].text.strip()info = {"岗位名称":job_name,"工资":salary,"技能要求":skills,"招聘人":recruiter_name,"招聘人职位":recruiter_position,"公司融资信息":financing,"公司规模":scale,"公司行业":industry,"工作地点":area_son,"工作区域":area_pather,"工作大区":area_grandfather}infos.append(info)df = pd.DataFrame(infos)
df.to_excel("智联招聘招聘信息.xlsx",index=False)

3.4 【实战】爬取智联招聘信息

课程目标

课程内容

编码实现

1. 基于playwright 获取cookie

2. 爬取到excel

最新新闻

热搜词