爬虫笔记13——网页爬取数据写入MySQL数据库，以阿里recruit为例

下载pymysql库

需要下载pymysql库，以便于在程序中连接MySQL数据库

pip install pymysql
# 或者使用国内的镜像文件，安装会快一点
pip install pymysql -i https://pypi.douban.com/simple

需要安装MySQL，并创建使用数据库

安装MySQL可以看这里。
创建一个数据库：

create database 数据库名 charset=utf8;
use 数据库名;

在python程序中我们可以这样连接数据库：

import pymysqldef connectMySql():#连接数据库，使用上下文管理器方便执行结束自动退出with pymysql.connect(host='127.0.0.1', user='root', password='root', database='testdb') as db:#创建游标，类似于鼠标吧，对到SQL语句执行with db.cursor() as cursor:cursor.execute('select * from users;')data = cursor.fetchall()print(data)print('程序结束！')connectMySql()

以上是基本的数据库连接操作，下面例子是爬取阿里招聘岗位信息的爬虫程序：

import pymysql
import requests
import timeclass SpiderALi:#python类创建实例对象自动运行def __init__(self):self.url = 'https://careers.aliyun.com/position/search'#headers其实只留下个‘User-Agent’就可以了，其他可以不用，这是cUrl复制过来的self.headers = {"accept": "application/json, text/plain, */*","accept-language": "zh-CN,zh;q=0.9","bx-v": "2.5.11","cache-control": "no-cache","content-type": "application/json","origin": "https://careers.aliyun.com","pragma": "no-cache","priority": "u=1, i","referer": "https://careers.aliyun.com/off-campus/position-list?lang=zh","sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-origin",'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}self.requestData = {'channel': 'group_official_site','language': 'zh','batchId': '','categories': '','deptCodes': [],'key': '','pageIndex': 1,'pageSize': 10,'regions': '','subCategories': '',}self.params = {'_csrf': '378212c5-83ef-42bb-a25c-3780f7f3ab09',}self.cookies = '去请求头里面找到cookies复制过来即可'self.connectMySql = pymysql.connect(host='127.0.0.1', user='root', password='你设置的密码，一般也是root', database='py_spider')self.cursor = self.connectMySql.cursor()#python类的生命周期，该魔术方法在实例对象销毁时自动执行def __del__(self):print('爬取结束，退出数据库！')self.cursor.close()self.connectMySql.close()#获取岗位信息def get_works(self):for i in range(1, 53):self.requestData["pageIndex"] = iresponse = requests.post(url=self.url,headers=self.headers,json=self.requestData,params=self.params,cookies=self.cookies).json()yield response["content"]["datas"]#创建表，不存在则创建def create_table(self):sql = """create table if not exists ali_works (id int primary key auto_increment,work_name varchar(100) not null,categories varchar(50),workLocations varchar(50),requirement text,description text,nowtime date);"""try:self.cursor.execute(sql)print('表创建成功！')except Exception as e:print('表创建失败：', e)#插入数据def insert_info(self, *args):""":param args:id int primary key auto_increment,work_name varchar(100) not null,categories varchar(50),workLocations varchar(50),requirement text,description text:return:"""sql = """insert into ali_works value (%s, %s, %s, %s, %s, %s, %s)"""try:self.cursor.execute(sql, args)print('数据插入成功')self.connectMySql.commit()except Exception as e:print('数据插入失败：', e)self.connectMySql.rollback()#主函数def main(self):self.create_table()time.sleep(2)allWorkLists = self.get_works()work_id = 0# 获取当前时间戳timestamp = time.time()# 转换为本地时间local_time = time.localtime(timestamp)# 格式化为日期date = time.strftime('%Y-%m-%d', local_time)for index, workLists in enumerate(allWorkLists):print(f'正在插入第{index+1}页数据')time.sleep(1.5)for work in workLists:work_name = work["name"]categories = work["categories"][0] if work["categories"] else 'null'workLocations = work["workLocations"]requirement = work["requirement"]description = work["description"]self.insert_info(work_id, work_name, categories, workLocations, requirement, description, date)if __name__ == '__main__':spiderALi = SpiderALi()spiderALi.main()

爬虫笔记13——网页爬取数据写入MySQL数据库，以阿里recruit为例

下载pymysql库

需要安装MySQL，并创建使用数据库

最新新闻

热搜词