您的位置:首页 > 财经 > 产业 > liburing和Linux io_uring源码阅读

liburing和Linux io_uring源码阅读

2024/10/6 22:19:26 来源:https://blog.csdn.net/DefiniteGoal/article/details/139962203  浏览:    关键词:liburing和Linux io_uring源码阅读
liburing
版本

系统内核版本: Linux 5.15.0-107-generic #117~20.04.1-Ubuntu x86_64 GNU/Linux

源代码版本: git@github.com:torvalds/linux.git v5.15

阅读入口

从最简单的代码看起,即阅读入口:examples/io_uring-test.c
此源文件内部调用liburing相关函数包含
io_uring_queue_initio_uring_get_sqeio_uring_prep_readvio_uring_submitio_uring_wait_cqeio_uring_cqe_seenio_uring_queue_exit
内核提供的函数仅有三个:io_uring_setup@425io_uring_enter@426io_uring_register@427
其中io_uring_queue_init调用的内核函数为io_uring_setup
以此为起点,阅读内核代码。

liburing调用栈
// 示例程序调用代码
struct io_uring_params p;
memset(&p, 0, sizeof(p));
p.flags = 0;
struct io_uring ring;
io_uring_queue_init_params(64, &ring, &p);
int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,struct io_uring_params *p)
{int ret;ret = io_uring_queue_init_try_nosqarr(entries, ring, p, NULL, 0);return ret >= 0 ? 0 : ret;
}
static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring,struct io_uring_params *p, void *buf,size_t buf_size)
{unsigned flags = p->flags;int ret;// 重点在__io_uring_queue_init_params函数p->flags |= IORING_SETUP_NO_SQARRAY;ret = __io_uring_queue_init_params(entries, ring, p, buf, buf_size);// 5.15内核不支持IORING_SETUP_NO_SQARRAY, 故会返回 -EINVAL// 所以真正执行的是下面的__io_uring_queue_init_paramsif (ret != -EINVAL || (flags & IORING_SETUP_NO_SQARRAY))return ret;p->flags = flags;return __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
}
int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,struct io_uring_params *p, void *buf,size_t buf_size)
{int fd, ret = 0;unsigned *sq_array;unsigned sq_entries, index;memset(ring, 0, sizeof(*ring));// 省去一些无法走到的代码fd = __sys_io_uring_setup(entries, p);if (fd < 0) {return fd;}return fd
}

__sys_io_uring_setup进入内核代码

// 根据调用号425找到函数地址 io_uring_setup
// grep -rn "SYSCALL_DEFINE" $(find ./ -name '*.c')
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{struct io_uring_params p;int i;// 将数据从用户空间拷贝到内核空间if (copy_from_user(&p, params, sizeof(p)))return -EFAULT;for (i = 0; i < ARRAY_SIZE(p.resv); i++) {if (p.resv[i])return -EINVAL;}if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |IORING_SETUP_R_DISABLED))return -EINVAL;return io_uring_create(entries, &p, params);
}

由于io_uring_create篇幅过长,走不到的地方将会对其进行裁剪

static int io_uring_create(unsigned entries, struct io_uring_params *p,struct io_uring_params *params)
{// p为内核空间地址数据// params为用户空间地址数据struct io_ring_ctx *ctx;struct file *file;int ret;// 此处检查提交队列SQ大小是否会超过最大限制IORING_MAX_ENTRIES(0x8000)if (entries > IORING_MAX_ENTRIES) {if (!(p->flags & IORING_SETUP_CLAMP))return -EINVAL;entries = IORING_MAX_ENTRIES;}// 保证队列容量是2的幂次方p->sq_entries = roundup_pow_of_two(entries);// 完成队列CQ是提交队列容量的两倍p->cq_entries = 2 * p->sq_entries;// 初始化io_ring_ctx对象ctx = io_ring_ctx_alloc(p);if (!ctx)return -ENOMEM;ctx->compat = in_compat_syscall();if (!capable(CAP_IPC_LOCK))ctx->user = get_uid(current_user());mmgrab(current->mm);ctx->mm_account = current->mm;// 申请ctx->rings和ctx->sq_sqes内存ret = io_allocate_scq_urings(ctx, p);if (ret)goto err;// 设置了轮询后创建轮询线程函数, 由于参数未设置, 不在展开ret = io_sq_offload_create(ctx, p);if (ret)goto err;ret = io_rsrc_node_switch_start(ctx);if (ret)goto err;io_rsrc_node_switch(ctx, NULL);// 保存结构体成员偏移地址// 用于用户空间创建内存映射时获取到偏移地址, 将用户空间结构体内部变量和内核空间结构体内部变量一一对应// liburing io_uring_setup_ring_pointersmemset(&p->sq_off, 0, sizeof(p->sq_off));/*** 由于代码中设置了按照cacheline字节对齐* 所以p->sq_off.tail = 64, 即虽然head是4字节, 但为了防止`错误共享`, 让其占据一个cacheline大小* cacheline一般为64, 也可能是128, 博主PC是64的* 错误共享知识可查看* https://github.com/TonyBeen/study/blob/master/false_sharing/false_sharing.cc* * struct io_uring {*     u32 head __attribute__((__aligned__(64)));*     u32 tail __attribute__((__aligned__(64)));* };*/p->sq_off.head = offsetof(struct io_rings, sq.head);                    // 0p->sq_off.tail = offsetof(struct io_rings, sq.tail);                    // 64p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);          // 256p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);    // 264p->sq_off.flags = offsetof(struct io_rings, sq_flags);                  // 276p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);              // 272/*** 具体细节在函数io_allocate_scq_urings内部, 如下* 下面介绍如何计算得到2368的(代码在rings_size中)** sizeof(struct io_rings) = 320 (64字节对齐结果)* sizeof(struct io_uring_cqe) = 16* cq_entries = 128* * p->sq_off.array = cq_entries * sizeof(struct io_uring_cqe) + sizeof(struct io_rings)* 得到结果满足在cacheline字节边界上, 不满足的情况下不会是2368* */p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;           // 2368memset(&p->cq_off, 0, sizeof(p->cq_off));p->cq_off.head = offsetof(struct io_rings, cq.head);p->cq_off.tail = offsetof(struct io_rings, cq.tail);p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);p->cq_off.cqes = offsetof(struct io_rings, cqes);p->cq_off.flags = offsetof(struct io_rings, cq_flags);p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |IORING_FEAT_RSRC_TAGS;// 拷贝到用户空间if (copy_to_user(params, p, sizeof(*p))) {ret = -EFAULT;goto err;}// 以下是创建一个匿名 inode, 并将文件描述符返回// 可通过ll /proc/{pid}/fd/ 查看到io_uring的文件描述 3 -> 'anon_inode:[io_uring]'// 用户可通过返回文件描述符进行mmap, 以获取对SQ/CQ的内存访问file = io_uring_get_file(ctx);if (IS_ERR(file)) {ret = PTR_ERR(file);goto err;}/** Install ring fd as the very last thing, so we don't risk someone* having closed it before we finish setup*/ret = io_uring_install_fd(ctx, file);if (ret < 0) {/* fput will clean it up */fput(file);return ret;}trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);return ret;
err:io_ring_ctx_wait_and_kill(ctx);return ret;
}
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,struct io_uring_params *p)
{struct io_rings *rings;size_t size, sq_array_offset;/* make sure these are sane, as we already accounted them */ctx->sq_entries = p->sq_entries;ctx->cq_entries = p->cq_entries;// rings_size计算的是一块存储struct io_rings + struct io_uring_cqe + sq_array的连续内存// sq_array是ctx->sq_sqes的索引数组, 内存大小 = p->sq_entries * sizeof(u32) = 256// sq_array_offset = 2368// size = 2368 + 256 = 2624size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);if (size == SIZE_MAX)return -EOVERFLOW;rings = io_mem_alloc(size);if (!rings)return -ENOMEM;ctx->rings = rings;ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);rings->sq_ring_mask = p->sq_entries - 1;rings->cq_ring_mask = p->cq_entries - 1;rings->sq_ring_entries = p->sq_entries;rings->cq_ring_entries = p->cq_entries;size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);if (size == SIZE_MAX) {io_mem_free(ctx->rings);ctx->rings = NULL;return -EOVERFLOW;}ctx->sq_sqes = io_mem_alloc(size);if (!ctx->sq_sqes) {io_mem_free(ctx->rings);ctx->rings = NULL;return -ENOMEM;}return 0;
}

返回用户空间

int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,struct io_uring_params *p, void *buf,size_t buf_size)
{int fd, ret = 0;unsigned *sq_array;unsigned sq_entries, index;memset(ring, 0, sizeof(*ring));// 省去一些无法走到的代码fd = __sys_io_uring_setup(entries, p);if (fd < 0) {return fd;}// ------> 从此处执行 <------// 未设置IORING_SETUP_NO_MMAP(不使用内存映射)标志if (!(p->flags & IORING_SETUP_NO_MMAP)) {// io_uring_queue_mmap透传, 直接看io_uring_mmapret = io_uring_queue_mmap(fd, p, ring);if (ret) {__sys_close(fd);return ret;}}return ret;
}
static int io_uring_mmap(int fd, struct io_uring_params *p,struct io_uring_sq *sq, struct io_uring_cq *cq)
{size_t size;int ret;size = sizeof(struct io_uring_cqe); // 16// 通过gdb查看/*** p->sq_off.array = 2368* p->sq_entries = 64* * p->cq_off.cqes = 320 // 固定320* p->cq_entries = 128* * 计算出结果* sq->ring_sz = 2624* cq->ring_sz = 2368*/sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);cq->ring_sz = p->cq_off.cqes + p->cq_entries * size;// fs/io_uring.c line:10328 会设置IORING_FEAT_SINGLE_MMAP标志if (p->features & IORING_FEAT_SINGLE_MMAP) {if (cq->ring_sz > sq->ring_sz)sq->ring_sz = cq->ring_sz;cq->ring_sz = sq->ring_sz;}/*** sq->ring_sz = 2624* cq->ring_sz = 2624*/// 创建与内核结构体io_rings的内存映射sq->ring_ptr = __sys_mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,MAP_SHARED | MAP_POPULATE, fd,IORING_OFF_SQ_RING);if (IS_ERR(sq->ring_ptr))return PTR_ERR(sq->ring_ptr);// 共用一份数据if (p->features & IORING_FEAT_SINGLE_MMAP) {cq->ring_ptr = sq->ring_ptr;}size = sizeof(struct io_uring_sqe); // 64// 创建提交队列的内存映射// 此结构体数据用于给用户填充数据, 如通过io_uring_get_sqe获取一个结构体// 并通过io_uring_prep_read填充此结构体, 最后通过io_uring_submit提交// NOTE 需要注意的是, 通过io_uring_prep_read设置的数据必须保证内存生命周期在完成后sq->sqes = __sys_mmap(0, size * p->sq_entries, PROT_READ | PROT_WRITE,MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);// 初始化提交队列和完成队列io_uring_setup_ring_pointers(p, sq, cq);return 0;
}

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com