aboutsummaryrefslogtreecommitdiff
path: root/virt
diff options
context:
space:
mode:
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/Kconfig6
-rw-r--r--virt/kvm/assigned-dev.c367
-rw-r--r--virt/kvm/async_pf.c11
-rw-r--r--virt/kvm/coalesced_mmio.c137
-rw-r--r--virt/kvm/coalesced_mmio.h7
-rw-r--r--virt/kvm/eventfd.c176
-rw-r--r--virt/kvm/ioapic.c78
-rw-r--r--virt/kvm/ioapic.h5
-rw-r--r--virt/kvm/iommu.c81
-rw-r--r--virt/kvm/irq_comm.c63
-rw-r--r--virt/kvm/kvm_main.c1107
11 files changed, 1366 insertions, 672 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index f63ccb0a598..d01b24b72c6 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -18,3 +18,9 @@ config KVM_MMIO
config KVM_ASYNC_PF
bool
+
+config HAVE_KVM_MSI
+ bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+ bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 4e9eaeb518c..23a41a9f8db 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -17,6 +17,8 @@
#include <linux/pci.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
#include "irq.h"
static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
@@ -47,71 +49,131 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
index = i;
break;
}
- if (index < 0) {
+ if (index < 0)
printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
- return 0;
- }
return index;
}
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
- u32 vector;
- int index;
+ int ret;
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
- spin_lock(&assigned_dev->intx_lock);
- disable_irq_nosync(irq);
+ spin_lock(&assigned_dev->intx_lock);
+ if (pci_check_and_mask_intx(assigned_dev->dev)) {
assigned_dev->host_irq_disabled = true;
- spin_unlock(&assigned_dev->intx_lock);
- }
+ ret = IRQ_WAKE_THREAD;
+ } else
+ ret = IRQ_NONE;
+ spin_unlock(&assigned_dev->intx_lock);
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
- index = find_index_from_host_irq(assigned_dev, irq);
- if (index >= 0) {
- vector = assigned_dev->
- guest_msix_entries[index].vector;
+ return ret;
+}
+
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+ int vector)
+{
+ if (unlikely(assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_GUEST_INTX)) {
+ spin_lock(&assigned_dev->intx_mask_lock);
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
kvm_set_irq(assigned_dev->kvm,
assigned_dev->irq_source_id, vector, 1);
- }
+ spin_unlock(&assigned_dev->intx_mask_lock);
} else
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- assigned_dev->guest_irq, 1);
+ vector, 1);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ disable_irq_nosync(irq);
+ assigned_dev->host_irq_disabled = true;
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ }
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
return IRQ_HANDLED;
}
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
+
+ return IRQ_HANDLED;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
{
- struct kvm_assigned_dev_kernel *dev;
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ u32 vector;
+
+ if (index >= 0) {
+ vector = assigned_dev->guest_msix_entries[index].vector;
+ kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
+ }
- if (kian->gsi == -1)
- return;
+ return IRQ_HANDLED;
+}
+#endif
- dev = container_of(kian, struct kvm_assigned_dev_kernel,
- ack_notifier);
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_assigned_dev_kernel *dev =
+ container_of(kian, struct kvm_assigned_dev_kernel,
+ ack_notifier);
kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
- /* The guest irq may be shared so this ack may be
- * from another device.
- */
- spin_lock(&dev->intx_lock);
- if (dev->host_irq_disabled) {
- enable_irq(dev->host_irq);
- dev->host_irq_disabled = false;
+ spin_lock(&dev->intx_mask_lock);
+
+ if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+ bool reassert = false;
+
+ spin_lock_irq(&dev->intx_lock);
+ /*
+ * The guest IRQ may be shared so this ack can come from an
+ * IRQ for another guest device.
+ */
+ if (dev->host_irq_disabled) {
+ if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+ enable_irq(dev->host_irq);
+ else if (!pci_check_and_unmask_intx(dev->dev))
+ reassert = true;
+ dev->host_irq_disabled = reassert;
+ }
+ spin_unlock_irq(&dev->intx_lock);
+
+ if (reassert)
+ kvm_set_irq(dev->kvm, dev->irq_source_id,
+ dev->guest_irq, 1);
}
- spin_unlock(&dev->intx_lock);
+
+ spin_unlock(&dev->intx_mask_lock);
}
static void deassign_guest_irq(struct kvm *kvm,
struct kvm_assigned_dev_kernel *assigned_dev)
{
- kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
- assigned_dev->ack_notifier.gsi = -1;
+ if (assigned_dev->ack_notifier.gsi != -1)
+ kvm_unregister_irq_ack_notifier(kvm,
+ &assigned_dev->ack_notifier);
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
assigned_dev->guest_irq, 0);
@@ -143,7 +205,7 @@ static void deassign_host_irq(struct kvm *kvm,
for (i = 0; i < assigned_dev->entries_nr; i++)
free_irq(assigned_dev->host_msix_entries[i].vector,
- (void *)assigned_dev);
+ assigned_dev);
assigned_dev->entries_nr = 0;
kfree(assigned_dev->host_msix_entries);
@@ -151,9 +213,17 @@ static void deassign_host_irq(struct kvm *kvm,
pci_disable_msix(assigned_dev->dev);
} else {
/* Deal with MSI and INTx */
- disable_irq(assigned_dev->host_irq);
-
- free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+ if ((assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_HOST_INTX) &&
+ (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ pci_intx(assigned_dev->dev, false);
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ synchronize_irq(assigned_dev->host_irq);
+ } else
+ disable_irq(assigned_dev->host_irq);
+
+ free_irq(assigned_dev->host_irq, assigned_dev);
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
pci_disable_msi(assigned_dev->dev);
@@ -205,6 +275,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
else
pci_restore_state(assigned_dev->dev);
+ assigned_dev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
+
pci_release_regions(assigned_dev->dev);
pci_disable_device(assigned_dev->dev);
pci_dev_put(assigned_dev->dev);
@@ -230,19 +302,43 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
static int assigned_device_enable_host_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
+ irq_handler_t irq_handler;
+ unsigned long flags;
+
dev->host_irq = dev->dev->irq;
- /* Even though this is PCI, we don't want to use shared
- * interrupts. Sharing host devices with guest-assigned devices
- * on the same interrupt line is not a happy situation: there
- * are going to be long delays in accepting, acking, etc.
+
+ /*
+ * We can only share the IRQ line with other host devices if we are
+ * able to disable the IRQ source at device-level - independently of
+ * the guest driver. Otherwise host devices may suffer from unbounded
+ * IRQ latencies when the guest keeps the line asserted.
*/
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- IRQF_ONESHOT, dev->irq_name, (void *)dev))
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ irq_handler = kvm_assigned_dev_intx;
+ flags = IRQF_SHARED;
+ } else {
+ irq_handler = NULL;
+ flags = IRQF_ONESHOT;
+ }
+ if (request_threaded_irq(dev->host_irq, irq_handler,
+ kvm_assigned_dev_thread_intx, flags,
+ dev->irq_name, dev))
return -EIO;
+
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&dev->intx_lock);
+ pci_intx(dev->dev, true);
+ spin_unlock_irq(&dev->intx_lock);
+ }
return 0;
}
#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
+{
+ return IRQ_WAKE_THREAD;
+}
+
static int assigned_device_enable_host_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -255,8 +351,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
}
dev->host_irq = dev->dev->irq;
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- 0, dev->irq_name, (void *)dev)) {
+ if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
+ kvm_assigned_dev_thread_msi, 0,
+ dev->irq_name, dev)) {
pci_disable_msi(dev->dev);
return -EIO;
}
@@ -266,6 +363,11 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
#endif
#ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
+{
+ return IRQ_WAKE_THREAD;
+}
+
static int assigned_device_enable_host_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -282,8 +384,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
for (i = 0; i < dev->entries_nr; i++) {
r = request_threaded_irq(dev->host_msix_entries[i].vector,
- NULL, kvm_assigned_dev_thread,
- 0, dev->irq_name, (void *)dev);
+ kvm_assigned_dev_msix,
+ kvm_assigned_dev_thread_msix,
+ 0, dev->irq_name, dev);
if (r)
goto err;
}
@@ -291,7 +394,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
return 0;
err:
for (i -= 1; i >= 0; i--)
- free_irq(dev->host_msix_entries[i].vector, (void *)dev);
+ free_irq(dev->host_msix_entries[i].vector, dev);
pci_disable_msix(dev->dev);
return r;
}
@@ -314,7 +417,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
@@ -326,7 +428,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
@@ -360,6 +461,7 @@ static int assign_host_irq(struct kvm *kvm,
default:
r = -EINVAL;
}
+ dev->host_irq_disabled = false;
if (!r)
dev->irq_requested_type |= host_irq_type;
@@ -404,7 +506,8 @@ static int assign_guest_irq(struct kvm *kvm,
if (!r) {
dev->irq_requested_type |= guest_irq_type;
- kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+ if (dev->ack_notifier.gsi != -1)
+ kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
} else
kvm_free_irq_source_id(kvm, dev->irq_source_id);
@@ -460,6 +563,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
{
int r = -ENODEV;
struct kvm_assigned_dev_kernel *match;
+ unsigned long irq_type;
mutex_lock(&kvm->lock);
@@ -468,12 +572,74 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
if (!match)
goto out;
- r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+ irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+ KVM_DEV_IRQ_GUEST_MASK);
+ r = kvm_deassign_irq(kvm, match, irq_type);
out:
mutex_unlock(&kvm->lock);
return r;
}
+/*
+ * We want to test whether the caller has been granted permissions to
+ * use this device. To be able to configure and control the device,
+ * the user needs access to PCI configuration space and BAR resources.
+ * These are accessed through PCI sysfs. PCI config space is often
+ * passed to the process calling this ioctl via file descriptor, so we
+ * can't rely on access to that file. We can check for permissions
+ * on each of the BAR resource files, which is a pretty clear
+ * indicator that the user has been granted access to the device.
+ */
+static int probe_sysfs_permissions(struct pci_dev *dev)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ bool bar_found = false;
+
+ for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
+ char *kpath, *syspath;
+ struct path path;
+ struct inode *inode;
+ int r;
+
+ if (!pci_resource_len(dev, i))
+ continue;
+
+ kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
+ if (!kpath)
+ return -ENOMEM;
+
+ /* Per sysfs-rules, sysfs is always at /sys */
+ syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
+ kfree(kpath);
+ if (!syspath)
+ return -ENOMEM;
+
+ r = kern_path(syspath, LOOKUP_FOLLOW, &path);
+ kfree(syspath);
+ if (r)
+ return r;
+
+ inode = path.dentry->d_inode;
+
+ r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
+ path_put(&path);
+ if (r)
+ return r;
+
+ bar_found = true;
+ }
+
+ /* If no resources, probably something special */
+ if (!bar_found)
+ return -EPERM;
+
+ return 0;
+#else
+ return -EINVAL; /* No way to control the device without sysfs */
+#endif
+}
+
static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
{
@@ -481,6 +647,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
struct kvm_assigned_dev_kernel *match;
struct pci_dev *dev;
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
idx = srcu_read_lock(&kvm->srcu);
@@ -507,6 +676,17 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
r = -EINVAL;
goto out_free;
}
+
+ /* Don't allow bridges to be assigned */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
+ r = -EPERM;
+ goto out_put;
+ }
+
+ r = probe_sysfs_permissions(dev);
+ if (r)
+ goto out_put;
+
if (pci_enable_device(dev)) {
printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
r = -EBUSY;
@@ -525,6 +705,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
if (!match->pci_saved_state)
printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
__func__, dev_name(&dev->dev));
+
+ if (!pci_intx_mask_supported(dev))
+ assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
@@ -532,22 +716,21 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
match->flags = assigned_dev->flags;
match->dev = dev;
spin_lock_init(&match->intx_lock);
+ spin_lock_init(&match->intx_mask_lock);
match->irq_source_id = -1;
match->kvm = kvm;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
list_add(&match->list, &kvm->arch.assigned_dev_head);
- if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
- if (!kvm->arch.iommu_domain) {
- r = kvm_iommu_map_guest(kvm);
- if (r)
- goto out_list_del;
- }
- r = kvm_assign_device(kvm, match);
+ if (!kvm->arch.iommu_domain) {
+ r = kvm_iommu_map_guest(kvm);
if (r)
goto out_list_del;
}
+ r = kvm_assign_device(kvm, match);
+ if (r)
+ goto out_list_del;
out:
srcu_read_unlock(&kvm->srcu, idx);
@@ -587,8 +770,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
goto out;
}
- if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
- kvm_deassign_device(kvm, match);
+ kvm_deassign_device(kvm, match);
kvm_free_assigned_device(kvm, match);
@@ -680,6 +862,55 @@ msix_entry_out:
}
#endif
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ r = -ENODEV;
+ goto out;
+ }
+
+ spin_lock(&match->intx_mask_lock);
+
+ match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+ match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+ if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+ kvm_set_irq(match->kvm, match->irq_source_id,
+ match->guest_irq, 0);
+ /*
+ * Masking at hardware-level is performed on demand,
+ * i.e. when an IRQ actually arrives at the host.
+ */
+ } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ /*
+ * Unmask the IRQ line if required. Unmasking at
+ * device level will be performed by user space.
+ */
+ spin_lock_irq(&match->intx_lock);
+ if (match->host_irq_disabled) {
+ enable_irq(match->host_irq);
+ match->host_irq_disabled = false;
+ }
+ spin_unlock_irq(&match->intx_lock);
+ }
+ }
+
+ spin_unlock(&match->intx_mask_lock);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
unsigned long arg)
{
@@ -787,6 +1018,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
break;
}
#endif
+ case KVM_ASSIGN_SET_INTX_MASK: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+ break;
+ }
default:
r = -ENOTTY;
break;
@@ -794,4 +1034,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
out:
return r;
}
-
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4c2ee..ea475cd0351 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
list_del(&work->queue);
vcpu->async_pf.queued--;
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
if (!work)
return -ENOMEM;
- work->page = bad_page;
- get_page(bad_page);
+ work->page = KVM_ERR_PTR_BAD_PAGE;
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index fc8487564d1..88b2fe3ddf4 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -24,10 +24,25 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
gpa_t addr, int len)
{
- struct kvm_coalesced_mmio_zone *zone;
+ /* is it in a batchable area ?
+ * (addr,len) is fully included in
+ * (zone->addr, zone->size)
+ */
+ if (len < 0)
+ return 0;
+ if (addr + len < addr)
+ return 0;
+ if (addr < dev->zone.addr)
+ return 0;
+ if (addr + len > dev->zone.addr + dev->zone.size)
+ return 0;
+ return 1;
+}
+
+static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
+{
struct kvm_coalesced_mmio_ring *ring;
unsigned avail;
- int i;
/* Are we able to batch it ? */
@@ -37,25 +52,12 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
*/
ring = dev->kvm->coalesced_mmio_ring;
avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
- if (avail < KVM_MAX_VCPUS) {
+ if (avail == 0) {
/* full */
return 0;
}
- /* is it in a batchable area ? */
-
- for (i = 0; i < dev->nb_zones; i++) {
- zone = &dev->zone[i];
-
- /* (addr,len) is fully included in
- * (zone->addr, zone->size)
- */
-
- if (zone->addr <= addr &&
- addr + len <= zone->addr + zone->size)
- return 1;
- }
- return 0;
+ return 1;
}
static int coalesced_mmio_write(struct kvm_io_device *this,
@@ -63,10 +65,16 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
{
struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+
if (!coalesced_mmio_in_range(dev, addr, len))
return -EOPNOTSUPP;
- spin_lock(&dev->lock);
+ spin_lock(&dev->kvm->ring_lock);
+
+ if (!coalesced_mmio_has_room(dev)) {
+ spin_unlock(&dev->kvm->ring_lock);
+ return -EOPNOTSUPP;
+ }
/* copy data in first free entry of the ring */
@@ -75,7 +83,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
memcpy(ring->coalesced_mmio[ring->last].data, val, len);
smp_wmb();
ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
- spin_unlock(&dev->lock);
+ spin_unlock(&dev->kvm->ring_lock);
return 0;
}
@@ -83,6 +91,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
{
struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+ list_del(&dev->list);
+
kfree(dev);
}
@@ -93,7 +103,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
int kvm_coalesced_mmio_init(struct kvm *kvm)
{
- struct kvm_coalesced_mmio_dev *dev;
struct page *page;
int ret;
@@ -101,31 +110,18 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page)
goto out_err;
- kvm->coalesced_mmio_ring = page_address(page);
- ret = -ENOMEM;
- dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
- if (!dev)
- goto out_free_page;
- spin_lock_init(&dev->lock);
- kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
- dev->kvm = kvm;
- kvm->coalesced_mmio_dev = dev;
-
- mutex_lock(&kvm->slots_lock);
- ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
- mutex_unlock(&kvm->slots_lock);
- if (ret < 0)
- goto out_free_dev;
+ ret = 0;
+ kvm->coalesced_mmio_ring = page_address(page);
- return ret;
+ /*
+ * We're using this spinlock to sync access to the coalesced ring.
+ * The list doesn't need it's own lock since device registration and
+ * unregistration should only happen when kvm->slots_lock is held.
+ */
+ spin_lock_init(&kvm->ring_lock);
+ INIT_LIST_HEAD(&kvm->coalesced_zones);
-out_free_dev:
- kvm->coalesced_mmio_dev = NULL;
- kfree(dev);
-out_free_page:
- kvm->coalesced_mmio_ring = NULL;
- __free_page(page);
out_err:
return ret;
}
@@ -139,51 +135,50 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *zone)
{
- struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+ int ret;
+ struct kvm_coalesced_mmio_dev *dev;
- if (dev == NULL)
- return -ENXIO;
+ dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+ dev->kvm = kvm;
+ dev->zone = *zone;
mutex_lock(&kvm->slots_lock);
- if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
- mutex_unlock(&kvm->slots_lock);
- return -ENOBUFS;
- }
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr,
+ zone->size, &dev->dev);
+ if (ret < 0)
+ goto out_free_dev;
+ list_add_tail(&dev->list, &kvm->coalesced_zones);
+ mutex_unlock(&kvm->slots_lock);
- dev->zone[dev->nb_zones] = *zone;
- dev->nb_zones++;
+ return ret;
+out_free_dev:
mutex_unlock(&kvm->slots_lock);
+
+ kfree(dev);
+
+ if (dev == NULL)
+ return -ENXIO;
+
return 0;
}
int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *zone)
{
- int i;
- struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
- struct kvm_coalesced_mmio_zone *z;
-
- if (dev == NULL)
- return -ENXIO;
+ struct kvm_coalesced_mmio_dev *dev, *tmp;
mutex_lock(&kvm->slots_lock);
- i = dev->nb_zones;
- while (i) {
- z = &dev->zone[i - 1];
-
- /* unregister all zones
- * included in (zone->addr, zone->size)
- */
-
- if (zone->addr <= z->addr &&
- z->addr + z->size <= zone->addr + zone->size) {
- dev->nb_zones--;
- *z = dev->zone[dev->nb_zones];
+ list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+ if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+ kvm_iodevice_destructor(&dev->dev);
}
- i--;
- }
mutex_unlock(&kvm->slots_lock);
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 8a5959e3535..b280c20444d 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,14 +12,13 @@
#ifdef CONFIG_KVM_MMIO
-#define KVM_COALESCED_MMIO_ZONE_MAX 100
+#include <linux/list.h>
struct kvm_coalesced_mmio_dev {
+ struct list_head list;
struct kvm_io_device dev;
struct kvm *kvm;
- spinlock_t lock;
- int nb_zones;
- struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+ struct kvm_coalesced_mmio_zone zone;
};
int kvm_coalesced_mmio_init(struct kvm *kvm);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 73358d256fa..9718e98d6d2 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -43,6 +43,31 @@
* --------------------------------------------------------------------
*/
+/*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts. The interrupt is asserted on eventfd
+ * trigger. On acknowledgement through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd. All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user. We can also therefore share the same irq source ID.
+ */
+struct _irqfd_resampler {
+ struct kvm *kvm;
+ /*
+ * List of resampling struct _irqfd objects sharing this gsi.
+ * RCU list modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head list;
+ struct kvm_irq_ack_notifier notifier;
+ /*
+ * Entry in list of kvm->irqfd.resampler_list. Use for sharing
+ * resamplers among irqfds on the same gsi.
+ * Accessed and modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head link;
+};
+
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
@@ -52,6 +77,12 @@ struct _irqfd {
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
+ /* The resampler used by this irqfd (resampler-only) */
+ struct _irqfd_resampler *resampler;
+ /* Eventfd notified on resample (resampler-only) */
+ struct eventfd_ctx *resamplefd;
+ /* Entry in list of irqfds for a resampler (resampler-only) */
+ struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
@@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work)
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ if (!irqfd->resampler) {
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ } else
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ irqfd->gsi, 1);
+}
+
+/*
+ * Since resampler irqfds share an IRQ source ID, we de-assert once
+ * then notify all of the resampler irqfds using this GSI. We can't
+ * do multiple de-asserts or we risk racing with incoming re-asserts.
+ */
+static void
+irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
+{
+ struct _irqfd_resampler *resampler;
+ struct _irqfd *irqfd;
+
+ resampler = container_of(kian, struct _irqfd_resampler, notifier);
+
+ kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
+ eventfd_signal(irqfd->resamplefd, 1);
+
+ rcu_read_unlock();
+}
+
+static void
+irqfd_resampler_shutdown(struct _irqfd *irqfd)
+{
+ struct _irqfd_resampler *resampler = irqfd->resampler;
+ struct kvm *kvm = resampler->kvm;
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_del_rcu(&irqfd->resampler_link);
+ synchronize_rcu();
+
+ if (list_empty(&resampler->list)) {
+ list_del(&resampler->link);
+ kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+ kfree(resampler);
+ }
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
@@ -90,7 +171,12 @@ irqfd_shutdown(struct work_struct *work)
* We know no new events will be scheduled at this point, so block
* until all previously outstanding events have completed
*/
- flush_work_sync(&irqfd->inject);
+ flush_work(&irqfd->inject);
+
+ if (irqfd->resampler) {
+ irqfd_resampler_shutdown(irqfd);
+ eventfd_ctx_put(irqfd->resamplefd);
+ }
/*
* It is now safe to release the object's resources
@@ -198,12 +284,12 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
}
static int
-kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
+kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
- struct eventfd_ctx *eventfd = NULL;
+ struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
unsigned int events;
@@ -212,12 +298,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
return -ENOMEM;
irqfd->kvm = kvm;
- irqfd->gsi = gsi;
+ irqfd->gsi = args->gsi;
INIT_LIST_HEAD(&irqfd->list);
INIT_WORK(&irqfd->inject, irqfd_inject);
INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
- file = eventfd_fget(fd);
+ file = eventfd_fget(args->fd);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
goto fail;
@@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
irqfd->eventfd = eventfd;
+ if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+ struct _irqfd_resampler *resampler;
+
+ resamplefd = eventfd_ctx_fdget(args->resamplefd);
+ if (IS_ERR(resamplefd)) {
+ ret = PTR_ERR(resamplefd);
+ goto fail;
+ }
+
+ irqfd->resamplefd = resamplefd;
+ INIT_LIST_HEAD(&irqfd->resampler_link);
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_for_each_entry(resampler,
+ &kvm->irqfds.resampler_list, list) {
+ if (resampler->notifier.gsi == irqfd->gsi) {
+ irqfd->resampler = resampler;
+ break;
+ }
+ }
+
+ if (!irqfd->resampler) {
+ resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+ if (!resampler) {
+ ret = -ENOMEM;
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ goto fail;
+ }
+
+ resampler->kvm = kvm;
+ INIT_LIST_HEAD(&resampler->list);
+ resampler->notifier.gsi = irqfd->gsi;
+ resampler->notifier.irq_acked = irqfd_resampler_ack;
+ INIT_LIST_HEAD(&resampler->link);
+
+ list_add(&resampler->link, &kvm->irqfds.resampler_list);
+ kvm_register_irq_ack_notifier(kvm,
+ &resampler->notifier);
+ irqfd->resampler = resampler;
+ }
+
+ list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
+ synchronize_rcu();
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ }
+
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
@@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
return 0;
fail:
+ if (irqfd->resampler)
+ irqfd_resampler_shutdown(irqfd);
+
+ if (resamplefd && !IS_ERR(resamplefd))
+ eventfd_ctx_put(resamplefd);
+
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
@@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm)
{
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
+ INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+ mutex_init(&kvm->irqfds.resampler_lock);
INIT_LIST_HEAD(&kvm->ioeventfds);
}
@@ -298,19 +440,19 @@ kvm_eventfd_init(struct kvm *kvm)
* shutdown any irqfd's that match fd+gsi
*/
static int
-kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
+kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct _irqfd *irqfd, *tmp;
struct eventfd_ctx *eventfd;
- eventfd = eventfd_ctx_fdget(fd);
+ eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
- if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
+ if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
/*
* This rcu_assign_pointer is needed for when
* another thread calls kvm_irq_routing_update before
@@ -338,12 +480,15 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
}
int
-kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
- if (flags & KVM_IRQFD_FLAG_DEASSIGN)
- return kvm_irqfd_deassign(kvm, fd, gsi);
+ if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
+ return -EINVAL;
+
+ if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
+ return kvm_irqfd_deassign(kvm, args);
- return kvm_irqfd_assign(kvm, fd, gsi);
+ return kvm_irqfd_assign(kvm, args);
}
/*
@@ -586,7 +731,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
kvm_iodevice_init(&p->dev, &ioeventfd_ops);
- ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
+ ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
+ &p->dev);
if (ret < 0)
goto unlock_fail;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 8df1ca104a7..cfb7e4d52dc 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -185,42 +185,56 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
irqe.dest_mode = 0; /* Physical mode. */
/* need to read apic_id from apic regiest since
* it can be rewritten */
- irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
+ irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
}
#endif
return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level)
{
u32 old_irr;
u32 mask = 1 << irq;
union kvm_ioapic_redirect_entry entry;
- int ret = 1;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
spin_lock(&ioapic->lock);
old_irr = ioapic->irr;
- if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
- entry = ioapic->redirtbl[irq];
- level ^= entry.fields.polarity;
- if (!level)
- ioapic->irr &= ~mask;
- else {
- int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
- ioapic->irr |= mask;
- if ((edge && old_irr != ioapic->irr) ||
- (!edge && !entry.fields.remote_irr))
- ret = ioapic_service(ioapic, irq);
- else
- ret = 0; /* report coalesced interrupt */
- }
- trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ entry = ioapic->redirtbl[irq];
+ irq_level ^= entry.fields.polarity;
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ } else {
+ int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+ ioapic->irr |= mask;
+ if ((edge && old_irr != ioapic->irr) ||
+ (!edge && !entry.fields.remote_irr))
+ ret = ioapic_service(ioapic, irq);
+ else
+ ret = 0; /* report coalesced interrupt */
}
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(&ioapic->lock);
return ret;
}
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
+{
+ int i;
+
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &ioapic->irq_states[i]);
+ spin_unlock(&ioapic->lock);
+}
+
static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
int trigger_mode)
{
@@ -254,13 +268,17 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
}
}
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ smp_rmb();
+ return test_bit(vector, ioapic->handled_vectors);
+}
+
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- smp_rmb();
- if (!test_bit(vector, ioapic->handled_vectors))
- return;
spin_lock(&ioapic->lock);
__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
spin_unlock(&ioapic->lock);
@@ -332,9 +350,18 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
(void*)addr, len, val);
ASSERT(!(addr & 0xf)); /* check alignment */
- if (len == 4 || len == 8)
+ switch (len) {
+ case 8:
+ case 4:
data = *(u32 *) val;
- else {
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
return 0;
}
@@ -343,7 +370,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
spin_lock(&ioapic->lock);
switch (addr) {
case IOAPIC_REG_SELECT:
- ioapic->ioregsel = data;
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
break;
case IOAPIC_REG_WINDOW:
@@ -394,7 +421,8 @@ int kvm_ioapic_init(struct kvm *kvm)
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
ioapic->kvm = kvm;
mutex_lock(&kvm->slots_lock);
- ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
mutex_unlock(&kvm->slots_lock);
if (ret < 0) {
kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 0b190c34ccc..a30abfe6ed1 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,9 +71,12 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level);
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 78c80f67f53..037cb6730e6 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -25,12 +25,14 @@
#include <linux/list.h>
#include <linux/kvm_host.h>
+#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/stat.h>
#include <linux/dmar.h>
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
-static int allow_unsafe_assigned_interrupts;
+static bool allow_unsafe_assigned_interrupts;
module_param_named(allow_unsafe_assigned_interrupts,
allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
@@ -40,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, unsigned long size)
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+ unsigned long size)
{
gfn_t end_gfn;
pfn_t pfn;
- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1;
@@ -54,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
return pfn;
while (gfn < end_gfn)
- gfn_to_pfn_memslot(kvm, slot, gfn++);
+ gfn_to_pfn_memslot(slot, gfn++);
return pfn;
}
@@ -103,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
*/
- pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+ pfn = kvm_pin_pages(slot, gfn, page_size);
if (is_error_pfn(pfn)) {
gfn += 1;
continue;
@@ -111,7 +113,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
/* Map into IO address space */
r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
- get_order(page_size), flags);
+ page_size, flags);
if (r) {
printk(KERN_ERR "kvm_iommu_map_address:"
"iommu failed to map pfn=%llx\n", pfn);
@@ -132,14 +134,15 @@ unmap_pages:
static int kvm_iommu_map_memslots(struct kvm *kvm)
{
- int i, idx, r = 0;
+ int idx, r = 0;
struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
idx = srcu_read_lock(&kvm->srcu);
slots = kvm_memslots(kvm);
- for (i = 0; i < slots->nmemslots; i++) {
- r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
+ kvm_for_each_memslot(memslot, slots) {
+ r = kvm_iommu_map_pages(kvm, memslot);
if (r)
break;
}
@@ -187,6 +190,8 @@ int kvm_assign_device(struct kvm *kvm,
goto out_unmap;
}
+ pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
+
printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
assigned_dev->host_segnr,
assigned_dev->host_busnr,
@@ -215,6 +220,8 @@ int kvm_deassign_device(struct kvm *kvm,
iommu_detach_device(domain, &pdev->dev);
+ pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
+
printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
assigned_dev->host_segnr,
assigned_dev->host_busnr,
@@ -228,14 +235,18 @@ int kvm_iommu_map_guest(struct kvm *kvm)
{
int r;
- if (!iommu_found()) {
+ if (!iommu_present(&pci_bus_type)) {
printk(KERN_ERR "%s: iommu not found\n", __func__);
return -ENODEV;
}
- kvm->arch.iommu_domain = iommu_domain_alloc();
- if (!kvm->arch.iommu_domain)
- return -ENOMEM;
+ mutex_lock(&kvm->slots_lock);
+
+ kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
+ if (!kvm->arch.iommu_domain) {
+ r = -ENOMEM;
+ goto out_unlock;
+ }
if (!allow_unsafe_assigned_interrupts &&
!iommu_domain_has_cap(kvm->arch.iommu_domain,
@@ -246,17 +257,16 @@ int kvm_iommu_map_guest(struct kvm *kvm)
" module option.\n", __func__);
iommu_domain_free(kvm->arch.iommu_domain);
kvm->arch.iommu_domain = NULL;
- return -EPERM;
+ r = -EPERM;
+ goto out_unlock;
}
r = kvm_iommu_map_memslots(kvm);
if (r)
- goto out_unmap;
-
- return 0;
+ kvm_iommu_unmap_memslots(kvm);
-out_unmap:
- kvm_iommu_unmap_memslots(kvm);
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -286,15 +296,21 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
while (gfn < end_gfn) {
unsigned long unmap_pages;
- int order;
+ size_t size;
/* Get physical address */
phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+ if (!phys) {
+ gfn++;
+ continue;
+ }
+
pfn = phys >> PAGE_SHIFT;
/* Unmap address from IO address space */
- order = iommu_unmap(domain, gfn_to_gpa(gfn), 0);
- unmap_pages = 1ULL << order;
+ size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
+ unmap_pages = 1ULL << get_order(size);
/* Unpin all pages we just unmapped to not leak any memory */
kvm_unpin_pages(kvm, pfn, unmap_pages);
@@ -303,18 +319,23 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
}
}
+void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
+}
+
static int kvm_iommu_unmap_memslots(struct kvm *kvm)
{
- int i, idx;
+ int idx;
struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
idx = srcu_read_lock(&kvm->srcu);
slots = kvm_memslots(kvm);
- for (i = 0; i < slots->nmemslots; i++) {
- kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
- slots->memslots[i].npages);
- }
+ kvm_for_each_memslot(memslot, slots)
+ kvm_iommu_unmap_pages(kvm, memslot);
+
srcu_read_unlock(&kvm->srcu, idx);
return 0;
@@ -328,7 +349,11 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
if (!domain)
return 0;
+ mutex_lock(&kvm->slots_lock);
kvm_iommu_unmap_memslots(kvm);
+ kvm->arch.iommu_domain = NULL;
+ mutex_unlock(&kvm->slots_lock);
+
iommu_domain_free(domain);
return 0;
}
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 9f614b4e365..2eb58af7ee9 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -33,26 +33,12 @@
#include "ioapic.h"
-static inline int kvm_irq_line_state(unsigned long *irq_state,
- int irq_source_id, int level)
-{
- /* Logical OR for level trig interrupt */
- if (level)
- set_bit(irq_source_id, irq_state);
- else
- clear_bit(irq_source_id, irq_state);
-
- return !!(*irq_state);
-}
-
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level)
{
#ifdef CONFIG_X86
struct kvm_pic *pic = pic_irqchip(kvm);
- level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
- irq_source_id, level);
- return kvm_pic_set_irq(pic, e->irqchip.pin, level);
+ return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
#else
return -1;
#endif
@@ -62,10 +48,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
- irq_source_id, level);
-
- return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level);
}
inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -85,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_is_dm_lowest_prio(irq))
+ kvm_is_dm_lowest_prio(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
+ return r;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
@@ -138,6 +126,20 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
}
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ struct kvm_kernel_irq_routing_entry route;
+
+ if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+ return -EINVAL;
+
+ route.msi.address_lo = msi->address_lo;
+ route.msi.address_hi = msi->address_hi;
+ route.msi.data = msi->data;
+
+ return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+}
+
/*
* Return value:
* < 0 Interrupt was ignored (masked or not delivered for other reasons)
@@ -226,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
}
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
set_bit(irq_source_id, bitmap);
unlock:
mutex_unlock(&kvm->irq_lock);
@@ -235,9 +240,10 @@ unlock:
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
{
- int i;
-
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
@@ -249,14 +255,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
if (!irqchip_in_kernel(kvm))
goto unlock;
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
- clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
- if (i >= 16)
- continue;
+ kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
#ifdef CONFIG_X86
- clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
+ kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
#endif
- }
unlock:
mutex_unlock(&kvm->irq_lock);
}
@@ -318,6 +320,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
*/
hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
if (ei->type == KVM_IRQ_ROUTING_MSI ||
+ ue->type == KVM_IRQ_ROUTING_MSI ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return r;
@@ -329,11 +332,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aefdda390f5..be70035fd42 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,6 +47,8 @@
#include <linux/srcu.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/bsearch.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -98,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-struct page *fault_page;
-pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
int reserved;
@@ -135,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
-void vcpu_load(struct kvm_vcpu *vcpu)
+int vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
- mutex_lock(&vcpu->mutex);
+ if (mutex_lock_killable(&vcpu->mutex))
+ return -EINTR;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
@@ -152,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
+ return 0;
}
void vcpu_put(struct kvm_vcpu *vcpu)
@@ -201,7 +199,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
- int dirty_count = kvm->tlbs_dirty;
+ long dirty_count = kvm->tlbs_dirty;
smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
@@ -234,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
}
vcpu->run = page_address(page);
+ kvm_vcpu_set_in_spin_loop(vcpu, false);
+ kvm_vcpu_set_dy_eligible(vcpu, false);
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
@@ -287,15 +288,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
*/
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
+
kvm->mmu_notifier_seq++;
need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
- spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
-
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
kvm_flush_remote_tlbs(kvm);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
@@ -330,15 +331,14 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- for (; start < end; start += PAGE_SIZE)
- need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
- spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
-
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
kvm_flush_remote_tlbs(kvm);
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -355,11 +355,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
* been freed.
*/
kvm->mmu_notifier_seq++;
+ smp_wmb();
/*
* The above sequence increase must be visible before the
- * below count decrease but both values are read by the kvm
- * page fault under mmu_lock spinlock so we don't need to add
- * a smb_wmb() here in between the two.
+ * below count decrease, which is ensured by the smp_wmb above
+ * in conjunction with the smp_rmb in mmu_notifier_retry().
*/
kvm->mmu_notifier_count--;
spin_unlock(&kvm->mmu_lock);
@@ -376,13 +376,14 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- young = kvm_age_hva(kvm, address);
- spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ young = kvm_age_hva(kvm, address);
if (young)
kvm_flush_remote_tlbs(kvm);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
return young;
}
@@ -409,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
int idx;
idx = srcu_read_lock(&kvm->srcu);
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -438,7 +439,16 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
-static struct kvm *kvm_create_vm(void)
+static void kvm_init_memslots_id(struct kvm *kvm)
+{
+ int i;
+ struct kvm_memslots *slots = kvm->memslots;
+
+ for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
+ slots->id_to_index[i] = slots->memslots[i].id = i;
+}
+
+static struct kvm *kvm_create_vm(unsigned long type)
{
int r, i;
struct kvm *kvm = kvm_arch_alloc_vm();
@@ -446,7 +456,7 @@ static struct kvm *kvm_create_vm(void)
if (!kvm)
return ERR_PTR(-ENOMEM);
- r = kvm_arch_init_vm(kvm);
+ r = kvm_arch_init_vm(kvm, type);
if (r)
goto out_err_nodisable;
@@ -463,6 +473,7 @@ static struct kvm *kvm_create_vm(void)
kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!kvm->memslots)
goto out_err_nosrcu;
+ kvm_init_memslots_id(kvm);
if (init_srcu_struct(&kvm->srcu))
goto out_err_nosrcu;
for (i = 0; i < KVM_NR_BUSES; i++) {
@@ -503,18 +514,33 @@ out_err_nodisable:
return ERR_PTR(r);
}
+/*
+ * Avoid using vmalloc for a small buffer.
+ * Should not be used when the size is statically known.
+ */
+void *kvm_kvzalloc(unsigned long size)
+{
+ if (size > PAGE_SIZE)
+ return vzalloc(size);
+ else
+ return kzalloc(size, GFP_KERNEL);
+}
+
+void kvm_kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
if (!memslot->dirty_bitmap)
return;
- if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
- vfree(memslot->dirty_bitmap_head);
- else
- kfree(memslot->dirty_bitmap_head);
-
+ kvm_kvfree(memslot->dirty_bitmap);
memslot->dirty_bitmap = NULL;
- memslot->dirty_bitmap_head = NULL;
}
/*
@@ -523,33 +549,21 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
- int i;
-
- if (!dont || free->rmap != dont->rmap)
- vfree(free->rmap);
-
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);
-
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
- vfree(free->lpage_info[i]);
- free->lpage_info[i] = NULL;
- }
- }
+ kvm_arch_free_memslot(free, dont);
free->npages = 0;
- free->rmap = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
{
- int i;
struct kvm_memslots *slots = kvm->memslots;
+ struct kvm_memory_slot *memslot;
- for (i = 0; i < slots->nmemslots; ++i)
- kvm_free_physmem_slot(&slots->memslots[i], NULL);
+ kvm_for_each_memslot(memslot, slots)
+ kvm_free_physmem_slot(memslot, NULL);
kfree(kvm->memslots);
}
@@ -570,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_free_physmem(kvm);
@@ -604,28 +618,81 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
return 0;
}
-#ifndef CONFIG_S390
/*
* Allocation size is twice as large as the actual dirty bitmap size.
- * This makes it possible to do double buffering: see x86's
- * kvm_vm_ioctl_get_dirty_log().
+ * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
*/
static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
{
+#ifndef CONFIG_S390
unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
- if (dirty_bytes > PAGE_SIZE)
- memslot->dirty_bitmap = vzalloc(dirty_bytes);
- else
- memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
-
+ memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
if (!memslot->dirty_bitmap)
return -ENOMEM;
- memslot->dirty_bitmap_head = memslot->dirty_bitmap;
+#endif /* !CONFIG_S390 */
+ return 0;
+}
+
+static int cmp_memslot(const void *slot1, const void *slot2)
+{
+ struct kvm_memory_slot *s1, *s2;
+
+ s1 = (struct kvm_memory_slot *)slot1;
+ s2 = (struct kvm_memory_slot *)slot2;
+
+ if (s1->npages < s2->npages)
+ return 1;
+ if (s1->npages > s2->npages)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Sort the memslots base on its size, so the larger slots
+ * will get better fit.
+ */
+static void sort_memslots(struct kvm_memslots *slots)
+{
+ int i;
+
+ sort(slots->memslots, KVM_MEM_SLOTS_NUM,
+ sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
+
+ for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
+ slots->id_to_index[slots->memslots[i].id] = i;
+}
+
+void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
+{
+ if (new) {
+ int id = new->id;
+ struct kvm_memory_slot *old = id_to_memslot(slots, id);
+ unsigned long npages = old->npages;
+
+ *old = *new;
+ if (new->npages != npages)
+ sort_memslots(slots);
+ }
+
+ slots->generation++;
+}
+
+static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+{
+ u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef KVM_CAP_READONLY_MEM
+ valid_flags |= KVM_MEM_READONLY;
+#endif
+
+ if (mem->flags & ~valid_flags)
+ return -EINVAL;
+
return 0;
}
-#endif /* !CONFIG_S390 */
/*
* Allocate some memory and give it an address in the guest physical address
@@ -647,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_memory_slot old, new;
struct kvm_memslots *slots, *old_memslots;
+ r = check_memory_region_flags(mem);
+ if (r)
+ goto out;
+
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
@@ -660,12 +731,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
(void __user *)(unsigned long)mem->userspace_addr,
mem->memory_size)))
goto out;
- if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+ if (mem->slot >= KVM_MEM_SLOTS_NUM)
goto out;
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
goto out;
- memslot = &kvm->memslots->memslots[mem->slot];
+ memslot = id_to_memslot(kvm->memslots, mem->slot);
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
@@ -707,92 +778,45 @@ int __kvm_set_memory_region(struct kvm *kvm,
r = -ENOMEM;
/* Allocate if a slot is being created */
-#ifndef CONFIG_S390
- if (npages && !new.rmap) {
- new.rmap = vzalloc(npages * sizeof(*new.rmap));
-
- if (!new.rmap)
- goto out_free;
-
+ if (npages && !old.npages) {
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
- }
- if (!npages)
- goto skip_lpage;
-
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- unsigned long ugfn;
- unsigned long j;
- int lpages;
- int level = i + 2;
-
- /* Avoid unused variable warning if no large pages */
- (void)level;
- if (new.lpage_info[i])
- continue;
-
- lpages = 1 + ((base_gfn + npages - 1)
- >> KVM_HPAGE_GFN_SHIFT(level));
- lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
-
- new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
-
- if (!new.lpage_info[i])
+ if (kvm_arch_create_memslot(&new, npages))
goto out_free;
-
- if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- new.lpage_info[i][0].write_count = 1;
- if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- new.lpage_info[i][lpages - 1].write_count = 1;
- ugfn = new.userspace_addr >> PAGE_SHIFT;
- /*
- * If the gfn and userspace address are not aligned wrt each
- * other, or if explicitly asked to, disable large page
- * support for this slot
- */
- if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
- !largepages_enabled)
- for (j = 0; j < lpages; ++j)
- new.lpage_info[i][j].write_count = 1;
}
-skip_lpage:
-
/* Allocate page dirty bitmap if needed */
if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free;
/* destroy any largepage mappings for dirty tracking */
}
-#else /* not defined CONFIG_S390 */
- new.user_alloc = user_alloc;
- if (user_alloc)
- new.userspace_addr = mem->userspace_addr;
-#endif /* not defined CONFIG_S390 */
- if (!npages) {
+ if (!npages || base_gfn != old.base_gfn) {
+ struct kvm_memory_slot *slot;
+
r = -ENOMEM;
- slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
+ GFP_KERNEL);
if (!slots)
goto out_free;
- memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
- if (mem->slot >= slots->nmemslots)
- slots->nmemslots = mem->slot + 1;
- slots->generation++;
- slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+ slot = id_to_memslot(slots, mem->slot);
+ slot->flags |= KVM_MEMSLOT_INVALID;
+
+ update_memslots(slots, NULL);
old_memslots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
- /* From this point no new shadow pages pointing to a deleted
- * memslot will be created.
+ /* From this point no new shadow pages pointing to a deleted,
+ * or moved, memslot will be created.
*
* validation of sp->gfn happens in:
* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
* - kvm_is_visible_gfn (mmu_check_roots)
*/
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_memslot(kvm, slot);
kfree(old_memslots);
}
@@ -800,44 +824,33 @@ skip_lpage:
if (r)
goto out_free;
- /* map the pages in iommu page table */
+ /* map/unmap the pages in iommu page table */
if (npages) {
r = kvm_iommu_map_pages(kvm, &new);
if (r)
goto out_free;
- }
+ } else
+ kvm_iommu_unmap_pages(kvm, &old);
r = -ENOMEM;
- slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
+ GFP_KERNEL);
if (!slots)
goto out_free;
- memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
- if (mem->slot >= slots->nmemslots)
- slots->nmemslots = mem->slot + 1;
- slots->generation++;
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
- new.rmap = NULL;
new.dirty_bitmap = NULL;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
- new.lpage_info[i] = NULL;
+ memset(&new.arch, 0, sizeof(new.arch));
}
- slots->memslots[mem->slot] = new;
+ update_memslots(slots, &new);
old_memslots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
- /*
- * If the new memory slot is created, we need to clear all
- * mmio sptes.
- */
- if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
- kvm_arch_flush_shadow(kvm);
-
kvm_free_physmem_slot(&old, &new);
kfree(old_memslots);
@@ -886,7 +899,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_MEMORY_SLOTS)
goto out;
- memslot = &kvm->memslots->memslots[log->slot];
+ memslot = id_to_memslot(kvm->memslots, log->slot);
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
@@ -908,73 +921,16 @@ out:
return r;
}
-void kvm_disable_largepages(void)
+bool kvm_largepages_enabled(void)
{
- largepages_enabled = false;
+ return largepages_enabled;
}
-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
-int is_error_page(struct page *page)
-{
- return page == bad_page || page == hwpoison_page || page == fault_page;
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-int is_error_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_error_pfn);
-
-int is_hwpoison_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
-int is_fault_pfn(pfn_t pfn)
-{
- return pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
-
-int is_noslot_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn;
-}
-EXPORT_SYMBOL_GPL(is_noslot_pfn);
-
-int is_invalid_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_invalid_pfn);
-
-static inline unsigned long bad_hva(void)
-{
- return PAGE_OFFSET;
-}
-
-int kvm_is_error_hva(unsigned long addr)
-{
- return addr == bad_hva();
-}
-EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
- gfn_t gfn)
+void kvm_disable_largepages(void)
{
- int i;
-
- for (i = 0; i < slots->nmemslots; ++i) {
- struct kvm_memory_slot *memslot = &slots->memslots[i];
-
- if (gfn >= memslot->base_gfn
- && gfn < memslot->base_gfn + memslot->npages)
- return memslot;
- }
- return NULL;
+ largepages_enabled = false;
}
+EXPORT_SYMBOL_GPL(kvm_disable_largepages);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -984,20 +940,13 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
- int i;
- struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *memslot = &slots->memslots[i];
-
- if (memslot->flags & KVM_MEMSLOT_INVALID)
- continue;
+ if (!memslot || memslot->id >= KVM_MEMORY_SLOTS ||
+ memslot->flags & KVM_MEMSLOT_INVALID)
+ return 0;
- if (gfn >= memslot->base_gfn
- && gfn < memslot->base_gfn + memslot->npages)
- return 1;
- }
- return 0;
+ return 1;
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
@@ -1025,17 +974,38 @@ out:
return size;
}
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
- gfn_t *nr_pages)
+static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+{
+ return slot->flags & KVM_MEM_READONLY;
+}
+
+static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return bad_hva();
+ return KVM_HVA_ERR_BAD;
+
+ if (memslot_is_readonly(slot) && write)
+ return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);
- return gfn_to_hva_memslot(slot, gfn);
+ return __gfn_to_hva_memslot(slot, gfn);
+}
+
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages)
+{
+ return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+}
+
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ return gfn_to_hva_many(slot, gfn, NULL);
}
+EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
@@ -1043,10 +1013,23 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t get_fault_pfn(void)
+/*
+ * The hva returned by this function is only allowed to be read.
+ * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+ */
+static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
{
- get_page(fault_page);
- return fault_pfn;
+ return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+}
+
+static int kvm_read_hva(void *data, void __user *hva, int len)
+{
+ return __copy_from_user(data, hva, len);
+}
+
+static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+{
+ return __copy_from_user_inatomic(data, hva, len);
}
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1069,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
return rc == -EHWPOISON;
}
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- bool *async, bool write_fault, bool *writable)
+/*
+ * The atomic path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.
+ */
+static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable, pfn_t *pfn)
{
struct page *page[1];
- int npages = 0;
- pfn_t pfn;
+ int npages;
- /* we can do it either atomically or asynchronously, not both */
- BUG_ON(atomic && async);
+ if (!(async || atomic))
+ return false;
- BUG_ON(!write_fault && !writable);
+ /*
+ * Fast pin a writable pfn only if it is a write fault request
+ * or the caller allows to map a writable pfn for a read fault
+ * request.
+ */
+ if (!(write_fault || writable))
+ return false;
- if (writable)
- *writable = true;
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (npages == 1) {
+ *pfn = page_to_pfn(page[0]);
- if (atomic || async)
- npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (writable)
+ *writable = true;
+ return true;
+ }
- if (unlikely(npages != 1) && !atomic) {
- might_sleep();
+ return false;
+}
- if (writable)
- *writable = write_fault;
+/*
+ * The slow path to get the pfn of the specified host virtual address,
+ * 1 indicates success, -errno is returned if error is detected.
+ */
+static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+ bool *writable, pfn_t *pfn)
+{
+ struct page *page[1];
+ int npages = 0;
- if (async) {
- down_read(&current->mm->mmap_sem);
- npages = get_user_page_nowait(current, current->mm,
- addr, write_fault, page);
- up_read(&current->mm->mmap_sem);
- } else
- npages = get_user_pages_fast(addr, 1, write_fault,
- page);
-
- /* map read fault as writable if possible */
- if (unlikely(!write_fault) && npages == 1) {
- struct page *wpage[1];
-
- npages = __get_user_pages_fast(addr, 1, 1, wpage);
- if (npages == 1) {
- *writable = true;
- put_page(page[0]);
- page[0] = wpage[0];
- }
- npages = 1;
+ might_sleep();
+
+ if (writable)
+ *writable = write_fault;
+
+ if (async) {
+ down_read(&current->mm->mmap_sem);
+ npages = get_user_page_nowait(current, current->mm,
+ addr, write_fault, page);
+ up_read(&current->mm->mmap_sem);
+ } else
+ npages = get_user_pages_fast(addr, 1, write_fault,
+ page);
+ if (npages != 1)
+ return npages;
+
+ /* map read fault as writable if possible */
+ if (unlikely(!write_fault) && writable) {
+ struct page *wpage[1];
+
+ npages = __get_user_pages_fast(addr, 1, 1, wpage);
+ if (npages == 1) {
+ *writable = true;
+ put_page(page[0]);
+ page[0] = wpage[0];
}
+
+ npages = 1;
}
+ *pfn = page_to_pfn(page[0]);
+ return npages;
+}
- if (unlikely(npages != 1)) {
- struct vm_area_struct *vma;
+static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+{
+ if (unlikely(!(vma->vm_flags & VM_READ)))
+ return false;
- if (atomic)
- return get_fault_pfn();
+ if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+ return false;
- down_read(&current->mm->mmap_sem);
- if (npages == -EHWPOISON ||
- (!async && check_user_page_hwpoison(addr))) {
- up_read(&current->mm->mmap_sem);
- get_page(hwpoison_page);
- return page_to_pfn(hwpoison_page);
- }
+ return true;
+}
- vma = find_vma_intersection(current->mm, addr, addr+1);
-
- if (vma == NULL)
- pfn = get_fault_pfn();
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_mmio_pfn(pfn));
- } else {
- if (async && (vma->vm_flags & VM_WRITE))
- *async = true;
- pfn = get_fault_pfn();
- }
- up_read(&current->mm->mmap_sem);
- } else
- pfn = page_to_pfn(page[0]);
+/*
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ * host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ * whether the mapping is writable.
+ */
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable)
+{
+ struct vm_area_struct *vma;
+ pfn_t pfn = 0;
+ int npages;
+ /* we can do it either atomically or asynchronously, not both */
+ BUG_ON(atomic && async);
+
+ if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+ return pfn;
+
+ if (atomic)
+ return KVM_PFN_ERR_FAULT;
+
+ npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+ if (npages == 1)
+ return pfn;
+
+ down_read(&current->mm->mmap_sem);
+ if (npages == -EHWPOISON ||
+ (!async && check_user_page_hwpoison(addr))) {
+ pfn = KVM_PFN_ERR_HWPOISON;
+ goto exit;
+ }
+
+ vma = find_vma_intersection(current->mm, addr, addr + 1);
+
+ if (vma == NULL)
+ pfn = KVM_PFN_ERR_FAULT;
+ else if ((vma->vm_flags & VM_PFNMAP)) {
+ pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ BUG_ON(!kvm_is_mmio_pfn(pfn));
+ } else {
+ if (async && vma_is_valid(vma, write_fault))
+ *async = true;
+ pfn = KVM_PFN_ERR_FAULT;
+ }
+exit:
+ up_read(&current->mm->mmap_sem);
return pfn;
}
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+static pfn_t
+__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable)
{
- return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+ unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+ if (addr == KVM_HVA_ERR_RO_BAD)
+ return KVM_PFN_ERR_RO_FAULT;
+
+ if (kvm_is_error_hva(addr))
+ return KVM_PFN_ERR_BAD;
+
+ /* Do not map writable pfn in the readonly memslot. */
+ if (writable && memslot_is_readonly(slot)) {
+ *writable = false;
+ writable = NULL;
+ }
+
+ return hva_to_pfn(addr, atomic, async, write_fault,
+ writable);
}
-EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
bool write_fault, bool *writable)
{
- unsigned long addr;
+ struct kvm_memory_slot *slot;
if (async)
*async = false;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
+ slot = gfn_to_memslot(kvm, gfn);
- return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+ return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+ writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1199,13 +1260,17 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
- unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
}
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
+
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages)
{
@@ -1223,37 +1288,49 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+ if (is_error_pfn(pfn))
+ return KVM_ERR_PTR_BAD_PAGE;
+
+ if (kvm_is_mmio_pfn(pfn)) {
+ WARN_ON(1);
+ return KVM_ERR_PTR_BAD_PAGE;
+ }
+
+ return pfn_to_page(pfn);
+}
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
- if (!kvm_is_mmio_pfn(pfn))
- return pfn_to_page(pfn);
-
- WARN_ON(kvm_is_mmio_pfn(pfn));
- get_page(bad_page);
- return bad_page;
+ return kvm_pfn_to_page(pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_clean(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn)
{
- if (!kvm_is_mmio_pfn(pfn))
+ if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn));
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
void kvm_release_page_dirty(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1309,10 +1386,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int r;
unsigned long addr;
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
- r = __copy_from_user(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
@@ -1347,11 +1424,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
gfn_t gfn = gpa >> PAGE_SHIFT;
int offset = offset_in_page(gpa);
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
pagefault_disable();
- r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
pagefault_enable();
if (r)
return -EFAULT;
@@ -1405,7 +1482,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
ghc->gpa = gpa;
ghc->generation = slots->generation;
- ghc->memslot = __gfn_to_memslot(slots, gfn);
+ ghc->memslot = gfn_to_memslot(kvm, gfn);
ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
if (!kvm_is_error_hva(ghc->hva))
ghc->hva += offset;
@@ -1489,7 +1566,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
- __set_bit_le(rel_gfn, memslot->dirty_bitmap);
+ set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -1526,6 +1603,30 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
finish_wait(&vcpu->wq, &wait);
}
+#ifndef CONFIG_S390
+/*
+ * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
+ */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ int me;
+ int cpu = vcpu->cpu;
+ wait_queue_head_t *wqp;
+
+ wqp = kvm_arch_vcpu_wq(vcpu);
+ if (waitqueue_active(wqp)) {
+ wake_up_interruptible(wqp);
+ ++vcpu->stat.halt_wakeup;
+ }
+
+ me = get_cpu();
+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+ if (kvm_arch_vcpu_should_kick(vcpu))
+ smp_send_reschedule(cpu);
+ put_cpu();
+}
+#endif /* !CONFIG_S390 */
+
void kvm_resched(struct kvm_vcpu *vcpu)
{
if (!need_resched())
@@ -1534,6 +1635,68 @@ void kvm_resched(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_resched);
+bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
+{
+ struct pid *pid;
+ struct task_struct *task = NULL;
+
+ rcu_read_lock();
+ pid = rcu_dereference(target->pid);
+ if (pid)
+ task = get_pid_task(target->pid, PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task)
+ return false;
+ if (task->flags & PF_VCPU) {
+ put_task_struct(task);
+ return false;
+ }
+ if (yield_to(task, 1)) {
+ put_task_struct(task);
+ return true;
+ }
+ put_task_struct(task);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ * (preempted lock holder), indicated by @in_spin_loop.
+ * Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ * chance last time (mostly it has become eligible now since we have probably
+ * yielded to lockholder in last iteration. This is done by toggling
+ * @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ * to preempted lock-holder could result in wrong VCPU selection and CPU
+ * burning. Giving priority for a potential lock-holder increases lock
+ * progress.
+ *
+ * Since algorithm is based on heuristics, accessing another VCPU data without
+ * locking does not harm. It may result in trying to yield to same VCPU, fail
+ * and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ bool eligible;
+
+ eligible = !vcpu->spin_loop.in_spin_loop ||
+ (vcpu->spin_loop.in_spin_loop &&
+ vcpu->spin_loop.dy_eligible);
+
+ if (vcpu->spin_loop.in_spin_loop)
+ kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+ return eligible;
+}
+#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
@@ -1543,6 +1706,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;
+ kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
@@ -1552,9 +1716,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
*/
for (pass = 0; pass < 2 && !yielded; pass++) {
kvm_for_each_vcpu(i, vcpu, kvm) {
- struct task_struct *task = NULL;
- struct pid *pid;
- if (!pass && i < last_boosted_vcpu) {
+ if (!pass && i <= last_boosted_vcpu) {
i = last_boosted_vcpu;
continue;
} else if (pass && i > last_boosted_vcpu)
@@ -1563,26 +1725,19 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (waitqueue_active(&vcpu->wq))
continue;
- rcu_read_lock();
- pid = rcu_dereference(vcpu->pid);
- if (pid)
- task = get_pid_task(vcpu->pid, PIDTYPE_PID);
- rcu_read_unlock();
- if (!task)
- continue;
- if (task->flags & PF_VCPU) {
- put_task_struct(task);
+ if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
- }
- if (yield_to(task, 1)) {
- put_task_struct(task);
+ if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
break;
}
- put_task_struct(task);
}
}
+ kvm_vcpu_set_in_spin_loop(me, false);
+
+ /* Ensure vcpu is not eligible during next spinloop */
+ kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@@ -1602,7 +1757,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
else
- return VM_FAULT_SIGBUS;
+ return kvm_arch_vcpu_fault(vcpu, vmf);
get_page(page);
vmf->page = page;
return 0;
@@ -1663,6 +1818,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
goto vcpu_destroy;
mutex_lock(&kvm->lock);
+ if (!kvm_vcpu_compatible(vcpu)) {
+ r = -EINVAL;
+ goto unlock_vcpu_destroy;
+ }
if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
r = -EINVAL;
goto unlock_vcpu_destroy;
@@ -1688,10 +1847,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
smp_wmb();
atomic_inc(&kvm->online_vcpus);
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
- if (kvm->bsp_vcpu_id == id)
- kvm->bsp_vcpu = vcpu;
-#endif
mutex_unlock(&kvm->lock);
return r;
@@ -1735,7 +1890,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
#endif
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
@@ -1766,12 +1923,11 @@ out_free1:
struct kvm_regs *kvm_regs;
r = -ENOMEM;
- kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
- if (!kvm_regs)
+ kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
+ if (IS_ERR(kvm_regs)) {
+ r = PTR_ERR(kvm_regs);
goto out;
- r = -EFAULT;
- if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
- goto out_free2;
+ }
r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
if (r)
goto out_free2;
@@ -1795,13 +1951,11 @@ out_free2:
break;
}
case KVM_SET_SREGS: {
- kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
- r = -ENOMEM;
- if (!kvm_sregs)
- goto out;
- r = -EFAULT;
- if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
+ kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
+ if (IS_ERR(kvm_sregs)) {
+ r = PTR_ERR(kvm_sregs);
goto out;
+ }
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
if (r)
goto out;
@@ -1897,13 +2051,11 @@ out_free2:
break;
}
case KVM_SET_FPU: {
- fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
- r = -ENOMEM;
- if (!fpu)
- goto out;
- r = -EFAULT;
- if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
+ fpu = memdup_user(argp, sizeof(*fpu));
+ if (IS_ERR(fpu)) {
+ r = PTR_ERR(fpu);
goto out;
+ }
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
if (r)
goto out;
@@ -1950,9 +2102,10 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
if (copy_from_user(&csigset, sigmask_arg->sigset,
sizeof csigset))
goto out;
- }
- sigset_from_compat(&sigset, &csigset);
- r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ sigset_from_compat(&sigset, &csigset);
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ } else
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
break;
}
default:
@@ -2033,7 +2186,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&data, argp, sizeof data))
goto out;
- r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
+ r = kvm_irqfd(kvm, &data);
break;
}
case KVM_IOEVENTFD: {
@@ -2056,6 +2209,40 @@ static long kvm_vm_ioctl(struct file *filp,
mutex_unlock(&kvm->lock);
break;
#endif
+#ifdef CONFIG_HAVE_KVM_MSI
+ case KVM_SIGNAL_MSI: {
+ struct kvm_msi msi;
+
+ r = -EFAULT;
+ if (copy_from_user(&msi, argp, sizeof msi))
+ goto out;
+ r = kvm_send_userspace_msi(kvm, &msi);
+ break;
+ }
+#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+ case KVM_IRQ_LINE_STATUS:
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+
+ r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ if (copy_to_user(argp, &irq_event, sizeof irq_event))
+ goto out;
+ }
+
+ r = 0;
+ break;
+ }
+#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
if (r == -ENOTTY)
@@ -2152,12 +2339,12 @@ static struct file_operations kvm_vm_fops = {
.llseek = noop_llseek,
};
-static int kvm_dev_ioctl_create_vm(void)
+static int kvm_dev_ioctl_create_vm(unsigned long type)
{
int r;
struct kvm *kvm;
- kvm = kvm_create_vm();
+ kvm = kvm_create_vm(type);
if (IS_ERR(kvm))
return PTR_ERR(kvm);
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2184,8 +2371,11 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
case KVM_CAP_SET_BOOT_CPU_ID:
#endif
case KVM_CAP_INTERNAL_ERROR_DATA:
+#ifdef CONFIG_HAVE_KVM_MSI
+ case KVM_CAP_SIGNAL_MSI:
+#endif
return 1;
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
+#ifdef KVM_CAP_IRQ_ROUTING
case KVM_CAP_IRQ_ROUTING:
return KVM_MAX_IRQ_ROUTES;
#endif
@@ -2208,10 +2398,7 @@ static long kvm_dev_ioctl(struct file *filp,
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
- r = -EINVAL;
- if (arg)
- goto out;
- r = kvm_dev_ioctl_create_vm();
+ r = kvm_dev_ioctl_create_vm(arg);
break;
case KVM_CHECK_EXTENSION:
r = kvm_dev_ioctl_check_extension_generic(arg);
@@ -2391,24 +2578,89 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
int i;
for (i = 0; i < bus->dev_count; i++) {
- struct kvm_io_device *pos = bus->devs[i];
+ struct kvm_io_device *pos = bus->range[i].dev;
kvm_iodevice_destructor(pos);
}
kfree(bus);
}
+int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+ const struct kvm_io_range *r1 = p1;
+ const struct kvm_io_range *r2 = p2;
+
+ if (r1->addr < r2->addr)
+ return -1;
+ if (r1->addr + r1->len > r2->addr + r2->len)
+ return 1;
+ return 0;
+}
+
+int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
+ gpa_t addr, int len)
+{
+ bus->range[bus->dev_count++] = (struct kvm_io_range) {
+ .addr = addr,
+ .len = len,
+ .dev = dev,
+ };
+
+ sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
+ kvm_io_bus_sort_cmp, NULL);
+
+ return 0;
+}
+
+int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
+ gpa_t addr, int len)
+{
+ struct kvm_io_range *range, key;
+ int off;
+
+ key = (struct kvm_io_range) {
+ .addr = addr,
+ .len = len,
+ };
+
+ range = bsearch(&key, bus->range, bus->dev_count,
+ sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
+ if (range == NULL)
+ return -ENOENT;
+
+ off = range - bus->range;
+
+ while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+ off--;
+
+ return off;
+}
+
/* kvm_io_bus_write - called under kvm->slots_lock */
int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, const void *val)
{
- int i;
+ int idx;
struct kvm_io_bus *bus;
+ struct kvm_io_range range;
+
+ range = (struct kvm_io_range) {
+ .addr = addr,
+ .len = len,
+ };
bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
- for (i = 0; i < bus->dev_count; i++)
- if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+ idx = kvm_io_bus_get_first_dev(bus, addr, len);
+ if (idx < 0)
+ return -EOPNOTSUPP;
+
+ while (idx < bus->dev_count &&
+ kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+ if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
return 0;
+ idx++;
+ }
+
return -EOPNOTSUPP;
}
@@ -2416,31 +2668,47 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, void *val)
{
- int i;
+ int idx;
struct kvm_io_bus *bus;
+ struct kvm_io_range range;
+
+ range = (struct kvm_io_range) {
+ .addr = addr,
+ .len = len,
+ };
bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
- for (i = 0; i < bus->dev_count; i++)
- if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+ idx = kvm_io_bus_get_first_dev(bus, addr, len);
+ if (idx < 0)
+ return -EOPNOTSUPP;
+
+ while (idx < bus->dev_count &&
+ kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+ if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
return 0;
+ idx++;
+ }
+
return -EOPNOTSUPP;
}
/* Caller must hold slots_lock. */
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
- struct kvm_io_device *dev)
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, struct kvm_io_device *dev)
{
struct kvm_io_bus *new_bus, *bus;
bus = kvm->buses[bus_idx];
- if (bus->dev_count > NR_IOBUS_DEVS-1)
+ if (bus->dev_count > NR_IOBUS_DEVS - 1)
return -ENOSPC;
- new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+ new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+ sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;
- memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
- new_bus->devs[new_bus->dev_count++] = dev;
+ memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
+ sizeof(struct kvm_io_range)));
+ kvm_io_bus_insert_dev(new_bus, dev, addr, len);
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
kfree(bus);
@@ -2455,25 +2723,26 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
int i, r;
struct kvm_io_bus *new_bus, *bus;
- new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
- if (!new_bus)
- return -ENOMEM;
-
bus = kvm->buses[bus_idx];
- memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-
r = -ENOENT;
- for (i = 0; i < new_bus->dev_count; i++)
- if (new_bus->devs[i] == dev) {
+ for (i = 0; i < bus->dev_count; i++)
+ if (bus->range[i].dev == dev) {
r = 0;
- new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
break;
}
- if (r) {
- kfree(new_bus);
+ if (r)
return r;
- }
+
+ new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
+ sizeof(struct kvm_io_range)), GFP_KERNEL);
+ if (!new_bus)
+ return -ENOMEM;
+
+ memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+ new_bus->dev_count--;
+ memcpy(new_bus->range + i, bus->range + i + 1,
+ (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
@@ -2524,15 +2793,29 @@ static const struct file_operations *stat_fops[] = {
[KVM_STAT_VM] = &vm_stat_fops,
};
-static void kvm_init_debug(void)
+static int kvm_init_debug(void)
{
+ int r = -EFAULT;
struct kvm_stats_debugfs_item *p;
kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
- for (p = debugfs_entries; p->name; ++p)
+ if (kvm_debugfs_dir == NULL)
+ goto out;
+
+ for (p = debugfs_entries; p->name; ++p) {
p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
(void *)(long)p->offset,
stat_fops[p->kind]);
+ if (p->dentry == NULL)
+ goto out_dir;
+ }
+
+ return 0;
+
+out_dir:
+ debugfs_remove_recursive(kvm_debugfs_dir);
+out:
+ return r;
}
static void kvm_exit_debug(void)
@@ -2564,9 +2847,6 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume,
};
-struct page *bad_page;
-pfn_t bad_pfn;
-
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
@@ -2598,33 +2878,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
if (r)
goto out_fail;
- bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (bad_page == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_pfn = page_to_pfn(bad_page);
-
- hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (hwpoison_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- hwpoison_pfn = page_to_pfn(hwpoison_page);
-
- fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (fault_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- fault_pfn = page_to_pfn(fault_page);
-
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
@@ -2676,10 +2929,16 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
- kvm_init_debug();
+ r = kvm_init_debug();
+ if (r) {
+ printk(KERN_ERR "kvm: create debugfs files failed\n");
+ goto out_undebugfs;
+ }
return 0;
+out_undebugfs:
+ unregister_syscore_ops(&kvm_syscore_ops);
out_unreg:
kvm_async_pf_deinit();
out_free:
@@ -2693,12 +2952,6 @@ out_free_1:
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
- if (fault_page)
- __free_page(fault_page);
- if (hwpoison_page)
- __free_page(hwpoison_page);
- __free_page(bad_page);
-out:
kvm_arch_exit();
out_fail:
return r;
@@ -2718,7 +2971,5 @@ void kvm_exit(void)
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
- __free_page(hwpoison_page);
- __free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);