From 3deda5e50c893be38c1b6b3a73f8f8fb5560baa4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 20 Dec 2016 14:02:29 +1100 Subject: [PATCH 01/42] KVM: PPC: Book3S HV: Don't try to signal cpu -1 If the target vcpu for kvmppc_fast_vcpu_kick_hv() is not running on any CPU, then we will have vcpu->arch.thread_cpu == -1, and as it happens, kvmppc_fast_vcpu_kick_hv will call kvmppc_ipi_thread with -1 as the cpu argument. Although this is not meaningful, in the past, before commit 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores on POWER9", 2016-11-18), it was harmless because CPU -1 is not in the same core as any real CPU thread. On a POWER9, however, we don't do the "same core" check, so we were trying to do a msgsnd to thread -1, which is invalid. To avoid this, we add a check to see that vcpu->arch.thread_cpu is >= 0 before calling kvmppc_ipi_thread() with it. Since vcpu->arch.thread_vcpu can change asynchronously, we use READ_ONCE to ensure that the value we check is the same value that we use as the argument to kvmppc_ipi_thread(). Fixes: 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores on POWER9") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ec34e39471a7..8d9cc07b1e9c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) ++vcpu->stat.halt_wakeup; } - if (kvmppc_ipi_thread(vcpu->arch.thread_cpu)) + cpu = READ_ONCE(vcpu->arch.thread_cpu); + if (cpu >= 0 && kvmppc_ipi_thread(cpu)) return; /* CPU points to the first thread of the core */ From 5efa6605151b84029edeb2e07f2d2d74b52c106f Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:32 +0800 Subject: [PATCH 02/42] KVM: PPC: Book 3S: XICS cleanup: remove XICS_RM_REJECT Commit b0221556dbd3 ("KVM: PPC: Book3S HV: Move virtual mode ICP functions to real-mode") removed the setting of the XICS_RM_REJECT flag. And since that commit, nothing else sets the flag any more, so we can remove the flag and the remaining code that handles it, including the counter that counts how many times it get set. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xics.c | 12 +++--------- arch/powerpc/kvm/book3s_xics.h | 2 -- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 20dff102a06f..debda779a240 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -832,10 +832,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) icp->n_rm_check_resend++; icp_check_resend(xics, icp->rm_resend_icp); } - if (icp->rm_action & XICS_RM_REJECT) { - icp->n_rm_reject++; - icp_deliver_irq(xics, icp, icp->rm_reject); - } if (icp->rm_action & XICS_RM_NOTIFY_EOI) { icp->n_rm_notify_eoi++; kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); @@ -920,7 +916,7 @@ static int xics_debug_show(struct seq_file *m, void *private) int icsid, i; unsigned long flags; unsigned long t_rm_kick_vcpu, t_rm_check_resend; - unsigned long t_rm_reject, t_rm_notify_eoi; + unsigned long t_rm_notify_eoi; unsigned long t_reject, t_check_resend; if (!kvm) @@ -929,7 +925,6 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu = 0; t_rm_notify_eoi = 0; t_rm_check_resend = 0; - t_rm_reject = 0; t_check_resend = 0; t_reject = 0; @@ -952,14 +947,13 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu += icp->n_rm_kick_vcpu; t_rm_notify_eoi += icp->n_rm_notify_eoi; t_rm_check_resend += icp->n_rm_check_resend; - t_rm_reject += icp->n_rm_reject; t_check_resend += icp->n_check_resend; t_reject += icp->n_reject; } - seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n", t_rm_kick_vcpu, t_rm_check_resend, - t_rm_reject, t_rm_notify_eoi); + t_rm_notify_eoi); seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", t_check_resend, t_reject); for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 2a50320b55ca..1d5fac80b706 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -73,7 +73,6 @@ struct kvmppc_icp { */ #define XICS_RM_KICK_VCPU 0x1 #define XICS_RM_CHECK_RESEND 0x2 -#define XICS_RM_REJECT 0x4 #define XICS_RM_NOTIFY_EOI 0x8 u32 rm_action; struct kvm_vcpu *rm_kick_target; @@ -84,7 +83,6 @@ struct kvmppc_icp { /* Counters for each reason we exited real mode */ unsigned long n_rm_kick_vcpu; unsigned long n_rm_check_resend; - unsigned long n_rm_reject; unsigned long n_rm_notify_eoi; /* Counters for handling ICP processing in real mode */ unsigned long n_check_resend; From 37451bc95dee0e666927d6ffdda302dbbaaae6fa Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:33 +0800 Subject: [PATCH 03/42] KVM: PPC: Book 3S: XICS: correct the real mode ICP rejecting counter Some counters are added in Commit 6e0365b78273 ("KVM: PPC: Book3S HV: Add ICP real mode counters"), to provide some performance statistics to determine whether further optimizing is needed for real mode functions. The n_reject counter counts how many times ICP rejects an irq because of priority in real mode. The redelivery of an lsi that is still asserted after eoi doesn't fall into this category, so the increasement there is removed. Also, it needs to be increased in icp_rm_deliver_irq() if it rejects another one. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 06edc4366639..9f6c8fe88db6 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -380,6 +380,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ if (reject && reject != XICS_IPI) { arch_spin_unlock(&ics->lock); + icp->n_reject++; new_irq = reject; goto again; } @@ -711,10 +712,8 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) state = &ics->irq_state[src]; /* Still asserted, resend it */ - if (state->asserted) { - icp->n_reject++; + if (state->asserted) icp_rm_deliver_irq(xics, icp, irq); - } if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; From bf5a71d53835110d46d33eb5335713ffdbff9ab6 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:34 +0800 Subject: [PATCH 04/42] KVM: PPC: Book 3S: XICS: Fix potential issue with duplicate IRQ resends It is possible that in the following order, one irq is resent twice: CPU 1 CPU 2 ics_check_resend() lock ics_lock see resend set unlock ics_lock /* change affinity of the irq */ kvmppc_xics_set_xive() write_xive() lock ics_lock see resend set unlock ics_lock icp_deliver_irq() /* resend */ icp_deliver_irq() /* resend again */ It doesn't have any user-visible effect at present, but needs to be avoided when the following patch implementing the P/Q stuff is applied. This patch clears the resend flag before releasing the ics lock, when we know we will do a re-delivery after checking the flag, or setting the flag. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 3 +++ arch/powerpc/kvm/book3s_xics.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 9f6c8fe88db6..16349c9e86ba 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -52,6 +52,8 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, if (!state->resend) continue; + state->resend = 0; + arch_spin_unlock(&ics->lock); icp_rm_deliver_irq(xics, icp, state->number); arch_spin_lock(&ics->lock); @@ -400,6 +402,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); goto again; } diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index debda779a240..cdfb4ed73601 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -125,6 +125,8 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, if (!state->resend) continue; + state->resend = 0; + XICS_DBG("resend %#x prio %#x\n", state->number, state->priority); @@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, deliver = false; if ((state->masked_pending || state->resend) && priority != MASKED) { state->masked_pending = 0; + state->resend = 0; deliver = true; } @@ -488,6 +491,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); local_irq_restore(flags); goto again; From 17d48610ae0fa218aa386b16a538c792991a3652 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:35 +0800 Subject: [PATCH 05/42] KVM: PPC: Book 3S: XICS: Implement ICS P/Q states This patch implements P(Presented)/Q(Queued) states for ICS irqs. When the interrupt is presented, set P. Present if P was not set. If P is already set, don't present again, set Q. When the interrupt is EOI'ed, move Q into P (and clear Q). If it is set, re-present. The asserted flag used by LSI is also incorporated into the P bit. When the irq state is saved, P/Q bits are also saved, they need some qemu modifications to be recognized and passed around to be restored. KVM_XICS_PENDING bit set and saved should also indicate KVM_XICS_PRESENTED bit set and saved. But it is possible some old code doesn't have/recognize the P bit, so when we restore, we set P for PENDING bit, too. The idea and much of the code come from Ben. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/include/uapi/asm/kvm.h | 2 + arch/powerpc/kvm/book3s_hv_rm_xics.c | 100 ++++++++++++++------- arch/powerpc/kvm/book3s_xics.c | 125 ++++++++++++++++++--------- arch/powerpc/kvm/book3s_xics.h | 5 +- 4 files changed, 161 insertions(+), 71 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 3603b6f51b11..e3db3a50127b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -613,5 +613,7 @@ struct kvm_get_htab_header { #define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) #define KVM_XICS_MASKED (1ULL << 41) #define KVM_XICS_PENDING (1ULL << 42) +#define KVM_XICS_PRESENTED (1ULL << 43) +#define KVM_XICS_QUEUED (1ULL << 44) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 16349c9e86ba..30f82c79de5d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -672,51 +672,39 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) return check_too_hard(xics, icp); } -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; - - if (!xics || !xics->real_mode) - return H_TOO_HARD; + u32 pq_old, pq_new; /* - * ICP State: EOI + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. * - * Note: If EOI is incorrectly used by SW to lower the CPPR - * value (ie more favored), we do not check for rejection of - * a pending interrupt, this is a SW error and PAPR sepcifies - * that we don't have to deal with it. - * - * The sending of an EOI to the ICS is handled after the - * CPPR update - * - * ICP State: Down_CPPR which we handle - * in a separate function as it's shared with H_CPPR. + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. */ - icp_rm_down_cppr(xics, icp, xirr >> 24); - /* IPIs have no EOI */ - if (irq == XICS_IPI) - goto bail; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ ics = kvmppc_xics_find_ics(xics, irq, &src); if (!ics) goto bail; + state = &ics->irq_state[src]; - /* Still asserted, resend it */ - if (state->asserted) - icp_rm_deliver_irq(xics, icp, irq); + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_rm_deliver_irq(xics, NULL, irq); if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; @@ -737,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) state->intr_cpu = -1; } } + bail: return check_too_hard(xics, icp); } +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR specifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return check_too_hard(xics, icp); + + return ics_rm_eoi(vcpu, irq); +} + unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) @@ -827,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, { struct kvmppc_xics *xics; struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; u32 irq; + u16 src; + u32 pq_old, pq_new; irq = irq_map->v_hwirq; xics = vcpu->kvm->arch.xics; icp = vcpu->arch.icp; kvmppc_rm_handle_irq_desc(irq_map->desc); - icp_rm_deliver_irq(xics, icp, irq); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return 2; + + state = &ics->irq_state[src]; + + /* only MSIs register bypass producers, so it must be MSI here */ + do { + pq_old = state->pq_state; + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_rm_deliver_irq(xics, icp, irq); /* EOI the interrupt */ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index cdfb4ed73601..c7620622c846 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) struct ics_irq_state *state; struct kvmppc_ics *ics; u16 src; + u32 pq_old, pq_new; XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); @@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) if (!state->exists) return -EINVAL; + if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET) + level = 1; + else if (level == KVM_INTERRUPT_UNSET) + level = 0; /* - * We set state->asserted locklessly. This should be fine as - * we are the only setter, thus concurrent access is undefined - * to begin with. + * Take other values the same as 1, consistent with original code. + * maybe WARN here? */ - if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) - state->asserted = 1; - else if (level == 0 || level == KVM_INTERRUPT_UNSET) { - state->asserted = 0; + + if (!state->lsi && level == 0) /* noop for MSI */ return 0; - } + + do { + pq_old = state->pq_state; + if (state->lsi) { + if (level) { + if (pq_old & PQ_PRESENTED) + /* Setting already set LSI ... */ + return 0; + + pq_new = PQ_PRESENTED; + } else + pq_new = 0; + } else + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_deliver_irq(xics, NULL, irq); /* Record which CPU this arrived on for passed-through interrupts */ if (state->host_irq) state->intr_cpu = raw_smp_processor_id(); - /* Attempt delivery */ - icp_deliver_irq(xics, NULL, irq); - return 0; } @@ -768,14 +785,51 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) icp_deliver_irq(xics, icp, reject); } -static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; + u32 pq_old, pq_new; + + /* + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. + * + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. + */ + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_deliver_irq(xics, icp, irq); + + kvm_notify_acked_irq(vcpu->kvm, 0, irq); + + return H_SUCCESS; +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); @@ -798,26 +852,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) /* IPIs have no EOI */ if (irq == XICS_IPI) return H_SUCCESS; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ - ics = kvmppc_xics_find_ics(xics, irq, &src); - if (!ics) { - XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); - return H_PARAMETER; - } - state = &ics->irq_state[src]; - /* Still asserted, resend it */ - if (state->asserted) - icp_deliver_irq(xics, icp, irq); - - kvm_notify_acked_irq(vcpu->kvm, 0, irq); - - return H_SUCCESS; + return ics_eoi(vcpu, irq); } int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) @@ -975,9 +1011,9 @@ static int xics_debug_show(struct seq_file *m, void *private) for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *irq = &ics->irq_state[i]; - seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n", irq->number, irq->server, irq->priority, - irq->saved_priority, irq->asserted, + irq->saved_priority, irq->pq_state, irq->resend, irq->masked_pending); } @@ -1196,10 +1232,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) val |= prio << KVM_XICS_PRIORITY_SHIFT; if (irqp->lsi) { val |= KVM_XICS_LEVEL_SENSITIVE; - if (irqp->asserted) + if (irqp->pq_state & PQ_PRESENTED) val |= KVM_XICS_PENDING; } else if (irqp->masked_pending || irqp->resend) val |= KVM_XICS_PENDING; + + if (irqp->pq_state & PQ_PRESENTED) + val |= KVM_XICS_PRESENTED; + + if (irqp->pq_state & PQ_QUEUED) + val |= KVM_XICS_QUEUED; + ret = 0; } arch_spin_unlock(&ics->lock); @@ -1251,12 +1294,14 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) irqp->resend = 0; irqp->masked_pending = 0; irqp->lsi = 0; - irqp->asserted = 0; - if (val & KVM_XICS_LEVEL_SENSITIVE) { + irqp->pq_state = 0; + if (val & KVM_XICS_LEVEL_SENSITIVE) irqp->lsi = 1; - if (val & KVM_XICS_PENDING) - irqp->asserted = 1; - } + /* If PENDING, set P in case P is not saved because of old code */ + if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) + irqp->pq_state |= PQ_PRESENTED; + if (val & KVM_XICS_QUEUED) + irqp->pq_state |= PQ_QUEUED; irqp->exists = 1; arch_spin_unlock(&ics->lock); local_irq_restore(flags); diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 1d5fac80b706..ec5474cf70c6 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -31,16 +31,19 @@ /* Priority value to use for disabling an interrupt */ #define MASKED 0xff +#define PQ_PRESENTED 1 +#define PQ_QUEUED 2 + /* State for one irq source */ struct ics_irq_state { u32 number; u32 server; + u32 pq_state; u8 priority; u8 saved_priority; u8 resend; u8 masked_pending; u8 lsi; /* level-sensitive interrupt */ - u8 asserted; /* Only for LSI */ u8 exists; int intr_cpu; u32 host_irq; From 21acd0e4df04f02176e773468658c3cebff096bb Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:36 +0800 Subject: [PATCH 06/42] KVM: PPC: Book 3S: XICS: Don't lock twice when checking for resend This patch improves the code that takes lock twice to check the resend flag and do the actual resending, by checking the resend flag locklessly, and add a boolean parameter check_resend to icp_[rm_]deliver_irq(), so the resend flag can be checked in the lock when doing the delivery. We need make sure when we clear the ics's bit in the icp's resend_map, we don't miss the resend flag of the irqs that set the bit. It could be ordered through the barrier in test_and_clear_bit(), and a newly added wmb between setting irq's resend flag, and icp's resend_map. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 40 ++++++++++--------- arch/powerpc/kvm/book3s_xics.c | 59 +++++++++++++--------------- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 30f82c79de5d..44cfdd281fa1 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -35,7 +35,7 @@ int kvm_irq_bypass = 1; EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu); /* -- ICS routines -- */ @@ -44,22 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, { int i; - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - state->resend = 0; - - arch_spin_unlock(&ics->lock); - icp_rm_deliver_irq(xics, icp, state->number); - arch_spin_lock(&ics->lock); + if (state->resend) + icp_rm_deliver_irq(xics, icp, state->number, true); } - arch_spin_unlock(&ics->lock); } /* -- ICP routines -- */ @@ -292,7 +282,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -337,6 +327,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -384,6 +378,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(&ics->lock); icp->n_reject++; new_irq = reject; + check_resend = 0; goto again; } } else { @@ -391,9 +386,15 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; + /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + /* * If the need_resend flag got cleared in the ICP some time * between icp_rm_try_to_deliver() atomic update and now, then @@ -404,6 +405,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, if (!icp->state.need_resend) { state->resend = 0; arch_spin_unlock(&ics->lock); + check_resend = 0; goto again; } } @@ -598,7 +600,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject in real mode */ if (reject && reject != XICS_IPI) { this_icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } /* Handle resends in real mode */ @@ -666,7 +668,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) */ if (reject && reject != XICS_IPI) { icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } bail: return check_too_hard(xics, icp); @@ -704,7 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); if (pq_new & PQ_PRESENTED) - icp_rm_deliver_irq(xics, NULL, irq); + icp_rm_deliver_irq(xics, NULL, irq, false); if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; @@ -874,7 +876,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, /* Test P=1, Q=0, this is the only case where we present */ if (pq_new == PQ_PRESENTED) - icp_rm_deliver_irq(xics, icp, irq); + icp_rm_deliver_irq(xics, icp, irq, false); /* EOI the interrupt */ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index c7620622c846..e48803e2918d 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -63,7 +63,7 @@ /* -- ICS routines -- */ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); /* * Return value ideally indicates how the interrupt was handled, but no @@ -117,7 +117,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) /* Test P=1, Q=0, this is the only case where we present */ if (pq_new == PQ_PRESENTED) - icp_deliver_irq(xics, NULL, irq); + icp_deliver_irq(xics, NULL, irq, false); /* Record which CPU this arrived on for passed-through interrupts */ if (state->host_irq) @@ -131,31 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, { int i; - unsigned long flags; - - local_irq_save(flags); - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - state->resend = 0; - - XICS_DBG("resend %#x prio %#x\n", state->number, - state->priority); - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); - icp_deliver_irq(xics, icp, state->number); - local_irq_save(flags); - arch_spin_lock(&ics->lock); + if (state->resend) { + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + icp_deliver_irq(xics, icp, state->number, true); + } } - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); } static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, @@ -209,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) state->masked_pending, state->resend); if (write_xive(xics, ics, state, server, priority, priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -262,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) if (write_xive(xics, ics, state, state->server, state->saved_priority, state->saved_priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -396,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -442,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -490,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(&ics->lock); local_irq_restore(flags); new_irq = reject; + check_resend = 0; goto again; } } else { @@ -497,9 +485,15 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; + /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + /* * If the need_resend flag got cleared in the ICP some time * between icp_try_to_deliver() atomic update and now, then @@ -511,6 +505,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, state->resend = 0; arch_spin_unlock(&ics->lock); local_irq_restore(flags); + check_resend = 0; goto again; } } @@ -702,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); /* Handle resend */ if (resend) @@ -782,7 +777,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * attempt (see comments in icp_deliver_irq). */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); } static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) @@ -818,7 +813,7 @@ static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); if (pq_new & PQ_PRESENTED) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); kvm_notify_acked_irq(vcpu->kvm, 0, irq); @@ -1307,7 +1302,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) local_irq_restore(flags); if (val & KVM_XICS_PENDING) - icp_deliver_irq(xics, NULL, irqp->number); + icp_deliver_irq(xics, NULL, irqp->number, false); return 0; } From d3918e7fd4a27564f93ec46d0359a9739c5deb8d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 22 Dec 2016 04:29:25 +1000 Subject: [PATCH 07/42] KVM: PPC: Book3S: Change interrupt call to reduce scratch space use on HV Change the calling convention to put the trap number together with CR in two halves of r12, which frees up HSTATE_SCRATCH2 in the HV handler. The 64-bit PR handler entry translates the calling convention back to match the previous call convention (i.e., shared with 32-bit), for simplicity. Signed-off-by: Nicholas Piggin Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 24 +++++++++++------------ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 16 ++++++++------- arch/powerpc/kvm/book3s_segment.S | 25 +++++++++++++++++------- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a3eee661297..a02a268bde6b 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -233,7 +233,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #endif -#define __KVM_HANDLER_PROLOG(area, n) \ +#define __KVM_HANDLER(area, h, n) \ BEGIN_FTR_SECTION_NESTED(947) \ ld r10,area+EX_CFAR(r13); \ std r10,HSTATE_CFAR(r13); \ @@ -243,30 +243,28 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) std r10,HSTATE_PPR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ ld r10,area+EX_R10(r13); \ - stw r9,HSTATE_SCRATCH1(r13); \ - ld r9,area+EX_R9(r13); \ std r12,HSTATE_SCRATCH0(r13); \ - -#define __KVM_HANDLER(area, h, n) \ - __KVM_HANDLER_PROLOG(area, n) \ - li r12,n; \ + sldi r12,r9,32; \ + ori r12,r12,(n); \ + ld r9,area+EX_R9(r13); \ b kvmppc_interrupt #define __KVM_HANDLER_SKIP(area, h, n) \ cmpwi r10,KVM_GUEST_MODE_SKIP; \ - ld r10,area+EX_R10(r13); \ beq 89f; \ - stw r9,HSTATE_SCRATCH1(r13); \ BEGIN_FTR_SECTION_NESTED(948) \ - ld r9,area+EX_PPR(r13); \ - std r9,HSTATE_PPR(r13); \ + ld r10,area+EX_PPR(r13); \ + std r10,HSTATE_PPR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ - ld r9,area+EX_R9(r13); \ + ld r10,area+EX_R10(r13); \ std r12,HSTATE_SCRATCH0(r13); \ - li r12,n; \ + sldi r12,r9,32; \ + ori r12,r12,(n); \ + ld r9,area+EX_R9(r13); \ b kvmppc_interrupt; \ 89: mtocrf 0x80,r9; \ ld r9,area+EX_R9(r13); \ + ld r10,area+EX_R10(r13); \ b kvmppc_skip_##h##interrupt #ifdef CONFIG_KVM_BOOK3S_64_HANDLER diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9338a818e05c..11882aac8216 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1057,19 +1057,18 @@ hdec_soon: kvmppc_interrupt_hv: /* * Register contents: - * R12 = interrupt vector + * R12 = (guest CR << 32) | interrupt vector * R13 = PACA - * guest CR, R12 saved in shadow VCPU SCRATCH1/0 + * guest R12 saved in shadow VCPU SCRATCH0 * guest R13 saved in SPRN_SCRATCH0 */ - std r9, HSTATE_SCRATCH2(r13) - + std r9, HSTATE_SCRATCH1(r13) lbz r9, HSTATE_IN_GUEST(r13) cmpwi r9, KVM_GUEST_MODE_HOST_HV beq kvmppc_bad_host_intr #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE cmpwi r9, KVM_GUEST_MODE_GUEST - ld r9, HSTATE_SCRATCH2(r13) + ld r9, HSTATE_SCRATCH1(r13) beq kvmppc_interrupt_pr #endif /* We're now back in the host but in guest MMU context */ @@ -1089,13 +1088,14 @@ kvmppc_interrupt_hv: std r6, VCPU_GPR(R6)(r9) std r7, VCPU_GPR(R7)(r9) std r8, VCPU_GPR(R8)(r9) - ld r0, HSTATE_SCRATCH2(r13) + ld r0, HSTATE_SCRATCH1(r13) std r0, VCPU_GPR(R9)(r9) std r10, VCPU_GPR(R10)(r9) std r11, VCPU_GPR(R11)(r9) ld r3, HSTATE_SCRATCH0(r13) - lwz r4, HSTATE_SCRATCH1(r13) std r3, VCPU_GPR(R12)(r9) + /* CR is in the high half of r12 */ + srdi r4, r12, 32 stw r4, VCPU_CR(r9) BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) @@ -1114,6 +1114,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mfspr r11, SPRN_SRR1 std r10, VCPU_SRR0(r9) std r11, VCPU_SRR1(r9) + /* trap is in the low half of r12, clear CR from the high half */ + clrldi r12, r12, 32 andi. r0, r12, 2 /* need to read HSRR0/1? */ beq 1f mfspr r10, SPRN_HSRR0 diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index ca8f174289bb..68e45080cf93 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -167,20 +167,31 @@ kvmppc_handler_trampoline_enter_end: * * *****************************************************************************/ -.global kvmppc_handler_trampoline_exit -kvmppc_handler_trampoline_exit: - .global kvmppc_interrupt_pr kvmppc_interrupt_pr: + /* 64-bit entry. Register usage at this point: + * + * SPRG_SCRATCH0 = guest R13 + * R12 = (guest CR << 32) | exit handler id + * R13 = PACA + * HSTATE.SCRATCH0 = guest R12 + */ +#ifdef CONFIG_PPC64 + /* Match 32-bit entry */ + rotldi r12, r12, 32 /* Flip R12 halves for stw */ + stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */ + srdi r12, r12, 32 /* shift trap into low half */ +#endif +.global kvmppc_handler_trampoline_exit +kvmppc_handler_trampoline_exit: /* Register usage at this point: * - * SPRG_SCRATCH0 = guest R13 - * R12 = exit handler id - * R13 = shadow vcpu (32-bit) or PACA (64-bit) + * SPRG_SCRATCH0 = guest R13 + * R12 = exit handler id + * R13 = shadow vcpu (32-bit) or PACA (64-bit) * HSTATE.SCRATCH0 = guest R12 * HSTATE.SCRATCH1 = guest CR - * */ /* Save registers */ From 7ede531773ea69fa56b02a873ed83ce3507eb8d5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 22 Dec 2016 04:29:26 +1000 Subject: [PATCH 08/42] KVM: PPC: Book3S: Move 64-bit KVM interrupt handler out from alt section A subsequent patch to make KVM handlers relocation-safe makes them unusable from within alt section "else" cases (due to the way fixed addresses are taken from within fixed section head code). Stop open-coding the KVM handlers, and add them both as normal. A more optimal fix may be to allow some level of alternate feature patching in the exception macros themselves, but for now this will do. The TRAMP_KVM handlers must be moved to the "virt" fixed section area (name is arbitrary) in order to be closer to .text and avoid the dreaded "relocation truncated to fit" error. Signed-off-by: Nicholas Piggin Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/head-64.h | 2 +- arch/powerpc/kernel/exceptions-64s.S | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index fca7033839a9..9bd81619d090 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -218,7 +218,7 @@ name: #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #define TRAMP_KVM_BEGIN(name) \ - TRAMP_REAL_BEGIN(name) + TRAMP_VIRT_BEGIN(name) #else #define TRAMP_KVM_BEGIN(name) #endif diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d39d6118c6e9..89b4f122aec6 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -717,13 +717,9 @@ hardware_interrupt_hv: BEGIN_FTR_SECTION _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_HV, SOFTEN_TEST_HV) -do_kvm_H0x500: - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502) FTR_SECTION_ELSE _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_STD, SOFTEN_TEST_PR) -do_kvm_0x500: - KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) EXC_REAL_END(hardware_interrupt, 0x500, 0x600) @@ -737,6 +733,8 @@ hardware_interrupt_relon_hv: ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600) +TRAMP_KVM(PACA_EXGEN, 0x500) +TRAMP_KVM_HV(PACA_EXGEN, 0x500) EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) From 8464c8842de2249061d3c5abc2ccce1bbbd10e7a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 6 Dec 2016 20:42:05 +1100 Subject: [PATCH 09/42] KVM: PPC: Book3S HV: Fix H_PROD to actually wake the target vcpu The H_PROD hypercall is supposed to wake up an idle vcpu. We have an implementation, but because Linux doesn't use it except when doing cpu hotplug, it was never tested properly. AIX does use it, and reported it broken. It turns out we were waking the wrong vcpu (the one doing H_PROD, not the target of the prod) and we weren't handling the case where the target needs an IPI to wake it. Fix it by using the existing kvmppc_fast_vcpu_kick_hv() function, which is intended for this kind of thing, and by using the target vcpu not the current vcpu. We were also not looking at the prodded flag when checking whether a ceded vcpu should wake up, so this adds checks for the prodded flag alongside the checks for pending exceptions. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8d9cc07b1e9c..856cc9d38efd 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -774,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } tvcpu->arch.prodded = 1; smp_mb(); - if (vcpu->arch.ceded) { - if (swait_active(&vcpu->wq)) { - swake_up(&vcpu->wq); - vcpu->stat.halt_wakeup++; - } - } + if (tvcpu->arch.ceded) + kvmppc_fast_vcpu_kick_hv(tvcpu); break; case H_CONFER: target = kvmppc_get_gpr(vcpu, 4); @@ -2621,7 +2617,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) int i; for_each_runnable_thread(i, vcpu, vc) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || + vcpu->arch.prodded) return 1; } @@ -2807,7 +2804,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; n_ceded = 0; for_each_runnable_thread(i, v, vc) { - if (!v->arch.pending_exceptions) + if (!v->arch.pending_exceptions && !v->arch.prodded) n_ceded += v->arch.ceded; else v->arch.ceded = 0; From fcd4f3c6d150357a02af8526e69bfebb82dd5d46 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 25 Jan 2017 13:27:22 +0100 Subject: [PATCH 10/42] KVM: PPC: Book3S PR: Refactor program interrupt related code into separate function The function kvmppc_handle_exit_pr() is quite huge and thus hard to read, and even contains a "spaghetti-code"-like goto between the different case labels of the big switch statement. This can be made much more readable by moving the code related to injecting program interrupts / instruction emulation into a separate function instead. Signed-off-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_pr.c | 130 +++++++++++++++++------------------ 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 1482961ceb4d..d4dfc0ca2a44 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu) } } +static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + enum emulation_result er; + ulong flags; + u32 last_inst; + int emul, r; + + /* + * shadow_srr1 only contains valid flags if we came here via a program + * exception. The other exceptions (emulation assist, FP unavailable, + * etc.) do not provide flags in SRR1, so use an illegal-instruction + * exception when injecting a program interrupt into the guest. + */ + if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + else + flags = SRR1_PROGILL; + + emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + if (emul != EMULATE_DONE) + return RESUME_GUEST; + + if (kvmppc_get_msr(vcpu) & MSR_PR) { +#ifdef EXIT_DEBUG + pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", + kvmppc_get_pc(vcpu), last_inst); +#endif + if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) { + kvmppc_core_queue_program(vcpu, flags); + return RESUME_GUEST; + } + } + + vcpu->stat.emulated_inst_exits++; + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + r = RESUME_GUEST_NV; + break; + case EMULATE_AGAIN: + r = RESUME_GUEST; + break; + case EMULATE_FAIL: + pr_crit("%s: emulation at %lx failed (%08x)\n", + __func__, kvmppc_get_pc(vcpu), last_inst); + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST_NV; + break; + case EMULATE_EXIT_USER: + r = RESUME_HOST_NV; + break; + default: + BUG(); + } + + return r; +} + int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_PROGRAM: case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - { - enum emulation_result er; - ulong flags; - u32 last_inst; - int emul; - -program_interrupt: - /* - * shadow_srr1 only contains valid flags if we came here via - * a program exception. The other exceptions (emulation assist, - * FP unavailable, etc.) do not provide flags in SRR1, so use - * an illegal-instruction exception when injecting a program - * interrupt into the guest. - */ - if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) - flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; - else - flags = SRR1_PROGILL; - - emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); - if (emul != EMULATE_DONE) { - r = RESUME_GUEST; - break; - } - - if (kvmppc_get_msr(vcpu) & MSR_PR) { -#ifdef EXIT_DEBUG - pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", - kvmppc_get_pc(vcpu), last_inst); -#endif - if ((last_inst & 0xff0007ff) != - (INS_DCBZ & 0xfffffff7)) { - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - } - } - - vcpu->stat.emulated_inst_exits++; - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - r = RESUME_GUEST_NV; - break; - case EMULATE_AGAIN: - r = RESUME_GUEST; - break; - case EMULATE_FAIL: - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, kvmppc_get_pc(vcpu), last_inst); - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - case EMULATE_DO_MMIO: - run->exit_reason = KVM_EXIT_MMIO; - r = RESUME_HOST_NV; - break; - case EMULATE_EXIT_USER: - r = RESUME_HOST_NV; - break; - default: - BUG(); - } + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); break; - } case BOOK3S_INTERRUPT_SYSCALL: { u32 last_sc; @@ -1185,7 +1185,7 @@ program_interrupt: emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); if (emul == EMULATE_DONE) - goto program_interrupt; + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); else r = RESUME_GUEST; From a97a65d53d9f53b6897dc1b2aed381bc1707136b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 27 Jan 2017 14:00:34 +1000 Subject: [PATCH 11/42] KVM: PPC: Book3S: 64-bit CONFIG_RELOCATABLE support for interrupts 64-bit Book3S exception handlers must find the dynamic kernel base to add to the target address when branching beyond __end_interrupts, in order to support kernel running at non-0 physical address. Support this in KVM by branching with CTR, similarly to regular interrupt handlers. The guest CTR saved in HSTATE_SCRATCH1 and restored after the branch. Without this, the host kernel hangs and crashes randomly when it is running at a non-0 address and a KVM guest is started. Signed-off-by: Nicholas Piggin Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 45 +++++++++++++++++++++--- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +++++-- arch/powerpc/kvm/book3s_segment.S | 7 ++++ 4 files changed, 58 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index a02a268bde6b..9a5dbfb2d9f2 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -97,6 +97,15 @@ ld reg,PACAKBASE(r13); \ ori reg,reg,(ABS_ADDR(label))@l; +/* + * Branches from unrelocated code (e.g., interrupts) to labels outside + * head-y require >64K offsets. + */ +#define __LOAD_FAR_HANDLER(reg, label) \ + ld reg,PACAKBASE(r13); \ + ori reg,reg,(ABS_ADDR(label))@l; \ + addis reg,reg,(ABS_ADDR(label))@h; + /* Exception register prefixes */ #define EXC_HV H #define EXC_STD @@ -227,12 +236,40 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg; \ bctr +/* + * KVM requires __LOAD_FAR_HANDLER. + * + * __BRANCH_TO_KVM_EXIT branches are also a special case because they + * explicitly use r9 then reload it from PACA before branching. Hence + * the double-underscore. + */ +#define __BRANCH_TO_KVM_EXIT(area, label) \ + mfctr r9; \ + std r9,HSTATE_SCRATCH1(r13); \ + __LOAD_FAR_HANDLER(r9, label); \ + mtctr r9; \ + ld r9,area+EX_R9(r13); \ + bctr + +#define BRANCH_TO_KVM(reg, label) \ + __LOAD_FAR_HANDLER(reg, label); \ + mtctr reg; \ + bctr + #else #define BRANCH_TO_COMMON(reg, label) \ b label +#define BRANCH_TO_KVM(reg, label) \ + b label + +#define __BRANCH_TO_KVM_EXIT(area, label) \ + ld r9,area+EX_R9(r13); \ + b label + #endif + #define __KVM_HANDLER(area, h, n) \ BEGIN_FTR_SECTION_NESTED(947) \ ld r10,area+EX_CFAR(r13); \ @@ -246,8 +283,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) std r12,HSTATE_SCRATCH0(r13); \ sldi r12,r9,32; \ ori r12,r12,(n); \ - ld r9,area+EX_R9(r13); \ - b kvmppc_interrupt + /* This reloads r9 before branching to kvmppc_interrupt */ \ + __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt) #define __KVM_HANDLER_SKIP(area, h, n) \ cmpwi r10,KVM_GUEST_MODE_SKIP; \ @@ -260,8 +297,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) std r12,HSTATE_SCRATCH0(r13); \ sldi r12,r9,32; \ ori r12,r12,(n); \ - ld r9,area+EX_R9(r13); \ - b kvmppc_interrupt; \ + /* This reloads r9 before branching to kvmppc_interrupt */ \ + __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt); \ 89: mtocrf 0x80,r9; \ ld r9,area+EX_R9(r13); \ ld r10,area+EX_R10(r13); \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 89b4f122aec6..65a2559eeb7f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) lbz r0,HSTATE_HWTHREAD_REQ(r13) cmpwi r0,0 beq 1f - b kvm_start_guest + BRANCH_TO_KVM(r10, kvm_start_guest) 1: #endif diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 11882aac8216..264ac9ad4585 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1060,15 +1060,16 @@ kvmppc_interrupt_hv: * R12 = (guest CR << 32) | interrupt vector * R13 = PACA * guest R12 saved in shadow VCPU SCRATCH0 + * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE * guest R13 saved in SPRN_SCRATCH0 */ - std r9, HSTATE_SCRATCH1(r13) + std r9, HSTATE_SCRATCH2(r13) lbz r9, HSTATE_IN_GUEST(r13) cmpwi r9, KVM_GUEST_MODE_HOST_HV beq kvmppc_bad_host_intr #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE cmpwi r9, KVM_GUEST_MODE_GUEST - ld r9, HSTATE_SCRATCH1(r13) + ld r9, HSTATE_SCRATCH2(r13) beq kvmppc_interrupt_pr #endif /* We're now back in the host but in guest MMU context */ @@ -1088,7 +1089,7 @@ kvmppc_interrupt_hv: std r6, VCPU_GPR(R6)(r9) std r7, VCPU_GPR(R7)(r9) std r8, VCPU_GPR(R8)(r9) - ld r0, HSTATE_SCRATCH1(r13) + ld r0, HSTATE_SCRATCH2(r13) std r0, VCPU_GPR(R9)(r9) std r10, VCPU_GPR(R10)(r9) std r11, VCPU_GPR(R11)(r9) @@ -1151,7 +1152,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 11: stw r3,VCPU_HEIR(r9) /* these are volatile across C function calls */ +#ifdef CONFIG_RELOCATABLE + ld r3, HSTATE_SCRATCH1(r13) + mtctr r3 +#else mfctr r3 +#endif mfxer r4 std r3, VCPU_CTR(r9) std r4, VCPU_XER(r9) diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 68e45080cf93..2a2b96d53999 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -175,9 +175,16 @@ kvmppc_interrupt_pr: * R12 = (guest CR << 32) | exit handler id * R13 = PACA * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE */ #ifdef CONFIG_PPC64 /* Match 32-bit entry */ +#ifdef CONFIG_RELOCATABLE + std r9, HSTATE_SCRATCH2(r13) + ld r9, HSTATE_SCRATCH1(r13) + mtctr r9 + ld r9, HSTATE_SCRATCH2(r13) +#endif rotldi r12, r12, 32 /* Flip R12 halves for stw */ stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */ srdi r12, r12, 32 /* shift trap into low half */ From 18569c1f134e1c5c88228f043c09678ae6052b7c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:34 +1100 Subject: [PATCH 12/42] powerpc/64: Don't try to use radix MMU under a hypervisor Currently, if the kernel is running on a POWER9 processor under a hypervisor, it will try to use the radix MMU even though it doesn't have the necessary code to use radix under a hypervisor (it doesn't negotiate use of radix, and it doesn't do the H_REGISTER_PROC_TBL hcall). The result is that the guest kernel will crash when it tries to turn on the MMU. This fixes it by looking for the /chosen/ibm,architecture-vec-5 property, and if it exists, clears the radix MMU feature bit, before we decide whether to initialize for radix or HPT. This property is created by the hypervisor as a result of the guest calling the ibm,client-architecture-support method to indicate its capabilities, so it will indicate whether the hypervisor agreed to us using radix. Systems without a hypervisor may have this property also (for example, skiboot creates it), so we check the HV bit in the MSR to see whether we are running as a guest or not. If we are in hypervisor mode, then we can do whatever we like including using the radix MMU. The reason for using this property is that in future, when we have support for using radix under a hypervisor, we will need to check this property to see whether the hypervisor agreed to us using radix. Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_64.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 93abf8a9813d..4d9481ec2468 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include #include @@ -344,12 +346,43 @@ static int __init parse_disable_radix(char *p) } early_param("disable_radix", parse_disable_radix); +/* + * If we're running under a hypervisor, we currently can't do radix + * since we don't have the code to do the H_REGISTER_PROC_TBL hcall. + * We tell that we're running under a hypervisor by looking for the + * /chosen/ibm,architecture-vec-5 property. + */ +static void early_check_vec5(void) +{ + unsigned long root, chosen; + int size; + const u8 *vec5; + + root = of_get_flat_dt_root(); + chosen = of_get_flat_dt_subnode_by_name(root, "chosen"); + if (chosen == -FDT_ERR_NOTFOUND) + return; + vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size); + if (!vec5) + return; + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; +} + void __init mmu_early_init_devtree(void) { /* Disable radix mode based on kernel command line. */ if (disable_radix) cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + /* + * Check /chosen/ibm,architecture-vec-5 if running as a guest. + * When running bare-metal, we can use radix if we like + * even though the ibm,architecture-vec-5 property created by + * skiboot doesn't have the necessary bits set. + */ + if (early_radix_enabled() && !(mfmsr() & MSR_HV)) + early_check_vec5(); + if (early_radix_enabled()) radix__early_init_devtree(); else From 3f4ab2f83b4e443c66549206eb88a9fa5a85d647 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:35 +1100 Subject: [PATCH 13/42] powerpc/pseries: Fixes for the "ibm,architecture-vec-5" options This fixes the byte index values for some of the option bits in the "ibm,architectur-vec-5" property. The "platform facilities options" bits are in byte 17 not byte 14, so the upper 8 bits of their definitions need to be 0x11 not 0x0E. The "sub processor support" option is in byte 21 not byte 15. Note none of these options are actually looked up in "ibm,architecture-vec-5" at this time, so there is no bug. When checking whether option bits are set, we should check that the offset of the byte being checked is less than the vector length that we got from the hypervisor. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/prom.h | 8 ++++---- arch/powerpc/platforms/pseries/firmware.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 5e57705b4759..e6d83d0fada7 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -151,10 +151,10 @@ struct of_drconf_cell { #define OV5_XCMO 0x0440 /* Page Coalescing */ #define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ -#define OV5_PFO_HW_RNG 0x0E80 /* PFO Random Number Generator */ -#define OV5_PFO_HW_842 0x0E40 /* PFO Compression Accelerator */ -#define OV5_PFO_HW_ENCR 0x0E20 /* PFO Encryption Accelerator */ -#define OV5_SUB_PROCESSORS 0x0F01 /* 1,2,or 4 Sub-Processors supported */ +#define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ +#define OV5_PFO_HW_842 0x1140 /* PFO Compression Accelerator */ +#define OV5_PFO_HW_ENCR 0x1120 /* PFO Encryption Accelerator */ +#define OV5_SUB_PROCESSORS 0x1501 /* 1,2,or 4 Sub-Processors supported */ /* Option Vector 6: IBM PAPR hints */ #define OV6_LINUX 0x02 /* Linux is our OS */ diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index ea7f09bd73b1..7d67623203b8 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -126,7 +126,7 @@ static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) index = OV5_INDX(vec5_fw_features_table[i].feature); feat = OV5_FEAT(vec5_fw_features_table[i].feature); - if (vec5[index] & feat) + if (index < len && (vec5[index] & feat)) powerpc_firmware_features |= vec5_fw_features_table[i].val; } From cc3d2940133d24000e2866b21e03ce32adfead0a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:36 +1100 Subject: [PATCH 14/42] powerpc/64: Enable use of radix MMU under hypervisor on POWER9 To use radix as a guest, we first need to tell the hypervisor via the ibm,client-architecture call first that we support POWER9 and architecture v3.00, and that we can do either radix or hash and that we would like to choose later using an hcall (the H_REGISTER_PROC_TBL hcall). Then we need to check whether the hypervisor agreed to us using radix. We need to do this very early on in the kernel boot process before any of the MMU initialization is done. If the hypervisor doesn't agree, we can't use radix and therefore clear the radix MMU feature bit. Later, when we have set up our process table, which points to the radix tree for each process, we need to install that using the H_REGISTER_PROC_TBL hcall. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 6 +++++ arch/powerpc/include/asm/hvcall.h | 11 +++++++++ arch/powerpc/include/asm/prom.h | 9 ++++++++ arch/powerpc/kernel/prom_init.c | 18 ++++++++++++++- arch/powerpc/mm/init_64.c | 12 ++++++---- arch/powerpc/mm/pgtable-radix.c | 2 ++ arch/powerpc/platforms/pseries/lpar.c | 29 ++++++++++++++++++++++++ 7 files changed, 81 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 8afb0e00f7d9..cea522c3bcae 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -138,5 +138,11 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, extern int (*register_process_table)(unsigned long base, unsigned long page_size, unsigned long tbl_size); +#ifdef CONFIG_PPC_PSERIES +extern void radix_init_pseries(void); +#else +static inline void radix_init_pseries(void) { }; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 77ff1ba99d1f..54d11b3a6bf7 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -276,6 +276,7 @@ #define H_GET_MPP_X 0x314 #define H_SET_MODE 0x31C #define H_CLEAR_HPT 0x358 +#define H_REGISTER_PROC_TBL 0x37C #define H_SIGNAL_SYS_RESET 0x380 #define MAX_HCALL_OPCODE H_SIGNAL_SYS_RESET @@ -313,6 +314,16 @@ #define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 /* >= 0 values are CPU number */ +/* Flag values used in H_REGISTER_PROC_TBL hcall */ +#define PROC_TABLE_OP_MASK 0x18 +#define PROC_TABLE_DEREG 0x10 +#define PROC_TABLE_NEW 0x18 +#define PROC_TABLE_TYPE_MASK 0x06 +#define PROC_TABLE_HPT_SLB 0x00 +#define PROC_TABLE_HPT_PT 0x02 +#define PROC_TABLE_RADIX 0x04 +#define PROC_TABLE_GTSE 0x01 + #ifndef __ASSEMBLY__ /** diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index e6d83d0fada7..8af2546ea593 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -121,6 +121,8 @@ struct of_drconf_cell { #define OV1_PPC_2_06 0x02 /* set if we support PowerPC 2.06 */ #define OV1_PPC_2_07 0x01 /* set if we support PowerPC 2.07 */ +#define OV1_PPC_3_00 0x80 /* set if we support PowerPC 3.00 */ + /* Option vector 2: Open Firmware options supported */ #define OV2_REAL_MODE 0x20 /* set if we want OF in real mode */ @@ -155,6 +157,13 @@ struct of_drconf_cell { #define OV5_PFO_HW_842 0x1140 /* PFO Compression Accelerator */ #define OV5_PFO_HW_ENCR 0x1120 /* PFO Encryption Accelerator */ #define OV5_SUB_PROCESSORS 0x1501 /* 1,2,or 4 Sub-Processors supported */ +#define OV5_XIVE_EXPLOIT 0x1701 /* XIVE exploitation supported */ +#define OV5_MMU_RADIX_300 0x1880 /* ISA v3.00 radix MMU supported */ +#define OV5_MMU_HASH_300 0x1840 /* ISA v3.00 hash MMU supported */ +#define OV5_MMU_SEGM_RADIX 0x1820 /* radix mode (no segmentation) */ +#define OV5_MMU_PROC_TBL 0x1810 /* hcall selects SLB or proc table */ +#define OV5_MMU_SLB 0x1800 /* always use SLB */ +#define OV5_MMU_GTSE 0x1808 /* Guest translation shootdown */ /* Option Vector 6: IBM PAPR hints */ #define OV6_LINUX 0x02 /* Linux is our OS */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ec47a939cbdd..358d43f8f84f 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -649,6 +649,7 @@ static void __init early_cmdline_parse(void) struct option_vector1 { u8 byte1; u8 arch_versions; + u8 arch_versions3; } __packed; struct option_vector2 { @@ -691,6 +692,9 @@ struct option_vector5 { u8 reserved2; __be16 reserved3; u8 subprocessors; + u8 byte22; + u8 intarch; + u8 mmu; } __packed; struct option_vector6 { @@ -700,7 +704,7 @@ struct option_vector6 { } __packed; struct ibm_arch_vec { - struct { u32 mask, val; } pvrs[10]; + struct { u32 mask, val; } pvrs[12]; u8 num_vectors; @@ -749,6 +753,14 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .mask = cpu_to_be32(0xffff0000), /* POWER8 */ .val = cpu_to_be32(0x004d0000), }, + { + .mask = cpu_to_be32(0xffff0000), /* POWER9 */ + .val = cpu_to_be32(0x004e0000), + }, + { + .mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */ + .val = cpu_to_be32(0x0f000005), + }, { .mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */ .val = cpu_to_be32(0x0f000004), @@ -774,6 +786,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .byte1 = 0, .arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 | OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07, + .arch_versions3 = OV1_PPC_3_00, }, .vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)), @@ -836,6 +849,9 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .reserved2 = 0, .reserved3 = 0, .subprocessors = 1, + .intarch = 0, + .mmu = OV5_FEAT(OV5_MMU_RADIX_300) | OV5_FEAT(OV5_MMU_HASH_300) | + OV5_FEAT(OV5_MMU_PROC_TBL) | OV5_FEAT(OV5_MMU_GTSE), }, /* option vector 6: IBM PAPR hints */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 4d9481ec2468..10c9a545a646 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -347,10 +347,9 @@ static int __init parse_disable_radix(char *p) early_param("disable_radix", parse_disable_radix); /* - * If we're running under a hypervisor, we currently can't do radix - * since we don't have the code to do the H_REGISTER_PROC_TBL hcall. - * We tell that we're running under a hypervisor by looking for the - * /chosen/ibm,architecture-vec-5 property. + * If we're running under a hypervisor, we need to check the contents of + * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do + * radix. If not, we clear the radix feature bit so we fall back to hash. */ static void early_check_vec5(void) { @@ -365,7 +364,10 @@ static void early_check_vec5(void) vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size); if (!vec5) return; - cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + if (size <= OV5_INDX(OV5_MMU_RADIX_300) || + !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300))) + /* Hypervisor doesn't support radix */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; } void __init mmu_early_init_devtree(void) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index cfa53ccc8baf..94323c4ececc 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -401,6 +401,8 @@ void __init radix__early_init_mmu(void) mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); radix_init_amor(); + } else { + radix_init_pseries(); } memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 5dc1c3c6e716..0587655aea69 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -609,6 +609,29 @@ static int __init disable_bulk_remove(char *str) __setup("bulk_remove=", disable_bulk_remove); +/* Actually only used for radix, so far */ +static int pseries_lpar_register_process_table(unsigned long base, + unsigned long page_size, unsigned long table_size) +{ + long rc; + unsigned long flags = PROC_TABLE_NEW; + + if (radix_enabled()) + flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE; + for (;;) { + rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, + page_size, table_size); + if (!H_IS_LONG_BUSY(rc)) + break; + mdelay(get_longbusy_msecs(rc)); + } + if (rc != H_SUCCESS) { + pr_err("Failed to register process table (rc=%ld)\n", rc); + BUG(); + } + return rc; +} + void __init hpte_init_pseries(void) { mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate; @@ -622,6 +645,12 @@ void __init hpte_init_pseries(void) mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; } +void radix_init_pseries(void) +{ + pr_info("Using radix MMU under hypervisor\n"); + register_process_table = pseries_lpar_register_process_table; +} + #ifdef CONFIG_PPC_SMLPAR #define CMO_FREE_HINT_DEFAULT 1 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT; From dbcbfee0c81c7938e40d7d6bc659a5191f490b50 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:37 +1100 Subject: [PATCH 15/42] powerpc/64: More definitions for POWER9 This adds definitions for bits in the DSISR register which are used by POWER9 for various translation-related exception conditions, and for some more bits in the partition table entry that will be needed by KVM. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 12 +++++++++++- arch/powerpc/include/asm/reg.h | 4 ++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index cea522c3bcae..d73e9dfa5237 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -44,10 +44,20 @@ struct patb_entry { }; extern struct patb_entry *partition_tb; +/* Bits in patb0 field */ #define PATB_HR (1UL << 63) -#define PATB_GR (1UL << 63) #define RPDB_MASK 0x0ffffffffffff00fUL #define RPDB_SHIFT (1UL << 8) +#define RTS1_SHIFT 61 /* top 2 bits of radix tree size */ +#define RTS1_MASK (3UL << RTS1_SHIFT) +#define RTS2_SHIFT 5 /* bottom 3 bits of radix tree size */ +#define RTS2_MASK (7UL << RTS2_SHIFT) +#define RPDS_MASK 0x1f /* root page dir. size field */ + +/* Bits in patb1 field */ +#define PATB_GR (1UL << 63) /* guest uses radix; must match HR */ +#define PRTS_MASK 0x1f /* process table size field */ + /* * Limit process table to PAGE_SIZE table. This * also limit the max pid we can support. diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 0d4531aa2052..aa44a83ad3ec 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -274,10 +274,14 @@ #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ #define DSISR_NOHPTE 0x40000000 /* no translation found */ #define DSISR_PROTFAULT 0x08000000 /* protection fault */ +#define DSISR_BADACCESS 0x04000000 /* bad access to CI or G */ #define DSISR_ISSTORE 0x02000000 /* access was a store */ #define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ #define DSISR_NOSEGMENT 0x00200000 /* SLB miss */ #define DSISR_KEYFAULT 0x00200000 /* Key fault */ +#define DSISR_UNSUPP_MMU 0x00080000 /* Unsupported MMU config */ +#define DSISR_SET_RC 0x00040000 /* Failed setting of R/C bits */ +#define DSISR_PGDIRFAULT 0x00020000 /* Fault on page directory */ #define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_CIR 0x11B /* Chip Information Register (hyper, R/0) */ From ba9b399aee6fb70cbe988f0750d6dd9f6677293b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:38 +1100 Subject: [PATCH 16/42] powerpc/64: Export pgtable_cache and pgtable_cache_add for KVM This exports the pgtable_cache array and the pgtable_cache_add function so that HV KVM can use them for allocating radix page tables for guests. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init-common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index a175cd82ae8c..2be5dc242832 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -41,6 +41,7 @@ static void pmd_ctor(void *addr) } struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; +EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */ /* * Create a kmem_cache() for pagetables. This is not used for PTE @@ -82,7 +83,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) pgtable_cache[shift - 1] = new; pr_debug("Allocated pgtable cache for order %d\n", shift); } - +EXPORT_SYMBOL_GPL(pgtable_cache_add); /* used by kvm_hv module */ void pgtable_cache_init(void) { From 16ed141677c5a1a796408e74ccd0a6f6554c3f21 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:39 +1100 Subject: [PATCH 17/42] powerpc/64: Make type of partition table flush depend on partition type When changing a partition table entry on POWER9, we do a particular form of the tlbie instruction which flushes all TLBs and caches of the partition table for a given logical partition ID (LPID). This instruction has a field in the instruction word, labelled R (radix), which should be 1 if the partition was previously a radix partition and 0 if it was a HPT partition. This implements that logic. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_64.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 8bca7f58afc4..d6b5e5cde412 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -454,13 +454,23 @@ void __init mmu_partition_table_init(void) void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, unsigned long dw1) { + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + partition_tb[lpid].patb0 = cpu_to_be64(dw0); partition_tb[lpid].patb1 = cpu_to_be64(dw1); - /* Global flush of TLBs and partition table caches for this lpid */ + /* + * Global flush of TLBs and partition table caches for this lpid. + * The type of flush (hash or radix) depends on what the previous + * use of this partition ID was, not the new use. + */ asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + if (old & PATB_HR) + asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + else + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); From bc3551257af837fc603d295e59f9e32953525b98 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:40 +1100 Subject: [PATCH 18/42] powerpc/64: Allow for relocation-on interrupts from guest to host With host and guest both using radix translation, it is feasible for the host to take interrupts that come from the guest with relocation on, and that is in fact what the POWER9 hardware will do when LPCR[AIL] = 3. All such interrupts use HSRR0/1 not SRR0/1 except for system call with LEV=1 (hcall). Therefore this adds the KVM tests to the _HV variants of the relocation-on interrupt handlers, and adds the KVM test to the relocation-on system call entry point. We also instantiate the relocation-on versions of the hypervisor data storage and instruction interrupt handlers, since these can occur with relocation on in radix guests. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 10 ++--- arch/powerpc/kernel/exceptions-64s.S | 53 +++++++++++++----------- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a5dbfb2d9f2..8fa09fa500f0 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -428,12 +428,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD) #define STD_RELON_EXCEPTION_HV(loc, vec, label) \ - /* No guest interrupts come through here */ \ SET_SCRATCH0(r13); /* save r13 */ \ - EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec); + EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, \ + EXC_HV, KVMTEST_HV, vec); #define STD_RELON_EXCEPTION_HV_OOL(vec, label) \ - EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec); \ EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV) /* This associate vector numbers with bits in paca->irq_happened */ @@ -510,10 +510,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label) \ _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, \ - EXC_HV, SOFTEN_NOTEST_HV) + EXC_HV, SOFTEN_TEST_HV) #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \ - EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \ EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV) /* diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 65a2559eeb7f..34a04a5fa468 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -830,6 +830,31 @@ EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems + * that support it) before changing to HMT_MEDIUM. That allows the KVM + * code to save that value into the guest state (it is the guest's PPR + * value). Otherwise just change to HMT_MEDIUM as userspace has + * already saved the PPR. + */ +#define SYSCALL_KVMTEST \ + SET_SCRATCH0(r13); \ + GET_PACA(r13); \ + std r9,PACA_EXGEN+EX_R9(r13); \ + OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ + HMT_MEDIUM; \ + std r10,PACA_EXGEN+EX_R10(r13); \ + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ + mfcr r9; \ + KVMTEST_PR(0xc00); \ + GET_SCRATCH0(r13) + +#else +#define SYSCALL_KVMTEST \ + HMT_MEDIUM +#endif + #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) @@ -883,34 +908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #endif EXC_REAL_BEGIN(system_call, 0xc00, 0xd00) - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER - SET_SCRATCH0(r13) - GET_PACA(r13) - std r9,PACA_EXGEN+EX_R9(r13) - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); - HMT_MEDIUM; - std r10,PACA_EXGEN+EX_R10(r13) - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); - mfcr r9 - KVMTEST_PR(0xc00) - GET_SCRATCH0(r13) -#else - HMT_MEDIUM; -#endif + SYSCALL_KVMTEST SYSCALL_PSERIES_1 SYSCALL_PSERIES_2_RFID SYSCALL_PSERIES_3 EXC_REAL_END(system_call, 0xc00, 0xd00) EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00) - HMT_MEDIUM + SYSCALL_KVMTEST SYSCALL_PSERIES_1 SYSCALL_PSERIES_2_DIRECT SYSCALL_PSERIES_3 @@ -925,7 +930,7 @@ TRAMP_KVM(PACA_EXGEN, 0xd00) EXC_COMMON(single_step_common, 0xd00, single_step_exception) EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20) -EXC_VIRT_NONE(0x4e00, 0x4e20) +EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x4e20, 0xe00) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) EXC_COMMON_BEGIN(h_data_storage_common) mfspr r10,SPRN_HDAR @@ -941,7 +946,7 @@ EXC_COMMON_BEGIN(h_data_storage_common) EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40) -EXC_VIRT_NONE(0x4e20, 0x4e40) +EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x4e40, 0xe20) TRAMP_KVM_HV(PACA_EXGEN, 0xe20) EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) From c92701322711682de89b2bd0f32affad040b6e86 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:41 +1100 Subject: [PATCH 19/42] KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU This adds two capabilities and two ioctls to allow userspace to find out about and configure the POWER9 MMU in a guest. The two capabilities tell userspace whether KVM can support a guest using the radix MMU, or using the hashed page table (HPT) MMU with a process table and segment tables. (Note that the MMUs in the POWER9 processor cores do not use the process and segment tables when in HPT mode, but the nest MMU does). The KVM_PPC_CONFIGURE_V3_MMU ioctl allows userspace to specify whether a guest will use the radix MMU or the HPT MMU, and to specify the size and location (in guest space) of the process table. The KVM_PPC_GET_RMMU_INFO ioctl gives userspace information about the radix MMU. It returns a list of supported radix tree geometries (base page size and number of bits indexed at each level of the radix tree) and the encoding used to specify the various page sizes for the TLB invalidate entry instruction. Initially, both capabilities return 0 and the ioctls return -EINVAL, until the necessary infrastructure for them to operate correctly is added. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- Documentation/virtual/kvm/api.txt | 83 +++++++++++++++++++++++++++++ arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/include/uapi/asm/kvm.h | 20 +++++++ arch/powerpc/kvm/book3s_hv.c | 13 +++++ arch/powerpc/kvm/powerpc.c | 32 +++++++++++ include/uapi/linux/kvm.h | 6 +++ 6 files changed, 156 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 03145b7cafaa..4470671b0c26 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3201,6 +3201,71 @@ struct kvm_reinject_control { pit_reinject = 0 (!reinject mode) is recommended, unless running an old operating system that uses the PIT for timing (e.g. Linux 2.4.x). +4.99 KVM_PPC_CONFIGURE_V3_MMU + +Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_mmuv3_cfg (in) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, + -EINVAL if the configuration is invalid + +This ioctl controls whether the guest will use radix or HPT (hashed +page table) translation, and sets the pointer to the process table for +the guest. + +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; +}; + +There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and +KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest +to use radix tree translation, and if clear, to use HPT translation. +KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest +to be able to use the global TLB and SLB invalidation instructions; +if clear, the guest may not use these instructions. + +The process_table field specifies the address and size of the guest +process table, which is in the guest's space. This field is formatted +as the second doubleword of the partition table entry, as defined in +the Power ISA V3.00, Book III section 5.7.6.1. + +4.100 KVM_PPC_GET_RMMU_INFO + +Capability: KVM_CAP_PPC_RADIX_MMU +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_rmmu_info (out) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_rmmu_info cannot be written, + -EINVAL if no useful information can be returned + +This ioctl returns a structure containing two things: (a) a list +containing supported radix tree geometries, and (b) a list that maps +page sizes to put in the "AP" (actual page size) field for the tlbie +(TLB invalidate entry) instruction. + +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + 5. The kvm_run structure ------------------------ @@ -3942,3 +4007,21 @@ In order to use SynIC, it has to be activated by setting this capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_PPC_RADIX_MMU + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_HASH_MMU_V3 + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +hashed page table MMU defined in Power ISA V3.00 (as implemented in +the POWER9 processor), including in-memory segment tables. diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2da67bf1f2ec..48c760f89590 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -291,6 +291,8 @@ struct kvmppc_ops { struct irq_bypass_producer *); void (*irq_bypass_del_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); + int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg); + int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 3603b6f51b11..cc0908b6c2a0 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -413,6 +413,26 @@ struct kvm_get_htab_header { __u16 n_invalid; }; +/* For KVM_PPC_CONFIGURE_V3_MMU */ +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; /* second doubleword of partition table entry */ +}; + +/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */ +#define KVM_PPC_MMUV3_RADIX 1 /* 1 = radix mode, 0 = HPT */ +#define KVM_PPC_MMUV3_GTSE 2 /* global translation shootdown enb. */ + +/* For KVM_PPC_GET_RMMU_INFO */ +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + /* Per-vcpu XICS interrupt controller state */ #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ec34e39471a7..5f08ed070ae5 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3657,6 +3657,17 @@ static void init_default_hcalls(void) } } +/* dummy implementations for now */ +static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) +{ + return -EINVAL; +} + +static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) +{ + return -EINVAL; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -3694,6 +3705,8 @@ static struct kvmppc_ops kvm_ops_hv = { .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, #endif + .configure_mmu = kvmhv_configure_mmu, + .get_rmmu_info = kvmhv_get_rmmu_info, }; static int kvm_init_subcore_bitmap(void) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cd892dec7cb6..38c0d154c01e 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_HWRNG: r = kvmppc_hwrng_present(); break; + case KVM_CAP_PPC_MMU_RADIX: + r = !!(0 && hv_enabled && radix_enabled()); + break; + case KVM_CAP_PPC_MMU_HASH_V3: + r = !!(0 && hv_enabled && !radix_enabled() && + cpu_has_feature(CPU_FTR_ARCH_300)); + break; #endif case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -1468,6 +1475,31 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_rtas_define_token(kvm, argp); break; } + case KVM_PPC_CONFIGURE_V3_MMU: { + struct kvm *kvm = filp->private_data; + struct kvm_ppc_mmuv3_cfg cfg; + + r = -EINVAL; + if (!kvm->arch.kvm_ops->configure_mmu) + goto out; + r = -EFAULT; + if (copy_from_user(&cfg, argp, sizeof(cfg))) + goto out; + r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg); + break; + } + case KVM_PPC_GET_RMMU_INFO: { + struct kvm *kvm = filp->private_data; + struct kvm_ppc_rmmu_info info; + + r = -EINVAL; + if (!kvm->arch.kvm_ops->get_rmmu_info) + goto out; + r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info); + if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) + r = -EFAULT; + break; + } default: { struct kvm *kvm = filp->private_data; r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cac48eda1075..e0035808c814 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -871,6 +871,8 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 #define KVM_CAP_PPC_HTM 132 +#define KVM_CAP_PPC_MMU_RADIX 134 +#define KVM_CAP_PPC_MMU_HASH_V3 135 #ifdef KVM_CAP_IRQ_ROUTING @@ -1187,6 +1189,10 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) /* Available with KVM_CAP_PPC_RTAS */ #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) +/* Available with KVM_CAP_PPC_RADIX_MMU */ +#define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) From 468808bd35c4aa3cf7d9fde0ebb010270038734b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:42 +1100 Subject: [PATCH 20/42] KVM: PPC: Book3S HV: Set process table for HPT guests on POWER9 This adds the implementation of the KVM_PPC_CONFIGURE_V3_MMU ioctl for HPT guests on POWER9. With this, we can return 1 for the KVM_CAP_PPC_MMU_HASH_V3 capability. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_hv.c | 35 +++++++++++++++++++++++++---- arch/powerpc/kvm/powerpc.c | 2 +- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e59b172666cd..944532dc4a57 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -264,6 +264,7 @@ struct kvm_arch { atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; int hpt_cma_alloc; + u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5f08ed070ae5..bb2854314ba4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3091,8 +3091,8 @@ static void kvmppc_setup_partition_table(struct kvm *kvm) /* HTABSIZE and HTABORG fields */ dw0 |= kvm->arch.sdr1; - /* Second dword has GR=0; other fields are unused since UPRT=0 */ - dw1 = 0; + /* Second dword as set by userspace */ + dw1 = kvm->arch.process_table; mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); } @@ -3657,10 +3657,37 @@ static void init_default_hcalls(void) } } -/* dummy implementations for now */ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) { - return -EINVAL; + unsigned long lpcr; + + /* If not on a POWER9, reject it */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -ENODEV; + + /* If any unknown flags set, reject it */ + if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) + return -EINVAL; + + /* We can't do radix yet */ + if (cfg->flags & KVM_PPC_MMUV3_RADIX) + return -EINVAL; + + /* GR (guest radix) bit in process_table field must match */ + if (cfg->process_table & PATB_GR) + return -EINVAL; + + /* Process table size field must be reasonable, i.e. <= 24 */ + if ((cfg->process_table & PRTS_MASK) > 24) + return -EINVAL; + + kvm->arch.process_table = cfg->process_table; + kvmppc_setup_partition_table(kvm); + + lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; + kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); + + return 0; } static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 38c0d154c01e..1476a480745e 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -569,7 +569,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(0 && hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(0 && hv_enabled && !radix_enabled() && + r = !!(hv_enabled && !radix_enabled() && cpu_has_feature(CPU_FTR_ARCH_300)); break; #endif From ef8c640cb9cc865a461827b698fcc55b0ecaa600 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:43 +1100 Subject: [PATCH 21/42] KVM: PPC: Book3S HV: Use ASDR for HPT guests on POWER9 POWER9 adds a register called ASDR (Access Segment Descriptor Register), which is set by hypervisor data/instruction storage interrupts to contain the segment descriptor for the address being accessed, assuming the guest is using HPT translation. (For radix guests, it contains the guest real address of the access.) Thus, for HPT guests on POWER9, we can use this register rather than looking up the SLB with the slbfee. instruction. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 264ac9ad4585..01f4392a284d 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1715,6 +1715,10 @@ kvmppc_hdsi: /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ +BEGIN_FTR_SECTION + mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ + b 4f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) andi. r0, r11, MSR_DR /* data relocation enabled? */ beq 3f clrrdi r0, r4, 28 @@ -1791,6 +1795,10 @@ fast_interrupt_c_return: kvmppc_hisi: andis. r0, r11, SRR1_ISI_NOPT@h beq 1f +BEGIN_FTR_SECTION + mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ + b 4f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) andi. r0, r11, MSR_IR /* instruction relocation enabled? */ beq 3f clrrdi r0, r10, 28 From 9e04ba69beec372ddf857c700ff922e95f50b0d0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:44 +1100 Subject: [PATCH 22/42] KVM: PPC: Book3S HV: Add basic infrastructure for radix guests This adds a field in struct kvm_arch and an inline helper to indicate whether a guest is a radix guest or not, plus a new file to contain the radix MMU code, which currently contains just a translate function which knows how to traverse the guest page tables to translate an address. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s.h | 3 + arch/powerpc/include/asm/kvm_book3s_64.h | 6 + arch/powerpc/include/asm/kvm_host.h | 2 + arch/powerpc/kvm/Makefile | 3 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 10 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 139 +++++++++++++++++++++++ 6 files changed, 160 insertions(+), 3 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_64_mmu_radix.c diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 5cf306ae0ac3..7adfcc03a35f 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -182,6 +182,9 @@ extern void kvmppc_mmu_hpte_sysexit(void); extern int kvmppc_mmu_hv_init(void); extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); +extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite); + /* XXX remove this export when load_last_inst() is generic */ extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 848292176908..0db010cc4e65 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -36,6 +36,12 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + +static inline bool kvm_is_radix(struct kvm *kvm) +{ + return kvm->arch.radix; +} + #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 944532dc4a57..fb73518bd03b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -264,6 +264,8 @@ struct kvm_arch { atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; int hpt_cma_alloc; + u8 radix; + pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 7dd89b79d038..b87ccde2137a 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -70,7 +70,8 @@ endif kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ - book3s_64_mmu_hv.o + book3s_64_mmu_hv.o \ + book3s_64_mmu_radix.o kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ book3s_hv_rm_xics.o diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b795dd1ac2ef..c208bf3b252f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -119,6 +119,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) long err = -EBUSY; long order; + if (kvm_is_radix(kvm)) + return -EINVAL; + mutex_lock(&kvm->lock); if (kvm->arch.hpte_setup_done) { kvm->arch.hpte_setup_done = 0; @@ -157,7 +160,7 @@ void kvmppc_free_hpt(struct kvm *kvm) if (kvm->arch.hpt_cma_alloc) kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); - else + else if (kvm->arch.hpt_virt) free_pages(kvm->arch.hpt_virt, kvm->arch.hpt_order - PAGE_SHIFT); } @@ -1675,7 +1678,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ - mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + if (kvm_is_radix(vcpu->kvm)) + mmu->xlate = kvmppc_mmu_radix_xlate; + else + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c new file mode 100644 index 000000000000..9091407fbfd4 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -0,0 +1,139 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2016 Paul Mackerras, IBM Corp. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * Supported radix tree geometry. + * Like p9, we support either 5 or 9 bits at the first (lowest) level, + * for a page size of 64k or 4k. + */ +static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; + +int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite) +{ + struct kvm *kvm = vcpu->kvm; + u32 pid; + int ret, level, ps; + __be64 prte, rpte; + unsigned long root, pte, index; + unsigned long rts, bits, offset; + unsigned long gpa; + unsigned long proc_tbl_size; + + /* Work out effective PID */ + switch (eaddr >> 62) { + case 0: + pid = vcpu->arch.pid; + break; + case 3: + pid = 0; + break; + default: + return -EINVAL; + } + proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12); + if (pid * 16 >= proc_tbl_size) + return -EINVAL; + + /* Read partition table to find root of tree for effective PID */ + ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16, + &prte, sizeof(prte)); + if (ret) + return ret; + + root = be64_to_cpu(prte); + rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | + ((root & RTS2_MASK) >> RTS2_SHIFT); + bits = root & RPDS_MASK; + root = root & RPDB_MASK; + + /* P9 DD1 interprets RTS (radix tree size) differently */ + offset = rts + 31; + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + offset -= 3; + + /* current implementations only support 52-bit space */ + if (offset != 52) + return -EINVAL; + + for (level = 3; level >= 0; --level) { + if (level && bits != p9_supported_radix_bits[level]) + return -EINVAL; + if (level == 0 && !(bits == 5 || bits == 9)) + return -EINVAL; + offset -= bits; + index = (eaddr >> offset) & ((1UL << bits) - 1); + /* check that low bits of page table base are zero */ + if (root & ((1UL << (bits + 3)) - 1)) + return -EINVAL; + ret = kvm_read_guest(kvm, root + index * 8, + &rpte, sizeof(rpte)); + if (ret) + return ret; + pte = __be64_to_cpu(rpte); + if (!(pte & _PAGE_PRESENT)) + return -ENOENT; + if (pte & _PAGE_PTE) + break; + bits = pte & 0x1f; + root = pte & 0x0fffffffffffff00ul; + } + /* need a leaf at lowest level; 512GB pages not supported */ + if (level < 0 || level == 3) + return -EINVAL; + + /* offset is now log base 2 of the page size */ + gpa = pte & 0x01fffffffffff000ul; + if (gpa & ((1ul << offset) - 1)) + return -EINVAL; + gpa += eaddr & ((1ul << offset) - 1); + for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) + if (offset == mmu_psize_defs[ps].shift) + break; + gpte->page_size = ps; + + gpte->eaddr = eaddr; + gpte->raddr = gpa; + + /* Work out permissions */ + gpte->may_read = !!(pte & _PAGE_READ); + gpte->may_write = !!(pte & _PAGE_WRITE); + gpte->may_execute = !!(pte & _PAGE_EXEC); + if (kvmppc_get_msr(vcpu) & MSR_PR) { + if (pte & _PAGE_PRIVILEGED) { + gpte->may_read = 0; + gpte->may_write = 0; + gpte->may_execute = 0; + } + } else { + if (!(pte & _PAGE_PRIVILEGED)) { + /* Check AMR/IAMR to see if strict mode is in force */ + if (vcpu->arch.amr & (1ul << 62)) + gpte->may_read = 0; + if (vcpu->arch.amr & (1ul << 63)) + gpte->may_write = 0; + if (vcpu->arch.iamr & (1ul << 62)) + gpte->may_execute = 0; + } + } + + return 0; +} + From f4c51f841d2ac7d36cacb84efbc383190861f87c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:45 +1100 Subject: [PATCH 23/42] KVM: PPC: Book3S HV: Modify guest entry/exit paths to handle radix guests This adds code to branch around the parts that radix guests don't need - clearing and loading the SLB with the guest SLB contents, saving the guest SLB contents on exit, and restoring the host SLB contents. Since the host is now using radix, we need to save and restore the host value for the PID register. On hypervisor data/instruction storage interrupts, we don't do the guest HPT lookup on radix, but just save the guest physical address for the fault (from the ASDR register) in the vcpu struct. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kernel/asm-offsets.c | 2 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 57 ++++++++++++++++++++----- 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index fb73518bd03b..da1421a4d6f2 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -606,6 +606,7 @@ struct kvm_vcpu_arch { ulong fault_dar; u32 fault_dsisr; unsigned long intr_msr; + ulong fault_gpa; /* guest real address of page fault (POWER9) */ #endif #ifdef CONFIG_BOOKE diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0601e6a7297c..3afa0ad9837f 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -498,6 +498,7 @@ int main(void) DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); + DEFINE(KVM_RADIX, offsetof(struct kvm, arch.radix)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); @@ -537,6 +538,7 @@ int main(void) DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); + DEFINE(VCPU_FAULT_GPA, offsetof(struct kvm_vcpu, arch.fault_gpa)); DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 01f4392a284d..7fc7a9221509 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -518,6 +518,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Stack frame offsets */ #define STACK_SLOT_TID (112-16) #define STACK_SLOT_PSSCR (112-24) +#define STACK_SLOT_PID (112-32) .global kvmppc_hv_entry kvmppc_hv_entry: @@ -530,6 +531,7 @@ kvmppc_hv_entry: * R1 = host R1 * R2 = TOC * all other volatile GPRS = free + * Does not preserve non-volatile GPRs or CR fields */ mflr r0 std r0, PPC_LR_STKOFF(r1) @@ -549,32 +551,38 @@ kvmppc_hv_entry: bl kvmhv_start_timing 1: #endif - /* Clear out SLB */ + + /* Use cr7 as an indication of radix mode */ + ld r5, HSTATE_KVM_VCORE(r13) + ld r9, VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r0, KVM_RADIX(r9) + cmpwi cr7, r0, 0 + + /* Clear out SLB if hash */ + bne cr7, 2f li r6,0 slbmte r6,r6 slbia ptesync - +2: /* * POWER7/POWER8 host -> guest partition switch code. * We don't have to lock against concurrent tlbies, * but we do have to coordinate across hardware threads. */ /* Set bit in entry map iff exit map is zero. */ - ld r5, HSTATE_KVM_VCORE(r13) li r7, 1 lbz r6, HSTATE_PTID(r13) sld r7, r7, r6 - addi r9, r5, VCORE_ENTRY_EXIT -21: lwarx r3, 0, r9 + addi r8, r5, VCORE_ENTRY_EXIT +21: lwarx r3, 0, r8 cmpwi r3, 0x100 /* any threads starting to exit? */ bge secondary_too_late /* if so we're too late to the party */ or r3, r3, r7 - stwcx. r3, 0, r9 + stwcx. r3, 0, r8 bne 21b /* Primary thread switches to guest partition. */ - ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ cmpwi r6,0 bne 10f lwz r7,KVM_LPID(r9) @@ -658,7 +666,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) beq kvmppc_primary_no_guest kvmppc_got_guest: - /* Load up guest SLB entries */ + /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ lwz r5,VCPU_SLB_MAX(r4) cmpwi r5,0 beq 9f @@ -696,8 +704,10 @@ kvmppc_got_guest: BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR + mfspr r7, SPRN_PID std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) + std r7, STACK_SLOT_PID(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION @@ -1293,11 +1303,15 @@ mc_cont: mtspr SPRN_CTRLT,r6 4: /* Read the guest SLB and save it away */ + ld r5, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r5) + cmpwi r0, 0 + li r5, 0 + bne 3f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 addi r7,r9,VCPU_SLB - li r5,0 1: slbmfee r8,r6 andis. r0,r8,SLB_ESID_V@h beq 2f @@ -1309,7 +1323,7 @@ mc_cont: addi r5,r5,1 2: addi r6,r6,1 bdnz 1b - stw r5,VCPU_SLB_MAX(r9) +3: stw r5,VCPU_SLB_MAX(r9) /* * Save the guest PURR/SPURR @@ -1558,8 +1572,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) + ld r7, STACK_SLOT_PID(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 + mtspr SPRN_PID, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) /* @@ -1671,6 +1687,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) isync /* load host SLB entries */ +BEGIN_MMU_FTR_SECTION + b 0f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) ld r8,PACA_SLBSHADOWPTR(r13) .rept SLB_NUM_BOLTED @@ -1683,7 +1702,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbmte r6,r5 1: addi r8,r8,16 .endr - +0: #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING /* Finish timing, if we have a vcpu */ ld r4, HSTATE_KVM_VCPU(r13) @@ -1710,8 +1729,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * reflect the HDSI to the guest as a DSI. */ kvmppc_hdsi: + ld r3, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r3) + cmpwi r0, 0 mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR + bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */ /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ @@ -1788,11 +1811,23 @@ fast_interrupt_c_return: stb r0, HSTATE_IN_GUEST(r13) b guest_exit_cont +.Lradix_hdsi: + std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) +.Lradix_hisi: + mfspr r5, SPRN_ASDR + std r5, VCPU_FAULT_GPA(r9) + b guest_exit_cont + /* * Similarly for an HISI, reflect it to the guest as an ISI unless * it is an HPTE not found fault for a page that we have paged out. */ kvmppc_hisi: + ld r3, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r3) + cmpwi r0, 0 + bne .Lradix_hisi /* for radix, just save ASDR */ andis. r0, r11, SRR1_ISI_NOPT@h beq 1f BEGIN_FTR_SECTION From 5a319350a46572d073042a3194676099dd2c135d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:46 +1100 Subject: [PATCH 24/42] KVM: PPC: Book3S HV: Page table construction and page faults for radix guests This adds the code to construct the second-level ("partition-scoped" in architecturese) page tables for guests using the radix MMU. Apart from the PGD level, which is allocated when the guest is created, the rest of the tree is all constructed in response to hypervisor page faults. As well as hypervisor page faults for missing pages, we also get faults for reference/change (RC) bits needing to be set, as well as various other error conditions. For now, we only set the R or C bit in the guest page table if the same bit is set in the host PTE for the backing page. This code can take advantage of the guest being backed with either transparent or ordinary 2MB huge pages, and insert 2MB page entries into the guest page tables. There is no support for 1GB huge pages yet. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s.h | 8 + arch/powerpc/kvm/book3s.c | 1 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 7 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 385 +++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 17 +- 5 files changed, 415 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 7adfcc03a35f..ff5cd5c5ce8d 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, unsigned long status); extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, unsigned long valid); +extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store); extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); @@ -182,8 +184,14 @@ extern void kvmppc_mmu_hpte_sysexit(void); extern int kvmppc_mmu_hv_init(void); extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); +extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr); extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data, bool iswrite); +extern void kvmppc_free_radix(struct kvm *kvm); +extern int kvmppc_radix_init(void); +extern void kvmppc_radix_exit(void); /* XXX remove this export when load_last_inst() is generic */ extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 019f008775b9..b6b5c185bd92 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, kvmppc_set_dsisr(vcpu, flags); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage); /* used by kvm_hv */ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index c208bf3b252f..57690c22716d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -395,8 +395,8 @@ static int instruction_is_store(unsigned int instr) return (instr & mask) != 0; } -static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned long gpa, gva_t ea, int is_store) +int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store) { u32 last_inst; @@ -461,6 +461,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long rcbits; long mmio_update; + if (kvm_is_radix(kvm)) + return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); + /* * Real-mode code has already searched the HPT and found the * entry we're interested in. Lock the entry and check that diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 9091407fbfd4..865ea9bca364 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -137,3 +137,388 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return 0; } +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_BASE_PSIZE MMU_PAGE_64K +#else +#define MMU_BASE_PSIZE MMU_PAGE_4K +#endif + +static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, + unsigned int pshift) +{ + int psize = MMU_BASE_PSIZE; + + if (pshift >= PMD_SHIFT) + psize = MMU_PAGE_2M; + addr &= ~0xfffUL; + addr |= mmu_psize_defs[psize].ap << 5; + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1) + : : "r" (addr), "r" (kvm->arch.lpid) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +void kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr, + unsigned long set, unsigned long addr, + unsigned int shift) +{ + if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) && + pte_present(*ptep)) { + /* have to invalidate it first */ + __radix_pte_update(ptep, _PAGE_PRESENT, 0); + kvmppc_radix_tlbie_page(kvm, addr, shift); + set |= _PAGE_PRESENT; + } + __radix_pte_update(ptep, clr, set); +} + +void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); +} + +static struct kmem_cache *kvm_pte_cache; + +static pte_t *kvmppc_pte_alloc(void) +{ + return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); +} + +static void kvmppc_pte_free(pte_t *ptep) +{ + kmem_cache_free(kvm_pte_cache, ptep); +} + +static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, + unsigned int level, unsigned long mmu_seq) +{ + pgd_t *pgd; + pud_t *pud, *new_pud = NULL; + pmd_t *pmd, *new_pmd = NULL; + pte_t *ptep, *new_ptep = NULL; + int ret; + + /* Traverse the guest's 2nd-level tree, allocate new levels needed */ + pgd = kvm->arch.pgtable + pgd_index(gpa); + pud = NULL; + if (pgd_present(*pgd)) + pud = pud_offset(pgd, gpa); + else + new_pud = pud_alloc_one(kvm->mm, gpa); + + pmd = NULL; + if (pud && pud_present(*pud)) + pmd = pmd_offset(pud, gpa); + else + new_pmd = pmd_alloc_one(kvm->mm, gpa); + + if (level == 0 && !(pmd && pmd_present(*pmd))) + new_ptep = kvmppc_pte_alloc(); + + /* Check if we might have been invalidated; let the guest retry if so */ + spin_lock(&kvm->mmu_lock); + ret = -EAGAIN; + if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + + /* Now traverse again under the lock and change the tree */ + ret = -ENOMEM; + if (pgd_none(*pgd)) { + if (!new_pud) + goto out_unlock; + pgd_populate(kvm->mm, pgd, new_pud); + new_pud = NULL; + } + pud = pud_offset(pgd, gpa); + if (pud_none(*pud)) { + if (!new_pmd) + goto out_unlock; + pud_populate(kvm->mm, pud, new_pmd); + new_pmd = NULL; + } + pmd = pmd_offset(pud, gpa); + if (pmd_large(*pmd)) { + /* Someone else has instantiated a large page here; retry */ + ret = -EAGAIN; + goto out_unlock; + } + if (level == 1 && !pmd_none(*pmd)) { + /* + * There's a page table page here, but we wanted + * to install a large page. Tell the caller and let + * it try installing a normal page if it wants. + */ + ret = -EBUSY; + goto out_unlock; + } + if (level == 0) { + if (pmd_none(*pmd)) { + if (!new_ptep) + goto out_unlock; + pmd_populate(kvm->mm, pmd, new_ptep); + new_ptep = NULL; + } + ptep = pte_offset_kernel(pmd, gpa); + if (pte_present(*ptep)) { + /* PTE was previously valid, so invalidate it */ + kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, + 0, gpa, 0); + kvmppc_radix_tlbie_page(kvm, gpa, 0); + } + kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); + } else { + kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + } + ret = 0; + + out_unlock: + spin_unlock(&kvm->mmu_lock); + if (new_pud) + pud_free(kvm->mm, new_pud); + if (new_pmd) + pmd_free(kvm->mm, new_pmd); + if (new_ptep) + kvmppc_pte_free(new_ptep); + return ret; +} + +int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long mmu_seq, pte_size; + unsigned long gpa, gfn, hva, pfn; + struct kvm_memory_slot *memslot; + struct page *page = NULL, *pages[1]; + long ret, npages, ok; + unsigned int writing; + struct vm_area_struct *vma; + unsigned long flags; + pte_t pte, *ptep; + unsigned long pgflags; + unsigned int shift, level; + + /* Check for unusual errors */ + if (dsisr & DSISR_UNSUPP_MMU) { + pr_err("KVM: Got unsupported MMU fault\n"); + return -EFAULT; + } + if (dsisr & DSISR_BADACCESS) { + /* Reflect to the guest as DSI */ + pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + + /* Translate the logical address and get the page */ + gpa = vcpu->arch.fault_gpa & ~0xfffUL; + gpa &= ~0xF000000000000000ul; + gfn = gpa >> PAGE_SHIFT; + if (!(dsisr & DSISR_PGDIRFAULT)) + gpa |= ea & 0xfff; + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS | + DSISR_SET_RC)) { + /* + * Bad address in guest page table tree, or other + * unusual error - reflect it to the guest as DSI. + */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, + dsisr & DSISR_ISSTORE); + } + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + writing = (dsisr & DSISR_ISSTORE) != 0; + hva = gfn_to_hva_memslot(memslot, gfn); + if (dsisr & DSISR_SET_RC) { + /* + * Need to set an R or C bit in the 2nd-level tables; + * if the relevant bits aren't already set in the linux + * page tables, fall through to do the gup_fast to + * set them in the linux page tables too. + */ + ok = 0; + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + local_irq_save(flags); + ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva, + NULL, NULL); + if (ptep) { + pte = READ_ONCE(*ptep); + if (pte_present(pte) && + (pte_val(pte) & pgflags) == pgflags) + ok = 1; + } + local_irq_restore(flags); + if (ok) { + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + spin_unlock(&kvm->mmu_lock); + return RESUME_GUEST; + } + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, + gpa, NULL, &shift); + if (ptep && pte_present(*ptep)) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, + gpa, shift); + spin_unlock(&kvm->mmu_lock); + return RESUME_GUEST; + } + spin_unlock(&kvm->mmu_lock); + } + } + + ret = -EFAULT; + pfn = 0; + pte_size = PAGE_SIZE; + pgflags = _PAGE_READ | _PAGE_EXEC; + level = 0; + npages = get_user_pages_fast(hva, 1, writing, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva < vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pgflags = pgprot_val(vma->vm_page_prot); + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + pfn = page_to_pfn(page); + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + /* See if we can insert a 2MB large-page PTE here */ + if (pte_size >= PMD_SIZE && + (gpa & PMD_MASK & PAGE_MASK) == + (hva & PMD_MASK & PAGE_MASK)) { + level = 1; + pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); + } + } + /* See if we can provide write access */ + if (writing) { + /* + * We assume gup_fast has set dirty on the host PTE. + */ + pgflags |= _PAGE_WRITE; + } else { + local_irq_save(flags); + ptep = __find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL, NULL); + if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) + pgflags |= _PAGE_WRITE; + local_irq_restore(flags); + } + } + + /* + * Compute the PTE value that we need to insert. + */ + pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; + if (pgflags & _PAGE_WRITE) + pgflags |= _PAGE_DIRTY; + pte = pfn_pte(pfn, __pgprot(pgflags)); + + /* Allocate space in the tree and write the PTE */ + ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + if (ret == -EBUSY) { + /* + * There's already a PMD where wanted to install a large page; + * for now, fall back to installing a small page. + */ + level = 0; + pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1); + pte = pfn_pte(pfn, __pgprot(pgflags)); + ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + } + if (ret == 0 || ret == -EAGAIN) + ret = RESUME_GUEST; + + if (page) { + /* + * We drop pages[0] here, not page because page might + * have been set to the head page of a compound, but + * we have to drop the reference on the correct tail + * page to match the get inside gup() + */ + put_page(pages[0]); + } + return ret; +} + +void kvmppc_free_radix(struct kvm *kvm) +{ + unsigned long ig, iu, im; + pte_t *pte; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + + if (!kvm->arch.pgtable) + return; + pgd = kvm->arch.pgtable; + for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { + if (!pgd_present(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) { + if (!pud_present(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { + if (pmd_huge(*pmd)) { + pmd_clear(pmd); + continue; + } + if (!pmd_present(*pmd)) + continue; + pte = pte_offset_map(pmd, 0); + memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); + kvmppc_pte_free(pte); + pmd_clear(pmd); + } + pmd_free(kvm->mm, pmd_offset(pud, 0)); + pud_clear(pud); + } + pud_free(kvm->mm, pud_offset(pgd, 0)); + pgd_clear(pgd); + } + pgd_free(kvm->mm, kvm->arch.pgtable); +} + +static void pte_ctor(void *addr) +{ + memset(addr, 0, PTE_TABLE_SIZE); +} + +int kvmppc_radix_init(void) +{ + unsigned long size = sizeof(void *) << PTE_INDEX_SIZE; + + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); + if (!kvm_pte_cache) + return -ENOMEM; + return 0; +} + +void kvmppc_radix_exit(void) +{ + kmem_cache_destroy(kvm_pte_cache); +} diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bb2854314ba4..d50251f9a3c9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3356,7 +3356,10 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmppc_free_vcores(kvm); - kvmppc_free_hpt(kvm); + if (kvm_is_radix(kvm)) + kvmppc_free_radix(kvm); + else + kvmppc_free_hpt(kvm); kvmppc_free_pimap(kvm); } @@ -3768,6 +3771,11 @@ static int kvm_init_subcore_bitmap(void) return 0; } +static int kvmppc_radix_possible(void) +{ + return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); +} + static int kvmppc_book3s_init_hv(void) { int r; @@ -3807,12 +3815,19 @@ static int kvmppc_book3s_init_hv(void) init_vcore_lists(); r = kvmppc_mmu_hv_init(); + if (r) + return r; + + if (kvmppc_radix_possible()) + r = kvmppc_radix_init(); return r; } static void kvmppc_book3s_exit_hv(void) { kvmppc_free_host_rm_ops(); + if (kvmppc_radix_possible()) + kvmppc_radix_exit(); kvmppc_hv_ops = NULL; } From 01756099e0a5f431bbada9693d566269acfb51f9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:47 +1100 Subject: [PATCH 25/42] KVM: PPC: Book3S HV: MMU notifier callbacks for radix guests This adapts our implementations of the MMU notifier callbacks (unmap_hva, unmap_hva_range, age_hva, test_age_hva, set_spte_hva) to call radix functions when the guest is using radix. These implementations are much simpler than for HPT guests because we have only one PTE to deal with, so we don't need to traverse rmap chains. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s.h | 6 +++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 64 +++++++++++++++++--------- arch/powerpc/kvm/book3s_64_mmu_radix.c | 54 ++++++++++++++++++++++ 3 files changed, 103 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index ff5cd5c5ce8d..952cc4b954a1 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -192,6 +192,12 @@ extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, extern void kvmppc_free_radix(struct kvm *kvm); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); +extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); /* XXX remove this export when load_last_inst() is generic */ extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 57690c22716d..088c82bb7ba4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -701,12 +701,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, srcu_idx); } +typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - int (*handler)(struct kvm *kvm, - unsigned long *rmapp, - unsigned long gfn)) + hva_handler_fn handler) { int ret; int retval = 0; @@ -731,9 +732,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for (; gfn < gfn_end; ++gfn) { - gfn_t gfn_offset = gfn - memslot->base_gfn; - - ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); + ret = handler(kvm, memslot, gfn); retval |= ret; } } @@ -742,20 +741,21 @@ static int kvm_handle_hva_range(struct kvm *kvm, } static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, - unsigned long gfn)) + hva_handler_fn handler) { return kvm_handle_hva_range(kvm, hva, hva + 1, handler); } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { struct revmap_entry *rev = kvm->arch.revmap; unsigned long h, i, j; __be64 *hptep; unsigned long ptel, psize, rcbits; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; for (;;) { lock_rmap(rmapp); if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { @@ -816,26 +816,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) { - kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva(kvm, hva, handler); return 0; } int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { - kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva_range(kvm, start, end, handler); return 0; } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot) { - unsigned long *rmapp; unsigned long gfn; unsigned long n; + unsigned long *rmapp; - rmapp = memslot->arch.rmap; gfn = memslot->base_gfn; - for (n = memslot->npages; n; --n) { + rmapp = memslot->arch.rmap; + for (n = memslot->npages; n; --n, ++gfn) { + if (kvm_is_radix(kvm)) { + kvm_unmap_radix(kvm, memslot, gfn); + continue; + } /* * Testing the present bit without locking is OK because * the memslot has been marked invalid already, and hence @@ -843,20 +853,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, * thus the present bit can't go from 0 to 1. */ if (*rmapp & KVMPPC_RMAP_PRESENT) - kvm_unmap_rmapp(kvm, rmapp, gfn); + kvm_unmap_rmapp(kvm, memslot, gfn); ++rmapp; - ++gfn; } } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; __be64 *hptep; int ret = 0; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; retry: lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_REFERENCED) { @@ -904,17 +915,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; + return kvm_handle_hva_range(kvm, start, end, handler); } -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; unsigned long *hp; int ret = 1; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; if (*rmapp & KVMPPC_RMAP_REFERENCED) return 1; @@ -940,12 +956,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; + return kvm_handle_hva(kvm, hva, handler); } void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) { - kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva(kvm, hva, handler); } static int vcpus_running(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 865ea9bca364..69cabadc121a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -463,6 +463,60 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; } +/* Called with kvm->lock held */ +int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep)) { + kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, + gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift); + } + return 0; +} + +/* Called with kvm->lock held */ +int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + int ref = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) { + kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, + gpa, shift); + /* XXX need to flush tlb here? */ + ref = 1; + } + return ref; +} + +/* Called with kvm->lock held */ +int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + int ref = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) + ref = 1; + return ref; +} + void kvmppc_free_radix(struct kvm *kvm) { unsigned long ig, iu, im; From 8f7b79b8379a85fb8dd0c3f42d9f452ec5552161 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:48 +1100 Subject: [PATCH 26/42] KVM: PPC: Book3S HV: Implement dirty page logging for radix guests This adds code to keep track of dirty pages when requested (that is, when memslot->dirty_bitmap is non-NULL) for radix guests. We use the dirty bits in the PTEs in the second-level (partition-scoped) page tables, together with a bitmap of pages that were dirty when their PTE was invalidated (e.g., when the page was paged out). This bitmap is stored in the first half of the memslot->dirty_bitmap area, and kvm_vm_ioctl_get_dirty_log_hv() now uses the second half for the bitmap that gets returned to userspace. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s.h | 7 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 28 +++---- arch/powerpc/kvm/book3s_64_mmu_radix.c | 111 +++++++++++++++++++++++-- arch/powerpc/kvm/book3s_hv.c | 31 +++++-- 4 files changed, 144 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 952cc4b954a1..57dc407cec4a 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -198,6 +198,8 @@ extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); +extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map); /* XXX remove this export when load_last_inst() is generic */ extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); @@ -228,8 +230,11 @@ extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, unsigned long pte_index, unsigned long avpn, unsigned long *hpret); -extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, +extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); +extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, + struct kvm_memory_slot *memslot, + unsigned long *map); extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask); extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 088c82bb7ba4..c9e587a2849d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1068,7 +1068,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) return npages_dirty; } -static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, +void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, struct kvm_memory_slot *memslot, unsigned long *map) { @@ -1086,12 +1086,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, __set_bit_le(gfn - memslot->base_gfn, map); } -long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long *map) +long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map) { unsigned long i, j; unsigned long *rmapp; - struct kvm_vcpu *vcpu; preempt_disable(); rmapp = memslot->arch.rmap; @@ -1107,15 +1106,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, __set_bit_le(j, map); ++rmapp; } - - /* Harvest dirty bits from VPA and DTL updates */ - /* Note: we never modify the SLB shadow buffer areas */ - kvm_for_each_vcpu(i, vcpu, kvm) { - spin_lock(&vcpu->arch.vpa_update_lock); - harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); - harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); - spin_unlock(&vcpu->arch.vpa_update_lock); - } preempt_enable(); return 0; } @@ -1170,10 +1160,14 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); if (memslot) { - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - unlock_rmap(rmap); + if (!kvm_is_radix(kvm)) { + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + unlock_rmap(rmap); + } else if (memslot->dirty_bitmap) { + mark_page_dirty(kvm, gfn); + } } srcu_read_unlock(&kvm->srcu, srcu_idx); } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 69cabadc121a..125cc7ce1525 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -158,18 +158,21 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, asm volatile("ptesync": : :"memory"); } -void kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr, - unsigned long set, unsigned long addr, - unsigned int shift) +unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, + unsigned long clr, unsigned long set, + unsigned long addr, unsigned int shift) { + unsigned long old = 0; + if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) && pte_present(*ptep)) { /* have to invalidate it first */ - __radix_pte_update(ptep, _PAGE_PRESENT, 0); + old = __radix_pte_update(ptep, _PAGE_PRESENT, 0); kvmppc_radix_tlbie_page(kvm, addr, shift); set |= _PAGE_PRESENT; + old &= _PAGE_PRESENT; } - __radix_pte_update(ptep, clr, set); + return __radix_pte_update(ptep, clr, set) | old; } void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, @@ -197,6 +200,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, pud_t *pud, *new_pud = NULL; pmd_t *pmd, *new_pmd = NULL; pte_t *ptep, *new_ptep = NULL; + unsigned long old; int ret; /* Traverse the guest's 2nd-level tree, allocate new levels needed */ @@ -262,9 +266,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ptep = pte_offset_kernel(pmd, gpa); if (pte_present(*ptep)) { /* PTE was previously valid, so invalidate it */ - kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, - 0, gpa, 0); + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, + 0, gpa, 0); kvmppc_radix_tlbie_page(kvm, gpa, 0); + if (old & _PAGE_DIRTY) + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); } kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); } else { @@ -463,6 +469,26 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; } +static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned int order) +{ + unsigned long i, limit; + unsigned long *dp; + + if (!memslot->dirty_bitmap) + return; + limit = 1ul << order; + if (limit < BITS_PER_LONG) { + for (i = 0; i < limit; ++i) + mark_page_dirty(kvm, gfn + i); + return; + } + dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn); + limit /= BITS_PER_LONG; + for (i = 0; i < limit; ++i) + *dp++ = ~0ul; +} + /* Called with kvm->lock held */ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) @@ -470,13 +496,21 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; + unsigned long old; ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep)) { - kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, - gpa, shift); + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, + gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift); + if (old & _PAGE_DIRTY) { + if (!shift) + mark_page_dirty(kvm, gfn); + else + mark_pages_dirty(kvm, memslot, + gfn, shift - PAGE_SHIFT); + } } return 0; } @@ -517,6 +551,65 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, return ref; } +/* Returns the number of PAGE_SIZE pages that are dirty */ +static int kvm_radix_test_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot, int pagenum) +{ + unsigned long gfn = memslot->base_gfn + pagenum; + unsigned long gpa = gfn << PAGE_SHIFT; + pte_t *ptep; + unsigned int shift; + int ret = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { + ret = 1; + if (shift) + ret = 1 << (shift - PAGE_SHIFT); + kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, + gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift); + } + return ret; +} + +long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map) +{ + unsigned long i, j; + unsigned long n, *p; + int npages; + + /* + * Radix accumulates dirty bits in the first half of the + * memslot's dirty_bitmap area, for when pages are paged + * out or modified by the host directly. Pick up these + * bits and add them to the map. + */ + n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long); + p = memslot->dirty_bitmap; + for (i = 0; i < n; ++i) + map[i] |= xchg(&p[i], 0); + + for (i = 0; i < memslot->npages; i = j) { + npages = kvm_radix_test_clear_dirty(kvm, memslot, i); + + /* + * Note that if npages > 0 then i must be a multiple of npages, + * since huge pages are only used to back the guest at guest + * real addresses that are a multiple of their size. + * Since we have at most one PTE covering any given guest + * real address, if npages > 1 we can skip to i + npages. + */ + j = i + 1; + if (npages) + for (j = i; npages; ++j, --npages) + __set_bit_le(j, map); + } + return 0; +} + void kvmppc_free_radix(struct kvm *kvm) { unsigned long ig, iu, im; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d50251f9a3c9..401e4cc8a91f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2961,8 +2961,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int r; + int i, r; unsigned long n; + unsigned long *buf; + struct kvm_vcpu *vcpu; mutex_lock(&kvm->slots_lock); @@ -2976,15 +2978,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (!memslot->dirty_bitmap) goto out; + /* + * Use second half of bitmap area because radix accumulates + * bits in the first half. + */ n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); + buf = memslot->dirty_bitmap + n / sizeof(long); + memset(buf, 0, n); - r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); + if (kvm_is_radix(kvm)) + r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf); + else + r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf); if (r) goto out; + /* Harvest dirty bits from VPA and DTL updates */ + /* Note: we never modify the SLB shadow buffer areas */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf); + kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf); + spin_unlock(&vcpu->arch.vpa_update_lock); + } + r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + if (copy_to_user(log->dirty_bitmap, buf, n)) goto out; r = 0; @@ -3037,7 +3056,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, if (npages) atomic64_inc(&kvm->arch.mmio_update); - if (npages && old->npages) { + if (npages && old->npages && !kvm_is_radix(kvm)) { /* * If modifying a memslot, reset all the rmap dirty bits. * If this is a new memslot, we don't need to do anything @@ -3046,7 +3065,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, */ slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, mem->slot); - kvmppc_hv_get_dirty_log(kvm, memslot, NULL); + kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); } } From 65dae5403a162fe6ef7cd8b2835de9d23c303891 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:49 +1100 Subject: [PATCH 27/42] KVM: PPC: Book3S HV: Make HPT-specific hypercalls return error in radix mode If the guest is in radix mode, then it doesn't have a hashed page table (HPT), so all of the hypercalls that manipulate the HPT can't work and should return an error. This adds checks to make them return H_FUNCTION ("function not supported"). Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 9ef3c4be952f..6c1ac3d21b91 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -182,6 +182,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long mmu_seq; unsigned long rcbits, irq_flags = 0; + if (kvm_is_radix(kvm)) + return H_FUNCTION; psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; @@ -458,6 +460,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; u64 pte, orig_pte, pte_r; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); @@ -529,6 +533,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) struct revmap_entry *rev, *revs[4]; u64 hp0, hp1; + if (kvm_is_radix(kvm)) + return H_FUNCTION; global = global_invalidates(kvm, 0); for (i = 0; i < 4 && ret == H_SUCCESS; ) { n = 0; @@ -642,6 +648,8 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long v, r, rb, mask, bits; u64 pte_v, pte_r; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; @@ -711,6 +719,8 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, int i, n = 1; struct revmap_entry *rev = NULL; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; if (flags & H_READ_4) { @@ -750,6 +760,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *rmap; long ret = H_NOT_FOUND; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; @@ -796,6 +808,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *rmap; long ret = H_NOT_FOUND; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; From a29ebeaf5575d03eef178bb87c425a1e46cae1ca Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:50 +1100 Subject: [PATCH 28/42] KVM: PPC: Book3S HV: Invalidate TLB on radix guest vcpu movement With radix, the guest can do TLB invalidations itself using the tlbie (global) and tlbiel (local) TLB invalidation instructions. Linux guests use local TLB invalidations for translations that have only ever been accessed on one vcpu. However, that doesn't mean that the translations have only been accessed on one physical cpu (pcpu) since vcpus can move around from one pcpu to another. Thus a tlbiel might leave behind stale TLB entries on a pcpu where the vcpu previously ran, and if that task then moves back to that previous pcpu, it could see those stale TLB entries and thus access memory incorrectly. The usual symptom of this is random segfaults in userspace programs in the guest. To cope with this, we detect when a vcpu is about to start executing on a thread in a core that is a different core from the last time it executed. If that is the case, then we mark the core as needing a TLB flush and then send an interrupt to any thread in the core that is currently running a vcpu from the same guest. This will get those vcpus out of the guest, and the first one to re-enter the guest will do the TLB flush. The reason for interrupting the vcpus executing on the old core is to cope with the following scenario: CPU 0 CPU 1 CPU 4 (core 0) (core 0) (core 1) VCPU 0 runs task X VCPU 1 runs core 0 TLB gets entries from task X VCPU 0 moves to CPU 4 VCPU 0 runs task X Unmap pages of task X tlbiel (still VCPU 1) task X moves to VCPU 1 task X runs task X sees stale TLB entries That is, as soon as the VCPU starts executing on the new core, it could unmap and tlbiel some page table entries, and then the task could migrate to one of the VCPUs running on the old core and potentially see stale TLB entries. Since the TLB is shared between all the threads in a core, we only use the bit of kvm->arch.need_tlb_flush corresponding to the first thread in the core. To ensure that we don't have a window where we can miss a flush, this moves the clearing of the bit from before the actual flush to after it. This way, two threads might both do the flush, but we prevent the situation where one thread can enter the guest before the flush is finished. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/kvm/book3s_hv.c | 45 +++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rm_mmu.c | 11 ++++-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 ++++++++++++++------- 4 files changed, 82 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index da1421a4d6f2..b2dbeac3f450 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -263,6 +263,7 @@ struct kvm_arch { unsigned long hpt_mask; atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; + cpumask_t cpu_in_guest; int hpt_cma_alloc; u8 radix; pgd_t *pgtable; @@ -661,6 +662,7 @@ struct kvm_vcpu_arch { int state; int ptid; int thread_cpu; + int prev_cpu; bool timer_running; wait_queue_head_t cpu_run; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 401e4cc8a91f..50c230e83f9b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1821,6 +1821,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, vcpu->arch.vcore = vcore; vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; vcpu->arch.thread_cpu = -1; + vcpu->arch.prev_cpu = -1; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -1950,11 +1951,33 @@ static void kvmppc_release_hwthread(int cpu) tpaca->kvm_hstate.kvm_split_mode = NULL; } +static void do_nothing(void *x) +{ +} + +static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) +{ + int i; + + cpu = cpu_first_thread_sibling(cpu); + cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + /* + * Make sure setting of bit in need_tlb_flush precedes + * testing of cpu_in_guest bits. The matching barrier on + * the other side is the first smp_mb() in kvmppc_run_core(). + */ + smp_mb(); + for (i = 0; i < threads_per_core; ++i) + if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) + smp_call_function_single(cpu + i, do_nothing, NULL, 1); +} + static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; struct paca_struct *tpaca; struct kvmppc_vcore *mvc = vc->master_vcore; + struct kvm *kvm = vc->kvm; cpu = vc->pcpu; if (vcpu) { @@ -1965,6 +1988,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) cpu += vcpu->arch.ptid; vcpu->cpu = mvc->pcpu; vcpu->arch.thread_cpu = cpu; + + /* + * With radix, the guest can do TLB invalidations itself, + * and it could choose to use the local form (tlbiel) if + * it is invalidating a translation that has only ever been + * used on one vcpu. However, that doesn't mean it has + * only ever been used on one physical cpu, since vcpus + * can move around between pcpus. To cope with this, when + * a vcpu moves from one pcpu to another, we need to tell + * any vcpus running on the same core as this vcpu previously + * ran to flush the TLB. The TLB is shared between threads, + * so we use a single bit in .need_tlb_flush for all 4 threads. + */ + if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) { + if (vcpu->arch.prev_cpu >= 0 && + cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + cpu_first_thread_sibling(cpu)) + radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); + vcpu->arch.prev_cpu = cpu; + } + cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); } tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; @@ -2552,6 +2596,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_release_hwthread(pcpu + i); if (sip && sip->napped[i]) kvmppc_ipi_thread(pcpu + i); + cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); } kvmppc_set_host_core(pcpu); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 6c1ac3d21b91..b095afcd4309 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x) static int global_invalidates(struct kvm *kvm, unsigned long flags) { int global; + int cpu; /* * If there is only one vcore, and it's currently running, @@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags) /* any other core might now have stale TLB entries... */ smp_wmb(); cpumask_setall(&kvm->arch.need_tlb_flush); - cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, - &kvm->arch.need_tlb_flush); + cpu = local_paca->kvm_hstate.kvm_vcore->pcpu; + /* + * On POWER9, threads are independent but the TLB is shared, + * so use the bit for the first thread to represent the core. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + cpu = cpu_first_thread_sibling(cpu); + cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush); } return global; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7fc7a9221509..dcc67a87d688 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -598,30 +598,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* See if we need to flush the TLB */ lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ +BEGIN_FTR_SECTION + /* + * On POWER9, individual threads can come in here, but the + * TLB is shared between the 4 threads in a core, hence + * invalidating on one thread invalidates for all. + * Thus we make all 4 threads use the same bit here. + */ + clrrdi r6,r6,2 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) clrldi r7,r6,64-6 /* extract bit number (6 bits) */ srdi r6,r6,6 /* doubleword number */ sldi r6,r6,3 /* address offset */ add r6,r6,r9 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ - li r0,1 - sld r0,r0,r7 + li r8,1 + sld r8,r8,r7 ld r7,0(r6) - and. r7,r7,r0 + and. r7,r7,r8 beq 22f -23: ldarx r7,0,r6 /* if set, clear the bit */ - andc r7,r7,r0 - stdcx. r7,0,r6 - bne 23b /* Flush the TLB of any entries for this LPID */ - lwz r6,KVM_TLB_SETS(r9) - li r0,0 /* RS for P9 version of tlbiel */ - mtctr r6 + lwz r0,KVM_TLB_SETS(r9) + mtctr r0 li r7,0x800 /* IS field = 0b10 */ ptesync -28: tlbiel r7 + li r0,0 /* RS for P9 version of tlbiel */ + bne cr7, 29f +28: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */ addi r7,r7,0x1000 bdnz 28b - ptesync + b 30f +29: PPC_TLBIEL(7,0,2,1,1) /* for radix, RIC=2, PRS=1, R=1 */ + addi r7,r7,0x1000 + bdnz 29b +30: ptesync +23: ldarx r7,0,r6 /* clear the bit after TLB flushed */ + andc r7,r7,r8 + stdcx. r7,0,r6 + bne 23b /* Add timebase offset onto timebase */ 22: ld r8,VCORE_TB_OFFSET(r5) From 53af3ba2e8195f504d6a3a0667ccb5e7d4c57599 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:51 +1100 Subject: [PATCH 29/42] KVM: PPC: Book3S HV: Allow guest exit path to have MMU on If we allow LPCR[AIL] to be set for radix guests, then interrupts from the guest to the host can be delivered by the hardware with relocation on, and thus the code path starting at kvmppc_interrupt_hv can be executed in virtual mode (MMU on) for radix guests (previously it was only ever executed in real mode). Most of the code is indifferent to whether the MMU is on or off, but the calls to OPAL that use the real-mode OPAL entry code need to be switched to use the virtual-mode code instead. The affected calls are the calls to the OPAL XICS emulation functions in kvmppc_read_one_intr() and related functions. We test the MSR[IR] bit to detect whether we are in real or virtual mode, and call the opal_rm_* or opal_* function as appropriate. The other place that depends on the MMU being off is the optimization where the guest exit code jumps to the external interrupt vector or hypervisor doorbell interrupt vector, or returns to its caller (which is __kvmppc_vcore_entry). If the MMU is on and we are returning to the caller, then we don't need to use an rfid instruction since the MMU is already on; a simple blr suffices. If there is an external or hypervisor doorbell interrupt to handle, we branch to the relocation-on version of the interrupt vector. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/book3s_hv_builtin.c | 38 +++++++++++++++++-------- arch/powerpc/kvm/book3s_hv_rm_xics.c | 8 ++---- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 29 +++++++++++++++++++ 3 files changed, 58 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 5bb24be0b346..fe08fea54b70 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -29,6 +29,11 @@ #include #include +static bool in_realmode(void) +{ + return !(mfmsr() & MSR_IR); +} + #define KVM_CMA_CHUNK_ORDER 18 /* @@ -200,7 +205,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val) /* * Send an interrupt or message to another CPU. - * This can only be called in real mode. * The caller needs to include any barrier needed to order writes * to memory vs. the IPI/message. */ @@ -226,7 +230,9 @@ void kvmhv_rm_send_ipi(int cpu) /* Else poke the target with an IPI */ xics_phys = paca[cpu].kvm_hstate.xics_phys; - if (xics_phys) + if (!in_realmode()) + opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); + else if (xics_phys) rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); else opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu), @@ -412,14 +418,15 @@ static long kvmppc_read_one_intr(bool *again) /* Now read the interrupt from the ICP */ xics_phys = local_paca->kvm_hstate.xics_phys; - if (!xics_phys) { - /* Use OPAL to read the XIRR */ + rc = 0; + if (!in_realmode()) + rc = opal_int_get_xirr(&xirr, false); + else if (!xics_phys) rc = opal_rm_int_get_xirr(&xirr, false); - if (rc < 0) - return 1; - } else { + else xirr = _lwzcix(xics_phys + XICS_XIRR); - } + if (rc < 0) + return 1; /* * Save XIRR for later. Since we get control in reverse endian @@ -445,15 +452,19 @@ static long kvmppc_read_one_intr(bool *again) * If it is an IPI, clear the MFRR and EOI it. */ if (xisr == XICS_IPI) { - if (xics_phys) { + rc = 0; + if (!in_realmode()) { + opal_int_set_mfrr(hard_smp_processor_id(), 0xff); + rc = opal_int_eoi(h_xirr); + } else if (xics_phys) { _stbcix(xics_phys + XICS_MFRR, 0xff); _stwcix(xics_phys + XICS_XIRR, xirr); } else { opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff); rc = opal_rm_int_eoi(h_xirr); - /* If rc > 0, there is another interrupt pending */ - *again = rc > 0; } + /* If rc > 0, there is another interrupt pending */ + *again = rc > 0; /* * Need to ensure side effects of above stores @@ -471,7 +482,10 @@ static long kvmppc_read_one_intr(bool *again) /* We raced with the host, * we need to resend that IPI, bummer */ - if (xics_phys) + if (!in_realmode()) + opal_int_set_mfrr(hard_smp_processor_id(), + IPI_PRIORITY); + else if (xics_phys) _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); else opal_rm_int_set_mfrr(hard_smp_processor_id(), diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 06edc4366639..7e2eb3e865b3 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -70,11 +70,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - if (paca[hcpu].kvm_hstate.xics_phys) - icp_native_cause_ipi_rm(hcpu); - else - opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu), - IPI_PRIORITY); + kvmppc_set_host_ipi(hcpu, 1); + smp_mb(); + kvmhv_rm_send_ipi(hcpu); } #else static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index dcc67a87d688..46c1c1fe55c8 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addi r1, r1, 112 ld r7, HSTATE_HOST_MSR(r13) + /* + * If we came back from the guest via a relocation-on interrupt, + * we will be in virtual mode at this point, which makes it a + * little easier to get back to the caller. + */ + mfmsr r0 + andi. r0, r0, MSR_IR /* in real mode? */ + bne .Lvirt_return + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq 11f @@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_HSRR1, r7 ba 0xe80 + /* Virtual-mode return - can't get here for HMI or machine check */ +.Lvirt_return: + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 16f + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + beq 17f + andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */ + beq 18f + mtmsrd r7, 1 /* if so then re-enable them */ +18: mtlr r8 + blr + +16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */ + mtspr SPRN_HSRR1, r7 + b exc_virt_0x4500_hardware_interrupt + +17: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + b exc_virt_0x4e80_h_doorbell + kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ From f11f6f79b606fb54bb388d0ea652ed889b2fdf86 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:52 +1100 Subject: [PATCH 30/42] KVM: PPC: Book3S HV: Invalidate ERAT on guest entry/exit for POWER9 DD1 On POWER9 DD1, we need to invalidate the ERAT (effective to real address translation cache) when changing the PIDR register, which we do as part of guest entry and exit. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 46c1c1fe55c8..47414a6fe2dd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -876,6 +876,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_BESCR, r6 mtspr SPRN_PID, r7 mtspr SPRN_WORT, r8 +BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) BEGIN_FTR_SECTION /* POWER8-only registers */ ld r5, VCPU_TCSCR(r4) @@ -1620,6 +1623,9 @@ BEGIN_FTR_SECTION mtspr SPRN_PSSCR, r6 mtspr SPRN_PID, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) /* * POWER7/POWER8 guest -> host partition switch code. From 8cf4ecc0ca9bd9bdc9b4ca0a99f7445a1e74afed Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Jan 2017 21:21:53 +1100 Subject: [PATCH 31/42] KVM: PPC: Book3S HV: Enable radix guest support This adds a few last pieces of the support for radix guests: * Implement the backends for the KVM_PPC_CONFIGURE_V3_MMU and KVM_PPC_GET_RMMU_INFO ioctls for radix guests * On POWER9, allow secondary threads to be on/off-lined while guests are running. * Set up LPCR and the partition table entry for radix guests. * Don't allocate the rmap array in the kvm_memory_slot structure on radix. * Don't try to initialize the HPT for radix guests, since they don't have an HPT. * Take out the code that prevents the HV KVM module from initializing on radix hosts. At this stage, we only support radix guests if the host is running in radix mode, and only support HPT guests if the host is running in HPT mode. Thus a guest cannot switch from one mode to the other, which enables some simplifications. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s.h | 2 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 1 - arch/powerpc/kvm/book3s_64_mmu_radix.c | 45 +++++++++++++ arch/powerpc/kvm/book3s_hv.c | 88 ++++++++++++++++++-------- arch/powerpc/kvm/powerpc.c | 2 +- 5 files changed, 111 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 57dc407cec4a..2bf35017ffc0 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -189,6 +189,7 @@ extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, unsigned long ea, unsigned long dsisr); extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data, bool iswrite); +extern int kvmppc_init_vm_radix(struct kvm *kvm); extern void kvmppc_free_radix(struct kvm *kvm); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); @@ -200,6 +201,7 @@ extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); +extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); /* XXX remove this export when load_last_inst() is generic */ extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index c9e587a2849d..9df3d940acec 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -155,7 +155,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) void kvmppc_free_hpt(struct kvm *kvm) { - kvmppc_free_lpid(kvm->arch.lpid); vfree(kvm->arch.revmap); if (kvm->arch.hpt_cma_alloc) kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 125cc7ce1525..4344651f408c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -610,6 +610,51 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, return 0; } +static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, + int psize, int *indexp) +{ + if (!mmu_psize_defs[psize].shift) + return; + info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | + (mmu_psize_defs[psize].ap << 29); + ++(*indexp); +} + +int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) +{ + int i; + + if (!radix_enabled()) + return -EINVAL; + memset(info, 0, sizeof(*info)); + + /* 4k page size */ + info->geometries[0].page_shift = 12; + info->geometries[0].level_bits[0] = 9; + for (i = 1; i < 4; ++i) + info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; + /* 64k page size */ + info->geometries[1].page_shift = 16; + for (i = 0; i < 4; ++i) + info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; + + i = 0; + add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); + + return 0; +} + +int kvmppc_init_vm_radix(struct kvm *kvm) +{ + kvm->arch.pgtable = pgd_alloc(kvm->mm); + if (!kvm->arch.pgtable) + return -ENOMEM; + return 0; +} + void kvmppc_free_radix(struct kvm *kvm) { unsigned long ig, iu, im; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 50c230e83f9b..e4a79679342e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1135,7 +1135,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, /* * Userspace can only modify DPFD (default prefetch depth), * ILE (interrupt little-endian) and TC (translation control). - * On POWER8 userspace can also modify AIL (alt. interrupt loc.) + * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.). */ mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; if (cpu_has_feature(CPU_FTR_ARCH_207S)) @@ -2922,7 +2922,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) smp_mb(); /* On the first time here, set up HTAB and VRMA */ - if (!vcpu->kvm->arch.hpte_setup_done) { + if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) { r = kvmppc_hv_setup_htab_rma(vcpu); if (r) goto out; @@ -2984,6 +2984,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, { struct kvm_ppc_one_seg_page_size *sps; + /* + * Since we don't yet support HPT guests on a radix host, + * return an error if the host uses radix. + */ + if (radix_enabled()) + return -EINVAL; + info->flags = KVM_PPC_PAGE_SIZES_REAL; if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) info->flags |= KVM_PPC_1T_SEGMENTS; @@ -3069,6 +3076,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, unsigned long npages) { + /* + * For now, if radix_enabled() then we only support radix guests, + * and in that case we don't need the rmap array. + */ + if (radix_enabled()) { + slot->arch.rmap = NULL; + return 0; + } + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) return -ENOMEM; @@ -3149,14 +3165,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm) { unsigned long dw0, dw1; - /* PS field - page size for VRMA */ - dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | - ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); - /* HTABSIZE and HTABORG fields */ - dw0 |= kvm->arch.sdr1; + if (!kvm_is_radix(kvm)) { + /* PS field - page size for VRMA */ + dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | + ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); + /* HTABSIZE and HTABORG fields */ + dw0 |= kvm->arch.sdr1; - /* Second dword as set by userspace */ - dw1 = kvm->arch.process_table; + /* Second dword as set by userspace */ + dw1 = kvm->arch.process_table; + } else { + dw0 = PATB_HR | radix__get_tree_size() | + __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; + dw1 = PATB_GR | kvm->arch.process_table; + } mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); } @@ -3326,6 +3348,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; char buf[32]; + int ret; /* Allocate the guest's logical partition ID */ @@ -3373,13 +3396,30 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) lpcr |= LPCR_HVICE; } + /* + * For now, if the host uses radix, the guest must be radix. + */ + if (radix_enabled()) { + kvm->arch.radix = 1; + lpcr &= ~LPCR_VPM1; + lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; + ret = kvmppc_init_vm_radix(kvm); + if (ret) { + kvmppc_free_lpid(kvm->arch.lpid); + return ret; + } + kvmppc_setup_partition_table(kvm); + } + kvm->arch.lpcr = lpcr; /* * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (kvm_is_radix(kvm)) + kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ + else if (cpu_has_feature(CPU_FTR_ARCH_300)) kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ else if (cpu_has_feature(CPU_FTR_ARCH_207S)) kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ @@ -3389,8 +3429,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. + * On POWER9, we only need to do this for HPT guests on a radix + * host, which is not yet supported. */ - kvm_hv_vm_activated(); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm_hv_vm_activated(); /* * Create a debugfs directory for the VM @@ -3416,10 +3459,13 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { debugfs_remove_recursive(kvm->arch.debugfs_dir); - kvm_hv_vm_deactivated(); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); + kvmppc_free_lpid(kvm->arch.lpid); + if (kvm_is_radix(kvm)) kvmppc_free_radix(kvm); else @@ -3452,11 +3498,6 @@ static int kvmppc_core_check_processor_compat_hv(void) if (!cpu_has_feature(CPU_FTR_HVMODE) || !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; - /* - * Disable KVM for Power9 in radix mode. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) - return -EIO; return 0; } @@ -3727,6 +3768,7 @@ static void init_default_hcalls(void) static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) { unsigned long lpcr; + int radix; /* If not on a POWER9, reject it */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) @@ -3736,12 +3778,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) return -EINVAL; - /* We can't do radix yet */ - if (cfg->flags & KVM_PPC_MMUV3_RADIX) + /* We can't change a guest to/from radix yet */ + radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); + if (radix != kvm_is_radix(kvm)) return -EINVAL; /* GR (guest radix) bit in process_table field must match */ - if (cfg->process_table & PATB_GR) + if (!!(cfg->process_table & PATB_GR) != radix) return -EINVAL; /* Process table size field must be reasonable, i.e. <= 24 */ @@ -3757,11 +3800,6 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) return 0; } -static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) -{ - return -EINVAL; -} - static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1476a480745e..40a5b2d75ed1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -566,7 +566,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvmppc_hwrng_present(); break; case KVM_CAP_PPC_MMU_RADIX: - r = !!(0 && hv_enabled && radix_enabled()); + r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: r = !!(hv_enabled && !radix_enabled() && From ccc4df4e2c3825919456c13b153d2a67bbf328dc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:48:57 +1100 Subject: [PATCH 32/42] Documentation: Correct duplicate section number in kvm/api.txt Both KVM_CREATE_SPAPR_TCE_64 and KVM_REINJECT_CONTROL have section number 4.98 in Documentation/virtual/kvm/api.txt, presumably due to a naive merge. This corrects the duplication. [paulus@ozlabs.org - correct section numbers for following sections, KVM_PPC_CONFIGURE_V3_MMU and KVM_PPC_GET_RMMU_INFO, as well.] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4470671b0c26..aca994a90355 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3177,7 +3177,7 @@ of IOMMU pages. The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. -4.98 KVM_REINJECT_CONTROL +4.99 KVM_REINJECT_CONTROL Capability: KVM_CAP_REINJECT_CONTROL Architectures: x86 @@ -3201,7 +3201,7 @@ struct kvm_reinject_control { pit_reinject = 0 (!reinject mode) is recommended, unless running an old operating system that uses the PIT for timing (e.g. Linux 2.4.x). -4.99 KVM_PPC_CONFIGURE_V3_MMU +4.100 KVM_PPC_CONFIGURE_V3_MMU Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 Architectures: ppc @@ -3232,7 +3232,7 @@ process table, which is in the guest's space. This field is formatted as the second doubleword of the partition table entry, as defined in the Power ISA V3.00, Book III section 5.7.6.1. -4.100 KVM_PPC_GET_RMMU_INFO +4.101 KVM_PPC_GET_RMMU_INFO Capability: KVM_CAP_PPC_RADIX_MMU Architectures: ppc From ef1ead0c3b1dfb43d33caa4f50c8d214f86b6bc8 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:48:58 +1100 Subject: [PATCH 33/42] KVM: PPC: Book3S HV: HPT resizing documentation and reserved numbers This adds a new powerpc-specific KVM_CAP_SPAPR_RESIZE_HPT capability to advertise whether KVM is capable of handling the PAPR extensions for resizing the hashed page table during guest runtime. It also adds definitions for two new VM ioctl()s to implement this extension, and documentation of the same. Note that, HPT resizing is already possible with KVM PR without kernel modification, since the HPT is managed within userspace (qemu). The capability defined here will only be set where an in-kernel implementation of resizing is necessary, i.e. for KVM HV. To determine if the userspace resize implementation can be used, it's necessary to check KVM_CAP_PPC_ALLOC_HTAB. Unfortunately older kernels incorrectly set KVM_CAP_PPC_ALLOC_HTAB even with KVM PR. If userspace it want to support resizing with KVM PR on such kernels, it will need a workaround. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 95 +++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 11 ++++ 2 files changed, 106 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index aca994a90355..64f217af0416 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3266,6 +3266,101 @@ The ap_encodings gives the supported page sizes and their AP field encodings, encoded with the AP value in the top 3 bits and the log base 2 of the page size in the bottom 6 bits. +4.102 KVM_PPC_RESIZE_HPT_PREPARE + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENOMEM if unable to allocate the new HPT + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this starts, stops or monitors +the preparation of a new potential HPT for the guest, essentially +implementing the H_RESIZE_HPT_PREPARE hypercall. + +If called with shift > 0 when there is no pending HPT for the guest, +this begins preparation of a new pending HPT of size 2^(shift) bytes. +It then returns a positive integer with the estimated number of +milliseconds until preparation is complete. + +If called when there is a pending HPT whose size does not match that +requested in the parameters, discards the existing pending HPT and +creates a new one as above. + +If called when there is a pending HPT of the size requested, will: + * If preparation of the pending HPT is already complete, return 0 + * If preparation of the pending HPT has failed, return an error + code, then discard the pending HPT. + * If preparation of the pending HPT is still in progress, return an + estimated number of milliseconds until preparation is complete. + +If called with shift == 0, discards any currently pending HPT and +returns 0 (i.e. cancels any in-progress preparation). + +flags is reserved for future expansion, currently setting any bits in +flags will result in an -EINVAL. + +Normally this will be called repeatedly with the same parameters until +it returns <= 0. The first call will initiate preparation, subsequent +ones will monitor preparation until it completes or fails. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.103 KVM_PPC_RESIZE_HPT_COMMIT + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENXIO is there is no pending HPT, or the pending HPT doesn't + have the requested size + -EBUSY if the pending HPT is not fully prepared + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this requests that the guest be +transferred to working with the new HPT, essentially implementing the +H_RESIZE_HPT_COMMIT hypercall. + +This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has +returned 0 with the same parameters. In other cases +KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or +-EBUSY, though others may be possible if the preparation was started, +but failed). + +This will have undefined effects on the guest if it has not already +placed itself in a quiescent state where no vcpu will make MMU enabled +memory accesses. + +On succsful completion, the pending HPT will become the guest's active +HPT and the previous HPT will be discarded. + +On failure, the guest will still be operating on its previous HPT. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + 5. The kvm_run structure ------------------------ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e0035808c814..7964b970b9ad 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -685,6 +685,13 @@ struct kvm_ppc_smmu_info { struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -871,6 +878,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 #define KVM_CAP_PPC_HTM 132 +#define KVM_CAP_SPAPR_RESIZE_HPT 133 #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 @@ -1189,6 +1197,9 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) /* Available with KVM_CAP_PPC_RTAS */ #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) +/* Available with KVM_CAP_SPAPR_RESIZE_HPT */ +#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) +#define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) /* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ From db9a290d9c3c596e5325e2a42133594435e5de46 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:48:59 +1100 Subject: [PATCH 34/42] KVM: PPC: Book3S HV: Rename kvm_alloc_hpt() for clarity The difference between kvm_alloc_hpt() and kvmppc_alloc_hpt() is not at all obvious from the name. In practice kvmppc_alloc_hpt() allocates an HPT by whatever means, and calls kvm_alloc_hpt() which will attempt to allocate it with CMA only. To make this less confusing, rename kvm_alloc_hpt() to kvm_alloc_hpt_cma(). Similarly, kvm_release_hpt() is renamed kvm_free_hpt_cma(). Signed-off-by: David Gibson Reviewed-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 4 ++-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 8 ++++---- arch/powerpc/kvm/book3s_hv_builtin.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 48c760f89590..6fad1f12e9ec 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -186,8 +186,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, unsigned long tce_value, unsigned long npages); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba); -extern struct page *kvm_alloc_hpt(unsigned long nr_pages); -extern void kvm_release_hpt(struct page *page, unsigned long nr_pages); +extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages); +extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern void kvmppc_core_free_memslot(struct kvm *kvm, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 9df3d940acec..16f278417c69 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -62,7 +62,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } kvm->arch.hpt_cma_alloc = 0; - page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT)); + page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); @@ -108,7 +108,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) out_freehpt: if (kvm->arch.hpt_cma_alloc) - kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); + kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else free_pages(hpt, order - PAGE_SHIFT); return -ENOMEM; @@ -157,8 +157,8 @@ void kvmppc_free_hpt(struct kvm *kvm) { vfree(kvm->arch.revmap); if (kvm->arch.hpt_cma_alloc) - kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), - 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); + kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt_virt), + 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); else if (kvm->arch.hpt_virt) free_pages(kvm->arch.hpt_virt, kvm->arch.hpt_order - PAGE_SHIFT); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fe08fea54b70..96e7e609f621 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -57,19 +57,19 @@ static int __init early_parse_kvm_cma_resv(char *p) } early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); -struct page *kvm_alloc_hpt(unsigned long nr_pages) +struct page *kvm_alloc_hpt_cma(unsigned long nr_pages) { VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES)); } -EXPORT_SYMBOL_GPL(kvm_alloc_hpt); +EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma); -void kvm_release_hpt(struct page *page, unsigned long nr_pages) +void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages) { cma_release(kvm_cma, page, nr_pages); } -EXPORT_SYMBOL_GPL(kvm_release_hpt); +EXPORT_SYMBOL_GPL(kvm_free_hpt_cma); /** * kvm_cma_reserve() - reserve area for kvm hash pagetable From 3f9d4f5a5f35e402e91bedf0c15e29cef187a29d Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:00 +1100 Subject: [PATCH 35/42] KVM: PPC: Book3S HV: Gather HPT related variables into sub-structure Currently, the powerpc kvm_arch structure contains a number of variables tracking the state of the guest's hashed page table (HPT) in KVM HV. This patch gathers them all together into a single kvm_hpt_info substructure. This makes life more convenient for the upcoming HPT resizing implementation. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 20 +++++-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 92 ++++++++++++++--------------- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 +++++++++---------- 4 files changed, 92 insertions(+), 84 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b2dbeac3f450..ea6f0c659936 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -241,12 +241,24 @@ struct kvm_arch_memory_slot { #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ }; +struct kvm_hpt_info { + /* Host virtual (linear mapping) address of guest HPT */ + unsigned long virt; + /* Array of reverse mapping entries for each guest HPTE */ + struct revmap_entry *rev; + unsigned long npte; + unsigned long mask; + /* Guest HPT size is 2**(order) bytes */ + u32 order; + /* 1 if HPT allocated with CMA, 0 otherwise */ + int cma; +}; + struct kvm_arch { unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned int tlb_sets; - unsigned long hpt_virt; - struct revmap_entry *revmap; + struct kvm_hpt_info hpt; atomic64_t mmio_update; unsigned int host_lpid; unsigned long host_lpcr; @@ -256,15 +268,11 @@ struct kvm_arch { unsigned long lpcr; unsigned long vrma_slb_v; int hpte_setup_done; - u32 hpt_order; atomic_t vcpus_running; u32 online_vcores; - unsigned long hpt_npte; - unsigned long hpt_mask; atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; cpumask_t cpu_in_guest; - int hpt_cma_alloc; u8 radix; pgd_t *pgtable; u64 process_table; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 16f278417c69..2af63ce129bc 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -61,12 +61,12 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) order = PPC_MIN_HPT_ORDER; } - kvm->arch.hpt_cma_alloc = 0; + kvm->arch.hpt.cma = 0; page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); - kvm->arch.hpt_cma_alloc = 1; + kvm->arch.hpt.cma = 1; } /* Lastly try successively smaller sizes from the page allocator */ @@ -81,22 +81,22 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) if (!hpt) return -ENOMEM; - kvm->arch.hpt_virt = hpt; - kvm->arch.hpt_order = order; + kvm->arch.hpt.virt = hpt; + kvm->arch.hpt.order = order; /* HPTEs are 2**4 bytes long */ - kvm->arch.hpt_npte = 1ul << (order - 4); + kvm->arch.hpt.npte = 1ul << (order - 4); /* 128 (2**7) bytes in each HPTEG */ - kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; + kvm->arch.hpt.mask = (1ul << (order - 7)) - 1; atomic64_set(&kvm->arch.mmio_update, 0); /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); + rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt.npte); if (!rev) { pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); goto out_freehpt; } - kvm->arch.revmap = rev; + kvm->arch.hpt.rev = rev; kvm->arch.sdr1 = __pa(hpt) | (order - 18); pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", @@ -107,7 +107,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) return 0; out_freehpt: - if (kvm->arch.hpt_cma_alloc) + if (kvm->arch.hpt.cma) kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else free_pages(hpt, order - PAGE_SHIFT); @@ -132,10 +132,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) goto out; } } - if (kvm->arch.hpt_virt) { - order = kvm->arch.hpt_order; + if (kvm->arch.hpt.virt) { + order = kvm->arch.hpt.order; /* Set the entire HPT to 0, i.e. invalid HPTEs */ - memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); + memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); /* * Reset all the reverse-mapping chains for all memslots */ @@ -155,13 +155,13 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) void kvmppc_free_hpt(struct kvm *kvm) { - vfree(kvm->arch.revmap); - if (kvm->arch.hpt_cma_alloc) - kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt_virt), - 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); - else if (kvm->arch.hpt_virt) - free_pages(kvm->arch.hpt_virt, - kvm->arch.hpt_order - PAGE_SHIFT); + vfree(kvm->arch.hpt.rev); + if (kvm->arch.hpt.cma) + kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt.virt), + 1 << (kvm->arch.hpt.order - PAGE_SHIFT)); + else if (kvm->arch.hpt.virt) + free_pages(kvm->arch.hpt.virt, + kvm->arch.hpt.order - PAGE_SHIFT); } /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ @@ -196,8 +196,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (npages > 1ul << (40 - porder)) npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ - if (npages > kvm->arch.hpt_mask + 1) - npages = kvm->arch.hpt_mask + 1; + if (npages > kvm->arch.hpt.mask + 1) + npages = kvm->arch.hpt.mask + 1; hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); @@ -207,7 +207,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt.mask; /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create @@ -340,11 +340,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, preempt_enable(); return -ENOENT; } - hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; if (cpu_has_feature(CPU_FTR_ARCH_300)) v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); - gr = kvm->arch.revmap[index].guest_rpte; + gr = kvm->arch.hpt.rev[index].guest_rpte; unlock_hpte(hptep, orig_v); preempt_enable(); @@ -485,8 +485,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } } index = vcpu->arch.pgfault_index; - hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); - rev = &kvm->arch.revmap[index]; + hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); + rev = &kvm->arch.hpt.rev[index]; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); @@ -748,7 +748,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long h, i, j; __be64 *hptep; unsigned long ptel, psize, rcbits; @@ -768,7 +768,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, * rmap chain lock. */ i = *rmapp & KVMPPC_RMAP_INDEX; - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); @@ -860,7 +860,7 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; __be64 *hptep; int ret = 0; @@ -880,7 +880,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; /* If this HPTE isn't referenced, ignore it */ @@ -923,7 +923,7 @@ int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long *hp; int ret = 1; @@ -940,7 +940,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, if (*rmapp & KVMPPC_RMAP_PRESENT) { i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; if (be64_to_cpu(hp[1]) & HPTE_R_R) goto out; @@ -980,7 +980,7 @@ static int vcpus_running(struct kvm *kvm) */ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long n; unsigned long v, r; @@ -1005,7 +1005,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) i = head = *rmapp & KVMPPC_RMAP_INDEX; do { unsigned long hptep1; - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; /* @@ -1311,8 +1311,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, flags = ctx->flags; i = ctx->index; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); - revp = kvm->arch.revmap + i; + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); + revp = kvm->arch.hpt.rev + i; lbuf = (unsigned long __user *)buf; nb = 0; @@ -1327,7 +1327,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, /* Skip uninteresting entries, i.e. clean on not-first pass */ if (!first_pass) { - while (i < kvm->arch.hpt_npte && + while (i < kvm->arch.hpt.npte && !hpte_dirty(revp, hptp)) { ++i; hptp += 2; @@ -1337,7 +1337,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, hdr.index = i; /* Grab a series of valid entries */ - while (i < kvm->arch.hpt_npte && + while (i < kvm->arch.hpt.npte && hdr.n_valid < 0xffff && nb + HPTE_SIZE < count && record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { @@ -1353,7 +1353,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, ++revp; } /* Now skip invalid entries while we can */ - while (i < kvm->arch.hpt_npte && + while (i < kvm->arch.hpt.npte && hdr.n_invalid < 0xffff && record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { /* found an invalid entry */ @@ -1374,7 +1374,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, } /* Check if we've wrapped around the hash table */ - if (i >= kvm->arch.hpt_npte) { + if (i >= kvm->arch.hpt.npte) { i = 0; ctx->first_pass = 0; break; @@ -1433,11 +1433,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, err = -EINVAL; i = hdr.index; - if (i >= kvm->arch.hpt_npte || - i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + if (i >= kvm->arch.hpt.npte || + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt.npte) break; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); lbuf = (unsigned long __user *)buf; for (j = 0; j < hdr.n_valid; ++j) { __be64 hpte_v; @@ -1624,8 +1624,8 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, kvm = p->kvm; i = p->hpt_index; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); - for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); + for (; len != 0 && i < kvm->arch.hpt.npte; ++i, hptp += 2) { if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) continue; @@ -1635,7 +1635,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, cpu_relax(); v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; hr = be64_to_cpu(hptp[1]); - gr = kvm->arch.revmap[i].guest_rpte; + gr = kvm->arch.hpt.rev[i].guest_rpte; unlock_hpte(hptp, v); preempt_enable(); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bdf281cc88c0..02607128a4d4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3197,7 +3197,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out; /* another vcpu beat us to it */ /* Allocate hashed page table (if not done already) and reset it */ - if (!kvm->arch.hpt_virt) { + if (!kvm->arch.hpt.virt) { err = kvmppc_alloc_hpt(kvm, NULL); if (err) { pr_err("KVM: Couldn't alloc HPT\n"); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index b095afcd4309..175748acc9a1 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -86,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, if (*rmap & KVMPPC_RMAP_PRESENT) { i = *rmap & KVMPPC_RMAP_INDEX; - head = &kvm->arch.revmap[i]; + head = &kvm->arch.hpt.rev[i]; if (realmode) head = real_vmalloc_addr(head); - tail = &kvm->arch.revmap[head->back]; + tail = &kvm->arch.hpt.rev[head->back]; if (realmode) tail = real_vmalloc_addr(tail); rev->forw = i; @@ -154,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, lock_rmap(rmap); head = *rmap & KVMPPC_RMAP_INDEX; - next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); - prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]); next->back = rev->back; prev->forw = rev->forw; if (head == pte_index) { @@ -292,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Find and lock the HPTEG slot to use */ do_insert: - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); for (i = 0; i < 8; ++i) { if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 && try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | @@ -327,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } pte_index += i; } else { - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | HPTE_V_ABSENT)) { /* Lock the slot and check again */ @@ -344,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } /* Save away the guest's idea of the second HPTE dword */ - rev = &kvm->arch.revmap[pte_index]; + rev = &kvm->arch.hpt.rev[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); if (rev) { @@ -469,9 +469,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); pte = orig_pte = be64_to_cpu(hpte[0]); @@ -487,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, return H_NOT_FOUND; } - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); v = pte & ~HPTE_V_HVLOCK; if (v & HPTE_V_VALID) { hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); @@ -557,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) break; } if (req != 1 || flags == 3 || - pte_index >= kvm->arch.hpt_npte) { + pte_index >= kvm->arch.hpt.npte) { /* parameter error */ args[j] = ((0xa0 | flags) << 56) + pte_index; ret = H_PARAMETER; break; } - hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4)); + hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4)); /* to avoid deadlock, don't spin except for first */ if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { if (n) @@ -600,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) } args[j] = ((0x80 | flags) << 56) + pte_index; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); note_hpte_modification(kvm, rev); if (!(hp0 & HPTE_V_VALID)) { @@ -657,10 +657,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = pte_v = be64_to_cpu(hpte[0]); @@ -680,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, /* Update guest view of 2nd HPTE dword */ mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | HPTE_R_KEY_LO; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); if (rev) { r = (rev->guest_rpte & ~mask) | bits; rev->guest_rpte = r; @@ -728,15 +728,15 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; n = 4; } - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); if (cpu_has_feature(CPU_FTR_ARCH_300)) { @@ -769,11 +769,11 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hpte[0]); @@ -817,11 +817,11 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hpte[0]); @@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, somask = (1UL << 28) - 1; vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; } - hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt.mask; avpn = slb_v & ~(somask >> 16); /* also includes B */ avpn |= (eaddr & somask) >> 16; @@ -981,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, val |= avpn; for (;;) { - hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7)); for (i = 0; i < 16; i += 2) { /* Read the PTE racily */ @@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, if (val & HPTE_V_SECONDARY) break; val |= HPTE_V_SECONDARY; - hash = hash ^ kvm->arch.hpt_mask; + hash = hash ^ kvm->arch.hpt.mask; } return -1; } @@ -1066,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, return status; /* there really was no HPTE */ return 0; /* for prot fault, HPTE disappeared */ } - hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); if (cpu_has_feature(CPU_FTR_ARCH_300)) { v = hpte_new_to_old_v(v, r); r = hpte_new_to_old_r(r); } - rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]); gr = rev->guest_rpte; unlock_hpte(hpte, orig_v); From 3d089f84c6f9b7b0eda993142d73961a44b553d2 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:01 +1100 Subject: [PATCH 36/42] KVM: PPC: Book3S HV: Don't store values derivable from HPT order Currently the kvm_hpt_info structure stores the hashed page table's order, and also the number of HPTEs it contains and a mask for its size. The last two can be easily derived from the order, so remove them and just calculate them as necessary with a couple of helper inlines. Signed-off-by: David Gibson Reviewed-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_64.h | 12 ++++++++++ arch/powerpc/include/asm/kvm_host.h | 2 -- arch/powerpc/kvm/book3s_64_mmu_hv.c | 28 +++++++++++------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 18 +++++++-------- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 0db010cc4e65..ecc3b33bcc59 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -356,6 +356,18 @@ extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); extern void kvmhv_rm_send_ipi(int cpu); +static inline unsigned long kvmppc_hpt_npte(struct kvm_hpt_info *hpt) +{ + /* HPTEs are 2**4 bytes long */ + return 1UL << (hpt->order - 4); +} + +static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt) +{ + /* 128 (2**7) bytes in each HPTEG */ + return (1UL << (hpt->order - 7)) - 1; +} + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ea6f0c659936..0aa0f22d775a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -246,8 +246,6 @@ struct kvm_hpt_info { unsigned long virt; /* Array of reverse mapping entries for each guest HPTE */ struct revmap_entry *rev; - unsigned long npte; - unsigned long mask; /* Guest HPT size is 2**(order) bytes */ u32 order; /* 1 if HPT allocated with CMA, 0 otherwise */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2af63ce129bc..d89995ef6c7b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -83,15 +83,11 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) kvm->arch.hpt.virt = hpt; kvm->arch.hpt.order = order; - /* HPTEs are 2**4 bytes long */ - kvm->arch.hpt.npte = 1ul << (order - 4); - /* 128 (2**7) bytes in each HPTEG */ - kvm->arch.hpt.mask = (1ul << (order - 7)) - 1; atomic64_set(&kvm->arch.mmio_update, 0); /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt.npte); + rev = vmalloc(sizeof(struct revmap_entry) * kvmppc_hpt_npte(&kvm->arch.hpt)); if (!rev) { pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); goto out_freehpt; @@ -196,8 +192,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (npages > 1ul << (40 - porder)) npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ - if (npages > kvm->arch.hpt.mask + 1) - npages = kvm->arch.hpt.mask + 1; + if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) + npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); @@ -207,7 +203,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt.mask; + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) + & kvmppc_hpt_mask(&kvm->arch.hpt); /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create @@ -1327,7 +1324,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, /* Skip uninteresting entries, i.e. clean on not-first pass */ if (!first_pass) { - while (i < kvm->arch.hpt.npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && !hpte_dirty(revp, hptp)) { ++i; hptp += 2; @@ -1337,7 +1334,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, hdr.index = i; /* Grab a series of valid entries */ - while (i < kvm->arch.hpt.npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && hdr.n_valid < 0xffff && nb + HPTE_SIZE < count && record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { @@ -1353,7 +1350,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, ++revp; } /* Now skip invalid entries while we can */ - while (i < kvm->arch.hpt.npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && hdr.n_invalid < 0xffff && record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { /* found an invalid entry */ @@ -1374,7 +1371,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, } /* Check if we've wrapped around the hash table */ - if (i >= kvm->arch.hpt.npte) { + if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { i = 0; ctx->first_pass = 0; break; @@ -1433,8 +1430,8 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, err = -EINVAL; i = hdr.index; - if (i >= kvm->arch.hpt.npte || - i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt.npte) + if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || + i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) break; hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); @@ -1625,7 +1622,8 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, kvm = p->kvm; i = p->hpt_index; hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); - for (; len != 0 && i < kvm->arch.hpt.npte; ++i, hptp += 2) { + for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); + ++i, hptp += 2) { if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) continue; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 175748acc9a1..6fca970373ee 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -292,7 +292,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Find and lock the HPTEG slot to use */ do_insert: - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; @@ -469,7 +469,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) @@ -557,7 +557,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) break; } if (req != 1 || flags == 3 || - pte_index >= kvm->arch.hpt.npte) { + pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) { /* parameter error */ args[j] = ((0xa0 | flags) << 56) + pte_index; ret = H_PARAMETER; @@ -657,7 +657,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); @@ -728,7 +728,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; @@ -769,7 +769,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); @@ -817,7 +817,7 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); @@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, somask = (1UL << 28) - 1; vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; } - hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt.mask; + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt); avpn = slb_v & ~(somask >> 16); /* also includes B */ avpn |= (eaddr & somask) >> 16; @@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, if (val & HPTE_V_SECONDARY) break; val |= HPTE_V_SECONDARY; - hash = hash ^ kvm->arch.hpt.mask; + hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt); } return -1; } From aae0777f1e8224b4fbb78b2c692060852ee750c8 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:02 +1100 Subject: [PATCH 37/42] KVM: PPC: Book3S HV: Split HPT allocation from activation Currently, kvmppc_alloc_hpt() both allocates a new hashed page table (HPT) and sets it up as the active page table for a VM. For the upcoming HPT resize implementation we're going to want to allocate HPTs separately from activating them. So, split the allocation itself out into kvmppc_allocate_hpt() and perform the activation with a new kvmppc_set_hpt() function. Likewise we split kvmppc_free_hpt(), which just frees the HPT, from kvmppc_release_hpt() which unsets it as an active HPT, then frees it. We also move the logic to fall back to smaller HPT sizes if the first try fails into the single caller which used that behaviour, kvmppc_hv_setup_htab_rma(). This introduces a slight semantic change, in that previously if the initial attempt at CMA allocation failed, we would fall back to attempting smaller sizes with the page allocator. Now, we try first CMA, then the page allocator at each size. As far as I can tell this change should be harmless. To match, we make kvmppc_free_hpt() just free the actual HPT itself. The call to kvmppc_free_lpid() that was there, we move to the single caller. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_64.h | 4 + arch/powerpc/include/asm/kvm_ppc.h | 5 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 101 +++++++++++------------ arch/powerpc/kvm/book3s_hv.c | 17 +++- 4 files changed, 70 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index ecc3b33bcc59..d9b48f5bb606 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -22,6 +22,10 @@ #include +/* Power architecture requires HPT is at least 256kiB, at most 64TiB */ +#define PPC_MIN_HPT_ORDER 18 +#define PPC_MAX_HPT_ORDER 46 + #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 6fad1f12e9ec..ba61dec72089 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -155,9 +155,10 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); -extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); +extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); +extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); -extern void kvmppc_free_hpt(struct kvm *kvm); +extern void kvmppc_free_hpt(struct kvm_hpt_info *info); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d89995ef6c7b..62d132a3cec5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -40,74 +40,66 @@ #include "trace_hv.h" -/* Power architecture requires HPT is at least 256kB */ -#define PPC_MIN_HPT_ORDER 18 - static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret); static void kvmppc_rmap_reset(struct kvm *kvm); -long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) +int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) { unsigned long hpt = 0; - struct revmap_entry *rev; + int cma = 0; struct page *page = NULL; - long order = KVM_DEFAULT_HPT_ORDER; + struct revmap_entry *rev; + unsigned long npte; - if (htab_orderp) { - order = *htab_orderp; - if (order < PPC_MIN_HPT_ORDER) - order = PPC_MIN_HPT_ORDER; - } + if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) + return -EINVAL; - kvm->arch.hpt.cma = 0; page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); - kvm->arch.hpt.cma = 1; + cma = 1; } - /* Lastly try successively smaller sizes from the page allocator */ - /* Only do this if userspace didn't specify a size via ioctl */ - while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| - __GFP_NOWARN, order - PAGE_SHIFT); - if (!hpt) - --order; - } + if (!hpt) + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT + |__GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) return -ENOMEM; - kvm->arch.hpt.virt = hpt; - kvm->arch.hpt.order = order; - - atomic64_set(&kvm->arch.mmio_update, 0); + /* HPTEs are 2**4 bytes long */ + npte = 1ul << (order - 4); /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * kvmppc_hpt_npte(&kvm->arch.hpt)); + rev = vmalloc(sizeof(struct revmap_entry) * npte); if (!rev) { - pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); - goto out_freehpt; + pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); + if (cma) + kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); + else + free_pages(hpt, order - PAGE_SHIFT); + return -ENOMEM; } - kvm->arch.hpt.rev = rev; - kvm->arch.sdr1 = __pa(hpt) | (order - 18); + + info->order = order; + info->virt = hpt; + info->cma = cma; + info->rev = rev; + + return 0; +} + +void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) +{ + atomic64_set(&kvm->arch.mmio_update, 0); + kvm->arch.hpt = *info; + kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", - hpt, order, kvm->arch.lpid); - - if (htab_orderp) - *htab_orderp = order; - return 0; - - out_freehpt: - if (kvm->arch.hpt.cma) - kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); - else - free_pages(hpt, order - PAGE_SHIFT); - return -ENOMEM; + info->virt, (long)info->order, kvm->arch.lpid); } long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) @@ -141,23 +133,28 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) *htab_orderp = order; err = 0; } else { - err = kvmppc_alloc_hpt(kvm, htab_orderp); - order = *htab_orderp; + struct kvm_hpt_info info; + + err = kvmppc_allocate_hpt(&info, *htab_orderp); + if (err < 0) + goto out; + kvmppc_set_hpt(kvm, &info); } out: mutex_unlock(&kvm->lock); return err; } -void kvmppc_free_hpt(struct kvm *kvm) +void kvmppc_free_hpt(struct kvm_hpt_info *info) { - vfree(kvm->arch.hpt.rev); - if (kvm->arch.hpt.cma) - kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt.virt), - 1 << (kvm->arch.hpt.order - PAGE_SHIFT)); - else if (kvm->arch.hpt.virt) - free_pages(kvm->arch.hpt.virt, - kvm->arch.hpt.order - PAGE_SHIFT); + vfree(info->rev); + if (info->cma) + kvm_free_hpt_cma(virt_to_page(info->virt), + 1 << (info->order - PAGE_SHIFT)); + else if (info->virt) + free_pages(info->virt, info->order - PAGE_SHIFT); + info->virt = 0; + info->order = 0; } /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 02607128a4d4..19987e4343c3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3198,11 +3198,22 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Allocate hashed page table (if not done already) and reset it */ if (!kvm->arch.hpt.virt) { - err = kvmppc_alloc_hpt(kvm, NULL); - if (err) { + int order = KVM_DEFAULT_HPT_ORDER; + struct kvm_hpt_info info; + + err = kvmppc_allocate_hpt(&info, order); + /* If we get here, it means userspace didn't specify a + * size explicitly. So, try successively smaller + * sizes if the default failed. */ + while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER) + err = kvmppc_allocate_hpt(&info, order); + + if (err < 0) { pr_err("KVM: Couldn't alloc HPT\n"); goto out; } + + kvmppc_set_hpt(kvm, &info); } /* Look up the memslot for guest physical address 0 */ @@ -3467,7 +3478,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) if (kvm_is_radix(kvm)) kvmppc_free_radix(kvm); else - kvmppc_free_hpt(kvm); + kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_free_pimap(kvm); } From f98a8bf9ee201b7e22fc05e27150b1e481d4949f Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:03 +1100 Subject: [PATCH 38/42] KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size The KVM_PPC_ALLOCATE_HTAB ioctl() is used to set the size of hashed page table (HPT) that userspace expects a guest VM to have, and is also used to clear that HPT when necessary (e.g. guest reboot). At present, once the ioctl() is called for the first time, the HPT size can never be changed thereafter - it will be cleared but always sized as from the first call. With upcoming HPT resize implementation, we're going to need to allow userspace to resize the HPT at reset (to change it back to the default size if the guest changed it). So, we need to allow this ioctl() to change the HPT size. This patch also updates Documentation/virtual/kvm/api.txt to reflect the new behaviour. In fact the documentation was already slightly incorrect since 572abd5 "KVM: PPC: Book3S HV: Don't fall back to smaller HPT size in allocation ioctl" Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 14 ++++++++------ arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 29 ++++++++++++++++------------- arch/powerpc/kvm/book3s_hv.c | 5 +---- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 64f217af0416..f1945d8cbccb 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2443,18 +2443,20 @@ are, it will do nothing and return an EBUSY error. The parameter is a pointer to a 32-bit unsigned integer variable containing the order (log base 2) of the desired size of the hash table, which must be between 18 and 46. On successful return from the -ioctl, it will have been updated with the order of the hash table that -was allocated. +ioctl, the value will not be changed by the kernel. If no hash table has been allocated when any vcpu is asked to run (with the KVM_RUN ioctl), the host kernel will allocate a default-sized hash table (16 MB). If this ioctl is called when a hash table has already been allocated, -the kernel will clear out the existing hash table (zero all HPTEs) and -return the hash table order in the parameter. (If the guest is using -the virtualized real-mode area (VRMA) facility, the kernel will -re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +with a different order from the existing hash table, the existing hash +table will be freed and a new one allocated. If this is ioctl is +called when a hash table has already been allocated of the same order +as specified, the kernel will clear out the existing hash table (zero +all HPTEs). In either case, if the guest is using the virtualized +real-mode area (VRMA) facility, the kernel will re-create the VMRA +HPTEs on the next KVM_RUN of any vcpu. 4.77 KVM_S390_INTERRUPT diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ba61dec72089..cf3ef8d75910 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -157,7 +157,7 @@ extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); -extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); +extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); extern void kvmppc_free_hpt(struct kvm_hpt_info *info); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 62d132a3cec5..3a607faf0f9f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -102,10 +102,10 @@ void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) info->virt, (long)info->order, kvm->arch.lpid); } -long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) +long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) { long err = -EBUSY; - long order; + struct kvm_hpt_info info; if (kvm_is_radix(kvm)) return -EINVAL; @@ -120,8 +120,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) goto out; } } - if (kvm->arch.hpt.virt) { - order = kvm->arch.hpt.order; + if (kvm->arch.hpt.order == order) { + /* We already have a suitable HPT */ + /* Set the entire HPT to 0, i.e. invalid HPTEs */ memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); /* @@ -130,17 +131,19 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) kvmppc_rmap_reset(kvm); /* Ensure that each vcpu will flush its TLB on next entry. */ cpumask_setall(&kvm->arch.need_tlb_flush); - *htab_orderp = order; err = 0; - } else { - struct kvm_hpt_info info; - - err = kvmppc_allocate_hpt(&info, *htab_orderp); - if (err < 0) - goto out; - kvmppc_set_hpt(kvm, &info); + goto out; } - out: + + if (kvm->arch.hpt.virt) + kvmppc_free_hpt(&kvm->arch.hpt); + + err = kvmppc_allocate_hpt(&info, order); + if (err < 0) + goto out; + kvmppc_set_hpt(kvm, &info); + +out: mutex_unlock(&kvm->lock); return err; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 19987e4343c3..fbc901746304 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3704,12 +3704,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, r = -EFAULT; if (get_user(htab_order, (u32 __user *)argp)) break; - r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + r = kvmppc_alloc_reset_hpt(kvm, htab_order); if (r) break; - r = -EFAULT; - if (put_user(htab_order, (u32 __user *)argp)) - break; r = 0; break; } From 639e459768845924705933db9142baef545ff5fc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:04 +1100 Subject: [PATCH 39/42] KVM: PPC: Book3S HV: Create kvmppc_unmap_hpte_helper() The kvm_unmap_rmapp() function, called from certain MMU notifiers, is used to force all guest mappings of a particular host page to be set ABSENT, and removed from the reverse mappings. For HPT resizing, we will have some cases where we want to set just a single guest HPTE ABSENT and remove its reverse mappings. To prepare with this, we split out the logic from kvm_unmap_rmapp() to evict a single HPTE, moving it to a new helper function. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 77 ++++++++++++++++------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 3a607faf0f9f..6d70989686a7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -742,13 +742,53 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return kvm_handle_hva_range(kvm, hva, hva + 1, handler); } +/* Must be called with both HPTE and rmap locked */ +static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, + unsigned long *rmapp, unsigned long gfn) +{ + __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); + struct revmap_entry *rev = kvm->arch.hpt.rev; + unsigned long j, h; + unsigned long ptel, psize, rcbits; + + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; + } + + /* Now check and modify the HPTE */ + ptel = rev[i].guest_rpte; + psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); + /* Harvest R and C */ + rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + if (rcbits & HPTE_R_C) + kvmppc_update_rmap_change(rmapp, psize); + if (rcbits & ~rev[i].guest_rpte) { + rev[i].guest_rpte = ptel | rcbits; + note_hpte_modification(kvm, &rev[i]); + } + } +} + static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.hpt.rev; - unsigned long h, i, j; + unsigned long i; __be64 *hptep; - unsigned long ptel, psize, rcbits; unsigned long *rmapp; rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; @@ -773,37 +813,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, cpu_relax(); continue; } - j = rev[i].forw; - if (j == i) { - /* chain is now empty */ - *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); - } else { - /* remove i from chain */ - h = rev[i].back; - rev[h].forw = j; - rev[j].back = h; - rev[i].forw = rev[i].back = i; - *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; - } - /* Now check and modify the HPTE */ - ptel = rev[i].guest_rpte; - psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); - if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && - hpte_rpn(ptel, psize) == gfn) { - hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); - kvmppc_invalidate_hpte(kvm, hptep, i); - hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); - /* Harvest R and C */ - rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); - *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmapp, psize); - if (rcbits & ~rev[i].guest_rpte) { - rev[i].guest_rpte = ptel | rcbits; - note_hpte_modification(kvm, &rev[i]); - } - } + kvmppc_unmap_hpte(kvm, i, rmapp, gfn); unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } From 5e9859699aba74c0e297645e7d1734cd4b964de7 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:05 +1100 Subject: [PATCH 40/42] KVM: PPC: Book3S HV: Outline of KVM-HV HPT resizing implementation This adds a not yet working outline of the HPT resizing PAPR extension. Specifically it adds the necessary ioctl() functions, their basic steps, the work function which will handle preparation for the resize, and synchronization between these, the guest page fault path and guest HPT update path. The actual guts of the implementation isn't here yet, so for now the calls will always fail. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 3 + arch/powerpc/include/asm/kvm_ppc.h | 4 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 191 ++++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 25 ++++ 4 files changed, 223 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0aa0f22d775a..7bba8f415627 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -252,6 +252,8 @@ struct kvm_hpt_info { int cma; }; +struct kvm_resize_hpt; + struct kvm_arch { unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -276,6 +278,7 @@ struct kvm_arch { u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; + struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE struct mutex hpt_mutex; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index cf3ef8d75910..dd11c4c8c56a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -215,6 +215,10 @@ extern void kvmppc_bookehv_exit(void); extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); +extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt); +extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt); int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6d70989686a7..323287fc0c01 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -40,9 +40,34 @@ #include "trace_hv.h" +//#define DEBUG_RESIZE_HPT 1 + +#ifdef DEBUG_RESIZE_HPT +#define resize_hpt_debug(resize, ...) \ + do { \ + printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ + printk(__VA_ARGS__); \ + } while (0) +#else +#define resize_hpt_debug(resize, ...) \ + do { } while (0) +#endif + static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret); + +struct kvm_resize_hpt { + /* These fields read-only after init */ + struct kvm *kvm; + struct work_struct work; + u32 order; + + /* These fields protected by kvm->lock */ + int error; + bool prepare_done; +}; + static void kvmppc_rmap_reset(struct kvm *kvm); int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) @@ -1179,6 +1204,172 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, srcu_read_unlock(&kvm->srcu, srcu_idx); } +/* + * HPT resizing + */ +static int resize_hpt_allocate(struct kvm_resize_hpt *resize) +{ + return 0; +} + +static int resize_hpt_rehash(struct kvm_resize_hpt *resize) +{ + return -EIO; +} + +static void resize_hpt_pivot(struct kvm_resize_hpt *resize) +{ +} + +static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) +{ + BUG_ON(kvm->arch.resize_hpt != resize); + kvm->arch.resize_hpt = NULL; + kfree(resize); +} + +static void resize_hpt_prepare_work(struct work_struct *work) +{ + struct kvm_resize_hpt *resize = container_of(work, + struct kvm_resize_hpt, + work); + struct kvm *kvm = resize->kvm; + int err; + + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize->order); + + err = resize_hpt_allocate(resize); + + mutex_lock(&kvm->lock); + + resize->error = err; + resize->prepare_done = true; + + mutex_unlock(&kvm->lock); +} + +long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) +{ + unsigned long flags = rhpt->flags; + unsigned long shift = rhpt->shift; + struct kvm_resize_hpt *resize; + int ret; + + if (flags != 0) + return -EINVAL; + + if (shift && ((shift < 18) || (shift > 46))) + return -EINVAL; + + mutex_lock(&kvm->lock); + + resize = kvm->arch.resize_hpt; + + if (resize) { + if (resize->order == shift) { + /* Suitable resize in progress */ + if (resize->prepare_done) { + ret = resize->error; + if (ret != 0) + resize_hpt_release(kvm, resize); + } else { + ret = 100; /* estimated time in ms */ + } + + goto out; + } + + /* not suitable, cancel it */ + resize_hpt_release(kvm, resize); + } + + ret = 0; + if (!shift) + goto out; /* nothing to do */ + + /* start new resize */ + + resize = kzalloc(sizeof(*resize), GFP_KERNEL); + resize->order = shift; + resize->kvm = kvm; + INIT_WORK(&resize->work, resize_hpt_prepare_work); + kvm->arch.resize_hpt = resize; + + schedule_work(&resize->work); + + ret = 100; /* estimated time in ms */ + +out: + mutex_unlock(&kvm->lock); + return ret; +} + +static void resize_hpt_boot_vcpu(void *opaque) +{ + /* Nothing to do, just force a KVM exit */ +} + +long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) +{ + unsigned long flags = rhpt->flags; + unsigned long shift = rhpt->shift; + struct kvm_resize_hpt *resize; + long ret; + + if (flags != 0) + return -EINVAL; + + if (shift && ((shift < 18) || (shift > 46))) + return -EINVAL; + + mutex_lock(&kvm->lock); + + resize = kvm->arch.resize_hpt; + + /* This shouldn't be possible */ + ret = -EIO; + if (WARN_ON(!kvm->arch.hpte_setup_done)) + goto out_no_hpt; + + /* Stop VCPUs from running while we mess with the HPT */ + kvm->arch.hpte_setup_done = 0; + smp_mb(); + + /* Boot all CPUs out of the guest so they re-read + * hpte_setup_done */ + on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); + + ret = -ENXIO; + if (!resize || (resize->order != shift)) + goto out; + + ret = -EBUSY; + if (!resize->prepare_done) + goto out; + + ret = resize->error; + if (ret != 0) + goto out; + + ret = resize_hpt_rehash(resize); + if (ret != 0) + goto out; + + resize_hpt_pivot(resize); + +out: + /* Let VCPUs run again */ + kvm->arch.hpte_setup_done = 1; + smp_mb(); +out_no_hpt: + resize_hpt_release(kvm, resize); + mutex_unlock(&kvm->lock); + return ret; +} + /* * Functions for reading and writing the hash table via reads and * writes on a file descriptor. diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index fbc901746304..1e107ece4e37 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3422,6 +3422,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.lpcr = lpcr; + /* Initialization for future HPT resizes */ + kvm->arch.resize_hpt = NULL; + /* * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. @@ -3721,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, break; } + case KVM_PPC_RESIZE_HPT_PREPARE: { + struct kvm_ppc_resize_hpt rhpt; + + r = -EFAULT; + if (copy_from_user(&rhpt, argp, sizeof(rhpt))) + break; + + r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt); + break; + } + + case KVM_PPC_RESIZE_HPT_COMMIT: { + struct kvm_ppc_resize_hpt rhpt; + + r = -EFAULT; + if (copy_from_user(&rhpt, argp, sizeof(rhpt))) + break; + + r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt); + break; + } + default: r = -ENOTTY; } From b5baa68773150772c275b4af1bb31327200cfc05 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:06 +1100 Subject: [PATCH 41/42] KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation This adds the "guts" of the implementation for the HPT resizing PAPR extension. It has the code to allocate and clear a new HPT, rehash an existing HPT's entries into it, and accomplish the switchover for a KVM guest from the old HPT to the new one. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 188 +++++++++++++++++++++++++++- 1 file changed, 187 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 323287fc0c01..013552f05182 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -66,6 +66,10 @@ struct kvm_resize_hpt { /* These fields protected by kvm->lock */ int error; bool prepare_done; + + /* Private to the work thread, until prepare_done is true, + * then protected by kvm->resize_hpt_sem */ + struct kvm_hpt_info hpt; }; static void kvmppc_rmap_reset(struct kvm *kvm); @@ -1209,21 +1213,203 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, */ static int resize_hpt_allocate(struct kvm_resize_hpt *resize) { + int rc; + + rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); + if (rc < 0) + return rc; + + resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", + resize->hpt.virt); + return 0; } +static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, + unsigned long idx) +{ + struct kvm *kvm = resize->kvm; + struct kvm_hpt_info *old = &kvm->arch.hpt; + struct kvm_hpt_info *new = &resize->hpt; + unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; + unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; + __be64 *hptep, *new_hptep; + unsigned long vpte, rpte, guest_rpte; + int ret; + struct revmap_entry *rev; + unsigned long apsize, psize, avpn, pteg, hash; + unsigned long new_idx, new_pteg, replace_vpte; + + hptep = (__be64 *)(old->virt + (idx << 4)); + + /* Guest is stopped, so new HPTEs can't be added or faulted + * in, only unmapped or altered by host actions. So, it's + * safe to check this before we take the HPTE lock */ + vpte = be64_to_cpu(hptep[0]); + if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) + return 0; /* nothing to do */ + + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + + vpte = be64_to_cpu(hptep[0]); + + ret = 0; + if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) + /* Nothing to do */ + goto out; + + /* Unmap */ + rev = &old->rev[idx]; + guest_rpte = rev->guest_rpte; + + ret = -EIO; + apsize = hpte_page_size(vpte, guest_rpte); + if (!apsize) + goto out; + + if (vpte & HPTE_V_VALID) { + unsigned long gfn = hpte_rpn(guest_rpte, apsize); + int srcu_idx = srcu_read_lock(&kvm->srcu); + struct kvm_memory_slot *memslot = + __gfn_to_memslot(kvm_memslots(kvm), gfn); + + if (memslot) { + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + + lock_rmap(rmapp); + kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); + unlock_rmap(rmapp); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); + } + + /* Reload PTE after unmap */ + vpte = be64_to_cpu(hptep[0]); + + BUG_ON(vpte & HPTE_V_VALID); + BUG_ON(!(vpte & HPTE_V_ABSENT)); + + ret = 0; + if (!(vpte & HPTE_V_BOLTED)) + goto out; + + rpte = be64_to_cpu(hptep[1]); + psize = hpte_base_page_size(vpte, rpte); + avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23); + pteg = idx / HPTES_PER_GROUP; + if (vpte & HPTE_V_SECONDARY) + pteg = ~pteg; + + if (!(vpte & HPTE_V_1TB_SEG)) { + unsigned long offset, vsid; + + /* We only have 28 - 23 bits of offset in avpn */ + offset = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (psize < (1ULL << 23)) + offset |= ((vsid ^ pteg) & old_hash_mask) * psize; + + hash = vsid ^ (offset / psize); + } else { + unsigned long offset, vsid; + + /* We only have 40 - 23 bits of seg_off in avpn */ + offset = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (psize < (1ULL << 23)) + offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize; + + hash = vsid ^ (vsid << 25) ^ (offset / psize); + } + + new_pteg = hash & new_hash_mask; + if (vpte & HPTE_V_SECONDARY) { + BUG_ON(~pteg != (hash & old_hash_mask)); + new_pteg = ~new_pteg; + } else { + BUG_ON(pteg != (hash & old_hash_mask)); + } + + new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); + new_hptep = (__be64 *)(new->virt + (new_idx << 4)); + + replace_vpte = be64_to_cpu(new_hptep[0]); + + if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { + BUG_ON(new->order >= old->order); + + if (replace_vpte & HPTE_V_BOLTED) { + if (vpte & HPTE_V_BOLTED) + /* Bolted collision, nothing we can do */ + ret = -ENOSPC; + /* Discard the new HPTE */ + goto out; + } + + /* Discard the previous HPTE */ + } + + new_hptep[1] = cpu_to_be64(rpte); + new->rev[new_idx].guest_rpte = guest_rpte; + /* No need for a barrier, since new HPT isn't active */ + new_hptep[0] = cpu_to_be64(vpte); + unlock_hpte(new_hptep, vpte); + +out: + unlock_hpte(hptep, vpte); + return ret; +} + static int resize_hpt_rehash(struct kvm_resize_hpt *resize) { - return -EIO; + struct kvm *kvm = resize->kvm; + unsigned long i; + int rc; + + for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { + rc = resize_hpt_rehash_hpte(resize, i); + if (rc != 0) + return rc; + } + + return 0; } static void resize_hpt_pivot(struct kvm_resize_hpt *resize) { + struct kvm *kvm = resize->kvm; + struct kvm_hpt_info hpt_tmp; + + /* Exchange the pending tables in the resize structure with + * the active tables */ + + resize_hpt_debug(resize, "resize_hpt_pivot()\n"); + + spin_lock(&kvm->mmu_lock); + asm volatile("ptesync" : : : "memory"); + + hpt_tmp = kvm->arch.hpt; + kvmppc_set_hpt(kvm, &resize->hpt); + resize->hpt = hpt_tmp; + + spin_unlock(&kvm->mmu_lock); + + synchronize_srcu_expedited(&kvm->srcu); + + resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); } static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { BUG_ON(kvm->arch.resize_hpt != resize); + + if (resize->hpt.virt) + kvmppc_free_hpt(&resize->hpt); + kvm->arch.resize_hpt = NULL; kfree(resize); } From 050f23390f6bdbfa7dd2800884d32490489851b7 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:07 +1100 Subject: [PATCH 42/42] KVM: PPC: Book3S HV: Advertise availablity of HPT resizing on KVM HV This updates the KVM_CAP_SPAPR_RESIZE_HPT capability to advertise the presence of in-kernel HPT resizing on KVM HV. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 40a5b2d75ed1..2b3e4e620078 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -612,6 +612,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPAPR_MULTITCE: r = 1; break; + case KVM_CAP_SPAPR_RESIZE_HPT: + r = !!hv_enabled; + break; #endif case KVM_CAP_PPC_HTM: r = cpu_has_feature(CPU_FTR_TM_COMP) &&