Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 1999, 2023 */ #include <linux/cpuhotplug.h> #include <linux/sched/task.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/irq.h> #include <asm/asm-extable.h> #include <asm/pfault.h> #include <asm/diag.h> #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000UL /* * 'pfault' pseudo page faults routines. */ static int pfault_disable; static int __init nopfault(char *str) { pfault_disable = 1; return 1; } early_param("nopfault", nopfault); struct pfault_refbk { u16 refdiagc; u16 reffcode; u16 refdwlen; u16 refversn; u64 refgaddr; u64 refselmk; u64 refcmpmk; u64 reserved; }; static struct pfault_refbk pfault_init_refbk = { .refdiagc = 0x258, .reffcode = 0, .refdwlen = 5, .refversn = 2, .refgaddr = __LC_LPP, .refselmk = 1UL << 48, .refcmpmk = 1UL << 48, .reserved = __PF_RES_FIELD }; int __pfault_init(void) { int rc = -EOPNOTSUPP; if (pfault_disable) return rc; diag_stat_inc(DIAG_STAT_X258); asm volatile( " diag %[refbk],%[rc],0x258\n" "0: nopr %%r7\n" EX_TABLE(0b, 0b) : [rc] "+d" (rc) : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); return rc; } static struct pfault_refbk pfault_fini_refbk = { .refdiagc = 0x258, .reffcode = 1, .refdwlen = 5, .refversn = 2, }; void __pfault_fini(void) { if (pfault_disable) return; diag_stat_inc(DIAG_STAT_X258); asm volatile( " diag %[refbk],0,0x258\n" "0: nopr %%r7\n" EX_TABLE(0b, 0b) : : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); } static DEFINE_SPINLOCK(pfault_lock); static LIST_HEAD(pfault_list); #define PF_COMPLETE 0x0080 /* * The mechanism of our pfault code: if Linux is running as guest, runs a user * space process and the user space process accesses a page that the host has * paged out we get a pfault interrupt. * * This allows us, within the guest, to schedule a different process. Without * this mechanism the host would have to suspend the whole virtual cpu until * the page has been paged in. * * So when we get such an interrupt then we set the state of the current task * to uninterruptible and also set the need_resched flag. Both happens within * interrupt context(!). If we later on want to return to user space we * recognize the need_resched flag and then call schedule(). It's not very * obvious how this works... * * Of course we have a lot of additional fun with the completion interrupt (-> * host signals that a page of a process has been paged in and the process can * continue to run). This interrupt can arrive on any cpu and, since we have * virtual cpus, actually appear before the interrupt that signals that a page * is missing. */ static void pfault_interrupt(struct ext_code ext_code, unsigned int param32, unsigned long param64) { struct task_struct *tsk; __u16 subcode; pid_t pid; /* * Get the external interruption subcode & pfault initial/completion * signal bit. VM stores this in the 'cpu address' field associated * with the external interrupt. */ subcode = ext_code.subcode; if ((subcode & 0xff00) != __SUBCODE_MASK) return; inc_irq_stat(IRQEXT_PFL); /* Get the token (= pid of the affected task). */ pid = param64 & LPP_PID_MASK; rcu_read_lock(); tsk = find_task_by_pid_ns(pid, &init_pid_ns); if (tsk) get_task_struct(tsk); rcu_read_unlock(); if (!tsk) return; spin_lock(&pfault_lock); if (subcode & PF_COMPLETE) { /* signal bit is set -> a page has been swapped in by VM */ if (tsk->thread.pfault_wait == 1) { /* * Initial interrupt was faster than the completion * interrupt. pfault_wait is valid. Set pfault_wait * back to zero and wake up the process. This can * safely be done because the task is still sleeping * and can't produce new pfaults. */ tsk->thread.pfault_wait = 0; list_del(&tsk->thread.list); wake_up_process(tsk); put_task_struct(tsk); } else { /* * Completion interrupt was faster than initial * interrupt. Set pfault_wait to -1 so the initial * interrupt doesn't put the task to sleep. * If the task is not running, ignore the completion * interrupt since it must be a leftover of a PFAULT * CANCEL operation which didn't remove all pending * completion interrupts. */ if (task_is_running(tsk)) tsk->thread.pfault_wait = -1; } } else { /* signal bit not set -> a real page is missing. */ if (WARN_ON_ONCE(tsk != current)) goto out; if (tsk->thread.pfault_wait == 1) { /* Already on the list with a reference: put to sleep */ goto block; } else if (tsk->thread.pfault_wait == -1) { /* * Completion interrupt was faster than the initial * interrupt (pfault_wait == -1). Set pfault_wait * back to zero and exit. */ tsk->thread.pfault_wait = 0; } else { /* * Initial interrupt arrived before completion * interrupt. Let the task sleep. * An extra task reference is needed since a different * cpu may set the task state to TASK_RUNNING again * before the scheduler is reached. */ get_task_struct(tsk); tsk->thread.pfault_wait = 1; list_add(&tsk->thread.list, &pfault_list); block: /* * Since this must be a userspace fault, there * is no kernel task state to trample. Rely on the * return to userspace schedule() to block. */ __set_current_state(TASK_UNINTERRUPTIBLE); set_tsk_need_resched(tsk); set_preempt_need_resched(); } } out: spin_unlock(&pfault_lock); put_task_struct(tsk); } static int pfault_cpu_dead(unsigned int cpu) { struct thread_struct *thread, *next; struct task_struct *tsk; spin_lock_irq(&pfault_lock); list_for_each_entry_safe(thread, next, &pfault_list, list) { thread->pfault_wait = 0; list_del(&thread->list); tsk = container_of(thread, struct task_struct, thread); wake_up_process(tsk); put_task_struct(tsk); } spin_unlock_irq(&pfault_lock); return 0; } static int __init pfault_irq_init(void) { int rc; rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); if (rc) goto out_extint; rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; if (rc) goto out_pfault; irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", NULL, pfault_cpu_dead); return 0; out_pfault: unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); out_extint: pfault_disable = 1; return rc; } early_initcall(pfault_irq_init); |