Loading...
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Xen time implementation.
4 *
5 * This is implemented in terms of a clocksource driver which uses
6 * the hypervisor clock as a nanosecond timebase, and a clockevent
7 * driver which uses the hypervisor's timer mechanism.
8 *
9 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
10 */
11#include <linux/kernel.h>
12#include <linux/interrupt.h>
13#include <linux/clocksource.h>
14#include <linux/clockchips.h>
15#include <linux/gfp.h>
16#include <linux/slab.h>
17#include <linux/pvclock_gtod.h>
18#include <linux/timekeeper_internal.h>
19
20#include <asm/pvclock.h>
21#include <asm/xen/hypervisor.h>
22#include <asm/xen/hypercall.h>
23
24#include <xen/events.h>
25#include <xen/features.h>
26#include <xen/interface/xen.h>
27#include <xen/interface/vcpu.h>
28
29#include "xen-ops.h"
30
31/* Minimum amount of time until next clock event fires */
32#define TIMER_SLOP 100000
33
34static u64 xen_sched_clock_offset __read_mostly;
35
36/* Get the TSC speed from Xen */
37static unsigned long xen_tsc_khz(void)
38{
39 struct pvclock_vcpu_time_info *info =
40 &HYPERVISOR_shared_info->vcpu_info[0].time;
41
42 setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
43 return pvclock_tsc_khz(info);
44}
45
46static u64 xen_clocksource_read(void)
47{
48 struct pvclock_vcpu_time_info *src;
49 u64 ret;
50
51 preempt_disable_notrace();
52 src = &__this_cpu_read(xen_vcpu)->time;
53 ret = pvclock_clocksource_read(src);
54 preempt_enable_notrace();
55 return ret;
56}
57
58static u64 xen_clocksource_get_cycles(struct clocksource *cs)
59{
60 return xen_clocksource_read();
61}
62
63static u64 xen_sched_clock(void)
64{
65 return xen_clocksource_read() - xen_sched_clock_offset;
66}
67
68static void xen_read_wallclock(struct timespec64 *ts)
69{
70 struct shared_info *s = HYPERVISOR_shared_info;
71 struct pvclock_wall_clock *wall_clock = &(s->wc);
72 struct pvclock_vcpu_time_info *vcpu_time;
73
74 vcpu_time = &get_cpu_var(xen_vcpu)->time;
75 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
76 put_cpu_var(xen_vcpu);
77}
78
79static void xen_get_wallclock(struct timespec64 *now)
80{
81 xen_read_wallclock(now);
82}
83
84static int xen_set_wallclock(const struct timespec64 *now)
85{
86 return -ENODEV;
87}
88
89static int xen_pvclock_gtod_notify(struct notifier_block *nb,
90 unsigned long was_set, void *priv)
91{
92 /* Protected by the calling core code serialization */
93 static struct timespec64 next_sync;
94
95 struct xen_platform_op op;
96 struct timespec64 now;
97 struct timekeeper *tk = priv;
98 static bool settime64_supported = true;
99 int ret;
100
101 now.tv_sec = tk->xtime_sec;
102 now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
103
104 /*
105 * We only take the expensive HV call when the clock was set
106 * or when the 11 minutes RTC synchronization time elapsed.
107 */
108 if (!was_set && timespec64_compare(&now, &next_sync) < 0)
109 return NOTIFY_OK;
110
111again:
112 if (settime64_supported) {
113 op.cmd = XENPF_settime64;
114 op.u.settime64.mbz = 0;
115 op.u.settime64.secs = now.tv_sec;
116 op.u.settime64.nsecs = now.tv_nsec;
117 op.u.settime64.system_time = xen_clocksource_read();
118 } else {
119 op.cmd = XENPF_settime32;
120 op.u.settime32.secs = now.tv_sec;
121 op.u.settime32.nsecs = now.tv_nsec;
122 op.u.settime32.system_time = xen_clocksource_read();
123 }
124
125 ret = HYPERVISOR_platform_op(&op);
126
127 if (ret == -ENOSYS && settime64_supported) {
128 settime64_supported = false;
129 goto again;
130 }
131 if (ret < 0)
132 return NOTIFY_BAD;
133
134 /*
135 * Move the next drift compensation time 11 minutes
136 * ahead. That's emulating the sync_cmos_clock() update for
137 * the hardware RTC.
138 */
139 next_sync = now;
140 next_sync.tv_sec += 11 * 60;
141
142 return NOTIFY_OK;
143}
144
145static struct notifier_block xen_pvclock_gtod_notifier = {
146 .notifier_call = xen_pvclock_gtod_notify,
147};
148
149static int xen_cs_enable(struct clocksource *cs)
150{
151 vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
152 return 0;
153}
154
155static struct clocksource xen_clocksource __read_mostly = {
156 .name = "xen",
157 .rating = 400,
158 .read = xen_clocksource_get_cycles,
159 .mask = CLOCKSOURCE_MASK(64),
160 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
161 .enable = xen_cs_enable,
162};
163
164/*
165 Xen clockevent implementation
166
167 Xen has two clockevent implementations:
168
169 The old timer_op one works with all released versions of Xen prior
170 to version 3.0.4. This version of the hypervisor provides a
171 single-shot timer with nanosecond resolution. However, sharing the
172 same event channel is a 100Hz tick which is delivered while the
173 vcpu is running. We don't care about or use this tick, but it will
174 cause the core time code to think the timer fired too soon, and
175 will end up resetting it each time. It could be filtered, but
176 doing so has complications when the ktime clocksource is not yet
177 the xen clocksource (ie, at boot time).
178
179 The new vcpu_op-based timer interface allows the tick timer period
180 to be changed or turned off. The tick timer is not useful as a
181 periodic timer because events are only delivered to running vcpus.
182 The one-shot timer can report when a timeout is in the past, so
183 set_next_event is capable of returning -ETIME when appropriate.
184 This interface is used when available.
185*/
186
187
188/*
189 Get a hypervisor absolute time. In theory we could maintain an
190 offset between the kernel's time and the hypervisor's time, and
191 apply that to a kernel's absolute timeout. Unfortunately the
192 hypervisor and kernel times can drift even if the kernel is using
193 the Xen clocksource, because ntp can warp the kernel's clocksource.
194*/
195static s64 get_abs_timeout(unsigned long delta)
196{
197 return xen_clocksource_read() + delta;
198}
199
200static int xen_timerop_shutdown(struct clock_event_device *evt)
201{
202 /* cancel timeout */
203 HYPERVISOR_set_timer_op(0);
204
205 return 0;
206}
207
208static int xen_timerop_set_next_event(unsigned long delta,
209 struct clock_event_device *evt)
210{
211 WARN_ON(!clockevent_state_oneshot(evt));
212
213 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
214 BUG();
215
216 /* We may have missed the deadline, but there's no real way of
217 knowing for sure. If the event was in the past, then we'll
218 get an immediate interrupt. */
219
220 return 0;
221}
222
223static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
224 .name = "xen",
225 .features = CLOCK_EVT_FEAT_ONESHOT,
226
227 .max_delta_ns = 0xffffffff,
228 .max_delta_ticks = 0xffffffff,
229 .min_delta_ns = TIMER_SLOP,
230 .min_delta_ticks = TIMER_SLOP,
231
232 .mult = 1,
233 .shift = 0,
234 .rating = 500,
235
236 .set_state_shutdown = xen_timerop_shutdown,
237 .set_next_event = xen_timerop_set_next_event,
238};
239
240static int xen_vcpuop_shutdown(struct clock_event_device *evt)
241{
242 int cpu = smp_processor_id();
243
244 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
245 NULL) ||
246 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
247 NULL))
248 BUG();
249
250 return 0;
251}
252
253static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
254{
255 int cpu = smp_processor_id();
256
257 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
258 NULL))
259 BUG();
260
261 return 0;
262}
263
264static int xen_vcpuop_set_next_event(unsigned long delta,
265 struct clock_event_device *evt)
266{
267 int cpu = smp_processor_id();
268 struct vcpu_set_singleshot_timer single;
269 int ret;
270
271 WARN_ON(!clockevent_state_oneshot(evt));
272
273 single.timeout_abs_ns = get_abs_timeout(delta);
274 /* Get an event anyway, even if the timeout is already expired */
275 single.flags = 0;
276
277 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
278 &single);
279 BUG_ON(ret != 0);
280
281 return ret;
282}
283
284static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
285 .name = "xen",
286 .features = CLOCK_EVT_FEAT_ONESHOT,
287
288 .max_delta_ns = 0xffffffff,
289 .max_delta_ticks = 0xffffffff,
290 .min_delta_ns = TIMER_SLOP,
291 .min_delta_ticks = TIMER_SLOP,
292
293 .mult = 1,
294 .shift = 0,
295 .rating = 500,
296
297 .set_state_shutdown = xen_vcpuop_shutdown,
298 .set_state_oneshot = xen_vcpuop_set_oneshot,
299 .set_next_event = xen_vcpuop_set_next_event,
300};
301
302static const struct clock_event_device *xen_clockevent =
303 &xen_timerop_clockevent;
304
305struct xen_clock_event_device {
306 struct clock_event_device evt;
307 char name[16];
308};
309static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
310
311static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
312{
313 struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
314 irqreturn_t ret;
315
316 ret = IRQ_NONE;
317 if (evt->event_handler) {
318 evt->event_handler(evt);
319 ret = IRQ_HANDLED;
320 }
321
322 return ret;
323}
324
325void xen_teardown_timer(int cpu)
326{
327 struct clock_event_device *evt;
328 evt = &per_cpu(xen_clock_events, cpu).evt;
329
330 if (evt->irq >= 0) {
331 unbind_from_irqhandler(evt->irq, NULL);
332 evt->irq = -1;
333 }
334}
335
336void xen_setup_timer(int cpu)
337{
338 struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
339 struct clock_event_device *evt = &xevt->evt;
340 int irq;
341
342 WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
343 if (evt->irq >= 0)
344 xen_teardown_timer(cpu);
345
346 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
347
348 snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
349
350 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
351 IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
352 IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
353 xevt->name, NULL);
354 (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
355
356 memcpy(evt, xen_clockevent, sizeof(*evt));
357
358 evt->cpumask = cpumask_of(cpu);
359 evt->irq = irq;
360}
361
362
363void xen_setup_cpu_clockevents(void)
364{
365 clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
366}
367
368void xen_timer_resume(void)
369{
370 int cpu;
371
372 if (xen_clockevent != &xen_vcpuop_clockevent)
373 return;
374
375 for_each_online_cpu(cpu) {
376 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
377 xen_vcpu_nr(cpu), NULL))
378 BUG();
379 }
380}
381
382static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
383static u64 xen_clock_value_saved;
384
385void xen_save_time_memory_area(void)
386{
387 struct vcpu_register_time_memory_area t;
388 int ret;
389
390 xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
391
392 if (!xen_clock)
393 return;
394
395 t.addr.v = NULL;
396
397 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
398 if (ret != 0)
399 pr_notice("Cannot save secondary vcpu_time_info (err %d)",
400 ret);
401 else
402 clear_page(xen_clock);
403}
404
405void xen_restore_time_memory_area(void)
406{
407 struct vcpu_register_time_memory_area t;
408 int ret;
409
410 if (!xen_clock)
411 goto out;
412
413 t.addr.v = &xen_clock->pvti;
414
415 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
416
417 /*
418 * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
419 * register the secondary time info with Xen or if we migrated to a
420 * host without the necessary flags. On both of these cases what
421 * happens is either process seeing a zeroed out pvti or seeing no
422 * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
423 * if 0, it discards the data in pvti and fallbacks to a system
424 * call for a reliable timestamp.
425 */
426 if (ret != 0)
427 pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
428 ret);
429
430out:
431 /* Need pvclock_resume() before using xen_clocksource_read(). */
432 pvclock_resume();
433 xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
434}
435
436static void xen_setup_vsyscall_time_info(void)
437{
438 struct vcpu_register_time_memory_area t;
439 struct pvclock_vsyscall_time_info *ti;
440 int ret;
441
442 ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
443 if (!ti)
444 return;
445
446 t.addr.v = &ti->pvti;
447
448 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
449 if (ret) {
450 pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
451 free_page((unsigned long)ti);
452 return;
453 }
454
455 /*
456 * If primary time info had this bit set, secondary should too since
457 * it's the same data on both just different memory regions. But we
458 * still check it in case hypervisor is buggy.
459 */
460 if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
461 t.addr.v = NULL;
462 ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
463 0, &t);
464 if (!ret)
465 free_page((unsigned long)ti);
466
467 pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
468 return;
469 }
470
471 xen_clock = ti;
472 pvclock_set_pvti_cpu0_va(xen_clock);
473
474 xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
475}
476
477static void __init xen_time_init(void)
478{
479 struct pvclock_vcpu_time_info *pvti;
480 int cpu = smp_processor_id();
481 struct timespec64 tp;
482
483 /* As Dom0 is never moved, no penalty on using TSC there */
484 if (xen_initial_domain())
485 xen_clocksource.rating = 275;
486
487 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
488
489 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
490 NULL) == 0) {
491 /* Successfully turned off 100Hz tick, so we have the
492 vcpuop-based timer interface */
493 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
494 xen_clockevent = &xen_vcpuop_clockevent;
495 }
496
497 /* Set initial system time with full resolution */
498 xen_read_wallclock(&tp);
499 do_settimeofday64(&tp);
500
501 setup_force_cpu_cap(X86_FEATURE_TSC);
502
503 /*
504 * We check ahead on the primary time info if this
505 * bit is supported hence speeding up Xen clocksource.
506 */
507 pvti = &__this_cpu_read(xen_vcpu)->time;
508 if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
509 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
510 xen_setup_vsyscall_time_info();
511 }
512
513 xen_setup_runstate_info(cpu);
514 xen_setup_timer(cpu);
515 xen_setup_cpu_clockevents();
516
517 xen_time_setup_guest();
518
519 if (xen_initial_domain())
520 pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
521}
522
523static void __init xen_init_time_common(void)
524{
525 xen_sched_clock_offset = xen_clocksource_read();
526 static_call_update(pv_steal_clock, xen_steal_clock);
527 paravirt_set_sched_clock(xen_sched_clock);
528
529 x86_platform.calibrate_tsc = xen_tsc_khz;
530 x86_platform.get_wallclock = xen_get_wallclock;
531}
532
533void __init xen_init_time_ops(void)
534{
535 xen_init_time_common();
536
537 x86_init.timers.timer_init = xen_time_init;
538 x86_init.timers.setup_percpu_clockev = x86_init_noop;
539 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
540
541 /* Dom0 uses the native method to set the hardware RTC. */
542 if (!xen_initial_domain())
543 x86_platform.set_wallclock = xen_set_wallclock;
544}
545
546#ifdef CONFIG_XEN_PVHVM
547static void xen_hvm_setup_cpu_clockevents(void)
548{
549 int cpu = smp_processor_id();
550 xen_setup_runstate_info(cpu);
551 /*
552 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
553 * doing it xen_hvm_cpu_notify (which gets called by smp_init during
554 * early bootup and also during CPU hotplug events).
555 */
556 xen_setup_cpu_clockevents();
557}
558
559void __init xen_hvm_init_time_ops(void)
560{
561 static bool hvm_time_initialized;
562
563 if (hvm_time_initialized)
564 return;
565
566 /*
567 * vector callback is needed otherwise we cannot receive interrupts
568 * on cpu > 0 and at this point we don't know how many cpus are
569 * available.
570 */
571 if (!xen_have_vector_callback)
572 return;
573
574 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
575 pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
576 return;
577 }
578
579 /*
580 * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
581 * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
582 * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
583 * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
584 *
585 * The xen_hvm_init_time_ops() should be called again later after
586 * __this_cpu_read(xen_vcpu) is available.
587 */
588 if (!__this_cpu_read(xen_vcpu)) {
589 pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
590 xen_vcpu_nr(0));
591 return;
592 }
593
594 xen_init_time_common();
595
596 x86_init.timers.setup_percpu_clockev = xen_time_init;
597 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
598
599 x86_platform.set_wallclock = xen_set_wallclock;
600
601 hvm_time_initialized = true;
602}
603#endif
604
605/* Kernel parameter to specify Xen timer slop */
606static int __init parse_xen_timer_slop(char *ptr)
607{
608 unsigned long slop = memparse(ptr, NULL);
609
610 xen_timerop_clockevent.min_delta_ns = slop;
611 xen_timerop_clockevent.min_delta_ticks = slop;
612 xen_vcpuop_clockevent.min_delta_ns = slop;
613 xen_vcpuop_clockevent.min_delta_ticks = slop;
614
615 return 0;
616}
617early_param("xen_timer_slop", parse_xen_timer_slop);
1/*
2 * Xen time implementation.
3 *
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
7 *
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9 */
10#include <linux/kernel.h>
11#include <linux/interrupt.h>
12#include <linux/clocksource.h>
13#include <linux/clockchips.h>
14#include <linux/kernel_stat.h>
15#include <linux/math64.h>
16#include <linux/gfp.h>
17
18#include <asm/pvclock.h>
19#include <asm/xen/hypervisor.h>
20#include <asm/xen/hypercall.h>
21
22#include <xen/events.h>
23#include <xen/features.h>
24#include <xen/interface/xen.h>
25#include <xen/interface/vcpu.h>
26
27#include "xen-ops.h"
28
29/* Xen may fire a timer up to this many ns early */
30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ)
32
33/* runstate info updated by Xen */
34static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
35
36/* snapshots of runstate info */
37static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
38
39/* unused ns of stolen and blocked time */
40static DEFINE_PER_CPU(u64, xen_residual_stolen);
41static DEFINE_PER_CPU(u64, xen_residual_blocked);
42
43/* return an consistent snapshot of 64-bit time/counter value */
44static u64 get64(const u64 *p)
45{
46 u64 ret;
47
48 if (BITS_PER_LONG < 64) {
49 u32 *p32 = (u32 *)p;
50 u32 h, l;
51
52 /*
53 * Read high then low, and then make sure high is
54 * still the same; this will only loop if low wraps
55 * and carries into high.
56 * XXX some clean way to make this endian-proof?
57 */
58 do {
59 h = p32[1];
60 barrier();
61 l = p32[0];
62 barrier();
63 } while (p32[1] != h);
64
65 ret = (((u64)h) << 32) | l;
66 } else
67 ret = *p;
68
69 return ret;
70}
71
72/*
73 * Runstate accounting
74 */
75static void get_runstate_snapshot(struct vcpu_runstate_info *res)
76{
77 u64 state_time;
78 struct vcpu_runstate_info *state;
79
80 BUG_ON(preemptible());
81
82 state = &__get_cpu_var(xen_runstate);
83
84 /*
85 * The runstate info is always updated by the hypervisor on
86 * the current CPU, so there's no need to use anything
87 * stronger than a compiler barrier when fetching it.
88 */
89 do {
90 state_time = get64(&state->state_entry_time);
91 barrier();
92 *res = *state;
93 barrier();
94 } while (get64(&state->state_entry_time) != state_time);
95}
96
97/* return true when a vcpu could run but has no real cpu to run on */
98bool xen_vcpu_stolen(int vcpu)
99{
100 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
101}
102
103void xen_setup_runstate_info(int cpu)
104{
105 struct vcpu_register_runstate_memory_area area;
106
107 area.addr.v = &per_cpu(xen_runstate, cpu);
108
109 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 cpu, &area))
111 BUG();
112}
113
114static void do_stolen_accounting(void)
115{
116 struct vcpu_runstate_info state;
117 struct vcpu_runstate_info *snap;
118 s64 blocked, runnable, offline, stolen;
119 cputime_t ticks;
120
121 get_runstate_snapshot(&state);
122
123 WARN_ON(state.state != RUNSTATE_running);
124
125 snap = &__get_cpu_var(xen_runstate_snapshot);
126
127 /* work out how much time the VCPU has not been runn*ing* */
128 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
129 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
130 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
131
132 *snap = state;
133
134 /* Add the appropriate number of ticks of stolen time,
135 including any left-overs from last time. */
136 stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
137
138 if (stolen < 0)
139 stolen = 0;
140
141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
142 __this_cpu_write(xen_residual_stolen, stolen);
143 account_steal_ticks(ticks);
144
145 /* Add the appropriate number of ticks of blocked time,
146 including any left-overs from last time. */
147 blocked += __this_cpu_read(xen_residual_blocked);
148
149 if (blocked < 0)
150 blocked = 0;
151
152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
153 __this_cpu_write(xen_residual_blocked, blocked);
154 account_idle_ticks(ticks);
155}
156
157/* Get the TSC speed from Xen */
158static unsigned long xen_tsc_khz(void)
159{
160 struct pvclock_vcpu_time_info *info =
161 &HYPERVISOR_shared_info->vcpu_info[0].time;
162
163 return pvclock_tsc_khz(info);
164}
165
166cycle_t xen_clocksource_read(void)
167{
168 struct pvclock_vcpu_time_info *src;
169 cycle_t ret;
170
171 preempt_disable_notrace();
172 src = &__get_cpu_var(xen_vcpu)->time;
173 ret = pvclock_clocksource_read(src);
174 preempt_enable_notrace();
175 return ret;
176}
177
178static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
179{
180 return xen_clocksource_read();
181}
182
183static void xen_read_wallclock(struct timespec *ts)
184{
185 struct shared_info *s = HYPERVISOR_shared_info;
186 struct pvclock_wall_clock *wall_clock = &(s->wc);
187 struct pvclock_vcpu_time_info *vcpu_time;
188
189 vcpu_time = &get_cpu_var(xen_vcpu)->time;
190 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
191 put_cpu_var(xen_vcpu);
192}
193
194static unsigned long xen_get_wallclock(void)
195{
196 struct timespec ts;
197
198 xen_read_wallclock(&ts);
199 return ts.tv_sec;
200}
201
202static int xen_set_wallclock(unsigned long now)
203{
204 /* do nothing for domU */
205 return -1;
206}
207
208static struct clocksource xen_clocksource __read_mostly = {
209 .name = "xen",
210 .rating = 400,
211 .read = xen_clocksource_get_cycles,
212 .mask = ~0,
213 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
214};
215
216/*
217 Xen clockevent implementation
218
219 Xen has two clockevent implementations:
220
221 The old timer_op one works with all released versions of Xen prior
222 to version 3.0.4. This version of the hypervisor provides a
223 single-shot timer with nanosecond resolution. However, sharing the
224 same event channel is a 100Hz tick which is delivered while the
225 vcpu is running. We don't care about or use this tick, but it will
226 cause the core time code to think the timer fired too soon, and
227 will end up resetting it each time. It could be filtered, but
228 doing so has complications when the ktime clocksource is not yet
229 the xen clocksource (ie, at boot time).
230
231 The new vcpu_op-based timer interface allows the tick timer period
232 to be changed or turned off. The tick timer is not useful as a
233 periodic timer because events are only delivered to running vcpus.
234 The one-shot timer can report when a timeout is in the past, so
235 set_next_event is capable of returning -ETIME when appropriate.
236 This interface is used when available.
237*/
238
239
240/*
241 Get a hypervisor absolute time. In theory we could maintain an
242 offset between the kernel's time and the hypervisor's time, and
243 apply that to a kernel's absolute timeout. Unfortunately the
244 hypervisor and kernel times can drift even if the kernel is using
245 the Xen clocksource, because ntp can warp the kernel's clocksource.
246*/
247static s64 get_abs_timeout(unsigned long delta)
248{
249 return xen_clocksource_read() + delta;
250}
251
252static void xen_timerop_set_mode(enum clock_event_mode mode,
253 struct clock_event_device *evt)
254{
255 switch (mode) {
256 case CLOCK_EVT_MODE_PERIODIC:
257 /* unsupported */
258 WARN_ON(1);
259 break;
260
261 case CLOCK_EVT_MODE_ONESHOT:
262 case CLOCK_EVT_MODE_RESUME:
263 break;
264
265 case CLOCK_EVT_MODE_UNUSED:
266 case CLOCK_EVT_MODE_SHUTDOWN:
267 HYPERVISOR_set_timer_op(0); /* cancel timeout */
268 break;
269 }
270}
271
272static int xen_timerop_set_next_event(unsigned long delta,
273 struct clock_event_device *evt)
274{
275 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
276
277 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
278 BUG();
279
280 /* We may have missed the deadline, but there's no real way of
281 knowing for sure. If the event was in the past, then we'll
282 get an immediate interrupt. */
283
284 return 0;
285}
286
287static const struct clock_event_device xen_timerop_clockevent = {
288 .name = "xen",
289 .features = CLOCK_EVT_FEAT_ONESHOT,
290
291 .max_delta_ns = 0xffffffff,
292 .min_delta_ns = TIMER_SLOP,
293
294 .mult = 1,
295 .shift = 0,
296 .rating = 500,
297
298 .set_mode = xen_timerop_set_mode,
299 .set_next_event = xen_timerop_set_next_event,
300};
301
302
303
304static void xen_vcpuop_set_mode(enum clock_event_mode mode,
305 struct clock_event_device *evt)
306{
307 int cpu = smp_processor_id();
308
309 switch (mode) {
310 case CLOCK_EVT_MODE_PERIODIC:
311 WARN_ON(1); /* unsupported */
312 break;
313
314 case CLOCK_EVT_MODE_ONESHOT:
315 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
316 BUG();
317 break;
318
319 case CLOCK_EVT_MODE_UNUSED:
320 case CLOCK_EVT_MODE_SHUTDOWN:
321 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
322 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
323 BUG();
324 break;
325 case CLOCK_EVT_MODE_RESUME:
326 break;
327 }
328}
329
330static int xen_vcpuop_set_next_event(unsigned long delta,
331 struct clock_event_device *evt)
332{
333 int cpu = smp_processor_id();
334 struct vcpu_set_singleshot_timer single;
335 int ret;
336
337 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
338
339 single.timeout_abs_ns = get_abs_timeout(delta);
340 single.flags = VCPU_SSHOTTMR_future;
341
342 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
343
344 BUG_ON(ret != 0 && ret != -ETIME);
345
346 return ret;
347}
348
349static const struct clock_event_device xen_vcpuop_clockevent = {
350 .name = "xen",
351 .features = CLOCK_EVT_FEAT_ONESHOT,
352
353 .max_delta_ns = 0xffffffff,
354 .min_delta_ns = TIMER_SLOP,
355
356 .mult = 1,
357 .shift = 0,
358 .rating = 500,
359
360 .set_mode = xen_vcpuop_set_mode,
361 .set_next_event = xen_vcpuop_set_next_event,
362};
363
364static const struct clock_event_device *xen_clockevent =
365 &xen_timerop_clockevent;
366static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
367
368static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
369{
370 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
371 irqreturn_t ret;
372
373 ret = IRQ_NONE;
374 if (evt->event_handler) {
375 evt->event_handler(evt);
376 ret = IRQ_HANDLED;
377 }
378
379 do_stolen_accounting();
380
381 return ret;
382}
383
384void xen_setup_timer(int cpu)
385{
386 const char *name;
387 struct clock_event_device *evt;
388 int irq;
389
390 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
391
392 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
393 if (!name)
394 name = "<timer kasprintf failed>";
395
396 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
397 IRQF_DISABLED|IRQF_PERCPU|
398 IRQF_NOBALANCING|IRQF_TIMER|
399 IRQF_FORCE_RESUME,
400 name, NULL);
401
402 evt = &per_cpu(xen_clock_events, cpu);
403 memcpy(evt, xen_clockevent, sizeof(*evt));
404
405 evt->cpumask = cpumask_of(cpu);
406 evt->irq = irq;
407}
408
409void xen_teardown_timer(int cpu)
410{
411 struct clock_event_device *evt;
412 BUG_ON(cpu == 0);
413 evt = &per_cpu(xen_clock_events, cpu);
414 unbind_from_irqhandler(evt->irq, NULL);
415}
416
417void xen_setup_cpu_clockevents(void)
418{
419 BUG_ON(preemptible());
420
421 clockevents_register_device(&__get_cpu_var(xen_clock_events));
422}
423
424void xen_timer_resume(void)
425{
426 int cpu;
427
428 pvclock_resume();
429
430 if (xen_clockevent != &xen_vcpuop_clockevent)
431 return;
432
433 for_each_online_cpu(cpu) {
434 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
435 BUG();
436 }
437}
438
439static const struct pv_time_ops xen_time_ops __initconst = {
440 .sched_clock = xen_clocksource_read,
441};
442
443static void __init xen_time_init(void)
444{
445 int cpu = smp_processor_id();
446 struct timespec tp;
447
448 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
449
450 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
451 /* Successfully turned off 100Hz tick, so we have the
452 vcpuop-based timer interface */
453 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
454 xen_clockevent = &xen_vcpuop_clockevent;
455 }
456
457 /* Set initial system time with full resolution */
458 xen_read_wallclock(&tp);
459 do_settimeofday(&tp);
460
461 setup_force_cpu_cap(X86_FEATURE_TSC);
462
463 xen_setup_runstate_info(cpu);
464 xen_setup_timer(cpu);
465 xen_setup_cpu_clockevents();
466}
467
468void __init xen_init_time_ops(void)
469{
470 pv_time_ops = xen_time_ops;
471
472 x86_init.timers.timer_init = xen_time_init;
473 x86_init.timers.setup_percpu_clockev = x86_init_noop;
474 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
475
476 x86_platform.calibrate_tsc = xen_tsc_khz;
477 x86_platform.get_wallclock = xen_get_wallclock;
478 x86_platform.set_wallclock = xen_set_wallclock;
479}
480
481#ifdef CONFIG_XEN_PVHVM
482static void xen_hvm_setup_cpu_clockevents(void)
483{
484 int cpu = smp_processor_id();
485 xen_setup_runstate_info(cpu);
486 xen_setup_timer(cpu);
487 xen_setup_cpu_clockevents();
488}
489
490void __init xen_hvm_init_time_ops(void)
491{
492 /* vector callback is needed otherwise we cannot receive interrupts
493 * on cpu > 0 and at this point we don't know how many cpus are
494 * available */
495 if (!xen_have_vector_callback)
496 return;
497 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
498 printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
499 "disable pv timer\n");
500 return;
501 }
502
503 pv_time_ops = xen_time_ops;
504 x86_init.timers.setup_percpu_clockev = xen_time_init;
505 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
506
507 x86_platform.calibrate_tsc = xen_tsc_khz;
508 x86_platform.get_wallclock = xen_get_wallclock;
509 x86_platform.set_wallclock = xen_set_wallclock;
510}
511#endif