Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Virtual PTP 1588 clock for use with LM-safe VMclock device.
4 *
5 * Copyright © 2024 Amazon.com, Inc. or its affiliates.
6 */
7
8#include <linux/acpi.h>
9#include <linux/device.h>
10#include <linux/err.h>
11#include <linux/file.h>
12#include <linux/fs.h>
13#include <linux/init.h>
14#include <linux/kernel.h>
15#include <linux/miscdevice.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/platform_device.h>
19#include <linux/slab.h>
20
21#include <uapi/linux/vmclock-abi.h>
22
23#include <linux/ptp_clock_kernel.h>
24
25#ifdef CONFIG_X86
26#include <asm/pvclock.h>
27#include <asm/kvmclock.h>
28#endif
29
30#ifdef CONFIG_KVM_GUEST
31#define SUPPORT_KVMCLOCK
32#endif
33
34static DEFINE_IDA(vmclock_ida);
35
36ACPI_MODULE_NAME("vmclock");
37
38struct vmclock_state {
39 struct resource res;
40 struct vmclock_abi *clk;
41 struct miscdevice miscdev;
42 struct ptp_clock_info ptp_clock_info;
43 struct ptp_clock *ptp_clock;
44 enum clocksource_ids cs_id, sys_cs_id;
45 int index;
46 char *name;
47};
48
49#define VMCLOCK_MAX_WAIT ms_to_ktime(100)
50
51/* Require at least the flags field to be present. All else can be optional. */
52#define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
53
54#define VMCLOCK_FIELD_PRESENT(_c, _f) \
55 (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \
56 sizeof((_c)->_f)))
57
58/*
59 * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
60 * and add the fractional second part of the reference time.
61 *
62 * The result is a 128-bit value, the top 64 bits of which are seconds, and
63 * the low 64 bits are (seconds >> 64).
64 */
65static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
66 uint64_t period, uint8_t shift,
67 uint64_t frac_sec)
68{
69 unsigned __int128 res = (unsigned __int128)delta * period;
70
71 res >>= shift;
72 res += frac_sec;
73 *res_hi = res >> 64;
74 return (uint64_t)res;
75}
76
77static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
78{
79 if (likely(clk->time_type == VMCLOCK_TIME_UTC))
80 return true;
81
82 if (clk->time_type == VMCLOCK_TIME_TAI &&
83 (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
84 if (sec)
85 *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
86 return true;
87 }
88 return false;
89}
90
91static int vmclock_get_crosststamp(struct vmclock_state *st,
92 struct ptp_system_timestamp *sts,
93 struct system_counterval_t *system_counter,
94 struct timespec64 *tspec)
95{
96 ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
97 struct system_time_snapshot systime_snapshot;
98 uint64_t cycle, delta, seq, frac_sec;
99
100#ifdef CONFIG_X86
101 /*
102 * We'd expect the hypervisor to know this and to report the clock
103 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
104 */
105 if (check_tsc_unstable())
106 return -EINVAL;
107#endif
108
109 while (1) {
110 seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
111
112 /*
113 * This pairs with a write barrier in the hypervisor
114 * which populates this structure.
115 */
116 virt_rmb();
117
118 if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
119 return -EINVAL;
120
121 /*
122 * When invoked for gettimex64(), fill in the pre/post system
123 * times. The simple case is when system time is based on the
124 * same counter as st->cs_id, in which case all three times
125 * will be derived from the *same* counter value.
126 *
127 * If the system isn't using the same counter, then the value
128 * from ktime_get_snapshot() will still be used as pre_ts, and
129 * ptp_read_system_postts() is called to populate postts after
130 * calling get_cycles().
131 *
132 * The conversion to timespec64 happens further down, outside
133 * the seq_count loop.
134 */
135 if (sts) {
136 ktime_get_snapshot(&systime_snapshot);
137 if (systime_snapshot.cs_id == st->cs_id) {
138 cycle = systime_snapshot.cycles;
139 } else {
140 cycle = get_cycles();
141 ptp_read_system_postts(sts);
142 }
143 } else {
144 cycle = get_cycles();
145 }
146
147 delta = cycle - le64_to_cpu(st->clk->counter_value);
148
149 frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
150 le64_to_cpu(st->clk->counter_period_frac_sec),
151 st->clk->counter_period_shift,
152 le64_to_cpu(st->clk->time_frac_sec));
153 tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
154 tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
155
156 if (!tai_adjust(st->clk, &tspec->tv_sec))
157 return -EINVAL;
158
159 /*
160 * This pairs with a write barrier in the hypervisor
161 * which populates this structure.
162 */
163 virt_rmb();
164 if (seq == le32_to_cpu(st->clk->seq_count))
165 break;
166
167 if (ktime_after(ktime_get(), deadline))
168 return -ETIMEDOUT;
169 }
170
171 if (system_counter) {
172 system_counter->cycles = cycle;
173 system_counter->cs_id = st->cs_id;
174 }
175
176 if (sts) {
177 sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
178 if (systime_snapshot.cs_id == st->cs_id)
179 sts->post_ts = sts->pre_ts;
180 }
181
182 return 0;
183}
184
185#ifdef SUPPORT_KVMCLOCK
186/*
187 * In the case where the system is using the KVM clock for timekeeping, convert
188 * the TSC value into a KVM clock time in order to return a paired reading that
189 * get_device_system_crosststamp() can cope with.
190 */
191static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
192 struct ptp_system_timestamp *sts,
193 struct system_counterval_t *system_counter,
194 struct timespec64 *tspec)
195{
196 struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
197 unsigned int pvti_ver;
198 int ret;
199
200 preempt_disable_notrace();
201
202 do {
203 pvti_ver = pvclock_read_begin(pvti);
204
205 ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
206 if (ret)
207 break;
208
209 system_counter->cycles = __pvclock_read_cycles(pvti,
210 system_counter->cycles);
211 system_counter->cs_id = CSID_X86_KVM_CLK;
212
213 /*
214 * This retry should never really happen; if the TSC is
215 * stable and reliable enough across vCPUS that it is sane
216 * for the hypervisor to expose a VMCLOCK device which uses
217 * it as the reference counter, then the KVM clock sohuld be
218 * in 'master clock mode' and basically never changed. But
219 * the KVM clock is a fickle and often broken thing, so do
220 * it "properly" just in case.
221 */
222 } while (pvclock_read_retry(pvti, pvti_ver));
223
224 preempt_enable_notrace();
225
226 return ret;
227}
228#endif
229
230static int ptp_vmclock_get_time_fn(ktime_t *device_time,
231 struct system_counterval_t *system_counter,
232 void *ctx)
233{
234 struct vmclock_state *st = ctx;
235 struct timespec64 tspec;
236 int ret;
237
238#ifdef SUPPORT_KVMCLOCK
239 if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
240 ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
241 &tspec);
242 else
243#endif
244 ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
245
246 if (!ret)
247 *device_time = timespec64_to_ktime(tspec);
248
249 return ret;
250}
251
252static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
253 struct system_device_crosststamp *xtstamp)
254{
255 struct vmclock_state *st = container_of(ptp, struct vmclock_state,
256 ptp_clock_info);
257 int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
258 NULL, xtstamp);
259#ifdef SUPPORT_KVMCLOCK
260 /*
261 * On x86, the KVM clock may be used for the system time. We can
262 * actually convert a TSC reading to that, and return a paired
263 * timestamp that get_device_system_crosststamp() *can* handle.
264 */
265 if (ret == -ENODEV) {
266 struct system_time_snapshot systime_snapshot;
267
268 ktime_get_snapshot(&systime_snapshot);
269
270 if (systime_snapshot.cs_id == CSID_X86_TSC ||
271 systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
272 WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
273 ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
274 st, NULL, xtstamp);
275 }
276 }
277#endif
278 return ret;
279}
280
281/*
282 * PTP clock operations
283 */
284
285static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
286{
287 return -EOPNOTSUPP;
288}
289
290static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
291{
292 return -EOPNOTSUPP;
293}
294
295static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
296 const struct timespec64 *ts)
297{
298 return -EOPNOTSUPP;
299}
300
301static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
302 struct ptp_system_timestamp *sts)
303{
304 struct vmclock_state *st = container_of(ptp, struct vmclock_state,
305 ptp_clock_info);
306
307 return vmclock_get_crosststamp(st, sts, NULL, ts);
308}
309
310static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
311 struct ptp_clock_request *rq, int on)
312{
313 return -EOPNOTSUPP;
314}
315
316static const struct ptp_clock_info ptp_vmclock_info = {
317 .owner = THIS_MODULE,
318 .max_adj = 0,
319 .n_ext_ts = 0,
320 .n_pins = 0,
321 .pps = 0,
322 .adjfine = ptp_vmclock_adjfine,
323 .adjtime = ptp_vmclock_adjtime,
324 .gettimex64 = ptp_vmclock_gettimex,
325 .settime64 = ptp_vmclock_settime,
326 .enable = ptp_vmclock_enable,
327 .getcrosststamp = ptp_vmclock_getcrosststamp,
328};
329
330static struct ptp_clock *vmclock_ptp_register(struct device *dev,
331 struct vmclock_state *st)
332{
333 enum clocksource_ids cs_id;
334
335 if (IS_ENABLED(CONFIG_ARM64) &&
336 st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
337 /* Can we check it's the virtual counter? */
338 cs_id = CSID_ARM_ARCH_COUNTER;
339 } else if (IS_ENABLED(CONFIG_X86) &&
340 st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
341 cs_id = CSID_X86_TSC;
342 } else {
343 return NULL;
344 }
345
346 /* Only UTC, or TAI with offset */
347 if (!tai_adjust(st->clk, NULL)) {
348 dev_info(dev, "vmclock does not provide unambiguous UTC\n");
349 return NULL;
350 }
351
352 st->sys_cs_id = cs_id;
353 st->cs_id = cs_id;
354 st->ptp_clock_info = ptp_vmclock_info;
355 strscpy(st->ptp_clock_info.name, st->name);
356
357 return ptp_clock_register(&st->ptp_clock_info, dev);
358}
359
360static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
361{
362 struct vmclock_state *st = container_of(fp->private_data,
363 struct vmclock_state, miscdev);
364
365 if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
366 return -EROFS;
367
368 if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
369 return -EINVAL;
370
371 if (io_remap_pfn_range(vma, vma->vm_start,
372 st->res.start >> PAGE_SHIFT, PAGE_SIZE,
373 vma->vm_page_prot))
374 return -EAGAIN;
375
376 return 0;
377}
378
379static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
380 size_t count, loff_t *ppos)
381{
382 struct vmclock_state *st = container_of(fp->private_data,
383 struct vmclock_state, miscdev);
384 ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
385 size_t max_count;
386 uint32_t seq;
387
388 if (*ppos >= PAGE_SIZE)
389 return 0;
390
391 max_count = PAGE_SIZE - *ppos;
392 if (count > max_count)
393 count = max_count;
394
395 while (1) {
396 seq = le32_to_cpu(st->clk->seq_count) & ~1U;
397 /* Pairs with hypervisor wmb */
398 virt_rmb();
399
400 if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
401 return -EFAULT;
402
403 /* Pairs with hypervisor wmb */
404 virt_rmb();
405 if (seq == le32_to_cpu(st->clk->seq_count))
406 break;
407
408 if (ktime_after(ktime_get(), deadline))
409 return -ETIMEDOUT;
410 }
411
412 *ppos += count;
413 return count;
414}
415
416static const struct file_operations vmclock_miscdev_fops = {
417 .owner = THIS_MODULE,
418 .mmap = vmclock_miscdev_mmap,
419 .read = vmclock_miscdev_read,
420};
421
422/* module operations */
423
424static void vmclock_remove(struct platform_device *pdev)
425{
426 struct device *dev = &pdev->dev;
427 struct vmclock_state *st = dev_get_drvdata(dev);
428
429 if (st->ptp_clock)
430 ptp_clock_unregister(st->ptp_clock);
431
432 if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
433 misc_deregister(&st->miscdev);
434}
435
436static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
437{
438 struct vmclock_state *st = data;
439 struct resource_win win;
440 struct resource *res = &win.res;
441
442 if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
443 return AE_OK;
444
445 /* There can be only one */
446 if (resource_type(&st->res) == IORESOURCE_MEM)
447 return AE_ERROR;
448
449 if (acpi_dev_resource_memory(ares, res) ||
450 acpi_dev_resource_address_space(ares, &win)) {
451
452 if (resource_type(res) != IORESOURCE_MEM ||
453 resource_size(res) < sizeof(st->clk))
454 return AE_ERROR;
455
456 st->res = *res;
457 return AE_OK;
458 }
459
460 return AE_ERROR;
461}
462
463static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
464{
465 struct acpi_device *adev = ACPI_COMPANION(dev);
466 acpi_status status;
467
468 /*
469 * This should never happen as this function is only called when
470 * has_acpi_companion(dev) is true, but the logic is sufficiently
471 * complex that Coverity can't see the tautology.
472 */
473 if (!adev)
474 return -ENODEV;
475
476 status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
477 vmclock_acpi_resources, st);
478 if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
479 dev_err(dev, "failed to get resources\n");
480 return -ENODEV;
481 }
482
483 return 0;
484}
485
486static void vmclock_put_idx(void *data)
487{
488 struct vmclock_state *st = data;
489
490 ida_free(&vmclock_ida, st->index);
491}
492
493static int vmclock_probe(struct platform_device *pdev)
494{
495 struct device *dev = &pdev->dev;
496 struct vmclock_state *st;
497 int ret;
498
499 st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
500 if (!st)
501 return -ENOMEM;
502
503 if (has_acpi_companion(dev))
504 ret = vmclock_probe_acpi(dev, st);
505 else
506 ret = -EINVAL; /* Only ACPI for now */
507
508 if (ret) {
509 dev_info(dev, "Failed to obtain physical address: %d\n", ret);
510 goto out;
511 }
512
513 if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
514 dev_info(dev, "Region too small (0x%llx)\n",
515 resource_size(&st->res));
516 ret = -EINVAL;
517 goto out;
518 }
519 st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
520 MEMREMAP_WB | MEMREMAP_DEC);
521 if (IS_ERR(st->clk)) {
522 ret = PTR_ERR(st->clk);
523 dev_info(dev, "failed to map shared memory\n");
524 st->clk = NULL;
525 goto out;
526 }
527
528 dev_set_drvdata(dev, st);
529
530 if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
531 le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
532 le16_to_cpu(st->clk->version) != 1) {
533 dev_info(dev, "vmclock magic fields invalid\n");
534 ret = -EINVAL;
535 goto out;
536 }
537
538 ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
539 if (ret < 0)
540 goto out;
541
542 st->index = ret;
543 ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
544 if (ret)
545 goto out;
546
547 st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
548 if (!st->name) {
549 ret = -ENOMEM;
550 goto out;
551 }
552
553 st->miscdev.minor = MISC_DYNAMIC_MINOR;
554
555 /*
556 * If the structure is big enough, it can be mapped to userspace.
557 * Theoretically a guest OS even using larger pages could still
558 * use 4KiB PTEs to map smaller MMIO regions like this, but let's
559 * cross that bridge if/when we come to it.
560 */
561 if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
562 st->miscdev.fops = &vmclock_miscdev_fops;
563 st->miscdev.name = st->name;
564
565 ret = misc_register(&st->miscdev);
566 if (ret)
567 goto out;
568 }
569
570 /* If there is valid clock information, register a PTP clock */
571 if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
572 /* Can return a silent NULL, or an error. */
573 st->ptp_clock = vmclock_ptp_register(dev, st);
574 if (IS_ERR(st->ptp_clock)) {
575 ret = PTR_ERR(st->ptp_clock);
576 st->ptp_clock = NULL;
577 vmclock_remove(pdev);
578 goto out;
579 }
580 }
581
582 if (!st->miscdev.minor && !st->ptp_clock) {
583 /* Neither miscdev nor PTP registered */
584 dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
585 ret = -ENODEV;
586 goto out;
587 }
588
589 dev_info(dev, "%s: registered %s%s%s\n", st->name,
590 st->miscdev.minor ? "miscdev" : "",
591 (st->miscdev.minor && st->ptp_clock) ? ", " : "",
592 st->ptp_clock ? "PTP" : "");
593
594 out:
595 return ret;
596}
597
598static const struct acpi_device_id vmclock_acpi_ids[] = {
599 { "AMZNC10C", 0 },
600 {}
601};
602MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
603
604static struct platform_driver vmclock_platform_driver = {
605 .probe = vmclock_probe,
606 .remove = vmclock_remove,
607 .driver = {
608 .name = "vmclock",
609 .acpi_match_table = vmclock_acpi_ids,
610 },
611};
612
613module_platform_driver(vmclock_platform_driver)
614
615MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
616MODULE_DESCRIPTION("PTP clock using VMCLOCK");
617MODULE_LICENSE("GPL");