validation.c - fs/afs/validation.c - Linux source code v6.2

Note: File does not exist in v6.2.
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* vnode and volume validity verification.
  3 *
  4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
  5 * Written by David Howells (dhowells@redhat.com)
  6 */
  7
  8#include <linux/kernel.h>
  9#include <linux/module.h>
 10#include <linux/sched.h>
 11#include "internal.h"
 12
 13/*
 14 * Data validation is managed through a number of mechanisms from the server:
 15 *
 16 *  (1) On first contact with a server (such as if it has just been rebooted),
 17 *      the server sends us a CB.InitCallBackState* request.
 18 *
 19 *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
 20 *      calls, the server maintains a time-limited per-vnode promise that it
 21 *      will send us a CB.CallBack request if a third party alters the vnodes
 22 *      accessed.
 23 *
 24 *      Note that a vnode-level callbacks may also be sent for other reasons,
 25 *      such as filelock release.
 26 *
 27 *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
 28 *      calls, each server maintains a time-limited per-volume promise that it
 29 *      will send us a CB.CallBack request if the RO volume is updated to a
 30 *      snapshot of the RW volume ("vos release").  This is an atomic event
 31 *      that cuts over all instances of the RO volume across multiple servers
 32 *      simultaneously.
 33 *
 34 *	Note that a volume-level callbacks may also be sent for other reasons,
 35 *	such as the volumeserver taking over control of the volume from the
 36 *	fileserver.
 37 *
 38 *	Note also that each server maintains an independent time limit on an
 39 *	independent callback.
 40 *
 41 *  (4) Certain RPC calls include a volume information record "VolSync" in
 42 *      their reply.  This contains a creation date for the volume that should
 43 *      remain unchanged for a RW volume (but will be changed if the volume is
 44 *      restored from backup) or will be bumped to the time of snapshotting
 45 *      when a RO volume is released.
 46 *
 47 * In order to track this events, the following are provided:
 48 *
 49 *	->cb_v_break.  A counter of events that might mean that the contents of
 50 *	a volume have been altered since we last checked a vnode.
 51 *
 52 *	->cb_v_check.  A counter of the number of events that we've sent a
 53 *	query to the server for.  Everything's up to date if this equals
 54 *	cb_v_break.
 55 *
 56 *	->cb_scrub.  A counter of the number of regression events for which we
 57 *	have to completely wipe the cache.
 58 *
 59 *	->cb_ro_snapshot.  A counter of the number of times that we've
 60 *      recognised that a RO volume has been updated.
 61 *
 62 *	->cb_break.  A counter of events that might mean that the contents of a
 63 *      vnode have been altered.
 64 *
 65 *	->cb_expires_at.  The time at which the callback promise expires or
 66 *      AFS_NO_CB_PROMISE if we have no promise.
 67 *
 68 * The way we manage things is:
 69 *
 70 *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
 71 *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
 72 *      volume and volume's server record.
 73 *
 74 *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
 75 *	callback break on all the volumes that have been using that volume
 76 *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
 77 *
 78 *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
 79 *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
 80 *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
 81 *	force reentry to the filesystem for revalidation.
 82 *
 83 *  (4) When entering the filesystem, we call afs_validate() to check the
 84 *	validity of a vnode.  This first checks to see if ->cb_v_check and
 85 *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
 86 *	exclusively and perform an FS.FetchStatus on the vnode.
 87 *
 88 *	After checking the volume, we check the vnode.  If there's a mismatch
 89 *	between the volume counters and the vnode's mirrors of those counters,
 90 *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
 91 *
 92 *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
 93 *      parsed:
 94 *
 95 *	(A) If the Creation timestamp has changed on a RW volume or regressed
 96 *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
 97 *	    RO volume, we assume "vos release" happened and try to increment
 98 *	    ->cb_ro_snapshot.
 99 *
100 *      (B) If the Update timestamp has regressed, we try to increment
101 *	    ->cb_scrub.
102 *
103 *      Note that in both of these cases, we only do the increment if we can
104 *      cmpxchg the value of the timestamp from the value we noted before the
105 *      op.  This tries to prevent parallel ops from fighting one another.
106 *
107 *	volume->cb_v_check is then set to ->cb_v_break.
108 *
109 *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110 *	parsed and used to set the promise in ->cb_expires_at for the vnode,
111 *	the volume and the volume's server record.
112 *
113 *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114 *      the vnode.
115 */
116
117/*
118 * Check the validity of a vnode/inode and its parent volume.
119 */
120bool afs_check_validity(const struct afs_vnode *vnode)
121{
122	const struct afs_volume *volume = vnode->volume;
123	time64_t deadline = ktime_get_real_seconds() + 10;
124
125	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
126	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
127	    volume->cb_expires_at <= deadline ||
128	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
129	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
130	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
131		_debug("inval");
132		return false;
133	}
134
135	return true;
136}
137
138/*
139 * See if the server we've just talked to is currently excluded.
140 */
141static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
142{
143	const struct afs_server_entry *se;
144	const struct afs_server_list *slist;
145	bool is_excluded = true;
146	int i;
147
148	rcu_read_lock();
149
150	slist = rcu_dereference(volume->servers);
151	for (i = 0; i < slist->nr_servers; i++) {
152		se = &slist->servers[i];
153		if (op->server == se->server) {
154			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
155			break;
156		}
157	}
158
159	rcu_read_unlock();
160	return is_excluded;
161}
162
163/*
164 * Update the volume's server list when the creation time changes and see if
165 * the server we've just talked to is currently excluded.
166 */
167static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
168{
169	int ret;
170
171	if (__afs_is_server_excluded(op, volume))
172		return 1;
173
174	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
175	ret = afs_check_volume_status(op->volume, op);
176	if (ret < 0)
177		return ret;
178
179	return __afs_is_server_excluded(op, volume);
180}
181
182/*
183 * Handle a change to the volume creation time in the VolSync record.
184 */
185static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
186{
187	unsigned int snap;
188	time64_t cur = volume->creation_time;
189	time64_t old = op->pre_volsync.creation;
190	time64_t new = op->volsync.creation;
191	int ret;
192
193	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
194
195	if (cur == TIME64_MIN) {
196		volume->creation_time = new;
197		return 0;
198	}
199
200	if (new == cur)
201		return 0;
202
203	/* Try to advance the creation timestamp from what we had before the
204	 * operation to what we got back from the server.  This should
205	 * hopefully ensure that in a race between multiple operations only one
206	 * of them will do this.
207	 */
208	if (cur != old)
209		return 0;
210
211	/* If the creation time changes in an unexpected way, we need to scrub
212	 * our caches.  For a RW vol, this will only change if the volume is
213	 * restored from a backup; for a RO/Backup vol, this will advance when
214	 * the volume is updated to a new snapshot (eg. "vos release").
215	 */
216	if (volume->type == AFSVL_RWVOL)
217		goto regressed;
218	if (volume->type == AFSVL_BACKVOL) {
219		if (new < old)
220			goto regressed;
221		goto advance;
222	}
223
224	/* We have an RO volume, we need to query the VL server and look at the
225	 * server flags to see if RW->RO replication is in progress.
226	 */
227	ret = afs_is_server_excluded(op, volume);
228	if (ret < 0)
229		return ret;
230	if (ret > 0) {
231		snap = atomic_read(&volume->cb_ro_snapshot);
232		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
233		return ret;
234	}
235
236advance:
237	snap = atomic_inc_return(&volume->cb_ro_snapshot);
238	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
239	volume->creation_time = new;
240	return 0;
241
242regressed:
243	atomic_inc(&volume->cb_scrub);
244	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
245	volume->creation_time = new;
246	return 0;
247}
248
249/*
250 * Handle a change to the volume update time in the VolSync record.
251 */
252static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
253{
254	enum afs_cb_break_reason reason = afs_cb_break_no_break;
255	time64_t cur = volume->update_time;
256	time64_t old = op->pre_volsync.update;
257	time64_t new = op->volsync.update;
258
259	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
260
261	if (cur == TIME64_MIN) {
262		volume->update_time = new;
263		return;
264	}
265
266	if (new == cur)
267		return;
268
269	/* If the volume update time changes in an unexpected way, we need to
270	 * scrub our caches.  For a RW vol, this will advance on every
271	 * modification op; for a RO/Backup vol, this will advance when the
272	 * volume is updated to a new snapshot (eg. "vos release").
273	 */
274	if (new < old)
275		reason = afs_cb_break_for_update_regress;
276
277	/* Try to advance the update timestamp from what we had before the
278	 * operation to what we got back from the server.  This should
279	 * hopefully ensure that in a race between multiple operations only one
280	 * of them will do this.
281	 */
282	if (cur == old) {
283		if (reason == afs_cb_break_for_update_regress) {
284			atomic_inc(&volume->cb_scrub);
285			trace_afs_cb_v_break(volume->vid, 0, reason);
286		}
287		volume->update_time = new;
288	}
289}
290
291static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
292{
293	int ret = 0;
294
295	if (likely(op->volsync.creation == volume->creation_time &&
296		   op->volsync.update == volume->update_time))
297		return 0;
298
299	mutex_lock(&volume->volsync_lock);
300	if (op->volsync.creation != volume->creation_time) {
301		ret = afs_update_volume_creation_time(op, volume);
302		if (ret < 0)
303			goto out;
304	}
305	if (op->volsync.update != volume->update_time)
306		afs_update_volume_update_time(op, volume);
307out:
308	mutex_unlock(&volume->volsync_lock);
309	return ret;
310}
311
312/*
313 * Update the state of a volume, including recording the expiration time of the
314 * callback promise.  Returns 1 to redo the operation from the start.
315 */
316int afs_update_volume_state(struct afs_operation *op)
317{
318	struct afs_server_list *slist = op->server_list;
319	struct afs_server_entry *se = &slist->servers[op->server_index];
320	struct afs_callback *cb = &op->file[0].scb.callback;
321	struct afs_volume *volume = op->volume;
322	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
323	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
324	int ret;
325
326	_enter("%llx", op->volume->vid);
327
328	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
329		ret = afs_update_volume_times(op, volume);
330		if (ret != 0) {
331			_leave(" = %d", ret);
332			return ret;
333		}
334	}
335
336	if (op->cb_v_break == cb_v_break &&
337	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
338		time64_t expires_at = cb->expires_at;
339
340		if (!op->file[0].scb.have_cb)
341			expires_at = op->file[1].scb.callback.expires_at;
342
343		se->cb_expires_at = expires_at;
344		volume->cb_expires_at = expires_at;
345	}
346	if (cb_v_check < op->cb_v_break)
347		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
348	return 0;
349}
350
351/*
352 * mark the data attached to an inode as obsolete due to a write on the server
353 * - might also want to ditch all the outstanding writes and dirty pages
354 */
355static void afs_zap_data(struct afs_vnode *vnode)
356{
357	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
358
359	afs_invalidate_cache(vnode, 0);
360
361	/* nuke all the non-dirty pages that aren't locked, mapped or being
362	 * written back in a regular file and completely discard the pages in a
363	 * directory or symlink */
364	if (S_ISREG(vnode->netfs.inode.i_mode))
365		invalidate_remote_inode(&vnode->netfs.inode);
366	else
367		invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
368}
369
370/*
371 * validate a vnode/inode
372 * - there are several things we need to check
373 *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
374 *     symlink)
375 *   - parent dir metadata changed (security changes)
376 *   - dentry data changed (write, truncate)
377 *   - dentry metadata changed (security changes)
378 */
379int afs_validate(struct afs_vnode *vnode, struct key *key)
380{
381	struct afs_volume *volume = vnode->volume;
382	unsigned int cb_ro_snapshot, cb_scrub;
383	time64_t deadline = ktime_get_real_seconds() + 10;
384	bool zap = false, locked_vol = false;
385	int ret;
386
387	_enter("{v={%llx:%llu} fl=%lx},%x",
388	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
389	       key_serial(key));
390
391	if (afs_check_validity(vnode))
392		return 0;
393
394	ret = down_write_killable(&vnode->validate_lock);
395	if (ret < 0)
396		goto error;
397
398	/* Validate a volume after the v_break has changed or the volume
399	 * callback expired.  We only want to do this once per volume per
400	 * v_break change.  The actual work will be done when parsing the
401	 * status fetch reply.
402	 */
403	if (volume->cb_expires_at <= deadline ||
404	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
405		ret = mutex_lock_interruptible(&volume->cb_check_lock);
406		if (ret < 0)
407			goto error_unlock;
408		locked_vol = true;
409	}
410
411	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
412	cb_scrub = atomic_read(&volume->cb_scrub);
413	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
414	    vnode->cb_scrub	  != cb_scrub)
415		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
416
417	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
418	    vnode->cb_scrub	  != cb_scrub ||
419	    volume->cb_expires_at <= deadline ||
420	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
421	    atomic64_read(&vnode->cb_expires_at) <= deadline
422	    ) {
423		ret = afs_fetch_status(vnode, key, false, NULL);
424		if (ret < 0) {
425			if (ret == -ENOENT) {
426				set_bit(AFS_VNODE_DELETED, &vnode->flags);
427				ret = -ESTALE;
428			}
429			goto error_unlock;
430		}
431
432		_debug("new promise [fl=%lx]", vnode->flags);
433	}
434
435	/* We can drop the volume lock now as. */
436	if (locked_vol) {
437		mutex_unlock(&volume->cb_check_lock);
438		locked_vol = false;
439	}
440
441	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
442	cb_scrub = atomic_read(&volume->cb_scrub);
443	_debug("vnode inval %x==%x %x==%x",
444	       vnode->cb_ro_snapshot, cb_ro_snapshot,
445	       vnode->cb_scrub, cb_scrub);
446	if (vnode->cb_scrub != cb_scrub)
447		zap = true;
448	vnode->cb_ro_snapshot = cb_ro_snapshot;
449	vnode->cb_scrub = cb_scrub;
450
451	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
452		_debug("file already deleted");
453		ret = -ESTALE;
454		goto error_unlock;
455	}
456
457	/* if the vnode's data version number changed then its contents are
458	 * different */
459	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
460	if (zap)
461		afs_zap_data(vnode);
462	up_write(&vnode->validate_lock);
463	_leave(" = 0");
464	return 0;
465
466error_unlock:
467	if (locked_vol)
468		mutex_unlock(&volume->cb_check_lock);
469	up_write(&vnode->validate_lock);
470error:
471	_leave(" = %d", ret);
472	return ret;
473}