rotate.c - fs/afs/rotate.c - Linux diff v6.13.7 - Bootlin Elixir Cross Referencer

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* Handle fileserver selection and rotation.
  3 *
  4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  5 * Written by David Howells (dhowells@redhat.com)
 
 
 
 
 
  6 */
  7
  8#include <linux/kernel.h>
  9#include <linux/slab.h>
 10#include <linux/fs.h>
 11#include <linux/sched.h>
 12#include <linux/delay.h>
 13#include <linux/sched/signal.h>
 14#include "internal.h"
 15#include "afs_fs.h"
 16#include "protocol_uae.h"
 17
 18void afs_clear_server_states(struct afs_operation *op)
 
 
 
 19{
 20	unsigned int i;
 
 21
 22	if (op->server_states) {
 23		for (i = 0; i < op->server_list->nr_servers; i++)
 24			afs_put_endpoint_state(op->server_states[i].endpoint_state,
 25					       afs_estate_trace_put_server_state);
 26		kfree(op->server_states);
 
 
 
 
 
 
 
 
 
 
 
 
 
 27	}
 
 
 
 
 28}
 29
 30/*
 31 * Begin iteration through a server list, starting with the vnode's last used
 32 * server if possible, or the last recorded good server if not.
 33 */
 34static bool afs_start_fs_iteration(struct afs_operation *op,
 35				   struct afs_vnode *vnode)
 36{
 37	struct afs_server *server;
 38	void *cb_server;
 39	int i;
 40
 41	trace_afs_rotate(op, afs_rotate_trace_start, 0);
 42
 43	read_lock(&op->volume->servers_lock);
 44	op->server_list = afs_get_serverlist(
 45		rcu_dereference_protected(op->volume->servers,
 46					  lockdep_is_held(&op->volume->servers_lock)));
 47	read_unlock(&op->volume->servers_lock);
 48
 49	op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]),
 50				    GFP_KERNEL);
 51	if (!op->server_states) {
 52		afs_op_nomem(op);
 53		trace_afs_rotate(op, afs_rotate_trace_nomem, 0);
 54		return false;
 55	}
 56
 57	rcu_read_lock();
 58	for (i = 0; i < op->server_list->nr_servers; i++) {
 59		struct afs_endpoint_state *estate;
 60		struct afs_server_state *s = &op->server_states[i];
 61
 62		server = op->server_list->servers[i].server;
 63		estate = rcu_dereference(server->endpoint_state);
 64		s->endpoint_state = afs_get_endpoint_state(estate,
 65							   afs_estate_trace_get_server_state);
 66		s->probe_seq = estate->probe_seq;
 67		s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1;
 68		init_waitqueue_entry(&s->probe_waiter, current);
 69		afs_get_address_preferences(op->net, estate->addresses);
 70	}
 71	rcu_read_unlock();
 72
 73
 74	op->untried_servers = (1UL << op->server_list->nr_servers) - 1;
 75	op->server_index = -1;
 76
 77	cb_server = vnode->cb_server;
 78	if (cb_server) {
 79		/* See if the vnode's preferred record is still available */
 80		for (i = 0; i < op->server_list->nr_servers; i++) {
 81			server = op->server_list->servers[i].server;
 82			if (server == cb_server) {
 83				op->server_index = i;
 84				goto found_interest;
 85			}
 86		}
 87
 88		/* If we have a lock outstanding on a server that's no longer
 89		 * serving this vnode, then we can't switch to another server
 90		 * and have to return an error.
 91		 */
 92		if (op->flags & AFS_OPERATION_CUR_ONLY) {
 93			afs_op_set_error(op, -ESTALE);
 94			trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0);
 95			return false;
 96		}
 97
 98		/* Note that the callback promise is effectively broken */
 99		write_seqlock(&vnode->cb_lock);
100		ASSERTCMP(cb_server, ==, vnode->cb_server);
101		vnode->cb_server = NULL;
102		if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
103			vnode->cb_break++;
104		write_sequnlock(&vnode->cb_lock);
 
 
 
 
 
105	}
106
107found_interest:
 
108	return true;
109}
110
111/*
112 * Post volume busy note.
113 */
114static void afs_busy(struct afs_operation *op, u32 abort_code)
115{
116	const char *m;
117
118	switch (abort_code) {
119	case VOFFLINE:		m = "offline";		break;
120	case VRESTARTING:	m = "restarting";	break;
121	case VSALVAGING:	m = "being salvaged";	break;
122	default:		m = "busy";		break;
123	}
124
125	pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n",
126		  op->volume->vid, op->volume->name, &op->server->uuid, m);
127}
128
129/*
130 * Sleep and retry the operation to the same fileserver.
131 */
132static bool afs_sleep_and_retry(struct afs_operation *op)
133{
134	trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0);
135	if (!(op->flags & AFS_OPERATION_UNINTR)) {
136		msleep_interruptible(1000);
137		if (signal_pending(current)) {
138			afs_op_set_error(op, -ERESTARTSYS);
139			return false;
140		}
141	} else {
142		msleep(1000);
143	}
144
145	return true;
146}
147
148/*
149 * Select the fileserver to use.  May be called multiple times to rotate
150 * through the fileservers.
151 */
152bool afs_select_fileserver(struct afs_operation *op)
153{
154	struct afs_addr_list *alist;
155	struct afs_server *server;
156	struct afs_vnode *vnode = op->file[0].vnode;
157	unsigned long set, failed;
158	s32 abort_code = op->call_abort_code;
159	int best_prio = 0;
160	int error = op->call_error, addr_index, i, j;
161
162	op->nr_iterations++;
163
164	_enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d",
165	       op->debug_id, op->nr_iterations, op->volume->vid,
166	       op->server_index, op->untried_servers,
167	       op->addr_index, op->addr_tried,
168	       error, abort_code);
169
170	if (op->flags & AFS_OPERATION_STOP) {
171		trace_afs_rotate(op, afs_rotate_trace_stopped, 0);
172		_leave(" = f [stopped]");
173		return false;
174	}
175
176	if (op->nr_iterations == 0)
 
 
177		goto start;
178
179	WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error);
180	trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error);
181
182	/* Evaluate the result of the previous operation, if there was one. */
183	switch (op->call_error) {
184	case 0:
185		clear_bit(AFS_SE_VOLUME_OFFLINE,
186			  &op->server_list->servers[op->server_index].flags);
187		clear_bit(AFS_SE_VOLUME_BUSY,
188			  &op->server_list->servers[op->server_index].flags);
189		op->cumul_error.responded = true;
190
191		/* We succeeded, but we may need to redo the op from another
192		 * server if we're looking at a set of RO volumes where some of
193		 * the servers have not yet been brought up to date lest we
194		 * regress the data.  We only switch to the new version once
195		 * >=50% of the servers are updated.
196		 */
197		error = afs_update_volume_state(op);
198		if (error != 0) {
199			if (error == 1) {
200				afs_sleep_and_retry(op);
201				goto restart_from_beginning;
202			}
203			afs_op_set_error(op, error);
204			goto failed;
205		}
206		fallthrough;
207	default:
208		/* Success or local failure.  Stop. */
209		afs_op_set_error(op, error);
210		op->flags |= AFS_OPERATION_STOP;
211		trace_afs_rotate(op, afs_rotate_trace_stop, error);
212		_leave(" = f [okay/local %d]", error);
213		return false;
214
215	case -ECONNABORTED:
216		/* The far side rejected the operation on some grounds.  This
217		 * might involve the server being busy or the volume having been moved.
218		 *
219		 * Note that various V* errors should not be sent to a cache manager
220		 * by a fileserver as they should be translated to more modern UAE*
221		 * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
222		 * these abort codes.
223		 */
224		trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code);
225		op->cumul_error.responded = true;
226		switch (abort_code) {
227		case VNOVOL:
228			/* This fileserver doesn't know about the volume.
229			 * - May indicate that the VL is wrong - retry once and compare
230			 *   the results.
231			 * - May indicate that the fileserver couldn't attach to the vol.
232			 * - The volume might have been temporarily removed so that it can
233			 *   be replaced by a volume restore.  "vos" might have ended one
234			 *   transaction and has yet to create the next.
235			 * - The volume might not be blessed or might not be in-service
236			 *   (administrative action).
237			 */
238			if (op->flags & AFS_OPERATION_VNOVOL) {
239				afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
240				goto next_server;
241			}
242
243			write_lock(&op->volume->servers_lock);
244			op->server_list->vnovol_mask |= 1 << op->server_index;
245			write_unlock(&op->volume->servers_lock);
246
247			set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
248			error = afs_check_volume_status(op->volume, op);
249			if (error < 0) {
250				afs_op_set_error(op, error);
251				goto failed;
252			}
253
254			if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
255				afs_op_set_error(op, -ENOMEDIUM);
256				goto failed;
257			}
258
259			/* If the server list didn't change, then assume that
260			 * it's the fileserver having trouble.
261			 */
262			if (rcu_access_pointer(op->volume->servers) == op->server_list) {
263				afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
264				goto next_server;
265			}
266
267			/* Try again */
268			op->flags |= AFS_OPERATION_VNOVOL;
269			_leave(" = t [vnovol]");
270			return true;
271
 
272		case VVOLEXISTS:
273		case VONLINE:
274			/* These should not be returned from the fileserver. */
275			pr_warn("Fileserver returned unexpected abort %d\n",
276				abort_code);
277			afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
278			goto next_server;
279
280		case VNOSERVICE:
281			/* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
282			 * if the volume was neither in-service nor administratively
283			 * blessed.  All usage was replaced by VNOVOL because AFS 3.1 and
284			 * earlier cache managers did not handle VNOSERVICE and assumed
285			 * it was the client OSes errno 105.
286			 *
287			 * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
288			 * fileserver idle dead time error which was sent in place of
289			 * RX_CALL_TIMEOUT (-3).  The error was intended to be sent if the
290			 * fileserver took too long to send a reply to the client.
291			 * RX_CALL_TIMEOUT would have caused the cache manager to mark the
292			 * server down whereas VNOSERVICE since AFS 3.2 would cause cache
293			 * manager to temporarily (up to 15 minutes) mark the volume
294			 * instance as unusable.
295			 *
296			 * The idle dead logic resulted in cache inconsistency since a
297			 * state changing call that the cache manager assumed was dead
298			 * could still be processed to completion by the fileserver.  This
299			 * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
300			 * returned.  However, many 1.4.8 through 1.6.24 fileservers are
301			 * still in existence.
302			 *
303			 * AuriStorFS fileservers have never returned VNOSERVICE.
304			 *
305			 * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
306			 */
307		case RX_CALL_TIMEOUT:
308			afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
309			goto next_server;
310
311		case VSALVAGING: /* This error should not be leaked to cache managers
312				  * but is from OpenAFS demand attach fileservers.
313				  * It should be treated as an alias for VOFFLINE.
314				  */
315		case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
316		case VOFFLINE:
317			/* The volume is in use by the volserver or another volume utility
318			 * for an operation that might alter the contents.  The volume is
319			 * expected to come back but it might take a long time (could be
320			 * days).
321			 */
322			if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE,
323					      &op->server_list->servers[op->server_index].flags)) {
324				afs_busy(op, abort_code);
325				clear_bit(AFS_SE_VOLUME_BUSY,
326					  &op->server_list->servers[op->server_index].flags);
327			}
328			if (op->flags & AFS_OPERATION_NO_VSLEEP) {
329				afs_op_set_error(op, -EADV);
330				goto failed;
331			}
332			goto busy;
333
334		case VRESTARTING: /* The fileserver is either shutting down or starting up. */
 
335		case VBUSY:
336			/* The volume is in use by the volserver or another volume
337			 * utility for an operation that is not expected to alter the
338			 * contents of the volume.  VBUSY does not need to be returned
339			 * for a ROVOL or BACKVOL bound to an ITBusy volserver
340			 * transaction.  The fileserver is permitted to continue serving
341			 * content from ROVOLs and BACKVOLs during an ITBusy transaction
342			 * because the content will not change.  However, many fileserver
343			 * releases do return VBUSY for ROVOL and BACKVOL instances under
344			 * many circumstances.
345			 *
346			 * Retry after going round all the servers unless we have a file
347			 * lock we need to maintain.
348			 */
349			if (op->flags & AFS_OPERATION_NO_VSLEEP) {
350				afs_op_set_error(op, -EBUSY);
351				goto failed;
352			}
353			if (!test_and_set_bit(AFS_SE_VOLUME_BUSY,
354					      &op->server_list->servers[op->server_index].flags)) {
355				afs_busy(op, abort_code);
356				clear_bit(AFS_SE_VOLUME_OFFLINE,
357					  &op->server_list->servers[op->server_index].flags);
358			}
359		busy:
360			if (op->flags & AFS_OPERATION_CUR_ONLY) {
361				if (!afs_sleep_and_retry(op))
362					goto failed;
363
364				/* Retry with same server & address */
365				_leave(" = t [vbusy]");
366				return true;
367			}
368
369			op->flags |= AFS_OPERATION_VBUSY;
370			goto next_server;
371
372		case VMOVED:
373			/* The volume migrated to another server.  We consider
374			 * consider all locks and callbacks broken and request
375			 * an update from the VLDB.
376			 *
377			 * We also limit the number of VMOVED hops we will
378			 * honour, just in case someone sets up a loop.
379			 */
380			if (op->flags & AFS_OPERATION_VMOVED) {
381				afs_op_set_error(op, -EREMOTEIO);
382				goto failed;
383			}
384			op->flags |= AFS_OPERATION_VMOVED;
385
386			set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
387			set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
388			error = afs_check_volume_status(op->volume, op);
389			if (error < 0) {
390				afs_op_set_error(op, error);
391				goto failed;
392			}
393
394			/* If the server list didn't change, then the VLDB is
395			 * out of sync with the fileservers.  This is hopefully
396			 * a temporary condition, however, so we don't want to
397			 * permanently block access to the file.
398			 *
399			 * TODO: Try other fileservers if we can.
400			 *
401			 * TODO: Retry a few times with sleeps.
402			 */
403			if (rcu_access_pointer(op->volume->servers) == op->server_list) {
404				afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
405				goto failed;
406			}
407
408			goto restart_from_beginning;
409
410		case UAEIO:
411		case VIO:
412			afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
413			if (op->volume->type != AFSVL_RWVOL)
414				goto next_server;
415			goto failed;
416
417		case VDISKFULL:
418		case UAENOSPC:
419			/* The partition is full.  Only applies to RWVOLs.
420			 * Translate locally and return ENOSPC.
421			 * No replicas to failover to.
422			 */
423			afs_op_set_error(op, -ENOSPC);
424			goto failed_but_online;
425
426		case VOVERQUOTA:
427		case UAEDQUOT:
428			/* Volume is full.  Only applies to RWVOLs.
429			 * Translate locally and return EDQUOT.
430			 * No replicas to failover to.
431			 */
432			afs_op_set_error(op, -EDQUOT);
433			goto failed_but_online;
434
435		default:
436			afs_op_accumulate_error(op, error, abort_code);
437		failed_but_online:
438			clear_bit(AFS_SE_VOLUME_OFFLINE,
439				  &op->server_list->servers[op->server_index].flags);
440			clear_bit(AFS_SE_VOLUME_BUSY,
441				  &op->server_list->servers[op->server_index].flags);
442			goto failed;
443		}
444
445	case -ETIMEDOUT:
446	case -ETIME:
447		if (afs_op_error(op) != -EDESTADDRREQ)
448			goto iterate_address;
449		fallthrough;
450	case -ERFKILL:
451	case -EADDRNOTAVAIL:
452	case -ENETUNREACH:
453	case -EHOSTUNREACH:
454	case -EHOSTDOWN:
455	case -ECONNREFUSED:
 
 
456		_debug("no conn");
457		afs_op_accumulate_error(op, error, 0);
458		goto iterate_address;
459
460	case -ENETRESET:
461		pr_warn("kAFS: Peer reset %s (op=%x)\n",
462			op->type ? op->type->name : "???", op->debug_id);
463		fallthrough;
464	case -ECONNRESET:
465		_debug("call reset");
466		afs_op_set_error(op, error);
467		goto failed;
468	}
469
470restart_from_beginning:
471	trace_afs_rotate(op, afs_rotate_trace_restart, 0);
472	_debug("restart");
473	op->estate = NULL;
474	op->server = NULL;
475	afs_clear_server_states(op);
476	op->server_states = NULL;
477	afs_put_serverlist(op->net, op->server_list);
478	op->server_list = NULL;
479start:
480	_debug("start");
481	ASSERTCMP(op->estate, ==, NULL);
482	/* See if we need to do an update of the volume record.  Note that the
483	 * volume may have moved or even have been deleted.
484	 */
485	error = afs_check_volume_status(op->volume, op);
486	trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error);
487	if (error < 0) {
488		afs_op_set_error(op, error);
489		goto failed;
490	}
491
492	if (!afs_start_fs_iteration(op, vnode))
493		goto failed;
494
495	_debug("__ VOL %llx __", op->volume->vid);
496
497pick_server:
498	_debug("pick [%lx]", op->untried_servers);
499	ASSERTCMP(op->estate, ==, NULL);
500
501	error = afs_wait_for_fs_probes(op, op->server_states,
502				       !(op->flags & AFS_OPERATION_UNINTR));
503	switch (error) {
504	case 0: /* No untried responsive servers and no outstanding probes */
505		trace_afs_rotate(op, afs_rotate_trace_probe_none, 0);
506		goto no_more_servers;
507	case 1: /* Got a response */
508		trace_afs_rotate(op, afs_rotate_trace_probe_response, 0);
509		break;
510	case 2: /* Probe data superseded */
511		trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0);
512		goto restart_from_beginning;
513	default:
514		trace_afs_rotate(op, afs_rotate_trace_probe_error, error);
515		afs_op_set_error(op, error);
516		goto failed;
517	}
518
519	/* Pick the untried server with the highest priority untried endpoint.
520	 * If we have outstanding callbacks, we stick with the server we're
521	 * already using if we can.
522	 */
523	if (op->server) {
524		_debug("server %u", op->server_index);
525		if (test_bit(op->server_index, &op->untried_servers))
526			goto selected_server;
527		op->server = NULL;
528		_debug("no server");
529	}
530
531	rcu_read_lock();
532	op->server_index = -1;
533	best_prio = -1;
534	for (i = 0; i < op->server_list->nr_servers; i++) {
535		struct afs_endpoint_state *es;
536		struct afs_server_entry *se = &op->server_list->servers[i];
537		struct afs_addr_list *sal;
538		struct afs_server *s = se->server;
539
540		if (!test_bit(i, &op->untried_servers) ||
541		    test_bit(AFS_SE_EXCLUDED, &se->flags) ||
542		    !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
543			continue;
544		es = op->server_states[i].endpoint_state;
545		sal = es->addresses;
546
547		afs_get_address_preferences_rcu(op->net, sal);
548		for (j = 0; j < sal->nr_addrs; j++) {
549			if (es->failed_set & (1 << j))
550				continue;
551			if (!sal->addrs[j].peer)
552				continue;
553			if (sal->addrs[j].prio > best_prio) {
554				op->server_index = i;
555				best_prio = sal->addrs[j].prio;
556			}
557		}
558	}
559	rcu_read_unlock();
560
561	if (op->server_index == -1)
562		goto no_more_servers;
563
564selected_server:
565	trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio);
566	_debug("use %d prio %u", op->server_index, best_prio);
567	__clear_bit(op->server_index, &op->untried_servers);
568
 
 
569	/* We're starting on a different fileserver from the list.  We need to
570	 * check it, create a callback intercept, find its address list and
571	 * probe its capabilities before we use it.
572	 */
573	ASSERTCMP(op->estate, ==, NULL);
574	server = op->server_list->servers[op->server_index].server;
575
576	if (!afs_check_server_record(op, server, op->key))
577		goto failed;
578
579	_debug("USING SERVER: %pU", &server->uuid);
580
581	op->flags |= AFS_OPERATION_RETRY_SERVER;
582	op->server = server;
583	if (vnode->cb_server != server) {
584		vnode->cb_server = server;
585		vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
586		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587	}
588
589retry_server:
590	op->addr_tried = 0;
591	op->addr_index = -1;
 
 
 
 
592
593iterate_address:
 
 
594	/* Iterate over the current server's address list to try and find an
595	 * address on which it will respond to us.
596	 */
597	op->estate = op->server_states[op->server_index].endpoint_state;
598	set = READ_ONCE(op->estate->responsive_set);
599	failed = READ_ONCE(op->estate->failed_set);
600	_debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed);
601	set &= ~(failed | op->addr_tried);
602	trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set);
603	if (!set)
604		goto wait_for_more_probe_results;
605
606	alist = op->estate->addresses;
607	best_prio = -1;
608	addr_index = 0;
609	for (i = 0; i < alist->nr_addrs; i++) {
610		if (!(set & (1 << i)))
611			continue;
612		if (alist->addrs[i].prio > best_prio) {
613			addr_index = i;
614			best_prio = alist->addrs[i].prio;
615		}
616	}
617
618	alist->preferred = addr_index;
619
620	op->addr_index = addr_index;
621	set_bit(addr_index, &op->addr_tried);
622
623	op->volsync.creation = TIME64_MIN;
624	op->volsync.update = TIME64_MIN;
625	op->call_responded = false;
626	_debug("address [%u] %u/%u %pISp",
627	       op->server_index, addr_index, alist->nr_addrs,
628	       rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
629	_leave(" = t");
630	return true;
631
632wait_for_more_probe_results:
633	error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
634					  !(op->flags & AFS_OPERATION_UNINTR));
635	if (error == 1)
636		goto iterate_address;
637	if (!error)
638		goto restart_from_beginning;
639
640	/* We've now had a failure to respond on all of a server's addresses -
641	 * immediately probe them again and consider retrying the server.
642	 */
643	trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0);
644	afs_probe_fileserver(op->net, op->server);
645	if (op->flags & AFS_OPERATION_RETRY_SERVER) {
646		error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
647						  !(op->flags & AFS_OPERATION_UNINTR));
648		switch (error) {
649		case 1:
650			op->flags &= ~AFS_OPERATION_RETRY_SERVER;
651			trace_afs_rotate(op, afs_rotate_trace_retry_server, 1);
652			goto retry_server;
653		case 0:
654			trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
655			goto restart_from_beginning;
656		case -ERESTARTSYS:
657			afs_op_set_error(op, error);
658			goto failed;
659		case -ETIME:
660		case -EDESTADDRREQ:
661			goto next_server;
662		}
663	}
664
665next_server:
666	trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
667	_debug("next");
668	op->estate = NULL;
669	goto pick_server;
 
 
 
 
 
 
670
671no_more_servers:
672	/* That's all the servers poked to no good effect.  Try again if some
673	 * of them were busy.
674	 */
675	trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0);
676	if (op->flags & AFS_OPERATION_VBUSY) {
677		afs_sleep_and_retry(op);
678		op->flags &= ~AFS_OPERATION_VBUSY;
679		goto restart_from_beginning;
680	}
681
682	rcu_read_lock();
683	for (i = 0; i < op->server_list->nr_servers; i++) {
684		struct afs_endpoint_state *estate;
685
686		estate = op->server_states[i].endpoint_state;
687		error = READ_ONCE(estate->error);
688		if (error < 0)
689			afs_op_accumulate_error(op, error, estate->abort_code);
690	}
691	rcu_read_unlock();
692
693failed:
694	trace_afs_rotate(op, afs_rotate_trace_failed, 0);
695	op->flags |= AFS_OPERATION_STOP;
696	op->estate = NULL;
697	_leave(" = f [failed %d]", afs_op_error(op));
698	return false;
699}
700
701/*
702 * Dump cursor state in the case of the error being EDESTADDRREQ.
 
 
703 */
704void afs_dump_edestaddrreq(const struct afs_operation *op)
705{
706	static int count;
707	int i;
 
708
709	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
710		return;
711	count++;
712
713	rcu_read_lock();
714
715	pr_notice("EDESTADDR occurred\n");
716	pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
717		  op->file[0].cb_break_before,
718		  op->file[1].cb_break_before, op->flags, op->cumul_error.error);
719	pr_notice("OP: ut=%lx ix=%d ni=%u\n",
720		  op->untried_servers, op->server_index, op->nr_iterations);
721	pr_notice("OP: call  er=%d ac=%d r=%u\n",
722		  op->call_error, op->call_abort_code, op->call_responded);
723
724	if (op->server_list) {
725		const struct afs_server_list *sl = op->server_list;
726
727		pr_notice("FC: SL nr=%u vnov=%hx\n",
728			  sl->nr_servers, sl->vnovol_mask);
729		for (i = 0; i < sl->nr_servers; i++) {
730			const struct afs_server *s = sl->servers[i].server;
731			const struct afs_endpoint_state *e =
732				rcu_dereference(s->endpoint_state);
733			const struct afs_addr_list *a = e->addresses;
734
735			pr_notice("FC: server fl=%lx av=%u %pU\n",
736				  s->flags, s->addr_version, &s->uuid);
737			pr_notice("FC:  - pq=%x R=%lx F=%lx\n",
738				  e->probe_seq, e->responsive_set, e->failed_set);
739			if (a) {
740				pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
741					  a->version,
742					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
743					  a->preferred);
744				if (a == e->addresses)
745					pr_notice("FC:  - current\n");
746			}
747		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748	}
749
750	pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index);
751	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752}

 
  1/* Handle fileserver selection and rotation.
  2 *
  3 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  4 * Written by David Howells (dhowells@redhat.com)
  5 *
  6 * This program is free software; you can redistribute it and/or
  7 * modify it under the terms of the GNU General Public Licence
  8 * as published by the Free Software Foundation; either version
  9 * 2 of the Licence, or (at your option) any later version.
 10 */
 11
 12#include <linux/kernel.h>
 13#include <linux/slab.h>
 14#include <linux/fs.h>
 15#include <linux/sched.h>
 16#include <linux/delay.h>
 17#include <linux/sched/signal.h>
 18#include "internal.h"
 19#include "afs_fs.h"
 
 20
 21/*
 22 * Initialise a filesystem server cursor for iterating over FS servers.
 23 */
 24static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
 25{
 26	memset(fc, 0, sizeof(*fc));
 27}
 28
 29/*
 30 * Begin an operation on the fileserver.
 31 *
 32 * Fileserver operations are serialised on the server by vnode, so we serialise
 33 * them here also using the io_lock.
 34 */
 35bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
 36			       struct key *key)
 37{
 38	afs_init_fs_cursor(fc, vnode);
 39	fc->vnode = vnode;
 40	fc->key = key;
 41	fc->ac.error = SHRT_MAX;
 42
 43	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
 44		fc->ac.error = -EINTR;
 45		fc->flags |= AFS_FS_CURSOR_STOP;
 46		return false;
 47	}
 48
 49	if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
 50		fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
 51	return true;
 52}
 53
 54/*
 55 * Begin iteration through a server list, starting with the vnode's last used
 56 * server if possible, or the last recorded good server if not.
 57 */
 58static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
 59				   struct afs_vnode *vnode)
 60{
 61	struct afs_cb_interest *cbi;
 
 62	int i;
 63
 64	read_lock(&vnode->volume->servers_lock);
 65	fc->server_list = afs_get_serverlist(vnode->volume->servers);
 66	read_unlock(&vnode->volume->servers_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 67
 68	cbi = vnode->cb_interest;
 69	if (cbi) {
 70		/* See if the vnode's preferred record is still available */
 71		for (i = 0; i < fc->server_list->nr_servers; i++) {
 72			if (fc->server_list->servers[i].cb_interest == cbi) {
 73				fc->start = i;
 
 74				goto found_interest;
 75			}
 76		}
 77
 78		/* If we have a lock outstanding on a server that's no longer
 79		 * serving this vnode, then we can't switch to another server
 80		 * and have to return an error.
 81		 */
 82		if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
 83			fc->ac.error = -ESTALE;
 
 84			return false;
 85		}
 86
 87		/* Note that the callback promise is effectively broken */
 88		write_seqlock(&vnode->cb_lock);
 89		ASSERTCMP(cbi, ==, vnode->cb_interest);
 90		vnode->cb_interest = NULL;
 91		if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
 92			vnode->cb_break++;
 93		write_sequnlock(&vnode->cb_lock);
 94
 95		afs_put_cb_interest(afs_v2net(vnode), cbi);
 96		cbi = NULL;
 97	} else {
 98		fc->start = READ_ONCE(fc->server_list->index);
 99	}
100
101found_interest:
102	fc->index = fc->start;
103	return true;
104}
105
106/*
107 * Post volume busy note.
108 */
109static void afs_busy(struct afs_volume *volume, u32 abort_code)
110{
111	const char *m;
112
113	switch (abort_code) {
114	case VOFFLINE:		m = "offline";		break;
115	case VRESTARTING:	m = "restarting";	break;
116	case VSALVAGING:	m = "being salvaged";	break;
117	default:		m = "busy";		break;
118	}
119
120	pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m);
 
121}
122
123/*
124 * Sleep and retry the operation to the same fileserver.
125 */
126static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
127{
128	msleep_interruptible(1000);
129	if (signal_pending(current)) {
130		fc->ac.error = -ERESTARTSYS;
131		return false;
 
 
 
 
 
132	}
133
134	return true;
135}
136
137/*
138 * Select the fileserver to use.  May be called multiple times to rotate
139 * through the fileservers.
140 */
141bool afs_select_fileserver(struct afs_fs_cursor *fc)
142{
143	struct afs_addr_list *alist;
144	struct afs_server *server;
145	struct afs_vnode *vnode = fc->vnode;
146
147	_enter("%u/%u,%u/%u,%d,%d",
148	       fc->index, fc->start,
149	       fc->ac.index, fc->ac.start,
150	       fc->ac.error, fc->ac.abort_code);
 
 
 
 
 
 
 
151
152	if (fc->flags & AFS_FS_CURSOR_STOP) {
 
153		_leave(" = f [stopped]");
154		return false;
155	}
156
157	/* Evaluate the result of the previous operation, if there was one. */
158	switch (fc->ac.error) {
159	case SHRT_MAX:
160		goto start;
161
 
 
 
 
 
162	case 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163	default:
164		/* Success or local failure.  Stop. */
165		fc->flags |= AFS_FS_CURSOR_STOP;
166		_leave(" = f [okay/local %d]", fc->ac.error);
 
 
167		return false;
168
169	case -ECONNABORTED:
170		/* The far side rejected the operation on some grounds.  This
171		 * might involve the server being busy or the volume having been moved.
 
 
 
 
 
172		 */
173		switch (fc->ac.abort_code) {
 
 
174		case VNOVOL:
175			/* This fileserver doesn't know about the volume.
176			 * - May indicate that the VL is wrong - retry once and compare
177			 *   the results.
178			 * - May indicate that the fileserver couldn't attach to the vol.
 
 
 
 
 
179			 */
180			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
181				fc->ac.error = -EREMOTEIO;
182				goto next_server;
183			}
184
185			write_lock(&vnode->volume->servers_lock);
186			fc->server_list->vnovol_mask |= 1 << fc->index;
187			write_unlock(&vnode->volume->servers_lock);
188
189			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
190			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
191			if (fc->ac.error < 0)
 
192				goto failed;
 
193
194			if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
195				fc->ac.error = -ENOMEDIUM;
196				goto failed;
197			}
198
199			/* If the server list didn't change, then assume that
200			 * it's the fileserver having trouble.
201			 */
202			if (vnode->volume->servers == fc->server_list) {
203				fc->ac.error = -EREMOTEIO;
204				goto next_server;
205			}
206
207			/* Try again */
208			fc->flags |= AFS_FS_CURSOR_VNOVOL;
209			_leave(" = t [vnovol]");
210			return true;
211
212		case VSALVAGE: /* TODO: Should this return an error or iterate? */
213		case VVOLEXISTS:
 
 
 
 
 
 
 
214		case VNOSERVICE:
215		case VONLINE:
216		case VDISKFULL:
217		case VOVERQUOTA:
218			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219			goto next_server;
220
 
 
 
 
 
221		case VOFFLINE:
222			if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
223				afs_busy(vnode->volume, fc->ac.abort_code);
224				clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
225			}
226			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
227				fc->ac.error = -EADV;
228				goto failed;
 
 
 
229			}
230			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
231				fc->ac.error = -ESTALE;
232				goto failed;
233			}
234			goto busy;
235
236		case VSALVAGING:
237		case VRESTARTING:
238		case VBUSY:
239			/* Retry after going round all the servers unless we
240			 * have a file lock we need to maintain.
 
 
 
 
 
 
 
 
 
 
241			 */
242			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
243				fc->ac.error = -EBUSY;
244				goto failed;
245			}
246			if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
247				afs_busy(vnode->volume, fc->ac.abort_code);
248				clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
 
 
249			}
250		busy:
251			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
252				if (!afs_sleep_and_retry(fc))
253					goto failed;
254
255				 /* Retry with same server & address */
256				_leave(" = t [vbusy]");
257				return true;
258			}
259
260			fc->flags |= AFS_FS_CURSOR_VBUSY;
261			goto next_server;
262
263		case VMOVED:
264			/* The volume migrated to another server.  We consider
265			 * consider all locks and callbacks broken and request
266			 * an update from the VLDB.
267			 *
268			 * We also limit the number of VMOVED hops we will
269			 * honour, just in case someone sets up a loop.
270			 */
271			if (fc->flags & AFS_FS_CURSOR_VMOVED) {
272				fc->ac.error = -EREMOTEIO;
273				goto failed;
274			}
275			fc->flags |= AFS_FS_CURSOR_VMOVED;
276
277			set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
278			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
279			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
280			if (fc->ac.error < 0)
 
281				goto failed;
 
282
283			/* If the server list didn't change, then the VLDB is
284			 * out of sync with the fileservers.  This is hopefully
285			 * a temporary condition, however, so we don't want to
286			 * permanently block access to the file.
287			 *
288			 * TODO: Try other fileservers if we can.
289			 *
290			 * TODO: Retry a few times with sleeps.
291			 */
292			if (vnode->volume->servers == fc->server_list) {
293				fc->ac.error = -ENOMEDIUM;
294				goto failed;
295			}
296
297			goto restart_from_beginning;
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299		default:
300			clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
301			clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
302			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
 
 
 
303			goto failed;
304		}
305
 
 
 
 
 
 
 
306	case -ENETUNREACH:
307	case -EHOSTUNREACH:
 
308	case -ECONNREFUSED:
309	case -ETIMEDOUT:
310	case -ETIME:
311		_debug("no conn");
 
312		goto iterate_address;
 
 
 
 
 
 
 
 
 
313	}
314
315restart_from_beginning:
 
316	_debug("restart");
317	afs_end_cursor(&fc->ac);
318	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
319	fc->cbi = NULL;
320	afs_put_serverlist(afs_v2net(vnode), fc->server_list);
321	fc->server_list = NULL;
 
322start:
323	_debug("start");
 
324	/* See if we need to do an update of the volume record.  Note that the
325	 * volume may have moved or even have been deleted.
326	 */
327	fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
328	if (fc->ac.error < 0)
 
 
 
 
 
 
329		goto failed;
330
331	if (!afs_start_fs_iteration(fc, vnode))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332		goto failed;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
334use_server:
335	_debug("use");
336	/* We're starting on a different fileserver from the list.  We need to
337	 * check it, create a callback intercept, find its address list and
338	 * probe its capabilities before we use it.
339	 */
340	ASSERTCMP(fc->ac.alist, ==, NULL);
341	server = fc->server_list->servers[fc->index].server;
342
343	if (!afs_check_server_record(fc, server))
344		goto failed;
345
346	_debug("USING SERVER: %pU", &server->uuid);
347
348	/* Make sure we've got a callback interest record for this server.  We
349	 * have to link it in before we send the request as we can be sent a
350	 * break request before we've finished decoding the reply and
351	 * installing the vnode.
352	 */
353	fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list,
354						       fc->index);
355	if (fc->ac.error < 0)
356		goto failed;
357
358	fc->cbi = afs_get_cb_interest(vnode->cb_interest);
359
360	read_lock(&server->fs_lock);
361	alist = rcu_dereference_protected(server->addresses,
362					  lockdep_is_held(&server->fs_lock));
363	afs_get_addrlist(alist);
364	read_unlock(&server->fs_lock);
365
366	memset(&fc->ac, 0, sizeof(fc->ac));
367
368	/* Probe the current fileserver if we haven't done so yet. */
369	if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
370		fc->ac.alist = afs_get_addrlist(alist);
371
372		if (!afs_probe_fileserver(fc)) {
373			switch (fc->ac.error) {
374			case -ENOMEM:
375			case -ERESTARTSYS:
376			case -EINTR:
377				goto failed;
378			default:
379				goto next_server;
380			}
381		}
382	}
383
384	if (!fc->ac.alist)
385		fc->ac.alist = alist;
386	else
387		afs_put_addrlist(alist);
388
389	fc->ac.start = READ_ONCE(alist->index);
390	fc->ac.index = fc->ac.start;
391
392iterate_address:
393	ASSERT(fc->ac.alist);
394	_debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs);
395	/* Iterate over the current server's address list to try and find an
396	 * address on which it will respond to us.
397	 */
398	if (!afs_iterate_addresses(&fc->ac))
399		goto next_server;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
 
 
 
 
 
401	_leave(" = t");
402	return true;
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404next_server:
 
405	_debug("next");
406	afs_end_cursor(&fc->ac);
407	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
408	fc->cbi = NULL;
409	fc->index++;
410	if (fc->index >= fc->server_list->nr_servers)
411		fc->index = 0;
412	if (fc->index != fc->start)
413		goto use_server;
414
 
415	/* That's all the servers poked to no good effect.  Try again if some
416	 * of them were busy.
417	 */
418	if (fc->flags & AFS_FS_CURSOR_VBUSY)
 
 
 
419		goto restart_from_beginning;
 
420
421	fc->ac.error = -EDESTADDRREQ;
422	goto failed;
 
 
 
 
 
 
 
 
423
424failed:
425	fc->flags |= AFS_FS_CURSOR_STOP;
426	afs_end_cursor(&fc->ac);
427	_leave(" = f [failed %d]", fc->ac.error);
 
428	return false;
429}
430
431/*
432 * Select the same fileserver we used for a vnode before and only that
433 * fileserver.  We use this when we have a lock on that file, which is backed
434 * only by the fileserver we obtained it from.
435 */
436bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
437{
438	struct afs_vnode *vnode = fc->vnode;
439	struct afs_cb_interest *cbi = vnode->cb_interest;
440	struct afs_addr_list *alist;
441
442	_enter("");
443
444	switch (fc->ac.error) {
445	case SHRT_MAX:
446		if (!cbi) {
447			fc->ac.error = -ESTALE;
448			fc->flags |= AFS_FS_CURSOR_STOP;
449			return false;
450		}
451
452		fc->cbi = afs_get_cb_interest(vnode->cb_interest);
453
454		read_lock(&cbi->server->fs_lock);
455		alist = rcu_dereference_protected(cbi->server->addresses,
456						  lockdep_is_held(&cbi->server->fs_lock));
457		afs_get_addrlist(alist);
458		read_unlock(&cbi->server->fs_lock);
459		if (!alist) {
460			fc->ac.error = -ESTALE;
461			fc->flags |= AFS_FS_CURSOR_STOP;
462			return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463		}
464
465		memset(&fc->ac, 0, sizeof(fc->ac));
466		fc->ac.alist = alist;
467		fc->ac.start = READ_ONCE(alist->index);
468		fc->ac.index = fc->ac.start;
469		goto iterate_address;
470
471	case 0:
472	default:
473		/* Success or local failure.  Stop. */
474		fc->flags |= AFS_FS_CURSOR_STOP;
475		_leave(" = f [okay/local %d]", fc->ac.error);
476		return false;
477
478	case -ECONNABORTED:
479		fc->flags |= AFS_FS_CURSOR_STOP;
480		_leave(" = f [abort]");
481		return false;
482
483	case -ENETUNREACH:
484	case -EHOSTUNREACH:
485	case -ECONNREFUSED:
486	case -ETIMEDOUT:
487	case -ETIME:
488		_debug("no conn");
489		goto iterate_address;
490	}
491
492iterate_address:
493	/* Iterate over the current server's address list to try and find an
494	 * address on which it will respond to us.
495	 */
496	if (afs_iterate_addresses(&fc->ac)) {
497		_leave(" = t");
498		return true;
499	}
500
501	afs_end_cursor(&fc->ac);
502	return false;
503}
504
505/*
506 * Tidy up a filesystem cursor and unlock the vnode.
507 */
508int afs_end_vnode_operation(struct afs_fs_cursor *fc)
509{
510	struct afs_net *net = afs_v2net(fc->vnode);
511	int ret;
512
513	mutex_unlock(&fc->vnode->io_lock);
514
515	afs_end_cursor(&fc->ac);
516	afs_put_cb_interest(net, fc->cbi);
517	afs_put_serverlist(net, fc->server_list);
518
519	ret = fc->ac.error;
520	if (ret == -ECONNABORTED)
521		afs_abort_to_error(fc->ac.abort_code);
522
523	return fc->ac.error;
524}