rotate.c - fs/afs/rotate.c - Linux diff v6.13.7 - Bootlin Elixir Cross Referencer

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* Handle fileserver selection and rotation.
  3 *
  4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  5 * Written by David Howells (dhowells@redhat.com)
  6 */
  7
  8#include <linux/kernel.h>
  9#include <linux/slab.h>
 10#include <linux/fs.h>
 11#include <linux/sched.h>
 12#include <linux/delay.h>
 13#include <linux/sched/signal.h>
 14#include "internal.h"
 15#include "afs_fs.h"
 16#include "protocol_uae.h"
 17
 18void afs_clear_server_states(struct afs_operation *op)
 19{
 20	unsigned int i;
 21
 22	if (op->server_states) {
 23		for (i = 0; i < op->server_list->nr_servers; i++)
 24			afs_put_endpoint_state(op->server_states[i].endpoint_state,
 25					       afs_estate_trace_put_server_state);
 26		kfree(op->server_states);
 27	}
 28}
 29
 30/*
 31 * Begin iteration through a server list, starting with the vnode's last used
 32 * server if possible, or the last recorded good server if not.
 33 */
 34static bool afs_start_fs_iteration(struct afs_operation *op,
 35				   struct afs_vnode *vnode)
 36{
 37	struct afs_server *server;
 38	void *cb_server;
 39	int i;
 40
 41	trace_afs_rotate(op, afs_rotate_trace_start, 0);
 42
 43	read_lock(&op->volume->servers_lock);
 44	op->server_list = afs_get_serverlist(
 45		rcu_dereference_protected(op->volume->servers,
 46					  lockdep_is_held(&op->volume->servers_lock)));
 47	read_unlock(&op->volume->servers_lock);
 48
 49	op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]),
 50				    GFP_KERNEL);
 51	if (!op->server_states) {
 52		afs_op_nomem(op);
 53		trace_afs_rotate(op, afs_rotate_trace_nomem, 0);
 54		return false;
 55	}
 56
 57	rcu_read_lock();
 58	for (i = 0; i < op->server_list->nr_servers; i++) {
 59		struct afs_endpoint_state *estate;
 60		struct afs_server_state *s = &op->server_states[i];
 61
 62		server = op->server_list->servers[i].server;
 63		estate = rcu_dereference(server->endpoint_state);
 64		s->endpoint_state = afs_get_endpoint_state(estate,
 65							   afs_estate_trace_get_server_state);
 66		s->probe_seq = estate->probe_seq;
 67		s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1;
 68		init_waitqueue_entry(&s->probe_waiter, current);
 69		afs_get_address_preferences(op->net, estate->addresses);
 70	}
 71	rcu_read_unlock();
 72
 73
 74	op->untried_servers = (1UL << op->server_list->nr_servers) - 1;
 75	op->server_index = -1;
 76
 77	cb_server = vnode->cb_server;
 78	if (cb_server) {
 79		/* See if the vnode's preferred record is still available */
 80		for (i = 0; i < op->server_list->nr_servers; i++) {
 81			server = op->server_list->servers[i].server;
 82			if (server == cb_server) {
 83				op->server_index = i;
 84				goto found_interest;
 85			}
 86		}
 87
 88		/* If we have a lock outstanding on a server that's no longer
 89		 * serving this vnode, then we can't switch to another server
 90		 * and have to return an error.
 91		 */
 92		if (op->flags & AFS_OPERATION_CUR_ONLY) {
 93			afs_op_set_error(op, -ESTALE);
 94			trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0);
 95			return false;
 96		}
 97
 98		/* Note that the callback promise is effectively broken */
 99		write_seqlock(&vnode->cb_lock);
100		ASSERTCMP(cb_server, ==, vnode->cb_server);
101		vnode->cb_server = NULL;
102		if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
103			vnode->cb_break++;
104		write_sequnlock(&vnode->cb_lock);
105	}
106
107found_interest:
108	return true;
109}
110
111/*
112 * Post volume busy note.
113 */
114static void afs_busy(struct afs_operation *op, u32 abort_code)
115{
116	const char *m;
117
118	switch (abort_code) {
119	case VOFFLINE:		m = "offline";		break;
120	case VRESTARTING:	m = "restarting";	break;
121	case VSALVAGING:	m = "being salvaged";	break;
122	default:		m = "busy";		break;
123	}
124
125	pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n",
126		  op->volume->vid, op->volume->name, &op->server->uuid, m);
127}
128
129/*
130 * Sleep and retry the operation to the same fileserver.
131 */
132static bool afs_sleep_and_retry(struct afs_operation *op)
133{
134	trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0);
135	if (!(op->flags & AFS_OPERATION_UNINTR)) {
136		msleep_interruptible(1000);
137		if (signal_pending(current)) {
138			afs_op_set_error(op, -ERESTARTSYS);
139			return false;
140		}
141	} else {
142		msleep(1000);
143	}
144
145	return true;
146}
147
148/*
149 * Select the fileserver to use.  May be called multiple times to rotate
150 * through the fileservers.
151 */
152bool afs_select_fileserver(struct afs_operation *op)
153{
154	struct afs_addr_list *alist;
155	struct afs_server *server;
156	struct afs_vnode *vnode = op->file[0].vnode;
157	unsigned long set, failed;
158	s32 abort_code = op->call_abort_code;
159	int best_prio = 0;
160	int error = op->call_error, addr_index, i, j;
161
162	op->nr_iterations++;
163
164	_enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d",
165	       op->debug_id, op->nr_iterations, op->volume->vid,
166	       op->server_index, op->untried_servers,
167	       op->addr_index, op->addr_tried,
168	       error, abort_code);
169
170	if (op->flags & AFS_OPERATION_STOP) {
171		trace_afs_rotate(op, afs_rotate_trace_stopped, 0);
172		_leave(" = f [stopped]");
173		return false;
174	}
175
176	if (op->nr_iterations == 0)
177		goto start;
178
179	WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error);
180	trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error);
181
182	/* Evaluate the result of the previous operation, if there was one. */
183	switch (op->call_error) {
184	case 0:
185		clear_bit(AFS_SE_VOLUME_OFFLINE,
186			  &op->server_list->servers[op->server_index].flags);
187		clear_bit(AFS_SE_VOLUME_BUSY,
188			  &op->server_list->servers[op->server_index].flags);
189		op->cumul_error.responded = true;
190
191		/* We succeeded, but we may need to redo the op from another
192		 * server if we're looking at a set of RO volumes where some of
193		 * the servers have not yet been brought up to date lest we
194		 * regress the data.  We only switch to the new version once
195		 * >=50% of the servers are updated.
196		 */
197		error = afs_update_volume_state(op);
198		if (error != 0) {
199			if (error == 1) {
200				afs_sleep_and_retry(op);
201				goto restart_from_beginning;
202			}
203			afs_op_set_error(op, error);
204			goto failed;
205		}
206		fallthrough;
207	default:
208		/* Success or local failure.  Stop. */
209		afs_op_set_error(op, error);
210		op->flags |= AFS_OPERATION_STOP;
211		trace_afs_rotate(op, afs_rotate_trace_stop, error);
212		_leave(" = f [okay/local %d]", error);
213		return false;
214
215	case -ECONNABORTED:
216		/* The far side rejected the operation on some grounds.  This
217		 * might involve the server being busy or the volume having been moved.
218		 *
219		 * Note that various V* errors should not be sent to a cache manager
220		 * by a fileserver as they should be translated to more modern UAE*
221		 * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
222		 * these abort codes.
223		 */
224		trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code);
225		op->cumul_error.responded = true;
226		switch (abort_code) {
227		case VNOVOL:
228			/* This fileserver doesn't know about the volume.
229			 * - May indicate that the VL is wrong - retry once and compare
230			 *   the results.
231			 * - May indicate that the fileserver couldn't attach to the vol.
232			 * - The volume might have been temporarily removed so that it can
233			 *   be replaced by a volume restore.  "vos" might have ended one
234			 *   transaction and has yet to create the next.
235			 * - The volume might not be blessed or might not be in-service
236			 *   (administrative action).
237			 */
238			if (op->flags & AFS_OPERATION_VNOVOL) {
239				afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
240				goto next_server;
241			}
242
243			write_lock(&op->volume->servers_lock);
244			op->server_list->vnovol_mask |= 1 << op->server_index;
245			write_unlock(&op->volume->servers_lock);
246
247			set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
248			error = afs_check_volume_status(op->volume, op);
249			if (error < 0) {
250				afs_op_set_error(op, error);
251				goto failed;
252			}
253
254			if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
255				afs_op_set_error(op, -ENOMEDIUM);
256				goto failed;
257			}
258
259			/* If the server list didn't change, then assume that
260			 * it's the fileserver having trouble.
261			 */
262			if (rcu_access_pointer(op->volume->servers) == op->server_list) {
263				afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
264				goto next_server;
265			}
266
267			/* Try again */
268			op->flags |= AFS_OPERATION_VNOVOL;
269			_leave(" = t [vnovol]");
270			return true;
271
272		case VVOLEXISTS:
273		case VONLINE:
274			/* These should not be returned from the fileserver. */
275			pr_warn("Fileserver returned unexpected abort %d\n",
276				abort_code);
277			afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
278			goto next_server;
279
280		case VNOSERVICE:
281			/* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
282			 * if the volume was neither in-service nor administratively
283			 * blessed.  All usage was replaced by VNOVOL because AFS 3.1 and
284			 * earlier cache managers did not handle VNOSERVICE and assumed
285			 * it was the client OSes errno 105.
286			 *
287			 * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
288			 * fileserver idle dead time error which was sent in place of
289			 * RX_CALL_TIMEOUT (-3).  The error was intended to be sent if the
290			 * fileserver took too long to send a reply to the client.
291			 * RX_CALL_TIMEOUT would have caused the cache manager to mark the
292			 * server down whereas VNOSERVICE since AFS 3.2 would cause cache
293			 * manager to temporarily (up to 15 minutes) mark the volume
294			 * instance as unusable.
295			 *
296			 * The idle dead logic resulted in cache inconsistency since a
297			 * state changing call that the cache manager assumed was dead
298			 * could still be processed to completion by the fileserver.  This
299			 * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
300			 * returned.  However, many 1.4.8 through 1.6.24 fileservers are
301			 * still in existence.
302			 *
303			 * AuriStorFS fileservers have never returned VNOSERVICE.
304			 *
305			 * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
306			 */
307		case RX_CALL_TIMEOUT:
308			afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
309			goto next_server;
310
311		case VSALVAGING: /* This error should not be leaked to cache managers
312				  * but is from OpenAFS demand attach fileservers.
313				  * It should be treated as an alias for VOFFLINE.
314				  */
315		case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
316		case VOFFLINE:
317			/* The volume is in use by the volserver or another volume utility
318			 * for an operation that might alter the contents.  The volume is
319			 * expected to come back but it might take a long time (could be
320			 * days).
321			 */
322			if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE,
323					      &op->server_list->servers[op->server_index].flags)) {
324				afs_busy(op, abort_code);
325				clear_bit(AFS_SE_VOLUME_BUSY,
326					  &op->server_list->servers[op->server_index].flags);
327			}
328			if (op->flags & AFS_OPERATION_NO_VSLEEP) {
329				afs_op_set_error(op, -EADV);
330				goto failed;
331			}
332			goto busy;
333
334		case VRESTARTING: /* The fileserver is either shutting down or starting up. */
335		case VBUSY:
336			/* The volume is in use by the volserver or another volume
337			 * utility for an operation that is not expected to alter the
338			 * contents of the volume.  VBUSY does not need to be returned
339			 * for a ROVOL or BACKVOL bound to an ITBusy volserver
340			 * transaction.  The fileserver is permitted to continue serving
341			 * content from ROVOLs and BACKVOLs during an ITBusy transaction
342			 * because the content will not change.  However, many fileserver
343			 * releases do return VBUSY for ROVOL and BACKVOL instances under
344			 * many circumstances.
345			 *
346			 * Retry after going round all the servers unless we have a file
347			 * lock we need to maintain.
348			 */
349			if (op->flags & AFS_OPERATION_NO_VSLEEP) {
350				afs_op_set_error(op, -EBUSY);
351				goto failed;
352			}
353			if (!test_and_set_bit(AFS_SE_VOLUME_BUSY,
354					      &op->server_list->servers[op->server_index].flags)) {
355				afs_busy(op, abort_code);
356				clear_bit(AFS_SE_VOLUME_OFFLINE,
357					  &op->server_list->servers[op->server_index].flags);
358			}
359		busy:
360			if (op->flags & AFS_OPERATION_CUR_ONLY) {
361				if (!afs_sleep_and_retry(op))
362					goto failed;
363
364				/* Retry with same server & address */
365				_leave(" = t [vbusy]");
366				return true;
367			}
368
369			op->flags |= AFS_OPERATION_VBUSY;
370			goto next_server;
371
372		case VMOVED:
373			/* The volume migrated to another server.  We consider
374			 * consider all locks and callbacks broken and request
375			 * an update from the VLDB.
376			 *
377			 * We also limit the number of VMOVED hops we will
378			 * honour, just in case someone sets up a loop.
379			 */
380			if (op->flags & AFS_OPERATION_VMOVED) {
381				afs_op_set_error(op, -EREMOTEIO);
382				goto failed;
383			}
384			op->flags |= AFS_OPERATION_VMOVED;
385
386			set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
387			set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
388			error = afs_check_volume_status(op->volume, op);
389			if (error < 0) {
390				afs_op_set_error(op, error);
391				goto failed;
392			}
393
394			/* If the server list didn't change, then the VLDB is
395			 * out of sync with the fileservers.  This is hopefully
396			 * a temporary condition, however, so we don't want to
397			 * permanently block access to the file.
398			 *
399			 * TODO: Try other fileservers if we can.
400			 *
401			 * TODO: Retry a few times with sleeps.
402			 */
403			if (rcu_access_pointer(op->volume->servers) == op->server_list) {
404				afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
405				goto failed;
406			}
407
408			goto restart_from_beginning;
409
410		case UAEIO:
411		case VIO:
412			afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
413			if (op->volume->type != AFSVL_RWVOL)
414				goto next_server;
415			goto failed;
416
417		case VDISKFULL:
418		case UAENOSPC:
419			/* The partition is full.  Only applies to RWVOLs.
420			 * Translate locally and return ENOSPC.
421			 * No replicas to failover to.
422			 */
423			afs_op_set_error(op, -ENOSPC);
424			goto failed_but_online;
425
426		case VOVERQUOTA:
427		case UAEDQUOT:
428			/* Volume is full.  Only applies to RWVOLs.
429			 * Translate locally and return EDQUOT.
430			 * No replicas to failover to.
431			 */
432			afs_op_set_error(op, -EDQUOT);
433			goto failed_but_online;
434
435		default:
436			afs_op_accumulate_error(op, error, abort_code);
437		failed_but_online:
438			clear_bit(AFS_SE_VOLUME_OFFLINE,
439				  &op->server_list->servers[op->server_index].flags);
440			clear_bit(AFS_SE_VOLUME_BUSY,
441				  &op->server_list->servers[op->server_index].flags);
442			goto failed;
443		}
444
445	case -ETIMEDOUT:
446	case -ETIME:
447		if (afs_op_error(op) != -EDESTADDRREQ)
448			goto iterate_address;
449		fallthrough;
450	case -ERFKILL:
451	case -EADDRNOTAVAIL:
452	case -ENETUNREACH:
453	case -EHOSTUNREACH:
454	case -EHOSTDOWN:
455	case -ECONNREFUSED:
456		_debug("no conn");
457		afs_op_accumulate_error(op, error, 0);
458		goto iterate_address;
459
460	case -ENETRESET:
461		pr_warn("kAFS: Peer reset %s (op=%x)\n",
462			op->type ? op->type->name : "???", op->debug_id);
463		fallthrough;
464	case -ECONNRESET:
465		_debug("call reset");
466		afs_op_set_error(op, error);
467		goto failed;
468	}
469
470restart_from_beginning:
471	trace_afs_rotate(op, afs_rotate_trace_restart, 0);
472	_debug("restart");
473	op->estate = NULL;
474	op->server = NULL;
475	afs_clear_server_states(op);
476	op->server_states = NULL;
477	afs_put_serverlist(op->net, op->server_list);
478	op->server_list = NULL;
479start:
480	_debug("start");
481	ASSERTCMP(op->estate, ==, NULL);
482	/* See if we need to do an update of the volume record.  Note that the
483	 * volume may have moved or even have been deleted.
484	 */
485	error = afs_check_volume_status(op->volume, op);
486	trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error);
487	if (error < 0) {
488		afs_op_set_error(op, error);
489		goto failed;
490	}
491
492	if (!afs_start_fs_iteration(op, vnode))
493		goto failed;
494
495	_debug("__ VOL %llx __", op->volume->vid);
496
497pick_server:
498	_debug("pick [%lx]", op->untried_servers);
499	ASSERTCMP(op->estate, ==, NULL);
500
501	error = afs_wait_for_fs_probes(op, op->server_states,
502				       !(op->flags & AFS_OPERATION_UNINTR));
503	switch (error) {
504	case 0: /* No untried responsive servers and no outstanding probes */
505		trace_afs_rotate(op, afs_rotate_trace_probe_none, 0);
506		goto no_more_servers;
507	case 1: /* Got a response */
508		trace_afs_rotate(op, afs_rotate_trace_probe_response, 0);
509		break;
510	case 2: /* Probe data superseded */
511		trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0);
512		goto restart_from_beginning;
513	default:
514		trace_afs_rotate(op, afs_rotate_trace_probe_error, error);
515		afs_op_set_error(op, error);
516		goto failed;
517	}
518
519	/* Pick the untried server with the highest priority untried endpoint.
520	 * If we have outstanding callbacks, we stick with the server we're
521	 * already using if we can.
522	 */
523	if (op->server) {
524		_debug("server %u", op->server_index);
525		if (test_bit(op->server_index, &op->untried_servers))
526			goto selected_server;
527		op->server = NULL;
528		_debug("no server");
529	}
530
531	rcu_read_lock();
532	op->server_index = -1;
533	best_prio = -1;
534	for (i = 0; i < op->server_list->nr_servers; i++) {
535		struct afs_endpoint_state *es;
536		struct afs_server_entry *se = &op->server_list->servers[i];
537		struct afs_addr_list *sal;
538		struct afs_server *s = se->server;
539
540		if (!test_bit(i, &op->untried_servers) ||
541		    test_bit(AFS_SE_EXCLUDED, &se->flags) ||
542		    !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
543			continue;
544		es = op->server_states[i].endpoint_state;
545		sal = es->addresses;
546
547		afs_get_address_preferences_rcu(op->net, sal);
548		for (j = 0; j < sal->nr_addrs; j++) {
549			if (es->failed_set & (1 << j))
550				continue;
551			if (!sal->addrs[j].peer)
552				continue;
553			if (sal->addrs[j].prio > best_prio) {
554				op->server_index = i;
555				best_prio = sal->addrs[j].prio;
556			}
557		}
558	}
559	rcu_read_unlock();
560
561	if (op->server_index == -1)
562		goto no_more_servers;
563
564selected_server:
565	trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio);
566	_debug("use %d prio %u", op->server_index, best_prio);
567	__clear_bit(op->server_index, &op->untried_servers);
568
569	/* We're starting on a different fileserver from the list.  We need to
570	 * check it, create a callback intercept, find its address list and
571	 * probe its capabilities before we use it.
572	 */
573	ASSERTCMP(op->estate, ==, NULL);
574	server = op->server_list->servers[op->server_index].server;
575
576	if (!afs_check_server_record(op, server, op->key))
577		goto failed;
578
579	_debug("USING SERVER: %pU", &server->uuid);
580
581	op->flags |= AFS_OPERATION_RETRY_SERVER;
582	op->server = server;
583	if (vnode->cb_server != server) {
584		vnode->cb_server = server;
585		vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
586		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
587	}
588
589retry_server:
590	op->addr_tried = 0;
591	op->addr_index = -1;
592
593iterate_address:
594	/* Iterate over the current server's address list to try and find an
595	 * address on which it will respond to us.
596	 */
597	op->estate = op->server_states[op->server_index].endpoint_state;
598	set = READ_ONCE(op->estate->responsive_set);
599	failed = READ_ONCE(op->estate->failed_set);
600	_debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed);
601	set &= ~(failed | op->addr_tried);
602	trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set);
603	if (!set)
604		goto wait_for_more_probe_results;
605
606	alist = op->estate->addresses;
607	best_prio = -1;
608	addr_index = 0;
609	for (i = 0; i < alist->nr_addrs; i++) {
610		if (!(set & (1 << i)))
611			continue;
612		if (alist->addrs[i].prio > best_prio) {
613			addr_index = i;
614			best_prio = alist->addrs[i].prio;
615		}
616	}
617
618	alist->preferred = addr_index;
619
620	op->addr_index = addr_index;
621	set_bit(addr_index, &op->addr_tried);
622
623	op->volsync.creation = TIME64_MIN;
624	op->volsync.update = TIME64_MIN;
625	op->call_responded = false;
626	_debug("address [%u] %u/%u %pISp",
627	       op->server_index, addr_index, alist->nr_addrs,
628	       rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
629	_leave(" = t");
630	return true;
631
632wait_for_more_probe_results:
633	error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
634					  !(op->flags & AFS_OPERATION_UNINTR));
635	if (error == 1)
636		goto iterate_address;
637	if (!error)
638		goto restart_from_beginning;
639
640	/* We've now had a failure to respond on all of a server's addresses -
641	 * immediately probe them again and consider retrying the server.
642	 */
643	trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0);
644	afs_probe_fileserver(op->net, op->server);
645	if (op->flags & AFS_OPERATION_RETRY_SERVER) {
646		error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
647						  !(op->flags & AFS_OPERATION_UNINTR));
648		switch (error) {
649		case 1:
650			op->flags &= ~AFS_OPERATION_RETRY_SERVER;
651			trace_afs_rotate(op, afs_rotate_trace_retry_server, 1);
652			goto retry_server;
653		case 0:
 
654			trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
655			goto restart_from_beginning;
656		case -ERESTARTSYS:
657			afs_op_set_error(op, error);
658			goto failed;
659		case -ETIME:
660		case -EDESTADDRREQ:
661			goto next_server;
662		}
663	}
664
665next_server:
666	trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
667	_debug("next");
668	op->estate = NULL;
669	goto pick_server;
670
671no_more_servers:
672	/* That's all the servers poked to no good effect.  Try again if some
673	 * of them were busy.
674	 */
675	trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0);
676	if (op->flags & AFS_OPERATION_VBUSY) {
677		afs_sleep_and_retry(op);
678		op->flags &= ~AFS_OPERATION_VBUSY;
679		goto restart_from_beginning;
680	}
681
682	rcu_read_lock();
683	for (i = 0; i < op->server_list->nr_servers; i++) {
684		struct afs_endpoint_state *estate;
685
686		estate = op->server_states[i].endpoint_state;
687		error = READ_ONCE(estate->error);
688		if (error < 0)
689			afs_op_accumulate_error(op, error, estate->abort_code);
690	}
691	rcu_read_unlock();
692
693failed:
694	trace_afs_rotate(op, afs_rotate_trace_failed, 0);
695	op->flags |= AFS_OPERATION_STOP;
696	op->estate = NULL;
697	_leave(" = f [failed %d]", afs_op_error(op));
698	return false;
699}
700
701/*
702 * Dump cursor state in the case of the error being EDESTADDRREQ.
703 */
704void afs_dump_edestaddrreq(const struct afs_operation *op)
705{
706	static int count;
707	int i;
708
709	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
710		return;
711	count++;
712
713	rcu_read_lock();
714
715	pr_notice("EDESTADDR occurred\n");
716	pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
717		  op->file[0].cb_break_before,
718		  op->file[1].cb_break_before, op->flags, op->cumul_error.error);
719	pr_notice("OP: ut=%lx ix=%d ni=%u\n",
720		  op->untried_servers, op->server_index, op->nr_iterations);
721	pr_notice("OP: call  er=%d ac=%d r=%u\n",
722		  op->call_error, op->call_abort_code, op->call_responded);
723
724	if (op->server_list) {
725		const struct afs_server_list *sl = op->server_list;
726
727		pr_notice("FC: SL nr=%u vnov=%hx\n",
728			  sl->nr_servers, sl->vnovol_mask);
729		for (i = 0; i < sl->nr_servers; i++) {
730			const struct afs_server *s = sl->servers[i].server;
731			const struct afs_endpoint_state *e =
732				rcu_dereference(s->endpoint_state);
733			const struct afs_addr_list *a = e->addresses;
734
735			pr_notice("FC: server fl=%lx av=%u %pU\n",
736				  s->flags, s->addr_version, &s->uuid);
737			pr_notice("FC:  - pq=%x R=%lx F=%lx\n",
738				  e->probe_seq, e->responsive_set, e->failed_set);
739			if (a) {
740				pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
741					  a->version,
742					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
743					  a->preferred);
744				if (a == e->addresses)
745					pr_notice("FC:  - current\n");
746			}
747		}
748	}
749
750	pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index);
751	rcu_read_unlock();
752}

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/* Handle fileserver selection and rotation.
  3 *
  4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  5 * Written by David Howells (dhowells@redhat.com)
  6 */
  7
  8#include <linux/kernel.h>
  9#include <linux/slab.h>
 10#include <linux/fs.h>
 11#include <linux/sched.h>
 12#include <linux/delay.h>
 13#include <linux/sched/signal.h>
 14#include "internal.h"
 15#include "afs_fs.h"
 16#include "protocol_uae.h"
 17
 18void afs_clear_server_states(struct afs_operation *op)
 19{
 20	unsigned int i;
 21
 22	if (op->server_states) {
 23		for (i = 0; i < op->server_list->nr_servers; i++)
 24			afs_put_endpoint_state(op->server_states[i].endpoint_state,
 25					       afs_estate_trace_put_server_state);
 26		kfree(op->server_states);
 27	}
 28}
 29
 30/*
 31 * Begin iteration through a server list, starting with the vnode's last used
 32 * server if possible, or the last recorded good server if not.
 33 */
 34static bool afs_start_fs_iteration(struct afs_operation *op,
 35				   struct afs_vnode *vnode)
 36{
 37	struct afs_server *server;
 38	void *cb_server;
 39	int i;
 40
 41	trace_afs_rotate(op, afs_rotate_trace_start, 0);
 42
 43	read_lock(&op->volume->servers_lock);
 44	op->server_list = afs_get_serverlist(
 45		rcu_dereference_protected(op->volume->servers,
 46					  lockdep_is_held(&op->volume->servers_lock)));
 47	read_unlock(&op->volume->servers_lock);
 48
 49	op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]),
 50				    GFP_KERNEL);
 51	if (!op->server_states) {
 52		afs_op_nomem(op);
 53		trace_afs_rotate(op, afs_rotate_trace_nomem, 0);
 54		return false;
 55	}
 56
 57	rcu_read_lock();
 58	for (i = 0; i < op->server_list->nr_servers; i++) {
 59		struct afs_endpoint_state *estate;
 60		struct afs_server_state *s = &op->server_states[i];
 61
 62		server = op->server_list->servers[i].server;
 63		estate = rcu_dereference(server->endpoint_state);
 64		s->endpoint_state = afs_get_endpoint_state(estate,
 65							   afs_estate_trace_get_server_state);
 66		s->probe_seq = estate->probe_seq;
 67		s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1;
 68		init_waitqueue_entry(&s->probe_waiter, current);
 69		afs_get_address_preferences(op->net, estate->addresses);
 70	}
 71	rcu_read_unlock();
 72
 73
 74	op->untried_servers = (1UL << op->server_list->nr_servers) - 1;
 75	op->server_index = -1;
 76
 77	cb_server = vnode->cb_server;
 78	if (cb_server) {
 79		/* See if the vnode's preferred record is still available */
 80		for (i = 0; i < op->server_list->nr_servers; i++) {
 81			server = op->server_list->servers[i].server;
 82			if (server == cb_server) {
 83				op->server_index = i;
 84				goto found_interest;
 85			}
 86		}
 87
 88		/* If we have a lock outstanding on a server that's no longer
 89		 * serving this vnode, then we can't switch to another server
 90		 * and have to return an error.
 91		 */
 92		if (op->flags & AFS_OPERATION_CUR_ONLY) {
 93			afs_op_set_error(op, -ESTALE);
 94			trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0);
 95			return false;
 96		}
 97
 98		/* Note that the callback promise is effectively broken */
 99		write_seqlock(&vnode->cb_lock);
100		ASSERTCMP(cb_server, ==, vnode->cb_server);
101		vnode->cb_server = NULL;
102		if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
103			vnode->cb_break++;
104		write_sequnlock(&vnode->cb_lock);
105	}
106
107found_interest:
108	return true;
109}
110
111/*
112 * Post volume busy note.
113 */
114static void afs_busy(struct afs_operation *op, u32 abort_code)
115{
116	const char *m;
117
118	switch (abort_code) {
119	case VOFFLINE:		m = "offline";		break;
120	case VRESTARTING:	m = "restarting";	break;
121	case VSALVAGING:	m = "being salvaged";	break;
122	default:		m = "busy";		break;
123	}
124
125	pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n",
126		  op->volume->vid, op->volume->name, &op->server->uuid, m);
127}
128
129/*
130 * Sleep and retry the operation to the same fileserver.
131 */
132static bool afs_sleep_and_retry(struct afs_operation *op)
133{
134	trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0);
135	if (!(op->flags & AFS_OPERATION_UNINTR)) {
136		msleep_interruptible(1000);
137		if (signal_pending(current)) {
138			afs_op_set_error(op, -ERESTARTSYS);
139			return false;
140		}
141	} else {
142		msleep(1000);
143	}
144
145	return true;
146}
147
148/*
149 * Select the fileserver to use.  May be called multiple times to rotate
150 * through the fileservers.
151 */
152bool afs_select_fileserver(struct afs_operation *op)
153{
154	struct afs_addr_list *alist;
155	struct afs_server *server;
156	struct afs_vnode *vnode = op->file[0].vnode;
157	unsigned long set, failed;
158	s32 abort_code = op->call_abort_code;
159	int best_prio = 0;
160	int error = op->call_error, addr_index, i, j;
161
162	op->nr_iterations++;
163
164	_enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d",
165	       op->debug_id, op->nr_iterations, op->volume->vid,
166	       op->server_index, op->untried_servers,
167	       op->addr_index, op->addr_tried,
168	       error, abort_code);
169
170	if (op->flags & AFS_OPERATION_STOP) {
171		trace_afs_rotate(op, afs_rotate_trace_stopped, 0);
172		_leave(" = f [stopped]");
173		return false;
174	}
175
176	if (op->nr_iterations == 0)
177		goto start;
178
179	WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error);
180	trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error);
181
182	/* Evaluate the result of the previous operation, if there was one. */
183	switch (op->call_error) {
184	case 0:
185		clear_bit(AFS_SE_VOLUME_OFFLINE,
186			  &op->server_list->servers[op->server_index].flags);
187		clear_bit(AFS_SE_VOLUME_BUSY,
188			  &op->server_list->servers[op->server_index].flags);
189		op->cumul_error.responded = true;
190
191		/* We succeeded, but we may need to redo the op from another
192		 * server if we're looking at a set of RO volumes where some of
193		 * the servers have not yet been brought up to date lest we
194		 * regress the data.  We only switch to the new version once
195		 * >=50% of the servers are updated.
196		 */
197		error = afs_update_volume_state(op);
198		if (error != 0) {
199			if (error == 1) {
200				afs_sleep_and_retry(op);
201				goto restart_from_beginning;
202			}
203			afs_op_set_error(op, error);
204			goto failed;
205		}
206		fallthrough;
207	default:
208		/* Success or local failure.  Stop. */
209		afs_op_set_error(op, error);
210		op->flags |= AFS_OPERATION_STOP;
211		trace_afs_rotate(op, afs_rotate_trace_stop, error);
212		_leave(" = f [okay/local %d]", error);
213		return false;
214
215	case -ECONNABORTED:
216		/* The far side rejected the operation on some grounds.  This
217		 * might involve the server being busy or the volume having been moved.
218		 *
219		 * Note that various V* errors should not be sent to a cache manager
220		 * by a fileserver as they should be translated to more modern UAE*
221		 * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
222		 * these abort codes.
223		 */
224		trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code);
225		op->cumul_error.responded = true;
226		switch (abort_code) {
227		case VNOVOL:
228			/* This fileserver doesn't know about the volume.
229			 * - May indicate that the VL is wrong - retry once and compare
230			 *   the results.
231			 * - May indicate that the fileserver couldn't attach to the vol.
232			 * - The volume might have been temporarily removed so that it can
233			 *   be replaced by a volume restore.  "vos" might have ended one
234			 *   transaction and has yet to create the next.
235			 * - The volume might not be blessed or might not be in-service
236			 *   (administrative action).
237			 */
238			if (op->flags & AFS_OPERATION_VNOVOL) {
239				afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
240				goto next_server;
241			}
242
243			write_lock(&op->volume->servers_lock);
244			op->server_list->vnovol_mask |= 1 << op->server_index;
245			write_unlock(&op->volume->servers_lock);
246
247			set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
248			error = afs_check_volume_status(op->volume, op);
249			if (error < 0) {
250				afs_op_set_error(op, error);
251				goto failed;
252			}
253
254			if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
255				afs_op_set_error(op, -ENOMEDIUM);
256				goto failed;
257			}
258
259			/* If the server list didn't change, then assume that
260			 * it's the fileserver having trouble.
261			 */
262			if (rcu_access_pointer(op->volume->servers) == op->server_list) {
263				afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
264				goto next_server;
265			}
266
267			/* Try again */
268			op->flags |= AFS_OPERATION_VNOVOL;
269			_leave(" = t [vnovol]");
270			return true;
271
272		case VVOLEXISTS:
273		case VONLINE:
274			/* These should not be returned from the fileserver. */
275			pr_warn("Fileserver returned unexpected abort %d\n",
276				abort_code);
277			afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
278			goto next_server;
279
280		case VNOSERVICE:
281			/* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
282			 * if the volume was neither in-service nor administratively
283			 * blessed.  All usage was replaced by VNOVOL because AFS 3.1 and
284			 * earlier cache managers did not handle VNOSERVICE and assumed
285			 * it was the client OSes errno 105.
286			 *
287			 * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
288			 * fileserver idle dead time error which was sent in place of
289			 * RX_CALL_TIMEOUT (-3).  The error was intended to be sent if the
290			 * fileserver took too long to send a reply to the client.
291			 * RX_CALL_TIMEOUT would have caused the cache manager to mark the
292			 * server down whereas VNOSERVICE since AFS 3.2 would cause cache
293			 * manager to temporarily (up to 15 minutes) mark the volume
294			 * instance as unusable.
295			 *
296			 * The idle dead logic resulted in cache inconsistency since a
297			 * state changing call that the cache manager assumed was dead
298			 * could still be processed to completion by the fileserver.  This
299			 * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
300			 * returned.  However, many 1.4.8 through 1.6.24 fileservers are
301			 * still in existence.
302			 *
303			 * AuriStorFS fileservers have never returned VNOSERVICE.
304			 *
305			 * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
306			 */
307		case RX_CALL_TIMEOUT:
308			afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
309			goto next_server;
310
311		case VSALVAGING: /* This error should not be leaked to cache managers
312				  * but is from OpenAFS demand attach fileservers.
313				  * It should be treated as an alias for VOFFLINE.
314				  */
315		case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
316		case VOFFLINE:
317			/* The volume is in use by the volserver or another volume utility
318			 * for an operation that might alter the contents.  The volume is
319			 * expected to come back but it might take a long time (could be
320			 * days).
321			 */
322			if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE,
323					      &op->server_list->servers[op->server_index].flags)) {
324				afs_busy(op, abort_code);
325				clear_bit(AFS_SE_VOLUME_BUSY,
326					  &op->server_list->servers[op->server_index].flags);
327			}
328			if (op->flags & AFS_OPERATION_NO_VSLEEP) {
329				afs_op_set_error(op, -EADV);
330				goto failed;
331			}
332			goto busy;
333
334		case VRESTARTING: /* The fileserver is either shutting down or starting up. */
335		case VBUSY:
336			/* The volume is in use by the volserver or another volume
337			 * utility for an operation that is not expected to alter the
338			 * contents of the volume.  VBUSY does not need to be returned
339			 * for a ROVOL or BACKVOL bound to an ITBusy volserver
340			 * transaction.  The fileserver is permitted to continue serving
341			 * content from ROVOLs and BACKVOLs during an ITBusy transaction
342			 * because the content will not change.  However, many fileserver
343			 * releases do return VBUSY for ROVOL and BACKVOL instances under
344			 * many circumstances.
345			 *
346			 * Retry after going round all the servers unless we have a file
347			 * lock we need to maintain.
348			 */
349			if (op->flags & AFS_OPERATION_NO_VSLEEP) {
350				afs_op_set_error(op, -EBUSY);
351				goto failed;
352			}
353			if (!test_and_set_bit(AFS_SE_VOLUME_BUSY,
354					      &op->server_list->servers[op->server_index].flags)) {
355				afs_busy(op, abort_code);
356				clear_bit(AFS_SE_VOLUME_OFFLINE,
357					  &op->server_list->servers[op->server_index].flags);
358			}
359		busy:
360			if (op->flags & AFS_OPERATION_CUR_ONLY) {
361				if (!afs_sleep_and_retry(op))
362					goto failed;
363
364				/* Retry with same server & address */
365				_leave(" = t [vbusy]");
366				return true;
367			}
368
369			op->flags |= AFS_OPERATION_VBUSY;
370			goto next_server;
371
372		case VMOVED:
373			/* The volume migrated to another server.  We consider
374			 * consider all locks and callbacks broken and request
375			 * an update from the VLDB.
376			 *
377			 * We also limit the number of VMOVED hops we will
378			 * honour, just in case someone sets up a loop.
379			 */
380			if (op->flags & AFS_OPERATION_VMOVED) {
381				afs_op_set_error(op, -EREMOTEIO);
382				goto failed;
383			}
384			op->flags |= AFS_OPERATION_VMOVED;
385
386			set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
387			set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
388			error = afs_check_volume_status(op->volume, op);
389			if (error < 0) {
390				afs_op_set_error(op, error);
391				goto failed;
392			}
393
394			/* If the server list didn't change, then the VLDB is
395			 * out of sync with the fileservers.  This is hopefully
396			 * a temporary condition, however, so we don't want to
397			 * permanently block access to the file.
398			 *
399			 * TODO: Try other fileservers if we can.
400			 *
401			 * TODO: Retry a few times with sleeps.
402			 */
403			if (rcu_access_pointer(op->volume->servers) == op->server_list) {
404				afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
405				goto failed;
406			}
407
408			goto restart_from_beginning;
409
410		case UAEIO:
411		case VIO:
412			afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
413			if (op->volume->type != AFSVL_RWVOL)
414				goto next_server;
415			goto failed;
416
417		case VDISKFULL:
418		case UAENOSPC:
419			/* The partition is full.  Only applies to RWVOLs.
420			 * Translate locally and return ENOSPC.
421			 * No replicas to failover to.
422			 */
423			afs_op_set_error(op, -ENOSPC);
424			goto failed_but_online;
425
426		case VOVERQUOTA:
427		case UAEDQUOT:
428			/* Volume is full.  Only applies to RWVOLs.
429			 * Translate locally and return EDQUOT.
430			 * No replicas to failover to.
431			 */
432			afs_op_set_error(op, -EDQUOT);
433			goto failed_but_online;
434
435		default:
436			afs_op_accumulate_error(op, error, abort_code);
437		failed_but_online:
438			clear_bit(AFS_SE_VOLUME_OFFLINE,
439				  &op->server_list->servers[op->server_index].flags);
440			clear_bit(AFS_SE_VOLUME_BUSY,
441				  &op->server_list->servers[op->server_index].flags);
442			goto failed;
443		}
444
445	case -ETIMEDOUT:
446	case -ETIME:
447		if (afs_op_error(op) != -EDESTADDRREQ)
448			goto iterate_address;
449		fallthrough;
450	case -ERFKILL:
451	case -EADDRNOTAVAIL:
452	case -ENETUNREACH:
453	case -EHOSTUNREACH:
454	case -EHOSTDOWN:
455	case -ECONNREFUSED:
456		_debug("no conn");
457		afs_op_accumulate_error(op, error, 0);
458		goto iterate_address;
459
460	case -ENETRESET:
461		pr_warn("kAFS: Peer reset %s (op=%x)\n",
462			op->type ? op->type->name : "???", op->debug_id);
463		fallthrough;
464	case -ECONNRESET:
465		_debug("call reset");
466		afs_op_set_error(op, error);
467		goto failed;
468	}
469
470restart_from_beginning:
471	trace_afs_rotate(op, afs_rotate_trace_restart, 0);
472	_debug("restart");
473	op->estate = NULL;
474	op->server = NULL;
475	afs_clear_server_states(op);
476	op->server_states = NULL;
477	afs_put_serverlist(op->net, op->server_list);
478	op->server_list = NULL;
479start:
480	_debug("start");
481	ASSERTCMP(op->estate, ==, NULL);
482	/* See if we need to do an update of the volume record.  Note that the
483	 * volume may have moved or even have been deleted.
484	 */
485	error = afs_check_volume_status(op->volume, op);
486	trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error);
487	if (error < 0) {
488		afs_op_set_error(op, error);
489		goto failed;
490	}
491
492	if (!afs_start_fs_iteration(op, vnode))
493		goto failed;
494
495	_debug("__ VOL %llx __", op->volume->vid);
496
497pick_server:
498	_debug("pick [%lx]", op->untried_servers);
499	ASSERTCMP(op->estate, ==, NULL);
500
501	error = afs_wait_for_fs_probes(op, op->server_states,
502				       !(op->flags & AFS_OPERATION_UNINTR));
503	switch (error) {
504	case 0: /* No untried responsive servers and no outstanding probes */
505		trace_afs_rotate(op, afs_rotate_trace_probe_none, 0);
506		goto no_more_servers;
507	case 1: /* Got a response */
508		trace_afs_rotate(op, afs_rotate_trace_probe_response, 0);
509		break;
510	case 2: /* Probe data superseded */
511		trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0);
512		goto restart_from_beginning;
513	default:
514		trace_afs_rotate(op, afs_rotate_trace_probe_error, error);
515		afs_op_set_error(op, error);
516		goto failed;
517	}
518
519	/* Pick the untried server with the highest priority untried endpoint.
520	 * If we have outstanding callbacks, we stick with the server we're
521	 * already using if we can.
522	 */
523	if (op->server) {
524		_debug("server %u", op->server_index);
525		if (test_bit(op->server_index, &op->untried_servers))
526			goto selected_server;
527		op->server = NULL;
528		_debug("no server");
529	}
530
531	rcu_read_lock();
532	op->server_index = -1;
533	best_prio = -1;
534	for (i = 0; i < op->server_list->nr_servers; i++) {
535		struct afs_endpoint_state *es;
536		struct afs_server_entry *se = &op->server_list->servers[i];
537		struct afs_addr_list *sal;
538		struct afs_server *s = se->server;
539
540		if (!test_bit(i, &op->untried_servers) ||
541		    test_bit(AFS_SE_EXCLUDED, &se->flags) ||
542		    !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
543			continue;
544		es = op->server_states->endpoint_state;
545		sal = es->addresses;
546
547		afs_get_address_preferences_rcu(op->net, sal);
548		for (j = 0; j < sal->nr_addrs; j++) {
 
 
549			if (!sal->addrs[j].peer)
550				continue;
551			if (sal->addrs[j].prio > best_prio) {
552				op->server_index = i;
553				best_prio = sal->addrs[j].prio;
554			}
555		}
556	}
557	rcu_read_unlock();
558
559	if (op->server_index == -1)
560		goto no_more_servers;
561
562selected_server:
563	trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio);
564	_debug("use %d prio %u", op->server_index, best_prio);
565	__clear_bit(op->server_index, &op->untried_servers);
566
567	/* We're starting on a different fileserver from the list.  We need to
568	 * check it, create a callback intercept, find its address list and
569	 * probe its capabilities before we use it.
570	 */
571	ASSERTCMP(op->estate, ==, NULL);
572	server = op->server_list->servers[op->server_index].server;
573
574	if (!afs_check_server_record(op, server, op->key))
575		goto failed;
576
577	_debug("USING SERVER: %pU", &server->uuid);
578
579	op->flags |= AFS_OPERATION_RETRY_SERVER;
580	op->server = server;
581	if (vnode->cb_server != server) {
582		vnode->cb_server = server;
583		vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
584		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
585	}
586
587retry_server:
588	op->addr_tried = 0;
589	op->addr_index = -1;
590
591iterate_address:
592	/* Iterate over the current server's address list to try and find an
593	 * address on which it will respond to us.
594	 */
595	op->estate = op->server_states[op->server_index].endpoint_state;
596	set = READ_ONCE(op->estate->responsive_set);
597	failed = READ_ONCE(op->estate->failed_set);
598	_debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed);
599	set &= ~(failed | op->addr_tried);
600	trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set);
601	if (!set)
602		goto wait_for_more_probe_results;
603
604	alist = op->estate->addresses;
605	best_prio = -1;
606	addr_index = 0;
607	for (i = 0; i < alist->nr_addrs; i++) {
 
 
608		if (alist->addrs[i].prio > best_prio) {
609			addr_index = i;
610			best_prio = alist->addrs[i].prio;
611		}
612	}
613
614	alist->preferred = addr_index;
615
616	op->addr_index = addr_index;
617	set_bit(addr_index, &op->addr_tried);
618
619	op->volsync.creation = TIME64_MIN;
620	op->volsync.update = TIME64_MIN;
621	op->call_responded = false;
622	_debug("address [%u] %u/%u %pISp",
623	       op->server_index, addr_index, alist->nr_addrs,
624	       rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
625	_leave(" = t");
626	return true;
627
628wait_for_more_probe_results:
629	error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
630					  !(op->flags & AFS_OPERATION_UNINTR));
 
 
631	if (!error)
632		goto iterate_address;
633
634	/* We've now had a failure to respond on all of a server's addresses -
635	 * immediately probe them again and consider retrying the server.
636	 */
637	trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0);
638	afs_probe_fileserver(op->net, op->server);
639	if (op->flags & AFS_OPERATION_RETRY_SERVER) {
640		error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
641						  !(op->flags & AFS_OPERATION_UNINTR));
642		switch (error) {
 
 
 
 
643		case 0:
644			op->flags &= ~AFS_OPERATION_RETRY_SERVER;
645			trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
646			goto retry_server;
647		case -ERESTARTSYS:
648			afs_op_set_error(op, error);
649			goto failed;
650		case -ETIME:
651		case -EDESTADDRREQ:
652			goto next_server;
653		}
654	}
655
656next_server:
657	trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
658	_debug("next");
659	op->estate = NULL;
660	goto pick_server;
661
662no_more_servers:
663	/* That's all the servers poked to no good effect.  Try again if some
664	 * of them were busy.
665	 */
666	trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0);
667	if (op->flags & AFS_OPERATION_VBUSY) {
668		afs_sleep_and_retry(op);
669		op->flags &= ~AFS_OPERATION_VBUSY;
670		goto restart_from_beginning;
671	}
672
673	rcu_read_lock();
674	for (i = 0; i < op->server_list->nr_servers; i++) {
675		struct afs_endpoint_state *estate;
676
677		estate = op->server_states->endpoint_state;
678		error = READ_ONCE(estate->error);
679		if (error < 0)
680			afs_op_accumulate_error(op, error, estate->abort_code);
681	}
682	rcu_read_unlock();
683
684failed:
685	trace_afs_rotate(op, afs_rotate_trace_failed, 0);
686	op->flags |= AFS_OPERATION_STOP;
687	op->estate = NULL;
688	_leave(" = f [failed %d]", afs_op_error(op));
689	return false;
690}
691
692/*
693 * Dump cursor state in the case of the error being EDESTADDRREQ.
694 */
695void afs_dump_edestaddrreq(const struct afs_operation *op)
696{
697	static int count;
698	int i;
699
700	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
701		return;
702	count++;
703
704	rcu_read_lock();
705
706	pr_notice("EDESTADDR occurred\n");
707	pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
708		  op->file[0].cb_break_before,
709		  op->file[1].cb_break_before, op->flags, op->cumul_error.error);
710	pr_notice("OP: ut=%lx ix=%d ni=%u\n",
711		  op->untried_servers, op->server_index, op->nr_iterations);
712	pr_notice("OP: call  er=%d ac=%d r=%u\n",
713		  op->call_error, op->call_abort_code, op->call_responded);
714
715	if (op->server_list) {
716		const struct afs_server_list *sl = op->server_list;
717
718		pr_notice("FC: SL nr=%u vnov=%hx\n",
719			  sl->nr_servers, sl->vnovol_mask);
720		for (i = 0; i < sl->nr_servers; i++) {
721			const struct afs_server *s = sl->servers[i].server;
722			const struct afs_endpoint_state *e =
723				rcu_dereference(s->endpoint_state);
724			const struct afs_addr_list *a = e->addresses;
725
726			pr_notice("FC: server fl=%lx av=%u %pU\n",
727				  s->flags, s->addr_version, &s->uuid);
728			pr_notice("FC:  - pq=%x R=%lx F=%lx\n",
729				  e->probe_seq, e->responsive_set, e->failed_set);
730			if (a) {
731				pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
732					  a->version,
733					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
734					  a->preferred);
735				if (a == e->addresses)
736					pr_notice("FC:  - current\n");
737			}
738		}
739	}
740
741	pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index);
742	rcu_read_unlock();
743}