fs.c - security/landlock/fs.c - Linux source code v5.4

Note: File does not exist in v5.4.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Landlock LSM - Filesystem management and hooks
   4 *
   5 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
   6 * Copyright © 2018-2020 ANSSI
   7 * Copyright © 2021-2022 Microsoft Corporation
   8 */
   9
  10#include <linux/atomic.h>
  11#include <linux/bitops.h>
  12#include <linux/bits.h>
  13#include <linux/compiler_types.h>
  14#include <linux/dcache.h>
  15#include <linux/err.h>
  16#include <linux/fs.h>
  17#include <linux/init.h>
  18#include <linux/kernel.h>
  19#include <linux/limits.h>
  20#include <linux/list.h>
  21#include <linux/lsm_hooks.h>
  22#include <linux/mount.h>
  23#include <linux/namei.h>
  24#include <linux/path.h>
  25#include <linux/rcupdate.h>
  26#include <linux/spinlock.h>
  27#include <linux/stat.h>
  28#include <linux/types.h>
  29#include <linux/wait_bit.h>
  30#include <linux/workqueue.h>
  31#include <uapi/linux/landlock.h>
  32
  33#include "common.h"
  34#include "cred.h"
  35#include "fs.h"
  36#include "limits.h"
  37#include "object.h"
  38#include "ruleset.h"
  39#include "setup.h"
  40
  41/* Underlying object management */
  42
  43static void release_inode(struct landlock_object *const object)
  44	__releases(object->lock)
  45{
  46	struct inode *const inode = object->underobj;
  47	struct super_block *sb;
  48
  49	if (!inode) {
  50		spin_unlock(&object->lock);
  51		return;
  52	}
  53
  54	/*
  55	 * Protects against concurrent use by hook_sb_delete() of the reference
  56	 * to the underlying inode.
  57	 */
  58	object->underobj = NULL;
  59	/*
  60	 * Makes sure that if the filesystem is concurrently unmounted,
  61	 * hook_sb_delete() will wait for us to finish iput().
  62	 */
  63	sb = inode->i_sb;
  64	atomic_long_inc(&landlock_superblock(sb)->inode_refs);
  65	spin_unlock(&object->lock);
  66	/*
  67	 * Because object->underobj was not NULL, hook_sb_delete() and
  68	 * get_inode_object() guarantee that it is safe to reset
  69	 * landlock_inode(inode)->object while it is not NULL.  It is therefore
  70	 * not necessary to lock inode->i_lock.
  71	 */
  72	rcu_assign_pointer(landlock_inode(inode)->object, NULL);
  73	/*
  74	 * Now, new rules can safely be tied to @inode with get_inode_object().
  75	 */
  76
  77	iput(inode);
  78	if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
  79		wake_up_var(&landlock_superblock(sb)->inode_refs);
  80}
  81
  82static const struct landlock_object_underops landlock_fs_underops = {
  83	.release = release_inode
  84};
  85
  86/* Ruleset management */
  87
  88static struct landlock_object *get_inode_object(struct inode *const inode)
  89{
  90	struct landlock_object *object, *new_object;
  91	struct landlock_inode_security *inode_sec = landlock_inode(inode);
  92
  93	rcu_read_lock();
  94retry:
  95	object = rcu_dereference(inode_sec->object);
  96	if (object) {
  97		if (likely(refcount_inc_not_zero(&object->usage))) {
  98			rcu_read_unlock();
  99			return object;
 100		}
 101		/*
 102		 * We are racing with release_inode(), the object is going
 103		 * away.  Wait for release_inode(), then retry.
 104		 */
 105		spin_lock(&object->lock);
 106		spin_unlock(&object->lock);
 107		goto retry;
 108	}
 109	rcu_read_unlock();
 110
 111	/*
 112	 * If there is no object tied to @inode, then create a new one (without
 113	 * holding any locks).
 114	 */
 115	new_object = landlock_create_object(&landlock_fs_underops, inode);
 116	if (IS_ERR(new_object))
 117		return new_object;
 118
 119	/*
 120	 * Protects against concurrent calls to get_inode_object() or
 121	 * hook_sb_delete().
 122	 */
 123	spin_lock(&inode->i_lock);
 124	if (unlikely(rcu_access_pointer(inode_sec->object))) {
 125		/* Someone else just created the object, bail out and retry. */
 126		spin_unlock(&inode->i_lock);
 127		kfree(new_object);
 128
 129		rcu_read_lock();
 130		goto retry;
 131	}
 132
 133	/*
 134	 * @inode will be released by hook_sb_delete() on its superblock
 135	 * shutdown, or by release_inode() when no more ruleset references the
 136	 * related object.
 137	 */
 138	ihold(inode);
 139	rcu_assign_pointer(inode_sec->object, new_object);
 140	spin_unlock(&inode->i_lock);
 141	return new_object;
 142}
 143
 144/* All access rights that can be tied to files. */
 145/* clang-format off */
 146#define ACCESS_FILE ( \
 147	LANDLOCK_ACCESS_FS_EXECUTE | \
 148	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 149	LANDLOCK_ACCESS_FS_READ_FILE | \
 150	LANDLOCK_ACCESS_FS_TRUNCATE)
 151/* clang-format on */
 152
 153/*
 154 * @path: Should have been checked by get_path_from_fd().
 155 */
 156int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
 157			    const struct path *const path,
 158			    access_mask_t access_rights)
 159{
 160	int err;
 161	struct landlock_id id = {
 162		.type = LANDLOCK_KEY_INODE,
 163	};
 164
 165	/* Files only get access rights that make sense. */
 166	if (!d_is_dir(path->dentry) &&
 167	    (access_rights | ACCESS_FILE) != ACCESS_FILE)
 168		return -EINVAL;
 169	if (WARN_ON_ONCE(ruleset->num_layers != 1))
 170		return -EINVAL;
 171
 172	/* Transforms relative access rights to absolute ones. */
 173	access_rights |= LANDLOCK_MASK_ACCESS_FS &
 174			 ~landlock_get_fs_access_mask(ruleset, 0);
 175	id.key.object = get_inode_object(d_backing_inode(path->dentry));
 176	if (IS_ERR(id.key.object))
 177		return PTR_ERR(id.key.object);
 178	mutex_lock(&ruleset->lock);
 179	err = landlock_insert_rule(ruleset, id, access_rights);
 180	mutex_unlock(&ruleset->lock);
 181	/*
 182	 * No need to check for an error because landlock_insert_rule()
 183	 * increments the refcount for the new object if needed.
 184	 */
 185	landlock_put_object(id.key.object);
 186	return err;
 187}
 188
 189/* Access-control management */
 190
 191/*
 192 * The lifetime of the returned rule is tied to @domain.
 193 *
 194 * Returns NULL if no rule is found or if @dentry is negative.
 195 */
 196static const struct landlock_rule *
 197find_rule(const struct landlock_ruleset *const domain,
 198	  const struct dentry *const dentry)
 199{
 200	const struct landlock_rule *rule;
 201	const struct inode *inode;
 202	struct landlock_id id = {
 203		.type = LANDLOCK_KEY_INODE,
 204	};
 205
 206	/* Ignores nonexistent leafs. */
 207	if (d_is_negative(dentry))
 208		return NULL;
 209
 210	inode = d_backing_inode(dentry);
 211	rcu_read_lock();
 212	id.key.object = rcu_dereference(landlock_inode(inode)->object);
 213	rule = landlock_find_rule(domain, id);
 214	rcu_read_unlock();
 215	return rule;
 216}
 217
 218/*
 219 * Allows access to pseudo filesystems that will never be mountable (e.g.
 220 * sockfs, pipefs), but can still be reachable through
 221 * /proc/<pid>/fd/<file-descriptor>
 222 */
 223static bool is_nouser_or_private(const struct dentry *dentry)
 224{
 225	return (dentry->d_sb->s_flags & SB_NOUSER) ||
 226	       (d_is_positive(dentry) &&
 227		unlikely(IS_PRIVATE(d_backing_inode(dentry))));
 228}
 229
 230static access_mask_t
 231get_raw_handled_fs_accesses(const struct landlock_ruleset *const domain)
 232{
 233	access_mask_t access_dom = 0;
 234	size_t layer_level;
 235
 236	for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
 237		access_dom |=
 238			landlock_get_raw_fs_access_mask(domain, layer_level);
 239	return access_dom;
 240}
 241
 242static access_mask_t
 243get_handled_fs_accesses(const struct landlock_ruleset *const domain)
 244{
 245	/* Handles all initially denied by default access rights. */
 246	return get_raw_handled_fs_accesses(domain) |
 247	       LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
 248}
 249
 250static const struct landlock_ruleset *get_current_fs_domain(void)
 251{
 252	const struct landlock_ruleset *const dom =
 253		landlock_get_current_domain();
 254
 255	if (!dom || !get_raw_handled_fs_accesses(dom))
 256		return NULL;
 257
 258	return dom;
 259}
 260
 261/*
 262 * Check that a destination file hierarchy has more restrictions than a source
 263 * file hierarchy.  This is only used for link and rename actions.
 264 *
 265 * @layer_masks_child2: Optional child masks.
 266 */
 267static bool no_more_access(
 268	const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
 269	const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS],
 270	const bool child1_is_directory,
 271	const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
 272	const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS],
 273	const bool child2_is_directory)
 274{
 275	unsigned long access_bit;
 276
 277	for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2);
 278	     access_bit++) {
 279		/* Ignores accesses that only make sense for directories. */
 280		const bool is_file_access =
 281			!!(BIT_ULL(access_bit) & ACCESS_FILE);
 282
 283		if (child1_is_directory || is_file_access) {
 284			/*
 285			 * Checks if the destination restrictions are a
 286			 * superset of the source ones (i.e. inherited access
 287			 * rights without child exceptions):
 288			 * restrictions(parent2) >= restrictions(child1)
 289			 */
 290			if ((((*layer_masks_parent1)[access_bit] &
 291			      (*layer_masks_child1)[access_bit]) |
 292			     (*layer_masks_parent2)[access_bit]) !=
 293			    (*layer_masks_parent2)[access_bit])
 294				return false;
 295		}
 296
 297		if (!layer_masks_child2)
 298			continue;
 299		if (child2_is_directory || is_file_access) {
 300			/*
 301			 * Checks inverted restrictions for RENAME_EXCHANGE:
 302			 * restrictions(parent1) >= restrictions(child2)
 303			 */
 304			if ((((*layer_masks_parent2)[access_bit] &
 305			      (*layer_masks_child2)[access_bit]) |
 306			     (*layer_masks_parent1)[access_bit]) !=
 307			    (*layer_masks_parent1)[access_bit])
 308				return false;
 309		}
 310	}
 311	return true;
 312}
 313
 314/*
 315 * Removes @layer_masks accesses that are not requested.
 316 *
 317 * Returns true if the request is allowed, false otherwise.
 318 */
 319static bool
 320scope_to_request(const access_mask_t access_request,
 321		 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
 322{
 323	const unsigned long access_req = access_request;
 324	unsigned long access_bit;
 325
 326	if (WARN_ON_ONCE(!layer_masks))
 327		return true;
 328
 329	for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks))
 330		(*layer_masks)[access_bit] = 0;
 331	return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
 332}
 333
 334/*
 335 * Returns true if there is at least one access right different than
 336 * LANDLOCK_ACCESS_FS_REFER.
 337 */
 338static bool
 339is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
 340	  const access_mask_t access_request)
 341{
 342	unsigned long access_bit;
 343	/* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */
 344	const unsigned long access_check = access_request &
 345					   ~LANDLOCK_ACCESS_FS_REFER;
 346
 347	if (!layer_masks)
 348		return false;
 349
 350	for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) {
 351		if ((*layer_masks)[access_bit])
 352			return true;
 353	}
 354	return false;
 355}
 356
 357/**
 358 * is_access_to_paths_allowed - Check accesses for requests with a common path
 359 *
 360 * @domain: Domain to check against.
 361 * @path: File hierarchy to walk through.
 362 * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is
 363 *     equal to @layer_masks_parent2 (if any).  This is tied to the unique
 364 *     requested path for most actions, or the source in case of a refer action
 365 *     (i.e. rename or link), or the source and destination in case of
 366 *     RENAME_EXCHANGE.
 367 * @layer_masks_parent1: Pointer to a matrix of layer masks per access
 368 *     masks, identifying the layers that forbid a specific access.  Bits from
 369 *     this matrix can be unset according to the @path walk.  An empty matrix
 370 *     means that @domain allows all possible Landlock accesses (i.e. not only
 371 *     those identified by @access_request_parent1).  This matrix can
 372 *     initially refer to domain layer masks and, when the accesses for the
 373 *     destination and source are the same, to requested layer masks.
 374 * @dentry_child1: Dentry to the initial child of the parent1 path.  This
 375 *     pointer must be NULL for non-refer actions (i.e. not link nor rename).
 376 * @access_request_parent2: Similar to @access_request_parent1 but for a
 377 *     request involving a source and a destination.  This refers to the
 378 *     destination, except in case of RENAME_EXCHANGE where it also refers to
 379 *     the source.  Must be set to 0 when using a simple path request.
 380 * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer
 381 *     action.  This must be NULL otherwise.
 382 * @dentry_child2: Dentry to the initial child of the parent2 path.  This
 383 *     pointer is only set for RENAME_EXCHANGE actions and must be NULL
 384 *     otherwise.
 385 *
 386 * This helper first checks that the destination has a superset of restrictions
 387 * compared to the source (if any) for a common path.  Because of
 388 * RENAME_EXCHANGE actions, source and destinations may be swapped.  It then
 389 * checks that the collected accesses and the remaining ones are enough to
 390 * allow the request.
 391 *
 392 * Returns:
 393 * - true if the access request is granted;
 394 * - false otherwise.
 395 */
 396static bool is_access_to_paths_allowed(
 397	const struct landlock_ruleset *const domain,
 398	const struct path *const path,
 399	const access_mask_t access_request_parent1,
 400	layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
 401	const struct dentry *const dentry_child1,
 402	const access_mask_t access_request_parent2,
 403	layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
 404	const struct dentry *const dentry_child2)
 405{
 406	bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check,
 407	     child1_is_directory = true, child2_is_directory = true;
 408	struct path walker_path;
 409	access_mask_t access_masked_parent1, access_masked_parent2;
 410	layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS],
 411		_layer_masks_child2[LANDLOCK_NUM_ACCESS_FS];
 412	layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL,
 413	(*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL;
 414
 415	if (!access_request_parent1 && !access_request_parent2)
 416		return true;
 417	if (WARN_ON_ONCE(!domain || !path))
 418		return true;
 419	if (is_nouser_or_private(path->dentry))
 420		return true;
 421	if (WARN_ON_ONCE(domain->num_layers < 1 || !layer_masks_parent1))
 422		return false;
 423
 424	if (unlikely(layer_masks_parent2)) {
 425		if (WARN_ON_ONCE(!dentry_child1))
 426			return false;
 427		/*
 428		 * For a double request, first check for potential privilege
 429		 * escalation by looking at domain handled accesses (which are
 430		 * a superset of the meaningful requested accesses).
 431		 */
 432		access_masked_parent1 = access_masked_parent2 =
 433			get_handled_fs_accesses(domain);
 434		is_dom_check = true;
 435	} else {
 436		if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
 437			return false;
 438		/* For a simple request, only check for requested accesses. */
 439		access_masked_parent1 = access_request_parent1;
 440		access_masked_parent2 = access_request_parent2;
 441		is_dom_check = false;
 442	}
 443
 444	if (unlikely(dentry_child1)) {
 445		landlock_unmask_layers(
 446			find_rule(domain, dentry_child1),
 447			landlock_init_layer_masks(
 448				domain, LANDLOCK_MASK_ACCESS_FS,
 449				&_layer_masks_child1, LANDLOCK_KEY_INODE),
 450			&_layer_masks_child1, ARRAY_SIZE(_layer_masks_child1));
 451		layer_masks_child1 = &_layer_masks_child1;
 452		child1_is_directory = d_is_dir(dentry_child1);
 453	}
 454	if (unlikely(dentry_child2)) {
 455		landlock_unmask_layers(
 456			find_rule(domain, dentry_child2),
 457			landlock_init_layer_masks(
 458				domain, LANDLOCK_MASK_ACCESS_FS,
 459				&_layer_masks_child2, LANDLOCK_KEY_INODE),
 460			&_layer_masks_child2, ARRAY_SIZE(_layer_masks_child2));
 461		layer_masks_child2 = &_layer_masks_child2;
 462		child2_is_directory = d_is_dir(dentry_child2);
 463	}
 464
 465	walker_path = *path;
 466	path_get(&walker_path);
 467	/*
 468	 * We need to walk through all the hierarchy to not miss any relevant
 469	 * restriction.
 470	 */
 471	while (true) {
 472		struct dentry *parent_dentry;
 473		const struct landlock_rule *rule;
 474
 475		/*
 476		 * If at least all accesses allowed on the destination are
 477		 * already allowed on the source, respectively if there is at
 478		 * least as much as restrictions on the destination than on the
 479		 * source, then we can safely refer files from the source to
 480		 * the destination without risking a privilege escalation.
 481		 * This also applies in the case of RENAME_EXCHANGE, which
 482		 * implies checks on both direction.  This is crucial for
 483		 * standalone multilayered security policies.  Furthermore,
 484		 * this helps avoid policy writers to shoot themselves in the
 485		 * foot.
 486		 */
 487		if (unlikely(is_dom_check &&
 488			     no_more_access(
 489				     layer_masks_parent1, layer_masks_child1,
 490				     child1_is_directory, layer_masks_parent2,
 491				     layer_masks_child2,
 492				     child2_is_directory))) {
 493			allowed_parent1 = scope_to_request(
 494				access_request_parent1, layer_masks_parent1);
 495			allowed_parent2 = scope_to_request(
 496				access_request_parent2, layer_masks_parent2);
 497
 498			/* Stops when all accesses are granted. */
 499			if (allowed_parent1 && allowed_parent2)
 500				break;
 501
 502			/*
 503			 * Now, downgrades the remaining checks from domain
 504			 * handled accesses to requested accesses.
 505			 */
 506			is_dom_check = false;
 507			access_masked_parent1 = access_request_parent1;
 508			access_masked_parent2 = access_request_parent2;
 509		}
 510
 511		rule = find_rule(domain, walker_path.dentry);
 512		allowed_parent1 = landlock_unmask_layers(
 513			rule, access_masked_parent1, layer_masks_parent1,
 514			ARRAY_SIZE(*layer_masks_parent1));
 515		allowed_parent2 = landlock_unmask_layers(
 516			rule, access_masked_parent2, layer_masks_parent2,
 517			ARRAY_SIZE(*layer_masks_parent2));
 518
 519		/* Stops when a rule from each layer grants access. */
 520		if (allowed_parent1 && allowed_parent2)
 521			break;
 522jump_up:
 523		if (walker_path.dentry == walker_path.mnt->mnt_root) {
 524			if (follow_up(&walker_path)) {
 525				/* Ignores hidden mount points. */
 526				goto jump_up;
 527			} else {
 528				/*
 529				 * Stops at the real root.  Denies access
 530				 * because not all layers have granted access.
 531				 */
 532				break;
 533			}
 534		}
 535		if (unlikely(IS_ROOT(walker_path.dentry))) {
 536			/*
 537			 * Stops at disconnected root directories.  Only allows
 538			 * access to internal filesystems (e.g. nsfs, which is
 539			 * reachable through /proc/<pid>/ns/<namespace>).
 540			 */
 541			allowed_parent1 = allowed_parent2 =
 542				!!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
 543			break;
 544		}
 545		parent_dentry = dget_parent(walker_path.dentry);
 546		dput(walker_path.dentry);
 547		walker_path.dentry = parent_dentry;
 548	}
 549	path_put(&walker_path);
 550
 551	return allowed_parent1 && allowed_parent2;
 552}
 553
 554static int check_access_path(const struct landlock_ruleset *const domain,
 555			     const struct path *const path,
 556			     access_mask_t access_request)
 557{
 558	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
 559
 560	access_request = landlock_init_layer_masks(
 561		domain, access_request, &layer_masks, LANDLOCK_KEY_INODE);
 562	if (is_access_to_paths_allowed(domain, path, access_request,
 563				       &layer_masks, NULL, 0, NULL, NULL))
 564		return 0;
 565	return -EACCES;
 566}
 567
 568static int current_check_access_path(const struct path *const path,
 569				     const access_mask_t access_request)
 570{
 571	const struct landlock_ruleset *const dom = get_current_fs_domain();
 572
 573	if (!dom)
 574		return 0;
 575	return check_access_path(dom, path, access_request);
 576}
 577
 578static access_mask_t get_mode_access(const umode_t mode)
 579{
 580	switch (mode & S_IFMT) {
 581	case S_IFLNK:
 582		return LANDLOCK_ACCESS_FS_MAKE_SYM;
 583	case 0:
 584		/* A zero mode translates to S_IFREG. */
 585	case S_IFREG:
 586		return LANDLOCK_ACCESS_FS_MAKE_REG;
 587	case S_IFDIR:
 588		return LANDLOCK_ACCESS_FS_MAKE_DIR;
 589	case S_IFCHR:
 590		return LANDLOCK_ACCESS_FS_MAKE_CHAR;
 591	case S_IFBLK:
 592		return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
 593	case S_IFIFO:
 594		return LANDLOCK_ACCESS_FS_MAKE_FIFO;
 595	case S_IFSOCK:
 596		return LANDLOCK_ACCESS_FS_MAKE_SOCK;
 597	default:
 598		WARN_ON_ONCE(1);
 599		return 0;
 600	}
 601}
 602
 603static access_mask_t maybe_remove(const struct dentry *const dentry)
 604{
 605	if (d_is_negative(dentry))
 606		return 0;
 607	return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
 608				  LANDLOCK_ACCESS_FS_REMOVE_FILE;
 609}
 610
 611/**
 612 * collect_domain_accesses - Walk through a file path and collect accesses
 613 *
 614 * @domain: Domain to check against.
 615 * @mnt_root: Last directory to check.
 616 * @dir: Directory to start the walk from.
 617 * @layer_masks_dom: Where to store the collected accesses.
 618 *
 619 * This helper is useful to begin a path walk from the @dir directory to a
 620 * @mnt_root directory used as a mount point.  This mount point is the common
 621 * ancestor between the source and the destination of a renamed and linked
 622 * file.  While walking from @dir to @mnt_root, we record all the domain's
 623 * allowed accesses in @layer_masks_dom.
 624 *
 625 * This is similar to is_access_to_paths_allowed() but much simpler because it
 626 * only handles walking on the same mount point and only checks one set of
 627 * accesses.
 628 *
 629 * Returns:
 630 * - true if all the domain access rights are allowed for @dir;
 631 * - false if the walk reached @mnt_root.
 632 */
 633static bool collect_domain_accesses(
 634	const struct landlock_ruleset *const domain,
 635	const struct dentry *const mnt_root, struct dentry *dir,
 636	layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS])
 637{
 638	unsigned long access_dom;
 639	bool ret = false;
 640
 641	if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom))
 642		return true;
 643	if (is_nouser_or_private(dir))
 644		return true;
 645
 646	access_dom = landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
 647					       layer_masks_dom,
 648					       LANDLOCK_KEY_INODE);
 649
 650	dget(dir);
 651	while (true) {
 652		struct dentry *parent_dentry;
 653
 654		/* Gets all layers allowing all domain accesses. */
 655		if (landlock_unmask_layers(find_rule(domain, dir), access_dom,
 656					   layer_masks_dom,
 657					   ARRAY_SIZE(*layer_masks_dom))) {
 658			/*
 659			 * Stops when all handled accesses are allowed by at
 660			 * least one rule in each layer.
 661			 */
 662			ret = true;
 663			break;
 664		}
 665
 666		/* We should not reach a root other than @mnt_root. */
 667		if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir)))
 668			break;
 669
 670		parent_dentry = dget_parent(dir);
 671		dput(dir);
 672		dir = parent_dentry;
 673	}
 674	dput(dir);
 675	return ret;
 676}
 677
 678/**
 679 * current_check_refer_path - Check if a rename or link action is allowed
 680 *
 681 * @old_dentry: File or directory requested to be moved or linked.
 682 * @new_dir: Destination parent directory.
 683 * @new_dentry: Destination file or directory.
 684 * @removable: Sets to true if it is a rename operation.
 685 * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE.
 686 *
 687 * Because of its unprivileged constraints, Landlock relies on file hierarchies
 688 * (and not only inodes) to tie access rights to files.  Being able to link or
 689 * rename a file hierarchy brings some challenges.  Indeed, moving or linking a
 690 * file (i.e. creating a new reference to an inode) can have an impact on the
 691 * actions allowed for a set of files if it would change its parent directory
 692 * (i.e. reparenting).
 693 *
 694 * To avoid trivial access right bypasses, Landlock first checks if the file or
 695 * directory requested to be moved would gain new access rights inherited from
 696 * its new hierarchy.  Before returning any error, Landlock then checks that
 697 * the parent source hierarchy and the destination hierarchy would allow the
 698 * link or rename action.  If it is not the case, an error with EACCES is
 699 * returned to inform user space that there is no way to remove or create the
 700 * requested source file type.  If it should be allowed but the new inherited
 701 * access rights would be greater than the source access rights, then the
 702 * kernel returns an error with EXDEV.  Prioritizing EACCES over EXDEV enables
 703 * user space to abort the whole operation if there is no way to do it, or to
 704 * manually copy the source to the destination if this remains allowed, e.g.
 705 * because file creation is allowed on the destination directory but not direct
 706 * linking.
 707 *
 708 * To achieve this goal, the kernel needs to compare two file hierarchies: the
 709 * one identifying the source file or directory (including itself), and the
 710 * destination one.  This can be seen as a multilayer partial ordering problem.
 711 * The kernel walks through these paths and collects in a matrix the access
 712 * rights that are denied per layer.  These matrices are then compared to see
 713 * if the destination one has more (or the same) restrictions as the source
 714 * one.  If this is the case, the requested action will not return EXDEV, which
 715 * doesn't mean the action is allowed.  The parent hierarchy of the source
 716 * (i.e. parent directory), and the destination hierarchy must also be checked
 717 * to verify that they explicitly allow such action (i.e.  referencing,
 718 * creation and potentially removal rights).  The kernel implementation is then
 719 * required to rely on potentially four matrices of access rights: one for the
 720 * source file or directory (i.e. the child), a potentially other one for the
 721 * other source/destination (in case of RENAME_EXCHANGE), one for the source
 722 * parent hierarchy and a last one for the destination hierarchy.  These
 723 * ephemeral matrices take some space on the stack, which limits the number of
 724 * layers to a deemed reasonable number: 16.
 725 *
 726 * Returns:
 727 * - 0 if access is allowed;
 728 * - -EXDEV if @old_dentry would inherit new access rights from @new_dir;
 729 * - -EACCES if file removal or creation is denied.
 730 */
 731static int current_check_refer_path(struct dentry *const old_dentry,
 732				    const struct path *const new_dir,
 733				    struct dentry *const new_dentry,
 734				    const bool removable, const bool exchange)
 735{
 736	const struct landlock_ruleset *const dom = get_current_fs_domain();
 737	bool allow_parent1, allow_parent2;
 738	access_mask_t access_request_parent1, access_request_parent2;
 739	struct path mnt_dir;
 740	layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {},
 741		     layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {};
 742
 743	if (!dom)
 744		return 0;
 745	if (WARN_ON_ONCE(dom->num_layers < 1))
 746		return -EACCES;
 747	if (unlikely(d_is_negative(old_dentry)))
 748		return -ENOENT;
 749	if (exchange) {
 750		if (unlikely(d_is_negative(new_dentry)))
 751			return -ENOENT;
 752		access_request_parent1 =
 753			get_mode_access(d_backing_inode(new_dentry)->i_mode);
 754	} else {
 755		access_request_parent1 = 0;
 756	}
 757	access_request_parent2 =
 758		get_mode_access(d_backing_inode(old_dentry)->i_mode);
 759	if (removable) {
 760		access_request_parent1 |= maybe_remove(old_dentry);
 761		access_request_parent2 |= maybe_remove(new_dentry);
 762	}
 763
 764	/* The mount points are the same for old and new paths, cf. EXDEV. */
 765	if (old_dentry->d_parent == new_dir->dentry) {
 766		/*
 767		 * The LANDLOCK_ACCESS_FS_REFER access right is not required
 768		 * for same-directory referer (i.e. no reparenting).
 769		 */
 770		access_request_parent1 = landlock_init_layer_masks(
 771			dom, access_request_parent1 | access_request_parent2,
 772			&layer_masks_parent1, LANDLOCK_KEY_INODE);
 773		if (is_access_to_paths_allowed(
 774			    dom, new_dir, access_request_parent1,
 775			    &layer_masks_parent1, NULL, 0, NULL, NULL))
 776			return 0;
 777		return -EACCES;
 778	}
 779
 780	access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
 781	access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER;
 782
 783	/* Saves the common mount point. */
 784	mnt_dir.mnt = new_dir->mnt;
 785	mnt_dir.dentry = new_dir->mnt->mnt_root;
 786
 787	/* new_dir->dentry is equal to new_dentry->d_parent */
 788	allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry,
 789						old_dentry->d_parent,
 790						&layer_masks_parent1);
 791	allow_parent2 = collect_domain_accesses(
 792		dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2);
 793
 794	if (allow_parent1 && allow_parent2)
 795		return 0;
 796
 797	/*
 798	 * To be able to compare source and destination domain access rights,
 799	 * take into account the @old_dentry access rights aggregated with its
 800	 * parent access rights.  This will be useful to compare with the
 801	 * destination parent access rights.
 802	 */
 803	if (is_access_to_paths_allowed(
 804		    dom, &mnt_dir, access_request_parent1, &layer_masks_parent1,
 805		    old_dentry, access_request_parent2, &layer_masks_parent2,
 806		    exchange ? new_dentry : NULL))
 807		return 0;
 808
 809	/*
 810	 * This prioritizes EACCES over EXDEV for all actions, including
 811	 * renames with RENAME_EXCHANGE.
 812	 */
 813	if (likely(is_eacces(&layer_masks_parent1, access_request_parent1) ||
 814		   is_eacces(&layer_masks_parent2, access_request_parent2)))
 815		return -EACCES;
 816
 817	/*
 818	 * Gracefully forbids reparenting if the destination directory
 819	 * hierarchy is not a superset of restrictions of the source directory
 820	 * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the
 821	 * source or the destination.
 822	 */
 823	return -EXDEV;
 824}
 825
 826/* Inode hooks */
 827
 828static void hook_inode_free_security(struct inode *const inode)
 829{
 830	/*
 831	 * All inodes must already have been untied from their object by
 832	 * release_inode() or hook_sb_delete().
 833	 */
 834	WARN_ON_ONCE(landlock_inode(inode)->object);
 835}
 836
 837/* Super-block hooks */
 838
 839/*
 840 * Release the inodes used in a security policy.
 841 *
 842 * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
 843 */
 844static void hook_sb_delete(struct super_block *const sb)
 845{
 846	struct inode *inode, *prev_inode = NULL;
 847
 848	if (!landlock_initialized)
 849		return;
 850
 851	spin_lock(&sb->s_inode_list_lock);
 852	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 853		struct landlock_object *object;
 854
 855		/* Only handles referenced inodes. */
 856		if (!atomic_read(&inode->i_count))
 857			continue;
 858
 859		/*
 860		 * Protects against concurrent modification of inode (e.g.
 861		 * from get_inode_object()).
 862		 */
 863		spin_lock(&inode->i_lock);
 864		/*
 865		 * Checks I_FREEING and I_WILL_FREE  to protect against a race
 866		 * condition when release_inode() just called iput(), which
 867		 * could lead to a NULL dereference of inode->security or a
 868		 * second call to iput() for the same Landlock object.  Also
 869		 * checks I_NEW because such inode cannot be tied to an object.
 870		 */
 871		if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
 872			spin_unlock(&inode->i_lock);
 873			continue;
 874		}
 875
 876		rcu_read_lock();
 877		object = rcu_dereference(landlock_inode(inode)->object);
 878		if (!object) {
 879			rcu_read_unlock();
 880			spin_unlock(&inode->i_lock);
 881			continue;
 882		}
 883		/* Keeps a reference to this inode until the next loop walk. */
 884		__iget(inode);
 885		spin_unlock(&inode->i_lock);
 886
 887		/*
 888		 * If there is no concurrent release_inode() ongoing, then we
 889		 * are in charge of calling iput() on this inode, otherwise we
 890		 * will just wait for it to finish.
 891		 */
 892		spin_lock(&object->lock);
 893		if (object->underobj == inode) {
 894			object->underobj = NULL;
 895			spin_unlock(&object->lock);
 896			rcu_read_unlock();
 897
 898			/*
 899			 * Because object->underobj was not NULL,
 900			 * release_inode() and get_inode_object() guarantee
 901			 * that it is safe to reset
 902			 * landlock_inode(inode)->object while it is not NULL.
 903			 * It is therefore not necessary to lock inode->i_lock.
 904			 */
 905			rcu_assign_pointer(landlock_inode(inode)->object, NULL);
 906			/*
 907			 * At this point, we own the ihold() reference that was
 908			 * originally set up by get_inode_object() and the
 909			 * __iget() reference that we just set in this loop
 910			 * walk.  Therefore the following call to iput() will
 911			 * not sleep nor drop the inode because there is now at
 912			 * least two references to it.
 913			 */
 914			iput(inode);
 915		} else {
 916			spin_unlock(&object->lock);
 917			rcu_read_unlock();
 918		}
 919
 920		if (prev_inode) {
 921			/*
 922			 * At this point, we still own the __iget() reference
 923			 * that we just set in this loop walk.  Therefore we
 924			 * can drop the list lock and know that the inode won't
 925			 * disappear from under us until the next loop walk.
 926			 */
 927			spin_unlock(&sb->s_inode_list_lock);
 928			/*
 929			 * We can now actually put the inode reference from the
 930			 * previous loop walk, which is not needed anymore.
 931			 */
 932			iput(prev_inode);
 933			cond_resched();
 934			spin_lock(&sb->s_inode_list_lock);
 935		}
 936		prev_inode = inode;
 937	}
 938	spin_unlock(&sb->s_inode_list_lock);
 939
 940	/* Puts the inode reference from the last loop walk, if any. */
 941	if (prev_inode)
 942		iput(prev_inode);
 943	/* Waits for pending iput() in release_inode(). */
 944	wait_var_event(&landlock_superblock(sb)->inode_refs,
 945		       !atomic_long_read(&landlock_superblock(sb)->inode_refs));
 946}
 947
 948/*
 949 * Because a Landlock security policy is defined according to the filesystem
 950 * topology (i.e. the mount namespace), changing it may grant access to files
 951 * not previously allowed.
 952 *
 953 * To make it simple, deny any filesystem topology modification by landlocked
 954 * processes.  Non-landlocked processes may still change the namespace of a
 955 * landlocked process, but this kind of threat must be handled by a system-wide
 956 * access-control security policy.
 957 *
 958 * This could be lifted in the future if Landlock can safely handle mount
 959 * namespace updates requested by a landlocked process.  Indeed, we could
 960 * update the current domain (which is currently read-only) by taking into
 961 * account the accesses of the source and the destination of a new mount point.
 962 * However, it would also require to make all the child domains dynamically
 963 * inherit these new constraints.  Anyway, for backward compatibility reasons,
 964 * a dedicated user space option would be required (e.g. as a ruleset flag).
 965 */
 966static int hook_sb_mount(const char *const dev_name,
 967			 const struct path *const path, const char *const type,
 968			 const unsigned long flags, void *const data)
 969{
 970	if (!get_current_fs_domain())
 971		return 0;
 972	return -EPERM;
 973}
 974
 975static int hook_move_mount(const struct path *const from_path,
 976			   const struct path *const to_path)
 977{
 978	if (!get_current_fs_domain())
 979		return 0;
 980	return -EPERM;
 981}
 982
 983/*
 984 * Removing a mount point may reveal a previously hidden file hierarchy, which
 985 * may then grant access to files, which may have previously been forbidden.
 986 */
 987static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
 988{
 989	if (!get_current_fs_domain())
 990		return 0;
 991	return -EPERM;
 992}
 993
 994static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
 995{
 996	if (!get_current_fs_domain())
 997		return 0;
 998	return -EPERM;
 999}
1000
1001/*
1002 * pivot_root(2), like mount(2), changes the current mount namespace.  It must
1003 * then be forbidden for a landlocked process.
1004 *
1005 * However, chroot(2) may be allowed because it only changes the relative root
1006 * directory of the current process.  Moreover, it can be used to restrict the
1007 * view of the filesystem.
1008 */
1009static int hook_sb_pivotroot(const struct path *const old_path,
1010			     const struct path *const new_path)
1011{
1012	if (!get_current_fs_domain())
1013		return 0;
1014	return -EPERM;
1015}
1016
1017/* Path hooks */
1018
1019static int hook_path_link(struct dentry *const old_dentry,
1020			  const struct path *const new_dir,
1021			  struct dentry *const new_dentry)
1022{
1023	return current_check_refer_path(old_dentry, new_dir, new_dentry, false,
1024					false);
1025}
1026
1027static int hook_path_rename(const struct path *const old_dir,
1028			    struct dentry *const old_dentry,
1029			    const struct path *const new_dir,
1030			    struct dentry *const new_dentry,
1031			    const unsigned int flags)
1032{
1033	/* old_dir refers to old_dentry->d_parent and new_dir->mnt */
1034	return current_check_refer_path(old_dentry, new_dir, new_dentry, true,
1035					!!(flags & RENAME_EXCHANGE));
1036}
1037
1038static int hook_path_mkdir(const struct path *const dir,
1039			   struct dentry *const dentry, const umode_t mode)
1040{
1041	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
1042}
1043
1044static int hook_path_mknod(const struct path *const dir,
1045			   struct dentry *const dentry, const umode_t mode,
1046			   const unsigned int dev)
1047{
1048	const struct landlock_ruleset *const dom = get_current_fs_domain();
1049
1050	if (!dom)
1051		return 0;
1052	return check_access_path(dom, dir, get_mode_access(mode));
1053}
1054
1055static int hook_path_symlink(const struct path *const dir,
1056			     struct dentry *const dentry,
1057			     const char *const old_name)
1058{
1059	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
1060}
1061
1062static int hook_path_unlink(const struct path *const dir,
1063			    struct dentry *const dentry)
1064{
1065	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
1066}
1067
1068static int hook_path_rmdir(const struct path *const dir,
1069			   struct dentry *const dentry)
1070{
1071	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
1072}
1073
1074static int hook_path_truncate(const struct path *const path)
1075{
1076	return current_check_access_path(path, LANDLOCK_ACCESS_FS_TRUNCATE);
1077}
1078
1079/* File hooks */
1080
1081/**
1082 * get_required_file_open_access - Get access needed to open a file
1083 *
1084 * @file: File being opened.
1085 *
1086 * Returns the access rights that are required for opening the given file,
1087 * depending on the file type and open mode.
1088 */
1089static access_mask_t
1090get_required_file_open_access(const struct file *const file)
1091{
1092	access_mask_t access = 0;
1093
1094	if (file->f_mode & FMODE_READ) {
1095		/* A directory can only be opened in read mode. */
1096		if (S_ISDIR(file_inode(file)->i_mode))
1097			return LANDLOCK_ACCESS_FS_READ_DIR;
1098		access = LANDLOCK_ACCESS_FS_READ_FILE;
1099	}
1100	if (file->f_mode & FMODE_WRITE)
1101		access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
1102	/* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
1103	if (file->f_flags & __FMODE_EXEC)
1104		access |= LANDLOCK_ACCESS_FS_EXECUTE;
1105	return access;
1106}
1107
1108static int hook_file_alloc_security(struct file *const file)
1109{
1110	/*
1111	 * Grants all access rights, even if most of them are not checked later
1112	 * on. It is more consistent.
1113	 *
1114	 * Notably, file descriptors for regular files can also be acquired
1115	 * without going through the file_open hook, for example when using
1116	 * memfd_create(2).
1117	 */
1118	landlock_file(file)->allowed_access = LANDLOCK_MASK_ACCESS_FS;
1119	return 0;
1120}
1121
1122static int hook_file_open(struct file *const file)
1123{
1124	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
1125	access_mask_t open_access_request, full_access_request, allowed_access;
1126	const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
1127	const struct landlock_ruleset *const dom = get_current_fs_domain();
1128
1129	if (!dom)
1130		return 0;
1131
1132	/*
1133	 * Because a file may be opened with O_PATH, get_required_file_open_access()
1134	 * may return 0.  This case will be handled with a future Landlock
1135	 * evolution.
1136	 */
1137	open_access_request = get_required_file_open_access(file);
1138
1139	/*
1140	 * We look up more access than what we immediately need for open(), so
1141	 * that we can later authorize operations on opened files.
1142	 */
1143	full_access_request = open_access_request | optional_access;
1144
1145	if (is_access_to_paths_allowed(
1146		    dom, &file->f_path,
1147		    landlock_init_layer_masks(dom, full_access_request,
1148					      &layer_masks, LANDLOCK_KEY_INODE),
1149		    &layer_masks, NULL, 0, NULL, NULL)) {
1150		allowed_access = full_access_request;
1151	} else {
1152		unsigned long access_bit;
1153		const unsigned long access_req = full_access_request;
1154
1155		/*
1156		 * Calculate the actual allowed access rights from layer_masks.
1157		 * Add each access right to allowed_access which has not been
1158		 * vetoed by any layer.
1159		 */
1160		allowed_access = 0;
1161		for_each_set_bit(access_bit, &access_req,
1162				 ARRAY_SIZE(layer_masks)) {
1163			if (!layer_masks[access_bit])
1164				allowed_access |= BIT_ULL(access_bit);
1165		}
1166	}
1167
1168	/*
1169	 * For operations on already opened files (i.e. ftruncate()), it is the
1170	 * access rights at the time of open() which decide whether the
1171	 * operation is permitted. Therefore, we record the relevant subset of
1172	 * file access rights in the opened struct file.
1173	 */
1174	landlock_file(file)->allowed_access = allowed_access;
1175
1176	if ((open_access_request & allowed_access) == open_access_request)
1177		return 0;
1178
1179	return -EACCES;
1180}
1181
1182static int hook_file_truncate(struct file *const file)
1183{
1184	/*
1185	 * Allows truncation if the truncate right was available at the time of
1186	 * opening the file, to get a consistent access check as for read, write
1187	 * and execute operations.
1188	 *
1189	 * Note: For checks done based on the file's Landlock allowed access, we
1190	 * enforce them independently of whether the current thread is in a
1191	 * Landlock domain, so that open files passed between independent
1192	 * processes retain their behaviour.
1193	 */
1194	if (landlock_file(file)->allowed_access & LANDLOCK_ACCESS_FS_TRUNCATE)
1195		return 0;
1196	return -EACCES;
1197}
1198
1199static struct security_hook_list landlock_hooks[] __ro_after_init = {
1200	LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
1201
1202	LSM_HOOK_INIT(sb_delete, hook_sb_delete),
1203	LSM_HOOK_INIT(sb_mount, hook_sb_mount),
1204	LSM_HOOK_INIT(move_mount, hook_move_mount),
1205	LSM_HOOK_INIT(sb_umount, hook_sb_umount),
1206	LSM_HOOK_INIT(sb_remount, hook_sb_remount),
1207	LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),
1208
1209	LSM_HOOK_INIT(path_link, hook_path_link),
1210	LSM_HOOK_INIT(path_rename, hook_path_rename),
1211	LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
1212	LSM_HOOK_INIT(path_mknod, hook_path_mknod),
1213	LSM_HOOK_INIT(path_symlink, hook_path_symlink),
1214	LSM_HOOK_INIT(path_unlink, hook_path_unlink),
1215	LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
1216	LSM_HOOK_INIT(path_truncate, hook_path_truncate),
1217
1218	LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
1219	LSM_HOOK_INIT(file_open, hook_file_open),
1220	LSM_HOOK_INIT(file_truncate, hook_file_truncate),
1221};
1222
1223__init void landlock_add_fs_hooks(void)
1224{
1225	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
1226			   &landlock_lsmid);
1227}