Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* Network filesystem high-level buffered read support.
3 *
4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <linux/export.h>
9#include <linux/task_io_accounting_ops.h>
10#include "internal.h"
11
12/*
13 * Unlock the folios in a read operation. We need to set PG_fscache on any
14 * folios we're going to write back before we unlock them.
15 */
16void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
17{
18 struct netfs_io_subrequest *subreq;
19 struct folio *folio;
20 pgoff_t start_page = rreq->start / PAGE_SIZE;
21 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
22 size_t account = 0;
23 bool subreq_failed = false;
24
25 XA_STATE(xas, &rreq->mapping->i_pages, start_page);
26
27 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
28 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
29 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
30 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
31 }
32 }
33
34 /* Walk through the pagecache and the I/O request lists simultaneously.
35 * We may have a mixture of cached and uncached sections and we only
36 * really want to write out the uncached sections. This is slightly
37 * complicated by the possibility that we might have huge pages with a
38 * mixture inside.
39 */
40 subreq = list_first_entry(&rreq->subrequests,
41 struct netfs_io_subrequest, rreq_link);
42 subreq_failed = (subreq->error < 0);
43
44 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
45
46 rcu_read_lock();
47 xas_for_each(&xas, folio, last_page) {
48 loff_t pg_end;
49 bool pg_failed = false;
50
51 if (xas_retry(&xas, folio))
52 continue;
53
54 pg_end = folio_pos(folio) + folio_size(folio) - 1;
55
56 for (;;) {
57 loff_t sreq_end;
58
59 if (!subreq) {
60 pg_failed = true;
61 break;
62 }
63 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
64 folio_start_fscache(folio);
65 pg_failed |= subreq_failed;
66 sreq_end = subreq->start + subreq->len - 1;
67 if (pg_end < sreq_end)
68 break;
69
70 account += subreq->transferred;
71 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
72 subreq = list_next_entry(subreq, rreq_link);
73 subreq_failed = (subreq->error < 0);
74 } else {
75 subreq = NULL;
76 subreq_failed = false;
77 }
78
79 if (pg_end == sreq_end)
80 break;
81 }
82
83 if (!pg_failed) {
84 flush_dcache_folio(folio);
85 folio_mark_uptodate(folio);
86 }
87
88 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
89 if (folio_index(folio) == rreq->no_unlock_folio &&
90 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
91 _debug("no unlock");
92 else
93 folio_unlock(folio);
94 }
95 }
96 rcu_read_unlock();
97
98 task_io_account_read(account);
99 if (rreq->netfs_ops->done)
100 rreq->netfs_ops->done(rreq);
101}
102
103static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
104 loff_t *_start, size_t *_len, loff_t i_size)
105{
106 struct netfs_cache_resources *cres = &rreq->cache_resources;
107
108 if (cres->ops && cres->ops->expand_readahead)
109 cres->ops->expand_readahead(cres, _start, _len, i_size);
110}
111
112static void netfs_rreq_expand(struct netfs_io_request *rreq,
113 struct readahead_control *ractl)
114{
115 /* Give the cache a chance to change the request parameters. The
116 * resultant request must contain the original region.
117 */
118 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
119
120 /* Give the netfs a chance to change the request parameters. The
121 * resultant request must contain the original region.
122 */
123 if (rreq->netfs_ops->expand_readahead)
124 rreq->netfs_ops->expand_readahead(rreq);
125
126 /* Expand the request if the cache wants it to start earlier. Note
127 * that the expansion may get further extended if the VM wishes to
128 * insert THPs and the preferred start and/or end wind up in the middle
129 * of THPs.
130 *
131 * If this is the case, however, the THP size should be an integer
132 * multiple of the cache granule size, so we get a whole number of
133 * granules to deal with.
134 */
135 if (rreq->start != readahead_pos(ractl) ||
136 rreq->len != readahead_length(ractl)) {
137 readahead_expand(ractl, rreq->start, rreq->len);
138 rreq->start = readahead_pos(ractl);
139 rreq->len = readahead_length(ractl);
140
141 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
142 netfs_read_trace_expanded);
143 }
144}
145
146/**
147 * netfs_readahead - Helper to manage a read request
148 * @ractl: The description of the readahead request
149 *
150 * Fulfil a readahead request by drawing data from the cache if possible, or
151 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
152 * requests from different sources will get munged together. If necessary, the
153 * readahead window can be expanded in either direction to a more convenient
154 * alighment for RPC efficiency or to make storage in the cache feasible.
155 *
156 * The calling netfs must initialise a netfs context contiguous to the vfs
157 * inode before calling this.
158 *
159 * This is usable whether or not caching is enabled.
160 */
161void netfs_readahead(struct readahead_control *ractl)
162{
163 struct netfs_io_request *rreq;
164 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
165 int ret;
166
167 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
168
169 if (readahead_count(ractl) == 0)
170 return;
171
172 rreq = netfs_alloc_request(ractl->mapping, ractl->file,
173 readahead_pos(ractl),
174 readahead_length(ractl),
175 NETFS_READAHEAD);
176 if (IS_ERR(rreq))
177 return;
178
179 if (ctx->ops->begin_cache_operation) {
180 ret = ctx->ops->begin_cache_operation(rreq);
181 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
182 goto cleanup_free;
183 }
184
185 netfs_stat(&netfs_n_rh_readahead);
186 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
187 netfs_read_trace_readahead);
188
189 netfs_rreq_expand(rreq, ractl);
190
191 /* Drop the refs on the folios here rather than in the cache or
192 * filesystem. The locks will be dropped in netfs_rreq_unlock().
193 */
194 while (readahead_folio(ractl))
195 ;
196
197 netfs_begin_read(rreq, false);
198 return;
199
200cleanup_free:
201 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
202 return;
203}
204EXPORT_SYMBOL(netfs_readahead);
205
206/**
207 * netfs_read_folio - Helper to manage a read_folio request
208 * @file: The file to read from
209 * @folio: The folio to read
210 *
211 * Fulfil a read_folio request by drawing data from the cache if
212 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
213 * Multiple I/O requests from different sources will get munged together.
214 *
215 * The calling netfs must initialise a netfs context contiguous to the vfs
216 * inode before calling this.
217 *
218 * This is usable whether or not caching is enabled.
219 */
220int netfs_read_folio(struct file *file, struct folio *folio)
221{
222 struct address_space *mapping = folio_file_mapping(folio);
223 struct netfs_io_request *rreq;
224 struct netfs_inode *ctx = netfs_inode(mapping->host);
225 int ret;
226
227 _enter("%lx", folio_index(folio));
228
229 rreq = netfs_alloc_request(mapping, file,
230 folio_file_pos(folio), folio_size(folio),
231 NETFS_READPAGE);
232 if (IS_ERR(rreq)) {
233 ret = PTR_ERR(rreq);
234 goto alloc_error;
235 }
236
237 if (ctx->ops->begin_cache_operation) {
238 ret = ctx->ops->begin_cache_operation(rreq);
239 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
240 goto discard;
241 }
242
243 netfs_stat(&netfs_n_rh_readpage);
244 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
245 return netfs_begin_read(rreq, true);
246
247discard:
248 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
249alloc_error:
250 folio_unlock(folio);
251 return ret;
252}
253EXPORT_SYMBOL(netfs_read_folio);
254
255/*
256 * Prepare a folio for writing without reading first
257 * @folio: The folio being prepared
258 * @pos: starting position for the write
259 * @len: length of write
260 * @always_fill: T if the folio should always be completely filled/cleared
261 *
262 * In some cases, write_begin doesn't need to read at all:
263 * - full folio write
264 * - write that lies in a folio that is completely beyond EOF
265 * - write that covers the folio from start to EOF or beyond it
266 *
267 * If any of these criteria are met, then zero out the unwritten parts
268 * of the folio and return true. Otherwise, return false.
269 */
270static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
271 bool always_fill)
272{
273 struct inode *inode = folio_inode(folio);
274 loff_t i_size = i_size_read(inode);
275 size_t offset = offset_in_folio(folio, pos);
276 size_t plen = folio_size(folio);
277
278 if (unlikely(always_fill)) {
279 if (pos - offset + len <= i_size)
280 return false; /* Page entirely before EOF */
281 zero_user_segment(&folio->page, 0, plen);
282 folio_mark_uptodate(folio);
283 return true;
284 }
285
286 /* Full folio write */
287 if (offset == 0 && len >= plen)
288 return true;
289
290 /* Page entirely beyond the end of the file */
291 if (pos - offset >= i_size)
292 goto zero_out;
293
294 /* Write that covers from the start of the folio to EOF or beyond */
295 if (offset == 0 && (pos + len) >= i_size)
296 goto zero_out;
297
298 return false;
299zero_out:
300 zero_user_segments(&folio->page, 0, offset, offset + len, plen);
301 return true;
302}
303
304/**
305 * netfs_write_begin - Helper to prepare for writing
306 * @ctx: The netfs context
307 * @file: The file to read from
308 * @mapping: The mapping to read from
309 * @pos: File position at which the write will begin
310 * @len: The length of the write (may extend beyond the end of the folio chosen)
311 * @_folio: Where to put the resultant folio
312 * @_fsdata: Place for the netfs to store a cookie
313 *
314 * Pre-read data for a write-begin request by drawing data from the cache if
315 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
316 * Multiple I/O requests from different sources will get munged together. If
317 * necessary, the readahead window can be expanded in either direction to a
318 * more convenient alighment for RPC efficiency or to make storage in the cache
319 * feasible.
320 *
321 * The calling netfs must provide a table of operations, only one of which,
322 * issue_op, is mandatory.
323 *
324 * The check_write_begin() operation can be provided to check for and flush
325 * conflicting writes once the folio is grabbed and locked. It is passed a
326 * pointer to the fsdata cookie that gets returned to the VM to be passed to
327 * write_end. It is permitted to sleep. It should return 0 if the request
328 * should go ahead or it may return an error. It may also unlock and put the
329 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
330 * will cause the folio to be re-got and the process to be retried.
331 *
332 * The calling netfs must initialise a netfs context contiguous to the vfs
333 * inode before calling this.
334 *
335 * This is usable whether or not caching is enabled.
336 */
337int netfs_write_begin(struct netfs_inode *ctx,
338 struct file *file, struct address_space *mapping,
339 loff_t pos, unsigned int len, struct folio **_folio,
340 void **_fsdata)
341{
342 struct netfs_io_request *rreq;
343 struct folio *folio;
344 unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
345 pgoff_t index = pos >> PAGE_SHIFT;
346 int ret;
347
348 DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
349
350retry:
351 folio = __filemap_get_folio(mapping, index, fgp_flags,
352 mapping_gfp_mask(mapping));
353 if (!folio)
354 return -ENOMEM;
355
356 if (ctx->ops->check_write_begin) {
357 /* Allow the netfs (eg. ceph) to flush conflicts. */
358 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
359 if (ret < 0) {
360 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
361 goto error;
362 }
363 if (!folio)
364 goto retry;
365 }
366
367 if (folio_test_uptodate(folio))
368 goto have_folio;
369
370 /* If the page is beyond the EOF, we want to clear it - unless it's
371 * within the cache granule containing the EOF, in which case we need
372 * to preload the granule.
373 */
374 if (!netfs_is_cache_enabled(ctx) &&
375 netfs_skip_folio_read(folio, pos, len, false)) {
376 netfs_stat(&netfs_n_rh_write_zskip);
377 goto have_folio_no_wait;
378 }
379
380 rreq = netfs_alloc_request(mapping, file,
381 folio_file_pos(folio), folio_size(folio),
382 NETFS_READ_FOR_WRITE);
383 if (IS_ERR(rreq)) {
384 ret = PTR_ERR(rreq);
385 goto error;
386 }
387 rreq->no_unlock_folio = folio_index(folio);
388 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
389
390 if (ctx->ops->begin_cache_operation) {
391 ret = ctx->ops->begin_cache_operation(rreq);
392 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
393 goto error_put;
394 }
395
396 netfs_stat(&netfs_n_rh_write_begin);
397 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
398
399 /* Expand the request to meet caching requirements and download
400 * preferences.
401 */
402 ractl._nr_pages = folio_nr_pages(folio);
403 netfs_rreq_expand(rreq, &ractl);
404
405 /* We hold the folio locks, so we can drop the references */
406 folio_get(folio);
407 while (readahead_folio(&ractl))
408 ;
409
410 ret = netfs_begin_read(rreq, true);
411 if (ret < 0)
412 goto error;
413
414have_folio:
415 ret = folio_wait_fscache_killable(folio);
416 if (ret < 0)
417 goto error;
418have_folio_no_wait:
419 *_folio = folio;
420 _leave(" = 0");
421 return 0;
422
423error_put:
424 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
425error:
426 if (folio) {
427 folio_unlock(folio);
428 folio_put(folio);
429 }
430 _leave(" = %d", ret);
431 return ret;
432}
433EXPORT_SYMBOL(netfs_write_begin);
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* Network filesystem high-level buffered read support.
3 *
4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <linux/export.h>
9#include <linux/task_io_accounting_ops.h>
10#include "internal.h"
11
12/*
13 * Unlock the folios in a read operation. We need to set PG_fscache on any
14 * folios we're going to write back before we unlock them.
15 */
16void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
17{
18 struct netfs_io_subrequest *subreq;
19 struct netfs_folio *finfo;
20 struct folio *folio;
21 pgoff_t start_page = rreq->start / PAGE_SIZE;
22 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
23 size_t account = 0;
24 bool subreq_failed = false;
25
26 XA_STATE(xas, &rreq->mapping->i_pages, start_page);
27
28 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
29 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
30 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
31 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
32 }
33 }
34
35 /* Walk through the pagecache and the I/O request lists simultaneously.
36 * We may have a mixture of cached and uncached sections and we only
37 * really want to write out the uncached sections. This is slightly
38 * complicated by the possibility that we might have huge pages with a
39 * mixture inside.
40 */
41 subreq = list_first_entry(&rreq->subrequests,
42 struct netfs_io_subrequest, rreq_link);
43 subreq_failed = (subreq->error < 0);
44
45 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
46
47 rcu_read_lock();
48 xas_for_each(&xas, folio, last_page) {
49 loff_t pg_end;
50 bool pg_failed = false;
51 bool folio_started;
52
53 if (xas_retry(&xas, folio))
54 continue;
55
56 pg_end = folio_pos(folio) + folio_size(folio) - 1;
57
58 folio_started = false;
59 for (;;) {
60 loff_t sreq_end;
61
62 if (!subreq) {
63 pg_failed = true;
64 break;
65 }
66 if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
67 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
68 folio_start_fscache(folio);
69 folio_started = true;
70 }
71 pg_failed |= subreq_failed;
72 sreq_end = subreq->start + subreq->len - 1;
73 if (pg_end < sreq_end)
74 break;
75
76 account += subreq->transferred;
77 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
78 subreq = list_next_entry(subreq, rreq_link);
79 subreq_failed = (subreq->error < 0);
80 } else {
81 subreq = NULL;
82 subreq_failed = false;
83 }
84
85 if (pg_end == sreq_end)
86 break;
87 }
88
89 if (!pg_failed) {
90 flush_dcache_folio(folio);
91 finfo = netfs_folio_info(folio);
92 if (finfo) {
93 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
94 if (finfo->netfs_group)
95 folio_change_private(folio, finfo->netfs_group);
96 else
97 folio_detach_private(folio);
98 kfree(finfo);
99 }
100 folio_mark_uptodate(folio);
101 }
102
103 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
104 if (folio->index == rreq->no_unlock_folio &&
105 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
106 _debug("no unlock");
107 else
108 folio_unlock(folio);
109 }
110 }
111 rcu_read_unlock();
112
113 task_io_account_read(account);
114 if (rreq->netfs_ops->done)
115 rreq->netfs_ops->done(rreq);
116}
117
118static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
119 loff_t *_start, size_t *_len, loff_t i_size)
120{
121 struct netfs_cache_resources *cres = &rreq->cache_resources;
122
123 if (cres->ops && cres->ops->expand_readahead)
124 cres->ops->expand_readahead(cres, _start, _len, i_size);
125}
126
127static void netfs_rreq_expand(struct netfs_io_request *rreq,
128 struct readahead_control *ractl)
129{
130 /* Give the cache a chance to change the request parameters. The
131 * resultant request must contain the original region.
132 */
133 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
134
135 /* Give the netfs a chance to change the request parameters. The
136 * resultant request must contain the original region.
137 */
138 if (rreq->netfs_ops->expand_readahead)
139 rreq->netfs_ops->expand_readahead(rreq);
140
141 /* Expand the request if the cache wants it to start earlier. Note
142 * that the expansion may get further extended if the VM wishes to
143 * insert THPs and the preferred start and/or end wind up in the middle
144 * of THPs.
145 *
146 * If this is the case, however, the THP size should be an integer
147 * multiple of the cache granule size, so we get a whole number of
148 * granules to deal with.
149 */
150 if (rreq->start != readahead_pos(ractl) ||
151 rreq->len != readahead_length(ractl)) {
152 readahead_expand(ractl, rreq->start, rreq->len);
153 rreq->start = readahead_pos(ractl);
154 rreq->len = readahead_length(ractl);
155
156 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
157 netfs_read_trace_expanded);
158 }
159}
160
161/*
162 * Begin an operation, and fetch the stored zero point value from the cookie if
163 * available.
164 */
165static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
166{
167 return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
168}
169
170/**
171 * netfs_readahead - Helper to manage a read request
172 * @ractl: The description of the readahead request
173 *
174 * Fulfil a readahead request by drawing data from the cache if possible, or
175 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
176 * requests from different sources will get munged together. If necessary, the
177 * readahead window can be expanded in either direction to a more convenient
178 * alighment for RPC efficiency or to make storage in the cache feasible.
179 *
180 * The calling netfs must initialise a netfs context contiguous to the vfs
181 * inode before calling this.
182 *
183 * This is usable whether or not caching is enabled.
184 */
185void netfs_readahead(struct readahead_control *ractl)
186{
187 struct netfs_io_request *rreq;
188 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
189 int ret;
190
191 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
192
193 if (readahead_count(ractl) == 0)
194 return;
195
196 rreq = netfs_alloc_request(ractl->mapping, ractl->file,
197 readahead_pos(ractl),
198 readahead_length(ractl),
199 NETFS_READAHEAD);
200 if (IS_ERR(rreq))
201 return;
202
203 ret = netfs_begin_cache_read(rreq, ctx);
204 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
205 goto cleanup_free;
206
207 netfs_stat(&netfs_n_rh_readahead);
208 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
209 netfs_read_trace_readahead);
210
211 netfs_rreq_expand(rreq, ractl);
212
213 /* Set up the output buffer */
214 iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
215 rreq->start, rreq->len);
216
217 /* Drop the refs on the folios here rather than in the cache or
218 * filesystem. The locks will be dropped in netfs_rreq_unlock().
219 */
220 while (readahead_folio(ractl))
221 ;
222
223 netfs_begin_read(rreq, false);
224 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
225 return;
226
227cleanup_free:
228 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
229 return;
230}
231EXPORT_SYMBOL(netfs_readahead);
232
233/**
234 * netfs_read_folio - Helper to manage a read_folio request
235 * @file: The file to read from
236 * @folio: The folio to read
237 *
238 * Fulfil a read_folio request by drawing data from the cache if
239 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
240 * Multiple I/O requests from different sources will get munged together.
241 *
242 * The calling netfs must initialise a netfs context contiguous to the vfs
243 * inode before calling this.
244 *
245 * This is usable whether or not caching is enabled.
246 */
247int netfs_read_folio(struct file *file, struct folio *folio)
248{
249 struct address_space *mapping = folio->mapping;
250 struct netfs_io_request *rreq;
251 struct netfs_inode *ctx = netfs_inode(mapping->host);
252 struct folio *sink = NULL;
253 int ret;
254
255 _enter("%lx", folio->index);
256
257 rreq = netfs_alloc_request(mapping, file,
258 folio_file_pos(folio), folio_size(folio),
259 NETFS_READPAGE);
260 if (IS_ERR(rreq)) {
261 ret = PTR_ERR(rreq);
262 goto alloc_error;
263 }
264
265 ret = netfs_begin_cache_read(rreq, ctx);
266 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
267 goto discard;
268
269 netfs_stat(&netfs_n_rh_readpage);
270 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
271
272 /* Set up the output buffer */
273 if (folio_test_dirty(folio)) {
274 /* Handle someone trying to read from an unflushed streaming
275 * write. We fiddle the buffer so that a gap at the beginning
276 * and/or a gap at the end get copied to, but the middle is
277 * discarded.
278 */
279 struct netfs_folio *finfo = netfs_folio_info(folio);
280 struct bio_vec *bvec;
281 unsigned int from = finfo->dirty_offset;
282 unsigned int to = from + finfo->dirty_len;
283 unsigned int off = 0, i = 0;
284 size_t flen = folio_size(folio);
285 size_t nr_bvec = flen / PAGE_SIZE + 2;
286 size_t part;
287
288 ret = -ENOMEM;
289 bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
290 if (!bvec)
291 goto discard;
292
293 sink = folio_alloc(GFP_KERNEL, 0);
294 if (!sink)
295 goto discard;
296
297 trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
298
299 rreq->direct_bv = bvec;
300 rreq->direct_bv_count = nr_bvec;
301 if (from > 0) {
302 bvec_set_folio(&bvec[i++], folio, from, 0);
303 off = from;
304 }
305 while (off < to) {
306 part = min_t(size_t, to - off, PAGE_SIZE);
307 bvec_set_folio(&bvec[i++], sink, part, 0);
308 off += part;
309 }
310 if (to < flen)
311 bvec_set_folio(&bvec[i++], folio, flen - to, to);
312 iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
313 } else {
314 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
315 rreq->start, rreq->len);
316 }
317
318 ret = netfs_begin_read(rreq, true);
319 if (sink)
320 folio_put(sink);
321 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
322 return ret < 0 ? ret : 0;
323
324discard:
325 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
326alloc_error:
327 folio_unlock(folio);
328 return ret;
329}
330EXPORT_SYMBOL(netfs_read_folio);
331
332/*
333 * Prepare a folio for writing without reading first
334 * @folio: The folio being prepared
335 * @pos: starting position for the write
336 * @len: length of write
337 * @always_fill: T if the folio should always be completely filled/cleared
338 *
339 * In some cases, write_begin doesn't need to read at all:
340 * - full folio write
341 * - write that lies in a folio that is completely beyond EOF
342 * - write that covers the folio from start to EOF or beyond it
343 *
344 * If any of these criteria are met, then zero out the unwritten parts
345 * of the folio and return true. Otherwise, return false.
346 */
347static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
348 bool always_fill)
349{
350 struct inode *inode = folio_inode(folio);
351 loff_t i_size = i_size_read(inode);
352 size_t offset = offset_in_folio(folio, pos);
353 size_t plen = folio_size(folio);
354
355 if (unlikely(always_fill)) {
356 if (pos - offset + len <= i_size)
357 return false; /* Page entirely before EOF */
358 zero_user_segment(&folio->page, 0, plen);
359 folio_mark_uptodate(folio);
360 return true;
361 }
362
363 /* Full folio write */
364 if (offset == 0 && len >= plen)
365 return true;
366
367 /* Page entirely beyond the end of the file */
368 if (pos - offset >= i_size)
369 goto zero_out;
370
371 /* Write that covers from the start of the folio to EOF or beyond */
372 if (offset == 0 && (pos + len) >= i_size)
373 goto zero_out;
374
375 return false;
376zero_out:
377 zero_user_segments(&folio->page, 0, offset, offset + len, plen);
378 return true;
379}
380
381/**
382 * netfs_write_begin - Helper to prepare for writing
383 * @ctx: The netfs context
384 * @file: The file to read from
385 * @mapping: The mapping to read from
386 * @pos: File position at which the write will begin
387 * @len: The length of the write (may extend beyond the end of the folio chosen)
388 * @_folio: Where to put the resultant folio
389 * @_fsdata: Place for the netfs to store a cookie
390 *
391 * Pre-read data for a write-begin request by drawing data from the cache if
392 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
393 * Multiple I/O requests from different sources will get munged together. If
394 * necessary, the readahead window can be expanded in either direction to a
395 * more convenient alighment for RPC efficiency or to make storage in the cache
396 * feasible.
397 *
398 * The calling netfs must provide a table of operations, only one of which,
399 * issue_op, is mandatory.
400 *
401 * The check_write_begin() operation can be provided to check for and flush
402 * conflicting writes once the folio is grabbed and locked. It is passed a
403 * pointer to the fsdata cookie that gets returned to the VM to be passed to
404 * write_end. It is permitted to sleep. It should return 0 if the request
405 * should go ahead or it may return an error. It may also unlock and put the
406 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
407 * will cause the folio to be re-got and the process to be retried.
408 *
409 * The calling netfs must initialise a netfs context contiguous to the vfs
410 * inode before calling this.
411 *
412 * This is usable whether or not caching is enabled.
413 */
414int netfs_write_begin(struct netfs_inode *ctx,
415 struct file *file, struct address_space *mapping,
416 loff_t pos, unsigned int len, struct folio **_folio,
417 void **_fsdata)
418{
419 struct netfs_io_request *rreq;
420 struct folio *folio;
421 pgoff_t index = pos >> PAGE_SHIFT;
422 int ret;
423
424 DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
425
426retry:
427 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
428 mapping_gfp_mask(mapping));
429 if (IS_ERR(folio))
430 return PTR_ERR(folio);
431
432 if (ctx->ops->check_write_begin) {
433 /* Allow the netfs (eg. ceph) to flush conflicts. */
434 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
435 if (ret < 0) {
436 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
437 goto error;
438 }
439 if (!folio)
440 goto retry;
441 }
442
443 if (folio_test_uptodate(folio))
444 goto have_folio;
445
446 /* If the page is beyond the EOF, we want to clear it - unless it's
447 * within the cache granule containing the EOF, in which case we need
448 * to preload the granule.
449 */
450 if (!netfs_is_cache_enabled(ctx) &&
451 netfs_skip_folio_read(folio, pos, len, false)) {
452 netfs_stat(&netfs_n_rh_write_zskip);
453 goto have_folio_no_wait;
454 }
455
456 rreq = netfs_alloc_request(mapping, file,
457 folio_file_pos(folio), folio_size(folio),
458 NETFS_READ_FOR_WRITE);
459 if (IS_ERR(rreq)) {
460 ret = PTR_ERR(rreq);
461 goto error;
462 }
463 rreq->no_unlock_folio = folio->index;
464 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
465
466 ret = netfs_begin_cache_read(rreq, ctx);
467 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
468 goto error_put;
469
470 netfs_stat(&netfs_n_rh_write_begin);
471 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
472
473 /* Expand the request to meet caching requirements and download
474 * preferences.
475 */
476 ractl._nr_pages = folio_nr_pages(folio);
477 netfs_rreq_expand(rreq, &ractl);
478
479 /* Set up the output buffer */
480 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
481 rreq->start, rreq->len);
482
483 /* We hold the folio locks, so we can drop the references */
484 folio_get(folio);
485 while (readahead_folio(&ractl))
486 ;
487
488 ret = netfs_begin_read(rreq, true);
489 if (ret < 0)
490 goto error;
491 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
492
493have_folio:
494 ret = folio_wait_fscache_killable(folio);
495 if (ret < 0)
496 goto error;
497have_folio_no_wait:
498 *_folio = folio;
499 _leave(" = 0");
500 return 0;
501
502error_put:
503 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
504error:
505 if (folio) {
506 folio_unlock(folio);
507 folio_put(folio);
508 }
509 _leave(" = %d", ret);
510 return ret;
511}
512EXPORT_SYMBOL(netfs_write_begin);
513
514/*
515 * Preload the data into a page we're proposing to write into.
516 */
517int netfs_prefetch_for_write(struct file *file, struct folio *folio,
518 size_t offset, size_t len)
519{
520 struct netfs_io_request *rreq;
521 struct address_space *mapping = folio->mapping;
522 struct netfs_inode *ctx = netfs_inode(mapping->host);
523 unsigned long long start = folio_pos(folio);
524 size_t flen = folio_size(folio);
525 int ret;
526
527 _enter("%zx @%llx", flen, start);
528
529 ret = -ENOMEM;
530
531 rreq = netfs_alloc_request(mapping, file, start, flen,
532 NETFS_READ_FOR_WRITE);
533 if (IS_ERR(rreq)) {
534 ret = PTR_ERR(rreq);
535 goto error;
536 }
537
538 rreq->no_unlock_folio = folio->index;
539 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
540 ret = netfs_begin_cache_read(rreq, ctx);
541 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
542 goto error_put;
543
544 netfs_stat(&netfs_n_rh_write_begin);
545 trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
546
547 /* Set up the output buffer */
548 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
549 rreq->start, rreq->len);
550
551 ret = netfs_begin_read(rreq, true);
552 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
553 return ret;
554
555error_put:
556 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
557error:
558 _leave(" = %d", ret);
559 return ret;
560}
561
562/**
563 * netfs_buffered_read_iter - Filesystem buffered I/O read routine
564 * @iocb: kernel I/O control block
565 * @iter: destination for the data read
566 *
567 * This is the ->read_iter() routine for all filesystems that can use the page
568 * cache directly.
569 *
570 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
571 * returned when no data can be read without waiting for I/O requests to
572 * complete; it doesn't prevent readahead.
573 *
574 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
575 * shall be made for the read or for readahead. When no data can be read,
576 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
577 * possibly empty read shall be returned.
578 *
579 * Return:
580 * * number of bytes copied, even for partial reads
581 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
582 */
583ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
584{
585 struct inode *inode = file_inode(iocb->ki_filp);
586 struct netfs_inode *ictx = netfs_inode(inode);
587 ssize_t ret;
588
589 if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
590 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
591 return -EINVAL;
592
593 ret = netfs_start_io_read(inode);
594 if (ret == 0) {
595 ret = filemap_read(iocb, iter, 0);
596 netfs_end_io_read(inode);
597 }
598 return ret;
599}
600EXPORT_SYMBOL(netfs_buffered_read_iter);
601
602/**
603 * netfs_file_read_iter - Generic filesystem read routine
604 * @iocb: kernel I/O control block
605 * @iter: destination for the data read
606 *
607 * This is the ->read_iter() routine for all filesystems that can use the page
608 * cache directly.
609 *
610 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
611 * returned when no data can be read without waiting for I/O requests to
612 * complete; it doesn't prevent readahead.
613 *
614 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
615 * shall be made for the read or for readahead. When no data can be read,
616 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
617 * possibly empty read shall be returned.
618 *
619 * Return:
620 * * number of bytes copied, even for partial reads
621 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
622 */
623ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
624{
625 struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
626
627 if ((iocb->ki_flags & IOCB_DIRECT) ||
628 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
629 return netfs_unbuffered_read_iter(iocb, iter);
630
631 return netfs_buffered_read_iter(iocb, iter);
632}
633EXPORT_SYMBOL(netfs_file_read_iter);