root / drivers / block-llcache.c @ abdb293f
History | View | Annotate | Download (13.8 kB)
1 |
/*
|
---|---|
2 |
* Copyright (c) 2010, XenSource Inc.
|
3 |
* All rights reserved.
|
4 |
*
|
5 |
* Redistribution and use in source and binary forms, with or without
|
6 |
* modification, are permitted provided that the following conditions are met:
|
7 |
* * Redistributions of source code must retain the above copyright
|
8 |
* notice, this list of conditions and the following disclaimer.
|
9 |
* * Redistributions in binary form must reproduce the above copyright
|
10 |
* notice, this list of conditions and the following disclaimer in the
|
11 |
* documentation and/or other materials provided with the distribution.
|
12 |
* * Neither the name of XenSource Inc. nor the names of its contributors
|
13 |
* may be used to endorse or promote products derived from this software
|
14 |
* without specific prior written permission.
|
15 |
*
|
16 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
17 |
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
18 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
19 |
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
20 |
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
21 |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
22 |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
23 |
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
24 |
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
25 |
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
26 |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27 |
*/
|
28 |
|
29 |
#ifdef HAVE_CONFIG_H
|
30 |
#include "config.h" |
31 |
#endif
|
32 |
|
33 |
#include <errno.h> |
34 |
|
35 |
#include "tapdisk.h" |
36 |
#include "tapdisk-vbd.h" |
37 |
#include "tapdisk-driver.h" |
38 |
#include "tapdisk-interface.h" |
39 |
#include "tapdisk-disktype.h" |
40 |
|
41 |
#define DBG(_f, _a...) tlog_syslog(TLOG_DBG, _f, ##_a) |
42 |
#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a) |
43 |
#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \ |
44 |
##_a, __func__, __LINE__) |
45 |
|
46 |
#define BUG() td_panic()
|
47 |
#define BUG_ON(_cond) if (unlikely(_cond)) { td_panic(); } |
48 |
#define WARN_ON(_p) if (unlikely(_cond)) { WARN(_cond); } |
49 |
|
50 |
int ll_write_error(int curr, int error) |
51 |
{ |
52 |
if (error && (!curr || curr == -ENOSPC))
|
53 |
return error;
|
54 |
|
55 |
return 0; |
56 |
} |
57 |
|
58 |
void ll_log_switch(int type, int error, |
59 |
td_image_t *local, td_image_t *shared) |
60 |
{ |
61 |
WARN("WARNING: %s, on %s:%s. Switching to %s:%s.",
|
62 |
strerror(-error), |
63 |
tapdisk_disk_types[local->type]->name, local->name, |
64 |
tapdisk_disk_types[shared->type]->name, shared->name); |
65 |
} |
66 |
|
67 |
/*
|
68 |
* LLP: Local leaf persistent cache
|
69 |
* -- Persistent write caching in local storage.
|
70 |
*
|
71 |
* VBD
|
72 |
* \
|
73 |
* +--r/w--> llp+vhd:/local/leaf
|
74 |
* \
|
75 |
* +--r/w--> vhd:/shared/leaf
|
76 |
* \
|
77 |
* +--r/o--> vhd:/shared/parent
|
78 |
*
|
79 |
* We drive two 'leaf' (r/w) images: One LOCAL (i.e. on local storage,
|
80 |
* unreliable and prone to out-of-space failures), and one SHARED
|
81 |
* (i.e. in shared storage with plenty of physical backing).
|
82 |
*
|
83 |
* All images are on a linear read chain: LOCAL inherits from SHARED,
|
84 |
* which inherits from a shared master image. This filter driver
|
85 |
* aggregates LOCAL. SHARED is our immediate parent, forced into R/W
|
86 |
* mode.
|
87 |
*
|
88 |
* Unless LOCAL failed, reads are issued to LOCAL, to save shared
|
89 |
* storage bandwidth. In case of failure, SHARED provides continued
|
90 |
* VDI consistency.
|
91 |
*
|
92 |
*/
|
93 |
enum {
|
94 |
LLP_MIRROR = 1,
|
95 |
/*
|
96 |
* LLP_MIRROR:
|
97 |
*
|
98 |
* Writes are mirrored to both LOCAL and SHARED. Reads are
|
99 |
* issued to LOCAL.
|
100 |
*
|
101 |
* Failure to write LOCAL are recoverable. The driver will
|
102 |
* transition to LLP_SHARED.
|
103 |
*
|
104 |
* Failure to write SHARED is irrecoverable, and signaled to
|
105 |
* the original issuer.
|
106 |
*/
|
107 |
|
108 |
LLP_SHARED = 2,
|
109 |
/*
|
110 |
* LLP_SHARED:
|
111 |
*
|
112 |
* Writes are issued to SHARED only. As are reads.
|
113 |
*
|
114 |
* Failure to write SHARED is irrecoverable.
|
115 |
*/
|
116 |
}; |
117 |
|
118 |
typedef struct llpcache td_llpcache_t; |
119 |
typedef struct llpcache_request td_llpcache_req_t; |
120 |
#define TD_LLPCACHE_MAX_REQ (MAX_REQUESTS*2) |
121 |
|
122 |
struct llpcache_vreq {
|
123 |
enum { LOCAL = 0, SHARED = 1 } target; |
124 |
td_vbd_request_t vreq; |
125 |
}; |
126 |
|
127 |
struct llpcache_request {
|
128 |
td_request_t treq; |
129 |
|
130 |
struct td_iovec iov;
|
131 |
int error;
|
132 |
|
133 |
struct llpcache_vreq lvr[2]; |
134 |
|
135 |
unsigned int pending; |
136 |
int mode;
|
137 |
}; |
138 |
|
139 |
struct llpcache {
|
140 |
td_image_t *local; |
141 |
int mode;
|
142 |
|
143 |
td_llpcache_req_t reqv[TD_LLPCACHE_MAX_REQ]; |
144 |
td_llpcache_req_t *free[TD_LLPCACHE_MAX_REQ]; |
145 |
int n_free;
|
146 |
}; |
147 |
|
148 |
static td_llpcache_req_t *
|
149 |
llpcache_alloc_request(td_llpcache_t *s) |
150 |
{ |
151 |
td_llpcache_req_t *req = NULL;
|
152 |
|
153 |
if (likely(s->n_free))
|
154 |
req = s->free[--s->n_free]; |
155 |
|
156 |
return req;
|
157 |
} |
158 |
|
159 |
static void |
160 |
llpcache_free_request(td_llpcache_t *s, td_llpcache_req_t *req) |
161 |
{ |
162 |
BUG_ON(s->n_free >= TD_LLPCACHE_MAX_REQ); |
163 |
s->free[s->n_free++] = req; |
164 |
} |
165 |
|
166 |
static void |
167 |
__llpcache_write_cb(td_vbd_request_t *vreq, int error,
|
168 |
void *token, int final) |
169 |
{ |
170 |
td_llpcache_t *s = token; |
171 |
struct llpcache_vreq *lvr;
|
172 |
td_llpcache_req_t *req; |
173 |
int mask;
|
174 |
|
175 |
lvr = containerof(vreq, struct llpcache_vreq, vreq);
|
176 |
req = containerof(lvr, td_llpcache_req_t, lvr[lvr->target]); |
177 |
|
178 |
mask = 1U << lvr->target;
|
179 |
BUG_ON(!(req->pending & mask)) |
180 |
|
181 |
if (lvr->target == LOCAL && error == -ENOSPC) {
|
182 |
td_image_t *shared = |
183 |
containerof(req->treq.image->next.next, |
184 |
td_image_t, next); |
185 |
ll_log_switch(DISK_TYPE_LLPCACHE, error, |
186 |
s->local, shared); |
187 |
s->mode = LLP_SHARED; |
188 |
error = 0;
|
189 |
} |
190 |
|
191 |
req->pending &= ~mask; |
192 |
req->error = ll_write_error(req->error, error); |
193 |
|
194 |
if (!req->pending) {
|
195 |
/* FIXME: Make sure this won't retry. */
|
196 |
td_complete_request(req->treq, req->error); |
197 |
llpcache_free_request(s, req); |
198 |
} |
199 |
} |
200 |
|
201 |
/*
|
202 |
* NB. Write mirroring. Lacking per-image queues, it's still a
|
203 |
* hack. But shall do for now:
|
204 |
*
|
205 |
* 1. Store the treq, thereby blocking the original vreq.
|
206 |
* 2. Reissue, as two clone vreqs. One local, one shared.
|
207 |
* 3. Clones seen again then get forwarded.
|
208 |
* 4. Treq completes after both vreqs.
|
209 |
*
|
210 |
* We can recognize clones by matching the vreq->token field.
|
211 |
*/
|
212 |
|
213 |
static int |
214 |
llpcache_requeue_treq(td_llpcache_t *s, td_llpcache_req_t *req, int target)
|
215 |
{ |
216 |
struct llpcache_vreq *lvr;
|
217 |
td_vbd_request_t *vreq; |
218 |
int err;
|
219 |
|
220 |
lvr = &req->lvr[target]; |
221 |
lvr->target = target; |
222 |
|
223 |
vreq = &lvr->vreq; |
224 |
vreq->op = TD_OP_WRITE; |
225 |
vreq->sec = req->treq.sec; |
226 |
vreq->iov = &req->iov; |
227 |
vreq->iovcnt = 1;
|
228 |
vreq->cb = __llpcache_write_cb; |
229 |
vreq->token = s; |
230 |
|
231 |
err = tapdisk_vbd_queue_request(req->treq.vreq->vbd, vreq); |
232 |
if (err)
|
233 |
goto fail;
|
234 |
|
235 |
req->pending |= 1UL << target;
|
236 |
return 0; |
237 |
|
238 |
fail:
|
239 |
req->error = req->error ? : err; |
240 |
return err;
|
241 |
} |
242 |
|
243 |
static void |
244 |
llpcache_fork_write(td_llpcache_t *s, td_request_t treq) |
245 |
{ |
246 |
td_llpcache_req_t *req; |
247 |
struct td_iovec *iov;
|
248 |
int err;
|
249 |
|
250 |
req = llpcache_alloc_request(s); |
251 |
if (!req) {
|
252 |
td_complete_request(treq, -EBUSY); |
253 |
return;
|
254 |
} |
255 |
|
256 |
memset(req, 0, sizeof(req)); |
257 |
|
258 |
req->treq = treq; |
259 |
|
260 |
iov = &req->iov; |
261 |
iov->base = treq.buf; |
262 |
iov->secs = treq.secs; |
263 |
|
264 |
err = llpcache_requeue_treq(s, req, LOCAL); |
265 |
if (err)
|
266 |
goto fail;
|
267 |
|
268 |
err = llpcache_requeue_treq(s, req, SHARED); |
269 |
if (err)
|
270 |
goto fail;
|
271 |
|
272 |
return;
|
273 |
|
274 |
fail:
|
275 |
if (!req->pending) {
|
276 |
td_complete_request(treq, req->error); |
277 |
llpcache_free_request(s, req); |
278 |
} |
279 |
} |
280 |
|
281 |
static void |
282 |
llpcache_forward_write(td_llpcache_t *s, td_request_t treq) |
283 |
{ |
284 |
const td_vbd_request_t *vreq = treq.vreq;
|
285 |
struct llpcache_vreq *lvr;
|
286 |
|
287 |
lvr = containerof(vreq, struct llpcache_vreq, vreq);
|
288 |
|
289 |
switch (lvr->target) {
|
290 |
case SHARED:
|
291 |
td_forward_request(treq); |
292 |
break;
|
293 |
case LOCAL:
|
294 |
td_queue_write(s->local, treq); |
295 |
break;
|
296 |
default:
|
297 |
BUG(); |
298 |
} |
299 |
} |
300 |
|
301 |
static void |
302 |
llpcache_queue_write(td_driver_t *driver, td_request_t treq) |
303 |
{ |
304 |
td_llpcache_t *s = driver->data; |
305 |
|
306 |
if (treq.vreq->token == s)
|
307 |
llpcache_forward_write(s, treq); |
308 |
else
|
309 |
llpcache_fork_write(s, treq); |
310 |
} |
311 |
|
312 |
static void |
313 |
llpcache_queue_read(td_driver_t *driver, td_request_t treq) |
314 |
{ |
315 |
td_llpcache_t *s = driver->data; |
316 |
|
317 |
switch (s->mode) {
|
318 |
case LLP_MIRROR:
|
319 |
td_queue_read(s->local, treq); |
320 |
break;
|
321 |
case LLP_SHARED:
|
322 |
td_forward_request(treq); |
323 |
default:
|
324 |
BUG(); |
325 |
} |
326 |
} |
327 |
|
328 |
static int |
329 |
llpcache_close(td_driver_t *driver) |
330 |
{ |
331 |
td_llpcache_t *s = driver->data; |
332 |
|
333 |
if (s->local) {
|
334 |
tapdisk_image_close(s->local); |
335 |
s->local = NULL;
|
336 |
} |
337 |
|
338 |
return 0; |
339 |
} |
340 |
|
341 |
static int |
342 |
llpcache_open(td_driver_t *driver, const char *name, td_flag_t flags) |
343 |
{ |
344 |
td_llpcache_t *s = driver->data; |
345 |
int i, err;
|
346 |
|
347 |
s->mode = LLP_MIRROR; |
348 |
|
349 |
for (i = 0; i < TD_LLPCACHE_MAX_REQ; i++) |
350 |
llpcache_free_request(s, &s->reqv[i]); |
351 |
|
352 |
err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->local); |
353 |
if (err)
|
354 |
goto fail;
|
355 |
|
356 |
driver->info = s->local->driver->info; |
357 |
|
358 |
return 0; |
359 |
|
360 |
fail:
|
361 |
llpcache_close(driver); |
362 |
return err;
|
363 |
} |
364 |
|
365 |
static int |
366 |
llcache_get_parent_id(td_driver_t *driver, td_disk_id_t *id) |
367 |
{ |
368 |
td_llpcache_t *s = driver->data; |
369 |
int err;
|
370 |
|
371 |
err = td_get_parent_id(s->local, id); |
372 |
if (!err)
|
373 |
id->flags &= ~TD_OPEN_RDONLY; |
374 |
|
375 |
return err;
|
376 |
} |
377 |
|
378 |
static int |
379 |
llcache_validate_parent(td_driver_t *driver, |
380 |
td_driver_t *pdriver, td_flag_t flags) |
381 |
{ |
382 |
return -ENOSYS;
|
383 |
} |
384 |
|
385 |
|
386 |
struct tap_disk tapdisk_llpcache = {
|
387 |
.disk_type = "tapdisk_llpcache",
|
388 |
.flags = 0,
|
389 |
.private_data_size = sizeof(td_llpcache_t),
|
390 |
.td_open = llpcache_open, |
391 |
.td_close = llpcache_close, |
392 |
.td_queue_read = llpcache_queue_read, |
393 |
.td_queue_write = llpcache_queue_write, |
394 |
.td_get_parent_id = llcache_get_parent_id, |
395 |
.td_validate_parent = llcache_validate_parent, |
396 |
}; |
397 |
|
398 |
/*
|
399 |
* LLE: Local Leaf Ephemeral Cache
|
400 |
* -- Non-persistent write caching in local storage.
|
401 |
*
|
402 |
* VBD
|
403 |
* \
|
404 |
* +--r/w--> lle+vhd:/shared/leaf
|
405 |
* \
|
406 |
* +--r/w--> vhd:/local/leaf
|
407 |
* \
|
408 |
* +--r/o--> vhd:/shared/parent
|
409 |
*
|
410 |
* Note that LOCAL and SHARED chain order differs from LLP. Shared
|
411 |
* storage data masks local data.
|
412 |
*
|
413 |
* This means VDI state in shared storage state alone is
|
414 |
* inconsistent. Wherever local is unavailable, SHARED must be
|
415 |
* discarded too.
|
416 |
*/
|
417 |
enum {
|
418 |
LLE_LOCAL = 1,
|
419 |
/*
|
420 |
* LLE_LOCAL:
|
421 |
*
|
422 |
* Writes are forwarded to LOCAL only. As are reads. This
|
423 |
* reduces network overhead.
|
424 |
*
|
425 |
* Failure to write LOCAL is recoverable. The driver will
|
426 |
* transition to LLE_SHARED.
|
427 |
*
|
428 |
* Failure to write to shared are irrecoverable and signaled
|
429 |
* to the original issuer.
|
430 |
*/
|
431 |
|
432 |
LLE_SHARED = 2,
|
433 |
/*
|
434 |
* LLE_SHARED:
|
435 |
*
|
436 |
* Writes are issued to SHARED. As are reads.
|
437 |
*
|
438 |
* Failure to write to SHARED is irrecoverable.
|
439 |
*/
|
440 |
}; |
441 |
|
442 |
typedef struct llecache td_llecache_t; |
443 |
typedef struct llecache_request td_llecache_req_t; |
444 |
#define TD_LLECACHE_MAX_REQ (MAX_REQUESTS*2) |
445 |
|
446 |
struct llecache_request {
|
447 |
td_llecache_t *s; |
448 |
td_request_t treq; |
449 |
int pending;
|
450 |
int error;
|
451 |
}; |
452 |
|
453 |
struct llecache {
|
454 |
td_image_t *shared; |
455 |
int mode;
|
456 |
|
457 |
td_llecache_req_t reqv[TD_LLECACHE_MAX_REQ]; |
458 |
td_llecache_req_t *free[TD_LLECACHE_MAX_REQ]; |
459 |
int n_free;
|
460 |
}; |
461 |
|
462 |
static td_llecache_req_t *
|
463 |
llecache_alloc_request(td_llecache_t *s) |
464 |
{ |
465 |
td_llecache_req_t *req = NULL;
|
466 |
|
467 |
if (likely(s->n_free))
|
468 |
req = s->free[--s->n_free]; |
469 |
|
470 |
return req;
|
471 |
} |
472 |
|
473 |
static void |
474 |
llecache_free_request(td_llecache_t *s, td_llecache_req_t *req) |
475 |
{ |
476 |
BUG_ON(s->n_free >= TD_LLECACHE_MAX_REQ); |
477 |
s->free[s->n_free++] = req; |
478 |
} |
479 |
|
480 |
static int |
481 |
llecache_close(td_driver_t *driver) |
482 |
{ |
483 |
td_llecache_t *s = driver->data; |
484 |
|
485 |
if (s->shared) {
|
486 |
tapdisk_image_close(s->shared); |
487 |
s->shared = NULL;
|
488 |
} |
489 |
|
490 |
return 0; |
491 |
} |
492 |
|
493 |
static int |
494 |
llecache_open(td_driver_t *driver, const char *name, td_flag_t flags) |
495 |
{ |
496 |
td_llecache_t *s = driver->data; |
497 |
int i, err;
|
498 |
|
499 |
s->mode = LLE_LOCAL; |
500 |
|
501 |
for (i = 0; i < TD_LLECACHE_MAX_REQ; i++) |
502 |
llecache_free_request(s, &s->reqv[i]); |
503 |
|
504 |
err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->shared); |
505 |
if (err)
|
506 |
goto fail;
|
507 |
|
508 |
driver->info = s->shared->driver->info; |
509 |
|
510 |
return 0; |
511 |
|
512 |
fail:
|
513 |
llecache_close(driver); |
514 |
return err;
|
515 |
} |
516 |
|
517 |
static void |
518 |
__llecache_write_cb(td_request_t treq, int error)
|
519 |
{ |
520 |
td_llecache_req_t *req = treq.cb_data; |
521 |
td_llecache_t *s = req->s; |
522 |
|
523 |
BUG_ON(req->pending < treq.secs); |
524 |
|
525 |
req->pending -= treq.secs; |
526 |
req->error = ll_write_error(req->error, error); |
527 |
|
528 |
if (req->pending)
|
529 |
return;
|
530 |
|
531 |
if (req->error == -ENOSPC) {
|
532 |
ll_log_switch(DISK_TYPE_LLECACHE, req->error, |
533 |
treq.image, s->shared); |
534 |
|
535 |
s->mode = LLE_SHARED; |
536 |
td_queue_write(s->shared, req->treq); |
537 |
|
538 |
} else
|
539 |
td_complete_request(req->treq, error); |
540 |
|
541 |
llecache_free_request(s, req); |
542 |
} |
543 |
|
544 |
static void |
545 |
llecache_forward_write(td_llecache_t *s, td_request_t treq) |
546 |
{ |
547 |
td_llecache_req_t *req; |
548 |
td_request_t clone; |
549 |
|
550 |
req = llecache_alloc_request(s); |
551 |
if (!req) {
|
552 |
td_complete_request(treq, -EBUSY); |
553 |
return;
|
554 |
} |
555 |
|
556 |
memset(req, 0, sizeof(req)); |
557 |
|
558 |
req->treq = treq; |
559 |
req->pending = treq.secs; |
560 |
req->s = s; |
561 |
|
562 |
clone = treq; |
563 |
clone.cb = __llecache_write_cb; |
564 |
clone.cb_data = req; |
565 |
|
566 |
td_forward_request(clone); |
567 |
} |
568 |
|
569 |
static void |
570 |
llecache_queue_write(td_driver_t *driver, td_request_t treq) |
571 |
{ |
572 |
td_llecache_t *s = driver->data; |
573 |
|
574 |
switch (s->mode) {
|
575 |
case LLE_LOCAL:
|
576 |
llecache_forward_write(s, treq); |
577 |
break;
|
578 |
case LLE_SHARED:
|
579 |
td_queue_write(s->shared, treq); |
580 |
break;
|
581 |
} |
582 |
} |
583 |
|
584 |
static void |
585 |
llecache_queue_read(td_driver_t *driver, td_request_t treq) |
586 |
{ |
587 |
td_llecache_t *s = driver->data; |
588 |
|
589 |
switch (s->mode) {
|
590 |
case LLE_LOCAL:
|
591 |
td_forward_request(treq); |
592 |
break;
|
593 |
case LLE_SHARED:
|
594 |
td_queue_read(s->shared, treq); |
595 |
break;
|
596 |
default:
|
597 |
BUG(); |
598 |
} |
599 |
} |
600 |
|
601 |
struct tap_disk tapdisk_llecache = {
|
602 |
.disk_type = "tapdisk_llecache",
|
603 |
.flags = 0,
|
604 |
.private_data_size = sizeof(td_llecache_t),
|
605 |
.td_open = llecache_open, |
606 |
.td_close = llecache_close, |
607 |
.td_queue_read = llecache_queue_read, |
608 |
.td_queue_write = llecache_queue_write, |
609 |
.td_get_parent_id = llcache_get_parent_id, |
610 |
.td_validate_parent = llcache_validate_parent, |
611 |
}; |