Statistics
| Branch: | Revision:

root / drivers / block-llcache.c @ abdb293f

History | View | Annotate | Download (13.8 kB)

1
/*
2
 * Copyright (c) 2010, XenSource Inc.
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *     * Redistributions of source code must retain the above copyright
8
 *       notice, this list of conditions and the following disclaimer.
9
 *     * Redistributions in binary form must reproduce the above copyright
10
 *       notice, this list of conditions and the following disclaimer in the
11
 *       documentation and/or other materials provided with the distribution.
12
 *     * Neither the name of XenSource Inc. nor the names of its contributors
13
 *       may be used to endorse or promote products derived from this software
14
 *       without specific prior written permission.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
 */
28

    
29
#ifdef HAVE_CONFIG_H
30
#include "config.h"
31
#endif
32

    
33
#include <errno.h>
34

    
35
#include "tapdisk.h"
36
#include "tapdisk-vbd.h"
37
#include "tapdisk-driver.h"
38
#include "tapdisk-interface.h"
39
#include "tapdisk-disktype.h"
40

    
41
#define DBG(_f, _a...)  tlog_syslog(TLOG_DBG, _f, ##_a)
42
#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a)
43
#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \
44
                                    ##_a, __func__, __LINE__)
45

    
46
#define BUG()           td_panic()
47
#define BUG_ON(_cond)   if (unlikely(_cond)) { td_panic(); }
48
#define WARN_ON(_p)     if (unlikely(_cond)) { WARN(_cond); }
49

    
50
int ll_write_error(int curr, int error)
51
{
52
        if (error && (!curr || curr == -ENOSPC))
53
                return error;
54

    
55
        return 0;
56
}
57

    
58
void ll_log_switch(int type, int error,
59
                   td_image_t *local, td_image_t *shared)
60
{
61
        WARN("WARNING: %s, on %s:%s. Switching to %s:%s.",
62
             strerror(-error),
63
             tapdisk_disk_types[local->type]->name, local->name,
64
             tapdisk_disk_types[shared->type]->name, shared->name);
65
}
66

    
67
/*
68
 * LLP: Local leaf persistent cache
69
 *      -- Persistent write caching in local storage.
70
 *
71
 *    VBD
72
 *      \
73
 *       +--r/w--> llp+vhd:/local/leaf
74
 *        \
75
 *         +--r/w--> vhd:/shared/leaf
76
 *          \
77
 *           +--r/o--> vhd:/shared/parent
78
 *
79
 * We drive two 'leaf' (r/w) images: One LOCAL (i.e. on local storage,
80
 * unreliable and prone to out-of-space failures), and one SHARED
81
 * (i.e. in shared storage with plenty of physical backing).
82
 *
83
 * All images are on a linear read chain: LOCAL inherits from SHARED,
84
 * which inherits from a shared master image. This filter driver
85
 * aggregates LOCAL. SHARED is our immediate parent, forced into R/W
86
 * mode.
87
 *
88
 * Unless LOCAL failed, reads are issued to LOCAL, to save shared
89
 * storage bandwidth. In case of failure, SHARED provides continued
90
 * VDI consistency.
91
 *
92
 */
93
enum {
94
        LLP_MIRROR = 1,
95
        /*
96
         * LLP_MIRROR:
97
         *
98
         * Writes are mirrored to both LOCAL and SHARED. Reads are
99
         * issued to LOCAL.
100
         *
101
         * Failure to write LOCAL are recoverable. The driver will
102
         * transition to LLP_SHARED.
103
         *
104
         * Failure to write SHARED is irrecoverable, and signaled to
105
         * the original issuer.
106
         */
107

    
108
        LLP_SHARED = 2,
109
        /*
110
         * LLP_SHARED:
111
         *
112
         * Writes are issued to SHARED only. As are reads.
113
         *
114
         * Failure to write SHARED is irrecoverable.
115
         */
116
};
117

    
118
typedef struct llpcache                 td_llpcache_t;
119
typedef struct llpcache_request         td_llpcache_req_t;
120
#define TD_LLPCACHE_MAX_REQ             (MAX_REQUESTS*2)
121

    
122
struct llpcache_vreq {
123
        enum { LOCAL = 0, SHARED = 1 }  target;
124
        td_vbd_request_t                vreq;
125
};
126

    
127
struct llpcache_request {
128
        td_request_t            treq;
129

    
130
        struct td_iovec         iov;
131
        int                     error;
132

    
133
        struct llpcache_vreq    lvr[2];
134

    
135
        unsigned int            pending;
136
        int                     mode;
137
};
138

    
139
struct llpcache {
140
        td_image_t             *local;
141
        int                     mode;
142

    
143
        td_llpcache_req_t       reqv[TD_LLPCACHE_MAX_REQ];
144
        td_llpcache_req_t      *free[TD_LLPCACHE_MAX_REQ];
145
        int                     n_free;
146
};
147

    
148
static td_llpcache_req_t *
149
llpcache_alloc_request(td_llpcache_t *s)
150
{
151
        td_llpcache_req_t *req = NULL;
152

    
153
        if (likely(s->n_free))
154
                req = s->free[--s->n_free];
155

    
156
        return req;
157
}
158

    
159
static void
160
llpcache_free_request(td_llpcache_t *s, td_llpcache_req_t *req)
161
{
162
        BUG_ON(s->n_free >= TD_LLPCACHE_MAX_REQ);
163
        s->free[s->n_free++] = req;
164
}
165

    
166
static void
167
__llpcache_write_cb(td_vbd_request_t *vreq, int error,
168
                   void *token, int final)
169
{
170
        td_llpcache_t *s = token;
171
        struct llpcache_vreq *lvr;
172
        td_llpcache_req_t *req;
173
        int mask;
174

    
175
        lvr = containerof(vreq, struct llpcache_vreq, vreq);
176
        req = containerof(lvr, td_llpcache_req_t, lvr[lvr->target]);
177

    
178
        mask = 1U << lvr->target;
179
        BUG_ON(!(req->pending & mask))
180

    
181
        if (lvr->target == LOCAL && error == -ENOSPC) {
182
                td_image_t *shared =
183
                        containerof(req->treq.image->next.next,
184
                                    td_image_t, next);
185
                ll_log_switch(DISK_TYPE_LLPCACHE, error,
186
                              s->local, shared);
187
                s->mode = LLP_SHARED;
188
                error = 0;
189
        }
190

    
191
        req->pending &= ~mask;
192
        req->error    = ll_write_error(req->error, error);
193

    
194
        if (!req->pending) {
195
                /* FIXME: Make sure this won't retry. */
196
                td_complete_request(req->treq, req->error);
197
                llpcache_free_request(s, req);
198
        }
199
}
200

    
201
/*
202
 * NB. Write mirroring. Lacking per-image queues, it's still a
203
 * hack. But shall do for now:
204
 *
205
 *   1. Store the treq, thereby blocking the original vreq.
206
 *   2. Reissue, as two clone vreqs. One local, one shared.
207
 *   3. Clones seen again then get forwarded.
208
 *   4. Treq completes after both vreqs.
209
 *
210
 * We can recognize clones by matching the vreq->token field.
211
 */
212

    
213
static int
214
llpcache_requeue_treq(td_llpcache_t *s, td_llpcache_req_t *req, int target)
215
{
216
        struct llpcache_vreq *lvr;
217
        td_vbd_request_t *vreq;
218
        int err;
219

    
220
        lvr           = &req->lvr[target];
221
        lvr->target   = target;
222

    
223
        vreq          = &lvr->vreq;
224
        vreq->op      = TD_OP_WRITE;
225
        vreq->sec     = req->treq.sec;
226
        vreq->iov     = &req->iov;
227
        vreq->iovcnt  = 1;
228
        vreq->cb      = __llpcache_write_cb;
229
        vreq->token   = s;
230

    
231
        err = tapdisk_vbd_queue_request(req->treq.vreq->vbd, vreq);
232
        if (err)
233
                goto fail;
234

    
235
        req->pending |= 1UL << target;
236
        return 0;
237

    
238
fail:
239
        req->error   = req->error ? : err;
240
        return err;
241
}
242

    
243
static void
244
llpcache_fork_write(td_llpcache_t *s, td_request_t treq)
245
{
246
        td_llpcache_req_t *req;
247
        struct td_iovec *iov;
248
        int err;
249

    
250
        req = llpcache_alloc_request(s);
251
        if (!req) {
252
                td_complete_request(treq, -EBUSY);
253
                return;
254
        }
255

    
256
        memset(req, 0, sizeof(req));
257

    
258
        req->treq     = treq;
259

    
260
        iov           = &req->iov;
261
        iov->base     = treq.buf;
262
        iov->secs     = treq.secs;
263

    
264
        err = llpcache_requeue_treq(s, req, LOCAL);
265
        if (err)
266
                goto fail;
267

    
268
        err = llpcache_requeue_treq(s, req, SHARED);
269
        if (err)
270
                goto fail;
271

    
272
        return;
273

    
274
fail:
275
        if (!req->pending) {
276
                td_complete_request(treq, req->error);
277
                llpcache_free_request(s, req);
278
        }
279
}
280

    
281
static void
282
llpcache_forward_write(td_llpcache_t *s, td_request_t treq)
283
{
284
        const td_vbd_request_t *vreq = treq.vreq;
285
        struct llpcache_vreq *lvr;
286

    
287
        lvr = containerof(vreq, struct llpcache_vreq, vreq);
288

    
289
        switch (lvr->target) {
290
        case SHARED:
291
                td_forward_request(treq);
292
                break;
293
        case LOCAL:
294
                td_queue_write(s->local, treq);
295
                break;
296
        default:
297
                BUG();
298
        }
299
}
300

    
301
static void
302
llpcache_queue_write(td_driver_t *driver, td_request_t treq)
303
{
304
        td_llpcache_t *s = driver->data;
305

    
306
        if (treq.vreq->token == s)
307
                llpcache_forward_write(s, treq);
308
        else
309
                llpcache_fork_write(s, treq);
310
}
311

    
312
static void
313
llpcache_queue_read(td_driver_t *driver, td_request_t treq)
314
{
315
        td_llpcache_t *s = driver->data;
316

    
317
        switch (s->mode) {
318
        case LLP_MIRROR:
319
                td_queue_read(s->local, treq);
320
                break;
321
        case LLP_SHARED:
322
                td_forward_request(treq);
323
        default:
324
                BUG();
325
        }
326
}
327

    
328
static int
329
llpcache_close(td_driver_t *driver)
330
{
331
        td_llpcache_t *s = driver->data;
332

    
333
        if (s->local) {
334
                tapdisk_image_close(s->local);
335
                s->local = NULL;
336
        }
337

    
338
        return 0;
339
}
340

    
341
static int
342
llpcache_open(td_driver_t *driver, const char *name, td_flag_t flags)
343
{
344
        td_llpcache_t *s = driver->data;
345
        int i, err;
346

    
347
        s->mode = LLP_MIRROR;
348

    
349
        for (i = 0; i < TD_LLPCACHE_MAX_REQ; i++)
350
                llpcache_free_request(s, &s->reqv[i]);
351

    
352
        err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->local);
353
        if (err)
354
                goto fail;
355

    
356
        driver->info = s->local->driver->info;
357

    
358
        return 0;
359

    
360
fail:
361
        llpcache_close(driver);
362
        return err;
363
}
364

    
365
static int
366
llcache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
367
{
368
        td_llpcache_t *s = driver->data;
369
        int err;
370

    
371
        err = td_get_parent_id(s->local, id);
372
        if (!err)
373
                id->flags &= ~TD_OPEN_RDONLY;
374

    
375
        return err;
376
}
377

    
378
static int
379
llcache_validate_parent(td_driver_t *driver,
380
                        td_driver_t *pdriver, td_flag_t flags)
381
{
382
        return -ENOSYS;
383
}
384

    
385

    
386
struct tap_disk tapdisk_llpcache = {
387
        .disk_type                  = "tapdisk_llpcache",
388
        .flags                      = 0,
389
        .private_data_size          = sizeof(td_llpcache_t),
390
        .td_open                    = llpcache_open,
391
        .td_close                   = llpcache_close,
392
        .td_queue_read              = llpcache_queue_read,
393
        .td_queue_write             = llpcache_queue_write,
394
        .td_get_parent_id           = llcache_get_parent_id,
395
        .td_validate_parent         = llcache_validate_parent,
396
};
397

    
398
/*
399
 * LLE: Local Leaf Ephemeral Cache
400
 *      -- Non-persistent write caching in local storage.
401
 *
402
 *    VBD
403
 *      \
404
 *       +--r/w--> lle+vhd:/shared/leaf
405
 *        \
406
 *         +--r/w--> vhd:/local/leaf
407
 *          \
408
 *           +--r/o--> vhd:/shared/parent
409
 *
410
 * Note that LOCAL and SHARED chain order differs from LLP. Shared
411
 * storage data masks local data.
412
 *
413
 * This means VDI state in shared storage state alone is
414
 * inconsistent. Wherever local is unavailable, SHARED must be
415
 * discarded too.
416
 */
417
enum {
418
        LLE_LOCAL = 1,
419
        /*
420
         * LLE_LOCAL:
421
         *
422
         * Writes are forwarded to LOCAL only. As are reads. This
423
         * reduces network overhead.
424
         *
425
         * Failure to write LOCAL is recoverable. The driver will
426
         * transition to LLE_SHARED.
427
         *
428
         * Failure to write to shared are irrecoverable and signaled
429
         * to the original issuer.
430
         */
431

    
432
        LLE_SHARED = 2,
433
        /*
434
         * LLE_SHARED:
435
         *
436
         * Writes are issued to SHARED. As are reads.
437
         *
438
         * Failure to write to SHARED is irrecoverable.
439
         */
440
};
441

    
442
typedef struct llecache                 td_llecache_t;
443
typedef struct llecache_request         td_llecache_req_t;
444
#define TD_LLECACHE_MAX_REQ             (MAX_REQUESTS*2)
445

    
446
struct llecache_request {
447
        td_llecache_t          *s;
448
        td_request_t            treq;
449
        int                     pending;
450
        int                     error;
451
};
452

    
453
struct llecache {
454
        td_image_t             *shared;
455
        int                     mode;
456

    
457
        td_llecache_req_t       reqv[TD_LLECACHE_MAX_REQ];
458
        td_llecache_req_t      *free[TD_LLECACHE_MAX_REQ];
459
        int                     n_free;
460
};
461

    
462
static td_llecache_req_t *
463
llecache_alloc_request(td_llecache_t *s)
464
{
465
        td_llecache_req_t *req = NULL;
466

    
467
        if (likely(s->n_free))
468
                req = s->free[--s->n_free];
469

    
470
        return req;
471
}
472

    
473
static void
474
llecache_free_request(td_llecache_t *s, td_llecache_req_t *req)
475
{
476
        BUG_ON(s->n_free >= TD_LLECACHE_MAX_REQ);
477
        s->free[s->n_free++] = req;
478
}
479

    
480
static int
481
llecache_close(td_driver_t *driver)
482
{
483
        td_llecache_t *s = driver->data;
484

    
485
        if (s->shared) {
486
                tapdisk_image_close(s->shared);
487
                s->shared = NULL;
488
        }
489

    
490
        return 0;
491
}
492

    
493
static int
494
llecache_open(td_driver_t *driver, const char *name, td_flag_t flags)
495
{
496
        td_llecache_t *s = driver->data;
497
        int i, err;
498

    
499
        s->mode = LLE_LOCAL;
500

    
501
        for (i = 0; i < TD_LLECACHE_MAX_REQ; i++)
502
                llecache_free_request(s, &s->reqv[i]);
503

    
504
        err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->shared);
505
        if (err)
506
                goto fail;
507

    
508
        driver->info = s->shared->driver->info;
509

    
510
        return 0;
511

    
512
fail:
513
        llecache_close(driver);
514
        return err;
515
}
516

    
517
static void
518
__llecache_write_cb(td_request_t treq, int error)
519
{
520
        td_llecache_req_t *req = treq.cb_data;
521
        td_llecache_t *s = req->s;
522

    
523
        BUG_ON(req->pending < treq.secs);
524

    
525
        req->pending -= treq.secs;
526
        req->error    = ll_write_error(req->error, error);
527

    
528
        if (req->pending)
529
                return;
530

    
531
        if (req->error == -ENOSPC) {
532
                ll_log_switch(DISK_TYPE_LLECACHE, req->error,
533
                              treq.image, s->shared);
534

    
535
                s->mode = LLE_SHARED;
536
                td_queue_write(s->shared, req->treq);
537

    
538
        } else
539
                td_complete_request(req->treq, error);
540

    
541
        llecache_free_request(s, req);
542
}
543

    
544
static void
545
llecache_forward_write(td_llecache_t *s, td_request_t treq)
546
{
547
        td_llecache_req_t *req;
548
        td_request_t clone;
549

    
550
        req = llecache_alloc_request(s);
551
        if (!req) {
552
                td_complete_request(treq, -EBUSY);
553
                return;
554
        }
555

    
556
        memset(req, 0, sizeof(req));
557

    
558
        req->treq       = treq;
559
        req->pending    = treq.secs;
560
        req->s          = s;
561

    
562
        clone           = treq;
563
        clone.cb        = __llecache_write_cb;
564
        clone.cb_data   = req;
565

    
566
        td_forward_request(clone);
567
}
568

    
569
static void
570
llecache_queue_write(td_driver_t *driver, td_request_t treq)
571
{
572
        td_llecache_t *s = driver->data;
573

    
574
        switch (s->mode) {
575
        case LLE_LOCAL:
576
                llecache_forward_write(s, treq);
577
                break;
578
        case LLE_SHARED:
579
                td_queue_write(s->shared, treq);
580
                break;
581
        }
582
}
583

    
584
static void
585
llecache_queue_read(td_driver_t *driver, td_request_t treq)
586
{
587
        td_llecache_t *s = driver->data;
588

    
589
        switch (s->mode) {
590
        case LLE_LOCAL:
591
                td_forward_request(treq);
592
                break;
593
        case LLE_SHARED:
594
                td_queue_read(s->shared, treq);
595
                break;
596
        default:
597
                BUG();
598
        }
599
}
600

    
601
struct tap_disk tapdisk_llecache = {
602
        .disk_type                  = "tapdisk_llecache",
603
        .flags                      = 0,
604
        .private_data_size          = sizeof(td_llecache_t),
605
        .td_open                    = llecache_open,
606
        .td_close                   = llecache_close,
607
        .td_queue_read              = llecache_queue_read,
608
        .td_queue_write             = llecache_queue_write,
609
        .td_get_parent_id           = llcache_get_parent_id,
610
        .td_validate_parent         = llcache_validate_parent,
611
};