Statistics
| Branch: | Revision:

root / block-migration.c @ 6ea44308

History | View | Annotate | Download (13.9 kB)

1
/*
2
 * QEMU live block migration
3
 *
4
 * Copyright IBM, Corp. 2009
5
 *
6
 * Authors:
7
 *  Liran Schour   <lirans@il.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU GPL, version 2.  See
10
 * the COPYING file in the top-level directory.
11
 *
12
 */
13

    
14
#include "qemu-common.h"
15
#include "block_int.h"
16
#include "hw/hw.h"
17
#include "block-migration.h"
18
#include <assert.h>
19

    
20
#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
21

    
22
#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
23
#define BLK_MIG_FLAG_EOS                0x02
24

    
25
#define MAX_IS_ALLOCATED_SEARCH 65536
26
#define MAX_BLOCKS_READ 10000
27
#define BLOCKS_READ_CHANGE 100
28
#define INITIAL_BLOCKS_READ 100
29

    
30
//#define DEBUG_BLK_MIGRATION
31

    
32
#ifdef DEBUG_BLK_MIGRATION
33
#define dprintf(fmt, ...) \
34
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
35
#else
36
#define dprintf(fmt, ...) \
37
    do { } while (0)
38
#endif
39

    
40
typedef struct BlkMigDevState {
41
    BlockDriverState *bs;
42
    int bulk_completed;
43
    int shared_base;
44
    struct BlkMigDevState *next;
45
    int64_t cur_sector;
46
    int64_t total_sectors;
47
    int64_t dirty;
48
} BlkMigDevState;
49

    
50
typedef struct BlkMigBlock {
51
    uint8_t *buf;
52
    BlkMigDevState *bmds;
53
    int64_t sector;
54
    struct iovec iov;
55
    QEMUIOVector qiov;
56
    BlockDriverAIOCB *aiocb;
57
    int ret;
58
    struct BlkMigBlock *next;
59
} BlkMigBlock;
60

    
61
typedef struct BlkMigState {
62
    int bulk_completed;
63
    int blk_enable;
64
    int shared_base;
65
    int no_dirty;
66
    QEMUFile *load_file;
67
    BlkMigDevState *bmds_first;
68
    BlkMigBlock *first_blk;
69
    BlkMigBlock *last_blk;
70
    int submitted;
71
    int read_done;
72
    int transferred;
73
    int64_t print_completion;
74
} BlkMigState;
75

    
76
static BlkMigState *block_mig_state = NULL;
77

    
78
static void blk_mig_read_cb(void *opaque, int ret)
79
{
80
    BlkMigBlock *blk = opaque;
81

    
82
    blk->ret = ret;
83

    
84
    /* insert at the end */
85
    if (block_mig_state->last_blk == NULL) {
86
        block_mig_state->first_blk = blk;
87
        block_mig_state->last_blk = blk;
88
    } else {
89
        block_mig_state->last_blk->next = blk;
90
        block_mig_state->last_blk = blk;
91
    }
92

    
93
    block_mig_state->submitted--;
94
    block_mig_state->read_done++;
95
    assert(block_mig_state->submitted >= 0);
96
}
97

    
98
static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms)
99
{
100
    int nr_sectors;
101
    int64_t total_sectors, cur_sector = 0;
102
    BlockDriverState *bs = bms->bs;
103
    BlkMigBlock *blk;
104

    
105
    blk = qemu_malloc(sizeof(BlkMigBlock));
106
    blk->buf = qemu_malloc(BLOCK_SIZE);
107

    
108
    cur_sector = bms->cur_sector;
109
    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
110

    
111
    if (bms->shared_base) {
112
        while (cur_sector < bms->total_sectors &&
113
               !bdrv_is_allocated(bms->bs, cur_sector,
114
                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
115
            cur_sector += nr_sectors;
116
        }
117
    }
118

    
119
    if (cur_sector >= total_sectors) {
120
        bms->cur_sector = total_sectors;
121
        qemu_free(blk->buf);
122
        qemu_free(blk);
123
        return 1;
124
    }
125

    
126
    if (cur_sector >= block_mig_state->print_completion) {
127
        printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
128
        fflush(stdout);
129
        block_mig_state->print_completion +=
130
            (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
131
    }
132

    
133
    /* we are going to transfer a full block even if it is not allocated */
134
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
135

    
136
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
137

    
138
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
139
        nr_sectors = (total_sectors - cur_sector);
140
    }
141

    
142
    bms->cur_sector = cur_sector + nr_sectors;
143
    blk->sector = cur_sector;
144
    blk->bmds = bms;
145
    blk->next = NULL;
146

    
147
    blk->iov.iov_base = blk->buf;
148
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
149
    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
150

    
151
    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
152
                                nr_sectors, blk_mig_read_cb, blk);
153

    
154
    if (!blk->aiocb) {
155
        printf("Error reading sector %" PRId64 "\n", cur_sector);
156
        qemu_free(blk->buf);
157
        qemu_free(blk);
158
        return 0;
159
    }
160

    
161
    bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors);
162
    block_mig_state->submitted++;
163

    
164
    return (bms->cur_sector >= total_sectors);
165
}
166

    
167
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
168
{
169
    int len, nr_sectors;
170
    int64_t total_sectors = bmds->total_sectors, cur_sector = 0;
171
    uint8_t *tmp_buf = NULL;
172
    BlockDriverState *bs = bmds->bs;
173

    
174
    tmp_buf = qemu_malloc(BLOCK_SIZE);
175

    
176
    cur_sector = bmds->cur_sector;
177

    
178
    if (bmds->shared_base) {
179
        while (cur_sector < bmds->total_sectors &&
180
               !bdrv_is_allocated(bmds->bs, cur_sector,
181
                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
182
            cur_sector += nr_sectors;
183
        }
184
    }
185

    
186
    if (cur_sector >= total_sectors) {
187
        bmds->cur_sector = total_sectors;
188
        qemu_free(tmp_buf);
189
        return 1;
190
    }
191

    
192
    if (cur_sector >= block_mig_state->print_completion) {
193
        printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
194
        fflush(stdout);
195
        block_mig_state->print_completion +=
196
            (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
197
    }
198

    
199
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
200

    
201
    /* we are going to transfer a full block even if it is not allocated */
202
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
203

    
204
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
205
        nr_sectors = (total_sectors - cur_sector);
206
    }
207

    
208
    if (bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) {
209
        printf("Error reading sector %" PRId64 "\n", cur_sector);
210
    }
211

    
212
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
213

    
214
    /* sector number and flags */
215
    qemu_put_be64(f, (cur_sector << BDRV_SECTOR_BITS)
216
                     | BLK_MIG_FLAG_DEVICE_BLOCK);
217

    
218
    /* device name */
219
    len = strlen(bs->device_name);
220
    qemu_put_byte(f, len);
221
    qemu_put_buffer(f, (uint8_t *)bs->device_name, len);
222

    
223
    qemu_put_buffer(f, tmp_buf, BLOCK_SIZE);
224

    
225
    bmds->cur_sector = cur_sector + BDRV_SECTORS_PER_DIRTY_CHUNK;
226

    
227
    qemu_free(tmp_buf);
228

    
229
    return (bmds->cur_sector >= total_sectors);
230
}
231

    
232
static void send_blk(QEMUFile *f, BlkMigBlock * blk)
233
{
234
    int len;
235

    
236
    /* sector number and flags */
237
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
238
                     | BLK_MIG_FLAG_DEVICE_BLOCK);
239

    
240
    /* device name */
241
    len = strlen(blk->bmds->bs->device_name);
242
    qemu_put_byte(f, len);
243
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
244

    
245
    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
246
}
247

    
248
static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds)
249
{
250
}
251

    
252
static void set_dirty_tracking(int enable)
253
{
254
    BlkMigDevState *bmds;
255
    for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
256
        bdrv_set_dirty_tracking(bmds->bs, enable);
257
    }
258
}
259

    
260
static void init_blk_migration(QEMUFile *f)
261
{
262
    BlkMigDevState **pbmds, *bmds;
263
    BlockDriverState *bs;
264

    
265
    for (bs = bdrv_first; bs != NULL; bs = bs->next) {
266
        if (bs->type == BDRV_TYPE_HD) {
267
            bmds = qemu_mallocz(sizeof(BlkMigDevState));
268
            bmds->bs = bs;
269
            bmds->bulk_completed = 0;
270
            bmds->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
271
            bmds->shared_base = block_mig_state->shared_base;
272

    
273
            if (bmds->shared_base) {
274
                printf("Start migration for %s with shared base image\n",
275
                       bs->device_name);
276
            } else {
277
                printf("Start full migration for %s\n", bs->device_name);
278
            }
279

    
280
            /* insert at the end */
281
            pbmds = &block_mig_state->bmds_first;
282
            while (*pbmds != NULL) {
283
                pbmds = &(*pbmds)->next;
284
            }
285
            *pbmds = bmds;
286

    
287
            blk_mig_save_dev_info(f, bmds);
288
        }
289
    }
290
}
291

    
292
static int blk_mig_save_bulked_block(QEMUFile *f, int is_async)
293
{
294
    BlkMigDevState *bmds;
295

    
296
    for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
297
        if (bmds->bulk_completed == 0) {
298
            if (is_async) {
299
                if (mig_read_device_bulk(f, bmds) == 1) {
300
                    /* completed bulk section for this device */
301
                    bmds->bulk_completed = 1;
302
                }
303
            } else {
304
                if (mig_save_device_bulk(f, bmds) == 1) {
305
                    /* completed bulk section for this device */
306
                    bmds->bulk_completed = 1;
307
                }
308
            }
309
            return 1;
310
        }
311
    }
312

    
313
    /* we reached here means bulk is completed */
314
    block_mig_state->bulk_completed = 1;
315

    
316
    return 0;
317
}
318

    
319
#define MAX_NUM_BLOCKS 4
320

    
321
static void blk_mig_save_dirty_blocks(QEMUFile *f)
322
{
323
    BlkMigDevState *bmds;
324
    uint8_t buf[BLOCK_SIZE];
325
    int64_t sector;
326
    int len;
327

    
328
    for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
329
        for (sector = 0; sector < bmds->cur_sector;) {
330
            if (bdrv_get_dirty(bmds->bs, sector)) {
331
                if (bdrv_read(bmds->bs, sector, buf,
332
                              BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
333
                    /* FIXME: add error handling */
334
                }
335

    
336
                /* sector number and flags */
337
                qemu_put_be64(f, (sector << BDRV_SECTOR_BITS)
338
                                 | BLK_MIG_FLAG_DEVICE_BLOCK);
339

    
340
                /* device name */
341
                len = strlen(bmds->bs->device_name);
342
                qemu_put_byte(f, len);
343
                qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len);
344

    
345
                qemu_put_buffer(f, buf, BLOCK_SIZE);
346

    
347
                bdrv_reset_dirty(bmds->bs, sector,
348
                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
349
            }
350
            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
351
        }
352
    }
353
}
354

    
355
static void flush_blks(QEMUFile* f)
356
{
357
    BlkMigBlock *blk, *next;
358

    
359
    dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__,
360
            submitted, read_done, transfered);
361

    
362
    for (blk = block_mig_state->first_blk;
363
         blk != NULL && !qemu_file_rate_limit(f);
364
         blk = next) {
365
        send_blk(f, blk);
366

    
367
        next = blk->next;
368
        qemu_free(blk->buf);
369
        qemu_free(blk);
370

    
371
        block_mig_state->read_done--;
372
        block_mig_state->transferred++;
373
        assert(block_mig_state->read_done >= 0);
374
    }
375
    block_mig_state->first_blk = blk;
376

    
377
    if (block_mig_state->first_blk == NULL) {
378
        block_mig_state->last_blk = NULL;
379
    }
380

    
381
    dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__,
382
            block_mig_state->submitted, block_mig_state->read_done,
383
            block_mig_state->transferred);
384
}
385

    
386
static int is_stage2_completed(void)
387
{
388
    BlkMigDevState *bmds;
389

    
390
    if (block_mig_state->submitted > 0) {
391
        return 0;
392
    }
393

    
394
    for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
395
        if (bmds->bulk_completed == 0) {
396
            return 0;
397
        }
398
    }
399

    
400
    return 1;
401
}
402

    
403
static int block_save_live(QEMUFile *f, int stage, void *opaque)
404
{
405
    dprintf("Enter save live stage %d submitted %d transferred %d\n", stage,
406
            submitted, transferred);
407

    
408
    if (block_mig_state->blk_enable != 1) {
409
        /* no need to migrate storage */
410
        qemu_put_be64(f, BLK_MIG_FLAG_EOS);
411
        return 1;
412
    }
413

    
414
    if (stage == 1) {
415
        init_blk_migration(f);
416

    
417
        /* start track dirty blocks */
418
        set_dirty_tracking(1);
419
    }
420

    
421
    flush_blks(f);
422

    
423
    /* control the rate of transfer */
424
    while ((block_mig_state->submitted +
425
            block_mig_state->read_done) * BLOCK_SIZE <
426
           qemu_file_get_rate_limit(f)) {
427
        if (blk_mig_save_bulked_block(f, 1) == 0) {
428
            /* no more bulk blocks for now */
429
            break;
430
        }
431
    }
432

    
433
    flush_blks(f);
434

    
435
    if (stage == 3) {
436
        while (blk_mig_save_bulked_block(f, 0) != 0) {
437
            /* empty */
438
        }
439

    
440
        blk_mig_save_dirty_blocks(f);
441

    
442
        /* stop track dirty blocks */
443
        set_dirty_tracking(0);
444

    
445
        printf("\nBlock migration completed\n");
446
    }
447

    
448
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
449

    
450
    return ((stage == 2) && is_stage2_completed());
451
}
452

    
453
static int block_load(QEMUFile *f, void *opaque, int version_id)
454
{
455
    int len, flags;
456
    char device_name[256];
457
    int64_t addr;
458
    BlockDriverState *bs;
459
    uint8_t *buf;
460

    
461
    buf = qemu_malloc(BLOCK_SIZE);
462

    
463
    do {
464
        addr = qemu_get_be64(f);
465

    
466
        flags = addr & ~BDRV_SECTOR_MASK;
467
        addr >>= BDRV_SECTOR_BITS;
468

    
469
        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
470
            /* get device name */
471
            len = qemu_get_byte(f);
472

    
473
            qemu_get_buffer(f, (uint8_t *)device_name, len);
474
            device_name[len] = '\0';
475

    
476
            bs = bdrv_find(device_name);
477

    
478
            qemu_get_buffer(f, buf, BLOCK_SIZE);
479
            if (bs != NULL) {
480
                bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
481
            } else {
482
                printf("Error unknown block device %s\n", device_name);
483
                /* FIXME: add error handling */
484
            }
485
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
486
            printf("Unknown flags\n");
487
            /* FIXME: add error handling */
488
        }
489
    } while (!(flags & BLK_MIG_FLAG_EOS));
490

    
491
    qemu_free(buf);
492

    
493
    return 0;
494
}
495

    
496
static void block_set_params(int blk_enable, int shared_base, void *opaque)
497
{
498
    assert(opaque == block_mig_state);
499

    
500
    block_mig_state->blk_enable = blk_enable;
501
    block_mig_state->shared_base = shared_base;
502

    
503
    /* shared base means that blk_enable = 1 */
504
    block_mig_state->blk_enable |= shared_base;
505
}
506

    
507
void blk_mig_info(void)
508
{
509
    BlockDriverState *bs;
510

    
511
    for (bs = bdrv_first; bs != NULL; bs = bs->next) {
512
        printf("Device %s\n", bs->device_name);
513
        if (bs->type == BDRV_TYPE_HD) {
514
            printf("device %s format %s\n",
515
                   bs->device_name, bs->drv->format_name);
516
        }
517
    }
518
}
519

    
520
void blk_mig_init(void)
521
{
522
    block_mig_state = qemu_mallocz(sizeof(BlkMigState));
523

    
524
    register_savevm_live("block", 0, 1, block_set_params, block_save_live,
525
                         NULL, block_load, block_mig_state);
526
}