Statistics
| Branch: | Revision:

root / block-migration.c @ feature-archipelago

History | View | Annotate | Download (22.8 kB)

1
/*
2
 * QEMU live block migration
3
 *
4
 * Copyright IBM, Corp. 2009
5
 *
6
 * Authors:
7
 *  Liran Schour   <lirans@il.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU GPL, version 2.  See
10
 * the COPYING file in the top-level directory.
11
 *
12
 * Contributions after 2012-01-13 are licensed under the terms of the
13
 * GNU GPL, version 2 or (at your option) any later version.
14
 */
15

    
16
#include "qemu-common.h"
17
#include "block/block_int.h"
18
#include "hw/hw.h"
19
#include "qemu/queue.h"
20
#include "qemu/timer.h"
21
#include "migration/block.h"
22
#include "migration/migration.h"
23
#include "sysemu/blockdev.h"
24
#include <assert.h>
25

    
26
#define BLOCK_SIZE                       (1 << 20)
27
#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
28

    
29
#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
30
#define BLK_MIG_FLAG_EOS                0x02
31
#define BLK_MIG_FLAG_PROGRESS           0x04
32
#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
33

    
34
#define MAX_IS_ALLOCATED_SEARCH 65536
35

    
36
//#define DEBUG_BLK_MIGRATION
37

    
38
#ifdef DEBUG_BLK_MIGRATION
39
#define DPRINTF(fmt, ...) \
40
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
41
#else
42
#define DPRINTF(fmt, ...) \
43
    do { } while (0)
44
#endif
45

    
46
typedef struct BlkMigDevState {
47
    /* Written during setup phase.  Can be read without a lock.  */
48
    BlockDriverState *bs;
49
    int shared_base;
50
    int64_t total_sectors;
51
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
52

    
53
    /* Only used by migration thread.  Does not need a lock.  */
54
    int bulk_completed;
55
    int64_t cur_sector;
56
    int64_t cur_dirty;
57

    
58
    /* Protected by block migration lock.  */
59
    unsigned long *aio_bitmap;
60
    int64_t completed_sectors;
61
    BdrvDirtyBitmap *dirty_bitmap;
62
} BlkMigDevState;
63

    
64
typedef struct BlkMigBlock {
65
    /* Only used by migration thread.  */
66
    uint8_t *buf;
67
    BlkMigDevState *bmds;
68
    int64_t sector;
69
    int nr_sectors;
70
    struct iovec iov;
71
    QEMUIOVector qiov;
72
    BlockDriverAIOCB *aiocb;
73

    
74
    /* Protected by block migration lock.  */
75
    int ret;
76
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
77
} BlkMigBlock;
78

    
79
typedef struct BlkMigState {
80
    /* Written during setup phase.  Can be read without a lock.  */
81
    int blk_enable;
82
    int shared_base;
83
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
84
    int64_t total_sector_sum;
85
    bool zero_blocks;
86

    
87
    /* Protected by lock.  */
88
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
89
    int submitted;
90
    int read_done;
91

    
92
    /* Only used by migration thread.  Does not need a lock.  */
93
    int transferred;
94
    int prev_progress;
95
    int bulk_completed;
96

    
97
    /* Lock must be taken _inside_ the iothread lock.  */
98
    QemuMutex lock;
99
} BlkMigState;
100

    
101
static BlkMigState block_mig_state;
102

    
103
static void blk_mig_lock(void)
104
{
105
    qemu_mutex_lock(&block_mig_state.lock);
106
}
107

    
108
static void blk_mig_unlock(void)
109
{
110
    qemu_mutex_unlock(&block_mig_state.lock);
111
}
112

    
113
/* Must run outside of the iothread lock during the bulk phase,
114
 * or the VM will stall.
115
 */
116

    
117
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
118
{
119
    int len;
120
    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
121

    
122
    if (block_mig_state.zero_blocks &&
123
        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
124
        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
125
    }
126

    
127
    /* sector number and flags */
128
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
129
                     | flags);
130

    
131
    /* device name */
132
    len = strlen(blk->bmds->bs->device_name);
133
    qemu_put_byte(f, len);
134
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
135

    
136
    /* if a block is zero we need to flush here since the network
137
     * bandwidth is now a lot higher than the storage device bandwidth.
138
     * thus if we queue zero blocks we slow down the migration */
139
    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
140
        qemu_fflush(f);
141
        return;
142
    }
143

    
144
    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
145
}
146

    
147
int blk_mig_active(void)
148
{
149
    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
150
}
151

    
152
uint64_t blk_mig_bytes_transferred(void)
153
{
154
    BlkMigDevState *bmds;
155
    uint64_t sum = 0;
156

    
157
    blk_mig_lock();
158
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
159
        sum += bmds->completed_sectors;
160
    }
161
    blk_mig_unlock();
162
    return sum << BDRV_SECTOR_BITS;
163
}
164

    
165
uint64_t blk_mig_bytes_remaining(void)
166
{
167
    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
168
}
169

    
170
uint64_t blk_mig_bytes_total(void)
171
{
172
    BlkMigDevState *bmds;
173
    uint64_t sum = 0;
174

    
175
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
176
        sum += bmds->total_sectors;
177
    }
178
    return sum << BDRV_SECTOR_BITS;
179
}
180

    
181

    
182
/* Called with migration lock held.  */
183

    
184
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
185
{
186
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
187

    
188
    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
189
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
190
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
191
    } else {
192
        return 0;
193
    }
194
}
195

    
196
/* Called with migration lock held.  */
197

    
198
static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
199
                             int nb_sectors, int set)
200
{
201
    int64_t start, end;
202
    unsigned long val, idx, bit;
203

    
204
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
205
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
206

    
207
    for (; start <= end; start++) {
208
        idx = start / (sizeof(unsigned long) * 8);
209
        bit = start % (sizeof(unsigned long) * 8);
210
        val = bmds->aio_bitmap[idx];
211
        if (set) {
212
            val |= 1UL << bit;
213
        } else {
214
            val &= ~(1UL << bit);
215
        }
216
        bmds->aio_bitmap[idx] = val;
217
    }
218
}
219

    
220
static void alloc_aio_bitmap(BlkMigDevState *bmds)
221
{
222
    BlockDriverState *bs = bmds->bs;
223
    int64_t bitmap_size;
224

    
225
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
226
            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
227
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
228

    
229
    bmds->aio_bitmap = g_malloc0(bitmap_size);
230
}
231

    
232
/* Never hold migration lock when yielding to the main loop!  */
233

    
234
static void blk_mig_read_cb(void *opaque, int ret)
235
{
236
    BlkMigBlock *blk = opaque;
237

    
238
    blk_mig_lock();
239
    blk->ret = ret;
240

    
241
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
242
    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
243

    
244
    block_mig_state.submitted--;
245
    block_mig_state.read_done++;
246
    assert(block_mig_state.submitted >= 0);
247
    blk_mig_unlock();
248
}
249

    
250
/* Called with no lock taken.  */
251

    
252
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
253
{
254
    int64_t total_sectors = bmds->total_sectors;
255
    int64_t cur_sector = bmds->cur_sector;
256
    BlockDriverState *bs = bmds->bs;
257
    BlkMigBlock *blk;
258
    int nr_sectors;
259

    
260
    if (bmds->shared_base) {
261
        qemu_mutex_lock_iothread();
262
        while (cur_sector < total_sectors &&
263
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
264
                                  &nr_sectors)) {
265
            cur_sector += nr_sectors;
266
        }
267
        qemu_mutex_unlock_iothread();
268
    }
269

    
270
    if (cur_sector >= total_sectors) {
271
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
272
        return 1;
273
    }
274

    
275
    bmds->completed_sectors = cur_sector;
276

    
277
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
278

    
279
    /* we are going to transfer a full block even if it is not allocated */
280
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
281

    
282
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
283
        nr_sectors = total_sectors - cur_sector;
284
    }
285

    
286
    blk = g_malloc(sizeof(BlkMigBlock));
287
    blk->buf = g_malloc(BLOCK_SIZE);
288
    blk->bmds = bmds;
289
    blk->sector = cur_sector;
290
    blk->nr_sectors = nr_sectors;
291

    
292
    blk->iov.iov_base = blk->buf;
293
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
294
    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
295

    
296
    blk_mig_lock();
297
    block_mig_state.submitted++;
298
    blk_mig_unlock();
299

    
300
    qemu_mutex_lock_iothread();
301
    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
302
                                nr_sectors, blk_mig_read_cb, blk);
303

    
304
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
305
    qemu_mutex_unlock_iothread();
306

    
307
    bmds->cur_sector = cur_sector + nr_sectors;
308
    return (bmds->cur_sector >= total_sectors);
309
}
310

    
311
/* Called with iothread lock taken.  */
312

    
313
static void set_dirty_tracking(void)
314
{
315
    BlkMigDevState *bmds;
316

    
317
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
318
        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE);
319
    }
320
}
321

    
322
static void unset_dirty_tracking(void)
323
{
324
    BlkMigDevState *bmds;
325

    
326
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
327
        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
328
    }
329
}
330

    
331
static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
332
{
333
    BlkMigDevState *bmds;
334
    int64_t sectors;
335

    
336
    if (!bdrv_is_read_only(bs)) {
337
        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
338
        if (sectors <= 0) {
339
            return;
340
        }
341

    
342
        bmds = g_malloc0(sizeof(BlkMigDevState));
343
        bmds->bs = bs;
344
        bmds->bulk_completed = 0;
345
        bmds->total_sectors = sectors;
346
        bmds->completed_sectors = 0;
347
        bmds->shared_base = block_mig_state.shared_base;
348
        alloc_aio_bitmap(bmds);
349
        bdrv_set_in_use(bs, 1);
350
        bdrv_ref(bs);
351

    
352
        block_mig_state.total_sector_sum += sectors;
353

    
354
        if (bmds->shared_base) {
355
            DPRINTF("Start migration for %s with shared base image\n",
356
                    bs->device_name);
357
        } else {
358
            DPRINTF("Start full migration for %s\n", bs->device_name);
359
        }
360

    
361
        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
362
    }
363
}
364

    
365
static void init_blk_migration(QEMUFile *f)
366
{
367
    block_mig_state.submitted = 0;
368
    block_mig_state.read_done = 0;
369
    block_mig_state.transferred = 0;
370
    block_mig_state.total_sector_sum = 0;
371
    block_mig_state.prev_progress = -1;
372
    block_mig_state.bulk_completed = 0;
373
    block_mig_state.zero_blocks = migrate_zero_blocks();
374

    
375
    bdrv_iterate(init_blk_migration_it, NULL);
376
}
377

    
378
/* Called with no lock taken.  */
379

    
380
static int blk_mig_save_bulked_block(QEMUFile *f)
381
{
382
    int64_t completed_sector_sum = 0;
383
    BlkMigDevState *bmds;
384
    int progress;
385
    int ret = 0;
386

    
387
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
388
        if (bmds->bulk_completed == 0) {
389
            if (mig_save_device_bulk(f, bmds) == 1) {
390
                /* completed bulk section for this device */
391
                bmds->bulk_completed = 1;
392
            }
393
            completed_sector_sum += bmds->completed_sectors;
394
            ret = 1;
395
            break;
396
        } else {
397
            completed_sector_sum += bmds->completed_sectors;
398
        }
399
    }
400

    
401
    if (block_mig_state.total_sector_sum != 0) {
402
        progress = completed_sector_sum * 100 /
403
                   block_mig_state.total_sector_sum;
404
    } else {
405
        progress = 100;
406
    }
407
    if (progress != block_mig_state.prev_progress) {
408
        block_mig_state.prev_progress = progress;
409
        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
410
                         | BLK_MIG_FLAG_PROGRESS);
411
        DPRINTF("Completed %d %%\r", progress);
412
    }
413

    
414
    return ret;
415
}
416

    
417
static void blk_mig_reset_dirty_cursor(void)
418
{
419
    BlkMigDevState *bmds;
420

    
421
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
422
        bmds->cur_dirty = 0;
423
    }
424
}
425

    
426
/* Called with iothread lock taken.  */
427

    
428
static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
429
                                 int is_async)
430
{
431
    BlkMigBlock *blk;
432
    int64_t total_sectors = bmds->total_sectors;
433
    int64_t sector;
434
    int nr_sectors;
435
    int ret = -EIO;
436

    
437
    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
438
        blk_mig_lock();
439
        if (bmds_aio_inflight(bmds, sector)) {
440
            blk_mig_unlock();
441
            bdrv_drain_all();
442
        } else {
443
            blk_mig_unlock();
444
        }
445
        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
446

    
447
            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
448
                nr_sectors = total_sectors - sector;
449
            } else {
450
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
451
            }
452
            blk = g_malloc(sizeof(BlkMigBlock));
453
            blk->buf = g_malloc(BLOCK_SIZE);
454
            blk->bmds = bmds;
455
            blk->sector = sector;
456
            blk->nr_sectors = nr_sectors;
457

    
458
            if (is_async) {
459
                blk->iov.iov_base = blk->buf;
460
                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
461
                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
462

    
463
                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
464
                                            nr_sectors, blk_mig_read_cb, blk);
465

    
466
                blk_mig_lock();
467
                block_mig_state.submitted++;
468
                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
469
                blk_mig_unlock();
470
            } else {
471
                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
472
                if (ret < 0) {
473
                    goto error;
474
                }
475
                blk_send(f, blk);
476

    
477
                g_free(blk->buf);
478
                g_free(blk);
479
            }
480

    
481
            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
482
            break;
483
        }
484
        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
485
        bmds->cur_dirty = sector;
486
    }
487

    
488
    return (bmds->cur_dirty >= bmds->total_sectors);
489

    
490
error:
491
    DPRINTF("Error reading sector %" PRId64 "\n", sector);
492
    g_free(blk->buf);
493
    g_free(blk);
494
    return ret;
495
}
496

    
497
/* Called with iothread lock taken.
498
 *
499
 * return value:
500
 * 0: too much data for max_downtime
501
 * 1: few enough data for max_downtime
502
*/
503
static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
504
{
505
    BlkMigDevState *bmds;
506
    int ret = 1;
507

    
508
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
509
        ret = mig_save_device_dirty(f, bmds, is_async);
510
        if (ret <= 0) {
511
            break;
512
        }
513
    }
514

    
515
    return ret;
516
}
517

    
518
/* Called with no locks taken.  */
519

    
520
static int flush_blks(QEMUFile *f)
521
{
522
    BlkMigBlock *blk;
523
    int ret = 0;
524

    
525
    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
526
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
527
            block_mig_state.transferred);
528

    
529
    blk_mig_lock();
530
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
531
        if (qemu_file_rate_limit(f)) {
532
            break;
533
        }
534
        if (blk->ret < 0) {
535
            ret = blk->ret;
536
            break;
537
        }
538

    
539
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
540
        blk_mig_unlock();
541
        blk_send(f, blk);
542
        blk_mig_lock();
543

    
544
        g_free(blk->buf);
545
        g_free(blk);
546

    
547
        block_mig_state.read_done--;
548
        block_mig_state.transferred++;
549
        assert(block_mig_state.read_done >= 0);
550
    }
551
    blk_mig_unlock();
552

    
553
    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
554
            block_mig_state.submitted, block_mig_state.read_done,
555
            block_mig_state.transferred);
556
    return ret;
557
}
558

    
559
/* Called with iothread lock taken.  */
560

    
561
static int64_t get_remaining_dirty(void)
562
{
563
    BlkMigDevState *bmds;
564
    int64_t dirty = 0;
565

    
566
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
567
        dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
568
    }
569

    
570
    return dirty << BDRV_SECTOR_BITS;
571
}
572

    
573
/* Called with iothread lock taken.  */
574

    
575
static void blk_mig_cleanup(void)
576
{
577
    BlkMigDevState *bmds;
578
    BlkMigBlock *blk;
579

    
580
    bdrv_drain_all();
581

    
582
    unset_dirty_tracking();
583

    
584
    blk_mig_lock();
585
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
586
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
587
        bdrv_set_in_use(bmds->bs, 0);
588
        bdrv_unref(bmds->bs);
589
        g_free(bmds->aio_bitmap);
590
        g_free(bmds);
591
    }
592

    
593
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
594
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
595
        g_free(blk->buf);
596
        g_free(blk);
597
    }
598
    blk_mig_unlock();
599
}
600

    
601
static void block_migration_cancel(void *opaque)
602
{
603
    blk_mig_cleanup();
604
}
605

    
606
static int block_save_setup(QEMUFile *f, void *opaque)
607
{
608
    int ret;
609

    
610
    DPRINTF("Enter save live setup submitted %d transferred %d\n",
611
            block_mig_state.submitted, block_mig_state.transferred);
612

    
613
    qemu_mutex_lock_iothread();
614
    init_blk_migration(f);
615

    
616
    /* start track dirty blocks */
617
    set_dirty_tracking();
618
    qemu_mutex_unlock_iothread();
619

    
620
    ret = flush_blks(f);
621
    blk_mig_reset_dirty_cursor();
622
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
623

    
624
    return ret;
625
}
626

    
627
static int block_save_iterate(QEMUFile *f, void *opaque)
628
{
629
    int ret;
630
    int64_t last_ftell = qemu_ftell(f);
631

    
632
    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
633
            block_mig_state.submitted, block_mig_state.transferred);
634

    
635
    ret = flush_blks(f);
636
    if (ret) {
637
        return ret;
638
    }
639

    
640
    blk_mig_reset_dirty_cursor();
641

    
642
    /* control the rate of transfer */
643
    blk_mig_lock();
644
    while ((block_mig_state.submitted +
645
            block_mig_state.read_done) * BLOCK_SIZE <
646
           qemu_file_get_rate_limit(f)) {
647
        blk_mig_unlock();
648
        if (block_mig_state.bulk_completed == 0) {
649
            /* first finish the bulk phase */
650
            if (blk_mig_save_bulked_block(f) == 0) {
651
                /* finished saving bulk on all devices */
652
                block_mig_state.bulk_completed = 1;
653
            }
654
            ret = 0;
655
        } else {
656
            /* Always called with iothread lock taken for
657
             * simplicity, block_save_complete also calls it.
658
             */
659
            qemu_mutex_lock_iothread();
660
            ret = blk_mig_save_dirty_block(f, 1);
661
            qemu_mutex_unlock_iothread();
662
        }
663
        if (ret < 0) {
664
            return ret;
665
        }
666
        blk_mig_lock();
667
        if (ret != 0) {
668
            /* no more dirty blocks */
669
            break;
670
        }
671
    }
672
    blk_mig_unlock();
673

    
674
    ret = flush_blks(f);
675
    if (ret) {
676
        return ret;
677
    }
678

    
679
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
680
    return qemu_ftell(f) - last_ftell;
681
}
682

    
683
/* Called with iothread lock taken.  */
684

    
685
static int block_save_complete(QEMUFile *f, void *opaque)
686
{
687
    int ret;
688

    
689
    DPRINTF("Enter save live complete submitted %d transferred %d\n",
690
            block_mig_state.submitted, block_mig_state.transferred);
691

    
692
    ret = flush_blks(f);
693
    if (ret) {
694
        return ret;
695
    }
696

    
697
    blk_mig_reset_dirty_cursor();
698

    
699
    /* we know for sure that save bulk is completed and
700
       all async read completed */
701
    blk_mig_lock();
702
    assert(block_mig_state.submitted == 0);
703
    blk_mig_unlock();
704

    
705
    do {
706
        ret = blk_mig_save_dirty_block(f, 0);
707
        if (ret < 0) {
708
            return ret;
709
        }
710
    } while (ret == 0);
711

    
712
    /* report completion */
713
    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
714

    
715
    DPRINTF("Block migration completed\n");
716

    
717
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
718

    
719
    blk_mig_cleanup();
720
    return 0;
721
}
722

    
723
static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
724
{
725
    /* Estimate pending number of bytes to send */
726
    uint64_t pending;
727

    
728
    qemu_mutex_lock_iothread();
729
    blk_mig_lock();
730
    pending = get_remaining_dirty() +
731
                       block_mig_state.submitted * BLOCK_SIZE +
732
                       block_mig_state.read_done * BLOCK_SIZE;
733

    
734
    /* Report at least one block pending during bulk phase */
735
    if (pending == 0 && !block_mig_state.bulk_completed) {
736
        pending = BLOCK_SIZE;
737
    }
738
    blk_mig_unlock();
739
    qemu_mutex_unlock_iothread();
740

    
741
    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
742
    return pending;
743
}
744

    
745
static int block_load(QEMUFile *f, void *opaque, int version_id)
746
{
747
    static int banner_printed;
748
    int len, flags;
749
    char device_name[256];
750
    int64_t addr;
751
    BlockDriverState *bs, *bs_prev = NULL;
752
    uint8_t *buf;
753
    int64_t total_sectors = 0;
754
    int nr_sectors;
755
    int ret;
756

    
757
    do {
758
        addr = qemu_get_be64(f);
759

    
760
        flags = addr & ~BDRV_SECTOR_MASK;
761
        addr >>= BDRV_SECTOR_BITS;
762

    
763
        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
764
            /* get device name */
765
            len = qemu_get_byte(f);
766
            qemu_get_buffer(f, (uint8_t *)device_name, len);
767
            device_name[len] = '\0';
768

    
769
            bs = bdrv_find(device_name);
770
            if (!bs) {
771
                fprintf(stderr, "Error unknown block device %s\n",
772
                        device_name);
773
                return -EINVAL;
774
            }
775

    
776
            if (bs != bs_prev) {
777
                bs_prev = bs;
778
                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
779
                if (total_sectors <= 0) {
780
                    error_report("Error getting length of block device %s",
781
                                 device_name);
782
                    return -EINVAL;
783
                }
784
            }
785

    
786
            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
787
                nr_sectors = total_sectors - addr;
788
            } else {
789
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
790
            }
791

    
792
            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
793
                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
794
                                        BDRV_REQ_MAY_UNMAP);
795
            } else {
796
                buf = g_malloc(BLOCK_SIZE);
797
                qemu_get_buffer(f, buf, BLOCK_SIZE);
798
                ret = bdrv_write(bs, addr, buf, nr_sectors);
799
                g_free(buf);
800
            }
801

    
802
            if (ret < 0) {
803
                return ret;
804
            }
805
        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
806
            if (!banner_printed) {
807
                printf("Receiving block device images\n");
808
                banner_printed = 1;
809
            }
810
            printf("Completed %d %%%c", (int)addr,
811
                   (addr == 100) ? '\n' : '\r');
812
            fflush(stdout);
813
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
814
            fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
815
            return -EINVAL;
816
        }
817
        ret = qemu_file_get_error(f);
818
        if (ret != 0) {
819
            return ret;
820
        }
821
    } while (!(flags & BLK_MIG_FLAG_EOS));
822

    
823
    return 0;
824
}
825

    
826
static void block_set_params(const MigrationParams *params, void *opaque)
827
{
828
    block_mig_state.blk_enable = params->blk;
829
    block_mig_state.shared_base = params->shared;
830

    
831
    /* shared base means that blk_enable = 1 */
832
    block_mig_state.blk_enable |= params->shared;
833
}
834

    
835
static bool block_is_active(void *opaque)
836
{
837
    return block_mig_state.blk_enable == 1;
838
}
839

    
840
SaveVMHandlers savevm_block_handlers = {
841
    .set_params = block_set_params,
842
    .save_live_setup = block_save_setup,
843
    .save_live_iterate = block_save_iterate,
844
    .save_live_complete = block_save_complete,
845
    .save_live_pending = block_save_pending,
846
    .load_state = block_load,
847
    .cancel = block_migration_cancel,
848
    .is_active = block_is_active,
849
};
850

    
851
void blk_mig_init(void)
852
{
853
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
854
    QSIMPLEQ_INIT(&block_mig_state.blk_list);
855
    qemu_mutex_init(&block_mig_state.lock);
856

    
857
    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
858
                         &block_mig_state);
859
}