Statistics
| Branch: | Revision:

root / block-migration.c @ 6a1751b7

History | View | Annotate | Download (22.5 kB)

1
/*
2
 * QEMU live block migration
3
 *
4
 * Copyright IBM, Corp. 2009
5
 *
6
 * Authors:
7
 *  Liran Schour   <lirans@il.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU GPL, version 2.  See
10
 * the COPYING file in the top-level directory.
11
 *
12
 * Contributions after 2012-01-13 are licensed under the terms of the
13
 * GNU GPL, version 2 or (at your option) any later version.
14
 */
15

    
16
#include "qemu-common.h"
17
#include "block/block_int.h"
18
#include "hw/hw.h"
19
#include "qemu/queue.h"
20
#include "qemu/timer.h"
21
#include "migration/block.h"
22
#include "migration/migration.h"
23
#include "sysemu/blockdev.h"
24
#include <assert.h>
25

    
26
#define BLOCK_SIZE                       (1 << 20)
27
#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
28

    
29
#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
30
#define BLK_MIG_FLAG_EOS                0x02
31
#define BLK_MIG_FLAG_PROGRESS           0x04
32
#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
33

    
34
#define MAX_IS_ALLOCATED_SEARCH 65536
35

    
36
//#define DEBUG_BLK_MIGRATION
37

    
38
#ifdef DEBUG_BLK_MIGRATION
39
#define DPRINTF(fmt, ...) \
40
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
41
#else
42
#define DPRINTF(fmt, ...) \
43
    do { } while (0)
44
#endif
45

    
46
typedef struct BlkMigDevState {
47
    /* Written during setup phase.  Can be read without a lock.  */
48
    BlockDriverState *bs;
49
    int shared_base;
50
    int64_t total_sectors;
51
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
52

    
53
    /* Only used by migration thread.  Does not need a lock.  */
54
    int bulk_completed;
55
    int64_t cur_sector;
56
    int64_t cur_dirty;
57

    
58
    /* Protected by block migration lock.  */
59
    unsigned long *aio_bitmap;
60
    int64_t completed_sectors;
61
} BlkMigDevState;
62

    
63
typedef struct BlkMigBlock {
64
    /* Only used by migration thread.  */
65
    uint8_t *buf;
66
    BlkMigDevState *bmds;
67
    int64_t sector;
68
    int nr_sectors;
69
    struct iovec iov;
70
    QEMUIOVector qiov;
71
    BlockDriverAIOCB *aiocb;
72

    
73
    /* Protected by block migration lock.  */
74
    int ret;
75
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
76
} BlkMigBlock;
77

    
78
typedef struct BlkMigState {
79
    /* Written during setup phase.  Can be read without a lock.  */
80
    int blk_enable;
81
    int shared_base;
82
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
83
    int64_t total_sector_sum;
84
    bool zero_blocks;
85

    
86
    /* Protected by lock.  */
87
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
88
    int submitted;
89
    int read_done;
90

    
91
    /* Only used by migration thread.  Does not need a lock.  */
92
    int transferred;
93
    int prev_progress;
94
    int bulk_completed;
95

    
96
    /* Lock must be taken _inside_ the iothread lock.  */
97
    QemuMutex lock;
98
} BlkMigState;
99

    
100
static BlkMigState block_mig_state;
101

    
102
static void blk_mig_lock(void)
103
{
104
    qemu_mutex_lock(&block_mig_state.lock);
105
}
106

    
107
static void blk_mig_unlock(void)
108
{
109
    qemu_mutex_unlock(&block_mig_state.lock);
110
}
111

    
112
/* Must run outside of the iothread lock during the bulk phase,
113
 * or the VM will stall.
114
 */
115

    
116
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
117
{
118
    int len;
119
    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
120

    
121
    if (block_mig_state.zero_blocks &&
122
        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
123
        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
124
    }
125

    
126
    /* sector number and flags */
127
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
128
                     | flags);
129

    
130
    /* device name */
131
    len = strlen(blk->bmds->bs->device_name);
132
    qemu_put_byte(f, len);
133
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
134

    
135
    /* if a block is zero we need to flush here since the network
136
     * bandwidth is now a lot higher than the storage device bandwidth.
137
     * thus if we queue zero blocks we slow down the migration */
138
    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
139
        qemu_fflush(f);
140
        return;
141
    }
142

    
143
    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
144
}
145

    
146
int blk_mig_active(void)
147
{
148
    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
149
}
150

    
151
uint64_t blk_mig_bytes_transferred(void)
152
{
153
    BlkMigDevState *bmds;
154
    uint64_t sum = 0;
155

    
156
    blk_mig_lock();
157
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
158
        sum += bmds->completed_sectors;
159
    }
160
    blk_mig_unlock();
161
    return sum << BDRV_SECTOR_BITS;
162
}
163

    
164
uint64_t blk_mig_bytes_remaining(void)
165
{
166
    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
167
}
168

    
169
uint64_t blk_mig_bytes_total(void)
170
{
171
    BlkMigDevState *bmds;
172
    uint64_t sum = 0;
173

    
174
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
175
        sum += bmds->total_sectors;
176
    }
177
    return sum << BDRV_SECTOR_BITS;
178
}
179

    
180

    
181
/* Called with migration lock held.  */
182

    
183
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
184
{
185
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
186

    
187
    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
188
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
189
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
190
    } else {
191
        return 0;
192
    }
193
}
194

    
195
/* Called with migration lock held.  */
196

    
197
static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
198
                             int nb_sectors, int set)
199
{
200
    int64_t start, end;
201
    unsigned long val, idx, bit;
202

    
203
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
204
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
205

    
206
    for (; start <= end; start++) {
207
        idx = start / (sizeof(unsigned long) * 8);
208
        bit = start % (sizeof(unsigned long) * 8);
209
        val = bmds->aio_bitmap[idx];
210
        if (set) {
211
            val |= 1UL << bit;
212
        } else {
213
            val &= ~(1UL << bit);
214
        }
215
        bmds->aio_bitmap[idx] = val;
216
    }
217
}
218

    
219
static void alloc_aio_bitmap(BlkMigDevState *bmds)
220
{
221
    BlockDriverState *bs = bmds->bs;
222
    int64_t bitmap_size;
223

    
224
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
225
            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
226
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
227

    
228
    bmds->aio_bitmap = g_malloc0(bitmap_size);
229
}
230

    
231
/* Never hold migration lock when yielding to the main loop!  */
232

    
233
static void blk_mig_read_cb(void *opaque, int ret)
234
{
235
    BlkMigBlock *blk = opaque;
236

    
237
    blk_mig_lock();
238
    blk->ret = ret;
239

    
240
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
241
    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
242

    
243
    block_mig_state.submitted--;
244
    block_mig_state.read_done++;
245
    assert(block_mig_state.submitted >= 0);
246
    blk_mig_unlock();
247
}
248

    
249
/* Called with no lock taken.  */
250

    
251
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
252
{
253
    int64_t total_sectors = bmds->total_sectors;
254
    int64_t cur_sector = bmds->cur_sector;
255
    BlockDriverState *bs = bmds->bs;
256
    BlkMigBlock *blk;
257
    int nr_sectors;
258

    
259
    if (bmds->shared_base) {
260
        qemu_mutex_lock_iothread();
261
        while (cur_sector < total_sectors &&
262
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
263
                                  &nr_sectors)) {
264
            cur_sector += nr_sectors;
265
        }
266
        qemu_mutex_unlock_iothread();
267
    }
268

    
269
    if (cur_sector >= total_sectors) {
270
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
271
        return 1;
272
    }
273

    
274
    bmds->completed_sectors = cur_sector;
275

    
276
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
277

    
278
    /* we are going to transfer a full block even if it is not allocated */
279
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
280

    
281
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
282
        nr_sectors = total_sectors - cur_sector;
283
    }
284

    
285
    blk = g_malloc(sizeof(BlkMigBlock));
286
    blk->buf = g_malloc(BLOCK_SIZE);
287
    blk->bmds = bmds;
288
    blk->sector = cur_sector;
289
    blk->nr_sectors = nr_sectors;
290

    
291
    blk->iov.iov_base = blk->buf;
292
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
293
    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
294

    
295
    blk_mig_lock();
296
    block_mig_state.submitted++;
297
    blk_mig_unlock();
298

    
299
    qemu_mutex_lock_iothread();
300
    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
301
                                nr_sectors, blk_mig_read_cb, blk);
302

    
303
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
304
    qemu_mutex_unlock_iothread();
305

    
306
    bmds->cur_sector = cur_sector + nr_sectors;
307
    return (bmds->cur_sector >= total_sectors);
308
}
309

    
310
/* Called with iothread lock taken.  */
311

    
312
static void set_dirty_tracking(int enable)
313
{
314
    BlkMigDevState *bmds;
315

    
316
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
317
        bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0);
318
    }
319
}
320

    
321
static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
322
{
323
    BlkMigDevState *bmds;
324
    int64_t sectors;
325

    
326
    if (!bdrv_is_read_only(bs)) {
327
        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
328
        if (sectors <= 0) {
329
            return;
330
        }
331

    
332
        bmds = g_malloc0(sizeof(BlkMigDevState));
333
        bmds->bs = bs;
334
        bmds->bulk_completed = 0;
335
        bmds->total_sectors = sectors;
336
        bmds->completed_sectors = 0;
337
        bmds->shared_base = block_mig_state.shared_base;
338
        alloc_aio_bitmap(bmds);
339
        drive_get_ref(drive_get_by_blockdev(bs));
340
        bdrv_set_in_use(bs, 1);
341

    
342
        block_mig_state.total_sector_sum += sectors;
343

    
344
        if (bmds->shared_base) {
345
            DPRINTF("Start migration for %s with shared base image\n",
346
                    bs->device_name);
347
        } else {
348
            DPRINTF("Start full migration for %s\n", bs->device_name);
349
        }
350

    
351
        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
352
    }
353
}
354

    
355
static void init_blk_migration(QEMUFile *f)
356
{
357
    block_mig_state.submitted = 0;
358
    block_mig_state.read_done = 0;
359
    block_mig_state.transferred = 0;
360
    block_mig_state.total_sector_sum = 0;
361
    block_mig_state.prev_progress = -1;
362
    block_mig_state.bulk_completed = 0;
363
    block_mig_state.zero_blocks = migrate_zero_blocks();
364

    
365
    bdrv_iterate(init_blk_migration_it, NULL);
366
}
367

    
368
/* Called with no lock taken.  */
369

    
370
static int blk_mig_save_bulked_block(QEMUFile *f)
371
{
372
    int64_t completed_sector_sum = 0;
373
    BlkMigDevState *bmds;
374
    int progress;
375
    int ret = 0;
376

    
377
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
378
        if (bmds->bulk_completed == 0) {
379
            if (mig_save_device_bulk(f, bmds) == 1) {
380
                /* completed bulk section for this device */
381
                bmds->bulk_completed = 1;
382
            }
383
            completed_sector_sum += bmds->completed_sectors;
384
            ret = 1;
385
            break;
386
        } else {
387
            completed_sector_sum += bmds->completed_sectors;
388
        }
389
    }
390

    
391
    if (block_mig_state.total_sector_sum != 0) {
392
        progress = completed_sector_sum * 100 /
393
                   block_mig_state.total_sector_sum;
394
    } else {
395
        progress = 100;
396
    }
397
    if (progress != block_mig_state.prev_progress) {
398
        block_mig_state.prev_progress = progress;
399
        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
400
                         | BLK_MIG_FLAG_PROGRESS);
401
        DPRINTF("Completed %d %%\r", progress);
402
    }
403

    
404
    return ret;
405
}
406

    
407
static void blk_mig_reset_dirty_cursor(void)
408
{
409
    BlkMigDevState *bmds;
410

    
411
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
412
        bmds->cur_dirty = 0;
413
    }
414
}
415

    
416
/* Called with iothread lock taken.  */
417

    
418
static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
419
                                 int is_async)
420
{
421
    BlkMigBlock *blk;
422
    int64_t total_sectors = bmds->total_sectors;
423
    int64_t sector;
424
    int nr_sectors;
425
    int ret = -EIO;
426

    
427
    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
428
        blk_mig_lock();
429
        if (bmds_aio_inflight(bmds, sector)) {
430
            blk_mig_unlock();
431
            bdrv_drain_all();
432
        } else {
433
            blk_mig_unlock();
434
        }
435
        if (bdrv_get_dirty(bmds->bs, sector)) {
436

    
437
            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
438
                nr_sectors = total_sectors - sector;
439
            } else {
440
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
441
            }
442
            blk = g_malloc(sizeof(BlkMigBlock));
443
            blk->buf = g_malloc(BLOCK_SIZE);
444
            blk->bmds = bmds;
445
            blk->sector = sector;
446
            blk->nr_sectors = nr_sectors;
447

    
448
            if (is_async) {
449
                blk->iov.iov_base = blk->buf;
450
                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
451
                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
452

    
453
                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
454
                                            nr_sectors, blk_mig_read_cb, blk);
455

    
456
                blk_mig_lock();
457
                block_mig_state.submitted++;
458
                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
459
                blk_mig_unlock();
460
            } else {
461
                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
462
                if (ret < 0) {
463
                    goto error;
464
                }
465
                blk_send(f, blk);
466

    
467
                g_free(blk->buf);
468
                g_free(blk);
469
            }
470

    
471
            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
472
            break;
473
        }
474
        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
475
        bmds->cur_dirty = sector;
476
    }
477

    
478
    return (bmds->cur_dirty >= bmds->total_sectors);
479

    
480
error:
481
    DPRINTF("Error reading sector %" PRId64 "\n", sector);
482
    g_free(blk->buf);
483
    g_free(blk);
484
    return ret;
485
}
486

    
487
/* Called with iothread lock taken.
488
 *
489
 * return value:
490
 * 0: too much data for max_downtime
491
 * 1: few enough data for max_downtime
492
*/
493
static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
494
{
495
    BlkMigDevState *bmds;
496
    int ret = 1;
497

    
498
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
499
        ret = mig_save_device_dirty(f, bmds, is_async);
500
        if (ret <= 0) {
501
            break;
502
        }
503
    }
504

    
505
    return ret;
506
}
507

    
508
/* Called with no locks taken.  */
509

    
510
static int flush_blks(QEMUFile *f)
511
{
512
    BlkMigBlock *blk;
513
    int ret = 0;
514

    
515
    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
516
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
517
            block_mig_state.transferred);
518

    
519
    blk_mig_lock();
520
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
521
        if (qemu_file_rate_limit(f)) {
522
            break;
523
        }
524
        if (blk->ret < 0) {
525
            ret = blk->ret;
526
            break;
527
        }
528

    
529
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
530
        blk_mig_unlock();
531
        blk_send(f, blk);
532
        blk_mig_lock();
533

    
534
        g_free(blk->buf);
535
        g_free(blk);
536

    
537
        block_mig_state.read_done--;
538
        block_mig_state.transferred++;
539
        assert(block_mig_state.read_done >= 0);
540
    }
541
    blk_mig_unlock();
542

    
543
    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
544
            block_mig_state.submitted, block_mig_state.read_done,
545
            block_mig_state.transferred);
546
    return ret;
547
}
548

    
549
/* Called with iothread lock taken.  */
550

    
551
static int64_t get_remaining_dirty(void)
552
{
553
    BlkMigDevState *bmds;
554
    int64_t dirty = 0;
555

    
556
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
557
        dirty += bdrv_get_dirty_count(bmds->bs);
558
    }
559

    
560
    return dirty << BDRV_SECTOR_BITS;
561
}
562

    
563
/* Called with iothread lock taken.  */
564

    
565
static void blk_mig_cleanup(void)
566
{
567
    BlkMigDevState *bmds;
568
    BlkMigBlock *blk;
569

    
570
    bdrv_drain_all();
571

    
572
    set_dirty_tracking(0);
573

    
574
    blk_mig_lock();
575
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
576
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
577
        bdrv_set_in_use(bmds->bs, 0);
578
        drive_put_ref(drive_get_by_blockdev(bmds->bs));
579
        g_free(bmds->aio_bitmap);
580
        g_free(bmds);
581
    }
582

    
583
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
584
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
585
        g_free(blk->buf);
586
        g_free(blk);
587
    }
588
    blk_mig_unlock();
589
}
590

    
591
static void block_migration_cancel(void *opaque)
592
{
593
    blk_mig_cleanup();
594
}
595

    
596
static int block_save_setup(QEMUFile *f, void *opaque)
597
{
598
    int ret;
599

    
600
    DPRINTF("Enter save live setup submitted %d transferred %d\n",
601
            block_mig_state.submitted, block_mig_state.transferred);
602

    
603
    qemu_mutex_lock_iothread();
604
    init_blk_migration(f);
605

    
606
    /* start track dirty blocks */
607
    set_dirty_tracking(1);
608
    qemu_mutex_unlock_iothread();
609

    
610
    ret = flush_blks(f);
611
    blk_mig_reset_dirty_cursor();
612
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
613

    
614
    return ret;
615
}
616

    
617
static int block_save_iterate(QEMUFile *f, void *opaque)
618
{
619
    int ret;
620
    int64_t last_ftell = qemu_ftell(f);
621

    
622
    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
623
            block_mig_state.submitted, block_mig_state.transferred);
624

    
625
    ret = flush_blks(f);
626
    if (ret) {
627
        return ret;
628
    }
629

    
630
    blk_mig_reset_dirty_cursor();
631

    
632
    /* control the rate of transfer */
633
    blk_mig_lock();
634
    while ((block_mig_state.submitted +
635
            block_mig_state.read_done) * BLOCK_SIZE <
636
           qemu_file_get_rate_limit(f)) {
637
        blk_mig_unlock();
638
        if (block_mig_state.bulk_completed == 0) {
639
            /* first finish the bulk phase */
640
            if (blk_mig_save_bulked_block(f) == 0) {
641
                /* finished saving bulk on all devices */
642
                block_mig_state.bulk_completed = 1;
643
            }
644
            ret = 0;
645
        } else {
646
            /* Always called with iothread lock taken for
647
             * simplicity, block_save_complete also calls it.
648
             */
649
            qemu_mutex_lock_iothread();
650
            ret = blk_mig_save_dirty_block(f, 1);
651
            qemu_mutex_unlock_iothread();
652
        }
653
        if (ret < 0) {
654
            return ret;
655
        }
656
        blk_mig_lock();
657
        if (ret != 0) {
658
            /* no more dirty blocks */
659
            break;
660
        }
661
    }
662
    blk_mig_unlock();
663

    
664
    ret = flush_blks(f);
665
    if (ret) {
666
        return ret;
667
    }
668

    
669
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
670
    return qemu_ftell(f) - last_ftell;
671
}
672

    
673
/* Called with iothread lock taken.  */
674

    
675
static int block_save_complete(QEMUFile *f, void *opaque)
676
{
677
    int ret;
678

    
679
    DPRINTF("Enter save live complete submitted %d transferred %d\n",
680
            block_mig_state.submitted, block_mig_state.transferred);
681

    
682
    ret = flush_blks(f);
683
    if (ret) {
684
        return ret;
685
    }
686

    
687
    blk_mig_reset_dirty_cursor();
688

    
689
    /* we know for sure that save bulk is completed and
690
       all async read completed */
691
    blk_mig_lock();
692
    assert(block_mig_state.submitted == 0);
693
    blk_mig_unlock();
694

    
695
    do {
696
        ret = blk_mig_save_dirty_block(f, 0);
697
        if (ret < 0) {
698
            return ret;
699
        }
700
    } while (ret == 0);
701

    
702
    /* report completion */
703
    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
704

    
705
    DPRINTF("Block migration completed\n");
706

    
707
    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
708

    
709
    blk_mig_cleanup();
710
    return 0;
711
}
712

    
713
static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
714
{
715
    /* Estimate pending number of bytes to send */
716
    uint64_t pending;
717

    
718
    qemu_mutex_lock_iothread();
719
    blk_mig_lock();
720
    pending = get_remaining_dirty() +
721
                       block_mig_state.submitted * BLOCK_SIZE +
722
                       block_mig_state.read_done * BLOCK_SIZE;
723

    
724
    /* Report at least one block pending during bulk phase */
725
    if (pending == 0 && !block_mig_state.bulk_completed) {
726
        pending = BLOCK_SIZE;
727
    }
728
    blk_mig_unlock();
729
    qemu_mutex_unlock_iothread();
730

    
731
    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
732
    return pending;
733
}
734

    
735
static int block_load(QEMUFile *f, void *opaque, int version_id)
736
{
737
    static int banner_printed;
738
    int len, flags;
739
    char device_name[256];
740
    int64_t addr;
741
    BlockDriverState *bs, *bs_prev = NULL;
742
    uint8_t *buf;
743
    int64_t total_sectors = 0;
744
    int nr_sectors;
745
    int ret;
746

    
747
    do {
748
        addr = qemu_get_be64(f);
749

    
750
        flags = addr & ~BDRV_SECTOR_MASK;
751
        addr >>= BDRV_SECTOR_BITS;
752

    
753
        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
754
            /* get device name */
755
            len = qemu_get_byte(f);
756
            qemu_get_buffer(f, (uint8_t *)device_name, len);
757
            device_name[len] = '\0';
758

    
759
            bs = bdrv_find(device_name);
760
            if (!bs) {
761
                fprintf(stderr, "Error unknown block device %s\n",
762
                        device_name);
763
                return -EINVAL;
764
            }
765

    
766
            if (bs != bs_prev) {
767
                bs_prev = bs;
768
                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
769
                if (total_sectors <= 0) {
770
                    error_report("Error getting length of block device %s",
771
                                 device_name);
772
                    return -EINVAL;
773
                }
774
            }
775

    
776
            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
777
                nr_sectors = total_sectors - addr;
778
            } else {
779
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
780
            }
781

    
782
            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
783
                ret = bdrv_write_zeroes(bs, addr, nr_sectors);
784
            } else {
785
                buf = g_malloc(BLOCK_SIZE);
786
                qemu_get_buffer(f, buf, BLOCK_SIZE);
787
                ret = bdrv_write(bs, addr, buf, nr_sectors);
788
                g_free(buf);
789
            }
790

    
791
            if (ret < 0) {
792
                return ret;
793
            }
794
        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
795
            if (!banner_printed) {
796
                printf("Receiving block device images\n");
797
                banner_printed = 1;
798
            }
799
            printf("Completed %d %%%c", (int)addr,
800
                   (addr == 100) ? '\n' : '\r');
801
            fflush(stdout);
802
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
803
            fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
804
            return -EINVAL;
805
        }
806
        ret = qemu_file_get_error(f);
807
        if (ret != 0) {
808
            return ret;
809
        }
810
    } while (!(flags & BLK_MIG_FLAG_EOS));
811

    
812
    return 0;
813
}
814

    
815
static void block_set_params(const MigrationParams *params, void *opaque)
816
{
817
    block_mig_state.blk_enable = params->blk;
818
    block_mig_state.shared_base = params->shared;
819

    
820
    /* shared base means that blk_enable = 1 */
821
    block_mig_state.blk_enable |= params->shared;
822
}
823

    
824
static bool block_is_active(void *opaque)
825
{
826
    return block_mig_state.blk_enable == 1;
827
}
828

    
829
SaveVMHandlers savevm_block_handlers = {
830
    .set_params = block_set_params,
831
    .save_live_setup = block_save_setup,
832
    .save_live_iterate = block_save_iterate,
833
    .save_live_complete = block_save_complete,
834
    .save_live_pending = block_save_pending,
835
    .load_state = block_load,
836
    .cancel = block_migration_cancel,
837
    .is_active = block_is_active,
838
};
839

    
840
void blk_mig_init(void)
841
{
842
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
843
    QSIMPLEQ_INIT(&block_mig_state.blk_list);
844
    qemu_mutex_init(&block_mig_state.lock);
845

    
846
    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
847
                         &block_mig_state);
848
}