root / block-migration.c @ a55eb92c
History | View | Annotate | Download (14.3 kB)
1 |
/*
|
---|---|
2 |
* QEMU live block migration
|
3 |
*
|
4 |
* Copyright IBM, Corp. 2009
|
5 |
*
|
6 |
* Authors:
|
7 |
* Liran Schour <lirans@il.ibm.com>
|
8 |
*
|
9 |
* This work is licensed under the terms of the GNU GPL, version 2. See
|
10 |
* the COPYING file in the top-level directory.
|
11 |
*
|
12 |
*/
|
13 |
|
14 |
#include "qemu-common.h" |
15 |
#include "block_int.h" |
16 |
#include "hw/hw.h" |
17 |
#include "block-migration.h" |
18 |
#include <assert.h> |
19 |
|
20 |
#define SECTOR_BITS 9 |
21 |
#define SECTOR_SIZE (1 << SECTOR_BITS) |
22 |
#define SECTOR_MASK ~(SECTOR_SIZE - 1); |
23 |
|
24 |
#define BLOCK_SIZE (block_mig_state->sectors_per_block << SECTOR_BITS)
|
25 |
|
26 |
#define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 |
27 |
#define BLK_MIG_FLAG_EOS 0x02 |
28 |
|
29 |
#define MAX_IS_ALLOCATED_SEARCH 65536 |
30 |
#define MAX_BLOCKS_READ 10000 |
31 |
#define BLOCKS_READ_CHANGE 100 |
32 |
#define INITIAL_BLOCKS_READ 100 |
33 |
|
34 |
//#define DEBUG_BLK_MIGRATION
|
35 |
|
36 |
#ifdef DEBUG_BLK_MIGRATION
|
37 |
#define dprintf(fmt, ...) \
|
38 |
do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) |
39 |
#else
|
40 |
#define dprintf(fmt, ...) \
|
41 |
do { } while (0) |
42 |
#endif
|
43 |
|
44 |
typedef struct BlkMigDevState { |
45 |
BlockDriverState *bs; |
46 |
int bulk_completed;
|
47 |
int shared_base;
|
48 |
struct BlkMigDevState *next;
|
49 |
int64_t cur_sector; |
50 |
int64_t total_sectors; |
51 |
int64_t dirty; |
52 |
} BlkMigDevState; |
53 |
|
54 |
typedef struct BlkMigBlock { |
55 |
uint8_t *buf; |
56 |
BlkMigDevState *bmds; |
57 |
int64_t sector; |
58 |
struct iovec iov;
|
59 |
QEMUIOVector qiov; |
60 |
BlockDriverAIOCB *aiocb; |
61 |
int ret;
|
62 |
struct BlkMigBlock *next;
|
63 |
} BlkMigBlock; |
64 |
|
65 |
typedef struct BlkMigState { |
66 |
int bulk_completed;
|
67 |
int blk_enable;
|
68 |
int shared_base;
|
69 |
int no_dirty;
|
70 |
QEMUFile *load_file; |
71 |
BlkMigDevState *bmds_first; |
72 |
int sectors_per_block;
|
73 |
BlkMigBlock *first_blk; |
74 |
BlkMigBlock *last_blk; |
75 |
int submitted;
|
76 |
int read_done;
|
77 |
int transferred;
|
78 |
int64_t print_completion; |
79 |
} BlkMigState; |
80 |
|
81 |
static BlkMigState *block_mig_state = NULL; |
82 |
|
83 |
static void blk_mig_read_cb(void *opaque, int ret) |
84 |
{ |
85 |
BlkMigBlock *blk = opaque; |
86 |
|
87 |
blk->ret = ret; |
88 |
|
89 |
/* insert at the end */
|
90 |
if (block_mig_state->last_blk == NULL) { |
91 |
block_mig_state->first_blk = blk; |
92 |
block_mig_state->last_blk = blk; |
93 |
} else {
|
94 |
block_mig_state->last_blk->next = blk; |
95 |
block_mig_state->last_blk = blk; |
96 |
} |
97 |
|
98 |
block_mig_state->submitted--; |
99 |
block_mig_state->read_done++; |
100 |
assert(block_mig_state->submitted >= 0);
|
101 |
} |
102 |
|
103 |
static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms) |
104 |
{ |
105 |
int nr_sectors;
|
106 |
int64_t total_sectors, cur_sector = 0;
|
107 |
BlockDriverState *bs = bms->bs; |
108 |
BlkMigBlock *blk; |
109 |
|
110 |
blk = qemu_malloc(sizeof(BlkMigBlock));
|
111 |
blk->buf = qemu_malloc(BLOCK_SIZE); |
112 |
|
113 |
cur_sector = bms->cur_sector; |
114 |
total_sectors = bdrv_getlength(bs) >> SECTOR_BITS; |
115 |
|
116 |
if (bms->shared_base) {
|
117 |
while (cur_sector < bms->total_sectors &&
|
118 |
!bdrv_is_allocated(bms->bs, cur_sector, |
119 |
MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { |
120 |
cur_sector += nr_sectors; |
121 |
} |
122 |
} |
123 |
|
124 |
if (cur_sector >= total_sectors) {
|
125 |
bms->cur_sector = total_sectors; |
126 |
qemu_free(blk->buf); |
127 |
qemu_free(blk); |
128 |
return 1; |
129 |
} |
130 |
|
131 |
if (cur_sector >= block_mig_state->print_completion) {
|
132 |
printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); |
133 |
fflush(stdout); |
134 |
block_mig_state->print_completion += |
135 |
(block_mig_state->sectors_per_block * 10000);
|
136 |
} |
137 |
|
138 |
/* we going to transfder BLOCK_SIZE any way even if it is not allocated */
|
139 |
nr_sectors = block_mig_state->sectors_per_block; |
140 |
|
141 |
cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1);
|
142 |
|
143 |
if (total_sectors - cur_sector < block_mig_state->sectors_per_block) {
|
144 |
nr_sectors = (total_sectors - cur_sector); |
145 |
} |
146 |
|
147 |
bms->cur_sector = cur_sector + nr_sectors; |
148 |
blk->sector = cur_sector; |
149 |
blk->bmds = bms; |
150 |
blk->next = NULL;
|
151 |
|
152 |
blk->iov.iov_base = blk->buf; |
153 |
blk->iov.iov_len = nr_sectors * SECTOR_SIZE; |
154 |
qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
|
155 |
|
156 |
blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, |
157 |
nr_sectors, blk_mig_read_cb, blk); |
158 |
|
159 |
if (!blk->aiocb) {
|
160 |
printf("Error reading sector %" PRId64 "\n", cur_sector); |
161 |
qemu_free(blk->buf); |
162 |
qemu_free(blk); |
163 |
return 0; |
164 |
} |
165 |
|
166 |
bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors); |
167 |
block_mig_state->submitted++; |
168 |
|
169 |
return (bms->cur_sector >= total_sectors);
|
170 |
} |
171 |
|
172 |
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) |
173 |
{ |
174 |
int len, nr_sectors;
|
175 |
int64_t total_sectors = bmds->total_sectors, cur_sector = 0;
|
176 |
uint8_t *tmp_buf = NULL;
|
177 |
BlockDriverState *bs = bmds->bs; |
178 |
|
179 |
tmp_buf = qemu_malloc(BLOCK_SIZE); |
180 |
|
181 |
cur_sector = bmds->cur_sector; |
182 |
|
183 |
if (bmds->shared_base) {
|
184 |
while (cur_sector < bmds->total_sectors &&
|
185 |
!bdrv_is_allocated(bmds->bs, cur_sector, |
186 |
MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { |
187 |
cur_sector += nr_sectors; |
188 |
} |
189 |
} |
190 |
|
191 |
if (cur_sector >= total_sectors) {
|
192 |
bmds->cur_sector = total_sectors; |
193 |
qemu_free(tmp_buf); |
194 |
return 1; |
195 |
} |
196 |
|
197 |
if (cur_sector >= block_mig_state->print_completion) {
|
198 |
printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); |
199 |
fflush(stdout); |
200 |
block_mig_state->print_completion += |
201 |
(block_mig_state->sectors_per_block * 10000);
|
202 |
} |
203 |
|
204 |
cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1);
|
205 |
|
206 |
/* we going to transfer BLOCK_SIZE any way even if it is not allocated */
|
207 |
nr_sectors = block_mig_state->sectors_per_block; |
208 |
|
209 |
if (total_sectors - cur_sector < block_mig_state->sectors_per_block) {
|
210 |
nr_sectors = (total_sectors - cur_sector); |
211 |
} |
212 |
|
213 |
if (bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) { |
214 |
printf("Error reading sector %" PRId64 "\n", cur_sector); |
215 |
} |
216 |
|
217 |
bdrv_reset_dirty(bs, cur_sector, nr_sectors); |
218 |
|
219 |
/* sector number and flags */
|
220 |
qemu_put_be64(f, (cur_sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK); |
221 |
|
222 |
/* device name */
|
223 |
len = strlen(bs->device_name); |
224 |
qemu_put_byte(f, len); |
225 |
qemu_put_buffer(f, (uint8_t *)bs->device_name, len); |
226 |
|
227 |
qemu_put_buffer(f, tmp_buf, BLOCK_SIZE); |
228 |
|
229 |
bmds->cur_sector = cur_sector + block_mig_state->sectors_per_block; |
230 |
|
231 |
qemu_free(tmp_buf); |
232 |
|
233 |
return (bmds->cur_sector >= total_sectors);
|
234 |
} |
235 |
|
236 |
static void send_blk(QEMUFile *f, BlkMigBlock * blk) |
237 |
{ |
238 |
int len;
|
239 |
|
240 |
/* sector number and flags */
|
241 |
qemu_put_be64(f, (blk->sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK); |
242 |
|
243 |
/* device name */
|
244 |
len = strlen(blk->bmds->bs->device_name); |
245 |
qemu_put_byte(f, len); |
246 |
qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len); |
247 |
|
248 |
qemu_put_buffer(f, blk->buf, BLOCK_SIZE); |
249 |
} |
250 |
|
251 |
static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds) |
252 |
{ |
253 |
} |
254 |
|
255 |
static void set_dirty_tracking(int enable) |
256 |
{ |
257 |
BlkMigDevState *bmds; |
258 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
259 |
bdrv_set_dirty_tracking(bmds->bs, enable); |
260 |
} |
261 |
} |
262 |
|
263 |
static void init_blk_migration(QEMUFile *f) |
264 |
{ |
265 |
BlkMigDevState **pbmds, *bmds; |
266 |
BlockDriverState *bs; |
267 |
|
268 |
for (bs = bdrv_first; bs != NULL; bs = bs->next) { |
269 |
if (bs->type == BDRV_TYPE_HD) {
|
270 |
bmds = qemu_mallocz(sizeof(BlkMigDevState));
|
271 |
bmds->bs = bs; |
272 |
bmds->bulk_completed = 0;
|
273 |
bmds->total_sectors = bdrv_getlength(bs) >> SECTOR_BITS; |
274 |
bmds->shared_base = block_mig_state->shared_base; |
275 |
|
276 |
if (bmds->shared_base) {
|
277 |
printf("Start migration for %s with shared base image\n",
|
278 |
bs->device_name); |
279 |
} else {
|
280 |
printf("Start full migration for %s\n", bs->device_name);
|
281 |
} |
282 |
|
283 |
/* insert at the end */
|
284 |
pbmds = &block_mig_state->bmds_first; |
285 |
while (*pbmds != NULL) { |
286 |
pbmds = &(*pbmds)->next; |
287 |
} |
288 |
*pbmds = bmds; |
289 |
|
290 |
blk_mig_save_dev_info(f, bmds); |
291 |
} |
292 |
} |
293 |
|
294 |
block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk(); |
295 |
} |
296 |
|
297 |
static int blk_mig_save_bulked_block(QEMUFile *f, int is_async) |
298 |
{ |
299 |
BlkMigDevState *bmds; |
300 |
|
301 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
302 |
if (bmds->bulk_completed == 0) { |
303 |
if (is_async) {
|
304 |
if (mig_read_device_bulk(f, bmds) == 1) { |
305 |
/* completed bulk section for this device */
|
306 |
bmds->bulk_completed = 1;
|
307 |
} |
308 |
} else {
|
309 |
if (mig_save_device_bulk(f, bmds) == 1) { |
310 |
/* completed bulk section for this device */
|
311 |
bmds->bulk_completed = 1;
|
312 |
} |
313 |
} |
314 |
return 1; |
315 |
} |
316 |
} |
317 |
|
318 |
/* we reached here means bulk is completed */
|
319 |
block_mig_state->bulk_completed = 1;
|
320 |
|
321 |
return 0; |
322 |
} |
323 |
|
324 |
#define MAX_NUM_BLOCKS 4 |
325 |
|
326 |
static void blk_mig_save_dirty_blocks(QEMUFile *f) |
327 |
{ |
328 |
BlkMigDevState *bmds; |
329 |
uint8_t buf[BLOCK_SIZE]; |
330 |
int64_t sector; |
331 |
int len;
|
332 |
|
333 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
334 |
for (sector = 0; sector < bmds->cur_sector;) { |
335 |
if (bdrv_get_dirty(bmds->bs, sector)) {
|
336 |
if (bdrv_read(bmds->bs, sector, buf,
|
337 |
block_mig_state->sectors_per_block) < 0) {
|
338 |
/* FIXME: add error handling */
|
339 |
} |
340 |
|
341 |
/* sector number and flags */
|
342 |
qemu_put_be64(f, (sector << SECTOR_BITS) |
343 |
| BLK_MIG_FLAG_DEVICE_BLOCK); |
344 |
|
345 |
/* device name */
|
346 |
len = strlen(bmds->bs->device_name); |
347 |
qemu_put_byte(f, len); |
348 |
qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len); |
349 |
|
350 |
qemu_put_buffer(f, buf, |
351 |
(block_mig_state->sectors_per_block * |
352 |
SECTOR_SIZE)); |
353 |
|
354 |
bdrv_reset_dirty(bmds->bs, sector, |
355 |
block_mig_state->sectors_per_block); |
356 |
} |
357 |
sector += block_mig_state->sectors_per_block; |
358 |
} |
359 |
} |
360 |
} |
361 |
|
362 |
static void flush_blks(QEMUFile* f) |
363 |
{ |
364 |
BlkMigBlock *blk, *next; |
365 |
|
366 |
dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__,
|
367 |
submitted, read_done, transfered); |
368 |
|
369 |
for (blk = block_mig_state->first_blk;
|
370 |
blk != NULL && !qemu_file_rate_limit(f);
|
371 |
blk = next) { |
372 |
send_blk(f, blk); |
373 |
|
374 |
next = blk->next; |
375 |
qemu_free(blk->buf); |
376 |
qemu_free(blk); |
377 |
|
378 |
block_mig_state->read_done--; |
379 |
block_mig_state->transferred++; |
380 |
assert(block_mig_state->read_done >= 0);
|
381 |
} |
382 |
block_mig_state->first_blk = blk; |
383 |
|
384 |
if (block_mig_state->first_blk == NULL) { |
385 |
block_mig_state->last_blk = NULL;
|
386 |
} |
387 |
|
388 |
dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__,
|
389 |
block_mig_state->submitted, block_mig_state->read_done, |
390 |
block_mig_state->transferred); |
391 |
} |
392 |
|
393 |
static int is_stage2_completed(void) |
394 |
{ |
395 |
BlkMigDevState *bmds; |
396 |
|
397 |
if (block_mig_state->submitted > 0) { |
398 |
return 0; |
399 |
} |
400 |
|
401 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
402 |
if (bmds->bulk_completed == 0) { |
403 |
return 0; |
404 |
} |
405 |
} |
406 |
|
407 |
return 1; |
408 |
} |
409 |
|
410 |
static int block_save_live(QEMUFile *f, int stage, void *opaque) |
411 |
{ |
412 |
dprintf("Enter save live stage %d submitted %d transferred %d\n", stage,
|
413 |
submitted, transferred); |
414 |
|
415 |
if (block_mig_state->blk_enable != 1) { |
416 |
/* no need to migrate storage */
|
417 |
qemu_put_be64(f, BLK_MIG_FLAG_EOS); |
418 |
return 1; |
419 |
} |
420 |
|
421 |
if (stage == 1) { |
422 |
init_blk_migration(f); |
423 |
|
424 |
/* start track dirty blocks */
|
425 |
set_dirty_tracking(1);
|
426 |
} |
427 |
|
428 |
flush_blks(f); |
429 |
|
430 |
/* control the rate of transfer */
|
431 |
while ((block_mig_state->submitted +
|
432 |
block_mig_state->read_done) * BLOCK_SIZE < |
433 |
qemu_file_get_rate_limit(f)) { |
434 |
if (blk_mig_save_bulked_block(f, 1) == 0) { |
435 |
/* no more bulk blocks for now */
|
436 |
break;
|
437 |
} |
438 |
} |
439 |
|
440 |
flush_blks(f); |
441 |
|
442 |
if (stage == 3) { |
443 |
while (blk_mig_save_bulked_block(f, 0) != 0) { |
444 |
/* empty */
|
445 |
} |
446 |
|
447 |
blk_mig_save_dirty_blocks(f); |
448 |
|
449 |
/* stop track dirty blocks */
|
450 |
set_dirty_tracking(0);
|
451 |
|
452 |
printf("\nBlock migration completed\n");
|
453 |
} |
454 |
|
455 |
qemu_put_be64(f, BLK_MIG_FLAG_EOS); |
456 |
|
457 |
return ((stage == 2) && is_stage2_completed()); |
458 |
} |
459 |
|
460 |
static int block_load(QEMUFile *f, void *opaque, int version_id) |
461 |
{ |
462 |
int len, flags;
|
463 |
char device_name[256]; |
464 |
int64_t addr; |
465 |
BlockDriverState *bs; |
466 |
uint8_t *buf; |
467 |
|
468 |
block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk(); |
469 |
buf = qemu_malloc(BLOCK_SIZE); |
470 |
|
471 |
do {
|
472 |
addr = qemu_get_be64(f); |
473 |
|
474 |
flags = addr & ~SECTOR_MASK; |
475 |
addr &= SECTOR_MASK; |
476 |
|
477 |
if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
|
478 |
/* get device name */
|
479 |
len = qemu_get_byte(f); |
480 |
|
481 |
qemu_get_buffer(f, (uint8_t *)device_name, len); |
482 |
device_name[len] = '\0';
|
483 |
|
484 |
bs = bdrv_find(device_name); |
485 |
|
486 |
qemu_get_buffer(f, buf, BLOCK_SIZE); |
487 |
if (bs != NULL) { |
488 |
bdrv_write(bs, (addr >> SECTOR_BITS), |
489 |
buf, block_mig_state->sectors_per_block); |
490 |
} else {
|
491 |
printf("Error unknown block device %s\n", device_name);
|
492 |
/* FIXME: add error handling */
|
493 |
} |
494 |
} else if (!(flags & BLK_MIG_FLAG_EOS)) { |
495 |
printf("Unknown flags\n");
|
496 |
/* FIXME: add error handling */
|
497 |
} |
498 |
} while (!(flags & BLK_MIG_FLAG_EOS));
|
499 |
|
500 |
qemu_free(buf); |
501 |
|
502 |
return 0; |
503 |
} |
504 |
|
505 |
static void block_set_params(int blk_enable, int shared_base, void *opaque) |
506 |
{ |
507 |
assert(opaque == block_mig_state); |
508 |
|
509 |
block_mig_state->blk_enable = blk_enable; |
510 |
block_mig_state->shared_base = shared_base; |
511 |
|
512 |
/* shared base means that blk_enable = 1 */
|
513 |
block_mig_state->blk_enable |= shared_base; |
514 |
} |
515 |
|
516 |
void blk_mig_info(void) |
517 |
{ |
518 |
BlockDriverState *bs; |
519 |
|
520 |
for (bs = bdrv_first; bs != NULL; bs = bs->next) { |
521 |
printf("Device %s\n", bs->device_name);
|
522 |
if (bs->type == BDRV_TYPE_HD) {
|
523 |
printf("device %s format %s\n",
|
524 |
bs->device_name, bs->drv->format_name); |
525 |
} |
526 |
} |
527 |
} |
528 |
|
529 |
void blk_mig_init(void) |
530 |
{ |
531 |
block_mig_state = qemu_mallocz(sizeof(BlkMigState));
|
532 |
|
533 |
register_savevm_live("block", 0, 1, block_set_params, block_save_live, |
534 |
NULL, block_load, block_mig_state);
|
535 |
} |