root / block-migration.c @ 6ea44308
History | View | Annotate | Download (13.9 kB)
1 |
/*
|
---|---|
2 |
* QEMU live block migration
|
3 |
*
|
4 |
* Copyright IBM, Corp. 2009
|
5 |
*
|
6 |
* Authors:
|
7 |
* Liran Schour <lirans@il.ibm.com>
|
8 |
*
|
9 |
* This work is licensed under the terms of the GNU GPL, version 2. See
|
10 |
* the COPYING file in the top-level directory.
|
11 |
*
|
12 |
*/
|
13 |
|
14 |
#include "qemu-common.h" |
15 |
#include "block_int.h" |
16 |
#include "hw/hw.h" |
17 |
#include "block-migration.h" |
18 |
#include <assert.h> |
19 |
|
20 |
#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
|
21 |
|
22 |
#define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 |
23 |
#define BLK_MIG_FLAG_EOS 0x02 |
24 |
|
25 |
#define MAX_IS_ALLOCATED_SEARCH 65536 |
26 |
#define MAX_BLOCKS_READ 10000 |
27 |
#define BLOCKS_READ_CHANGE 100 |
28 |
#define INITIAL_BLOCKS_READ 100 |
29 |
|
30 |
//#define DEBUG_BLK_MIGRATION
|
31 |
|
32 |
#ifdef DEBUG_BLK_MIGRATION
|
33 |
#define dprintf(fmt, ...) \
|
34 |
do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) |
35 |
#else
|
36 |
#define dprintf(fmt, ...) \
|
37 |
do { } while (0) |
38 |
#endif
|
39 |
|
40 |
typedef struct BlkMigDevState { |
41 |
BlockDriverState *bs; |
42 |
int bulk_completed;
|
43 |
int shared_base;
|
44 |
struct BlkMigDevState *next;
|
45 |
int64_t cur_sector; |
46 |
int64_t total_sectors; |
47 |
int64_t dirty; |
48 |
} BlkMigDevState; |
49 |
|
50 |
typedef struct BlkMigBlock { |
51 |
uint8_t *buf; |
52 |
BlkMigDevState *bmds; |
53 |
int64_t sector; |
54 |
struct iovec iov;
|
55 |
QEMUIOVector qiov; |
56 |
BlockDriverAIOCB *aiocb; |
57 |
int ret;
|
58 |
struct BlkMigBlock *next;
|
59 |
} BlkMigBlock; |
60 |
|
61 |
typedef struct BlkMigState { |
62 |
int bulk_completed;
|
63 |
int blk_enable;
|
64 |
int shared_base;
|
65 |
int no_dirty;
|
66 |
QEMUFile *load_file; |
67 |
BlkMigDevState *bmds_first; |
68 |
BlkMigBlock *first_blk; |
69 |
BlkMigBlock *last_blk; |
70 |
int submitted;
|
71 |
int read_done;
|
72 |
int transferred;
|
73 |
int64_t print_completion; |
74 |
} BlkMigState; |
75 |
|
76 |
static BlkMigState *block_mig_state = NULL; |
77 |
|
78 |
static void blk_mig_read_cb(void *opaque, int ret) |
79 |
{ |
80 |
BlkMigBlock *blk = opaque; |
81 |
|
82 |
blk->ret = ret; |
83 |
|
84 |
/* insert at the end */
|
85 |
if (block_mig_state->last_blk == NULL) { |
86 |
block_mig_state->first_blk = blk; |
87 |
block_mig_state->last_blk = blk; |
88 |
} else {
|
89 |
block_mig_state->last_blk->next = blk; |
90 |
block_mig_state->last_blk = blk; |
91 |
} |
92 |
|
93 |
block_mig_state->submitted--; |
94 |
block_mig_state->read_done++; |
95 |
assert(block_mig_state->submitted >= 0);
|
96 |
} |
97 |
|
98 |
static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms) |
99 |
{ |
100 |
int nr_sectors;
|
101 |
int64_t total_sectors, cur_sector = 0;
|
102 |
BlockDriverState *bs = bms->bs; |
103 |
BlkMigBlock *blk; |
104 |
|
105 |
blk = qemu_malloc(sizeof(BlkMigBlock));
|
106 |
blk->buf = qemu_malloc(BLOCK_SIZE); |
107 |
|
108 |
cur_sector = bms->cur_sector; |
109 |
total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; |
110 |
|
111 |
if (bms->shared_base) {
|
112 |
while (cur_sector < bms->total_sectors &&
|
113 |
!bdrv_is_allocated(bms->bs, cur_sector, |
114 |
MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { |
115 |
cur_sector += nr_sectors; |
116 |
} |
117 |
} |
118 |
|
119 |
if (cur_sector >= total_sectors) {
|
120 |
bms->cur_sector = total_sectors; |
121 |
qemu_free(blk->buf); |
122 |
qemu_free(blk); |
123 |
return 1; |
124 |
} |
125 |
|
126 |
if (cur_sector >= block_mig_state->print_completion) {
|
127 |
printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); |
128 |
fflush(stdout); |
129 |
block_mig_state->print_completion += |
130 |
(BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
|
131 |
} |
132 |
|
133 |
/* we are going to transfer a full block even if it is not allocated */
|
134 |
nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; |
135 |
|
136 |
cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
|
137 |
|
138 |
if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
|
139 |
nr_sectors = (total_sectors - cur_sector); |
140 |
} |
141 |
|
142 |
bms->cur_sector = cur_sector + nr_sectors; |
143 |
blk->sector = cur_sector; |
144 |
blk->bmds = bms; |
145 |
blk->next = NULL;
|
146 |
|
147 |
blk->iov.iov_base = blk->buf; |
148 |
blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; |
149 |
qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
|
150 |
|
151 |
blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, |
152 |
nr_sectors, blk_mig_read_cb, blk); |
153 |
|
154 |
if (!blk->aiocb) {
|
155 |
printf("Error reading sector %" PRId64 "\n", cur_sector); |
156 |
qemu_free(blk->buf); |
157 |
qemu_free(blk); |
158 |
return 0; |
159 |
} |
160 |
|
161 |
bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors); |
162 |
block_mig_state->submitted++; |
163 |
|
164 |
return (bms->cur_sector >= total_sectors);
|
165 |
} |
166 |
|
167 |
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) |
168 |
{ |
169 |
int len, nr_sectors;
|
170 |
int64_t total_sectors = bmds->total_sectors, cur_sector = 0;
|
171 |
uint8_t *tmp_buf = NULL;
|
172 |
BlockDriverState *bs = bmds->bs; |
173 |
|
174 |
tmp_buf = qemu_malloc(BLOCK_SIZE); |
175 |
|
176 |
cur_sector = bmds->cur_sector; |
177 |
|
178 |
if (bmds->shared_base) {
|
179 |
while (cur_sector < bmds->total_sectors &&
|
180 |
!bdrv_is_allocated(bmds->bs, cur_sector, |
181 |
MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) { |
182 |
cur_sector += nr_sectors; |
183 |
} |
184 |
} |
185 |
|
186 |
if (cur_sector >= total_sectors) {
|
187 |
bmds->cur_sector = total_sectors; |
188 |
qemu_free(tmp_buf); |
189 |
return 1; |
190 |
} |
191 |
|
192 |
if (cur_sector >= block_mig_state->print_completion) {
|
193 |
printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors); |
194 |
fflush(stdout); |
195 |
block_mig_state->print_completion += |
196 |
(BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
|
197 |
} |
198 |
|
199 |
cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
|
200 |
|
201 |
/* we are going to transfer a full block even if it is not allocated */
|
202 |
nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; |
203 |
|
204 |
if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
|
205 |
nr_sectors = (total_sectors - cur_sector); |
206 |
} |
207 |
|
208 |
if (bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) { |
209 |
printf("Error reading sector %" PRId64 "\n", cur_sector); |
210 |
} |
211 |
|
212 |
bdrv_reset_dirty(bs, cur_sector, nr_sectors); |
213 |
|
214 |
/* sector number and flags */
|
215 |
qemu_put_be64(f, (cur_sector << BDRV_SECTOR_BITS) |
216 |
| BLK_MIG_FLAG_DEVICE_BLOCK); |
217 |
|
218 |
/* device name */
|
219 |
len = strlen(bs->device_name); |
220 |
qemu_put_byte(f, len); |
221 |
qemu_put_buffer(f, (uint8_t *)bs->device_name, len); |
222 |
|
223 |
qemu_put_buffer(f, tmp_buf, BLOCK_SIZE); |
224 |
|
225 |
bmds->cur_sector = cur_sector + BDRV_SECTORS_PER_DIRTY_CHUNK; |
226 |
|
227 |
qemu_free(tmp_buf); |
228 |
|
229 |
return (bmds->cur_sector >= total_sectors);
|
230 |
} |
231 |
|
232 |
static void send_blk(QEMUFile *f, BlkMigBlock * blk) |
233 |
{ |
234 |
int len;
|
235 |
|
236 |
/* sector number and flags */
|
237 |
qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) |
238 |
| BLK_MIG_FLAG_DEVICE_BLOCK); |
239 |
|
240 |
/* device name */
|
241 |
len = strlen(blk->bmds->bs->device_name); |
242 |
qemu_put_byte(f, len); |
243 |
qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len); |
244 |
|
245 |
qemu_put_buffer(f, blk->buf, BLOCK_SIZE); |
246 |
} |
247 |
|
248 |
static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds) |
249 |
{ |
250 |
} |
251 |
|
252 |
static void set_dirty_tracking(int enable) |
253 |
{ |
254 |
BlkMigDevState *bmds; |
255 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
256 |
bdrv_set_dirty_tracking(bmds->bs, enable); |
257 |
} |
258 |
} |
259 |
|
260 |
static void init_blk_migration(QEMUFile *f) |
261 |
{ |
262 |
BlkMigDevState **pbmds, *bmds; |
263 |
BlockDriverState *bs; |
264 |
|
265 |
for (bs = bdrv_first; bs != NULL; bs = bs->next) { |
266 |
if (bs->type == BDRV_TYPE_HD) {
|
267 |
bmds = qemu_mallocz(sizeof(BlkMigDevState));
|
268 |
bmds->bs = bs; |
269 |
bmds->bulk_completed = 0;
|
270 |
bmds->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; |
271 |
bmds->shared_base = block_mig_state->shared_base; |
272 |
|
273 |
if (bmds->shared_base) {
|
274 |
printf("Start migration for %s with shared base image\n",
|
275 |
bs->device_name); |
276 |
} else {
|
277 |
printf("Start full migration for %s\n", bs->device_name);
|
278 |
} |
279 |
|
280 |
/* insert at the end */
|
281 |
pbmds = &block_mig_state->bmds_first; |
282 |
while (*pbmds != NULL) { |
283 |
pbmds = &(*pbmds)->next; |
284 |
} |
285 |
*pbmds = bmds; |
286 |
|
287 |
blk_mig_save_dev_info(f, bmds); |
288 |
} |
289 |
} |
290 |
} |
291 |
|
292 |
static int blk_mig_save_bulked_block(QEMUFile *f, int is_async) |
293 |
{ |
294 |
BlkMigDevState *bmds; |
295 |
|
296 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
297 |
if (bmds->bulk_completed == 0) { |
298 |
if (is_async) {
|
299 |
if (mig_read_device_bulk(f, bmds) == 1) { |
300 |
/* completed bulk section for this device */
|
301 |
bmds->bulk_completed = 1;
|
302 |
} |
303 |
} else {
|
304 |
if (mig_save_device_bulk(f, bmds) == 1) { |
305 |
/* completed bulk section for this device */
|
306 |
bmds->bulk_completed = 1;
|
307 |
} |
308 |
} |
309 |
return 1; |
310 |
} |
311 |
} |
312 |
|
313 |
/* we reached here means bulk is completed */
|
314 |
block_mig_state->bulk_completed = 1;
|
315 |
|
316 |
return 0; |
317 |
} |
318 |
|
319 |
#define MAX_NUM_BLOCKS 4 |
320 |
|
321 |
static void blk_mig_save_dirty_blocks(QEMUFile *f) |
322 |
{ |
323 |
BlkMigDevState *bmds; |
324 |
uint8_t buf[BLOCK_SIZE]; |
325 |
int64_t sector; |
326 |
int len;
|
327 |
|
328 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
329 |
for (sector = 0; sector < bmds->cur_sector;) { |
330 |
if (bdrv_get_dirty(bmds->bs, sector)) {
|
331 |
if (bdrv_read(bmds->bs, sector, buf,
|
332 |
BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
|
333 |
/* FIXME: add error handling */
|
334 |
} |
335 |
|
336 |
/* sector number and flags */
|
337 |
qemu_put_be64(f, (sector << BDRV_SECTOR_BITS) |
338 |
| BLK_MIG_FLAG_DEVICE_BLOCK); |
339 |
|
340 |
/* device name */
|
341 |
len = strlen(bmds->bs->device_name); |
342 |
qemu_put_byte(f, len); |
343 |
qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len); |
344 |
|
345 |
qemu_put_buffer(f, buf, BLOCK_SIZE); |
346 |
|
347 |
bdrv_reset_dirty(bmds->bs, sector, |
348 |
BDRV_SECTORS_PER_DIRTY_CHUNK); |
349 |
} |
350 |
sector += BDRV_SECTORS_PER_DIRTY_CHUNK; |
351 |
} |
352 |
} |
353 |
} |
354 |
|
355 |
static void flush_blks(QEMUFile* f) |
356 |
{ |
357 |
BlkMigBlock *blk, *next; |
358 |
|
359 |
dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__,
|
360 |
submitted, read_done, transfered); |
361 |
|
362 |
for (blk = block_mig_state->first_blk;
|
363 |
blk != NULL && !qemu_file_rate_limit(f);
|
364 |
blk = next) { |
365 |
send_blk(f, blk); |
366 |
|
367 |
next = blk->next; |
368 |
qemu_free(blk->buf); |
369 |
qemu_free(blk); |
370 |
|
371 |
block_mig_state->read_done--; |
372 |
block_mig_state->transferred++; |
373 |
assert(block_mig_state->read_done >= 0);
|
374 |
} |
375 |
block_mig_state->first_blk = blk; |
376 |
|
377 |
if (block_mig_state->first_blk == NULL) { |
378 |
block_mig_state->last_blk = NULL;
|
379 |
} |
380 |
|
381 |
dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__,
|
382 |
block_mig_state->submitted, block_mig_state->read_done, |
383 |
block_mig_state->transferred); |
384 |
} |
385 |
|
386 |
static int is_stage2_completed(void) |
387 |
{ |
388 |
BlkMigDevState *bmds; |
389 |
|
390 |
if (block_mig_state->submitted > 0) { |
391 |
return 0; |
392 |
} |
393 |
|
394 |
for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) { |
395 |
if (bmds->bulk_completed == 0) { |
396 |
return 0; |
397 |
} |
398 |
} |
399 |
|
400 |
return 1; |
401 |
} |
402 |
|
403 |
static int block_save_live(QEMUFile *f, int stage, void *opaque) |
404 |
{ |
405 |
dprintf("Enter save live stage %d submitted %d transferred %d\n", stage,
|
406 |
submitted, transferred); |
407 |
|
408 |
if (block_mig_state->blk_enable != 1) { |
409 |
/* no need to migrate storage */
|
410 |
qemu_put_be64(f, BLK_MIG_FLAG_EOS); |
411 |
return 1; |
412 |
} |
413 |
|
414 |
if (stage == 1) { |
415 |
init_blk_migration(f); |
416 |
|
417 |
/* start track dirty blocks */
|
418 |
set_dirty_tracking(1);
|
419 |
} |
420 |
|
421 |
flush_blks(f); |
422 |
|
423 |
/* control the rate of transfer */
|
424 |
while ((block_mig_state->submitted +
|
425 |
block_mig_state->read_done) * BLOCK_SIZE < |
426 |
qemu_file_get_rate_limit(f)) { |
427 |
if (blk_mig_save_bulked_block(f, 1) == 0) { |
428 |
/* no more bulk blocks for now */
|
429 |
break;
|
430 |
} |
431 |
} |
432 |
|
433 |
flush_blks(f); |
434 |
|
435 |
if (stage == 3) { |
436 |
while (blk_mig_save_bulked_block(f, 0) != 0) { |
437 |
/* empty */
|
438 |
} |
439 |
|
440 |
blk_mig_save_dirty_blocks(f); |
441 |
|
442 |
/* stop track dirty blocks */
|
443 |
set_dirty_tracking(0);
|
444 |
|
445 |
printf("\nBlock migration completed\n");
|
446 |
} |
447 |
|
448 |
qemu_put_be64(f, BLK_MIG_FLAG_EOS); |
449 |
|
450 |
return ((stage == 2) && is_stage2_completed()); |
451 |
} |
452 |
|
453 |
static int block_load(QEMUFile *f, void *opaque, int version_id) |
454 |
{ |
455 |
int len, flags;
|
456 |
char device_name[256]; |
457 |
int64_t addr; |
458 |
BlockDriverState *bs; |
459 |
uint8_t *buf; |
460 |
|
461 |
buf = qemu_malloc(BLOCK_SIZE); |
462 |
|
463 |
do {
|
464 |
addr = qemu_get_be64(f); |
465 |
|
466 |
flags = addr & ~BDRV_SECTOR_MASK; |
467 |
addr >>= BDRV_SECTOR_BITS; |
468 |
|
469 |
if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
|
470 |
/* get device name */
|
471 |
len = qemu_get_byte(f); |
472 |
|
473 |
qemu_get_buffer(f, (uint8_t *)device_name, len); |
474 |
device_name[len] = '\0';
|
475 |
|
476 |
bs = bdrv_find(device_name); |
477 |
|
478 |
qemu_get_buffer(f, buf, BLOCK_SIZE); |
479 |
if (bs != NULL) { |
480 |
bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK); |
481 |
} else {
|
482 |
printf("Error unknown block device %s\n", device_name);
|
483 |
/* FIXME: add error handling */
|
484 |
} |
485 |
} else if (!(flags & BLK_MIG_FLAG_EOS)) { |
486 |
printf("Unknown flags\n");
|
487 |
/* FIXME: add error handling */
|
488 |
} |
489 |
} while (!(flags & BLK_MIG_FLAG_EOS));
|
490 |
|
491 |
qemu_free(buf); |
492 |
|
493 |
return 0; |
494 |
} |
495 |
|
496 |
static void block_set_params(int blk_enable, int shared_base, void *opaque) |
497 |
{ |
498 |
assert(opaque == block_mig_state); |
499 |
|
500 |
block_mig_state->blk_enable = blk_enable; |
501 |
block_mig_state->shared_base = shared_base; |
502 |
|
503 |
/* shared base means that blk_enable = 1 */
|
504 |
block_mig_state->blk_enable |= shared_base; |
505 |
} |
506 |
|
507 |
void blk_mig_info(void) |
508 |
{ |
509 |
BlockDriverState *bs; |
510 |
|
511 |
for (bs = bdrv_first; bs != NULL; bs = bs->next) { |
512 |
printf("Device %s\n", bs->device_name);
|
513 |
if (bs->type == BDRV_TYPE_HD) {
|
514 |
printf("device %s format %s\n",
|
515 |
bs->device_name, bs->drv->format_name); |
516 |
} |
517 |
} |
518 |
} |
519 |
|
520 |
void blk_mig_init(void) |
521 |
{ |
522 |
block_mig_state = qemu_mallocz(sizeof(BlkMigState));
|
523 |
|
524 |
register_savevm_live("block", 0, 1, block_set_params, block_save_live, |
525 |
NULL, block_load, block_mig_state);
|
526 |
} |