Statistics
| Branch: | Revision:

root / block / backup.c @ 793ed47a

History | View | Annotate | Download (12.1 kB)

1
/*
2
 * QEMU backup
3
 *
4
 * Copyright (C) 2013 Proxmox Server Solutions
5
 *
6
 * Authors:
7
 *  Dietmar Maurer (dietmar@proxmox.com)
8
 *
9
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10
 * See the COPYING file in the top-level directory.
11
 *
12
 */
13

    
14
#include <stdio.h>
15
#include <errno.h>
16
#include <unistd.h>
17

    
18
#include "trace.h"
19
#include "block/block.h"
20
#include "block/block_int.h"
21
#include "block/blockjob.h"
22
#include "qemu/ratelimit.h"
23

    
24
#define BACKUP_CLUSTER_BITS 16
25
#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
26
#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
27

    
28
#define SLICE_TIME 100000000ULL /* ns */
29

    
30
typedef struct CowRequest {
31
    int64_t start;
32
    int64_t end;
33
    QLIST_ENTRY(CowRequest) list;
34
    CoQueue wait_queue; /* coroutines blocked on this request */
35
} CowRequest;
36

    
37
typedef struct BackupBlockJob {
38
    BlockJob common;
39
    BlockDriverState *target;
40
    MirrorSyncMode sync_mode;
41
    RateLimit limit;
42
    BlockdevOnError on_source_error;
43
    BlockdevOnError on_target_error;
44
    CoRwlock flush_rwlock;
45
    uint64_t sectors_read;
46
    HBitmap *bitmap;
47
    QLIST_HEAD(, CowRequest) inflight_reqs;
48
} BackupBlockJob;
49

    
50
/* See if in-flight requests overlap and wait for them to complete */
51
static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
52
                                                       int64_t start,
53
                                                       int64_t end)
54
{
55
    CowRequest *req;
56
    bool retry;
57

    
58
    do {
59
        retry = false;
60
        QLIST_FOREACH(req, &job->inflight_reqs, list) {
61
            if (end > req->start && start < req->end) {
62
                qemu_co_queue_wait(&req->wait_queue);
63
                retry = true;
64
                break;
65
            }
66
        }
67
    } while (retry);
68
}
69

    
70
/* Keep track of an in-flight request */
71
static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
72
                                     int64_t start, int64_t end)
73
{
74
    req->start = start;
75
    req->end = end;
76
    qemu_co_queue_init(&req->wait_queue);
77
    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
78
}
79

    
80
/* Forget about a completed request */
81
static void cow_request_end(CowRequest *req)
82
{
83
    QLIST_REMOVE(req, list);
84
    qemu_co_queue_restart_all(&req->wait_queue);
85
}
86

    
87
static int coroutine_fn backup_do_cow(BlockDriverState *bs,
88
                                      int64_t sector_num, int nb_sectors,
89
                                      bool *error_is_read)
90
{
91
    BackupBlockJob *job = (BackupBlockJob *)bs->job;
92
    CowRequest cow_request;
93
    struct iovec iov;
94
    QEMUIOVector bounce_qiov;
95
    void *bounce_buffer = NULL;
96
    int ret = 0;
97
    int64_t start, end;
98
    int n;
99

    
100
    qemu_co_rwlock_rdlock(&job->flush_rwlock);
101

    
102
    start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
103
    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);
104

    
105
    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);
106

    
107
    wait_for_overlapping_requests(job, start, end);
108
    cow_request_begin(&cow_request, job, start, end);
109

    
110
    for (; start < end; start++) {
111
        if (hbitmap_get(job->bitmap, start)) {
112
            trace_backup_do_cow_skip(job, start);
113
            continue; /* already copied */
114
        }
115

    
116
        trace_backup_do_cow_process(job, start);
117

    
118
        n = MIN(BACKUP_SECTORS_PER_CLUSTER,
119
                job->common.len / BDRV_SECTOR_SIZE -
120
                start * BACKUP_SECTORS_PER_CLUSTER);
121

    
122
        if (!bounce_buffer) {
123
            bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
124
        }
125
        iov.iov_base = bounce_buffer;
126
        iov.iov_len = n * BDRV_SECTOR_SIZE;
127
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);
128

    
129
        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
130
                            &bounce_qiov);
131
        if (ret < 0) {
132
            trace_backup_do_cow_read_fail(job, start, ret);
133
            if (error_is_read) {
134
                *error_is_read = true;
135
            }
136
            goto out;
137
        }
138

    
139
        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
140
            ret = bdrv_co_write_zeroes(job->target,
141
                                       start * BACKUP_SECTORS_PER_CLUSTER,
142
                                       n, BDRV_REQ_MAY_UNMAP);
143
        } else {
144
            ret = bdrv_co_writev(job->target,
145
                                 start * BACKUP_SECTORS_PER_CLUSTER, n,
146
                                 &bounce_qiov);
147
        }
148
        if (ret < 0) {
149
            trace_backup_do_cow_write_fail(job, start, ret);
150
            if (error_is_read) {
151
                *error_is_read = false;
152
            }
153
            goto out;
154
        }
155

    
156
        hbitmap_set(job->bitmap, start, 1);
157

    
158
        /* Publish progress, guest I/O counts as progress too.  Note that the
159
         * offset field is an opaque progress value, it is not a disk offset.
160
         */
161
        job->sectors_read += n;
162
        job->common.offset += n * BDRV_SECTOR_SIZE;
163
    }
164

    
165
out:
166
    if (bounce_buffer) {
167
        qemu_vfree(bounce_buffer);
168
    }
169

    
170
    cow_request_end(&cow_request);
171

    
172
    trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);
173

    
174
    qemu_co_rwlock_unlock(&job->flush_rwlock);
175

    
176
    return ret;
177
}
178

    
179
static int coroutine_fn backup_before_write_notify(
180
        NotifierWithReturn *notifier,
181
        void *opaque)
182
{
183
    BdrvTrackedRequest *req = opaque;
184
    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
185
    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
186

    
187
    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
188
    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
189

    
190
    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
191
}
192

    
193
static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
194
{
195
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
196

    
197
    if (speed < 0) {
198
        error_set(errp, QERR_INVALID_PARAMETER, "speed");
199
        return;
200
    }
201
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
202
}
203

    
204
static void backup_iostatus_reset(BlockJob *job)
205
{
206
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
207

    
208
    bdrv_iostatus_reset(s->target);
209
}
210

    
211
static const BlockJobDriver backup_job_driver = {
212
    .instance_size  = sizeof(BackupBlockJob),
213
    .job_type       = BLOCK_JOB_TYPE_BACKUP,
214
    .set_speed      = backup_set_speed,
215
    .iostatus_reset = backup_iostatus_reset,
216
};
217

    
218
static BlockErrorAction backup_error_action(BackupBlockJob *job,
219
                                            bool read, int error)
220
{
221
    if (read) {
222
        return block_job_error_action(&job->common, job->common.bs,
223
                                      job->on_source_error, true, error);
224
    } else {
225
        return block_job_error_action(&job->common, job->target,
226
                                      job->on_target_error, false, error);
227
    }
228
}
229

    
230
static void coroutine_fn backup_run(void *opaque)
231
{
232
    BackupBlockJob *job = opaque;
233
    BlockDriverState *bs = job->common.bs;
234
    BlockDriverState *target = job->target;
235
    BlockdevOnError on_target_error = job->on_target_error;
236
    NotifierWithReturn before_write = {
237
        .notify = backup_before_write_notify,
238
    };
239
    int64_t start, end;
240
    int ret = 0;
241

    
242
    QLIST_INIT(&job->inflight_reqs);
243
    qemu_co_rwlock_init(&job->flush_rwlock);
244

    
245
    start = 0;
246
    end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
247
                       BACKUP_SECTORS_PER_CLUSTER);
248

    
249
    job->bitmap = hbitmap_alloc(end, 0);
250

    
251
    bdrv_set_enable_write_cache(target, true);
252
    bdrv_set_on_error(target, on_target_error, on_target_error);
253
    bdrv_iostatus_enable(target);
254

    
255
    bdrv_add_before_write_notifier(bs, &before_write);
256

    
257
    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
258
        while (!block_job_is_cancelled(&job->common)) {
259
            /* Yield until the job is cancelled.  We just let our before_write
260
             * notify callback service CoW requests. */
261
            job->common.busy = false;
262
            qemu_coroutine_yield();
263
            job->common.busy = true;
264
        }
265
    } else {
266
        /* Both FULL and TOP SYNC_MODE's require copying.. */
267
        for (; start < end; start++) {
268
            bool error_is_read;
269

    
270
            if (block_job_is_cancelled(&job->common)) {
271
                break;
272
            }
273

    
274
            /* we need to yield so that qemu_aio_flush() returns.
275
             * (without, VM does not reboot)
276
             */
277
            if (job->common.speed) {
278
                uint64_t delay_ns = ratelimit_calculate_delay(
279
                        &job->limit, job->sectors_read);
280
                job->sectors_read = 0;
281
                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
282
            } else {
283
                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
284
            }
285

    
286
            if (block_job_is_cancelled(&job->common)) {
287
                break;
288
            }
289

    
290
            if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
291
                int i, n;
292
                int alloced = 0;
293

    
294
                /* Check to see if these blocks are already in the
295
                 * backing file. */
296

    
297
                for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
298
                    /* bdrv_is_allocated() only returns true/false based
299
                     * on the first set of sectors it comes across that
300
                     * are are all in the same state.
301
                     * For that reason we must verify each sector in the
302
                     * backup cluster length.  We end up copying more than
303
                     * needed but at some point that is always the case. */
304
                    alloced =
305
                        bdrv_is_allocated(bs,
306
                                start * BACKUP_SECTORS_PER_CLUSTER + i,
307
                                BACKUP_SECTORS_PER_CLUSTER - i, &n);
308
                    i += n;
309

    
310
                    if (alloced == 1) {
311
                        break;
312
                    }
313
                }
314

    
315
                /* If the above loop never found any sectors that are in
316
                 * the topmost image, skip this backup. */
317
                if (alloced == 0) {
318
                    continue;
319
                }
320
            }
321
            /* FULL sync mode we copy the whole drive. */
322
            ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
323
                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
324
            if (ret < 0) {
325
                /* Depending on error action, fail now or retry cluster */
326
                BlockErrorAction action =
327
                    backup_error_action(job, error_is_read, -ret);
328
                if (action == BDRV_ACTION_REPORT) {
329
                    break;
330
                } else {
331
                    start--;
332
                    continue;
333
                }
334
            }
335
        }
336
    }
337

    
338
    notifier_with_return_remove(&before_write);
339

    
340
    /* wait until pending backup_do_cow() calls have completed */
341
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
342
    qemu_co_rwlock_unlock(&job->flush_rwlock);
343

    
344
    hbitmap_free(job->bitmap);
345

    
346
    bdrv_iostatus_disable(target);
347
    bdrv_unref(target);
348

    
349
    block_job_completed(&job->common, ret);
350
}
351

    
352
void backup_start(BlockDriverState *bs, BlockDriverState *target,
353
                  int64_t speed, MirrorSyncMode sync_mode,
354
                  BlockdevOnError on_source_error,
355
                  BlockdevOnError on_target_error,
356
                  BlockDriverCompletionFunc *cb, void *opaque,
357
                  Error **errp)
358
{
359
    int64_t len;
360

    
361
    assert(bs);
362
    assert(target);
363
    assert(cb);
364

    
365
    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
366
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
367
        !bdrv_iostatus_is_enabled(bs)) {
368
        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
369
        return;
370
    }
371

    
372
    len = bdrv_getlength(bs);
373
    if (len < 0) {
374
        error_setg_errno(errp, -len, "unable to get length for '%s'",
375
                         bdrv_get_device_name(bs));
376
        return;
377
    }
378

    
379
    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
380
                                           cb, opaque, errp);
381
    if (!job) {
382
        return;
383
    }
384

    
385
    job->on_source_error = on_source_error;
386
    job->on_target_error = on_target_error;
387
    job->target = target;
388
    job->sync_mode = sync_mode;
389
    job->common.len = len;
390
    job->common.co = qemu_coroutine_create(backup_run);
391
    qemu_coroutine_enter(job->common.co, job);
392
}