root / block / rbd.c @ fab5cf59
History | View | Annotate | Download (20 kB)
1 |
/*
|
---|---|
2 |
* QEMU Block driver for RADOS (Ceph)
|
3 |
*
|
4 |
* Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
|
5 |
* Josh Durgin <josh.durgin@dreamhost.com>
|
6 |
*
|
7 |
* This work is licensed under the terms of the GNU GPL, version 2. See
|
8 |
* the COPYING file in the top-level directory.
|
9 |
*
|
10 |
*/
|
11 |
|
12 |
#include <inttypes.h> |
13 |
|
14 |
#include "qemu-common.h" |
15 |
#include "qemu-error.h" |
16 |
|
17 |
#include "block_int.h" |
18 |
|
19 |
#include <rbd/librbd.h> |
20 |
|
21 |
|
22 |
|
23 |
/*
|
24 |
* When specifying the image filename use:
|
25 |
*
|
26 |
* rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
|
27 |
*
|
28 |
* poolname must be the name of an existing rados pool
|
29 |
*
|
30 |
* devicename is the basename for all objects used to
|
31 |
* emulate the raw device.
|
32 |
*
|
33 |
* Each option given is used to configure rados, and may be
|
34 |
* any Ceph option, or "conf". The "conf" option specifies
|
35 |
* a Ceph configuration file to read.
|
36 |
*
|
37 |
* Metadata information (image size, ...) is stored in an
|
38 |
* object with the name "devicename.rbd".
|
39 |
*
|
40 |
* The raw device is split into 4MB sized objects by default.
|
41 |
* The sequencenumber is encoded in a 12 byte long hex-string,
|
42 |
* and is attached to the devicename, separated by a dot.
|
43 |
* e.g. "devicename.1234567890ab"
|
44 |
*
|
45 |
*/
|
46 |
|
47 |
#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) |
48 |
|
49 |
#define RBD_MAX_CONF_NAME_SIZE 128 |
50 |
#define RBD_MAX_CONF_VAL_SIZE 512 |
51 |
#define RBD_MAX_CONF_SIZE 1024 |
52 |
#define RBD_MAX_POOL_NAME_SIZE 128 |
53 |
#define RBD_MAX_SNAP_NAME_SIZE 128 |
54 |
#define RBD_MAX_SNAPS 100 |
55 |
|
56 |
typedef struct RBDAIOCB { |
57 |
BlockDriverAIOCB common; |
58 |
QEMUBH *bh; |
59 |
int ret;
|
60 |
QEMUIOVector *qiov; |
61 |
char *bounce;
|
62 |
int write;
|
63 |
int64_t sector_num; |
64 |
int error;
|
65 |
struct BDRVRBDState *s;
|
66 |
int cancelled;
|
67 |
} RBDAIOCB; |
68 |
|
69 |
typedef struct RADOSCB { |
70 |
int rcbid;
|
71 |
RBDAIOCB *acb; |
72 |
struct BDRVRBDState *s;
|
73 |
int done;
|
74 |
int64_t size; |
75 |
char *buf;
|
76 |
int ret;
|
77 |
} RADOSCB; |
78 |
|
79 |
#define RBD_FD_READ 0 |
80 |
#define RBD_FD_WRITE 1 |
81 |
|
82 |
typedef struct BDRVRBDState { |
83 |
int fds[2]; |
84 |
rados_t cluster; |
85 |
rados_ioctx_t io_ctx; |
86 |
rbd_image_t image; |
87 |
char name[RBD_MAX_IMAGE_NAME_SIZE];
|
88 |
int qemu_aio_count;
|
89 |
char *snap;
|
90 |
int event_reader_pos;
|
91 |
RADOSCB *event_rcb; |
92 |
} BDRVRBDState; |
93 |
|
94 |
static void rbd_aio_bh_cb(void *opaque); |
95 |
|
96 |
static int qemu_rbd_next_tok(char *dst, int dst_len, |
97 |
char *src, char delim, |
98 |
const char *name, |
99 |
char **p)
|
100 |
{ |
101 |
int l;
|
102 |
char *end;
|
103 |
|
104 |
*p = NULL;
|
105 |
|
106 |
if (delim != '\0') { |
107 |
end = strchr(src, delim); |
108 |
if (end) {
|
109 |
*p = end + 1;
|
110 |
*end = '\0';
|
111 |
} |
112 |
} |
113 |
l = strlen(src); |
114 |
if (l >= dst_len) {
|
115 |
error_report("%s too long", name);
|
116 |
return -EINVAL;
|
117 |
} else if (l == 0) { |
118 |
error_report("%s too short", name);
|
119 |
return -EINVAL;
|
120 |
} |
121 |
|
122 |
pstrcpy(dst, dst_len, src); |
123 |
|
124 |
return 0; |
125 |
} |
126 |
|
127 |
static int qemu_rbd_parsename(const char *filename, |
128 |
char *pool, int pool_len, |
129 |
char *snap, int snap_len, |
130 |
char *name, int name_len, |
131 |
char *conf, int conf_len) |
132 |
{ |
133 |
const char *start; |
134 |
char *p, *buf;
|
135 |
int ret;
|
136 |
|
137 |
if (!strstart(filename, "rbd:", &start)) { |
138 |
return -EINVAL;
|
139 |
} |
140 |
|
141 |
buf = qemu_strdup(start); |
142 |
p = buf; |
143 |
*snap = '\0';
|
144 |
*conf = '\0';
|
145 |
|
146 |
ret = qemu_rbd_next_tok(pool, pool_len, p, '/', "pool name", &p); |
147 |
if (ret < 0 || !p) { |
148 |
ret = -EINVAL; |
149 |
goto done;
|
150 |
} |
151 |
|
152 |
if (strchr(p, '@')) { |
153 |
ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p); |
154 |
if (ret < 0) { |
155 |
goto done;
|
156 |
} |
157 |
ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p); |
158 |
} else {
|
159 |
ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p); |
160 |
} |
161 |
if (ret < 0 || !p) { |
162 |
goto done;
|
163 |
} |
164 |
|
165 |
ret = qemu_rbd_next_tok(conf, conf_len, p, '\0', "configuration", &p); |
166 |
|
167 |
done:
|
168 |
qemu_free(buf); |
169 |
return ret;
|
170 |
} |
171 |
|
172 |
static int qemu_rbd_set_conf(rados_t cluster, const char *conf) |
173 |
{ |
174 |
char *p, *buf;
|
175 |
char name[RBD_MAX_CONF_NAME_SIZE];
|
176 |
char value[RBD_MAX_CONF_VAL_SIZE];
|
177 |
int ret = 0; |
178 |
|
179 |
buf = qemu_strdup(conf); |
180 |
p = buf; |
181 |
|
182 |
while (p) {
|
183 |
ret = qemu_rbd_next_tok(name, sizeof(name), p,
|
184 |
'=', "conf option name", &p); |
185 |
if (ret < 0) { |
186 |
break;
|
187 |
} |
188 |
|
189 |
if (!p) {
|
190 |
error_report("conf option %s has no value", name);
|
191 |
ret = -EINVAL; |
192 |
break;
|
193 |
} |
194 |
|
195 |
ret = qemu_rbd_next_tok(value, sizeof(value), p,
|
196 |
':', "conf option value", &p); |
197 |
if (ret < 0) { |
198 |
break;
|
199 |
} |
200 |
|
201 |
if (strcmp(name, "conf")) { |
202 |
ret = rados_conf_set(cluster, name, value); |
203 |
if (ret < 0) { |
204 |
error_report("invalid conf option %s", name);
|
205 |
ret = -EINVAL; |
206 |
break;
|
207 |
} |
208 |
} else {
|
209 |
ret = rados_conf_read_file(cluster, value); |
210 |
if (ret < 0) { |
211 |
error_report("error reading conf file %s", value);
|
212 |
break;
|
213 |
} |
214 |
} |
215 |
} |
216 |
|
217 |
qemu_free(buf); |
218 |
return ret;
|
219 |
} |
220 |
|
221 |
static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options) |
222 |
{ |
223 |
int64_t bytes = 0;
|
224 |
int64_t objsize; |
225 |
int obj_order = 0; |
226 |
char pool[RBD_MAX_POOL_NAME_SIZE];
|
227 |
char name[RBD_MAX_IMAGE_NAME_SIZE];
|
228 |
char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
|
229 |
char conf[RBD_MAX_CONF_SIZE];
|
230 |
char *snap = NULL; |
231 |
rados_t cluster; |
232 |
rados_ioctx_t io_ctx; |
233 |
int ret;
|
234 |
|
235 |
if (qemu_rbd_parsename(filename, pool, sizeof(pool), |
236 |
snap_buf, sizeof(snap_buf),
|
237 |
name, sizeof(name),
|
238 |
conf, sizeof(conf)) < 0) { |
239 |
return -EINVAL;
|
240 |
} |
241 |
if (snap_buf[0] != '\0') { |
242 |
snap = snap_buf; |
243 |
} |
244 |
|
245 |
/* Read out options */
|
246 |
while (options && options->name) {
|
247 |
if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
|
248 |
bytes = options->value.n; |
249 |
} else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { |
250 |
if (options->value.n) {
|
251 |
objsize = options->value.n; |
252 |
if ((objsize - 1) & objsize) { /* not a power of 2? */ |
253 |
error_report("obj size needs to be power of 2");
|
254 |
return -EINVAL;
|
255 |
} |
256 |
if (objsize < 4096) { |
257 |
error_report("obj size too small");
|
258 |
return -EINVAL;
|
259 |
} |
260 |
obj_order = ffs(objsize) - 1;
|
261 |
} |
262 |
} |
263 |
options++; |
264 |
} |
265 |
|
266 |
if (rados_create(&cluster, NULL) < 0) { |
267 |
error_report("error initializing");
|
268 |
return -EIO;
|
269 |
} |
270 |
|
271 |
if (strstr(conf, "conf=") == NULL) { |
272 |
if (rados_conf_read_file(cluster, NULL) < 0) { |
273 |
error_report("error reading config file");
|
274 |
rados_shutdown(cluster); |
275 |
return -EIO;
|
276 |
} |
277 |
} |
278 |
|
279 |
if (conf[0] != '\0' && |
280 |
qemu_rbd_set_conf(cluster, conf) < 0) {
|
281 |
error_report("error setting config options");
|
282 |
rados_shutdown(cluster); |
283 |
return -EIO;
|
284 |
} |
285 |
|
286 |
if (rados_connect(cluster) < 0) { |
287 |
error_report("error connecting");
|
288 |
rados_shutdown(cluster); |
289 |
return -EIO;
|
290 |
} |
291 |
|
292 |
if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { |
293 |
error_report("error opening pool %s", pool);
|
294 |
rados_shutdown(cluster); |
295 |
return -EIO;
|
296 |
} |
297 |
|
298 |
ret = rbd_create(io_ctx, name, bytes, &obj_order); |
299 |
rados_ioctx_destroy(io_ctx); |
300 |
rados_shutdown(cluster); |
301 |
|
302 |
return ret;
|
303 |
} |
304 |
|
305 |
/*
|
306 |
* This aio completion is being called from qemu_rbd_aio_event_reader()
|
307 |
* and runs in qemu context. It schedules a bh, but just in case the aio
|
308 |
* was not cancelled before.
|
309 |
*/
|
310 |
static void qemu_rbd_complete_aio(RADOSCB *rcb) |
311 |
{ |
312 |
RBDAIOCB *acb = rcb->acb; |
313 |
int64_t r; |
314 |
|
315 |
if (acb->cancelled) {
|
316 |
qemu_vfree(acb->bounce); |
317 |
qemu_aio_release(acb); |
318 |
goto done;
|
319 |
} |
320 |
|
321 |
r = rcb->ret; |
322 |
|
323 |
if (acb->write) {
|
324 |
if (r < 0) { |
325 |
acb->ret = r; |
326 |
acb->error = 1;
|
327 |
} else if (!acb->error) { |
328 |
acb->ret = rcb->size; |
329 |
} |
330 |
} else {
|
331 |
if (r < 0) { |
332 |
memset(rcb->buf, 0, rcb->size);
|
333 |
acb->ret = r; |
334 |
acb->error = 1;
|
335 |
} else if (r < rcb->size) { |
336 |
memset(rcb->buf + r, 0, rcb->size - r);
|
337 |
if (!acb->error) {
|
338 |
acb->ret = rcb->size; |
339 |
} |
340 |
} else if (!acb->error) { |
341 |
acb->ret = r; |
342 |
} |
343 |
} |
344 |
/* Note that acb->bh can be NULL in case where the aio was cancelled */
|
345 |
acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); |
346 |
qemu_bh_schedule(acb->bh); |
347 |
done:
|
348 |
qemu_free(rcb); |
349 |
} |
350 |
|
351 |
/*
|
352 |
* aio fd read handler. It runs in the qemu context and calls the
|
353 |
* completion handling of completed rados aio operations.
|
354 |
*/
|
355 |
static void qemu_rbd_aio_event_reader(void *opaque) |
356 |
{ |
357 |
BDRVRBDState *s = opaque; |
358 |
|
359 |
ssize_t ret; |
360 |
|
361 |
do {
|
362 |
char *p = (char *)&s->event_rcb; |
363 |
|
364 |
/* now read the rcb pointer that was sent from a non qemu thread */
|
365 |
if ((ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
|
366 |
sizeof(s->event_rcb) - s->event_reader_pos)) > 0) { |
367 |
if (ret > 0) { |
368 |
s->event_reader_pos += ret; |
369 |
if (s->event_reader_pos == sizeof(s->event_rcb)) { |
370 |
s->event_reader_pos = 0;
|
371 |
qemu_rbd_complete_aio(s->event_rcb); |
372 |
s->qemu_aio_count--; |
373 |
} |
374 |
} |
375 |
} |
376 |
} while (ret < 0 && errno == EINTR); |
377 |
} |
378 |
|
379 |
static int qemu_rbd_aio_flush_cb(void *opaque) |
380 |
{ |
381 |
BDRVRBDState *s = opaque; |
382 |
|
383 |
return (s->qemu_aio_count > 0); |
384 |
} |
385 |
|
386 |
static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags) |
387 |
{ |
388 |
BDRVRBDState *s = bs->opaque; |
389 |
char pool[RBD_MAX_POOL_NAME_SIZE];
|
390 |
char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
|
391 |
char conf[RBD_MAX_CONF_SIZE];
|
392 |
int r;
|
393 |
|
394 |
if (qemu_rbd_parsename(filename, pool, sizeof(pool), |
395 |
snap_buf, sizeof(snap_buf),
|
396 |
s->name, sizeof(s->name),
|
397 |
conf, sizeof(conf)) < 0) { |
398 |
return -EINVAL;
|
399 |
} |
400 |
s->snap = NULL;
|
401 |
if (snap_buf[0] != '\0') { |
402 |
s->snap = qemu_strdup(snap_buf); |
403 |
} |
404 |
|
405 |
r = rados_create(&s->cluster, NULL);
|
406 |
if (r < 0) { |
407 |
error_report("error initializing");
|
408 |
return r;
|
409 |
} |
410 |
|
411 |
if (strstr(conf, "conf=") == NULL) { |
412 |
r = rados_conf_read_file(s->cluster, NULL);
|
413 |
if (r < 0) { |
414 |
error_report("error reading config file");
|
415 |
rados_shutdown(s->cluster); |
416 |
return r;
|
417 |
} |
418 |
} |
419 |
|
420 |
if (conf[0] != '\0') { |
421 |
r = qemu_rbd_set_conf(s->cluster, conf); |
422 |
if (r < 0) { |
423 |
error_report("error setting config options");
|
424 |
rados_shutdown(s->cluster); |
425 |
return r;
|
426 |
} |
427 |
} |
428 |
|
429 |
r = rados_connect(s->cluster); |
430 |
if (r < 0) { |
431 |
error_report("error connecting");
|
432 |
rados_shutdown(s->cluster); |
433 |
return r;
|
434 |
} |
435 |
|
436 |
r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); |
437 |
if (r < 0) { |
438 |
error_report("error opening pool %s", pool);
|
439 |
rados_shutdown(s->cluster); |
440 |
return r;
|
441 |
} |
442 |
|
443 |
r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); |
444 |
if (r < 0) { |
445 |
error_report("error reading header from %s", s->name);
|
446 |
rados_ioctx_destroy(s->io_ctx); |
447 |
rados_shutdown(s->cluster); |
448 |
return r;
|
449 |
} |
450 |
|
451 |
bs->read_only = (s->snap != NULL);
|
452 |
|
453 |
s->event_reader_pos = 0;
|
454 |
r = qemu_pipe(s->fds); |
455 |
if (r < 0) { |
456 |
error_report("error opening eventfd");
|
457 |
goto failed;
|
458 |
} |
459 |
fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
|
460 |
fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
|
461 |
qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader, |
462 |
NULL, qemu_rbd_aio_flush_cb, NULL, s); |
463 |
|
464 |
|
465 |
return 0; |
466 |
|
467 |
failed:
|
468 |
rbd_close(s->image); |
469 |
rados_ioctx_destroy(s->io_ctx); |
470 |
rados_shutdown(s->cluster); |
471 |
return r;
|
472 |
} |
473 |
|
474 |
static void qemu_rbd_close(BlockDriverState *bs) |
475 |
{ |
476 |
BDRVRBDState *s = bs->opaque; |
477 |
|
478 |
close(s->fds[0]);
|
479 |
close(s->fds[1]);
|
480 |
qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL , NULL, NULL, NULL, |
481 |
NULL);
|
482 |
|
483 |
rbd_close(s->image); |
484 |
rados_ioctx_destroy(s->io_ctx); |
485 |
qemu_free(s->snap); |
486 |
rados_shutdown(s->cluster); |
487 |
} |
488 |
|
489 |
/*
|
490 |
* Cancel aio. Since we don't reference acb in a non qemu threads,
|
491 |
* it is safe to access it here.
|
492 |
*/
|
493 |
static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb) |
494 |
{ |
495 |
RBDAIOCB *acb = (RBDAIOCB *) blockacb; |
496 |
acb->cancelled = 1;
|
497 |
} |
498 |
|
499 |
static AIOPool rbd_aio_pool = {
|
500 |
.aiocb_size = sizeof(RBDAIOCB),
|
501 |
.cancel = qemu_rbd_aio_cancel, |
502 |
}; |
503 |
|
504 |
static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) |
505 |
{ |
506 |
int ret = 0; |
507 |
while (1) { |
508 |
fd_set wfd; |
509 |
int fd = s->fds[RBD_FD_WRITE];
|
510 |
|
511 |
/* send the op pointer to the qemu thread that is responsible
|
512 |
for the aio/op completion. Must do it in a qemu thread context */
|
513 |
ret = write(fd, (void *)&rcb, sizeof(rcb)); |
514 |
if (ret >= 0) { |
515 |
break;
|
516 |
} |
517 |
if (errno == EINTR) {
|
518 |
continue;
|
519 |
} |
520 |
if (errno != EAGAIN) {
|
521 |
break;
|
522 |
} |
523 |
|
524 |
FD_ZERO(&wfd); |
525 |
FD_SET(fd, &wfd); |
526 |
do {
|
527 |
ret = select(fd + 1, NULL, &wfd, NULL, NULL); |
528 |
} while (ret < 0 && errno == EINTR); |
529 |
} |
530 |
|
531 |
return ret;
|
532 |
} |
533 |
|
534 |
/*
|
535 |
* This is the callback function for rbd_aio_read and _write
|
536 |
*
|
537 |
* Note: this function is being called from a non qemu thread so
|
538 |
* we need to be careful about what we do here. Generally we only
|
539 |
* write to the block notification pipe, and do the rest of the
|
540 |
* io completion handling from qemu_rbd_aio_event_reader() which
|
541 |
* runs in a qemu context.
|
542 |
*/
|
543 |
static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) |
544 |
{ |
545 |
int ret;
|
546 |
rcb->ret = rbd_aio_get_return_value(c); |
547 |
rbd_aio_release(c); |
548 |
ret = qemu_rbd_send_pipe(rcb->s, rcb); |
549 |
if (ret < 0) { |
550 |
error_report("failed writing to acb->s->fds");
|
551 |
qemu_free(rcb); |
552 |
} |
553 |
} |
554 |
|
555 |
/* Callback when all queued rbd_aio requests are complete */
|
556 |
|
557 |
static void rbd_aio_bh_cb(void *opaque) |
558 |
{ |
559 |
RBDAIOCB *acb = opaque; |
560 |
|
561 |
if (!acb->write) {
|
562 |
qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); |
563 |
} |
564 |
qemu_vfree(acb->bounce); |
565 |
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); |
566 |
qemu_bh_delete(acb->bh); |
567 |
acb->bh = NULL;
|
568 |
|
569 |
qemu_aio_release(acb); |
570 |
} |
571 |
|
572 |
static BlockDriverAIOCB *rbd_aio_rw_vector(BlockDriverState *bs,
|
573 |
int64_t sector_num, |
574 |
QEMUIOVector *qiov, |
575 |
int nb_sectors,
|
576 |
BlockDriverCompletionFunc *cb, |
577 |
void *opaque, int write) |
578 |
{ |
579 |
RBDAIOCB *acb; |
580 |
RADOSCB *rcb; |
581 |
rbd_completion_t c; |
582 |
int64_t off, size; |
583 |
char *buf;
|
584 |
|
585 |
BDRVRBDState *s = bs->opaque; |
586 |
|
587 |
acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque); |
588 |
acb->write = write; |
589 |
acb->qiov = qiov; |
590 |
acb->bounce = qemu_blockalign(bs, qiov->size); |
591 |
acb->ret = 0;
|
592 |
acb->error = 0;
|
593 |
acb->s = s; |
594 |
acb->cancelled = 0;
|
595 |
acb->bh = NULL;
|
596 |
|
597 |
if (write) {
|
598 |
qemu_iovec_to_buffer(acb->qiov, acb->bounce); |
599 |
} |
600 |
|
601 |
buf = acb->bounce; |
602 |
|
603 |
off = sector_num * BDRV_SECTOR_SIZE; |
604 |
size = nb_sectors * BDRV_SECTOR_SIZE; |
605 |
|
606 |
s->qemu_aio_count++; /* All the RADOSCB */
|
607 |
|
608 |
rcb = qemu_malloc(sizeof(RADOSCB));
|
609 |
rcb->done = 0;
|
610 |
rcb->acb = acb; |
611 |
rcb->buf = buf; |
612 |
rcb->s = acb->s; |
613 |
rcb->size = size; |
614 |
|
615 |
if (write) {
|
616 |
rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); |
617 |
rbd_aio_write(s->image, off, size, buf, c); |
618 |
} else {
|
619 |
rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); |
620 |
rbd_aio_read(s->image, off, size, buf, c); |
621 |
} |
622 |
|
623 |
return &acb->common;
|
624 |
} |
625 |
|
626 |
static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
|
627 |
int64_t sector_num, |
628 |
QEMUIOVector *qiov, |
629 |
int nb_sectors,
|
630 |
BlockDriverCompletionFunc *cb, |
631 |
void *opaque)
|
632 |
{ |
633 |
return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); |
634 |
} |
635 |
|
636 |
static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
|
637 |
int64_t sector_num, |
638 |
QEMUIOVector *qiov, |
639 |
int nb_sectors,
|
640 |
BlockDriverCompletionFunc *cb, |
641 |
void *opaque)
|
642 |
{ |
643 |
return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); |
644 |
} |
645 |
|
646 |
static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) |
647 |
{ |
648 |
BDRVRBDState *s = bs->opaque; |
649 |
rbd_image_info_t info; |
650 |
int r;
|
651 |
|
652 |
r = rbd_stat(s->image, &info, sizeof(info));
|
653 |
if (r < 0) { |
654 |
return r;
|
655 |
} |
656 |
|
657 |
bdi->cluster_size = info.obj_size; |
658 |
return 0; |
659 |
} |
660 |
|
661 |
static int64_t qemu_rbd_getlength(BlockDriverState *bs)
|
662 |
{ |
663 |
BDRVRBDState *s = bs->opaque; |
664 |
rbd_image_info_t info; |
665 |
int r;
|
666 |
|
667 |
r = rbd_stat(s->image, &info, sizeof(info));
|
668 |
if (r < 0) { |
669 |
return r;
|
670 |
} |
671 |
|
672 |
return info.size;
|
673 |
} |
674 |
|
675 |
static int qemu_rbd_snap_create(BlockDriverState *bs, |
676 |
QEMUSnapshotInfo *sn_info) |
677 |
{ |
678 |
BDRVRBDState *s = bs->opaque; |
679 |
int r;
|
680 |
|
681 |
if (sn_info->name[0] == '\0') { |
682 |
return -EINVAL; /* we need a name for rbd snapshots */ |
683 |
} |
684 |
|
685 |
/*
|
686 |
* rbd snapshots are using the name as the user controlled unique identifier
|
687 |
* we can't use the rbd snapid for that purpose, as it can't be set
|
688 |
*/
|
689 |
if (sn_info->id_str[0] != '\0' && |
690 |
strcmp(sn_info->id_str, sn_info->name) != 0) {
|
691 |
return -EINVAL;
|
692 |
} |
693 |
|
694 |
if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { |
695 |
return -ERANGE;
|
696 |
} |
697 |
|
698 |
r = rbd_snap_create(s->image, sn_info->name); |
699 |
if (r < 0) { |
700 |
error_report("failed to create snap: %s", strerror(-r));
|
701 |
return r;
|
702 |
} |
703 |
|
704 |
return 0; |
705 |
} |
706 |
|
707 |
static int qemu_rbd_snap_list(BlockDriverState *bs, |
708 |
QEMUSnapshotInfo **psn_tab) |
709 |
{ |
710 |
BDRVRBDState *s = bs->opaque; |
711 |
QEMUSnapshotInfo *sn_info, *sn_tab = NULL;
|
712 |
int i, snap_count;
|
713 |
rbd_snap_info_t *snaps; |
714 |
int max_snaps = RBD_MAX_SNAPS;
|
715 |
|
716 |
do {
|
717 |
snaps = qemu_malloc(sizeof(*snaps) * max_snaps);
|
718 |
snap_count = rbd_snap_list(s->image, snaps, &max_snaps); |
719 |
if (snap_count < 0) { |
720 |
qemu_free(snaps); |
721 |
} |
722 |
} while (snap_count == -ERANGE);
|
723 |
|
724 |
if (snap_count <= 0) { |
725 |
return snap_count;
|
726 |
} |
727 |
|
728 |
sn_tab = qemu_mallocz(snap_count * sizeof(QEMUSnapshotInfo));
|
729 |
|
730 |
for (i = 0; i < snap_count; i++) { |
731 |
const char *snap_name = snaps[i].name; |
732 |
|
733 |
sn_info = sn_tab + i; |
734 |
pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name);
|
735 |
pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name);
|
736 |
|
737 |
sn_info->vm_state_size = snaps[i].size; |
738 |
sn_info->date_sec = 0;
|
739 |
sn_info->date_nsec = 0;
|
740 |
sn_info->vm_clock_nsec = 0;
|
741 |
} |
742 |
rbd_snap_list_end(snaps); |
743 |
|
744 |
*psn_tab = sn_tab; |
745 |
return snap_count;
|
746 |
} |
747 |
|
748 |
static QEMUOptionParameter qemu_rbd_create_options[] = {
|
749 |
{ |
750 |
.name = BLOCK_OPT_SIZE, |
751 |
.type = OPT_SIZE, |
752 |
.help = "Virtual disk size"
|
753 |
}, |
754 |
{ |
755 |
.name = BLOCK_OPT_CLUSTER_SIZE, |
756 |
.type = OPT_SIZE, |
757 |
.help = "RBD object size"
|
758 |
}, |
759 |
{NULL}
|
760 |
}; |
761 |
|
762 |
static BlockDriver bdrv_rbd = {
|
763 |
.format_name = "rbd",
|
764 |
.instance_size = sizeof(BDRVRBDState),
|
765 |
.bdrv_file_open = qemu_rbd_open, |
766 |
.bdrv_close = qemu_rbd_close, |
767 |
.bdrv_create = qemu_rbd_create, |
768 |
.bdrv_get_info = qemu_rbd_getinfo, |
769 |
.create_options = qemu_rbd_create_options, |
770 |
.bdrv_getlength = qemu_rbd_getlength, |
771 |
.protocol_name = "rbd",
|
772 |
|
773 |
.bdrv_aio_readv = qemu_rbd_aio_readv, |
774 |
.bdrv_aio_writev = qemu_rbd_aio_writev, |
775 |
|
776 |
.bdrv_snapshot_create = qemu_rbd_snap_create, |
777 |
.bdrv_snapshot_list = qemu_rbd_snap_list, |
778 |
}; |
779 |
|
780 |
static void bdrv_rbd_init(void) |
781 |
{ |
782 |
bdrv_register(&bdrv_rbd); |
783 |
} |
784 |
|
785 |
block_init(bdrv_rbd_init); |