root / block / rbd.c @ d5124c00
History | View | Annotate | Download (26.6 kB)
1 |
/*
|
---|---|
2 |
* QEMU Block driver for RADOS (Ceph)
|
3 |
*
|
4 |
* Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
|
5 |
* Josh Durgin <josh.durgin@dreamhost.com>
|
6 |
*
|
7 |
* This work is licensed under the terms of the GNU GPL, version 2. See
|
8 |
* the COPYING file in the top-level directory.
|
9 |
*
|
10 |
* Contributions after 2012-01-13 are licensed under the terms of the
|
11 |
* GNU GPL, version 2 or (at your option) any later version.
|
12 |
*/
|
13 |
|
14 |
#include <inttypes.h> |
15 |
|
16 |
#include "qemu-common.h" |
17 |
#include "qemu/error-report.h" |
18 |
#include "block/block_int.h" |
19 |
|
20 |
#include <rbd/librbd.h> |
21 |
|
22 |
/*
|
23 |
* When specifying the image filename use:
|
24 |
*
|
25 |
* rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
|
26 |
*
|
27 |
* poolname must be the name of an existing rados pool.
|
28 |
*
|
29 |
* devicename is the name of the rbd image.
|
30 |
*
|
31 |
* Each option given is used to configure rados, and may be any valid
|
32 |
* Ceph option, "id", or "conf".
|
33 |
*
|
34 |
* The "id" option indicates what user we should authenticate as to
|
35 |
* the Ceph cluster. If it is excluded we will use the Ceph default
|
36 |
* (normally 'admin').
|
37 |
*
|
38 |
* The "conf" option specifies a Ceph configuration file to read. If
|
39 |
* it is not specified, we will read from the default Ceph locations
|
40 |
* (e.g., /etc/ceph/ceph.conf). To avoid reading _any_ configuration
|
41 |
* file, specify conf=/dev/null.
|
42 |
*
|
43 |
* Configuration values containing :, @, or = can be escaped with a
|
44 |
* leading "\".
|
45 |
*/
|
46 |
|
47 |
/* rbd_aio_discard added in 0.1.2 */
|
48 |
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2) |
49 |
#define LIBRBD_SUPPORTS_DISCARD
|
50 |
#else
|
51 |
#undef LIBRBD_SUPPORTS_DISCARD
|
52 |
#endif
|
53 |
|
54 |
#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) |
55 |
|
56 |
#define RBD_MAX_CONF_NAME_SIZE 128 |
57 |
#define RBD_MAX_CONF_VAL_SIZE 512 |
58 |
#define RBD_MAX_CONF_SIZE 1024 |
59 |
#define RBD_MAX_POOL_NAME_SIZE 128 |
60 |
#define RBD_MAX_SNAP_NAME_SIZE 128 |
61 |
#define RBD_MAX_SNAPS 100 |
62 |
|
63 |
typedef enum { |
64 |
RBD_AIO_READ, |
65 |
RBD_AIO_WRITE, |
66 |
RBD_AIO_DISCARD, |
67 |
RBD_AIO_FLUSH |
68 |
} RBDAIOCmd; |
69 |
|
70 |
typedef struct RBDAIOCB { |
71 |
BlockDriverAIOCB common; |
72 |
QEMUBH *bh; |
73 |
int64_t ret; |
74 |
QEMUIOVector *qiov; |
75 |
char *bounce;
|
76 |
RBDAIOCmd cmd; |
77 |
int64_t sector_num; |
78 |
int error;
|
79 |
struct BDRVRBDState *s;
|
80 |
int cancelled;
|
81 |
int status;
|
82 |
} RBDAIOCB; |
83 |
|
84 |
typedef struct RADOSCB { |
85 |
int rcbid;
|
86 |
RBDAIOCB *acb; |
87 |
struct BDRVRBDState *s;
|
88 |
int done;
|
89 |
int64_t size; |
90 |
char *buf;
|
91 |
int64_t ret; |
92 |
} RADOSCB; |
93 |
|
94 |
#define RBD_FD_READ 0 |
95 |
#define RBD_FD_WRITE 1 |
96 |
|
97 |
typedef struct BDRVRBDState { |
98 |
int fds[2]; |
99 |
rados_t cluster; |
100 |
rados_ioctx_t io_ctx; |
101 |
rbd_image_t image; |
102 |
char name[RBD_MAX_IMAGE_NAME_SIZE];
|
103 |
char *snap;
|
104 |
int event_reader_pos;
|
105 |
RADOSCB *event_rcb; |
106 |
} BDRVRBDState; |
107 |
|
108 |
static void rbd_aio_bh_cb(void *opaque); |
109 |
|
110 |
static int qemu_rbd_next_tok(char *dst, int dst_len, |
111 |
char *src, char delim, |
112 |
const char *name, |
113 |
char **p)
|
114 |
{ |
115 |
int l;
|
116 |
char *end;
|
117 |
|
118 |
*p = NULL;
|
119 |
|
120 |
if (delim != '\0') { |
121 |
for (end = src; *end; ++end) {
|
122 |
if (*end == delim) {
|
123 |
break;
|
124 |
} |
125 |
if (*end == '\\' && end[1] != '\0') { |
126 |
end++; |
127 |
} |
128 |
} |
129 |
if (*end == delim) {
|
130 |
*p = end + 1;
|
131 |
*end = '\0';
|
132 |
} |
133 |
} |
134 |
l = strlen(src); |
135 |
if (l >= dst_len) {
|
136 |
error_report("%s too long", name);
|
137 |
return -EINVAL;
|
138 |
} else if (l == 0) { |
139 |
error_report("%s too short", name);
|
140 |
return -EINVAL;
|
141 |
} |
142 |
|
143 |
pstrcpy(dst, dst_len, src); |
144 |
|
145 |
return 0; |
146 |
} |
147 |
|
148 |
static void qemu_rbd_unescape(char *src) |
149 |
{ |
150 |
char *p;
|
151 |
|
152 |
for (p = src; *src; ++src, ++p) {
|
153 |
if (*src == '\\' && src[1] != '\0') { |
154 |
src++; |
155 |
} |
156 |
*p = *src; |
157 |
} |
158 |
*p = '\0';
|
159 |
} |
160 |
|
161 |
static int qemu_rbd_parsename(const char *filename, |
162 |
char *pool, int pool_len, |
163 |
char *snap, int snap_len, |
164 |
char *name, int name_len, |
165 |
char *conf, int conf_len) |
166 |
{ |
167 |
const char *start; |
168 |
char *p, *buf;
|
169 |
int ret;
|
170 |
|
171 |
if (!strstart(filename, "rbd:", &start)) { |
172 |
return -EINVAL;
|
173 |
} |
174 |
|
175 |
buf = g_strdup(start); |
176 |
p = buf; |
177 |
*snap = '\0';
|
178 |
*conf = '\0';
|
179 |
|
180 |
ret = qemu_rbd_next_tok(pool, pool_len, p, '/', "pool name", &p); |
181 |
if (ret < 0 || !p) { |
182 |
ret = -EINVAL; |
183 |
goto done;
|
184 |
} |
185 |
qemu_rbd_unescape(pool); |
186 |
|
187 |
if (strchr(p, '@')) { |
188 |
ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p); |
189 |
if (ret < 0) { |
190 |
goto done;
|
191 |
} |
192 |
ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p); |
193 |
qemu_rbd_unescape(snap); |
194 |
} else {
|
195 |
ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p); |
196 |
} |
197 |
qemu_rbd_unescape(name); |
198 |
if (ret < 0 || !p) { |
199 |
goto done;
|
200 |
} |
201 |
|
202 |
ret = qemu_rbd_next_tok(conf, conf_len, p, '\0', "configuration", &p); |
203 |
|
204 |
done:
|
205 |
g_free(buf); |
206 |
return ret;
|
207 |
} |
208 |
|
209 |
static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) |
210 |
{ |
211 |
const char *p = conf; |
212 |
|
213 |
while (*p) {
|
214 |
int len;
|
215 |
const char *end = strchr(p, ':'); |
216 |
|
217 |
if (end) {
|
218 |
len = end - p; |
219 |
} else {
|
220 |
len = strlen(p); |
221 |
} |
222 |
|
223 |
if (strncmp(p, "id=", 3) == 0) { |
224 |
len -= 3;
|
225 |
strncpy(clientname, p + 3, len);
|
226 |
clientname[len] = '\0';
|
227 |
return clientname;
|
228 |
} |
229 |
if (end == NULL) { |
230 |
break;
|
231 |
} |
232 |
p = end + 1;
|
233 |
} |
234 |
return NULL; |
235 |
} |
236 |
|
237 |
static int qemu_rbd_set_conf(rados_t cluster, const char *conf) |
238 |
{ |
239 |
char *p, *buf;
|
240 |
char name[RBD_MAX_CONF_NAME_SIZE];
|
241 |
char value[RBD_MAX_CONF_VAL_SIZE];
|
242 |
int ret = 0; |
243 |
|
244 |
buf = g_strdup(conf); |
245 |
p = buf; |
246 |
|
247 |
while (p) {
|
248 |
ret = qemu_rbd_next_tok(name, sizeof(name), p,
|
249 |
'=', "conf option name", &p); |
250 |
if (ret < 0) { |
251 |
break;
|
252 |
} |
253 |
qemu_rbd_unescape(name); |
254 |
|
255 |
if (!p) {
|
256 |
error_report("conf option %s has no value", name);
|
257 |
ret = -EINVAL; |
258 |
break;
|
259 |
} |
260 |
|
261 |
ret = qemu_rbd_next_tok(value, sizeof(value), p,
|
262 |
':', "conf option value", &p); |
263 |
if (ret < 0) { |
264 |
break;
|
265 |
} |
266 |
qemu_rbd_unescape(value); |
267 |
|
268 |
if (strcmp(name, "conf") == 0) { |
269 |
ret = rados_conf_read_file(cluster, value); |
270 |
if (ret < 0) { |
271 |
error_report("error reading conf file %s", value);
|
272 |
break;
|
273 |
} |
274 |
} else if (strcmp(name, "id") == 0) { |
275 |
/* ignore, this is parsed by qemu_rbd_parse_clientname() */
|
276 |
} else {
|
277 |
ret = rados_conf_set(cluster, name, value); |
278 |
if (ret < 0) { |
279 |
error_report("invalid conf option %s", name);
|
280 |
ret = -EINVAL; |
281 |
break;
|
282 |
} |
283 |
} |
284 |
} |
285 |
|
286 |
g_free(buf); |
287 |
return ret;
|
288 |
} |
289 |
|
290 |
static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options, |
291 |
Error **errp) |
292 |
{ |
293 |
int64_t bytes = 0;
|
294 |
int64_t objsize; |
295 |
int obj_order = 0; |
296 |
char pool[RBD_MAX_POOL_NAME_SIZE];
|
297 |
char name[RBD_MAX_IMAGE_NAME_SIZE];
|
298 |
char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
|
299 |
char conf[RBD_MAX_CONF_SIZE];
|
300 |
char clientname_buf[RBD_MAX_CONF_SIZE];
|
301 |
char *clientname;
|
302 |
rados_t cluster; |
303 |
rados_ioctx_t io_ctx; |
304 |
int ret;
|
305 |
|
306 |
if (qemu_rbd_parsename(filename, pool, sizeof(pool), |
307 |
snap_buf, sizeof(snap_buf),
|
308 |
name, sizeof(name),
|
309 |
conf, sizeof(conf)) < 0) { |
310 |
return -EINVAL;
|
311 |
} |
312 |
|
313 |
/* Read out options */
|
314 |
while (options && options->name) {
|
315 |
if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
|
316 |
bytes = options->value.n; |
317 |
} else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { |
318 |
if (options->value.n) {
|
319 |
objsize = options->value.n; |
320 |
if ((objsize - 1) & objsize) { /* not a power of 2? */ |
321 |
error_report("obj size needs to be power of 2");
|
322 |
return -EINVAL;
|
323 |
} |
324 |
if (objsize < 4096) { |
325 |
error_report("obj size too small");
|
326 |
return -EINVAL;
|
327 |
} |
328 |
obj_order = ffs(objsize) - 1;
|
329 |
} |
330 |
} |
331 |
options++; |
332 |
} |
333 |
|
334 |
clientname = qemu_rbd_parse_clientname(conf, clientname_buf); |
335 |
if (rados_create(&cluster, clientname) < 0) { |
336 |
error_report("error initializing");
|
337 |
return -EIO;
|
338 |
} |
339 |
|
340 |
if (strstr(conf, "conf=") == NULL) { |
341 |
/* try default location, but ignore failure */
|
342 |
rados_conf_read_file(cluster, NULL);
|
343 |
} |
344 |
|
345 |
if (conf[0] != '\0' && |
346 |
qemu_rbd_set_conf(cluster, conf) < 0) {
|
347 |
error_report("error setting config options");
|
348 |
rados_shutdown(cluster); |
349 |
return -EIO;
|
350 |
} |
351 |
|
352 |
if (rados_connect(cluster) < 0) { |
353 |
error_report("error connecting");
|
354 |
rados_shutdown(cluster); |
355 |
return -EIO;
|
356 |
} |
357 |
|
358 |
if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { |
359 |
error_report("error opening pool %s", pool);
|
360 |
rados_shutdown(cluster); |
361 |
return -EIO;
|
362 |
} |
363 |
|
364 |
ret = rbd_create(io_ctx, name, bytes, &obj_order); |
365 |
rados_ioctx_destroy(io_ctx); |
366 |
rados_shutdown(cluster); |
367 |
|
368 |
return ret;
|
369 |
} |
370 |
|
371 |
/*
|
372 |
* This aio completion is being called from qemu_rbd_aio_event_reader()
|
373 |
* and runs in qemu context. It schedules a bh, but just in case the aio
|
374 |
* was not cancelled before.
|
375 |
*/
|
376 |
static void qemu_rbd_complete_aio(RADOSCB *rcb) |
377 |
{ |
378 |
RBDAIOCB *acb = rcb->acb; |
379 |
int64_t r; |
380 |
|
381 |
r = rcb->ret; |
382 |
|
383 |
if (acb->cmd != RBD_AIO_READ) {
|
384 |
if (r < 0) { |
385 |
acb->ret = r; |
386 |
acb->error = 1;
|
387 |
} else if (!acb->error) { |
388 |
acb->ret = rcb->size; |
389 |
} |
390 |
} else {
|
391 |
if (r < 0) { |
392 |
memset(rcb->buf, 0, rcb->size);
|
393 |
acb->ret = r; |
394 |
acb->error = 1;
|
395 |
} else if (r < rcb->size) { |
396 |
memset(rcb->buf + r, 0, rcb->size - r);
|
397 |
if (!acb->error) {
|
398 |
acb->ret = rcb->size; |
399 |
} |
400 |
} else if (!acb->error) { |
401 |
acb->ret = r; |
402 |
} |
403 |
} |
404 |
/* Note that acb->bh can be NULL in case where the aio was cancelled */
|
405 |
acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); |
406 |
qemu_bh_schedule(acb->bh); |
407 |
g_free(rcb); |
408 |
} |
409 |
|
410 |
/*
|
411 |
* aio fd read handler. It runs in the qemu context and calls the
|
412 |
* completion handling of completed rados aio operations.
|
413 |
*/
|
414 |
static void qemu_rbd_aio_event_reader(void *opaque) |
415 |
{ |
416 |
BDRVRBDState *s = opaque; |
417 |
|
418 |
ssize_t ret; |
419 |
|
420 |
do {
|
421 |
char *p = (char *)&s->event_rcb; |
422 |
|
423 |
/* now read the rcb pointer that was sent from a non qemu thread */
|
424 |
ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos, |
425 |
sizeof(s->event_rcb) - s->event_reader_pos);
|
426 |
if (ret > 0) { |
427 |
s->event_reader_pos += ret; |
428 |
if (s->event_reader_pos == sizeof(s->event_rcb)) { |
429 |
s->event_reader_pos = 0;
|
430 |
qemu_rbd_complete_aio(s->event_rcb); |
431 |
} |
432 |
} |
433 |
} while (ret < 0 && errno == EINTR); |
434 |
} |
435 |
|
436 |
/* TODO Convert to fine grained options */
|
437 |
static QemuOptsList runtime_opts = {
|
438 |
.name = "rbd",
|
439 |
.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), |
440 |
.desc = { |
441 |
{ |
442 |
.name = "filename",
|
443 |
.type = QEMU_OPT_STRING, |
444 |
.help = "Specification of the rbd image",
|
445 |
}, |
446 |
{ /* end of list */ }
|
447 |
}, |
448 |
}; |
449 |
|
450 |
static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, |
451 |
Error **errp) |
452 |
{ |
453 |
BDRVRBDState *s = bs->opaque; |
454 |
char pool[RBD_MAX_POOL_NAME_SIZE];
|
455 |
char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
|
456 |
char conf[RBD_MAX_CONF_SIZE];
|
457 |
char clientname_buf[RBD_MAX_CONF_SIZE];
|
458 |
char *clientname;
|
459 |
QemuOpts *opts; |
460 |
Error *local_err = NULL;
|
461 |
const char *filename; |
462 |
int r;
|
463 |
|
464 |
opts = qemu_opts_create_nofail(&runtime_opts); |
465 |
qemu_opts_absorb_qdict(opts, options, &local_err); |
466 |
if (error_is_set(&local_err)) {
|
467 |
qerror_report_err(local_err); |
468 |
error_free(local_err); |
469 |
qemu_opts_del(opts); |
470 |
return -EINVAL;
|
471 |
} |
472 |
|
473 |
filename = qemu_opt_get(opts, "filename");
|
474 |
|
475 |
if (qemu_rbd_parsename(filename, pool, sizeof(pool), |
476 |
snap_buf, sizeof(snap_buf),
|
477 |
s->name, sizeof(s->name),
|
478 |
conf, sizeof(conf)) < 0) { |
479 |
r = -EINVAL; |
480 |
goto failed_opts;
|
481 |
} |
482 |
|
483 |
clientname = qemu_rbd_parse_clientname(conf, clientname_buf); |
484 |
r = rados_create(&s->cluster, clientname); |
485 |
if (r < 0) { |
486 |
error_report("error initializing");
|
487 |
goto failed_opts;
|
488 |
} |
489 |
|
490 |
s->snap = NULL;
|
491 |
if (snap_buf[0] != '\0') { |
492 |
s->snap = g_strdup(snap_buf); |
493 |
} |
494 |
|
495 |
/*
|
496 |
* Fallback to more conservative semantics if setting cache
|
497 |
* options fails. Ignore errors from setting rbd_cache because the
|
498 |
* only possible error is that the option does not exist, and
|
499 |
* librbd defaults to no caching. If write through caching cannot
|
500 |
* be set up, fall back to no caching.
|
501 |
*/
|
502 |
if (flags & BDRV_O_NOCACHE) {
|
503 |
rados_conf_set(s->cluster, "rbd_cache", "false"); |
504 |
} else {
|
505 |
rados_conf_set(s->cluster, "rbd_cache", "true"); |
506 |
} |
507 |
|
508 |
if (strstr(conf, "conf=") == NULL) { |
509 |
/* try default location, but ignore failure */
|
510 |
rados_conf_read_file(s->cluster, NULL);
|
511 |
} |
512 |
|
513 |
if (conf[0] != '\0') { |
514 |
r = qemu_rbd_set_conf(s->cluster, conf); |
515 |
if (r < 0) { |
516 |
error_report("error setting config options");
|
517 |
goto failed_shutdown;
|
518 |
} |
519 |
} |
520 |
|
521 |
r = rados_connect(s->cluster); |
522 |
if (r < 0) { |
523 |
error_report("error connecting");
|
524 |
goto failed_shutdown;
|
525 |
} |
526 |
|
527 |
r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); |
528 |
if (r < 0) { |
529 |
error_report("error opening pool %s", pool);
|
530 |
goto failed_shutdown;
|
531 |
} |
532 |
|
533 |
r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); |
534 |
if (r < 0) { |
535 |
error_report("error reading header from %s", s->name);
|
536 |
goto failed_open;
|
537 |
} |
538 |
|
539 |
bs->read_only = (s->snap != NULL);
|
540 |
|
541 |
s->event_reader_pos = 0;
|
542 |
r = qemu_pipe(s->fds); |
543 |
if (r < 0) { |
544 |
error_report("error opening eventfd");
|
545 |
goto failed;
|
546 |
} |
547 |
fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
|
548 |
fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
|
549 |
qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader, |
550 |
NULL, s);
|
551 |
|
552 |
|
553 |
qemu_opts_del(opts); |
554 |
return 0; |
555 |
|
556 |
failed:
|
557 |
rbd_close(s->image); |
558 |
failed_open:
|
559 |
rados_ioctx_destroy(s->io_ctx); |
560 |
failed_shutdown:
|
561 |
rados_shutdown(s->cluster); |
562 |
g_free(s->snap); |
563 |
failed_opts:
|
564 |
qemu_opts_del(opts); |
565 |
return r;
|
566 |
} |
567 |
|
568 |
static void qemu_rbd_close(BlockDriverState *bs) |
569 |
{ |
570 |
BDRVRBDState *s = bs->opaque; |
571 |
|
572 |
close(s->fds[0]);
|
573 |
close(s->fds[1]);
|
574 |
qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL); |
575 |
|
576 |
rbd_close(s->image); |
577 |
rados_ioctx_destroy(s->io_ctx); |
578 |
g_free(s->snap); |
579 |
rados_shutdown(s->cluster); |
580 |
} |
581 |
|
582 |
/*
|
583 |
* Cancel aio. Since we don't reference acb in a non qemu threads,
|
584 |
* it is safe to access it here.
|
585 |
*/
|
586 |
static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb) |
587 |
{ |
588 |
RBDAIOCB *acb = (RBDAIOCB *) blockacb; |
589 |
acb->cancelled = 1;
|
590 |
|
591 |
while (acb->status == -EINPROGRESS) {
|
592 |
qemu_aio_wait(); |
593 |
} |
594 |
|
595 |
qemu_aio_release(acb); |
596 |
} |
597 |
|
598 |
static const AIOCBInfo rbd_aiocb_info = { |
599 |
.aiocb_size = sizeof(RBDAIOCB),
|
600 |
.cancel = qemu_rbd_aio_cancel, |
601 |
}; |
602 |
|
603 |
static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) |
604 |
{ |
605 |
int ret = 0; |
606 |
while (1) { |
607 |
fd_set wfd; |
608 |
int fd = s->fds[RBD_FD_WRITE];
|
609 |
|
610 |
/* send the op pointer to the qemu thread that is responsible
|
611 |
for the aio/op completion. Must do it in a qemu thread context */
|
612 |
ret = write(fd, (void *)&rcb, sizeof(rcb)); |
613 |
if (ret >= 0) { |
614 |
break;
|
615 |
} |
616 |
if (errno == EINTR) {
|
617 |
continue;
|
618 |
} |
619 |
if (errno != EAGAIN) {
|
620 |
break;
|
621 |
} |
622 |
|
623 |
FD_ZERO(&wfd); |
624 |
FD_SET(fd, &wfd); |
625 |
do {
|
626 |
ret = select(fd + 1, NULL, &wfd, NULL, NULL); |
627 |
} while (ret < 0 && errno == EINTR); |
628 |
} |
629 |
|
630 |
return ret;
|
631 |
} |
632 |
|
633 |
/*
|
634 |
* This is the callback function for rbd_aio_read and _write
|
635 |
*
|
636 |
* Note: this function is being called from a non qemu thread so
|
637 |
* we need to be careful about what we do here. Generally we only
|
638 |
* write to the block notification pipe, and do the rest of the
|
639 |
* io completion handling from qemu_rbd_aio_event_reader() which
|
640 |
* runs in a qemu context.
|
641 |
*/
|
642 |
static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) |
643 |
{ |
644 |
int ret;
|
645 |
rcb->ret = rbd_aio_get_return_value(c); |
646 |
rbd_aio_release(c); |
647 |
ret = qemu_rbd_send_pipe(rcb->s, rcb); |
648 |
if (ret < 0) { |
649 |
error_report("failed writing to acb->s->fds");
|
650 |
g_free(rcb); |
651 |
} |
652 |
} |
653 |
|
654 |
/* Callback when all queued rbd_aio requests are complete */
|
655 |
|
656 |
static void rbd_aio_bh_cb(void *opaque) |
657 |
{ |
658 |
RBDAIOCB *acb = opaque; |
659 |
|
660 |
if (acb->cmd == RBD_AIO_READ) {
|
661 |
qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
|
662 |
} |
663 |
qemu_vfree(acb->bounce); |
664 |
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); |
665 |
qemu_bh_delete(acb->bh); |
666 |
acb->bh = NULL;
|
667 |
acb->status = 0;
|
668 |
|
669 |
if (!acb->cancelled) {
|
670 |
qemu_aio_release(acb); |
671 |
} |
672 |
} |
673 |
|
674 |
static int rbd_aio_discard_wrapper(rbd_image_t image, |
675 |
uint64_t off, |
676 |
uint64_t len, |
677 |
rbd_completion_t comp) |
678 |
{ |
679 |
#ifdef LIBRBD_SUPPORTS_DISCARD
|
680 |
return rbd_aio_discard(image, off, len, comp);
|
681 |
#else
|
682 |
return -ENOTSUP;
|
683 |
#endif
|
684 |
} |
685 |
|
686 |
static int rbd_aio_flush_wrapper(rbd_image_t image, |
687 |
rbd_completion_t comp) |
688 |
{ |
689 |
#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
|
690 |
return rbd_aio_flush(image, comp);
|
691 |
#else
|
692 |
return -ENOTSUP;
|
693 |
#endif
|
694 |
} |
695 |
|
696 |
static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
|
697 |
int64_t sector_num, |
698 |
QEMUIOVector *qiov, |
699 |
int nb_sectors,
|
700 |
BlockDriverCompletionFunc *cb, |
701 |
void *opaque,
|
702 |
RBDAIOCmd cmd) |
703 |
{ |
704 |
RBDAIOCB *acb; |
705 |
RADOSCB *rcb; |
706 |
rbd_completion_t c; |
707 |
int64_t off, size; |
708 |
char *buf;
|
709 |
int r;
|
710 |
|
711 |
BDRVRBDState *s = bs->opaque; |
712 |
|
713 |
acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); |
714 |
acb->cmd = cmd; |
715 |
acb->qiov = qiov; |
716 |
if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
|
717 |
acb->bounce = NULL;
|
718 |
} else {
|
719 |
acb->bounce = qemu_blockalign(bs, qiov->size); |
720 |
} |
721 |
acb->ret = 0;
|
722 |
acb->error = 0;
|
723 |
acb->s = s; |
724 |
acb->cancelled = 0;
|
725 |
acb->bh = NULL;
|
726 |
acb->status = -EINPROGRESS; |
727 |
|
728 |
if (cmd == RBD_AIO_WRITE) {
|
729 |
qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
|
730 |
} |
731 |
|
732 |
buf = acb->bounce; |
733 |
|
734 |
off = sector_num * BDRV_SECTOR_SIZE; |
735 |
size = nb_sectors * BDRV_SECTOR_SIZE; |
736 |
|
737 |
rcb = g_malloc(sizeof(RADOSCB));
|
738 |
rcb->done = 0;
|
739 |
rcb->acb = acb; |
740 |
rcb->buf = buf; |
741 |
rcb->s = acb->s; |
742 |
rcb->size = size; |
743 |
r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); |
744 |
if (r < 0) { |
745 |
goto failed;
|
746 |
} |
747 |
|
748 |
switch (cmd) {
|
749 |
case RBD_AIO_WRITE:
|
750 |
r = rbd_aio_write(s->image, off, size, buf, c); |
751 |
break;
|
752 |
case RBD_AIO_READ:
|
753 |
r = rbd_aio_read(s->image, off, size, buf, c); |
754 |
break;
|
755 |
case RBD_AIO_DISCARD:
|
756 |
r = rbd_aio_discard_wrapper(s->image, off, size, c); |
757 |
break;
|
758 |
case RBD_AIO_FLUSH:
|
759 |
r = rbd_aio_flush_wrapper(s->image, c); |
760 |
break;
|
761 |
default:
|
762 |
r = -EINVAL; |
763 |
} |
764 |
|
765 |
if (r < 0) { |
766 |
goto failed;
|
767 |
} |
768 |
|
769 |
return &acb->common;
|
770 |
|
771 |
failed:
|
772 |
g_free(rcb); |
773 |
qemu_aio_release(acb); |
774 |
return NULL; |
775 |
} |
776 |
|
777 |
static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
|
778 |
int64_t sector_num, |
779 |
QEMUIOVector *qiov, |
780 |
int nb_sectors,
|
781 |
BlockDriverCompletionFunc *cb, |
782 |
void *opaque)
|
783 |
{ |
784 |
return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
|
785 |
RBD_AIO_READ); |
786 |
} |
787 |
|
788 |
static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
|
789 |
int64_t sector_num, |
790 |
QEMUIOVector *qiov, |
791 |
int nb_sectors,
|
792 |
BlockDriverCompletionFunc *cb, |
793 |
void *opaque)
|
794 |
{ |
795 |
return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
|
796 |
RBD_AIO_WRITE); |
797 |
} |
798 |
|
799 |
#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
|
800 |
static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
|
801 |
BlockDriverCompletionFunc *cb, |
802 |
void *opaque)
|
803 |
{ |
804 |
return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); |
805 |
} |
806 |
|
807 |
#else
|
808 |
|
809 |
static int qemu_rbd_co_flush(BlockDriverState *bs) |
810 |
{ |
811 |
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) |
812 |
/* rbd_flush added in 0.1.1 */
|
813 |
BDRVRBDState *s = bs->opaque; |
814 |
return rbd_flush(s->image);
|
815 |
#else
|
816 |
return 0; |
817 |
#endif
|
818 |
} |
819 |
#endif
|
820 |
|
821 |
static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) |
822 |
{ |
823 |
BDRVRBDState *s = bs->opaque; |
824 |
rbd_image_info_t info; |
825 |
int r;
|
826 |
|
827 |
r = rbd_stat(s->image, &info, sizeof(info));
|
828 |
if (r < 0) { |
829 |
return r;
|
830 |
} |
831 |
|
832 |
bdi->cluster_size = info.obj_size; |
833 |
return 0; |
834 |
} |
835 |
|
836 |
static int64_t qemu_rbd_getlength(BlockDriverState *bs)
|
837 |
{ |
838 |
BDRVRBDState *s = bs->opaque; |
839 |
rbd_image_info_t info; |
840 |
int r;
|
841 |
|
842 |
r = rbd_stat(s->image, &info, sizeof(info));
|
843 |
if (r < 0) { |
844 |
return r;
|
845 |
} |
846 |
|
847 |
return info.size;
|
848 |
} |
849 |
|
850 |
static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset) |
851 |
{ |
852 |
BDRVRBDState *s = bs->opaque; |
853 |
int r;
|
854 |
|
855 |
r = rbd_resize(s->image, offset); |
856 |
if (r < 0) { |
857 |
return r;
|
858 |
} |
859 |
|
860 |
return 0; |
861 |
} |
862 |
|
863 |
static int qemu_rbd_snap_create(BlockDriverState *bs, |
864 |
QEMUSnapshotInfo *sn_info) |
865 |
{ |
866 |
BDRVRBDState *s = bs->opaque; |
867 |
int r;
|
868 |
|
869 |
if (sn_info->name[0] == '\0') { |
870 |
return -EINVAL; /* we need a name for rbd snapshots */ |
871 |
} |
872 |
|
873 |
/*
|
874 |
* rbd snapshots are using the name as the user controlled unique identifier
|
875 |
* we can't use the rbd snapid for that purpose, as it can't be set
|
876 |
*/
|
877 |
if (sn_info->id_str[0] != '\0' && |
878 |
strcmp(sn_info->id_str, sn_info->name) != 0) {
|
879 |
return -EINVAL;
|
880 |
} |
881 |
|
882 |
if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { |
883 |
return -ERANGE;
|
884 |
} |
885 |
|
886 |
r = rbd_snap_create(s->image, sn_info->name); |
887 |
if (r < 0) { |
888 |
error_report("failed to create snap: %s", strerror(-r));
|
889 |
return r;
|
890 |
} |
891 |
|
892 |
return 0; |
893 |
} |
894 |
|
895 |
static int qemu_rbd_snap_remove(BlockDriverState *bs, |
896 |
const char *snapshot_id, |
897 |
const char *snapshot_name, |
898 |
Error **errp) |
899 |
{ |
900 |
BDRVRBDState *s = bs->opaque; |
901 |
int r;
|
902 |
|
903 |
if (!snapshot_name) {
|
904 |
error_setg(errp, "rbd need a valid snapshot name");
|
905 |
return -EINVAL;
|
906 |
} |
907 |
|
908 |
/* If snapshot_id is specified, it must be equal to name, see
|
909 |
qemu_rbd_snap_list() */
|
910 |
if (snapshot_id && strcmp(snapshot_id, snapshot_name)) {
|
911 |
error_setg(errp, |
912 |
"rbd do not support snapshot id, it should be NULL or "
|
913 |
"equal to snapshot name");
|
914 |
return -EINVAL;
|
915 |
} |
916 |
|
917 |
r = rbd_snap_remove(s->image, snapshot_name); |
918 |
if (r < 0) { |
919 |
error_setg_errno(errp, -r, "Failed to remove the snapshot");
|
920 |
} |
921 |
return r;
|
922 |
} |
923 |
|
924 |
static int qemu_rbd_snap_rollback(BlockDriverState *bs, |
925 |
const char *snapshot_name) |
926 |
{ |
927 |
BDRVRBDState *s = bs->opaque; |
928 |
int r;
|
929 |
|
930 |
r = rbd_snap_rollback(s->image, snapshot_name); |
931 |
return r;
|
932 |
} |
933 |
|
934 |
static int qemu_rbd_snap_list(BlockDriverState *bs, |
935 |
QEMUSnapshotInfo **psn_tab) |
936 |
{ |
937 |
BDRVRBDState *s = bs->opaque; |
938 |
QEMUSnapshotInfo *sn_info, *sn_tab = NULL;
|
939 |
int i, snap_count;
|
940 |
rbd_snap_info_t *snaps; |
941 |
int max_snaps = RBD_MAX_SNAPS;
|
942 |
|
943 |
do {
|
944 |
snaps = g_malloc(sizeof(*snaps) * max_snaps);
|
945 |
snap_count = rbd_snap_list(s->image, snaps, &max_snaps); |
946 |
if (snap_count < 0) { |
947 |
g_free(snaps); |
948 |
} |
949 |
} while (snap_count == -ERANGE);
|
950 |
|
951 |
if (snap_count <= 0) { |
952 |
goto done;
|
953 |
} |
954 |
|
955 |
sn_tab = g_malloc0(snap_count * sizeof(QEMUSnapshotInfo));
|
956 |
|
957 |
for (i = 0; i < snap_count; i++) { |
958 |
const char *snap_name = snaps[i].name; |
959 |
|
960 |
sn_info = sn_tab + i; |
961 |
pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name);
|
962 |
pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name);
|
963 |
|
964 |
sn_info->vm_state_size = snaps[i].size; |
965 |
sn_info->date_sec = 0;
|
966 |
sn_info->date_nsec = 0;
|
967 |
sn_info->vm_clock_nsec = 0;
|
968 |
} |
969 |
rbd_snap_list_end(snaps); |
970 |
|
971 |
done:
|
972 |
*psn_tab = sn_tab; |
973 |
return snap_count;
|
974 |
} |
975 |
|
976 |
#ifdef LIBRBD_SUPPORTS_DISCARD
|
977 |
static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
|
978 |
int64_t sector_num, |
979 |
int nb_sectors,
|
980 |
BlockDriverCompletionFunc *cb, |
981 |
void *opaque)
|
982 |
{ |
983 |
return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, |
984 |
RBD_AIO_DISCARD); |
985 |
} |
986 |
#endif
|
987 |
|
988 |
static QEMUOptionParameter qemu_rbd_create_options[] = {
|
989 |
{ |
990 |
.name = BLOCK_OPT_SIZE, |
991 |
.type = OPT_SIZE, |
992 |
.help = "Virtual disk size"
|
993 |
}, |
994 |
{ |
995 |
.name = BLOCK_OPT_CLUSTER_SIZE, |
996 |
.type = OPT_SIZE, |
997 |
.help = "RBD object size"
|
998 |
}, |
999 |
{NULL}
|
1000 |
}; |
1001 |
|
1002 |
static BlockDriver bdrv_rbd = {
|
1003 |
.format_name = "rbd",
|
1004 |
.instance_size = sizeof(BDRVRBDState),
|
1005 |
.bdrv_file_open = qemu_rbd_open, |
1006 |
.bdrv_close = qemu_rbd_close, |
1007 |
.bdrv_create = qemu_rbd_create, |
1008 |
.bdrv_has_zero_init = bdrv_has_zero_init_1, |
1009 |
.bdrv_get_info = qemu_rbd_getinfo, |
1010 |
.create_options = qemu_rbd_create_options, |
1011 |
.bdrv_getlength = qemu_rbd_getlength, |
1012 |
.bdrv_truncate = qemu_rbd_truncate, |
1013 |
.protocol_name = "rbd",
|
1014 |
|
1015 |
.bdrv_aio_readv = qemu_rbd_aio_readv, |
1016 |
.bdrv_aio_writev = qemu_rbd_aio_writev, |
1017 |
|
1018 |
#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
|
1019 |
.bdrv_aio_flush = qemu_rbd_aio_flush, |
1020 |
#else
|
1021 |
.bdrv_co_flush_to_disk = qemu_rbd_co_flush, |
1022 |
#endif
|
1023 |
|
1024 |
#ifdef LIBRBD_SUPPORTS_DISCARD
|
1025 |
.bdrv_aio_discard = qemu_rbd_aio_discard, |
1026 |
#endif
|
1027 |
|
1028 |
.bdrv_snapshot_create = qemu_rbd_snap_create, |
1029 |
.bdrv_snapshot_delete = qemu_rbd_snap_remove, |
1030 |
.bdrv_snapshot_list = qemu_rbd_snap_list, |
1031 |
.bdrv_snapshot_goto = qemu_rbd_snap_rollback, |
1032 |
}; |
1033 |
|
1034 |
static void bdrv_rbd_init(void) |
1035 |
{ |
1036 |
bdrv_register(&bdrv_rbd); |
1037 |
} |
1038 |
|
1039 |
block_init(bdrv_rbd_init); |