Statistics
| Branch: | Revision:

root / block / rbd.c @ 6a1751b7

History | View | Annotate | Download (25.9 kB)

1 f27aaf4b Christian Brunner
/*
2 f27aaf4b Christian Brunner
 * QEMU Block driver for RADOS (Ceph)
3 f27aaf4b Christian Brunner
 *
4 ad32e9c0 Josh Durgin
 * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
5 ad32e9c0 Josh Durgin
 *                         Josh Durgin <josh.durgin@dreamhost.com>
6 f27aaf4b Christian Brunner
 *
7 f27aaf4b Christian Brunner
 * This work is licensed under the terms of the GNU GPL, version 2.  See
8 f27aaf4b Christian Brunner
 * the COPYING file in the top-level directory.
9 f27aaf4b Christian Brunner
 *
10 6b620ca3 Paolo Bonzini
 * Contributions after 2012-01-13 are licensed under the terms of the
11 6b620ca3 Paolo Bonzini
 * GNU GPL, version 2 or (at your option) any later version.
12 f27aaf4b Christian Brunner
 */
13 f27aaf4b Christian Brunner
14 ad32e9c0 Josh Durgin
#include <inttypes.h>
15 ad32e9c0 Josh Durgin
16 f27aaf4b Christian Brunner
#include "qemu-common.h"
17 1de7afc9 Paolo Bonzini
#include "qemu/error-report.h"
18 737e150e Paolo Bonzini
#include "block/block_int.h"
19 f27aaf4b Christian Brunner
20 ad32e9c0 Josh Durgin
#include <rbd/librbd.h>
21 f27aaf4b Christian Brunner
22 f27aaf4b Christian Brunner
/*
23 f27aaf4b Christian Brunner
 * When specifying the image filename use:
24 f27aaf4b Christian Brunner
 *
25 fab5cf59 Josh Durgin
 * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
26 f27aaf4b Christian Brunner
 *
27 9e1fbcde Sage Weil
 * poolname must be the name of an existing rados pool.
28 f27aaf4b Christian Brunner
 *
29 9e1fbcde Sage Weil
 * devicename is the name of the rbd image.
30 f27aaf4b Christian Brunner
 *
31 9e1fbcde Sage Weil
 * Each option given is used to configure rados, and may be any valid
32 9e1fbcde Sage Weil
 * Ceph option, "id", or "conf".
33 fab5cf59 Josh Durgin
 *
34 9e1fbcde Sage Weil
 * The "id" option indicates what user we should authenticate as to
35 9e1fbcde Sage Weil
 * the Ceph cluster.  If it is excluded we will use the Ceph default
36 9e1fbcde Sage Weil
 * (normally 'admin').
37 f27aaf4b Christian Brunner
 *
38 9e1fbcde Sage Weil
 * The "conf" option specifies a Ceph configuration file to read.  If
39 9e1fbcde Sage Weil
 * it is not specified, we will read from the default Ceph locations
40 9e1fbcde Sage Weil
 * (e.g., /etc/ceph/ceph.conf).  To avoid reading _any_ configuration
41 9e1fbcde Sage Weil
 * file, specify conf=/dev/null.
42 f27aaf4b Christian Brunner
 *
43 9e1fbcde Sage Weil
 * Configuration values containing :, @, or = can be escaped with a
44 9e1fbcde Sage Weil
 * leading "\".
45 f27aaf4b Christian Brunner
 */
46 f27aaf4b Christian Brunner
47 787f3133 Josh Durgin
/* rbd_aio_discard added in 0.1.2 */
48 787f3133 Josh Durgin
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2)
49 787f3133 Josh Durgin
#define LIBRBD_SUPPORTS_DISCARD
50 787f3133 Josh Durgin
#else
51 787f3133 Josh Durgin
#undef LIBRBD_SUPPORTS_DISCARD
52 787f3133 Josh Durgin
#endif
53 787f3133 Josh Durgin
54 f27aaf4b Christian Brunner
#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
55 f27aaf4b Christian Brunner
56 ad32e9c0 Josh Durgin
#define RBD_MAX_CONF_NAME_SIZE 128
57 ad32e9c0 Josh Durgin
#define RBD_MAX_CONF_VAL_SIZE 512
58 ad32e9c0 Josh Durgin
#define RBD_MAX_CONF_SIZE 1024
59 ad32e9c0 Josh Durgin
#define RBD_MAX_POOL_NAME_SIZE 128
60 ad32e9c0 Josh Durgin
#define RBD_MAX_SNAP_NAME_SIZE 128
61 ad32e9c0 Josh Durgin
#define RBD_MAX_SNAPS 100
62 ad32e9c0 Josh Durgin
63 787f3133 Josh Durgin
typedef enum {
64 787f3133 Josh Durgin
    RBD_AIO_READ,
65 787f3133 Josh Durgin
    RBD_AIO_WRITE,
66 dc7588c1 Josh Durgin
    RBD_AIO_DISCARD,
67 dc7588c1 Josh Durgin
    RBD_AIO_FLUSH
68 787f3133 Josh Durgin
} RBDAIOCmd;
69 787f3133 Josh Durgin
70 f27aaf4b Christian Brunner
typedef struct RBDAIOCB {
71 f27aaf4b Christian Brunner
    BlockDriverAIOCB common;
72 f27aaf4b Christian Brunner
    QEMUBH *bh;
73 08448d51 Stefan Priebe
    int64_t ret;
74 f27aaf4b Christian Brunner
    QEMUIOVector *qiov;
75 f27aaf4b Christian Brunner
    char *bounce;
76 787f3133 Josh Durgin
    RBDAIOCmd cmd;
77 f27aaf4b Christian Brunner
    int64_t sector_num;
78 f27aaf4b Christian Brunner
    int error;
79 f27aaf4b Christian Brunner
    struct BDRVRBDState *s;
80 f27aaf4b Christian Brunner
    int cancelled;
81 473c7f02 Stefan Priebe
    int status;
82 f27aaf4b Christian Brunner
} RBDAIOCB;
83 f27aaf4b Christian Brunner
84 f27aaf4b Christian Brunner
typedef struct RADOSCB {
85 f27aaf4b Christian Brunner
    int rcbid;
86 f27aaf4b Christian Brunner
    RBDAIOCB *acb;
87 f27aaf4b Christian Brunner
    struct BDRVRBDState *s;
88 f27aaf4b Christian Brunner
    int done;
89 ad32e9c0 Josh Durgin
    int64_t size;
90 f27aaf4b Christian Brunner
    char *buf;
91 08448d51 Stefan Priebe
    int64_t ret;
92 f27aaf4b Christian Brunner
} RADOSCB;
93 f27aaf4b Christian Brunner
94 f27aaf4b Christian Brunner
#define RBD_FD_READ 0
95 f27aaf4b Christian Brunner
#define RBD_FD_WRITE 1
96 f27aaf4b Christian Brunner
97 f27aaf4b Christian Brunner
typedef struct BDRVRBDState {
98 f27aaf4b Christian Brunner
    int fds[2];
99 ad32e9c0 Josh Durgin
    rados_t cluster;
100 ad32e9c0 Josh Durgin
    rados_ioctx_t io_ctx;
101 ad32e9c0 Josh Durgin
    rbd_image_t image;
102 ad32e9c0 Josh Durgin
    char name[RBD_MAX_IMAGE_NAME_SIZE];
103 ad32e9c0 Josh Durgin
    char *snap;
104 f27aaf4b Christian Brunner
    int event_reader_pos;
105 f27aaf4b Christian Brunner
    RADOSCB *event_rcb;
106 f27aaf4b Christian Brunner
} BDRVRBDState;
107 f27aaf4b Christian Brunner
108 f27aaf4b Christian Brunner
static void rbd_aio_bh_cb(void *opaque);
109 f27aaf4b Christian Brunner
110 ad32e9c0 Josh Durgin
static int qemu_rbd_next_tok(char *dst, int dst_len,
111 ad32e9c0 Josh Durgin
                             char *src, char delim,
112 ad32e9c0 Josh Durgin
                             const char *name,
113 ad32e9c0 Josh Durgin
                             char **p)
114 f27aaf4b Christian Brunner
{
115 f27aaf4b Christian Brunner
    int l;
116 f27aaf4b Christian Brunner
    char *end;
117 f27aaf4b Christian Brunner
118 f27aaf4b Christian Brunner
    *p = NULL;
119 f27aaf4b Christian Brunner
120 f27aaf4b Christian Brunner
    if (delim != '\0') {
121 16a06b24 Sage Weil
        for (end = src; *end; ++end) {
122 16a06b24 Sage Weil
            if (*end == delim) {
123 16a06b24 Sage Weil
                break;
124 16a06b24 Sage Weil
            }
125 16a06b24 Sage Weil
            if (*end == '\\' && end[1] != '\0') {
126 16a06b24 Sage Weil
                end++;
127 16a06b24 Sage Weil
            }
128 16a06b24 Sage Weil
        }
129 16a06b24 Sage Weil
        if (*end == delim) {
130 f27aaf4b Christian Brunner
            *p = end + 1;
131 f27aaf4b Christian Brunner
            *end = '\0';
132 f27aaf4b Christian Brunner
        }
133 f27aaf4b Christian Brunner
    }
134 f27aaf4b Christian Brunner
    l = strlen(src);
135 f27aaf4b Christian Brunner
    if (l >= dst_len) {
136 f27aaf4b Christian Brunner
        error_report("%s too long", name);
137 f27aaf4b Christian Brunner
        return -EINVAL;
138 f27aaf4b Christian Brunner
    } else if (l == 0) {
139 f27aaf4b Christian Brunner
        error_report("%s too short", name);
140 f27aaf4b Christian Brunner
        return -EINVAL;
141 f27aaf4b Christian Brunner
    }
142 f27aaf4b Christian Brunner
143 f27aaf4b Christian Brunner
    pstrcpy(dst, dst_len, src);
144 f27aaf4b Christian Brunner
145 f27aaf4b Christian Brunner
    return 0;
146 f27aaf4b Christian Brunner
}
147 f27aaf4b Christian Brunner
148 16a06b24 Sage Weil
static void qemu_rbd_unescape(char *src)
149 16a06b24 Sage Weil
{
150 16a06b24 Sage Weil
    char *p;
151 16a06b24 Sage Weil
152 16a06b24 Sage Weil
    for (p = src; *src; ++src, ++p) {
153 16a06b24 Sage Weil
        if (*src == '\\' && src[1] != '\0') {
154 16a06b24 Sage Weil
            src++;
155 16a06b24 Sage Weil
        }
156 16a06b24 Sage Weil
        *p = *src;
157 16a06b24 Sage Weil
    }
158 16a06b24 Sage Weil
    *p = '\0';
159 16a06b24 Sage Weil
}
160 16a06b24 Sage Weil
161 ad32e9c0 Josh Durgin
static int qemu_rbd_parsename(const char *filename,
162 ad32e9c0 Josh Durgin
                              char *pool, int pool_len,
163 ad32e9c0 Josh Durgin
                              char *snap, int snap_len,
164 fab5cf59 Josh Durgin
                              char *name, int name_len,
165 fab5cf59 Josh Durgin
                              char *conf, int conf_len)
166 f27aaf4b Christian Brunner
{
167 f27aaf4b Christian Brunner
    const char *start;
168 f27aaf4b Christian Brunner
    char *p, *buf;
169 f27aaf4b Christian Brunner
    int ret;
170 f27aaf4b Christian Brunner
171 f27aaf4b Christian Brunner
    if (!strstart(filename, "rbd:", &start)) {
172 f27aaf4b Christian Brunner
        return -EINVAL;
173 f27aaf4b Christian Brunner
    }
174 f27aaf4b Christian Brunner
175 7267c094 Anthony Liguori
    buf = g_strdup(start);
176 f27aaf4b Christian Brunner
    p = buf;
177 fab5cf59 Josh Durgin
    *snap = '\0';
178 fab5cf59 Josh Durgin
    *conf = '\0';
179 f27aaf4b Christian Brunner
180 ad32e9c0 Josh Durgin
    ret = qemu_rbd_next_tok(pool, pool_len, p, '/', "pool name", &p);
181 f27aaf4b Christian Brunner
    if (ret < 0 || !p) {
182 f27aaf4b Christian Brunner
        ret = -EINVAL;
183 f27aaf4b Christian Brunner
        goto done;
184 f27aaf4b Christian Brunner
    }
185 16a06b24 Sage Weil
    qemu_rbd_unescape(pool);
186 fab5cf59 Josh Durgin
187 fab5cf59 Josh Durgin
    if (strchr(p, '@')) {
188 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p);
189 fab5cf59 Josh Durgin
        if (ret < 0) {
190 fab5cf59 Josh Durgin
            goto done;
191 fab5cf59 Josh Durgin
        }
192 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p);
193 16a06b24 Sage Weil
        qemu_rbd_unescape(snap);
194 fab5cf59 Josh Durgin
    } else {
195 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p);
196 f27aaf4b Christian Brunner
    }
197 16a06b24 Sage Weil
    qemu_rbd_unescape(name);
198 fab5cf59 Josh Durgin
    if (ret < 0 || !p) {
199 f27aaf4b Christian Brunner
        goto done;
200 f27aaf4b Christian Brunner
    }
201 f27aaf4b Christian Brunner
202 fab5cf59 Josh Durgin
    ret = qemu_rbd_next_tok(conf, conf_len, p, '\0', "configuration", &p);
203 f27aaf4b Christian Brunner
204 f27aaf4b Christian Brunner
done:
205 7267c094 Anthony Liguori
    g_free(buf);
206 f27aaf4b Christian Brunner
    return ret;
207 f27aaf4b Christian Brunner
}
208 f27aaf4b Christian Brunner
209 7c7e9df0 Sage Weil
static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
210 7c7e9df0 Sage Weil
{
211 7c7e9df0 Sage Weil
    const char *p = conf;
212 7c7e9df0 Sage Weil
213 7c7e9df0 Sage Weil
    while (*p) {
214 7c7e9df0 Sage Weil
        int len;
215 7c7e9df0 Sage Weil
        const char *end = strchr(p, ':');
216 7c7e9df0 Sage Weil
217 7c7e9df0 Sage Weil
        if (end) {
218 7c7e9df0 Sage Weil
            len = end - p;
219 7c7e9df0 Sage Weil
        } else {
220 7c7e9df0 Sage Weil
            len = strlen(p);
221 7c7e9df0 Sage Weil
        }
222 7c7e9df0 Sage Weil
223 7c7e9df0 Sage Weil
        if (strncmp(p, "id=", 3) == 0) {
224 7c7e9df0 Sage Weil
            len -= 3;
225 7c7e9df0 Sage Weil
            strncpy(clientname, p + 3, len);
226 7c7e9df0 Sage Weil
            clientname[len] = '\0';
227 7c7e9df0 Sage Weil
            return clientname;
228 7c7e9df0 Sage Weil
        }
229 7c7e9df0 Sage Weil
        if (end == NULL) {
230 7c7e9df0 Sage Weil
            break;
231 7c7e9df0 Sage Weil
        }
232 7c7e9df0 Sage Weil
        p = end + 1;
233 7c7e9df0 Sage Weil
    }
234 7c7e9df0 Sage Weil
    return NULL;
235 7c7e9df0 Sage Weil
}
236 7c7e9df0 Sage Weil
237 fab5cf59 Josh Durgin
static int qemu_rbd_set_conf(rados_t cluster, const char *conf)
238 fab5cf59 Josh Durgin
{
239 fab5cf59 Josh Durgin
    char *p, *buf;
240 fab5cf59 Josh Durgin
    char name[RBD_MAX_CONF_NAME_SIZE];
241 fab5cf59 Josh Durgin
    char value[RBD_MAX_CONF_VAL_SIZE];
242 fab5cf59 Josh Durgin
    int ret = 0;
243 fab5cf59 Josh Durgin
244 7267c094 Anthony Liguori
    buf = g_strdup(conf);
245 fab5cf59 Josh Durgin
    p = buf;
246 fab5cf59 Josh Durgin
247 fab5cf59 Josh Durgin
    while (p) {
248 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(name, sizeof(name), p,
249 fab5cf59 Josh Durgin
                                '=', "conf option name", &p);
250 fab5cf59 Josh Durgin
        if (ret < 0) {
251 fab5cf59 Josh Durgin
            break;
252 fab5cf59 Josh Durgin
        }
253 16a06b24 Sage Weil
        qemu_rbd_unescape(name);
254 fab5cf59 Josh Durgin
255 fab5cf59 Josh Durgin
        if (!p) {
256 fab5cf59 Josh Durgin
            error_report("conf option %s has no value", name);
257 fab5cf59 Josh Durgin
            ret = -EINVAL;
258 fab5cf59 Josh Durgin
            break;
259 fab5cf59 Josh Durgin
        }
260 fab5cf59 Josh Durgin
261 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(value, sizeof(value), p,
262 fab5cf59 Josh Durgin
                                ':', "conf option value", &p);
263 fab5cf59 Josh Durgin
        if (ret < 0) {
264 fab5cf59 Josh Durgin
            break;
265 fab5cf59 Josh Durgin
        }
266 16a06b24 Sage Weil
        qemu_rbd_unescape(value);
267 fab5cf59 Josh Durgin
268 7c7e9df0 Sage Weil
        if (strcmp(name, "conf") == 0) {
269 7c7e9df0 Sage Weil
            ret = rados_conf_read_file(cluster, value);
270 fab5cf59 Josh Durgin
            if (ret < 0) {
271 7c7e9df0 Sage Weil
                error_report("error reading conf file %s", value);
272 fab5cf59 Josh Durgin
                break;
273 fab5cf59 Josh Durgin
            }
274 7c7e9df0 Sage Weil
        } else if (strcmp(name, "id") == 0) {
275 7c7e9df0 Sage Weil
            /* ignore, this is parsed by qemu_rbd_parse_clientname() */
276 fab5cf59 Josh Durgin
        } else {
277 7c7e9df0 Sage Weil
            ret = rados_conf_set(cluster, name, value);
278 fab5cf59 Josh Durgin
            if (ret < 0) {
279 7c7e9df0 Sage Weil
                error_report("invalid conf option %s", name);
280 7c7e9df0 Sage Weil
                ret = -EINVAL;
281 fab5cf59 Josh Durgin
                break;
282 fab5cf59 Josh Durgin
            }
283 fab5cf59 Josh Durgin
        }
284 fab5cf59 Josh Durgin
    }
285 fab5cf59 Josh Durgin
286 7267c094 Anthony Liguori
    g_free(buf);
287 fab5cf59 Josh Durgin
    return ret;
288 fab5cf59 Josh Durgin
}
289 fab5cf59 Josh Durgin
290 ad32e9c0 Josh Durgin
static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options)
291 f27aaf4b Christian Brunner
{
292 f27aaf4b Christian Brunner
    int64_t bytes = 0;
293 f27aaf4b Christian Brunner
    int64_t objsize;
294 ad32e9c0 Josh Durgin
    int obj_order = 0;
295 ad32e9c0 Josh Durgin
    char pool[RBD_MAX_POOL_NAME_SIZE];
296 ad32e9c0 Josh Durgin
    char name[RBD_MAX_IMAGE_NAME_SIZE];
297 ad32e9c0 Josh Durgin
    char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
298 fab5cf59 Josh Durgin
    char conf[RBD_MAX_CONF_SIZE];
299 7c7e9df0 Sage Weil
    char clientname_buf[RBD_MAX_CONF_SIZE];
300 7c7e9df0 Sage Weil
    char *clientname;
301 ad32e9c0 Josh Durgin
    rados_t cluster;
302 ad32e9c0 Josh Durgin
    rados_ioctx_t io_ctx;
303 f27aaf4b Christian Brunner
    int ret;
304 f27aaf4b Christian Brunner
305 ad32e9c0 Josh Durgin
    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
306 ad32e9c0 Josh Durgin
                           snap_buf, sizeof(snap_buf),
307 fab5cf59 Josh Durgin
                           name, sizeof(name),
308 fab5cf59 Josh Durgin
                           conf, sizeof(conf)) < 0) {
309 f27aaf4b Christian Brunner
        return -EINVAL;
310 f27aaf4b Christian Brunner
    }
311 f27aaf4b Christian Brunner
312 f27aaf4b Christian Brunner
    /* Read out options */
313 f27aaf4b Christian Brunner
    while (options && options->name) {
314 f27aaf4b Christian Brunner
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
315 f27aaf4b Christian Brunner
            bytes = options->value.n;
316 f27aaf4b Christian Brunner
        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
317 f27aaf4b Christian Brunner
            if (options->value.n) {
318 f27aaf4b Christian Brunner
                objsize = options->value.n;
319 f27aaf4b Christian Brunner
                if ((objsize - 1) & objsize) {    /* not a power of 2? */
320 f27aaf4b Christian Brunner
                    error_report("obj size needs to be power of 2");
321 f27aaf4b Christian Brunner
                    return -EINVAL;
322 f27aaf4b Christian Brunner
                }
323 f27aaf4b Christian Brunner
                if (objsize < 4096) {
324 f27aaf4b Christian Brunner
                    error_report("obj size too small");
325 f27aaf4b Christian Brunner
                    return -EINVAL;
326 f27aaf4b Christian Brunner
                }
327 ad32e9c0 Josh Durgin
                obj_order = ffs(objsize) - 1;
328 f27aaf4b Christian Brunner
            }
329 f27aaf4b Christian Brunner
        }
330 f27aaf4b Christian Brunner
        options++;
331 f27aaf4b Christian Brunner
    }
332 f27aaf4b Christian Brunner
333 7c7e9df0 Sage Weil
    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
334 7c7e9df0 Sage Weil
    if (rados_create(&cluster, clientname) < 0) {
335 f27aaf4b Christian Brunner
        error_report("error initializing");
336 f27aaf4b Christian Brunner
        return -EIO;
337 f27aaf4b Christian Brunner
    }
338 f27aaf4b Christian Brunner
339 fab5cf59 Josh Durgin
    if (strstr(conf, "conf=") == NULL) {
340 f9fe18ec Sage Weil
        /* try default location, but ignore failure */
341 f9fe18ec Sage Weil
        rados_conf_read_file(cluster, NULL);
342 fab5cf59 Josh Durgin
    }
343 fab5cf59 Josh Durgin
344 fab5cf59 Josh Durgin
    if (conf[0] != '\0' &&
345 fab5cf59 Josh Durgin
        qemu_rbd_set_conf(cluster, conf) < 0) {
346 fab5cf59 Josh Durgin
        error_report("error setting config options");
347 ad32e9c0 Josh Durgin
        rados_shutdown(cluster);
348 f27aaf4b Christian Brunner
        return -EIO;
349 f27aaf4b Christian Brunner
    }
350 f27aaf4b Christian Brunner
351 ad32e9c0 Josh Durgin
    if (rados_connect(cluster) < 0) {
352 ad32e9c0 Josh Durgin
        error_report("error connecting");
353 ad32e9c0 Josh Durgin
        rados_shutdown(cluster);
354 f27aaf4b Christian Brunner
        return -EIO;
355 f27aaf4b Christian Brunner
    }
356 f27aaf4b Christian Brunner
357 ad32e9c0 Josh Durgin
    if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
358 ad32e9c0 Josh Durgin
        error_report("error opening pool %s", pool);
359 ad32e9c0 Josh Durgin
        rados_shutdown(cluster);
360 ad32e9c0 Josh Durgin
        return -EIO;
361 f27aaf4b Christian Brunner
    }
362 f27aaf4b Christian Brunner
363 ad32e9c0 Josh Durgin
    ret = rbd_create(io_ctx, name, bytes, &obj_order);
364 ad32e9c0 Josh Durgin
    rados_ioctx_destroy(io_ctx);
365 ad32e9c0 Josh Durgin
    rados_shutdown(cluster);
366 f27aaf4b Christian Brunner
367 f27aaf4b Christian Brunner
    return ret;
368 f27aaf4b Christian Brunner
}
369 f27aaf4b Christian Brunner
370 f27aaf4b Christian Brunner
/*
371 ad32e9c0 Josh Durgin
 * This aio completion is being called from qemu_rbd_aio_event_reader()
372 ad32e9c0 Josh Durgin
 * and runs in qemu context. It schedules a bh, but just in case the aio
373 f27aaf4b Christian Brunner
 * was not cancelled before.
374 f27aaf4b Christian Brunner
 */
375 ad32e9c0 Josh Durgin
static void qemu_rbd_complete_aio(RADOSCB *rcb)
376 f27aaf4b Christian Brunner
{
377 f27aaf4b Christian Brunner
    RBDAIOCB *acb = rcb->acb;
378 f27aaf4b Christian Brunner
    int64_t r;
379 f27aaf4b Christian Brunner
380 f27aaf4b Christian Brunner
    r = rcb->ret;
381 f27aaf4b Christian Brunner
382 dc7588c1 Josh Durgin
    if (acb->cmd != RBD_AIO_READ) {
383 f27aaf4b Christian Brunner
        if (r < 0) {
384 f27aaf4b Christian Brunner
            acb->ret = r;
385 f27aaf4b Christian Brunner
            acb->error = 1;
386 f27aaf4b Christian Brunner
        } else if (!acb->error) {
387 ad32e9c0 Josh Durgin
            acb->ret = rcb->size;
388 f27aaf4b Christian Brunner
        }
389 f27aaf4b Christian Brunner
    } else {
390 ad32e9c0 Josh Durgin
        if (r < 0) {
391 ad32e9c0 Josh Durgin
            memset(rcb->buf, 0, rcb->size);
392 f27aaf4b Christian Brunner
            acb->ret = r;
393 f27aaf4b Christian Brunner
            acb->error = 1;
394 ad32e9c0 Josh Durgin
        } else if (r < rcb->size) {
395 ad32e9c0 Josh Durgin
            memset(rcb->buf + r, 0, rcb->size - r);
396 f27aaf4b Christian Brunner
            if (!acb->error) {
397 ad32e9c0 Josh Durgin
                acb->ret = rcb->size;
398 f27aaf4b Christian Brunner
            }
399 f27aaf4b Christian Brunner
        } else if (!acb->error) {
400 ad32e9c0 Josh Durgin
            acb->ret = r;
401 f27aaf4b Christian Brunner
        }
402 f27aaf4b Christian Brunner
    }
403 f27aaf4b Christian Brunner
    /* Note that acb->bh can be NULL in case where the aio was cancelled */
404 ad32e9c0 Josh Durgin
    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
405 ad32e9c0 Josh Durgin
    qemu_bh_schedule(acb->bh);
406 7267c094 Anthony Liguori
    g_free(rcb);
407 f27aaf4b Christian Brunner
}
408 f27aaf4b Christian Brunner
409 f27aaf4b Christian Brunner
/*
410 f27aaf4b Christian Brunner
 * aio fd read handler. It runs in the qemu context and calls the
411 f27aaf4b Christian Brunner
 * completion handling of completed rados aio operations.
412 f27aaf4b Christian Brunner
 */
413 ad32e9c0 Josh Durgin
static void qemu_rbd_aio_event_reader(void *opaque)
414 f27aaf4b Christian Brunner
{
415 f27aaf4b Christian Brunner
    BDRVRBDState *s = opaque;
416 f27aaf4b Christian Brunner
417 f27aaf4b Christian Brunner
    ssize_t ret;
418 f27aaf4b Christian Brunner
419 f27aaf4b Christian Brunner
    do {
420 f27aaf4b Christian Brunner
        char *p = (char *)&s->event_rcb;
421 f27aaf4b Christian Brunner
422 f27aaf4b Christian Brunner
        /* now read the rcb pointer that was sent from a non qemu thread */
423 dfe80b07 Sage Weil
        ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
424 dfe80b07 Sage Weil
                   sizeof(s->event_rcb) - s->event_reader_pos);
425 dfe80b07 Sage Weil
        if (ret > 0) {
426 dfe80b07 Sage Weil
            s->event_reader_pos += ret;
427 dfe80b07 Sage Weil
            if (s->event_reader_pos == sizeof(s->event_rcb)) {
428 dfe80b07 Sage Weil
                s->event_reader_pos = 0;
429 dfe80b07 Sage Weil
                qemu_rbd_complete_aio(s->event_rcb);
430 f27aaf4b Christian Brunner
            }
431 f27aaf4b Christian Brunner
        }
432 f27aaf4b Christian Brunner
    } while (ret < 0 && errno == EINTR);
433 f27aaf4b Christian Brunner
}
434 f27aaf4b Christian Brunner
435 a9ccedc3 Kevin Wolf
/* TODO Convert to fine grained options */
436 a9ccedc3 Kevin Wolf
static QemuOptsList runtime_opts = {
437 a9ccedc3 Kevin Wolf
    .name = "rbd",
438 a9ccedc3 Kevin Wolf
    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
439 a9ccedc3 Kevin Wolf
    .desc = {
440 a9ccedc3 Kevin Wolf
        {
441 a9ccedc3 Kevin Wolf
            .name = "filename",
442 a9ccedc3 Kevin Wolf
            .type = QEMU_OPT_STRING,
443 a9ccedc3 Kevin Wolf
            .help = "Specification of the rbd image",
444 a9ccedc3 Kevin Wolf
        },
445 a9ccedc3 Kevin Wolf
        { /* end of list */ }
446 a9ccedc3 Kevin Wolf
    },
447 a9ccedc3 Kevin Wolf
};
448 a9ccedc3 Kevin Wolf
449 56d1b4d2 Kevin Wolf
static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags)
450 f27aaf4b Christian Brunner
{
451 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
452 ad32e9c0 Josh Durgin
    char pool[RBD_MAX_POOL_NAME_SIZE];
453 ad32e9c0 Josh Durgin
    char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
454 fab5cf59 Josh Durgin
    char conf[RBD_MAX_CONF_SIZE];
455 7c7e9df0 Sage Weil
    char clientname_buf[RBD_MAX_CONF_SIZE];
456 7c7e9df0 Sage Weil
    char *clientname;
457 a9ccedc3 Kevin Wolf
    QemuOpts *opts;
458 a9ccedc3 Kevin Wolf
    Error *local_err = NULL;
459 a9ccedc3 Kevin Wolf
    const char *filename;
460 f27aaf4b Christian Brunner
    int r;
461 f27aaf4b Christian Brunner
462 a9ccedc3 Kevin Wolf
    opts = qemu_opts_create_nofail(&runtime_opts);
463 a9ccedc3 Kevin Wolf
    qemu_opts_absorb_qdict(opts, options, &local_err);
464 a9ccedc3 Kevin Wolf
    if (error_is_set(&local_err)) {
465 a9ccedc3 Kevin Wolf
        qerror_report_err(local_err);
466 a9ccedc3 Kevin Wolf
        error_free(local_err);
467 a9ccedc3 Kevin Wolf
        qemu_opts_del(opts);
468 a9ccedc3 Kevin Wolf
        return -EINVAL;
469 a9ccedc3 Kevin Wolf
    }
470 a9ccedc3 Kevin Wolf
471 a9ccedc3 Kevin Wolf
    filename = qemu_opt_get(opts, "filename");
472 a9ccedc3 Kevin Wolf
473 ad32e9c0 Josh Durgin
    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
474 ad32e9c0 Josh Durgin
                           snap_buf, sizeof(snap_buf),
475 fab5cf59 Josh Durgin
                           s->name, sizeof(s->name),
476 fab5cf59 Josh Durgin
                           conf, sizeof(conf)) < 0) {
477 c3ca988d Kevin Wolf
        r = -EINVAL;
478 c3ca988d Kevin Wolf
        goto failed_opts;
479 f27aaf4b Christian Brunner
    }
480 f27aaf4b Christian Brunner
481 7c7e9df0 Sage Weil
    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
482 7c7e9df0 Sage Weil
    r = rados_create(&s->cluster, clientname);
483 ad32e9c0 Josh Durgin
    if (r < 0) {
484 f27aaf4b Christian Brunner
        error_report("error initializing");
485 c3ca988d Kevin Wolf
        goto failed_opts;
486 f27aaf4b Christian Brunner
    }
487 f27aaf4b Christian Brunner
488 eb93d5d9 Sage Weil
    s->snap = NULL;
489 eb93d5d9 Sage Weil
    if (snap_buf[0] != '\0') {
490 eb93d5d9 Sage Weil
        s->snap = g_strdup(snap_buf);
491 eb93d5d9 Sage Weil
    }
492 eb93d5d9 Sage Weil
493 b11f38fc Josh Durgin
    /*
494 b11f38fc Josh Durgin
     * Fallback to more conservative semantics if setting cache
495 b11f38fc Josh Durgin
     * options fails. Ignore errors from setting rbd_cache because the
496 b11f38fc Josh Durgin
     * only possible error is that the option does not exist, and
497 b11f38fc Josh Durgin
     * librbd defaults to no caching. If write through caching cannot
498 b11f38fc Josh Durgin
     * be set up, fall back to no caching.
499 b11f38fc Josh Durgin
     */
500 b11f38fc Josh Durgin
    if (flags & BDRV_O_NOCACHE) {
501 b11f38fc Josh Durgin
        rados_conf_set(s->cluster, "rbd_cache", "false");
502 b11f38fc Josh Durgin
    } else {
503 b11f38fc Josh Durgin
        rados_conf_set(s->cluster, "rbd_cache", "true");
504 b11f38fc Josh Durgin
    }
505 b11f38fc Josh Durgin
506 fab5cf59 Josh Durgin
    if (strstr(conf, "conf=") == NULL) {
507 f9fe18ec Sage Weil
        /* try default location, but ignore failure */
508 f9fe18ec Sage Weil
        rados_conf_read_file(s->cluster, NULL);
509 fab5cf59 Josh Durgin
    }
510 fab5cf59 Josh Durgin
511 fab5cf59 Josh Durgin
    if (conf[0] != '\0') {
512 fab5cf59 Josh Durgin
        r = qemu_rbd_set_conf(s->cluster, conf);
513 fab5cf59 Josh Durgin
        if (r < 0) {
514 fab5cf59 Josh Durgin
            error_report("error setting config options");
515 eb93d5d9 Sage Weil
            goto failed_shutdown;
516 fab5cf59 Josh Durgin
        }
517 f27aaf4b Christian Brunner
    }
518 f27aaf4b Christian Brunner
519 ad32e9c0 Josh Durgin
    r = rados_connect(s->cluster);
520 ad32e9c0 Josh Durgin
    if (r < 0) {
521 ad32e9c0 Josh Durgin
        error_report("error connecting");
522 eb93d5d9 Sage Weil
        goto failed_shutdown;
523 f27aaf4b Christian Brunner
    }
524 f27aaf4b Christian Brunner
525 ad32e9c0 Josh Durgin
    r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
526 ad32e9c0 Josh Durgin
    if (r < 0) {
527 ad32e9c0 Josh Durgin
        error_report("error opening pool %s", pool);
528 eb93d5d9 Sage Weil
        goto failed_shutdown;
529 f27aaf4b Christian Brunner
    }
530 f27aaf4b Christian Brunner
531 ad32e9c0 Josh Durgin
    r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
532 f27aaf4b Christian Brunner
    if (r < 0) {
533 ad32e9c0 Josh Durgin
        error_report("error reading header from %s", s->name);
534 eb93d5d9 Sage Weil
        goto failed_open;
535 f27aaf4b Christian Brunner
    }
536 f27aaf4b Christian Brunner
537 ad32e9c0 Josh Durgin
    bs->read_only = (s->snap != NULL);
538 f27aaf4b Christian Brunner
539 f27aaf4b Christian Brunner
    s->event_reader_pos = 0;
540 f27aaf4b Christian Brunner
    r = qemu_pipe(s->fds);
541 f27aaf4b Christian Brunner
    if (r < 0) {
542 f27aaf4b Christian Brunner
        error_report("error opening eventfd");
543 f27aaf4b Christian Brunner
        goto failed;
544 f27aaf4b Christian Brunner
    }
545 f27aaf4b Christian Brunner
    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
546 f27aaf4b Christian Brunner
    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
547 ad32e9c0 Josh Durgin
    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader,
548 f2e5dca4 Stefan Hajnoczi
                            NULL, s);
549 f27aaf4b Christian Brunner
550 f27aaf4b Christian Brunner
551 c3ca988d Kevin Wolf
    qemu_opts_del(opts);
552 f27aaf4b Christian Brunner
    return 0;
553 f27aaf4b Christian Brunner
554 f27aaf4b Christian Brunner
failed:
555 ad32e9c0 Josh Durgin
    rbd_close(s->image);
556 eb93d5d9 Sage Weil
failed_open:
557 ad32e9c0 Josh Durgin
    rados_ioctx_destroy(s->io_ctx);
558 eb93d5d9 Sage Weil
failed_shutdown:
559 ad32e9c0 Josh Durgin
    rados_shutdown(s->cluster);
560 eb93d5d9 Sage Weil
    g_free(s->snap);
561 c3ca988d Kevin Wolf
failed_opts:
562 c3ca988d Kevin Wolf
    qemu_opts_del(opts);
563 f27aaf4b Christian Brunner
    return r;
564 f27aaf4b Christian Brunner
}
565 f27aaf4b Christian Brunner
566 ad32e9c0 Josh Durgin
static void qemu_rbd_close(BlockDriverState *bs)
567 f27aaf4b Christian Brunner
{
568 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
569 f27aaf4b Christian Brunner
570 f27aaf4b Christian Brunner
    close(s->fds[0]);
571 f27aaf4b Christian Brunner
    close(s->fds[1]);
572 f2e5dca4 Stefan Hajnoczi
    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL);
573 f27aaf4b Christian Brunner
574 ad32e9c0 Josh Durgin
    rbd_close(s->image);
575 ad32e9c0 Josh Durgin
    rados_ioctx_destroy(s->io_ctx);
576 7267c094 Anthony Liguori
    g_free(s->snap);
577 ad32e9c0 Josh Durgin
    rados_shutdown(s->cluster);
578 f27aaf4b Christian Brunner
}
579 f27aaf4b Christian Brunner
580 f27aaf4b Christian Brunner
/*
581 f27aaf4b Christian Brunner
 * Cancel aio. Since we don't reference acb in a non qemu threads,
582 f27aaf4b Christian Brunner
 * it is safe to access it here.
583 f27aaf4b Christian Brunner
 */
584 ad32e9c0 Josh Durgin
static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
585 f27aaf4b Christian Brunner
{
586 f27aaf4b Christian Brunner
    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
587 f27aaf4b Christian Brunner
    acb->cancelled = 1;
588 473c7f02 Stefan Priebe
589 473c7f02 Stefan Priebe
    while (acb->status == -EINPROGRESS) {
590 473c7f02 Stefan Priebe
        qemu_aio_wait();
591 473c7f02 Stefan Priebe
    }
592 473c7f02 Stefan Priebe
593 473c7f02 Stefan Priebe
    qemu_aio_release(acb);
594 f27aaf4b Christian Brunner
}
595 f27aaf4b Christian Brunner
596 d7331bed Stefan Hajnoczi
static const AIOCBInfo rbd_aiocb_info = {
597 f27aaf4b Christian Brunner
    .aiocb_size = sizeof(RBDAIOCB),
598 ad32e9c0 Josh Durgin
    .cancel = qemu_rbd_aio_cancel,
599 f27aaf4b Christian Brunner
};
600 f27aaf4b Christian Brunner
601 ad32e9c0 Josh Durgin
static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
602 f27aaf4b Christian Brunner
{
603 ad32e9c0 Josh Durgin
    int ret = 0;
604 f27aaf4b Christian Brunner
    while (1) {
605 f27aaf4b Christian Brunner
        fd_set wfd;
606 ad32e9c0 Josh Durgin
        int fd = s->fds[RBD_FD_WRITE];
607 f27aaf4b Christian Brunner
608 ad32e9c0 Josh Durgin
        /* send the op pointer to the qemu thread that is responsible
609 ad32e9c0 Josh Durgin
           for the aio/op completion. Must do it in a qemu thread context */
610 f27aaf4b Christian Brunner
        ret = write(fd, (void *)&rcb, sizeof(rcb));
611 f27aaf4b Christian Brunner
        if (ret >= 0) {
612 f27aaf4b Christian Brunner
            break;
613 f27aaf4b Christian Brunner
        }
614 f27aaf4b Christian Brunner
        if (errno == EINTR) {
615 f27aaf4b Christian Brunner
            continue;
616 ad32e9c0 Josh Durgin
        }
617 f27aaf4b Christian Brunner
        if (errno != EAGAIN) {
618 f27aaf4b Christian Brunner
            break;
619 ad32e9c0 Josh Durgin
        }
620 f27aaf4b Christian Brunner
621 f27aaf4b Christian Brunner
        FD_ZERO(&wfd);
622 f27aaf4b Christian Brunner
        FD_SET(fd, &wfd);
623 f27aaf4b Christian Brunner
        do {
624 f27aaf4b Christian Brunner
            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
625 f27aaf4b Christian Brunner
        } while (ret < 0 && errno == EINTR);
626 f27aaf4b Christian Brunner
    }
627 f27aaf4b Christian Brunner
628 ad32e9c0 Josh Durgin
    return ret;
629 ad32e9c0 Josh Durgin
}
630 ad32e9c0 Josh Durgin
631 ad32e9c0 Josh Durgin
/*
632 ad32e9c0 Josh Durgin
 * This is the callback function for rbd_aio_read and _write
633 ad32e9c0 Josh Durgin
 *
634 ad32e9c0 Josh Durgin
 * Note: this function is being called from a non qemu thread so
635 ad32e9c0 Josh Durgin
 * we need to be careful about what we do here. Generally we only
636 ad32e9c0 Josh Durgin
 * write to the block notification pipe, and do the rest of the
637 ad32e9c0 Josh Durgin
 * io completion handling from qemu_rbd_aio_event_reader() which
638 ad32e9c0 Josh Durgin
 * runs in a qemu context.
639 ad32e9c0 Josh Durgin
 */
640 ad32e9c0 Josh Durgin
static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
641 ad32e9c0 Josh Durgin
{
642 ad32e9c0 Josh Durgin
    int ret;
643 ad32e9c0 Josh Durgin
    rcb->ret = rbd_aio_get_return_value(c);
644 ad32e9c0 Josh Durgin
    rbd_aio_release(c);
645 ad32e9c0 Josh Durgin
    ret = qemu_rbd_send_pipe(rcb->s, rcb);
646 f27aaf4b Christian Brunner
    if (ret < 0) {
647 ad32e9c0 Josh Durgin
        error_report("failed writing to acb->s->fds");
648 7267c094 Anthony Liguori
        g_free(rcb);
649 f27aaf4b Christian Brunner
    }
650 f27aaf4b Christian Brunner
}
651 f27aaf4b Christian Brunner
652 ad32e9c0 Josh Durgin
/* Callback when all queued rbd_aio requests are complete */
653 f27aaf4b Christian Brunner
654 f27aaf4b Christian Brunner
static void rbd_aio_bh_cb(void *opaque)
655 f27aaf4b Christian Brunner
{
656 f27aaf4b Christian Brunner
    RBDAIOCB *acb = opaque;
657 f27aaf4b Christian Brunner
658 787f3133 Josh Durgin
    if (acb->cmd == RBD_AIO_READ) {
659 03396148 Michael Tokarev
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
660 f27aaf4b Christian Brunner
    }
661 f27aaf4b Christian Brunner
    qemu_vfree(acb->bounce);
662 f27aaf4b Christian Brunner
    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
663 f27aaf4b Christian Brunner
    qemu_bh_delete(acb->bh);
664 f27aaf4b Christian Brunner
    acb->bh = NULL;
665 473c7f02 Stefan Priebe
    acb->status = 0;
666 f27aaf4b Christian Brunner
667 473c7f02 Stefan Priebe
    if (!acb->cancelled) {
668 473c7f02 Stefan Priebe
        qemu_aio_release(acb);
669 473c7f02 Stefan Priebe
    }
670 f27aaf4b Christian Brunner
}
671 f27aaf4b Christian Brunner
672 787f3133 Josh Durgin
static int rbd_aio_discard_wrapper(rbd_image_t image,
673 787f3133 Josh Durgin
                                   uint64_t off,
674 787f3133 Josh Durgin
                                   uint64_t len,
675 787f3133 Josh Durgin
                                   rbd_completion_t comp)
676 787f3133 Josh Durgin
{
677 787f3133 Josh Durgin
#ifdef LIBRBD_SUPPORTS_DISCARD
678 787f3133 Josh Durgin
    return rbd_aio_discard(image, off, len, comp);
679 787f3133 Josh Durgin
#else
680 787f3133 Josh Durgin
    return -ENOTSUP;
681 787f3133 Josh Durgin
#endif
682 787f3133 Josh Durgin
}
683 787f3133 Josh Durgin
684 dc7588c1 Josh Durgin
static int rbd_aio_flush_wrapper(rbd_image_t image,
685 dc7588c1 Josh Durgin
                                 rbd_completion_t comp)
686 dc7588c1 Josh Durgin
{
687 dc7588c1 Josh Durgin
#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
688 dc7588c1 Josh Durgin
    return rbd_aio_flush(image, comp);
689 dc7588c1 Josh Durgin
#else
690 dc7588c1 Josh Durgin
    return -ENOTSUP;
691 dc7588c1 Josh Durgin
#endif
692 dc7588c1 Josh Durgin
}
693 dc7588c1 Josh Durgin
694 787f3133 Josh Durgin
static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
695 787f3133 Josh Durgin
                                       int64_t sector_num,
696 787f3133 Josh Durgin
                                       QEMUIOVector *qiov,
697 787f3133 Josh Durgin
                                       int nb_sectors,
698 787f3133 Josh Durgin
                                       BlockDriverCompletionFunc *cb,
699 787f3133 Josh Durgin
                                       void *opaque,
700 787f3133 Josh Durgin
                                       RBDAIOCmd cmd)
701 f27aaf4b Christian Brunner
{
702 f27aaf4b Christian Brunner
    RBDAIOCB *acb;
703 f27aaf4b Christian Brunner
    RADOSCB *rcb;
704 ad32e9c0 Josh Durgin
    rbd_completion_t c;
705 f27aaf4b Christian Brunner
    int64_t off, size;
706 f27aaf4b Christian Brunner
    char *buf;
707 51a13528 Josh Durgin
    int r;
708 f27aaf4b Christian Brunner
709 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
710 f27aaf4b Christian Brunner
711 d7331bed Stefan Hajnoczi
    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
712 787f3133 Josh Durgin
    acb->cmd = cmd;
713 f27aaf4b Christian Brunner
    acb->qiov = qiov;
714 dc7588c1 Josh Durgin
    if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
715 787f3133 Josh Durgin
        acb->bounce = NULL;
716 787f3133 Josh Durgin
    } else {
717 787f3133 Josh Durgin
        acb->bounce = qemu_blockalign(bs, qiov->size);
718 787f3133 Josh Durgin
    }
719 f27aaf4b Christian Brunner
    acb->ret = 0;
720 f27aaf4b Christian Brunner
    acb->error = 0;
721 f27aaf4b Christian Brunner
    acb->s = s;
722 f27aaf4b Christian Brunner
    acb->cancelled = 0;
723 f27aaf4b Christian Brunner
    acb->bh = NULL;
724 473c7f02 Stefan Priebe
    acb->status = -EINPROGRESS;
725 f27aaf4b Christian Brunner
726 787f3133 Josh Durgin
    if (cmd == RBD_AIO_WRITE) {
727 d5e6b161 Michael Tokarev
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
728 f27aaf4b Christian Brunner
    }
729 f27aaf4b Christian Brunner
730 f27aaf4b Christian Brunner
    buf = acb->bounce;
731 f27aaf4b Christian Brunner
732 f27aaf4b Christian Brunner
    off = sector_num * BDRV_SECTOR_SIZE;
733 f27aaf4b Christian Brunner
    size = nb_sectors * BDRV_SECTOR_SIZE;
734 f27aaf4b Christian Brunner
735 7267c094 Anthony Liguori
    rcb = g_malloc(sizeof(RADOSCB));
736 ad32e9c0 Josh Durgin
    rcb->done = 0;
737 ad32e9c0 Josh Durgin
    rcb->acb = acb;
738 ad32e9c0 Josh Durgin
    rcb->buf = buf;
739 ad32e9c0 Josh Durgin
    rcb->s = acb->s;
740 ad32e9c0 Josh Durgin
    rcb->size = size;
741 51a13528 Josh Durgin
    r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
742 51a13528 Josh Durgin
    if (r < 0) {
743 51a13528 Josh Durgin
        goto failed;
744 51a13528 Josh Durgin
    }
745 f27aaf4b Christian Brunner
746 787f3133 Josh Durgin
    switch (cmd) {
747 787f3133 Josh Durgin
    case RBD_AIO_WRITE:
748 51a13528 Josh Durgin
        r = rbd_aio_write(s->image, off, size, buf, c);
749 787f3133 Josh Durgin
        break;
750 787f3133 Josh Durgin
    case RBD_AIO_READ:
751 51a13528 Josh Durgin
        r = rbd_aio_read(s->image, off, size, buf, c);
752 787f3133 Josh Durgin
        break;
753 787f3133 Josh Durgin
    case RBD_AIO_DISCARD:
754 787f3133 Josh Durgin
        r = rbd_aio_discard_wrapper(s->image, off, size, c);
755 787f3133 Josh Durgin
        break;
756 dc7588c1 Josh Durgin
    case RBD_AIO_FLUSH:
757 dc7588c1 Josh Durgin
        r = rbd_aio_flush_wrapper(s->image, c);
758 dc7588c1 Josh Durgin
        break;
759 787f3133 Josh Durgin
    default:
760 787f3133 Josh Durgin
        r = -EINVAL;
761 51a13528 Josh Durgin
    }
762 51a13528 Josh Durgin
763 51a13528 Josh Durgin
    if (r < 0) {
764 51a13528 Josh Durgin
        goto failed;
765 f27aaf4b Christian Brunner
    }
766 f27aaf4b Christian Brunner
767 f27aaf4b Christian Brunner
    return &acb->common;
768 51a13528 Josh Durgin
769 51a13528 Josh Durgin
failed:
770 7267c094 Anthony Liguori
    g_free(rcb);
771 51a13528 Josh Durgin
    qemu_aio_release(acb);
772 51a13528 Josh Durgin
    return NULL;
773 f27aaf4b Christian Brunner
}
774 f27aaf4b Christian Brunner
775 ad32e9c0 Josh Durgin
static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
776 ad32e9c0 Josh Durgin
                                            int64_t sector_num,
777 ad32e9c0 Josh Durgin
                                            QEMUIOVector *qiov,
778 ad32e9c0 Josh Durgin
                                            int nb_sectors,
779 ad32e9c0 Josh Durgin
                                            BlockDriverCompletionFunc *cb,
780 ad32e9c0 Josh Durgin
                                            void *opaque)
781 f27aaf4b Christian Brunner
{
782 787f3133 Josh Durgin
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
783 787f3133 Josh Durgin
                         RBD_AIO_READ);
784 f27aaf4b Christian Brunner
}
785 f27aaf4b Christian Brunner
786 ad32e9c0 Josh Durgin
static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
787 ad32e9c0 Josh Durgin
                                             int64_t sector_num,
788 ad32e9c0 Josh Durgin
                                             QEMUIOVector *qiov,
789 ad32e9c0 Josh Durgin
                                             int nb_sectors,
790 ad32e9c0 Josh Durgin
                                             BlockDriverCompletionFunc *cb,
791 ad32e9c0 Josh Durgin
                                             void *opaque)
792 f27aaf4b Christian Brunner
{
793 787f3133 Josh Durgin
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
794 787f3133 Josh Durgin
                         RBD_AIO_WRITE);
795 f27aaf4b Christian Brunner
}
796 f27aaf4b Christian Brunner
797 dc7588c1 Josh Durgin
#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
798 dc7588c1 Josh Durgin
static BlockDriverAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
799 dc7588c1 Josh Durgin
                                            BlockDriverCompletionFunc *cb,
800 dc7588c1 Josh Durgin
                                            void *opaque)
801 dc7588c1 Josh Durgin
{
802 dc7588c1 Josh Durgin
    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
803 dc7588c1 Josh Durgin
}
804 dc7588c1 Josh Durgin
805 dc7588c1 Josh Durgin
#else
806 dc7588c1 Josh Durgin
807 8b94ff85 Paolo Bonzini
static int qemu_rbd_co_flush(BlockDriverState *bs)
808 7a3f5fe9 Sage Weil
{
809 7a3f5fe9 Sage Weil
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
810 7a3f5fe9 Sage Weil
    /* rbd_flush added in 0.1.1 */
811 7a3f5fe9 Sage Weil
    BDRVRBDState *s = bs->opaque;
812 7a3f5fe9 Sage Weil
    return rbd_flush(s->image);
813 7a3f5fe9 Sage Weil
#else
814 7a3f5fe9 Sage Weil
    return 0;
815 7a3f5fe9 Sage Weil
#endif
816 7a3f5fe9 Sage Weil
}
817 dc7588c1 Josh Durgin
#endif
818 7a3f5fe9 Sage Weil
819 ad32e9c0 Josh Durgin
static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
820 f27aaf4b Christian Brunner
{
821 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
822 ad32e9c0 Josh Durgin
    rbd_image_info_t info;
823 ad32e9c0 Josh Durgin
    int r;
824 ad32e9c0 Josh Durgin
825 ad32e9c0 Josh Durgin
    r = rbd_stat(s->image, &info, sizeof(info));
826 ad32e9c0 Josh Durgin
    if (r < 0) {
827 ad32e9c0 Josh Durgin
        return r;
828 ad32e9c0 Josh Durgin
    }
829 ad32e9c0 Josh Durgin
830 ad32e9c0 Josh Durgin
    bdi->cluster_size = info.obj_size;
831 f27aaf4b Christian Brunner
    return 0;
832 f27aaf4b Christian Brunner
}
833 f27aaf4b Christian Brunner
834 ad32e9c0 Josh Durgin
static int64_t qemu_rbd_getlength(BlockDriverState *bs)
835 f27aaf4b Christian Brunner
{
836 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
837 ad32e9c0 Josh Durgin
    rbd_image_info_t info;
838 ad32e9c0 Josh Durgin
    int r;
839 f27aaf4b Christian Brunner
840 ad32e9c0 Josh Durgin
    r = rbd_stat(s->image, &info, sizeof(info));
841 ad32e9c0 Josh Durgin
    if (r < 0) {
842 ad32e9c0 Josh Durgin
        return r;
843 ad32e9c0 Josh Durgin
    }
844 ad32e9c0 Josh Durgin
845 ad32e9c0 Josh Durgin
    return info.size;
846 f27aaf4b Christian Brunner
}
847 f27aaf4b Christian Brunner
848 30cdc48c Josh Durgin
static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset)
849 30cdc48c Josh Durgin
{
850 30cdc48c Josh Durgin
    BDRVRBDState *s = bs->opaque;
851 30cdc48c Josh Durgin
    int r;
852 30cdc48c Josh Durgin
853 30cdc48c Josh Durgin
    r = rbd_resize(s->image, offset);
854 30cdc48c Josh Durgin
    if (r < 0) {
855 30cdc48c Josh Durgin
        return r;
856 30cdc48c Josh Durgin
    }
857 30cdc48c Josh Durgin
858 30cdc48c Josh Durgin
    return 0;
859 30cdc48c Josh Durgin
}
860 30cdc48c Josh Durgin
861 ad32e9c0 Josh Durgin
static int qemu_rbd_snap_create(BlockDriverState *bs,
862 ad32e9c0 Josh Durgin
                                QEMUSnapshotInfo *sn_info)
863 f27aaf4b Christian Brunner
{
864 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
865 f27aaf4b Christian Brunner
    int r;
866 f27aaf4b Christian Brunner
867 f27aaf4b Christian Brunner
    if (sn_info->name[0] == '\0') {
868 f27aaf4b Christian Brunner
        return -EINVAL; /* we need a name for rbd snapshots */
869 f27aaf4b Christian Brunner
    }
870 f27aaf4b Christian Brunner
871 f27aaf4b Christian Brunner
    /*
872 f27aaf4b Christian Brunner
     * rbd snapshots are using the name as the user controlled unique identifier
873 f27aaf4b Christian Brunner
     * we can't use the rbd snapid for that purpose, as it can't be set
874 f27aaf4b Christian Brunner
     */
875 f27aaf4b Christian Brunner
    if (sn_info->id_str[0] != '\0' &&
876 f27aaf4b Christian Brunner
        strcmp(sn_info->id_str, sn_info->name) != 0) {
877 f27aaf4b Christian Brunner
        return -EINVAL;
878 f27aaf4b Christian Brunner
    }
879 f27aaf4b Christian Brunner
880 f27aaf4b Christian Brunner
    if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) {
881 f27aaf4b Christian Brunner
        return -ERANGE;
882 f27aaf4b Christian Brunner
    }
883 f27aaf4b Christian Brunner
884 ad32e9c0 Josh Durgin
    r = rbd_snap_create(s->image, sn_info->name);
885 f27aaf4b Christian Brunner
    if (r < 0) {
886 ad32e9c0 Josh Durgin
        error_report("failed to create snap: %s", strerror(-r));
887 f27aaf4b Christian Brunner
        return r;
888 f27aaf4b Christian Brunner
    }
889 f27aaf4b Christian Brunner
890 f27aaf4b Christian Brunner
    return 0;
891 f27aaf4b Christian Brunner
}
892 f27aaf4b Christian Brunner
893 bd603247 Gregory Farnum
static int qemu_rbd_snap_remove(BlockDriverState *bs,
894 bd603247 Gregory Farnum
                                const char *snapshot_name)
895 bd603247 Gregory Farnum
{
896 bd603247 Gregory Farnum
    BDRVRBDState *s = bs->opaque;
897 bd603247 Gregory Farnum
    int r;
898 bd603247 Gregory Farnum
899 bd603247 Gregory Farnum
    r = rbd_snap_remove(s->image, snapshot_name);
900 bd603247 Gregory Farnum
    return r;
901 bd603247 Gregory Farnum
}
902 bd603247 Gregory Farnum
903 bd603247 Gregory Farnum
static int qemu_rbd_snap_rollback(BlockDriverState *bs,
904 bd603247 Gregory Farnum
                                  const char *snapshot_name)
905 bd603247 Gregory Farnum
{
906 bd603247 Gregory Farnum
    BDRVRBDState *s = bs->opaque;
907 bd603247 Gregory Farnum
    int r;
908 bd603247 Gregory Farnum
909 bd603247 Gregory Farnum
    r = rbd_snap_rollback(s->image, snapshot_name);
910 bd603247 Gregory Farnum
    return r;
911 bd603247 Gregory Farnum
}
912 bd603247 Gregory Farnum
913 ad32e9c0 Josh Durgin
static int qemu_rbd_snap_list(BlockDriverState *bs,
914 ad32e9c0 Josh Durgin
                              QEMUSnapshotInfo **psn_tab)
915 f27aaf4b Christian Brunner
{
916 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
917 f27aaf4b Christian Brunner
    QEMUSnapshotInfo *sn_info, *sn_tab = NULL;
918 ad32e9c0 Josh Durgin
    int i, snap_count;
919 ad32e9c0 Josh Durgin
    rbd_snap_info_t *snaps;
920 ad32e9c0 Josh Durgin
    int max_snaps = RBD_MAX_SNAPS;
921 f27aaf4b Christian Brunner
922 ad32e9c0 Josh Durgin
    do {
923 7267c094 Anthony Liguori
        snaps = g_malloc(sizeof(*snaps) * max_snaps);
924 ad32e9c0 Josh Durgin
        snap_count = rbd_snap_list(s->image, snaps, &max_snaps);
925 ad32e9c0 Josh Durgin
        if (snap_count < 0) {
926 7267c094 Anthony Liguori
            g_free(snaps);
927 f27aaf4b Christian Brunner
        }
928 ad32e9c0 Josh Durgin
    } while (snap_count == -ERANGE);
929 f27aaf4b Christian Brunner
930 ad32e9c0 Josh Durgin
    if (snap_count <= 0) {
931 b9c53290 Josh Durgin
        goto done;
932 f27aaf4b Christian Brunner
    }
933 f27aaf4b Christian Brunner
934 7267c094 Anthony Liguori
    sn_tab = g_malloc0(snap_count * sizeof(QEMUSnapshotInfo));
935 f27aaf4b Christian Brunner
936 ad32e9c0 Josh Durgin
    for (i = 0; i < snap_count; i++) {
937 ad32e9c0 Josh Durgin
        const char *snap_name = snaps[i].name;
938 f27aaf4b Christian Brunner
939 f27aaf4b Christian Brunner
        sn_info = sn_tab + i;
940 f27aaf4b Christian Brunner
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name);
941 f27aaf4b Christian Brunner
        pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name);
942 f27aaf4b Christian Brunner
943 ad32e9c0 Josh Durgin
        sn_info->vm_state_size = snaps[i].size;
944 f27aaf4b Christian Brunner
        sn_info->date_sec = 0;
945 f27aaf4b Christian Brunner
        sn_info->date_nsec = 0;
946 f27aaf4b Christian Brunner
        sn_info->vm_clock_nsec = 0;
947 f27aaf4b Christian Brunner
    }
948 ad32e9c0 Josh Durgin
    rbd_snap_list_end(snaps);
949 ad32e9c0 Josh Durgin
950 b9c53290 Josh Durgin
 done:
951 f27aaf4b Christian Brunner
    *psn_tab = sn_tab;
952 f27aaf4b Christian Brunner
    return snap_count;
953 f27aaf4b Christian Brunner
}
954 f27aaf4b Christian Brunner
955 787f3133 Josh Durgin
#ifdef LIBRBD_SUPPORTS_DISCARD
956 787f3133 Josh Durgin
static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
957 787f3133 Josh Durgin
                                              int64_t sector_num,
958 787f3133 Josh Durgin
                                              int nb_sectors,
959 787f3133 Josh Durgin
                                              BlockDriverCompletionFunc *cb,
960 787f3133 Josh Durgin
                                              void *opaque)
961 787f3133 Josh Durgin
{
962 787f3133 Josh Durgin
    return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
963 787f3133 Josh Durgin
                         RBD_AIO_DISCARD);
964 787f3133 Josh Durgin
}
965 787f3133 Josh Durgin
#endif
966 787f3133 Josh Durgin
967 ad32e9c0 Josh Durgin
static QEMUOptionParameter qemu_rbd_create_options[] = {
968 f27aaf4b Christian Brunner
    {
969 f27aaf4b Christian Brunner
     .name = BLOCK_OPT_SIZE,
970 f27aaf4b Christian Brunner
     .type = OPT_SIZE,
971 f27aaf4b Christian Brunner
     .help = "Virtual disk size"
972 f27aaf4b Christian Brunner
    },
973 f27aaf4b Christian Brunner
    {
974 f27aaf4b Christian Brunner
     .name = BLOCK_OPT_CLUSTER_SIZE,
975 f27aaf4b Christian Brunner
     .type = OPT_SIZE,
976 f27aaf4b Christian Brunner
     .help = "RBD object size"
977 f27aaf4b Christian Brunner
    },
978 f27aaf4b Christian Brunner
    {NULL}
979 f27aaf4b Christian Brunner
};
980 f27aaf4b Christian Brunner
981 f27aaf4b Christian Brunner
static BlockDriver bdrv_rbd = {
982 f27aaf4b Christian Brunner
    .format_name        = "rbd",
983 f27aaf4b Christian Brunner
    .instance_size      = sizeof(BDRVRBDState),
984 ad32e9c0 Josh Durgin
    .bdrv_file_open     = qemu_rbd_open,
985 ad32e9c0 Josh Durgin
    .bdrv_close         = qemu_rbd_close,
986 ad32e9c0 Josh Durgin
    .bdrv_create        = qemu_rbd_create,
987 3ac21627 Peter Lieven
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
988 ad32e9c0 Josh Durgin
    .bdrv_get_info      = qemu_rbd_getinfo,
989 ad32e9c0 Josh Durgin
    .create_options     = qemu_rbd_create_options,
990 ad32e9c0 Josh Durgin
    .bdrv_getlength     = qemu_rbd_getlength,
991 30cdc48c Josh Durgin
    .bdrv_truncate      = qemu_rbd_truncate,
992 f27aaf4b Christian Brunner
    .protocol_name      = "rbd",
993 f27aaf4b Christian Brunner
994 c68b89ac Kevin Wolf
    .bdrv_aio_readv         = qemu_rbd_aio_readv,
995 c68b89ac Kevin Wolf
    .bdrv_aio_writev        = qemu_rbd_aio_writev,
996 dc7588c1 Josh Durgin
997 dc7588c1 Josh Durgin
#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
998 dc7588c1 Josh Durgin
    .bdrv_aio_flush         = qemu_rbd_aio_flush,
999 dc7588c1 Josh Durgin
#else
1000 c68b89ac Kevin Wolf
    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
1001 dc7588c1 Josh Durgin
#endif
1002 f27aaf4b Christian Brunner
1003 787f3133 Josh Durgin
#ifdef LIBRBD_SUPPORTS_DISCARD
1004 787f3133 Josh Durgin
    .bdrv_aio_discard       = qemu_rbd_aio_discard,
1005 787f3133 Josh Durgin
#endif
1006 787f3133 Josh Durgin
1007 c68b89ac Kevin Wolf
    .bdrv_snapshot_create   = qemu_rbd_snap_create,
1008 bd603247 Gregory Farnum
    .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
1009 c68b89ac Kevin Wolf
    .bdrv_snapshot_list     = qemu_rbd_snap_list,
1010 bd603247 Gregory Farnum
    .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
1011 f27aaf4b Christian Brunner
};
1012 f27aaf4b Christian Brunner
1013 f27aaf4b Christian Brunner
static void bdrv_rbd_init(void)
1014 f27aaf4b Christian Brunner
{
1015 f27aaf4b Christian Brunner
    bdrv_register(&bdrv_rbd);
1016 f27aaf4b Christian Brunner
}
1017 f27aaf4b Christian Brunner
1018 f27aaf4b Christian Brunner
block_init(bdrv_rbd_init);