Statistics
| Branch: | Revision:

root / block / rbd.c @ 3d1807ac

History | View | Annotate | Download (24.5 kB)

1 f27aaf4b Christian Brunner
/*
2 f27aaf4b Christian Brunner
 * QEMU Block driver for RADOS (Ceph)
3 f27aaf4b Christian Brunner
 *
4 ad32e9c0 Josh Durgin
 * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
5 ad32e9c0 Josh Durgin
 *                         Josh Durgin <josh.durgin@dreamhost.com>
6 f27aaf4b Christian Brunner
 *
7 f27aaf4b Christian Brunner
 * This work is licensed under the terms of the GNU GPL, version 2.  See
8 f27aaf4b Christian Brunner
 * the COPYING file in the top-level directory.
9 f27aaf4b Christian Brunner
 *
10 6b620ca3 Paolo Bonzini
 * Contributions after 2012-01-13 are licensed under the terms of the
11 6b620ca3 Paolo Bonzini
 * GNU GPL, version 2 or (at your option) any later version.
12 f27aaf4b Christian Brunner
 */
13 f27aaf4b Christian Brunner
14 ad32e9c0 Josh Durgin
#include <inttypes.h>
15 ad32e9c0 Josh Durgin
16 f27aaf4b Christian Brunner
#include "qemu-common.h"
17 f27aaf4b Christian Brunner
#include "qemu-error.h"
18 f27aaf4b Christian Brunner
#include "block_int.h"
19 f27aaf4b Christian Brunner
20 ad32e9c0 Josh Durgin
#include <rbd/librbd.h>
21 f27aaf4b Christian Brunner
22 f27aaf4b Christian Brunner
/*
23 f27aaf4b Christian Brunner
 * When specifying the image filename use:
24 f27aaf4b Christian Brunner
 *
25 fab5cf59 Josh Durgin
 * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
26 f27aaf4b Christian Brunner
 *
27 9e1fbcde Sage Weil
 * poolname must be the name of an existing rados pool.
28 f27aaf4b Christian Brunner
 *
29 9e1fbcde Sage Weil
 * devicename is the name of the rbd image.
30 f27aaf4b Christian Brunner
 *
31 9e1fbcde Sage Weil
 * Each option given is used to configure rados, and may be any valid
32 9e1fbcde Sage Weil
 * Ceph option, "id", or "conf".
33 fab5cf59 Josh Durgin
 *
34 9e1fbcde Sage Weil
 * The "id" option indicates what user we should authenticate as to
35 9e1fbcde Sage Weil
 * the Ceph cluster.  If it is excluded we will use the Ceph default
36 9e1fbcde Sage Weil
 * (normally 'admin').
37 f27aaf4b Christian Brunner
 *
38 9e1fbcde Sage Weil
 * The "conf" option specifies a Ceph configuration file to read.  If
39 9e1fbcde Sage Weil
 * it is not specified, we will read from the default Ceph locations
40 9e1fbcde Sage Weil
 * (e.g., /etc/ceph/ceph.conf).  To avoid reading _any_ configuration
41 9e1fbcde Sage Weil
 * file, specify conf=/dev/null.
42 f27aaf4b Christian Brunner
 *
43 9e1fbcde Sage Weil
 * Configuration values containing :, @, or = can be escaped with a
44 9e1fbcde Sage Weil
 * leading "\".
45 f27aaf4b Christian Brunner
 */
46 f27aaf4b Christian Brunner
47 787f3133 Josh Durgin
/* rbd_aio_discard added in 0.1.2 */
48 787f3133 Josh Durgin
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2)
49 787f3133 Josh Durgin
#define LIBRBD_SUPPORTS_DISCARD
50 787f3133 Josh Durgin
#else
51 787f3133 Josh Durgin
#undef LIBRBD_SUPPORTS_DISCARD
52 787f3133 Josh Durgin
#endif
53 787f3133 Josh Durgin
54 f27aaf4b Christian Brunner
#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
55 f27aaf4b Christian Brunner
56 ad32e9c0 Josh Durgin
#define RBD_MAX_CONF_NAME_SIZE 128
57 ad32e9c0 Josh Durgin
#define RBD_MAX_CONF_VAL_SIZE 512
58 ad32e9c0 Josh Durgin
#define RBD_MAX_CONF_SIZE 1024
59 ad32e9c0 Josh Durgin
#define RBD_MAX_POOL_NAME_SIZE 128
60 ad32e9c0 Josh Durgin
#define RBD_MAX_SNAP_NAME_SIZE 128
61 ad32e9c0 Josh Durgin
#define RBD_MAX_SNAPS 100
62 ad32e9c0 Josh Durgin
63 787f3133 Josh Durgin
typedef enum {
64 787f3133 Josh Durgin
    RBD_AIO_READ,
65 787f3133 Josh Durgin
    RBD_AIO_WRITE,
66 787f3133 Josh Durgin
    RBD_AIO_DISCARD
67 787f3133 Josh Durgin
} RBDAIOCmd;
68 787f3133 Josh Durgin
69 f27aaf4b Christian Brunner
typedef struct RBDAIOCB {
70 f27aaf4b Christian Brunner
    BlockDriverAIOCB common;
71 f27aaf4b Christian Brunner
    QEMUBH *bh;
72 f27aaf4b Christian Brunner
    int ret;
73 f27aaf4b Christian Brunner
    QEMUIOVector *qiov;
74 f27aaf4b Christian Brunner
    char *bounce;
75 787f3133 Josh Durgin
    RBDAIOCmd cmd;
76 f27aaf4b Christian Brunner
    int64_t sector_num;
77 f27aaf4b Christian Brunner
    int error;
78 f27aaf4b Christian Brunner
    struct BDRVRBDState *s;
79 f27aaf4b Christian Brunner
    int cancelled;
80 f27aaf4b Christian Brunner
} RBDAIOCB;
81 f27aaf4b Christian Brunner
82 f27aaf4b Christian Brunner
typedef struct RADOSCB {
83 f27aaf4b Christian Brunner
    int rcbid;
84 f27aaf4b Christian Brunner
    RBDAIOCB *acb;
85 f27aaf4b Christian Brunner
    struct BDRVRBDState *s;
86 f27aaf4b Christian Brunner
    int done;
87 ad32e9c0 Josh Durgin
    int64_t size;
88 f27aaf4b Christian Brunner
    char *buf;
89 f27aaf4b Christian Brunner
    int ret;
90 f27aaf4b Christian Brunner
} RADOSCB;
91 f27aaf4b Christian Brunner
92 f27aaf4b Christian Brunner
#define RBD_FD_READ 0
93 f27aaf4b Christian Brunner
#define RBD_FD_WRITE 1
94 f27aaf4b Christian Brunner
95 f27aaf4b Christian Brunner
typedef struct BDRVRBDState {
96 f27aaf4b Christian Brunner
    int fds[2];
97 ad32e9c0 Josh Durgin
    rados_t cluster;
98 ad32e9c0 Josh Durgin
    rados_ioctx_t io_ctx;
99 ad32e9c0 Josh Durgin
    rbd_image_t image;
100 ad32e9c0 Josh Durgin
    char name[RBD_MAX_IMAGE_NAME_SIZE];
101 f27aaf4b Christian Brunner
    int qemu_aio_count;
102 ad32e9c0 Josh Durgin
    char *snap;
103 f27aaf4b Christian Brunner
    int event_reader_pos;
104 f27aaf4b Christian Brunner
    RADOSCB *event_rcb;
105 f27aaf4b Christian Brunner
} BDRVRBDState;
106 f27aaf4b Christian Brunner
107 f27aaf4b Christian Brunner
static void rbd_aio_bh_cb(void *opaque);
108 f27aaf4b Christian Brunner
109 ad32e9c0 Josh Durgin
static int qemu_rbd_next_tok(char *dst, int dst_len,
110 ad32e9c0 Josh Durgin
                             char *src, char delim,
111 ad32e9c0 Josh Durgin
                             const char *name,
112 ad32e9c0 Josh Durgin
                             char **p)
113 f27aaf4b Christian Brunner
{
114 f27aaf4b Christian Brunner
    int l;
115 f27aaf4b Christian Brunner
    char *end;
116 f27aaf4b Christian Brunner
117 f27aaf4b Christian Brunner
    *p = NULL;
118 f27aaf4b Christian Brunner
119 f27aaf4b Christian Brunner
    if (delim != '\0') {
120 16a06b24 Sage Weil
        for (end = src; *end; ++end) {
121 16a06b24 Sage Weil
            if (*end == delim) {
122 16a06b24 Sage Weil
                break;
123 16a06b24 Sage Weil
            }
124 16a06b24 Sage Weil
            if (*end == '\\' && end[1] != '\0') {
125 16a06b24 Sage Weil
                end++;
126 16a06b24 Sage Weil
            }
127 16a06b24 Sage Weil
        }
128 16a06b24 Sage Weil
        if (*end == delim) {
129 f27aaf4b Christian Brunner
            *p = end + 1;
130 f27aaf4b Christian Brunner
            *end = '\0';
131 f27aaf4b Christian Brunner
        }
132 f27aaf4b Christian Brunner
    }
133 f27aaf4b Christian Brunner
    l = strlen(src);
134 f27aaf4b Christian Brunner
    if (l >= dst_len) {
135 f27aaf4b Christian Brunner
        error_report("%s too long", name);
136 f27aaf4b Christian Brunner
        return -EINVAL;
137 f27aaf4b Christian Brunner
    } else if (l == 0) {
138 f27aaf4b Christian Brunner
        error_report("%s too short", name);
139 f27aaf4b Christian Brunner
        return -EINVAL;
140 f27aaf4b Christian Brunner
    }
141 f27aaf4b Christian Brunner
142 f27aaf4b Christian Brunner
    pstrcpy(dst, dst_len, src);
143 f27aaf4b Christian Brunner
144 f27aaf4b Christian Brunner
    return 0;
145 f27aaf4b Christian Brunner
}
146 f27aaf4b Christian Brunner
147 16a06b24 Sage Weil
static void qemu_rbd_unescape(char *src)
148 16a06b24 Sage Weil
{
149 16a06b24 Sage Weil
    char *p;
150 16a06b24 Sage Weil
151 16a06b24 Sage Weil
    for (p = src; *src; ++src, ++p) {
152 16a06b24 Sage Weil
        if (*src == '\\' && src[1] != '\0') {
153 16a06b24 Sage Weil
            src++;
154 16a06b24 Sage Weil
        }
155 16a06b24 Sage Weil
        *p = *src;
156 16a06b24 Sage Weil
    }
157 16a06b24 Sage Weil
    *p = '\0';
158 16a06b24 Sage Weil
}
159 16a06b24 Sage Weil
160 ad32e9c0 Josh Durgin
static int qemu_rbd_parsename(const char *filename,
161 ad32e9c0 Josh Durgin
                              char *pool, int pool_len,
162 ad32e9c0 Josh Durgin
                              char *snap, int snap_len,
163 fab5cf59 Josh Durgin
                              char *name, int name_len,
164 fab5cf59 Josh Durgin
                              char *conf, int conf_len)
165 f27aaf4b Christian Brunner
{
166 f27aaf4b Christian Brunner
    const char *start;
167 f27aaf4b Christian Brunner
    char *p, *buf;
168 f27aaf4b Christian Brunner
    int ret;
169 f27aaf4b Christian Brunner
170 f27aaf4b Christian Brunner
    if (!strstart(filename, "rbd:", &start)) {
171 f27aaf4b Christian Brunner
        return -EINVAL;
172 f27aaf4b Christian Brunner
    }
173 f27aaf4b Christian Brunner
174 7267c094 Anthony Liguori
    buf = g_strdup(start);
175 f27aaf4b Christian Brunner
    p = buf;
176 fab5cf59 Josh Durgin
    *snap = '\0';
177 fab5cf59 Josh Durgin
    *conf = '\0';
178 f27aaf4b Christian Brunner
179 ad32e9c0 Josh Durgin
    ret = qemu_rbd_next_tok(pool, pool_len, p, '/', "pool name", &p);
180 f27aaf4b Christian Brunner
    if (ret < 0 || !p) {
181 f27aaf4b Christian Brunner
        ret = -EINVAL;
182 f27aaf4b Christian Brunner
        goto done;
183 f27aaf4b Christian Brunner
    }
184 16a06b24 Sage Weil
    qemu_rbd_unescape(pool);
185 fab5cf59 Josh Durgin
186 fab5cf59 Josh Durgin
    if (strchr(p, '@')) {
187 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p);
188 fab5cf59 Josh Durgin
        if (ret < 0) {
189 fab5cf59 Josh Durgin
            goto done;
190 fab5cf59 Josh Durgin
        }
191 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p);
192 16a06b24 Sage Weil
        qemu_rbd_unescape(snap);
193 fab5cf59 Josh Durgin
    } else {
194 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p);
195 f27aaf4b Christian Brunner
    }
196 16a06b24 Sage Weil
    qemu_rbd_unescape(name);
197 fab5cf59 Josh Durgin
    if (ret < 0 || !p) {
198 f27aaf4b Christian Brunner
        goto done;
199 f27aaf4b Christian Brunner
    }
200 f27aaf4b Christian Brunner
201 fab5cf59 Josh Durgin
    ret = qemu_rbd_next_tok(conf, conf_len, p, '\0', "configuration", &p);
202 f27aaf4b Christian Brunner
203 f27aaf4b Christian Brunner
done:
204 7267c094 Anthony Liguori
    g_free(buf);
205 f27aaf4b Christian Brunner
    return ret;
206 f27aaf4b Christian Brunner
}
207 f27aaf4b Christian Brunner
208 7c7e9df0 Sage Weil
static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
209 7c7e9df0 Sage Weil
{
210 7c7e9df0 Sage Weil
    const char *p = conf;
211 7c7e9df0 Sage Weil
212 7c7e9df0 Sage Weil
    while (*p) {
213 7c7e9df0 Sage Weil
        int len;
214 7c7e9df0 Sage Weil
        const char *end = strchr(p, ':');
215 7c7e9df0 Sage Weil
216 7c7e9df0 Sage Weil
        if (end) {
217 7c7e9df0 Sage Weil
            len = end - p;
218 7c7e9df0 Sage Weil
        } else {
219 7c7e9df0 Sage Weil
            len = strlen(p);
220 7c7e9df0 Sage Weil
        }
221 7c7e9df0 Sage Weil
222 7c7e9df0 Sage Weil
        if (strncmp(p, "id=", 3) == 0) {
223 7c7e9df0 Sage Weil
            len -= 3;
224 7c7e9df0 Sage Weil
            strncpy(clientname, p + 3, len);
225 7c7e9df0 Sage Weil
            clientname[len] = '\0';
226 7c7e9df0 Sage Weil
            return clientname;
227 7c7e9df0 Sage Weil
        }
228 7c7e9df0 Sage Weil
        if (end == NULL) {
229 7c7e9df0 Sage Weil
            break;
230 7c7e9df0 Sage Weil
        }
231 7c7e9df0 Sage Weil
        p = end + 1;
232 7c7e9df0 Sage Weil
    }
233 7c7e9df0 Sage Weil
    return NULL;
234 7c7e9df0 Sage Weil
}
235 7c7e9df0 Sage Weil
236 fab5cf59 Josh Durgin
static int qemu_rbd_set_conf(rados_t cluster, const char *conf)
237 fab5cf59 Josh Durgin
{
238 fab5cf59 Josh Durgin
    char *p, *buf;
239 fab5cf59 Josh Durgin
    char name[RBD_MAX_CONF_NAME_SIZE];
240 fab5cf59 Josh Durgin
    char value[RBD_MAX_CONF_VAL_SIZE];
241 fab5cf59 Josh Durgin
    int ret = 0;
242 fab5cf59 Josh Durgin
243 7267c094 Anthony Liguori
    buf = g_strdup(conf);
244 fab5cf59 Josh Durgin
    p = buf;
245 fab5cf59 Josh Durgin
246 fab5cf59 Josh Durgin
    while (p) {
247 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(name, sizeof(name), p,
248 fab5cf59 Josh Durgin
                                '=', "conf option name", &p);
249 fab5cf59 Josh Durgin
        if (ret < 0) {
250 fab5cf59 Josh Durgin
            break;
251 fab5cf59 Josh Durgin
        }
252 16a06b24 Sage Weil
        qemu_rbd_unescape(name);
253 fab5cf59 Josh Durgin
254 fab5cf59 Josh Durgin
        if (!p) {
255 fab5cf59 Josh Durgin
            error_report("conf option %s has no value", name);
256 fab5cf59 Josh Durgin
            ret = -EINVAL;
257 fab5cf59 Josh Durgin
            break;
258 fab5cf59 Josh Durgin
        }
259 fab5cf59 Josh Durgin
260 fab5cf59 Josh Durgin
        ret = qemu_rbd_next_tok(value, sizeof(value), p,
261 fab5cf59 Josh Durgin
                                ':', "conf option value", &p);
262 fab5cf59 Josh Durgin
        if (ret < 0) {
263 fab5cf59 Josh Durgin
            break;
264 fab5cf59 Josh Durgin
        }
265 16a06b24 Sage Weil
        qemu_rbd_unescape(value);
266 fab5cf59 Josh Durgin
267 7c7e9df0 Sage Weil
        if (strcmp(name, "conf") == 0) {
268 7c7e9df0 Sage Weil
            ret = rados_conf_read_file(cluster, value);
269 fab5cf59 Josh Durgin
            if (ret < 0) {
270 7c7e9df0 Sage Weil
                error_report("error reading conf file %s", value);
271 fab5cf59 Josh Durgin
                break;
272 fab5cf59 Josh Durgin
            }
273 7c7e9df0 Sage Weil
        } else if (strcmp(name, "id") == 0) {
274 7c7e9df0 Sage Weil
            /* ignore, this is parsed by qemu_rbd_parse_clientname() */
275 fab5cf59 Josh Durgin
        } else {
276 7c7e9df0 Sage Weil
            ret = rados_conf_set(cluster, name, value);
277 fab5cf59 Josh Durgin
            if (ret < 0) {
278 7c7e9df0 Sage Weil
                error_report("invalid conf option %s", name);
279 7c7e9df0 Sage Weil
                ret = -EINVAL;
280 fab5cf59 Josh Durgin
                break;
281 fab5cf59 Josh Durgin
            }
282 fab5cf59 Josh Durgin
        }
283 fab5cf59 Josh Durgin
    }
284 fab5cf59 Josh Durgin
285 7267c094 Anthony Liguori
    g_free(buf);
286 fab5cf59 Josh Durgin
    return ret;
287 fab5cf59 Josh Durgin
}
288 fab5cf59 Josh Durgin
289 ad32e9c0 Josh Durgin
static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options)
290 f27aaf4b Christian Brunner
{
291 f27aaf4b Christian Brunner
    int64_t bytes = 0;
292 f27aaf4b Christian Brunner
    int64_t objsize;
293 ad32e9c0 Josh Durgin
    int obj_order = 0;
294 ad32e9c0 Josh Durgin
    char pool[RBD_MAX_POOL_NAME_SIZE];
295 ad32e9c0 Josh Durgin
    char name[RBD_MAX_IMAGE_NAME_SIZE];
296 ad32e9c0 Josh Durgin
    char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
297 fab5cf59 Josh Durgin
    char conf[RBD_MAX_CONF_SIZE];
298 7c7e9df0 Sage Weil
    char clientname_buf[RBD_MAX_CONF_SIZE];
299 7c7e9df0 Sage Weil
    char *clientname;
300 ad32e9c0 Josh Durgin
    rados_t cluster;
301 ad32e9c0 Josh Durgin
    rados_ioctx_t io_ctx;
302 f27aaf4b Christian Brunner
    int ret;
303 f27aaf4b Christian Brunner
304 ad32e9c0 Josh Durgin
    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
305 ad32e9c0 Josh Durgin
                           snap_buf, sizeof(snap_buf),
306 fab5cf59 Josh Durgin
                           name, sizeof(name),
307 fab5cf59 Josh Durgin
                           conf, sizeof(conf)) < 0) {
308 f27aaf4b Christian Brunner
        return -EINVAL;
309 f27aaf4b Christian Brunner
    }
310 f27aaf4b Christian Brunner
311 f27aaf4b Christian Brunner
    /* Read out options */
312 f27aaf4b Christian Brunner
    while (options && options->name) {
313 f27aaf4b Christian Brunner
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
314 f27aaf4b Christian Brunner
            bytes = options->value.n;
315 f27aaf4b Christian Brunner
        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
316 f27aaf4b Christian Brunner
            if (options->value.n) {
317 f27aaf4b Christian Brunner
                objsize = options->value.n;
318 f27aaf4b Christian Brunner
                if ((objsize - 1) & objsize) {    /* not a power of 2? */
319 f27aaf4b Christian Brunner
                    error_report("obj size needs to be power of 2");
320 f27aaf4b Christian Brunner
                    return -EINVAL;
321 f27aaf4b Christian Brunner
                }
322 f27aaf4b Christian Brunner
                if (objsize < 4096) {
323 f27aaf4b Christian Brunner
                    error_report("obj size too small");
324 f27aaf4b Christian Brunner
                    return -EINVAL;
325 f27aaf4b Christian Brunner
                }
326 ad32e9c0 Josh Durgin
                obj_order = ffs(objsize) - 1;
327 f27aaf4b Christian Brunner
            }
328 f27aaf4b Christian Brunner
        }
329 f27aaf4b Christian Brunner
        options++;
330 f27aaf4b Christian Brunner
    }
331 f27aaf4b Christian Brunner
332 7c7e9df0 Sage Weil
    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
333 7c7e9df0 Sage Weil
    if (rados_create(&cluster, clientname) < 0) {
334 f27aaf4b Christian Brunner
        error_report("error initializing");
335 f27aaf4b Christian Brunner
        return -EIO;
336 f27aaf4b Christian Brunner
    }
337 f27aaf4b Christian Brunner
338 fab5cf59 Josh Durgin
    if (strstr(conf, "conf=") == NULL) {
339 f9fe18ec Sage Weil
        /* try default location, but ignore failure */
340 f9fe18ec Sage Weil
        rados_conf_read_file(cluster, NULL);
341 fab5cf59 Josh Durgin
    }
342 fab5cf59 Josh Durgin
343 fab5cf59 Josh Durgin
    if (conf[0] != '\0' &&
344 fab5cf59 Josh Durgin
        qemu_rbd_set_conf(cluster, conf) < 0) {
345 fab5cf59 Josh Durgin
        error_report("error setting config options");
346 ad32e9c0 Josh Durgin
        rados_shutdown(cluster);
347 f27aaf4b Christian Brunner
        return -EIO;
348 f27aaf4b Christian Brunner
    }
349 f27aaf4b Christian Brunner
350 ad32e9c0 Josh Durgin
    if (rados_connect(cluster) < 0) {
351 ad32e9c0 Josh Durgin
        error_report("error connecting");
352 ad32e9c0 Josh Durgin
        rados_shutdown(cluster);
353 f27aaf4b Christian Brunner
        return -EIO;
354 f27aaf4b Christian Brunner
    }
355 f27aaf4b Christian Brunner
356 ad32e9c0 Josh Durgin
    if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
357 ad32e9c0 Josh Durgin
        error_report("error opening pool %s", pool);
358 ad32e9c0 Josh Durgin
        rados_shutdown(cluster);
359 ad32e9c0 Josh Durgin
        return -EIO;
360 f27aaf4b Christian Brunner
    }
361 f27aaf4b Christian Brunner
362 ad32e9c0 Josh Durgin
    ret = rbd_create(io_ctx, name, bytes, &obj_order);
363 ad32e9c0 Josh Durgin
    rados_ioctx_destroy(io_ctx);
364 ad32e9c0 Josh Durgin
    rados_shutdown(cluster);
365 f27aaf4b Christian Brunner
366 f27aaf4b Christian Brunner
    return ret;
367 f27aaf4b Christian Brunner
}
368 f27aaf4b Christian Brunner
369 f27aaf4b Christian Brunner
/*
370 ad32e9c0 Josh Durgin
 * This aio completion is being called from qemu_rbd_aio_event_reader()
371 ad32e9c0 Josh Durgin
 * and runs in qemu context. It schedules a bh, but just in case the aio
372 f27aaf4b Christian Brunner
 * was not cancelled before.
373 f27aaf4b Christian Brunner
 */
374 ad32e9c0 Josh Durgin
static void qemu_rbd_complete_aio(RADOSCB *rcb)
375 f27aaf4b Christian Brunner
{
376 f27aaf4b Christian Brunner
    RBDAIOCB *acb = rcb->acb;
377 f27aaf4b Christian Brunner
    int64_t r;
378 f27aaf4b Christian Brunner
379 f27aaf4b Christian Brunner
    if (acb->cancelled) {
380 ad32e9c0 Josh Durgin
        qemu_vfree(acb->bounce);
381 ad32e9c0 Josh Durgin
        qemu_aio_release(acb);
382 f27aaf4b Christian Brunner
        goto done;
383 f27aaf4b Christian Brunner
    }
384 f27aaf4b Christian Brunner
385 f27aaf4b Christian Brunner
    r = rcb->ret;
386 f27aaf4b Christian Brunner
387 787f3133 Josh Durgin
    if (acb->cmd == RBD_AIO_WRITE ||
388 787f3133 Josh Durgin
        acb->cmd == RBD_AIO_DISCARD) {
389 f27aaf4b Christian Brunner
        if (r < 0) {
390 f27aaf4b Christian Brunner
            acb->ret = r;
391 f27aaf4b Christian Brunner
            acb->error = 1;
392 f27aaf4b Christian Brunner
        } else if (!acb->error) {
393 ad32e9c0 Josh Durgin
            acb->ret = rcb->size;
394 f27aaf4b Christian Brunner
        }
395 f27aaf4b Christian Brunner
    } else {
396 ad32e9c0 Josh Durgin
        if (r < 0) {
397 ad32e9c0 Josh Durgin
            memset(rcb->buf, 0, rcb->size);
398 f27aaf4b Christian Brunner
            acb->ret = r;
399 f27aaf4b Christian Brunner
            acb->error = 1;
400 ad32e9c0 Josh Durgin
        } else if (r < rcb->size) {
401 ad32e9c0 Josh Durgin
            memset(rcb->buf + r, 0, rcb->size - r);
402 f27aaf4b Christian Brunner
            if (!acb->error) {
403 ad32e9c0 Josh Durgin
                acb->ret = rcb->size;
404 f27aaf4b Christian Brunner
            }
405 f27aaf4b Christian Brunner
        } else if (!acb->error) {
406 ad32e9c0 Josh Durgin
            acb->ret = r;
407 f27aaf4b Christian Brunner
        }
408 f27aaf4b Christian Brunner
    }
409 f27aaf4b Christian Brunner
    /* Note that acb->bh can be NULL in case where the aio was cancelled */
410 ad32e9c0 Josh Durgin
    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
411 ad32e9c0 Josh Durgin
    qemu_bh_schedule(acb->bh);
412 f27aaf4b Christian Brunner
done:
413 7267c094 Anthony Liguori
    g_free(rcb);
414 f27aaf4b Christian Brunner
}
415 f27aaf4b Christian Brunner
416 f27aaf4b Christian Brunner
/*
417 f27aaf4b Christian Brunner
 * aio fd read handler. It runs in the qemu context and calls the
418 f27aaf4b Christian Brunner
 * completion handling of completed rados aio operations.
419 f27aaf4b Christian Brunner
 */
420 ad32e9c0 Josh Durgin
static void qemu_rbd_aio_event_reader(void *opaque)
421 f27aaf4b Christian Brunner
{
422 f27aaf4b Christian Brunner
    BDRVRBDState *s = opaque;
423 f27aaf4b Christian Brunner
424 f27aaf4b Christian Brunner
    ssize_t ret;
425 f27aaf4b Christian Brunner
426 f27aaf4b Christian Brunner
    do {
427 f27aaf4b Christian Brunner
        char *p = (char *)&s->event_rcb;
428 f27aaf4b Christian Brunner
429 f27aaf4b Christian Brunner
        /* now read the rcb pointer that was sent from a non qemu thread */
430 dfe80b07 Sage Weil
        ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
431 dfe80b07 Sage Weil
                   sizeof(s->event_rcb) - s->event_reader_pos);
432 dfe80b07 Sage Weil
        if (ret > 0) {
433 dfe80b07 Sage Weil
            s->event_reader_pos += ret;
434 dfe80b07 Sage Weil
            if (s->event_reader_pos == sizeof(s->event_rcb)) {
435 dfe80b07 Sage Weil
                s->event_reader_pos = 0;
436 dfe80b07 Sage Weil
                qemu_rbd_complete_aio(s->event_rcb);
437 dfe80b07 Sage Weil
                s->qemu_aio_count--;
438 f27aaf4b Christian Brunner
            }
439 f27aaf4b Christian Brunner
        }
440 f27aaf4b Christian Brunner
    } while (ret < 0 && errno == EINTR);
441 f27aaf4b Christian Brunner
}
442 f27aaf4b Christian Brunner
443 ad32e9c0 Josh Durgin
static int qemu_rbd_aio_flush_cb(void *opaque)
444 f27aaf4b Christian Brunner
{
445 f27aaf4b Christian Brunner
    BDRVRBDState *s = opaque;
446 f27aaf4b Christian Brunner
447 f27aaf4b Christian Brunner
    return (s->qemu_aio_count > 0);
448 f27aaf4b Christian Brunner
}
449 f27aaf4b Christian Brunner
450 ad32e9c0 Josh Durgin
static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
451 f27aaf4b Christian Brunner
{
452 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
453 ad32e9c0 Josh Durgin
    char pool[RBD_MAX_POOL_NAME_SIZE];
454 ad32e9c0 Josh Durgin
    char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
455 fab5cf59 Josh Durgin
    char conf[RBD_MAX_CONF_SIZE];
456 7c7e9df0 Sage Weil
    char clientname_buf[RBD_MAX_CONF_SIZE];
457 7c7e9df0 Sage Weil
    char *clientname;
458 f27aaf4b Christian Brunner
    int r;
459 f27aaf4b Christian Brunner
460 ad32e9c0 Josh Durgin
    if (qemu_rbd_parsename(filename, pool, sizeof(pool),
461 ad32e9c0 Josh Durgin
                           snap_buf, sizeof(snap_buf),
462 fab5cf59 Josh Durgin
                           s->name, sizeof(s->name),
463 fab5cf59 Josh Durgin
                           conf, sizeof(conf)) < 0) {
464 f27aaf4b Christian Brunner
        return -EINVAL;
465 f27aaf4b Christian Brunner
    }
466 f27aaf4b Christian Brunner
467 7c7e9df0 Sage Weil
    clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
468 7c7e9df0 Sage Weil
    r = rados_create(&s->cluster, clientname);
469 ad32e9c0 Josh Durgin
    if (r < 0) {
470 f27aaf4b Christian Brunner
        error_report("error initializing");
471 f27aaf4b Christian Brunner
        return r;
472 f27aaf4b Christian Brunner
    }
473 f27aaf4b Christian Brunner
474 eb93d5d9 Sage Weil
    s->snap = NULL;
475 eb93d5d9 Sage Weil
    if (snap_buf[0] != '\0') {
476 eb93d5d9 Sage Weil
        s->snap = g_strdup(snap_buf);
477 eb93d5d9 Sage Weil
    }
478 eb93d5d9 Sage Weil
479 b11f38fc Josh Durgin
    /*
480 b11f38fc Josh Durgin
     * Fallback to more conservative semantics if setting cache
481 b11f38fc Josh Durgin
     * options fails. Ignore errors from setting rbd_cache because the
482 b11f38fc Josh Durgin
     * only possible error is that the option does not exist, and
483 b11f38fc Josh Durgin
     * librbd defaults to no caching. If write through caching cannot
484 b11f38fc Josh Durgin
     * be set up, fall back to no caching.
485 b11f38fc Josh Durgin
     */
486 b11f38fc Josh Durgin
    if (flags & BDRV_O_NOCACHE) {
487 b11f38fc Josh Durgin
        rados_conf_set(s->cluster, "rbd_cache", "false");
488 b11f38fc Josh Durgin
    } else {
489 b11f38fc Josh Durgin
        rados_conf_set(s->cluster, "rbd_cache", "true");
490 b11f38fc Josh Durgin
    }
491 b11f38fc Josh Durgin
492 fab5cf59 Josh Durgin
    if (strstr(conf, "conf=") == NULL) {
493 f9fe18ec Sage Weil
        /* try default location, but ignore failure */
494 f9fe18ec Sage Weil
        rados_conf_read_file(s->cluster, NULL);
495 fab5cf59 Josh Durgin
    }
496 fab5cf59 Josh Durgin
497 fab5cf59 Josh Durgin
    if (conf[0] != '\0') {
498 fab5cf59 Josh Durgin
        r = qemu_rbd_set_conf(s->cluster, conf);
499 fab5cf59 Josh Durgin
        if (r < 0) {
500 fab5cf59 Josh Durgin
            error_report("error setting config options");
501 eb93d5d9 Sage Weil
            goto failed_shutdown;
502 fab5cf59 Josh Durgin
        }
503 f27aaf4b Christian Brunner
    }
504 f27aaf4b Christian Brunner
505 ad32e9c0 Josh Durgin
    r = rados_connect(s->cluster);
506 ad32e9c0 Josh Durgin
    if (r < 0) {
507 ad32e9c0 Josh Durgin
        error_report("error connecting");
508 eb93d5d9 Sage Weil
        goto failed_shutdown;
509 f27aaf4b Christian Brunner
    }
510 f27aaf4b Christian Brunner
511 ad32e9c0 Josh Durgin
    r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
512 ad32e9c0 Josh Durgin
    if (r < 0) {
513 ad32e9c0 Josh Durgin
        error_report("error opening pool %s", pool);
514 eb93d5d9 Sage Weil
        goto failed_shutdown;
515 f27aaf4b Christian Brunner
    }
516 f27aaf4b Christian Brunner
517 ad32e9c0 Josh Durgin
    r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
518 f27aaf4b Christian Brunner
    if (r < 0) {
519 ad32e9c0 Josh Durgin
        error_report("error reading header from %s", s->name);
520 eb93d5d9 Sage Weil
        goto failed_open;
521 f27aaf4b Christian Brunner
    }
522 f27aaf4b Christian Brunner
523 ad32e9c0 Josh Durgin
    bs->read_only = (s->snap != NULL);
524 f27aaf4b Christian Brunner
525 f27aaf4b Christian Brunner
    s->event_reader_pos = 0;
526 f27aaf4b Christian Brunner
    r = qemu_pipe(s->fds);
527 f27aaf4b Christian Brunner
    if (r < 0) {
528 f27aaf4b Christian Brunner
        error_report("error opening eventfd");
529 f27aaf4b Christian Brunner
        goto failed;
530 f27aaf4b Christian Brunner
    }
531 f27aaf4b Christian Brunner
    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
532 f27aaf4b Christian Brunner
    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
533 ad32e9c0 Josh Durgin
    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader,
534 bafbd6a1 Paolo Bonzini
                            NULL, qemu_rbd_aio_flush_cb, s);
535 f27aaf4b Christian Brunner
536 f27aaf4b Christian Brunner
537 f27aaf4b Christian Brunner
    return 0;
538 f27aaf4b Christian Brunner
539 f27aaf4b Christian Brunner
failed:
540 ad32e9c0 Josh Durgin
    rbd_close(s->image);
541 eb93d5d9 Sage Weil
failed_open:
542 ad32e9c0 Josh Durgin
    rados_ioctx_destroy(s->io_ctx);
543 eb93d5d9 Sage Weil
failed_shutdown:
544 ad32e9c0 Josh Durgin
    rados_shutdown(s->cluster);
545 eb93d5d9 Sage Weil
    g_free(s->snap);
546 f27aaf4b Christian Brunner
    return r;
547 f27aaf4b Christian Brunner
}
548 f27aaf4b Christian Brunner
549 ad32e9c0 Josh Durgin
static void qemu_rbd_close(BlockDriverState *bs)
550 f27aaf4b Christian Brunner
{
551 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
552 f27aaf4b Christian Brunner
553 f27aaf4b Christian Brunner
    close(s->fds[0]);
554 f27aaf4b Christian Brunner
    close(s->fds[1]);
555 bafbd6a1 Paolo Bonzini
    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL, NULL);
556 f27aaf4b Christian Brunner
557 ad32e9c0 Josh Durgin
    rbd_close(s->image);
558 ad32e9c0 Josh Durgin
    rados_ioctx_destroy(s->io_ctx);
559 7267c094 Anthony Liguori
    g_free(s->snap);
560 ad32e9c0 Josh Durgin
    rados_shutdown(s->cluster);
561 f27aaf4b Christian Brunner
}
562 f27aaf4b Christian Brunner
563 f27aaf4b Christian Brunner
/*
564 f27aaf4b Christian Brunner
 * Cancel aio. Since we don't reference acb in a non qemu threads,
565 f27aaf4b Christian Brunner
 * it is safe to access it here.
566 f27aaf4b Christian Brunner
 */
567 ad32e9c0 Josh Durgin
static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
568 f27aaf4b Christian Brunner
{
569 f27aaf4b Christian Brunner
    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
570 f27aaf4b Christian Brunner
    acb->cancelled = 1;
571 f27aaf4b Christian Brunner
}
572 f27aaf4b Christian Brunner
573 f27aaf4b Christian Brunner
static AIOPool rbd_aio_pool = {
574 f27aaf4b Christian Brunner
    .aiocb_size = sizeof(RBDAIOCB),
575 ad32e9c0 Josh Durgin
    .cancel = qemu_rbd_aio_cancel,
576 f27aaf4b Christian Brunner
};
577 f27aaf4b Christian Brunner
578 ad32e9c0 Josh Durgin
static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
579 f27aaf4b Christian Brunner
{
580 ad32e9c0 Josh Durgin
    int ret = 0;
581 f27aaf4b Christian Brunner
    while (1) {
582 f27aaf4b Christian Brunner
        fd_set wfd;
583 ad32e9c0 Josh Durgin
        int fd = s->fds[RBD_FD_WRITE];
584 f27aaf4b Christian Brunner
585 ad32e9c0 Josh Durgin
        /* send the op pointer to the qemu thread that is responsible
586 ad32e9c0 Josh Durgin
           for the aio/op completion. Must do it in a qemu thread context */
587 f27aaf4b Christian Brunner
        ret = write(fd, (void *)&rcb, sizeof(rcb));
588 f27aaf4b Christian Brunner
        if (ret >= 0) {
589 f27aaf4b Christian Brunner
            break;
590 f27aaf4b Christian Brunner
        }
591 f27aaf4b Christian Brunner
        if (errno == EINTR) {
592 f27aaf4b Christian Brunner
            continue;
593 ad32e9c0 Josh Durgin
        }
594 f27aaf4b Christian Brunner
        if (errno != EAGAIN) {
595 f27aaf4b Christian Brunner
            break;
596 ad32e9c0 Josh Durgin
        }
597 f27aaf4b Christian Brunner
598 f27aaf4b Christian Brunner
        FD_ZERO(&wfd);
599 f27aaf4b Christian Brunner
        FD_SET(fd, &wfd);
600 f27aaf4b Christian Brunner
        do {
601 f27aaf4b Christian Brunner
            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
602 f27aaf4b Christian Brunner
        } while (ret < 0 && errno == EINTR);
603 f27aaf4b Christian Brunner
    }
604 f27aaf4b Christian Brunner
605 ad32e9c0 Josh Durgin
    return ret;
606 ad32e9c0 Josh Durgin
}
607 ad32e9c0 Josh Durgin
608 ad32e9c0 Josh Durgin
/*
609 ad32e9c0 Josh Durgin
 * This is the callback function for rbd_aio_read and _write
610 ad32e9c0 Josh Durgin
 *
611 ad32e9c0 Josh Durgin
 * Note: this function is being called from a non qemu thread so
612 ad32e9c0 Josh Durgin
 * we need to be careful about what we do here. Generally we only
613 ad32e9c0 Josh Durgin
 * write to the block notification pipe, and do the rest of the
614 ad32e9c0 Josh Durgin
 * io completion handling from qemu_rbd_aio_event_reader() which
615 ad32e9c0 Josh Durgin
 * runs in a qemu context.
616 ad32e9c0 Josh Durgin
 */
617 ad32e9c0 Josh Durgin
static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
618 ad32e9c0 Josh Durgin
{
619 ad32e9c0 Josh Durgin
    int ret;
620 ad32e9c0 Josh Durgin
    rcb->ret = rbd_aio_get_return_value(c);
621 ad32e9c0 Josh Durgin
    rbd_aio_release(c);
622 ad32e9c0 Josh Durgin
    ret = qemu_rbd_send_pipe(rcb->s, rcb);
623 f27aaf4b Christian Brunner
    if (ret < 0) {
624 ad32e9c0 Josh Durgin
        error_report("failed writing to acb->s->fds");
625 7267c094 Anthony Liguori
        g_free(rcb);
626 f27aaf4b Christian Brunner
    }
627 f27aaf4b Christian Brunner
}
628 f27aaf4b Christian Brunner
629 ad32e9c0 Josh Durgin
/* Callback when all queued rbd_aio requests are complete */
630 f27aaf4b Christian Brunner
631 f27aaf4b Christian Brunner
static void rbd_aio_bh_cb(void *opaque)
632 f27aaf4b Christian Brunner
{
633 f27aaf4b Christian Brunner
    RBDAIOCB *acb = opaque;
634 f27aaf4b Christian Brunner
635 787f3133 Josh Durgin
    if (acb->cmd == RBD_AIO_READ) {
636 03396148 Michael Tokarev
        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
637 f27aaf4b Christian Brunner
    }
638 f27aaf4b Christian Brunner
    qemu_vfree(acb->bounce);
639 f27aaf4b Christian Brunner
    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
640 f27aaf4b Christian Brunner
    qemu_bh_delete(acb->bh);
641 f27aaf4b Christian Brunner
    acb->bh = NULL;
642 f27aaf4b Christian Brunner
643 f27aaf4b Christian Brunner
    qemu_aio_release(acb);
644 f27aaf4b Christian Brunner
}
645 f27aaf4b Christian Brunner
646 787f3133 Josh Durgin
static int rbd_aio_discard_wrapper(rbd_image_t image,
647 787f3133 Josh Durgin
                                   uint64_t off,
648 787f3133 Josh Durgin
                                   uint64_t len,
649 787f3133 Josh Durgin
                                   rbd_completion_t comp)
650 787f3133 Josh Durgin
{
651 787f3133 Josh Durgin
#ifdef LIBRBD_SUPPORTS_DISCARD
652 787f3133 Josh Durgin
    return rbd_aio_discard(image, off, len, comp);
653 787f3133 Josh Durgin
#else
654 787f3133 Josh Durgin
    return -ENOTSUP;
655 787f3133 Josh Durgin
#endif
656 787f3133 Josh Durgin
}
657 787f3133 Josh Durgin
658 787f3133 Josh Durgin
static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
659 787f3133 Josh Durgin
                                       int64_t sector_num,
660 787f3133 Josh Durgin
                                       QEMUIOVector *qiov,
661 787f3133 Josh Durgin
                                       int nb_sectors,
662 787f3133 Josh Durgin
                                       BlockDriverCompletionFunc *cb,
663 787f3133 Josh Durgin
                                       void *opaque,
664 787f3133 Josh Durgin
                                       RBDAIOCmd cmd)
665 f27aaf4b Christian Brunner
{
666 f27aaf4b Christian Brunner
    RBDAIOCB *acb;
667 f27aaf4b Christian Brunner
    RADOSCB *rcb;
668 ad32e9c0 Josh Durgin
    rbd_completion_t c;
669 f27aaf4b Christian Brunner
    int64_t off, size;
670 f27aaf4b Christian Brunner
    char *buf;
671 51a13528 Josh Durgin
    int r;
672 f27aaf4b Christian Brunner
673 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
674 f27aaf4b Christian Brunner
675 f27aaf4b Christian Brunner
    acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
676 787f3133 Josh Durgin
    acb->cmd = cmd;
677 f27aaf4b Christian Brunner
    acb->qiov = qiov;
678 787f3133 Josh Durgin
    if (cmd == RBD_AIO_DISCARD) {
679 787f3133 Josh Durgin
        acb->bounce = NULL;
680 787f3133 Josh Durgin
    } else {
681 787f3133 Josh Durgin
        acb->bounce = qemu_blockalign(bs, qiov->size);
682 787f3133 Josh Durgin
    }
683 f27aaf4b Christian Brunner
    acb->ret = 0;
684 f27aaf4b Christian Brunner
    acb->error = 0;
685 f27aaf4b Christian Brunner
    acb->s = s;
686 f27aaf4b Christian Brunner
    acb->cancelled = 0;
687 f27aaf4b Christian Brunner
    acb->bh = NULL;
688 f27aaf4b Christian Brunner
689 787f3133 Josh Durgin
    if (cmd == RBD_AIO_WRITE) {
690 d5e6b161 Michael Tokarev
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
691 f27aaf4b Christian Brunner
    }
692 f27aaf4b Christian Brunner
693 f27aaf4b Christian Brunner
    buf = acb->bounce;
694 f27aaf4b Christian Brunner
695 f27aaf4b Christian Brunner
    off = sector_num * BDRV_SECTOR_SIZE;
696 f27aaf4b Christian Brunner
    size = nb_sectors * BDRV_SECTOR_SIZE;
697 f27aaf4b Christian Brunner
698 ad32e9c0 Josh Durgin
    s->qemu_aio_count++; /* All the RADOSCB */
699 f27aaf4b Christian Brunner
700 7267c094 Anthony Liguori
    rcb = g_malloc(sizeof(RADOSCB));
701 ad32e9c0 Josh Durgin
    rcb->done = 0;
702 ad32e9c0 Josh Durgin
    rcb->acb = acb;
703 ad32e9c0 Josh Durgin
    rcb->buf = buf;
704 ad32e9c0 Josh Durgin
    rcb->s = acb->s;
705 ad32e9c0 Josh Durgin
    rcb->size = size;
706 51a13528 Josh Durgin
    r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
707 51a13528 Josh Durgin
    if (r < 0) {
708 51a13528 Josh Durgin
        goto failed;
709 51a13528 Josh Durgin
    }
710 f27aaf4b Christian Brunner
711 787f3133 Josh Durgin
    switch (cmd) {
712 787f3133 Josh Durgin
    case RBD_AIO_WRITE:
713 51a13528 Josh Durgin
        r = rbd_aio_write(s->image, off, size, buf, c);
714 787f3133 Josh Durgin
        break;
715 787f3133 Josh Durgin
    case RBD_AIO_READ:
716 51a13528 Josh Durgin
        r = rbd_aio_read(s->image, off, size, buf, c);
717 787f3133 Josh Durgin
        break;
718 787f3133 Josh Durgin
    case RBD_AIO_DISCARD:
719 787f3133 Josh Durgin
        r = rbd_aio_discard_wrapper(s->image, off, size, c);
720 787f3133 Josh Durgin
        break;
721 787f3133 Josh Durgin
    default:
722 787f3133 Josh Durgin
        r = -EINVAL;
723 51a13528 Josh Durgin
    }
724 51a13528 Josh Durgin
725 51a13528 Josh Durgin
    if (r < 0) {
726 51a13528 Josh Durgin
        goto failed;
727 f27aaf4b Christian Brunner
    }
728 f27aaf4b Christian Brunner
729 f27aaf4b Christian Brunner
    return &acb->common;
730 51a13528 Josh Durgin
731 51a13528 Josh Durgin
failed:
732 7267c094 Anthony Liguori
    g_free(rcb);
733 51a13528 Josh Durgin
    s->qemu_aio_count--;
734 51a13528 Josh Durgin
    qemu_aio_release(acb);
735 51a13528 Josh Durgin
    return NULL;
736 f27aaf4b Christian Brunner
}
737 f27aaf4b Christian Brunner
738 ad32e9c0 Josh Durgin
static BlockDriverAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
739 ad32e9c0 Josh Durgin
                                            int64_t sector_num,
740 ad32e9c0 Josh Durgin
                                            QEMUIOVector *qiov,
741 ad32e9c0 Josh Durgin
                                            int nb_sectors,
742 ad32e9c0 Josh Durgin
                                            BlockDriverCompletionFunc *cb,
743 ad32e9c0 Josh Durgin
                                            void *opaque)
744 f27aaf4b Christian Brunner
{
745 787f3133 Josh Durgin
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
746 787f3133 Josh Durgin
                         RBD_AIO_READ);
747 f27aaf4b Christian Brunner
}
748 f27aaf4b Christian Brunner
749 ad32e9c0 Josh Durgin
static BlockDriverAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
750 ad32e9c0 Josh Durgin
                                             int64_t sector_num,
751 ad32e9c0 Josh Durgin
                                             QEMUIOVector *qiov,
752 ad32e9c0 Josh Durgin
                                             int nb_sectors,
753 ad32e9c0 Josh Durgin
                                             BlockDriverCompletionFunc *cb,
754 ad32e9c0 Josh Durgin
                                             void *opaque)
755 f27aaf4b Christian Brunner
{
756 787f3133 Josh Durgin
    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
757 787f3133 Josh Durgin
                         RBD_AIO_WRITE);
758 f27aaf4b Christian Brunner
}
759 f27aaf4b Christian Brunner
760 8b94ff85 Paolo Bonzini
static int qemu_rbd_co_flush(BlockDriverState *bs)
761 7a3f5fe9 Sage Weil
{
762 7a3f5fe9 Sage Weil
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
763 7a3f5fe9 Sage Weil
    /* rbd_flush added in 0.1.1 */
764 7a3f5fe9 Sage Weil
    BDRVRBDState *s = bs->opaque;
765 7a3f5fe9 Sage Weil
    return rbd_flush(s->image);
766 7a3f5fe9 Sage Weil
#else
767 7a3f5fe9 Sage Weil
    return 0;
768 7a3f5fe9 Sage Weil
#endif
769 7a3f5fe9 Sage Weil
}
770 7a3f5fe9 Sage Weil
771 ad32e9c0 Josh Durgin
static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
772 f27aaf4b Christian Brunner
{
773 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
774 ad32e9c0 Josh Durgin
    rbd_image_info_t info;
775 ad32e9c0 Josh Durgin
    int r;
776 ad32e9c0 Josh Durgin
777 ad32e9c0 Josh Durgin
    r = rbd_stat(s->image, &info, sizeof(info));
778 ad32e9c0 Josh Durgin
    if (r < 0) {
779 ad32e9c0 Josh Durgin
        return r;
780 ad32e9c0 Josh Durgin
    }
781 ad32e9c0 Josh Durgin
782 ad32e9c0 Josh Durgin
    bdi->cluster_size = info.obj_size;
783 f27aaf4b Christian Brunner
    return 0;
784 f27aaf4b Christian Brunner
}
785 f27aaf4b Christian Brunner
786 ad32e9c0 Josh Durgin
static int64_t qemu_rbd_getlength(BlockDriverState *bs)
787 f27aaf4b Christian Brunner
{
788 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
789 ad32e9c0 Josh Durgin
    rbd_image_info_t info;
790 ad32e9c0 Josh Durgin
    int r;
791 f27aaf4b Christian Brunner
792 ad32e9c0 Josh Durgin
    r = rbd_stat(s->image, &info, sizeof(info));
793 ad32e9c0 Josh Durgin
    if (r < 0) {
794 ad32e9c0 Josh Durgin
        return r;
795 ad32e9c0 Josh Durgin
    }
796 ad32e9c0 Josh Durgin
797 ad32e9c0 Josh Durgin
    return info.size;
798 f27aaf4b Christian Brunner
}
799 f27aaf4b Christian Brunner
800 30cdc48c Josh Durgin
static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset)
801 30cdc48c Josh Durgin
{
802 30cdc48c Josh Durgin
    BDRVRBDState *s = bs->opaque;
803 30cdc48c Josh Durgin
    int r;
804 30cdc48c Josh Durgin
805 30cdc48c Josh Durgin
    r = rbd_resize(s->image, offset);
806 30cdc48c Josh Durgin
    if (r < 0) {
807 30cdc48c Josh Durgin
        return r;
808 30cdc48c Josh Durgin
    }
809 30cdc48c Josh Durgin
810 30cdc48c Josh Durgin
    return 0;
811 30cdc48c Josh Durgin
}
812 30cdc48c Josh Durgin
813 ad32e9c0 Josh Durgin
static int qemu_rbd_snap_create(BlockDriverState *bs,
814 ad32e9c0 Josh Durgin
                                QEMUSnapshotInfo *sn_info)
815 f27aaf4b Christian Brunner
{
816 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
817 f27aaf4b Christian Brunner
    int r;
818 f27aaf4b Christian Brunner
819 f27aaf4b Christian Brunner
    if (sn_info->name[0] == '\0') {
820 f27aaf4b Christian Brunner
        return -EINVAL; /* we need a name for rbd snapshots */
821 f27aaf4b Christian Brunner
    }
822 f27aaf4b Christian Brunner
823 f27aaf4b Christian Brunner
    /*
824 f27aaf4b Christian Brunner
     * rbd snapshots are using the name as the user controlled unique identifier
825 f27aaf4b Christian Brunner
     * we can't use the rbd snapid for that purpose, as it can't be set
826 f27aaf4b Christian Brunner
     */
827 f27aaf4b Christian Brunner
    if (sn_info->id_str[0] != '\0' &&
828 f27aaf4b Christian Brunner
        strcmp(sn_info->id_str, sn_info->name) != 0) {
829 f27aaf4b Christian Brunner
        return -EINVAL;
830 f27aaf4b Christian Brunner
    }
831 f27aaf4b Christian Brunner
832 f27aaf4b Christian Brunner
    if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) {
833 f27aaf4b Christian Brunner
        return -ERANGE;
834 f27aaf4b Christian Brunner
    }
835 f27aaf4b Christian Brunner
836 ad32e9c0 Josh Durgin
    r = rbd_snap_create(s->image, sn_info->name);
837 f27aaf4b Christian Brunner
    if (r < 0) {
838 ad32e9c0 Josh Durgin
        error_report("failed to create snap: %s", strerror(-r));
839 f27aaf4b Christian Brunner
        return r;
840 f27aaf4b Christian Brunner
    }
841 f27aaf4b Christian Brunner
842 f27aaf4b Christian Brunner
    return 0;
843 f27aaf4b Christian Brunner
}
844 f27aaf4b Christian Brunner
845 bd603247 Gregory Farnum
static int qemu_rbd_snap_remove(BlockDriverState *bs,
846 bd603247 Gregory Farnum
                                const char *snapshot_name)
847 bd603247 Gregory Farnum
{
848 bd603247 Gregory Farnum
    BDRVRBDState *s = bs->opaque;
849 bd603247 Gregory Farnum
    int r;
850 bd603247 Gregory Farnum
851 bd603247 Gregory Farnum
    r = rbd_snap_remove(s->image, snapshot_name);
852 bd603247 Gregory Farnum
    return r;
853 bd603247 Gregory Farnum
}
854 bd603247 Gregory Farnum
855 bd603247 Gregory Farnum
static int qemu_rbd_snap_rollback(BlockDriverState *bs,
856 bd603247 Gregory Farnum
                                  const char *snapshot_name)
857 bd603247 Gregory Farnum
{
858 bd603247 Gregory Farnum
    BDRVRBDState *s = bs->opaque;
859 bd603247 Gregory Farnum
    int r;
860 bd603247 Gregory Farnum
861 bd603247 Gregory Farnum
    r = rbd_snap_rollback(s->image, snapshot_name);
862 bd603247 Gregory Farnum
    return r;
863 bd603247 Gregory Farnum
}
864 bd603247 Gregory Farnum
865 ad32e9c0 Josh Durgin
static int qemu_rbd_snap_list(BlockDriverState *bs,
866 ad32e9c0 Josh Durgin
                              QEMUSnapshotInfo **psn_tab)
867 f27aaf4b Christian Brunner
{
868 f27aaf4b Christian Brunner
    BDRVRBDState *s = bs->opaque;
869 f27aaf4b Christian Brunner
    QEMUSnapshotInfo *sn_info, *sn_tab = NULL;
870 ad32e9c0 Josh Durgin
    int i, snap_count;
871 ad32e9c0 Josh Durgin
    rbd_snap_info_t *snaps;
872 ad32e9c0 Josh Durgin
    int max_snaps = RBD_MAX_SNAPS;
873 f27aaf4b Christian Brunner
874 ad32e9c0 Josh Durgin
    do {
875 7267c094 Anthony Liguori
        snaps = g_malloc(sizeof(*snaps) * max_snaps);
876 ad32e9c0 Josh Durgin
        snap_count = rbd_snap_list(s->image, snaps, &max_snaps);
877 ad32e9c0 Josh Durgin
        if (snap_count < 0) {
878 7267c094 Anthony Liguori
            g_free(snaps);
879 f27aaf4b Christian Brunner
        }
880 ad32e9c0 Josh Durgin
    } while (snap_count == -ERANGE);
881 f27aaf4b Christian Brunner
882 ad32e9c0 Josh Durgin
    if (snap_count <= 0) {
883 b9c53290 Josh Durgin
        goto done;
884 f27aaf4b Christian Brunner
    }
885 f27aaf4b Christian Brunner
886 7267c094 Anthony Liguori
    sn_tab = g_malloc0(snap_count * sizeof(QEMUSnapshotInfo));
887 f27aaf4b Christian Brunner
888 ad32e9c0 Josh Durgin
    for (i = 0; i < snap_count; i++) {
889 ad32e9c0 Josh Durgin
        const char *snap_name = snaps[i].name;
890 f27aaf4b Christian Brunner
891 f27aaf4b Christian Brunner
        sn_info = sn_tab + i;
892 f27aaf4b Christian Brunner
        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name);
893 f27aaf4b Christian Brunner
        pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name);
894 f27aaf4b Christian Brunner
895 ad32e9c0 Josh Durgin
        sn_info->vm_state_size = snaps[i].size;
896 f27aaf4b Christian Brunner
        sn_info->date_sec = 0;
897 f27aaf4b Christian Brunner
        sn_info->date_nsec = 0;
898 f27aaf4b Christian Brunner
        sn_info->vm_clock_nsec = 0;
899 f27aaf4b Christian Brunner
    }
900 ad32e9c0 Josh Durgin
    rbd_snap_list_end(snaps);
901 ad32e9c0 Josh Durgin
902 b9c53290 Josh Durgin
 done:
903 f27aaf4b Christian Brunner
    *psn_tab = sn_tab;
904 f27aaf4b Christian Brunner
    return snap_count;
905 f27aaf4b Christian Brunner
}
906 f27aaf4b Christian Brunner
907 787f3133 Josh Durgin
#ifdef LIBRBD_SUPPORTS_DISCARD
908 787f3133 Josh Durgin
static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
909 787f3133 Josh Durgin
                                              int64_t sector_num,
910 787f3133 Josh Durgin
                                              int nb_sectors,
911 787f3133 Josh Durgin
                                              BlockDriverCompletionFunc *cb,
912 787f3133 Josh Durgin
                                              void *opaque)
913 787f3133 Josh Durgin
{
914 787f3133 Josh Durgin
    return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
915 787f3133 Josh Durgin
                         RBD_AIO_DISCARD);
916 787f3133 Josh Durgin
}
917 787f3133 Josh Durgin
#endif
918 787f3133 Josh Durgin
919 ad32e9c0 Josh Durgin
static QEMUOptionParameter qemu_rbd_create_options[] = {
920 f27aaf4b Christian Brunner
    {
921 f27aaf4b Christian Brunner
     .name = BLOCK_OPT_SIZE,
922 f27aaf4b Christian Brunner
     .type = OPT_SIZE,
923 f27aaf4b Christian Brunner
     .help = "Virtual disk size"
924 f27aaf4b Christian Brunner
    },
925 f27aaf4b Christian Brunner
    {
926 f27aaf4b Christian Brunner
     .name = BLOCK_OPT_CLUSTER_SIZE,
927 f27aaf4b Christian Brunner
     .type = OPT_SIZE,
928 f27aaf4b Christian Brunner
     .help = "RBD object size"
929 f27aaf4b Christian Brunner
    },
930 f27aaf4b Christian Brunner
    {NULL}
931 f27aaf4b Christian Brunner
};
932 f27aaf4b Christian Brunner
933 f27aaf4b Christian Brunner
static BlockDriver bdrv_rbd = {
934 f27aaf4b Christian Brunner
    .format_name        = "rbd",
935 f27aaf4b Christian Brunner
    .instance_size      = sizeof(BDRVRBDState),
936 ad32e9c0 Josh Durgin
    .bdrv_file_open     = qemu_rbd_open,
937 ad32e9c0 Josh Durgin
    .bdrv_close         = qemu_rbd_close,
938 ad32e9c0 Josh Durgin
    .bdrv_create        = qemu_rbd_create,
939 ad32e9c0 Josh Durgin
    .bdrv_get_info      = qemu_rbd_getinfo,
940 ad32e9c0 Josh Durgin
    .create_options     = qemu_rbd_create_options,
941 ad32e9c0 Josh Durgin
    .bdrv_getlength     = qemu_rbd_getlength,
942 30cdc48c Josh Durgin
    .bdrv_truncate      = qemu_rbd_truncate,
943 f27aaf4b Christian Brunner
    .protocol_name      = "rbd",
944 f27aaf4b Christian Brunner
945 c68b89ac Kevin Wolf
    .bdrv_aio_readv         = qemu_rbd_aio_readv,
946 c68b89ac Kevin Wolf
    .bdrv_aio_writev        = qemu_rbd_aio_writev,
947 c68b89ac Kevin Wolf
    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
948 f27aaf4b Christian Brunner
949 787f3133 Josh Durgin
#ifdef LIBRBD_SUPPORTS_DISCARD
950 787f3133 Josh Durgin
    .bdrv_aio_discard       = qemu_rbd_aio_discard,
951 787f3133 Josh Durgin
#endif
952 787f3133 Josh Durgin
953 c68b89ac Kevin Wolf
    .bdrv_snapshot_create   = qemu_rbd_snap_create,
954 bd603247 Gregory Farnum
    .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
955 c68b89ac Kevin Wolf
    .bdrv_snapshot_list     = qemu_rbd_snap_list,
956 bd603247 Gregory Farnum
    .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
957 f27aaf4b Christian Brunner
};
958 f27aaf4b Christian Brunner
959 f27aaf4b Christian Brunner
static void bdrv_rbd_init(void)
960 f27aaf4b Christian Brunner
{
961 f27aaf4b Christian Brunner
    bdrv_register(&bdrv_rbd);
962 f27aaf4b Christian Brunner
}
963 f27aaf4b Christian Brunner
964 f27aaf4b Christian Brunner
block_init(bdrv_rbd_init);