root / block / nbd.c @ fc19f8a0
History | View | Annotate | Download (13.8 kB)
1 |
/*
|
---|---|
2 |
* QEMU Block driver for NBD
|
3 |
*
|
4 |
* Copyright (C) 2008 Bull S.A.S.
|
5 |
* Author: Laurent Vivier <Laurent.Vivier@bull.net>
|
6 |
*
|
7 |
* Some parts:
|
8 |
* Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
|
9 |
*
|
10 |
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
11 |
* of this software and associated documentation files (the "Software"), to deal
|
12 |
* in the Software without restriction, including without limitation the rights
|
13 |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14 |
* copies of the Software, and to permit persons to whom the Software is
|
15 |
* furnished to do so, subject to the following conditions:
|
16 |
*
|
17 |
* The above copyright notice and this permission notice shall be included in
|
18 |
* all copies or substantial portions of the Software.
|
19 |
*
|
20 |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21 |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22 |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
23 |
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24 |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25 |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
26 |
* THE SOFTWARE.
|
27 |
*/
|
28 |
|
29 |
#include "qemu-common.h" |
30 |
#include "nbd.h" |
31 |
#include "block_int.h" |
32 |
#include "module.h" |
33 |
#include "qemu_socket.h" |
34 |
|
35 |
#include <sys/types.h> |
36 |
#include <unistd.h> |
37 |
|
38 |
#define EN_OPTSTR ":exportname=" |
39 |
|
40 |
/* #define DEBUG_NBD */
|
41 |
|
42 |
#if defined(DEBUG_NBD)
|
43 |
#define logout(fmt, ...) \
|
44 |
fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) |
45 |
#else
|
46 |
#define logout(fmt, ...) ((void)0) |
47 |
#endif
|
48 |
|
49 |
#define MAX_NBD_REQUESTS 16 |
50 |
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
|
51 |
#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs))
|
52 |
|
53 |
typedef struct BDRVNBDState { |
54 |
int sock;
|
55 |
uint32_t nbdflags; |
56 |
off_t size; |
57 |
size_t blocksize; |
58 |
char *export_name; /* An NBD server may export several devices */ |
59 |
|
60 |
CoMutex send_mutex; |
61 |
CoMutex free_sema; |
62 |
Coroutine *send_coroutine; |
63 |
int in_flight;
|
64 |
|
65 |
Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; |
66 |
struct nbd_reply reply;
|
67 |
|
68 |
/* If it begins with '/', this is a UNIX domain socket. Otherwise,
|
69 |
* it's a string of the form <hostname|ip4|\[ip6\]>:port
|
70 |
*/
|
71 |
char *host_spec;
|
72 |
} BDRVNBDState; |
73 |
|
74 |
static int nbd_config(BDRVNBDState *s, const char *filename, int flags) |
75 |
{ |
76 |
char *file;
|
77 |
char *export_name;
|
78 |
const char *host_spec; |
79 |
const char *unixpath; |
80 |
int err = -EINVAL;
|
81 |
|
82 |
file = g_strdup(filename); |
83 |
|
84 |
export_name = strstr(file, EN_OPTSTR); |
85 |
if (export_name) {
|
86 |
if (export_name[strlen(EN_OPTSTR)] == 0) { |
87 |
goto out;
|
88 |
} |
89 |
export_name[0] = 0; /* truncate 'file' */ |
90 |
export_name += strlen(EN_OPTSTR); |
91 |
s->export_name = g_strdup(export_name); |
92 |
} |
93 |
|
94 |
/* extract the host_spec - fail if it's not nbd:... */
|
95 |
if (!strstart(file, "nbd:", &host_spec)) { |
96 |
goto out;
|
97 |
} |
98 |
|
99 |
/* are we a UNIX or TCP socket? */
|
100 |
if (strstart(host_spec, "unix:", &unixpath)) { |
101 |
if (unixpath[0] != '/') { /* We demand an absolute path*/ |
102 |
goto out;
|
103 |
} |
104 |
s->host_spec = g_strdup(unixpath); |
105 |
} else {
|
106 |
s->host_spec = g_strdup(host_spec); |
107 |
} |
108 |
|
109 |
err = 0;
|
110 |
|
111 |
out:
|
112 |
g_free(file); |
113 |
if (err != 0) { |
114 |
g_free(s->export_name); |
115 |
g_free(s->host_spec); |
116 |
} |
117 |
return err;
|
118 |
} |
119 |
|
120 |
static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request) |
121 |
{ |
122 |
int i;
|
123 |
|
124 |
/* Poor man semaphore. The free_sema is locked when no other request
|
125 |
* can be accepted, and unlocked after receiving one reply. */
|
126 |
if (s->in_flight >= MAX_NBD_REQUESTS - 1) { |
127 |
qemu_co_mutex_lock(&s->free_sema); |
128 |
assert(s->in_flight < MAX_NBD_REQUESTS); |
129 |
} |
130 |
s->in_flight++; |
131 |
|
132 |
for (i = 0; i < MAX_NBD_REQUESTS; i++) { |
133 |
if (s->recv_coroutine[i] == NULL) { |
134 |
s->recv_coroutine[i] = qemu_coroutine_self(); |
135 |
break;
|
136 |
} |
137 |
} |
138 |
|
139 |
assert(i < MAX_NBD_REQUESTS); |
140 |
request->handle = INDEX_TO_HANDLE(s, i); |
141 |
} |
142 |
|
143 |
static int nbd_have_request(void *opaque) |
144 |
{ |
145 |
BDRVNBDState *s = opaque; |
146 |
|
147 |
return s->in_flight > 0; |
148 |
} |
149 |
|
150 |
static void nbd_reply_ready(void *opaque) |
151 |
{ |
152 |
BDRVNBDState *s = opaque; |
153 |
uint64_t i; |
154 |
|
155 |
if (s->reply.handle == 0) { |
156 |
/* No reply already in flight. Fetch a header. */
|
157 |
if (nbd_receive_reply(s->sock, &s->reply) < 0) { |
158 |
s->reply.handle = 0;
|
159 |
goto fail;
|
160 |
} |
161 |
} |
162 |
|
163 |
/* There's no need for a mutex on the receive side, because the
|
164 |
* handler acts as a synchronization point and ensures that only
|
165 |
* one coroutine is called until the reply finishes. */
|
166 |
i = HANDLE_TO_INDEX(s, s->reply.handle); |
167 |
if (i >= MAX_NBD_REQUESTS) {
|
168 |
goto fail;
|
169 |
} |
170 |
|
171 |
if (s->recv_coroutine[i]) {
|
172 |
qemu_coroutine_enter(s->recv_coroutine[i], NULL);
|
173 |
return;
|
174 |
} |
175 |
|
176 |
fail:
|
177 |
for (i = 0; i < MAX_NBD_REQUESTS; i++) { |
178 |
if (s->recv_coroutine[i]) {
|
179 |
qemu_coroutine_enter(s->recv_coroutine[i], NULL);
|
180 |
} |
181 |
} |
182 |
} |
183 |
|
184 |
static void nbd_restart_write(void *opaque) |
185 |
{ |
186 |
BDRVNBDState *s = opaque; |
187 |
qemu_coroutine_enter(s->send_coroutine, NULL);
|
188 |
} |
189 |
|
190 |
static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request, |
191 |
struct iovec *iov, int offset) |
192 |
{ |
193 |
int rc, ret;
|
194 |
|
195 |
qemu_co_mutex_lock(&s->send_mutex); |
196 |
s->send_coroutine = qemu_coroutine_self(); |
197 |
qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, |
198 |
nbd_have_request, NULL, s);
|
199 |
rc = nbd_send_request(s->sock, request); |
200 |
if (rc >= 0 && iov) { |
201 |
ret = qemu_co_sendv(s->sock, iov, request->len, offset); |
202 |
if (ret != request->len) {
|
203 |
errno = -EIO; |
204 |
rc = -1;
|
205 |
} |
206 |
} |
207 |
qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
|
208 |
nbd_have_request, NULL, s);
|
209 |
s->send_coroutine = NULL;
|
210 |
qemu_co_mutex_unlock(&s->send_mutex); |
211 |
return rc;
|
212 |
} |
213 |
|
214 |
static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request, |
215 |
struct nbd_reply *reply,
|
216 |
struct iovec *iov, int offset) |
217 |
{ |
218 |
int ret;
|
219 |
|
220 |
/* Wait until we're woken up by the read handler. TODO: perhaps
|
221 |
* peek at the next reply and avoid yielding if it's ours? */
|
222 |
qemu_coroutine_yield(); |
223 |
*reply = s->reply; |
224 |
if (reply->handle != request->handle) {
|
225 |
reply->error = EIO; |
226 |
} else {
|
227 |
if (iov && reply->error == 0) { |
228 |
ret = qemu_co_recvv(s->sock, iov, request->len, offset); |
229 |
if (ret != request->len) {
|
230 |
reply->error = EIO; |
231 |
} |
232 |
} |
233 |
|
234 |
/* Tell the read handler to read another header. */
|
235 |
s->reply.handle = 0;
|
236 |
} |
237 |
} |
238 |
|
239 |
static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request) |
240 |
{ |
241 |
int i = HANDLE_TO_INDEX(s, request->handle);
|
242 |
s->recv_coroutine[i] = NULL;
|
243 |
if (s->in_flight-- == MAX_NBD_REQUESTS) {
|
244 |
qemu_co_mutex_unlock(&s->free_sema); |
245 |
} |
246 |
} |
247 |
|
248 |
static int nbd_establish_connection(BlockDriverState *bs) |
249 |
{ |
250 |
BDRVNBDState *s = bs->opaque; |
251 |
int sock;
|
252 |
int ret;
|
253 |
off_t size; |
254 |
size_t blocksize; |
255 |
|
256 |
if (s->host_spec[0] == '/') { |
257 |
sock = unix_socket_outgoing(s->host_spec); |
258 |
} else {
|
259 |
sock = tcp_socket_outgoing_spec(s->host_spec); |
260 |
} |
261 |
|
262 |
/* Failed to establish connection */
|
263 |
if (sock < 0) { |
264 |
logout("Failed to establish connection to NBD server\n");
|
265 |
return -errno;
|
266 |
} |
267 |
|
268 |
/* NBD handshake */
|
269 |
ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size, |
270 |
&blocksize); |
271 |
if (ret < 0) { |
272 |
logout("Failed to negotiate with the NBD server\n");
|
273 |
closesocket(sock); |
274 |
return -errno;
|
275 |
} |
276 |
|
277 |
/* Now that we're connected, set the socket to be non-blocking and
|
278 |
* kick the reply mechanism. */
|
279 |
socket_set_nonblock(sock); |
280 |
qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
|
281 |
nbd_have_request, NULL, s);
|
282 |
|
283 |
s->sock = sock; |
284 |
s->size = size; |
285 |
s->blocksize = blocksize; |
286 |
|
287 |
logout("Established connection with NBD server\n");
|
288 |
return 0; |
289 |
} |
290 |
|
291 |
static void nbd_teardown_connection(BlockDriverState *bs) |
292 |
{ |
293 |
BDRVNBDState *s = bs->opaque; |
294 |
struct nbd_request request;
|
295 |
|
296 |
request.type = NBD_CMD_DISC; |
297 |
request.from = 0;
|
298 |
request.len = 0;
|
299 |
nbd_send_request(s->sock, &request); |
300 |
|
301 |
qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL, NULL); |
302 |
closesocket(s->sock); |
303 |
} |
304 |
|
305 |
static int nbd_open(BlockDriverState *bs, const char* filename, int flags) |
306 |
{ |
307 |
BDRVNBDState *s = bs->opaque; |
308 |
int result;
|
309 |
|
310 |
qemu_co_mutex_init(&s->send_mutex); |
311 |
qemu_co_mutex_init(&s->free_sema); |
312 |
|
313 |
/* Pop the config into our state object. Exit if invalid. */
|
314 |
result = nbd_config(s, filename, flags); |
315 |
if (result != 0) { |
316 |
return result;
|
317 |
} |
318 |
|
319 |
/* establish TCP connection, return error if it fails
|
320 |
* TODO: Configurable retry-until-timeout behaviour.
|
321 |
*/
|
322 |
result = nbd_establish_connection(bs); |
323 |
|
324 |
return result;
|
325 |
} |
326 |
|
327 |
static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, |
328 |
int nb_sectors, QEMUIOVector *qiov,
|
329 |
int offset)
|
330 |
{ |
331 |
BDRVNBDState *s = bs->opaque; |
332 |
struct nbd_request request;
|
333 |
struct nbd_reply reply;
|
334 |
ssize_t ret; |
335 |
|
336 |
request.type = NBD_CMD_READ; |
337 |
request.from = sector_num * 512;
|
338 |
request.len = nb_sectors * 512;
|
339 |
|
340 |
nbd_coroutine_start(s, &request); |
341 |
ret = nbd_co_send_request(s, &request, NULL, 0); |
342 |
if (ret < 0) { |
343 |
reply.error = errno; |
344 |
} else {
|
345 |
nbd_co_receive_reply(s, &request, &reply, qiov->iov, offset); |
346 |
} |
347 |
nbd_coroutine_end(s, &request); |
348 |
return -reply.error;
|
349 |
|
350 |
} |
351 |
|
352 |
static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, |
353 |
int nb_sectors, QEMUIOVector *qiov,
|
354 |
int offset)
|
355 |
{ |
356 |
BDRVNBDState *s = bs->opaque; |
357 |
struct nbd_request request;
|
358 |
struct nbd_reply reply;
|
359 |
ssize_t ret; |
360 |
|
361 |
request.type = NBD_CMD_WRITE; |
362 |
if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
|
363 |
request.type |= NBD_CMD_FLAG_FUA; |
364 |
} |
365 |
|
366 |
request.from = sector_num * 512;
|
367 |
request.len = nb_sectors * 512;
|
368 |
|
369 |
nbd_coroutine_start(s, &request); |
370 |
ret = nbd_co_send_request(s, &request, qiov->iov, offset); |
371 |
if (ret < 0) { |
372 |
reply.error = errno; |
373 |
} else {
|
374 |
nbd_co_receive_reply(s, &request, &reply, NULL, 0); |
375 |
} |
376 |
nbd_coroutine_end(s, &request); |
377 |
return -reply.error;
|
378 |
} |
379 |
|
380 |
/* qemu-nbd has a limit of slightly less than 1M per request. Try to
|
381 |
* remain aligned to 4K. */
|
382 |
#define NBD_MAX_SECTORS 2040 |
383 |
|
384 |
static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, |
385 |
int nb_sectors, QEMUIOVector *qiov)
|
386 |
{ |
387 |
int offset = 0; |
388 |
int ret;
|
389 |
while (nb_sectors > NBD_MAX_SECTORS) {
|
390 |
ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); |
391 |
if (ret < 0) { |
392 |
return ret;
|
393 |
} |
394 |
offset += NBD_MAX_SECTORS * 512;
|
395 |
sector_num += NBD_MAX_SECTORS; |
396 |
nb_sectors -= NBD_MAX_SECTORS; |
397 |
} |
398 |
return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
|
399 |
} |
400 |
|
401 |
static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, |
402 |
int nb_sectors, QEMUIOVector *qiov)
|
403 |
{ |
404 |
int offset = 0; |
405 |
int ret;
|
406 |
while (nb_sectors > NBD_MAX_SECTORS) {
|
407 |
ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); |
408 |
if (ret < 0) { |
409 |
return ret;
|
410 |
} |
411 |
offset += NBD_MAX_SECTORS * 512;
|
412 |
sector_num += NBD_MAX_SECTORS; |
413 |
nb_sectors -= NBD_MAX_SECTORS; |
414 |
} |
415 |
return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
|
416 |
} |
417 |
|
418 |
static int nbd_co_flush(BlockDriverState *bs) |
419 |
{ |
420 |
BDRVNBDState *s = bs->opaque; |
421 |
struct nbd_request request;
|
422 |
struct nbd_reply reply;
|
423 |
ssize_t ret; |
424 |
|
425 |
if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
|
426 |
return 0; |
427 |
} |
428 |
|
429 |
request.type = NBD_CMD_FLUSH; |
430 |
if (s->nbdflags & NBD_FLAG_SEND_FUA) {
|
431 |
request.type |= NBD_CMD_FLAG_FUA; |
432 |
} |
433 |
|
434 |
request.from = 0;
|
435 |
request.len = 0;
|
436 |
|
437 |
nbd_coroutine_start(s, &request); |
438 |
ret = nbd_co_send_request(s, &request, NULL, 0); |
439 |
if (ret < 0) { |
440 |
reply.error = errno; |
441 |
} else {
|
442 |
nbd_co_receive_reply(s, &request, &reply, NULL, 0); |
443 |
} |
444 |
nbd_coroutine_end(s, &request); |
445 |
return -reply.error;
|
446 |
} |
447 |
|
448 |
static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, |
449 |
int nb_sectors)
|
450 |
{ |
451 |
BDRVNBDState *s = bs->opaque; |
452 |
struct nbd_request request;
|
453 |
struct nbd_reply reply;
|
454 |
ssize_t ret; |
455 |
|
456 |
if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
|
457 |
return 0; |
458 |
} |
459 |
request.type = NBD_CMD_TRIM; |
460 |
request.from = sector_num * 512;;
|
461 |
request.len = nb_sectors * 512;
|
462 |
|
463 |
nbd_coroutine_start(s, &request); |
464 |
ret = nbd_co_send_request(s, &request, NULL, 0); |
465 |
if (ret < 0) { |
466 |
reply.error = errno; |
467 |
} else {
|
468 |
nbd_co_receive_reply(s, &request, &reply, NULL, 0); |
469 |
} |
470 |
nbd_coroutine_end(s, &request); |
471 |
return -reply.error;
|
472 |
} |
473 |
|
474 |
static void nbd_close(BlockDriverState *bs) |
475 |
{ |
476 |
BDRVNBDState *s = bs->opaque; |
477 |
g_free(s->export_name); |
478 |
g_free(s->host_spec); |
479 |
|
480 |
nbd_teardown_connection(bs); |
481 |
} |
482 |
|
483 |
static int64_t nbd_getlength(BlockDriverState *bs)
|
484 |
{ |
485 |
BDRVNBDState *s = bs->opaque; |
486 |
|
487 |
return s->size;
|
488 |
} |
489 |
|
490 |
static BlockDriver bdrv_nbd = {
|
491 |
.format_name = "nbd",
|
492 |
.instance_size = sizeof(BDRVNBDState),
|
493 |
.bdrv_file_open = nbd_open, |
494 |
.bdrv_co_readv = nbd_co_readv, |
495 |
.bdrv_co_writev = nbd_co_writev, |
496 |
.bdrv_close = nbd_close, |
497 |
.bdrv_co_flush_to_os = nbd_co_flush, |
498 |
.bdrv_co_discard = nbd_co_discard, |
499 |
.bdrv_getlength = nbd_getlength, |
500 |
.protocol_name = "nbd",
|
501 |
}; |
502 |
|
503 |
static void bdrv_nbd_init(void) |
504 |
{ |
505 |
bdrv_register(&bdrv_nbd); |
506 |
} |
507 |
|
508 |
block_init(bdrv_nbd_init); |