Statistics
| Branch: | Revision:

root / nbd.c @ 0ddf08db

History | View | Annotate | Download (26 kB)

1
/*
2
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
3
 *
4
 *  Network Block Device
5
 *
6
 *  This program is free software; you can redistribute it and/or modify
7
 *  it under the terms of the GNU General Public License as published by
8
 *  the Free Software Foundation; under version 2 of the License.
9
 *
10
 *  This program is distributed in the hope that it will be useful,
11
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 *  GNU General Public License for more details.
14
 *
15
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
18

    
19
#include "nbd.h"
20
#include "block.h"
21

    
22
#include "qemu-coroutine.h"
23

    
24
#include <errno.h>
25
#include <string.h>
26
#ifndef _WIN32
27
#include <sys/ioctl.h>
28
#endif
29
#if defined(__sun__) || defined(__HAIKU__)
30
#include <sys/ioccom.h>
31
#endif
32
#include <ctype.h>
33
#include <inttypes.h>
34

    
35
#ifdef __linux__
36
#include <linux/fs.h>
37
#endif
38

    
39
#include "qemu_socket.h"
40
#include "qemu-queue.h"
41

    
42
//#define DEBUG_NBD
43

    
44
#ifdef DEBUG_NBD
45
#define TRACE(msg, ...) do { \
46
    LOG(msg, ## __VA_ARGS__); \
47
} while(0)
48
#else
49
#define TRACE(msg, ...) \
50
    do { } while (0)
51
#endif
52

    
53
#define LOG(msg, ...) do { \
54
    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
55
            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
56
} while(0)
57

    
58
/* This is all part of the "official" NBD API */
59

    
60
#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
61
#define NBD_REPLY_SIZE          (4 + 4 + 8)
62
#define NBD_REQUEST_MAGIC       0x25609513
63
#define NBD_REPLY_MAGIC         0x67446698
64
#define NBD_OPTS_MAGIC          0x49484156454F5054LL
65
#define NBD_CLIENT_MAGIC        0x0000420281861253LL
66

    
67
#define NBD_SET_SOCK            _IO(0xab, 0)
68
#define NBD_SET_BLKSIZE         _IO(0xab, 1)
69
#define NBD_SET_SIZE            _IO(0xab, 2)
70
#define NBD_DO_IT               _IO(0xab, 3)
71
#define NBD_CLEAR_SOCK          _IO(0xab, 4)
72
#define NBD_CLEAR_QUE           _IO(0xab, 5)
73
#define NBD_PRINT_DEBUG         _IO(0xab, 6)
74
#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
75
#define NBD_DISCONNECT          _IO(0xab, 8)
76
#define NBD_SET_TIMEOUT         _IO(0xab, 9)
77
#define NBD_SET_FLAGS           _IO(0xab, 10)
78

    
79
#define NBD_OPT_EXPORT_NAME     (1 << 0)
80

    
81
/* Definitions for opaque data types */
82

    
83
typedef struct NBDRequest NBDRequest;
84

    
85
struct NBDRequest {
86
    QSIMPLEQ_ENTRY(NBDRequest) entry;
87
    NBDClient *client;
88
    uint8_t *data;
89
};
90

    
91
struct NBDExport {
92
    int refcount;
93
    void (*close)(NBDExport *exp);
94

    
95
    BlockDriverState *bs;
96
    off_t dev_offset;
97
    off_t size;
98
    uint32_t nbdflags;
99
    QTAILQ_HEAD(, NBDClient) clients;
100
    QSIMPLEQ_HEAD(, NBDRequest) requests;
101
};
102

    
103
struct NBDClient {
104
    int refcount;
105
    void (*close)(NBDClient *client);
106

    
107
    NBDExport *exp;
108
    int sock;
109

    
110
    Coroutine *recv_coroutine;
111

    
112
    CoMutex send_lock;
113
    Coroutine *send_coroutine;
114

    
115
    QTAILQ_ENTRY(NBDClient) next;
116
    int nb_requests;
117
    bool closing;
118
};
119

    
120
/* That's all folks */
121

    
122
ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
123
{
124
    size_t offset = 0;
125
    int err;
126

    
127
    if (qemu_in_coroutine()) {
128
        if (do_read) {
129
            return qemu_co_recv(fd, buffer, size);
130
        } else {
131
            return qemu_co_send(fd, buffer, size);
132
        }
133
    }
134

    
135
    while (offset < size) {
136
        ssize_t len;
137

    
138
        if (do_read) {
139
            len = qemu_recv(fd, buffer + offset, size - offset, 0);
140
        } else {
141
            len = send(fd, buffer + offset, size - offset, 0);
142
        }
143

    
144
        if (len < 0) {
145
            err = socket_error();
146

    
147
            /* recoverable error */
148
            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
149
                continue;
150
            }
151

    
152
            /* unrecoverable error */
153
            return -err;
154
        }
155

    
156
        /* eof */
157
        if (len == 0) {
158
            break;
159
        }
160

    
161
        offset += len;
162
    }
163

    
164
    return offset;
165
}
166

    
167
static ssize_t read_sync(int fd, void *buffer, size_t size)
168
{
169
    /* Sockets are kept in blocking mode in the negotiation phase.  After
170
     * that, a non-readable socket simply means that another thread stole
171
     * our request/reply.  Synchronization is done with recv_coroutine, so
172
     * that this is coroutine-safe.
173
     */
174
    return nbd_wr_sync(fd, buffer, size, true);
175
}
176

    
177
static ssize_t write_sync(int fd, void *buffer, size_t size)
178
{
179
    int ret;
180
    do {
181
        /* For writes, we do expect the socket to be writable.  */
182
        ret = nbd_wr_sync(fd, buffer, size, false);
183
    } while (ret == -EAGAIN);
184
    return ret;
185
}
186

    
187
static void combine_addr(char *buf, size_t len, const char* address,
188
                         uint16_t port)
189
{
190
    /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
191
    if (strstr(address, ":")) {
192
        snprintf(buf, len, "[%s]:%u", address, port);
193
    } else {
194
        snprintf(buf, len, "%s:%u", address, port);
195
    }
196
}
197

    
198
int tcp_socket_outgoing(const char *address, uint16_t port)
199
{
200
    char address_and_port[128];
201
    combine_addr(address_and_port, 128, address, port);
202
    return tcp_socket_outgoing_spec(address_and_port);
203
}
204

    
205
int tcp_socket_outgoing_spec(const char *address_and_port)
206
{
207
    return inet_connect(address_and_port, true, NULL, NULL);
208
}
209

    
210
int tcp_socket_incoming(const char *address, uint16_t port)
211
{
212
    char address_and_port[128];
213
    combine_addr(address_and_port, 128, address, port);
214
    return tcp_socket_incoming_spec(address_and_port);
215
}
216

    
217
int tcp_socket_incoming_spec(const char *address_and_port)
218
{
219
    char *ostr  = NULL;
220
    int olen = 0;
221
    return inet_listen(address_and_port, ostr, olen, SOCK_STREAM, 0, NULL);
222
}
223

    
224
int unix_socket_incoming(const char *path)
225
{
226
    char *ostr = NULL;
227
    int olen = 0;
228

    
229
    return unix_listen(path, ostr, olen);
230
}
231

    
232
int unix_socket_outgoing(const char *path)
233
{
234
    return unix_connect(path);
235
}
236

    
237
/* Basic flow
238

239
   Server         Client
240

241
   Negotiate
242
                  Request
243
   Response
244
                  Request
245
   Response
246
                  ...
247
   ...
248
                  Request (type == 2)
249
*/
250

    
251
static int nbd_send_negotiate(NBDClient *client)
252
{
253
    int csock = client->sock;
254
    char buf[8 + 8 + 8 + 128];
255
    int rc;
256

    
257
    /* Negotiate
258
        [ 0 ..   7]   passwd   ("NBDMAGIC")
259
        [ 8 ..  15]   magic    (NBD_CLIENT_MAGIC)
260
        [16 ..  23]   size
261
        [24 ..  27]   flags
262
        [28 .. 151]   reserved (0)
263
     */
264

    
265
    socket_set_block(csock);
266
    rc = -EINVAL;
267

    
268
    TRACE("Beginning negotiation.");
269
    memcpy(buf, "NBDMAGIC", 8);
270
    cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
271
    cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
272
    cpu_to_be32w((uint32_t*)(buf + 24),
273
                 client->exp->nbdflags | NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
274
                 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
275
    memset(buf + 28, 0, 124);
276

    
277
    if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
278
        LOG("write failed");
279
        goto fail;
280
    }
281

    
282
    TRACE("Negotiation succeeded.");
283
    rc = 0;
284
fail:
285
    socket_set_nonblock(csock);
286
    return rc;
287
}
288

    
289
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
290
                          off_t *size, size_t *blocksize)
291
{
292
    char buf[256];
293
    uint64_t magic, s;
294
    uint16_t tmp;
295
    int rc;
296

    
297
    TRACE("Receiving negotiation.");
298

    
299
    socket_set_block(csock);
300
    rc = -EINVAL;
301

    
302
    if (read_sync(csock, buf, 8) != 8) {
303
        LOG("read failed");
304
        goto fail;
305
    }
306

    
307
    buf[8] = '\0';
308
    if (strlen(buf) == 0) {
309
        LOG("server connection closed");
310
        goto fail;
311
    }
312

    
313
    TRACE("Magic is %c%c%c%c%c%c%c%c",
314
          qemu_isprint(buf[0]) ? buf[0] : '.',
315
          qemu_isprint(buf[1]) ? buf[1] : '.',
316
          qemu_isprint(buf[2]) ? buf[2] : '.',
317
          qemu_isprint(buf[3]) ? buf[3] : '.',
318
          qemu_isprint(buf[4]) ? buf[4] : '.',
319
          qemu_isprint(buf[5]) ? buf[5] : '.',
320
          qemu_isprint(buf[6]) ? buf[6] : '.',
321
          qemu_isprint(buf[7]) ? buf[7] : '.');
322

    
323
    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
324
        LOG("Invalid magic received");
325
        goto fail;
326
    }
327

    
328
    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
329
        LOG("read failed");
330
        goto fail;
331
    }
332
    magic = be64_to_cpu(magic);
333
    TRACE("Magic is 0x%" PRIx64, magic);
334

    
335
    if (name) {
336
        uint32_t reserved = 0;
337
        uint32_t opt;
338
        uint32_t namesize;
339

    
340
        TRACE("Checking magic (opts_magic)");
341
        if (magic != NBD_OPTS_MAGIC) {
342
            LOG("Bad magic received");
343
            goto fail;
344
        }
345
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
346
            LOG("flags read failed");
347
            goto fail;
348
        }
349
        *flags = be16_to_cpu(tmp) << 16;
350
        /* reserved for future use */
351
        if (write_sync(csock, &reserved, sizeof(reserved)) !=
352
            sizeof(reserved)) {
353
            LOG("write failed (reserved)");
354
            goto fail;
355
        }
356
        /* write the export name */
357
        magic = cpu_to_be64(magic);
358
        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
359
            LOG("write failed (magic)");
360
            goto fail;
361
        }
362
        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
363
        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
364
            LOG("write failed (opt)");
365
            goto fail;
366
        }
367
        namesize = cpu_to_be32(strlen(name));
368
        if (write_sync(csock, &namesize, sizeof(namesize)) !=
369
            sizeof(namesize)) {
370
            LOG("write failed (namesize)");
371
            goto fail;
372
        }
373
        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
374
            LOG("write failed (name)");
375
            goto fail;
376
        }
377
    } else {
378
        TRACE("Checking magic (cli_magic)");
379

    
380
        if (magic != NBD_CLIENT_MAGIC) {
381
            LOG("Bad magic received");
382
            goto fail;
383
        }
384
    }
385

    
386
    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
387
        LOG("read failed");
388
        goto fail;
389
    }
390
    *size = be64_to_cpu(s);
391
    *blocksize = 1024;
392
    TRACE("Size is %" PRIu64, *size);
393

    
394
    if (!name) {
395
        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
396
            LOG("read failed (flags)");
397
            goto fail;
398
        }
399
        *flags = be32_to_cpup(flags);
400
    } else {
401
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
402
            LOG("read failed (tmp)");
403
            goto fail;
404
        }
405
        *flags |= be32_to_cpu(tmp);
406
    }
407
    if (read_sync(csock, &buf, 124) != 124) {
408
        LOG("read failed (buf)");
409
        goto fail;
410
    }
411
    rc = 0;
412

    
413
fail:
414
    socket_set_nonblock(csock);
415
    return rc;
416
}
417

    
418
#ifdef __linux__
419
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
420
{
421
    TRACE("Setting NBD socket");
422

    
423
    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
424
        int serrno = errno;
425
        LOG("Failed to set NBD socket");
426
        return -serrno;
427
    }
428

    
429
    TRACE("Setting block size to %lu", (unsigned long)blocksize);
430

    
431
    if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
432
        int serrno = errno;
433
        LOG("Failed setting NBD block size");
434
        return -serrno;
435
    }
436

    
437
        TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
438

    
439
    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
440
        int serrno = errno;
441
        LOG("Failed setting size (in blocks)");
442
        return -serrno;
443
    }
444

    
445
    if (flags & NBD_FLAG_READ_ONLY) {
446
        int read_only = 1;
447
        TRACE("Setting readonly attribute");
448

    
449
        if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
450
            int serrno = errno;
451
            LOG("Failed setting read-only attribute");
452
            return -serrno;
453
        }
454
    }
455

    
456
    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0
457
        && errno != ENOTTY) {
458
        int serrno = errno;
459
        LOG("Failed setting flags");
460
        return -serrno;
461
    }
462

    
463
    TRACE("Negotiation ended");
464

    
465
    return 0;
466
}
467

    
468
int nbd_disconnect(int fd)
469
{
470
    ioctl(fd, NBD_CLEAR_QUE);
471
    ioctl(fd, NBD_DISCONNECT);
472
    ioctl(fd, NBD_CLEAR_SOCK);
473
    return 0;
474
}
475

    
476
int nbd_client(int fd)
477
{
478
    int ret;
479
    int serrno;
480

    
481
    TRACE("Doing NBD loop");
482

    
483
    ret = ioctl(fd, NBD_DO_IT);
484
    if (ret < 0 && errno == EPIPE) {
485
        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
486
         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
487
         * that case.
488
         */
489
        ret = 0;
490
    }
491
    serrno = errno;
492

    
493
    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
494

    
495
    TRACE("Clearing NBD queue");
496
    ioctl(fd, NBD_CLEAR_QUE);
497

    
498
    TRACE("Clearing NBD socket");
499
    ioctl(fd, NBD_CLEAR_SOCK);
500

    
501
    errno = serrno;
502
    return ret;
503
}
504
#else
505
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
506
{
507
    return -ENOTSUP;
508
}
509

    
510
int nbd_disconnect(int fd)
511
{
512
    return -ENOTSUP;
513
}
514

    
515
int nbd_client(int fd)
516
{
517
    return -ENOTSUP;
518
}
519
#endif
520

    
521
ssize_t nbd_send_request(int csock, struct nbd_request *request)
522
{
523
    uint8_t buf[NBD_REQUEST_SIZE];
524
    ssize_t ret;
525

    
526
    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
527
    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
528
    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
529
    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
530
    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
531

    
532
    TRACE("Sending request to client: "
533
          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
534
          request->from, request->len, request->handle, request->type);
535

    
536
    ret = write_sync(csock, buf, sizeof(buf));
537
    if (ret < 0) {
538
        return ret;
539
    }
540

    
541
    if (ret != sizeof(buf)) {
542
        LOG("writing to socket failed");
543
        return -EINVAL;
544
    }
545
    return 0;
546
}
547

    
548
static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
549
{
550
    uint8_t buf[NBD_REQUEST_SIZE];
551
    uint32_t magic;
552
    ssize_t ret;
553

    
554
    ret = read_sync(csock, buf, sizeof(buf));
555
    if (ret < 0) {
556
        return ret;
557
    }
558

    
559
    if (ret != sizeof(buf)) {
560
        LOG("read failed");
561
        return -EINVAL;
562
    }
563

    
564
    /* Request
565
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
566
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
567
       [ 8 .. 15]   handle
568
       [16 .. 23]   from
569
       [24 .. 27]   len
570
     */
571

    
572
    magic = be32_to_cpup((uint32_t*)buf);
573
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
574
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
575
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
576
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));
577

    
578
    TRACE("Got request: "
579
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
580
          magic, request->type, request->from, request->len);
581

    
582
    if (magic != NBD_REQUEST_MAGIC) {
583
        LOG("invalid magic (got 0x%x)", magic);
584
        return -EINVAL;
585
    }
586
    return 0;
587
}
588

    
589
ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
590
{
591
    uint8_t buf[NBD_REPLY_SIZE];
592
    uint32_t magic;
593
    ssize_t ret;
594

    
595
    ret = read_sync(csock, buf, sizeof(buf));
596
    if (ret < 0) {
597
        return ret;
598
    }
599

    
600
    if (ret != sizeof(buf)) {
601
        LOG("read failed");
602
        return -EINVAL;
603
    }
604

    
605
    /* Reply
606
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
607
       [ 4 ..  7]    error   (0 == no error)
608
       [ 7 .. 15]    handle
609
     */
610

    
611
    magic = be32_to_cpup((uint32_t*)buf);
612
    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
613
    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
614

    
615
    TRACE("Got reply: "
616
          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
617
          magic, reply->error, reply->handle);
618

    
619
    if (magic != NBD_REPLY_MAGIC) {
620
        LOG("invalid magic (got 0x%x)", magic);
621
        return -EINVAL;
622
    }
623
    return 0;
624
}
625

    
626
static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
627
{
628
    uint8_t buf[NBD_REPLY_SIZE];
629
    ssize_t ret;
630

    
631
    /* Reply
632
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
633
       [ 4 ..  7]    error   (0 == no error)
634
       [ 7 .. 15]    handle
635
     */
636
    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
637
    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
638
    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
639

    
640
    TRACE("Sending response to client");
641

    
642
    ret = write_sync(csock, buf, sizeof(buf));
643
    if (ret < 0) {
644
        return ret;
645
    }
646

    
647
    if (ret != sizeof(buf)) {
648
        LOG("writing to socket failed");
649
        return -EINVAL;
650
    }
651
    return 0;
652
}
653

    
654
#define MAX_NBD_REQUESTS 16
655

    
656
void nbd_client_get(NBDClient *client)
657
{
658
    client->refcount++;
659
}
660

    
661
void nbd_client_put(NBDClient *client)
662
{
663
    if (--client->refcount == 0) {
664
        /* The last reference should be dropped by client->close,
665
         * which is called by nbd_client_close.
666
         */
667
        assert(client->closing);
668

    
669
        qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
670
        close(client->sock);
671
        client->sock = -1;
672
        QTAILQ_REMOVE(&client->exp->clients, client, next);
673
        nbd_export_put(client->exp);
674
        g_free(client);
675
    }
676
}
677

    
678
void nbd_client_close(NBDClient *client)
679
{
680
    if (client->closing) {
681
        return;
682
    }
683

    
684
    client->closing = true;
685

    
686
    /* Force requests to finish.  They will drop their own references,
687
     * then we'll close the socket and free the NBDClient.
688
     */
689
    shutdown(client->sock, 2);
690

    
691
    /* Also tell the client, so that they release their reference.  */
692
    if (client->close) {
693
        client->close(client);
694
    }
695
}
696

    
697
static NBDRequest *nbd_request_get(NBDClient *client)
698
{
699
    NBDRequest *req;
700
    NBDExport *exp = client->exp;
701

    
702
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
703
    client->nb_requests++;
704

    
705
    if (QSIMPLEQ_EMPTY(&exp->requests)) {
706
        req = g_malloc0(sizeof(NBDRequest));
707
        req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
708
    } else {
709
        req = QSIMPLEQ_FIRST(&exp->requests);
710
        QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
711
    }
712
    nbd_client_get(client);
713
    req->client = client;
714
    return req;
715
}
716

    
717
static void nbd_request_put(NBDRequest *req)
718
{
719
    NBDClient *client = req->client;
720
    QSIMPLEQ_INSERT_HEAD(&client->exp->requests, req, entry);
721
    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
722
        qemu_notify_event();
723
    }
724
    nbd_client_put(client);
725
}
726

    
727
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
728
                          off_t size, uint32_t nbdflags,
729
                          void (*close)(NBDExport *))
730
{
731
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
732
    QSIMPLEQ_INIT(&exp->requests);
733
    exp->refcount = 1;
734
    QTAILQ_INIT(&exp->clients);
735
    exp->bs = bs;
736
    exp->dev_offset = dev_offset;
737
    exp->nbdflags = nbdflags;
738
    exp->size = size == -1 ? bdrv_getlength(bs) : size;
739
    exp->close = close;
740
    return exp;
741
}
742

    
743
void nbd_export_close(NBDExport *exp)
744
{
745
    NBDClient *client, *next;
746

    
747
    nbd_export_get(exp);
748
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
749
        nbd_client_close(client);
750
    }
751
    nbd_export_put(exp);
752
}
753

    
754
void nbd_export_get(NBDExport *exp)
755
{
756
    assert(exp->refcount > 0);
757
    exp->refcount++;
758
}
759

    
760
void nbd_export_put(NBDExport *exp)
761
{
762
    assert(exp->refcount > 0);
763
    if (exp->refcount == 1) {
764
        nbd_export_close(exp);
765
    }
766

    
767
    if (--exp->refcount == 0) {
768
        if (exp->close) {
769
            exp->close(exp);
770
        }
771

    
772
        while (!QSIMPLEQ_EMPTY(&exp->requests)) {
773
            NBDRequest *first = QSIMPLEQ_FIRST(&exp->requests);
774
            QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
775
            qemu_vfree(first->data);
776
            g_free(first);
777
        }
778

    
779
        g_free(exp);
780
    }
781
}
782

    
783
static int nbd_can_read(void *opaque);
784
static void nbd_read(void *opaque);
785
static void nbd_restart_write(void *opaque);
786

    
787
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
788
                                 int len)
789
{
790
    NBDClient *client = req->client;
791
    int csock = client->sock;
792
    ssize_t rc, ret;
793

    
794
    qemu_co_mutex_lock(&client->send_lock);
795
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
796
                         nbd_restart_write, client);
797
    client->send_coroutine = qemu_coroutine_self();
798

    
799
    if (!len) {
800
        rc = nbd_send_reply(csock, reply);
801
    } else {
802
        socket_set_cork(csock, 1);
803
        rc = nbd_send_reply(csock, reply);
804
        if (rc >= 0) {
805
            ret = qemu_co_send(csock, req->data, len);
806
            if (ret != len) {
807
                rc = -EIO;
808
            }
809
        }
810
        socket_set_cork(csock, 0);
811
    }
812

    
813
    client->send_coroutine = NULL;
814
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
815
    qemu_co_mutex_unlock(&client->send_lock);
816
    return rc;
817
}
818

    
819
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
820
{
821
    NBDClient *client = req->client;
822
    int csock = client->sock;
823
    ssize_t rc;
824

    
825
    client->recv_coroutine = qemu_coroutine_self();
826
    rc = nbd_receive_request(csock, request);
827
    if (rc < 0) {
828
        if (rc != -EAGAIN) {
829
            rc = -EIO;
830
        }
831
        goto out;
832
    }
833

    
834
    if (request->len > NBD_BUFFER_SIZE) {
835
        LOG("len (%u) is larger than max len (%u)",
836
            request->len, NBD_BUFFER_SIZE);
837
        rc = -EINVAL;
838
        goto out;
839
    }
840

    
841
    if ((request->from + request->len) < request->from) {
842
        LOG("integer overflow detected! "
843
            "you're probably being attacked");
844
        rc = -EINVAL;
845
        goto out;
846
    }
847

    
848
    TRACE("Decoding type");
849

    
850
    if ((request->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
851
        TRACE("Reading %u byte(s)", request->len);
852

    
853
        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
854
            LOG("reading from socket failed");
855
            rc = -EIO;
856
            goto out;
857
        }
858
    }
859
    rc = 0;
860

    
861
out:
862
    client->recv_coroutine = NULL;
863
    return rc;
864
}
865

    
866
static void nbd_trip(void *opaque)
867
{
868
    NBDClient *client = opaque;
869
    NBDExport *exp = client->exp;
870
    NBDRequest *req;
871
    struct nbd_request request;
872
    struct nbd_reply reply;
873
    ssize_t ret;
874

    
875
    TRACE("Reading request.");
876
    if (client->closing) {
877
        return;
878
    }
879

    
880
    req = nbd_request_get(client);
881
    ret = nbd_co_receive_request(req, &request);
882
    if (ret == -EAGAIN) {
883
        goto done;
884
    }
885
    if (ret == -EIO) {
886
        goto out;
887
    }
888

    
889
    reply.handle = request.handle;
890
    reply.error = 0;
891

    
892
    if (ret < 0) {
893
        reply.error = -ret;
894
        goto error_reply;
895
    }
896

    
897
    if ((request.from + request.len) > exp->size) {
898
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
899
            ", Offset: %" PRIu64 "\n",
900
                    request.from, request.len,
901
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
902
        LOG("requested operation past EOF--bad client?");
903
        goto invalid_request;
904
    }
905

    
906
    switch (request.type & NBD_CMD_MASK_COMMAND) {
907
    case NBD_CMD_READ:
908
        TRACE("Request type is READ");
909

    
910
        if (request.type & NBD_CMD_FLAG_FUA) {
911
            ret = bdrv_co_flush(exp->bs);
912
            if (ret < 0) {
913
                LOG("flush failed");
914
                reply.error = -ret;
915
                goto error_reply;
916
            }
917
        }
918

    
919
        ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
920
                        req->data, request.len / 512);
921
        if (ret < 0) {
922
            LOG("reading from file failed");
923
            reply.error = -ret;
924
            goto error_reply;
925
        }
926

    
927
        TRACE("Read %u byte(s)", request.len);
928
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
929
            goto out;
930
        break;
931
    case NBD_CMD_WRITE:
932
        TRACE("Request type is WRITE");
933

    
934
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
935
            TRACE("Server is read-only, return error");
936
            reply.error = EROFS;
937
            goto error_reply;
938
        }
939

    
940
        TRACE("Writing to device");
941

    
942
        ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
943
                         req->data, request.len / 512);
944
        if (ret < 0) {
945
            LOG("writing to file failed");
946
            reply.error = -ret;
947
            goto error_reply;
948
        }
949

    
950
        if (request.type & NBD_CMD_FLAG_FUA) {
951
            ret = bdrv_co_flush(exp->bs);
952
            if (ret < 0) {
953
                LOG("flush failed");
954
                reply.error = -ret;
955
                goto error_reply;
956
            }
957
        }
958

    
959
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
960
            goto out;
961
        }
962
        break;
963
    case NBD_CMD_DISC:
964
        TRACE("Request type is DISCONNECT");
965
        errno = 0;
966
        goto out;
967
    case NBD_CMD_FLUSH:
968
        TRACE("Request type is FLUSH");
969

    
970
        ret = bdrv_co_flush(exp->bs);
971
        if (ret < 0) {
972
            LOG("flush failed");
973
            reply.error = -ret;
974
        }
975
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
976
            goto out;
977
        }
978
        break;
979
    case NBD_CMD_TRIM:
980
        TRACE("Request type is TRIM");
981
        ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
982
                              request.len / 512);
983
        if (ret < 0) {
984
            LOG("discard failed");
985
            reply.error = -ret;
986
        }
987
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
988
            goto out;
989
        }
990
        break;
991
    default:
992
        LOG("invalid request type (%u) received", request.type);
993
    invalid_request:
994
        reply.error = -EINVAL;
995
    error_reply:
996
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
997
            goto out;
998
        }
999
        break;
1000
    }
1001

    
1002
    TRACE("Request/Reply complete");
1003

    
1004
done:
1005
    nbd_request_put(req);
1006
    return;
1007

    
1008
out:
1009
    nbd_request_put(req);
1010
    nbd_client_close(client);
1011
}
1012

    
1013
static int nbd_can_read(void *opaque)
1014
{
1015
    NBDClient *client = opaque;
1016

    
1017
    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
1018
}
1019

    
1020
static void nbd_read(void *opaque)
1021
{
1022
    NBDClient *client = opaque;
1023

    
1024
    if (client->recv_coroutine) {
1025
        qemu_coroutine_enter(client->recv_coroutine, NULL);
1026
    } else {
1027
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1028
    }
1029
}
1030

    
1031
static void nbd_restart_write(void *opaque)
1032
{
1033
    NBDClient *client = opaque;
1034

    
1035
    qemu_coroutine_enter(client->send_coroutine, NULL);
1036
}
1037

    
1038
NBDClient *nbd_client_new(NBDExport *exp, int csock,
1039
                          void (*close)(NBDClient *))
1040
{
1041
    NBDClient *client;
1042
    client = g_malloc0(sizeof(NBDClient));
1043
    client->refcount = 1;
1044
    client->exp = exp;
1045
    client->sock = csock;
1046
    if (nbd_send_negotiate(client) < 0) {
1047
        g_free(client);
1048
        return NULL;
1049
    }
1050
    client->close = close;
1051
    qemu_co_mutex_init(&client->send_lock);
1052
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1053

    
1054
    QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1055
    nbd_export_get(exp);
1056
    return client;
1057
}