Statistics
| Branch: | Revision:

root / nbd.c @ 5a37532d

History | View | Annotate | Download (30.8 kB)

1
/*
2
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
3
 *
4
 *  Network Block Device
5
 *
6
 *  This program is free software; you can redistribute it and/or modify
7
 *  it under the terms of the GNU General Public License as published by
8
 *  the Free Software Foundation; under version 2 of the License.
9
 *
10
 *  This program is distributed in the hope that it will be useful,
11
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 *  GNU General Public License for more details.
14
 *
15
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
18

    
19
#include "block/nbd.h"
20
#include "block/block.h"
21

    
22
#include "block/coroutine.h"
23

    
24
#include <errno.h>
25
#include <string.h>
26
#ifndef _WIN32
27
#include <sys/ioctl.h>
28
#endif
29
#if defined(__sun__) || defined(__HAIKU__)
30
#include <sys/ioccom.h>
31
#endif
32
#include <ctype.h>
33
#include <inttypes.h>
34

    
35
#ifdef __linux__
36
#include <linux/fs.h>
37
#endif
38

    
39
#include "qemu/sockets.h"
40
#include "qemu/queue.h"
41

    
42
//#define DEBUG_NBD
43

    
44
#ifdef DEBUG_NBD
45
#define TRACE(msg, ...) do { \
46
    LOG(msg, ## __VA_ARGS__); \
47
} while(0)
48
#else
49
#define TRACE(msg, ...) \
50
    do { } while (0)
51
#endif
52

    
53
#define LOG(msg, ...) do { \
54
    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
55
            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
56
} while(0)
57

    
58
/* This is all part of the "official" NBD API */
59

    
60
#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
61
#define NBD_REPLY_SIZE          (4 + 4 + 8)
62
#define NBD_REQUEST_MAGIC       0x25609513
63
#define NBD_REPLY_MAGIC         0x67446698
64
#define NBD_OPTS_MAGIC          0x49484156454F5054LL
65
#define NBD_CLIENT_MAGIC        0x0000420281861253LL
66

    
67
#define NBD_SET_SOCK            _IO(0xab, 0)
68
#define NBD_SET_BLKSIZE         _IO(0xab, 1)
69
#define NBD_SET_SIZE            _IO(0xab, 2)
70
#define NBD_DO_IT               _IO(0xab, 3)
71
#define NBD_CLEAR_SOCK          _IO(0xab, 4)
72
#define NBD_CLEAR_QUE           _IO(0xab, 5)
73
#define NBD_PRINT_DEBUG         _IO(0xab, 6)
74
#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
75
#define NBD_DISCONNECT          _IO(0xab, 8)
76
#define NBD_SET_TIMEOUT         _IO(0xab, 9)
77
#define NBD_SET_FLAGS           _IO(0xab, 10)
78

    
79
#define NBD_OPT_EXPORT_NAME     (1 << 0)
80

    
81
/* Definitions for opaque data types */
82

    
83
typedef struct NBDRequest NBDRequest;
84

    
85
struct NBDRequest {
86
    QSIMPLEQ_ENTRY(NBDRequest) entry;
87
    NBDClient *client;
88
    uint8_t *data;
89
};
90

    
91
struct NBDExport {
92
    int refcount;
93
    void (*close)(NBDExport *exp);
94

    
95
    BlockDriverState *bs;
96
    char *name;
97
    off_t dev_offset;
98
    off_t size;
99
    uint32_t nbdflags;
100
    QTAILQ_HEAD(, NBDClient) clients;
101
    QSIMPLEQ_HEAD(, NBDRequest) requests;
102
    QTAILQ_ENTRY(NBDExport) next;
103
};
104

    
105
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
106

    
107
struct NBDClient {
108
    int refcount;
109
    void (*close)(NBDClient *client);
110

    
111
    NBDExport *exp;
112
    int sock;
113

    
114
    Coroutine *recv_coroutine;
115

    
116
    CoMutex send_lock;
117
    Coroutine *send_coroutine;
118

    
119
    QTAILQ_ENTRY(NBDClient) next;
120
    int nb_requests;
121
    bool closing;
122
};
123

    
124
/* That's all folks */
125

    
126
ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
127
{
128
    size_t offset = 0;
129
    int err;
130

    
131
    if (qemu_in_coroutine()) {
132
        if (do_read) {
133
            return qemu_co_recv(fd, buffer, size);
134
        } else {
135
            return qemu_co_send(fd, buffer, size);
136
        }
137
    }
138

    
139
    while (offset < size) {
140
        ssize_t len;
141

    
142
        if (do_read) {
143
            len = qemu_recv(fd, buffer + offset, size - offset, 0);
144
        } else {
145
            len = send(fd, buffer + offset, size - offset, 0);
146
        }
147

    
148
        if (len < 0) {
149
            err = socket_error();
150

    
151
            /* recoverable error */
152
            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
153
                continue;
154
            }
155

    
156
            /* unrecoverable error */
157
            return -err;
158
        }
159

    
160
        /* eof */
161
        if (len == 0) {
162
            break;
163
        }
164

    
165
        offset += len;
166
    }
167

    
168
    return offset;
169
}
170

    
171
static ssize_t read_sync(int fd, void *buffer, size_t size)
172
{
173
    /* Sockets are kept in blocking mode in the negotiation phase.  After
174
     * that, a non-readable socket simply means that another thread stole
175
     * our request/reply.  Synchronization is done with recv_coroutine, so
176
     * that this is coroutine-safe.
177
     */
178
    return nbd_wr_sync(fd, buffer, size, true);
179
}
180

    
181
static ssize_t write_sync(int fd, void *buffer, size_t size)
182
{
183
    int ret;
184
    do {
185
        /* For writes, we do expect the socket to be writable.  */
186
        ret = nbd_wr_sync(fd, buffer, size, false);
187
    } while (ret == -EAGAIN);
188
    return ret;
189
}
190

    
191
static void combine_addr(char *buf, size_t len, const char* address,
192
                         uint16_t port)
193
{
194
    /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
195
    if (strstr(address, ":")) {
196
        snprintf(buf, len, "[%s]:%u", address, port);
197
    } else {
198
        snprintf(buf, len, "%s:%u", address, port);
199
    }
200
}
201

    
202
int tcp_socket_outgoing_opts(QemuOpts *opts)
203
{
204
    Error *local_err = NULL;
205
    int fd = inet_connect_opts(opts, &local_err, NULL, NULL);
206
    if (local_err != NULL) {
207
        qerror_report_err(local_err);
208
        error_free(local_err);
209
    }
210

    
211
    return fd;
212
}
213

    
214
int tcp_socket_incoming(const char *address, uint16_t port)
215
{
216
    char address_and_port[128];
217
    combine_addr(address_and_port, 128, address, port);
218
    return tcp_socket_incoming_spec(address_and_port);
219
}
220

    
221
int tcp_socket_incoming_spec(const char *address_and_port)
222
{
223
    Error *local_err = NULL;
224
    int fd = inet_listen(address_and_port, NULL, 0, SOCK_STREAM, 0, &local_err);
225

    
226
    if (local_err != NULL) {
227
        qerror_report_err(local_err);
228
        error_free(local_err);
229
    }
230
    return fd;
231
}
232

    
233
int unix_socket_incoming(const char *path)
234
{
235
    Error *local_err = NULL;
236
    int fd = unix_listen(path, NULL, 0, &local_err);
237

    
238
    if (local_err != NULL) {
239
        qerror_report_err(local_err);
240
        error_free(local_err);
241
    }
242
    return fd;
243
}
244

    
245
int unix_socket_outgoing(const char *path)
246
{
247
    Error *local_err = NULL;
248
    int fd = unix_connect(path, &local_err);
249

    
250
    if (local_err != NULL) {
251
        qerror_report_err(local_err);
252
        error_free(local_err);
253
    }
254
    return fd;
255
}
256

    
257
/* Basic flow for negotiation
258

259
   Server         Client
260
   Negotiate
261

262
   or
263

264
   Server         Client
265
   Negotiate #1
266
                  Option
267
   Negotiate #2
268

269
   ----
270

271
   followed by
272

273
   Server         Client
274
                  Request
275
   Response
276
                  Request
277
   Response
278
                  ...
279
   ...
280
                  Request (type == 2)
281

282
*/
283

    
284
static int nbd_receive_options(NBDClient *client)
285
{
286
    int csock = client->sock;
287
    char name[256];
288
    uint32_t tmp, length;
289
    uint64_t magic;
290
    int rc;
291

    
292
    /* Client sends:
293
        [ 0 ..   3]   reserved (0)
294
        [ 4 ..  11]   NBD_OPTS_MAGIC
295
        [12 ..  15]   NBD_OPT_EXPORT_NAME
296
        [16 ..  19]   length
297
        [20 ..  xx]   export name (length bytes)
298
     */
299

    
300
    rc = -EINVAL;
301
    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
302
        LOG("read failed");
303
        goto fail;
304
    }
305
    TRACE("Checking reserved");
306
    if (tmp != 0) {
307
        LOG("Bad reserved received");
308
        goto fail;
309
    }
310

    
311
    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
312
        LOG("read failed");
313
        goto fail;
314
    }
315
    TRACE("Checking reserved");
316
    if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
317
        LOG("Bad magic received");
318
        goto fail;
319
    }
320

    
321
    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
322
        LOG("read failed");
323
        goto fail;
324
    }
325
    TRACE("Checking option");
326
    if (tmp != be32_to_cpu(NBD_OPT_EXPORT_NAME)) {
327
        LOG("Bad option received");
328
        goto fail;
329
    }
330

    
331
    if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
332
        LOG("read failed");
333
        goto fail;
334
    }
335
    TRACE("Checking length");
336
    length = be32_to_cpu(length);
337
    if (length > 255) {
338
        LOG("Bad length received");
339
        goto fail;
340
    }
341
    if (read_sync(csock, name, length) != length) {
342
        LOG("read failed");
343
        goto fail;
344
    }
345
    name[length] = '\0';
346

    
347
    client->exp = nbd_export_find(name);
348
    if (!client->exp) {
349
        LOG("export not found");
350
        goto fail;
351
    }
352

    
353
    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
354
    nbd_export_get(client->exp);
355

    
356
    TRACE("Option negotiation succeeded.");
357
    rc = 0;
358
fail:
359
    return rc;
360
}
361

    
362
static int nbd_send_negotiate(NBDClient *client)
363
{
364
    int csock = client->sock;
365
    char buf[8 + 8 + 8 + 128];
366
    int rc;
367
    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
368
                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
369

    
370
    /* Negotiation header without options:
371
        [ 0 ..   7]   passwd       ("NBDMAGIC")
372
        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
373
        [16 ..  23]   size
374
        [24 ..  25]   server flags (0)
375
        [24 ..  27]   export flags
376
        [28 .. 151]   reserved     (0)
377

378
       Negotiation header with options, part 1:
379
        [ 0 ..   7]   passwd       ("NBDMAGIC")
380
        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
381
        [16 ..  17]   server flags (0)
382

383
       part 2 (after options are sent):
384
        [18 ..  25]   size
385
        [26 ..  27]   export flags
386
        [28 .. 151]   reserved     (0)
387
     */
388

    
389
    qemu_set_block(csock);
390
    rc = -EINVAL;
391

    
392
    TRACE("Beginning negotiation.");
393
    memset(buf, 0, sizeof(buf));
394
    memcpy(buf, "NBDMAGIC", 8);
395
    if (client->exp) {
396
        assert ((client->exp->nbdflags & ~65535) == 0);
397
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
398
        cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
399
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
400
    } else {
401
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
402
    }
403

    
404
    if (client->exp) {
405
        if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
406
            LOG("write failed");
407
            goto fail;
408
        }
409
    } else {
410
        if (write_sync(csock, buf, 18) != 18) {
411
            LOG("write failed");
412
            goto fail;
413
        }
414
        rc = nbd_receive_options(client);
415
        if (rc < 0) {
416
            LOG("option negotiation failed");
417
            goto fail;
418
        }
419

    
420
        assert ((client->exp->nbdflags & ~65535) == 0);
421
        cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
422
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
423
        if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
424
            LOG("write failed");
425
            goto fail;
426
        }
427
    }
428

    
429
    TRACE("Negotiation succeeded.");
430
    rc = 0;
431
fail:
432
    qemu_set_nonblock(csock);
433
    return rc;
434
}
435

    
436
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
437
                          off_t *size, size_t *blocksize)
438
{
439
    char buf[256];
440
    uint64_t magic, s;
441
    uint16_t tmp;
442
    int rc;
443

    
444
    TRACE("Receiving negotiation.");
445

    
446
    qemu_set_block(csock);
447
    rc = -EINVAL;
448

    
449
    if (read_sync(csock, buf, 8) != 8) {
450
        LOG("read failed");
451
        goto fail;
452
    }
453

    
454
    buf[8] = '\0';
455
    if (strlen(buf) == 0) {
456
        LOG("server connection closed");
457
        goto fail;
458
    }
459

    
460
    TRACE("Magic is %c%c%c%c%c%c%c%c",
461
          qemu_isprint(buf[0]) ? buf[0] : '.',
462
          qemu_isprint(buf[1]) ? buf[1] : '.',
463
          qemu_isprint(buf[2]) ? buf[2] : '.',
464
          qemu_isprint(buf[3]) ? buf[3] : '.',
465
          qemu_isprint(buf[4]) ? buf[4] : '.',
466
          qemu_isprint(buf[5]) ? buf[5] : '.',
467
          qemu_isprint(buf[6]) ? buf[6] : '.',
468
          qemu_isprint(buf[7]) ? buf[7] : '.');
469

    
470
    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
471
        LOG("Invalid magic received");
472
        goto fail;
473
    }
474

    
475
    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
476
        LOG("read failed");
477
        goto fail;
478
    }
479
    magic = be64_to_cpu(magic);
480
    TRACE("Magic is 0x%" PRIx64, magic);
481

    
482
    if (name) {
483
        uint32_t reserved = 0;
484
        uint32_t opt;
485
        uint32_t namesize;
486

    
487
        TRACE("Checking magic (opts_magic)");
488
        if (magic != NBD_OPTS_MAGIC) {
489
            LOG("Bad magic received");
490
            goto fail;
491
        }
492
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
493
            LOG("flags read failed");
494
            goto fail;
495
        }
496
        *flags = be16_to_cpu(tmp) << 16;
497
        /* reserved for future use */
498
        if (write_sync(csock, &reserved, sizeof(reserved)) !=
499
            sizeof(reserved)) {
500
            LOG("write failed (reserved)");
501
            goto fail;
502
        }
503
        /* write the export name */
504
        magic = cpu_to_be64(magic);
505
        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
506
            LOG("write failed (magic)");
507
            goto fail;
508
        }
509
        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
510
        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
511
            LOG("write failed (opt)");
512
            goto fail;
513
        }
514
        namesize = cpu_to_be32(strlen(name));
515
        if (write_sync(csock, &namesize, sizeof(namesize)) !=
516
            sizeof(namesize)) {
517
            LOG("write failed (namesize)");
518
            goto fail;
519
        }
520
        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
521
            LOG("write failed (name)");
522
            goto fail;
523
        }
524
    } else {
525
        TRACE("Checking magic (cli_magic)");
526

    
527
        if (magic != NBD_CLIENT_MAGIC) {
528
            LOG("Bad magic received");
529
            goto fail;
530
        }
531
    }
532

    
533
    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
534
        LOG("read failed");
535
        goto fail;
536
    }
537
    *size = be64_to_cpu(s);
538
    *blocksize = 1024;
539
    TRACE("Size is %" PRIu64, *size);
540

    
541
    if (!name) {
542
        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
543
            LOG("read failed (flags)");
544
            goto fail;
545
        }
546
        *flags = be32_to_cpup(flags);
547
    } else {
548
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
549
            LOG("read failed (tmp)");
550
            goto fail;
551
        }
552
        *flags |= be32_to_cpu(tmp);
553
    }
554
    if (read_sync(csock, &buf, 124) != 124) {
555
        LOG("read failed (buf)");
556
        goto fail;
557
    }
558
    rc = 0;
559

    
560
fail:
561
    qemu_set_nonblock(csock);
562
    return rc;
563
}
564

    
565
#ifdef __linux__
566
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
567
{
568
    TRACE("Setting NBD socket");
569

    
570
    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
571
        int serrno = errno;
572
        LOG("Failed to set NBD socket");
573
        return -serrno;
574
    }
575

    
576
    TRACE("Setting block size to %lu", (unsigned long)blocksize);
577

    
578
    if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
579
        int serrno = errno;
580
        LOG("Failed setting NBD block size");
581
        return -serrno;
582
    }
583

    
584
        TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
585

    
586
    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
587
        int serrno = errno;
588
        LOG("Failed setting size (in blocks)");
589
        return -serrno;
590
    }
591

    
592
    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
593
        if (errno == ENOTTY) {
594
            int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
595
            TRACE("Setting readonly attribute");
596

    
597
            if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
598
                int serrno = errno;
599
                LOG("Failed setting read-only attribute");
600
                return -serrno;
601
            }
602
        } else {
603
            int serrno = errno;
604
            LOG("Failed setting flags");
605
            return -serrno;
606
        }
607
    }
608

    
609
    TRACE("Negotiation ended");
610

    
611
    return 0;
612
}
613

    
614
int nbd_disconnect(int fd)
615
{
616
    ioctl(fd, NBD_CLEAR_QUE);
617
    ioctl(fd, NBD_DISCONNECT);
618
    ioctl(fd, NBD_CLEAR_SOCK);
619
    return 0;
620
}
621

    
622
int nbd_client(int fd)
623
{
624
    int ret;
625
    int serrno;
626

    
627
    TRACE("Doing NBD loop");
628

    
629
    ret = ioctl(fd, NBD_DO_IT);
630
    if (ret < 0 && errno == EPIPE) {
631
        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
632
         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
633
         * that case.
634
         */
635
        ret = 0;
636
    }
637
    serrno = errno;
638

    
639
    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
640

    
641
    TRACE("Clearing NBD queue");
642
    ioctl(fd, NBD_CLEAR_QUE);
643

    
644
    TRACE("Clearing NBD socket");
645
    ioctl(fd, NBD_CLEAR_SOCK);
646

    
647
    errno = serrno;
648
    return ret;
649
}
650
#else
651
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
652
{
653
    return -ENOTSUP;
654
}
655

    
656
int nbd_disconnect(int fd)
657
{
658
    return -ENOTSUP;
659
}
660

    
661
int nbd_client(int fd)
662
{
663
    return -ENOTSUP;
664
}
665
#endif
666

    
667
ssize_t nbd_send_request(int csock, struct nbd_request *request)
668
{
669
    uint8_t buf[NBD_REQUEST_SIZE];
670
    ssize_t ret;
671

    
672
    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
673
    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
674
    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
675
    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
676
    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
677

    
678
    TRACE("Sending request to client: "
679
          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
680
          request->from, request->len, request->handle, request->type);
681

    
682
    ret = write_sync(csock, buf, sizeof(buf));
683
    if (ret < 0) {
684
        return ret;
685
    }
686

    
687
    if (ret != sizeof(buf)) {
688
        LOG("writing to socket failed");
689
        return -EINVAL;
690
    }
691
    return 0;
692
}
693

    
694
static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
695
{
696
    uint8_t buf[NBD_REQUEST_SIZE];
697
    uint32_t magic;
698
    ssize_t ret;
699

    
700
    ret = read_sync(csock, buf, sizeof(buf));
701
    if (ret < 0) {
702
        return ret;
703
    }
704

    
705
    if (ret != sizeof(buf)) {
706
        LOG("read failed");
707
        return -EINVAL;
708
    }
709

    
710
    /* Request
711
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
712
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
713
       [ 8 .. 15]   handle
714
       [16 .. 23]   from
715
       [24 .. 27]   len
716
     */
717

    
718
    magic = be32_to_cpup((uint32_t*)buf);
719
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
720
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
721
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
722
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));
723

    
724
    TRACE("Got request: "
725
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
726
          magic, request->type, request->from, request->len);
727

    
728
    if (magic != NBD_REQUEST_MAGIC) {
729
        LOG("invalid magic (got 0x%x)", magic);
730
        return -EINVAL;
731
    }
732
    return 0;
733
}
734

    
735
ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
736
{
737
    uint8_t buf[NBD_REPLY_SIZE];
738
    uint32_t magic;
739
    ssize_t ret;
740

    
741
    ret = read_sync(csock, buf, sizeof(buf));
742
    if (ret < 0) {
743
        return ret;
744
    }
745

    
746
    if (ret != sizeof(buf)) {
747
        LOG("read failed");
748
        return -EINVAL;
749
    }
750

    
751
    /* Reply
752
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
753
       [ 4 ..  7]    error   (0 == no error)
754
       [ 7 .. 15]    handle
755
     */
756

    
757
    magic = be32_to_cpup((uint32_t*)buf);
758
    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
759
    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
760

    
761
    TRACE("Got reply: "
762
          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
763
          magic, reply->error, reply->handle);
764

    
765
    if (magic != NBD_REPLY_MAGIC) {
766
        LOG("invalid magic (got 0x%x)", magic);
767
        return -EINVAL;
768
    }
769
    return 0;
770
}
771

    
772
static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
773
{
774
    uint8_t buf[NBD_REPLY_SIZE];
775
    ssize_t ret;
776

    
777
    /* Reply
778
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
779
       [ 4 ..  7]    error   (0 == no error)
780
       [ 7 .. 15]    handle
781
     */
782
    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
783
    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
784
    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
785

    
786
    TRACE("Sending response to client");
787

    
788
    ret = write_sync(csock, buf, sizeof(buf));
789
    if (ret < 0) {
790
        return ret;
791
    }
792

    
793
    if (ret != sizeof(buf)) {
794
        LOG("writing to socket failed");
795
        return -EINVAL;
796
    }
797
    return 0;
798
}
799

    
800
#define MAX_NBD_REQUESTS 16
801

    
802
void nbd_client_get(NBDClient *client)
803
{
804
    client->refcount++;
805
}
806

    
807
void nbd_client_put(NBDClient *client)
808
{
809
    if (--client->refcount == 0) {
810
        /* The last reference should be dropped by client->close,
811
         * which is called by nbd_client_close.
812
         */
813
        assert(client->closing);
814

    
815
        qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
816
        close(client->sock);
817
        client->sock = -1;
818
        if (client->exp) {
819
            QTAILQ_REMOVE(&client->exp->clients, client, next);
820
            nbd_export_put(client->exp);
821
        }
822
        g_free(client);
823
    }
824
}
825

    
826
void nbd_client_close(NBDClient *client)
827
{
828
    if (client->closing) {
829
        return;
830
    }
831

    
832
    client->closing = true;
833

    
834
    /* Force requests to finish.  They will drop their own references,
835
     * then we'll close the socket and free the NBDClient.
836
     */
837
    shutdown(client->sock, 2);
838

    
839
    /* Also tell the client, so that they release their reference.  */
840
    if (client->close) {
841
        client->close(client);
842
    }
843
}
844

    
845
static NBDRequest *nbd_request_get(NBDClient *client)
846
{
847
    NBDRequest *req;
848
    NBDExport *exp = client->exp;
849

    
850
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
851
    client->nb_requests++;
852

    
853
    if (QSIMPLEQ_EMPTY(&exp->requests)) {
854
        req = g_malloc0(sizeof(NBDRequest));
855
        req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
856
    } else {
857
        req = QSIMPLEQ_FIRST(&exp->requests);
858
        QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
859
    }
860
    nbd_client_get(client);
861
    req->client = client;
862
    return req;
863
}
864

    
865
static void nbd_request_put(NBDRequest *req)
866
{
867
    NBDClient *client = req->client;
868
    QSIMPLEQ_INSERT_HEAD(&client->exp->requests, req, entry);
869
    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
870
        qemu_notify_event();
871
    }
872
    nbd_client_put(client);
873
}
874

    
875
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
876
                          off_t size, uint32_t nbdflags,
877
                          void (*close)(NBDExport *))
878
{
879
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
880
    QSIMPLEQ_INIT(&exp->requests);
881
    exp->refcount = 1;
882
    QTAILQ_INIT(&exp->clients);
883
    exp->bs = bs;
884
    exp->dev_offset = dev_offset;
885
    exp->nbdflags = nbdflags;
886
    exp->size = size == -1 ? bdrv_getlength(bs) : size;
887
    exp->close = close;
888
    return exp;
889
}
890

    
891
NBDExport *nbd_export_find(const char *name)
892
{
893
    NBDExport *exp;
894
    QTAILQ_FOREACH(exp, &exports, next) {
895
        if (strcmp(name, exp->name) == 0) {
896
            return exp;
897
        }
898
    }
899

    
900
    return NULL;
901
}
902

    
903
void nbd_export_set_name(NBDExport *exp, const char *name)
904
{
905
    if (exp->name == name) {
906
        return;
907
    }
908

    
909
    nbd_export_get(exp);
910
    if (exp->name != NULL) {
911
        g_free(exp->name);
912
        exp->name = NULL;
913
        QTAILQ_REMOVE(&exports, exp, next);
914
        nbd_export_put(exp);
915
    }
916
    if (name != NULL) {
917
        nbd_export_get(exp);
918
        exp->name = g_strdup(name);
919
        QTAILQ_INSERT_TAIL(&exports, exp, next);
920
    }
921
    nbd_export_put(exp);
922
}
923

    
924
void nbd_export_close(NBDExport *exp)
925
{
926
    NBDClient *client, *next;
927

    
928
    nbd_export_get(exp);
929
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
930
        nbd_client_close(client);
931
    }
932
    nbd_export_set_name(exp, NULL);
933
    nbd_export_put(exp);
934
}
935

    
936
void nbd_export_get(NBDExport *exp)
937
{
938
    assert(exp->refcount > 0);
939
    exp->refcount++;
940
}
941

    
942
void nbd_export_put(NBDExport *exp)
943
{
944
    assert(exp->refcount > 0);
945
    if (exp->refcount == 1) {
946
        nbd_export_close(exp);
947
    }
948

    
949
    if (--exp->refcount == 0) {
950
        assert(exp->name == NULL);
951

    
952
        if (exp->close) {
953
            exp->close(exp);
954
        }
955

    
956
        while (!QSIMPLEQ_EMPTY(&exp->requests)) {
957
            NBDRequest *first = QSIMPLEQ_FIRST(&exp->requests);
958
            QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
959
            qemu_vfree(first->data);
960
            g_free(first);
961
        }
962

    
963
        g_free(exp);
964
    }
965
}
966

    
967
BlockDriverState *nbd_export_get_blockdev(NBDExport *exp)
968
{
969
    return exp->bs;
970
}
971

    
972
void nbd_export_close_all(void)
973
{
974
    NBDExport *exp, *next;
975

    
976
    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
977
        nbd_export_close(exp);
978
    }
979
}
980

    
981
static int nbd_can_read(void *opaque);
982
static void nbd_read(void *opaque);
983
static void nbd_restart_write(void *opaque);
984

    
985
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
986
                                 int len)
987
{
988
    NBDClient *client = req->client;
989
    int csock = client->sock;
990
    ssize_t rc, ret;
991

    
992
    qemu_co_mutex_lock(&client->send_lock);
993
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
994
                         nbd_restart_write, client);
995
    client->send_coroutine = qemu_coroutine_self();
996

    
997
    if (!len) {
998
        rc = nbd_send_reply(csock, reply);
999
    } else {
1000
        socket_set_cork(csock, 1);
1001
        rc = nbd_send_reply(csock, reply);
1002
        if (rc >= 0) {
1003
            ret = qemu_co_send(csock, req->data, len);
1004
            if (ret != len) {
1005
                rc = -EIO;
1006
            }
1007
        }
1008
        socket_set_cork(csock, 0);
1009
    }
1010

    
1011
    client->send_coroutine = NULL;
1012
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1013
    qemu_co_mutex_unlock(&client->send_lock);
1014
    return rc;
1015
}
1016

    
1017
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
1018
{
1019
    NBDClient *client = req->client;
1020
    int csock = client->sock;
1021
    ssize_t rc;
1022

    
1023
    client->recv_coroutine = qemu_coroutine_self();
1024
    rc = nbd_receive_request(csock, request);
1025
    if (rc < 0) {
1026
        if (rc != -EAGAIN) {
1027
            rc = -EIO;
1028
        }
1029
        goto out;
1030
    }
1031

    
1032
    if (request->len > NBD_BUFFER_SIZE) {
1033
        LOG("len (%u) is larger than max len (%u)",
1034
            request->len, NBD_BUFFER_SIZE);
1035
        rc = -EINVAL;
1036
        goto out;
1037
    }
1038

    
1039
    if ((request->from + request->len) < request->from) {
1040
        LOG("integer overflow detected! "
1041
            "you're probably being attacked");
1042
        rc = -EINVAL;
1043
        goto out;
1044
    }
1045

    
1046
    TRACE("Decoding type");
1047

    
1048
    if ((request->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
1049
        TRACE("Reading %u byte(s)", request->len);
1050

    
1051
        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
1052
            LOG("reading from socket failed");
1053
            rc = -EIO;
1054
            goto out;
1055
        }
1056
    }
1057
    rc = 0;
1058

    
1059
out:
1060
    client->recv_coroutine = NULL;
1061
    return rc;
1062
}
1063

    
1064
static void nbd_trip(void *opaque)
1065
{
1066
    NBDClient *client = opaque;
1067
    NBDExport *exp = client->exp;
1068
    NBDRequest *req;
1069
    struct nbd_request request;
1070
    struct nbd_reply reply;
1071
    ssize_t ret;
1072

    
1073
    TRACE("Reading request.");
1074
    if (client->closing) {
1075
        return;
1076
    }
1077

    
1078
    req = nbd_request_get(client);
1079
    ret = nbd_co_receive_request(req, &request);
1080
    if (ret == -EAGAIN) {
1081
        goto done;
1082
    }
1083
    if (ret == -EIO) {
1084
        goto out;
1085
    }
1086

    
1087
    reply.handle = request.handle;
1088
    reply.error = 0;
1089

    
1090
    if (ret < 0) {
1091
        reply.error = -ret;
1092
        goto error_reply;
1093
    }
1094

    
1095
    if ((request.from + request.len) > exp->size) {
1096
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
1097
            ", Offset: %" PRIu64 "\n",
1098
                    request.from, request.len,
1099
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
1100
        LOG("requested operation past EOF--bad client?");
1101
        goto invalid_request;
1102
    }
1103

    
1104
    switch (request.type & NBD_CMD_MASK_COMMAND) {
1105
    case NBD_CMD_READ:
1106
        TRACE("Request type is READ");
1107

    
1108
        if (request.type & NBD_CMD_FLAG_FUA) {
1109
            ret = bdrv_co_flush(exp->bs);
1110
            if (ret < 0) {
1111
                LOG("flush failed");
1112
                reply.error = -ret;
1113
                goto error_reply;
1114
            }
1115
        }
1116

    
1117
        ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
1118
                        req->data, request.len / 512);
1119
        if (ret < 0) {
1120
            LOG("reading from file failed");
1121
            reply.error = -ret;
1122
            goto error_reply;
1123
        }
1124

    
1125
        TRACE("Read %u byte(s)", request.len);
1126
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
1127
            goto out;
1128
        break;
1129
    case NBD_CMD_WRITE:
1130
        TRACE("Request type is WRITE");
1131

    
1132
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1133
            TRACE("Server is read-only, return error");
1134
            reply.error = EROFS;
1135
            goto error_reply;
1136
        }
1137

    
1138
        TRACE("Writing to device");
1139

    
1140
        ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
1141
                         req->data, request.len / 512);
1142
        if (ret < 0) {
1143
            LOG("writing to file failed");
1144
            reply.error = -ret;
1145
            goto error_reply;
1146
        }
1147

    
1148
        if (request.type & NBD_CMD_FLAG_FUA) {
1149
            ret = bdrv_co_flush(exp->bs);
1150
            if (ret < 0) {
1151
                LOG("flush failed");
1152
                reply.error = -ret;
1153
                goto error_reply;
1154
            }
1155
        }
1156

    
1157
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1158
            goto out;
1159
        }
1160
        break;
1161
    case NBD_CMD_DISC:
1162
        TRACE("Request type is DISCONNECT");
1163
        errno = 0;
1164
        goto out;
1165
    case NBD_CMD_FLUSH:
1166
        TRACE("Request type is FLUSH");
1167

    
1168
        ret = bdrv_co_flush(exp->bs);
1169
        if (ret < 0) {
1170
            LOG("flush failed");
1171
            reply.error = -ret;
1172
        }
1173
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1174
            goto out;
1175
        }
1176
        break;
1177
    case NBD_CMD_TRIM:
1178
        TRACE("Request type is TRIM");
1179
        ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
1180
                              request.len / 512);
1181
        if (ret < 0) {
1182
            LOG("discard failed");
1183
            reply.error = -ret;
1184
        }
1185
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1186
            goto out;
1187
        }
1188
        break;
1189
    default:
1190
        LOG("invalid request type (%u) received", request.type);
1191
    invalid_request:
1192
        reply.error = -EINVAL;
1193
    error_reply:
1194
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1195
            goto out;
1196
        }
1197
        break;
1198
    }
1199

    
1200
    TRACE("Request/Reply complete");
1201

    
1202
done:
1203
    nbd_request_put(req);
1204
    return;
1205

    
1206
out:
1207
    nbd_request_put(req);
1208
    nbd_client_close(client);
1209
}
1210

    
1211
static int nbd_can_read(void *opaque)
1212
{
1213
    NBDClient *client = opaque;
1214

    
1215
    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
1216
}
1217

    
1218
static void nbd_read(void *opaque)
1219
{
1220
    NBDClient *client = opaque;
1221

    
1222
    if (client->recv_coroutine) {
1223
        qemu_coroutine_enter(client->recv_coroutine, NULL);
1224
    } else {
1225
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1226
    }
1227
}
1228

    
1229
static void nbd_restart_write(void *opaque)
1230
{
1231
    NBDClient *client = opaque;
1232

    
1233
    qemu_coroutine_enter(client->send_coroutine, NULL);
1234
}
1235

    
1236
NBDClient *nbd_client_new(NBDExport *exp, int csock,
1237
                          void (*close)(NBDClient *))
1238
{
1239
    NBDClient *client;
1240
    client = g_malloc0(sizeof(NBDClient));
1241
    client->refcount = 1;
1242
    client->exp = exp;
1243
    client->sock = csock;
1244
    if (nbd_send_negotiate(client) < 0) {
1245
        g_free(client);
1246
        return NULL;
1247
    }
1248
    client->close = close;
1249
    qemu_co_mutex_init(&client->send_lock);
1250
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1251

    
1252
    if (exp) {
1253
        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1254
        nbd_export_get(exp);
1255
    }
1256
    return client;
1257
}