Statistics
| Branch: | Revision:

root / nbd.c @ a4aab7b4

History | View | Annotate | Download (24.6 kB)

1
/*
2
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
3
 *
4
 *  Network Block Device
5
 *
6
 *  This program is free software; you can redistribute it and/or modify
7
 *  it under the terms of the GNU General Public License as published by
8
 *  the Free Software Foundation; under version 2 of the License.
9
 *
10
 *  This program is distributed in the hope that it will be useful,
11
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 *  GNU General Public License for more details.
14
 *
15
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
18

    
19
#include "nbd.h"
20
#include "block.h"
21

    
22
#include "qemu-coroutine.h"
23

    
24
#include <errno.h>
25
#include <string.h>
26
#ifndef _WIN32
27
#include <sys/ioctl.h>
28
#endif
29
#if defined(__sun__) || defined(__HAIKU__)
30
#include <sys/ioccom.h>
31
#endif
32
#include <ctype.h>
33
#include <inttypes.h>
34

    
35
#ifdef __linux__
36
#include <linux/fs.h>
37
#endif
38

    
39
#include "qemu_socket.h"
40
#include "qemu-queue.h"
41

    
42
//#define DEBUG_NBD
43

    
44
#ifdef DEBUG_NBD
45
#define TRACE(msg, ...) do { \
46
    LOG(msg, ## __VA_ARGS__); \
47
} while(0)
48
#else
49
#define TRACE(msg, ...) \
50
    do { } while (0)
51
#endif
52

    
53
#define LOG(msg, ...) do { \
54
    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
55
            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
56
} while(0)
57

    
58
/* This is all part of the "official" NBD API */
59

    
60
#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
61
#define NBD_REPLY_SIZE          (4 + 4 + 8)
62
#define NBD_REQUEST_MAGIC       0x25609513
63
#define NBD_REPLY_MAGIC         0x67446698
64
#define NBD_OPTS_MAGIC          0x49484156454F5054LL
65
#define NBD_CLIENT_MAGIC        0x0000420281861253LL
66

    
67
#define NBD_SET_SOCK            _IO(0xab, 0)
68
#define NBD_SET_BLKSIZE         _IO(0xab, 1)
69
#define NBD_SET_SIZE            _IO(0xab, 2)
70
#define NBD_DO_IT               _IO(0xab, 3)
71
#define NBD_CLEAR_SOCK          _IO(0xab, 4)
72
#define NBD_CLEAR_QUE           _IO(0xab, 5)
73
#define NBD_PRINT_DEBUG         _IO(0xab, 6)
74
#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
75
#define NBD_DISCONNECT          _IO(0xab, 8)
76
#define NBD_SET_TIMEOUT         _IO(0xab, 9)
77
#define NBD_SET_FLAGS           _IO(0xab, 10)
78

    
79
#define NBD_OPT_EXPORT_NAME     (1 << 0)
80

    
81
/* Definitions for opaque data types */
82

    
83
typedef struct NBDRequest NBDRequest;
84

    
85
struct NBDRequest {
86
    QSIMPLEQ_ENTRY(NBDRequest) entry;
87
    NBDClient *client;
88
    uint8_t *data;
89
};
90

    
91
struct NBDExport {
92
    BlockDriverState *bs;
93
    off_t dev_offset;
94
    off_t size;
95
    uint32_t nbdflags;
96
    QSIMPLEQ_HEAD(, NBDRequest) requests;
97
};
98

    
99
struct NBDClient {
100
    int refcount;
101
    void (*close)(NBDClient *client);
102

    
103
    NBDExport *exp;
104
    int sock;
105

    
106
    Coroutine *recv_coroutine;
107

    
108
    CoMutex send_lock;
109
    Coroutine *send_coroutine;
110

    
111
    int nb_requests;
112
};
113

    
114
/* That's all folks */
115

    
116
ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
117
{
118
    size_t offset = 0;
119
    int err;
120

    
121
    if (qemu_in_coroutine()) {
122
        if (do_read) {
123
            return qemu_co_recv(fd, buffer, size);
124
        } else {
125
            return qemu_co_send(fd, buffer, size);
126
        }
127
    }
128

    
129
    while (offset < size) {
130
        ssize_t len;
131

    
132
        if (do_read) {
133
            len = qemu_recv(fd, buffer + offset, size - offset, 0);
134
        } else {
135
            len = send(fd, buffer + offset, size - offset, 0);
136
        }
137

    
138
        if (len < 0) {
139
            err = socket_error();
140

    
141
            /* recoverable error */
142
            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
143
                continue;
144
            }
145

    
146
            /* unrecoverable error */
147
            return -err;
148
        }
149

    
150
        /* eof */
151
        if (len == 0) {
152
            break;
153
        }
154

    
155
        offset += len;
156
    }
157

    
158
    return offset;
159
}
160

    
161
static ssize_t read_sync(int fd, void *buffer, size_t size)
162
{
163
    /* Sockets are kept in blocking mode in the negotiation phase.  After
164
     * that, a non-readable socket simply means that another thread stole
165
     * our request/reply.  Synchronization is done with recv_coroutine, so
166
     * that this is coroutine-safe.
167
     */
168
    return nbd_wr_sync(fd, buffer, size, true);
169
}
170

    
171
static ssize_t write_sync(int fd, void *buffer, size_t size)
172
{
173
    int ret;
174
    do {
175
        /* For writes, we do expect the socket to be writable.  */
176
        ret = nbd_wr_sync(fd, buffer, size, false);
177
    } while (ret == -EAGAIN);
178
    return ret;
179
}
180

    
181
static void combine_addr(char *buf, size_t len, const char* address,
182
                         uint16_t port)
183
{
184
    /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
185
    if (strstr(address, ":")) {
186
        snprintf(buf, len, "[%s]:%u", address, port);
187
    } else {
188
        snprintf(buf, len, "%s:%u", address, port);
189
    }
190
}
191

    
192
int tcp_socket_outgoing(const char *address, uint16_t port)
193
{
194
    char address_and_port[128];
195
    combine_addr(address_and_port, 128, address, port);
196
    return tcp_socket_outgoing_spec(address_and_port);
197
}
198

    
199
int tcp_socket_outgoing_spec(const char *address_and_port)
200
{
201
    return inet_connect(address_and_port, true, NULL, NULL);
202
}
203

    
204
int tcp_socket_incoming(const char *address, uint16_t port)
205
{
206
    char address_and_port[128];
207
    combine_addr(address_and_port, 128, address, port);
208
    return tcp_socket_incoming_spec(address_and_port);
209
}
210

    
211
int tcp_socket_incoming_spec(const char *address_and_port)
212
{
213
    char *ostr  = NULL;
214
    int olen = 0;
215
    return inet_listen(address_and_port, ostr, olen, SOCK_STREAM, 0, NULL);
216
}
217

    
218
int unix_socket_incoming(const char *path)
219
{
220
    char *ostr = NULL;
221
    int olen = 0;
222

    
223
    return unix_listen(path, ostr, olen);
224
}
225

    
226
int unix_socket_outgoing(const char *path)
227
{
228
    return unix_connect(path);
229
}
230

    
231
/* Basic flow
232

233
   Server         Client
234

235
   Negotiate
236
                  Request
237
   Response
238
                  Request
239
   Response
240
                  ...
241
   ...
242
                  Request (type == 2)
243
*/
244

    
245
static int nbd_send_negotiate(NBDClient *client)
246
{
247
    int csock = client->sock;
248
    char buf[8 + 8 + 8 + 128];
249
    int rc;
250

    
251
    /* Negotiate
252
        [ 0 ..   7]   passwd   ("NBDMAGIC")
253
        [ 8 ..  15]   magic    (NBD_CLIENT_MAGIC)
254
        [16 ..  23]   size
255
        [24 ..  27]   flags
256
        [28 .. 151]   reserved (0)
257
     */
258

    
259
    socket_set_block(csock);
260
    rc = -EINVAL;
261

    
262
    TRACE("Beginning negotiation.");
263
    memcpy(buf, "NBDMAGIC", 8);
264
    cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
265
    cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
266
    cpu_to_be32w((uint32_t*)(buf + 24),
267
                 client->exp->nbdflags | NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
268
                 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
269
    memset(buf + 28, 0, 124);
270

    
271
    if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
272
        LOG("write failed");
273
        goto fail;
274
    }
275

    
276
    TRACE("Negotiation succeeded.");
277
    rc = 0;
278
fail:
279
    socket_set_nonblock(csock);
280
    return rc;
281
}
282

    
283
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
284
                          off_t *size, size_t *blocksize)
285
{
286
    char buf[256];
287
    uint64_t magic, s;
288
    uint16_t tmp;
289
    int rc;
290

    
291
    TRACE("Receiving negotiation.");
292

    
293
    socket_set_block(csock);
294
    rc = -EINVAL;
295

    
296
    if (read_sync(csock, buf, 8) != 8) {
297
        LOG("read failed");
298
        goto fail;
299
    }
300

    
301
    buf[8] = '\0';
302
    if (strlen(buf) == 0) {
303
        LOG("server connection closed");
304
        goto fail;
305
    }
306

    
307
    TRACE("Magic is %c%c%c%c%c%c%c%c",
308
          qemu_isprint(buf[0]) ? buf[0] : '.',
309
          qemu_isprint(buf[1]) ? buf[1] : '.',
310
          qemu_isprint(buf[2]) ? buf[2] : '.',
311
          qemu_isprint(buf[3]) ? buf[3] : '.',
312
          qemu_isprint(buf[4]) ? buf[4] : '.',
313
          qemu_isprint(buf[5]) ? buf[5] : '.',
314
          qemu_isprint(buf[6]) ? buf[6] : '.',
315
          qemu_isprint(buf[7]) ? buf[7] : '.');
316

    
317
    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
318
        LOG("Invalid magic received");
319
        goto fail;
320
    }
321

    
322
    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
323
        LOG("read failed");
324
        goto fail;
325
    }
326
    magic = be64_to_cpu(magic);
327
    TRACE("Magic is 0x%" PRIx64, magic);
328

    
329
    if (name) {
330
        uint32_t reserved = 0;
331
        uint32_t opt;
332
        uint32_t namesize;
333

    
334
        TRACE("Checking magic (opts_magic)");
335
        if (magic != NBD_OPTS_MAGIC) {
336
            LOG("Bad magic received");
337
            goto fail;
338
        }
339
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
340
            LOG("flags read failed");
341
            goto fail;
342
        }
343
        *flags = be16_to_cpu(tmp) << 16;
344
        /* reserved for future use */
345
        if (write_sync(csock, &reserved, sizeof(reserved)) !=
346
            sizeof(reserved)) {
347
            LOG("write failed (reserved)");
348
            goto fail;
349
        }
350
        /* write the export name */
351
        magic = cpu_to_be64(magic);
352
        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
353
            LOG("write failed (magic)");
354
            goto fail;
355
        }
356
        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
357
        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
358
            LOG("write failed (opt)");
359
            goto fail;
360
        }
361
        namesize = cpu_to_be32(strlen(name));
362
        if (write_sync(csock, &namesize, sizeof(namesize)) !=
363
            sizeof(namesize)) {
364
            LOG("write failed (namesize)");
365
            goto fail;
366
        }
367
        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
368
            LOG("write failed (name)");
369
            goto fail;
370
        }
371
    } else {
372
        TRACE("Checking magic (cli_magic)");
373

    
374
        if (magic != NBD_CLIENT_MAGIC) {
375
            LOG("Bad magic received");
376
            goto fail;
377
        }
378
    }
379

    
380
    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
381
        LOG("read failed");
382
        goto fail;
383
    }
384
    *size = be64_to_cpu(s);
385
    *blocksize = 1024;
386
    TRACE("Size is %" PRIu64, *size);
387

    
388
    if (!name) {
389
        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
390
            LOG("read failed (flags)");
391
            goto fail;
392
        }
393
        *flags = be32_to_cpup(flags);
394
    } else {
395
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
396
            LOG("read failed (tmp)");
397
            goto fail;
398
        }
399
        *flags |= be32_to_cpu(tmp);
400
    }
401
    if (read_sync(csock, &buf, 124) != 124) {
402
        LOG("read failed (buf)");
403
        goto fail;
404
    }
405
    rc = 0;
406

    
407
fail:
408
    socket_set_nonblock(csock);
409
    return rc;
410
}
411

    
412
#ifdef __linux__
413
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
414
{
415
    TRACE("Setting NBD socket");
416

    
417
    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
418
        int serrno = errno;
419
        LOG("Failed to set NBD socket");
420
        return -serrno;
421
    }
422

    
423
    TRACE("Setting block size to %lu", (unsigned long)blocksize);
424

    
425
    if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
426
        int serrno = errno;
427
        LOG("Failed setting NBD block size");
428
        return -serrno;
429
    }
430

    
431
        TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
432

    
433
    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
434
        int serrno = errno;
435
        LOG("Failed setting size (in blocks)");
436
        return -serrno;
437
    }
438

    
439
    if (flags & NBD_FLAG_READ_ONLY) {
440
        int read_only = 1;
441
        TRACE("Setting readonly attribute");
442

    
443
        if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
444
            int serrno = errno;
445
            LOG("Failed setting read-only attribute");
446
            return -serrno;
447
        }
448
    }
449

    
450
    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0
451
        && errno != ENOTTY) {
452
        int serrno = errno;
453
        LOG("Failed setting flags");
454
        return -serrno;
455
    }
456

    
457
    TRACE("Negotiation ended");
458

    
459
    return 0;
460
}
461

    
462
int nbd_disconnect(int fd)
463
{
464
    ioctl(fd, NBD_CLEAR_QUE);
465
    ioctl(fd, NBD_DISCONNECT);
466
    ioctl(fd, NBD_CLEAR_SOCK);
467
    return 0;
468
}
469

    
470
int nbd_client(int fd)
471
{
472
    int ret;
473
    int serrno;
474

    
475
    TRACE("Doing NBD loop");
476

    
477
    ret = ioctl(fd, NBD_DO_IT);
478
    if (ret < 0 && errno == EPIPE) {
479
        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
480
         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
481
         * that case.
482
         */
483
        ret = 0;
484
    }
485
    serrno = errno;
486

    
487
    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
488

    
489
    TRACE("Clearing NBD queue");
490
    ioctl(fd, NBD_CLEAR_QUE);
491

    
492
    TRACE("Clearing NBD socket");
493
    ioctl(fd, NBD_CLEAR_SOCK);
494

    
495
    errno = serrno;
496
    return ret;
497
}
498
#else
499
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
500
{
501
    return -ENOTSUP;
502
}
503

    
504
int nbd_disconnect(int fd)
505
{
506
    return -ENOTSUP;
507
}
508

    
509
int nbd_client(int fd)
510
{
511
    return -ENOTSUP;
512
}
513
#endif
514

    
515
ssize_t nbd_send_request(int csock, struct nbd_request *request)
516
{
517
    uint8_t buf[NBD_REQUEST_SIZE];
518
    ssize_t ret;
519

    
520
    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
521
    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
522
    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
523
    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
524
    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
525

    
526
    TRACE("Sending request to client: "
527
          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
528
          request->from, request->len, request->handle, request->type);
529

    
530
    ret = write_sync(csock, buf, sizeof(buf));
531
    if (ret < 0) {
532
        return ret;
533
    }
534

    
535
    if (ret != sizeof(buf)) {
536
        LOG("writing to socket failed");
537
        return -EINVAL;
538
    }
539
    return 0;
540
}
541

    
542
static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
543
{
544
    uint8_t buf[NBD_REQUEST_SIZE];
545
    uint32_t magic;
546
    ssize_t ret;
547

    
548
    ret = read_sync(csock, buf, sizeof(buf));
549
    if (ret < 0) {
550
        return ret;
551
    }
552

    
553
    if (ret != sizeof(buf)) {
554
        LOG("read failed");
555
        return -EINVAL;
556
    }
557

    
558
    /* Request
559
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
560
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
561
       [ 8 .. 15]   handle
562
       [16 .. 23]   from
563
       [24 .. 27]   len
564
     */
565

    
566
    magic = be32_to_cpup((uint32_t*)buf);
567
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
568
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
569
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
570
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));
571

    
572
    TRACE("Got request: "
573
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
574
          magic, request->type, request->from, request->len);
575

    
576
    if (magic != NBD_REQUEST_MAGIC) {
577
        LOG("invalid magic (got 0x%x)", magic);
578
        return -EINVAL;
579
    }
580
    return 0;
581
}
582

    
583
ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
584
{
585
    uint8_t buf[NBD_REPLY_SIZE];
586
    uint32_t magic;
587
    ssize_t ret;
588

    
589
    ret = read_sync(csock, buf, sizeof(buf));
590
    if (ret < 0) {
591
        return ret;
592
    }
593

    
594
    if (ret != sizeof(buf)) {
595
        LOG("read failed");
596
        return -EINVAL;
597
    }
598

    
599
    /* Reply
600
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
601
       [ 4 ..  7]    error   (0 == no error)
602
       [ 7 .. 15]    handle
603
     */
604

    
605
    magic = be32_to_cpup((uint32_t*)buf);
606
    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
607
    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
608

    
609
    TRACE("Got reply: "
610
          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
611
          magic, reply->error, reply->handle);
612

    
613
    if (magic != NBD_REPLY_MAGIC) {
614
        LOG("invalid magic (got 0x%x)", magic);
615
        return -EINVAL;
616
    }
617
    return 0;
618
}
619

    
620
static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
621
{
622
    uint8_t buf[NBD_REPLY_SIZE];
623
    ssize_t ret;
624

    
625
    /* Reply
626
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
627
       [ 4 ..  7]    error   (0 == no error)
628
       [ 7 .. 15]    handle
629
     */
630
    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
631
    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
632
    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
633

    
634
    TRACE("Sending response to client");
635

    
636
    ret = write_sync(csock, buf, sizeof(buf));
637
    if (ret < 0) {
638
        return ret;
639
    }
640

    
641
    if (ret != sizeof(buf)) {
642
        LOG("writing to socket failed");
643
        return -EINVAL;
644
    }
645
    return 0;
646
}
647

    
648
#define MAX_NBD_REQUESTS 16
649

    
650
static void nbd_client_get(NBDClient *client)
651
{
652
    client->refcount++;
653
}
654

    
655
static void nbd_client_put(NBDClient *client)
656
{
657
    if (--client->refcount == 0) {
658
        g_free(client);
659
    }
660
}
661

    
662
static void nbd_client_close(NBDClient *client)
663
{
664
    qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
665
    close(client->sock);
666
    client->sock = -1;
667
    if (client->close) {
668
        client->close(client);
669
    }
670
    nbd_client_put(client);
671
}
672

    
673
static NBDRequest *nbd_request_get(NBDClient *client)
674
{
675
    NBDRequest *req;
676
    NBDExport *exp = client->exp;
677

    
678
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
679
    client->nb_requests++;
680

    
681
    if (QSIMPLEQ_EMPTY(&exp->requests)) {
682
        req = g_malloc0(sizeof(NBDRequest));
683
        req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
684
    } else {
685
        req = QSIMPLEQ_FIRST(&exp->requests);
686
        QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
687
    }
688
    nbd_client_get(client);
689
    req->client = client;
690
    return req;
691
}
692

    
693
static void nbd_request_put(NBDRequest *req)
694
{
695
    NBDClient *client = req->client;
696
    QSIMPLEQ_INSERT_HEAD(&client->exp->requests, req, entry);
697
    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
698
        qemu_notify_event();
699
    }
700
    nbd_client_put(client);
701
}
702

    
703
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
704
                          off_t size, uint32_t nbdflags)
705
{
706
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
707
    QSIMPLEQ_INIT(&exp->requests);
708
    exp->bs = bs;
709
    exp->dev_offset = dev_offset;
710
    exp->nbdflags = nbdflags;
711
    exp->size = size == -1 ? bdrv_getlength(bs) : size;
712
    return exp;
713
}
714

    
715
void nbd_export_close(NBDExport *exp)
716
{
717
    while (!QSIMPLEQ_EMPTY(&exp->requests)) {
718
        NBDRequest *first = QSIMPLEQ_FIRST(&exp->requests);
719
        QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
720
        qemu_vfree(first->data);
721
        g_free(first);
722
    }
723

    
724
    g_free(exp);
725
}
726

    
727
static int nbd_can_read(void *opaque);
728
static void nbd_read(void *opaque);
729
static void nbd_restart_write(void *opaque);
730

    
731
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
732
                                 int len)
733
{
734
    NBDClient *client = req->client;
735
    int csock = client->sock;
736
    ssize_t rc, ret;
737

    
738
    qemu_co_mutex_lock(&client->send_lock);
739
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
740
                         nbd_restart_write, client);
741
    client->send_coroutine = qemu_coroutine_self();
742

    
743
    if (!len) {
744
        rc = nbd_send_reply(csock, reply);
745
    } else {
746
        socket_set_cork(csock, 1);
747
        rc = nbd_send_reply(csock, reply);
748
        if (rc >= 0) {
749
            ret = qemu_co_send(csock, req->data, len);
750
            if (ret != len) {
751
                rc = -EIO;
752
            }
753
        }
754
        socket_set_cork(csock, 0);
755
    }
756

    
757
    client->send_coroutine = NULL;
758
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
759
    qemu_co_mutex_unlock(&client->send_lock);
760
    return rc;
761
}
762

    
763
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
764
{
765
    NBDClient *client = req->client;
766
    int csock = client->sock;
767
    ssize_t rc;
768

    
769
    client->recv_coroutine = qemu_coroutine_self();
770
    rc = nbd_receive_request(csock, request);
771
    if (rc < 0) {
772
        if (rc != -EAGAIN) {
773
            rc = -EIO;
774
        }
775
        goto out;
776
    }
777

    
778
    if (request->len > NBD_BUFFER_SIZE) {
779
        LOG("len (%u) is larger than max len (%u)",
780
            request->len, NBD_BUFFER_SIZE);
781
        rc = -EINVAL;
782
        goto out;
783
    }
784

    
785
    if ((request->from + request->len) < request->from) {
786
        LOG("integer overflow detected! "
787
            "you're probably being attacked");
788
        rc = -EINVAL;
789
        goto out;
790
    }
791

    
792
    TRACE("Decoding type");
793

    
794
    if ((request->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
795
        TRACE("Reading %u byte(s)", request->len);
796

    
797
        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
798
            LOG("reading from socket failed");
799
            rc = -EIO;
800
            goto out;
801
        }
802
    }
803
    rc = 0;
804

    
805
out:
806
    client->recv_coroutine = NULL;
807
    return rc;
808
}
809

    
810
static void nbd_trip(void *opaque)
811
{
812
    NBDClient *client = opaque;
813
    NBDRequest *req = nbd_request_get(client);
814
    NBDExport *exp = client->exp;
815
    struct nbd_request request;
816
    struct nbd_reply reply;
817
    ssize_t ret;
818

    
819
    TRACE("Reading request.");
820

    
821
    ret = nbd_co_receive_request(req, &request);
822
    if (ret == -EAGAIN) {
823
        goto done;
824
    }
825
    if (ret == -EIO) {
826
        goto out;
827
    }
828

    
829
    reply.handle = request.handle;
830
    reply.error = 0;
831

    
832
    if (ret < 0) {
833
        reply.error = -ret;
834
        goto error_reply;
835
    }
836

    
837
    if ((request.from + request.len) > exp->size) {
838
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
839
            ", Offset: %" PRIu64 "\n",
840
                    request.from, request.len,
841
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
842
        LOG("requested operation past EOF--bad client?");
843
        goto invalid_request;
844
    }
845

    
846
    switch (request.type & NBD_CMD_MASK_COMMAND) {
847
    case NBD_CMD_READ:
848
        TRACE("Request type is READ");
849

    
850
        if (request.type & NBD_CMD_FLAG_FUA) {
851
            ret = bdrv_co_flush(exp->bs);
852
            if (ret < 0) {
853
                LOG("flush failed");
854
                reply.error = -ret;
855
                goto error_reply;
856
            }
857
        }
858

    
859
        ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
860
                        req->data, request.len / 512);
861
        if (ret < 0) {
862
            LOG("reading from file failed");
863
            reply.error = -ret;
864
            goto error_reply;
865
        }
866

    
867
        TRACE("Read %u byte(s)", request.len);
868
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
869
            goto out;
870
        break;
871
    case NBD_CMD_WRITE:
872
        TRACE("Request type is WRITE");
873

    
874
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
875
            TRACE("Server is read-only, return error");
876
            reply.error = EROFS;
877
            goto error_reply;
878
        }
879

    
880
        TRACE("Writing to device");
881

    
882
        ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
883
                         req->data, request.len / 512);
884
        if (ret < 0) {
885
            LOG("writing to file failed");
886
            reply.error = -ret;
887
            goto error_reply;
888
        }
889

    
890
        if (request.type & NBD_CMD_FLAG_FUA) {
891
            ret = bdrv_co_flush(exp->bs);
892
            if (ret < 0) {
893
                LOG("flush failed");
894
                reply.error = -ret;
895
                goto error_reply;
896
            }
897
        }
898

    
899
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
900
            goto out;
901
        }
902
        break;
903
    case NBD_CMD_DISC:
904
        TRACE("Request type is DISCONNECT");
905
        errno = 0;
906
        goto out;
907
    case NBD_CMD_FLUSH:
908
        TRACE("Request type is FLUSH");
909

    
910
        ret = bdrv_co_flush(exp->bs);
911
        if (ret < 0) {
912
            LOG("flush failed");
913
            reply.error = -ret;
914
        }
915
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
916
            goto out;
917
        }
918
        break;
919
    case NBD_CMD_TRIM:
920
        TRACE("Request type is TRIM");
921
        ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
922
                              request.len / 512);
923
        if (ret < 0) {
924
            LOG("discard failed");
925
            reply.error = -ret;
926
        }
927
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
928
            goto out;
929
        }
930
        break;
931
    default:
932
        LOG("invalid request type (%u) received", request.type);
933
    invalid_request:
934
        reply.error = -EINVAL;
935
    error_reply:
936
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
937
            goto out;
938
        }
939
        break;
940
    }
941

    
942
    TRACE("Request/Reply complete");
943

    
944
done:
945
    nbd_request_put(req);
946
    return;
947

    
948
out:
949
    nbd_request_put(req);
950
    nbd_client_close(client);
951
}
952

    
953
static int nbd_can_read(void *opaque)
954
{
955
    NBDClient *client = opaque;
956

    
957
    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
958
}
959

    
960
static void nbd_read(void *opaque)
961
{
962
    NBDClient *client = opaque;
963

    
964
    if (client->recv_coroutine) {
965
        qemu_coroutine_enter(client->recv_coroutine, NULL);
966
    } else {
967
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
968
    }
969
}
970

    
971
static void nbd_restart_write(void *opaque)
972
{
973
    NBDClient *client = opaque;
974

    
975
    qemu_coroutine_enter(client->send_coroutine, NULL);
976
}
977

    
978
NBDClient *nbd_client_new(NBDExport *exp, int csock,
979
                          void (*close)(NBDClient *))
980
{
981
    NBDClient *client;
982
    client = g_malloc0(sizeof(NBDClient));
983
    client->refcount = 1;
984
    client->exp = exp;
985
    client->sock = csock;
986
    if (nbd_send_negotiate(client) < 0) {
987
        g_free(client);
988
        return NULL;
989
    }
990
    client->close = close;
991
    qemu_co_mutex_init(&client->send_lock);
992
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
993
    return client;
994
}