Statistics
| Branch: | Revision:

root / nbd.c @ 2c8d9f06

History | View | Annotate | Download (25.5 kB)

1
/*
2
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
3
 *
4
 *  Network Block Device
5
 *
6
 *  This program is free software; you can redistribute it and/or modify
7
 *  it under the terms of the GNU General Public License as published by
8
 *  the Free Software Foundation; under version 2 of the License.
9
 *
10
 *  This program is distributed in the hope that it will be useful,
11
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 *  GNU General Public License for more details.
14
 *
15
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
18

    
19
#include "nbd.h"
20
#include "block.h"
21

    
22
#include "qemu-coroutine.h"
23

    
24
#include <errno.h>
25
#include <string.h>
26
#ifndef _WIN32
27
#include <sys/ioctl.h>
28
#endif
29
#if defined(__sun__) || defined(__HAIKU__)
30
#include <sys/ioccom.h>
31
#endif
32
#include <ctype.h>
33
#include <inttypes.h>
34

    
35
#ifdef __linux__
36
#include <linux/fs.h>
37
#endif
38

    
39
#include "qemu_socket.h"
40
#include "qemu-queue.h"
41

    
42
//#define DEBUG_NBD
43

    
44
#ifdef DEBUG_NBD
45
#define TRACE(msg, ...) do { \
46
    LOG(msg, ## __VA_ARGS__); \
47
} while(0)
48
#else
49
#define TRACE(msg, ...) \
50
    do { } while (0)
51
#endif
52

    
53
#define LOG(msg, ...) do { \
54
    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
55
            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
56
} while(0)
57

    
58
/* This is all part of the "official" NBD API */
59

    
60
#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
61
#define NBD_REPLY_SIZE          (4 + 4 + 8)
62
#define NBD_REQUEST_MAGIC       0x25609513
63
#define NBD_REPLY_MAGIC         0x67446698
64
#define NBD_OPTS_MAGIC          0x49484156454F5054LL
65
#define NBD_CLIENT_MAGIC        0x0000420281861253LL
66

    
67
#define NBD_SET_SOCK            _IO(0xab, 0)
68
#define NBD_SET_BLKSIZE         _IO(0xab, 1)
69
#define NBD_SET_SIZE            _IO(0xab, 2)
70
#define NBD_DO_IT               _IO(0xab, 3)
71
#define NBD_CLEAR_SOCK          _IO(0xab, 4)
72
#define NBD_CLEAR_QUE           _IO(0xab, 5)
73
#define NBD_PRINT_DEBUG         _IO(0xab, 6)
74
#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
75
#define NBD_DISCONNECT          _IO(0xab, 8)
76
#define NBD_SET_TIMEOUT         _IO(0xab, 9)
77
#define NBD_SET_FLAGS           _IO(0xab, 10)
78

    
79
#define NBD_OPT_EXPORT_NAME     (1 << 0)
80

    
81
/* Definitions for opaque data types */
82

    
83
typedef struct NBDRequest NBDRequest;
84

    
85
struct NBDRequest {
86
    QSIMPLEQ_ENTRY(NBDRequest) entry;
87
    NBDClient *client;
88
    uint8_t *data;
89
};
90

    
91
struct NBDExport {
92
    int refcount;
93
    BlockDriverState *bs;
94
    off_t dev_offset;
95
    off_t size;
96
    uint32_t nbdflags;
97
    QSIMPLEQ_HEAD(, NBDRequest) requests;
98
};
99

    
100
struct NBDClient {
101
    int refcount;
102
    void (*close)(NBDClient *client);
103

    
104
    NBDExport *exp;
105
    int sock;
106

    
107
    Coroutine *recv_coroutine;
108

    
109
    CoMutex send_lock;
110
    Coroutine *send_coroutine;
111

    
112
    int nb_requests;
113
    bool closing;
114
};
115

    
116
/* That's all folks */
117

    
118
ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
119
{
120
    size_t offset = 0;
121
    int err;
122

    
123
    if (qemu_in_coroutine()) {
124
        if (do_read) {
125
            return qemu_co_recv(fd, buffer, size);
126
        } else {
127
            return qemu_co_send(fd, buffer, size);
128
        }
129
    }
130

    
131
    while (offset < size) {
132
        ssize_t len;
133

    
134
        if (do_read) {
135
            len = qemu_recv(fd, buffer + offset, size - offset, 0);
136
        } else {
137
            len = send(fd, buffer + offset, size - offset, 0);
138
        }
139

    
140
        if (len < 0) {
141
            err = socket_error();
142

    
143
            /* recoverable error */
144
            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
145
                continue;
146
            }
147

    
148
            /* unrecoverable error */
149
            return -err;
150
        }
151

    
152
        /* eof */
153
        if (len == 0) {
154
            break;
155
        }
156

    
157
        offset += len;
158
    }
159

    
160
    return offset;
161
}
162

    
163
static ssize_t read_sync(int fd, void *buffer, size_t size)
164
{
165
    /* Sockets are kept in blocking mode in the negotiation phase.  After
166
     * that, a non-readable socket simply means that another thread stole
167
     * our request/reply.  Synchronization is done with recv_coroutine, so
168
     * that this is coroutine-safe.
169
     */
170
    return nbd_wr_sync(fd, buffer, size, true);
171
}
172

    
173
static ssize_t write_sync(int fd, void *buffer, size_t size)
174
{
175
    int ret;
176
    do {
177
        /* For writes, we do expect the socket to be writable.  */
178
        ret = nbd_wr_sync(fd, buffer, size, false);
179
    } while (ret == -EAGAIN);
180
    return ret;
181
}
182

    
183
static void combine_addr(char *buf, size_t len, const char* address,
184
                         uint16_t port)
185
{
186
    /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
187
    if (strstr(address, ":")) {
188
        snprintf(buf, len, "[%s]:%u", address, port);
189
    } else {
190
        snprintf(buf, len, "%s:%u", address, port);
191
    }
192
}
193

    
194
int tcp_socket_outgoing(const char *address, uint16_t port)
195
{
196
    char address_and_port[128];
197
    combine_addr(address_and_port, 128, address, port);
198
    return tcp_socket_outgoing_spec(address_and_port);
199
}
200

    
201
int tcp_socket_outgoing_spec(const char *address_and_port)
202
{
203
    return inet_connect(address_and_port, true, NULL, NULL);
204
}
205

    
206
int tcp_socket_incoming(const char *address, uint16_t port)
207
{
208
    char address_and_port[128];
209
    combine_addr(address_and_port, 128, address, port);
210
    return tcp_socket_incoming_spec(address_and_port);
211
}
212

    
213
int tcp_socket_incoming_spec(const char *address_and_port)
214
{
215
    char *ostr  = NULL;
216
    int olen = 0;
217
    return inet_listen(address_and_port, ostr, olen, SOCK_STREAM, 0, NULL);
218
}
219

    
220
int unix_socket_incoming(const char *path)
221
{
222
    char *ostr = NULL;
223
    int olen = 0;
224

    
225
    return unix_listen(path, ostr, olen);
226
}
227

    
228
int unix_socket_outgoing(const char *path)
229
{
230
    return unix_connect(path);
231
}
232

    
233
/* Basic flow
234

235
   Server         Client
236

237
   Negotiate
238
                  Request
239
   Response
240
                  Request
241
   Response
242
                  ...
243
   ...
244
                  Request (type == 2)
245
*/
246

    
247
static int nbd_send_negotiate(NBDClient *client)
248
{
249
    int csock = client->sock;
250
    char buf[8 + 8 + 8 + 128];
251
    int rc;
252

    
253
    /* Negotiate
254
        [ 0 ..   7]   passwd   ("NBDMAGIC")
255
        [ 8 ..  15]   magic    (NBD_CLIENT_MAGIC)
256
        [16 ..  23]   size
257
        [24 ..  27]   flags
258
        [28 .. 151]   reserved (0)
259
     */
260

    
261
    socket_set_block(csock);
262
    rc = -EINVAL;
263

    
264
    TRACE("Beginning negotiation.");
265
    memcpy(buf, "NBDMAGIC", 8);
266
    cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
267
    cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
268
    cpu_to_be32w((uint32_t*)(buf + 24),
269
                 client->exp->nbdflags | NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
270
                 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
271
    memset(buf + 28, 0, 124);
272

    
273
    if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
274
        LOG("write failed");
275
        goto fail;
276
    }
277

    
278
    TRACE("Negotiation succeeded.");
279
    rc = 0;
280
fail:
281
    socket_set_nonblock(csock);
282
    return rc;
283
}
284

    
285
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
286
                          off_t *size, size_t *blocksize)
287
{
288
    char buf[256];
289
    uint64_t magic, s;
290
    uint16_t tmp;
291
    int rc;
292

    
293
    TRACE("Receiving negotiation.");
294

    
295
    socket_set_block(csock);
296
    rc = -EINVAL;
297

    
298
    if (read_sync(csock, buf, 8) != 8) {
299
        LOG("read failed");
300
        goto fail;
301
    }
302

    
303
    buf[8] = '\0';
304
    if (strlen(buf) == 0) {
305
        LOG("server connection closed");
306
        goto fail;
307
    }
308

    
309
    TRACE("Magic is %c%c%c%c%c%c%c%c",
310
          qemu_isprint(buf[0]) ? buf[0] : '.',
311
          qemu_isprint(buf[1]) ? buf[1] : '.',
312
          qemu_isprint(buf[2]) ? buf[2] : '.',
313
          qemu_isprint(buf[3]) ? buf[3] : '.',
314
          qemu_isprint(buf[4]) ? buf[4] : '.',
315
          qemu_isprint(buf[5]) ? buf[5] : '.',
316
          qemu_isprint(buf[6]) ? buf[6] : '.',
317
          qemu_isprint(buf[7]) ? buf[7] : '.');
318

    
319
    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
320
        LOG("Invalid magic received");
321
        goto fail;
322
    }
323

    
324
    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
325
        LOG("read failed");
326
        goto fail;
327
    }
328
    magic = be64_to_cpu(magic);
329
    TRACE("Magic is 0x%" PRIx64, magic);
330

    
331
    if (name) {
332
        uint32_t reserved = 0;
333
        uint32_t opt;
334
        uint32_t namesize;
335

    
336
        TRACE("Checking magic (opts_magic)");
337
        if (magic != NBD_OPTS_MAGIC) {
338
            LOG("Bad magic received");
339
            goto fail;
340
        }
341
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
342
            LOG("flags read failed");
343
            goto fail;
344
        }
345
        *flags = be16_to_cpu(tmp) << 16;
346
        /* reserved for future use */
347
        if (write_sync(csock, &reserved, sizeof(reserved)) !=
348
            sizeof(reserved)) {
349
            LOG("write failed (reserved)");
350
            goto fail;
351
        }
352
        /* write the export name */
353
        magic = cpu_to_be64(magic);
354
        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
355
            LOG("write failed (magic)");
356
            goto fail;
357
        }
358
        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
359
        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
360
            LOG("write failed (opt)");
361
            goto fail;
362
        }
363
        namesize = cpu_to_be32(strlen(name));
364
        if (write_sync(csock, &namesize, sizeof(namesize)) !=
365
            sizeof(namesize)) {
366
            LOG("write failed (namesize)");
367
            goto fail;
368
        }
369
        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
370
            LOG("write failed (name)");
371
            goto fail;
372
        }
373
    } else {
374
        TRACE("Checking magic (cli_magic)");
375

    
376
        if (magic != NBD_CLIENT_MAGIC) {
377
            LOG("Bad magic received");
378
            goto fail;
379
        }
380
    }
381

    
382
    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
383
        LOG("read failed");
384
        goto fail;
385
    }
386
    *size = be64_to_cpu(s);
387
    *blocksize = 1024;
388
    TRACE("Size is %" PRIu64, *size);
389

    
390
    if (!name) {
391
        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
392
            LOG("read failed (flags)");
393
            goto fail;
394
        }
395
        *flags = be32_to_cpup(flags);
396
    } else {
397
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
398
            LOG("read failed (tmp)");
399
            goto fail;
400
        }
401
        *flags |= be32_to_cpu(tmp);
402
    }
403
    if (read_sync(csock, &buf, 124) != 124) {
404
        LOG("read failed (buf)");
405
        goto fail;
406
    }
407
    rc = 0;
408

    
409
fail:
410
    socket_set_nonblock(csock);
411
    return rc;
412
}
413

    
414
#ifdef __linux__
415
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
416
{
417
    TRACE("Setting NBD socket");
418

    
419
    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
420
        int serrno = errno;
421
        LOG("Failed to set NBD socket");
422
        return -serrno;
423
    }
424

    
425
    TRACE("Setting block size to %lu", (unsigned long)blocksize);
426

    
427
    if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
428
        int serrno = errno;
429
        LOG("Failed setting NBD block size");
430
        return -serrno;
431
    }
432

    
433
        TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
434

    
435
    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
436
        int serrno = errno;
437
        LOG("Failed setting size (in blocks)");
438
        return -serrno;
439
    }
440

    
441
    if (flags & NBD_FLAG_READ_ONLY) {
442
        int read_only = 1;
443
        TRACE("Setting readonly attribute");
444

    
445
        if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
446
            int serrno = errno;
447
            LOG("Failed setting read-only attribute");
448
            return -serrno;
449
        }
450
    }
451

    
452
    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0
453
        && errno != ENOTTY) {
454
        int serrno = errno;
455
        LOG("Failed setting flags");
456
        return -serrno;
457
    }
458

    
459
    TRACE("Negotiation ended");
460

    
461
    return 0;
462
}
463

    
464
int nbd_disconnect(int fd)
465
{
466
    ioctl(fd, NBD_CLEAR_QUE);
467
    ioctl(fd, NBD_DISCONNECT);
468
    ioctl(fd, NBD_CLEAR_SOCK);
469
    return 0;
470
}
471

    
472
int nbd_client(int fd)
473
{
474
    int ret;
475
    int serrno;
476

    
477
    TRACE("Doing NBD loop");
478

    
479
    ret = ioctl(fd, NBD_DO_IT);
480
    if (ret < 0 && errno == EPIPE) {
481
        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
482
         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
483
         * that case.
484
         */
485
        ret = 0;
486
    }
487
    serrno = errno;
488

    
489
    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
490

    
491
    TRACE("Clearing NBD queue");
492
    ioctl(fd, NBD_CLEAR_QUE);
493

    
494
    TRACE("Clearing NBD socket");
495
    ioctl(fd, NBD_CLEAR_SOCK);
496

    
497
    errno = serrno;
498
    return ret;
499
}
500
#else
501
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
502
{
503
    return -ENOTSUP;
504
}
505

    
506
int nbd_disconnect(int fd)
507
{
508
    return -ENOTSUP;
509
}
510

    
511
int nbd_client(int fd)
512
{
513
    return -ENOTSUP;
514
}
515
#endif
516

    
517
ssize_t nbd_send_request(int csock, struct nbd_request *request)
518
{
519
    uint8_t buf[NBD_REQUEST_SIZE];
520
    ssize_t ret;
521

    
522
    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
523
    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
524
    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
525
    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
526
    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
527

    
528
    TRACE("Sending request to client: "
529
          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
530
          request->from, request->len, request->handle, request->type);
531

    
532
    ret = write_sync(csock, buf, sizeof(buf));
533
    if (ret < 0) {
534
        return ret;
535
    }
536

    
537
    if (ret != sizeof(buf)) {
538
        LOG("writing to socket failed");
539
        return -EINVAL;
540
    }
541
    return 0;
542
}
543

    
544
static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
545
{
546
    uint8_t buf[NBD_REQUEST_SIZE];
547
    uint32_t magic;
548
    ssize_t ret;
549

    
550
    ret = read_sync(csock, buf, sizeof(buf));
551
    if (ret < 0) {
552
        return ret;
553
    }
554

    
555
    if (ret != sizeof(buf)) {
556
        LOG("read failed");
557
        return -EINVAL;
558
    }
559

    
560
    /* Request
561
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
562
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
563
       [ 8 .. 15]   handle
564
       [16 .. 23]   from
565
       [24 .. 27]   len
566
     */
567

    
568
    magic = be32_to_cpup((uint32_t*)buf);
569
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
570
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
571
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
572
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));
573

    
574
    TRACE("Got request: "
575
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
576
          magic, request->type, request->from, request->len);
577

    
578
    if (magic != NBD_REQUEST_MAGIC) {
579
        LOG("invalid magic (got 0x%x)", magic);
580
        return -EINVAL;
581
    }
582
    return 0;
583
}
584

    
585
ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
586
{
587
    uint8_t buf[NBD_REPLY_SIZE];
588
    uint32_t magic;
589
    ssize_t ret;
590

    
591
    ret = read_sync(csock, buf, sizeof(buf));
592
    if (ret < 0) {
593
        return ret;
594
    }
595

    
596
    if (ret != sizeof(buf)) {
597
        LOG("read failed");
598
        return -EINVAL;
599
    }
600

    
601
    /* Reply
602
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
603
       [ 4 ..  7]    error   (0 == no error)
604
       [ 7 .. 15]    handle
605
     */
606

    
607
    magic = be32_to_cpup((uint32_t*)buf);
608
    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
609
    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
610

    
611
    TRACE("Got reply: "
612
          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
613
          magic, reply->error, reply->handle);
614

    
615
    if (magic != NBD_REPLY_MAGIC) {
616
        LOG("invalid magic (got 0x%x)", magic);
617
        return -EINVAL;
618
    }
619
    return 0;
620
}
621

    
622
static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
623
{
624
    uint8_t buf[NBD_REPLY_SIZE];
625
    ssize_t ret;
626

    
627
    /* Reply
628
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
629
       [ 4 ..  7]    error   (0 == no error)
630
       [ 7 .. 15]    handle
631
     */
632
    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
633
    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
634
    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
635

    
636
    TRACE("Sending response to client");
637

    
638
    ret = write_sync(csock, buf, sizeof(buf));
639
    if (ret < 0) {
640
        return ret;
641
    }
642

    
643
    if (ret != sizeof(buf)) {
644
        LOG("writing to socket failed");
645
        return -EINVAL;
646
    }
647
    return 0;
648
}
649

    
650
#define MAX_NBD_REQUESTS 16
651

    
652
void nbd_client_get(NBDClient *client)
653
{
654
    client->refcount++;
655
}
656

    
657
void nbd_client_put(NBDClient *client)
658
{
659
    if (--client->refcount == 0) {
660
        /* The last reference should be dropped by client->close,
661
         * which is called by nbd_client_close.
662
         */
663
        assert(client->closing);
664

    
665
        qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
666
        close(client->sock);
667
        client->sock = -1;
668
        nbd_export_put(client->exp);
669
        g_free(client);
670
    }
671
}
672

    
673
void nbd_client_close(NBDClient *client)
674
{
675
    if (client->closing) {
676
        return;
677
    }
678

    
679
    client->closing = true;
680

    
681
    /* Force requests to finish.  They will drop their own references,
682
     * then we'll close the socket and free the NBDClient.
683
     */
684
    shutdown(client->sock, 2);
685

    
686
    /* Also tell the client, so that they release their reference.  */
687
    if (client->close) {
688
        client->close(client);
689
    }
690
}
691

    
692
static NBDRequest *nbd_request_get(NBDClient *client)
693
{
694
    NBDRequest *req;
695
    NBDExport *exp = client->exp;
696

    
697
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
698
    client->nb_requests++;
699

    
700
    if (QSIMPLEQ_EMPTY(&exp->requests)) {
701
        req = g_malloc0(sizeof(NBDRequest));
702
        req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
703
    } else {
704
        req = QSIMPLEQ_FIRST(&exp->requests);
705
        QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
706
    }
707
    nbd_client_get(client);
708
    req->client = client;
709
    return req;
710
}
711

    
712
static void nbd_request_put(NBDRequest *req)
713
{
714
    NBDClient *client = req->client;
715
    QSIMPLEQ_INSERT_HEAD(&client->exp->requests, req, entry);
716
    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
717
        qemu_notify_event();
718
    }
719
    nbd_client_put(client);
720
}
721

    
722
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
723
                          off_t size, uint32_t nbdflags)
724
{
725
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
726
    QSIMPLEQ_INIT(&exp->requests);
727
    exp->refcount = 1;
728
    exp->bs = bs;
729
    exp->dev_offset = dev_offset;
730
    exp->nbdflags = nbdflags;
731
    exp->size = size == -1 ? bdrv_getlength(bs) : size;
732
    return exp;
733
}
734

    
735
void nbd_export_close(NBDExport *exp)
736
{
737
    assert(exp->refcount == 1);
738

    
739
    /* stub */
740
}
741

    
742
void nbd_export_get(NBDExport *exp)
743
{
744
    assert(exp->refcount > 0);
745
    exp->refcount++;
746
}
747

    
748
void nbd_export_put(NBDExport *exp)
749
{
750
    assert(exp->refcount > 0);
751
    if (exp->refcount == 1) {
752
        nbd_export_close(exp);
753
    }
754

    
755
    if (--exp->refcount == 0) {
756
        while (!QSIMPLEQ_EMPTY(&exp->requests)) {
757
            NBDRequest *first = QSIMPLEQ_FIRST(&exp->requests);
758
            QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
759
            qemu_vfree(first->data);
760
            g_free(first);
761
        }
762

    
763
        g_free(exp);
764
    }
765
}
766

    
767
static int nbd_can_read(void *opaque);
768
static void nbd_read(void *opaque);
769
static void nbd_restart_write(void *opaque);
770

    
771
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
772
                                 int len)
773
{
774
    NBDClient *client = req->client;
775
    int csock = client->sock;
776
    ssize_t rc, ret;
777

    
778
    qemu_co_mutex_lock(&client->send_lock);
779
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
780
                         nbd_restart_write, client);
781
    client->send_coroutine = qemu_coroutine_self();
782

    
783
    if (!len) {
784
        rc = nbd_send_reply(csock, reply);
785
    } else {
786
        socket_set_cork(csock, 1);
787
        rc = nbd_send_reply(csock, reply);
788
        if (rc >= 0) {
789
            ret = qemu_co_send(csock, req->data, len);
790
            if (ret != len) {
791
                rc = -EIO;
792
            }
793
        }
794
        socket_set_cork(csock, 0);
795
    }
796

    
797
    client->send_coroutine = NULL;
798
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
799
    qemu_co_mutex_unlock(&client->send_lock);
800
    return rc;
801
}
802

    
803
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
804
{
805
    NBDClient *client = req->client;
806
    int csock = client->sock;
807
    ssize_t rc;
808

    
809
    client->recv_coroutine = qemu_coroutine_self();
810
    rc = nbd_receive_request(csock, request);
811
    if (rc < 0) {
812
        if (rc != -EAGAIN) {
813
            rc = -EIO;
814
        }
815
        goto out;
816
    }
817

    
818
    if (request->len > NBD_BUFFER_SIZE) {
819
        LOG("len (%u) is larger than max len (%u)",
820
            request->len, NBD_BUFFER_SIZE);
821
        rc = -EINVAL;
822
        goto out;
823
    }
824

    
825
    if ((request->from + request->len) < request->from) {
826
        LOG("integer overflow detected! "
827
            "you're probably being attacked");
828
        rc = -EINVAL;
829
        goto out;
830
    }
831

    
832
    TRACE("Decoding type");
833

    
834
    if ((request->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
835
        TRACE("Reading %u byte(s)", request->len);
836

    
837
        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
838
            LOG("reading from socket failed");
839
            rc = -EIO;
840
            goto out;
841
        }
842
    }
843
    rc = 0;
844

    
845
out:
846
    client->recv_coroutine = NULL;
847
    return rc;
848
}
849

    
850
static void nbd_trip(void *opaque)
851
{
852
    NBDClient *client = opaque;
853
    NBDExport *exp = client->exp;
854
    NBDRequest *req;
855
    struct nbd_request request;
856
    struct nbd_reply reply;
857
    ssize_t ret;
858

    
859
    TRACE("Reading request.");
860
    if (client->closing) {
861
        return;
862
    }
863

    
864
    req = nbd_request_get(client);
865
    ret = nbd_co_receive_request(req, &request);
866
    if (ret == -EAGAIN) {
867
        goto done;
868
    }
869
    if (ret == -EIO) {
870
        goto out;
871
    }
872

    
873
    reply.handle = request.handle;
874
    reply.error = 0;
875

    
876
    if (ret < 0) {
877
        reply.error = -ret;
878
        goto error_reply;
879
    }
880

    
881
    if ((request.from + request.len) > exp->size) {
882
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
883
            ", Offset: %" PRIu64 "\n",
884
                    request.from, request.len,
885
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
886
        LOG("requested operation past EOF--bad client?");
887
        goto invalid_request;
888
    }
889

    
890
    switch (request.type & NBD_CMD_MASK_COMMAND) {
891
    case NBD_CMD_READ:
892
        TRACE("Request type is READ");
893

    
894
        if (request.type & NBD_CMD_FLAG_FUA) {
895
            ret = bdrv_co_flush(exp->bs);
896
            if (ret < 0) {
897
                LOG("flush failed");
898
                reply.error = -ret;
899
                goto error_reply;
900
            }
901
        }
902

    
903
        ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
904
                        req->data, request.len / 512);
905
        if (ret < 0) {
906
            LOG("reading from file failed");
907
            reply.error = -ret;
908
            goto error_reply;
909
        }
910

    
911
        TRACE("Read %u byte(s)", request.len);
912
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
913
            goto out;
914
        break;
915
    case NBD_CMD_WRITE:
916
        TRACE("Request type is WRITE");
917

    
918
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
919
            TRACE("Server is read-only, return error");
920
            reply.error = EROFS;
921
            goto error_reply;
922
        }
923

    
924
        TRACE("Writing to device");
925

    
926
        ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
927
                         req->data, request.len / 512);
928
        if (ret < 0) {
929
            LOG("writing to file failed");
930
            reply.error = -ret;
931
            goto error_reply;
932
        }
933

    
934
        if (request.type & NBD_CMD_FLAG_FUA) {
935
            ret = bdrv_co_flush(exp->bs);
936
            if (ret < 0) {
937
                LOG("flush failed");
938
                reply.error = -ret;
939
                goto error_reply;
940
            }
941
        }
942

    
943
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
944
            goto out;
945
        }
946
        break;
947
    case NBD_CMD_DISC:
948
        TRACE("Request type is DISCONNECT");
949
        errno = 0;
950
        goto out;
951
    case NBD_CMD_FLUSH:
952
        TRACE("Request type is FLUSH");
953

    
954
        ret = bdrv_co_flush(exp->bs);
955
        if (ret < 0) {
956
            LOG("flush failed");
957
            reply.error = -ret;
958
        }
959
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
960
            goto out;
961
        }
962
        break;
963
    case NBD_CMD_TRIM:
964
        TRACE("Request type is TRIM");
965
        ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
966
                              request.len / 512);
967
        if (ret < 0) {
968
            LOG("discard failed");
969
            reply.error = -ret;
970
        }
971
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
972
            goto out;
973
        }
974
        break;
975
    default:
976
        LOG("invalid request type (%u) received", request.type);
977
    invalid_request:
978
        reply.error = -EINVAL;
979
    error_reply:
980
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
981
            goto out;
982
        }
983
        break;
984
    }
985

    
986
    TRACE("Request/Reply complete");
987

    
988
done:
989
    nbd_request_put(req);
990
    return;
991

    
992
out:
993
    nbd_request_put(req);
994
    nbd_client_close(client);
995
}
996

    
997
static int nbd_can_read(void *opaque)
998
{
999
    NBDClient *client = opaque;
1000

    
1001
    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
1002
}
1003

    
1004
static void nbd_read(void *opaque)
1005
{
1006
    NBDClient *client = opaque;
1007

    
1008
    if (client->recv_coroutine) {
1009
        qemu_coroutine_enter(client->recv_coroutine, NULL);
1010
    } else {
1011
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1012
    }
1013
}
1014

    
1015
static void nbd_restart_write(void *opaque)
1016
{
1017
    NBDClient *client = opaque;
1018

    
1019
    qemu_coroutine_enter(client->send_coroutine, NULL);
1020
}
1021

    
1022
NBDClient *nbd_client_new(NBDExport *exp, int csock,
1023
                          void (*close)(NBDClient *))
1024
{
1025
    NBDClient *client;
1026
    client = g_malloc0(sizeof(NBDClient));
1027
    client->refcount = 1;
1028
    client->exp = exp;
1029
    client->sock = csock;
1030
    if (nbd_send_negotiate(client) < 0) {
1031
        g_free(client);
1032
        return NULL;
1033
    }
1034
    client->close = close;
1035
    qemu_co_mutex_init(&client->send_lock);
1036
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1037

    
1038
    nbd_export_get(exp);
1039
    return client;
1040
}