Statistics
| Branch: | Revision:

root / block / sheepdog.c @ 6defcc37

History | View | Annotate | Download (51.8 kB)

1
/*
2
 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3
 *
4
 * This program is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU General Public License version
6
 * 2 as published by the Free Software Foundation.
7
 *
8
 * You should have received a copy of the GNU General Public License
9
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10
 */
11
#ifdef _WIN32
12
#include <windows.h>
13
#include <winsock2.h>
14
#include <ws2tcpip.h>
15
#else
16
#include <netdb.h>
17
#include <netinet/tcp.h>
18

    
19
#define closesocket(s) close(s)
20
#endif
21

    
22
#include "qemu-common.h"
23
#include "qemu-error.h"
24
#include "qemu_socket.h"
25
#include "block_int.h"
26

    
27
#define SD_PROTO_VER 0x01
28

    
29
#define SD_DEFAULT_ADDR "localhost"
30
#define SD_DEFAULT_PORT "7000"
31

    
32
#define SD_OP_CREATE_AND_WRITE_OBJ  0x01
33
#define SD_OP_READ_OBJ       0x02
34
#define SD_OP_WRITE_OBJ      0x03
35

    
36
#define SD_OP_NEW_VDI        0x11
37
#define SD_OP_LOCK_VDI       0x12
38
#define SD_OP_RELEASE_VDI    0x13
39
#define SD_OP_GET_VDI_INFO   0x14
40
#define SD_OP_READ_VDIS      0x15
41

    
42
#define SD_FLAG_CMD_WRITE    0x01
43
#define SD_FLAG_CMD_COW      0x02
44

    
45
#define SD_RES_SUCCESS       0x00 /* Success */
46
#define SD_RES_UNKNOWN       0x01 /* Unknown error */
47
#define SD_RES_NO_OBJ        0x02 /* No object found */
48
#define SD_RES_EIO           0x03 /* I/O error */
49
#define SD_RES_VDI_EXIST     0x04 /* Vdi exists already */
50
#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
51
#define SD_RES_SYSTEM_ERROR  0x06 /* System error */
52
#define SD_RES_VDI_LOCKED    0x07 /* Vdi is locked */
53
#define SD_RES_NO_VDI        0x08 /* No vdi found */
54
#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
55
#define SD_RES_VDI_READ      0x0A /* Cannot read requested vdi */
56
#define SD_RES_VDI_WRITE     0x0B /* Cannot write requested vdi */
57
#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
58
#define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */
59
#define SD_RES_NO_TAG        0x0E /* Requested tag is not found */
60
#define SD_RES_STARTUP       0x0F /* Sheepdog is on starting up */
61
#define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */
62
#define SD_RES_SHUTDOWN      0x11 /* Sheepdog is shutting down */
63
#define SD_RES_NO_MEM        0x12 /* Cannot allocate memory */
64
#define SD_RES_FULL_VDI      0x13 /* we already have the maximum vdis */
65
#define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */
66
#define SD_RES_NO_SPACE      0x15 /* Server has no room for new objects */
67
#define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
68
#define SD_RES_WAIT_FOR_JOIN    0x17 /* Waiting for other nodes joining */
69
#define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
70

    
71
/*
72
 * Object ID rules
73
 *
74
 *  0 - 19 (20 bits): data object space
75
 * 20 - 31 (12 bits): reserved data object space
76
 * 32 - 55 (24 bits): vdi object space
77
 * 56 - 59 ( 4 bits): reserved vdi object space
78
 * 60 - 63 ( 4 bits): object type indentifier space
79
 */
80

    
81
#define VDI_SPACE_SHIFT   32
82
#define VDI_BIT (UINT64_C(1) << 63)
83
#define VMSTATE_BIT (UINT64_C(1) << 62)
84
#define MAX_DATA_OBJS (UINT64_C(1) << 20)
85
#define MAX_CHILDREN 1024
86
#define SD_MAX_VDI_LEN 256
87
#define SD_MAX_VDI_TAG_LEN 256
88
#define SD_NR_VDIS   (1U << 24)
89
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
90
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
91
#define SECTOR_SIZE 512
92

    
93
#define SD_INODE_SIZE (sizeof(SheepdogInode))
94
#define CURRENT_VDI_ID 0
95

    
96
typedef struct SheepdogReq {
97
    uint8_t proto_ver;
98
    uint8_t opcode;
99
    uint16_t flags;
100
    uint32_t epoch;
101
    uint32_t id;
102
    uint32_t data_length;
103
    uint32_t opcode_specific[8];
104
} SheepdogReq;
105

    
106
typedef struct SheepdogRsp {
107
    uint8_t proto_ver;
108
    uint8_t opcode;
109
    uint16_t flags;
110
    uint32_t epoch;
111
    uint32_t id;
112
    uint32_t data_length;
113
    uint32_t result;
114
    uint32_t opcode_specific[7];
115
} SheepdogRsp;
116

    
117
typedef struct SheepdogObjReq {
118
    uint8_t proto_ver;
119
    uint8_t opcode;
120
    uint16_t flags;
121
    uint32_t epoch;
122
    uint32_t id;
123
    uint32_t data_length;
124
    uint64_t oid;
125
    uint64_t cow_oid;
126
    uint32_t copies;
127
    uint32_t rsvd;
128
    uint64_t offset;
129
} SheepdogObjReq;
130

    
131
typedef struct SheepdogObjRsp {
132
    uint8_t proto_ver;
133
    uint8_t opcode;
134
    uint16_t flags;
135
    uint32_t epoch;
136
    uint32_t id;
137
    uint32_t data_length;
138
    uint32_t result;
139
    uint32_t copies;
140
    uint32_t pad[6];
141
} SheepdogObjRsp;
142

    
143
typedef struct SheepdogVdiReq {
144
    uint8_t proto_ver;
145
    uint8_t opcode;
146
    uint16_t flags;
147
    uint32_t epoch;
148
    uint32_t id;
149
    uint32_t data_length;
150
    uint64_t vdi_size;
151
    uint32_t base_vdi_id;
152
    uint32_t copies;
153
    uint32_t snapid;
154
    uint32_t pad[3];
155
} SheepdogVdiReq;
156

    
157
typedef struct SheepdogVdiRsp {
158
    uint8_t proto_ver;
159
    uint8_t opcode;
160
    uint16_t flags;
161
    uint32_t epoch;
162
    uint32_t id;
163
    uint32_t data_length;
164
    uint32_t result;
165
    uint32_t rsvd;
166
    uint32_t vdi_id;
167
    uint32_t pad[5];
168
} SheepdogVdiRsp;
169

    
170
typedef struct SheepdogInode {
171
    char name[SD_MAX_VDI_LEN];
172
    char tag[SD_MAX_VDI_TAG_LEN];
173
    uint64_t ctime;
174
    uint64_t snap_ctime;
175
    uint64_t vm_clock_nsec;
176
    uint64_t vdi_size;
177
    uint64_t vm_state_size;
178
    uint16_t copy_policy;
179
    uint8_t nr_copies;
180
    uint8_t block_size_shift;
181
    uint32_t snap_id;
182
    uint32_t vdi_id;
183
    uint32_t parent_vdi_id;
184
    uint32_t child_vdi_id[MAX_CHILDREN];
185
    uint32_t data_vdi_id[MAX_DATA_OBJS];
186
} SheepdogInode;
187

    
188
/*
189
 * 64 bit FNV-1a non-zero initial basis
190
 */
191
#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
192

    
193
/*
194
 * 64 bit Fowler/Noll/Vo FNV-1a hash code
195
 */
196
static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
197
{
198
    unsigned char *bp = buf;
199
    unsigned char *be = bp + len;
200
    while (bp < be) {
201
        hval ^= (uint64_t) *bp++;
202
        hval += (hval << 1) + (hval << 4) + (hval << 5) +
203
            (hval << 7) + (hval << 8) + (hval << 40);
204
    }
205
    return hval;
206
}
207

    
208
static inline int is_data_obj_writeable(SheepdogInode *inode, unsigned int idx)
209
{
210
    return inode->vdi_id == inode->data_vdi_id[idx];
211
}
212

    
213
static inline int is_data_obj(uint64_t oid)
214
{
215
    return !(VDI_BIT & oid);
216
}
217

    
218
static inline uint64_t data_oid_to_idx(uint64_t oid)
219
{
220
    return oid & (MAX_DATA_OBJS - 1);
221
}
222

    
223
static inline uint64_t vid_to_vdi_oid(uint32_t vid)
224
{
225
    return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
226
}
227

    
228
static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
229
{
230
    return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
231
}
232

    
233
static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
234
{
235
    return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
236
}
237

    
238
static inline int is_snapshot(struct SheepdogInode *inode)
239
{
240
    return !!inode->snap_ctime;
241
}
242

    
243
#undef dprintf
244
#ifdef DEBUG_SDOG
245
#define dprintf(fmt, args...)                                       \
246
    do {                                                            \
247
        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
248
    } while (0)
249
#else
250
#define dprintf(fmt, args...)
251
#endif
252

    
253
typedef struct SheepdogAIOCB SheepdogAIOCB;
254

    
255
typedef struct AIOReq {
256
    SheepdogAIOCB *aiocb;
257
    unsigned int iov_offset;
258

    
259
    uint64_t oid;
260
    uint64_t base_oid;
261
    uint64_t offset;
262
    unsigned int data_len;
263
    uint8_t flags;
264
    uint32_t id;
265

    
266
    QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
267
    QLIST_ENTRY(AIOReq) aioreq_siblings;
268
} AIOReq;
269

    
270
enum AIOCBState {
271
    AIOCB_WRITE_UDATA,
272
    AIOCB_READ_UDATA,
273
};
274

    
275
struct SheepdogAIOCB {
276
    BlockDriverAIOCB common;
277

    
278
    QEMUIOVector *qiov;
279

    
280
    int64_t sector_num;
281
    int nb_sectors;
282

    
283
    int ret;
284
    enum AIOCBState aiocb_type;
285

    
286
    QEMUBH *bh;
287
    void (*aio_done_func)(SheepdogAIOCB *);
288

    
289
    int canceled;
290

    
291
    QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
292
};
293

    
294
typedef struct BDRVSheepdogState {
295
    SheepdogInode inode;
296

    
297
    uint32_t min_dirty_data_idx;
298
    uint32_t max_dirty_data_idx;
299

    
300
    char name[SD_MAX_VDI_LEN];
301
    int is_snapshot;
302

    
303
    char *addr;
304
    char *port;
305
    int fd;
306

    
307
    uint32_t aioreq_seq_num;
308
    QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
309
} BDRVSheepdogState;
310

    
311
static const char * sd_strerror(int err)
312
{
313
    int i;
314

    
315
    static const struct {
316
        int err;
317
        const char *desc;
318
    } errors[] = {
319
        {SD_RES_SUCCESS, "Success"},
320
        {SD_RES_UNKNOWN, "Unknown error"},
321
        {SD_RES_NO_OBJ, "No object found"},
322
        {SD_RES_EIO, "I/O error"},
323
        {SD_RES_VDI_EXIST, "VDI exists already"},
324
        {SD_RES_INVALID_PARMS, "Invalid parameters"},
325
        {SD_RES_SYSTEM_ERROR, "System error"},
326
        {SD_RES_VDI_LOCKED, "VDI is already locked"},
327
        {SD_RES_NO_VDI, "No vdi found"},
328
        {SD_RES_NO_BASE_VDI, "No base VDI found"},
329
        {SD_RES_VDI_READ, "Failed read the requested VDI"},
330
        {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
331
        {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
332
        {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
333
        {SD_RES_NO_TAG, "Failed to find the requested tag"},
334
        {SD_RES_STARTUP, "The system is still booting"},
335
        {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
336
        {SD_RES_SHUTDOWN, "The system is shutting down"},
337
        {SD_RES_NO_MEM, "Out of memory on the server"},
338
        {SD_RES_FULL_VDI, "We already have the maximum vdis"},
339
        {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
340
        {SD_RES_NO_SPACE, "Server has no space for new objects"},
341
        {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
342
        {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
343
        {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
344
    };
345

    
346
    for (i = 0; i < ARRAY_SIZE(errors); ++i) {
347
        if (errors[i].err == err) {
348
            return errors[i].desc;
349
        }
350
    }
351

    
352
    return "Invalid error code";
353
}
354

    
355
/*
356
 * Sheepdog I/O handling:
357
 *
358
 * 1. In the sd_aio_readv/writev, read/write requests are added to the
359
 *    QEMU Bottom Halves.
360
 *
361
 * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
362
 *    requests to the server and link the requests to the
363
 *    outstanding_list in the BDRVSheepdogState.  we exits the
364
 *    function without waiting for receiving the response.
365
 *
366
 * 3. We receive the response in aio_read_response, the fd handler to
367
 *    the sheepdog connection.  If metadata update is needed, we send
368
 *    the write request to the vdi object in sd_write_done, the write
369
 *    completion function.  The AIOCB callback is not called until all
370
 *    the requests belonging to the AIOCB are finished.
371
 */
372

    
373
static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
374
                                    uint64_t oid, unsigned int data_len,
375
                                    uint64_t offset, uint8_t flags,
376
                                    uint64_t base_oid, unsigned int iov_offset)
377
{
378
    AIOReq *aio_req;
379

    
380
    aio_req = qemu_malloc(sizeof(*aio_req));
381
    aio_req->aiocb = acb;
382
    aio_req->iov_offset = iov_offset;
383
    aio_req->oid = oid;
384
    aio_req->base_oid = base_oid;
385
    aio_req->offset = offset;
386
    aio_req->data_len = data_len;
387
    aio_req->flags = flags;
388
    aio_req->id = s->aioreq_seq_num++;
389

    
390
    QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
391
                      outstanding_aio_siblings);
392
    QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
393

    
394
    return aio_req;
395
}
396

    
397
static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
398
{
399
    SheepdogAIOCB *acb = aio_req->aiocb;
400
    QLIST_REMOVE(aio_req, outstanding_aio_siblings);
401
    QLIST_REMOVE(aio_req, aioreq_siblings);
402
    qemu_free(aio_req);
403

    
404
    return !QLIST_EMPTY(&acb->aioreq_head);
405
}
406

    
407
static void sd_finish_aiocb(SheepdogAIOCB *acb)
408
{
409
    if (!acb->canceled) {
410
        acb->common.cb(acb->common.opaque, acb->ret);
411
    }
412
    qemu_aio_release(acb);
413
}
414

    
415
static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
416
{
417
    SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
418

    
419
    /*
420
     * Sheepdog cannot cancel the requests which are already sent to
421
     * the servers, so we just complete the request with -EIO here.
422
     */
423
    acb->common.cb(acb->common.opaque, -EIO);
424
    acb->canceled = 1;
425
}
426

    
427
static AIOPool sd_aio_pool = {
428
    .aiocb_size = sizeof(SheepdogAIOCB),
429
    .cancel = sd_aio_cancel,
430
};
431

    
432
static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
433
                                   int64_t sector_num, int nb_sectors,
434
                                   BlockDriverCompletionFunc *cb, void *opaque)
435
{
436
    SheepdogAIOCB *acb;
437

    
438
    acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
439

    
440
    acb->qiov = qiov;
441

    
442
    acb->sector_num = sector_num;
443
    acb->nb_sectors = nb_sectors;
444

    
445
    acb->aio_done_func = NULL;
446
    acb->canceled = 0;
447
    acb->bh = NULL;
448
    acb->ret = 0;
449
    QLIST_INIT(&acb->aioreq_head);
450
    return acb;
451
}
452

    
453
static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
454
{
455
    if (acb->bh) {
456
        error_report("bug: %d %d\n", acb->aiocb_type, acb->aiocb_type);
457
        return -EIO;
458
    }
459

    
460
    acb->bh = qemu_bh_new(cb, acb);
461
    if (!acb->bh) {
462
        error_report("oom: %d %d\n", acb->aiocb_type, acb->aiocb_type);
463
        return -EIO;
464
    }
465

    
466
    qemu_bh_schedule(acb->bh);
467

    
468
    return 0;
469
}
470

    
471
#ifdef _WIN32
472

    
473
struct msghdr {
474
    struct iovec *msg_iov;
475
    size_t        msg_iovlen;
476
};
477

    
478
static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
479
{
480
    size_t size = 0;
481
    char *buf, *p;
482
    int i, ret;
483

    
484
    /* count the msg size */
485
    for (i = 0; i < msg->msg_iovlen; i++) {
486
        size += msg->msg_iov[i].iov_len;
487
    }
488
    buf = qemu_malloc(size);
489

    
490
    p = buf;
491
    for (i = 0; i < msg->msg_iovlen; i++) {
492
        memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
493
        p += msg->msg_iov[i].iov_len;
494
    }
495

    
496
    ret = send(s, buf, size, flags);
497

    
498
    qemu_free(buf);
499
    return ret;
500
}
501

    
502
static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
503
{
504
    size_t size = 0;
505
    char *buf, *p;
506
    int i, ret;
507

    
508
    /* count the msg size */
509
    for (i = 0; i < msg->msg_iovlen; i++) {
510
        size += msg->msg_iov[i].iov_len;
511
    }
512
    buf = qemu_malloc(size);
513

    
514
    ret = recv(s, buf, size, flags);
515
    if (ret < 0) {
516
        goto out;
517
    }
518

    
519
    p = buf;
520
    for (i = 0; i < msg->msg_iovlen; i++) {
521
        memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
522
        p += msg->msg_iov[i].iov_len;
523
    }
524
out:
525
    qemu_free(buf);
526
    return ret;
527
}
528

    
529
#endif
530

    
531
/*
532
 * Send/recv data with iovec buffers
533
 *
534
 * This function send/recv data from/to the iovec buffer directly.
535
 * The first `offset' bytes in the iovec buffer are skipped and next
536
 * `len' bytes are used.
537
 *
538
 * For example,
539
 *
540
 *   do_send_recv(sockfd, iov, len, offset, 1);
541
 *
542
 * is equals to
543
 *
544
 *   char *buf = malloc(size);
545
 *   iov_to_buf(iov, iovcnt, buf, offset, size);
546
 *   send(sockfd, buf, size, 0);
547
 *   free(buf);
548
 */
549
static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
550
                        int write)
551
{
552
    struct msghdr msg;
553
    int ret, diff;
554

    
555
    memset(&msg, 0, sizeof(msg));
556
    msg.msg_iov = iov;
557
    msg.msg_iovlen = 1;
558

    
559
    len += offset;
560

    
561
    while (iov->iov_len < len) {
562
        len -= iov->iov_len;
563

    
564
        iov++;
565
        msg.msg_iovlen++;
566
    }
567

    
568
    diff = iov->iov_len - len;
569
    iov->iov_len -= diff;
570

    
571
    while (msg.msg_iov->iov_len <= offset) {
572
        offset -= msg.msg_iov->iov_len;
573

    
574
        msg.msg_iov++;
575
        msg.msg_iovlen--;
576
    }
577

    
578
    msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
579
    msg.msg_iov->iov_len -= offset;
580

    
581
    if (write) {
582
        ret = sendmsg(sockfd, &msg, 0);
583
    } else {
584
        ret = recvmsg(sockfd, &msg, 0);
585
    }
586

    
587
    msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
588
    msg.msg_iov->iov_len += offset;
589

    
590
    iov->iov_len += diff;
591
    return ret;
592
}
593

    
594
static int connect_to_sdog(const char *addr, const char *port)
595
{
596
    char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
597
    int fd, ret;
598
    struct addrinfo hints, *res, *res0;
599

    
600
    if (!addr) {
601
        addr = SD_DEFAULT_ADDR;
602
        port = SD_DEFAULT_PORT;
603
    }
604

    
605
    memset(&hints, 0, sizeof(hints));
606
    hints.ai_socktype = SOCK_STREAM;
607

    
608
    ret = getaddrinfo(addr, port, &hints, &res0);
609
    if (ret) {
610
        error_report("unable to get address info %s, %s\n",
611
                     addr, strerror(errno));
612
        return -1;
613
    }
614

    
615
    for (res = res0; res; res = res->ai_next) {
616
        ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
617
                          sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
618
        if (ret) {
619
            continue;
620
        }
621

    
622
        fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
623
        if (fd < 0) {
624
            continue;
625
        }
626

    
627
    reconnect:
628
        ret = connect(fd, res->ai_addr, res->ai_addrlen);
629
        if (ret < 0) {
630
            if (errno == EINTR) {
631
                goto reconnect;
632
            }
633
            break;
634
        }
635

    
636
        dprintf("connected to %s:%s\n", addr, port);
637
        goto success;
638
    }
639
    fd = -1;
640
    error_report("failed connect to %s:%s\n", addr, port);
641
success:
642
    freeaddrinfo(res0);
643
    return fd;
644
}
645

    
646
static int do_readv_writev(int sockfd, struct iovec *iov, int len,
647
                           int iov_offset, int write)
648
{
649
    int ret;
650
again:
651
    ret = do_send_recv(sockfd, iov, len, iov_offset, write);
652
    if (ret < 0) {
653
        if (errno == EINTR || errno == EAGAIN) {
654
            goto again;
655
        }
656
        error_report("failed to recv a rsp, %s\n", strerror(errno));
657
        return 1;
658
    }
659

    
660
    iov_offset += ret;
661
    len -= ret;
662
    if (len) {
663
        goto again;
664
    }
665

    
666
    return 0;
667
}
668

    
669
static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
670
{
671
    return do_readv_writev(sockfd, iov, len, iov_offset, 0);
672
}
673

    
674
static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
675
{
676
    return do_readv_writev(sockfd, iov, len, iov_offset, 1);
677
}
678

    
679
static int do_read_write(int sockfd, void *buf, int len, int write)
680
{
681
    struct iovec iov;
682

    
683
    iov.iov_base = buf;
684
    iov.iov_len = len;
685

    
686
    return do_readv_writev(sockfd, &iov, len, 0, write);
687
}
688

    
689
static int do_read(int sockfd, void *buf, int len)
690
{
691
    return do_read_write(sockfd, buf, len, 0);
692
}
693

    
694
static int do_write(int sockfd, void *buf, int len)
695
{
696
    return do_read_write(sockfd, buf, len, 1);
697
}
698

    
699
static int send_req(int sockfd, SheepdogReq *hdr, void *data,
700
                    unsigned int *wlen)
701
{
702
    int ret;
703
    struct iovec iov[2];
704

    
705
    iov[0].iov_base = hdr;
706
    iov[0].iov_len = sizeof(*hdr);
707

    
708
    if (*wlen) {
709
        iov[1].iov_base = data;
710
        iov[1].iov_len = *wlen;
711
    }
712

    
713
    ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
714
    if (ret) {
715
        error_report("failed to send a req, %s\n", strerror(errno));
716
        ret = -1;
717
    }
718

    
719
    return ret;
720
}
721

    
722
static int do_req(int sockfd, SheepdogReq *hdr, void *data,
723
                  unsigned int *wlen, unsigned int *rlen)
724
{
725
    int ret;
726

    
727
    ret = send_req(sockfd, hdr, data, wlen);
728
    if (ret) {
729
        ret = -1;
730
        goto out;
731
    }
732

    
733
    ret = do_read(sockfd, hdr, sizeof(*hdr));
734
    if (ret) {
735
        error_report("failed to get a rsp, %s\n", strerror(errno));
736
        ret = -1;
737
        goto out;
738
    }
739

    
740
    if (*rlen > hdr->data_length) {
741
        *rlen = hdr->data_length;
742
    }
743

    
744
    if (*rlen) {
745
        ret = do_read(sockfd, data, *rlen);
746
        if (ret) {
747
            error_report("failed to get the data, %s\n", strerror(errno));
748
            ret = -1;
749
            goto out;
750
        }
751
    }
752
    ret = 0;
753
out:
754
    return ret;
755
}
756

    
757
static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
758
                           struct iovec *iov, int niov, int create,
759
                           enum AIOCBState aiocb_type);
760

    
761
/*
762
 * This function searchs pending requests to the object `oid', and
763
 * sends them.
764
 */
765
static void send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
766
{
767
    AIOReq *aio_req, *next;
768
    SheepdogAIOCB *acb;
769
    int ret;
770

    
771
    QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
772
                       outstanding_aio_siblings, next) {
773
        if (id == aio_req->id) {
774
            continue;
775
        }
776
        if (aio_req->oid != oid) {
777
            continue;
778
        }
779

    
780
        acb = aio_req->aiocb;
781
        ret = add_aio_request(s, aio_req, acb->qiov->iov,
782
                              acb->qiov->niov, 0, acb->aiocb_type);
783
        if (ret < 0) {
784
            error_report("add_aio_request is failed\n");
785
            free_aio_req(s, aio_req);
786
            if (QLIST_EMPTY(&acb->aioreq_head)) {
787
                sd_finish_aiocb(acb);
788
            }
789
        }
790
    }
791
}
792

    
793
/*
794
 * Receive responses of the I/O requests.
795
 *
796
 * This function is registered as a fd handler, and called from the
797
 * main loop when s->fd is ready for reading responses.
798
 */
799
static void aio_read_response(void *opaque)
800
{
801
    SheepdogObjRsp rsp;
802
    BDRVSheepdogState *s = opaque;
803
    int fd = s->fd;
804
    int ret;
805
    AIOReq *aio_req = NULL;
806
    SheepdogAIOCB *acb;
807
    int rest;
808
    unsigned long idx;
809

    
810
    if (QLIST_EMPTY(&s->outstanding_aio_head)) {
811
        return;
812
    }
813

    
814
    /* read a header */
815
    ret = do_read(fd, &rsp, sizeof(rsp));
816
    if (ret) {
817
        error_report("failed to get the header, %s\n", strerror(errno));
818
        return;
819
    }
820

    
821
    /* find the right aio_req from the outstanding_aio list */
822
    QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
823
        if (aio_req->id == rsp.id) {
824
            break;
825
        }
826
    }
827
    if (!aio_req) {
828
        error_report("cannot find aio_req %x\n", rsp.id);
829
        return;
830
    }
831

    
832
    acb = aio_req->aiocb;
833

    
834
    switch (acb->aiocb_type) {
835
    case AIOCB_WRITE_UDATA:
836
        if (!is_data_obj(aio_req->oid)) {
837
            break;
838
        }
839
        idx = data_oid_to_idx(aio_req->oid);
840

    
841
        if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
842
            /*
843
             * If the object is newly created one, we need to update
844
             * the vdi object (metadata object).  min_dirty_data_idx
845
             * and max_dirty_data_idx are changed to include updated
846
             * index between them.
847
             */
848
            s->inode.data_vdi_id[idx] = s->inode.vdi_id;
849
            s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
850
            s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
851

    
852
            /*
853
             * Some requests may be blocked because simultaneous
854
             * create requests are not allowed, so we search the
855
             * pending requests here.
856
             */
857
            send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
858
        }
859
        break;
860
    case AIOCB_READ_UDATA:
861
        ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
862
                       aio_req->iov_offset);
863
        if (ret) {
864
            error_report("failed to get the data, %s\n", strerror(errno));
865
            return;
866
        }
867
        break;
868
    }
869

    
870
    if (rsp.result != SD_RES_SUCCESS) {
871
        acb->ret = -EIO;
872
        error_report("%s\n", sd_strerror(rsp.result));
873
    }
874

    
875
    rest = free_aio_req(s, aio_req);
876
    if (!rest) {
877
        /*
878
         * We've finished all requests which belong to the AIOCB, so
879
         * we can call the callback now.
880
         */
881
        acb->aio_done_func(acb);
882
    }
883
}
884

    
885
static int aio_flush_request(void *opaque)
886
{
887
    BDRVSheepdogState *s = opaque;
888

    
889
    return !QLIST_EMPTY(&s->outstanding_aio_head);
890
}
891

    
892
#if !defined(SOL_TCP) || !defined(TCP_CORK)
893

    
894
static int set_cork(int fd, int v)
895
{
896
    return 0;
897
}
898

    
899
#else
900

    
901
static int set_cork(int fd, int v)
902
{
903
    return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
904
}
905

    
906
#endif
907

    
908
static int set_nodelay(int fd)
909
{
910
    int ret, opt;
911

    
912
    opt = 1;
913
    ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
914
    return ret;
915
}
916

    
917
/*
918
 * Return a socket discriptor to read/write objects.
919
 *
920
 * We cannot use this discriptor for other operations because
921
 * the block driver may be on waiting response from the server.
922
 */
923
static int get_sheep_fd(BDRVSheepdogState *s)
924
{
925
    int ret, fd;
926

    
927
    fd = connect_to_sdog(s->addr, s->port);
928
    if (fd < 0) {
929
        error_report("%s\n", strerror(errno));
930
        return -1;
931
    }
932

    
933
    socket_set_nonblock(fd);
934

    
935
    ret = set_nodelay(fd);
936
    if (ret) {
937
        error_report("%s\n", strerror(errno));
938
        closesocket(fd);
939
        return -1;
940
    }
941

    
942
    qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
943
                            NULL, s);
944
    return fd;
945
}
946

    
947
/*
948
 * Parse a filename
949
 *
950
 * filename must be one of the following formats:
951
 *   1. [vdiname]
952
 *   2. [vdiname]:[snapid]
953
 *   3. [vdiname]:[tag]
954
 *   4. [hostname]:[port]:[vdiname]
955
 *   5. [hostname]:[port]:[vdiname]:[snapid]
956
 *   6. [hostname]:[port]:[vdiname]:[tag]
957
 *
958
 * You can boot from the snapshot images by specifying `snapid` or
959
 * `tag'.
960
 *
961
 * You can run VMs outside the Sheepdog cluster by specifying
962
 * `hostname' and `port' (experimental).
963
 */
964
static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
965
                         char *vdi, uint32_t *snapid, char *tag)
966
{
967
    char *p, *q;
968
    int nr_sep;
969

    
970
    p = q = qemu_strdup(filename);
971

    
972
    /* count the number of separators */
973
    nr_sep = 0;
974
    while (*p) {
975
        if (*p == ':') {
976
            nr_sep++;
977
        }
978
        p++;
979
    }
980
    p = q;
981

    
982
    /* use the first two tokens as hostname and port number. */
983
    if (nr_sep >= 2) {
984
        s->addr = p;
985
        p = strchr(p, ':');
986
        *p++ = '\0';
987

    
988
        s->port = p;
989
        p = strchr(p, ':');
990
        *p++ = '\0';
991
    } else {
992
        s->addr = NULL;
993
        s->port = 0;
994
    }
995

    
996
    strncpy(vdi, p, SD_MAX_VDI_LEN);
997

    
998
    p = strchr(vdi, ':');
999
    if (p) {
1000
        *p++ = '\0';
1001
        *snapid = strtoul(p, NULL, 10);
1002
        if (*snapid == 0) {
1003
            strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
1004
        }
1005
    } else {
1006
        *snapid = CURRENT_VDI_ID; /* search current vdi */
1007
    }
1008

    
1009
    if (s->addr == NULL) {
1010
        qemu_free(q);
1011
    }
1012

    
1013
    return 0;
1014
}
1015

    
1016
static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1017
                         char *tag, uint32_t *vid, int for_snapshot)
1018
{
1019
    int ret, fd;
1020
    SheepdogVdiReq hdr;
1021
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1022
    unsigned int wlen, rlen = 0;
1023
    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1024

    
1025
    fd = connect_to_sdog(s->addr, s->port);
1026
    if (fd < 0) {
1027
        return -1;
1028
    }
1029

    
1030
    memset(buf, 0, sizeof(buf));
1031
    strncpy(buf, filename, SD_MAX_VDI_LEN);
1032
    strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1033

    
1034
    memset(&hdr, 0, sizeof(hdr));
1035
    if (for_snapshot) {
1036
        hdr.opcode = SD_OP_GET_VDI_INFO;
1037
    } else {
1038
        hdr.opcode = SD_OP_LOCK_VDI;
1039
    }
1040
    wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1041
    hdr.proto_ver = SD_PROTO_VER;
1042
    hdr.data_length = wlen;
1043
    hdr.snapid = snapid;
1044
    hdr.flags = SD_FLAG_CMD_WRITE;
1045

    
1046
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1047
    if (ret) {
1048
        ret = -1;
1049
        goto out;
1050
    }
1051

    
1052
    if (rsp->result != SD_RES_SUCCESS) {
1053
        error_report("cannot get vdi info, %s, %s %d %s\n",
1054
                     sd_strerror(rsp->result), filename, snapid, tag);
1055
        ret = -1;
1056
        goto out;
1057
    }
1058
    *vid = rsp->vdi_id;
1059

    
1060
    ret = 0;
1061
out:
1062
    closesocket(fd);
1063
    return ret;
1064
}
1065

    
1066
static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1067
                           struct iovec *iov, int niov, int create,
1068
                           enum AIOCBState aiocb_type)
1069
{
1070
    int nr_copies = s->inode.nr_copies;
1071
    SheepdogObjReq hdr;
1072
    unsigned int wlen;
1073
    int ret;
1074
    uint64_t oid = aio_req->oid;
1075
    unsigned int datalen = aio_req->data_len;
1076
    uint64_t offset = aio_req->offset;
1077
    uint8_t flags = aio_req->flags;
1078
    uint64_t old_oid = aio_req->base_oid;
1079

    
1080
    if (!nr_copies) {
1081
        error_report("bug\n");
1082
    }
1083

    
1084
    memset(&hdr, 0, sizeof(hdr));
1085

    
1086
    if (aiocb_type == AIOCB_READ_UDATA) {
1087
        wlen = 0;
1088
        hdr.opcode = SD_OP_READ_OBJ;
1089
        hdr.flags = flags;
1090
    } else if (create) {
1091
        wlen = datalen;
1092
        hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1093
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
1094
    } else {
1095
        wlen = datalen;
1096
        hdr.opcode = SD_OP_WRITE_OBJ;
1097
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
1098
    }
1099

    
1100
    hdr.oid = oid;
1101
    hdr.cow_oid = old_oid;
1102
    hdr.copies = s->inode.nr_copies;
1103

    
1104
    hdr.data_length = datalen;
1105
    hdr.offset = offset;
1106

    
1107
    hdr.id = aio_req->id;
1108

    
1109
    set_cork(s->fd, 1);
1110

    
1111
    /* send a header */
1112
    ret = do_write(s->fd, &hdr, sizeof(hdr));
1113
    if (ret) {
1114
        error_report("failed to send a req, %s\n", strerror(errno));
1115
        return -EIO;
1116
    }
1117

    
1118
    if (wlen) {
1119
        ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1120
        if (ret) {
1121
            error_report("failed to send a data, %s\n", strerror(errno));
1122
            return -EIO;
1123
        }
1124
    }
1125

    
1126
    set_cork(s->fd, 0);
1127

    
1128
    return 0;
1129
}
1130

    
1131
static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1132
                             unsigned int datalen, uint64_t offset,
1133
                             int write, int create)
1134
{
1135
    SheepdogObjReq hdr;
1136
    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1137
    unsigned int wlen, rlen;
1138
    int ret;
1139

    
1140
    memset(&hdr, 0, sizeof(hdr));
1141

    
1142
    if (write) {
1143
        wlen = datalen;
1144
        rlen = 0;
1145
        hdr.flags = SD_FLAG_CMD_WRITE;
1146
        if (create) {
1147
            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1148
        } else {
1149
            hdr.opcode = SD_OP_WRITE_OBJ;
1150
        }
1151
    } else {
1152
        wlen = 0;
1153
        rlen = datalen;
1154
        hdr.opcode = SD_OP_READ_OBJ;
1155
    }
1156
    hdr.oid = oid;
1157
    hdr.data_length = datalen;
1158
    hdr.offset = offset;
1159
    hdr.copies = copies;
1160

    
1161
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1162
    if (ret) {
1163
        error_report("failed to send a request to the sheep\n");
1164
        return -1;
1165
    }
1166

    
1167
    switch (rsp->result) {
1168
    case SD_RES_SUCCESS:
1169
        return 0;
1170
    default:
1171
        error_report("%s\n", sd_strerror(rsp->result));
1172
        return -1;
1173
    }
1174
}
1175

    
1176
static int read_object(int fd, char *buf, uint64_t oid, int copies,
1177
                       unsigned int datalen, uint64_t offset)
1178
{
1179
    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1180
}
1181

    
1182
static int write_object(int fd, char *buf, uint64_t oid, int copies,
1183
                        unsigned int datalen, uint64_t offset, int create)
1184
{
1185
    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1186
}
1187

    
1188
static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1189
{
1190
    int ret, fd;
1191
    uint32_t vid = 0;
1192
    BDRVSheepdogState *s = bs->opaque;
1193
    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1194
    uint32_t snapid;
1195
    char *buf = NULL;
1196

    
1197
    strstart(filename, "sheepdog:", (const char **)&filename);
1198

    
1199
    QLIST_INIT(&s->outstanding_aio_head);
1200
    s->fd = -1;
1201

    
1202
    memset(vdi, 0, sizeof(vdi));
1203
    memset(tag, 0, sizeof(tag));
1204
    if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1205
        goto out;
1206
    }
1207
    s->fd = get_sheep_fd(s);
1208
    if (s->fd < 0) {
1209
        goto out;
1210
    }
1211

    
1212
    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1213
    if (ret) {
1214
        goto out;
1215
    }
1216

    
1217
    if (snapid) {
1218
        dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1219
        s->is_snapshot = 1;
1220
    }
1221

    
1222
    fd = connect_to_sdog(s->addr, s->port);
1223
    if (fd < 0) {
1224
        error_report("failed to connect\n");
1225
        goto out;
1226
    }
1227

    
1228
    buf = qemu_malloc(SD_INODE_SIZE);
1229
    ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1230

    
1231
    closesocket(fd);
1232

    
1233
    if (ret) {
1234
        goto out;
1235
    }
1236

    
1237
    memcpy(&s->inode, buf, sizeof(s->inode));
1238
    s->min_dirty_data_idx = UINT32_MAX;
1239
    s->max_dirty_data_idx = 0;
1240

    
1241
    bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1242
    strncpy(s->name, vdi, sizeof(s->name));
1243
    qemu_free(buf);
1244
    return 0;
1245
out:
1246
    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1247
    if (s->fd >= 0) {
1248
        closesocket(s->fd);
1249
    }
1250
    qemu_free(buf);
1251
    return -1;
1252
}
1253

    
1254
static int do_sd_create(char *filename, int64_t vdi_size,
1255
                        uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1256
                        const char *addr, const char *port)
1257
{
1258
    SheepdogVdiReq hdr;
1259
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1260
    int fd, ret;
1261
    unsigned int wlen, rlen = 0;
1262
    char buf[SD_MAX_VDI_LEN];
1263

    
1264
    fd = connect_to_sdog(addr, port);
1265
    if (fd < 0) {
1266
        return -EIO;
1267
    }
1268

    
1269
    memset(buf, 0, sizeof(buf));
1270
    strncpy(buf, filename, SD_MAX_VDI_LEN);
1271

    
1272
    memset(&hdr, 0, sizeof(hdr));
1273
    hdr.opcode = SD_OP_NEW_VDI;
1274
    hdr.base_vdi_id = base_vid;
1275

    
1276
    wlen = SD_MAX_VDI_LEN;
1277

    
1278
    hdr.flags = SD_FLAG_CMD_WRITE;
1279
    hdr.snapid = snapshot;
1280

    
1281
    hdr.data_length = wlen;
1282
    hdr.vdi_size = vdi_size;
1283

    
1284
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1285

    
1286
    closesocket(fd);
1287

    
1288
    if (ret) {
1289
        return -EIO;
1290
    }
1291

    
1292
    if (rsp->result != SD_RES_SUCCESS) {
1293
        error_report("%s, %s\n", sd_strerror(rsp->result), filename);
1294
        return -EIO;
1295
    }
1296

    
1297
    if (vdi_id) {
1298
        *vdi_id = rsp->vdi_id;
1299
    }
1300

    
1301
    return 0;
1302
}
1303

    
1304
static int sd_create(const char *filename, QEMUOptionParameter *options)
1305
{
1306
    int ret;
1307
    uint32_t vid = 0;
1308
    int64_t vdi_size = 0;
1309
    char *backing_file = NULL;
1310

    
1311
    strstart(filename, "sheepdog:", (const char **)&filename);
1312

    
1313
    while (options && options->name) {
1314
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1315
            vdi_size = options->value.n;
1316
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1317
            backing_file = options->value.s;
1318
        }
1319
        options++;
1320
    }
1321

    
1322
    if (vdi_size > SD_MAX_VDI_SIZE) {
1323
        error_report("too big image size\n");
1324
        return -EINVAL;
1325
    }
1326

    
1327
    if (backing_file) {
1328
        BlockDriverState *bs;
1329
        BDRVSheepdogState *s;
1330
        BlockDriver *drv;
1331

    
1332
        /* Currently, only Sheepdog backing image is supported. */
1333
        drv = bdrv_find_protocol(backing_file);
1334
        if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1335
            error_report("backing_file must be a sheepdog image\n");
1336
            return -EINVAL;
1337
        }
1338

    
1339
        ret = bdrv_file_open(&bs, backing_file, 0);
1340
        if (ret < 0)
1341
            return -EIO;
1342

    
1343
        s = bs->opaque;
1344

    
1345
        if (!is_snapshot(&s->inode)) {
1346
            error_report("cannot clone from a non snapshot vdi\n");
1347
            bdrv_delete(bs);
1348
            return -EINVAL;
1349
        }
1350

    
1351
        vid = s->inode.vdi_id;
1352
        bdrv_delete(bs);
1353
    }
1354

    
1355
    return do_sd_create((char *)filename, vdi_size, vid, NULL, 0, NULL, NULL);
1356
}
1357

    
1358
static void sd_close(BlockDriverState *bs)
1359
{
1360
    BDRVSheepdogState *s = bs->opaque;
1361
    SheepdogVdiReq hdr;
1362
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1363
    unsigned int wlen, rlen = 0;
1364
    int fd, ret;
1365

    
1366
    dprintf("%s\n", s->name);
1367

    
1368
    fd = connect_to_sdog(s->addr, s->port);
1369
    if (fd < 0) {
1370
        return;
1371
    }
1372

    
1373
    memset(&hdr, 0, sizeof(hdr));
1374

    
1375
    hdr.opcode = SD_OP_RELEASE_VDI;
1376
    wlen = strlen(s->name) + 1;
1377
    hdr.data_length = wlen;
1378
    hdr.flags = SD_FLAG_CMD_WRITE;
1379

    
1380
    ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1381

    
1382
    closesocket(fd);
1383

    
1384
    if (!ret && rsp->result != SD_RES_SUCCESS &&
1385
        rsp->result != SD_RES_VDI_NOT_LOCKED) {
1386
        error_report("%s, %s\n", sd_strerror(rsp->result), s->name);
1387
    }
1388

    
1389
    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1390
    closesocket(s->fd);
1391
    qemu_free(s->addr);
1392
}
1393

    
1394
static int64_t sd_getlength(BlockDriverState *bs)
1395
{
1396
    BDRVSheepdogState *s = bs->opaque;
1397

    
1398
    return s->inode.vdi_size;
1399
}
1400

    
1401
static int sd_truncate(BlockDriverState *bs, int64_t offset)
1402
{
1403
    BDRVSheepdogState *s = bs->opaque;
1404
    int ret, fd;
1405
    unsigned int datalen;
1406

    
1407
    if (offset < s->inode.vdi_size) {
1408
        error_report("shrinking is not supported\n");
1409
        return -EINVAL;
1410
    } else if (offset > SD_MAX_VDI_SIZE) {
1411
        error_report("too big image size\n");
1412
        return -EINVAL;
1413
    }
1414

    
1415
    fd = connect_to_sdog(s->addr, s->port);
1416
    if (fd < 0) {
1417
        return -EIO;
1418
    }
1419

    
1420
    /* we don't need to update entire object */
1421
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1422
    s->inode.vdi_size = offset;
1423
    ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1424
                       s->inode.nr_copies, datalen, 0, 0);
1425
    close(fd);
1426

    
1427
    if (ret < 0) {
1428
        error_report("failed to update an inode.\n");
1429
        return -EIO;
1430
    }
1431

    
1432
    return 0;
1433
}
1434

    
1435
/*
1436
 * This function is called after writing data objects.  If we need to
1437
 * update metadata, this sends a write request to the vdi object.
1438
 * Otherwise, this calls the AIOCB callback.
1439
 */
1440
static void sd_write_done(SheepdogAIOCB *acb)
1441
{
1442
    int ret;
1443
    BDRVSheepdogState *s = acb->common.bs->opaque;
1444
    struct iovec iov;
1445
    AIOReq *aio_req;
1446
    uint32_t offset, data_len, mn, mx;
1447

    
1448
    mn = s->min_dirty_data_idx;
1449
    mx = s->max_dirty_data_idx;
1450
    if (mn <= mx) {
1451
        /* we need to update the vdi object. */
1452
        offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1453
            mn * sizeof(s->inode.data_vdi_id[0]);
1454
        data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1455

    
1456
        s->min_dirty_data_idx = UINT32_MAX;
1457
        s->max_dirty_data_idx = 0;
1458

    
1459
        iov.iov_base = &s->inode;
1460
        iov.iov_len = sizeof(s->inode);
1461
        aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1462
                                data_len, offset, 0, 0, offset);
1463
        ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1464
        if (ret) {
1465
            free_aio_req(s, aio_req);
1466
            acb->ret = -EIO;
1467
            goto out;
1468
        }
1469

    
1470
        acb->aio_done_func = sd_finish_aiocb;
1471
        acb->aiocb_type = AIOCB_WRITE_UDATA;
1472
        return;
1473
    }
1474
out:
1475
    sd_finish_aiocb(acb);
1476
}
1477

    
1478
/*
1479
 * Create a writable VDI from a snapshot
1480
 */
1481
static int sd_create_branch(BDRVSheepdogState *s)
1482
{
1483
    int ret, fd;
1484
    uint32_t vid;
1485
    char *buf;
1486

    
1487
    dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1488

    
1489
    buf = qemu_malloc(SD_INODE_SIZE);
1490

    
1491
    ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1492
                       s->addr, s->port);
1493
    if (ret) {
1494
        goto out;
1495
    }
1496

    
1497
    dprintf("%" PRIx32 " is created.\n", vid);
1498

    
1499
    fd = connect_to_sdog(s->addr, s->port);
1500
    if (fd < 0) {
1501
        error_report("failed to connect\n");
1502
        goto out;
1503
    }
1504

    
1505
    ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1506
                      SD_INODE_SIZE, 0);
1507

    
1508
    closesocket(fd);
1509

    
1510
    if (ret < 0) {
1511
        goto out;
1512
    }
1513

    
1514
    memcpy(&s->inode, buf, sizeof(s->inode));
1515

    
1516
    s->is_snapshot = 0;
1517
    ret = 0;
1518
    dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1519

    
1520
out:
1521
    qemu_free(buf);
1522

    
1523
    return ret;
1524
}
1525

    
1526
/*
1527
 * Send I/O requests to the server.
1528
 *
1529
 * This function sends requests to the server, links the requests to
1530
 * the outstanding_list in BDRVSheepdogState, and exits without
1531
 * waiting the response.  The responses are received in the
1532
 * `aio_read_response' function which is called from the main loop as
1533
 * a fd handler.
1534
 */
1535
static void sd_readv_writev_bh_cb(void *p)
1536
{
1537
    SheepdogAIOCB *acb = p;
1538
    int ret = 0;
1539
    unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1540
    unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1541
    uint64_t oid;
1542
    uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1543
    BDRVSheepdogState *s = acb->common.bs->opaque;
1544
    SheepdogInode *inode = &s->inode;
1545
    AIOReq *aio_req;
1546

    
1547
    qemu_bh_delete(acb->bh);
1548
    acb->bh = NULL;
1549

    
1550
    if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1551
        /*
1552
         * In the case we open the snapshot VDI, Sheepdog creates the
1553
         * writable VDI when we do a write operation first.
1554
         */
1555
        ret = sd_create_branch(s);
1556
        if (ret) {
1557
            acb->ret = -EIO;
1558
            goto out;
1559
        }
1560
    }
1561

    
1562
    while (done != total) {
1563
        uint8_t flags = 0;
1564
        uint64_t old_oid = 0;
1565
        int create = 0;
1566

    
1567
        oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1568

    
1569
        len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1570

    
1571
        if (!inode->data_vdi_id[idx]) {
1572
            if (acb->aiocb_type == AIOCB_READ_UDATA) {
1573
                goto done;
1574
            }
1575

    
1576
            create = 1;
1577
        } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
1578
                   && !is_data_obj_writeable(inode, idx)) {
1579
            /* Copy-On-Write */
1580
            create = 1;
1581
            old_oid = oid;
1582
            flags = SD_FLAG_CMD_COW;
1583
        }
1584

    
1585
        if (create) {
1586
            dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1587
                    " %" PRIu64 "\n", inode->vdi_id, oid,
1588
                    vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1589
            oid = vid_to_data_oid(inode->vdi_id, idx);
1590
            dprintf("new oid %lx\n", oid);
1591
        }
1592

    
1593
        aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1594

    
1595
        if (create) {
1596
            AIOReq *areq;
1597
            QLIST_FOREACH(areq, &s->outstanding_aio_head,
1598
                          outstanding_aio_siblings) {
1599
                if (areq == aio_req) {
1600
                    continue;
1601
                }
1602
                if (areq->oid == oid) {
1603
                    /*
1604
                     * Sheepdog cannot handle simultaneous create
1605
                     * requests to the same object.  So we cannot send
1606
                     * the request until the previous request
1607
                     * finishes.
1608
                     */
1609
                    aio_req->flags = 0;
1610
                    aio_req->base_oid = 0;
1611
                    goto done;
1612
                }
1613
            }
1614
        }
1615

    
1616
        ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1617
                              create, acb->aiocb_type);
1618
        if (ret < 0) {
1619
            error_report("add_aio_request is failed\n");
1620
            free_aio_req(s, aio_req);
1621
            acb->ret = -EIO;
1622
            goto out;
1623
        }
1624
    done:
1625
        offset = 0;
1626
        idx++;
1627
        done += len;
1628
    }
1629
out:
1630
    if (QLIST_EMPTY(&acb->aioreq_head)) {
1631
        sd_finish_aiocb(acb);
1632
    }
1633
}
1634

    
1635
static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
1636
                                       QEMUIOVector *qiov, int nb_sectors,
1637
                                       BlockDriverCompletionFunc *cb,
1638
                                       void *opaque)
1639
{
1640
    SheepdogAIOCB *acb;
1641

    
1642
    if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1643
        /* TODO: shouldn't block here */
1644
        if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
1645
            return NULL;
1646
        }
1647
        bs->total_sectors = sector_num + nb_sectors;
1648
    }
1649

    
1650
    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1651
    acb->aio_done_func = sd_write_done;
1652
    acb->aiocb_type = AIOCB_WRITE_UDATA;
1653

    
1654
    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1655
    return &acb->common;
1656
}
1657

    
1658
static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
1659
                                      QEMUIOVector *qiov, int nb_sectors,
1660
                                      BlockDriverCompletionFunc *cb,
1661
                                      void *opaque)
1662
{
1663
    SheepdogAIOCB *acb;
1664
    int i;
1665

    
1666
    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1667
    acb->aiocb_type = AIOCB_READ_UDATA;
1668
    acb->aio_done_func = sd_finish_aiocb;
1669

    
1670
    /*
1671
     * TODO: we can do better; we don't need to initialize
1672
     * blindly.
1673
     */
1674
    for (i = 0; i < qiov->niov; i++) {
1675
        memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1676
    }
1677

    
1678
    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1679
    return &acb->common;
1680
}
1681

    
1682
static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1683
{
1684
    BDRVSheepdogState *s = bs->opaque;
1685
    int ret, fd;
1686
    uint32_t new_vid;
1687
    SheepdogInode *inode;
1688
    unsigned int datalen;
1689

    
1690
    dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1691
            "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1692
            s->name, sn_info->vm_state_size, s->is_snapshot);
1693

    
1694
    if (s->is_snapshot) {
1695
        error_report("You can't create a snapshot of a snapshot VDI, "
1696
                     "%s (%" PRIu32 ").\n", s->name, s->inode.vdi_id);
1697

    
1698
        return -EINVAL;
1699
    }
1700

    
1701
    dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1702

    
1703
    s->inode.vm_state_size = sn_info->vm_state_size;
1704
    s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1705
    strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1706
    /* we don't need to update entire object */
1707
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1708

    
1709
    /* refresh inode. */
1710
    fd = connect_to_sdog(s->addr, s->port);
1711
    if (fd < 0) {
1712
        ret = -EIO;
1713
        goto cleanup;
1714
    }
1715

    
1716
    ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1717
                       s->inode.nr_copies, datalen, 0, 0);
1718
    if (ret < 0) {
1719
        error_report("failed to write snapshot's inode.\n");
1720
        ret = -EIO;
1721
        goto cleanup;
1722
    }
1723

    
1724
    ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1725
                       s->addr, s->port);
1726
    if (ret < 0) {
1727
        error_report("failed to create inode for snapshot. %s\n",
1728
                     strerror(errno));
1729
        ret = -EIO;
1730
        goto cleanup;
1731
    }
1732

    
1733
    inode = (SheepdogInode *)qemu_malloc(datalen);
1734

    
1735
    ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1736
                      s->inode.nr_copies, datalen, 0);
1737

    
1738
    if (ret < 0) {
1739
        error_report("failed to read new inode info. %s\n", strerror(errno));
1740
        ret = -EIO;
1741
        goto cleanup;
1742
    }
1743

    
1744
    memcpy(&s->inode, inode, datalen);
1745
    dprintf("s->inode: name %s snap_id %x oid %x\n",
1746
            s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1747

    
1748
cleanup:
1749
    closesocket(fd);
1750
    return ret;
1751
}
1752

    
1753
static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1754
{
1755
    BDRVSheepdogState *s = bs->opaque;
1756
    BDRVSheepdogState *old_s;
1757
    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1758
    char *buf = NULL;
1759
    uint32_t vid;
1760
    uint32_t snapid = 0;
1761
    int ret = -ENOENT, fd;
1762

    
1763
    old_s = qemu_malloc(sizeof(BDRVSheepdogState));
1764

    
1765
    memcpy(old_s, s, sizeof(BDRVSheepdogState));
1766

    
1767
    memset(vdi, 0, sizeof(vdi));
1768
    strncpy(vdi, s->name, sizeof(vdi));
1769

    
1770
    memset(tag, 0, sizeof(tag));
1771
    snapid = strtoul(snapshot_id, NULL, 10);
1772
    if (!snapid) {
1773
        strncpy(tag, s->name, sizeof(tag));
1774
    }
1775

    
1776
    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1777
    if (ret) {
1778
        error_report("Failed to find_vdi_name\n");
1779
        ret = -ENOENT;
1780
        goto out;
1781
    }
1782

    
1783
    fd = connect_to_sdog(s->addr, s->port);
1784
    if (fd < 0) {
1785
        error_report("failed to connect\n");
1786
        goto out;
1787
    }
1788

    
1789
    buf = qemu_malloc(SD_INODE_SIZE);
1790
    ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1791
                      SD_INODE_SIZE, 0);
1792

    
1793
    closesocket(fd);
1794

    
1795
    if (ret) {
1796
        ret = -ENOENT;
1797
        goto out;
1798
    }
1799

    
1800
    memcpy(&s->inode, buf, sizeof(s->inode));
1801

    
1802
    if (!s->inode.vm_state_size) {
1803
        error_report("Invalid snapshot\n");
1804
        ret = -ENOENT;
1805
        goto out;
1806
    }
1807

    
1808
    s->is_snapshot = 1;
1809

    
1810
    qemu_free(buf);
1811
    qemu_free(old_s);
1812

    
1813
    return 0;
1814
out:
1815
    /* recover bdrv_sd_state */
1816
    memcpy(s, old_s, sizeof(BDRVSheepdogState));
1817
    qemu_free(buf);
1818
    qemu_free(old_s);
1819

    
1820
    error_report("failed to open. recover old bdrv_sd_state.\n");
1821

    
1822
    return ret;
1823
}
1824

    
1825
static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1826
{
1827
    /* FIXME: Delete specified snapshot id.  */
1828
    return 0;
1829
}
1830

    
1831
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
1832
#define BITS_PER_BYTE        8
1833
#define BITS_TO_LONGS(nr)    DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
1834
#define DECLARE_BITMAP(name,bits)               \
1835
    unsigned long name[BITS_TO_LONGS(bits)]
1836

    
1837
#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
1838

    
1839
static inline int test_bit(unsigned int nr, const unsigned long *addr)
1840
{
1841
    return ((1UL << (nr % BITS_PER_LONG)) &
1842
            (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
1843
}
1844

    
1845
static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1846
{
1847
    BDRVSheepdogState *s = bs->opaque;
1848
    SheepdogReq req;
1849
    int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1850
    QEMUSnapshotInfo *sn_tab = NULL;
1851
    unsigned wlen, rlen;
1852
    int found = 0;
1853
    static SheepdogInode inode;
1854
    unsigned long *vdi_inuse;
1855
    unsigned int start_nr;
1856
    uint64_t hval;
1857
    uint32_t vid;
1858

    
1859
    vdi_inuse = qemu_malloc(max);
1860

    
1861
    fd = connect_to_sdog(s->addr, s->port);
1862
    if (fd < 0) {
1863
        goto out;
1864
    }
1865

    
1866
    rlen = max;
1867
    wlen = 0;
1868

    
1869
    memset(&req, 0, sizeof(req));
1870

    
1871
    req.opcode = SD_OP_READ_VDIS;
1872
    req.data_length = max;
1873

    
1874
    ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1875

    
1876
    closesocket(fd);
1877
    if (ret) {
1878
        goto out;
1879
    }
1880

    
1881
    sn_tab = qemu_mallocz(nr * sizeof(*sn_tab));
1882

    
1883
    /* calculate a vdi id with hash function */
1884
    hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1885
    start_nr = hval & (SD_NR_VDIS - 1);
1886

    
1887
    fd = connect_to_sdog(s->addr, s->port);
1888
    if (fd < 0) {
1889
        error_report("failed to connect\n");
1890
        goto out;
1891
    }
1892

    
1893
    for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1894
        if (!test_bit(vid, vdi_inuse)) {
1895
            break;
1896
        }
1897

    
1898
        /* we don't need to read entire object */
1899
        ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1900
                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1901

    
1902
        if (ret) {
1903
            continue;
1904
        }
1905

    
1906
        if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1907
            sn_tab[found].date_sec = inode.snap_ctime >> 32;
1908
            sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1909
            sn_tab[found].vm_state_size = inode.vm_state_size;
1910
            sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1911

    
1912
            snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1913
                     inode.snap_id);
1914
            strncpy(sn_tab[found].name, inode.tag,
1915
                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1916
            found++;
1917
        }
1918
    }
1919

    
1920
    closesocket(fd);
1921
out:
1922
    *psn_tab = sn_tab;
1923

    
1924
    qemu_free(vdi_inuse);
1925

    
1926
    return found;
1927
}
1928

    
1929
static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
1930
                                int64_t pos, int size, int load)
1931
{
1932
    int fd, create;
1933
    int ret = 0;
1934
    unsigned int data_len;
1935
    uint64_t vmstate_oid;
1936
    uint32_t vdi_index;
1937
    uint64_t offset;
1938

    
1939
    fd = connect_to_sdog(s->addr, s->port);
1940
    if (fd < 0) {
1941
        ret = -EIO;
1942
        goto cleanup;
1943
    }
1944

    
1945
    while (size) {
1946
        vdi_index = pos / SD_DATA_OBJ_SIZE;
1947
        offset = pos % SD_DATA_OBJ_SIZE;
1948

    
1949
        data_len = MIN(size, SD_DATA_OBJ_SIZE);
1950

    
1951
        vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
1952

    
1953
        create = (offset == 0);
1954
        if (load) {
1955
            ret = read_object(fd, (char *)data, vmstate_oid,
1956
                              s->inode.nr_copies, data_len, offset);
1957
        } else {
1958
            ret = write_object(fd, (char *)data, vmstate_oid,
1959
                               s->inode.nr_copies, data_len, offset, create);
1960
        }
1961

    
1962
        if (ret < 0) {
1963
            error_report("failed to save vmstate %s\n", strerror(errno));
1964
            ret = -EIO;
1965
            goto cleanup;
1966
        }
1967

    
1968
        pos += data_len;
1969
        size -= data_len;
1970
        ret += data_len;
1971
    }
1972
cleanup:
1973
    closesocket(fd);
1974
    return ret;
1975
}
1976

    
1977
static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
1978
                           int64_t pos, int size)
1979
{
1980
    BDRVSheepdogState *s = bs->opaque;
1981

    
1982
    return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
1983
}
1984

    
1985
static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
1986
                           int64_t pos, int size)
1987
{
1988
    BDRVSheepdogState *s = bs->opaque;
1989

    
1990
    return do_load_save_vmstate(s, data, pos, size, 1);
1991
}
1992

    
1993

    
1994
static QEMUOptionParameter sd_create_options[] = {
1995
    {
1996
        .name = BLOCK_OPT_SIZE,
1997
        .type = OPT_SIZE,
1998
        .help = "Virtual disk size"
1999
    },
2000
    {
2001
        .name = BLOCK_OPT_BACKING_FILE,
2002
        .type = OPT_STRING,
2003
        .help = "File name of a base image"
2004
    },
2005
    { NULL }
2006
};
2007

    
2008
BlockDriver bdrv_sheepdog = {
2009
    .format_name    = "sheepdog",
2010
    .protocol_name  = "sheepdog",
2011
    .instance_size  = sizeof(BDRVSheepdogState),
2012
    .bdrv_file_open = sd_open,
2013
    .bdrv_close     = sd_close,
2014
    .bdrv_create    = sd_create,
2015
    .bdrv_getlength = sd_getlength,
2016
    .bdrv_truncate  = sd_truncate,
2017

    
2018
    .bdrv_aio_readv     = sd_aio_readv,
2019
    .bdrv_aio_writev    = sd_aio_writev,
2020

    
2021
    .bdrv_snapshot_create   = sd_snapshot_create,
2022
    .bdrv_snapshot_goto     = sd_snapshot_goto,
2023
    .bdrv_snapshot_delete   = sd_snapshot_delete,
2024
    .bdrv_snapshot_list     = sd_snapshot_list,
2025

    
2026
    .bdrv_save_vmstate  = sd_save_vmstate,
2027
    .bdrv_load_vmstate  = sd_load_vmstate,
2028

    
2029
    .create_options = sd_create_options,
2030
};
2031

    
2032
static void bdrv_sheepdog_init(void)
2033
{
2034
    bdrv_register(&bdrv_sheepdog);
2035
}
2036
block_init(bdrv_sheepdog_init);