Statistics
| Branch: | Revision:

root / block / sheepdog.c @ 00aa0040

History | View | Annotate | Download (53 kB)

1
/*
2
 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3
 *
4
 * This program is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU General Public License version
6
 * 2 as published by the Free Software Foundation.
7
 *
8
 * You should have received a copy of the GNU General Public License
9
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10
 */
11

    
12
#include "qemu-common.h"
13
#include "qemu-error.h"
14
#include "qemu_socket.h"
15
#include "block_int.h"
16
#include "bitops.h"
17

    
18
#define SD_PROTO_VER 0x01
19

    
20
#define SD_DEFAULT_ADDR "localhost"
21
#define SD_DEFAULT_PORT "7000"
22

    
23
#define SD_OP_CREATE_AND_WRITE_OBJ  0x01
24
#define SD_OP_READ_OBJ       0x02
25
#define SD_OP_WRITE_OBJ      0x03
26

    
27
#define SD_OP_NEW_VDI        0x11
28
#define SD_OP_LOCK_VDI       0x12
29
#define SD_OP_RELEASE_VDI    0x13
30
#define SD_OP_GET_VDI_INFO   0x14
31
#define SD_OP_READ_VDIS      0x15
32

    
33
#define SD_FLAG_CMD_WRITE    0x01
34
#define SD_FLAG_CMD_COW      0x02
35

    
36
#define SD_RES_SUCCESS       0x00 /* Success */
37
#define SD_RES_UNKNOWN       0x01 /* Unknown error */
38
#define SD_RES_NO_OBJ        0x02 /* No object found */
39
#define SD_RES_EIO           0x03 /* I/O error */
40
#define SD_RES_VDI_EXIST     0x04 /* Vdi exists already */
41
#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
42
#define SD_RES_SYSTEM_ERROR  0x06 /* System error */
43
#define SD_RES_VDI_LOCKED    0x07 /* Vdi is locked */
44
#define SD_RES_NO_VDI        0x08 /* No vdi found */
45
#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
46
#define SD_RES_VDI_READ      0x0A /* Cannot read requested vdi */
47
#define SD_RES_VDI_WRITE     0x0B /* Cannot write requested vdi */
48
#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
49
#define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */
50
#define SD_RES_NO_TAG        0x0E /* Requested tag is not found */
51
#define SD_RES_STARTUP       0x0F /* Sheepdog is on starting up */
52
#define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */
53
#define SD_RES_SHUTDOWN      0x11 /* Sheepdog is shutting down */
54
#define SD_RES_NO_MEM        0x12 /* Cannot allocate memory */
55
#define SD_RES_FULL_VDI      0x13 /* we already have the maximum vdis */
56
#define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */
57
#define SD_RES_NO_SPACE      0x15 /* Server has no room for new objects */
58
#define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
59
#define SD_RES_WAIT_FOR_JOIN    0x17 /* Waiting for other nodes joining */
60
#define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
61

    
62
/*
63
 * Object ID rules
64
 *
65
 *  0 - 19 (20 bits): data object space
66
 * 20 - 31 (12 bits): reserved data object space
67
 * 32 - 55 (24 bits): vdi object space
68
 * 56 - 59 ( 4 bits): reserved vdi object space
69
 * 60 - 63 ( 4 bits): object type indentifier space
70
 */
71

    
72
#define VDI_SPACE_SHIFT   32
73
#define VDI_BIT (UINT64_C(1) << 63)
74
#define VMSTATE_BIT (UINT64_C(1) << 62)
75
#define MAX_DATA_OBJS (UINT64_C(1) << 20)
76
#define MAX_CHILDREN 1024
77
#define SD_MAX_VDI_LEN 256
78
#define SD_MAX_VDI_TAG_LEN 256
79
#define SD_NR_VDIS   (1U << 24)
80
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
81
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
82
#define SECTOR_SIZE 512
83

    
84
#define SD_INODE_SIZE (sizeof(SheepdogInode))
85
#define CURRENT_VDI_ID 0
86

    
87
typedef struct SheepdogReq {
88
    uint8_t proto_ver;
89
    uint8_t opcode;
90
    uint16_t flags;
91
    uint32_t epoch;
92
    uint32_t id;
93
    uint32_t data_length;
94
    uint32_t opcode_specific[8];
95
} SheepdogReq;
96

    
97
typedef struct SheepdogRsp {
98
    uint8_t proto_ver;
99
    uint8_t opcode;
100
    uint16_t flags;
101
    uint32_t epoch;
102
    uint32_t id;
103
    uint32_t data_length;
104
    uint32_t result;
105
    uint32_t opcode_specific[7];
106
} SheepdogRsp;
107

    
108
typedef struct SheepdogObjReq {
109
    uint8_t proto_ver;
110
    uint8_t opcode;
111
    uint16_t flags;
112
    uint32_t epoch;
113
    uint32_t id;
114
    uint32_t data_length;
115
    uint64_t oid;
116
    uint64_t cow_oid;
117
    uint32_t copies;
118
    uint32_t rsvd;
119
    uint64_t offset;
120
} SheepdogObjReq;
121

    
122
typedef struct SheepdogObjRsp {
123
    uint8_t proto_ver;
124
    uint8_t opcode;
125
    uint16_t flags;
126
    uint32_t epoch;
127
    uint32_t id;
128
    uint32_t data_length;
129
    uint32_t result;
130
    uint32_t copies;
131
    uint32_t pad[6];
132
} SheepdogObjRsp;
133

    
134
typedef struct SheepdogVdiReq {
135
    uint8_t proto_ver;
136
    uint8_t opcode;
137
    uint16_t flags;
138
    uint32_t epoch;
139
    uint32_t id;
140
    uint32_t data_length;
141
    uint64_t vdi_size;
142
    uint32_t base_vdi_id;
143
    uint32_t copies;
144
    uint32_t snapid;
145
    uint32_t pad[3];
146
} SheepdogVdiReq;
147

    
148
typedef struct SheepdogVdiRsp {
149
    uint8_t proto_ver;
150
    uint8_t opcode;
151
    uint16_t flags;
152
    uint32_t epoch;
153
    uint32_t id;
154
    uint32_t data_length;
155
    uint32_t result;
156
    uint32_t rsvd;
157
    uint32_t vdi_id;
158
    uint32_t pad[5];
159
} SheepdogVdiRsp;
160

    
161
typedef struct SheepdogInode {
162
    char name[SD_MAX_VDI_LEN];
163
    char tag[SD_MAX_VDI_TAG_LEN];
164
    uint64_t ctime;
165
    uint64_t snap_ctime;
166
    uint64_t vm_clock_nsec;
167
    uint64_t vdi_size;
168
    uint64_t vm_state_size;
169
    uint16_t copy_policy;
170
    uint8_t nr_copies;
171
    uint8_t block_size_shift;
172
    uint32_t snap_id;
173
    uint32_t vdi_id;
174
    uint32_t parent_vdi_id;
175
    uint32_t child_vdi_id[MAX_CHILDREN];
176
    uint32_t data_vdi_id[MAX_DATA_OBJS];
177
} SheepdogInode;
178

    
179
/*
180
 * 64 bit FNV-1a non-zero initial basis
181
 */
182
#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
183

    
184
/*
185
 * 64 bit Fowler/Noll/Vo FNV-1a hash code
186
 */
187
static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
188
{
189
    unsigned char *bp = buf;
190
    unsigned char *be = bp + len;
191
    while (bp < be) {
192
        hval ^= (uint64_t) *bp++;
193
        hval += (hval << 1) + (hval << 4) + (hval << 5) +
194
            (hval << 7) + (hval << 8) + (hval << 40);
195
    }
196
    return hval;
197
}
198

    
199
static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
200
{
201
    return inode->vdi_id == inode->data_vdi_id[idx];
202
}
203

    
204
static inline int is_data_obj(uint64_t oid)
205
{
206
    return !(VDI_BIT & oid);
207
}
208

    
209
static inline uint64_t data_oid_to_idx(uint64_t oid)
210
{
211
    return oid & (MAX_DATA_OBJS - 1);
212
}
213

    
214
static inline uint64_t vid_to_vdi_oid(uint32_t vid)
215
{
216
    return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
217
}
218

    
219
static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
220
{
221
    return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
222
}
223

    
224
static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
225
{
226
    return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
227
}
228

    
229
static inline int is_snapshot(struct SheepdogInode *inode)
230
{
231
    return !!inode->snap_ctime;
232
}
233

    
234
#undef dprintf
235
#ifdef DEBUG_SDOG
236
#define dprintf(fmt, args...)                                       \
237
    do {                                                            \
238
        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
239
    } while (0)
240
#else
241
#define dprintf(fmt, args...)
242
#endif
243

    
244
typedef struct SheepdogAIOCB SheepdogAIOCB;
245

    
246
typedef struct AIOReq {
247
    SheepdogAIOCB *aiocb;
248
    unsigned int iov_offset;
249

    
250
    uint64_t oid;
251
    uint64_t base_oid;
252
    uint64_t offset;
253
    unsigned int data_len;
254
    uint8_t flags;
255
    uint32_t id;
256

    
257
    QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
258
    QLIST_ENTRY(AIOReq) aioreq_siblings;
259
} AIOReq;
260

    
261
enum AIOCBState {
262
    AIOCB_WRITE_UDATA,
263
    AIOCB_READ_UDATA,
264
};
265

    
266
struct SheepdogAIOCB {
267
    BlockDriverAIOCB common;
268

    
269
    QEMUIOVector *qiov;
270

    
271
    int64_t sector_num;
272
    int nb_sectors;
273

    
274
    int ret;
275
    enum AIOCBState aiocb_type;
276

    
277
    QEMUBH *bh;
278
    void (*aio_done_func)(SheepdogAIOCB *);
279

    
280
    int canceled;
281

    
282
    QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
283
};
284

    
285
typedef struct BDRVSheepdogState {
286
    SheepdogInode inode;
287

    
288
    uint32_t min_dirty_data_idx;
289
    uint32_t max_dirty_data_idx;
290

    
291
    char name[SD_MAX_VDI_LEN];
292
    int is_snapshot;
293

    
294
    char *addr;
295
    char *port;
296
    int fd;
297

    
298
    uint32_t aioreq_seq_num;
299
    QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
300
} BDRVSheepdogState;
301

    
302
static const char * sd_strerror(int err)
303
{
304
    int i;
305

    
306
    static const struct {
307
        int err;
308
        const char *desc;
309
    } errors[] = {
310
        {SD_RES_SUCCESS, "Success"},
311
        {SD_RES_UNKNOWN, "Unknown error"},
312
        {SD_RES_NO_OBJ, "No object found"},
313
        {SD_RES_EIO, "I/O error"},
314
        {SD_RES_VDI_EXIST, "VDI exists already"},
315
        {SD_RES_INVALID_PARMS, "Invalid parameters"},
316
        {SD_RES_SYSTEM_ERROR, "System error"},
317
        {SD_RES_VDI_LOCKED, "VDI is already locked"},
318
        {SD_RES_NO_VDI, "No vdi found"},
319
        {SD_RES_NO_BASE_VDI, "No base VDI found"},
320
        {SD_RES_VDI_READ, "Failed read the requested VDI"},
321
        {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
322
        {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
323
        {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
324
        {SD_RES_NO_TAG, "Failed to find the requested tag"},
325
        {SD_RES_STARTUP, "The system is still booting"},
326
        {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
327
        {SD_RES_SHUTDOWN, "The system is shutting down"},
328
        {SD_RES_NO_MEM, "Out of memory on the server"},
329
        {SD_RES_FULL_VDI, "We already have the maximum vdis"},
330
        {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
331
        {SD_RES_NO_SPACE, "Server has no space for new objects"},
332
        {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
333
        {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
334
        {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
335
    };
336

    
337
    for (i = 0; i < ARRAY_SIZE(errors); ++i) {
338
        if (errors[i].err == err) {
339
            return errors[i].desc;
340
        }
341
    }
342

    
343
    return "Invalid error code";
344
}
345

    
346
/*
347
 * Sheepdog I/O handling:
348
 *
349
 * 1. In the sd_aio_readv/writev, read/write requests are added to the
350
 *    QEMU Bottom Halves.
351
 *
352
 * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
353
 *    requests to the server and link the requests to the
354
 *    outstanding_list in the BDRVSheepdogState.  we exits the
355
 *    function without waiting for receiving the response.
356
 *
357
 * 3. We receive the response in aio_read_response, the fd handler to
358
 *    the sheepdog connection.  If metadata update is needed, we send
359
 *    the write request to the vdi object in sd_write_done, the write
360
 *    completion function.  The AIOCB callback is not called until all
361
 *    the requests belonging to the AIOCB are finished.
362
 */
363

    
364
static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
365
                                    uint64_t oid, unsigned int data_len,
366
                                    uint64_t offset, uint8_t flags,
367
                                    uint64_t base_oid, unsigned int iov_offset)
368
{
369
    AIOReq *aio_req;
370

    
371
    aio_req = qemu_malloc(sizeof(*aio_req));
372
    aio_req->aiocb = acb;
373
    aio_req->iov_offset = iov_offset;
374
    aio_req->oid = oid;
375
    aio_req->base_oid = base_oid;
376
    aio_req->offset = offset;
377
    aio_req->data_len = data_len;
378
    aio_req->flags = flags;
379
    aio_req->id = s->aioreq_seq_num++;
380

    
381
    QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
382
                      outstanding_aio_siblings);
383
    QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
384

    
385
    return aio_req;
386
}
387

    
388
static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
389
{
390
    SheepdogAIOCB *acb = aio_req->aiocb;
391
    QLIST_REMOVE(aio_req, outstanding_aio_siblings);
392
    QLIST_REMOVE(aio_req, aioreq_siblings);
393
    qemu_free(aio_req);
394

    
395
    return !QLIST_EMPTY(&acb->aioreq_head);
396
}
397

    
398
static void sd_finish_aiocb(SheepdogAIOCB *acb)
399
{
400
    if (!acb->canceled) {
401
        acb->common.cb(acb->common.opaque, acb->ret);
402
    }
403
    qemu_aio_release(acb);
404
}
405

    
406
static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
407
{
408
    SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
409

    
410
    /*
411
     * Sheepdog cannot cancel the requests which are already sent to
412
     * the servers, so we just complete the request with -EIO here.
413
     */
414
    acb->common.cb(acb->common.opaque, -EIO);
415
    acb->canceled = 1;
416
}
417

    
418
static AIOPool sd_aio_pool = {
419
    .aiocb_size = sizeof(SheepdogAIOCB),
420
    .cancel = sd_aio_cancel,
421
};
422

    
423
static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
424
                                   int64_t sector_num, int nb_sectors,
425
                                   BlockDriverCompletionFunc *cb, void *opaque)
426
{
427
    SheepdogAIOCB *acb;
428

    
429
    acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
430

    
431
    acb->qiov = qiov;
432

    
433
    acb->sector_num = sector_num;
434
    acb->nb_sectors = nb_sectors;
435

    
436
    acb->aio_done_func = NULL;
437
    acb->canceled = 0;
438
    acb->bh = NULL;
439
    acb->ret = 0;
440
    QLIST_INIT(&acb->aioreq_head);
441
    return acb;
442
}
443

    
444
static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
445
{
446
    if (acb->bh) {
447
        error_report("bug: %d %d", acb->aiocb_type, acb->aiocb_type);
448
        return -EIO;
449
    }
450

    
451
    acb->bh = qemu_bh_new(cb, acb);
452
    qemu_bh_schedule(acb->bh);
453
    return 0;
454
}
455

    
456
#ifdef _WIN32
457

    
458
struct msghdr {
459
    struct iovec *msg_iov;
460
    size_t        msg_iovlen;
461
};
462

    
463
static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
464
{
465
    size_t size = 0;
466
    char *buf, *p;
467
    int i, ret;
468

    
469
    /* count the msg size */
470
    for (i = 0; i < msg->msg_iovlen; i++) {
471
        size += msg->msg_iov[i].iov_len;
472
    }
473
    buf = qemu_malloc(size);
474

    
475
    p = buf;
476
    for (i = 0; i < msg->msg_iovlen; i++) {
477
        memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
478
        p += msg->msg_iov[i].iov_len;
479
    }
480

    
481
    ret = send(s, buf, size, flags);
482

    
483
    qemu_free(buf);
484
    return ret;
485
}
486

    
487
static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
488
{
489
    size_t size = 0;
490
    char *buf, *p;
491
    int i, ret;
492

    
493
    /* count the msg size */
494
    for (i = 0; i < msg->msg_iovlen; i++) {
495
        size += msg->msg_iov[i].iov_len;
496
    }
497
    buf = qemu_malloc(size);
498

    
499
    ret = qemu_recv(s, buf, size, flags);
500
    if (ret < 0) {
501
        goto out;
502
    }
503

    
504
    p = buf;
505
    for (i = 0; i < msg->msg_iovlen; i++) {
506
        memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
507
        p += msg->msg_iov[i].iov_len;
508
    }
509
out:
510
    qemu_free(buf);
511
    return ret;
512
}
513

    
514
#endif
515

    
516
/*
517
 * Send/recv data with iovec buffers
518
 *
519
 * This function send/recv data from/to the iovec buffer directly.
520
 * The first `offset' bytes in the iovec buffer are skipped and next
521
 * `len' bytes are used.
522
 *
523
 * For example,
524
 *
525
 *   do_send_recv(sockfd, iov, len, offset, 1);
526
 *
527
 * is equals to
528
 *
529
 *   char *buf = malloc(size);
530
 *   iov_to_buf(iov, iovcnt, buf, offset, size);
531
 *   send(sockfd, buf, size, 0);
532
 *   free(buf);
533
 */
534
static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
535
                        int write)
536
{
537
    struct msghdr msg;
538
    int ret, diff;
539

    
540
    memset(&msg, 0, sizeof(msg));
541
    msg.msg_iov = iov;
542
    msg.msg_iovlen = 1;
543

    
544
    len += offset;
545

    
546
    while (iov->iov_len < len) {
547
        len -= iov->iov_len;
548

    
549
        iov++;
550
        msg.msg_iovlen++;
551
    }
552

    
553
    diff = iov->iov_len - len;
554
    iov->iov_len -= diff;
555

    
556
    while (msg.msg_iov->iov_len <= offset) {
557
        offset -= msg.msg_iov->iov_len;
558

    
559
        msg.msg_iov++;
560
        msg.msg_iovlen--;
561
    }
562

    
563
    msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
564
    msg.msg_iov->iov_len -= offset;
565

    
566
    if (write) {
567
        ret = sendmsg(sockfd, &msg, 0);
568
    } else {
569
        ret = recvmsg(sockfd, &msg, 0);
570
    }
571

    
572
    msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
573
    msg.msg_iov->iov_len += offset;
574

    
575
    iov->iov_len += diff;
576
    return ret;
577
}
578

    
579
static int connect_to_sdog(const char *addr, const char *port)
580
{
581
    char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
582
    int fd, ret;
583
    struct addrinfo hints, *res, *res0;
584

    
585
    if (!addr) {
586
        addr = SD_DEFAULT_ADDR;
587
        port = SD_DEFAULT_PORT;
588
    }
589

    
590
    memset(&hints, 0, sizeof(hints));
591
    hints.ai_socktype = SOCK_STREAM;
592

    
593
    ret = getaddrinfo(addr, port, &hints, &res0);
594
    if (ret) {
595
        error_report("unable to get address info %s, %s",
596
                     addr, strerror(errno));
597
        return -1;
598
    }
599

    
600
    for (res = res0; res; res = res->ai_next) {
601
        ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
602
                          sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
603
        if (ret) {
604
            continue;
605
        }
606

    
607
        fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
608
        if (fd < 0) {
609
            continue;
610
        }
611

    
612
    reconnect:
613
        ret = connect(fd, res->ai_addr, res->ai_addrlen);
614
        if (ret < 0) {
615
            if (errno == EINTR) {
616
                goto reconnect;
617
            }
618
            break;
619
        }
620

    
621
        dprintf("connected to %s:%s\n", addr, port);
622
        goto success;
623
    }
624
    fd = -1;
625
    error_report("failed connect to %s:%s", addr, port);
626
success:
627
    freeaddrinfo(res0);
628
    return fd;
629
}
630

    
631
static int do_readv_writev(int sockfd, struct iovec *iov, int len,
632
                           int iov_offset, int write)
633
{
634
    int ret;
635
again:
636
    ret = do_send_recv(sockfd, iov, len, iov_offset, write);
637
    if (ret < 0) {
638
        if (errno == EINTR || errno == EAGAIN) {
639
            goto again;
640
        }
641
        error_report("failed to recv a rsp, %s", strerror(errno));
642
        return 1;
643
    }
644

    
645
    iov_offset += ret;
646
    len -= ret;
647
    if (len) {
648
        goto again;
649
    }
650

    
651
    return 0;
652
}
653

    
654
static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
655
{
656
    return do_readv_writev(sockfd, iov, len, iov_offset, 0);
657
}
658

    
659
static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
660
{
661
    return do_readv_writev(sockfd, iov, len, iov_offset, 1);
662
}
663

    
664
static int do_read_write(int sockfd, void *buf, int len, int write)
665
{
666
    struct iovec iov;
667

    
668
    iov.iov_base = buf;
669
    iov.iov_len = len;
670

    
671
    return do_readv_writev(sockfd, &iov, len, 0, write);
672
}
673

    
674
static int do_read(int sockfd, void *buf, int len)
675
{
676
    return do_read_write(sockfd, buf, len, 0);
677
}
678

    
679
static int do_write(int sockfd, void *buf, int len)
680
{
681
    return do_read_write(sockfd, buf, len, 1);
682
}
683

    
684
static int send_req(int sockfd, SheepdogReq *hdr, void *data,
685
                    unsigned int *wlen)
686
{
687
    int ret;
688
    struct iovec iov[2];
689

    
690
    iov[0].iov_base = hdr;
691
    iov[0].iov_len = sizeof(*hdr);
692

    
693
    if (*wlen) {
694
        iov[1].iov_base = data;
695
        iov[1].iov_len = *wlen;
696
    }
697

    
698
    ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
699
    if (ret) {
700
        error_report("failed to send a req, %s", strerror(errno));
701
        ret = -1;
702
    }
703

    
704
    return ret;
705
}
706

    
707
static int do_req(int sockfd, SheepdogReq *hdr, void *data,
708
                  unsigned int *wlen, unsigned int *rlen)
709
{
710
    int ret;
711

    
712
    ret = send_req(sockfd, hdr, data, wlen);
713
    if (ret) {
714
        ret = -1;
715
        goto out;
716
    }
717

    
718
    ret = do_read(sockfd, hdr, sizeof(*hdr));
719
    if (ret) {
720
        error_report("failed to get a rsp, %s", strerror(errno));
721
        ret = -1;
722
        goto out;
723
    }
724

    
725
    if (*rlen > hdr->data_length) {
726
        *rlen = hdr->data_length;
727
    }
728

    
729
    if (*rlen) {
730
        ret = do_read(sockfd, data, *rlen);
731
        if (ret) {
732
            error_report("failed to get the data, %s", strerror(errno));
733
            ret = -1;
734
            goto out;
735
        }
736
    }
737
    ret = 0;
738
out:
739
    return ret;
740
}
741

    
742
static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
743
                           struct iovec *iov, int niov, int create,
744
                           enum AIOCBState aiocb_type);
745

    
746
/*
747
 * This function searchs pending requests to the object `oid', and
748
 * sends them.
749
 */
750
static void send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
751
{
752
    AIOReq *aio_req, *next;
753
    SheepdogAIOCB *acb;
754
    int ret;
755

    
756
    QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
757
                       outstanding_aio_siblings, next) {
758
        if (id == aio_req->id) {
759
            continue;
760
        }
761
        if (aio_req->oid != oid) {
762
            continue;
763
        }
764

    
765
        acb = aio_req->aiocb;
766
        ret = add_aio_request(s, aio_req, acb->qiov->iov,
767
                              acb->qiov->niov, 0, acb->aiocb_type);
768
        if (ret < 0) {
769
            error_report("add_aio_request is failed");
770
            free_aio_req(s, aio_req);
771
            if (QLIST_EMPTY(&acb->aioreq_head)) {
772
                sd_finish_aiocb(acb);
773
            }
774
        }
775
    }
776
}
777

    
778
/*
779
 * Receive responses of the I/O requests.
780
 *
781
 * This function is registered as a fd handler, and called from the
782
 * main loop when s->fd is ready for reading responses.
783
 */
784
static void aio_read_response(void *opaque)
785
{
786
    SheepdogObjRsp rsp;
787
    BDRVSheepdogState *s = opaque;
788
    int fd = s->fd;
789
    int ret;
790
    AIOReq *aio_req = NULL;
791
    SheepdogAIOCB *acb;
792
    int rest;
793
    unsigned long idx;
794

    
795
    if (QLIST_EMPTY(&s->outstanding_aio_head)) {
796
        return;
797
    }
798

    
799
    /* read a header */
800
    ret = do_read(fd, &rsp, sizeof(rsp));
801
    if (ret) {
802
        error_report("failed to get the header, %s", strerror(errno));
803
        return;
804
    }
805

    
806
    /* find the right aio_req from the outstanding_aio list */
807
    QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
808
        if (aio_req->id == rsp.id) {
809
            break;
810
        }
811
    }
812
    if (!aio_req) {
813
        error_report("cannot find aio_req %x", rsp.id);
814
        return;
815
    }
816

    
817
    acb = aio_req->aiocb;
818

    
819
    switch (acb->aiocb_type) {
820
    case AIOCB_WRITE_UDATA:
821
        if (!is_data_obj(aio_req->oid)) {
822
            break;
823
        }
824
        idx = data_oid_to_idx(aio_req->oid);
825

    
826
        if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
827
            /*
828
             * If the object is newly created one, we need to update
829
             * the vdi object (metadata object).  min_dirty_data_idx
830
             * and max_dirty_data_idx are changed to include updated
831
             * index between them.
832
             */
833
            s->inode.data_vdi_id[idx] = s->inode.vdi_id;
834
            s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
835
            s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
836

    
837
            /*
838
             * Some requests may be blocked because simultaneous
839
             * create requests are not allowed, so we search the
840
             * pending requests here.
841
             */
842
            send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
843
        }
844
        break;
845
    case AIOCB_READ_UDATA:
846
        ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
847
                       aio_req->iov_offset);
848
        if (ret) {
849
            error_report("failed to get the data, %s", strerror(errno));
850
            return;
851
        }
852
        break;
853
    }
854

    
855
    if (rsp.result != SD_RES_SUCCESS) {
856
        acb->ret = -EIO;
857
        error_report("%s", sd_strerror(rsp.result));
858
    }
859

    
860
    rest = free_aio_req(s, aio_req);
861
    if (!rest) {
862
        /*
863
         * We've finished all requests which belong to the AIOCB, so
864
         * we can call the callback now.
865
         */
866
        acb->aio_done_func(acb);
867
    }
868
}
869

    
870
static int aio_flush_request(void *opaque)
871
{
872
    BDRVSheepdogState *s = opaque;
873

    
874
    return !QLIST_EMPTY(&s->outstanding_aio_head);
875
}
876

    
877
#if !defined(SOL_TCP) || !defined(TCP_CORK)
878

    
879
static int set_cork(int fd, int v)
880
{
881
    return 0;
882
}
883

    
884
#else
885

    
886
static int set_cork(int fd, int v)
887
{
888
    return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
889
}
890

    
891
#endif
892

    
893
static int set_nodelay(int fd)
894
{
895
    int ret, opt;
896

    
897
    opt = 1;
898
    ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
899
    return ret;
900
}
901

    
902
/*
903
 * Return a socket discriptor to read/write objects.
904
 *
905
 * We cannot use this discriptor for other operations because
906
 * the block driver may be on waiting response from the server.
907
 */
908
static int get_sheep_fd(BDRVSheepdogState *s)
909
{
910
    int ret, fd;
911

    
912
    fd = connect_to_sdog(s->addr, s->port);
913
    if (fd < 0) {
914
        error_report("%s", strerror(errno));
915
        return -1;
916
    }
917

    
918
    socket_set_nonblock(fd);
919

    
920
    ret = set_nodelay(fd);
921
    if (ret) {
922
        error_report("%s", strerror(errno));
923
        closesocket(fd);
924
        return -1;
925
    }
926

    
927
    qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
928
                            NULL, s);
929
    return fd;
930
}
931

    
932
/*
933
 * Parse a filename
934
 *
935
 * filename must be one of the following formats:
936
 *   1. [vdiname]
937
 *   2. [vdiname]:[snapid]
938
 *   3. [vdiname]:[tag]
939
 *   4. [hostname]:[port]:[vdiname]
940
 *   5. [hostname]:[port]:[vdiname]:[snapid]
941
 *   6. [hostname]:[port]:[vdiname]:[tag]
942
 *
943
 * You can boot from the snapshot images by specifying `snapid` or
944
 * `tag'.
945
 *
946
 * You can run VMs outside the Sheepdog cluster by specifying
947
 * `hostname' and `port' (experimental).
948
 */
949
static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
950
                         char *vdi, uint32_t *snapid, char *tag)
951
{
952
    char *p, *q;
953
    int nr_sep;
954

    
955
    p = q = qemu_strdup(filename);
956

    
957
    /* count the number of separators */
958
    nr_sep = 0;
959
    while (*p) {
960
        if (*p == ':') {
961
            nr_sep++;
962
        }
963
        p++;
964
    }
965
    p = q;
966

    
967
    /* use the first two tokens as hostname and port number. */
968
    if (nr_sep >= 2) {
969
        s->addr = p;
970
        p = strchr(p, ':');
971
        *p++ = '\0';
972

    
973
        s->port = p;
974
        p = strchr(p, ':');
975
        *p++ = '\0';
976
    } else {
977
        s->addr = NULL;
978
        s->port = 0;
979
    }
980

    
981
    strncpy(vdi, p, SD_MAX_VDI_LEN);
982

    
983
    p = strchr(vdi, ':');
984
    if (p) {
985
        *p++ = '\0';
986
        *snapid = strtoul(p, NULL, 10);
987
        if (*snapid == 0) {
988
            strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
989
        }
990
    } else {
991
        *snapid = CURRENT_VDI_ID; /* search current vdi */
992
    }
993

    
994
    if (s->addr == NULL) {
995
        qemu_free(q);
996
    }
997

    
998
    return 0;
999
}
1000

    
1001
static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1002
                         char *tag, uint32_t *vid, int for_snapshot)
1003
{
1004
    int ret, fd;
1005
    SheepdogVdiReq hdr;
1006
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1007
    unsigned int wlen, rlen = 0;
1008
    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1009

    
1010
    fd = connect_to_sdog(s->addr, s->port);
1011
    if (fd < 0) {
1012
        return -1;
1013
    }
1014

    
1015
    memset(buf, 0, sizeof(buf));
1016
    strncpy(buf, filename, SD_MAX_VDI_LEN);
1017
    strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1018

    
1019
    memset(&hdr, 0, sizeof(hdr));
1020
    if (for_snapshot) {
1021
        hdr.opcode = SD_OP_GET_VDI_INFO;
1022
    } else {
1023
        hdr.opcode = SD_OP_LOCK_VDI;
1024
    }
1025
    wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1026
    hdr.proto_ver = SD_PROTO_VER;
1027
    hdr.data_length = wlen;
1028
    hdr.snapid = snapid;
1029
    hdr.flags = SD_FLAG_CMD_WRITE;
1030

    
1031
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1032
    if (ret) {
1033
        ret = -1;
1034
        goto out;
1035
    }
1036

    
1037
    if (rsp->result != SD_RES_SUCCESS) {
1038
        error_report("cannot get vdi info, %s, %s %d %s",
1039
                     sd_strerror(rsp->result), filename, snapid, tag);
1040
        ret = -1;
1041
        goto out;
1042
    }
1043
    *vid = rsp->vdi_id;
1044

    
1045
    ret = 0;
1046
out:
1047
    closesocket(fd);
1048
    return ret;
1049
}
1050

    
1051
static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1052
                           struct iovec *iov, int niov, int create,
1053
                           enum AIOCBState aiocb_type)
1054
{
1055
    int nr_copies = s->inode.nr_copies;
1056
    SheepdogObjReq hdr;
1057
    unsigned int wlen;
1058
    int ret;
1059
    uint64_t oid = aio_req->oid;
1060
    unsigned int datalen = aio_req->data_len;
1061
    uint64_t offset = aio_req->offset;
1062
    uint8_t flags = aio_req->flags;
1063
    uint64_t old_oid = aio_req->base_oid;
1064

    
1065
    if (!nr_copies) {
1066
        error_report("bug");
1067
    }
1068

    
1069
    memset(&hdr, 0, sizeof(hdr));
1070

    
1071
    if (aiocb_type == AIOCB_READ_UDATA) {
1072
        wlen = 0;
1073
        hdr.opcode = SD_OP_READ_OBJ;
1074
        hdr.flags = flags;
1075
    } else if (create) {
1076
        wlen = datalen;
1077
        hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1078
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
1079
    } else {
1080
        wlen = datalen;
1081
        hdr.opcode = SD_OP_WRITE_OBJ;
1082
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
1083
    }
1084

    
1085
    hdr.oid = oid;
1086
    hdr.cow_oid = old_oid;
1087
    hdr.copies = s->inode.nr_copies;
1088

    
1089
    hdr.data_length = datalen;
1090
    hdr.offset = offset;
1091

    
1092
    hdr.id = aio_req->id;
1093

    
1094
    set_cork(s->fd, 1);
1095

    
1096
    /* send a header */
1097
    ret = do_write(s->fd, &hdr, sizeof(hdr));
1098
    if (ret) {
1099
        error_report("failed to send a req, %s", strerror(errno));
1100
        return -EIO;
1101
    }
1102

    
1103
    if (wlen) {
1104
        ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1105
        if (ret) {
1106
            error_report("failed to send a data, %s", strerror(errno));
1107
            return -EIO;
1108
        }
1109
    }
1110

    
1111
    set_cork(s->fd, 0);
1112

    
1113
    return 0;
1114
}
1115

    
1116
static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1117
                             unsigned int datalen, uint64_t offset,
1118
                             int write, int create)
1119
{
1120
    SheepdogObjReq hdr;
1121
    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1122
    unsigned int wlen, rlen;
1123
    int ret;
1124

    
1125
    memset(&hdr, 0, sizeof(hdr));
1126

    
1127
    if (write) {
1128
        wlen = datalen;
1129
        rlen = 0;
1130
        hdr.flags = SD_FLAG_CMD_WRITE;
1131
        if (create) {
1132
            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1133
        } else {
1134
            hdr.opcode = SD_OP_WRITE_OBJ;
1135
        }
1136
    } else {
1137
        wlen = 0;
1138
        rlen = datalen;
1139
        hdr.opcode = SD_OP_READ_OBJ;
1140
    }
1141
    hdr.oid = oid;
1142
    hdr.data_length = datalen;
1143
    hdr.offset = offset;
1144
    hdr.copies = copies;
1145

    
1146
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1147
    if (ret) {
1148
        error_report("failed to send a request to the sheep");
1149
        return -1;
1150
    }
1151

    
1152
    switch (rsp->result) {
1153
    case SD_RES_SUCCESS:
1154
        return 0;
1155
    default:
1156
        error_report("%s", sd_strerror(rsp->result));
1157
        return -1;
1158
    }
1159
}
1160

    
1161
static int read_object(int fd, char *buf, uint64_t oid, int copies,
1162
                       unsigned int datalen, uint64_t offset)
1163
{
1164
    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1165
}
1166

    
1167
static int write_object(int fd, char *buf, uint64_t oid, int copies,
1168
                        unsigned int datalen, uint64_t offset, int create)
1169
{
1170
    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1171
}
1172

    
1173
static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1174
{
1175
    int ret, fd;
1176
    uint32_t vid = 0;
1177
    BDRVSheepdogState *s = bs->opaque;
1178
    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1179
    uint32_t snapid;
1180
    char *buf = NULL;
1181

    
1182
    strstart(filename, "sheepdog:", (const char **)&filename);
1183

    
1184
    QLIST_INIT(&s->outstanding_aio_head);
1185
    s->fd = -1;
1186

    
1187
    memset(vdi, 0, sizeof(vdi));
1188
    memset(tag, 0, sizeof(tag));
1189
    if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1190
        goto out;
1191
    }
1192
    s->fd = get_sheep_fd(s);
1193
    if (s->fd < 0) {
1194
        goto out;
1195
    }
1196

    
1197
    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1198
    if (ret) {
1199
        goto out;
1200
    }
1201

    
1202
    if (snapid) {
1203
        dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1204
        s->is_snapshot = 1;
1205
    }
1206

    
1207
    fd = connect_to_sdog(s->addr, s->port);
1208
    if (fd < 0) {
1209
        error_report("failed to connect");
1210
        goto out;
1211
    }
1212

    
1213
    buf = qemu_malloc(SD_INODE_SIZE);
1214
    ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1215

    
1216
    closesocket(fd);
1217

    
1218
    if (ret) {
1219
        goto out;
1220
    }
1221

    
1222
    memcpy(&s->inode, buf, sizeof(s->inode));
1223
    s->min_dirty_data_idx = UINT32_MAX;
1224
    s->max_dirty_data_idx = 0;
1225

    
1226
    bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1227
    strncpy(s->name, vdi, sizeof(s->name));
1228
    qemu_free(buf);
1229
    return 0;
1230
out:
1231
    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1232
    if (s->fd >= 0) {
1233
        closesocket(s->fd);
1234
    }
1235
    qemu_free(buf);
1236
    return -1;
1237
}
1238

    
1239
static int do_sd_create(char *filename, int64_t vdi_size,
1240
                        uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1241
                        const char *addr, const char *port)
1242
{
1243
    SheepdogVdiReq hdr;
1244
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1245
    int fd, ret;
1246
    unsigned int wlen, rlen = 0;
1247
    char buf[SD_MAX_VDI_LEN];
1248

    
1249
    fd = connect_to_sdog(addr, port);
1250
    if (fd < 0) {
1251
        return -EIO;
1252
    }
1253

    
1254
    memset(buf, 0, sizeof(buf));
1255
    strncpy(buf, filename, SD_MAX_VDI_LEN);
1256

    
1257
    memset(&hdr, 0, sizeof(hdr));
1258
    hdr.opcode = SD_OP_NEW_VDI;
1259
    hdr.base_vdi_id = base_vid;
1260

    
1261
    wlen = SD_MAX_VDI_LEN;
1262

    
1263
    hdr.flags = SD_FLAG_CMD_WRITE;
1264
    hdr.snapid = snapshot;
1265

    
1266
    hdr.data_length = wlen;
1267
    hdr.vdi_size = vdi_size;
1268

    
1269
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1270

    
1271
    closesocket(fd);
1272

    
1273
    if (ret) {
1274
        return -EIO;
1275
    }
1276

    
1277
    if (rsp->result != SD_RES_SUCCESS) {
1278
        error_report("%s, %s", sd_strerror(rsp->result), filename);
1279
        return -EIO;
1280
    }
1281

    
1282
    if (vdi_id) {
1283
        *vdi_id = rsp->vdi_id;
1284
    }
1285

    
1286
    return 0;
1287
}
1288

    
1289
static int sd_prealloc(const char *filename)
1290
{
1291
    BlockDriverState *bs = NULL;
1292
    uint32_t idx, max_idx;
1293
    int64_t vdi_size;
1294
    void *buf = qemu_mallocz(SD_DATA_OBJ_SIZE);
1295
    int ret;
1296

    
1297
    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
1298
    if (ret < 0) {
1299
        goto out;
1300
    }
1301

    
1302
    vdi_size = bdrv_getlength(bs);
1303
    if (vdi_size < 0) {
1304
        ret = vdi_size;
1305
        goto out;
1306
    }
1307
    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
1308

    
1309
    for (idx = 0; idx < max_idx; idx++) {
1310
        /*
1311
         * The created image can be a cloned image, so we need to read
1312
         * a data from the source image.
1313
         */
1314
        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1315
        if (ret < 0) {
1316
            goto out;
1317
        }
1318
        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1319
        if (ret < 0) {
1320
            goto out;
1321
        }
1322
    }
1323
out:
1324
    if (bs) {
1325
        bdrv_delete(bs);
1326
    }
1327
    qemu_free(buf);
1328

    
1329
    return ret;
1330
}
1331

    
1332
static int sd_create(const char *filename, QEMUOptionParameter *options)
1333
{
1334
    int ret;
1335
    uint32_t vid = 0, base_vid = 0;
1336
    int64_t vdi_size = 0;
1337
    char *backing_file = NULL;
1338
    BDRVSheepdogState s;
1339
    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1340
    uint32_t snapid;
1341
    int prealloc = 0;
1342
    const char *vdiname;
1343

    
1344
    strstart(filename, "sheepdog:", &vdiname);
1345

    
1346
    memset(&s, 0, sizeof(s));
1347
    memset(vdi, 0, sizeof(vdi));
1348
    memset(tag, 0, sizeof(tag));
1349
    if (parse_vdiname(&s, vdiname, vdi, &snapid, tag) < 0) {
1350
        error_report("invalid filename");
1351
        return -EINVAL;
1352
    }
1353

    
1354
    while (options && options->name) {
1355
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1356
            vdi_size = options->value.n;
1357
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1358
            backing_file = options->value.s;
1359
        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1360
            if (!options->value.s || !strcmp(options->value.s, "off")) {
1361
                prealloc = 0;
1362
            } else if (!strcmp(options->value.s, "full")) {
1363
                prealloc = 1;
1364
            } else {
1365
                error_report("Invalid preallocation mode: '%s'",
1366
                             options->value.s);
1367
                return -EINVAL;
1368
            }
1369
        }
1370
        options++;
1371
    }
1372

    
1373
    if (vdi_size > SD_MAX_VDI_SIZE) {
1374
        error_report("too big image size");
1375
        return -EINVAL;
1376
    }
1377

    
1378
    if (backing_file) {
1379
        BlockDriverState *bs;
1380
        BDRVSheepdogState *s;
1381
        BlockDriver *drv;
1382

    
1383
        /* Currently, only Sheepdog backing image is supported. */
1384
        drv = bdrv_find_protocol(backing_file);
1385
        if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1386
            error_report("backing_file must be a sheepdog image");
1387
            return -EINVAL;
1388
        }
1389

    
1390
        ret = bdrv_file_open(&bs, backing_file, 0);
1391
        if (ret < 0)
1392
            return -EIO;
1393

    
1394
        s = bs->opaque;
1395

    
1396
        if (!is_snapshot(&s->inode)) {
1397
            error_report("cannot clone from a non snapshot vdi");
1398
            bdrv_delete(bs);
1399
            return -EINVAL;
1400
        }
1401

    
1402
        base_vid = s->inode.vdi_id;
1403
        bdrv_delete(bs);
1404
    }
1405

    
1406
    ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s.addr, s.port);
1407
    if (!prealloc || ret) {
1408
        return ret;
1409
    }
1410

    
1411
    return sd_prealloc(filename);
1412
}
1413

    
1414
static void sd_close(BlockDriverState *bs)
1415
{
1416
    BDRVSheepdogState *s = bs->opaque;
1417
    SheepdogVdiReq hdr;
1418
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1419
    unsigned int wlen, rlen = 0;
1420
    int fd, ret;
1421

    
1422
    dprintf("%s\n", s->name);
1423

    
1424
    fd = connect_to_sdog(s->addr, s->port);
1425
    if (fd < 0) {
1426
        return;
1427
    }
1428

    
1429
    memset(&hdr, 0, sizeof(hdr));
1430

    
1431
    hdr.opcode = SD_OP_RELEASE_VDI;
1432
    wlen = strlen(s->name) + 1;
1433
    hdr.data_length = wlen;
1434
    hdr.flags = SD_FLAG_CMD_WRITE;
1435

    
1436
    ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1437

    
1438
    closesocket(fd);
1439

    
1440
    if (!ret && rsp->result != SD_RES_SUCCESS &&
1441
        rsp->result != SD_RES_VDI_NOT_LOCKED) {
1442
        error_report("%s, %s", sd_strerror(rsp->result), s->name);
1443
    }
1444

    
1445
    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1446
    closesocket(s->fd);
1447
    qemu_free(s->addr);
1448
}
1449

    
1450
static int64_t sd_getlength(BlockDriverState *bs)
1451
{
1452
    BDRVSheepdogState *s = bs->opaque;
1453

    
1454
    return s->inode.vdi_size;
1455
}
1456

    
1457
static int sd_truncate(BlockDriverState *bs, int64_t offset)
1458
{
1459
    BDRVSheepdogState *s = bs->opaque;
1460
    int ret, fd;
1461
    unsigned int datalen;
1462

    
1463
    if (offset < s->inode.vdi_size) {
1464
        error_report("shrinking is not supported");
1465
        return -EINVAL;
1466
    } else if (offset > SD_MAX_VDI_SIZE) {
1467
        error_report("too big image size");
1468
        return -EINVAL;
1469
    }
1470

    
1471
    fd = connect_to_sdog(s->addr, s->port);
1472
    if (fd < 0) {
1473
        return -EIO;
1474
    }
1475

    
1476
    /* we don't need to update entire object */
1477
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1478
    s->inode.vdi_size = offset;
1479
    ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1480
                       s->inode.nr_copies, datalen, 0, 0);
1481
    close(fd);
1482

    
1483
    if (ret < 0) {
1484
        error_report("failed to update an inode.");
1485
        return -EIO;
1486
    }
1487

    
1488
    return 0;
1489
}
1490

    
1491
/*
1492
 * This function is called after writing data objects.  If we need to
1493
 * update metadata, this sends a write request to the vdi object.
1494
 * Otherwise, this calls the AIOCB callback.
1495
 */
1496
static void sd_write_done(SheepdogAIOCB *acb)
1497
{
1498
    int ret;
1499
    BDRVSheepdogState *s = acb->common.bs->opaque;
1500
    struct iovec iov;
1501
    AIOReq *aio_req;
1502
    uint32_t offset, data_len, mn, mx;
1503

    
1504
    mn = s->min_dirty_data_idx;
1505
    mx = s->max_dirty_data_idx;
1506
    if (mn <= mx) {
1507
        /* we need to update the vdi object. */
1508
        offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1509
            mn * sizeof(s->inode.data_vdi_id[0]);
1510
        data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1511

    
1512
        s->min_dirty_data_idx = UINT32_MAX;
1513
        s->max_dirty_data_idx = 0;
1514

    
1515
        iov.iov_base = &s->inode;
1516
        iov.iov_len = sizeof(s->inode);
1517
        aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1518
                                data_len, offset, 0, 0, offset);
1519
        ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1520
        if (ret) {
1521
            free_aio_req(s, aio_req);
1522
            acb->ret = -EIO;
1523
            goto out;
1524
        }
1525

    
1526
        acb->aio_done_func = sd_finish_aiocb;
1527
        acb->aiocb_type = AIOCB_WRITE_UDATA;
1528
        return;
1529
    }
1530
out:
1531
    sd_finish_aiocb(acb);
1532
}
1533

    
1534
/*
1535
 * Create a writable VDI from a snapshot
1536
 */
1537
static int sd_create_branch(BDRVSheepdogState *s)
1538
{
1539
    int ret, fd;
1540
    uint32_t vid;
1541
    char *buf;
1542

    
1543
    dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1544

    
1545
    buf = qemu_malloc(SD_INODE_SIZE);
1546

    
1547
    ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1548
                       s->addr, s->port);
1549
    if (ret) {
1550
        goto out;
1551
    }
1552

    
1553
    dprintf("%" PRIx32 " is created.\n", vid);
1554

    
1555
    fd = connect_to_sdog(s->addr, s->port);
1556
    if (fd < 0) {
1557
        error_report("failed to connect");
1558
        goto out;
1559
    }
1560

    
1561
    ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1562
                      SD_INODE_SIZE, 0);
1563

    
1564
    closesocket(fd);
1565

    
1566
    if (ret < 0) {
1567
        goto out;
1568
    }
1569

    
1570
    memcpy(&s->inode, buf, sizeof(s->inode));
1571

    
1572
    s->is_snapshot = 0;
1573
    ret = 0;
1574
    dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1575

    
1576
out:
1577
    qemu_free(buf);
1578

    
1579
    return ret;
1580
}
1581

    
1582
/*
1583
 * Send I/O requests to the server.
1584
 *
1585
 * This function sends requests to the server, links the requests to
1586
 * the outstanding_list in BDRVSheepdogState, and exits without
1587
 * waiting the response.  The responses are received in the
1588
 * `aio_read_response' function which is called from the main loop as
1589
 * a fd handler.
1590
 */
1591
static void sd_readv_writev_bh_cb(void *p)
1592
{
1593
    SheepdogAIOCB *acb = p;
1594
    int ret = 0;
1595
    unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1596
    unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1597
    uint64_t oid;
1598
    uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1599
    BDRVSheepdogState *s = acb->common.bs->opaque;
1600
    SheepdogInode *inode = &s->inode;
1601
    AIOReq *aio_req;
1602

    
1603
    qemu_bh_delete(acb->bh);
1604
    acb->bh = NULL;
1605

    
1606
    if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1607
        /*
1608
         * In the case we open the snapshot VDI, Sheepdog creates the
1609
         * writable VDI when we do a write operation first.
1610
         */
1611
        ret = sd_create_branch(s);
1612
        if (ret) {
1613
            acb->ret = -EIO;
1614
            goto out;
1615
        }
1616
    }
1617

    
1618
    while (done != total) {
1619
        uint8_t flags = 0;
1620
        uint64_t old_oid = 0;
1621
        int create = 0;
1622

    
1623
        oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1624

    
1625
        len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1626

    
1627
        if (!inode->data_vdi_id[idx]) {
1628
            if (acb->aiocb_type == AIOCB_READ_UDATA) {
1629
                goto done;
1630
            }
1631

    
1632
            create = 1;
1633
        } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
1634
                   && !is_data_obj_writable(inode, idx)) {
1635
            /* Copy-On-Write */
1636
            create = 1;
1637
            old_oid = oid;
1638
            flags = SD_FLAG_CMD_COW;
1639
        }
1640

    
1641
        if (create) {
1642
            dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1643
                    " %" PRIu64 "\n", inode->vdi_id, oid,
1644
                    vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1645
            oid = vid_to_data_oid(inode->vdi_id, idx);
1646
            dprintf("new oid %lx\n", oid);
1647
        }
1648

    
1649
        aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1650

    
1651
        if (create) {
1652
            AIOReq *areq;
1653
            QLIST_FOREACH(areq, &s->outstanding_aio_head,
1654
                          outstanding_aio_siblings) {
1655
                if (areq == aio_req) {
1656
                    continue;
1657
                }
1658
                if (areq->oid == oid) {
1659
                    /*
1660
                     * Sheepdog cannot handle simultaneous create
1661
                     * requests to the same object.  So we cannot send
1662
                     * the request until the previous request
1663
                     * finishes.
1664
                     */
1665
                    aio_req->flags = 0;
1666
                    aio_req->base_oid = 0;
1667
                    goto done;
1668
                }
1669
            }
1670
        }
1671

    
1672
        ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1673
                              create, acb->aiocb_type);
1674
        if (ret < 0) {
1675
            error_report("add_aio_request is failed");
1676
            free_aio_req(s, aio_req);
1677
            acb->ret = -EIO;
1678
            goto out;
1679
        }
1680
    done:
1681
        offset = 0;
1682
        idx++;
1683
        done += len;
1684
    }
1685
out:
1686
    if (QLIST_EMPTY(&acb->aioreq_head)) {
1687
        sd_finish_aiocb(acb);
1688
    }
1689
}
1690

    
1691
static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
1692
                                       QEMUIOVector *qiov, int nb_sectors,
1693
                                       BlockDriverCompletionFunc *cb,
1694
                                       void *opaque)
1695
{
1696
    SheepdogAIOCB *acb;
1697

    
1698
    if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1699
        /* TODO: shouldn't block here */
1700
        if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
1701
            return NULL;
1702
        }
1703
        bs->total_sectors = sector_num + nb_sectors;
1704
    }
1705

    
1706
    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1707
    acb->aio_done_func = sd_write_done;
1708
    acb->aiocb_type = AIOCB_WRITE_UDATA;
1709

    
1710
    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1711
    return &acb->common;
1712
}
1713

    
1714
static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
1715
                                      QEMUIOVector *qiov, int nb_sectors,
1716
                                      BlockDriverCompletionFunc *cb,
1717
                                      void *opaque)
1718
{
1719
    SheepdogAIOCB *acb;
1720
    int i;
1721

    
1722
    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1723
    acb->aiocb_type = AIOCB_READ_UDATA;
1724
    acb->aio_done_func = sd_finish_aiocb;
1725

    
1726
    /*
1727
     * TODO: we can do better; we don't need to initialize
1728
     * blindly.
1729
     */
1730
    for (i = 0; i < qiov->niov; i++) {
1731
        memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1732
    }
1733

    
1734
    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1735
    return &acb->common;
1736
}
1737

    
1738
static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1739
{
1740
    BDRVSheepdogState *s = bs->opaque;
1741
    int ret, fd;
1742
    uint32_t new_vid;
1743
    SheepdogInode *inode;
1744
    unsigned int datalen;
1745

    
1746
    dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1747
            "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1748
            s->name, sn_info->vm_state_size, s->is_snapshot);
1749

    
1750
    if (s->is_snapshot) {
1751
        error_report("You can't create a snapshot of a snapshot VDI, "
1752
                     "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
1753

    
1754
        return -EINVAL;
1755
    }
1756

    
1757
    dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1758

    
1759
    s->inode.vm_state_size = sn_info->vm_state_size;
1760
    s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1761
    strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1762
    /* we don't need to update entire object */
1763
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1764

    
1765
    /* refresh inode. */
1766
    fd = connect_to_sdog(s->addr, s->port);
1767
    if (fd < 0) {
1768
        ret = -EIO;
1769
        goto cleanup;
1770
    }
1771

    
1772
    ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1773
                       s->inode.nr_copies, datalen, 0, 0);
1774
    if (ret < 0) {
1775
        error_report("failed to write snapshot's inode.");
1776
        ret = -EIO;
1777
        goto cleanup;
1778
    }
1779

    
1780
    ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1781
                       s->addr, s->port);
1782
    if (ret < 0) {
1783
        error_report("failed to create inode for snapshot. %s",
1784
                     strerror(errno));
1785
        ret = -EIO;
1786
        goto cleanup;
1787
    }
1788

    
1789
    inode = (SheepdogInode *)qemu_malloc(datalen);
1790

    
1791
    ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1792
                      s->inode.nr_copies, datalen, 0);
1793

    
1794
    if (ret < 0) {
1795
        error_report("failed to read new inode info. %s", strerror(errno));
1796
        ret = -EIO;
1797
        goto cleanup;
1798
    }
1799

    
1800
    memcpy(&s->inode, inode, datalen);
1801
    dprintf("s->inode: name %s snap_id %x oid %x\n",
1802
            s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1803

    
1804
cleanup:
1805
    closesocket(fd);
1806
    return ret;
1807
}
1808

    
1809
static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1810
{
1811
    BDRVSheepdogState *s = bs->opaque;
1812
    BDRVSheepdogState *old_s;
1813
    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1814
    char *buf = NULL;
1815
    uint32_t vid;
1816
    uint32_t snapid = 0;
1817
    int ret = -ENOENT, fd;
1818

    
1819
    old_s = qemu_malloc(sizeof(BDRVSheepdogState));
1820

    
1821
    memcpy(old_s, s, sizeof(BDRVSheepdogState));
1822

    
1823
    memset(vdi, 0, sizeof(vdi));
1824
    strncpy(vdi, s->name, sizeof(vdi));
1825

    
1826
    memset(tag, 0, sizeof(tag));
1827
    snapid = strtoul(snapshot_id, NULL, 10);
1828
    if (!snapid) {
1829
        strncpy(tag, s->name, sizeof(tag));
1830
    }
1831

    
1832
    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1833
    if (ret) {
1834
        error_report("Failed to find_vdi_name");
1835
        ret = -ENOENT;
1836
        goto out;
1837
    }
1838

    
1839
    fd = connect_to_sdog(s->addr, s->port);
1840
    if (fd < 0) {
1841
        error_report("failed to connect");
1842
        goto out;
1843
    }
1844

    
1845
    buf = qemu_malloc(SD_INODE_SIZE);
1846
    ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1847
                      SD_INODE_SIZE, 0);
1848

    
1849
    closesocket(fd);
1850

    
1851
    if (ret) {
1852
        ret = -ENOENT;
1853
        goto out;
1854
    }
1855

    
1856
    memcpy(&s->inode, buf, sizeof(s->inode));
1857

    
1858
    if (!s->inode.vm_state_size) {
1859
        error_report("Invalid snapshot");
1860
        ret = -ENOENT;
1861
        goto out;
1862
    }
1863

    
1864
    s->is_snapshot = 1;
1865

    
1866
    qemu_free(buf);
1867
    qemu_free(old_s);
1868

    
1869
    return 0;
1870
out:
1871
    /* recover bdrv_sd_state */
1872
    memcpy(s, old_s, sizeof(BDRVSheepdogState));
1873
    qemu_free(buf);
1874
    qemu_free(old_s);
1875

    
1876
    error_report("failed to open. recover old bdrv_sd_state.");
1877

    
1878
    return ret;
1879
}
1880

    
1881
static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1882
{
1883
    /* FIXME: Delete specified snapshot id.  */
1884
    return 0;
1885
}
1886

    
1887
static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1888
{
1889
    BDRVSheepdogState *s = bs->opaque;
1890
    SheepdogReq req;
1891
    int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1892
    QEMUSnapshotInfo *sn_tab = NULL;
1893
    unsigned wlen, rlen;
1894
    int found = 0;
1895
    static SheepdogInode inode;
1896
    unsigned long *vdi_inuse;
1897
    unsigned int start_nr;
1898
    uint64_t hval;
1899
    uint32_t vid;
1900

    
1901
    vdi_inuse = qemu_malloc(max);
1902

    
1903
    fd = connect_to_sdog(s->addr, s->port);
1904
    if (fd < 0) {
1905
        goto out;
1906
    }
1907

    
1908
    rlen = max;
1909
    wlen = 0;
1910

    
1911
    memset(&req, 0, sizeof(req));
1912

    
1913
    req.opcode = SD_OP_READ_VDIS;
1914
    req.data_length = max;
1915

    
1916
    ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1917

    
1918
    closesocket(fd);
1919
    if (ret) {
1920
        goto out;
1921
    }
1922

    
1923
    sn_tab = qemu_mallocz(nr * sizeof(*sn_tab));
1924

    
1925
    /* calculate a vdi id with hash function */
1926
    hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1927
    start_nr = hval & (SD_NR_VDIS - 1);
1928

    
1929
    fd = connect_to_sdog(s->addr, s->port);
1930
    if (fd < 0) {
1931
        error_report("failed to connect");
1932
        goto out;
1933
    }
1934

    
1935
    for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1936
        if (!test_bit(vid, vdi_inuse)) {
1937
            break;
1938
        }
1939

    
1940
        /* we don't need to read entire object */
1941
        ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1942
                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1943

    
1944
        if (ret) {
1945
            continue;
1946
        }
1947

    
1948
        if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1949
            sn_tab[found].date_sec = inode.snap_ctime >> 32;
1950
            sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1951
            sn_tab[found].vm_state_size = inode.vm_state_size;
1952
            sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1953

    
1954
            snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1955
                     inode.snap_id);
1956
            strncpy(sn_tab[found].name, inode.tag,
1957
                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1958
            found++;
1959
        }
1960
    }
1961

    
1962
    closesocket(fd);
1963
out:
1964
    *psn_tab = sn_tab;
1965

    
1966
    qemu_free(vdi_inuse);
1967

    
1968
    return found;
1969
}
1970

    
1971
static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
1972
                                int64_t pos, int size, int load)
1973
{
1974
    int fd, create;
1975
    int ret = 0;
1976
    unsigned int data_len;
1977
    uint64_t vmstate_oid;
1978
    uint32_t vdi_index;
1979
    uint64_t offset;
1980

    
1981
    fd = connect_to_sdog(s->addr, s->port);
1982
    if (fd < 0) {
1983
        ret = -EIO;
1984
        goto cleanup;
1985
    }
1986

    
1987
    while (size) {
1988
        vdi_index = pos / SD_DATA_OBJ_SIZE;
1989
        offset = pos % SD_DATA_OBJ_SIZE;
1990

    
1991
        data_len = MIN(size, SD_DATA_OBJ_SIZE);
1992

    
1993
        vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
1994

    
1995
        create = (offset == 0);
1996
        if (load) {
1997
            ret = read_object(fd, (char *)data, vmstate_oid,
1998
                              s->inode.nr_copies, data_len, offset);
1999
        } else {
2000
            ret = write_object(fd, (char *)data, vmstate_oid,
2001
                               s->inode.nr_copies, data_len, offset, create);
2002
        }
2003

    
2004
        if (ret < 0) {
2005
            error_report("failed to save vmstate %s", strerror(errno));
2006
            ret = -EIO;
2007
            goto cleanup;
2008
        }
2009

    
2010
        pos += data_len;
2011
        size -= data_len;
2012
        ret += data_len;
2013
    }
2014
cleanup:
2015
    closesocket(fd);
2016
    return ret;
2017
}
2018

    
2019
static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
2020
                           int64_t pos, int size)
2021
{
2022
    BDRVSheepdogState *s = bs->opaque;
2023

    
2024
    return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
2025
}
2026

    
2027
static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
2028
                           int64_t pos, int size)
2029
{
2030
    BDRVSheepdogState *s = bs->opaque;
2031

    
2032
    return do_load_save_vmstate(s, data, pos, size, 1);
2033
}
2034

    
2035

    
2036
static QEMUOptionParameter sd_create_options[] = {
2037
    {
2038
        .name = BLOCK_OPT_SIZE,
2039
        .type = OPT_SIZE,
2040
        .help = "Virtual disk size"
2041
    },
2042
    {
2043
        .name = BLOCK_OPT_BACKING_FILE,
2044
        .type = OPT_STRING,
2045
        .help = "File name of a base image"
2046
    },
2047
    {
2048
        .name = BLOCK_OPT_PREALLOC,
2049
        .type = OPT_STRING,
2050
        .help = "Preallocation mode (allowed values: off, full)"
2051
    },
2052
    { NULL }
2053
};
2054

    
2055
BlockDriver bdrv_sheepdog = {
2056
    .format_name    = "sheepdog",
2057
    .protocol_name  = "sheepdog",
2058
    .instance_size  = sizeof(BDRVSheepdogState),
2059
    .bdrv_file_open = sd_open,
2060
    .bdrv_close     = sd_close,
2061
    .bdrv_create    = sd_create,
2062
    .bdrv_getlength = sd_getlength,
2063
    .bdrv_truncate  = sd_truncate,
2064

    
2065
    .bdrv_aio_readv     = sd_aio_readv,
2066
    .bdrv_aio_writev    = sd_aio_writev,
2067

    
2068
    .bdrv_snapshot_create   = sd_snapshot_create,
2069
    .bdrv_snapshot_goto     = sd_snapshot_goto,
2070
    .bdrv_snapshot_delete   = sd_snapshot_delete,
2071
    .bdrv_snapshot_list     = sd_snapshot_list,
2072

    
2073
    .bdrv_save_vmstate  = sd_save_vmstate,
2074
    .bdrv_load_vmstate  = sd_load_vmstate,
2075

    
2076
    .create_options = sd_create_options,
2077
};
2078

    
2079
static void bdrv_sheepdog_init(void)
2080
{
2081
    bdrv_register(&bdrv_sheepdog);
2082
}
2083
block_init(bdrv_sheepdog_init);