Revision 33b1db1c

b/Makefile.objs
14 14

  
15 15
block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
16 16
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
17
block-nested-y += parallels.o nbd.o blkdebug.o
17
block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o
18 18
block-nested-$(CONFIG_WIN32) += raw-win32.o
19 19
block-nested-$(CONFIG_POSIX) += raw-posix.o
20 20
block-nested-$(CONFIG_CURL) += curl.o
b/block/sheepdog.c
1
/*
2
 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3
 *
4
 * This program is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU General Public License version
6
 * 2 as published by the Free Software Foundation.
7
 *
8
 * You should have received a copy of the GNU General Public License
9
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10
 */
11
#ifdef _WIN32
12
#include <windows.h>
13
#include <winsock2.h>
14
#include <ws2tcpip.h>
15
#else
16
#include <netdb.h>
17
#include <netinet/tcp.h>
18

  
19
#define closesocket(s) close(s)
20
#endif
21

  
22
#include "qemu-common.h"
23
#include "qemu-error.h"
24
#include "qemu_socket.h"
25
#include "block_int.h"
26

  
27
#define SD_PROTO_VER 0x01
28

  
29
#define SD_DEFAULT_ADDR "localhost"
30
#define SD_DEFAULT_PORT "7000"
31

  
32
#define SD_OP_CREATE_AND_WRITE_OBJ  0x01
33
#define SD_OP_READ_OBJ       0x02
34
#define SD_OP_WRITE_OBJ      0x03
35

  
36
#define SD_OP_NEW_VDI        0x11
37
#define SD_OP_LOCK_VDI       0x12
38
#define SD_OP_RELEASE_VDI    0x13
39
#define SD_OP_GET_VDI_INFO   0x14
40
#define SD_OP_READ_VDIS      0x15
41

  
42
#define SD_FLAG_CMD_WRITE    0x01
43
#define SD_FLAG_CMD_COW      0x02
44

  
45
#define SD_RES_SUCCESS       0x00 /* Success */
46
#define SD_RES_UNKNOWN       0x01 /* Unknown error */
47
#define SD_RES_NO_OBJ        0x02 /* No object found */
48
#define SD_RES_EIO           0x03 /* I/O error */
49
#define SD_RES_VDI_EXIST     0x04 /* Vdi exists already */
50
#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
51
#define SD_RES_SYSTEM_ERROR  0x06 /* System error */
52
#define SD_RES_VDI_LOCKED    0x07 /* Vdi is locked */
53
#define SD_RES_NO_VDI        0x08 /* No vdi found */
54
#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
55
#define SD_RES_VDI_READ      0x0A /* Cannot read requested vdi */
56
#define SD_RES_VDI_WRITE     0x0B /* Cannot write requested vdi */
57
#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
58
#define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */
59
#define SD_RES_NO_TAG        0x0E /* Requested tag is not found */
60
#define SD_RES_STARTUP       0x0F /* Sheepdog is on starting up */
61
#define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */
62
#define SD_RES_SHUTDOWN      0x11 /* Sheepdog is shutting down */
63
#define SD_RES_NO_MEM        0x12 /* Cannot allocate memory */
64
#define SD_RES_FULL_VDI      0x13 /* we already have the maximum vdis */
65
#define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */
66
#define SD_RES_NO_SPACE      0x15 /* Server has no room for new objects */
67
#define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
68
#define SD_RES_WAIT_FOR_JOIN    0x17 /* Waiting for other nodes joining */
69
#define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
70

  
71
/*
72
 * Object ID rules
73
 *
74
 *  0 - 19 (20 bits): data object space
75
 * 20 - 31 (12 bits): reserved data object space
76
 * 32 - 55 (24 bits): vdi object space
77
 * 56 - 59 ( 4 bits): reserved vdi object space
78
 * 60 - 63 ( 4 bits): object type indentifier space
79
 */
80

  
81
#define VDI_SPACE_SHIFT   32
82
#define VDI_BIT (UINT64_C(1) << 63)
83
#define VMSTATE_BIT (UINT64_C(1) << 62)
84
#define MAX_DATA_OBJS (UINT64_C(1) << 20)
85
#define MAX_CHILDREN 1024
86
#define SD_MAX_VDI_LEN 256
87
#define SD_MAX_VDI_TAG_LEN 256
88
#define SD_NR_VDIS   (1U << 24)
89
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
90
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
91
#define SECTOR_SIZE 512
92

  
93
#define SD_INODE_SIZE (sizeof(SheepdogInode))
94
#define CURRENT_VDI_ID 0
95

  
96
typedef struct SheepdogReq {
97
    uint8_t proto_ver;
98
    uint8_t opcode;
99
    uint16_t flags;
100
    uint32_t epoch;
101
    uint32_t id;
102
    uint32_t data_length;
103
    uint32_t opcode_specific[8];
104
} SheepdogReq;
105

  
106
typedef struct SheepdogRsp {
107
    uint8_t proto_ver;
108
    uint8_t opcode;
109
    uint16_t flags;
110
    uint32_t epoch;
111
    uint32_t id;
112
    uint32_t data_length;
113
    uint32_t result;
114
    uint32_t opcode_specific[7];
115
} SheepdogRsp;
116

  
117
typedef struct SheepdogObjReq {
118
    uint8_t proto_ver;
119
    uint8_t opcode;
120
    uint16_t flags;
121
    uint32_t epoch;
122
    uint32_t id;
123
    uint32_t data_length;
124
    uint64_t oid;
125
    uint64_t cow_oid;
126
    uint32_t copies;
127
    uint32_t rsvd;
128
    uint64_t offset;
129
} SheepdogObjReq;
130

  
131
typedef struct SheepdogObjRsp {
132
    uint8_t proto_ver;
133
    uint8_t opcode;
134
    uint16_t flags;
135
    uint32_t epoch;
136
    uint32_t id;
137
    uint32_t data_length;
138
    uint32_t result;
139
    uint32_t copies;
140
    uint32_t pad[6];
141
} SheepdogObjRsp;
142

  
143
typedef struct SheepdogVdiReq {
144
    uint8_t proto_ver;
145
    uint8_t opcode;
146
    uint16_t flags;
147
    uint32_t epoch;
148
    uint32_t id;
149
    uint32_t data_length;
150
    uint64_t vdi_size;
151
    uint32_t base_vdi_id;
152
    uint32_t copies;
153
    uint32_t snapid;
154
    uint32_t pad[3];
155
} SheepdogVdiReq;
156

  
157
typedef struct SheepdogVdiRsp {
158
    uint8_t proto_ver;
159
    uint8_t opcode;
160
    uint16_t flags;
161
    uint32_t epoch;
162
    uint32_t id;
163
    uint32_t data_length;
164
    uint32_t result;
165
    uint32_t rsvd;
166
    uint32_t vdi_id;
167
    uint32_t pad[5];
168
} SheepdogVdiRsp;
169

  
170
typedef struct SheepdogInode {
171
    char name[SD_MAX_VDI_LEN];
172
    char tag[SD_MAX_VDI_TAG_LEN];
173
    uint64_t ctime;
174
    uint64_t snap_ctime;
175
    uint64_t vm_clock_nsec;
176
    uint64_t vdi_size;
177
    uint64_t vm_state_size;
178
    uint16_t copy_policy;
179
    uint8_t nr_copies;
180
    uint8_t block_size_shift;
181
    uint32_t snap_id;
182
    uint32_t vdi_id;
183
    uint32_t parent_vdi_id;
184
    uint32_t child_vdi_id[MAX_CHILDREN];
185
    uint32_t data_vdi_id[MAX_DATA_OBJS];
186
} SheepdogInode;
187

  
188
/*
189
 * 64 bit FNV-1a non-zero initial basis
190
 */
191
#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
192

  
193
/*
194
 * 64 bit Fowler/Noll/Vo FNV-1a hash code
195
 */
196
static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
197
{
198
    unsigned char *bp = buf;
199
    unsigned char *be = bp + len;
200
    while (bp < be) {
201
        hval ^= (uint64_t) *bp++;
202
        hval += (hval << 1) + (hval << 4) + (hval << 5) +
203
            (hval << 7) + (hval << 8) + (hval << 40);
204
    }
205
    return hval;
206
}
207

  
208
static inline int is_data_obj_writeable(SheepdogInode *inode, unsigned int idx)
209
{
210
    return inode->vdi_id == inode->data_vdi_id[idx];
211
}
212

  
213
static inline int is_data_obj(uint64_t oid)
214
{
215
    return !(VDI_BIT & oid);
216
}
217

  
218
static inline uint64_t data_oid_to_idx(uint64_t oid)
219
{
220
    return oid & (MAX_DATA_OBJS - 1);
221
}
222

  
223
static inline uint64_t vid_to_vdi_oid(uint32_t vid)
224
{
225
    return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
226
}
227

  
228
static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
229
{
230
    return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
231
}
232

  
233
static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
234
{
235
    return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
236
}
237

  
238
static inline int is_snapshot(struct SheepdogInode *inode)
239
{
240
    return !!inode->snap_ctime;
241
}
242

  
243
#undef dprintf
244
#ifdef DEBUG_SDOG
245
#define dprintf(fmt, args...)                                       \
246
    do {                                                            \
247
        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
248
    } while (0)
249
#else
250
#define dprintf(fmt, args...)
251
#endif
252

  
253
typedef struct SheepdogAIOCB SheepdogAIOCB;
254

  
255
typedef struct AIOReq {
256
    SheepdogAIOCB *aiocb;
257
    unsigned int iov_offset;
258

  
259
    uint64_t oid;
260
    uint64_t base_oid;
261
    uint64_t offset;
262
    unsigned int data_len;
263
    uint8_t flags;
264
    uint32_t id;
265

  
266
    QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
267
    QLIST_ENTRY(AIOReq) aioreq_siblings;
268
} AIOReq;
269

  
270
enum AIOCBState {
271
    AIOCB_WRITE_UDATA,
272
    AIOCB_READ_UDATA,
273
};
274

  
275
struct SheepdogAIOCB {
276
    BlockDriverAIOCB common;
277

  
278
    QEMUIOVector *qiov;
279

  
280
    int64_t sector_num;
281
    int nb_sectors;
282

  
283
    int ret;
284
    enum AIOCBState aiocb_type;
285

  
286
    QEMUBH *bh;
287
    void (*aio_done_func)(SheepdogAIOCB *);
288

  
289
    int canceled;
290

  
291
    QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
292
};
293

  
294
typedef struct BDRVSheepdogState {
295
    SheepdogInode inode;
296

  
297
    uint32_t min_dirty_data_idx;
298
    uint32_t max_dirty_data_idx;
299

  
300
    char name[SD_MAX_VDI_LEN];
301
    int is_snapshot;
302

  
303
    char *addr;
304
    char *port;
305
    int fd;
306

  
307
    uint32_t aioreq_seq_num;
308
    QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
309
} BDRVSheepdogState;
310

  
311
static const char * sd_strerror(int err)
312
{
313
    int i;
314

  
315
    static const struct {
316
        int err;
317
        const char *desc;
318
    } errors[] = {
319
        {SD_RES_SUCCESS, "Success"},
320
        {SD_RES_UNKNOWN, "Unknown error"},
321
        {SD_RES_NO_OBJ, "No object found"},
322
        {SD_RES_EIO, "I/O error"},
323
        {SD_RES_VDI_EXIST, "VDI exists already"},
324
        {SD_RES_INVALID_PARMS, "Invalid parameters"},
325
        {SD_RES_SYSTEM_ERROR, "System error"},
326
        {SD_RES_VDI_LOCKED, "VDI is already locked"},
327
        {SD_RES_NO_VDI, "No vdi found"},
328
        {SD_RES_NO_BASE_VDI, "No base VDI found"},
329
        {SD_RES_VDI_READ, "Failed read the requested VDI"},
330
        {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
331
        {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
332
        {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
333
        {SD_RES_NO_TAG, "Failed to find the requested tag"},
334
        {SD_RES_STARTUP, "The system is still booting"},
335
        {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
336
        {SD_RES_SHUTDOWN, "The system is shutting down"},
337
        {SD_RES_NO_MEM, "Out of memory on the server"},
338
        {SD_RES_FULL_VDI, "We already have the maximum vdis"},
339
        {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
340
        {SD_RES_NO_SPACE, "Server has no space for new objects"},
341
        {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
342
        {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
343
        {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
344
    };
345

  
346
    for (i = 0; i < ARRAY_SIZE(errors); ++i) {
347
        if (errors[i].err == err) {
348
            return errors[i].desc;
349
        }
350
    }
351

  
352
    return "Invalid error code";
353
}
354

  
355
/*
356
 * Sheepdog I/O handling:
357
 *
358
 * 1. In the sd_aio_readv/writev, read/write requests are added to the
359
 *    QEMU Bottom Halves.
360
 *
361
 * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
362
 *    requests to the server and link the requests to the
363
 *    outstanding_list in the BDRVSheepdogState.  we exits the
364
 *    function without waiting for receiving the response.
365
 *
366
 * 3. We receive the response in aio_read_response, the fd handler to
367
 *    the sheepdog connection.  If metadata update is needed, we send
368
 *    the write request to the vdi object in sd_write_done, the write
369
 *    completion function.  The AIOCB callback is not called until all
370
 *    the requests belonging to the AIOCB are finished.
371
 */
372

  
373
static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
374
                                    uint64_t oid, unsigned int data_len,
375
                                    uint64_t offset, uint8_t flags,
376
                                    uint64_t base_oid, unsigned int iov_offset)
377
{
378
    AIOReq *aio_req;
379

  
380
    aio_req = qemu_malloc(sizeof(*aio_req));
381
    aio_req->aiocb = acb;
382
    aio_req->iov_offset = iov_offset;
383
    aio_req->oid = oid;
384
    aio_req->base_oid = base_oid;
385
    aio_req->offset = offset;
386
    aio_req->data_len = data_len;
387
    aio_req->flags = flags;
388
    aio_req->id = s->aioreq_seq_num++;
389

  
390
    QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
391
                      outstanding_aio_siblings);
392
    QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
393

  
394
    return aio_req;
395
}
396

  
397
static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
398
{
399
    SheepdogAIOCB *acb = aio_req->aiocb;
400
    QLIST_REMOVE(aio_req, outstanding_aio_siblings);
401
    QLIST_REMOVE(aio_req, aioreq_siblings);
402
    qemu_free(aio_req);
403

  
404
    return !QLIST_EMPTY(&acb->aioreq_head);
405
}
406

  
407
static void sd_finish_aiocb(SheepdogAIOCB *acb)
408
{
409
    if (!acb->canceled) {
410
        acb->common.cb(acb->common.opaque, acb->ret);
411
    }
412
    qemu_aio_release(acb);
413
}
414

  
415
static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
416
{
417
    SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
418

  
419
    /*
420
     * Sheepdog cannot cancel the requests which are already sent to
421
     * the servers, so we just complete the request with -EIO here.
422
     */
423
    acb->common.cb(acb->common.opaque, -EIO);
424
    acb->canceled = 1;
425
}
426

  
427
static AIOPool sd_aio_pool = {
428
    .aiocb_size = sizeof(SheepdogAIOCB),
429
    .cancel = sd_aio_cancel,
430
};
431

  
432
static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
433
                                   int64_t sector_num, int nb_sectors,
434
                                   BlockDriverCompletionFunc *cb, void *opaque)
435
{
436
    SheepdogAIOCB *acb;
437

  
438
    acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
439

  
440
    acb->qiov = qiov;
441

  
442
    acb->sector_num = sector_num;
443
    acb->nb_sectors = nb_sectors;
444

  
445
    acb->aio_done_func = NULL;
446
    acb->canceled = 0;
447
    acb->bh = NULL;
448
    acb->ret = 0;
449
    QLIST_INIT(&acb->aioreq_head);
450
    return acb;
451
}
452

  
453
static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
454
{
455
    if (acb->bh) {
456
        error_report("bug: %d %d\n", acb->aiocb_type, acb->aiocb_type);
457
        return -EIO;
458
    }
459

  
460
    acb->bh = qemu_bh_new(cb, acb);
461
    if (!acb->bh) {
462
        error_report("oom: %d %d\n", acb->aiocb_type, acb->aiocb_type);
463
        return -EIO;
464
    }
465

  
466
    qemu_bh_schedule(acb->bh);
467

  
468
    return 0;
469
}
470

  
471
#ifdef _WIN32
472

  
473
struct msghdr {
474
    struct iovec *msg_iov;
475
    size_t        msg_iovlen;
476
};
477

  
478
static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
479
{
480
    size_t size = 0;
481
    char *buf, *p;
482
    int i, ret;
483

  
484
    /* count the msg size */
485
    for (i = 0; i < msg->msg_iovlen; i++) {
486
        size += msg->msg_iov[i].iov_len;
487
    }
488
    buf = qemu_malloc(size);
489

  
490
    p = buf;
491
    for (i = 0; i < msg->msg_iovlen; i++) {
492
        memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
493
        p += msg->msg_iov[i].iov_len;
494
    }
495

  
496
    ret = send(s, buf, size, flags);
497

  
498
    qemu_free(buf);
499
    return ret;
500
}
501

  
502
static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
503
{
504
    size_t size = 0;
505
    char *buf, *p;
506
    int i, ret;
507

  
508
    /* count the msg size */
509
    for (i = 0; i < msg->msg_iovlen; i++) {
510
        size += msg->msg_iov[i].iov_len;
511
    }
512
    buf = qemu_malloc(size);
513

  
514
    ret = recv(s, buf, size, flags);
515
    if (ret < 0) {
516
        goto out;
517
    }
518

  
519
    p = buf;
520
    for (i = 0; i < msg->msg_iovlen; i++) {
521
        memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
522
        p += msg->msg_iov[i].iov_len;
523
    }
524
out:
525
    qemu_free(buf);
526
    return ret;
527
}
528

  
529
#endif
530

  
531
/*
532
 * Send/recv data with iovec buffers
533
 *
534
 * This function send/recv data from/to the iovec buffer directly.
535
 * The first `offset' bytes in the iovec buffer are skipped and next
536
 * `len' bytes are used.
537
 *
538
 * For example,
539
 *
540
 *   do_send_recv(sockfd, iov, len, offset, 1);
541
 *
542
 * is equals to
543
 *
544
 *   char *buf = malloc(size);
545
 *   iov_to_buf(iov, iovcnt, buf, offset, size);
546
 *   send(sockfd, buf, size, 0);
547
 *   free(buf);
548
 */
549
static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
550
                        int write)
551
{
552
    struct msghdr msg;
553
    int ret, diff;
554

  
555
    memset(&msg, 0, sizeof(msg));
556
    msg.msg_iov = iov;
557
    msg.msg_iovlen = 1;
558

  
559
    len += offset;
560

  
561
    while (iov->iov_len < len) {
562
        len -= iov->iov_len;
563

  
564
        iov++;
565
        msg.msg_iovlen++;
566
    }
567

  
568
    diff = iov->iov_len - len;
569
    iov->iov_len -= diff;
570

  
571
    while (msg.msg_iov->iov_len <= offset) {
572
        offset -= msg.msg_iov->iov_len;
573

  
574
        msg.msg_iov++;
575
        msg.msg_iovlen--;
576
    }
577

  
578
    msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
579
    msg.msg_iov->iov_len -= offset;
580

  
581
    if (write) {
582
        ret = sendmsg(sockfd, &msg, 0);
583
    } else {
584
        ret = recvmsg(sockfd, &msg, 0);
585
    }
586

  
587
    msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
588
    msg.msg_iov->iov_len += offset;
589

  
590
    iov->iov_len += diff;
591
    return ret;
592
}
593

  
594
static int connect_to_sdog(const char *addr, const char *port)
595
{
596
    char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
597
    int fd, ret;
598
    struct addrinfo hints, *res, *res0;
599

  
600
    if (!addr) {
601
        addr = SD_DEFAULT_ADDR;
602
        port = SD_DEFAULT_PORT;
603
    }
604

  
605
    memset(&hints, 0, sizeof(hints));
606
    hints.ai_socktype = SOCK_STREAM;
607

  
608
    ret = getaddrinfo(addr, port, &hints, &res0);
609
    if (ret) {
610
        error_report("unable to get address info %s, %s\n",
611
                     addr, strerror(errno));
612
        return -1;
613
    }
614

  
615
    for (res = res0; res; res = res->ai_next) {
616
        ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
617
                          sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
618
        if (ret) {
619
            continue;
620
        }
621

  
622
        fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
623
        if (fd < 0) {
624
            continue;
625
        }
626

  
627
    reconnect:
628
        ret = connect(fd, res->ai_addr, res->ai_addrlen);
629
        if (ret < 0) {
630
            if (errno == EINTR) {
631
                goto reconnect;
632
            }
633
            break;
634
        }
635

  
636
        dprintf("connected to %s:%s\n", addr, port);
637
        goto success;
638
    }
639
    fd = -1;
640
    error_report("failed connect to %s:%s\n", addr, port);
641
success:
642
    freeaddrinfo(res0);
643
    return fd;
644
}
645

  
646
static int do_readv_writev(int sockfd, struct iovec *iov, int len,
647
                           int iov_offset, int write)
648
{
649
    int ret;
650
again:
651
    ret = do_send_recv(sockfd, iov, len, iov_offset, write);
652
    if (ret < 0) {
653
        if (errno == EINTR || errno == EAGAIN) {
654
            goto again;
655
        }
656
        error_report("failed to recv a rsp, %s\n", strerror(errno));
657
        return 1;
658
    }
659

  
660
    iov_offset += ret;
661
    len -= ret;
662
    if (len) {
663
        goto again;
664
    }
665

  
666
    return 0;
667
}
668

  
669
static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
670
{
671
    return do_readv_writev(sockfd, iov, len, iov_offset, 0);
672
}
673

  
674
static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
675
{
676
    return do_readv_writev(sockfd, iov, len, iov_offset, 1);
677
}
678

  
679
static int do_read_write(int sockfd, void *buf, int len, int write)
680
{
681
    struct iovec iov;
682

  
683
    iov.iov_base = buf;
684
    iov.iov_len = len;
685

  
686
    return do_readv_writev(sockfd, &iov, len, 0, write);
687
}
688

  
689
static int do_read(int sockfd, void *buf, int len)
690
{
691
    return do_read_write(sockfd, buf, len, 0);
692
}
693

  
694
static int do_write(int sockfd, void *buf, int len)
695
{
696
    return do_read_write(sockfd, buf, len, 1);
697
}
698

  
699
static int send_req(int sockfd, SheepdogReq *hdr, void *data,
700
                    unsigned int *wlen)
701
{
702
    int ret;
703
    struct iovec iov[2];
704

  
705
    iov[0].iov_base = hdr;
706
    iov[0].iov_len = sizeof(*hdr);
707

  
708
    if (*wlen) {
709
        iov[1].iov_base = data;
710
        iov[1].iov_len = *wlen;
711
    }
712

  
713
    ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
714
    if (ret) {
715
        error_report("failed to send a req, %s\n", strerror(errno));
716
        ret = -1;
717
    }
718

  
719
    return ret;
720
}
721

  
722
static int do_req(int sockfd, SheepdogReq *hdr, void *data,
723
                  unsigned int *wlen, unsigned int *rlen)
724
{
725
    int ret;
726

  
727
    ret = send_req(sockfd, hdr, data, wlen);
728
    if (ret) {
729
        ret = -1;
730
        goto out;
731
    }
732

  
733
    ret = do_read(sockfd, hdr, sizeof(*hdr));
734
    if (ret) {
735
        error_report("failed to get a rsp, %s\n", strerror(errno));
736
        ret = -1;
737
        goto out;
738
    }
739

  
740
    if (*rlen > hdr->data_length) {
741
        *rlen = hdr->data_length;
742
    }
743

  
744
    if (*rlen) {
745
        ret = do_read(sockfd, data, *rlen);
746
        if (ret) {
747
            error_report("failed to get the data, %s\n", strerror(errno));
748
            ret = -1;
749
            goto out;
750
        }
751
    }
752
    ret = 0;
753
out:
754
    return ret;
755
}
756

  
757
static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
758
                           struct iovec *iov, int niov, int create,
759
                           enum AIOCBState aiocb_type);
760

  
761
/*
762
 * This function searchs pending requests to the object `oid', and
763
 * sends them.
764
 */
765
static void send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
766
{
767
    AIOReq *aio_req, *next;
768
    SheepdogAIOCB *acb;
769
    int ret;
770

  
771
    QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
772
                       outstanding_aio_siblings, next) {
773
        if (id == aio_req->id) {
774
            continue;
775
        }
776
        if (aio_req->oid != oid) {
777
            continue;
778
        }
779

  
780
        acb = aio_req->aiocb;
781
        ret = add_aio_request(s, aio_req, acb->qiov->iov,
782
                              acb->qiov->niov, 0, acb->aiocb_type);
783
        if (ret < 0) {
784
            error_report("add_aio_request is failed\n");
785
            free_aio_req(s, aio_req);
786
            if (QLIST_EMPTY(&acb->aioreq_head)) {
787
                sd_finish_aiocb(acb);
788
            }
789
        }
790
    }
791
}
792

  
793
/*
794
 * Receive responses of the I/O requests.
795
 *
796
 * This function is registered as a fd handler, and called from the
797
 * main loop when s->fd is ready for reading responses.
798
 */
799
static void aio_read_response(void *opaque)
800
{
801
    SheepdogObjRsp rsp;
802
    BDRVSheepdogState *s = opaque;
803
    int fd = s->fd;
804
    int ret;
805
    AIOReq *aio_req = NULL;
806
    SheepdogAIOCB *acb;
807
    int rest;
808
    unsigned long idx;
809

  
810
    if (QLIST_EMPTY(&s->outstanding_aio_head)) {
811
        return;
812
    }
813

  
814
    /* read a header */
815
    ret = do_read(fd, &rsp, sizeof(rsp));
816
    if (ret) {
817
        error_report("failed to get the header, %s\n", strerror(errno));
818
        return;
819
    }
820

  
821
    /* find the right aio_req from the outstanding_aio list */
822
    QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
823
        if (aio_req->id == rsp.id) {
824
            break;
825
        }
826
    }
827
    if (!aio_req) {
828
        error_report("cannot find aio_req %x\n", rsp.id);
829
        return;
830
    }
831

  
832
    acb = aio_req->aiocb;
833

  
834
    switch (acb->aiocb_type) {
835
    case AIOCB_WRITE_UDATA:
836
        if (!is_data_obj(aio_req->oid)) {
837
            break;
838
        }
839
        idx = data_oid_to_idx(aio_req->oid);
840

  
841
        if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
842
            /*
843
             * If the object is newly created one, we need to update
844
             * the vdi object (metadata object).  min_dirty_data_idx
845
             * and max_dirty_data_idx are changed to include updated
846
             * index between them.
847
             */
848
            s->inode.data_vdi_id[idx] = s->inode.vdi_id;
849
            s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
850
            s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
851

  
852
            /*
853
             * Some requests may be blocked because simultaneous
854
             * create requests are not allowed, so we search the
855
             * pending requests here.
856
             */
857
            send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
858
        }
859
        break;
860
    case AIOCB_READ_UDATA:
861
        ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
862
                       aio_req->iov_offset);
863
        if (ret) {
864
            error_report("failed to get the data, %s\n", strerror(errno));
865
            return;
866
        }
867
        break;
868
    }
869

  
870
    if (rsp.result != SD_RES_SUCCESS) {
871
        acb->ret = -EIO;
872
        error_report("%s\n", sd_strerror(rsp.result));
873
    }
874

  
875
    rest = free_aio_req(s, aio_req);
876
    if (!rest) {
877
        /*
878
         * We've finished all requests which belong to the AIOCB, so
879
         * we can call the callback now.
880
         */
881
        acb->aio_done_func(acb);
882
    }
883
}
884

  
885
static int aio_flush_request(void *opaque)
886
{
887
    BDRVSheepdogState *s = opaque;
888

  
889
    return !QLIST_EMPTY(&s->outstanding_aio_head);
890
}
891

  
892
#ifdef _WIN32
893

  
894
static int set_cork(int fd, int v)
895
{
896
    return 0;
897
}
898

  
899
#else
900

  
901
static int set_cork(int fd, int v)
902
{
903
    return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
904
}
905

  
906
#endif
907

  
908
static int set_nodelay(int fd)
909
{
910
    int ret, opt;
911

  
912
    opt = 1;
913
    ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
914
    return ret;
915
}
916

  
917
/*
918
 * Return a socket discriptor to read/write objects.
919
 *
920
 * We cannot use this discriptor for other operations because
921
 * the block driver may be on waiting response from the server.
922
 */
923
static int get_sheep_fd(BDRVSheepdogState *s)
924
{
925
    int ret, fd;
926

  
927
    fd = connect_to_sdog(s->addr, s->port);
928
    if (fd < 0) {
929
        error_report("%s\n", strerror(errno));
930
        return -1;
931
    }
932

  
933
    socket_set_nonblock(fd);
934

  
935
    ret = set_nodelay(fd);
936
    if (ret) {
937
        error_report("%s\n", strerror(errno));
938
        closesocket(fd);
939
        return -1;
940
    }
941

  
942
    qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
943
                            NULL, s);
944
    return fd;
945
}
946

  
947
/*
948
 * Parse a filename
949
 *
950
 * filename must be one of the following formats:
951
 *   1. [vdiname]
952
 *   2. [vdiname]:[snapid]
953
 *   3. [vdiname]:[tag]
954
 *   4. [hostname]:[port]:[vdiname]
955
 *   5. [hostname]:[port]:[vdiname]:[snapid]
956
 *   6. [hostname]:[port]:[vdiname]:[tag]
957
 *
958
 * You can boot from the snapshot images by specifying `snapid` or
959
 * `tag'.
960
 *
961
 * You can run VMs outside the Sheepdog cluster by specifying
962
 * `hostname' and `port' (experimental).
963
 */
964
static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
965
                         char *vdi, uint32_t *snapid, char *tag)
966
{
967
    char *p, *q;
968
    int nr_sep;
969

  
970
    p = q = qemu_strdup(filename);
971

  
972
    /* count the number of separators */
973
    nr_sep = 0;
974
    while (*p) {
975
        if (*p == ':') {
976
            nr_sep++;
977
        }
978
        p++;
979
    }
980
    p = q;
981

  
982
    /* use the first two tokens as hostname and port number. */
983
    if (nr_sep >= 2) {
984
        s->addr = p;
985
        p = strchr(p, ':');
986
        *p++ = '\0';
987

  
988
        s->port = p;
989
        p = strchr(p, ':');
990
        *p++ = '\0';
991
    } else {
992
        s->addr = NULL;
993
        s->port = 0;
994
    }
995

  
996
    strncpy(vdi, p, SD_MAX_VDI_LEN);
997

  
998
    p = strchr(vdi, ':');
999
    if (p) {
1000
        *p++ = '\0';
1001
        *snapid = strtoul(p, NULL, 10);
1002
        if (*snapid == 0) {
1003
            strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
1004
        }
1005
    } else {
1006
        *snapid = CURRENT_VDI_ID; /* search current vdi */
1007
    }
1008

  
1009
    if (s->addr == NULL) {
1010
        qemu_free(q);
1011
    }
1012

  
1013
    return 0;
1014
}
1015

  
1016
static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1017
                         char *tag, uint32_t *vid, int for_snapshot)
1018
{
1019
    int ret, fd;
1020
    SheepdogVdiReq hdr;
1021
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1022
    unsigned int wlen, rlen = 0;
1023
    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1024

  
1025
    fd = connect_to_sdog(s->addr, s->port);
1026
    if (fd < 0) {
1027
        return -1;
1028
    }
1029

  
1030
    memset(buf, 0, sizeof(buf));
1031
    strncpy(buf, filename, SD_MAX_VDI_LEN);
1032
    strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1033

  
1034
    memset(&hdr, 0, sizeof(hdr));
1035
    if (for_snapshot) {
1036
        hdr.opcode = SD_OP_GET_VDI_INFO;
1037
    } else {
1038
        hdr.opcode = SD_OP_LOCK_VDI;
1039
    }
1040
    wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1041
    hdr.proto_ver = SD_PROTO_VER;
1042
    hdr.data_length = wlen;
1043
    hdr.snapid = snapid;
1044
    hdr.flags = SD_FLAG_CMD_WRITE;
1045

  
1046
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1047
    if (ret) {
1048
        ret = -1;
1049
        goto out;
1050
    }
1051

  
1052
    if (rsp->result != SD_RES_SUCCESS) {
1053
        error_report("cannot get vdi info, %s, %s %d %s\n",
1054
                     sd_strerror(rsp->result), filename, snapid, tag);
1055
        ret = -1;
1056
        goto out;
1057
    }
1058
    *vid = rsp->vdi_id;
1059

  
1060
    ret = 0;
1061
out:
1062
    closesocket(fd);
1063
    return ret;
1064
}
1065

  
1066
static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1067
                           struct iovec *iov, int niov, int create,
1068
                           enum AIOCBState aiocb_type)
1069
{
1070
    int nr_copies = s->inode.nr_copies;
1071
    SheepdogObjReq hdr;
1072
    unsigned int wlen;
1073
    int ret;
1074
    uint64_t oid = aio_req->oid;
1075
    unsigned int datalen = aio_req->data_len;
1076
    uint64_t offset = aio_req->offset;
1077
    uint8_t flags = aio_req->flags;
1078
    uint64_t old_oid = aio_req->base_oid;
1079

  
1080
    if (!nr_copies) {
1081
        error_report("bug\n");
1082
    }
1083

  
1084
    memset(&hdr, 0, sizeof(hdr));
1085

  
1086
    if (aiocb_type == AIOCB_READ_UDATA) {
1087
        wlen = 0;
1088
        hdr.opcode = SD_OP_READ_OBJ;
1089
        hdr.flags = flags;
1090
    } else if (create) {
1091
        wlen = datalen;
1092
        hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1093
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
1094
    } else {
1095
        wlen = datalen;
1096
        hdr.opcode = SD_OP_WRITE_OBJ;
1097
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
1098
    }
1099

  
1100
    hdr.oid = oid;
1101
    hdr.cow_oid = old_oid;
1102
    hdr.copies = s->inode.nr_copies;
1103

  
1104
    hdr.data_length = datalen;
1105
    hdr.offset = offset;
1106

  
1107
    hdr.id = aio_req->id;
1108

  
1109
    set_cork(s->fd, 1);
1110

  
1111
    /* send a header */
1112
    ret = do_write(s->fd, &hdr, sizeof(hdr));
1113
    if (ret) {
1114
        error_report("failed to send a req, %s\n", strerror(errno));
1115
        return -EIO;
1116
    }
1117

  
1118
    if (wlen) {
1119
        ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1120
        if (ret) {
1121
            error_report("failed to send a data, %s\n", strerror(errno));
1122
            return -EIO;
1123
        }
1124
    }
1125

  
1126
    set_cork(s->fd, 0);
1127

  
1128
    return 0;
1129
}
1130

  
1131
static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1132
                             unsigned int datalen, uint64_t offset,
1133
                             int write, int create)
1134
{
1135
    SheepdogObjReq hdr;
1136
    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1137
    unsigned int wlen, rlen;
1138
    int ret;
1139

  
1140
    memset(&hdr, 0, sizeof(hdr));
1141

  
1142
    if (write) {
1143
        wlen = datalen;
1144
        rlen = 0;
1145
        hdr.flags = SD_FLAG_CMD_WRITE;
1146
        if (create) {
1147
            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1148
        } else {
1149
            hdr.opcode = SD_OP_WRITE_OBJ;
1150
        }
1151
    } else {
1152
        wlen = 0;
1153
        rlen = datalen;
1154
        hdr.opcode = SD_OP_READ_OBJ;
1155
    }
1156
    hdr.oid = oid;
1157
    hdr.data_length = datalen;
1158
    hdr.offset = offset;
1159
    hdr.copies = copies;
1160

  
1161
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1162
    if (ret) {
1163
        error_report("failed to send a request to the sheep\n");
1164
        return -1;
1165
    }
1166

  
1167
    switch (rsp->result) {
1168
    case SD_RES_SUCCESS:
1169
        return 0;
1170
    default:
1171
        error_report("%s\n", sd_strerror(rsp->result));
1172
        return -1;
1173
    }
1174
}
1175

  
1176
static int read_object(int fd, char *buf, uint64_t oid, int copies,
1177
                       unsigned int datalen, uint64_t offset)
1178
{
1179
    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1180
}
1181

  
1182
static int write_object(int fd, char *buf, uint64_t oid, int copies,
1183
                        unsigned int datalen, uint64_t offset, int create)
1184
{
1185
    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1186
}
1187

  
1188
static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1189
{
1190
    int ret, fd;
1191
    uint32_t vid = 0;
1192
    BDRVSheepdogState *s = bs->opaque;
1193
    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1194
    uint32_t snapid;
1195
    char *buf = NULL;
1196

  
1197
    strstart(filename, "sheepdog:", (const char **)&filename);
1198

  
1199
    QLIST_INIT(&s->outstanding_aio_head);
1200
    s->fd = -1;
1201

  
1202
    memset(vdi, 0, sizeof(vdi));
1203
    memset(tag, 0, sizeof(tag));
1204
    if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1205
        goto out;
1206
    }
1207
    s->fd = get_sheep_fd(s);
1208
    if (s->fd < 0) {
1209
        goto out;
1210
    }
1211

  
1212
    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1213
    if (ret) {
1214
        goto out;
1215
    }
1216

  
1217
    if (snapid) {
1218
        dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1219
        s->is_snapshot = 1;
1220
    }
1221

  
1222
    fd = connect_to_sdog(s->addr, s->port);
1223
    if (fd < 0) {
1224
        error_report("failed to connect\n");
1225
        goto out;
1226
    }
1227

  
1228
    buf = qemu_malloc(SD_INODE_SIZE);
1229
    ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1230

  
1231
    closesocket(fd);
1232

  
1233
    if (ret) {
1234
        goto out;
1235
    }
1236

  
1237
    memcpy(&s->inode, buf, sizeof(s->inode));
1238
    s->min_dirty_data_idx = UINT32_MAX;
1239
    s->max_dirty_data_idx = 0;
1240

  
1241
    bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1242
    strncpy(s->name, vdi, sizeof(s->name));
1243
    qemu_free(buf);
1244
    return 0;
1245
out:
1246
    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1247
    if (s->fd >= 0) {
1248
        closesocket(s->fd);
1249
    }
1250
    qemu_free(buf);
1251
    return -1;
1252
}
1253

  
1254
static int do_sd_create(char *filename, int64_t vdi_size,
1255
                        uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1256
                        const char *addr, const char *port)
1257
{
1258
    SheepdogVdiReq hdr;
1259
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1260
    int fd, ret;
1261
    unsigned int wlen, rlen = 0;
1262
    char buf[SD_MAX_VDI_LEN];
1263

  
1264
    fd = connect_to_sdog(addr, port);
1265
    if (fd < 0) {
1266
        return -EIO;
1267
    }
1268

  
1269
    memset(buf, 0, sizeof(buf));
1270
    strncpy(buf, filename, SD_MAX_VDI_LEN);
1271

  
1272
    memset(&hdr, 0, sizeof(hdr));
1273
    hdr.opcode = SD_OP_NEW_VDI;
1274
    hdr.base_vdi_id = base_vid;
1275

  
1276
    wlen = SD_MAX_VDI_LEN;
1277

  
1278
    hdr.flags = SD_FLAG_CMD_WRITE;
1279
    hdr.snapid = snapshot;
1280

  
1281
    hdr.data_length = wlen;
1282
    hdr.vdi_size = vdi_size;
1283

  
1284
    ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1285

  
1286
    closesocket(fd);
1287

  
1288
    if (ret) {
1289
        return -EIO;
1290
    }
1291

  
1292
    if (rsp->result != SD_RES_SUCCESS) {
1293
        error_report("%s, %s\n", sd_strerror(rsp->result), filename);
1294
        return -EIO;
1295
    }
1296

  
1297
    if (vdi_id) {
1298
        *vdi_id = rsp->vdi_id;
1299
    }
1300

  
1301
    return 0;
1302
}
1303

  
1304
static int sd_create(const char *filename, QEMUOptionParameter *options)
1305
{
1306
    int ret;
1307
    uint32_t vid = 0;
1308
    int64_t vdi_size = 0;
1309
    char *backing_file = NULL;
1310

  
1311
    strstart(filename, "sheepdog:", (const char **)&filename);
1312

  
1313
    while (options && options->name) {
1314
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1315
            vdi_size = options->value.n;
1316
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1317
            backing_file = options->value.s;
1318
        }
1319
        options++;
1320
    }
1321

  
1322
    if (vdi_size > SD_MAX_VDI_SIZE) {
1323
        error_report("too big image size\n");
1324
        return -EINVAL;
1325
    }
1326

  
1327
    if (backing_file) {
1328
        BlockDriverState *bs;
1329
        BDRVSheepdogState *s;
1330
        BlockDriver *drv;
1331

  
1332
        /* Currently, only Sheepdog backing image is supported. */
1333
        drv = bdrv_find_protocol(backing_file);
1334
        if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1335
            error_report("backing_file must be a sheepdog image\n");
1336
            return -EINVAL;
1337
        }
1338

  
1339
        ret = bdrv_file_open(&bs, backing_file, 0);
1340
        if (ret < 0)
1341
            return -EIO;
1342

  
1343
        s = bs->opaque;
1344

  
1345
        if (!is_snapshot(&s->inode)) {
1346
            error_report("cannot clone from a non snapshot vdi\n");
1347
            bdrv_delete(bs);
1348
            return -EINVAL;
1349
        }
1350

  
1351
        vid = s->inode.vdi_id;
1352
        bdrv_delete(bs);
1353
    }
1354

  
1355
    return do_sd_create((char *)filename, vdi_size, vid, NULL, 0, NULL, NULL);
1356
}
1357

  
1358
static void sd_close(BlockDriverState *bs)
1359
{
1360
    BDRVSheepdogState *s = bs->opaque;
1361
    SheepdogVdiReq hdr;
1362
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1363
    unsigned int wlen, rlen = 0;
1364
    int fd, ret;
1365

  
1366
    dprintf("%s\n", s->name);
1367

  
1368
    fd = connect_to_sdog(s->addr, s->port);
1369
    if (fd < 0) {
1370
        return;
1371
    }
1372

  
1373
    memset(&hdr, 0, sizeof(hdr));
1374

  
1375
    hdr.opcode = SD_OP_RELEASE_VDI;
1376
    wlen = strlen(s->name) + 1;
1377
    hdr.data_length = wlen;
1378
    hdr.flags = SD_FLAG_CMD_WRITE;
1379

  
1380
    ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1381

  
1382
    closesocket(fd);
1383

  
1384
    if (!ret && rsp->result != SD_RES_SUCCESS &&
1385
        rsp->result != SD_RES_VDI_NOT_LOCKED) {
1386
        error_report("%s, %s\n", sd_strerror(rsp->result), s->name);
1387
    }
1388

  
1389
    qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1390
    closesocket(s->fd);
1391
    qemu_free(s->addr);
1392
}
1393

  
1394
static int64_t sd_getlength(BlockDriverState *bs)
1395
{
1396
    BDRVSheepdogState *s = bs->opaque;
1397

  
1398
    return s->inode.vdi_size;
1399
}
1400

  
1401
static int sd_truncate(BlockDriverState *bs, int64_t offset)
1402
{
1403
    BDRVSheepdogState *s = bs->opaque;
1404
    int ret, fd;
1405
    unsigned int datalen;
1406

  
1407
    if (offset < s->inode.vdi_size) {
1408
        error_report("shrinking is not supported\n");
1409
        return -EINVAL;
1410
    } else if (offset > SD_MAX_VDI_SIZE) {
1411
        error_report("too big image size\n");
1412
        return -EINVAL;
1413
    }
1414

  
1415
    fd = connect_to_sdog(s->addr, s->port);
1416
    if (fd < 0) {
1417
        return -EIO;
1418
    }
1419

  
1420
    /* we don't need to update entire object */
1421
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1422
    s->inode.vdi_size = offset;
1423
    ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1424
                       s->inode.nr_copies, datalen, 0, 0);
1425
    close(fd);
1426

  
1427
    if (ret < 0) {
1428
        error_report("failed to update an inode.\n");
1429
        return -EIO;
1430
    }
1431

  
1432
    return 0;
1433
}
1434

  
1435
/*
1436
 * This function is called after writing data objects.  If we need to
1437
 * update metadata, this sends a write request to the vdi object.
1438
 * Otherwise, this calls the AIOCB callback.
1439
 */
1440
static void sd_write_done(SheepdogAIOCB *acb)
1441
{
1442
    int ret;
1443
    BDRVSheepdogState *s = acb->common.bs->opaque;
1444
    struct iovec iov;
1445
    AIOReq *aio_req;
1446
    uint32_t offset, data_len, mn, mx;
1447

  
1448
    mn = s->min_dirty_data_idx;
1449
    mx = s->max_dirty_data_idx;
1450
    if (mn <= mx) {
1451
        /* we need to update the vdi object. */
1452
        offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1453
            mn * sizeof(s->inode.data_vdi_id[0]);
1454
        data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1455

  
1456
        s->min_dirty_data_idx = UINT32_MAX;
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff