Statistics
| Branch: | Revision:

root / block / raw-posix.c @ c4d9d196

History | View | Annotate | Download (48.9 kB)

1
/*
2
 * Block driver for RAW files (posix)
3
 *
4
 * Copyright (c) 2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include "qemu-common.h"
25
#include "qemu/timer.h"
26
#include "qemu/log.h"
27
#include "block/block_int.h"
28
#include "qemu/module.h"
29
#include "trace.h"
30
#include "block/thread-pool.h"
31
#include "qemu/iov.h"
32
#include "raw-aio.h"
33

    
34
#if defined(__APPLE__) && (__MACH__)
35
#include <paths.h>
36
#include <sys/param.h>
37
#include <IOKit/IOKitLib.h>
38
#include <IOKit/IOBSD.h>
39
#include <IOKit/storage/IOMediaBSDClient.h>
40
#include <IOKit/storage/IOMedia.h>
41
#include <IOKit/storage/IOCDMedia.h>
42
//#include <IOKit/storage/IOCDTypes.h>
43
#include <CoreFoundation/CoreFoundation.h>
44
#endif
45

    
46
#ifdef __sun__
47
#define _POSIX_PTHREAD_SEMANTICS 1
48
#include <sys/dkio.h>
49
#endif
50
#ifdef __linux__
51
#include <sys/types.h>
52
#include <sys/stat.h>
53
#include <sys/ioctl.h>
54
#include <sys/param.h>
55
#include <linux/cdrom.h>
56
#include <linux/fd.h>
57
#include <linux/fs.h>
58
#endif
59
#ifdef CONFIG_FIEMAP
60
#include <linux/fiemap.h>
61
#endif
62
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
63
#include <linux/falloc.h>
64
#endif
65
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
66
#include <sys/disk.h>
67
#include <sys/cdio.h>
68
#endif
69

    
70
#ifdef __OpenBSD__
71
#include <sys/ioctl.h>
72
#include <sys/disklabel.h>
73
#include <sys/dkio.h>
74
#endif
75

    
76
#ifdef __NetBSD__
77
#include <sys/ioctl.h>
78
#include <sys/disklabel.h>
79
#include <sys/dkio.h>
80
#include <sys/disk.h>
81
#endif
82

    
83
#ifdef __DragonFly__
84
#include <sys/ioctl.h>
85
#include <sys/diskslice.h>
86
#endif
87

    
88
#ifdef CONFIG_XFS
89
#include <xfs/xfs.h>
90
#endif
91

    
92
//#define DEBUG_FLOPPY
93

    
94
//#define DEBUG_BLOCK
95
#if defined(DEBUG_BLOCK)
96
#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
97
    { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
98
#else
99
#define DEBUG_BLOCK_PRINT(formatCstr, ...)
100
#endif
101

    
102
/* OS X does not have O_DSYNC */
103
#ifndef O_DSYNC
104
#ifdef O_SYNC
105
#define O_DSYNC O_SYNC
106
#elif defined(O_FSYNC)
107
#define O_DSYNC O_FSYNC
108
#endif
109
#endif
110

    
111
/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
112
#ifndef O_DIRECT
113
#define O_DIRECT O_DSYNC
114
#endif
115

    
116
#define FTYPE_FILE   0
117
#define FTYPE_CD     1
118
#define FTYPE_FD     2
119

    
120
/* if the FD is not accessed during that time (in ns), we try to
121
   reopen it to see if the disk has been changed */
122
#define FD_OPEN_TIMEOUT (1000000000)
123

    
124
#define MAX_BLOCKSIZE        4096
125

    
126
typedef struct BDRVRawState {
127
    int fd;
128
    int type;
129
    int open_flags;
130
#if defined(__linux__)
131
    /* linux floppy specific */
132
    int64_t fd_open_time;
133
    int64_t fd_error_time;
134
    int fd_got_error;
135
    int fd_media_changed;
136
#endif
137
#ifdef CONFIG_LINUX_AIO
138
    int use_aio;
139
    void *aio_ctx;
140
#endif
141
#ifdef CONFIG_XFS
142
    bool is_xfs : 1;
143
#endif
144
    bool has_discard : 1;
145
} BDRVRawState;
146

    
147
typedef struct BDRVRawReopenState {
148
    int fd;
149
    int open_flags;
150
#ifdef CONFIG_LINUX_AIO
151
    int use_aio;
152
#endif
153
} BDRVRawReopenState;
154

    
155
static int fd_open(BlockDriverState *bs);
156
static int64_t raw_getlength(BlockDriverState *bs);
157

    
158
typedef struct RawPosixAIOData {
159
    BlockDriverState *bs;
160
    int aio_fildes;
161
    union {
162
        struct iovec *aio_iov;
163
        void *aio_ioctl_buf;
164
    };
165
    int aio_niov;
166
    uint64_t aio_nbytes;
167
#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
168
    off_t aio_offset;
169
    int aio_type;
170
} RawPosixAIOData;
171

    
172
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
173
static int cdrom_reopen(BlockDriverState *bs);
174
#endif
175

    
176
#if defined(__NetBSD__)
177
static int raw_normalize_devicepath(const char **filename)
178
{
179
    static char namebuf[PATH_MAX];
180
    const char *dp, *fname;
181
    struct stat sb;
182

    
183
    fname = *filename;
184
    dp = strrchr(fname, '/');
185
    if (lstat(fname, &sb) < 0) {
186
        fprintf(stderr, "%s: stat failed: %s\n",
187
            fname, strerror(errno));
188
        return -errno;
189
    }
190

    
191
    if (!S_ISBLK(sb.st_mode)) {
192
        return 0;
193
    }
194

    
195
    if (dp == NULL) {
196
        snprintf(namebuf, PATH_MAX, "r%s", fname);
197
    } else {
198
        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
199
            (int)(dp - fname), fname, dp + 1);
200
    }
201
    fprintf(stderr, "%s is a block device", fname);
202
    *filename = namebuf;
203
    fprintf(stderr, ", using %s\n", *filename);
204

    
205
    return 0;
206
}
207
#else
208
static int raw_normalize_devicepath(const char **filename)
209
{
210
    return 0;
211
}
212
#endif
213

    
214
static void raw_parse_flags(int bdrv_flags, int *open_flags)
215
{
216
    assert(open_flags != NULL);
217

    
218
    *open_flags |= O_BINARY;
219
    *open_flags &= ~O_ACCMODE;
220
    if (bdrv_flags & BDRV_O_RDWR) {
221
        *open_flags |= O_RDWR;
222
    } else {
223
        *open_flags |= O_RDONLY;
224
    }
225

    
226
    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
227
     * and O_DIRECT for no caching. */
228
    if ((bdrv_flags & BDRV_O_NOCACHE)) {
229
        *open_flags |= O_DIRECT;
230
    }
231
}
232

    
233
#ifdef CONFIG_LINUX_AIO
234
static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
235
{
236
    int ret = -1;
237
    assert(aio_ctx != NULL);
238
    assert(use_aio != NULL);
239
    /*
240
     * Currently Linux do AIO only for files opened with O_DIRECT
241
     * specified so check NOCACHE flag too
242
     */
243
    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
244
                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
245

    
246
        /* if non-NULL, laio_init() has already been run */
247
        if (*aio_ctx == NULL) {
248
            *aio_ctx = laio_init();
249
            if (!*aio_ctx) {
250
                goto error;
251
            }
252
        }
253
        *use_aio = 1;
254
    } else {
255
        *use_aio = 0;
256
    }
257

    
258
    ret = 0;
259

    
260
error:
261
    return ret;
262
}
263
#endif
264

    
265
static int raw_open_common(BlockDriverState *bs, const char *filename,
266
                           int bdrv_flags, int open_flags)
267
{
268
    BDRVRawState *s = bs->opaque;
269
    int fd, ret;
270

    
271
    ret = raw_normalize_devicepath(&filename);
272
    if (ret != 0) {
273
        return ret;
274
    }
275

    
276
    s->open_flags = open_flags;
277
    raw_parse_flags(bdrv_flags, &s->open_flags);
278

    
279
    s->fd = -1;
280
    fd = qemu_open(filename, s->open_flags, 0644);
281
    if (fd < 0) {
282
        ret = -errno;
283
        if (ret == -EROFS)
284
            ret = -EACCES;
285
        return ret;
286
    }
287
    s->fd = fd;
288

    
289
#ifdef CONFIG_LINUX_AIO
290
    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
291
        qemu_close(fd);
292
        return -errno;
293
    }
294
#endif
295

    
296
    s->has_discard = 1;
297
#ifdef CONFIG_XFS
298
    if (platform_test_xfs_fd(s->fd)) {
299
        s->is_xfs = 1;
300
    }
301
#endif
302

    
303
    return 0;
304
}
305

    
306
static int raw_open(BlockDriverState *bs, const char *filename, int flags)
307
{
308
    BDRVRawState *s = bs->opaque;
309

    
310
    s->type = FTYPE_FILE;
311
    return raw_open_common(bs, filename, flags, 0);
312
}
313

    
314
static int raw_reopen_prepare(BDRVReopenState *state,
315
                              BlockReopenQueue *queue, Error **errp)
316
{
317
    BDRVRawState *s;
318
    BDRVRawReopenState *raw_s;
319
    int ret = 0;
320

    
321
    assert(state != NULL);
322
    assert(state->bs != NULL);
323

    
324
    s = state->bs->opaque;
325

    
326
    state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
327
    raw_s = state->opaque;
328

    
329
#ifdef CONFIG_LINUX_AIO
330
    raw_s->use_aio = s->use_aio;
331

    
332
    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
333
     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
334
     * won't override aio_ctx if aio_ctx is non-NULL */
335
    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
336
        return -1;
337
    }
338
#endif
339

    
340
    if (s->type == FTYPE_FD || s->type == FTYPE_CD) {
341
        raw_s->open_flags |= O_NONBLOCK;
342
    }
343

    
344
    raw_parse_flags(state->flags, &raw_s->open_flags);
345

    
346
    raw_s->fd = -1;
347

    
348
    int fcntl_flags = O_APPEND | O_NONBLOCK;
349
#ifdef O_NOATIME
350
    fcntl_flags |= O_NOATIME;
351
#endif
352

    
353
#ifdef O_ASYNC
354
    /* Not all operating systems have O_ASYNC, and those that don't
355
     * will not let us track the state into raw_s->open_flags (typically
356
     * you achieve the same effect with an ioctl, for example I_SETSIG
357
     * on Solaris). But we do not use O_ASYNC, so that's fine.
358
     */
359
    assert((s->open_flags & O_ASYNC) == 0);
360
#endif
361

    
362
    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
363
        /* dup the original fd */
364
        /* TODO: use qemu fcntl wrapper */
365
#ifdef F_DUPFD_CLOEXEC
366
        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
367
#else
368
        raw_s->fd = dup(s->fd);
369
        if (raw_s->fd != -1) {
370
            qemu_set_cloexec(raw_s->fd);
371
        }
372
#endif
373
        if (raw_s->fd >= 0) {
374
            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
375
            if (ret) {
376
                qemu_close(raw_s->fd);
377
                raw_s->fd = -1;
378
            }
379
        }
380
    }
381

    
382
    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
383
    if (raw_s->fd == -1) {
384
        assert(!(raw_s->open_flags & O_CREAT));
385
        raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags);
386
        if (raw_s->fd == -1) {
387
            ret = -1;
388
        }
389
    }
390
    return ret;
391
}
392

    
393

    
394
static void raw_reopen_commit(BDRVReopenState *state)
395
{
396
    BDRVRawReopenState *raw_s = state->opaque;
397
    BDRVRawState *s = state->bs->opaque;
398

    
399
    s->open_flags = raw_s->open_flags;
400

    
401
    qemu_close(s->fd);
402
    s->fd = raw_s->fd;
403
#ifdef CONFIG_LINUX_AIO
404
    s->use_aio = raw_s->use_aio;
405
#endif
406

    
407
    g_free(state->opaque);
408
    state->opaque = NULL;
409
}
410

    
411

    
412
static void raw_reopen_abort(BDRVReopenState *state)
413
{
414
    BDRVRawReopenState *raw_s = state->opaque;
415

    
416
     /* nothing to do if NULL, we didn't get far enough */
417
    if (raw_s == NULL) {
418
        return;
419
    }
420

    
421
    if (raw_s->fd >= 0) {
422
        qemu_close(raw_s->fd);
423
        raw_s->fd = -1;
424
    }
425
    g_free(state->opaque);
426
    state->opaque = NULL;
427
}
428

    
429

    
430
/* XXX: use host sector size if necessary with:
431
#ifdef DIOCGSECTORSIZE
432
        {
433
            unsigned int sectorsize = 512;
434
            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
435
                sectorsize > bufsize)
436
                bufsize = sectorsize;
437
        }
438
#endif
439
#ifdef CONFIG_COCOA
440
        uint32_t blockSize = 512;
441
        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
442
            bufsize = blockSize;
443
        }
444
#endif
445
*/
446

    
447
static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
448
{
449
    int ret;
450

    
451
    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
452
    if (ret == -1) {
453
        return -errno;
454
    }
455

    
456
    return 0;
457
}
458

    
459
static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
460
{
461
    int ret;
462

    
463
    ret = qemu_fdatasync(aiocb->aio_fildes);
464
    if (ret == -1) {
465
        return -errno;
466
    }
467
    return 0;
468
}
469

    
470
#ifdef CONFIG_PREADV
471

    
472
static bool preadv_present = true;
473

    
474
static ssize_t
475
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
476
{
477
    return preadv(fd, iov, nr_iov, offset);
478
}
479

    
480
static ssize_t
481
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
482
{
483
    return pwritev(fd, iov, nr_iov, offset);
484
}
485

    
486
#else
487

    
488
static bool preadv_present = false;
489

    
490
static ssize_t
491
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
492
{
493
    return -ENOSYS;
494
}
495

    
496
static ssize_t
497
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
498
{
499
    return -ENOSYS;
500
}
501

    
502
#endif
503

    
504
static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
505
{
506
    ssize_t len;
507

    
508
    do {
509
        if (aiocb->aio_type & QEMU_AIO_WRITE)
510
            len = qemu_pwritev(aiocb->aio_fildes,
511
                               aiocb->aio_iov,
512
                               aiocb->aio_niov,
513
                               aiocb->aio_offset);
514
         else
515
            len = qemu_preadv(aiocb->aio_fildes,
516
                              aiocb->aio_iov,
517
                              aiocb->aio_niov,
518
                              aiocb->aio_offset);
519
    } while (len == -1 && errno == EINTR);
520

    
521
    if (len == -1) {
522
        return -errno;
523
    }
524
    return len;
525
}
526

    
527
/*
528
 * Read/writes the data to/from a given linear buffer.
529
 *
530
 * Returns the number of bytes handles or -errno in case of an error. Short
531
 * reads are only returned if the end of the file is reached.
532
 */
533
static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
534
{
535
    ssize_t offset = 0;
536
    ssize_t len;
537

    
538
    while (offset < aiocb->aio_nbytes) {
539
        if (aiocb->aio_type & QEMU_AIO_WRITE) {
540
            len = pwrite(aiocb->aio_fildes,
541
                         (const char *)buf + offset,
542
                         aiocb->aio_nbytes - offset,
543
                         aiocb->aio_offset + offset);
544
        } else {
545
            len = pread(aiocb->aio_fildes,
546
                        buf + offset,
547
                        aiocb->aio_nbytes - offset,
548
                        aiocb->aio_offset + offset);
549
        }
550
        if (len == -1 && errno == EINTR) {
551
            continue;
552
        } else if (len == -1) {
553
            offset = -errno;
554
            break;
555
        } else if (len == 0) {
556
            break;
557
        }
558
        offset += len;
559
    }
560

    
561
    return offset;
562
}
563

    
564
static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
565
{
566
    ssize_t nbytes;
567
    char *buf;
568

    
569
    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
570
        /*
571
         * If there is just a single buffer, and it is properly aligned
572
         * we can just use plain pread/pwrite without any problems.
573
         */
574
        if (aiocb->aio_niov == 1) {
575
             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
576
        }
577
        /*
578
         * We have more than one iovec, and all are properly aligned.
579
         *
580
         * Try preadv/pwritev first and fall back to linearizing the
581
         * buffer if it's not supported.
582
         */
583
        if (preadv_present) {
584
            nbytes = handle_aiocb_rw_vector(aiocb);
585
            if (nbytes == aiocb->aio_nbytes ||
586
                (nbytes < 0 && nbytes != -ENOSYS)) {
587
                return nbytes;
588
            }
589
            preadv_present = false;
590
        }
591

    
592
        /*
593
         * XXX(hch): short read/write.  no easy way to handle the reminder
594
         * using these interfaces.  For now retry using plain
595
         * pread/pwrite?
596
         */
597
    }
598

    
599
    /*
600
     * Ok, we have to do it the hard way, copy all segments into
601
     * a single aligned buffer.
602
     */
603
    buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
604
    if (aiocb->aio_type & QEMU_AIO_WRITE) {
605
        char *p = buf;
606
        int i;
607

    
608
        for (i = 0; i < aiocb->aio_niov; ++i) {
609
            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
610
            p += aiocb->aio_iov[i].iov_len;
611
        }
612
    }
613

    
614
    nbytes = handle_aiocb_rw_linear(aiocb, buf);
615
    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
616
        char *p = buf;
617
        size_t count = aiocb->aio_nbytes, copy;
618
        int i;
619

    
620
        for (i = 0; i < aiocb->aio_niov && count; ++i) {
621
            copy = count;
622
            if (copy > aiocb->aio_iov[i].iov_len) {
623
                copy = aiocb->aio_iov[i].iov_len;
624
            }
625
            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
626
            p     += copy;
627
            count -= copy;
628
        }
629
    }
630
    qemu_vfree(buf);
631

    
632
    return nbytes;
633
}
634

    
635
#ifdef CONFIG_XFS
636
static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
637
{
638
    struct xfs_flock64 fl;
639

    
640
    memset(&fl, 0, sizeof(fl));
641
    fl.l_whence = SEEK_SET;
642
    fl.l_start = offset;
643
    fl.l_len = bytes;
644

    
645
    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
646
        DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
647
        return -errno;
648
    }
649

    
650
    return 0;
651
}
652
#endif
653

    
654
static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
655
{
656
    int ret = -EOPNOTSUPP;
657
    BDRVRawState *s = aiocb->bs->opaque;
658

    
659
    if (s->has_discard == 0) {
660
        return 0;
661
    }
662

    
663
    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
664
#ifdef BLKDISCARD
665
        do {
666
            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
667
            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
668
                return 0;
669
            }
670
        } while (errno == EINTR);
671

    
672
        ret = -errno;
673
#endif
674
    } else {
675
#ifdef CONFIG_XFS
676
        if (s->is_xfs) {
677
            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
678
        }
679
#endif
680

    
681
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
682
        do {
683
            if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
684
                          aiocb->aio_offset, aiocb->aio_nbytes) == 0) {
685
                return 0;
686
            }
687
        } while (errno == EINTR);
688

    
689
        ret = -errno;
690
#endif
691
    }
692

    
693
    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
694
        ret == -ENOTTY) {
695
        s->has_discard = 0;
696
        ret = 0;
697
    }
698
    return ret;
699
}
700

    
701
static int aio_worker(void *arg)
702
{
703
    RawPosixAIOData *aiocb = arg;
704
    ssize_t ret = 0;
705

    
706
    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
707
    case QEMU_AIO_READ:
708
        ret = handle_aiocb_rw(aiocb);
709
        if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->bs->growable) {
710
            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
711
                      0, aiocb->aio_nbytes - ret);
712

    
713
            ret = aiocb->aio_nbytes;
714
        }
715
        if (ret == aiocb->aio_nbytes) {
716
            ret = 0;
717
        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
718
            ret = -EINVAL;
719
        }
720
        break;
721
    case QEMU_AIO_WRITE:
722
        ret = handle_aiocb_rw(aiocb);
723
        if (ret == aiocb->aio_nbytes) {
724
            ret = 0;
725
        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
726
            ret = -EINVAL;
727
        }
728
        break;
729
    case QEMU_AIO_FLUSH:
730
        ret = handle_aiocb_flush(aiocb);
731
        break;
732
    case QEMU_AIO_IOCTL:
733
        ret = handle_aiocb_ioctl(aiocb);
734
        break;
735
    case QEMU_AIO_DISCARD:
736
        ret = handle_aiocb_discard(aiocb);
737
        break;
738
    default:
739
        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
740
        ret = -EINVAL;
741
        break;
742
    }
743

    
744
    g_slice_free(RawPosixAIOData, aiocb);
745
    return ret;
746
}
747

    
748
static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
749
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
750
        BlockDriverCompletionFunc *cb, void *opaque, int type)
751
{
752
    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
753
    ThreadPool *pool;
754

    
755
    acb->bs = bs;
756
    acb->aio_type = type;
757
    acb->aio_fildes = fd;
758

    
759
    if (qiov) {
760
        acb->aio_iov = qiov->iov;
761
        acb->aio_niov = qiov->niov;
762
    }
763
    acb->aio_nbytes = nb_sectors * 512;
764
    acb->aio_offset = sector_num * 512;
765

    
766
    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
767
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
768
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
769
}
770

    
771
static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
772
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
773
        BlockDriverCompletionFunc *cb, void *opaque, int type)
774
{
775
    BDRVRawState *s = bs->opaque;
776

    
777
    if (fd_open(bs) < 0)
778
        return NULL;
779

    
780
    /*
781
     * If O_DIRECT is used the buffer needs to be aligned on a sector
782
     * boundary.  Check if this is the case or tell the low-level
783
     * driver that it needs to copy the buffer.
784
     */
785
    if ((bs->open_flags & BDRV_O_NOCACHE)) {
786
        if (!bdrv_qiov_is_aligned(bs, qiov)) {
787
            type |= QEMU_AIO_MISALIGNED;
788
#ifdef CONFIG_LINUX_AIO
789
        } else if (s->use_aio) {
790
            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
791
                               nb_sectors, cb, opaque, type);
792
#endif
793
        }
794
    }
795

    
796
    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
797
                       cb, opaque, type);
798
}
799

    
800
static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
801
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
802
        BlockDriverCompletionFunc *cb, void *opaque)
803
{
804
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
805
                          cb, opaque, QEMU_AIO_READ);
806
}
807

    
808
static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
809
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
810
        BlockDriverCompletionFunc *cb, void *opaque)
811
{
812
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
813
                          cb, opaque, QEMU_AIO_WRITE);
814
}
815

    
816
static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
817
        BlockDriverCompletionFunc *cb, void *opaque)
818
{
819
    BDRVRawState *s = bs->opaque;
820

    
821
    if (fd_open(bs) < 0)
822
        return NULL;
823

    
824
    return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
825
}
826

    
827
static void raw_close(BlockDriverState *bs)
828
{
829
    BDRVRawState *s = bs->opaque;
830
    if (s->fd >= 0) {
831
        qemu_close(s->fd);
832
        s->fd = -1;
833
    }
834
}
835

    
836
static int raw_truncate(BlockDriverState *bs, int64_t offset)
837
{
838
    BDRVRawState *s = bs->opaque;
839
    struct stat st;
840

    
841
    if (fstat(s->fd, &st)) {
842
        return -errno;
843
    }
844

    
845
    if (S_ISREG(st.st_mode)) {
846
        if (ftruncate(s->fd, offset) < 0) {
847
            return -errno;
848
        }
849
    } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
850
       if (offset > raw_getlength(bs)) {
851
           return -EINVAL;
852
       }
853
    } else {
854
        return -ENOTSUP;
855
    }
856

    
857
    return 0;
858
}
859

    
860
#ifdef __OpenBSD__
861
static int64_t raw_getlength(BlockDriverState *bs)
862
{
863
    BDRVRawState *s = bs->opaque;
864
    int fd = s->fd;
865
    struct stat st;
866

    
867
    if (fstat(fd, &st))
868
        return -1;
869
    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
870
        struct disklabel dl;
871

    
872
        if (ioctl(fd, DIOCGDINFO, &dl))
873
            return -1;
874
        return (uint64_t)dl.d_secsize *
875
            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
876
    } else
877
        return st.st_size;
878
}
879
#elif defined(__NetBSD__)
880
static int64_t raw_getlength(BlockDriverState *bs)
881
{
882
    BDRVRawState *s = bs->opaque;
883
    int fd = s->fd;
884
    struct stat st;
885

    
886
    if (fstat(fd, &st))
887
        return -1;
888
    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
889
        struct dkwedge_info dkw;
890

    
891
        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
892
            return dkw.dkw_size * 512;
893
        } else {
894
            struct disklabel dl;
895

    
896
            if (ioctl(fd, DIOCGDINFO, &dl))
897
                return -1;
898
            return (uint64_t)dl.d_secsize *
899
                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
900
        }
901
    } else
902
        return st.st_size;
903
}
904
#elif defined(__sun__)
905
static int64_t raw_getlength(BlockDriverState *bs)
906
{
907
    BDRVRawState *s = bs->opaque;
908
    struct dk_minfo minfo;
909
    int ret;
910

    
911
    ret = fd_open(bs);
912
    if (ret < 0) {
913
        return ret;
914
    }
915

    
916
    /*
917
     * Use the DKIOCGMEDIAINFO ioctl to read the size.
918
     */
919
    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
920
    if (ret != -1) {
921
        return minfo.dki_lbsize * minfo.dki_capacity;
922
    }
923

    
924
    /*
925
     * There are reports that lseek on some devices fails, but
926
     * irc discussion said that contingency on contingency was overkill.
927
     */
928
    return lseek(s->fd, 0, SEEK_END);
929
}
930
#elif defined(CONFIG_BSD)
931
static int64_t raw_getlength(BlockDriverState *bs)
932
{
933
    BDRVRawState *s = bs->opaque;
934
    int fd = s->fd;
935
    int64_t size;
936
    struct stat sb;
937
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
938
    int reopened = 0;
939
#endif
940
    int ret;
941

    
942
    ret = fd_open(bs);
943
    if (ret < 0)
944
        return ret;
945

    
946
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
947
again:
948
#endif
949
    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
950
#ifdef DIOCGMEDIASIZE
951
        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
952
#elif defined(DIOCGPART)
953
        {
954
                struct partinfo pi;
955
                if (ioctl(fd, DIOCGPART, &pi) == 0)
956
                        size = pi.media_size;
957
                else
958
                        size = 0;
959
        }
960
        if (size == 0)
961
#endif
962
#if defined(__APPLE__) && defined(__MACH__)
963
        size = LONG_LONG_MAX;
964
#else
965
        size = lseek(fd, 0LL, SEEK_END);
966
#endif
967
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
968
        switch(s->type) {
969
        case FTYPE_CD:
970
            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
971
            if (size == 2048LL * (unsigned)-1)
972
                size = 0;
973
            /* XXX no disc?  maybe we need to reopen... */
974
            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
975
                reopened = 1;
976
                goto again;
977
            }
978
        }
979
#endif
980
    } else {
981
        size = lseek(fd, 0, SEEK_END);
982
    }
983
    return size;
984
}
985
#else
986
static int64_t raw_getlength(BlockDriverState *bs)
987
{
988
    BDRVRawState *s = bs->opaque;
989
    int ret;
990

    
991
    ret = fd_open(bs);
992
    if (ret < 0) {
993
        return ret;
994
    }
995

    
996
    return lseek(s->fd, 0, SEEK_END);
997
}
998
#endif
999

    
1000
static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
1001
{
1002
    struct stat st;
1003
    BDRVRawState *s = bs->opaque;
1004

    
1005
    if (fstat(s->fd, &st) < 0) {
1006
        return -errno;
1007
    }
1008
    return (int64_t)st.st_blocks * 512;
1009
}
1010

    
1011
static int raw_create(const char *filename, QEMUOptionParameter *options)
1012
{
1013
    int fd;
1014
    int result = 0;
1015
    int64_t total_size = 0;
1016

    
1017
    /* Read out options */
1018
    while (options && options->name) {
1019
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1020
            total_size = options->value.n / BDRV_SECTOR_SIZE;
1021
        }
1022
        options++;
1023
    }
1024

    
1025
    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
1026
                   0644);
1027
    if (fd < 0) {
1028
        result = -errno;
1029
    } else {
1030
        if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
1031
            result = -errno;
1032
        }
1033
        if (qemu_close(fd) != 0) {
1034
            result = -errno;
1035
        }
1036
    }
1037
    return result;
1038
}
1039

    
1040
/*
1041
 * Returns true iff the specified sector is present in the disk image. Drivers
1042
 * not implementing the functionality are assumed to not support backing files,
1043
 * hence all their sectors are reported as allocated.
1044
 *
1045
 * If 'sector_num' is beyond the end of the disk image the return value is 0
1046
 * and 'pnum' is set to 0.
1047
 *
1048
 * 'pnum' is set to the number of sectors (including and immediately following
1049
 * the specified sector) that are known to be in the same
1050
 * allocated/unallocated state.
1051
 *
1052
 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1053
 * beyond the end of the disk image it will be clamped.
1054
 */
1055
static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
1056
                                            int64_t sector_num,
1057
                                            int nb_sectors, int *pnum)
1058
{
1059
    off_t start, data, hole;
1060
    int ret;
1061

    
1062
    ret = fd_open(bs);
1063
    if (ret < 0) {
1064
        return ret;
1065
    }
1066

    
1067
    start = sector_num * BDRV_SECTOR_SIZE;
1068

    
1069
#ifdef CONFIG_FIEMAP
1070

    
1071
    BDRVRawState *s = bs->opaque;
1072
    struct {
1073
        struct fiemap fm;
1074
        struct fiemap_extent fe;
1075
    } f;
1076

    
1077
    f.fm.fm_start = start;
1078
    f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
1079
    f.fm.fm_flags = 0;
1080
    f.fm.fm_extent_count = 1;
1081
    f.fm.fm_reserved = 0;
1082
    if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
1083
        /* Assume everything is allocated.  */
1084
        *pnum = nb_sectors;
1085
        return 1;
1086
    }
1087

    
1088
    if (f.fm.fm_mapped_extents == 0) {
1089
        /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
1090
         * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
1091
         */
1092
        off_t length = lseek(s->fd, 0, SEEK_END);
1093
        hole = f.fm.fm_start;
1094
        data = MIN(f.fm.fm_start + f.fm.fm_length, length);
1095
    } else {
1096
        data = f.fe.fe_logical;
1097
        hole = f.fe.fe_logical + f.fe.fe_length;
1098
    }
1099

    
1100
#elif defined SEEK_HOLE && defined SEEK_DATA
1101

    
1102
    BDRVRawState *s = bs->opaque;
1103

    
1104
    hole = lseek(s->fd, start, SEEK_HOLE);
1105
    if (hole == -1) {
1106
        /* -ENXIO indicates that sector_num was past the end of the file.
1107
         * There is a virtual hole there.  */
1108
        assert(errno != -ENXIO);
1109

    
1110
        /* Most likely EINVAL.  Assume everything is allocated.  */
1111
        *pnum = nb_sectors;
1112
        return 1;
1113
    }
1114

    
1115
    if (hole > start) {
1116
        data = start;
1117
    } else {
1118
        /* On a hole.  We need another syscall to find its end.  */
1119
        data = lseek(s->fd, start, SEEK_DATA);
1120
        if (data == -1) {
1121
            data = lseek(s->fd, 0, SEEK_END);
1122
        }
1123
    }
1124
#else
1125
    *pnum = nb_sectors;
1126
    return 1;
1127
#endif
1128

    
1129
    if (data <= start) {
1130
        /* On a data extent, compute sectors to the end of the extent.  */
1131
        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
1132
        return 1;
1133
    } else {
1134
        /* On a hole, compute sectors to the beginning of the next extent.  */
1135
        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
1136
        return 0;
1137
    }
1138
}
1139

    
1140
static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
1141
    int64_t sector_num, int nb_sectors,
1142
    BlockDriverCompletionFunc *cb, void *opaque)
1143
{
1144
    BDRVRawState *s = bs->opaque;
1145

    
1146
    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
1147
                       cb, opaque, QEMU_AIO_DISCARD);
1148
}
1149

    
1150
static QEMUOptionParameter raw_create_options[] = {
1151
    {
1152
        .name = BLOCK_OPT_SIZE,
1153
        .type = OPT_SIZE,
1154
        .help = "Virtual disk size"
1155
    },
1156
    { NULL }
1157
};
1158

    
1159
static BlockDriver bdrv_file = {
1160
    .format_name = "file",
1161
    .protocol_name = "file",
1162
    .instance_size = sizeof(BDRVRawState),
1163
    .bdrv_probe = NULL, /* no probe for protocols */
1164
    .bdrv_file_open = raw_open,
1165
    .bdrv_reopen_prepare = raw_reopen_prepare,
1166
    .bdrv_reopen_commit = raw_reopen_commit,
1167
    .bdrv_reopen_abort = raw_reopen_abort,
1168
    .bdrv_close = raw_close,
1169
    .bdrv_create = raw_create,
1170
    .bdrv_co_is_allocated = raw_co_is_allocated,
1171

    
1172
    .bdrv_aio_readv = raw_aio_readv,
1173
    .bdrv_aio_writev = raw_aio_writev,
1174
    .bdrv_aio_flush = raw_aio_flush,
1175
    .bdrv_aio_discard = raw_aio_discard,
1176

    
1177
    .bdrv_truncate = raw_truncate,
1178
    .bdrv_getlength = raw_getlength,
1179
    .bdrv_get_allocated_file_size
1180
                        = raw_get_allocated_file_size,
1181

    
1182
    .create_options = raw_create_options,
1183
};
1184

    
1185
/***********************************************/
1186
/* host device */
1187

    
1188
#if defined(__APPLE__) && defined(__MACH__)
1189
static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
1190
static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
1191

    
1192
kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
1193
{
1194
    kern_return_t       kernResult;
1195
    mach_port_t     masterPort;
1196
    CFMutableDictionaryRef  classesToMatch;
1197

    
1198
    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
1199
    if ( KERN_SUCCESS != kernResult ) {
1200
        printf( "IOMasterPort returned %d\n", kernResult );
1201
    }
1202

    
1203
    classesToMatch = IOServiceMatching( kIOCDMediaClass );
1204
    if ( classesToMatch == NULL ) {
1205
        printf( "IOServiceMatching returned a NULL dictionary.\n" );
1206
    } else {
1207
    CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue );
1208
    }
1209
    kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator );
1210
    if ( KERN_SUCCESS != kernResult )
1211
    {
1212
        printf( "IOServiceGetMatchingServices returned %d\n", kernResult );
1213
    }
1214

    
1215
    return kernResult;
1216
}
1217

    
1218
kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
1219
{
1220
    io_object_t     nextMedia;
1221
    kern_return_t   kernResult = KERN_FAILURE;
1222
    *bsdPath = '\0';
1223
    nextMedia = IOIteratorNext( mediaIterator );
1224
    if ( nextMedia )
1225
    {
1226
        CFTypeRef   bsdPathAsCFString;
1227
    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
1228
        if ( bsdPathAsCFString ) {
1229
            size_t devPathLength;
1230
            strcpy( bsdPath, _PATH_DEV );
1231
            strcat( bsdPath, "r" );
1232
            devPathLength = strlen( bsdPath );
1233
            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
1234
                kernResult = KERN_SUCCESS;
1235
            }
1236
            CFRelease( bsdPathAsCFString );
1237
        }
1238
        IOObjectRelease( nextMedia );
1239
    }
1240

    
1241
    return kernResult;
1242
}
1243

    
1244
#endif
1245

    
1246
static int hdev_probe_device(const char *filename)
1247
{
1248
    struct stat st;
1249

    
1250
    /* allow a dedicated CD-ROM driver to match with a higher priority */
1251
    if (strstart(filename, "/dev/cdrom", NULL))
1252
        return 50;
1253

    
1254
    if (stat(filename, &st) >= 0 &&
1255
            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1256
        return 100;
1257
    }
1258

    
1259
    return 0;
1260
}
1261

    
1262
static int check_hdev_writable(BDRVRawState *s)
1263
{
1264
#if defined(BLKROGET)
1265
    /* Linux block devices can be configured "read-only" using blockdev(8).
1266
     * This is independent of device node permissions and therefore open(2)
1267
     * with O_RDWR succeeds.  Actual writes fail with EPERM.
1268
     *
1269
     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
1270
     * check for read-only block devices so that Linux block devices behave
1271
     * properly.
1272
     */
1273
    struct stat st;
1274
    int readonly = 0;
1275

    
1276
    if (fstat(s->fd, &st)) {
1277
        return -errno;
1278
    }
1279

    
1280
    if (!S_ISBLK(st.st_mode)) {
1281
        return 0;
1282
    }
1283

    
1284
    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
1285
        return -errno;
1286
    }
1287

    
1288
    if (readonly) {
1289
        return -EACCES;
1290
    }
1291
#endif /* defined(BLKROGET) */
1292
    return 0;
1293
}
1294

    
1295
static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
1296
{
1297
    BDRVRawState *s = bs->opaque;
1298
    int ret;
1299

    
1300
#if defined(__APPLE__) && defined(__MACH__)
1301
    if (strstart(filename, "/dev/cdrom", NULL)) {
1302
        kern_return_t kernResult;
1303
        io_iterator_t mediaIterator;
1304
        char bsdPath[ MAXPATHLEN ];
1305
        int fd;
1306

    
1307
        kernResult = FindEjectableCDMedia( &mediaIterator );
1308
        kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
1309

    
1310
        if ( bsdPath[ 0 ] != '\0' ) {
1311
            strcat(bsdPath,"s0");
1312
            /* some CDs don't have a partition 0 */
1313
            fd = qemu_open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE);
1314
            if (fd < 0) {
1315
                bsdPath[strlen(bsdPath)-1] = '1';
1316
            } else {
1317
                qemu_close(fd);
1318
            }
1319
            filename = bsdPath;
1320
        }
1321

    
1322
        if ( mediaIterator )
1323
            IOObjectRelease( mediaIterator );
1324
    }
1325
#endif
1326

    
1327
    s->type = FTYPE_FILE;
1328
#if defined(__linux__)
1329
    {
1330
        char resolved_path[ MAXPATHLEN ], *temp;
1331

    
1332
        temp = realpath(filename, resolved_path);
1333
        if (temp && strstart(temp, "/dev/sg", NULL)) {
1334
            bs->sg = 1;
1335
        }
1336
    }
1337
#endif
1338

    
1339
    ret = raw_open_common(bs, filename, flags, 0);
1340
    if (ret < 0) {
1341
        return ret;
1342
    }
1343

    
1344
    if (flags & BDRV_O_RDWR) {
1345
        ret = check_hdev_writable(s);
1346
        if (ret < 0) {
1347
            raw_close(bs);
1348
            return ret;
1349
        }
1350
    }
1351

    
1352
    return ret;
1353
}
1354

    
1355
#if defined(__linux__)
1356
/* Note: we do not have a reliable method to detect if the floppy is
1357
   present. The current method is to try to open the floppy at every
1358
   I/O and to keep it opened during a few hundreds of ms. */
1359
static int fd_open(BlockDriverState *bs)
1360
{
1361
    BDRVRawState *s = bs->opaque;
1362
    int last_media_present;
1363

    
1364
    if (s->type != FTYPE_FD)
1365
        return 0;
1366
    last_media_present = (s->fd >= 0);
1367
    if (s->fd >= 0 &&
1368
        (get_clock() - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
1369
        qemu_close(s->fd);
1370
        s->fd = -1;
1371
#ifdef DEBUG_FLOPPY
1372
        printf("Floppy closed\n");
1373
#endif
1374
    }
1375
    if (s->fd < 0) {
1376
        if (s->fd_got_error &&
1377
            (get_clock() - s->fd_error_time) < FD_OPEN_TIMEOUT) {
1378
#ifdef DEBUG_FLOPPY
1379
            printf("No floppy (open delayed)\n");
1380
#endif
1381
            return -EIO;
1382
        }
1383
        s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
1384
        if (s->fd < 0) {
1385
            s->fd_error_time = get_clock();
1386
            s->fd_got_error = 1;
1387
            if (last_media_present)
1388
                s->fd_media_changed = 1;
1389
#ifdef DEBUG_FLOPPY
1390
            printf("No floppy\n");
1391
#endif
1392
            return -EIO;
1393
        }
1394
#ifdef DEBUG_FLOPPY
1395
        printf("Floppy opened\n");
1396
#endif
1397
    }
1398
    if (!last_media_present)
1399
        s->fd_media_changed = 1;
1400
    s->fd_open_time = get_clock();
1401
    s->fd_got_error = 0;
1402
    return 0;
1403
}
1404

    
1405
static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
1406
{
1407
    BDRVRawState *s = bs->opaque;
1408

    
1409
    return ioctl(s->fd, req, buf);
1410
}
1411

    
1412
static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
1413
        unsigned long int req, void *buf,
1414
        BlockDriverCompletionFunc *cb, void *opaque)
1415
{
1416
    BDRVRawState *s = bs->opaque;
1417
    RawPosixAIOData *acb;
1418
    ThreadPool *pool;
1419

    
1420
    if (fd_open(bs) < 0)
1421
        return NULL;
1422

    
1423
    acb = g_slice_new(RawPosixAIOData);
1424
    acb->bs = bs;
1425
    acb->aio_type = QEMU_AIO_IOCTL;
1426
    acb->aio_fildes = s->fd;
1427
    acb->aio_offset = 0;
1428
    acb->aio_ioctl_buf = buf;
1429
    acb->aio_ioctl_cmd = req;
1430
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1431
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
1432
}
1433

    
1434
#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1435
static int fd_open(BlockDriverState *bs)
1436
{
1437
    BDRVRawState *s = bs->opaque;
1438

    
1439
    /* this is just to ensure s->fd is sane (its called by io ops) */
1440
    if (s->fd >= 0)
1441
        return 0;
1442
    return -EIO;
1443
}
1444
#else /* !linux && !FreeBSD */
1445

    
1446
static int fd_open(BlockDriverState *bs)
1447
{
1448
    return 0;
1449
}
1450

    
1451
#endif /* !linux && !FreeBSD */
1452

    
1453
static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
1454
    int64_t sector_num, int nb_sectors,
1455
    BlockDriverCompletionFunc *cb, void *opaque)
1456
{
1457
    BDRVRawState *s = bs->opaque;
1458

    
1459
    if (fd_open(bs) < 0) {
1460
        return NULL;
1461
    }
1462
    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
1463
                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
1464
}
1465

    
1466
static int hdev_create(const char *filename, QEMUOptionParameter *options)
1467
{
1468
    int fd;
1469
    int ret = 0;
1470
    struct stat stat_buf;
1471
    int64_t total_size = 0;
1472

    
1473
    /* Read out options */
1474
    while (options && options->name) {
1475
        if (!strcmp(options->name, "size")) {
1476
            total_size = options->value.n / BDRV_SECTOR_SIZE;
1477
        }
1478
        options++;
1479
    }
1480

    
1481
    fd = qemu_open(filename, O_WRONLY | O_BINARY);
1482
    if (fd < 0)
1483
        return -errno;
1484

    
1485
    if (fstat(fd, &stat_buf) < 0)
1486
        ret = -errno;
1487
    else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode))
1488
        ret = -ENODEV;
1489
    else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE)
1490
        ret = -ENOSPC;
1491

    
1492
    qemu_close(fd);
1493
    return ret;
1494
}
1495

    
1496
static int hdev_has_zero_init(BlockDriverState *bs)
1497
{
1498
    return 0;
1499
}
1500

    
1501
static BlockDriver bdrv_host_device = {
1502
    .format_name        = "host_device",
1503
    .protocol_name        = "host_device",
1504
    .instance_size      = sizeof(BDRVRawState),
1505
    .bdrv_probe_device  = hdev_probe_device,
1506
    .bdrv_file_open     = hdev_open,
1507
    .bdrv_close         = raw_close,
1508
    .bdrv_reopen_prepare = raw_reopen_prepare,
1509
    .bdrv_reopen_commit  = raw_reopen_commit,
1510
    .bdrv_reopen_abort   = raw_reopen_abort,
1511
    .bdrv_create        = hdev_create,
1512
    .create_options     = raw_create_options,
1513
    .bdrv_has_zero_init = hdev_has_zero_init,
1514

    
1515
    .bdrv_aio_readv        = raw_aio_readv,
1516
    .bdrv_aio_writev        = raw_aio_writev,
1517
    .bdrv_aio_flush        = raw_aio_flush,
1518
    .bdrv_aio_discard   = hdev_aio_discard,
1519

    
1520
    .bdrv_truncate      = raw_truncate,
1521
    .bdrv_getlength        = raw_getlength,
1522
    .bdrv_get_allocated_file_size
1523
                        = raw_get_allocated_file_size,
1524

    
1525
    /* generic scsi device */
1526
#ifdef __linux__
1527
    .bdrv_ioctl         = hdev_ioctl,
1528
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
1529
#endif
1530
};
1531

    
1532
#ifdef __linux__
1533
static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
1534
{
1535
    BDRVRawState *s = bs->opaque;
1536
    int ret;
1537

    
1538
    s->type = FTYPE_FD;
1539

    
1540
    /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
1541
    ret = raw_open_common(bs, filename, flags, O_NONBLOCK);
1542
    if (ret)
1543
        return ret;
1544

    
1545
    /* close fd so that we can reopen it as needed */
1546
    qemu_close(s->fd);
1547
    s->fd = -1;
1548
    s->fd_media_changed = 1;
1549

    
1550
    return 0;
1551
}
1552

    
1553
static int floppy_probe_device(const char *filename)
1554
{
1555
    int fd, ret;
1556
    int prio = 0;
1557
    struct floppy_struct fdparam;
1558
    struct stat st;
1559

    
1560
    if (strstart(filename, "/dev/fd", NULL) &&
1561
        !strstart(filename, "/dev/fdset/", NULL)) {
1562
        prio = 50;
1563
    }
1564

    
1565
    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
1566
    if (fd < 0) {
1567
        goto out;
1568
    }
1569
    ret = fstat(fd, &st);
1570
    if (ret == -1 || !S_ISBLK(st.st_mode)) {
1571
        goto outc;
1572
    }
1573

    
1574
    /* Attempt to detect via a floppy specific ioctl */
1575
    ret = ioctl(fd, FDGETPRM, &fdparam);
1576
    if (ret >= 0)
1577
        prio = 100;
1578

    
1579
outc:
1580
    qemu_close(fd);
1581
out:
1582
    return prio;
1583
}
1584

    
1585

    
1586
static int floppy_is_inserted(BlockDriverState *bs)
1587
{
1588
    return fd_open(bs) >= 0;
1589
}
1590

    
1591
static int floppy_media_changed(BlockDriverState *bs)
1592
{
1593
    BDRVRawState *s = bs->opaque;
1594
    int ret;
1595

    
1596
    /*
1597
     * XXX: we do not have a true media changed indication.
1598
     * It does not work if the floppy is changed without trying to read it.
1599
     */
1600
    fd_open(bs);
1601
    ret = s->fd_media_changed;
1602
    s->fd_media_changed = 0;
1603
#ifdef DEBUG_FLOPPY
1604
    printf("Floppy changed=%d\n", ret);
1605
#endif
1606
    return ret;
1607
}
1608

    
1609
static void floppy_eject(BlockDriverState *bs, bool eject_flag)
1610
{
1611
    BDRVRawState *s = bs->opaque;
1612
    int fd;
1613

    
1614
    if (s->fd >= 0) {
1615
        qemu_close(s->fd);
1616
        s->fd = -1;
1617
    }
1618
    fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK);
1619
    if (fd >= 0) {
1620
        if (ioctl(fd, FDEJECT, 0) < 0)
1621
            perror("FDEJECT");
1622
        qemu_close(fd);
1623
    }
1624
}
1625

    
1626
static BlockDriver bdrv_host_floppy = {
1627
    .format_name        = "host_floppy",
1628
    .protocol_name      = "host_floppy",
1629
    .instance_size      = sizeof(BDRVRawState),
1630
    .bdrv_probe_device        = floppy_probe_device,
1631
    .bdrv_file_open     = floppy_open,
1632
    .bdrv_close         = raw_close,
1633
    .bdrv_reopen_prepare = raw_reopen_prepare,
1634
    .bdrv_reopen_commit  = raw_reopen_commit,
1635
    .bdrv_reopen_abort   = raw_reopen_abort,
1636
    .bdrv_create        = hdev_create,
1637
    .create_options     = raw_create_options,
1638
    .bdrv_has_zero_init = hdev_has_zero_init,
1639

    
1640
    .bdrv_aio_readv     = raw_aio_readv,
1641
    .bdrv_aio_writev    = raw_aio_writev,
1642
    .bdrv_aio_flush        = raw_aio_flush,
1643

    
1644
    .bdrv_truncate      = raw_truncate,
1645
    .bdrv_getlength        = raw_getlength,
1646
    .bdrv_get_allocated_file_size
1647
                        = raw_get_allocated_file_size,
1648

    
1649
    /* removable device support */
1650
    .bdrv_is_inserted   = floppy_is_inserted,
1651
    .bdrv_media_changed = floppy_media_changed,
1652
    .bdrv_eject         = floppy_eject,
1653
};
1654

    
1655
static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
1656
{
1657
    BDRVRawState *s = bs->opaque;
1658

    
1659
    s->type = FTYPE_CD;
1660

    
1661
    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
1662
    return raw_open_common(bs, filename, flags, O_NONBLOCK);
1663
}
1664

    
1665
static int cdrom_probe_device(const char *filename)
1666
{
1667
    int fd, ret;
1668
    int prio = 0;
1669
    struct stat st;
1670

    
1671
    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
1672
    if (fd < 0) {
1673
        goto out;
1674
    }
1675
    ret = fstat(fd, &st);
1676
    if (ret == -1 || !S_ISBLK(st.st_mode)) {
1677
        goto outc;
1678
    }
1679

    
1680
    /* Attempt to detect via a CDROM specific ioctl */
1681
    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
1682
    if (ret >= 0)
1683
        prio = 100;
1684

    
1685
outc:
1686
    qemu_close(fd);
1687
out:
1688
    return prio;
1689
}
1690

    
1691
static int cdrom_is_inserted(BlockDriverState *bs)
1692
{
1693
    BDRVRawState *s = bs->opaque;
1694
    int ret;
1695

    
1696
    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
1697
    if (ret == CDS_DISC_OK)
1698
        return 1;
1699
    return 0;
1700
}
1701

    
1702
static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
1703
{
1704
    BDRVRawState *s = bs->opaque;
1705

    
1706
    if (eject_flag) {
1707
        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
1708
            perror("CDROMEJECT");
1709
    } else {
1710
        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
1711
            perror("CDROMEJECT");
1712
    }
1713
}
1714

    
1715
static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
1716
{
1717
    BDRVRawState *s = bs->opaque;
1718

    
1719
    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
1720
        /*
1721
         * Note: an error can happen if the distribution automatically
1722
         * mounts the CD-ROM
1723
         */
1724
        /* perror("CDROM_LOCKDOOR"); */
1725
    }
1726
}
1727

    
1728
static BlockDriver bdrv_host_cdrom = {
1729
    .format_name        = "host_cdrom",
1730
    .protocol_name      = "host_cdrom",
1731
    .instance_size      = sizeof(BDRVRawState),
1732
    .bdrv_probe_device        = cdrom_probe_device,
1733
    .bdrv_file_open     = cdrom_open,
1734
    .bdrv_close         = raw_close,
1735
    .bdrv_reopen_prepare = raw_reopen_prepare,
1736
    .bdrv_reopen_commit  = raw_reopen_commit,
1737
    .bdrv_reopen_abort   = raw_reopen_abort,
1738
    .bdrv_create        = hdev_create,
1739
    .create_options     = raw_create_options,
1740
    .bdrv_has_zero_init = hdev_has_zero_init,
1741

    
1742
    .bdrv_aio_readv     = raw_aio_readv,
1743
    .bdrv_aio_writev    = raw_aio_writev,
1744
    .bdrv_aio_flush        = raw_aio_flush,
1745

    
1746
    .bdrv_truncate      = raw_truncate,
1747
    .bdrv_getlength     = raw_getlength,
1748
    .bdrv_get_allocated_file_size
1749
                        = raw_get_allocated_file_size,
1750

    
1751
    /* removable device support */
1752
    .bdrv_is_inserted   = cdrom_is_inserted,
1753
    .bdrv_eject         = cdrom_eject,
1754
    .bdrv_lock_medium   = cdrom_lock_medium,
1755

    
1756
    /* generic scsi device */
1757
    .bdrv_ioctl         = hdev_ioctl,
1758
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
1759
};
1760
#endif /* __linux__ */
1761

    
1762
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1763
static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
1764
{
1765
    BDRVRawState *s = bs->opaque;
1766
    int ret;
1767

    
1768
    s->type = FTYPE_CD;
1769

    
1770
    ret = raw_open_common(bs, filename, flags, 0);
1771
    if (ret)
1772
        return ret;
1773

    
1774
    /* make sure the door isn't locked at this time */
1775
    ioctl(s->fd, CDIOCALLOW);
1776
    return 0;
1777
}
1778

    
1779
static int cdrom_probe_device(const char *filename)
1780
{
1781
    if (strstart(filename, "/dev/cd", NULL) ||
1782
            strstart(filename, "/dev/acd", NULL))
1783
        return 100;
1784
    return 0;
1785
}
1786

    
1787
static int cdrom_reopen(BlockDriverState *bs)
1788
{
1789
    BDRVRawState *s = bs->opaque;
1790
    int fd;
1791

    
1792
    /*
1793
     * Force reread of possibly changed/newly loaded disc,
1794
     * FreeBSD seems to not notice sometimes...
1795
     */
1796
    if (s->fd >= 0)
1797
        qemu_close(s->fd);
1798
    fd = qemu_open(bs->filename, s->open_flags, 0644);
1799
    if (fd < 0) {
1800
        s->fd = -1;
1801
        return -EIO;
1802
    }
1803
    s->fd = fd;
1804

    
1805
    /* make sure the door isn't locked at this time */
1806
    ioctl(s->fd, CDIOCALLOW);
1807
    return 0;
1808
}
1809

    
1810
static int cdrom_is_inserted(BlockDriverState *bs)
1811
{
1812
    return raw_getlength(bs) > 0;
1813
}
1814

    
1815
static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
1816
{
1817
    BDRVRawState *s = bs->opaque;
1818

    
1819
    if (s->fd < 0)
1820
        return;
1821

    
1822
    (void) ioctl(s->fd, CDIOCALLOW);
1823

    
1824
    if (eject_flag) {
1825
        if (ioctl(s->fd, CDIOCEJECT) < 0)
1826
            perror("CDIOCEJECT");
1827
    } else {
1828
        if (ioctl(s->fd, CDIOCCLOSE) < 0)
1829
            perror("CDIOCCLOSE");
1830
    }
1831

    
1832
    cdrom_reopen(bs);
1833
}
1834

    
1835
static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
1836
{
1837
    BDRVRawState *s = bs->opaque;
1838

    
1839
    if (s->fd < 0)
1840
        return;
1841
    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
1842
        /*
1843
         * Note: an error can happen if the distribution automatically
1844
         * mounts the CD-ROM
1845
         */
1846
        /* perror("CDROM_LOCKDOOR"); */
1847
    }
1848
}
1849

    
1850
static BlockDriver bdrv_host_cdrom = {
1851
    .format_name        = "host_cdrom",
1852
    .protocol_name      = "host_cdrom",
1853
    .instance_size      = sizeof(BDRVRawState),
1854
    .bdrv_probe_device        = cdrom_probe_device,
1855
    .bdrv_file_open     = cdrom_open,
1856
    .bdrv_close         = raw_close,
1857
    .bdrv_reopen_prepare = raw_reopen_prepare,
1858
    .bdrv_reopen_commit  = raw_reopen_commit,
1859
    .bdrv_reopen_abort   = raw_reopen_abort,
1860
    .bdrv_create        = hdev_create,
1861
    .create_options     = raw_create_options,
1862
    .bdrv_has_zero_init = hdev_has_zero_init,
1863

    
1864
    .bdrv_aio_readv     = raw_aio_readv,
1865
    .bdrv_aio_writev    = raw_aio_writev,
1866
    .bdrv_aio_flush        = raw_aio_flush,
1867

    
1868
    .bdrv_truncate      = raw_truncate,
1869
    .bdrv_getlength     = raw_getlength,
1870
    .bdrv_get_allocated_file_size
1871
                        = raw_get_allocated_file_size,
1872

    
1873
    /* removable device support */
1874
    .bdrv_is_inserted   = cdrom_is_inserted,
1875
    .bdrv_eject         = cdrom_eject,
1876
    .bdrv_lock_medium   = cdrom_lock_medium,
1877
};
1878
#endif /* __FreeBSD__ */
1879

    
1880
#ifdef CONFIG_LINUX_AIO
1881
/**
1882
 * Return the file descriptor for Linux AIO
1883
 *
1884
 * This function is a layering violation and should be removed when it becomes
1885
 * possible to call the block layer outside the global mutex.  It allows the
1886
 * caller to hijack the file descriptor so I/O can be performed outside the
1887
 * block layer.
1888
 */
1889
int raw_get_aio_fd(BlockDriverState *bs)
1890
{
1891
    BDRVRawState *s;
1892

    
1893
    if (!bs->drv) {
1894
        return -ENOMEDIUM;
1895
    }
1896

    
1897
    if (bs->drv == bdrv_find_format("raw")) {
1898
        bs = bs->file;
1899
    }
1900

    
1901
    /* raw-posix has several protocols so just check for raw_aio_readv */
1902
    if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
1903
        return -ENOTSUP;
1904
    }
1905

    
1906
    s = bs->opaque;
1907
    if (!s->use_aio) {
1908
        return -ENOTSUP;
1909
    }
1910
    return s->fd;
1911
}
1912
#endif /* CONFIG_LINUX_AIO */
1913

    
1914
static void bdrv_file_init(void)
1915
{
1916
    /*
1917
     * Register all the drivers.  Note that order is important, the driver
1918
     * registered last will get probed first.
1919
     */
1920
    bdrv_register(&bdrv_file);
1921
    bdrv_register(&bdrv_host_device);
1922
#ifdef __linux__
1923
    bdrv_register(&bdrv_host_floppy);
1924
    bdrv_register(&bdrv_host_cdrom);
1925
#endif
1926
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1927
    bdrv_register(&bdrv_host_cdrom);
1928
#endif
1929
}
1930

    
1931
block_init(bdrv_file_init);