Revision 298800ca

b/Makefile.objs
20 20

  
21 21
block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
22 22
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
23
block-nested-y += qed.o
23
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
24 24
block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
25 25
block-nested-$(CONFIG_WIN32) += raw-win32.o
26 26
block-nested-$(CONFIG_POSIX) += raw-posix.o
b/block/qed-cluster.c
1
/*
2
 * QEMU Enhanced Disk Format Cluster functions
3
 *
4
 * Copyright IBM, Corp. 2010
5
 *
6
 * Authors:
7
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *
10
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11
 * See the COPYING.LIB file in the top-level directory.
12
 *
13
 */
14

  
15
#include "qed.h"
16

  
17
/**
18
 * Count the number of contiguous data clusters
19
 *
20
 * @s:              QED state
21
 * @table:          L2 table
22
 * @index:          First cluster index
23
 * @n:              Maximum number of clusters
24
 * @offset:         Set to first cluster offset
25
 *
26
 * This function scans tables for contiguous allocated or free clusters.
27
 */
28
static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
29
                                                  QEDTable *table,
30
                                                  unsigned int index,
31
                                                  unsigned int n,
32
                                                  uint64_t *offset)
33
{
34
    unsigned int end = MIN(index + n, s->table_nelems);
35
    uint64_t last = table->offsets[index];
36
    unsigned int i;
37

  
38
    *offset = last;
39

  
40
    for (i = index + 1; i < end; i++) {
41
        if (last == 0) {
42
            /* Counting free clusters */
43
            if (table->offsets[i] != 0) {
44
                break;
45
            }
46
        } else {
47
            /* Counting allocated clusters */
48
            if (table->offsets[i] != last + s->header.cluster_size) {
49
                break;
50
            }
51
            last = table->offsets[i];
52
        }
53
    }
54
    return i - index;
55
}
56

  
57
typedef struct {
58
    BDRVQEDState *s;
59
    uint64_t pos;
60
    size_t len;
61

  
62
    QEDRequest *request;
63

  
64
    /* User callback */
65
    QEDFindClusterFunc *cb;
66
    void *opaque;
67
} QEDFindClusterCB;
68

  
69
static void qed_find_cluster_cb(void *opaque, int ret)
70
{
71
    QEDFindClusterCB *find_cluster_cb = opaque;
72
    BDRVQEDState *s = find_cluster_cb->s;
73
    QEDRequest *request = find_cluster_cb->request;
74
    uint64_t offset = 0;
75
    size_t len = 0;
76
    unsigned int index;
77
    unsigned int n;
78

  
79
    if (ret) {
80
        goto out;
81
    }
82

  
83
    index = qed_l2_index(s, find_cluster_cb->pos);
84
    n = qed_bytes_to_clusters(s,
85
                              qed_offset_into_cluster(s, find_cluster_cb->pos) +
86
                              find_cluster_cb->len);
87
    n = qed_count_contiguous_clusters(s, request->l2_table->table,
88
                                      index, n, &offset);
89

  
90
    ret = offset ? QED_CLUSTER_FOUND : QED_CLUSTER_L2;
91
    len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
92
              qed_offset_into_cluster(s, find_cluster_cb->pos));
93

  
94
    if (offset && !qed_check_cluster_offset(s, offset)) {
95
        ret = -EINVAL;
96
    }
97

  
98
out:
99
    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
100
    qemu_free(find_cluster_cb);
101
}
102

  
103
/**
104
 * Find the offset of a data cluster
105
 *
106
 * @s:          QED state
107
 * @request:    L2 cache entry
108
 * @pos:        Byte position in device
109
 * @len:        Number of bytes
110
 * @cb:         Completion function
111
 * @opaque:     User data for completion function
112
 *
113
 * This function translates a position in the block device to an offset in the
114
 * image file.  It invokes the cb completion callback to report back the
115
 * translated offset or unallocated range in the image file.
116
 *
117
 * If the L2 table exists, request->l2_table points to the L2 table cache entry
118
 * and the caller must free the reference when they are finished.  The cache
119
 * entry is exposed in this way to avoid callers having to read the L2 table
120
 * again later during request processing.  If request->l2_table is non-NULL it
121
 * will be unreferenced before taking on the new cache entry.
122
 */
123
void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
124
                      size_t len, QEDFindClusterFunc *cb, void *opaque)
125
{
126
    QEDFindClusterCB *find_cluster_cb;
127
    uint64_t l2_offset;
128

  
129
    /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
130
     * so that a request acts on one L2 table at a time.
131
     */
132
    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
133

  
134
    l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
135
    if (!l2_offset) {
136
        cb(opaque, QED_CLUSTER_L1, 0, len);
137
        return;
138
    }
139
    if (!qed_check_table_offset(s, l2_offset)) {
140
        cb(opaque, -EINVAL, 0, 0);
141
        return;
142
    }
143

  
144
    find_cluster_cb = qemu_malloc(sizeof(*find_cluster_cb));
145
    find_cluster_cb->s = s;
146
    find_cluster_cb->pos = pos;
147
    find_cluster_cb->len = len;
148
    find_cluster_cb->cb = cb;
149
    find_cluster_cb->opaque = opaque;
150
    find_cluster_cb->request = request;
151

  
152
    qed_read_l2_table(s, request, l2_offset,
153
                      qed_find_cluster_cb, find_cluster_cb);
154
}
b/block/qed-gencb.c
1
/*
2
 * QEMU Enhanced Disk Format
3
 *
4
 * Copyright IBM, Corp. 2010
5
 *
6
 * Authors:
7
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
10
 * See the COPYING.LIB file in the top-level directory.
11
 *
12
 */
13

  
14
#include "qed.h"
15

  
16
void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
17
{
18
    GenericCB *gencb = qemu_malloc(len);
19
    gencb->cb = cb;
20
    gencb->opaque = opaque;
21
    return gencb;
22
}
23

  
24
void gencb_complete(void *opaque, int ret)
25
{
26
    GenericCB *gencb = opaque;
27
    BlockDriverCompletionFunc *cb = gencb->cb;
28
    void *user_opaque = gencb->opaque;
29

  
30
    qemu_free(gencb);
31
    cb(user_opaque, ret);
32
}
b/block/qed-l2-cache.c
1
/*
2
 * QEMU Enhanced Disk Format L2 Cache
3
 *
4
 * Copyright IBM, Corp. 2010
5
 *
6
 * Authors:
7
 *  Anthony Liguori   <aliguori@us.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
10
 * See the COPYING.LIB file in the top-level directory.
11
 *
12
 */
13

  
14
/*
15
 * L2 table cache usage is as follows:
16
 *
17
 * An open image has one L2 table cache that is used to avoid accessing the
18
 * image file for recently referenced L2 tables.
19
 *
20
 * Cluster offset lookup translates the logical offset within the block device
21
 * to a cluster offset within the image file.  This is done by indexing into
22
 * the L1 and L2 tables which store cluster offsets.  It is here where the L2
23
 * table cache serves up recently referenced L2 tables.
24
 *
25
 * If there is a cache miss, that L2 table is read from the image file and
26
 * committed to the cache.  Subsequent accesses to that L2 table will be served
27
 * from the cache until the table is evicted from the cache.
28
 *
29
 * L2 tables are also committed to the cache when new L2 tables are allocated
30
 * in the image file.  Since the L2 table cache is write-through, the new L2
31
 * table is first written out to the image file and then committed to the
32
 * cache.
33
 *
34
 * Multiple I/O requests may be using an L2 table cache entry at any given
35
 * time.  That means an entry may be in use across several requests and
36
 * reference counting is needed to free the entry at the correct time.  In
37
 * particular, an entry evicted from the cache will only be freed once all
38
 * references are dropped.
39
 *
40
 * An in-flight I/O request will hold a reference to a L2 table cache entry for
41
 * the period during which it needs to access the L2 table.  This includes
42
 * cluster offset lookup, L2 table allocation, and L2 table update when a new
43
 * data cluster has been allocated.
44
 *
45
 * An interesting case occurs when two requests need to access an L2 table that
46
 * is not in the cache.  Since the operation to read the table from the image
47
 * file takes some time to complete, both requests may see a cache miss and
48
 * start reading the L2 table from the image file.  The first to finish will
49
 * commit its L2 table into the cache.  When the second tries to commit its
50
 * table will be deleted in favor of the existing cache entry.
51
 */
52

  
53
#include "trace.h"
54
#include "qed.h"
55

  
56
/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */
57
#define MAX_L2_CACHE_SIZE 50
58

  
59
/**
60
 * Initialize the L2 cache
61
 */
62
void qed_init_l2_cache(L2TableCache *l2_cache)
63
{
64
    QTAILQ_INIT(&l2_cache->entries);
65
    l2_cache->n_entries = 0;
66
}
67

  
68
/**
69
 * Free the L2 cache
70
 */
71
void qed_free_l2_cache(L2TableCache *l2_cache)
72
{
73
    CachedL2Table *entry, *next_entry;
74

  
75
    QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) {
76
        qemu_vfree(entry->table);
77
        qemu_free(entry);
78
    }
79
}
80

  
81
/**
82
 * Allocate an uninitialized entry from the cache
83
 *
84
 * The returned entry has a reference count of 1 and is owned by the caller.
85
 * The caller must allocate the actual table field for this entry and it must
86
 * be freeable using qemu_vfree().
87
 */
88
CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache)
89
{
90
    CachedL2Table *entry;
91

  
92
    entry = qemu_mallocz(sizeof(*entry));
93
    entry->ref++;
94

  
95
    trace_qed_alloc_l2_cache_entry(l2_cache, entry);
96

  
97
    return entry;
98
}
99

  
100
/**
101
 * Decrease an entry's reference count and free if necessary when the reference
102
 * count drops to zero.
103
 */
104
void qed_unref_l2_cache_entry(CachedL2Table *entry)
105
{
106
    if (!entry) {
107
        return;
108
    }
109

  
110
    entry->ref--;
111
    trace_qed_unref_l2_cache_entry(entry, entry->ref);
112
    if (entry->ref == 0) {
113
        qemu_vfree(entry->table);
114
        qemu_free(entry);
115
    }
116
}
117

  
118
/**
119
 * Find an entry in the L2 cache.  This may return NULL and it's up to the
120
 * caller to satisfy the cache miss.
121
 *
122
 * For a cached entry, this function increases the reference count and returns
123
 * the entry.
124
 */
125
CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset)
126
{
127
    CachedL2Table *entry;
128

  
129
    QTAILQ_FOREACH(entry, &l2_cache->entries, node) {
130
        if (entry->offset == offset) {
131
            trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref);
132
            entry->ref++;
133
            return entry;
134
        }
135
    }
136
    return NULL;
137
}
138

  
139
/**
140
 * Commit an L2 cache entry into the cache.  This is meant to be used as part of
141
 * the process to satisfy a cache miss.  A caller would allocate an entry which
142
 * is not actually in the L2 cache and then once the entry was valid and
143
 * present on disk, the entry can be committed into the cache.
144
 *
145
 * Since the cache is write-through, it's important that this function is not
146
 * called until the entry is present on disk and the L1 has been updated to
147
 * point to the entry.
148
 *
149
 * N.B. This function steals a reference to the l2_table from the caller so the
150
 * caller must obtain a new reference by issuing a call to
151
 * qed_find_l2_cache_entry().
152
 */
153
void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table)
154
{
155
    CachedL2Table *entry;
156

  
157
    entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset);
158
    if (entry) {
159
        qed_unref_l2_cache_entry(entry);
160
        qed_unref_l2_cache_entry(l2_table);
161
        return;
162
    }
163

  
164
    if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) {
165
        entry = QTAILQ_FIRST(&l2_cache->entries);
166
        QTAILQ_REMOVE(&l2_cache->entries, entry, node);
167
        l2_cache->n_entries--;
168
        qed_unref_l2_cache_entry(entry);
169
    }
170

  
171
    l2_cache->n_entries++;
172
    QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node);
173
}
b/block/qed-table.c
1
/*
2
 * QEMU Enhanced Disk Format Table I/O
3
 *
4
 * Copyright IBM, Corp. 2010
5
 *
6
 * Authors:
7
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *
10
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11
 * See the COPYING.LIB file in the top-level directory.
12
 *
13
 */
14

  
15
#include "trace.h"
16
#include "qemu_socket.h" /* for EINPROGRESS on Windows */
17
#include "qed.h"
18

  
19
typedef struct {
20
    GenericCB gencb;
21
    BDRVQEDState *s;
22
    QEDTable *table;
23

  
24
    struct iovec iov;
25
    QEMUIOVector qiov;
26
} QEDReadTableCB;
27

  
28
static void qed_read_table_cb(void *opaque, int ret)
29
{
30
    QEDReadTableCB *read_table_cb = opaque;
31
    QEDTable *table = read_table_cb->table;
32
    int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t);
33
    int i;
34

  
35
    /* Handle I/O error */
36
    if (ret) {
37
        goto out;
38
    }
39

  
40
    /* Byteswap offsets */
41
    for (i = 0; i < noffsets; i++) {
42
        table->offsets[i] = le64_to_cpu(table->offsets[i]);
43
    }
44

  
45
out:
46
    /* Completion */
47
    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
48
    gencb_complete(&read_table_cb->gencb, ret);
49
}
50

  
51
static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
52
                           BlockDriverCompletionFunc *cb, void *opaque)
53
{
54
    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
55
                                                cb, opaque);
56
    QEMUIOVector *qiov = &read_table_cb->qiov;
57
    BlockDriverAIOCB *aiocb;
58

  
59
    trace_qed_read_table(s, offset, table);
60

  
61
    read_table_cb->s = s;
62
    read_table_cb->table = table;
63
    read_table_cb->iov.iov_base = table->offsets,
64
    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
65

  
66
    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
67
    aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
68
                           read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE,
69
                           qed_read_table_cb, read_table_cb);
70
    if (!aiocb) {
71
        qed_read_table_cb(read_table_cb, -EIO);
72
    }
73
}
74

  
75
typedef struct {
76
    GenericCB gencb;
77
    BDRVQEDState *s;
78
    QEDTable *orig_table;
79
    QEDTable *table;
80
    bool flush;             /* flush after write? */
81

  
82
    struct iovec iov;
83
    QEMUIOVector qiov;
84
} QEDWriteTableCB;
85

  
86
static void qed_write_table_cb(void *opaque, int ret)
87
{
88
    QEDWriteTableCB *write_table_cb = opaque;
89

  
90
    trace_qed_write_table_cb(write_table_cb->s,
91
                             write_table_cb->orig_table,
92
                             write_table_cb->flush,
93
                             ret);
94

  
95
    if (ret) {
96
        goto out;
97
    }
98

  
99
    if (write_table_cb->flush) {
100
        /* We still need to flush first */
101
        write_table_cb->flush = false;
102
        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
103
                       write_table_cb);
104
        return;
105
    }
106

  
107
out:
108
    qemu_vfree(write_table_cb->table);
109
    gencb_complete(&write_table_cb->gencb, ret);
110
    return;
111
}
112

  
113
/**
114
 * Write out an updated part or all of a table
115
 *
116
 * @s:          QED state
117
 * @offset:     Offset of table in image file, in bytes
118
 * @table:      Table
119
 * @index:      Index of first element
120
 * @n:          Number of elements
121
 * @flush:      Whether or not to sync to disk
122
 * @cb:         Completion function
123
 * @opaque:     Argument for completion function
124
 */
125
static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
126
                            unsigned int index, unsigned int n, bool flush,
127
                            BlockDriverCompletionFunc *cb, void *opaque)
128
{
129
    QEDWriteTableCB *write_table_cb;
130
    BlockDriverAIOCB *aiocb;
131
    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
132
    unsigned int start, end, i;
133
    size_t len_bytes;
134

  
135
    trace_qed_write_table(s, offset, table, index, n);
136

  
137
    /* Calculate indices of the first and one after last elements */
138
    start = index & ~sector_mask;
139
    end = (index + n + sector_mask) & ~sector_mask;
140

  
141
    len_bytes = (end - start) * sizeof(uint64_t);
142

  
143
    write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
144
    write_table_cb->s = s;
145
    write_table_cb->orig_table = table;
146
    write_table_cb->flush = flush;
147
    write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
148
    write_table_cb->iov.iov_base = write_table_cb->table->offsets;
149
    write_table_cb->iov.iov_len = len_bytes;
150
    qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
151

  
152
    /* Byteswap table */
153
    for (i = start; i < end; i++) {
154
        uint64_t le_offset = cpu_to_le64(table->offsets[i]);
155
        write_table_cb->table->offsets[i - start] = le_offset;
156
    }
157

  
158
    /* Adjust for offset into table */
159
    offset += start * sizeof(uint64_t);
160

  
161
    aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
162
                            &write_table_cb->qiov,
163
                            write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE,
164
                            qed_write_table_cb, write_table_cb);
165
    if (!aiocb) {
166
        qed_write_table_cb(write_table_cb, -EIO);
167
    }
168
}
169

  
170
/**
171
 * Propagate return value from async callback
172
 */
173
static void qed_sync_cb(void *opaque, int ret)
174
{
175
    *(int *)opaque = ret;
176
}
177

  
178
int qed_read_l1_table_sync(BDRVQEDState *s)
179
{
180
    int ret = -EINPROGRESS;
181

  
182
    async_context_push();
183

  
184
    qed_read_table(s, s->header.l1_table_offset,
185
                   s->l1_table, qed_sync_cb, &ret);
186
    while (ret == -EINPROGRESS) {
187
        qemu_aio_wait();
188
    }
189

  
190
    async_context_pop();
191

  
192
    return ret;
193
}
194

  
195
void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
196
                        BlockDriverCompletionFunc *cb, void *opaque)
197
{
198
    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
199
    qed_write_table(s, s->header.l1_table_offset,
200
                    s->l1_table, index, n, false, cb, opaque);
201
}
202

  
203
int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
204
                            unsigned int n)
205
{
206
    int ret = -EINPROGRESS;
207

  
208
    async_context_push();
209

  
210
    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
211
    while (ret == -EINPROGRESS) {
212
        qemu_aio_wait();
213
    }
214

  
215
    async_context_pop();
216

  
217
    return ret;
218
}
219

  
220
typedef struct {
221
    GenericCB gencb;
222
    BDRVQEDState *s;
223
    uint64_t l2_offset;
224
    QEDRequest *request;
225
} QEDReadL2TableCB;
226

  
227
static void qed_read_l2_table_cb(void *opaque, int ret)
228
{
229
    QEDReadL2TableCB *read_l2_table_cb = opaque;
230
    QEDRequest *request = read_l2_table_cb->request;
231
    BDRVQEDState *s = read_l2_table_cb->s;
232
    CachedL2Table *l2_table = request->l2_table;
233

  
234
    if (ret) {
235
        /* can't trust loaded L2 table anymore */
236
        qed_unref_l2_cache_entry(l2_table);
237
        request->l2_table = NULL;
238
    } else {
239
        l2_table->offset = read_l2_table_cb->l2_offset;
240

  
241
        qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
242

  
243
        /* This is guaranteed to succeed because we just committed the entry
244
         * to the cache.
245
         */
246
        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache,
247
                                                    l2_table->offset);
248
        assert(request->l2_table != NULL);
249
    }
250

  
251
    gencb_complete(&read_l2_table_cb->gencb, ret);
252
}
253

  
254
void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
255
                       BlockDriverCompletionFunc *cb, void *opaque)
256
{
257
    QEDReadL2TableCB *read_l2_table_cb;
258

  
259
    qed_unref_l2_cache_entry(request->l2_table);
260

  
261
    /* Check for cached L2 entry */
262
    request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
263
    if (request->l2_table) {
264
        cb(opaque, 0);
265
        return;
266
    }
267

  
268
    request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
269
    request->l2_table->table = qed_alloc_table(s);
270

  
271
    read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
272
    read_l2_table_cb->s = s;
273
    read_l2_table_cb->l2_offset = offset;
274
    read_l2_table_cb->request = request;
275

  
276
    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
277
    qed_read_table(s, offset, request->l2_table->table,
278
                   qed_read_l2_table_cb, read_l2_table_cb);
279
}
280

  
281
int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
282
{
283
    int ret = -EINPROGRESS;
284

  
285
    async_context_push();
286

  
287
    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
288
    while (ret == -EINPROGRESS) {
289
        qemu_aio_wait();
290
    }
291

  
292
    async_context_pop();
293
    return ret;
294
}
295

  
296
void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
297
                        unsigned int index, unsigned int n, bool flush,
298
                        BlockDriverCompletionFunc *cb, void *opaque)
299
{
300
    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
301
    qed_write_table(s, request->l2_table->offset,
302
                    request->l2_table->table, index, n, flush, cb, opaque);
303
}
304

  
305
int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
306
                            unsigned int index, unsigned int n, bool flush)
307
{
308
    int ret = -EINPROGRESS;
309

  
310
    async_context_push();
311

  
312
    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
313
    while (ret == -EINPROGRESS) {
314
        qemu_aio_wait();
315
    }
316

  
317
    async_context_pop();
318
    return ret;
319
}
b/block/qed.c
155 155
    return 0;
156 156
}
157 157

  
158
QEDTable *qed_alloc_table(BDRVQEDState *s)
159
{
160
    /* Honor O_DIRECT memory alignment requirements */
161
    return qemu_blockalign(s->bs,
162
                           s->header.cluster_size * s->header.table_size);
163
}
164

  
158 165
static int bdrv_qed_open(BlockDriverState *bs, int flags)
159 166
{
160 167
    BDRVQEDState *s = bs->opaque;
......
244 251
        bdrv_flush(bs->file);
245 252
    }
246 253

  
254
    s->l1_table = qed_alloc_table(s);
255
    qed_init_l2_cache(&s->l2_cache);
256

  
257
    ret = qed_read_l1_table_sync(s);
258
    if (ret) {
259
        qed_free_l2_cache(&s->l2_cache);
260
        qemu_vfree(s->l1_table);
261
    }
247 262
    return ret;
248 263
}
249 264

  
250 265
static void bdrv_qed_close(BlockDriverState *bs)
251 266
{
267
    BDRVQEDState *s = bs->opaque;
268

  
269
    qed_free_l2_cache(&s->l2_cache);
270
    qemu_vfree(s->l1_table);
252 271
}
253 272

  
254 273
static int bdrv_qed_flush(BlockDriverState *bs)
......
368 387
                      backing_file, backing_fmt);
369 388
}
370 389

  
390
typedef struct {
391
    int is_allocated;
392
    int *pnum;
393
} QEDIsAllocatedCB;
394

  
395
static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
396
{
397
    QEDIsAllocatedCB *cb = opaque;
398
    *cb->pnum = len / BDRV_SECTOR_SIZE;
399
    cb->is_allocated = ret == QED_CLUSTER_FOUND;
400
}
401

  
371 402
static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
372 403
                                  int nb_sectors, int *pnum)
373 404
{
374
    return -ENOTSUP;
405
    BDRVQEDState *s = bs->opaque;
406
    uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
407
    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
408
    QEDIsAllocatedCB cb = {
409
        .is_allocated = -1,
410
        .pnum = pnum,
411
    };
412
    QEDRequest request = { .l2_table = NULL };
413

  
414
    async_context_push();
415

  
416
    qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
417

  
418
    while (cb.is_allocated == -1) {
419
        qemu_aio_wait();
420
    }
421

  
422
    async_context_pop();
423

  
424
    qed_unref_l2_cache_entry(request.l2_table);
425

  
426
    return cb.is_allocated;
375 427
}
376 428

  
377 429
static int bdrv_qed_make_empty(BlockDriverState *bs)
b/block/qed.h
96 96
} QEDHeader;
97 97

  
98 98
typedef struct {
99
    uint64_t offsets[0];            /* in bytes */
100
} QEDTable;
101

  
102
/* The L2 cache is a simple write-through cache for L2 structures */
103
typedef struct CachedL2Table {
104
    QEDTable *table;
105
    uint64_t offset;    /* offset=0 indicates an invalidate entry */
106
    QTAILQ_ENTRY(CachedL2Table) node;
107
    int ref;
108
} CachedL2Table;
109

  
110
typedef struct {
111
    QTAILQ_HEAD(, CachedL2Table) entries;
112
    unsigned int n_entries;
113
} L2TableCache;
114

  
115
typedef struct QEDRequest {
116
    CachedL2Table *l2_table;
117
} QEDRequest;
118

  
119
typedef struct {
99 120
    BlockDriverState *bs;           /* device */
100 121
    uint64_t file_size;             /* length of image file, in bytes */
101 122

  
102 123
    QEDHeader header;               /* always cpu-endian */
124
    QEDTable *l1_table;
125
    L2TableCache l2_cache;          /* l2 table cache */
103 126
    uint32_t table_nelems;
104 127
    uint32_t l1_shift;
105 128
    uint32_t l2_shift;
106 129
    uint32_t l2_mask;
107 130
} BDRVQEDState;
108 131

  
132
enum {
133
    QED_CLUSTER_FOUND,         /* cluster found */
134
    QED_CLUSTER_L2,            /* cluster missing in L2 */
135
    QED_CLUSTER_L1,            /* cluster missing in L1 */
136
};
137

  
138
/**
139
 * qed_find_cluster() completion callback
140
 *
141
 * @opaque:     User data for completion callback
142
 * @ret:        QED_CLUSTER_FOUND   Success
143
 *              QED_CLUSTER_L2      Data cluster unallocated in L2
144
 *              QED_CLUSTER_L1      L2 unallocated in L1
145
 *              -errno              POSIX error occurred
146
 * @offset:     Data cluster offset
147
 * @len:        Contiguous bytes starting from cluster offset
148
 *
149
 * This function is invoked when qed_find_cluster() completes.
150
 *
151
 * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range
152
 * in the image file.
153
 *
154
 * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1
155
 * table offset, respectively.  len is number of contiguous unallocated bytes.
156
 */
157
typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
158

  
159
/**
160
 * Generic callback for chaining async callbacks
161
 */
162
typedef struct {
163
    BlockDriverCompletionFunc *cb;
164
    void *opaque;
165
} GenericCB;
166

  
167
void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque);
168
void gencb_complete(void *opaque, int ret);
169

  
170
/**
171
 * L2 cache functions
172
 */
173
void qed_init_l2_cache(L2TableCache *l2_cache);
174
void qed_free_l2_cache(L2TableCache *l2_cache);
175
CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache);
176
void qed_unref_l2_cache_entry(CachedL2Table *entry);
177
CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset);
178
void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
179

  
180
/**
181
 * Table I/O functions
182
 */
183
int qed_read_l1_table_sync(BDRVQEDState *s);
184
void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
185
                        BlockDriverCompletionFunc *cb, void *opaque);
186
int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
187
                            unsigned int n);
188
int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
189
                           uint64_t offset);
190
void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
191
                       BlockDriverCompletionFunc *cb, void *opaque);
192
void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
193
                        unsigned int index, unsigned int n, bool flush,
194
                        BlockDriverCompletionFunc *cb, void *opaque);
195
int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
196
                            unsigned int index, unsigned int n, bool flush);
197

  
198
/**
199
 * Cluster functions
200
 */
201
void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
202
                      size_t len, QEDFindClusterFunc *cb, void *opaque);
203

  
204
/**
205
 * Consistency check
206
 */
207
int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix);
208

  
209
QEDTable *qed_alloc_table(BDRVQEDState *s);
210

  
109 211
/**
110 212
 * Round down to the start of a cluster
111 213
 */
......
114 216
    return offset & ~(uint64_t)(s->header.cluster_size - 1);
115 217
}
116 218

  
219
static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset)
220
{
221
    return offset & (s->header.cluster_size - 1);
222
}
223

  
224
static inline unsigned int qed_bytes_to_clusters(BDRVQEDState *s, size_t bytes)
225
{
226
    return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) /
227
           (s->header.cluster_size - 1);
228
}
229

  
230
static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos)
231
{
232
    return pos >> s->l1_shift;
233
}
234

  
235
static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos)
236
{
237
    return (pos >> s->l2_shift) & s->l2_mask;
238
}
239

  
117 240
/**
118 241
 * Test if a cluster offset is valid
119 242
 */
b/trace-events
192 192

  
193 193
# vl.c
194 194
disable vm_state_notify(int running, int reason) "running %d reason %d"
195

  
196
# block/qed-l2-cache.c
197
disable qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p"
198
disable qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d"
199
disable qed_find_l2_cache_entry(void *l2_cache, void *entry, uint64_t offset, int ref) "l2_cache %p entry %p offset %"PRIu64" ref %d"
200

  
201
# block/qed-table.c
202
disable qed_read_table(void *s, uint64_t offset, void *table) "s %p offset %"PRIu64" table %p"
203
disable qed_read_table_cb(void *s, void *table, int ret) "s %p table %p ret %d"
204
disable qed_write_table(void *s, uint64_t offset, void *table, unsigned int index, unsigned int n) "s %p offset %"PRIu64" table %p index %u n %u"
205
disable qed_write_table_cb(void *s, void *table, int flush, int ret) "s %p table %p flush %d ret %d"

Also available in: Unified diff