Revision 298800ca
b/Makefile.objs | ||
---|---|---|
20 | 20 |
|
21 | 21 |
block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o |
22 | 22 |
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o |
23 |
block-nested-y += qed.o |
|
23 |
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
|
|
24 | 24 |
block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o |
25 | 25 |
block-nested-$(CONFIG_WIN32) += raw-win32.o |
26 | 26 |
block-nested-$(CONFIG_POSIX) += raw-posix.o |
b/block/qed-cluster.c | ||
---|---|---|
1 |
/* |
|
2 |
* QEMU Enhanced Disk Format Cluster functions |
|
3 |
* |
|
4 |
* Copyright IBM, Corp. 2010 |
|
5 |
* |
|
6 |
* Authors: |
|
7 |
* Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> |
|
8 |
* Anthony Liguori <aliguori@us.ibm.com> |
|
9 |
* |
|
10 |
* This work is licensed under the terms of the GNU LGPL, version 2 or later. |
|
11 |
* See the COPYING.LIB file in the top-level directory. |
|
12 |
* |
|
13 |
*/ |
|
14 |
|
|
15 |
#include "qed.h" |
|
16 |
|
|
17 |
/** |
|
18 |
* Count the number of contiguous data clusters |
|
19 |
* |
|
20 |
* @s: QED state |
|
21 |
* @table: L2 table |
|
22 |
* @index: First cluster index |
|
23 |
* @n: Maximum number of clusters |
|
24 |
* @offset: Set to first cluster offset |
|
25 |
* |
|
26 |
* This function scans tables for contiguous allocated or free clusters. |
|
27 |
*/ |
|
28 |
static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, |
|
29 |
QEDTable *table, |
|
30 |
unsigned int index, |
|
31 |
unsigned int n, |
|
32 |
uint64_t *offset) |
|
33 |
{ |
|
34 |
unsigned int end = MIN(index + n, s->table_nelems); |
|
35 |
uint64_t last = table->offsets[index]; |
|
36 |
unsigned int i; |
|
37 |
|
|
38 |
*offset = last; |
|
39 |
|
|
40 |
for (i = index + 1; i < end; i++) { |
|
41 |
if (last == 0) { |
|
42 |
/* Counting free clusters */ |
|
43 |
if (table->offsets[i] != 0) { |
|
44 |
break; |
|
45 |
} |
|
46 |
} else { |
|
47 |
/* Counting allocated clusters */ |
|
48 |
if (table->offsets[i] != last + s->header.cluster_size) { |
|
49 |
break; |
|
50 |
} |
|
51 |
last = table->offsets[i]; |
|
52 |
} |
|
53 |
} |
|
54 |
return i - index; |
|
55 |
} |
|
56 |
|
|
57 |
typedef struct { |
|
58 |
BDRVQEDState *s; |
|
59 |
uint64_t pos; |
|
60 |
size_t len; |
|
61 |
|
|
62 |
QEDRequest *request; |
|
63 |
|
|
64 |
/* User callback */ |
|
65 |
QEDFindClusterFunc *cb; |
|
66 |
void *opaque; |
|
67 |
} QEDFindClusterCB; |
|
68 |
|
|
69 |
static void qed_find_cluster_cb(void *opaque, int ret) |
|
70 |
{ |
|
71 |
QEDFindClusterCB *find_cluster_cb = opaque; |
|
72 |
BDRVQEDState *s = find_cluster_cb->s; |
|
73 |
QEDRequest *request = find_cluster_cb->request; |
|
74 |
uint64_t offset = 0; |
|
75 |
size_t len = 0; |
|
76 |
unsigned int index; |
|
77 |
unsigned int n; |
|
78 |
|
|
79 |
if (ret) { |
|
80 |
goto out; |
|
81 |
} |
|
82 |
|
|
83 |
index = qed_l2_index(s, find_cluster_cb->pos); |
|
84 |
n = qed_bytes_to_clusters(s, |
|
85 |
qed_offset_into_cluster(s, find_cluster_cb->pos) + |
|
86 |
find_cluster_cb->len); |
|
87 |
n = qed_count_contiguous_clusters(s, request->l2_table->table, |
|
88 |
index, n, &offset); |
|
89 |
|
|
90 |
ret = offset ? QED_CLUSTER_FOUND : QED_CLUSTER_L2; |
|
91 |
len = MIN(find_cluster_cb->len, n * s->header.cluster_size - |
|
92 |
qed_offset_into_cluster(s, find_cluster_cb->pos)); |
|
93 |
|
|
94 |
if (offset && !qed_check_cluster_offset(s, offset)) { |
|
95 |
ret = -EINVAL; |
|
96 |
} |
|
97 |
|
|
98 |
out: |
|
99 |
find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); |
|
100 |
qemu_free(find_cluster_cb); |
|
101 |
} |
|
102 |
|
|
103 |
/** |
|
104 |
* Find the offset of a data cluster |
|
105 |
* |
|
106 |
* @s: QED state |
|
107 |
* @request: L2 cache entry |
|
108 |
* @pos: Byte position in device |
|
109 |
* @len: Number of bytes |
|
110 |
* @cb: Completion function |
|
111 |
* @opaque: User data for completion function |
|
112 |
* |
|
113 |
* This function translates a position in the block device to an offset in the |
|
114 |
* image file. It invokes the cb completion callback to report back the |
|
115 |
* translated offset or unallocated range in the image file. |
|
116 |
* |
|
117 |
* If the L2 table exists, request->l2_table points to the L2 table cache entry |
|
118 |
* and the caller must free the reference when they are finished. The cache |
|
119 |
* entry is exposed in this way to avoid callers having to read the L2 table |
|
120 |
* again later during request processing. If request->l2_table is non-NULL it |
|
121 |
* will be unreferenced before taking on the new cache entry. |
|
122 |
*/ |
|
123 |
void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, |
|
124 |
size_t len, QEDFindClusterFunc *cb, void *opaque) |
|
125 |
{ |
|
126 |
QEDFindClusterCB *find_cluster_cb; |
|
127 |
uint64_t l2_offset; |
|
128 |
|
|
129 |
/* Limit length to L2 boundary. Requests are broken up at the L2 boundary |
|
130 |
* so that a request acts on one L2 table at a time. |
|
131 |
*/ |
|
132 |
len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); |
|
133 |
|
|
134 |
l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; |
|
135 |
if (!l2_offset) { |
|
136 |
cb(opaque, QED_CLUSTER_L1, 0, len); |
|
137 |
return; |
|
138 |
} |
|
139 |
if (!qed_check_table_offset(s, l2_offset)) { |
|
140 |
cb(opaque, -EINVAL, 0, 0); |
|
141 |
return; |
|
142 |
} |
|
143 |
|
|
144 |
find_cluster_cb = qemu_malloc(sizeof(*find_cluster_cb)); |
|
145 |
find_cluster_cb->s = s; |
|
146 |
find_cluster_cb->pos = pos; |
|
147 |
find_cluster_cb->len = len; |
|
148 |
find_cluster_cb->cb = cb; |
|
149 |
find_cluster_cb->opaque = opaque; |
|
150 |
find_cluster_cb->request = request; |
|
151 |
|
|
152 |
qed_read_l2_table(s, request, l2_offset, |
|
153 |
qed_find_cluster_cb, find_cluster_cb); |
|
154 |
} |
b/block/qed-gencb.c | ||
---|---|---|
1 |
/* |
|
2 |
* QEMU Enhanced Disk Format |
|
3 |
* |
|
4 |
* Copyright IBM, Corp. 2010 |
|
5 |
* |
|
6 |
* Authors: |
|
7 |
* Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> |
|
8 |
* |
|
9 |
* This work is licensed under the terms of the GNU LGPL, version 2 or later. |
|
10 |
* See the COPYING.LIB file in the top-level directory. |
|
11 |
* |
|
12 |
*/ |
|
13 |
|
|
14 |
#include "qed.h" |
|
15 |
|
|
16 |
void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque) |
|
17 |
{ |
|
18 |
GenericCB *gencb = qemu_malloc(len); |
|
19 |
gencb->cb = cb; |
|
20 |
gencb->opaque = opaque; |
|
21 |
return gencb; |
|
22 |
} |
|
23 |
|
|
24 |
void gencb_complete(void *opaque, int ret) |
|
25 |
{ |
|
26 |
GenericCB *gencb = opaque; |
|
27 |
BlockDriverCompletionFunc *cb = gencb->cb; |
|
28 |
void *user_opaque = gencb->opaque; |
|
29 |
|
|
30 |
qemu_free(gencb); |
|
31 |
cb(user_opaque, ret); |
|
32 |
} |
b/block/qed-l2-cache.c | ||
---|---|---|
1 |
/* |
|
2 |
* QEMU Enhanced Disk Format L2 Cache |
|
3 |
* |
|
4 |
* Copyright IBM, Corp. 2010 |
|
5 |
* |
|
6 |
* Authors: |
|
7 |
* Anthony Liguori <aliguori@us.ibm.com> |
|
8 |
* |
|
9 |
* This work is licensed under the terms of the GNU LGPL, version 2 or later. |
|
10 |
* See the COPYING.LIB file in the top-level directory. |
|
11 |
* |
|
12 |
*/ |
|
13 |
|
|
14 |
/* |
|
15 |
* L2 table cache usage is as follows: |
|
16 |
* |
|
17 |
* An open image has one L2 table cache that is used to avoid accessing the |
|
18 |
* image file for recently referenced L2 tables. |
|
19 |
* |
|
20 |
* Cluster offset lookup translates the logical offset within the block device |
|
21 |
* to a cluster offset within the image file. This is done by indexing into |
|
22 |
* the L1 and L2 tables which store cluster offsets. It is here where the L2 |
|
23 |
* table cache serves up recently referenced L2 tables. |
|
24 |
* |
|
25 |
* If there is a cache miss, that L2 table is read from the image file and |
|
26 |
* committed to the cache. Subsequent accesses to that L2 table will be served |
|
27 |
* from the cache until the table is evicted from the cache. |
|
28 |
* |
|
29 |
* L2 tables are also committed to the cache when new L2 tables are allocated |
|
30 |
* in the image file. Since the L2 table cache is write-through, the new L2 |
|
31 |
* table is first written out to the image file and then committed to the |
|
32 |
* cache. |
|
33 |
* |
|
34 |
* Multiple I/O requests may be using an L2 table cache entry at any given |
|
35 |
* time. That means an entry may be in use across several requests and |
|
36 |
* reference counting is needed to free the entry at the correct time. In |
|
37 |
* particular, an entry evicted from the cache will only be freed once all |
|
38 |
* references are dropped. |
|
39 |
* |
|
40 |
* An in-flight I/O request will hold a reference to a L2 table cache entry for |
|
41 |
* the period during which it needs to access the L2 table. This includes |
|
42 |
* cluster offset lookup, L2 table allocation, and L2 table update when a new |
|
43 |
* data cluster has been allocated. |
|
44 |
* |
|
45 |
* An interesting case occurs when two requests need to access an L2 table that |
|
46 |
* is not in the cache. Since the operation to read the table from the image |
|
47 |
* file takes some time to complete, both requests may see a cache miss and |
|
48 |
* start reading the L2 table from the image file. The first to finish will |
|
49 |
* commit its L2 table into the cache. When the second tries to commit its |
|
50 |
* table will be deleted in favor of the existing cache entry. |
|
51 |
*/ |
|
52 |
|
|
53 |
#include "trace.h" |
|
54 |
#include "qed.h" |
|
55 |
|
|
56 |
/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ |
|
57 |
#define MAX_L2_CACHE_SIZE 50 |
|
58 |
|
|
59 |
/** |
|
60 |
* Initialize the L2 cache |
|
61 |
*/ |
|
62 |
void qed_init_l2_cache(L2TableCache *l2_cache) |
|
63 |
{ |
|
64 |
QTAILQ_INIT(&l2_cache->entries); |
|
65 |
l2_cache->n_entries = 0; |
|
66 |
} |
|
67 |
|
|
68 |
/** |
|
69 |
* Free the L2 cache |
|
70 |
*/ |
|
71 |
void qed_free_l2_cache(L2TableCache *l2_cache) |
|
72 |
{ |
|
73 |
CachedL2Table *entry, *next_entry; |
|
74 |
|
|
75 |
QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { |
|
76 |
qemu_vfree(entry->table); |
|
77 |
qemu_free(entry); |
|
78 |
} |
|
79 |
} |
|
80 |
|
|
81 |
/** |
|
82 |
* Allocate an uninitialized entry from the cache |
|
83 |
* |
|
84 |
* The returned entry has a reference count of 1 and is owned by the caller. |
|
85 |
* The caller must allocate the actual table field for this entry and it must |
|
86 |
* be freeable using qemu_vfree(). |
|
87 |
*/ |
|
88 |
CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) |
|
89 |
{ |
|
90 |
CachedL2Table *entry; |
|
91 |
|
|
92 |
entry = qemu_mallocz(sizeof(*entry)); |
|
93 |
entry->ref++; |
|
94 |
|
|
95 |
trace_qed_alloc_l2_cache_entry(l2_cache, entry); |
|
96 |
|
|
97 |
return entry; |
|
98 |
} |
|
99 |
|
|
100 |
/** |
|
101 |
* Decrease an entry's reference count and free if necessary when the reference |
|
102 |
* count drops to zero. |
|
103 |
*/ |
|
104 |
void qed_unref_l2_cache_entry(CachedL2Table *entry) |
|
105 |
{ |
|
106 |
if (!entry) { |
|
107 |
return; |
|
108 |
} |
|
109 |
|
|
110 |
entry->ref--; |
|
111 |
trace_qed_unref_l2_cache_entry(entry, entry->ref); |
|
112 |
if (entry->ref == 0) { |
|
113 |
qemu_vfree(entry->table); |
|
114 |
qemu_free(entry); |
|
115 |
} |
|
116 |
} |
|
117 |
|
|
118 |
/** |
|
119 |
* Find an entry in the L2 cache. This may return NULL and it's up to the |
|
120 |
* caller to satisfy the cache miss. |
|
121 |
* |
|
122 |
* For a cached entry, this function increases the reference count and returns |
|
123 |
* the entry. |
|
124 |
*/ |
|
125 |
CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) |
|
126 |
{ |
|
127 |
CachedL2Table *entry; |
|
128 |
|
|
129 |
QTAILQ_FOREACH(entry, &l2_cache->entries, node) { |
|
130 |
if (entry->offset == offset) { |
|
131 |
trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); |
|
132 |
entry->ref++; |
|
133 |
return entry; |
|
134 |
} |
|
135 |
} |
|
136 |
return NULL; |
|
137 |
} |
|
138 |
|
|
139 |
/** |
|
140 |
* Commit an L2 cache entry into the cache. This is meant to be used as part of |
|
141 |
* the process to satisfy a cache miss. A caller would allocate an entry which |
|
142 |
* is not actually in the L2 cache and then once the entry was valid and |
|
143 |
* present on disk, the entry can be committed into the cache. |
|
144 |
* |
|
145 |
* Since the cache is write-through, it's important that this function is not |
|
146 |
* called until the entry is present on disk and the L1 has been updated to |
|
147 |
* point to the entry. |
|
148 |
* |
|
149 |
* N.B. This function steals a reference to the l2_table from the caller so the |
|
150 |
* caller must obtain a new reference by issuing a call to |
|
151 |
* qed_find_l2_cache_entry(). |
|
152 |
*/ |
|
153 |
void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) |
|
154 |
{ |
|
155 |
CachedL2Table *entry; |
|
156 |
|
|
157 |
entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); |
|
158 |
if (entry) { |
|
159 |
qed_unref_l2_cache_entry(entry); |
|
160 |
qed_unref_l2_cache_entry(l2_table); |
|
161 |
return; |
|
162 |
} |
|
163 |
|
|
164 |
if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { |
|
165 |
entry = QTAILQ_FIRST(&l2_cache->entries); |
|
166 |
QTAILQ_REMOVE(&l2_cache->entries, entry, node); |
|
167 |
l2_cache->n_entries--; |
|
168 |
qed_unref_l2_cache_entry(entry); |
|
169 |
} |
|
170 |
|
|
171 |
l2_cache->n_entries++; |
|
172 |
QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); |
|
173 |
} |
b/block/qed-table.c | ||
---|---|---|
1 |
/* |
|
2 |
* QEMU Enhanced Disk Format Table I/O |
|
3 |
* |
|
4 |
* Copyright IBM, Corp. 2010 |
|
5 |
* |
|
6 |
* Authors: |
|
7 |
* Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> |
|
8 |
* Anthony Liguori <aliguori@us.ibm.com> |
|
9 |
* |
|
10 |
* This work is licensed under the terms of the GNU LGPL, version 2 or later. |
|
11 |
* See the COPYING.LIB file in the top-level directory. |
|
12 |
* |
|
13 |
*/ |
|
14 |
|
|
15 |
#include "trace.h" |
|
16 |
#include "qemu_socket.h" /* for EINPROGRESS on Windows */ |
|
17 |
#include "qed.h" |
|
18 |
|
|
19 |
typedef struct { |
|
20 |
GenericCB gencb; |
|
21 |
BDRVQEDState *s; |
|
22 |
QEDTable *table; |
|
23 |
|
|
24 |
struct iovec iov; |
|
25 |
QEMUIOVector qiov; |
|
26 |
} QEDReadTableCB; |
|
27 |
|
|
28 |
static void qed_read_table_cb(void *opaque, int ret) |
|
29 |
{ |
|
30 |
QEDReadTableCB *read_table_cb = opaque; |
|
31 |
QEDTable *table = read_table_cb->table; |
|
32 |
int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t); |
|
33 |
int i; |
|
34 |
|
|
35 |
/* Handle I/O error */ |
|
36 |
if (ret) { |
|
37 |
goto out; |
|
38 |
} |
|
39 |
|
|
40 |
/* Byteswap offsets */ |
|
41 |
for (i = 0; i < noffsets; i++) { |
|
42 |
table->offsets[i] = le64_to_cpu(table->offsets[i]); |
|
43 |
} |
|
44 |
|
|
45 |
out: |
|
46 |
/* Completion */ |
|
47 |
trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); |
|
48 |
gencb_complete(&read_table_cb->gencb, ret); |
|
49 |
} |
|
50 |
|
|
51 |
static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, |
|
52 |
BlockDriverCompletionFunc *cb, void *opaque) |
|
53 |
{ |
|
54 |
QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), |
|
55 |
cb, opaque); |
|
56 |
QEMUIOVector *qiov = &read_table_cb->qiov; |
|
57 |
BlockDriverAIOCB *aiocb; |
|
58 |
|
|
59 |
trace_qed_read_table(s, offset, table); |
|
60 |
|
|
61 |
read_table_cb->s = s; |
|
62 |
read_table_cb->table = table; |
|
63 |
read_table_cb->iov.iov_base = table->offsets, |
|
64 |
read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, |
|
65 |
|
|
66 |
qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); |
|
67 |
aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, |
|
68 |
read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, |
|
69 |
qed_read_table_cb, read_table_cb); |
|
70 |
if (!aiocb) { |
|
71 |
qed_read_table_cb(read_table_cb, -EIO); |
|
72 |
} |
|
73 |
} |
|
74 |
|
|
75 |
typedef struct { |
|
76 |
GenericCB gencb; |
|
77 |
BDRVQEDState *s; |
|
78 |
QEDTable *orig_table; |
|
79 |
QEDTable *table; |
|
80 |
bool flush; /* flush after write? */ |
|
81 |
|
|
82 |
struct iovec iov; |
|
83 |
QEMUIOVector qiov; |
|
84 |
} QEDWriteTableCB; |
|
85 |
|
|
86 |
static void qed_write_table_cb(void *opaque, int ret) |
|
87 |
{ |
|
88 |
QEDWriteTableCB *write_table_cb = opaque; |
|
89 |
|
|
90 |
trace_qed_write_table_cb(write_table_cb->s, |
|
91 |
write_table_cb->orig_table, |
|
92 |
write_table_cb->flush, |
|
93 |
ret); |
|
94 |
|
|
95 |
if (ret) { |
|
96 |
goto out; |
|
97 |
} |
|
98 |
|
|
99 |
if (write_table_cb->flush) { |
|
100 |
/* We still need to flush first */ |
|
101 |
write_table_cb->flush = false; |
|
102 |
bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, |
|
103 |
write_table_cb); |
|
104 |
return; |
|
105 |
} |
|
106 |
|
|
107 |
out: |
|
108 |
qemu_vfree(write_table_cb->table); |
|
109 |
gencb_complete(&write_table_cb->gencb, ret); |
|
110 |
return; |
|
111 |
} |
|
112 |
|
|
113 |
/** |
|
114 |
* Write out an updated part or all of a table |
|
115 |
* |
|
116 |
* @s: QED state |
|
117 |
* @offset: Offset of table in image file, in bytes |
|
118 |
* @table: Table |
|
119 |
* @index: Index of first element |
|
120 |
* @n: Number of elements |
|
121 |
* @flush: Whether or not to sync to disk |
|
122 |
* @cb: Completion function |
|
123 |
* @opaque: Argument for completion function |
|
124 |
*/ |
|
125 |
static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, |
|
126 |
unsigned int index, unsigned int n, bool flush, |
|
127 |
BlockDriverCompletionFunc *cb, void *opaque) |
|
128 |
{ |
|
129 |
QEDWriteTableCB *write_table_cb; |
|
130 |
BlockDriverAIOCB *aiocb; |
|
131 |
unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; |
|
132 |
unsigned int start, end, i; |
|
133 |
size_t len_bytes; |
|
134 |
|
|
135 |
trace_qed_write_table(s, offset, table, index, n); |
|
136 |
|
|
137 |
/* Calculate indices of the first and one after last elements */ |
|
138 |
start = index & ~sector_mask; |
|
139 |
end = (index + n + sector_mask) & ~sector_mask; |
|
140 |
|
|
141 |
len_bytes = (end - start) * sizeof(uint64_t); |
|
142 |
|
|
143 |
write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); |
|
144 |
write_table_cb->s = s; |
|
145 |
write_table_cb->orig_table = table; |
|
146 |
write_table_cb->flush = flush; |
|
147 |
write_table_cb->table = qemu_blockalign(s->bs, len_bytes); |
|
148 |
write_table_cb->iov.iov_base = write_table_cb->table->offsets; |
|
149 |
write_table_cb->iov.iov_len = len_bytes; |
|
150 |
qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); |
|
151 |
|
|
152 |
/* Byteswap table */ |
|
153 |
for (i = start; i < end; i++) { |
|
154 |
uint64_t le_offset = cpu_to_le64(table->offsets[i]); |
|
155 |
write_table_cb->table->offsets[i - start] = le_offset; |
|
156 |
} |
|
157 |
|
|
158 |
/* Adjust for offset into table */ |
|
159 |
offset += start * sizeof(uint64_t); |
|
160 |
|
|
161 |
aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, |
|
162 |
&write_table_cb->qiov, |
|
163 |
write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, |
|
164 |
qed_write_table_cb, write_table_cb); |
|
165 |
if (!aiocb) { |
|
166 |
qed_write_table_cb(write_table_cb, -EIO); |
|
167 |
} |
|
168 |
} |
|
169 |
|
|
170 |
/** |
|
171 |
* Propagate return value from async callback |
|
172 |
*/ |
|
173 |
static void qed_sync_cb(void *opaque, int ret) |
|
174 |
{ |
|
175 |
*(int *)opaque = ret; |
|
176 |
} |
|
177 |
|
|
178 |
int qed_read_l1_table_sync(BDRVQEDState *s) |
|
179 |
{ |
|
180 |
int ret = -EINPROGRESS; |
|
181 |
|
|
182 |
async_context_push(); |
|
183 |
|
|
184 |
qed_read_table(s, s->header.l1_table_offset, |
|
185 |
s->l1_table, qed_sync_cb, &ret); |
|
186 |
while (ret == -EINPROGRESS) { |
|
187 |
qemu_aio_wait(); |
|
188 |
} |
|
189 |
|
|
190 |
async_context_pop(); |
|
191 |
|
|
192 |
return ret; |
|
193 |
} |
|
194 |
|
|
195 |
void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, |
|
196 |
BlockDriverCompletionFunc *cb, void *opaque) |
|
197 |
{ |
|
198 |
BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); |
|
199 |
qed_write_table(s, s->header.l1_table_offset, |
|
200 |
s->l1_table, index, n, false, cb, opaque); |
|
201 |
} |
|
202 |
|
|
203 |
int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, |
|
204 |
unsigned int n) |
|
205 |
{ |
|
206 |
int ret = -EINPROGRESS; |
|
207 |
|
|
208 |
async_context_push(); |
|
209 |
|
|
210 |
qed_write_l1_table(s, index, n, qed_sync_cb, &ret); |
|
211 |
while (ret == -EINPROGRESS) { |
|
212 |
qemu_aio_wait(); |
|
213 |
} |
|
214 |
|
|
215 |
async_context_pop(); |
|
216 |
|
|
217 |
return ret; |
|
218 |
} |
|
219 |
|
|
220 |
typedef struct { |
|
221 |
GenericCB gencb; |
|
222 |
BDRVQEDState *s; |
|
223 |
uint64_t l2_offset; |
|
224 |
QEDRequest *request; |
|
225 |
} QEDReadL2TableCB; |
|
226 |
|
|
227 |
static void qed_read_l2_table_cb(void *opaque, int ret) |
|
228 |
{ |
|
229 |
QEDReadL2TableCB *read_l2_table_cb = opaque; |
|
230 |
QEDRequest *request = read_l2_table_cb->request; |
|
231 |
BDRVQEDState *s = read_l2_table_cb->s; |
|
232 |
CachedL2Table *l2_table = request->l2_table; |
|
233 |
|
|
234 |
if (ret) { |
|
235 |
/* can't trust loaded L2 table anymore */ |
|
236 |
qed_unref_l2_cache_entry(l2_table); |
|
237 |
request->l2_table = NULL; |
|
238 |
} else { |
|
239 |
l2_table->offset = read_l2_table_cb->l2_offset; |
|
240 |
|
|
241 |
qed_commit_l2_cache_entry(&s->l2_cache, l2_table); |
|
242 |
|
|
243 |
/* This is guaranteed to succeed because we just committed the entry |
|
244 |
* to the cache. |
|
245 |
*/ |
|
246 |
request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, |
|
247 |
l2_table->offset); |
|
248 |
assert(request->l2_table != NULL); |
|
249 |
} |
|
250 |
|
|
251 |
gencb_complete(&read_l2_table_cb->gencb, ret); |
|
252 |
} |
|
253 |
|
|
254 |
void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, |
|
255 |
BlockDriverCompletionFunc *cb, void *opaque) |
|
256 |
{ |
|
257 |
QEDReadL2TableCB *read_l2_table_cb; |
|
258 |
|
|
259 |
qed_unref_l2_cache_entry(request->l2_table); |
|
260 |
|
|
261 |
/* Check for cached L2 entry */ |
|
262 |
request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); |
|
263 |
if (request->l2_table) { |
|
264 |
cb(opaque, 0); |
|
265 |
return; |
|
266 |
} |
|
267 |
|
|
268 |
request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); |
|
269 |
request->l2_table->table = qed_alloc_table(s); |
|
270 |
|
|
271 |
read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); |
|
272 |
read_l2_table_cb->s = s; |
|
273 |
read_l2_table_cb->l2_offset = offset; |
|
274 |
read_l2_table_cb->request = request; |
|
275 |
|
|
276 |
BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); |
|
277 |
qed_read_table(s, offset, request->l2_table->table, |
|
278 |
qed_read_l2_table_cb, read_l2_table_cb); |
|
279 |
} |
|
280 |
|
|
281 |
int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) |
|
282 |
{ |
|
283 |
int ret = -EINPROGRESS; |
|
284 |
|
|
285 |
async_context_push(); |
|
286 |
|
|
287 |
qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); |
|
288 |
while (ret == -EINPROGRESS) { |
|
289 |
qemu_aio_wait(); |
|
290 |
} |
|
291 |
|
|
292 |
async_context_pop(); |
|
293 |
return ret; |
|
294 |
} |
|
295 |
|
|
296 |
void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, |
|
297 |
unsigned int index, unsigned int n, bool flush, |
|
298 |
BlockDriverCompletionFunc *cb, void *opaque) |
|
299 |
{ |
|
300 |
BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); |
|
301 |
qed_write_table(s, request->l2_table->offset, |
|
302 |
request->l2_table->table, index, n, flush, cb, opaque); |
|
303 |
} |
|
304 |
|
|
305 |
int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, |
|
306 |
unsigned int index, unsigned int n, bool flush) |
|
307 |
{ |
|
308 |
int ret = -EINPROGRESS; |
|
309 |
|
|
310 |
async_context_push(); |
|
311 |
|
|
312 |
qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); |
|
313 |
while (ret == -EINPROGRESS) { |
|
314 |
qemu_aio_wait(); |
|
315 |
} |
|
316 |
|
|
317 |
async_context_pop(); |
|
318 |
return ret; |
|
319 |
} |
b/block/qed.c | ||
---|---|---|
155 | 155 |
return 0; |
156 | 156 |
} |
157 | 157 |
|
158 |
QEDTable *qed_alloc_table(BDRVQEDState *s) |
|
159 |
{ |
|
160 |
/* Honor O_DIRECT memory alignment requirements */ |
|
161 |
return qemu_blockalign(s->bs, |
|
162 |
s->header.cluster_size * s->header.table_size); |
|
163 |
} |
|
164 |
|
|
158 | 165 |
static int bdrv_qed_open(BlockDriverState *bs, int flags) |
159 | 166 |
{ |
160 | 167 |
BDRVQEDState *s = bs->opaque; |
... | ... | |
244 | 251 |
bdrv_flush(bs->file); |
245 | 252 |
} |
246 | 253 |
|
254 |
s->l1_table = qed_alloc_table(s); |
|
255 |
qed_init_l2_cache(&s->l2_cache); |
|
256 |
|
|
257 |
ret = qed_read_l1_table_sync(s); |
|
258 |
if (ret) { |
|
259 |
qed_free_l2_cache(&s->l2_cache); |
|
260 |
qemu_vfree(s->l1_table); |
|
261 |
} |
|
247 | 262 |
return ret; |
248 | 263 |
} |
249 | 264 |
|
250 | 265 |
static void bdrv_qed_close(BlockDriverState *bs) |
251 | 266 |
{ |
267 |
BDRVQEDState *s = bs->opaque; |
|
268 |
|
|
269 |
qed_free_l2_cache(&s->l2_cache); |
|
270 |
qemu_vfree(s->l1_table); |
|
252 | 271 |
} |
253 | 272 |
|
254 | 273 |
static int bdrv_qed_flush(BlockDriverState *bs) |
... | ... | |
368 | 387 |
backing_file, backing_fmt); |
369 | 388 |
} |
370 | 389 |
|
390 |
typedef struct { |
|
391 |
int is_allocated; |
|
392 |
int *pnum; |
|
393 |
} QEDIsAllocatedCB; |
|
394 |
|
|
395 |
static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) |
|
396 |
{ |
|
397 |
QEDIsAllocatedCB *cb = opaque; |
|
398 |
*cb->pnum = len / BDRV_SECTOR_SIZE; |
|
399 |
cb->is_allocated = ret == QED_CLUSTER_FOUND; |
|
400 |
} |
|
401 |
|
|
371 | 402 |
static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num, |
372 | 403 |
int nb_sectors, int *pnum) |
373 | 404 |
{ |
374 |
return -ENOTSUP; |
|
405 |
BDRVQEDState *s = bs->opaque; |
|
406 |
uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; |
|
407 |
size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; |
|
408 |
QEDIsAllocatedCB cb = { |
|
409 |
.is_allocated = -1, |
|
410 |
.pnum = pnum, |
|
411 |
}; |
|
412 |
QEDRequest request = { .l2_table = NULL }; |
|
413 |
|
|
414 |
async_context_push(); |
|
415 |
|
|
416 |
qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); |
|
417 |
|
|
418 |
while (cb.is_allocated == -1) { |
|
419 |
qemu_aio_wait(); |
|
420 |
} |
|
421 |
|
|
422 |
async_context_pop(); |
|
423 |
|
|
424 |
qed_unref_l2_cache_entry(request.l2_table); |
|
425 |
|
|
426 |
return cb.is_allocated; |
|
375 | 427 |
} |
376 | 428 |
|
377 | 429 |
static int bdrv_qed_make_empty(BlockDriverState *bs) |
b/block/qed.h | ||
---|---|---|
96 | 96 |
} QEDHeader; |
97 | 97 |
|
98 | 98 |
typedef struct { |
99 |
uint64_t offsets[0]; /* in bytes */ |
|
100 |
} QEDTable; |
|
101 |
|
|
102 |
/* The L2 cache is a simple write-through cache for L2 structures */ |
|
103 |
typedef struct CachedL2Table { |
|
104 |
QEDTable *table; |
|
105 |
uint64_t offset; /* offset=0 indicates an invalidate entry */ |
|
106 |
QTAILQ_ENTRY(CachedL2Table) node; |
|
107 |
int ref; |
|
108 |
} CachedL2Table; |
|
109 |
|
|
110 |
typedef struct { |
|
111 |
QTAILQ_HEAD(, CachedL2Table) entries; |
|
112 |
unsigned int n_entries; |
|
113 |
} L2TableCache; |
|
114 |
|
|
115 |
typedef struct QEDRequest { |
|
116 |
CachedL2Table *l2_table; |
|
117 |
} QEDRequest; |
|
118 |
|
|
119 |
typedef struct { |
|
99 | 120 |
BlockDriverState *bs; /* device */ |
100 | 121 |
uint64_t file_size; /* length of image file, in bytes */ |
101 | 122 |
|
102 | 123 |
QEDHeader header; /* always cpu-endian */ |
124 |
QEDTable *l1_table; |
|
125 |
L2TableCache l2_cache; /* l2 table cache */ |
|
103 | 126 |
uint32_t table_nelems; |
104 | 127 |
uint32_t l1_shift; |
105 | 128 |
uint32_t l2_shift; |
106 | 129 |
uint32_t l2_mask; |
107 | 130 |
} BDRVQEDState; |
108 | 131 |
|
132 |
enum { |
|
133 |
QED_CLUSTER_FOUND, /* cluster found */ |
|
134 |
QED_CLUSTER_L2, /* cluster missing in L2 */ |
|
135 |
QED_CLUSTER_L1, /* cluster missing in L1 */ |
|
136 |
}; |
|
137 |
|
|
138 |
/** |
|
139 |
* qed_find_cluster() completion callback |
|
140 |
* |
|
141 |
* @opaque: User data for completion callback |
|
142 |
* @ret: QED_CLUSTER_FOUND Success |
|
143 |
* QED_CLUSTER_L2 Data cluster unallocated in L2 |
|
144 |
* QED_CLUSTER_L1 L2 unallocated in L1 |
|
145 |
* -errno POSIX error occurred |
|
146 |
* @offset: Data cluster offset |
|
147 |
* @len: Contiguous bytes starting from cluster offset |
|
148 |
* |
|
149 |
* This function is invoked when qed_find_cluster() completes. |
|
150 |
* |
|
151 |
* On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range |
|
152 |
* in the image file. |
|
153 |
* |
|
154 |
* On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 |
|
155 |
* table offset, respectively. len is number of contiguous unallocated bytes. |
|
156 |
*/ |
|
157 |
typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); |
|
158 |
|
|
159 |
/** |
|
160 |
* Generic callback for chaining async callbacks |
|
161 |
*/ |
|
162 |
typedef struct { |
|
163 |
BlockDriverCompletionFunc *cb; |
|
164 |
void *opaque; |
|
165 |
} GenericCB; |
|
166 |
|
|
167 |
void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque); |
|
168 |
void gencb_complete(void *opaque, int ret); |
|
169 |
|
|
170 |
/** |
|
171 |
* L2 cache functions |
|
172 |
*/ |
|
173 |
void qed_init_l2_cache(L2TableCache *l2_cache); |
|
174 |
void qed_free_l2_cache(L2TableCache *l2_cache); |
|
175 |
CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); |
|
176 |
void qed_unref_l2_cache_entry(CachedL2Table *entry); |
|
177 |
CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); |
|
178 |
void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); |
|
179 |
|
|
180 |
/** |
|
181 |
* Table I/O functions |
|
182 |
*/ |
|
183 |
int qed_read_l1_table_sync(BDRVQEDState *s); |
|
184 |
void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, |
|
185 |
BlockDriverCompletionFunc *cb, void *opaque); |
|
186 |
int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, |
|
187 |
unsigned int n); |
|
188 |
int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, |
|
189 |
uint64_t offset); |
|
190 |
void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, |
|
191 |
BlockDriverCompletionFunc *cb, void *opaque); |
|
192 |
void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, |
|
193 |
unsigned int index, unsigned int n, bool flush, |
|
194 |
BlockDriverCompletionFunc *cb, void *opaque); |
|
195 |
int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, |
|
196 |
unsigned int index, unsigned int n, bool flush); |
|
197 |
|
|
198 |
/** |
|
199 |
* Cluster functions |
|
200 |
*/ |
|
201 |
void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, |
|
202 |
size_t len, QEDFindClusterFunc *cb, void *opaque); |
|
203 |
|
|
204 |
/** |
|
205 |
* Consistency check |
|
206 |
*/ |
|
207 |
int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); |
|
208 |
|
|
209 |
QEDTable *qed_alloc_table(BDRVQEDState *s); |
|
210 |
|
|
109 | 211 |
/** |
110 | 212 |
* Round down to the start of a cluster |
111 | 213 |
*/ |
... | ... | |
114 | 216 |
return offset & ~(uint64_t)(s->header.cluster_size - 1); |
115 | 217 |
} |
116 | 218 |
|
219 |
static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) |
|
220 |
{ |
|
221 |
return offset & (s->header.cluster_size - 1); |
|
222 |
} |
|
223 |
|
|
224 |
static inline unsigned int qed_bytes_to_clusters(BDRVQEDState *s, size_t bytes) |
|
225 |
{ |
|
226 |
return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / |
|
227 |
(s->header.cluster_size - 1); |
|
228 |
} |
|
229 |
|
|
230 |
static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) |
|
231 |
{ |
|
232 |
return pos >> s->l1_shift; |
|
233 |
} |
|
234 |
|
|
235 |
static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) |
|
236 |
{ |
|
237 |
return (pos >> s->l2_shift) & s->l2_mask; |
|
238 |
} |
|
239 |
|
|
117 | 240 |
/** |
118 | 241 |
* Test if a cluster offset is valid |
119 | 242 |
*/ |
b/trace-events | ||
---|---|---|
192 | 192 |
|
193 | 193 |
# vl.c |
194 | 194 |
disable vm_state_notify(int running, int reason) "running %d reason %d" |
195 |
|
|
196 |
# block/qed-l2-cache.c |
|
197 |
disable qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p" |
|
198 |
disable qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d" |
|
199 |
disable qed_find_l2_cache_entry(void *l2_cache, void *entry, uint64_t offset, int ref) "l2_cache %p entry %p offset %"PRIu64" ref %d" |
|
200 |
|
|
201 |
# block/qed-table.c |
|
202 |
disable qed_read_table(void *s, uint64_t offset, void *table) "s %p offset %"PRIu64" table %p" |
|
203 |
disable qed_read_table_cb(void *s, void *table, int ret) "s %p table %p ret %d" |
|
204 |
disable qed_write_table(void *s, uint64_t offset, void *table, unsigned int index, unsigned int n) "s %p offset %"PRIu64" table %p index %u n %u" |
|
205 |
disable qed_write_table_cb(void *s, void *table, int flush, int ret) "s %p table %p flush %d ret %d" |
Also available in: Unified diff