root / block / quorum.c @ 98a7a38f
History | View | Annotate | Download (19.4 kB)
1 |
/*
|
---|---|
2 |
* Quorum Block filter
|
3 |
*
|
4 |
* Copyright (C) 2012-2014 Nodalink, EURL.
|
5 |
*
|
6 |
* Author:
|
7 |
* Benoît Canet <benoit.canet@irqsave.net>
|
8 |
*
|
9 |
* Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp)
|
10 |
* and blkmirror.c (Copyright (C) 2011 Red Hat, Inc).
|
11 |
*
|
12 |
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
13 |
* See the COPYING file in the top-level directory.
|
14 |
*/
|
15 |
|
16 |
#include <gnutls/gnutls.h> |
17 |
#include <gnutls/crypto.h> |
18 |
#include "block/block_int.h" |
19 |
#include "qapi/qmp/qjson.h" |
20 |
|
21 |
#define HASH_LENGTH 32 |
22 |
|
23 |
/* This union holds a vote hash value */
|
24 |
typedef union QuorumVoteValue { |
25 |
char h[HASH_LENGTH]; /* SHA-256 hash */ |
26 |
int64_t l; /* simpler 64 bits hash */
|
27 |
} QuorumVoteValue; |
28 |
|
29 |
/* A vote item */
|
30 |
typedef struct QuorumVoteItem { |
31 |
int index;
|
32 |
QLIST_ENTRY(QuorumVoteItem) next; |
33 |
} QuorumVoteItem; |
34 |
|
35 |
/* this structure is a vote version. A version is the set of votes sharing the
|
36 |
* same vote value.
|
37 |
* The set of votes will be tracked with the items field and its cardinality is
|
38 |
* vote_count.
|
39 |
*/
|
40 |
typedef struct QuorumVoteVersion { |
41 |
QuorumVoteValue value; |
42 |
int index;
|
43 |
int vote_count;
|
44 |
QLIST_HEAD(, QuorumVoteItem) items; |
45 |
QLIST_ENTRY(QuorumVoteVersion) next; |
46 |
} QuorumVoteVersion; |
47 |
|
48 |
/* this structure holds a group of vote versions together */
|
49 |
typedef struct QuorumVotes { |
50 |
QLIST_HEAD(, QuorumVoteVersion) vote_list; |
51 |
bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
|
52 |
} QuorumVotes; |
53 |
|
54 |
/* the following structure holds the state of one quorum instance */
|
55 |
typedef struct BDRVQuorumState { |
56 |
BlockDriverState **bs; /* children BlockDriverStates */
|
57 |
int num_children; /* children count */ |
58 |
int threshold; /* if less than threshold children reads gave the |
59 |
* same result a quorum error occurs.
|
60 |
*/
|
61 |
bool is_blkverify; /* true if the driver is in blkverify mode |
62 |
* Writes are mirrored on two children devices.
|
63 |
* On reads the two children devices' contents are
|
64 |
* compared and if a difference is spotted its
|
65 |
* location is printed and the code aborts.
|
66 |
* It is useful to debug other block drivers by
|
67 |
* comparing them with a reference one.
|
68 |
*/
|
69 |
} BDRVQuorumState; |
70 |
|
71 |
typedef struct QuorumAIOCB QuorumAIOCB; |
72 |
|
73 |
/* Quorum will create one instance of the following structure per operation it
|
74 |
* performs on its children.
|
75 |
* So for each read/write operation coming from the upper layer there will be
|
76 |
* $children_count QuorumChildRequest.
|
77 |
*/
|
78 |
typedef struct QuorumChildRequest { |
79 |
BlockDriverAIOCB *aiocb; |
80 |
QEMUIOVector qiov; |
81 |
uint8_t *buf; |
82 |
int ret;
|
83 |
QuorumAIOCB *parent; |
84 |
} QuorumChildRequest; |
85 |
|
86 |
/* Quorum will use the following structure to track progress of each read/write
|
87 |
* operation received by the upper layer.
|
88 |
* This structure hold pointers to the QuorumChildRequest structures instances
|
89 |
* used to do operations on each children and track overall progress.
|
90 |
*/
|
91 |
struct QuorumAIOCB {
|
92 |
BlockDriverAIOCB common; |
93 |
|
94 |
/* Request metadata */
|
95 |
uint64_t sector_num; |
96 |
int nb_sectors;
|
97 |
|
98 |
QEMUIOVector *qiov; /* calling IOV */
|
99 |
|
100 |
QuorumChildRequest *qcrs; /* individual child requests */
|
101 |
int count; /* number of completed AIOCB */ |
102 |
int success_count; /* number of successfully completed AIOCB */ |
103 |
|
104 |
QuorumVotes votes; |
105 |
|
106 |
bool is_read;
|
107 |
int vote_ret;
|
108 |
}; |
109 |
|
110 |
static void quorum_vote(QuorumAIOCB *acb); |
111 |
|
112 |
static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) |
113 |
{ |
114 |
QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); |
115 |
BDRVQuorumState *s = acb->common.bs->opaque; |
116 |
int i;
|
117 |
|
118 |
/* cancel all callbacks */
|
119 |
for (i = 0; i < s->num_children; i++) { |
120 |
bdrv_aio_cancel(acb->qcrs[i].aiocb); |
121 |
} |
122 |
|
123 |
g_free(acb->qcrs); |
124 |
qemu_aio_release(acb); |
125 |
} |
126 |
|
127 |
static AIOCBInfo quorum_aiocb_info = {
|
128 |
.aiocb_size = sizeof(QuorumAIOCB),
|
129 |
.cancel = quorum_aio_cancel, |
130 |
}; |
131 |
|
132 |
static void quorum_aio_finalize(QuorumAIOCB *acb) |
133 |
{ |
134 |
BDRVQuorumState *s = acb->common.bs->opaque; |
135 |
int i, ret = 0; |
136 |
|
137 |
if (acb->vote_ret) {
|
138 |
ret = acb->vote_ret; |
139 |
} |
140 |
|
141 |
acb->common.cb(acb->common.opaque, ret); |
142 |
|
143 |
if (acb->is_read) {
|
144 |
for (i = 0; i < s->num_children; i++) { |
145 |
qemu_vfree(acb->qcrs[i].buf); |
146 |
qemu_iovec_destroy(&acb->qcrs[i].qiov); |
147 |
} |
148 |
} |
149 |
|
150 |
g_free(acb->qcrs); |
151 |
qemu_aio_release(acb); |
152 |
} |
153 |
|
154 |
static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) |
155 |
{ |
156 |
return !memcmp(a->h, b->h, HASH_LENGTH);
|
157 |
} |
158 |
|
159 |
static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) |
160 |
{ |
161 |
return a->l == b->l;
|
162 |
} |
163 |
|
164 |
static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
|
165 |
BlockDriverState *bs, |
166 |
QEMUIOVector *qiov, |
167 |
uint64_t sector_num, |
168 |
int nb_sectors,
|
169 |
BlockDriverCompletionFunc *cb, |
170 |
void *opaque)
|
171 |
{ |
172 |
QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); |
173 |
int i;
|
174 |
|
175 |
acb->common.bs->opaque = s; |
176 |
acb->sector_num = sector_num; |
177 |
acb->nb_sectors = nb_sectors; |
178 |
acb->qiov = qiov; |
179 |
acb->qcrs = g_new0(QuorumChildRequest, s->num_children); |
180 |
acb->count = 0;
|
181 |
acb->success_count = 0;
|
182 |
acb->votes.compare = quorum_sha256_compare; |
183 |
QLIST_INIT(&acb->votes.vote_list); |
184 |
acb->is_read = false;
|
185 |
acb->vote_ret = 0;
|
186 |
|
187 |
for (i = 0; i < s->num_children; i++) { |
188 |
acb->qcrs[i].buf = NULL;
|
189 |
acb->qcrs[i].ret = 0;
|
190 |
acb->qcrs[i].parent = acb; |
191 |
} |
192 |
|
193 |
return acb;
|
194 |
} |
195 |
|
196 |
static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) |
197 |
{ |
198 |
QObject *data; |
199 |
assert(node_name); |
200 |
data = qobject_from_jsonf("{ 'ret': %d"
|
201 |
", 'node-name': %s"
|
202 |
", 'sector-num': %" PRId64
|
203 |
", 'sectors-count': %d }",
|
204 |
ret, node_name, acb->sector_num, acb->nb_sectors); |
205 |
monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); |
206 |
qobject_decref(data); |
207 |
} |
208 |
|
209 |
static void quorum_report_failure(QuorumAIOCB *acb) |
210 |
{ |
211 |
QObject *data; |
212 |
const char *reference = acb->common.bs->device_name[0] ? |
213 |
acb->common.bs->device_name : |
214 |
acb->common.bs->node_name; |
215 |
data = qobject_from_jsonf("{ 'reference': %s"
|
216 |
", 'sector-num': %" PRId64
|
217 |
", 'sectors-count': %d }",
|
218 |
reference, acb->sector_num, acb->nb_sectors); |
219 |
monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); |
220 |
qobject_decref(data); |
221 |
} |
222 |
|
223 |
static int quorum_vote_error(QuorumAIOCB *acb); |
224 |
|
225 |
static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) |
226 |
{ |
227 |
BDRVQuorumState *s = acb->common.bs->opaque; |
228 |
|
229 |
if (acb->success_count < s->threshold) {
|
230 |
acb->vote_ret = quorum_vote_error(acb); |
231 |
quorum_report_failure(acb); |
232 |
return true; |
233 |
} |
234 |
|
235 |
return false; |
236 |
} |
237 |
|
238 |
static void quorum_aio_cb(void *opaque, int ret) |
239 |
{ |
240 |
QuorumChildRequest *sacb = opaque; |
241 |
QuorumAIOCB *acb = sacb->parent; |
242 |
BDRVQuorumState *s = acb->common.bs->opaque; |
243 |
|
244 |
sacb->ret = ret; |
245 |
acb->count++; |
246 |
if (ret == 0) { |
247 |
acb->success_count++; |
248 |
} else {
|
249 |
quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); |
250 |
} |
251 |
assert(acb->count <= s->num_children); |
252 |
assert(acb->success_count <= s->num_children); |
253 |
if (acb->count < s->num_children) {
|
254 |
return;
|
255 |
} |
256 |
|
257 |
/* Do the vote on read */
|
258 |
if (acb->is_read) {
|
259 |
quorum_vote(acb); |
260 |
} else {
|
261 |
quorum_has_too_much_io_failed(acb); |
262 |
} |
263 |
|
264 |
quorum_aio_finalize(acb); |
265 |
} |
266 |
|
267 |
static void quorum_report_bad_versions(BDRVQuorumState *s, |
268 |
QuorumAIOCB *acb, |
269 |
QuorumVoteValue *value) |
270 |
{ |
271 |
QuorumVoteVersion *version; |
272 |
QuorumVoteItem *item; |
273 |
|
274 |
QLIST_FOREACH(version, &acb->votes.vote_list, next) { |
275 |
if (acb->votes.compare(&version->value, value)) {
|
276 |
continue;
|
277 |
} |
278 |
QLIST_FOREACH(item, &version->items, next) { |
279 |
quorum_report_bad(acb, s->bs[item->index]->node_name, 0);
|
280 |
} |
281 |
} |
282 |
} |
283 |
|
284 |
static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) |
285 |
{ |
286 |
int i;
|
287 |
assert(dest->niov == source->niov); |
288 |
assert(dest->size == source->size); |
289 |
for (i = 0; i < source->niov; i++) { |
290 |
assert(dest->iov[i].iov_len == source->iov[i].iov_len); |
291 |
memcpy(dest->iov[i].iov_base, |
292 |
source->iov[i].iov_base, |
293 |
source->iov[i].iov_len); |
294 |
} |
295 |
} |
296 |
|
297 |
static void quorum_count_vote(QuorumVotes *votes, |
298 |
QuorumVoteValue *value, |
299 |
int index)
|
300 |
{ |
301 |
QuorumVoteVersion *v = NULL, *version = NULL; |
302 |
QuorumVoteItem *item; |
303 |
|
304 |
/* look if we have something with this hash */
|
305 |
QLIST_FOREACH(v, &votes->vote_list, next) { |
306 |
if (votes->compare(&v->value, value)) {
|
307 |
version = v; |
308 |
break;
|
309 |
} |
310 |
} |
311 |
|
312 |
/* It's a version not yet in the list add it */
|
313 |
if (!version) {
|
314 |
version = g_new0(QuorumVoteVersion, 1);
|
315 |
QLIST_INIT(&version->items); |
316 |
memcpy(&version->value, value, sizeof(version->value));
|
317 |
version->index = index; |
318 |
version->vote_count = 0;
|
319 |
QLIST_INSERT_HEAD(&votes->vote_list, version, next); |
320 |
} |
321 |
|
322 |
version->vote_count++; |
323 |
|
324 |
item = g_new0(QuorumVoteItem, 1);
|
325 |
item->index = index; |
326 |
QLIST_INSERT_HEAD(&version->items, item, next); |
327 |
} |
328 |
|
329 |
static void quorum_free_vote_list(QuorumVotes *votes) |
330 |
{ |
331 |
QuorumVoteVersion *version, *next_version; |
332 |
QuorumVoteItem *item, *next_item; |
333 |
|
334 |
QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { |
335 |
QLIST_REMOVE(version, next); |
336 |
QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { |
337 |
QLIST_REMOVE(item, next); |
338 |
g_free(item); |
339 |
} |
340 |
g_free(version); |
341 |
} |
342 |
} |
343 |
|
344 |
static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) |
345 |
{ |
346 |
int j, ret;
|
347 |
gnutls_hash_hd_t dig; |
348 |
QEMUIOVector *qiov = &acb->qcrs[i].qiov; |
349 |
|
350 |
ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); |
351 |
|
352 |
if (ret < 0) { |
353 |
return ret;
|
354 |
} |
355 |
|
356 |
for (j = 0; j < qiov->niov; j++) { |
357 |
ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len); |
358 |
if (ret < 0) { |
359 |
break;
|
360 |
} |
361 |
} |
362 |
|
363 |
gnutls_hash_deinit(dig, (void *) hash);
|
364 |
return ret;
|
365 |
} |
366 |
|
367 |
static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
|
368 |
{ |
369 |
int max = 0; |
370 |
QuorumVoteVersion *candidate, *winner = NULL;
|
371 |
|
372 |
QLIST_FOREACH(candidate, &votes->vote_list, next) { |
373 |
if (candidate->vote_count > max) {
|
374 |
max = candidate->vote_count; |
375 |
winner = candidate; |
376 |
} |
377 |
} |
378 |
|
379 |
return winner;
|
380 |
} |
381 |
|
382 |
/* qemu_iovec_compare is handy for blkverify mode because it returns the first
|
383 |
* differing byte location. Yet it is handcoded to compare vectors one byte
|
384 |
* after another so it does not benefit from the libc SIMD optimizations.
|
385 |
* quorum_iovec_compare is written for speed and should be used in the non
|
386 |
* blkverify mode of quorum.
|
387 |
*/
|
388 |
static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) |
389 |
{ |
390 |
int i;
|
391 |
int result;
|
392 |
|
393 |
assert(a->niov == b->niov); |
394 |
for (i = 0; i < a->niov; i++) { |
395 |
assert(a->iov[i].iov_len == b->iov[i].iov_len); |
396 |
result = memcmp(a->iov[i].iov_base, |
397 |
b->iov[i].iov_base, |
398 |
a->iov[i].iov_len); |
399 |
if (result) {
|
400 |
return false; |
401 |
} |
402 |
} |
403 |
|
404 |
return true; |
405 |
} |
406 |
|
407 |
static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, |
408 |
const char *fmt, ...) |
409 |
{ |
410 |
va_list ap; |
411 |
|
412 |
va_start(ap, fmt); |
413 |
fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", |
414 |
acb->sector_num, acb->nb_sectors); |
415 |
vfprintf(stderr, fmt, ap); |
416 |
fprintf(stderr, "\n");
|
417 |
va_end(ap); |
418 |
exit(1);
|
419 |
} |
420 |
|
421 |
static bool quorum_compare(QuorumAIOCB *acb, |
422 |
QEMUIOVector *a, |
423 |
QEMUIOVector *b) |
424 |
{ |
425 |
BDRVQuorumState *s = acb->common.bs->opaque; |
426 |
ssize_t offset; |
427 |
|
428 |
/* This driver will replace blkverify in this particular case */
|
429 |
if (s->is_blkverify) {
|
430 |
offset = qemu_iovec_compare(a, b); |
431 |
if (offset != -1) { |
432 |
quorum_err(acb, "contents mismatch in sector %" PRId64,
|
433 |
acb->sector_num + |
434 |
(uint64_t)(offset / BDRV_SECTOR_SIZE)); |
435 |
} |
436 |
return true; |
437 |
} |
438 |
|
439 |
return quorum_iovec_compare(a, b);
|
440 |
} |
441 |
|
442 |
/* Do a vote to get the error code */
|
443 |
static int quorum_vote_error(QuorumAIOCB *acb) |
444 |
{ |
445 |
BDRVQuorumState *s = acb->common.bs->opaque; |
446 |
QuorumVoteVersion *winner = NULL;
|
447 |
QuorumVotes error_votes; |
448 |
QuorumVoteValue result_value; |
449 |
int i, ret = 0; |
450 |
bool error = false; |
451 |
|
452 |
QLIST_INIT(&error_votes.vote_list); |
453 |
error_votes.compare = quorum_64bits_compare; |
454 |
|
455 |
for (i = 0; i < s->num_children; i++) { |
456 |
ret = acb->qcrs[i].ret; |
457 |
if (ret) {
|
458 |
error = true;
|
459 |
result_value.l = ret; |
460 |
quorum_count_vote(&error_votes, &result_value, i); |
461 |
} |
462 |
} |
463 |
|
464 |
if (error) {
|
465 |
winner = quorum_get_vote_winner(&error_votes); |
466 |
ret = winner->value.l; |
467 |
} |
468 |
|
469 |
quorum_free_vote_list(&error_votes); |
470 |
|
471 |
return ret;
|
472 |
} |
473 |
|
474 |
static void quorum_vote(QuorumAIOCB *acb) |
475 |
{ |
476 |
bool quorum = true; |
477 |
int i, j, ret;
|
478 |
QuorumVoteValue hash; |
479 |
BDRVQuorumState *s = acb->common.bs->opaque; |
480 |
QuorumVoteVersion *winner; |
481 |
|
482 |
if (quorum_has_too_much_io_failed(acb)) {
|
483 |
return;
|
484 |
} |
485 |
|
486 |
/* get the index of the first successful read */
|
487 |
for (i = 0; i < s->num_children; i++) { |
488 |
if (!acb->qcrs[i].ret) {
|
489 |
break;
|
490 |
} |
491 |
} |
492 |
|
493 |
assert(i < s->num_children); |
494 |
|
495 |
/* compare this read with all other successful reads stopping at quorum
|
496 |
* failure
|
497 |
*/
|
498 |
for (j = i + 1; j < s->num_children; j++) { |
499 |
if (acb->qcrs[j].ret) {
|
500 |
continue;
|
501 |
} |
502 |
quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov); |
503 |
if (!quorum) {
|
504 |
break;
|
505 |
} |
506 |
} |
507 |
|
508 |
/* Every successful read agrees */
|
509 |
if (quorum) {
|
510 |
quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); |
511 |
return;
|
512 |
} |
513 |
|
514 |
/* compute hashes for each successful read, also store indexes */
|
515 |
for (i = 0; i < s->num_children; i++) { |
516 |
if (acb->qcrs[i].ret) {
|
517 |
continue;
|
518 |
} |
519 |
ret = quorum_compute_hash(acb, i, &hash); |
520 |
/* if ever the hash computation failed */
|
521 |
if (ret < 0) { |
522 |
acb->vote_ret = ret; |
523 |
goto free_exit;
|
524 |
} |
525 |
quorum_count_vote(&acb->votes, &hash, i); |
526 |
} |
527 |
|
528 |
/* vote to select the most represented version */
|
529 |
winner = quorum_get_vote_winner(&acb->votes); |
530 |
|
531 |
/* if the winner count is smaller than threshold the read fails */
|
532 |
if (winner->vote_count < s->threshold) {
|
533 |
quorum_report_failure(acb); |
534 |
acb->vote_ret = -EIO; |
535 |
goto free_exit;
|
536 |
} |
537 |
|
538 |
/* we have a winner: copy it */
|
539 |
quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov); |
540 |
|
541 |
/* some versions are bad print them */
|
542 |
quorum_report_bad_versions(s, acb, &winner->value); |
543 |
|
544 |
free_exit:
|
545 |
/* free lists */
|
546 |
quorum_free_vote_list(&acb->votes); |
547 |
} |
548 |
|
549 |
static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
|
550 |
int64_t sector_num, |
551 |
QEMUIOVector *qiov, |
552 |
int nb_sectors,
|
553 |
BlockDriverCompletionFunc *cb, |
554 |
void *opaque)
|
555 |
{ |
556 |
BDRVQuorumState *s = bs->opaque; |
557 |
QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, |
558 |
nb_sectors, cb, opaque); |
559 |
int i;
|
560 |
|
561 |
acb->is_read = true;
|
562 |
|
563 |
for (i = 0; i < s->num_children; i++) { |
564 |
acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size); |
565 |
qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov); |
566 |
qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf); |
567 |
} |
568 |
|
569 |
for (i = 0; i < s->num_children; i++) { |
570 |
bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors, |
571 |
quorum_aio_cb, &acb->qcrs[i]); |
572 |
} |
573 |
|
574 |
return &acb->common;
|
575 |
} |
576 |
|
577 |
static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
|
578 |
int64_t sector_num, |
579 |
QEMUIOVector *qiov, |
580 |
int nb_sectors,
|
581 |
BlockDriverCompletionFunc *cb, |
582 |
void *opaque)
|
583 |
{ |
584 |
BDRVQuorumState *s = bs->opaque; |
585 |
QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, |
586 |
cb, opaque); |
587 |
int i;
|
588 |
|
589 |
for (i = 0; i < s->num_children; i++) { |
590 |
acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov, |
591 |
nb_sectors, &quorum_aio_cb, |
592 |
&acb->qcrs[i]); |
593 |
} |
594 |
|
595 |
return &acb->common;
|
596 |
} |
597 |
|
598 |
static int64_t quorum_getlength(BlockDriverState *bs)
|
599 |
{ |
600 |
BDRVQuorumState *s = bs->opaque; |
601 |
int64_t result; |
602 |
int i;
|
603 |
|
604 |
/* check that all file have the same length */
|
605 |
result = bdrv_getlength(s->bs[0]);
|
606 |
if (result < 0) { |
607 |
return result;
|
608 |
} |
609 |
for (i = 1; i < s->num_children; i++) { |
610 |
int64_t value = bdrv_getlength(s->bs[i]); |
611 |
if (value < 0) { |
612 |
return value;
|
613 |
} |
614 |
if (value != result) {
|
615 |
return -EIO;
|
616 |
} |
617 |
} |
618 |
|
619 |
return result;
|
620 |
} |
621 |
|
622 |
static void quorum_invalidate_cache(BlockDriverState *bs) |
623 |
{ |
624 |
BDRVQuorumState *s = bs->opaque; |
625 |
int i;
|
626 |
|
627 |
for (i = 0; i < s->num_children; i++) { |
628 |
bdrv_invalidate_cache(s->bs[i]); |
629 |
} |
630 |
} |
631 |
|
632 |
static coroutine_fn int quorum_co_flush(BlockDriverState *bs) |
633 |
{ |
634 |
BDRVQuorumState *s = bs->opaque; |
635 |
QuorumVoteVersion *winner = NULL;
|
636 |
QuorumVotes error_votes; |
637 |
QuorumVoteValue result_value; |
638 |
int i;
|
639 |
int result = 0; |
640 |
|
641 |
QLIST_INIT(&error_votes.vote_list); |
642 |
error_votes.compare = quorum_64bits_compare; |
643 |
|
644 |
for (i = 0; i < s->num_children; i++) { |
645 |
result = bdrv_co_flush(s->bs[i]); |
646 |
result_value.l = result; |
647 |
quorum_count_vote(&error_votes, &result_value, i); |
648 |
} |
649 |
|
650 |
winner = quorum_get_vote_winner(&error_votes); |
651 |
result = winner->value.l; |
652 |
|
653 |
quorum_free_vote_list(&error_votes); |
654 |
|
655 |
return result;
|
656 |
} |
657 |
|
658 |
static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, |
659 |
BlockDriverState *candidate) |
660 |
{ |
661 |
BDRVQuorumState *s = bs->opaque; |
662 |
int i;
|
663 |
|
664 |
for (i = 0; i < s->num_children; i++) { |
665 |
bool perm = bdrv_recurse_is_first_non_filter(s->bs[i],
|
666 |
candidate); |
667 |
if (perm) {
|
668 |
return true; |
669 |
} |
670 |
} |
671 |
|
672 |
return false; |
673 |
} |
674 |
|
675 |
static BlockDriver bdrv_quorum = {
|
676 |
.format_name = "quorum",
|
677 |
.protocol_name = "quorum",
|
678 |
|
679 |
.instance_size = sizeof(BDRVQuorumState),
|
680 |
|
681 |
.bdrv_co_flush_to_disk = quorum_co_flush, |
682 |
|
683 |
.bdrv_getlength = quorum_getlength, |
684 |
|
685 |
.bdrv_aio_readv = quorum_aio_readv, |
686 |
.bdrv_aio_writev = quorum_aio_writev, |
687 |
.bdrv_invalidate_cache = quorum_invalidate_cache, |
688 |
|
689 |
.bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, |
690 |
}; |
691 |
|
692 |
static void bdrv_quorum_init(void) |
693 |
{ |
694 |
bdrv_register(&bdrv_quorum); |
695 |
} |
696 |
|
697 |
block_init(bdrv_quorum_init); |