Revision 47e6b251 posix-aio-compat.c
b/posix-aio-compat.c | ||
---|---|---|
28 | 28 |
#include "sysemu.h" |
29 | 29 |
#include "qemu-common.h" |
30 | 30 |
#include "trace.h" |
31 |
#include "thread-pool.h" |
|
31 | 32 |
#include "block_int.h" |
32 | 33 |
#include "iov.h" |
33 | 34 |
|
34 | 35 |
#include "block/raw-posix-aio.h" |
35 | 36 |
|
36 |
static void do_spawn_thread(void); |
|
37 |
|
|
38 | 37 |
struct qemu_paiocb { |
39 | 38 |
BlockDriverAIOCB common; |
40 | 39 |
int aio_fildes; |
... | ... | |
46 | 45 |
size_t aio_nbytes; |
47 | 46 |
#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ |
48 | 47 |
off_t aio_offset; |
49 |
|
|
50 |
QTAILQ_ENTRY(qemu_paiocb) node; |
|
51 | 48 |
int aio_type; |
52 |
ssize_t ret; |
|
53 |
int active; |
|
54 |
struct qemu_paiocb *next; |
|
55 | 49 |
}; |
56 | 50 |
|
57 |
typedef struct PosixAioState { |
|
58 |
int rfd, wfd; |
|
59 |
struct qemu_paiocb *first_aio; |
|
60 |
} PosixAioState; |
|
61 |
|
|
62 |
|
|
63 |
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; |
|
64 |
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; |
|
65 |
static pthread_t thread_id; |
|
66 |
static pthread_attr_t attr; |
|
67 |
static int max_threads = 64; |
|
68 |
static int cur_threads = 0; |
|
69 |
static int idle_threads = 0; |
|
70 |
static int new_threads = 0; /* backlog of threads we need to create */ |
|
71 |
static int pending_threads = 0; /* threads created but not running yet */ |
|
72 |
static QEMUBH *new_thread_bh; |
|
73 |
static QTAILQ_HEAD(, qemu_paiocb) request_list; |
|
74 |
|
|
75 | 51 |
#ifdef CONFIG_PREADV |
76 | 52 |
static int preadv_present = 1; |
77 | 53 |
#else |
78 | 54 |
static int preadv_present = 0; |
79 | 55 |
#endif |
80 | 56 |
|
81 |
static void die2(int err, const char *what) |
|
82 |
{ |
|
83 |
fprintf(stderr, "%s failed: %s\n", what, strerror(err)); |
|
84 |
abort(); |
|
85 |
} |
|
86 |
|
|
87 |
static void die(const char *what) |
|
88 |
{ |
|
89 |
die2(errno, what); |
|
90 |
} |
|
91 |
|
|
92 |
static void mutex_lock(pthread_mutex_t *mutex) |
|
93 |
{ |
|
94 |
int ret = pthread_mutex_lock(mutex); |
|
95 |
if (ret) die2(ret, "pthread_mutex_lock"); |
|
96 |
} |
|
97 |
|
|
98 |
static void mutex_unlock(pthread_mutex_t *mutex) |
|
99 |
{ |
|
100 |
int ret = pthread_mutex_unlock(mutex); |
|
101 |
if (ret) die2(ret, "pthread_mutex_unlock"); |
|
102 |
} |
|
103 |
|
|
104 |
static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, |
|
105 |
struct timespec *ts) |
|
106 |
{ |
|
107 |
int ret = pthread_cond_timedwait(cond, mutex, ts); |
|
108 |
if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait"); |
|
109 |
return ret; |
|
110 |
} |
|
111 |
|
|
112 |
static void cond_signal(pthread_cond_t *cond) |
|
113 |
{ |
|
114 |
int ret = pthread_cond_signal(cond); |
|
115 |
if (ret) die2(ret, "pthread_cond_signal"); |
|
116 |
} |
|
117 |
|
|
118 |
static void thread_create(pthread_t *thread, pthread_attr_t *attr, |
|
119 |
void *(*start_routine)(void*), void *arg) |
|
120 |
{ |
|
121 |
int ret = pthread_create(thread, attr, start_routine, arg); |
|
122 |
if (ret) die2(ret, "pthread_create"); |
|
123 |
} |
|
124 |
|
|
125 | 57 |
static ssize_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb) |
126 | 58 |
{ |
127 | 59 |
int ret; |
... | ... | |
310 | 242 |
return nbytes; |
311 | 243 |
} |
312 | 244 |
|
313 |
static void posix_aio_notify_event(void); |
|
314 |
|
|
315 |
static void *aio_thread(void *unused) |
|
245 |
static int aio_worker(void *arg) |
|
316 | 246 |
{ |
317 |
mutex_lock(&lock); |
|
318 |
pending_threads--; |
|
319 |
mutex_unlock(&lock); |
|
320 |
do_spawn_thread(); |
|
321 |
|
|
322 |
while (1) { |
|
323 |
struct qemu_paiocb *aiocb; |
|
324 |
ssize_t ret = 0; |
|
325 |
qemu_timeval tv; |
|
326 |
struct timespec ts; |
|
327 |
|
|
328 |
qemu_gettimeofday(&tv); |
|
329 |
ts.tv_sec = tv.tv_sec + 10; |
|
330 |
ts.tv_nsec = 0; |
|
331 |
|
|
332 |
mutex_lock(&lock); |
|
333 |
|
|
334 |
while (QTAILQ_EMPTY(&request_list) && |
|
335 |
!(ret == ETIMEDOUT)) { |
|
336 |
idle_threads++; |
|
337 |
ret = cond_timedwait(&cond, &lock, &ts); |
|
338 |
idle_threads--; |
|
247 |
struct qemu_paiocb *aiocb = arg; |
|
248 |
ssize_t ret = 0; |
|
249 |
|
|
250 |
switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { |
|
251 |
case QEMU_AIO_READ: |
|
252 |
ret = handle_aiocb_rw(aiocb); |
|
253 |
if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) { |
|
254 |
/* A short read means that we have reached EOF. Pad the buffer |
|
255 |
* with zeros for bytes after EOF. */ |
|
256 |
iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, |
|
257 |
0, aiocb->aio_nbytes - ret); |
|
258 |
|
|
259 |
ret = aiocb->aio_nbytes; |
|
339 | 260 |
} |
340 |
|
|
341 |
if (QTAILQ_EMPTY(&request_list)) |
|
342 |
break; |
|
343 |
|
|
344 |
aiocb = QTAILQ_FIRST(&request_list); |
|
345 |
QTAILQ_REMOVE(&request_list, aiocb, node); |
|
346 |
aiocb->active = 1; |
|
347 |
mutex_unlock(&lock); |
|
348 |
|
|
349 |
switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { |
|
350 |
case QEMU_AIO_READ: |
|
351 |
ret = handle_aiocb_rw(aiocb); |
|
352 |
if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) { |
|
353 |
/* A short read means that we have reached EOF. Pad the buffer |
|
354 |
* with zeros for bytes after EOF. */ |
|
355 |
iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, |
|
356 |
0, aiocb->aio_nbytes - ret); |
|
357 |
|
|
358 |
ret = aiocb->aio_nbytes; |
|
359 |
} |
|
360 |
break; |
|
361 |
case QEMU_AIO_WRITE: |
|
362 |
ret = handle_aiocb_rw(aiocb); |
|
363 |
break; |
|
364 |
case QEMU_AIO_FLUSH: |
|
365 |
ret = handle_aiocb_flush(aiocb); |
|
366 |
break; |
|
367 |
case QEMU_AIO_IOCTL: |
|
368 |
ret = handle_aiocb_ioctl(aiocb); |
|
369 |
break; |
|
370 |
default: |
|
371 |
fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); |
|
261 |
if (ret == aiocb->aio_nbytes) { |
|
262 |
ret = 0; |
|
263 |
} else if (ret >= 0 && ret < aiocb->aio_nbytes) { |
|
372 | 264 |
ret = -EINVAL; |
373 |
break; |
|
374 | 265 |
} |
375 |
|
|
376 |
mutex_lock(&lock); |
|
377 |
aiocb->ret = ret; |
|
378 |
mutex_unlock(&lock); |
|
379 |
|
|
380 |
posix_aio_notify_event(); |
|
381 |
} |
|
382 |
|
|
383 |
cur_threads--; |
|
384 |
mutex_unlock(&lock); |
|
385 |
|
|
386 |
return NULL; |
|
387 |
} |
|
388 |
|
|
389 |
static void do_spawn_thread(void) |
|
390 |
{ |
|
391 |
sigset_t set, oldset; |
|
392 |
|
|
393 |
mutex_lock(&lock); |
|
394 |
if (!new_threads) { |
|
395 |
mutex_unlock(&lock); |
|
396 |
return; |
|
397 |
} |
|
398 |
|
|
399 |
new_threads--; |
|
400 |
pending_threads++; |
|
401 |
|
|
402 |
mutex_unlock(&lock); |
|
403 |
|
|
404 |
/* block all signals */ |
|
405 |
if (sigfillset(&set)) die("sigfillset"); |
|
406 |
if (sigprocmask(SIG_SETMASK, &set, &oldset)) die("sigprocmask"); |
|
407 |
|
|
408 |
thread_create(&thread_id, &attr, aio_thread, NULL); |
|
409 |
|
|
410 |
if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore"); |
|
411 |
} |
|
412 |
|
|
413 |
static void spawn_thread_bh_fn(void *opaque) |
|
414 |
{ |
|
415 |
do_spawn_thread(); |
|
416 |
} |
|
417 |
|
|
418 |
static void spawn_thread(void) |
|
419 |
{ |
|
420 |
cur_threads++; |
|
421 |
new_threads++; |
|
422 |
/* If there are threads being created, they will spawn new workers, so |
|
423 |
* we don't spend time creating many threads in a loop holding a mutex or |
|
424 |
* starving the current vcpu. |
|
425 |
* |
|
426 |
* If there are no idle threads, ask the main thread to create one, so we |
|
427 |
* inherit the correct affinity instead of the vcpu affinity. |
|
428 |
*/ |
|
429 |
if (!pending_threads) { |
|
430 |
qemu_bh_schedule(new_thread_bh); |
|
431 |
} |
|
432 |
} |
|
433 |
|
|
434 |
static void qemu_paio_submit(struct qemu_paiocb *aiocb) |
|
435 |
{ |
|
436 |
aiocb->ret = -EINPROGRESS; |
|
437 |
aiocb->active = 0; |
|
438 |
mutex_lock(&lock); |
|
439 |
if (idle_threads == 0 && cur_threads < max_threads) |
|
440 |
spawn_thread(); |
|
441 |
QTAILQ_INSERT_TAIL(&request_list, aiocb, node); |
|
442 |
mutex_unlock(&lock); |
|
443 |
cond_signal(&cond); |
|
444 |
} |
|
445 |
|
|
446 |
static ssize_t qemu_paio_return(struct qemu_paiocb *aiocb) |
|
447 |
{ |
|
448 |
ssize_t ret; |
|
449 |
|
|
450 |
mutex_lock(&lock); |
|
451 |
ret = aiocb->ret; |
|
452 |
mutex_unlock(&lock); |
|
453 |
|
|
454 |
return ret; |
|
455 |
} |
|
456 |
|
|
457 |
static int qemu_paio_error(struct qemu_paiocb *aiocb) |
|
458 |
{ |
|
459 |
ssize_t ret = qemu_paio_return(aiocb); |
|
460 |
|
|
461 |
if (ret < 0) |
|
462 |
ret = -ret; |
|
463 |
else |
|
464 |
ret = 0; |
|
465 |
|
|
466 |
return ret; |
|
467 |
} |
|
468 |
|
|
469 |
static void posix_aio_read(void *opaque) |
|
470 |
{ |
|
471 |
PosixAioState *s = opaque; |
|
472 |
struct qemu_paiocb *acb, **pacb; |
|
473 |
int ret; |
|
474 |
ssize_t len; |
|
475 |
|
|
476 |
/* read all bytes from signal pipe */ |
|
477 |
for (;;) { |
|
478 |
char bytes[16]; |
|
479 |
|
|
480 |
len = read(s->rfd, bytes, sizeof(bytes)); |
|
481 |
if (len == -1 && errno == EINTR) |
|
482 |
continue; /* try again */ |
|
483 |
if (len == sizeof(bytes)) |
|
484 |
continue; /* more to read */ |
|
485 | 266 |
break; |
486 |
} |
|
487 |
|
|
488 |
for(;;) { |
|
489 |
pacb = &s->first_aio; |
|
490 |
for(;;) { |
|
491 |
acb = *pacb; |
|
492 |
if (!acb) |
|
493 |
return; |
|
494 |
|
|
495 |
ret = qemu_paio_error(acb); |
|
496 |
if (ret == ECANCELED) { |
|
497 |
/* remove the request */ |
|
498 |
*pacb = acb->next; |
|
499 |
qemu_aio_release(acb); |
|
500 |
} else if (ret != EINPROGRESS) { |
|
501 |
/* end of aio */ |
|
502 |
if (ret == 0) { |
|
503 |
ret = qemu_paio_return(acb); |
|
504 |
if (ret == acb->aio_nbytes) |
|
505 |
ret = 0; |
|
506 |
else |
|
507 |
ret = -EINVAL; |
|
508 |
} else { |
|
509 |
ret = -ret; |
|
510 |
} |
|
511 |
|
|
512 |
trace_paio_complete(acb, acb->common.opaque, ret); |
|
513 |
|
|
514 |
/* remove the request */ |
|
515 |
*pacb = acb->next; |
|
516 |
/* call the callback */ |
|
517 |
acb->common.cb(acb->common.opaque, ret); |
|
518 |
qemu_aio_release(acb); |
|
519 |
break; |
|
520 |
} else { |
|
521 |
pacb = &acb->next; |
|
522 |
} |
|
523 |
} |
|
524 |
} |
|
525 |
} |
|
526 |
|
|
527 |
static int posix_aio_flush(void *opaque) |
|
528 |
{ |
|
529 |
PosixAioState *s = opaque; |
|
530 |
return !!s->first_aio; |
|
531 |
} |
|
532 |
|
|
533 |
static PosixAioState *posix_aio_state; |
|
534 |
|
|
535 |
static void posix_aio_notify_event(void) |
|
536 |
{ |
|
537 |
char byte = 0; |
|
538 |
ssize_t ret; |
|
539 |
|
|
540 |
ret = write(posix_aio_state->wfd, &byte, sizeof(byte)); |
|
541 |
if (ret < 0 && errno != EAGAIN) |
|
542 |
die("write()"); |
|
543 |
} |
|
544 |
|
|
545 |
static void paio_remove(struct qemu_paiocb *acb) |
|
546 |
{ |
|
547 |
struct qemu_paiocb **pacb; |
|
548 |
|
|
549 |
/* remove the callback from the queue */ |
|
550 |
pacb = &posix_aio_state->first_aio; |
|
551 |
for(;;) { |
|
552 |
if (*pacb == NULL) { |
|
553 |
fprintf(stderr, "paio_remove: aio request not found!\n"); |
|
554 |
break; |
|
555 |
} else if (*pacb == acb) { |
|
556 |
*pacb = acb->next; |
|
557 |
qemu_aio_release(acb); |
|
558 |
break; |
|
267 |
case QEMU_AIO_WRITE: |
|
268 |
ret = handle_aiocb_rw(aiocb); |
|
269 |
if (ret == aiocb->aio_nbytes) { |
|
270 |
ret = 0; |
|
271 |
} else if (ret >= 0 && ret < aiocb->aio_nbytes) { |
|
272 |
ret = -EINVAL; |
|
559 | 273 |
} |
560 |
pacb = &(*pacb)->next; |
|
561 |
} |
|
562 |
} |
|
563 |
|
|
564 |
static void paio_cancel(BlockDriverAIOCB *blockacb) |
|
565 |
{ |
|
566 |
struct qemu_paiocb *acb = (struct qemu_paiocb *)blockacb; |
|
567 |
int active = 0; |
|
568 |
|
|
569 |
trace_paio_cancel(acb, acb->common.opaque); |
|
570 |
|
|
571 |
mutex_lock(&lock); |
|
572 |
if (!acb->active) { |
|
573 |
QTAILQ_REMOVE(&request_list, acb, node); |
|
574 |
acb->ret = -ECANCELED; |
|
575 |
} else if (acb->ret == -EINPROGRESS) { |
|
576 |
active = 1; |
|
577 |
} |
|
578 |
mutex_unlock(&lock); |
|
579 |
|
|
580 |
if (active) { |
|
581 |
/* fail safe: if the aio could not be canceled, we wait for |
|
582 |
it */ |
|
583 |
while (qemu_paio_error(acb) == EINPROGRESS) |
|
584 |
; |
|
274 |
break; |
|
275 |
case QEMU_AIO_FLUSH: |
|
276 |
ret = handle_aiocb_flush(aiocb); |
|
277 |
break; |
|
278 |
case QEMU_AIO_IOCTL: |
|
279 |
ret = handle_aiocb_ioctl(aiocb); |
|
280 |
break; |
|
281 |
default: |
|
282 |
fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); |
|
283 |
ret = -EINVAL; |
|
284 |
break; |
|
585 | 285 |
} |
586 | 286 |
|
587 |
paio_remove(acb); |
|
287 |
qemu_aio_release(aiocb); |
|
288 |
return ret; |
|
588 | 289 |
} |
589 | 290 |
|
590 | 291 |
static AIOPool raw_aio_pool = { |
591 | 292 |
.aiocb_size = sizeof(struct qemu_paiocb), |
592 |
.cancel = paio_cancel, |
|
593 | 293 |
}; |
594 | 294 |
|
595 | 295 |
BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, |
... | ... | |
609 | 309 |
acb->aio_nbytes = nb_sectors * 512; |
610 | 310 |
acb->aio_offset = sector_num * 512; |
611 | 311 |
|
612 |
acb->next = posix_aio_state->first_aio; |
|
613 |
posix_aio_state->first_aio = acb; |
|
614 |
|
|
615 | 312 |
trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); |
616 |
qemu_paio_submit(acb); |
|
617 |
return &acb->common; |
|
313 |
return thread_pool_submit_aio(aio_worker, acb, cb, opaque); |
|
618 | 314 |
} |
619 | 315 |
|
620 | 316 |
BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, |
... | ... | |
630 | 326 |
acb->aio_ioctl_buf = buf; |
631 | 327 |
acb->aio_ioctl_cmd = req; |
632 | 328 |
|
633 |
acb->next = posix_aio_state->first_aio; |
|
634 |
posix_aio_state->first_aio = acb; |
|
635 |
|
|
636 |
qemu_paio_submit(acb); |
|
637 |
return &acb->common; |
|
638 |
} |
|
639 |
|
|
640 |
int paio_init(void) |
|
641 |
{ |
|
642 |
PosixAioState *s; |
|
643 |
int fds[2]; |
|
644 |
int ret; |
|
645 |
|
|
646 |
if (posix_aio_state) |
|
647 |
return 0; |
|
648 |
|
|
649 |
s = g_malloc(sizeof(PosixAioState)); |
|
650 |
|
|
651 |
s->first_aio = NULL; |
|
652 |
if (qemu_pipe(fds) == -1) { |
|
653 |
fprintf(stderr, "failed to create pipe\n"); |
|
654 |
g_free(s); |
|
655 |
return -1; |
|
656 |
} |
|
657 |
|
|
658 |
s->rfd = fds[0]; |
|
659 |
s->wfd = fds[1]; |
|
660 |
|
|
661 |
fcntl(s->rfd, F_SETFL, O_NONBLOCK); |
|
662 |
fcntl(s->wfd, F_SETFL, O_NONBLOCK); |
|
663 |
|
|
664 |
qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s); |
|
665 |
|
|
666 |
ret = pthread_attr_init(&attr); |
|
667 |
if (ret) |
|
668 |
die2(ret, "pthread_attr_init"); |
|
669 |
|
|
670 |
ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); |
|
671 |
if (ret) |
|
672 |
die2(ret, "pthread_attr_setdetachstate"); |
|
673 |
|
|
674 |
QTAILQ_INIT(&request_list); |
|
675 |
new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL); |
|
676 |
|
|
677 |
posix_aio_state = s; |
|
678 |
return 0; |
|
329 |
return thread_pool_submit_aio(aio_worker, acb, cb, opaque); |
|
679 | 330 |
} |
Also available in: Unified diff