Statistics
| Branch: | Revision:

root / posix-aio-compat.c @ d7585251

History | View | Annotate | Download (9.8 kB)

1 3c529d93 aliguori
/*
2 3c529d93 aliguori
 * QEMU posix-aio emulation
3 3c529d93 aliguori
 *
4 3c529d93 aliguori
 * Copyright IBM, Corp. 2008
5 3c529d93 aliguori
 *
6 3c529d93 aliguori
 * Authors:
7 3c529d93 aliguori
 *  Anthony Liguori   <aliguori@us.ibm.com>
8 3c529d93 aliguori
 *
9 3c529d93 aliguori
 * This work is licensed under the terms of the GNU GPL, version 2.  See
10 3c529d93 aliguori
 * the COPYING file in the top-level directory.
11 3c529d93 aliguori
 *
12 3c529d93 aliguori
 */
13 3c529d93 aliguori
14 221f715d aliguori
#include <sys/ioctl.h>
15 3c529d93 aliguori
#include <pthread.h>
16 3c529d93 aliguori
#include <unistd.h>
17 3c529d93 aliguori
#include <errno.h>
18 30525aff malc
#include <time.h>
19 8653c015 malc
#include <string.h>
20 8653c015 malc
#include <stdlib.h>
21 8653c015 malc
#include <stdio.h>
22 3c529d93 aliguori
#include "osdep.h"
23 f141eafe aliguori
#include "qemu-common.h"
24 3c529d93 aliguori
25 3c529d93 aliguori
#include "posix-aio-compat.h"
26 3c529d93 aliguori
27 3c529d93 aliguori
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
28 3c529d93 aliguori
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
29 3c529d93 aliguori
static pthread_t thread_id;
30 a8227a5a malc
static pthread_attr_t attr;
31 3c529d93 aliguori
static int max_threads = 64;
32 3c529d93 aliguori
static int cur_threads = 0;
33 3c529d93 aliguori
static int idle_threads = 0;
34 3c529d93 aliguori
static TAILQ_HEAD(, qemu_paiocb) request_list;
35 3c529d93 aliguori
36 ceb42de8 aliguori
#ifdef HAVE_PREADV
37 ceb42de8 aliguori
static int preadv_present = 1;
38 ceb42de8 aliguori
#else
39 ceb42de8 aliguori
static int preadv_present = 0;
40 ceb42de8 aliguori
#endif
41 ceb42de8 aliguori
42 8653c015 malc
static void die2(int err, const char *what)
43 8653c015 malc
{
44 8653c015 malc
    fprintf(stderr, "%s failed: %s\n", what, strerror(err));
45 8653c015 malc
    abort();
46 8653c015 malc
}
47 8653c015 malc
48 8653c015 malc
static void die(const char *what)
49 8653c015 malc
{
50 8653c015 malc
    die2(errno, what);
51 8653c015 malc
}
52 8653c015 malc
53 8653c015 malc
static void mutex_lock(pthread_mutex_t *mutex)
54 8653c015 malc
{
55 8653c015 malc
    int ret = pthread_mutex_lock(mutex);
56 8653c015 malc
    if (ret) die2(ret, "pthread_mutex_lock");
57 8653c015 malc
}
58 8653c015 malc
59 8653c015 malc
static void mutex_unlock(pthread_mutex_t *mutex)
60 8653c015 malc
{
61 8653c015 malc
    int ret = pthread_mutex_unlock(mutex);
62 8653c015 malc
    if (ret) die2(ret, "pthread_mutex_unlock");
63 8653c015 malc
}
64 8653c015 malc
65 8653c015 malc
static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
66 8653c015 malc
                           struct timespec *ts)
67 8653c015 malc
{
68 8653c015 malc
    int ret = pthread_cond_timedwait(cond, mutex, ts);
69 8653c015 malc
    if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait");
70 8653c015 malc
    return ret;
71 8653c015 malc
}
72 8653c015 malc
73 5d47e372 malc
static void cond_signal(pthread_cond_t *cond)
74 8653c015 malc
{
75 5d47e372 malc
    int ret = pthread_cond_signal(cond);
76 5d47e372 malc
    if (ret) die2(ret, "pthread_cond_signal");
77 8653c015 malc
}
78 8653c015 malc
79 8653c015 malc
static void thread_create(pthread_t *thread, pthread_attr_t *attr,
80 8653c015 malc
                          void *(*start_routine)(void*), void *arg)
81 8653c015 malc
{
82 8653c015 malc
    int ret = pthread_create(thread, attr, start_routine, arg);
83 8653c015 malc
    if (ret) die2(ret, "pthread_create");
84 8653c015 malc
}
85 8653c015 malc
86 f141eafe aliguori
static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
87 f141eafe aliguori
{
88 f141eafe aliguori
        int ret;
89 f141eafe aliguori
90 f141eafe aliguori
        ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
91 f141eafe aliguori
        if (ret == -1)
92 f141eafe aliguori
                return -errno;
93 f141eafe aliguori
        return ret;
94 f141eafe aliguori
}
95 f141eafe aliguori
96 ceb42de8 aliguori
#ifdef HAVE_PREADV
97 ceb42de8 aliguori
98 ceb42de8 aliguori
static ssize_t
99 ceb42de8 aliguori
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
100 ceb42de8 aliguori
{
101 ceb42de8 aliguori
    return preadv(fd, iov, nr_iov, offset);
102 ceb42de8 aliguori
}
103 ceb42de8 aliguori
104 ceb42de8 aliguori
static ssize_t
105 ceb42de8 aliguori
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
106 ceb42de8 aliguori
{
107 ceb42de8 aliguori
    return pwritev(fd, iov, nr_iov, offset);
108 ceb42de8 aliguori
}
109 ceb42de8 aliguori
110 ceb42de8 aliguori
#else
111 ceb42de8 aliguori
112 ceb42de8 aliguori
static ssize_t
113 ceb42de8 aliguori
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
114 ceb42de8 aliguori
{
115 ceb42de8 aliguori
    return -ENOSYS;
116 ceb42de8 aliguori
}
117 ceb42de8 aliguori
118 ceb42de8 aliguori
static ssize_t
119 ceb42de8 aliguori
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
120 ceb42de8 aliguori
{
121 ceb42de8 aliguori
    return -ENOSYS;
122 ceb42de8 aliguori
}
123 ceb42de8 aliguori
124 ceb42de8 aliguori
#endif
125 ceb42de8 aliguori
126 f141eafe aliguori
/*
127 f141eafe aliguori
 * Check if we need to copy the data in the aiocb into a new
128 f141eafe aliguori
 * properly aligned buffer.
129 f141eafe aliguori
 */
130 f141eafe aliguori
static int aiocb_needs_copy(struct qemu_paiocb *aiocb)
131 f141eafe aliguori
{
132 f141eafe aliguori
    if (aiocb->aio_flags & QEMU_AIO_SECTOR_ALIGNED) {
133 f141eafe aliguori
        int i;
134 f141eafe aliguori
135 f141eafe aliguori
        for (i = 0; i < aiocb->aio_niov; i++)
136 f141eafe aliguori
            if ((uintptr_t) aiocb->aio_iov[i].iov_base % 512)
137 f141eafe aliguori
                return 1;
138 f141eafe aliguori
    }
139 f141eafe aliguori
140 f141eafe aliguori
    return 0;
141 f141eafe aliguori
}
142 f141eafe aliguori
143 ceb42de8 aliguori
static size_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
144 ceb42de8 aliguori
{
145 ceb42de8 aliguori
    size_t offset = 0;
146 ceb42de8 aliguori
    ssize_t len;
147 ceb42de8 aliguori
148 ceb42de8 aliguori
    do {
149 ceb42de8 aliguori
        if (aiocb->aio_type == QEMU_PAIO_WRITE)
150 ceb42de8 aliguori
            len = qemu_pwritev(aiocb->aio_fildes,
151 ceb42de8 aliguori
                               aiocb->aio_iov,
152 ceb42de8 aliguori
                               aiocb->aio_niov,
153 ceb42de8 aliguori
                               aiocb->aio_offset + offset);
154 ceb42de8 aliguori
         else
155 ceb42de8 aliguori
            len = qemu_preadv(aiocb->aio_fildes,
156 ceb42de8 aliguori
                              aiocb->aio_iov,
157 ceb42de8 aliguori
                              aiocb->aio_niov,
158 ceb42de8 aliguori
                              aiocb->aio_offset + offset);
159 ceb42de8 aliguori
    } while (len == -1 && errno == EINTR);
160 ceb42de8 aliguori
161 ceb42de8 aliguori
    if (len == -1)
162 ceb42de8 aliguori
        return -errno;
163 ceb42de8 aliguori
    return len;
164 ceb42de8 aliguori
}
165 ceb42de8 aliguori
166 f141eafe aliguori
static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
167 221f715d aliguori
{
168 221f715d aliguori
    size_t offset = 0;
169 f141eafe aliguori
    size_t len;
170 221f715d aliguori
171 221f715d aliguori
    while (offset < aiocb->aio_nbytes) {
172 f141eafe aliguori
         if (aiocb->aio_type == QEMU_PAIO_WRITE)
173 f141eafe aliguori
             len = pwrite(aiocb->aio_fildes,
174 f141eafe aliguori
                          (const char *)buf + offset,
175 f141eafe aliguori
                          aiocb->aio_nbytes - offset,
176 f141eafe aliguori
                          aiocb->aio_offset + offset);
177 f141eafe aliguori
         else
178 f141eafe aliguori
             len = pread(aiocb->aio_fildes,
179 f141eafe aliguori
                         buf + offset,
180 221f715d aliguori
                         aiocb->aio_nbytes - offset,
181 221f715d aliguori
                         aiocb->aio_offset + offset);
182 221f715d aliguori
183 f141eafe aliguori
         if (len == -1 && errno == EINTR)
184 f141eafe aliguori
             continue;
185 f141eafe aliguori
         else if (len == -1) {
186 f141eafe aliguori
             offset = -errno;
187 f141eafe aliguori
             break;
188 f141eafe aliguori
         } else if (len == 0)
189 f141eafe aliguori
             break;
190 f141eafe aliguori
191 f141eafe aliguori
         offset += len;
192 221f715d aliguori
    }
193 221f715d aliguori
194 221f715d aliguori
    return offset;
195 221f715d aliguori
}
196 221f715d aliguori
197 f141eafe aliguori
static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
198 221f715d aliguori
{
199 f141eafe aliguori
    size_t nbytes;
200 f141eafe aliguori
    char *buf;
201 f141eafe aliguori
202 ceb42de8 aliguori
    if (!aiocb_needs_copy(aiocb)) {
203 f141eafe aliguori
        /*
204 f141eafe aliguori
         * If there is just a single buffer, and it is properly aligned
205 f141eafe aliguori
         * we can just use plain pread/pwrite without any problems.
206 f141eafe aliguori
         */
207 ceb42de8 aliguori
        if (aiocb->aio_niov == 1)
208 ceb42de8 aliguori
             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
209 ceb42de8 aliguori
210 ceb42de8 aliguori
        /*
211 ceb42de8 aliguori
         * We have more than one iovec, and all are properly aligned.
212 ceb42de8 aliguori
         *
213 ceb42de8 aliguori
         * Try preadv/pwritev first and fall back to linearizing the
214 ceb42de8 aliguori
         * buffer if it's not supported.
215 ceb42de8 aliguori
         */
216 ceb42de8 aliguori
        if (preadv_present) {
217 ceb42de8 aliguori
            nbytes = handle_aiocb_rw_vector(aiocb);
218 ceb42de8 aliguori
            if (nbytes == aiocb->aio_nbytes)
219 ceb42de8 aliguori
                return nbytes;
220 ceb42de8 aliguori
            if (nbytes < 0 && nbytes != -ENOSYS)
221 ceb42de8 aliguori
                return nbytes;
222 ceb42de8 aliguori
            preadv_present = 0;
223 ceb42de8 aliguori
        }
224 ceb42de8 aliguori
225 ceb42de8 aliguori
        /*
226 ceb42de8 aliguori
         * XXX(hch): short read/write.  no easy way to handle the reminder
227 ceb42de8 aliguori
         * using these interfaces.  For now retry using plain
228 ceb42de8 aliguori
         * pread/pwrite?
229 ceb42de8 aliguori
         */
230 f141eafe aliguori
    }
231 221f715d aliguori
232 f141eafe aliguori
    /*
233 f141eafe aliguori
     * Ok, we have to do it the hard way, copy all segments into
234 f141eafe aliguori
     * a single aligned buffer.
235 f141eafe aliguori
     */
236 f141eafe aliguori
    buf = qemu_memalign(512, aiocb->aio_nbytes);
237 f141eafe aliguori
    if (aiocb->aio_type == QEMU_PAIO_WRITE) {
238 f141eafe aliguori
        char *p = buf;
239 f141eafe aliguori
        int i;
240 f141eafe aliguori
241 f141eafe aliguori
        for (i = 0; i < aiocb->aio_niov; ++i) {
242 f141eafe aliguori
            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
243 f141eafe aliguori
            p += aiocb->aio_iov[i].iov_len;
244 f141eafe aliguori
        }
245 f141eafe aliguori
    }
246 f141eafe aliguori
247 f141eafe aliguori
    nbytes = handle_aiocb_rw_linear(aiocb, buf);
248 f141eafe aliguori
    if (aiocb->aio_type != QEMU_PAIO_WRITE) {
249 f141eafe aliguori
        char *p = buf;
250 f141eafe aliguori
        size_t count = aiocb->aio_nbytes, copy;
251 f141eafe aliguori
        int i;
252 f141eafe aliguori
253 f141eafe aliguori
        for (i = 0; i < aiocb->aio_niov && count; ++i) {
254 f141eafe aliguori
            copy = count;
255 f141eafe aliguori
            if (copy > aiocb->aio_iov[i].iov_len)
256 f141eafe aliguori
                copy = aiocb->aio_iov[i].iov_len;
257 f141eafe aliguori
            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
258 f141eafe aliguori
            p     += copy;
259 f141eafe aliguori
            count -= copy;
260 f141eafe aliguori
        }
261 f141eafe aliguori
    }
262 f141eafe aliguori
    qemu_vfree(buf);
263 f141eafe aliguori
264 f141eafe aliguori
    return nbytes;
265 221f715d aliguori
}
266 221f715d aliguori
267 3c529d93 aliguori
static void *aio_thread(void *unused)
268 3c529d93 aliguori
{
269 a8227a5a malc
    pid_t pid;
270 3c529d93 aliguori
    sigset_t set;
271 3c529d93 aliguori
272 a8227a5a malc
    pid = getpid();
273 a8227a5a malc
274 3c529d93 aliguori
    /* block all signals */
275 8653c015 malc
    if (sigfillset(&set)) die("sigfillset");
276 8653c015 malc
    if (sigprocmask(SIG_BLOCK, &set, NULL)) die("sigprocmask");
277 3c529d93 aliguori
278 3c529d93 aliguori
    while (1) {
279 3c529d93 aliguori
        struct qemu_paiocb *aiocb;
280 221f715d aliguori
        size_t ret = 0;
281 30525aff malc
        qemu_timeval tv;
282 30525aff malc
        struct timespec ts;
283 30525aff malc
284 30525aff malc
        qemu_gettimeofday(&tv);
285 30525aff malc
        ts.tv_sec = tv.tv_sec + 10;
286 30525aff malc
        ts.tv_nsec = 0;
287 3c529d93 aliguori
288 8653c015 malc
        mutex_lock(&lock);
289 3c529d93 aliguori
290 3c529d93 aliguori
        while (TAILQ_EMPTY(&request_list) &&
291 3c529d93 aliguori
               !(ret == ETIMEDOUT)) {
292 8653c015 malc
            ret = cond_timedwait(&cond, &lock, &ts);
293 3c529d93 aliguori
        }
294 3c529d93 aliguori
295 514f7a27 malc
        if (TAILQ_EMPTY(&request_list))
296 3c529d93 aliguori
            break;
297 3c529d93 aliguori
298 3c529d93 aliguori
        aiocb = TAILQ_FIRST(&request_list);
299 3c529d93 aliguori
        TAILQ_REMOVE(&request_list, aiocb, node);
300 3c529d93 aliguori
        aiocb->active = 1;
301 3c529d93 aliguori
        idle_threads--;
302 8653c015 malc
        mutex_unlock(&lock);
303 3c529d93 aliguori
304 221f715d aliguori
        switch (aiocb->aio_type) {
305 221f715d aliguori
        case QEMU_PAIO_READ:
306 221f715d aliguori
        case QEMU_PAIO_WRITE:
307 f141eafe aliguori
                ret = handle_aiocb_rw(aiocb);
308 221f715d aliguori
                break;
309 221f715d aliguori
        case QEMU_PAIO_IOCTL:
310 221f715d aliguori
                ret = handle_aiocb_ioctl(aiocb);
311 221f715d aliguori
                break;
312 221f715d aliguori
        default:
313 221f715d aliguori
                fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
314 221f715d aliguori
                ret = -EINVAL;
315 221f715d aliguori
                break;
316 221f715d aliguori
        }
317 3c529d93 aliguori
318 8653c015 malc
        mutex_lock(&lock);
319 221f715d aliguori
        aiocb->ret = ret;
320 3c529d93 aliguori
        idle_threads++;
321 8653c015 malc
        mutex_unlock(&lock);
322 3c529d93 aliguori
323 a8227a5a malc
        if (kill(pid, aiocb->ev_signo)) die("kill failed");
324 3c529d93 aliguori
    }
325 3c529d93 aliguori
326 3c529d93 aliguori
    idle_threads--;
327 3c529d93 aliguori
    cur_threads--;
328 8653c015 malc
    mutex_unlock(&lock);
329 3c529d93 aliguori
330 3c529d93 aliguori
    return NULL;
331 3c529d93 aliguori
}
332 3c529d93 aliguori
333 8653c015 malc
static void spawn_thread(void)
334 3c529d93 aliguori
{
335 3c529d93 aliguori
    cur_threads++;
336 3c529d93 aliguori
    idle_threads++;
337 8653c015 malc
    thread_create(&thread_id, &attr, aio_thread, NULL);
338 3c529d93 aliguori
}
339 3c529d93 aliguori
340 3c529d93 aliguori
int qemu_paio_init(struct qemu_paioinit *aioinit)
341 3c529d93 aliguori
{
342 a8227a5a malc
    int ret;
343 a8227a5a malc
344 a8227a5a malc
    ret = pthread_attr_init(&attr);
345 a8227a5a malc
    if (ret) die2(ret, "pthread_attr_init");
346 a8227a5a malc
347 a8227a5a malc
    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
348 a8227a5a malc
    if (ret) die2(ret, "pthread_attr_setdetachstate");
349 a8227a5a malc
350 3c529d93 aliguori
    TAILQ_INIT(&request_list);
351 3c529d93 aliguori
352 3c529d93 aliguori
    return 0;
353 3c529d93 aliguori
}
354 3c529d93 aliguori
355 221f715d aliguori
static int qemu_paio_submit(struct qemu_paiocb *aiocb, int type)
356 3c529d93 aliguori
{
357 221f715d aliguori
    aiocb->aio_type = type;
358 3c529d93 aliguori
    aiocb->ret = -EINPROGRESS;
359 3c529d93 aliguori
    aiocb->active = 0;
360 8653c015 malc
    mutex_lock(&lock);
361 3c529d93 aliguori
    if (idle_threads == 0 && cur_threads < max_threads)
362 3c529d93 aliguori
        spawn_thread();
363 3c529d93 aliguori
    TAILQ_INSERT_TAIL(&request_list, aiocb, node);
364 8653c015 malc
    mutex_unlock(&lock);
365 5d47e372 malc
    cond_signal(&cond);
366 3c529d93 aliguori
367 3c529d93 aliguori
    return 0;
368 3c529d93 aliguori
}
369 3c529d93 aliguori
370 3c529d93 aliguori
int qemu_paio_read(struct qemu_paiocb *aiocb)
371 3c529d93 aliguori
{
372 221f715d aliguori
    return qemu_paio_submit(aiocb, QEMU_PAIO_READ);
373 3c529d93 aliguori
}
374 3c529d93 aliguori
375 3c529d93 aliguori
int qemu_paio_write(struct qemu_paiocb *aiocb)
376 3c529d93 aliguori
{
377 221f715d aliguori
    return qemu_paio_submit(aiocb, QEMU_PAIO_WRITE);
378 221f715d aliguori
}
379 221f715d aliguori
380 221f715d aliguori
int qemu_paio_ioctl(struct qemu_paiocb *aiocb)
381 221f715d aliguori
{
382 221f715d aliguori
    return qemu_paio_submit(aiocb, QEMU_PAIO_IOCTL);
383 3c529d93 aliguori
}
384 3c529d93 aliguori
385 3c529d93 aliguori
ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
386 3c529d93 aliguori
{
387 3c529d93 aliguori
    ssize_t ret;
388 3c529d93 aliguori
389 8653c015 malc
    mutex_lock(&lock);
390 3c529d93 aliguori
    ret = aiocb->ret;
391 8653c015 malc
    mutex_unlock(&lock);
392 3c529d93 aliguori
393 3c529d93 aliguori
    return ret;
394 3c529d93 aliguori
}
395 3c529d93 aliguori
396 3c529d93 aliguori
int qemu_paio_error(struct qemu_paiocb *aiocb)
397 3c529d93 aliguori
{
398 3c529d93 aliguori
    ssize_t ret = qemu_paio_return(aiocb);
399 3c529d93 aliguori
400 3c529d93 aliguori
    if (ret < 0)
401 3c529d93 aliguori
        ret = -ret;
402 3c529d93 aliguori
    else
403 3c529d93 aliguori
        ret = 0;
404 3c529d93 aliguori
405 3c529d93 aliguori
    return ret;
406 3c529d93 aliguori
}
407 3c529d93 aliguori
408 3c529d93 aliguori
int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb)
409 3c529d93 aliguori
{
410 3c529d93 aliguori
    int ret;
411 3c529d93 aliguori
412 8653c015 malc
    mutex_lock(&lock);
413 3c529d93 aliguori
    if (!aiocb->active) {
414 3c529d93 aliguori
        TAILQ_REMOVE(&request_list, aiocb, node);
415 3c529d93 aliguori
        aiocb->ret = -ECANCELED;
416 3c529d93 aliguori
        ret = QEMU_PAIO_CANCELED;
417 3c529d93 aliguori
    } else if (aiocb->ret == -EINPROGRESS)
418 3c529d93 aliguori
        ret = QEMU_PAIO_NOTCANCELED;
419 3c529d93 aliguori
    else
420 3c529d93 aliguori
        ret = QEMU_PAIO_ALLDONE;
421 8653c015 malc
    mutex_unlock(&lock);
422 3c529d93 aliguori
423 3c529d93 aliguori
    return ret;
424 3c529d93 aliguori
}