root / migration-rdma.c @ feature-archipelago
History | View | Annotate | Download (104.9 kB)
1 |
/*
|
---|---|
2 |
* RDMA protocol and interfaces
|
3 |
*
|
4 |
* Copyright IBM, Corp. 2010-2013
|
5 |
*
|
6 |
* Authors:
|
7 |
* Michael R. Hines <mrhines@us.ibm.com>
|
8 |
* Jiuxing Liu <jl@us.ibm.com>
|
9 |
*
|
10 |
* This work is licensed under the terms of the GNU GPL, version 2 or
|
11 |
* later. See the COPYING file in the top-level directory.
|
12 |
*
|
13 |
*/
|
14 |
#include "qemu-common.h" |
15 |
#include "migration/migration.h" |
16 |
#include "migration/qemu-file.h" |
17 |
#include "exec/cpu-common.h" |
18 |
#include "qemu/main-loop.h" |
19 |
#include "qemu/sockets.h" |
20 |
#include "qemu/bitmap.h" |
21 |
#include "block/coroutine.h" |
22 |
#include <stdio.h> |
23 |
#include <sys/types.h> |
24 |
#include <sys/socket.h> |
25 |
#include <netdb.h> |
26 |
#include <arpa/inet.h> |
27 |
#include <string.h> |
28 |
#include <rdma/rdma_cma.h> |
29 |
|
30 |
//#define DEBUG_RDMA
|
31 |
//#define DEBUG_RDMA_VERBOSE
|
32 |
//#define DEBUG_RDMA_REALLY_VERBOSE
|
33 |
|
34 |
#ifdef DEBUG_RDMA
|
35 |
#define DPRINTF(fmt, ...) \
|
36 |
do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0) |
37 |
#else
|
38 |
#define DPRINTF(fmt, ...) \
|
39 |
do { } while (0) |
40 |
#endif
|
41 |
|
42 |
#ifdef DEBUG_RDMA_VERBOSE
|
43 |
#define DDPRINTF(fmt, ...) \
|
44 |
do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0) |
45 |
#else
|
46 |
#define DDPRINTF(fmt, ...) \
|
47 |
do { } while (0) |
48 |
#endif
|
49 |
|
50 |
#ifdef DEBUG_RDMA_REALLY_VERBOSE
|
51 |
#define DDDPRINTF(fmt, ...) \
|
52 |
do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0) |
53 |
#else
|
54 |
#define DDDPRINTF(fmt, ...) \
|
55 |
do { } while (0) |
56 |
#endif
|
57 |
|
58 |
/*
|
59 |
* Print and error on both the Monitor and the Log file.
|
60 |
*/
|
61 |
#define ERROR(errp, fmt, ...) \
|
62 |
do { \
|
63 |
fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \ |
64 |
if (errp && (*(errp) == NULL)) { \ |
65 |
error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \ |
66 |
} \ |
67 |
} while (0) |
68 |
|
69 |
#define RDMA_RESOLVE_TIMEOUT_MS 10000 |
70 |
|
71 |
/* Do not merge data if larger than this. */
|
72 |
#define RDMA_MERGE_MAX (2 * 1024 * 1024) |
73 |
#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096) |
74 |
|
75 |
#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */ |
76 |
|
77 |
/*
|
78 |
* This is only for non-live state being migrated.
|
79 |
* Instead of RDMA_WRITE messages, we use RDMA_SEND
|
80 |
* messages for that state, which requires a different
|
81 |
* delivery design than main memory.
|
82 |
*/
|
83 |
#define RDMA_SEND_INCREMENT 32768 |
84 |
|
85 |
/*
|
86 |
* Maximum size infiniband SEND message
|
87 |
*/
|
88 |
#define RDMA_CONTROL_MAX_BUFFER (512 * 1024) |
89 |
#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096 |
90 |
|
91 |
#define RDMA_CONTROL_VERSION_CURRENT 1 |
92 |
/*
|
93 |
* Capabilities for negotiation.
|
94 |
*/
|
95 |
#define RDMA_CAPABILITY_PIN_ALL 0x01 |
96 |
|
97 |
/*
|
98 |
* Add the other flags above to this list of known capabilities
|
99 |
* as they are introduced.
|
100 |
*/
|
101 |
static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
|
102 |
|
103 |
#define CHECK_ERROR_STATE() \
|
104 |
do { \
|
105 |
if (rdma->error_state) { \
|
106 |
if (!rdma->error_reported) { \
|
107 |
fprintf(stderr, "RDMA is in an error state waiting migration" \
|
108 |
" to abort!\n"); \
|
109 |
rdma->error_reported = 1; \
|
110 |
} \ |
111 |
return rdma->error_state; \
|
112 |
} \ |
113 |
} while (0); |
114 |
|
115 |
/*
|
116 |
* A work request ID is 64-bits and we split up these bits
|
117 |
* into 3 parts:
|
118 |
*
|
119 |
* bits 0-15 : type of control message, 2^16
|
120 |
* bits 16-29: ram block index, 2^14
|
121 |
* bits 30-63: ram block chunk number, 2^34
|
122 |
*
|
123 |
* The last two bit ranges are only used for RDMA writes,
|
124 |
* in order to track their completion and potentially
|
125 |
* also track unregistration status of the message.
|
126 |
*/
|
127 |
#define RDMA_WRID_TYPE_SHIFT 0UL |
128 |
#define RDMA_WRID_BLOCK_SHIFT 16UL |
129 |
#define RDMA_WRID_CHUNK_SHIFT 30UL |
130 |
|
131 |
#define RDMA_WRID_TYPE_MASK \
|
132 |
((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL) |
133 |
|
134 |
#define RDMA_WRID_BLOCK_MASK \
|
135 |
(~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL)) |
136 |
|
137 |
#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
|
138 |
|
139 |
/*
|
140 |
* RDMA migration protocol:
|
141 |
* 1. RDMA Writes (data messages, i.e. RAM)
|
142 |
* 2. IB Send/Recv (control channel messages)
|
143 |
*/
|
144 |
enum {
|
145 |
RDMA_WRID_NONE = 0,
|
146 |
RDMA_WRID_RDMA_WRITE = 1,
|
147 |
RDMA_WRID_SEND_CONTROL = 2000,
|
148 |
RDMA_WRID_RECV_CONTROL = 4000,
|
149 |
}; |
150 |
|
151 |
const char *wrid_desc[] = { |
152 |
[RDMA_WRID_NONE] = "NONE",
|
153 |
[RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
|
154 |
[RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
|
155 |
[RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
|
156 |
}; |
157 |
|
158 |
/*
|
159 |
* Work request IDs for IB SEND messages only (not RDMA writes).
|
160 |
* This is used by the migration protocol to transmit
|
161 |
* control messages (such as device state and registration commands)
|
162 |
*
|
163 |
* We could use more WRs, but we have enough for now.
|
164 |
*/
|
165 |
enum {
|
166 |
RDMA_WRID_READY = 0,
|
167 |
RDMA_WRID_DATA, |
168 |
RDMA_WRID_CONTROL, |
169 |
RDMA_WRID_MAX, |
170 |
}; |
171 |
|
172 |
/*
|
173 |
* SEND/RECV IB Control Messages.
|
174 |
*/
|
175 |
enum {
|
176 |
RDMA_CONTROL_NONE = 0,
|
177 |
RDMA_CONTROL_ERROR, |
178 |
RDMA_CONTROL_READY, /* ready to receive */
|
179 |
RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
|
180 |
RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */
|
181 |
RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */
|
182 |
RDMA_CONTROL_COMPRESS, /* page contains repeat values */
|
183 |
RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */
|
184 |
RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */
|
185 |
RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */
|
186 |
RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
|
187 |
RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
|
188 |
}; |
189 |
|
190 |
const char *control_desc[] = { |
191 |
[RDMA_CONTROL_NONE] = "NONE",
|
192 |
[RDMA_CONTROL_ERROR] = "ERROR",
|
193 |
[RDMA_CONTROL_READY] = "READY",
|
194 |
[RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
|
195 |
[RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
|
196 |
[RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
|
197 |
[RDMA_CONTROL_COMPRESS] = "COMPRESS",
|
198 |
[RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
|
199 |
[RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
|
200 |
[RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
|
201 |
[RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
|
202 |
[RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
|
203 |
}; |
204 |
|
205 |
/*
|
206 |
* Memory and MR structures used to represent an IB Send/Recv work request.
|
207 |
* This is *not* used for RDMA writes, only IB Send/Recv.
|
208 |
*/
|
209 |
typedef struct { |
210 |
uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
|
211 |
struct ibv_mr *control_mr; /* registration metadata */ |
212 |
size_t control_len; /* length of the message */
|
213 |
uint8_t *control_curr; /* start of unconsumed bytes */
|
214 |
} RDMAWorkRequestData; |
215 |
|
216 |
/*
|
217 |
* Negotiate RDMA capabilities during connection-setup time.
|
218 |
*/
|
219 |
typedef struct { |
220 |
uint32_t version; |
221 |
uint32_t flags; |
222 |
} RDMACapabilities; |
223 |
|
224 |
static void caps_to_network(RDMACapabilities *cap) |
225 |
{ |
226 |
cap->version = htonl(cap->version); |
227 |
cap->flags = htonl(cap->flags); |
228 |
} |
229 |
|
230 |
static void network_to_caps(RDMACapabilities *cap) |
231 |
{ |
232 |
cap->version = ntohl(cap->version); |
233 |
cap->flags = ntohl(cap->flags); |
234 |
} |
235 |
|
236 |
/*
|
237 |
* Representation of a RAMBlock from an RDMA perspective.
|
238 |
* This is not transmitted, only local.
|
239 |
* This and subsequent structures cannot be linked lists
|
240 |
* because we're using a single IB message to transmit
|
241 |
* the information. It's small anyway, so a list is overkill.
|
242 |
*/
|
243 |
typedef struct RDMALocalBlock { |
244 |
uint8_t *local_host_addr; /* local virtual address */
|
245 |
uint64_t remote_host_addr; /* remote virtual address */
|
246 |
uint64_t offset; |
247 |
uint64_t length; |
248 |
struct ibv_mr **pmr; /* MRs for chunk-level registration */ |
249 |
struct ibv_mr *mr; /* MR for non-chunk-level registration */ |
250 |
uint32_t *remote_keys; /* rkeys for chunk-level registration */
|
251 |
uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
|
252 |
int index; /* which block are we */ |
253 |
bool is_ram_block;
|
254 |
int nb_chunks;
|
255 |
unsigned long *transit_bitmap; |
256 |
unsigned long *unregister_bitmap; |
257 |
} RDMALocalBlock; |
258 |
|
259 |
/*
|
260 |
* Also represents a RAMblock, but only on the dest.
|
261 |
* This gets transmitted by the dest during connection-time
|
262 |
* to the source VM and then is used to populate the
|
263 |
* corresponding RDMALocalBlock with
|
264 |
* the information needed to perform the actual RDMA.
|
265 |
*/
|
266 |
typedef struct QEMU_PACKED RDMARemoteBlock { |
267 |
uint64_t remote_host_addr; |
268 |
uint64_t offset; |
269 |
uint64_t length; |
270 |
uint32_t remote_rkey; |
271 |
uint32_t padding; |
272 |
} RDMARemoteBlock; |
273 |
|
274 |
static uint64_t htonll(uint64_t v)
|
275 |
{ |
276 |
union { uint32_t lv[2]; uint64_t llv; } u; |
277 |
u.lv[0] = htonl(v >> 32); |
278 |
u.lv[1] = htonl(v & 0xFFFFFFFFULL); |
279 |
return u.llv;
|
280 |
} |
281 |
|
282 |
static uint64_t ntohll(uint64_t v) {
|
283 |
union { uint32_t lv[2]; uint64_t llv; } u; |
284 |
u.llv = v; |
285 |
return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]); |
286 |
} |
287 |
|
288 |
static void remote_block_to_network(RDMARemoteBlock *rb) |
289 |
{ |
290 |
rb->remote_host_addr = htonll(rb->remote_host_addr); |
291 |
rb->offset = htonll(rb->offset); |
292 |
rb->length = htonll(rb->length); |
293 |
rb->remote_rkey = htonl(rb->remote_rkey); |
294 |
} |
295 |
|
296 |
static void network_to_remote_block(RDMARemoteBlock *rb) |
297 |
{ |
298 |
rb->remote_host_addr = ntohll(rb->remote_host_addr); |
299 |
rb->offset = ntohll(rb->offset); |
300 |
rb->length = ntohll(rb->length); |
301 |
rb->remote_rkey = ntohl(rb->remote_rkey); |
302 |
} |
303 |
|
304 |
/*
|
305 |
* Virtual address of the above structures used for transmitting
|
306 |
* the RAMBlock descriptions at connection-time.
|
307 |
* This structure is *not* transmitted.
|
308 |
*/
|
309 |
typedef struct RDMALocalBlocks { |
310 |
int nb_blocks;
|
311 |
bool init; /* main memory init complete */ |
312 |
RDMALocalBlock *block; |
313 |
} RDMALocalBlocks; |
314 |
|
315 |
/*
|
316 |
* Main data structure for RDMA state.
|
317 |
* While there is only one copy of this structure being allocated right now,
|
318 |
* this is the place where one would start if you wanted to consider
|
319 |
* having more than one RDMA connection open at the same time.
|
320 |
*/
|
321 |
typedef struct RDMAContext { |
322 |
char *host;
|
323 |
int port;
|
324 |
|
325 |
RDMAWorkRequestData wr_data[RDMA_WRID_MAX]; |
326 |
|
327 |
/*
|
328 |
* This is used by *_exchange_send() to figure out whether or not
|
329 |
* the initial "READY" message has already been received or not.
|
330 |
* This is because other functions may potentially poll() and detect
|
331 |
* the READY message before send() does, in which case we need to
|
332 |
* know if it completed.
|
333 |
*/
|
334 |
int control_ready_expected;
|
335 |
|
336 |
/* number of outstanding writes */
|
337 |
int nb_sent;
|
338 |
|
339 |
/* store info about current buffer so that we can
|
340 |
merge it with future sends */
|
341 |
uint64_t current_addr; |
342 |
uint64_t current_length; |
343 |
/* index of ram block the current buffer belongs to */
|
344 |
int current_index;
|
345 |
/* index of the chunk in the current ram block */
|
346 |
int current_chunk;
|
347 |
|
348 |
bool pin_all;
|
349 |
|
350 |
/*
|
351 |
* infiniband-specific variables for opening the device
|
352 |
* and maintaining connection state and so forth.
|
353 |
*
|
354 |
* cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
|
355 |
* cm_id->verbs, cm_id->channel, and cm_id->qp.
|
356 |
*/
|
357 |
struct rdma_cm_id *cm_id; /* connection manager ID */ |
358 |
struct rdma_cm_id *listen_id;
|
359 |
bool connected;
|
360 |
|
361 |
struct ibv_context *verbs;
|
362 |
struct rdma_event_channel *channel;
|
363 |
struct ibv_qp *qp; /* queue pair */ |
364 |
struct ibv_comp_channel *comp_channel; /* completion channel */ |
365 |
struct ibv_pd *pd; /* protection domain */ |
366 |
struct ibv_cq *cq; /* completion queue */ |
367 |
|
368 |
/*
|
369 |
* If a previous write failed (perhaps because of a failed
|
370 |
* memory registration, then do not attempt any future work
|
371 |
* and remember the error state.
|
372 |
*/
|
373 |
int error_state;
|
374 |
int error_reported;
|
375 |
|
376 |
/*
|
377 |
* Description of ram blocks used throughout the code.
|
378 |
*/
|
379 |
RDMALocalBlocks local_ram_blocks; |
380 |
RDMARemoteBlock *block; |
381 |
|
382 |
/*
|
383 |
* Migration on *destination* started.
|
384 |
* Then use coroutine yield function.
|
385 |
* Source runs in a thread, so we don't care.
|
386 |
*/
|
387 |
int migration_started_on_destination;
|
388 |
|
389 |
int total_registrations;
|
390 |
int total_writes;
|
391 |
|
392 |
int unregister_current, unregister_next;
|
393 |
uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX]; |
394 |
|
395 |
GHashTable *blockmap; |
396 |
} RDMAContext; |
397 |
|
398 |
/*
|
399 |
* Interface to the rest of the migration call stack.
|
400 |
*/
|
401 |
typedef struct QEMUFileRDMA { |
402 |
RDMAContext *rdma; |
403 |
size_t len; |
404 |
void *file;
|
405 |
} QEMUFileRDMA; |
406 |
|
407 |
/*
|
408 |
* Main structure for IB Send/Recv control messages.
|
409 |
* This gets prepended at the beginning of every Send/Recv.
|
410 |
*/
|
411 |
typedef struct QEMU_PACKED { |
412 |
uint32_t len; /* Total length of data portion */
|
413 |
uint32_t type; /* which control command to perform */
|
414 |
uint32_t repeat; /* number of commands in data portion of same type */
|
415 |
uint32_t padding; |
416 |
} RDMAControlHeader; |
417 |
|
418 |
static void control_to_network(RDMAControlHeader *control) |
419 |
{ |
420 |
control->type = htonl(control->type); |
421 |
control->len = htonl(control->len); |
422 |
control->repeat = htonl(control->repeat); |
423 |
} |
424 |
|
425 |
static void network_to_control(RDMAControlHeader *control) |
426 |
{ |
427 |
control->type = ntohl(control->type); |
428 |
control->len = ntohl(control->len); |
429 |
control->repeat = ntohl(control->repeat); |
430 |
} |
431 |
|
432 |
/*
|
433 |
* Register a single Chunk.
|
434 |
* Information sent by the source VM to inform the dest
|
435 |
* to register an single chunk of memory before we can perform
|
436 |
* the actual RDMA operation.
|
437 |
*/
|
438 |
typedef struct QEMU_PACKED { |
439 |
union QEMU_PACKED {
|
440 |
uint64_t current_addr; /* offset into the ramblock of the chunk */
|
441 |
uint64_t chunk; /* chunk to lookup if unregistering */
|
442 |
} key; |
443 |
uint32_t current_index; /* which ramblock the chunk belongs to */
|
444 |
uint32_t padding; |
445 |
uint64_t chunks; /* how many sequential chunks to register */
|
446 |
} RDMARegister; |
447 |
|
448 |
static void register_to_network(RDMARegister *reg) |
449 |
{ |
450 |
reg->key.current_addr = htonll(reg->key.current_addr); |
451 |
reg->current_index = htonl(reg->current_index); |
452 |
reg->chunks = htonll(reg->chunks); |
453 |
} |
454 |
|
455 |
static void network_to_register(RDMARegister *reg) |
456 |
{ |
457 |
reg->key.current_addr = ntohll(reg->key.current_addr); |
458 |
reg->current_index = ntohl(reg->current_index); |
459 |
reg->chunks = ntohll(reg->chunks); |
460 |
} |
461 |
|
462 |
typedef struct QEMU_PACKED { |
463 |
uint32_t value; /* if zero, we will madvise() */
|
464 |
uint32_t block_idx; /* which ram block index */
|
465 |
uint64_t offset; /* where in the remote ramblock this chunk */
|
466 |
uint64_t length; /* length of the chunk */
|
467 |
} RDMACompress; |
468 |
|
469 |
static void compress_to_network(RDMACompress *comp) |
470 |
{ |
471 |
comp->value = htonl(comp->value); |
472 |
comp->block_idx = htonl(comp->block_idx); |
473 |
comp->offset = htonll(comp->offset); |
474 |
comp->length = htonll(comp->length); |
475 |
} |
476 |
|
477 |
static void network_to_compress(RDMACompress *comp) |
478 |
{ |
479 |
comp->value = ntohl(comp->value); |
480 |
comp->block_idx = ntohl(comp->block_idx); |
481 |
comp->offset = ntohll(comp->offset); |
482 |
comp->length = ntohll(comp->length); |
483 |
} |
484 |
|
485 |
/*
|
486 |
* The result of the dest's memory registration produces an "rkey"
|
487 |
* which the source VM must reference in order to perform
|
488 |
* the RDMA operation.
|
489 |
*/
|
490 |
typedef struct QEMU_PACKED { |
491 |
uint32_t rkey; |
492 |
uint32_t padding; |
493 |
uint64_t host_addr; |
494 |
} RDMARegisterResult; |
495 |
|
496 |
static void result_to_network(RDMARegisterResult *result) |
497 |
{ |
498 |
result->rkey = htonl(result->rkey); |
499 |
result->host_addr = htonll(result->host_addr); |
500 |
}; |
501 |
|
502 |
static void network_to_result(RDMARegisterResult *result) |
503 |
{ |
504 |
result->rkey = ntohl(result->rkey); |
505 |
result->host_addr = ntohll(result->host_addr); |
506 |
}; |
507 |
|
508 |
const char *print_wrid(int wrid); |
509 |
static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, |
510 |
uint8_t *data, RDMAControlHeader *resp, |
511 |
int *resp_idx,
|
512 |
int (*callback)(RDMAContext *rdma));
|
513 |
|
514 |
static inline uint64_t ram_chunk_index(const uint8_t *start, |
515 |
const uint8_t *host)
|
516 |
{ |
517 |
return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
|
518 |
} |
519 |
|
520 |
static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block, |
521 |
uint64_t i) |
522 |
{ |
523 |
return (uint8_t *) (((uintptr_t) rdma_ram_block->local_host_addr)
|
524 |
+ (i << RDMA_REG_CHUNK_SHIFT)); |
525 |
} |
526 |
|
527 |
static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, |
528 |
uint64_t i) |
529 |
{ |
530 |
uint8_t *result = ram_chunk_start(rdma_ram_block, i) + |
531 |
(1UL << RDMA_REG_CHUNK_SHIFT);
|
532 |
|
533 |
if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
|
534 |
result = rdma_ram_block->local_host_addr + rdma_ram_block->length; |
535 |
} |
536 |
|
537 |
return result;
|
538 |
} |
539 |
|
540 |
static int __qemu_rdma_add_block(RDMAContext *rdma, void *host_addr, |
541 |
ram_addr_t block_offset, uint64_t length) |
542 |
{ |
543 |
RDMALocalBlocks *local = &rdma->local_ram_blocks; |
544 |
RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, |
545 |
(void *) block_offset);
|
546 |
RDMALocalBlock *old = local->block; |
547 |
|
548 |
assert(block == NULL);
|
549 |
|
550 |
local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1)); |
551 |
|
552 |
if (local->nb_blocks) {
|
553 |
int x;
|
554 |
|
555 |
for (x = 0; x < local->nb_blocks; x++) { |
556 |
g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
|
557 |
g_hash_table_insert(rdma->blockmap, (void *)old[x].offset,
|
558 |
&local->block[x]); |
559 |
} |
560 |
memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
|
561 |
g_free(old); |
562 |
} |
563 |
|
564 |
block = &local->block[local->nb_blocks]; |
565 |
|
566 |
block->local_host_addr = host_addr; |
567 |
block->offset = block_offset; |
568 |
block->length = length; |
569 |
block->index = local->nb_blocks; |
570 |
block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
|
571 |
block->transit_bitmap = bitmap_new(block->nb_chunks); |
572 |
bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
|
573 |
block->unregister_bitmap = bitmap_new(block->nb_chunks); |
574 |
bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
|
575 |
block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t));
|
576 |
|
577 |
block->is_ram_block = local->init ? false : true; |
578 |
|
579 |
g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
|
580 |
|
581 |
DDPRINTF("Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 |
582 |
" length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n", |
583 |
local->nb_blocks, (uint64_t) block->local_host_addr, block->offset, |
584 |
block->length, (uint64_t) (block->local_host_addr + block->length), |
585 |
BITS_TO_LONGS(block->nb_chunks) * |
586 |
sizeof(unsigned long) * 8, block->nb_chunks); |
587 |
|
588 |
local->nb_blocks++; |
589 |
|
590 |
return 0; |
591 |
} |
592 |
|
593 |
/*
|
594 |
* Memory regions need to be registered with the device and queue pairs setup
|
595 |
* in advanced before the migration starts. This tells us where the RAM blocks
|
596 |
* are so that we can register them individually.
|
597 |
*/
|
598 |
static void qemu_rdma_init_one_block(void *host_addr, |
599 |
ram_addr_t block_offset, ram_addr_t length, void *opaque)
|
600 |
{ |
601 |
__qemu_rdma_add_block(opaque, host_addr, block_offset, length); |
602 |
} |
603 |
|
604 |
/*
|
605 |
* Identify the RAMBlocks and their quantity. They will be references to
|
606 |
* identify chunk boundaries inside each RAMBlock and also be referenced
|
607 |
* during dynamic page registration.
|
608 |
*/
|
609 |
static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) |
610 |
{ |
611 |
RDMALocalBlocks *local = &rdma->local_ram_blocks; |
612 |
|
613 |
assert(rdma->blockmap == NULL);
|
614 |
rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); |
615 |
memset(local, 0, sizeof *local); |
616 |
qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); |
617 |
DPRINTF("Allocated %d local ram block structures\n", local->nb_blocks);
|
618 |
rdma->block = (RDMARemoteBlock *) g_malloc0(sizeof(RDMARemoteBlock) *
|
619 |
rdma->local_ram_blocks.nb_blocks); |
620 |
local->init = true;
|
621 |
return 0; |
622 |
} |
623 |
|
624 |
static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) |
625 |
{ |
626 |
RDMALocalBlocks *local = &rdma->local_ram_blocks; |
627 |
RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, |
628 |
(void *) block_offset);
|
629 |
RDMALocalBlock *old = local->block; |
630 |
int x;
|
631 |
|
632 |
assert(block); |
633 |
|
634 |
if (block->pmr) {
|
635 |
int j;
|
636 |
|
637 |
for (j = 0; j < block->nb_chunks; j++) { |
638 |
if (!block->pmr[j]) {
|
639 |
continue;
|
640 |
} |
641 |
ibv_dereg_mr(block->pmr[j]); |
642 |
rdma->total_registrations--; |
643 |
} |
644 |
g_free(block->pmr); |
645 |
block->pmr = NULL;
|
646 |
} |
647 |
|
648 |
if (block->mr) {
|
649 |
ibv_dereg_mr(block->mr); |
650 |
rdma->total_registrations--; |
651 |
block->mr = NULL;
|
652 |
} |
653 |
|
654 |
g_free(block->transit_bitmap); |
655 |
block->transit_bitmap = NULL;
|
656 |
|
657 |
g_free(block->unregister_bitmap); |
658 |
block->unregister_bitmap = NULL;
|
659 |
|
660 |
g_free(block->remote_keys); |
661 |
block->remote_keys = NULL;
|
662 |
|
663 |
for (x = 0; x < local->nb_blocks; x++) { |
664 |
g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
|
665 |
} |
666 |
|
667 |
if (local->nb_blocks > 1) { |
668 |
|
669 |
local->block = g_malloc0(sizeof(RDMALocalBlock) *
|
670 |
(local->nb_blocks - 1));
|
671 |
|
672 |
if (block->index) {
|
673 |
memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
|
674 |
} |
675 |
|
676 |
if (block->index < (local->nb_blocks - 1)) { |
677 |
memcpy(local->block + block->index, old + (block->index + 1),
|
678 |
sizeof(RDMALocalBlock) *
|
679 |
(local->nb_blocks - (block->index + 1)));
|
680 |
} |
681 |
} else {
|
682 |
assert(block == local->block); |
683 |
local->block = NULL;
|
684 |
} |
685 |
|
686 |
DDPRINTF("Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 |
687 |
" length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n", |
688 |
local->nb_blocks, (uint64_t) block->local_host_addr, block->offset, |
689 |
block->length, (uint64_t) (block->local_host_addr + block->length), |
690 |
BITS_TO_LONGS(block->nb_chunks) * |
691 |
sizeof(unsigned long) * 8, block->nb_chunks); |
692 |
|
693 |
g_free(old); |
694 |
|
695 |
local->nb_blocks--; |
696 |
|
697 |
if (local->nb_blocks) {
|
698 |
for (x = 0; x < local->nb_blocks; x++) { |
699 |
g_hash_table_insert(rdma->blockmap, (void *)local->block[x].offset,
|
700 |
&local->block[x]); |
701 |
} |
702 |
} |
703 |
|
704 |
return 0; |
705 |
} |
706 |
|
707 |
/*
|
708 |
* Put in the log file which RDMA device was opened and the details
|
709 |
* associated with that device.
|
710 |
*/
|
711 |
static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs) |
712 |
{ |
713 |
struct ibv_port_attr port;
|
714 |
|
715 |
if (ibv_query_port(verbs, 1, &port)) { |
716 |
fprintf(stderr, "FAILED TO QUERY PORT INFORMATION!\n");
|
717 |
return;
|
718 |
} |
719 |
|
720 |
printf("%s RDMA Device opened: kernel name %s "
|
721 |
"uverbs device name %s, "
|
722 |
"infiniband_verbs class device path %s, "
|
723 |
"infiniband class device path %s, "
|
724 |
"transport: (%d) %s\n",
|
725 |
who, |
726 |
verbs->device->name, |
727 |
verbs->device->dev_name, |
728 |
verbs->device->dev_path, |
729 |
verbs->device->ibdev_path, |
730 |
port.link_layer, |
731 |
(port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
|
732 |
((port.link_layer == IBV_LINK_LAYER_ETHERNET) |
733 |
? "Ethernet" : "Unknown")); |
734 |
} |
735 |
|
736 |
/*
|
737 |
* Put in the log file the RDMA gid addressing information,
|
738 |
* useful for folks who have trouble understanding the
|
739 |
* RDMA device hierarchy in the kernel.
|
740 |
*/
|
741 |
static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) |
742 |
{ |
743 |
char sgid[33]; |
744 |
char dgid[33]; |
745 |
inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
|
746 |
inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
|
747 |
DPRINTF("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid);
|
748 |
} |
749 |
|
750 |
/*
|
751 |
* As of now, IPv6 over RoCE / iWARP is not supported by linux.
|
752 |
* We will try the next addrinfo struct, and fail if there are
|
753 |
* no other valid addresses to bind against.
|
754 |
*
|
755 |
* If user is listening on '[::]', then we will not have a opened a device
|
756 |
* yet and have no way of verifying if the device is RoCE or not.
|
757 |
*
|
758 |
* In this case, the source VM will throw an error for ALL types of
|
759 |
* connections (both IPv4 and IPv6) if the destination machine does not have
|
760 |
* a regular infiniband network available for use.
|
761 |
*
|
762 |
* The only way to guarantee that an error is thrown for broken kernels is
|
763 |
* for the management software to choose a *specific* interface at bind time
|
764 |
* and validate what time of hardware it is.
|
765 |
*
|
766 |
* Unfortunately, this puts the user in a fix:
|
767 |
*
|
768 |
* If the source VM connects with an IPv4 address without knowing that the
|
769 |
* destination has bound to '[::]' the migration will unconditionally fail
|
770 |
* unless the management software is explicitly listening on the the IPv4
|
771 |
* address while using a RoCE-based device.
|
772 |
*
|
773 |
* If the source VM connects with an IPv6 address, then we're OK because we can
|
774 |
* throw an error on the source (and similarly on the destination).
|
775 |
*
|
776 |
* But in mixed environments, this will be broken for a while until it is fixed
|
777 |
* inside linux.
|
778 |
*
|
779 |
* We do provide a *tiny* bit of help in this function: We can list all of the
|
780 |
* devices in the system and check to see if all the devices are RoCE or
|
781 |
* Infiniband.
|
782 |
*
|
783 |
* If we detect that we have a *pure* RoCE environment, then we can safely
|
784 |
* thrown an error even if the management software has specified '[::]' as the
|
785 |
* bind address.
|
786 |
*
|
787 |
* However, if there is are multiple hetergeneous devices, then we cannot make
|
788 |
* this assumption and the user just has to be sure they know what they are
|
789 |
* doing.
|
790 |
*
|
791 |
* Patches are being reviewed on linux-rdma.
|
792 |
*/
|
793 |
static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs) |
794 |
{ |
795 |
struct ibv_port_attr port_attr;
|
796 |
|
797 |
/* This bug only exists in linux, to our knowledge. */
|
798 |
#ifdef CONFIG_LINUX
|
799 |
|
800 |
/*
|
801 |
* Verbs are only NULL if management has bound to '[::]'.
|
802 |
*
|
803 |
* Let's iterate through all the devices and see if there any pure IB
|
804 |
* devices (non-ethernet).
|
805 |
*
|
806 |
* If not, then we can safely proceed with the migration.
|
807 |
* Otherwise, there are no guarantees until the bug is fixed in linux.
|
808 |
*/
|
809 |
if (!verbs) {
|
810 |
int num_devices, x;
|
811 |
struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
|
812 |
bool roce_found = false; |
813 |
bool ib_found = false; |
814 |
|
815 |
for (x = 0; x < num_devices; x++) { |
816 |
verbs = ibv_open_device(dev_list[x]); |
817 |
|
818 |
if (ibv_query_port(verbs, 1, &port_attr)) { |
819 |
ibv_close_device(verbs); |
820 |
ERROR(errp, "Could not query initial IB port");
|
821 |
return -EINVAL;
|
822 |
} |
823 |
|
824 |
if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
|
825 |
ib_found = true;
|
826 |
} else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { |
827 |
roce_found = true;
|
828 |
} |
829 |
|
830 |
ibv_close_device(verbs); |
831 |
|
832 |
} |
833 |
|
834 |
if (roce_found) {
|
835 |
if (ib_found) {
|
836 |
fprintf(stderr, "WARN: migrations may fail:"
|
837 |
" IPv6 over RoCE / iWARP in linux"
|
838 |
" is broken. But since you appear to have a"
|
839 |
" mixed RoCE / IB environment, be sure to only"
|
840 |
" migrate over the IB fabric until the kernel "
|
841 |
" fixes the bug.\n");
|
842 |
} else {
|
843 |
ERROR(errp, "You only have RoCE / iWARP devices in your systems"
|
844 |
" and your management software has specified '[::]'"
|
845 |
", but IPv6 over RoCE / iWARP is not supported in Linux.");
|
846 |
return -ENONET;
|
847 |
} |
848 |
} |
849 |
|
850 |
return 0; |
851 |
} |
852 |
|
853 |
/*
|
854 |
* If we have a verbs context, that means that some other than '[::]' was
|
855 |
* used by the management software for binding. In which case we can actually
|
856 |
* warn the user about a potential broken kernel;
|
857 |
*/
|
858 |
|
859 |
/* IB ports start with 1, not 0 */
|
860 |
if (ibv_query_port(verbs, 1, &port_attr)) { |
861 |
ERROR(errp, "Could not query initial IB port");
|
862 |
return -EINVAL;
|
863 |
} |
864 |
|
865 |
if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
|
866 |
ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
|
867 |
"(but patches on linux-rdma in progress)");
|
868 |
return -ENONET;
|
869 |
} |
870 |
|
871 |
#endif
|
872 |
|
873 |
return 0; |
874 |
} |
875 |
|
876 |
/*
|
877 |
* Figure out which RDMA device corresponds to the requested IP hostname
|
878 |
* Also create the initial connection manager identifiers for opening
|
879 |
* the connection.
|
880 |
*/
|
881 |
static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) |
882 |
{ |
883 |
int ret;
|
884 |
struct rdma_addrinfo *res;
|
885 |
char port_str[16]; |
886 |
struct rdma_cm_event *cm_event;
|
887 |
char ip[40] = "unknown"; |
888 |
struct rdma_addrinfo *e;
|
889 |
|
890 |
if (rdma->host == NULL || !strcmp(rdma->host, "")) { |
891 |
ERROR(errp, "RDMA hostname has not been set");
|
892 |
return -EINVAL;
|
893 |
} |
894 |
|
895 |
/* create CM channel */
|
896 |
rdma->channel = rdma_create_event_channel(); |
897 |
if (!rdma->channel) {
|
898 |
ERROR(errp, "could not create CM channel");
|
899 |
return -EINVAL;
|
900 |
} |
901 |
|
902 |
/* create CM id */
|
903 |
ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
|
904 |
if (ret) {
|
905 |
ERROR(errp, "could not create channel id");
|
906 |
goto err_resolve_create_id;
|
907 |
} |
908 |
|
909 |
snprintf(port_str, 16, "%d", rdma->port); |
910 |
port_str[15] = '\0'; |
911 |
|
912 |
ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
|
913 |
if (ret < 0) { |
914 |
ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
|
915 |
goto err_resolve_get_addr;
|
916 |
} |
917 |
|
918 |
for (e = res; e != NULL; e = e->ai_next) { |
919 |
inet_ntop(e->ai_family, |
920 |
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); |
921 |
DPRINTF("Trying %s => %s\n", rdma->host, ip);
|
922 |
|
923 |
ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
|
924 |
RDMA_RESOLVE_TIMEOUT_MS); |
925 |
if (!ret) {
|
926 |
if (e->ai_family == AF_INET6) {
|
927 |
ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs); |
928 |
if (ret) {
|
929 |
continue;
|
930 |
} |
931 |
} |
932 |
goto route;
|
933 |
} |
934 |
} |
935 |
|
936 |
ERROR(errp, "could not resolve address %s", rdma->host);
|
937 |
goto err_resolve_get_addr;
|
938 |
|
939 |
route:
|
940 |
qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
|
941 |
|
942 |
ret = rdma_get_cm_event(rdma->channel, &cm_event); |
943 |
if (ret) {
|
944 |
ERROR(errp, "could not perform event_addr_resolved");
|
945 |
goto err_resolve_get_addr;
|
946 |
} |
947 |
|
948 |
if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
|
949 |
ERROR(errp, "result not equal to event_addr_resolved %s",
|
950 |
rdma_event_str(cm_event->event)); |
951 |
perror("rdma_resolve_addr");
|
952 |
ret = -EINVAL; |
953 |
goto err_resolve_get_addr;
|
954 |
} |
955 |
rdma_ack_cm_event(cm_event); |
956 |
|
957 |
/* resolve route */
|
958 |
ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS); |
959 |
if (ret) {
|
960 |
ERROR(errp, "could not resolve rdma route");
|
961 |
goto err_resolve_get_addr;
|
962 |
} |
963 |
|
964 |
ret = rdma_get_cm_event(rdma->channel, &cm_event); |
965 |
if (ret) {
|
966 |
ERROR(errp, "could not perform event_route_resolved");
|
967 |
goto err_resolve_get_addr;
|
968 |
} |
969 |
if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
|
970 |
ERROR(errp, "result not equal to event_route_resolved: %s",
|
971 |
rdma_event_str(cm_event->event)); |
972 |
rdma_ack_cm_event(cm_event); |
973 |
ret = -EINVAL; |
974 |
goto err_resolve_get_addr;
|
975 |
} |
976 |
rdma_ack_cm_event(cm_event); |
977 |
rdma->verbs = rdma->cm_id->verbs; |
978 |
qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
|
979 |
qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
|
980 |
return 0; |
981 |
|
982 |
err_resolve_get_addr:
|
983 |
rdma_destroy_id(rdma->cm_id); |
984 |
rdma->cm_id = NULL;
|
985 |
err_resolve_create_id:
|
986 |
rdma_destroy_event_channel(rdma->channel); |
987 |
rdma->channel = NULL;
|
988 |
return ret;
|
989 |
} |
990 |
|
991 |
/*
|
992 |
* Create protection domain and completion queues
|
993 |
*/
|
994 |
static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma) |
995 |
{ |
996 |
/* allocate pd */
|
997 |
rdma->pd = ibv_alloc_pd(rdma->verbs); |
998 |
if (!rdma->pd) {
|
999 |
fprintf(stderr, "failed to allocate protection domain\n");
|
1000 |
return -1; |
1001 |
} |
1002 |
|
1003 |
/* create completion channel */
|
1004 |
rdma->comp_channel = ibv_create_comp_channel(rdma->verbs); |
1005 |
if (!rdma->comp_channel) {
|
1006 |
fprintf(stderr, "failed to allocate completion channel\n");
|
1007 |
goto err_alloc_pd_cq;
|
1008 |
} |
1009 |
|
1010 |
/*
|
1011 |
* Completion queue can be filled by both read and write work requests,
|
1012 |
* so must reflect the sum of both possible queue sizes.
|
1013 |
*/
|
1014 |
rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
|
1015 |
NULL, rdma->comp_channel, 0); |
1016 |
if (!rdma->cq) {
|
1017 |
fprintf(stderr, "failed to allocate completion queue\n");
|
1018 |
goto err_alloc_pd_cq;
|
1019 |
} |
1020 |
|
1021 |
return 0; |
1022 |
|
1023 |
err_alloc_pd_cq:
|
1024 |
if (rdma->pd) {
|
1025 |
ibv_dealloc_pd(rdma->pd); |
1026 |
} |
1027 |
if (rdma->comp_channel) {
|
1028 |
ibv_destroy_comp_channel(rdma->comp_channel); |
1029 |
} |
1030 |
rdma->pd = NULL;
|
1031 |
rdma->comp_channel = NULL;
|
1032 |
return -1; |
1033 |
|
1034 |
} |
1035 |
|
1036 |
/*
|
1037 |
* Create queue pairs.
|
1038 |
*/
|
1039 |
static int qemu_rdma_alloc_qp(RDMAContext *rdma) |
1040 |
{ |
1041 |
struct ibv_qp_init_attr attr = { 0 }; |
1042 |
int ret;
|
1043 |
|
1044 |
attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX; |
1045 |
attr.cap.max_recv_wr = 3;
|
1046 |
attr.cap.max_send_sge = 1;
|
1047 |
attr.cap.max_recv_sge = 1;
|
1048 |
attr.send_cq = rdma->cq; |
1049 |
attr.recv_cq = rdma->cq; |
1050 |
attr.qp_type = IBV_QPT_RC; |
1051 |
|
1052 |
ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); |
1053 |
if (ret) {
|
1054 |
return -1; |
1055 |
} |
1056 |
|
1057 |
rdma->qp = rdma->cm_id->qp; |
1058 |
return 0; |
1059 |
} |
1060 |
|
1061 |
static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma) |
1062 |
{ |
1063 |
int i;
|
1064 |
RDMALocalBlocks *local = &rdma->local_ram_blocks; |
1065 |
|
1066 |
for (i = 0; i < local->nb_blocks; i++) { |
1067 |
local->block[i].mr = |
1068 |
ibv_reg_mr(rdma->pd, |
1069 |
local->block[i].local_host_addr, |
1070 |
local->block[i].length, |
1071 |
IBV_ACCESS_LOCAL_WRITE | |
1072 |
IBV_ACCESS_REMOTE_WRITE |
1073 |
); |
1074 |
if (!local->block[i].mr) {
|
1075 |
perror("Failed to register local dest ram block!\n");
|
1076 |
break;
|
1077 |
} |
1078 |
rdma->total_registrations++; |
1079 |
} |
1080 |
|
1081 |
if (i >= local->nb_blocks) {
|
1082 |
return 0; |
1083 |
} |
1084 |
|
1085 |
for (i--; i >= 0; i--) { |
1086 |
ibv_dereg_mr(local->block[i].mr); |
1087 |
rdma->total_registrations--; |
1088 |
} |
1089 |
|
1090 |
return -1; |
1091 |
|
1092 |
} |
1093 |
|
1094 |
/*
|
1095 |
* Find the ram block that corresponds to the page requested to be
|
1096 |
* transmitted by QEMU.
|
1097 |
*
|
1098 |
* Once the block is found, also identify which 'chunk' within that
|
1099 |
* block that the page belongs to.
|
1100 |
*
|
1101 |
* This search cannot fail or the migration will fail.
|
1102 |
*/
|
1103 |
static int qemu_rdma_search_ram_block(RDMAContext *rdma, |
1104 |
uint64_t block_offset, |
1105 |
uint64_t offset, |
1106 |
uint64_t length, |
1107 |
uint64_t *block_index, |
1108 |
uint64_t *chunk_index) |
1109 |
{ |
1110 |
uint64_t current_addr = block_offset + offset; |
1111 |
RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, |
1112 |
(void *) block_offset);
|
1113 |
assert(block); |
1114 |
assert(current_addr >= block->offset); |
1115 |
assert((current_addr + length) <= (block->offset + block->length)); |
1116 |
|
1117 |
*block_index = block->index; |
1118 |
*chunk_index = ram_chunk_index(block->local_host_addr, |
1119 |
block->local_host_addr + (current_addr - block->offset)); |
1120 |
|
1121 |
return 0; |
1122 |
} |
1123 |
|
1124 |
/*
|
1125 |
* Register a chunk with IB. If the chunk was already registered
|
1126 |
* previously, then skip.
|
1127 |
*
|
1128 |
* Also return the keys associated with the registration needed
|
1129 |
* to perform the actual RDMA operation.
|
1130 |
*/
|
1131 |
static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, |
1132 |
RDMALocalBlock *block, uint8_t *host_addr, |
1133 |
uint32_t *lkey, uint32_t *rkey, int chunk,
|
1134 |
uint8_t *chunk_start, uint8_t *chunk_end) |
1135 |
{ |
1136 |
if (block->mr) {
|
1137 |
if (lkey) {
|
1138 |
*lkey = block->mr->lkey; |
1139 |
} |
1140 |
if (rkey) {
|
1141 |
*rkey = block->mr->rkey; |
1142 |
} |
1143 |
return 0; |
1144 |
} |
1145 |
|
1146 |
/* allocate memory to store chunk MRs */
|
1147 |
if (!block->pmr) {
|
1148 |
block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *)); |
1149 |
if (!block->pmr) {
|
1150 |
return -1; |
1151 |
} |
1152 |
} |
1153 |
|
1154 |
/*
|
1155 |
* If 'rkey', then we're the destination, so grant access to the source.
|
1156 |
*
|
1157 |
* If 'lkey', then we're the source VM, so grant access only to ourselves.
|
1158 |
*/
|
1159 |
if (!block->pmr[chunk]) {
|
1160 |
uint64_t len = chunk_end - chunk_start; |
1161 |
|
1162 |
DDPRINTF("Registering %" PRIu64 " bytes @ %p\n", |
1163 |
len, chunk_start); |
1164 |
|
1165 |
block->pmr[chunk] = ibv_reg_mr(rdma->pd, |
1166 |
chunk_start, len, |
1167 |
(rkey ? (IBV_ACCESS_LOCAL_WRITE | |
1168 |
IBV_ACCESS_REMOTE_WRITE) : 0));
|
1169 |
|
1170 |
if (!block->pmr[chunk]) {
|
1171 |
perror("Failed to register chunk!");
|
1172 |
fprintf(stderr, "Chunk details: block: %d chunk index %d"
|
1173 |
" start %" PRIu64 " end %" PRIu64 " host %" PRIu64 |
1174 |
" local %" PRIu64 " registrations: %d\n", |
1175 |
block->index, chunk, (uint64_t) chunk_start, |
1176 |
(uint64_t) chunk_end, (uint64_t) host_addr, |
1177 |
(uint64_t) block->local_host_addr, |
1178 |
rdma->total_registrations); |
1179 |
return -1; |
1180 |
} |
1181 |
rdma->total_registrations++; |
1182 |
} |
1183 |
|
1184 |
if (lkey) {
|
1185 |
*lkey = block->pmr[chunk]->lkey; |
1186 |
} |
1187 |
if (rkey) {
|
1188 |
*rkey = block->pmr[chunk]->rkey; |
1189 |
} |
1190 |
return 0; |
1191 |
} |
1192 |
|
1193 |
/*
|
1194 |
* Register (at connection time) the memory used for control
|
1195 |
* channel messages.
|
1196 |
*/
|
1197 |
static int qemu_rdma_reg_control(RDMAContext *rdma, int idx) |
1198 |
{ |
1199 |
rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd, |
1200 |
rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER, |
1201 |
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); |
1202 |
if (rdma->wr_data[idx].control_mr) {
|
1203 |
rdma->total_registrations++; |
1204 |
return 0; |
1205 |
} |
1206 |
fprintf(stderr, "qemu_rdma_reg_control failed!\n");
|
1207 |
return -1; |
1208 |
} |
1209 |
|
1210 |
const char *print_wrid(int wrid) |
1211 |
{ |
1212 |
if (wrid >= RDMA_WRID_RECV_CONTROL) {
|
1213 |
return wrid_desc[RDMA_WRID_RECV_CONTROL];
|
1214 |
} |
1215 |
return wrid_desc[wrid];
|
1216 |
} |
1217 |
|
1218 |
/*
|
1219 |
* RDMA requires memory registration (mlock/pinning), but this is not good for
|
1220 |
* overcommitment.
|
1221 |
*
|
1222 |
* In preparation for the future where LRU information or workload-specific
|
1223 |
* writable writable working set memory access behavior is available to QEMU
|
1224 |
* it would be nice to have in place the ability to UN-register/UN-pin
|
1225 |
* particular memory regions from the RDMA hardware when it is determine that
|
1226 |
* those regions of memory will likely not be accessed again in the near future.
|
1227 |
*
|
1228 |
* While we do not yet have such information right now, the following
|
1229 |
* compile-time option allows us to perform a non-optimized version of this
|
1230 |
* behavior.
|
1231 |
*
|
1232 |
* By uncommenting this option, you will cause *all* RDMA transfers to be
|
1233 |
* unregistered immediately after the transfer completes on both sides of the
|
1234 |
* connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
|
1235 |
*
|
1236 |
* This will have a terrible impact on migration performance, so until future
|
1237 |
* workload information or LRU information is available, do not attempt to use
|
1238 |
* this feature except for basic testing.
|
1239 |
*/
|
1240 |
//#define RDMA_UNREGISTRATION_EXAMPLE
|
1241 |
|
1242 |
/*
|
1243 |
* Perform a non-optimized memory unregistration after every transfer
|
1244 |
* for demonsration purposes, only if pin-all is not requested.
|
1245 |
*
|
1246 |
* Potential optimizations:
|
1247 |
* 1. Start a new thread to run this function continuously
|
1248 |
- for bit clearing
|
1249 |
- and for receipt of unregister messages
|
1250 |
* 2. Use an LRU.
|
1251 |
* 3. Use workload hints.
|
1252 |
*/
|
1253 |
static int qemu_rdma_unregister_waiting(RDMAContext *rdma) |
1254 |
{ |
1255 |
while (rdma->unregistrations[rdma->unregister_current]) {
|
1256 |
int ret;
|
1257 |
uint64_t wr_id = rdma->unregistrations[rdma->unregister_current]; |
1258 |
uint64_t chunk = |
1259 |
(wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; |
1260 |
uint64_t index = |
1261 |
(wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; |
1262 |
RDMALocalBlock *block = |
1263 |
&(rdma->local_ram_blocks.block[index]); |
1264 |
RDMARegister reg = { .current_index = index }; |
1265 |
RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED, |
1266 |
}; |
1267 |
RDMAControlHeader head = { .len = sizeof(RDMARegister),
|
1268 |
.type = RDMA_CONTROL_UNREGISTER_REQUEST, |
1269 |
.repeat = 1,
|
1270 |
}; |
1271 |
|
1272 |
DDPRINTF("Processing unregister for chunk: %" PRIu64
|
1273 |
" at position %d\n", chunk, rdma->unregister_current);
|
1274 |
|
1275 |
rdma->unregistrations[rdma->unregister_current] = 0;
|
1276 |
rdma->unregister_current++; |
1277 |
|
1278 |
if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
|
1279 |
rdma->unregister_current = 0;
|
1280 |
} |
1281 |
|
1282 |
|
1283 |
/*
|
1284 |
* Unregistration is speculative (because migration is single-threaded
|
1285 |
* and we cannot break the protocol's inifinband message ordering).
|
1286 |
* Thus, if the memory is currently being used for transmission,
|
1287 |
* then abort the attempt to unregister and try again
|
1288 |
* later the next time a completion is received for this memory.
|
1289 |
*/
|
1290 |
clear_bit(chunk, block->unregister_bitmap); |
1291 |
|
1292 |
if (test_bit(chunk, block->transit_bitmap)) {
|
1293 |
DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk); |
1294 |
continue;
|
1295 |
} |
1296 |
|
1297 |
DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk); |
1298 |
|
1299 |
ret = ibv_dereg_mr(block->pmr[chunk]); |
1300 |
block->pmr[chunk] = NULL;
|
1301 |
block->remote_keys[chunk] = 0;
|
1302 |
|
1303 |
if (ret != 0) { |
1304 |
perror("unregistration chunk failed");
|
1305 |
return -ret;
|
1306 |
} |
1307 |
rdma->total_registrations--; |
1308 |
|
1309 |
reg.key.chunk = chunk; |
1310 |
register_to_network(®); |
1311 |
ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, |
1312 |
&resp, NULL, NULL); |
1313 |
if (ret < 0) { |
1314 |
return ret;
|
1315 |
} |
1316 |
|
1317 |
DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk); |
1318 |
} |
1319 |
|
1320 |
return 0; |
1321 |
} |
1322 |
|
1323 |
static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
|
1324 |
uint64_t chunk) |
1325 |
{ |
1326 |
uint64_t result = wr_id & RDMA_WRID_TYPE_MASK; |
1327 |
|
1328 |
result |= (index << RDMA_WRID_BLOCK_SHIFT); |
1329 |
result |= (chunk << RDMA_WRID_CHUNK_SHIFT); |
1330 |
|
1331 |
return result;
|
1332 |
} |
1333 |
|
1334 |
/*
|
1335 |
* Set bit for unregistration in the next iteration.
|
1336 |
* We cannot transmit right here, but will unpin later.
|
1337 |
*/
|
1338 |
static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index, |
1339 |
uint64_t chunk, uint64_t wr_id) |
1340 |
{ |
1341 |
if (rdma->unregistrations[rdma->unregister_next] != 0) { |
1342 |
fprintf(stderr, "rdma migration: queue is full!\n");
|
1343 |
} else {
|
1344 |
RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); |
1345 |
|
1346 |
if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
|
1347 |
DDPRINTF("Appending unregister chunk %" PRIu64
|
1348 |
" at position %d\n", chunk, rdma->unregister_next);
|
1349 |
|
1350 |
rdma->unregistrations[rdma->unregister_next++] = |
1351 |
qemu_rdma_make_wrid(wr_id, index, chunk); |
1352 |
|
1353 |
if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
|
1354 |
rdma->unregister_next = 0;
|
1355 |
} |
1356 |
} else {
|
1357 |
DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n", |
1358 |
chunk); |
1359 |
} |
1360 |
} |
1361 |
} |
1362 |
|
1363 |
/*
|
1364 |
* Consult the connection manager to see a work request
|
1365 |
* (of any kind) has completed.
|
1366 |
* Return the work request ID that completed.
|
1367 |
*/
|
1368 |
static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
|
1369 |
uint32_t *byte_len) |
1370 |
{ |
1371 |
int ret;
|
1372 |
struct ibv_wc wc;
|
1373 |
uint64_t wr_id; |
1374 |
|
1375 |
ret = ibv_poll_cq(rdma->cq, 1, &wc);
|
1376 |
|
1377 |
if (!ret) {
|
1378 |
*wr_id_out = RDMA_WRID_NONE; |
1379 |
return 0; |
1380 |
} |
1381 |
|
1382 |
if (ret < 0) { |
1383 |
fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
|
1384 |
return ret;
|
1385 |
} |
1386 |
|
1387 |
wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK; |
1388 |
|
1389 |
if (wc.status != IBV_WC_SUCCESS) {
|
1390 |
fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
|
1391 |
wc.status, ibv_wc_status_str(wc.status)); |
1392 |
fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
|
1393 |
|
1394 |
return -1; |
1395 |
} |
1396 |
|
1397 |
if (rdma->control_ready_expected &&
|
1398 |
(wr_id >= RDMA_WRID_RECV_CONTROL)) { |
1399 |
DDDPRINTF("completion %s #%" PRId64 " received (%" PRId64 ")" |
1400 |
" left %d\n", wrid_desc[RDMA_WRID_RECV_CONTROL],
|
1401 |
wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent); |
1402 |
rdma->control_ready_expected = 0;
|
1403 |
} |
1404 |
|
1405 |
if (wr_id == RDMA_WRID_RDMA_WRITE) {
|
1406 |
uint64_t chunk = |
1407 |
(wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; |
1408 |
uint64_t index = |
1409 |
(wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; |
1410 |
RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); |
1411 |
|
1412 |
DDDPRINTF("completions %s (%" PRId64 ") left %d, " |
1413 |
"block %" PRIu64 ", chunk: %" PRIu64 " %p %p\n", |
1414 |
print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk, |
1415 |
block->local_host_addr, (void *)block->remote_host_addr);
|
1416 |
|
1417 |
clear_bit(chunk, block->transit_bitmap); |
1418 |
|
1419 |
if (rdma->nb_sent > 0) { |
1420 |
rdma->nb_sent--; |
1421 |
} |
1422 |
|
1423 |
if (!rdma->pin_all) {
|
1424 |
/*
|
1425 |
* FYI: If one wanted to signal a specific chunk to be unregistered
|
1426 |
* using LRU or workload-specific information, this is the function
|
1427 |
* you would call to do so. That chunk would then get asynchronously
|
1428 |
* unregistered later.
|
1429 |
*/
|
1430 |
#ifdef RDMA_UNREGISTRATION_EXAMPLE
|
1431 |
qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id); |
1432 |
#endif
|
1433 |
} |
1434 |
} else {
|
1435 |
DDDPRINTF("other completion %s (%" PRId64 ") received left %d\n", |
1436 |
print_wrid(wr_id), wr_id, rdma->nb_sent); |
1437 |
} |
1438 |
|
1439 |
*wr_id_out = wc.wr_id; |
1440 |
if (byte_len) {
|
1441 |
*byte_len = wc.byte_len; |
1442 |
} |
1443 |
|
1444 |
return 0; |
1445 |
} |
1446 |
|
1447 |
/*
|
1448 |
* Block until the next work request has completed.
|
1449 |
*
|
1450 |
* First poll to see if a work request has already completed,
|
1451 |
* otherwise block.
|
1452 |
*
|
1453 |
* If we encounter completed work requests for IDs other than
|
1454 |
* the one we're interested in, then that's generally an error.
|
1455 |
*
|
1456 |
* The only exception is actual RDMA Write completions. These
|
1457 |
* completions only need to be recorded, but do not actually
|
1458 |
* need further processing.
|
1459 |
*/
|
1460 |
static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested, |
1461 |
uint32_t *byte_len) |
1462 |
{ |
1463 |
int num_cq_events = 0, ret = 0; |
1464 |
struct ibv_cq *cq;
|
1465 |
void *cq_ctx;
|
1466 |
uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; |
1467 |
|
1468 |
if (ibv_req_notify_cq(rdma->cq, 0)) { |
1469 |
return -1; |
1470 |
} |
1471 |
/* poll cq first */
|
1472 |
while (wr_id != wrid_requested) {
|
1473 |
ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); |
1474 |
if (ret < 0) { |
1475 |
return ret;
|
1476 |
} |
1477 |
|
1478 |
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; |
1479 |
|
1480 |
if (wr_id == RDMA_WRID_NONE) {
|
1481 |
break;
|
1482 |
} |
1483 |
if (wr_id != wrid_requested) {
|
1484 |
DDDPRINTF("A Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n", |
1485 |
print_wrid(wrid_requested), |
1486 |
wrid_requested, print_wrid(wr_id), wr_id); |
1487 |
} |
1488 |
} |
1489 |
|
1490 |
if (wr_id == wrid_requested) {
|
1491 |
return 0; |
1492 |
} |
1493 |
|
1494 |
while (1) { |
1495 |
/*
|
1496 |
* Coroutine doesn't start until process_incoming_migration()
|
1497 |
* so don't yield unless we know we're running inside of a coroutine.
|
1498 |
*/
|
1499 |
if (rdma->migration_started_on_destination) {
|
1500 |
yield_until_fd_readable(rdma->comp_channel->fd); |
1501 |
} |
1502 |
|
1503 |
if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) {
|
1504 |
perror("ibv_get_cq_event");
|
1505 |
goto err_block_for_wrid;
|
1506 |
} |
1507 |
|
1508 |
num_cq_events++; |
1509 |
|
1510 |
if (ibv_req_notify_cq(cq, 0)) { |
1511 |
goto err_block_for_wrid;
|
1512 |
} |
1513 |
|
1514 |
while (wr_id != wrid_requested) {
|
1515 |
ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); |
1516 |
if (ret < 0) { |
1517 |
goto err_block_for_wrid;
|
1518 |
} |
1519 |
|
1520 |
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; |
1521 |
|
1522 |
if (wr_id == RDMA_WRID_NONE) {
|
1523 |
break;
|
1524 |
} |
1525 |
if (wr_id != wrid_requested) {
|
1526 |
DDDPRINTF("B Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n", |
1527 |
print_wrid(wrid_requested), wrid_requested, |
1528 |
print_wrid(wr_id), wr_id); |
1529 |
} |
1530 |
} |
1531 |
|
1532 |
if (wr_id == wrid_requested) {
|
1533 |
goto success_block_for_wrid;
|
1534 |
} |
1535 |
} |
1536 |
|
1537 |
success_block_for_wrid:
|
1538 |
if (num_cq_events) {
|
1539 |
ibv_ack_cq_events(cq, num_cq_events); |
1540 |
} |
1541 |
return 0; |
1542 |
|
1543 |
err_block_for_wrid:
|
1544 |
if (num_cq_events) {
|
1545 |
ibv_ack_cq_events(cq, num_cq_events); |
1546 |
} |
1547 |
return ret;
|
1548 |
} |
1549 |
|
1550 |
/*
|
1551 |
* Post a SEND message work request for the control channel
|
1552 |
* containing some data and block until the post completes.
|
1553 |
*/
|
1554 |
static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, |
1555 |
RDMAControlHeader *head) |
1556 |
{ |
1557 |
int ret = 0; |
1558 |
RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL]; |
1559 |
struct ibv_send_wr *bad_wr;
|
1560 |
struct ibv_sge sge = {
|
1561 |
.addr = (uint64_t)(wr->control), |
1562 |
.length = head->len + sizeof(RDMAControlHeader),
|
1563 |
.lkey = wr->control_mr->lkey, |
1564 |
}; |
1565 |
struct ibv_send_wr send_wr = {
|
1566 |
.wr_id = RDMA_WRID_SEND_CONTROL, |
1567 |
.opcode = IBV_WR_SEND, |
1568 |
.send_flags = IBV_SEND_SIGNALED, |
1569 |
.sg_list = &sge, |
1570 |
.num_sge = 1,
|
1571 |
}; |
1572 |
|
1573 |
DDDPRINTF("CONTROL: sending %s..\n", control_desc[head->type]);
|
1574 |
|
1575 |
/*
|
1576 |
* We don't actually need to do a memcpy() in here if we used
|
1577 |
* the "sge" properly, but since we're only sending control messages
|
1578 |
* (not RAM in a performance-critical path), then its OK for now.
|
1579 |
*
|
1580 |
* The copy makes the RDMAControlHeader simpler to manipulate
|
1581 |
* for the time being.
|
1582 |
*/
|
1583 |
assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
|
1584 |
memcpy(wr->control, head, sizeof(RDMAControlHeader));
|
1585 |
control_to_network((void *) wr->control);
|
1586 |
|
1587 |
if (buf) {
|
1588 |
memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
|
1589 |
} |
1590 |
|
1591 |
|
1592 |
if (ibv_post_send(rdma->qp, &send_wr, &bad_wr)) {
|
1593 |
return -1; |
1594 |
} |
1595 |
|
1596 |
if (ret < 0) { |
1597 |
fprintf(stderr, "Failed to use post IB SEND for control!\n");
|
1598 |
return ret;
|
1599 |
} |
1600 |
|
1601 |
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
|
1602 |
if (ret < 0) { |
1603 |
fprintf(stderr, "rdma migration: send polling control error!\n");
|
1604 |
} |
1605 |
|
1606 |
return ret;
|
1607 |
} |
1608 |
|
1609 |
/*
|
1610 |
* Post a RECV work request in anticipation of some future receipt
|
1611 |
* of data on the control channel.
|
1612 |
*/
|
1613 |
static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx) |
1614 |
{ |
1615 |
struct ibv_recv_wr *bad_wr;
|
1616 |
struct ibv_sge sge = {
|
1617 |
.addr = (uint64_t)(rdma->wr_data[idx].control), |
1618 |
.length = RDMA_CONTROL_MAX_BUFFER, |
1619 |
.lkey = rdma->wr_data[idx].control_mr->lkey, |
1620 |
}; |
1621 |
|
1622 |
struct ibv_recv_wr recv_wr = {
|
1623 |
.wr_id = RDMA_WRID_RECV_CONTROL + idx, |
1624 |
.sg_list = &sge, |
1625 |
.num_sge = 1,
|
1626 |
}; |
1627 |
|
1628 |
|
1629 |
if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
|
1630 |
return -1; |
1631 |
} |
1632 |
|
1633 |
return 0; |
1634 |
} |
1635 |
|
1636 |
/*
|
1637 |
* Block and wait for a RECV control channel message to arrive.
|
1638 |
*/
|
1639 |
static int qemu_rdma_exchange_get_response(RDMAContext *rdma, |
1640 |
RDMAControlHeader *head, int expecting, int idx) |
1641 |
{ |
1642 |
uint32_t byte_len; |
1643 |
int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
|
1644 |
&byte_len); |
1645 |
|
1646 |
if (ret < 0) { |
1647 |
fprintf(stderr, "rdma migration: recv polling control error!\n");
|
1648 |
return ret;
|
1649 |
} |
1650 |
|
1651 |
network_to_control((void *) rdma->wr_data[idx].control);
|
1652 |
memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
|
1653 |
|
1654 |
DDDPRINTF("CONTROL: %s receiving...\n", control_desc[expecting]);
|
1655 |
|
1656 |
if (expecting == RDMA_CONTROL_NONE) {
|
1657 |
DDDPRINTF("Surprise: got %s (%d)\n",
|
1658 |
control_desc[head->type], head->type); |
1659 |
} else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) { |
1660 |
fprintf(stderr, "Was expecting a %s (%d) control message"
|
1661 |
", but got: %s (%d), length: %d\n",
|
1662 |
control_desc[expecting], expecting, |
1663 |
control_desc[head->type], head->type, head->len); |
1664 |
return -EIO;
|
1665 |
} |
1666 |
if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) { |
1667 |
fprintf(stderr, "too long length: %d\n", head->len);
|
1668 |
return -EINVAL;
|
1669 |
} |
1670 |
if (sizeof(*head) + head->len != byte_len) { |
1671 |
fprintf(stderr, "Malformed length: %d byte_len %d\n",
|
1672 |
head->len, byte_len); |
1673 |
return -EINVAL;
|
1674 |
} |
1675 |
|
1676 |
return 0; |
1677 |
} |
1678 |
|
1679 |
/*
|
1680 |
* When a RECV work request has completed, the work request's
|
1681 |
* buffer is pointed at the header.
|
1682 |
*
|
1683 |
* This will advance the pointer to the data portion
|
1684 |
* of the control message of the work request's buffer that
|
1685 |
* was populated after the work request finished.
|
1686 |
*/
|
1687 |
static void qemu_rdma_move_header(RDMAContext *rdma, int idx, |
1688 |
RDMAControlHeader *head) |
1689 |
{ |
1690 |
rdma->wr_data[idx].control_len = head->len; |
1691 |
rdma->wr_data[idx].control_curr = |
1692 |
rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
|
1693 |
} |
1694 |
|
1695 |
/*
|
1696 |
* This is an 'atomic' high-level operation to deliver a single, unified
|
1697 |
* control-channel message.
|
1698 |
*
|
1699 |
* Additionally, if the user is expecting some kind of reply to this message,
|
1700 |
* they can request a 'resp' response message be filled in by posting an
|
1701 |
* additional work request on behalf of the user and waiting for an additional
|
1702 |
* completion.
|
1703 |
*
|
1704 |
* The extra (optional) response is used during registration to us from having
|
1705 |
* to perform an *additional* exchange of message just to provide a response by
|
1706 |
* instead piggy-backing on the acknowledgement.
|
1707 |
*/
|
1708 |
static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, |
1709 |
uint8_t *data, RDMAControlHeader *resp, |
1710 |
int *resp_idx,
|
1711 |
int (*callback)(RDMAContext *rdma))
|
1712 |
{ |
1713 |
int ret = 0; |
1714 |
|
1715 |
/*
|
1716 |
* Wait until the dest is ready before attempting to deliver the message
|
1717 |
* by waiting for a READY message.
|
1718 |
*/
|
1719 |
if (rdma->control_ready_expected) {
|
1720 |
RDMAControlHeader resp; |
1721 |
ret = qemu_rdma_exchange_get_response(rdma, |
1722 |
&resp, RDMA_CONTROL_READY, RDMA_WRID_READY); |
1723 |
if (ret < 0) { |
1724 |
return ret;
|
1725 |
} |
1726 |
} |
1727 |
|
1728 |
/*
|
1729 |
* If the user is expecting a response, post a WR in anticipation of it.
|
1730 |
*/
|
1731 |
if (resp) {
|
1732 |
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA); |
1733 |
if (ret) {
|
1734 |
fprintf(stderr, "rdma migration: error posting"
|
1735 |
" extra control recv for anticipated result!");
|
1736 |
return ret;
|
1737 |
} |
1738 |
} |
1739 |
|
1740 |
/*
|
1741 |
* Post a WR to replace the one we just consumed for the READY message.
|
1742 |
*/
|
1743 |
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); |
1744 |
if (ret) {
|
1745 |
fprintf(stderr, "rdma migration: error posting first control recv!");
|
1746 |
return ret;
|
1747 |
} |
1748 |
|
1749 |
/*
|
1750 |
* Deliver the control message that was requested.
|
1751 |
*/
|
1752 |
ret = qemu_rdma_post_send_control(rdma, data, head); |
1753 |
|
1754 |
if (ret < 0) { |
1755 |
fprintf(stderr, "Failed to send control buffer!\n");
|
1756 |
return ret;
|
1757 |
} |
1758 |
|
1759 |
/*
|
1760 |
* If we're expecting a response, block and wait for it.
|
1761 |
*/
|
1762 |
if (resp) {
|
1763 |
if (callback) {
|
1764 |
DDPRINTF("Issuing callback before receiving response...\n");
|
1765 |
ret = callback(rdma); |
1766 |
if (ret < 0) { |
1767 |
return ret;
|
1768 |
} |
1769 |
} |
1770 |
|
1771 |
DDPRINTF("Waiting for response %s\n", control_desc[resp->type]);
|
1772 |
ret = qemu_rdma_exchange_get_response(rdma, resp, |
1773 |
resp->type, RDMA_WRID_DATA); |
1774 |
|
1775 |
if (ret < 0) { |
1776 |
return ret;
|
1777 |
} |
1778 |
|
1779 |
qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp); |
1780 |
if (resp_idx) {
|
1781 |
*resp_idx = RDMA_WRID_DATA; |
1782 |
} |
1783 |
DDPRINTF("Response %s received.\n", control_desc[resp->type]);
|
1784 |
} |
1785 |
|
1786 |
rdma->control_ready_expected = 1;
|
1787 |
|
1788 |
return 0; |
1789 |
} |
1790 |
|
1791 |
/*
|
1792 |
* This is an 'atomic' high-level operation to receive a single, unified
|
1793 |
* control-channel message.
|
1794 |
*/
|
1795 |
static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head, |
1796 |
int expecting)
|
1797 |
{ |
1798 |
RDMAControlHeader ready = { |
1799 |
.len = 0,
|
1800 |
.type = RDMA_CONTROL_READY, |
1801 |
.repeat = 1,
|
1802 |
}; |
1803 |
int ret;
|
1804 |
|
1805 |
/*
|
1806 |
* Inform the source that we're ready to receive a message.
|
1807 |
*/
|
1808 |
ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
|
1809 |
|
1810 |
if (ret < 0) { |
1811 |
fprintf(stderr, "Failed to send control buffer!\n");
|
1812 |
return ret;
|
1813 |
} |
1814 |
|
1815 |
/*
|
1816 |
* Block and wait for the message.
|
1817 |
*/
|
1818 |
ret = qemu_rdma_exchange_get_response(rdma, head, |
1819 |
expecting, RDMA_WRID_READY); |
1820 |
|
1821 |
if (ret < 0) { |
1822 |
return ret;
|
1823 |
} |
1824 |
|
1825 |
qemu_rdma_move_header(rdma, RDMA_WRID_READY, head); |
1826 |
|
1827 |
/*
|
1828 |
* Post a new RECV work request to replace the one we just consumed.
|
1829 |
*/
|
1830 |
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); |
1831 |
if (ret) {
|
1832 |
fprintf(stderr, "rdma migration: error posting second control recv!");
|
1833 |
return ret;
|
1834 |
} |
1835 |
|
1836 |
return 0; |
1837 |
} |
1838 |
|
1839 |
/*
|
1840 |
* Write an actual chunk of memory using RDMA.
|
1841 |
*
|
1842 |
* If we're using dynamic registration on the dest-side, we have to
|
1843 |
* send a registration command first.
|
1844 |
*/
|
1845 |
static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma, |
1846 |
int current_index, uint64_t current_addr,
|
1847 |
uint64_t length) |
1848 |
{ |
1849 |
struct ibv_sge sge;
|
1850 |
struct ibv_send_wr send_wr = { 0 }; |
1851 |
struct ibv_send_wr *bad_wr;
|
1852 |
int reg_result_idx, ret, count = 0; |
1853 |
uint64_t chunk, chunks; |
1854 |
uint8_t *chunk_start, *chunk_end; |
1855 |
RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]); |
1856 |
RDMARegister reg; |
1857 |
RDMARegisterResult *reg_result; |
1858 |
RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT }; |
1859 |
RDMAControlHeader head = { .len = sizeof(RDMARegister),
|
1860 |
.type = RDMA_CONTROL_REGISTER_REQUEST, |
1861 |
.repeat = 1,
|
1862 |
}; |
1863 |
|
1864 |
retry:
|
1865 |
sge.addr = (uint64_t)(block->local_host_addr + |
1866 |
(current_addr - block->offset)); |
1867 |
sge.length = length; |
1868 |
|
1869 |
chunk = ram_chunk_index(block->local_host_addr, (uint8_t *) sge.addr); |
1870 |
chunk_start = ram_chunk_start(block, chunk); |
1871 |
|
1872 |
if (block->is_ram_block) {
|
1873 |
chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
|
1874 |
|
1875 |
if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { |
1876 |
chunks--; |
1877 |
} |
1878 |
} else {
|
1879 |
chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
|
1880 |
|
1881 |
if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { |
1882 |
chunks--; |
1883 |
} |
1884 |
} |
1885 |
|
1886 |
DDPRINTF("Writing %" PRIu64 " chunks, (%" PRIu64 " MB)\n", |
1887 |
chunks + 1, (chunks + 1) * (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024); |
1888 |
|
1889 |
chunk_end = ram_chunk_end(block, chunk + chunks); |
1890 |
|
1891 |
if (!rdma->pin_all) {
|
1892 |
#ifdef RDMA_UNREGISTRATION_EXAMPLE
|
1893 |
qemu_rdma_unregister_waiting(rdma); |
1894 |
#endif
|
1895 |
} |
1896 |
|
1897 |
while (test_bit(chunk, block->transit_bitmap)) {
|
1898 |
(void)count;
|
1899 |
DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
|
1900 |
" current %" PRIu64 " len %" PRIu64 " %d %d\n", |
1901 |
count++, current_index, chunk, |
1902 |
sge.addr, length, rdma->nb_sent, block->nb_chunks); |
1903 |
|
1904 |
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
|
1905 |
|
1906 |
if (ret < 0) { |
1907 |
fprintf(stderr, "Failed to Wait for previous write to complete "
|
1908 |
"block %d chunk %" PRIu64
|
1909 |
" current %" PRIu64 " len %" PRIu64 " %d\n", |
1910 |
current_index, chunk, sge.addr, length, rdma->nb_sent); |
1911 |
return ret;
|
1912 |
} |
1913 |
} |
1914 |
|
1915 |
if (!rdma->pin_all || !block->is_ram_block) {
|
1916 |
if (!block->remote_keys[chunk]) {
|
1917 |
/*
|
1918 |
* This chunk has not yet been registered, so first check to see
|
1919 |
* if the entire chunk is zero. If so, tell the other size to
|
1920 |
* memset() + madvise() the entire chunk without RDMA.
|
1921 |
*/
|
1922 |
|
1923 |
if (can_use_buffer_find_nonzero_offset((void *)sge.addr, length) |
1924 |
&& buffer_find_nonzero_offset((void *)sge.addr,
|
1925 |
length) == length) { |
1926 |
RDMACompress comp = { |
1927 |
.offset = current_addr, |
1928 |
.value = 0,
|
1929 |
.block_idx = current_index, |
1930 |
.length = length, |
1931 |
}; |
1932 |
|
1933 |
head.len = sizeof(comp);
|
1934 |
head.type = RDMA_CONTROL_COMPRESS; |
1935 |
|
1936 |
DDPRINTF("Entire chunk is zero, sending compress: %"
|
1937 |
PRIu64 " for %d "
|
1938 |
"bytes, index: %d, offset: %" PRId64 "...\n", |
1939 |
chunk, sge.length, current_index, current_addr); |
1940 |
|
1941 |
compress_to_network(&comp); |
1942 |
ret = qemu_rdma_exchange_send(rdma, &head, |
1943 |
(uint8_t *) &comp, NULL, NULL, NULL); |
1944 |
|
1945 |
if (ret < 0) { |
1946 |
return -EIO;
|
1947 |
} |
1948 |
|
1949 |
acct_update_position(f, sge.length, true);
|
1950 |
|
1951 |
return 1; |
1952 |
} |
1953 |
|
1954 |
/*
|
1955 |
* Otherwise, tell other side to register.
|
1956 |
*/
|
1957 |
reg.current_index = current_index; |
1958 |
if (block->is_ram_block) {
|
1959 |
reg.key.current_addr = current_addr; |
1960 |
} else {
|
1961 |
reg.key.chunk = chunk; |
1962 |
} |
1963 |
reg.chunks = chunks; |
1964 |
|
1965 |
DDPRINTF("Sending registration request chunk %" PRIu64 " for %d " |
1966 |
"bytes, index: %d, offset: %" PRId64 "...\n", |
1967 |
chunk, sge.length, current_index, current_addr); |
1968 |
|
1969 |
register_to_network(®); |
1970 |
ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, |
1971 |
&resp, ®_result_idx, NULL);
|
1972 |
if (ret < 0) { |
1973 |
return ret;
|
1974 |
} |
1975 |
|
1976 |
/* try to overlap this single registration with the one we sent. */
|
1977 |
if (qemu_rdma_register_and_get_keys(rdma, block,
|
1978 |
(uint8_t *) sge.addr, |
1979 |
&sge.lkey, NULL, chunk,
|
1980 |
chunk_start, chunk_end)) { |
1981 |
fprintf(stderr, "cannot get lkey!\n");
|
1982 |
return -EINVAL;
|
1983 |
} |
1984 |
|
1985 |
reg_result = (RDMARegisterResult *) |
1986 |
rdma->wr_data[reg_result_idx].control_curr; |
1987 |
|
1988 |
network_to_result(reg_result); |
1989 |
|
1990 |
DDPRINTF("Received registration result:"
|
1991 |
" my key: %x their key %x, chunk %" PRIu64 "\n", |
1992 |
block->remote_keys[chunk], reg_result->rkey, chunk); |
1993 |
|
1994 |
block->remote_keys[chunk] = reg_result->rkey; |
1995 |
block->remote_host_addr = reg_result->host_addr; |
1996 |
} else {
|
1997 |
/* already registered before */
|
1998 |
if (qemu_rdma_register_and_get_keys(rdma, block,
|
1999 |
(uint8_t *)sge.addr, |
2000 |
&sge.lkey, NULL, chunk,
|
2001 |
chunk_start, chunk_end)) { |
2002 |
fprintf(stderr, "cannot get lkey!\n");
|
2003 |
return -EINVAL;
|
2004 |
} |
2005 |
} |
2006 |
|
2007 |
send_wr.wr.rdma.rkey = block->remote_keys[chunk]; |
2008 |
} else {
|
2009 |
send_wr.wr.rdma.rkey = block->remote_rkey; |
2010 |
|
2011 |
if (qemu_rdma_register_and_get_keys(rdma, block, (uint8_t *)sge.addr,
|
2012 |
&sge.lkey, NULL, chunk,
|
2013 |
chunk_start, chunk_end)) { |
2014 |
fprintf(stderr, "cannot get lkey!\n");
|
2015 |
return -EINVAL;
|
2016 |
} |
2017 |
} |
2018 |
|
2019 |
/*
|
2020 |
* Encode the ram block index and chunk within this wrid.
|
2021 |
* We will use this information at the time of completion
|
2022 |
* to figure out which bitmap to check against and then which
|
2023 |
* chunk in the bitmap to look for.
|
2024 |
*/
|
2025 |
send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE, |
2026 |
current_index, chunk); |
2027 |
|
2028 |
send_wr.opcode = IBV_WR_RDMA_WRITE; |
2029 |
send_wr.send_flags = IBV_SEND_SIGNALED; |
2030 |
send_wr.sg_list = &sge; |
2031 |
send_wr.num_sge = 1;
|
2032 |
send_wr.wr.rdma.remote_addr = block->remote_host_addr + |
2033 |
(current_addr - block->offset); |
2034 |
|
2035 |
DDDPRINTF("Posting chunk: %" PRIu64 ", addr: %lx" |
2036 |
" remote: %lx, bytes %" PRIu32 "\n", |
2037 |
chunk, sge.addr, send_wr.wr.rdma.remote_addr, |
2038 |
sge.length); |
2039 |
|
2040 |
/*
|
2041 |
* ibv_post_send() does not return negative error numbers,
|
2042 |
* per the specification they are positive - no idea why.
|
2043 |
*/
|
2044 |
ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); |
2045 |
|
2046 |
if (ret == ENOMEM) {
|
2047 |
DDPRINTF("send queue is full. wait a little....\n");
|
2048 |
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
|
2049 |
if (ret < 0) { |
2050 |
fprintf(stderr, "rdma migration: failed to make "
|
2051 |
"room in full send queue! %d\n", ret);
|
2052 |
return ret;
|
2053 |
} |
2054 |
|
2055 |
goto retry;
|
2056 |
|
2057 |
} else if (ret > 0) { |
2058 |
perror("rdma migration: post rdma write failed");
|
2059 |
return -ret;
|
2060 |
} |
2061 |
|
2062 |
set_bit(chunk, block->transit_bitmap); |
2063 |
acct_update_position(f, sge.length, false);
|
2064 |
rdma->total_writes++; |
2065 |
|
2066 |
return 0; |
2067 |
} |
2068 |
|
2069 |
/*
|
2070 |
* Push out any unwritten RDMA operations.
|
2071 |
*
|
2072 |
* We support sending out multiple chunks at the same time.
|
2073 |
* Not all of them need to get signaled in the completion queue.
|
2074 |
*/
|
2075 |
static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma) |
2076 |
{ |
2077 |
int ret;
|
2078 |
|
2079 |
if (!rdma->current_length) {
|
2080 |
return 0; |
2081 |
} |
2082 |
|
2083 |
ret = qemu_rdma_write_one(f, rdma, |
2084 |
rdma->current_index, rdma->current_addr, rdma->current_length); |
2085 |
|
2086 |
if (ret < 0) { |
2087 |
return ret;
|
2088 |
} |
2089 |
|
2090 |
if (ret == 0) { |
2091 |
rdma->nb_sent++; |
2092 |
DDDPRINTF("sent total: %d\n", rdma->nb_sent);
|
2093 |
} |
2094 |
|
2095 |
rdma->current_length = 0;
|
2096 |
rdma->current_addr = 0;
|
2097 |
|
2098 |
return 0; |
2099 |
} |
2100 |
|
2101 |
static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma, |
2102 |
uint64_t offset, uint64_t len) |
2103 |
{ |
2104 |
RDMALocalBlock *block; |
2105 |
uint8_t *host_addr; |
2106 |
uint8_t *chunk_end; |
2107 |
|
2108 |
if (rdma->current_index < 0) { |
2109 |
return 0; |
2110 |
} |
2111 |
|
2112 |
if (rdma->current_chunk < 0) { |
2113 |
return 0; |
2114 |
} |
2115 |
|
2116 |
block = &(rdma->local_ram_blocks.block[rdma->current_index]); |
2117 |
host_addr = block->local_host_addr + (offset - block->offset); |
2118 |
chunk_end = ram_chunk_end(block, rdma->current_chunk); |
2119 |
|
2120 |
if (rdma->current_length == 0) { |
2121 |
return 0; |
2122 |
} |
2123 |
|
2124 |
/*
|
2125 |
* Only merge into chunk sequentially.
|
2126 |
*/
|
2127 |
if (offset != (rdma->current_addr + rdma->current_length)) {
|
2128 |
return 0; |
2129 |
} |
2130 |
|
2131 |
if (offset < block->offset) {
|
2132 |
return 0; |
2133 |
} |
2134 |
|
2135 |
if ((offset + len) > (block->offset + block->length)) {
|
2136 |
return 0; |
2137 |
} |
2138 |
|
2139 |
if ((host_addr + len) > chunk_end) {
|
2140 |
return 0; |
2141 |
} |
2142 |
|
2143 |
return 1; |
2144 |
} |
2145 |
|
2146 |
/*
|
2147 |
* We're not actually writing here, but doing three things:
|
2148 |
*
|
2149 |
* 1. Identify the chunk the buffer belongs to.
|
2150 |
* 2. If the chunk is full or the buffer doesn't belong to the current
|
2151 |
* chunk, then start a new chunk and flush() the old chunk.
|
2152 |
* 3. To keep the hardware busy, we also group chunks into batches
|
2153 |
* and only require that a batch gets acknowledged in the completion
|
2154 |
* qeueue instead of each individual chunk.
|
2155 |
*/
|
2156 |
static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma, |
2157 |
uint64_t block_offset, uint64_t offset, |
2158 |
uint64_t len) |
2159 |
{ |
2160 |
uint64_t current_addr = block_offset + offset; |
2161 |
uint64_t index = rdma->current_index; |
2162 |
uint64_t chunk = rdma->current_chunk; |
2163 |
int ret;
|
2164 |
|
2165 |
/* If we cannot merge it, we flush the current buffer first. */
|
2166 |
if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
|
2167 |
ret = qemu_rdma_write_flush(f, rdma); |
2168 |
if (ret) {
|
2169 |
return ret;
|
2170 |
} |
2171 |
rdma->current_length = 0;
|
2172 |
rdma->current_addr = current_addr; |
2173 |
|
2174 |
ret = qemu_rdma_search_ram_block(rdma, block_offset, |
2175 |
offset, len, &index, &chunk); |
2176 |
if (ret) {
|
2177 |
fprintf(stderr, "ram block search failed\n");
|
2178 |
return ret;
|
2179 |
} |
2180 |
rdma->current_index = index; |
2181 |
rdma->current_chunk = chunk; |
2182 |
} |
2183 |
|
2184 |
/* merge it */
|
2185 |
rdma->current_length += len; |
2186 |
|
2187 |
/* flush it if buffer is too large */
|
2188 |
if (rdma->current_length >= RDMA_MERGE_MAX) {
|
2189 |
return qemu_rdma_write_flush(f, rdma);
|
2190 |
} |
2191 |
|
2192 |
return 0; |
2193 |
} |
2194 |
|
2195 |
static void qemu_rdma_cleanup(RDMAContext *rdma) |
2196 |
{ |
2197 |
struct rdma_cm_event *cm_event;
|
2198 |
int ret, idx;
|
2199 |
|
2200 |
if (rdma->cm_id && rdma->connected) {
|
2201 |
if (rdma->error_state) {
|
2202 |
RDMAControlHeader head = { .len = 0,
|
2203 |
.type = RDMA_CONTROL_ERROR, |
2204 |
.repeat = 1,
|
2205 |
}; |
2206 |
fprintf(stderr, "Early error. Sending error.\n");
|
2207 |
qemu_rdma_post_send_control(rdma, NULL, &head);
|
2208 |
} |
2209 |
|
2210 |
ret = rdma_disconnect(rdma->cm_id); |
2211 |
if (!ret) {
|
2212 |
DDPRINTF("waiting for disconnect\n");
|
2213 |
ret = rdma_get_cm_event(rdma->channel, &cm_event); |
2214 |
if (!ret) {
|
2215 |
rdma_ack_cm_event(cm_event); |
2216 |
} |
2217 |
} |
2218 |
DDPRINTF("Disconnected.\n");
|
2219 |
rdma->connected = false;
|
2220 |
} |
2221 |
|
2222 |
g_free(rdma->block); |
2223 |
rdma->block = NULL;
|
2224 |
|
2225 |
for (idx = 0; idx < RDMA_WRID_MAX; idx++) { |
2226 |
if (rdma->wr_data[idx].control_mr) {
|
2227 |
rdma->total_registrations--; |
2228 |
ibv_dereg_mr(rdma->wr_data[idx].control_mr); |
2229 |
} |
2230 |
rdma->wr_data[idx].control_mr = NULL;
|
2231 |
} |
2232 |
|
2233 |
if (rdma->local_ram_blocks.block) {
|
2234 |
while (rdma->local_ram_blocks.nb_blocks) {
|
2235 |
__qemu_rdma_delete_block(rdma, |
2236 |
rdma->local_ram_blocks.block->offset); |
2237 |
} |
2238 |
} |
2239 |
|
2240 |
if (rdma->qp) {
|
2241 |
rdma_destroy_qp(rdma->cm_id); |
2242 |
rdma->qp = NULL;
|
2243 |
} |
2244 |
if (rdma->cq) {
|
2245 |
ibv_destroy_cq(rdma->cq); |
2246 |
rdma->cq = NULL;
|
2247 |
} |
2248 |
if (rdma->comp_channel) {
|
2249 |
ibv_destroy_comp_channel(rdma->comp_channel); |
2250 |
rdma->comp_channel = NULL;
|
2251 |
} |
2252 |
if (rdma->pd) {
|
2253 |
ibv_dealloc_pd(rdma->pd); |
2254 |
rdma->pd = NULL;
|
2255 |
} |
2256 |
if (rdma->listen_id) {
|
2257 |
rdma_destroy_id(rdma->listen_id); |
2258 |
rdma->listen_id = NULL;
|
2259 |
} |
2260 |
if (rdma->cm_id) {
|
2261 |
rdma_destroy_id(rdma->cm_id); |
2262 |
rdma->cm_id = NULL;
|
2263 |
} |
2264 |
if (rdma->channel) {
|
2265 |
rdma_destroy_event_channel(rdma->channel); |
2266 |
rdma->channel = NULL;
|
2267 |
} |
2268 |
g_free(rdma->host); |
2269 |
rdma->host = NULL;
|
2270 |
} |
2271 |
|
2272 |
|
2273 |
static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all) |
2274 |
{ |
2275 |
int ret, idx;
|
2276 |
Error *local_err = NULL, **temp = &local_err;
|
2277 |
|
2278 |
/*
|
2279 |
* Will be validated against destination's actual capabilities
|
2280 |
* after the connect() completes.
|
2281 |
*/
|
2282 |
rdma->pin_all = pin_all; |
2283 |
|
2284 |
ret = qemu_rdma_resolve_host(rdma, temp); |
2285 |
if (ret) {
|
2286 |
goto err_rdma_source_init;
|
2287 |
} |
2288 |
|
2289 |
ret = qemu_rdma_alloc_pd_cq(rdma); |
2290 |
if (ret) {
|
2291 |
ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
|
2292 |
" limits may be too low. Please check $ ulimit -a # and "
|
2293 |
"search for 'ulimit -l' in the output");
|
2294 |
goto err_rdma_source_init;
|
2295 |
} |
2296 |
|
2297 |
ret = qemu_rdma_alloc_qp(rdma); |
2298 |
if (ret) {
|
2299 |
ERROR(temp, "rdma migration: error allocating qp!");
|
2300 |
goto err_rdma_source_init;
|
2301 |
} |
2302 |
|
2303 |
ret = qemu_rdma_init_ram_blocks(rdma); |
2304 |
if (ret) {
|
2305 |
ERROR(temp, "rdma migration: error initializing ram blocks!");
|
2306 |
goto err_rdma_source_init;
|
2307 |
} |
2308 |
|
2309 |
for (idx = 0; idx < RDMA_WRID_MAX; idx++) { |
2310 |
ret = qemu_rdma_reg_control(rdma, idx); |
2311 |
if (ret) {
|
2312 |
ERROR(temp, "rdma migration: error registering %d control!",
|
2313 |
idx); |
2314 |
goto err_rdma_source_init;
|
2315 |
} |
2316 |
} |
2317 |
|
2318 |
return 0; |
2319 |
|
2320 |
err_rdma_source_init:
|
2321 |
error_propagate(errp, local_err); |
2322 |
qemu_rdma_cleanup(rdma); |
2323 |
return -1; |
2324 |
} |
2325 |
|
2326 |
static int qemu_rdma_connect(RDMAContext *rdma, Error **errp) |
2327 |
{ |
2328 |
RDMACapabilities cap = { |
2329 |
.version = RDMA_CONTROL_VERSION_CURRENT, |
2330 |
.flags = 0,
|
2331 |
}; |
2332 |
struct rdma_conn_param conn_param = { .initiator_depth = 2, |
2333 |
.retry_count = 5,
|
2334 |
.private_data = &cap, |
2335 |
.private_data_len = sizeof(cap),
|
2336 |
}; |
2337 |
struct rdma_cm_event *cm_event;
|
2338 |
int ret;
|
2339 |
|
2340 |
/*
|
2341 |
* Only negotiate the capability with destination if the user
|
2342 |
* on the source first requested the capability.
|
2343 |
*/
|
2344 |
if (rdma->pin_all) {
|
2345 |
DPRINTF("Server pin-all memory requested.\n");
|
2346 |
cap.flags |= RDMA_CAPABILITY_PIN_ALL; |
2347 |
} |
2348 |
|
2349 |
caps_to_network(&cap); |
2350 |
|
2351 |
ret = rdma_connect(rdma->cm_id, &conn_param); |
2352 |
if (ret) {
|
2353 |
perror("rdma_connect");
|
2354 |
ERROR(errp, "connecting to destination!");
|
2355 |
rdma_destroy_id(rdma->cm_id); |
2356 |
rdma->cm_id = NULL;
|
2357 |
goto err_rdma_source_connect;
|
2358 |
} |
2359 |
|
2360 |
ret = rdma_get_cm_event(rdma->channel, &cm_event); |
2361 |
if (ret) {
|
2362 |
perror("rdma_get_cm_event after rdma_connect");
|
2363 |
ERROR(errp, "connecting to destination!");
|
2364 |
rdma_ack_cm_event(cm_event); |
2365 |
rdma_destroy_id(rdma->cm_id); |
2366 |
rdma->cm_id = NULL;
|
2367 |
goto err_rdma_source_connect;
|
2368 |
} |
2369 |
|
2370 |
if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
|
2371 |
perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
|
2372 |
ERROR(errp, "connecting to destination!");
|
2373 |
rdma_ack_cm_event(cm_event); |
2374 |
rdma_destroy_id(rdma->cm_id); |
2375 |
rdma->cm_id = NULL;
|
2376 |
goto err_rdma_source_connect;
|
2377 |
} |
2378 |
rdma->connected = true;
|
2379 |
|
2380 |
memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
|
2381 |
network_to_caps(&cap); |
2382 |
|
2383 |
/*
|
2384 |
* Verify that the *requested* capabilities are supported by the destination
|
2385 |
* and disable them otherwise.
|
2386 |
*/
|
2387 |
if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
|
2388 |
ERROR(errp, "Server cannot support pinning all memory. "
|
2389 |
"Will register memory dynamically.");
|
2390 |
rdma->pin_all = false;
|
2391 |
} |
2392 |
|
2393 |
DPRINTF("Pin all memory: %s\n", rdma->pin_all ? "enabled" : "disabled"); |
2394 |
|
2395 |
rdma_ack_cm_event(cm_event); |
2396 |
|
2397 |
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); |
2398 |
if (ret) {
|
2399 |
ERROR(errp, "posting second control recv!");
|
2400 |
goto err_rdma_source_connect;
|
2401 |
} |
2402 |
|
2403 |
rdma->control_ready_expected = 1;
|
2404 |
rdma->nb_sent = 0;
|
2405 |
return 0; |
2406 |
|
2407 |
err_rdma_source_connect:
|
2408 |
qemu_rdma_cleanup(rdma); |
2409 |
return -1; |
2410 |
} |
2411 |
|
2412 |
static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) |
2413 |
{ |
2414 |
int ret = -EINVAL, idx;
|
2415 |
struct rdma_cm_id *listen_id;
|
2416 |
char ip[40] = "unknown"; |
2417 |
struct rdma_addrinfo *res;
|
2418 |
char port_str[16]; |
2419 |
|
2420 |
for (idx = 0; idx < RDMA_WRID_MAX; idx++) { |
2421 |
rdma->wr_data[idx].control_len = 0;
|
2422 |
rdma->wr_data[idx].control_curr = NULL;
|
2423 |
} |
2424 |
|
2425 |
if (rdma->host == NULL) { |
2426 |
ERROR(errp, "RDMA host is not set!");
|
2427 |
rdma->error_state = -EINVAL; |
2428 |
return -1; |
2429 |
} |
2430 |
/* create CM channel */
|
2431 |
rdma->channel = rdma_create_event_channel(); |
2432 |
if (!rdma->channel) {
|
2433 |
ERROR(errp, "could not create rdma event channel");
|
2434 |
rdma->error_state = -EINVAL; |
2435 |
return -1; |
2436 |
} |
2437 |
|
2438 |
/* create CM id */
|
2439 |
ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
|
2440 |
if (ret) {
|
2441 |
ERROR(errp, "could not create cm_id!");
|
2442 |
goto err_dest_init_create_listen_id;
|
2443 |
} |
2444 |
|
2445 |
snprintf(port_str, 16, "%d", rdma->port); |
2446 |
port_str[15] = '\0'; |
2447 |
|
2448 |
if (rdma->host && strcmp("", rdma->host)) { |
2449 |
struct rdma_addrinfo *e;
|
2450 |
|
2451 |
ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
|
2452 |
if (ret < 0) { |
2453 |
ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
|
2454 |
goto err_dest_init_bind_addr;
|
2455 |
} |
2456 |
|
2457 |
for (e = res; e != NULL; e = e->ai_next) { |
2458 |
inet_ntop(e->ai_family, |
2459 |
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); |
2460 |
DPRINTF("Trying %s => %s\n", rdma->host, ip);
|
2461 |
ret = rdma_bind_addr(listen_id, e->ai_dst_addr); |
2462 |
if (!ret) {
|
2463 |
if (e->ai_family == AF_INET6) {
|
2464 |
ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs); |
2465 |
if (ret) {
|
2466 |
continue;
|
2467 |
} |
2468 |
} |
2469 |
|
2470 |
goto listen;
|
2471 |
} |
2472 |
} |
2473 |
|
2474 |
ERROR(errp, "Error: could not rdma_bind_addr!");
|
2475 |
goto err_dest_init_bind_addr;
|
2476 |
} else {
|
2477 |
ERROR(errp, "migration host and port not specified!");
|
2478 |
ret = -EINVAL; |
2479 |
goto err_dest_init_bind_addr;
|
2480 |
} |
2481 |
listen:
|
2482 |
|
2483 |
rdma->listen_id = listen_id; |
2484 |
qemu_rdma_dump_gid("dest_init", listen_id);
|
2485 |
return 0; |
2486 |
|
2487 |
err_dest_init_bind_addr:
|
2488 |
rdma_destroy_id(listen_id); |
2489 |
err_dest_init_create_listen_id:
|
2490 |
rdma_destroy_event_channel(rdma->channel); |
2491 |
rdma->channel = NULL;
|
2492 |
rdma->error_state = ret; |
2493 |
return ret;
|
2494 |
|
2495 |
} |
2496 |
|
2497 |
static void *qemu_rdma_data_init(const char *host_port, Error **errp) |
2498 |
{ |
2499 |
RDMAContext *rdma = NULL;
|
2500 |
InetSocketAddress *addr; |
2501 |
|
2502 |
if (host_port) {
|
2503 |
rdma = g_malloc0(sizeof(RDMAContext));
|
2504 |
memset(rdma, 0, sizeof(RDMAContext)); |
2505 |
rdma->current_index = -1;
|
2506 |
rdma->current_chunk = -1;
|
2507 |
|
2508 |
addr = inet_parse(host_port, NULL);
|
2509 |
if (addr != NULL) { |
2510 |
rdma->port = atoi(addr->port); |
2511 |
rdma->host = g_strdup(addr->host); |
2512 |
} else {
|
2513 |
ERROR(errp, "bad RDMA migration address '%s'", host_port);
|
2514 |
g_free(rdma); |
2515 |
return NULL; |
2516 |
} |
2517 |
} |
2518 |
|
2519 |
return rdma;
|
2520 |
} |
2521 |
|
2522 |
/*
|
2523 |
* QEMUFile interface to the control channel.
|
2524 |
* SEND messages for control only.
|
2525 |
* pc.ram is handled with regular RDMA messages.
|
2526 |
*/
|
2527 |
static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, |
2528 |
int64_t pos, int size)
|
2529 |
{ |
2530 |
QEMUFileRDMA *r = opaque; |
2531 |
QEMUFile *f = r->file; |
2532 |
RDMAContext *rdma = r->rdma; |
2533 |
size_t remaining = size; |
2534 |
uint8_t * data = (void *) buf;
|
2535 |
int ret;
|
2536 |
|
2537 |
CHECK_ERROR_STATE(); |
2538 |
|
2539 |
/*
|
2540 |
* Push out any writes that
|
2541 |
* we're queued up for pc.ram.
|
2542 |
*/
|
2543 |
ret = qemu_rdma_write_flush(f, rdma); |
2544 |
if (ret < 0) { |
2545 |
rdma->error_state = ret; |
2546 |
return ret;
|
2547 |
} |
2548 |
|
2549 |
while (remaining) {
|
2550 |
RDMAControlHeader head; |
2551 |
|
2552 |
r->len = MIN(remaining, RDMA_SEND_INCREMENT); |
2553 |
remaining -= r->len; |
2554 |
|
2555 |
head.len = r->len; |
2556 |
head.type = RDMA_CONTROL_QEMU_FILE; |
2557 |
|
2558 |
ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL); |
2559 |
|
2560 |
if (ret < 0) { |
2561 |
rdma->error_state = ret; |
2562 |
return ret;
|
2563 |
} |
2564 |
|
2565 |
data += r->len; |
2566 |
} |
2567 |
|
2568 |
return size;
|
2569 |
} |
2570 |
|
2571 |
static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
|
2572 |
int size, int idx) |
2573 |
{ |
2574 |
size_t len = 0;
|
2575 |
|
2576 |
if (rdma->wr_data[idx].control_len) {
|
2577 |
DDDPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n", |
2578 |
rdma->wr_data[idx].control_len, size); |
2579 |
|
2580 |
len = MIN(size, rdma->wr_data[idx].control_len); |
2581 |
memcpy(buf, rdma->wr_data[idx].control_curr, len); |
2582 |
rdma->wr_data[idx].control_curr += len; |
2583 |
rdma->wr_data[idx].control_len -= len; |
2584 |
} |
2585 |
|
2586 |
return len;
|
2587 |
} |
2588 |
|
2589 |
/*
|
2590 |
* QEMUFile interface to the control channel.
|
2591 |
* RDMA links don't use bytestreams, so we have to
|
2592 |
* return bytes to QEMUFile opportunistically.
|
2593 |
*/
|
2594 |
static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, |
2595 |
int64_t pos, int size)
|
2596 |
{ |
2597 |
QEMUFileRDMA *r = opaque; |
2598 |
RDMAContext *rdma = r->rdma; |
2599 |
RDMAControlHeader head; |
2600 |
int ret = 0; |
2601 |
|
2602 |
CHECK_ERROR_STATE(); |
2603 |
|
2604 |
/*
|
2605 |
* First, we hold on to the last SEND message we
|
2606 |
* were given and dish out the bytes until we run
|
2607 |
* out of bytes.
|
2608 |
*/
|
2609 |
r->len = qemu_rdma_fill(r->rdma, buf, size, 0);
|
2610 |
if (r->len) {
|
2611 |
return r->len;
|
2612 |
} |
2613 |
|
2614 |
/*
|
2615 |
* Once we run out, we block and wait for another
|
2616 |
* SEND message to arrive.
|
2617 |
*/
|
2618 |
ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE); |
2619 |
|
2620 |
if (ret < 0) { |
2621 |
rdma->error_state = ret; |
2622 |
return ret;
|
2623 |
} |
2624 |
|
2625 |
/*
|
2626 |
* SEND was received with new bytes, now try again.
|
2627 |
*/
|
2628 |
return qemu_rdma_fill(r->rdma, buf, size, 0); |
2629 |
} |
2630 |
|
2631 |
/*
|
2632 |
* Block until all the outstanding chunks have been delivered by the hardware.
|
2633 |
*/
|
2634 |
static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma) |
2635 |
{ |
2636 |
int ret;
|
2637 |
|
2638 |
if (qemu_rdma_write_flush(f, rdma) < 0) { |
2639 |
return -EIO;
|
2640 |
} |
2641 |
|
2642 |
while (rdma->nb_sent) {
|
2643 |
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
|
2644 |
if (ret < 0) { |
2645 |
fprintf(stderr, "rdma migration: complete polling error!\n");
|
2646 |
return -EIO;
|
2647 |
} |
2648 |
} |
2649 |
|
2650 |
qemu_rdma_unregister_waiting(rdma); |
2651 |
|
2652 |
return 0; |
2653 |
} |
2654 |
|
2655 |
static int qemu_rdma_close(void *opaque) |
2656 |
{ |
2657 |
DPRINTF("Shutting down connection.\n");
|
2658 |
QEMUFileRDMA *r = opaque; |
2659 |
if (r->rdma) {
|
2660 |
qemu_rdma_cleanup(r->rdma); |
2661 |
g_free(r->rdma); |
2662 |
} |
2663 |
g_free(r); |
2664 |
return 0; |
2665 |
} |
2666 |
|
2667 |
/*
|
2668 |
* Parameters:
|
2669 |
* @offset == 0 :
|
2670 |
* This means that 'block_offset' is a full virtual address that does not
|
2671 |
* belong to a RAMBlock of the virtual machine and instead
|
2672 |
* represents a private malloc'd memory area that the caller wishes to
|
2673 |
* transfer.
|
2674 |
*
|
2675 |
* @offset != 0 :
|
2676 |
* Offset is an offset to be added to block_offset and used
|
2677 |
* to also lookup the corresponding RAMBlock.
|
2678 |
*
|
2679 |
* @size > 0 :
|
2680 |
* Initiate an transfer this size.
|
2681 |
*
|
2682 |
* @size == 0 :
|
2683 |
* A 'hint' or 'advice' that means that we wish to speculatively
|
2684 |
* and asynchronously unregister this memory. In this case, there is no
|
2685 |
* guarantee that the unregister will actually happen, for example,
|
2686 |
* if the memory is being actively transmitted. Additionally, the memory
|
2687 |
* may be re-registered at any future time if a write within the same
|
2688 |
* chunk was requested again, even if you attempted to unregister it
|
2689 |
* here.
|
2690 |
*
|
2691 |
* @size < 0 : TODO, not yet supported
|
2692 |
* Unregister the memory NOW. This means that the caller does not
|
2693 |
* expect there to be any future RDMA transfers and we just want to clean
|
2694 |
* things up. This is used in case the upper layer owns the memory and
|
2695 |
* cannot wait for qemu_fclose() to occur.
|
2696 |
*
|
2697 |
* @bytes_sent : User-specificed pointer to indicate how many bytes were
|
2698 |
* sent. Usually, this will not be more than a few bytes of
|
2699 |
* the protocol because most transfers are sent asynchronously.
|
2700 |
*/
|
2701 |
static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, |
2702 |
ram_addr_t block_offset, ram_addr_t offset, |
2703 |
size_t size, int *bytes_sent)
|
2704 |
{ |
2705 |
QEMUFileRDMA *rfile = opaque; |
2706 |
RDMAContext *rdma = rfile->rdma; |
2707 |
int ret;
|
2708 |
|
2709 |
CHECK_ERROR_STATE(); |
2710 |
|
2711 |
qemu_fflush(f); |
2712 |
|
2713 |
if (size > 0) { |
2714 |
/*
|
2715 |
* Add this page to the current 'chunk'. If the chunk
|
2716 |
* is full, or the page doen't belong to the current chunk,
|
2717 |
* an actual RDMA write will occur and a new chunk will be formed.
|
2718 |
*/
|
2719 |
ret = qemu_rdma_write(f, rdma, block_offset, offset, size); |
2720 |
if (ret < 0) { |
2721 |
fprintf(stderr, "rdma migration: write error! %d\n", ret);
|
2722 |
goto err;
|
2723 |
} |
2724 |
|
2725 |
/*
|
2726 |
* We always return 1 bytes because the RDMA
|
2727 |
* protocol is completely asynchronous. We do not yet know
|
2728 |
* whether an identified chunk is zero or not because we're
|
2729 |
* waiting for other pages to potentially be merged with
|
2730 |
* the current chunk. So, we have to call qemu_update_position()
|
2731 |
* later on when the actual write occurs.
|
2732 |
*/
|
2733 |
if (bytes_sent) {
|
2734 |
*bytes_sent = 1;
|
2735 |
} |
2736 |
} else {
|
2737 |
uint64_t index, chunk; |
2738 |
|
2739 |
/* TODO: Change QEMUFileOps prototype to be signed: size_t => long
|
2740 |
if (size < 0) {
|
2741 |
ret = qemu_rdma_drain_cq(f, rdma);
|
2742 |
if (ret < 0) {
|
2743 |
fprintf(stderr, "rdma: failed to synchronously drain"
|
2744 |
" completion queue before unregistration.\n");
|
2745 |
goto err;
|
2746 |
}
|
2747 |
}
|
2748 |
*/
|
2749 |
|
2750 |
ret = qemu_rdma_search_ram_block(rdma, block_offset, |
2751 |
offset, size, &index, &chunk); |
2752 |
|
2753 |
if (ret) {
|
2754 |
fprintf(stderr, "ram block search failed\n");
|
2755 |
goto err;
|
2756 |
} |
2757 |
|
2758 |
qemu_rdma_signal_unregister(rdma, index, chunk, 0);
|
2759 |
|
2760 |
/*
|
2761 |
* TODO: Synchronous, guaranteed unregistration (should not occur during
|
2762 |
* fast-path). Otherwise, unregisters will process on the next call to
|
2763 |
* qemu_rdma_drain_cq()
|
2764 |
if (size < 0) {
|
2765 |
qemu_rdma_unregister_waiting(rdma);
|
2766 |
}
|
2767 |
*/
|
2768 |
} |
2769 |
|
2770 |
/*
|
2771 |
* Drain the Completion Queue if possible, but do not block,
|
2772 |
* just poll.
|
2773 |
*
|
2774 |
* If nothing to poll, the end of the iteration will do this
|
2775 |
* again to make sure we don't overflow the request queue.
|
2776 |
*/
|
2777 |
while (1) { |
2778 |
uint64_t wr_id, wr_id_in; |
2779 |
int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL); |
2780 |
if (ret < 0) { |
2781 |
fprintf(stderr, "rdma migration: polling error! %d\n", ret);
|
2782 |
goto err;
|
2783 |
} |
2784 |
|
2785 |
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; |
2786 |
|
2787 |
if (wr_id == RDMA_WRID_NONE) {
|
2788 |
break;
|
2789 |
} |
2790 |
} |
2791 |
|
2792 |
return RAM_SAVE_CONTROL_DELAYED;
|
2793 |
err:
|
2794 |
rdma->error_state = ret; |
2795 |
return ret;
|
2796 |
} |
2797 |
|
2798 |
static int qemu_rdma_accept(RDMAContext *rdma) |
2799 |
{ |
2800 |
RDMACapabilities cap; |
2801 |
struct rdma_conn_param conn_param = {
|
2802 |
.responder_resources = 2,
|
2803 |
.private_data = &cap, |
2804 |
.private_data_len = sizeof(cap),
|
2805 |
}; |
2806 |
struct rdma_cm_event *cm_event;
|
2807 |
struct ibv_context *verbs;
|
2808 |
int ret = -EINVAL;
|
2809 |
int idx;
|
2810 |
|
2811 |
ret = rdma_get_cm_event(rdma->channel, &cm_event); |
2812 |
if (ret) {
|
2813 |
goto err_rdma_dest_wait;
|
2814 |
} |
2815 |
|
2816 |
if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
|
2817 |
rdma_ack_cm_event(cm_event); |
2818 |
goto err_rdma_dest_wait;
|
2819 |
} |
2820 |
|
2821 |
memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
|
2822 |
|
2823 |
network_to_caps(&cap); |
2824 |
|
2825 |
if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) { |
2826 |
fprintf(stderr, "Unknown source RDMA version: %d, bailing...\n",
|
2827 |
cap.version); |
2828 |
rdma_ack_cm_event(cm_event); |
2829 |
goto err_rdma_dest_wait;
|
2830 |
} |
2831 |
|
2832 |
/*
|
2833 |
* Respond with only the capabilities this version of QEMU knows about.
|
2834 |
*/
|
2835 |
cap.flags &= known_capabilities; |
2836 |
|
2837 |
/*
|
2838 |
* Enable the ones that we do know about.
|
2839 |
* Add other checks here as new ones are introduced.
|
2840 |
*/
|
2841 |
if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
|
2842 |
rdma->pin_all = true;
|
2843 |
} |
2844 |
|
2845 |
rdma->cm_id = cm_event->id; |
2846 |
verbs = cm_event->id->verbs; |
2847 |
|
2848 |
rdma_ack_cm_event(cm_event); |
2849 |
|
2850 |
DPRINTF("Memory pin all: %s\n", rdma->pin_all ? "enabled" : "disabled"); |
2851 |
|
2852 |
caps_to_network(&cap); |
2853 |
|
2854 |
DPRINTF("verbs context after listen: %p\n", verbs);
|
2855 |
|
2856 |
if (!rdma->verbs) {
|
2857 |
rdma->verbs = verbs; |
2858 |
} else if (rdma->verbs != verbs) { |
2859 |
fprintf(stderr, "ibv context not matching %p, %p!\n",
|
2860 |
rdma->verbs, verbs); |
2861 |
goto err_rdma_dest_wait;
|
2862 |
} |
2863 |
|
2864 |
qemu_rdma_dump_id("dest_init", verbs);
|
2865 |
|
2866 |
ret = qemu_rdma_alloc_pd_cq(rdma); |
2867 |
if (ret) {
|
2868 |
fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
|
2869 |
goto err_rdma_dest_wait;
|
2870 |
} |
2871 |
|
2872 |
ret = qemu_rdma_alloc_qp(rdma); |
2873 |
if (ret) {
|
2874 |
fprintf(stderr, "rdma migration: error allocating qp!\n");
|
2875 |
goto err_rdma_dest_wait;
|
2876 |
} |
2877 |
|
2878 |
ret = qemu_rdma_init_ram_blocks(rdma); |
2879 |
if (ret) {
|
2880 |
fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
|
2881 |
goto err_rdma_dest_wait;
|
2882 |
} |
2883 |
|
2884 |
for (idx = 0; idx < RDMA_WRID_MAX; idx++) { |
2885 |
ret = qemu_rdma_reg_control(rdma, idx); |
2886 |
if (ret) {
|
2887 |
fprintf(stderr, "rdma: error registering %d control!\n", idx);
|
2888 |
goto err_rdma_dest_wait;
|
2889 |
} |
2890 |
} |
2891 |
|
2892 |
qemu_set_fd_handler2(rdma->channel->fd, NULL, NULL, NULL, NULL); |
2893 |
|
2894 |
ret = rdma_accept(rdma->cm_id, &conn_param); |
2895 |
if (ret) {
|
2896 |
fprintf(stderr, "rdma_accept returns %d!\n", ret);
|
2897 |
goto err_rdma_dest_wait;
|
2898 |
} |
2899 |
|
2900 |
ret = rdma_get_cm_event(rdma->channel, &cm_event); |
2901 |
if (ret) {
|
2902 |
fprintf(stderr, "rdma_accept get_cm_event failed %d!\n", ret);
|
2903 |
goto err_rdma_dest_wait;
|
2904 |
} |
2905 |
|
2906 |
if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
|
2907 |
fprintf(stderr, "rdma_accept not event established!\n");
|
2908 |
rdma_ack_cm_event(cm_event); |
2909 |
goto err_rdma_dest_wait;
|
2910 |
} |
2911 |
|
2912 |
rdma_ack_cm_event(cm_event); |
2913 |
rdma->connected = true;
|
2914 |
|
2915 |
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); |
2916 |
if (ret) {
|
2917 |
fprintf(stderr, "rdma migration: error posting second control recv!\n");
|
2918 |
goto err_rdma_dest_wait;
|
2919 |
} |
2920 |
|
2921 |
qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
|
2922 |
|
2923 |
return 0; |
2924 |
|
2925 |
err_rdma_dest_wait:
|
2926 |
rdma->error_state = ret; |
2927 |
qemu_rdma_cleanup(rdma); |
2928 |
return ret;
|
2929 |
} |
2930 |
|
2931 |
/*
|
2932 |
* During each iteration of the migration, we listen for instructions
|
2933 |
* by the source VM to perform dynamic page registrations before they
|
2934 |
* can perform RDMA operations.
|
2935 |
*
|
2936 |
* We respond with the 'rkey'.
|
2937 |
*
|
2938 |
* Keep doing this until the source tells us to stop.
|
2939 |
*/
|
2940 |
static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, |
2941 |
uint64_t flags) |
2942 |
{ |
2943 |
RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
|
2944 |
.type = RDMA_CONTROL_REGISTER_RESULT, |
2945 |
.repeat = 0,
|
2946 |
}; |
2947 |
RDMAControlHeader unreg_resp = { .len = 0,
|
2948 |
.type = RDMA_CONTROL_UNREGISTER_FINISHED, |
2949 |
.repeat = 0,
|
2950 |
}; |
2951 |
RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT, |
2952 |
.repeat = 1 };
|
2953 |
QEMUFileRDMA *rfile = opaque; |
2954 |
RDMAContext *rdma = rfile->rdma; |
2955 |
RDMALocalBlocks *local = &rdma->local_ram_blocks; |
2956 |
RDMAControlHeader head; |
2957 |
RDMARegister *reg, *registers; |
2958 |
RDMACompress *comp; |
2959 |
RDMARegisterResult *reg_result; |
2960 |
static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
|
2961 |
RDMALocalBlock *block; |
2962 |
void *host_addr;
|
2963 |
int ret = 0; |
2964 |
int idx = 0; |
2965 |
int count = 0; |
2966 |
int i = 0; |
2967 |
|
2968 |
CHECK_ERROR_STATE(); |
2969 |
|
2970 |
do {
|
2971 |
DDDPRINTF("Waiting for next request %" PRIu64 "...\n", flags); |
2972 |
|
2973 |
ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE); |
2974 |
|
2975 |
if (ret < 0) { |
2976 |
break;
|
2977 |
} |
2978 |
|
2979 |
if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
|
2980 |
fprintf(stderr, "rdma: Too many requests in this message (%d)."
|
2981 |
"Bailing.\n", head.repeat);
|
2982 |
ret = -EIO; |
2983 |
break;
|
2984 |
} |
2985 |
|
2986 |
switch (head.type) {
|
2987 |
case RDMA_CONTROL_COMPRESS:
|
2988 |
comp = (RDMACompress *) rdma->wr_data[idx].control_curr; |
2989 |
network_to_compress(comp); |
2990 |
|
2991 |
DDPRINTF("Zapping zero chunk: %" PRId64
|
2992 |
" bytes, index %d, offset %" PRId64 "\n", |
2993 |
comp->length, comp->block_idx, comp->offset); |
2994 |
block = &(rdma->local_ram_blocks.block[comp->block_idx]); |
2995 |
|
2996 |
host_addr = block->local_host_addr + |
2997 |
(comp->offset - block->offset); |
2998 |
|
2999 |
ram_handle_compressed(host_addr, comp->value, comp->length); |
3000 |
break;
|
3001 |
|
3002 |
case RDMA_CONTROL_REGISTER_FINISHED:
|
3003 |
DDDPRINTF("Current registrations complete.\n");
|
3004 |
goto out;
|
3005 |
|
3006 |
case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
|
3007 |
DPRINTF("Initial setup info requested.\n");
|
3008 |
|
3009 |
if (rdma->pin_all) {
|
3010 |
ret = qemu_rdma_reg_whole_ram_blocks(rdma); |
3011 |
if (ret) {
|
3012 |
fprintf(stderr, "rdma migration: error dest "
|
3013 |
"registering ram blocks!\n");
|
3014 |
goto out;
|
3015 |
} |
3016 |
} |
3017 |
|
3018 |
/*
|
3019 |
* Dest uses this to prepare to transmit the RAMBlock descriptions
|
3020 |
* to the source VM after connection setup.
|
3021 |
* Both sides use the "remote" structure to communicate and update
|
3022 |
* their "local" descriptions with what was sent.
|
3023 |
*/
|
3024 |
for (i = 0; i < local->nb_blocks; i++) { |
3025 |
rdma->block[i].remote_host_addr = |
3026 |
(uint64_t)(local->block[i].local_host_addr); |
3027 |
|
3028 |
if (rdma->pin_all) {
|
3029 |
rdma->block[i].remote_rkey = local->block[i].mr->rkey; |
3030 |
} |
3031 |
|
3032 |
rdma->block[i].offset = local->block[i].offset; |
3033 |
rdma->block[i].length = local->block[i].length; |
3034 |
|
3035 |
remote_block_to_network(&rdma->block[i]); |
3036 |
} |
3037 |
|
3038 |
blocks.len = rdma->local_ram_blocks.nb_blocks |
3039 |
* sizeof(RDMARemoteBlock);
|
3040 |
|
3041 |
|
3042 |
ret = qemu_rdma_post_send_control(rdma, |
3043 |
(uint8_t *) rdma->block, &blocks); |
3044 |
|
3045 |
if (ret < 0) { |
3046 |
fprintf(stderr, "rdma migration: error sending remote info!\n");
|
3047 |
goto out;
|
3048 |
} |
3049 |
|
3050 |
break;
|
3051 |
case RDMA_CONTROL_REGISTER_REQUEST:
|
3052 |
DDPRINTF("There are %d registration requests\n", head.repeat);
|
3053 |
|
3054 |
reg_resp.repeat = head.repeat; |
3055 |
registers = (RDMARegister *) rdma->wr_data[idx].control_curr; |
3056 |
|
3057 |
for (count = 0; count < head.repeat; count++) { |
3058 |
uint64_t chunk; |
3059 |
uint8_t *chunk_start, *chunk_end; |
3060 |
|
3061 |
reg = ®isters[count]; |
3062 |
network_to_register(reg); |
3063 |
|
3064 |
reg_result = &results[count]; |
3065 |
|
3066 |
DDPRINTF("Registration request (%d): index %d, current_addr %"
|
3067 |
PRIu64 " chunks: %" PRIu64 "\n", count, |
3068 |
reg->current_index, reg->key.current_addr, reg->chunks); |
3069 |
|
3070 |
block = &(rdma->local_ram_blocks.block[reg->current_index]); |
3071 |
if (block->is_ram_block) {
|
3072 |
host_addr = (block->local_host_addr + |
3073 |
(reg->key.current_addr - block->offset)); |
3074 |
chunk = ram_chunk_index(block->local_host_addr, |
3075 |
(uint8_t *) host_addr); |
3076 |
} else {
|
3077 |
chunk = reg->key.chunk; |
3078 |
host_addr = block->local_host_addr + |
3079 |
(reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
|
3080 |
} |
3081 |
chunk_start = ram_chunk_start(block, chunk); |
3082 |
chunk_end = ram_chunk_end(block, chunk + reg->chunks); |
3083 |
if (qemu_rdma_register_and_get_keys(rdma, block,
|
3084 |
(uint8_t *)host_addr, NULL, ®_result->rkey,
|
3085 |
chunk, chunk_start, chunk_end)) { |
3086 |
fprintf(stderr, "cannot get rkey!\n");
|
3087 |
ret = -EINVAL; |
3088 |
goto out;
|
3089 |
} |
3090 |
|
3091 |
reg_result->host_addr = (uint64_t) block->local_host_addr; |
3092 |
|
3093 |
DDPRINTF("Registered rkey for this request: %x\n",
|
3094 |
reg_result->rkey); |
3095 |
|
3096 |
result_to_network(reg_result); |
3097 |
} |
3098 |
|
3099 |
ret = qemu_rdma_post_send_control(rdma, |
3100 |
(uint8_t *) results, ®_resp); |
3101 |
|
3102 |
if (ret < 0) { |
3103 |
fprintf(stderr, "Failed to send control buffer!\n");
|
3104 |
goto out;
|
3105 |
} |
3106 |
break;
|
3107 |
case RDMA_CONTROL_UNREGISTER_REQUEST:
|
3108 |
DDPRINTF("There are %d unregistration requests\n", head.repeat);
|
3109 |
unreg_resp.repeat = head.repeat; |
3110 |
registers = (RDMARegister *) rdma->wr_data[idx].control_curr; |
3111 |
|
3112 |
for (count = 0; count < head.repeat; count++) { |
3113 |
reg = ®isters[count]; |
3114 |
network_to_register(reg); |
3115 |
|
3116 |
DDPRINTF("Unregistration request (%d): "
|
3117 |
" index %d, chunk %" PRIu64 "\n", |
3118 |
count, reg->current_index, reg->key.chunk); |
3119 |
|
3120 |
block = &(rdma->local_ram_blocks.block[reg->current_index]); |
3121 |
|
3122 |
ret = ibv_dereg_mr(block->pmr[reg->key.chunk]); |
3123 |
block->pmr[reg->key.chunk] = NULL;
|
3124 |
|
3125 |
if (ret != 0) { |
3126 |
perror("rdma unregistration chunk failed");
|
3127 |
ret = -ret; |
3128 |
goto out;
|
3129 |
} |
3130 |
|
3131 |
rdma->total_registrations--; |
3132 |
|
3133 |
DDPRINTF("Unregistered chunk %" PRIu64 " successfully.\n", |
3134 |
reg->key.chunk); |
3135 |
} |
3136 |
|
3137 |
ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
|
3138 |
|
3139 |
if (ret < 0) { |
3140 |
fprintf(stderr, "Failed to send control buffer!\n");
|
3141 |
goto out;
|
3142 |
} |
3143 |
break;
|
3144 |
case RDMA_CONTROL_REGISTER_RESULT:
|
3145 |
fprintf(stderr, "Invalid RESULT message at dest.\n");
|
3146 |
ret = -EIO; |
3147 |
goto out;
|
3148 |
default:
|
3149 |
fprintf(stderr, "Unknown control message %s\n",
|
3150 |
control_desc[head.type]); |
3151 |
ret = -EIO; |
3152 |
goto out;
|
3153 |
} |
3154 |
} while (1); |
3155 |
out:
|
3156 |
if (ret < 0) { |
3157 |
rdma->error_state = ret; |
3158 |
} |
3159 |
return ret;
|
3160 |
} |
3161 |
|
3162 |
static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, |
3163 |
uint64_t flags) |
3164 |
{ |
3165 |
QEMUFileRDMA *rfile = opaque; |
3166 |
RDMAContext *rdma = rfile->rdma; |
3167 |
|
3168 |
CHECK_ERROR_STATE(); |
3169 |
|
3170 |
DDDPRINTF("start section: %" PRIu64 "\n", flags); |
3171 |
qemu_put_be64(f, RAM_SAVE_FLAG_HOOK); |
3172 |
qemu_fflush(f); |
3173 |
|
3174 |
return 0; |
3175 |
} |
3176 |
|
3177 |
/*
|
3178 |
* Inform dest that dynamic registrations are done for now.
|
3179 |
* First, flush writes, if any.
|
3180 |
*/
|
3181 |
static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, |
3182 |
uint64_t flags) |
3183 |
{ |
3184 |
Error *local_err = NULL, **errp = &local_err;
|
3185 |
QEMUFileRDMA *rfile = opaque; |
3186 |
RDMAContext *rdma = rfile->rdma; |
3187 |
RDMAControlHeader head = { .len = 0, .repeat = 1 }; |
3188 |
int ret = 0; |
3189 |
|
3190 |
CHECK_ERROR_STATE(); |
3191 |
|
3192 |
qemu_fflush(f); |
3193 |
ret = qemu_rdma_drain_cq(f, rdma); |
3194 |
|
3195 |
if (ret < 0) { |
3196 |
goto err;
|
3197 |
} |
3198 |
|
3199 |
if (flags == RAM_CONTROL_SETUP) {
|
3200 |
RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; |
3201 |
RDMALocalBlocks *local = &rdma->local_ram_blocks; |
3202 |
int reg_result_idx, i, j, nb_remote_blocks;
|
3203 |
|
3204 |
head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; |
3205 |
DPRINTF("Sending registration setup for ram blocks...\n");
|
3206 |
|
3207 |
/*
|
3208 |
* Make sure that we parallelize the pinning on both sides.
|
3209 |
* For very large guests, doing this serially takes a really
|
3210 |
* long time, so we have to 'interleave' the pinning locally
|
3211 |
* with the control messages by performing the pinning on this
|
3212 |
* side before we receive the control response from the other
|
3213 |
* side that the pinning has completed.
|
3214 |
*/
|
3215 |
ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
|
3216 |
®_result_idx, rdma->pin_all ? |
3217 |
qemu_rdma_reg_whole_ram_blocks : NULL);
|
3218 |
if (ret < 0) { |
3219 |
ERROR(errp, "receiving remote info!");
|
3220 |
return ret;
|
3221 |
} |
3222 |
|
3223 |
nb_remote_blocks = resp.len / sizeof(RDMARemoteBlock);
|
3224 |
|
3225 |
/*
|
3226 |
* The protocol uses two different sets of rkeys (mutually exclusive):
|
3227 |
* 1. One key to represent the virtual address of the entire ram block.
|
3228 |
* (dynamic chunk registration disabled - pin everything with one rkey.)
|
3229 |
* 2. One to represent individual chunks within a ram block.
|
3230 |
* (dynamic chunk registration enabled - pin individual chunks.)
|
3231 |
*
|
3232 |
* Once the capability is successfully negotiated, the destination transmits
|
3233 |
* the keys to use (or sends them later) including the virtual addresses
|
3234 |
* and then propagates the remote ram block descriptions to his local copy.
|
3235 |
*/
|
3236 |
|
3237 |
if (local->nb_blocks != nb_remote_blocks) {
|
3238 |
ERROR(errp, "ram blocks mismatch #1! "
|
3239 |
"Your QEMU command line parameters are probably "
|
3240 |
"not identical on both the source and destination.");
|
3241 |
return -EINVAL;
|
3242 |
} |
3243 |
|
3244 |
qemu_rdma_move_header(rdma, reg_result_idx, &resp); |
3245 |
memcpy(rdma->block, |
3246 |
rdma->wr_data[reg_result_idx].control_curr, resp.len); |
3247 |
for (i = 0; i < nb_remote_blocks; i++) { |
3248 |
network_to_remote_block(&rdma->block[i]); |
3249 |
|
3250 |
/* search local ram blocks */
|
3251 |
for (j = 0; j < local->nb_blocks; j++) { |
3252 |
if (rdma->block[i].offset != local->block[j].offset) {
|
3253 |
continue;
|
3254 |
} |
3255 |
|
3256 |
if (rdma->block[i].length != local->block[j].length) {
|
3257 |
ERROR(errp, "ram blocks mismatch #2! "
|
3258 |
"Your QEMU command line parameters are probably "
|
3259 |
"not identical on both the source and destination.");
|
3260 |
return -EINVAL;
|
3261 |
} |
3262 |
local->block[j].remote_host_addr = |
3263 |
rdma->block[i].remote_host_addr; |
3264 |
local->block[j].remote_rkey = rdma->block[i].remote_rkey; |
3265 |
break;
|
3266 |
} |
3267 |
|
3268 |
if (j >= local->nb_blocks) {
|
3269 |
ERROR(errp, "ram blocks mismatch #3! "
|
3270 |
"Your QEMU command line parameters are probably "
|
3271 |
"not identical on both the source and destination.");
|
3272 |
return -EINVAL;
|
3273 |
} |
3274 |
} |
3275 |
} |
3276 |
|
3277 |
DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags); |
3278 |
|
3279 |
head.type = RDMA_CONTROL_REGISTER_FINISHED; |
3280 |
ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL); |
3281 |
|
3282 |
if (ret < 0) { |
3283 |
goto err;
|
3284 |
} |
3285 |
|
3286 |
return 0; |
3287 |
err:
|
3288 |
rdma->error_state = ret; |
3289 |
return ret;
|
3290 |
} |
3291 |
|
3292 |
static int qemu_rdma_get_fd(void *opaque) |
3293 |
{ |
3294 |
QEMUFileRDMA *rfile = opaque; |
3295 |
RDMAContext *rdma = rfile->rdma; |
3296 |
|
3297 |
return rdma->comp_channel->fd;
|
3298 |
} |
3299 |
|
3300 |
const QEMUFileOps rdma_read_ops = {
|
3301 |
.get_buffer = qemu_rdma_get_buffer, |
3302 |
.get_fd = qemu_rdma_get_fd, |
3303 |
.close = qemu_rdma_close, |
3304 |
.hook_ram_load = qemu_rdma_registration_handle, |
3305 |
}; |
3306 |
|
3307 |
const QEMUFileOps rdma_write_ops = {
|
3308 |
.put_buffer = qemu_rdma_put_buffer, |
3309 |
.close = qemu_rdma_close, |
3310 |
.before_ram_iterate = qemu_rdma_registration_start, |
3311 |
.after_ram_iterate = qemu_rdma_registration_stop, |
3312 |
.save_page = qemu_rdma_save_page, |
3313 |
}; |
3314 |
|
3315 |
static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) |
3316 |
{ |
3317 |
QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
|
3318 |
|
3319 |
if (qemu_file_mode_is_not_valid(mode)) {
|
3320 |
return NULL; |
3321 |
} |
3322 |
|
3323 |
r->rdma = rdma; |
3324 |
|
3325 |
if (mode[0] == 'w') { |
3326 |
r->file = qemu_fopen_ops(r, &rdma_write_ops); |
3327 |
} else {
|
3328 |
r->file = qemu_fopen_ops(r, &rdma_read_ops); |
3329 |
} |
3330 |
|
3331 |
return r->file;
|
3332 |
} |
3333 |
|
3334 |
static void rdma_accept_incoming_migration(void *opaque) |
3335 |
{ |
3336 |
RDMAContext *rdma = opaque; |
3337 |
int ret;
|
3338 |
QEMUFile *f; |
3339 |
Error *local_err = NULL, **errp = &local_err;
|
3340 |
|
3341 |
DPRINTF("Accepting rdma connection...\n");
|
3342 |
ret = qemu_rdma_accept(rdma); |
3343 |
|
3344 |
if (ret) {
|
3345 |
ERROR(errp, "RDMA Migration initialization failed!");
|
3346 |
return;
|
3347 |
} |
3348 |
|
3349 |
DPRINTF("Accepted migration\n");
|
3350 |
|
3351 |
f = qemu_fopen_rdma(rdma, "rb");
|
3352 |
if (f == NULL) { |
3353 |
ERROR(errp, "could not qemu_fopen_rdma!");
|
3354 |
qemu_rdma_cleanup(rdma); |
3355 |
return;
|
3356 |
} |
3357 |
|
3358 |
rdma->migration_started_on_destination = 1;
|
3359 |
process_incoming_migration(f); |
3360 |
} |
3361 |
|
3362 |
void rdma_start_incoming_migration(const char *host_port, Error **errp) |
3363 |
{ |
3364 |
int ret;
|
3365 |
RDMAContext *rdma; |
3366 |
Error *local_err = NULL;
|
3367 |
|
3368 |
DPRINTF("Starting RDMA-based incoming migration\n");
|
3369 |
rdma = qemu_rdma_data_init(host_port, &local_err); |
3370 |
|
3371 |
if (rdma == NULL) { |
3372 |
goto err;
|
3373 |
} |
3374 |
|
3375 |
ret = qemu_rdma_dest_init(rdma, &local_err); |
3376 |
|
3377 |
if (ret) {
|
3378 |
goto err;
|
3379 |
} |
3380 |
|
3381 |
DPRINTF("qemu_rdma_dest_init success\n");
|
3382 |
|
3383 |
ret = rdma_listen(rdma->listen_id, 5);
|
3384 |
|
3385 |
if (ret) {
|
3386 |
ERROR(errp, "listening on socket!");
|
3387 |
goto err;
|
3388 |
} |
3389 |
|
3390 |
DPRINTF("rdma_listen success\n");
|
3391 |
|
3392 |
qemu_set_fd_handler2(rdma->channel->fd, NULL,
|
3393 |
rdma_accept_incoming_migration, NULL,
|
3394 |
(void *)(intptr_t) rdma);
|
3395 |
return;
|
3396 |
err:
|
3397 |
error_propagate(errp, local_err); |
3398 |
g_free(rdma); |
3399 |
} |
3400 |
|
3401 |
void rdma_start_outgoing_migration(void *opaque, |
3402 |
const char *host_port, Error **errp) |
3403 |
{ |
3404 |
MigrationState *s = opaque; |
3405 |
Error *local_err = NULL, **temp = &local_err;
|
3406 |
RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err); |
3407 |
int ret = 0; |
3408 |
|
3409 |
if (rdma == NULL) { |
3410 |
ERROR(temp, "Failed to initialize RDMA data structures! %d", ret);
|
3411 |
goto err;
|
3412 |
} |
3413 |
|
3414 |
ret = qemu_rdma_source_init(rdma, &local_err, |
3415 |
s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]); |
3416 |
|
3417 |
if (ret) {
|
3418 |
goto err;
|
3419 |
} |
3420 |
|
3421 |
DPRINTF("qemu_rdma_source_init success\n");
|
3422 |
ret = qemu_rdma_connect(rdma, &local_err); |
3423 |
|
3424 |
if (ret) {
|
3425 |
goto err;
|
3426 |
} |
3427 |
|
3428 |
DPRINTF("qemu_rdma_source_connect success\n");
|
3429 |
|
3430 |
s->file = qemu_fopen_rdma(rdma, "wb");
|
3431 |
migrate_fd_connect(s); |
3432 |
return;
|
3433 |
err:
|
3434 |
error_propagate(errp, local_err); |
3435 |
g_free(rdma); |
3436 |
migrate_fd_error(s); |
3437 |
} |