root / docs / migration.txt @ 81a97d9d
History | View | Annotate | Download (11.6 kB)
1 | f58ae59c | Juan Quintela | = Migration = |
---|---|---|---|
2 | f58ae59c | Juan Quintela | |
3 | f58ae59c | Juan Quintela | QEMU has code to load/save the state of the guest that it is running. |
4 | f58ae59c | Juan Quintela | This are two complementary operations. Saving the state just does |
5 | f58ae59c | Juan Quintela | that, saves the state for each device that the guest is running. |
6 | f58ae59c | Juan Quintela | Restoring a guest is just the opposite operation: we need to load the |
7 | f58ae59c | Juan Quintela | state of each device. |
8 | f58ae59c | Juan Quintela | |
9 | f58ae59c | Juan Quintela | For this to work, QEMU has to be launch with the same arguments the |
10 | f58ae59c | Juan Quintela | two times. I.e. it can only restore the state in one guest that has |
11 | f58ae59c | Juan Quintela | the same devices that the one it was saved (this last requirement can |
12 | f58ae59c | Juan Quintela | be relaxed a bit, but for now we can consider that configuration have |
13 | f58ae59c | Juan Quintela | to be exactly the same). |
14 | f58ae59c | Juan Quintela | |
15 | f58ae59c | Juan Quintela | Once that we are able to save/restore a guest, a new functionality is |
16 | f58ae59c | Juan Quintela | requested: migration. This means that QEMU is able to start in one |
17 | f58ae59c | Juan Quintela | machine and being "migrated" to other machine. I.e. being moved to |
18 | f58ae59c | Juan Quintela | other machine. |
19 | f58ae59c | Juan Quintela | |
20 | f58ae59c | Juan Quintela | Next was the "live migration" functionality. This is important |
21 | f58ae59c | Juan Quintela | because some guests run with a lot of state (specially RAM), and it |
22 | f58ae59c | Juan Quintela | can take a while to move all state from one machine to another. Live |
23 | f58ae59c | Juan Quintela | migration allows the guest to continue running while the state is |
24 | f58ae59c | Juan Quintela | transferred. Only while the last part of the state is transferred has |
25 | f58ae59c | Juan Quintela | the guest to be stopped. Typically the time that the guest is |
26 | f58ae59c | Juan Quintela | unresponsive during live migration is the low hundred of milliseconds |
27 | f58ae59c | Juan Quintela | (notice that this depends on lot of things). |
28 | f58ae59c | Juan Quintela | |
29 | f58ae59c | Juan Quintela | === Types of migration === |
30 | f58ae59c | Juan Quintela | |
31 | f58ae59c | Juan Quintela | Now that we have talked about live migration, there are several ways |
32 | f58ae59c | Juan Quintela | to do migration: |
33 | f58ae59c | Juan Quintela | |
34 | f58ae59c | Juan Quintela | - tcp migration: do the migration using tcp sockets |
35 | f58ae59c | Juan Quintela | - unix migration: do the migration using unix sockets |
36 | f58ae59c | Juan Quintela | - exec migration: do the migration using the stdin/stdout through a process. |
37 | f58ae59c | Juan Quintela | - fd migration: do the migration using an file descriptor that is |
38 | f58ae59c | Juan Quintela | passed to QEMU. QEMU don't cares how this file descriptor is opened. |
39 | f58ae59c | Juan Quintela | |
40 | f58ae59c | Juan Quintela | All this four migration protocols use the same infrastructure to |
41 | f58ae59c | Juan Quintela | save/restore state devices. This infrastructure is shared with the |
42 | f58ae59c | Juan Quintela | savevm/loadvm functionality. |
43 | f58ae59c | Juan Quintela | |
44 | f58ae59c | Juan Quintela | === State Live Migration == |
45 | f58ae59c | Juan Quintela | |
46 | f58ae59c | Juan Quintela | This is used for RAM and block devices. It is not yet ported to vmstate. |
47 | f58ae59c | Juan Quintela | <Fill more information here> |
48 | f58ae59c | Juan Quintela | |
49 | f58ae59c | Juan Quintela | === What is the common infrastructure === |
50 | f58ae59c | Juan Quintela | |
51 | f58ae59c | Juan Quintela | QEMU uses a QEMUFile abstraction to be able to do migration. Any type |
52 | f58ae59c | Juan Quintela | of migration that what to use QEMU infrastructure has to create a |
53 | f58ae59c | Juan Quintela | QEMUFile with: |
54 | f58ae59c | Juan Quintela | |
55 | f58ae59c | Juan Quintela | QEMUFile *qemu_fopen_ops(void *opaque, |
56 | f58ae59c | Juan Quintela | QEMUFilePutBufferFunc *put_buffer, |
57 | f58ae59c | Juan Quintela | QEMUFileGetBufferFunc *get_buffer, |
58 | f58ae59c | Juan Quintela | QEMUFileCloseFunc *close, |
59 | f58ae59c | Juan Quintela | QEMUFileRateLimit *rate_limit, |
60 | f58ae59c | Juan Quintela | QEMUFileSetRateLimit *set_rate_limit, |
61 | f58ae59c | Juan Quintela | QEMUFileGetRateLimit *get_rate_limit); |
62 | f58ae59c | Juan Quintela | |
63 | f58ae59c | Juan Quintela | The functions have the following functionality: |
64 | f58ae59c | Juan Quintela | |
65 | f58ae59c | Juan Quintela | This function writes a chunk of data to a file at the given position. |
66 | f58ae59c | Juan Quintela | The pos argument can be ignored if the file is only being used for |
67 | f58ae59c | Juan Quintela | streaming. The handler should try to write all of the data it can. |
68 | f58ae59c | Juan Quintela | |
69 | f58ae59c | Juan Quintela | typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, |
70 | f58ae59c | Juan Quintela | int64_t pos, int size); |
71 | f58ae59c | Juan Quintela | |
72 | f58ae59c | Juan Quintela | Read a chunk of data from a file at the given position. The pos argument |
73 | f58ae59c | Juan Quintela | can be ignored if the file is only be used for streaming. The number of |
74 | f58ae59c | Juan Quintela | bytes actually read should be returned. |
75 | f58ae59c | Juan Quintela | |
76 | f58ae59c | Juan Quintela | typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, |
77 | f58ae59c | Juan Quintela | int64_t pos, int size); |
78 | f58ae59c | Juan Quintela | |
79 | f58ae59c | Juan Quintela | Close a file and return an error code |
80 | f58ae59c | Juan Quintela | |
81 | f58ae59c | Juan Quintela | typedef int (QEMUFileCloseFunc)(void *opaque); |
82 | f58ae59c | Juan Quintela | |
83 | f58ae59c | Juan Quintela | Called to determine if the file has exceeded it's bandwidth allocation. The |
84 | f58ae59c | Juan Quintela | bandwidth capping is a soft limit, not a hard limit. |
85 | f58ae59c | Juan Quintela | |
86 | f58ae59c | Juan Quintela | typedef int (QEMUFileRateLimit)(void *opaque); |
87 | f58ae59c | Juan Quintela | |
88 | f58ae59c | Juan Quintela | Called to change the current bandwidth allocation. This function must return |
89 | f58ae59c | Juan Quintela | the new actual bandwidth. It should be new_rate if everything goes OK, and |
90 | f58ae59c | Juan Quintela | the old rate otherwise |
91 | f58ae59c | Juan Quintela | |
92 | f58ae59c | Juan Quintela | typedef size_t (QEMUFileSetRateLimit)(void *opaque, size_t new_rate); |
93 | f58ae59c | Juan Quintela | typedef size_t (QEMUFileGetRateLimit)(void *opaque); |
94 | f58ae59c | Juan Quintela | |
95 | f58ae59c | Juan Quintela | You can use any internal state that you need using the opaque void * |
96 | f58ae59c | Juan Quintela | pointer that is passed to all functions. |
97 | f58ae59c | Juan Quintela | |
98 | f58ae59c | Juan Quintela | The rate limiting functions are used to limit the bandwidth used by |
99 | f58ae59c | Juan Quintela | QEMU migration. |
100 | f58ae59c | Juan Quintela | |
101 | f58ae59c | Juan Quintela | The important functions for us are put_buffer()/get_buffer() that |
102 | f58ae59c | Juan Quintela | allow to write/read a buffer into the QEMUFile. |
103 | f58ae59c | Juan Quintela | |
104 | f58ae59c | Juan Quintela | === How to save the state of one device == |
105 | f58ae59c | Juan Quintela | |
106 | f58ae59c | Juan Quintela | The state of a device is saved using intermediate buffers. There are |
107 | f58ae59c | Juan Quintela | some helper functions to assist this saving. |
108 | f58ae59c | Juan Quintela | |
109 | f58ae59c | Juan Quintela | There is a new concept that we have to explain here: device state |
110 | f58ae59c | Juan Quintela | version. When we migrate a device, we save/load the state as a series |
111 | f58ae59c | Juan Quintela | of fields. Some times, due to bugs or new functionality, we need to |
112 | f58ae59c | Juan Quintela | change the state to store more/different information. We use the |
113 | f58ae59c | Juan Quintela | version to identify each time that we do a change. Each version is |
114 | f58ae59c | Juan Quintela | associated with a series of fields saved. The save_state always save |
115 | f58ae59c | Juan Quintela | the state as the newer version. But load_state some times is able to |
116 | f58ae59c | Juan Quintela | load state from an older version. |
117 | f58ae59c | Juan Quintela | |
118 | f58ae59c | Juan Quintela | === Legacy way === |
119 | f58ae59c | Juan Quintela | |
120 | f58ae59c | Juan Quintela | This way is going to disappear as soon as all current users are ported to VMSTATE. |
121 | f58ae59c | Juan Quintela | |
122 | f58ae59c | Juan Quintela | Each device has to register two functions, one to save the state and |
123 | f58ae59c | Juan Quintela | another to load the state back. |
124 | f58ae59c | Juan Quintela | |
125 | f58ae59c | Juan Quintela | int register_savevm(DeviceState *dev, |
126 | f58ae59c | Juan Quintela | const char *idstr, |
127 | f58ae59c | Juan Quintela | int instance_id, |
128 | f58ae59c | Juan Quintela | int version_id, |
129 | f58ae59c | Juan Quintela | SaveStateHandler *save_state, |
130 | f58ae59c | Juan Quintela | LoadStateHandler *load_state, |
131 | f58ae59c | Juan Quintela | void *opaque); |
132 | f58ae59c | Juan Quintela | |
133 | f58ae59c | Juan Quintela | typedef void SaveStateHandler(QEMUFile *f, void *opaque); |
134 | f58ae59c | Juan Quintela | typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); |
135 | f58ae59c | Juan Quintela | |
136 | f58ae59c | Juan Quintela | The important functions for the device state format are the save_state |
137 | f58ae59c | Juan Quintela | and load_state. Notice that load_state receives a version_id |
138 | f58ae59c | Juan Quintela | parameter to know what state format is receiving. save_state don't |
139 | f58ae59c | Juan Quintela | have a version_id parameter because it uses always the latest version. |
140 | f58ae59c | Juan Quintela | |
141 | f58ae59c | Juan Quintela | === VMState === |
142 | f58ae59c | Juan Quintela | |
143 | f58ae59c | Juan Quintela | The legacy way of saving/loading state of the device had the problem |
144 | f58ae59c | Juan Quintela | that we have to maintain in sync two functions. If we did one change |
145 | f58ae59c | Juan Quintela | in one of them and not on the other, we got a failed migration. |
146 | f58ae59c | Juan Quintela | |
147 | f58ae59c | Juan Quintela | VMState changed the way that state is saved/loaded. Instead of using |
148 | f58ae59c | Juan Quintela | a function to save the state and another to load it, it was changed to |
149 | f58ae59c | Juan Quintela | a declarative way of what the state consisted of. Now VMState is able |
150 | f58ae59c | Juan Quintela | to interpret that definition to be able to load/save the state. As |
151 | f58ae59c | Juan Quintela | the state is declared only once, it can't go out of sync in the |
152 | f58ae59c | Juan Quintela | save/load functions. |
153 | f58ae59c | Juan Quintela | |
154 | f58ae59c | Juan Quintela | An example (from hw/pckbd.c) |
155 | f58ae59c | Juan Quintela | |
156 | f58ae59c | Juan Quintela | static const VMStateDescription vmstate_kbd = { |
157 | f58ae59c | Juan Quintela | .name = "pckbd", |
158 | f58ae59c | Juan Quintela | .version_id = 3, |
159 | f58ae59c | Juan Quintela | .minimum_version_id = 3, |
160 | f58ae59c | Juan Quintela | .minimum_version_id_old = 3, |
161 | f58ae59c | Juan Quintela | .fields = (VMStateField []) { |
162 | f58ae59c | Juan Quintela | VMSTATE_UINT8(write_cmd, KBDState), |
163 | f58ae59c | Juan Quintela | VMSTATE_UINT8(status, KBDState), |
164 | f58ae59c | Juan Quintela | VMSTATE_UINT8(mode, KBDState), |
165 | f58ae59c | Juan Quintela | VMSTATE_UINT8(pending, KBDState), |
166 | f58ae59c | Juan Quintela | VMSTATE_END_OF_LIST() |
167 | f58ae59c | Juan Quintela | } |
168 | f58ae59c | Juan Quintela | }; |
169 | f58ae59c | Juan Quintela | |
170 | f58ae59c | Juan Quintela | We are declaring the state with name "pckbd". |
171 | f58ae59c | Juan Quintela | The version_id is 3, and the fields are 4 uint8_t in a KBDState structure. |
172 | f58ae59c | Juan Quintela | We registered this with: |
173 | f58ae59c | Juan Quintela | |
174 | f58ae59c | Juan Quintela | vmstate_register(NULL, 0, &vmstate_kbd, s); |
175 | f58ae59c | Juan Quintela | |
176 | f58ae59c | Juan Quintela | Note: talk about how vmstate <-> qdev interact, and what the instance id's mean. |
177 | f58ae59c | Juan Quintela | |
178 | f58ae59c | Juan Quintela | You can search for VMSTATE_* macros for lots of types used in QEMU in |
179 | f58ae59c | Juan Quintela | hw/hw.h. |
180 | f58ae59c | Juan Quintela | |
181 | f58ae59c | Juan Quintela | === More about versions == |
182 | f58ae59c | Juan Quintela | |
183 | f58ae59c | Juan Quintela | You can see that there are several version fields: |
184 | f58ae59c | Juan Quintela | |
185 | f58ae59c | Juan Quintela | - version_id: the maximum version_id supported by VMState for that device |
186 | f58ae59c | Juan Quintela | - minimum_version_id: the minimum version_id that VMState is able to understand |
187 | f58ae59c | Juan Quintela | for that device. |
188 | f58ae59c | Juan Quintela | - minimum_version_id_old: For devices that were not able to port to vmstate, we can |
189 | f58ae59c | Juan Quintela | assign a function that knows how to read this old state. |
190 | f58ae59c | Juan Quintela | |
191 | f58ae59c | Juan Quintela | So, VMState is able to read versions from minimum_version_id to |
192 | f58ae59c | Juan Quintela | version_id. And the function load_state_old() is able to load state |
193 | f58ae59c | Juan Quintela | from minimum_version_id_old to minimum_version_id. This function is |
194 | f58ae59c | Juan Quintela | deprecated and will be removed when no more users are left. |
195 | f58ae59c | Juan Quintela | |
196 | f58ae59c | Juan Quintela | === Massaging functions === |
197 | f58ae59c | Juan Quintela | |
198 | f58ae59c | Juan Quintela | Some times, it is not enough to be able to save the state directly |
199 | f58ae59c | Juan Quintela | from one structure, we need to fill the correct values there. One |
200 | f58ae59c | Juan Quintela | example is when we are using kvm. Before saving the cpu state, we |
201 | f58ae59c | Juan Quintela | need to ask kvm to copy to QEMU the state that it is using. And the |
202 | f58ae59c | Juan Quintela | opposite when we are loading the state, we need a way to tell kvm to |
203 | f58ae59c | Juan Quintela | load the state for the cpu that we have just loaded from the QEMUFile. |
204 | f58ae59c | Juan Quintela | |
205 | f58ae59c | Juan Quintela | The functions to do that are inside a vmstate definition, and are called: |
206 | f58ae59c | Juan Quintela | |
207 | f58ae59c | Juan Quintela | - int (*pre_load)(void *opaque); |
208 | f58ae59c | Juan Quintela | |
209 | f58ae59c | Juan Quintela | This function is called before we load the state of one device. |
210 | f58ae59c | Juan Quintela | |
211 | f58ae59c | Juan Quintela | - int (*post_load)(void *opaque, int version_id); |
212 | f58ae59c | Juan Quintela | |
213 | f58ae59c | Juan Quintela | This function is called after we load the state of one device. |
214 | f58ae59c | Juan Quintela | |
215 | f58ae59c | Juan Quintela | - void (*pre_save)(void *opaque); |
216 | f58ae59c | Juan Quintela | |
217 | f58ae59c | Juan Quintela | This function is called before we save the state of one device. |
218 | f58ae59c | Juan Quintela | |
219 | f58ae59c | Juan Quintela | Example: You can look at hpet.c, that uses the three function to |
220 | f58ae59c | Juan Quintela | massage the state that is transferred. |
221 | f58ae59c | Juan Quintela | |
222 | f58ae59c | Juan Quintela | === Subsections === |
223 | f58ae59c | Juan Quintela | |
224 | f58ae59c | Juan Quintela | The use of version_id allows to be able to migrate from older versions |
225 | f58ae59c | Juan Quintela | to newer versions of a device. But not the other way around. This |
226 | f58ae59c | Juan Quintela | makes very complicated to fix bugs in stable branches. If we need to |
227 | f58ae59c | Juan Quintela | add anything to the state to fix a bug, we have to disable migration |
228 | f58ae59c | Juan Quintela | to older versions that don't have that bug-fix (i.e. a new field). |
229 | f58ae59c | Juan Quintela | |
230 | f58ae59c | Juan Quintela | But some time, that bug-fix is only needed sometimes, not always. For |
231 | f58ae59c | Juan Quintela | instance, if the device is in the middle of a DMA operation, it is |
232 | f58ae59c | Juan Quintela | using a specific functionality, .... |
233 | f58ae59c | Juan Quintela | |
234 | f58ae59c | Juan Quintela | It is impossible to create a way to make migration from any version to |
235 | f58ae59c | Juan Quintela | any other version to work. But we can do better that only allowing |
236 | f58ae59c | Juan Quintela | migration from older versions no newer ones. For that fields that are |
237 | f58ae59c | Juan Quintela | only needed sometimes, we add the idea of subsections. a subsection |
238 | f58ae59c | Juan Quintela | is "like" a device vmstate, but with a particularity, it has a Boolean |
239 | f58ae59c | Juan Quintela | function that tells if that values are needed to be sent or not. If |
240 | f58ae59c | Juan Quintela | this functions returns false, the subsection is not sent. |
241 | f58ae59c | Juan Quintela | |
242 | f58ae59c | Juan Quintela | On the receiving side, if we found a subsection for a device that we |
243 | f58ae59c | Juan Quintela | don't understand, we just fail the migration. If we understand all |
244 | f58ae59c | Juan Quintela | the subsections, then we load the state with success. |
245 | f58ae59c | Juan Quintela | |
246 | f58ae59c | Juan Quintela | One important note is that the post_load() function is called "after" |
247 | f58ae59c | Juan Quintela | loading all subsections, because a newer subsection could change same |
248 | f58ae59c | Juan Quintela | value that it uses. |
249 | f58ae59c | Juan Quintela | |
250 | f58ae59c | Juan Quintela | Example: |
251 | f58ae59c | Juan Quintela | |
252 | f58ae59c | Juan Quintela | static bool ide_drive_pio_state_needed(void *opaque) |
253 | f58ae59c | Juan Quintela | { |
254 | f58ae59c | Juan Quintela | IDEState *s = opaque; |
255 | f58ae59c | Juan Quintela | |
256 | f58ae59c | Juan Quintela | return (s->status & DRQ_STAT) != 0; |
257 | f58ae59c | Juan Quintela | } |
258 | f58ae59c | Juan Quintela | |
259 | f58ae59c | Juan Quintela | const VMStateDescription vmstate_ide_drive_pio_state = { |
260 | f58ae59c | Juan Quintela | .name = "ide_drive/pio_state", |
261 | f58ae59c | Juan Quintela | .version_id = 1, |
262 | f58ae59c | Juan Quintela | .minimum_version_id = 1, |
263 | f58ae59c | Juan Quintela | .minimum_version_id_old = 1, |
264 | f58ae59c | Juan Quintela | .pre_save = ide_drive_pio_pre_save, |
265 | f58ae59c | Juan Quintela | .post_load = ide_drive_pio_post_load, |
266 | f58ae59c | Juan Quintela | .fields = (VMStateField []) { |
267 | f58ae59c | Juan Quintela | VMSTATE_INT32(req_nb_sectors, IDEState), |
268 | f58ae59c | Juan Quintela | VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, |
269 | f58ae59c | Juan Quintela | vmstate_info_uint8, uint8_t), |
270 | f58ae59c | Juan Quintela | VMSTATE_INT32(cur_io_buffer_offset, IDEState), |
271 | f58ae59c | Juan Quintela | VMSTATE_INT32(cur_io_buffer_len, IDEState), |
272 | f58ae59c | Juan Quintela | VMSTATE_UINT8(end_transfer_fn_idx, IDEState), |
273 | f58ae59c | Juan Quintela | VMSTATE_INT32(elementary_transfer_size, IDEState), |
274 | f58ae59c | Juan Quintela | VMSTATE_INT32(packet_transfer_size, IDEState), |
275 | f58ae59c | Juan Quintela | VMSTATE_END_OF_LIST() |
276 | f58ae59c | Juan Quintela | } |
277 | f58ae59c | Juan Quintela | }; |
278 | f58ae59c | Juan Quintela | |
279 | f58ae59c | Juan Quintela | const VMStateDescription vmstate_ide_drive = { |
280 | f58ae59c | Juan Quintela | .name = "ide_drive", |
281 | f58ae59c | Juan Quintela | .version_id = 3, |
282 | f58ae59c | Juan Quintela | .minimum_version_id = 0, |
283 | f58ae59c | Juan Quintela | .minimum_version_id_old = 0, |
284 | f58ae59c | Juan Quintela | .post_load = ide_drive_post_load, |
285 | f58ae59c | Juan Quintela | .fields = (VMStateField []) { |
286 | f58ae59c | Juan Quintela | .... several fields .... |
287 | f58ae59c | Juan Quintela | VMSTATE_END_OF_LIST() |
288 | f58ae59c | Juan Quintela | }, |
289 | f58ae59c | Juan Quintela | .subsections = (VMStateSubsection []) { |
290 | f58ae59c | Juan Quintela | { |
291 | f58ae59c | Juan Quintela | .vmsd = &vmstate_ide_drive_pio_state, |
292 | f58ae59c | Juan Quintela | .needed = ide_drive_pio_state_needed, |
293 | f58ae59c | Juan Quintela | }, { |
294 | f58ae59c | Juan Quintela | /* empty */ |
295 | f58ae59c | Juan Quintela | } |
296 | f58ae59c | Juan Quintela | } |
297 | f58ae59c | Juan Quintela | }; |
298 | f58ae59c | Juan Quintela | |
299 | f58ae59c | Juan Quintela | Here we have a subsection for the pio state. We only need to |
300 | f58ae59c | Juan Quintela | save/send this state when we are in the middle of a pio operation |
301 | f58ae59c | Juan Quintela | (that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is |
302 | f58ae59c | Juan Quintela | not enabled, the values on that fields are garbage and don't need to |
303 | f58ae59c | Juan Quintela | be sent. |