3 | f58ae59c | Juan Quintela | QEMU has code to load/save the state of the guest that it is running. |

4 | dda5336e | Stefan Weil | These are two complementary operations. Saving the state just does |

5 | f58ae59c | Juan Quintela | that, saves the state for each device that the guest is running. |

6 | f58ae59c | Juan Quintela | Restoring a guest is just the opposite operation: we need to load the |

7 | f58ae59c | Juan Quintela | state of each device. |

8 | f58ae59c | Juan Quintela | |

9 | dda5336e | Stefan Weil | For this to work, QEMU has to be launched with the same arguments the |

10 | f58ae59c | Juan Quintela | two times. I.e. it can only restore the state in one guest that has |

11 | f58ae59c | Juan Quintela | the same devices that the one it was saved (this last requirement can |

12 | dda5336e | Stefan Weil | be relaxed a bit, but for now we can consider that configuration has |

13 | f58ae59c | Juan Quintela | to be exactly the same). |

14 | f58ae59c | Juan Quintela | |

15 | f58ae59c | Juan Quintela | Once that we are able to save/restore a guest, a new functionality is |

16 | f58ae59c | Juan Quintela | requested: migration. This means that QEMU is able to start in one |

17 | dda5336e | Stefan Weil | machine and being "migrated" to another machine. I.e. being moved to |

18 | dda5336e | Stefan Weil | another machine. |

19 | f58ae59c | Juan Quintela | |

20 | f58ae59c | Juan Quintela | Next was the "live migration" functionality. This is important |

21 | f58ae59c | Juan Quintela | because some guests run with a lot of state (specially RAM), and it |

22 | f58ae59c | Juan Quintela | can take a while to move all state from one machine to another. Live |

23 | f58ae59c | Juan Quintela | migration allows the guest to continue running while the state is |

24 | f58ae59c | Juan Quintela | transferred. Only while the last part of the state is transferred has |

25 | f58ae59c | Juan Quintela | the guest to be stopped. Typically the time that the guest is |

26 | f58ae59c | Juan Quintela | unresponsive during live migration is the low hundred of milliseconds |

27 | dda5336e | Stefan Weil | (notice that this depends on a lot of things). |

28 | f58ae59c | Juan Quintela | |

29 | f58ae59c | Juan Quintela | === Types of migration === |

30 | f58ae59c | Juan Quintela | |

31 | f58ae59c | Juan Quintela | Now that we have talked about live migration, there are several ways |

32 | f58ae59c | Juan Quintela | to do migration: |

33 | f58ae59c | Juan Quintela | |

34 | f58ae59c | Juan Quintela | - tcp migration: do the migration using tcp sockets |

35 | f58ae59c | Juan Quintela | - unix migration: do the migration using unix sockets |

36 | f58ae59c | Juan Quintela | - exec migration: do the migration using the stdin/stdout through a process. |

37 | f58ae59c | Juan Quintela | - fd migration: do the migration using an file descriptor that is |

38 | dda5336e | Stefan Weil | passed to QEMU. QEMU doesn't care how this file descriptor is opened. |

39 | f58ae59c | Juan Quintela | |

40 | dda5336e | Stefan Weil | All these four migration protocols use the same infrastructure to |

41 | f58ae59c | Juan Quintela | save/restore state devices. This infrastructure is shared with the |

42 | f58ae59c | Juan Quintela | savevm/loadvm functionality. |

43 | f58ae59c | Juan Quintela | |

44 | f58ae59c | Juan Quintela | === State Live Migration == |

45 | f58ae59c | Juan Quintela | |

46 | f58ae59c | Juan Quintela | This is used for RAM and block devices. It is not yet ported to vmstate. |

47 | f58ae59c | Juan Quintela | <Fill more information here> |

48 | f58ae59c | Juan Quintela | |

49 | f58ae59c | Juan Quintela | === What is the common infrastructure === |

50 | f58ae59c | Juan Quintela | |

51 | f58ae59c | Juan Quintela | QEMU uses a QEMUFile abstraction to be able to do migration. Any type |

52 | dda5336e | Stefan Weil | of migration that wants to use QEMU infrastructure has to create a |

53 | f58ae59c | Juan Quintela | QEMUFile with: |

54 | f58ae59c | Juan Quintela | |

55 | f58ae59c | Juan Quintela | QEMUFile *qemu_fopen_ops(void *opaque, |

56 | dda5336e | Stefan Weil | QEMUFilePutBufferFunc *put_buffer, |

57 | f58ae59c | Juan Quintela | QEMUFileGetBufferFunc *get_buffer, |

58 | f58ae59c | Juan Quintela | QEMUFileCloseFunc *close, |

59 | f58ae59c | Juan Quintela | QEMUFileRateLimit *rate_limit, |

60 | f58ae59c | Juan Quintela | QEMUFileSetRateLimit *set_rate_limit, |

61 | dda5336e | Stefan Weil | QEMUFileGetRateLimit *get_rate_limit); |

62 | f58ae59c | Juan Quintela | |

63 | f58ae59c | Juan Quintela | The functions have the following functionality: |

64 | f58ae59c | Juan Quintela | |

65 | f58ae59c | Juan Quintela | This function writes a chunk of data to a file at the given position. |

66 | dda5336e | Stefan Weil | The pos argument can be ignored if the file is only used for |

67 | f58ae59c | Juan Quintela | streaming. The handler should try to write all of the data it can. |

68 | f58ae59c | Juan Quintela | |

69 | f58ae59c | Juan Quintela | typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, |

70 | f58ae59c | Juan Quintela | int64_t pos, int size); |

71 | f58ae59c | Juan Quintela | |

72 | f58ae59c | Juan Quintela | Read a chunk of data from a file at the given position. The pos argument |

73 | f58ae59c | Juan Quintela | can be ignored if the file is only be used for streaming. The number of |

74 | f58ae59c | Juan Quintela | bytes actually read should be returned. |

75 | f58ae59c | Juan Quintela | |

76 | f58ae59c | Juan Quintela | typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, |

77 | f58ae59c | Juan Quintela | int64_t pos, int size); |

78 | f58ae59c | Juan Quintela | |

79 | dda5336e | Stefan Weil | Close a file and return an error code. |

80 | f58ae59c | Juan Quintela | |

81 | f58ae59c | Juan Quintela | typedef int (QEMUFileCloseFunc)(void *opaque); |

82 | f58ae59c | Juan Quintela | |

83 | dda5336e | Stefan Weil | Called to determine if the file has exceeded its bandwidth allocation. The |

84 | f58ae59c | Juan Quintela | bandwidth capping is a soft limit, not a hard limit. |

85 | f58ae59c | Juan Quintela | |

86 | f58ae59c | Juan Quintela | typedef int (QEMUFileRateLimit)(void *opaque); |

87 | f58ae59c | Juan Quintela | |

88 | f58ae59c | Juan Quintela | Called to change the current bandwidth allocation. This function must return |

89 | f58ae59c | Juan Quintela | the new actual bandwidth. It should be new_rate if everything goes OK, and |

90 | dda5336e | Stefan Weil | the old rate otherwise. |

91 | f58ae59c | Juan Quintela | |

92 | f58ae59c | Juan Quintela | typedef size_t (QEMUFileSetRateLimit)(void *opaque, size_t new_rate); |

93 | f58ae59c | Juan Quintela | typedef size_t (QEMUFileGetRateLimit)(void *opaque); |

94 | f58ae59c | Juan Quintela | |

95 | f58ae59c | Juan Quintela | You can use any internal state that you need using the opaque void * |

96 | f58ae59c | Juan Quintela | pointer that is passed to all functions. |

97 | f58ae59c | Juan Quintela | |

98 | f58ae59c | Juan Quintela | The rate limiting functions are used to limit the bandwidth used by |

99 | f58ae59c | Juan Quintela | QEMU migration. |

100 | f58ae59c | Juan Quintela | |

101 | f58ae59c | Juan Quintela | The important functions for us are put_buffer()/get_buffer() that |

102 | f58ae59c | Juan Quintela | allow to write/read a buffer into the QEMUFile. |

103 | f58ae59c | Juan Quintela | |

104 | f58ae59c | Juan Quintela | === How to save the state of one device == |

105 | f58ae59c | Juan Quintela | |

106 | f58ae59c | Juan Quintela | The state of a device is saved using intermediate buffers. There are |

107 | f58ae59c | Juan Quintela | some helper functions to assist this saving. |

108 | f58ae59c | Juan Quintela | |

109 | f58ae59c | Juan Quintela | There is a new concept that we have to explain here: device state |

110 | f58ae59c | Juan Quintela | version. When we migrate a device, we save/load the state as a series |

111 | f58ae59c | Juan Quintela | of fields. Some times, due to bugs or new functionality, we need to |

112 | f58ae59c | Juan Quintela | change the state to store more/different information. We use the |

113 | f58ae59c | Juan Quintela | version to identify each time that we do a change. Each version is |

114 | dda5336e | Stefan Weil | associated with a series of fields saved. The save_state always saves |

115 | dda5336e | Stefan Weil | the state as the newer version. But load_state sometimes is able to |

116 | f58ae59c | Juan Quintela | load state from an older version. |

117 | f58ae59c | Juan Quintela | |

118 | f58ae59c | Juan Quintela | === Legacy way === |

119 | f58ae59c | Juan Quintela | |

120 | f58ae59c | Juan Quintela | This way is going to disappear as soon as all current users are ported to VMSTATE. |

121 | f58ae59c | Juan Quintela | |

122 | f58ae59c | Juan Quintela | Each device has to register two functions, one to save the state and |

123 | f58ae59c | Juan Quintela | another to load the state back. |

124 | f58ae59c | Juan Quintela | |

125 | f58ae59c | Juan Quintela | int register_savevm(DeviceState *dev, |

126 | f58ae59c | Juan Quintela | const char *idstr, |

127 | f58ae59c | Juan Quintela | int instance_id, |

128 | f58ae59c | Juan Quintela | int version_id, |

129 | f58ae59c | Juan Quintela | SaveStateHandler *save_state, |

130 | f58ae59c | Juan Quintela | LoadStateHandler *load_state, |

131 | f58ae59c | Juan Quintela | void *opaque); |

132 | f58ae59c | Juan Quintela | |

133 | f58ae59c | Juan Quintela | typedef void SaveStateHandler(QEMUFile *f, void *opaque); |

134 | f58ae59c | Juan Quintela | typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); |

135 | f58ae59c | Juan Quintela | |

136 | f58ae59c | Juan Quintela | The important functions for the device state format are the save_state |

137 | f58ae59c | Juan Quintela | and load_state. Notice that load_state receives a version_id |

138 | dda5336e | Stefan Weil | parameter to know what state format is receiving. save_state doesn't |

139 | dda5336e | Stefan Weil | have a version_id parameter because it always uses the latest version. |

140 | f58ae59c | Juan Quintela | |

141 | f58ae59c | Juan Quintela | === VMState === |

142 | f58ae59c | Juan Quintela | |

143 | f58ae59c | Juan Quintela | The legacy way of saving/loading state of the device had the problem |

144 | dda5336e | Stefan Weil | that we have to maintain two functions in sync. If we did one change |

145 | dda5336e | Stefan Weil | in one of them and not in the other, we would get a failed migration. |

146 | f58ae59c | Juan Quintela | |

147 | f58ae59c | Juan Quintela | VMState changed the way that state is saved/loaded. Instead of using |

148 | f58ae59c | Juan Quintela | a function to save the state and another to load it, it was changed to |

149 | f58ae59c | Juan Quintela | a declarative way of what the state consisted of. Now VMState is able |

150 | f58ae59c | Juan Quintela | to interpret that definition to be able to load/save the state. As |

151 | f58ae59c | Juan Quintela | the state is declared only once, it can't go out of sync in the |

152 | f58ae59c | Juan Quintela | save/load functions. |

153 | f58ae59c | Juan Quintela | |

154 | f58ae59c | Juan Quintela | An example (from hw/pckbd.c) |

155 | f58ae59c | Juan Quintela | |

156 | f58ae59c | Juan Quintela | static const VMStateDescription vmstate_kbd = { |

157 | f58ae59c | Juan Quintela | .name = "pckbd", |

158 | f58ae59c | Juan Quintela | .version_id = 3, |

159 | f58ae59c | Juan Quintela | .minimum_version_id = 3, |

160 | f58ae59c | Juan Quintela | .minimum_version_id_old = 3, |

161 | f58ae59c | Juan Quintela | .fields = (VMStateField []) { |

162 | f58ae59c | Juan Quintela | VMSTATE_UINT8(write_cmd, KBDState), |

163 | f58ae59c | Juan Quintela | VMSTATE_UINT8(status, KBDState), |

164 | f58ae59c | Juan Quintela | VMSTATE_UINT8(mode, KBDState), |

165 | f58ae59c | Juan Quintela | VMSTATE_UINT8(pending, KBDState), |

166 | f58ae59c | Juan Quintela | VMSTATE_END_OF_LIST() |

167 | f58ae59c | Juan Quintela | } |

168 | f58ae59c | Juan Quintela | }; |

169 | f58ae59c | Juan Quintela | |

170 | f58ae59c | Juan Quintela | We are declaring the state with name "pckbd". |

171 | f58ae59c | Juan Quintela | The version_id is 3, and the fields are 4 uint8_t in a KBDState structure. |

172 | f58ae59c | Juan Quintela | We registered this with: |

173 | f58ae59c | Juan Quintela | |

174 | f58ae59c | Juan Quintela | vmstate_register(NULL, 0, &vmstate_kbd, s); |

175 | f58ae59c | Juan Quintela | |

176 | dda5336e | Stefan Weil | Note: talk about how vmstate <-> qdev interact, and what the instance ids mean. |

177 | f58ae59c | Juan Quintela | |

178 | f58ae59c | Juan Quintela | You can search for VMSTATE_* macros for lots of types used in QEMU in |

179 | f58ae59c | Juan Quintela | hw/hw.h. |

180 | f58ae59c | Juan Quintela | |

181 | f58ae59c | Juan Quintela | === More about versions == |

182 | f58ae59c | Juan Quintela | |

183 | f58ae59c | Juan Quintela | You can see that there are several version fields: |

184 | f58ae59c | Juan Quintela | |

185 | dda5336e | Stefan Weil | - version_id: the maximum version_id supported by VMState for that device. |

186 | f58ae59c | Juan Quintela | - minimum_version_id: the minimum version_id that VMState is able to understand |

187 | f58ae59c | Juan Quintela | for that device. |

188 | f58ae59c | Juan Quintela | - minimum_version_id_old: For devices that were not able to port to vmstate, we can |

189 | f58ae59c | Juan Quintela | assign a function that knows how to read this old state. |

190 | f58ae59c | Juan Quintela | |

191 | f58ae59c | Juan Quintela | So, VMState is able to read versions from minimum_version_id to |

192 | f58ae59c | Juan Quintela | version_id. And the function load_state_old() is able to load state |

193 | f58ae59c | Juan Quintela | from minimum_version_id_old to minimum_version_id. This function is |

194 | f58ae59c | Juan Quintela | deprecated and will be removed when no more users are left. |

195 | f58ae59c | Juan Quintela | |

196 | f58ae59c | Juan Quintela | === Massaging functions === |

197 | f58ae59c | Juan Quintela | |

198 | dda5336e | Stefan Weil | Sometimes, it is not enough to be able to save the state directly |

199 | f58ae59c | Juan Quintela | from one structure, we need to fill the correct values there. One |

200 | f58ae59c | Juan Quintela | example is when we are using kvm. Before saving the cpu state, we |

201 | f58ae59c | Juan Quintela | need to ask kvm to copy to QEMU the state that it is using. And the |

202 | f58ae59c | Juan Quintela | opposite when we are loading the state, we need a way to tell kvm to |

203 | f58ae59c | Juan Quintela | load the state for the cpu that we have just loaded from the QEMUFile. |

204 | f58ae59c | Juan Quintela | |

205 | f58ae59c | Juan Quintela | The functions to do that are inside a vmstate definition, and are called: |

206 | f58ae59c | Juan Quintela | |

207 | f58ae59c | Juan Quintela | - int (*pre_load)(void *opaque); |

208 | f58ae59c | Juan Quintela | |

209 | f58ae59c | Juan Quintela | This function is called before we load the state of one device. |

210 | f58ae59c | Juan Quintela | |

211 | f58ae59c | Juan Quintela | - int (*post_load)(void *opaque, int version_id); |

212 | f58ae59c | Juan Quintela | |

213 | f58ae59c | Juan Quintela | This function is called after we load the state of one device. |

214 | f58ae59c | Juan Quintela | |

215 | f58ae59c | Juan Quintela | - void (*pre_save)(void *opaque); |

216 | f58ae59c | Juan Quintela | |

217 | f58ae59c | Juan Quintela | This function is called before we save the state of one device. |

218 | f58ae59c | Juan Quintela | |

219 | f58ae59c | Juan Quintela | Example: You can look at hpet.c, that uses the three function to |

220 | f58ae59c | Juan Quintela | massage the state that is transferred. |

221 | f58ae59c | Juan Quintela | |

222 | f58ae59c | Juan Quintela | === Subsections === |

223 | f58ae59c | Juan Quintela | |

224 | f58ae59c | Juan Quintela | The use of version_id allows to be able to migrate from older versions |

225 | f58ae59c | Juan Quintela | to newer versions of a device. But not the other way around. This |

226 | f58ae59c | Juan Quintela | makes very complicated to fix bugs in stable branches. If we need to |

227 | f58ae59c | Juan Quintela | add anything to the state to fix a bug, we have to disable migration |

228 | f58ae59c | Juan Quintela | to older versions that don't have that bug-fix (i.e. a new field). |

229 | f58ae59c | Juan Quintela | |

230 | dda5336e | Stefan Weil | But sometimes, that bug-fix is only needed sometimes, not always. For |

231 | f58ae59c | Juan Quintela | instance, if the device is in the middle of a DMA operation, it is |

232 | f58ae59c | Juan Quintela | using a specific functionality, .... |

233 | f58ae59c | Juan Quintela | |

234 | f58ae59c | Juan Quintela | It is impossible to create a way to make migration from any version to |

235 | dda5336e | Stefan Weil | any other version to work. But we can do better than only allowing |

236 | f58ae59c | Juan Quintela | migration from older versions no newer ones. For that fields that are |

237 | dda5336e | Stefan Weil | only needed sometimes, we add the idea of subsections. A subsection |

238 | f58ae59c | Juan Quintela | is "like" a device vmstate, but with a particularity, it has a Boolean |

239 | f58ae59c | Juan Quintela | function that tells if that values are needed to be sent or not. If |

240 | f58ae59c | Juan Quintela | this functions returns false, the subsection is not sent. |

241 | f58ae59c | Juan Quintela | |

242 | f58ae59c | Juan Quintela | On the receiving side, if we found a subsection for a device that we |

243 | f58ae59c | Juan Quintela | don't understand, we just fail the migration. If we understand all |

244 | f58ae59c | Juan Quintela | the subsections, then we load the state with success. |

245 | f58ae59c | Juan Quintela | |

246 | f58ae59c | Juan Quintela | One important note is that the post_load() function is called "after" |

247 | f58ae59c | Juan Quintela | loading all subsections, because a newer subsection could change same |

248 | f58ae59c | Juan Quintela | value that it uses. |

249 | f58ae59c | Juan Quintela | |

250 | f58ae59c | Juan Quintela | Example: |

251 | f58ae59c | Juan Quintela | |

252 | f58ae59c | Juan Quintela | static bool ide_drive_pio_state_needed(void *opaque) |

253 | f58ae59c | Juan Quintela | { |

254 | f58ae59c | Juan Quintela | IDEState *s = opaque; |

255 | f58ae59c | Juan Quintela | |

256 | f58ae59c | Juan Quintela | return (s->status & DRQ_STAT) != 0; |

257 | f58ae59c | Juan Quintela | } |

258 | f58ae59c | Juan Quintela | |

259 | f58ae59c | Juan Quintela | const VMStateDescription vmstate_ide_drive_pio_state = { |

260 | f58ae59c | Juan Quintela | .name = "ide_drive/pio_state", |

261 | f58ae59c | Juan Quintela | .version_id = 1, |

262 | f58ae59c | Juan Quintela | .minimum_version_id = 1, |

263 | f58ae59c | Juan Quintela | .minimum_version_id_old = 1, |

264 | f58ae59c | Juan Quintela | .pre_save = ide_drive_pio_pre_save, |

265 | f58ae59c | Juan Quintela | .post_load = ide_drive_pio_post_load, |

266 | f58ae59c | Juan Quintela | .fields = (VMStateField []) { |

267 | f58ae59c | Juan Quintela | VMSTATE_INT32(req_nb_sectors, IDEState), |

268 | f58ae59c | Juan Quintela | VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, |

269 | dda5336e | Stefan Weil | vmstate_info_uint8, uint8_t), |

270 | f58ae59c | Juan Quintela | VMSTATE_INT32(cur_io_buffer_offset, IDEState), |

271 | f58ae59c | Juan Quintela | VMSTATE_INT32(cur_io_buffer_len, IDEState), |

272 | f58ae59c | Juan Quintela | VMSTATE_UINT8(end_transfer_fn_idx, IDEState), |

273 | f58ae59c | Juan Quintela | VMSTATE_INT32(elementary_transfer_size, IDEState), |

274 | f58ae59c | Juan Quintela | VMSTATE_INT32(packet_transfer_size, IDEState), |

275 | f58ae59c | Juan Quintela | VMSTATE_END_OF_LIST() |

276 | f58ae59c | Juan Quintela | } |

277 | f58ae59c | Juan Quintela | }; |

278 | f58ae59c | Juan Quintela | |

279 | f58ae59c | Juan Quintela | const VMStateDescription vmstate_ide_drive = { |

280 | f58ae59c | Juan Quintela | .name = "ide_drive", |

281 | f58ae59c | Juan Quintela | .version_id = 3, |

282 | f58ae59c | Juan Quintela | .minimum_version_id = 0, |

283 | f58ae59c | Juan Quintela | .minimum_version_id_old = 0, |

284 | f58ae59c | Juan Quintela | .post_load = ide_drive_post_load, |

285 | f58ae59c | Juan Quintela | .fields = (VMStateField []) { |

286 | f58ae59c | Juan Quintela | .... several fields .... |

287 | f58ae59c | Juan Quintela | VMSTATE_END_OF_LIST() |

288 | f58ae59c | Juan Quintela | }, |

289 | f58ae59c | Juan Quintela | .subsections = (VMStateSubsection []) { |

290 | f58ae59c | Juan Quintela | { |

291 | f58ae59c | Juan Quintela | .vmsd = &vmstate_ide_drive_pio_state, |

292 | f58ae59c | Juan Quintela | .needed = ide_drive_pio_state_needed, |

293 | f58ae59c | Juan Quintela | }, { |

294 | f58ae59c | Juan Quintela | /* empty */ |

295 | f58ae59c | Juan Quintela | } |

296 | f58ae59c | Juan Quintela | } |

297 | f58ae59c | Juan Quintela | }; |

298 | f58ae59c | Juan Quintela | |

299 | f58ae59c | Juan Quintela | Here we have a subsection for the pio state. We only need to |

300 | f58ae59c | Juan Quintela | save/send this state when we are in the middle of a pio operation |

301 | f58ae59c | Juan Quintela | (that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is |

302 | f58ae59c | Juan Quintela | not enabled, the values on that fields are garbage and don't need to |

303 | f58ae59c | Juan Quintela | be sent. |