24 |
24 |
This program and set of classes implement a watchdog to restart
|
25 |
25 |
virtual machines in a Ganeti cluster that have crashed or been killed
|
26 |
26 |
by a node reboot. Run from cron or similar.
|
27 |
|
"""
|
28 |
|
|
29 |
27 |
|
30 |
|
LOGFILE = '/var/log/ganeti/watcher.log'
|
31 |
|
MAXTRIES = 5
|
32 |
|
BAD_STATES = ['stopped']
|
33 |
|
HELPLESS_STATES = ['(node down)']
|
34 |
|
NOTICE = 'NOTICE'
|
35 |
|
ERROR = 'ERROR'
|
|
28 |
"""
|
36 |
29 |
|
37 |
30 |
import os
|
38 |
31 |
import sys
|
|
32 |
import re
|
39 |
33 |
import time
|
40 |
34 |
import fcntl
|
41 |
35 |
import errno
|
|
36 |
import simplejson
|
42 |
37 |
from optparse import OptionParser
|
43 |
38 |
|
44 |
|
|
45 |
39 |
from ganeti import utils
|
46 |
40 |
from ganeti import constants
|
47 |
41 |
from ganeti import ssconf
|
48 |
42 |
from ganeti import errors
|
49 |
43 |
|
50 |
44 |
|
|
45 |
MAXTRIES = 5
|
|
46 |
BAD_STATES = ['stopped']
|
|
47 |
HELPLESS_STATES = ['(node down)']
|
|
48 |
NOTICE = 'NOTICE'
|
|
49 |
ERROR = 'ERROR'
|
|
50 |
|
|
51 |
|
51 |
52 |
class Error(Exception):
|
52 |
53 |
"""Generic custom error class."""
|
53 |
54 |
|
... | ... | |
88 |
89 |
return res
|
89 |
90 |
|
90 |
91 |
|
91 |
|
class RestarterState(object):
|
|
92 |
class WatcherState(object):
|
92 |
93 |
"""Interface to a state file recording restart attempts.
|
93 |
94 |
|
94 |
|
Methods:
|
95 |
|
Open(): open, lock, read and parse the file.
|
96 |
|
Raises StandardError on lock contention.
|
97 |
|
|
98 |
|
NumberOfAttempts(name): returns the number of times in succession
|
99 |
|
a restart has been attempted of the named instance.
|
100 |
|
|
101 |
|
RecordAttempt(name, when): records one restart attempt of name at
|
102 |
|
time in when.
|
103 |
|
|
104 |
|
Remove(name): remove record given by name, if exists.
|
105 |
|
|
106 |
|
Save(name): saves all records to file, releases lock and closes file.
|
107 |
|
|
108 |
95 |
"""
|
109 |
96 |
def __init__(self):
|
|
97 |
"""Open, lock, read and parse the file.
|
|
98 |
|
|
99 |
Raises StandardError on lock contention.
|
|
100 |
|
|
101 |
"""
|
110 |
102 |
# The two-step dance below is necessary to allow both opening existing
|
111 |
103 |
# file read/write and creating if not existing. Vanilla open will truncate
|
112 |
104 |
# an existing file -or- allow creating if not existing.
|
... | ... | |
121 |
113 |
raise
|
122 |
114 |
|
123 |
115 |
self.statefile = f
|
124 |
|
self.inst_map = {}
|
125 |
116 |
|
126 |
|
for line in f:
|
127 |
|
name, when, count = line.rstrip().split(':')
|
|
117 |
try:
|
|
118 |
self.data = simplejson.load(self.statefile)
|
|
119 |
except Exception, msg:
|
|
120 |
# Ignore errors while loading the file and treat it as empty
|
|
121 |
self.data = {}
|
|
122 |
sys.stderr.write("Empty or invalid state file. "
|
|
123 |
"Using defaults. Error message: %s\n" % msg)
|
|
124 |
|
|
125 |
if "instance" not in self.data:
|
|
126 |
self.data["instance"] = {}
|
|
127 |
if "node" not in self.data:
|
|
128 |
self.data["node"] = {}
|
|
129 |
|
|
130 |
def __del__(self):
|
|
131 |
"""Called on destruction.
|
|
132 |
|
|
133 |
"""
|
|
134 |
if self.statefile:
|
|
135 |
self._Close()
|
|
136 |
|
|
137 |
def _Close(self):
|
|
138 |
"""Unlock configuration file and close it.
|
|
139 |
|
|
140 |
"""
|
|
141 |
assert self.statefile
|
|
142 |
|
|
143 |
fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
|
|
144 |
|
|
145 |
self.statefile.close()
|
|
146 |
self.statefile = None
|
|
147 |
|
|
148 |
def GetNodeBootID(self, name):
|
|
149 |
"""Returns the last boot ID of a node or None.
|
128 |
150 |
|
129 |
|
when = int(when)
|
130 |
|
count = int(count)
|
|
151 |
"""
|
|
152 |
ndata = self.data["node"]
|
|
153 |
|
|
154 |
if name in ndata and "bootid" in ndata[name]:
|
|
155 |
return ndata[name]["bootid"]
|
|
156 |
return None
|
|
157 |
|
|
158 |
def SetNodeBootID(self, name, bootid):
|
|
159 |
"""Sets the boot ID of a node.
|
|
160 |
|
|
161 |
"""
|
|
162 |
assert bootid
|
131 |
163 |
|
132 |
|
self.inst_map[name] = (when, count)
|
|
164 |
ndata = self.data["node"]
|
133 |
165 |
|
134 |
|
def NumberOfAttempts(self, instance):
|
|
166 |
if name not in ndata:
|
|
167 |
ndata[name] = {}
|
|
168 |
|
|
169 |
ndata[name]["bootid"] = bootid
|
|
170 |
|
|
171 |
def NumberOfRestartAttempts(self, instance):
|
135 |
172 |
"""Returns number of previous restart attempts.
|
136 |
173 |
|
137 |
174 |
Args:
|
138 |
175 |
instance - the instance to look up.
|
139 |
176 |
|
140 |
177 |
"""
|
141 |
|
assert self.statefile
|
|
178 |
idata = self.data["instance"]
|
142 |
179 |
|
143 |
|
if instance.name in self.inst_map:
|
144 |
|
return self.inst_map[instance.name][1]
|
|
180 |
if instance.name in idata:
|
|
181 |
return idata[instance.name]["restart_count"]
|
145 |
182 |
|
146 |
183 |
return 0
|
147 |
184 |
|
148 |
|
def RecordAttempt(self, instance):
|
|
185 |
def RecordRestartAttempt(self, instance):
|
149 |
186 |
"""Record a restart attempt.
|
150 |
187 |
|
151 |
188 |
Args:
|
152 |
189 |
instance - the instance being restarted
|
153 |
190 |
|
154 |
191 |
"""
|
155 |
|
assert self.statefile
|
|
192 |
idata = self.data["instance"]
|
156 |
193 |
|
157 |
|
when = time.time()
|
|
194 |
if instance.name not in idata:
|
|
195 |
inst = idata[instance.name] = {}
|
|
196 |
else:
|
|
197 |
inst = idata[instance.name]
|
158 |
198 |
|
159 |
|
self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
|
|
199 |
inst["restart_when"] = time.time()
|
|
200 |
inst["restart_count"] = idata.get("restart_count", 0) + 1
|
160 |
201 |
|
161 |
|
def Remove(self, instance):
|
|
202 |
def RemoveInstance(self, instance):
|
162 |
203 |
"""Update state to reflect that a machine is running, i.e. remove record.
|
163 |
204 |
|
164 |
205 |
Args:
|
... | ... | |
167 |
208 |
This method removes the record for a named instance.
|
168 |
209 |
|
169 |
210 |
"""
|
170 |
|
assert self.statefile
|
|
211 |
idata = self.data["instance"]
|
171 |
212 |
|
172 |
|
if instance.name in self.inst_map:
|
173 |
|
del self.inst_map[instance.name]
|
|
213 |
if instance.name in idata:
|
|
214 |
del idata[instance.name]
|
174 |
215 |
|
175 |
216 |
def Save(self):
|
176 |
|
"""Save records to file, then unlock and close file.
|
|
217 |
"""Save state to file, then unlock and close it.
|
177 |
218 |
|
178 |
219 |
"""
|
179 |
220 |
assert self.statefile
|
... | ... | |
181 |
222 |
self.statefile.seek(0)
|
182 |
223 |
self.statefile.truncate()
|
183 |
224 |
|
184 |
|
for name in self.inst_map:
|
185 |
|
print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
|
|
225 |
simplejson.dump(self.data, self.statefile)
|
186 |
226 |
|
187 |
|
fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
|
188 |
|
|
189 |
|
self.statefile.close()
|
190 |
|
self.statefile = None
|
|
227 |
self._Close()
|
191 |
228 |
|
192 |
229 |
|
193 |
230 |
class Instance(object):
|
... | ... | |
197 |
234 |
Restart(): issue a command to restart the represented machine.
|
198 |
235 |
|
199 |
236 |
"""
|
200 |
|
def __init__(self, name, state):
|
|
237 |
def __init__(self, name, state, autostart):
|
201 |
238 |
self.name = name
|
202 |
239 |
self.state = state
|
|
240 |
self.autostart = autostart
|
203 |
241 |
|
204 |
242 |
def Restart(self):
|
205 |
243 |
"""Encapsulates the start of an instance.
|
206 |
244 |
|
207 |
|
This is currently done using the command line interface and not
|
208 |
|
the Ganeti modules.
|
209 |
|
|
210 |
245 |
"""
|
211 |
246 |
DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
|
212 |
247 |
|
|
248 |
def ActivateDisks(self):
|
|
249 |
"""Encapsulates the activation of all disks of an instance.
|
|
250 |
|
|
251 |
"""
|
|
252 |
DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
|
|
253 |
|
213 |
254 |
|
214 |
|
class InstanceList(object):
|
215 |
|
"""The set of Virtual Machine instances on a cluster.
|
|
255 |
def _RunListCmd(cmd):
|
|
256 |
"""Runs a command and parses its output into lists.
|
216 |
257 |
|
217 |
258 |
"""
|
218 |
|
cmd = ['gnt-instance', 'list', '--lock-retries=15',
|
219 |
|
'-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
|
|
259 |
for line in DoCmd(cmd).stdout.splitlines():
|
|
260 |
yield line.split(':')
|
220 |
261 |
|
221 |
|
def __init__(self):
|
222 |
|
res = DoCmd(self.cmd)
|
223 |
262 |
|
224 |
|
lines = res.stdout.splitlines()
|
|
263 |
def GetInstanceList(with_secondaries=None):
|
|
264 |
"""Get a list of instances on this cluster.
|
|
265 |
|
|
266 |
"""
|
|
267 |
cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
|
|
268 |
'--separator=:']
|
|
269 |
|
|
270 |
fields = 'name,oper_state,admin_state'
|
225 |
271 |
|
226 |
|
self.instances = []
|
227 |
|
for line in lines:
|
228 |
|
fields = [fld.strip() for fld in line.split(':')]
|
|
272 |
if with_secondaries is not None:
|
|
273 |
fields += ',snodes'
|
229 |
274 |
|
230 |
|
if len(fields) != 3:
|
|
275 |
cmd.append('-o')
|
|
276 |
cmd.append(fields)
|
|
277 |
|
|
278 |
instances = []
|
|
279 |
for fields in _RunListCmd(cmd):
|
|
280 |
if with_secondaries is not None:
|
|
281 |
(name, status, autostart, snodes) = fields
|
|
282 |
|
|
283 |
if snodes == "-":
|
231 |
284 |
continue
|
232 |
|
if fields[1] == "no": #no autostart, we don't care about this instance
|
|
285 |
|
|
286 |
for node in with_secondaries:
|
|
287 |
if node in snodes.split(','):
|
|
288 |
break
|
|
289 |
else:
|
233 |
290 |
continue
|
234 |
|
name, status = fields[0], fields[2]
|
235 |
291 |
|
236 |
|
self.instances.append(Instance(name, status))
|
|
292 |
else:
|
|
293 |
(name, status, autostart) = fields
|
|
294 |
|
|
295 |
instances.append(Instance(name, status, autostart != "no"))
|
237 |
296 |
|
238 |
|
def __iter__(self):
|
239 |
|
return self.instances.__iter__()
|
|
297 |
return instances
|
|
298 |
|
|
299 |
|
|
300 |
def GetNodeBootIDs():
|
|
301 |
"""Get a dict mapping nodes to boot IDs.
|
|
302 |
|
|
303 |
"""
|
|
304 |
cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
|
|
305 |
'--separator=:', '-o', 'name,bootid']
|
|
306 |
|
|
307 |
ids = {}
|
|
308 |
for fields in _RunListCmd(cmd):
|
|
309 |
(name, bootid) = fields
|
|
310 |
ids[name] = bootid
|
|
311 |
|
|
312 |
return ids
|
240 |
313 |
|
241 |
314 |
|
242 |
315 |
class Message(object):
|
... | ... | |
252 |
325 |
return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
|
253 |
326 |
|
254 |
327 |
|
255 |
|
class Restarter(object):
|
|
328 |
class Watcher(object):
|
256 |
329 |
"""Encapsulate the logic for restarting erronously halted virtual machines.
|
257 |
330 |
|
258 |
331 |
The calling program should periodically instantiate me and call Run().
|
... | ... | |
265 |
338 |
master = sstore.GetMasterNode()
|
266 |
339 |
if master != utils.HostInfo().name:
|
267 |
340 |
raise NotMasterError("This is not the master node")
|
268 |
|
self.instances = InstanceList()
|
|
341 |
self.instances = GetInstanceList()
|
|
342 |
self.bootids = GetNodeBootIDs()
|
269 |
343 |
self.messages = []
|
270 |
344 |
|
271 |
345 |
def Run(self):
|
272 |
|
"""Make a pass over the list of instances, restarting downed ones.
|
|
346 |
notepad = WatcherState()
|
|
347 |
self.CheckInstances(notepad)
|
|
348 |
self.CheckDisks(notepad)
|
|
349 |
notepad.Save()
|
|
350 |
|
|
351 |
def CheckDisks(self, notepad):
|
|
352 |
"""Check all nodes for restarted ones.
|
273 |
353 |
|
274 |
354 |
"""
|
275 |
|
notepad = RestarterState()
|
|
355 |
check_nodes = []
|
|
356 |
for name, id in self.bootids.iteritems():
|
|
357 |
old = notepad.GetNodeBootID(name)
|
|
358 |
if old != id:
|
|
359 |
# Node's boot ID has changed, proably through a reboot.
|
|
360 |
check_nodes.append(name)
|
|
361 |
|
|
362 |
if check_nodes:
|
|
363 |
# Activate disks for all instances with any of the checked nodes as a
|
|
364 |
# secondary node.
|
|
365 |
for instance in GetInstanceList(with_secondaries=check_nodes):
|
|
366 |
try:
|
|
367 |
self.messages.append(Message(NOTICE,
|
|
368 |
"Activating disks for %s." %
|
|
369 |
instance.name))
|
|
370 |
instance.ActivateDisks()
|
|
371 |
except Error, x:
|
|
372 |
self.messages.append(Message(ERROR, str(x)))
|
|
373 |
|
|
374 |
# Keep changed boot IDs
|
|
375 |
for name in check_nodes:
|
|
376 |
notepad.SetNodeBootID(name, self.bootids[name])
|
276 |
377 |
|
|
378 |
def CheckInstances(self, notepad):
|
|
379 |
"""Make a pass over the list of instances, restarting downed ones.
|
|
380 |
|
|
381 |
"""
|
277 |
382 |
for instance in self.instances:
|
|
383 |
# Don't care about manually stopped instances
|
|
384 |
if not instance.autostart:
|
|
385 |
continue
|
|
386 |
|
278 |
387 |
if instance.state in BAD_STATES:
|
279 |
|
n = notepad.NumberOfAttempts(instance)
|
|
388 |
n = notepad.NumberOfRestartAttempts(instance)
|
280 |
389 |
|
281 |
390 |
if n > MAXTRIES:
|
282 |
391 |
# stay quiet.
|
... | ... | |
284 |
393 |
elif n < MAXTRIES:
|
285 |
394 |
last = " (Attempt #%d)" % (n + 1)
|
286 |
395 |
else:
|
287 |
|
notepad.RecordAttempt(instance)
|
|
396 |
notepad.RecordRestartAttempt(instance)
|
288 |
397 |
self.messages.append(Message(ERROR, "Could not restart %s for %d"
|
289 |
398 |
" times, giving up..." %
|
290 |
399 |
(instance.name, MAXTRIES)))
|
... | ... | |
297 |
406 |
except Error, x:
|
298 |
407 |
self.messages.append(Message(ERROR, str(x)))
|
299 |
408 |
|
300 |
|
notepad.RecordAttempt(instance)
|
|
409 |
notepad.RecordRestartAttempt(instance)
|
301 |
410 |
elif instance.state in HELPLESS_STATES:
|
302 |
|
if notepad.NumberOfAttempts(instance):
|
303 |
|
notepad.Remove(instance)
|
|
411 |
if notepad.NumberOfRestartAttempts(instance):
|
|
412 |
notepad.RemoveInstance(instance)
|
304 |
413 |
else:
|
305 |
|
if notepad.NumberOfAttempts(instance):
|
306 |
|
notepad.Remove(instance)
|
|
414 |
if notepad.NumberOfRestartAttempts(instance):
|
|
415 |
notepad.RemoveInstance(instance)
|
307 |
416 |
msg = Message(NOTICE,
|
308 |
417 |
"Restart of %s succeeded." % instance.name)
|
309 |
418 |
self.messages.append(msg)
|
310 |
419 |
|
311 |
|
notepad.Save()
|
312 |
|
|
313 |
420 |
def WriteReport(self, logfile):
|
314 |
421 |
"""Log all messages to file.
|
315 |
422 |
|
... | ... | |
347 |
454 |
options, args = ParseOptions()
|
348 |
455 |
|
349 |
456 |
if not options.debug:
|
350 |
|
sys.stderr = sys.stdout = open(LOGFILE, 'a')
|
|
457 |
sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
|
351 |
458 |
|
352 |
459 |
try:
|
353 |
|
restarter = Restarter()
|
354 |
|
restarter.Run()
|
355 |
|
restarter.WriteReport(sys.stdout)
|
|
460 |
watcher = Watcher()
|
|
461 |
watcher.Run()
|
|
462 |
watcher.WriteReport(sys.stdout)
|
356 |
463 |
except NotMasterError:
|
357 |
464 |
if options.debug:
|
358 |
465 |
sys.stderr.write("Not master, exiting.\n")
|
... | ... | |
363 |
470 |
except Error, err:
|
364 |
471 |
print err
|
365 |
472 |
|
|
473 |
|
366 |
474 |
if __name__ == '__main__':
|
367 |
475 |
main()
|