Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ f4bc1f2c

History | View | Annotate | Download (11.9 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 a8083063 Iustin Pop
# Copyright (C) 2006, 2007 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 5a3103e9 Michael Hanselmann
import re
33 a8083063 Iustin Pop
import time
34 a8083063 Iustin Pop
import fcntl
35 a8083063 Iustin Pop
import errno
36 5a3103e9 Michael Hanselmann
import simplejson
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 38242904 Iustin Pop
from ganeti import ssconf
42 89e1fc26 Iustin Pop
from ganeti import errors
43 a8083063 Iustin Pop
44 a8083063 Iustin Pop
45 5a3103e9 Michael Hanselmann
MAXTRIES = 5
46 5a3103e9 Michael Hanselmann
BAD_STATES = ['stopped']
47 5a3103e9 Michael Hanselmann
HELPLESS_STATES = ['(node down)']
48 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
49 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
50 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
51 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
52 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
53 5a3103e9 Michael Hanselmann
54 5a3103e9 Michael Hanselmann
55 a8083063 Iustin Pop
class Error(Exception):
56 a8083063 Iustin Pop
  """Generic custom error class."""
57 38242904 Iustin Pop
58 38242904 Iustin Pop
59 38242904 Iustin Pop
class NotMasterError(Error):
60 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
def Indent(s, prefix='| '):
64 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
65 a8083063 Iustin Pop
66 a8083063 Iustin Pop
  Args:
67 a8083063 Iustin Pop
    s: The string to indent
68 a8083063 Iustin Pop
    prefix: The string to prepend each line.
69 38242904 Iustin Pop
70 a8083063 Iustin Pop
  """
71 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72 a8083063 Iustin Pop
73 a8083063 Iustin Pop
74 a8083063 Iustin Pop
def DoCmd(cmd):
75 a8083063 Iustin Pop
  """Run a shell command.
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
  Args:
78 a8083063 Iustin Pop
    cmd: the command to run.
79 a8083063 Iustin Pop
80 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
81 38242904 Iustin Pop
82 a8083063 Iustin Pop
  """
83 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
84 a8083063 Iustin Pop
85 a8083063 Iustin Pop
  if res.failed:
86 a8083063 Iustin Pop
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
87 a8083063 Iustin Pop
                (repr(cmd),
88 a8083063 Iustin Pop
                 Indent(res.fail_reason),
89 a8083063 Iustin Pop
                 Indent(res.stdout),
90 a8083063 Iustin Pop
                 Indent(res.stderr)))
91 a8083063 Iustin Pop
92 a8083063 Iustin Pop
  return res
93 a8083063 Iustin Pop
94 a8083063 Iustin Pop
95 5a3103e9 Michael Hanselmann
class WatcherState(object):
96 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
97 a8083063 Iustin Pop
98 a8083063 Iustin Pop
  """
99 a8083063 Iustin Pop
  def __init__(self):
100 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
101 5a3103e9 Michael Hanselmann
102 5a3103e9 Michael Hanselmann
    Raises StandardError on lock contention.
103 5a3103e9 Michael Hanselmann
104 5a3103e9 Michael Hanselmann
    """
105 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
106 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
107 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
108 a8083063 Iustin Pop
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
109 a8083063 Iustin Pop
    f = os.fdopen(f, 'w+')
110 a8083063 Iustin Pop
111 a8083063 Iustin Pop
    try:
112 a8083063 Iustin Pop
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
113 a8083063 Iustin Pop
    except IOError, x:
114 a8083063 Iustin Pop
      if x.errno == errno.EAGAIN:
115 3ecf6786 Iustin Pop
        raise StandardError("State file already locked")
116 a8083063 Iustin Pop
      raise
117 a8083063 Iustin Pop
118 a8083063 Iustin Pop
    self.statefile = f
119 a8083063 Iustin Pop
120 5a3103e9 Michael Hanselmann
    try:
121 5a3103e9 Michael Hanselmann
      self.data = simplejson.load(self.statefile)
122 5a3103e9 Michael Hanselmann
    except Exception, msg:
123 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
124 5a3103e9 Michael Hanselmann
      self.data = {}
125 f4bc1f2c Michael Hanselmann
      sys.stderr.write("Empty or invalid state file."
126 f4bc1f2c Michael Hanselmann
                       " Using defaults. Error message: %s\n" % msg)
127 5a3103e9 Michael Hanselmann
128 5a3103e9 Michael Hanselmann
    if "instance" not in self.data:
129 5a3103e9 Michael Hanselmann
      self.data["instance"] = {}
130 5a3103e9 Michael Hanselmann
    if "node" not in self.data:
131 5a3103e9 Michael Hanselmann
      self.data["node"] = {}
132 5a3103e9 Michael Hanselmann
133 5a3103e9 Michael Hanselmann
  def __del__(self):
134 5a3103e9 Michael Hanselmann
    """Called on destruction.
135 5a3103e9 Michael Hanselmann
136 5a3103e9 Michael Hanselmann
    """
137 5a3103e9 Michael Hanselmann
    if self.statefile:
138 5a3103e9 Michael Hanselmann
      self._Close()
139 5a3103e9 Michael Hanselmann
140 5a3103e9 Michael Hanselmann
  def _Close(self):
141 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
142 5a3103e9 Michael Hanselmann
143 5a3103e9 Michael Hanselmann
    """
144 5a3103e9 Michael Hanselmann
    assert self.statefile
145 5a3103e9 Michael Hanselmann
146 5a3103e9 Michael Hanselmann
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
147 5a3103e9 Michael Hanselmann
148 5a3103e9 Michael Hanselmann
    self.statefile.close()
149 5a3103e9 Michael Hanselmann
    self.statefile = None
150 5a3103e9 Michael Hanselmann
151 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
152 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
153 a8083063 Iustin Pop
154 5a3103e9 Michael Hanselmann
    """
155 5a3103e9 Michael Hanselmann
    ndata = self.data["node"]
156 5a3103e9 Michael Hanselmann
157 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
158 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
159 5a3103e9 Michael Hanselmann
    return None
160 5a3103e9 Michael Hanselmann
161 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
162 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
163 5a3103e9 Michael Hanselmann
164 5a3103e9 Michael Hanselmann
    """
165 5a3103e9 Michael Hanselmann
    assert bootid
166 a8083063 Iustin Pop
167 5a3103e9 Michael Hanselmann
    ndata = self.data["node"]
168 a8083063 Iustin Pop
169 5a3103e9 Michael Hanselmann
    if name not in ndata:
170 5a3103e9 Michael Hanselmann
      ndata[name] = {}
171 5a3103e9 Michael Hanselmann
172 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
173 5a3103e9 Michael Hanselmann
174 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
175 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
176 a8083063 Iustin Pop
177 a8083063 Iustin Pop
    Args:
178 a8083063 Iustin Pop
      instance - the instance to look up.
179 38242904 Iustin Pop
180 a8083063 Iustin Pop
    """
181 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
182 a8083063 Iustin Pop
183 5a3103e9 Michael Hanselmann
    if instance.name in idata:
184 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
185 a8083063 Iustin Pop
186 a8083063 Iustin Pop
    return 0
187 a8083063 Iustin Pop
188 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
189 a8083063 Iustin Pop
    """Record a restart attempt.
190 a8083063 Iustin Pop
191 a8083063 Iustin Pop
    Args:
192 a8083063 Iustin Pop
      instance - the instance being restarted
193 38242904 Iustin Pop
194 a8083063 Iustin Pop
    """
195 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
196 a8083063 Iustin Pop
197 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
198 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
199 5a3103e9 Michael Hanselmann
    else:
200 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
201 a8083063 Iustin Pop
202 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
203 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
204 a8083063 Iustin Pop
205 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
206 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
207 a8083063 Iustin Pop
208 a8083063 Iustin Pop
    Args:
209 a8083063 Iustin Pop
      instance - the instance to remove from books
210 a8083063 Iustin Pop
211 38242904 Iustin Pop
    This method removes the record for a named instance.
212 38242904 Iustin Pop
213 a8083063 Iustin Pop
    """
214 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
215 a8083063 Iustin Pop
216 5a3103e9 Michael Hanselmann
    if instance.name in idata:
217 5a3103e9 Michael Hanselmann
      del idata[instance.name]
218 a8083063 Iustin Pop
219 a8083063 Iustin Pop
  def Save(self):
220 5a3103e9 Michael Hanselmann
    """Save state to file, then unlock and close it.
221 38242904 Iustin Pop
222 a8083063 Iustin Pop
    """
223 a8083063 Iustin Pop
    assert self.statefile
224 a8083063 Iustin Pop
225 a8083063 Iustin Pop
    self.statefile.seek(0)
226 a8083063 Iustin Pop
    self.statefile.truncate()
227 a8083063 Iustin Pop
228 5a3103e9 Michael Hanselmann
    simplejson.dump(self.data, self.statefile)
229 a8083063 Iustin Pop
230 5a3103e9 Michael Hanselmann
    self._Close()
231 a8083063 Iustin Pop
232 a8083063 Iustin Pop
233 a8083063 Iustin Pop
class Instance(object):
234 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
235 a8083063 Iustin Pop
236 a8083063 Iustin Pop
  Methods:
237 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
238 098c0958 Michael Hanselmann
239 a8083063 Iustin Pop
  """
240 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
241 a8083063 Iustin Pop
    self.name = name
242 a8083063 Iustin Pop
    self.state = state
243 5a3103e9 Michael Hanselmann
    self.autostart = autostart
244 a8083063 Iustin Pop
245 a8083063 Iustin Pop
  def Restart(self):
246 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
247 3ecf6786 Iustin Pop
248 3ecf6786 Iustin Pop
    """
249 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
250 a8083063 Iustin Pop
251 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
252 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
253 5a3103e9 Michael Hanselmann
254 5a3103e9 Michael Hanselmann
    """
255 5a3103e9 Michael Hanselmann
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
256 5a3103e9 Michael Hanselmann
257 a8083063 Iustin Pop
258 5a3103e9 Michael Hanselmann
def _RunListCmd(cmd):
259 5a3103e9 Michael Hanselmann
  """Runs a command and parses its output into lists.
260 38242904 Iustin Pop
261 a8083063 Iustin Pop
  """
262 5a3103e9 Michael Hanselmann
  for line in DoCmd(cmd).stdout.splitlines():
263 5a3103e9 Michael Hanselmann
    yield line.split(':')
264 a8083063 Iustin Pop
265 a8083063 Iustin Pop
266 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
267 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
268 5a3103e9 Michael Hanselmann
269 5a3103e9 Michael Hanselmann
  """
270 5a3103e9 Michael Hanselmann
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
271 5a3103e9 Michael Hanselmann
         '--separator=:']
272 5a3103e9 Michael Hanselmann
273 5a3103e9 Michael Hanselmann
  fields = 'name,oper_state,admin_state'
274 a8083063 Iustin Pop
275 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
276 5a3103e9 Michael Hanselmann
    fields += ',snodes'
277 a8083063 Iustin Pop
278 5a3103e9 Michael Hanselmann
  cmd.append('-o')
279 5a3103e9 Michael Hanselmann
  cmd.append(fields)
280 5a3103e9 Michael Hanselmann
281 5a3103e9 Michael Hanselmann
  instances = []
282 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
283 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
284 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
285 5a3103e9 Michael Hanselmann
286 5a3103e9 Michael Hanselmann
      if snodes == "-":
287 a8083063 Iustin Pop
        continue
288 5a3103e9 Michael Hanselmann
289 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
290 5a3103e9 Michael Hanselmann
        if node in snodes.split(','):
291 5a3103e9 Michael Hanselmann
          break
292 5a3103e9 Michael Hanselmann
      else:
293 a8083063 Iustin Pop
        continue
294 a8083063 Iustin Pop
295 5a3103e9 Michael Hanselmann
    else:
296 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
297 5a3103e9 Michael Hanselmann
298 5a3103e9 Michael Hanselmann
    instances.append(Instance(name, status, autostart != "no"))
299 a8083063 Iustin Pop
300 5a3103e9 Michael Hanselmann
  return instances
301 5a3103e9 Michael Hanselmann
302 5a3103e9 Michael Hanselmann
303 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
304 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
305 5a3103e9 Michael Hanselmann
306 5a3103e9 Michael Hanselmann
  """
307 5a3103e9 Michael Hanselmann
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
308 5a3103e9 Michael Hanselmann
         '--separator=:', '-o', 'name,bootid']
309 5a3103e9 Michael Hanselmann
310 5a3103e9 Michael Hanselmann
  ids = {}
311 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
312 5a3103e9 Michael Hanselmann
    (name, bootid) = fields
313 5a3103e9 Michael Hanselmann
    ids[name] = bootid
314 5a3103e9 Michael Hanselmann
315 5a3103e9 Michael Hanselmann
  return ids
316 a8083063 Iustin Pop
317 a8083063 Iustin Pop
318 a8083063 Iustin Pop
class Message(object):
319 a8083063 Iustin Pop
  """Encapsulation of a notice or error message.
320 38242904 Iustin Pop
321 a8083063 Iustin Pop
  """
322 a8083063 Iustin Pop
  def __init__(self, level, msg):
323 a8083063 Iustin Pop
    self.level = level
324 a8083063 Iustin Pop
    self.msg = msg
325 a8083063 Iustin Pop
    self.when = time.time()
326 a8083063 Iustin Pop
327 a8083063 Iustin Pop
  def __str__(self):
328 a8083063 Iustin Pop
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
329 a8083063 Iustin Pop
330 a8083063 Iustin Pop
331 5a3103e9 Michael Hanselmann
class Watcher(object):
332 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
333 a8083063 Iustin Pop
334 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
335 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
336 a8083063 Iustin Pop
  to restart machines that are down.
337 38242904 Iustin Pop
338 a8083063 Iustin Pop
  """
339 a8083063 Iustin Pop
  def __init__(self):
340 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
341 38242904 Iustin Pop
    master = sstore.GetMasterNode()
342 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
343 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
344 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
345 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
346 a8083063 Iustin Pop
    self.messages = []
347 a8083063 Iustin Pop
348 a8083063 Iustin Pop
  def Run(self):
349 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
350 5a3103e9 Michael Hanselmann
    self.CheckInstances(notepad)
351 5a3103e9 Michael Hanselmann
    self.CheckDisks(notepad)
352 5a3103e9 Michael Hanselmann
    notepad.Save()
353 5a3103e9 Michael Hanselmann
354 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
355 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
356 38242904 Iustin Pop
357 a8083063 Iustin Pop
    """
358 5a3103e9 Michael Hanselmann
    check_nodes = []
359 5a3103e9 Michael Hanselmann
    for name, id in self.bootids.iteritems():
360 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
361 5a3103e9 Michael Hanselmann
      if old != id:
362 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
363 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
364 5a3103e9 Michael Hanselmann
365 5a3103e9 Michael Hanselmann
    if check_nodes:
366 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
367 5a3103e9 Michael Hanselmann
      # secondary node.
368 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
369 5a3103e9 Michael Hanselmann
        try:
370 f4bc1f2c Michael Hanselmann
          self.messages.append(Message(NOTICE, ("Activating disks for %s." %
371 f4bc1f2c Michael Hanselmann
                                                instance.name)))
372 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
373 5a3103e9 Michael Hanselmann
        except Error, x:
374 5a3103e9 Michael Hanselmann
          self.messages.append(Message(ERROR, str(x)))
375 5a3103e9 Michael Hanselmann
376 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
377 5a3103e9 Michael Hanselmann
      for name in check_nodes:
378 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
379 a8083063 Iustin Pop
380 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
381 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
382 5a3103e9 Michael Hanselmann
383 5a3103e9 Michael Hanselmann
    """
384 a8083063 Iustin Pop
    for instance in self.instances:
385 5a3103e9 Michael Hanselmann
      # Don't care about manually stopped instances
386 5a3103e9 Michael Hanselmann
      if not instance.autostart:
387 5a3103e9 Michael Hanselmann
        continue
388 5a3103e9 Michael Hanselmann
389 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
390 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
391 a8083063 Iustin Pop
392 a8083063 Iustin Pop
        if n > MAXTRIES:
393 a8083063 Iustin Pop
          # stay quiet.
394 a8083063 Iustin Pop
          continue
395 a8083063 Iustin Pop
        elif n < MAXTRIES:
396 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
397 a8083063 Iustin Pop
        else:
398 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
399 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
400 a8083063 Iustin Pop
                                       " times, giving up..." %
401 a8083063 Iustin Pop
                                       (instance.name, MAXTRIES)))
402 a8083063 Iustin Pop
          continue
403 a8083063 Iustin Pop
        try:
404 f4bc1f2c Michael Hanselmann
          self.messages.append(Message(NOTICE, ("Restarting %s%s." %
405 f4bc1f2c Michael Hanselmann
                                                (instance.name, last))))
406 a8083063 Iustin Pop
          instance.Restart()
407 a8083063 Iustin Pop
        except Error, x:
408 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, str(x)))
409 a8083063 Iustin Pop
410 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
411 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
412 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
413 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
414 a8083063 Iustin Pop
      else:
415 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
416 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
417 f4bc1f2c Michael Hanselmann
          msg = Message(NOTICE, "Restart of %s succeeded." % instance.name)
418 a8083063 Iustin Pop
          self.messages.append(msg)
419 a8083063 Iustin Pop
420 a8083063 Iustin Pop
  def WriteReport(self, logfile):
421 38242904 Iustin Pop
    """Log all messages to file.
422 a8083063 Iustin Pop
423 a8083063 Iustin Pop
    Args:
424 a8083063 Iustin Pop
      logfile: file object open for writing (the log file)
425 38242904 Iustin Pop
426 a8083063 Iustin Pop
    """
427 a8083063 Iustin Pop
    for msg in self.messages:
428 a8083063 Iustin Pop
      print >> logfile, str(msg)
429 a8083063 Iustin Pop
430 a8083063 Iustin Pop
431 a8083063 Iustin Pop
def ParseOptions():
432 a8083063 Iustin Pop
  """Parse the command line options.
433 a8083063 Iustin Pop
434 a8083063 Iustin Pop
  Returns:
435 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
436 a8083063 Iustin Pop
437 a8083063 Iustin Pop
  """
438 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
439 a8083063 Iustin Pop
                        usage="%prog [-d]",
440 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
441 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
442 a8083063 Iustin Pop
443 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
444 a8083063 Iustin Pop
                    help="Don't redirect messages to the log file",
445 a8083063 Iustin Pop
                    default=False, action="store_true")
446 a8083063 Iustin Pop
  options, args = parser.parse_args()
447 a8083063 Iustin Pop
  return options, args
448 a8083063 Iustin Pop
449 a8083063 Iustin Pop
450 a8083063 Iustin Pop
def main():
451 a8083063 Iustin Pop
  """Main function.
452 a8083063 Iustin Pop
453 a8083063 Iustin Pop
  """
454 a8083063 Iustin Pop
  options, args = ParseOptions()
455 a8083063 Iustin Pop
456 a8083063 Iustin Pop
  if not options.debug:
457 5a3103e9 Michael Hanselmann
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
458 a8083063 Iustin Pop
459 a8083063 Iustin Pop
  try:
460 781b2b2b Michael Hanselmann
    try:
461 781b2b2b Michael Hanselmann
      watcher = Watcher()
462 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
463 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
464 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
465 5a3103e9 Michael Hanselmann
    watcher.Run()
466 5a3103e9 Michael Hanselmann
    watcher.WriteReport(sys.stdout)
467 38242904 Iustin Pop
  except NotMasterError:
468 38242904 Iustin Pop
    if options.debug:
469 38242904 Iustin Pop
      sys.stderr.write("Not master, exiting.\n")
470 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
471 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
472 89e1fc26 Iustin Pop
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
473 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
474 a8083063 Iustin Pop
  except Error, err:
475 a8083063 Iustin Pop
    print err
476 a8083063 Iustin Pop
477 5a3103e9 Michael Hanselmann
478 a8083063 Iustin Pop
if __name__ == '__main__':
479 a8083063 Iustin Pop
  main()