Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 153d9724

History | View | Annotate | Download (11.9 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 a8083063 Iustin Pop
# Copyright (C) 2006, 2007 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 5a3103e9 Michael Hanselmann
import re
33 a8083063 Iustin Pop
import time
34 a8083063 Iustin Pop
import fcntl
35 a8083063 Iustin Pop
import errno
36 5a3103e9 Michael Hanselmann
import simplejson
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 38242904 Iustin Pop
from ganeti import ssconf
42 89e1fc26 Iustin Pop
from ganeti import errors
43 a8083063 Iustin Pop
44 a8083063 Iustin Pop
45 5a3103e9 Michael Hanselmann
MAXTRIES = 5
46 5a3103e9 Michael Hanselmann
BAD_STATES = ['stopped']
47 5a3103e9 Michael Hanselmann
HELPLESS_STATES = ['(node down)']
48 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
49 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
50 5a3103e9 Michael Hanselmann
51 5a3103e9 Michael Hanselmann
52 a8083063 Iustin Pop
class Error(Exception):
53 a8083063 Iustin Pop
  """Generic custom error class."""
54 38242904 Iustin Pop
55 38242904 Iustin Pop
56 38242904 Iustin Pop
class NotMasterError(Error):
57 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
58 a8083063 Iustin Pop
59 a8083063 Iustin Pop
60 a8083063 Iustin Pop
def Indent(s, prefix='| '):
61 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
  Args:
64 a8083063 Iustin Pop
    s: The string to indent
65 a8083063 Iustin Pop
    prefix: The string to prepend each line.
66 38242904 Iustin Pop
67 a8083063 Iustin Pop
  """
68 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
69 a8083063 Iustin Pop
70 a8083063 Iustin Pop
71 a8083063 Iustin Pop
def DoCmd(cmd):
72 a8083063 Iustin Pop
  """Run a shell command.
73 a8083063 Iustin Pop
74 a8083063 Iustin Pop
  Args:
75 a8083063 Iustin Pop
    cmd: the command to run.
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
78 38242904 Iustin Pop
79 a8083063 Iustin Pop
  """
80 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
81 a8083063 Iustin Pop
82 a8083063 Iustin Pop
  if res.failed:
83 a8083063 Iustin Pop
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
84 a8083063 Iustin Pop
                (repr(cmd),
85 a8083063 Iustin Pop
                 Indent(res.fail_reason),
86 a8083063 Iustin Pop
                 Indent(res.stdout),
87 a8083063 Iustin Pop
                 Indent(res.stderr)))
88 a8083063 Iustin Pop
89 a8083063 Iustin Pop
  return res
90 a8083063 Iustin Pop
91 a8083063 Iustin Pop
92 5a3103e9 Michael Hanselmann
class WatcherState(object):
93 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
94 a8083063 Iustin Pop
95 a8083063 Iustin Pop
  """
96 a8083063 Iustin Pop
  def __init__(self):
97 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
98 5a3103e9 Michael Hanselmann
99 5a3103e9 Michael Hanselmann
    Raises StandardError on lock contention.
100 5a3103e9 Michael Hanselmann
101 5a3103e9 Michael Hanselmann
    """
102 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
103 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
104 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
105 a8083063 Iustin Pop
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
106 a8083063 Iustin Pop
    f = os.fdopen(f, 'w+')
107 a8083063 Iustin Pop
108 a8083063 Iustin Pop
    try:
109 a8083063 Iustin Pop
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
110 a8083063 Iustin Pop
    except IOError, x:
111 a8083063 Iustin Pop
      if x.errno == errno.EAGAIN:
112 3ecf6786 Iustin Pop
        raise StandardError("State file already locked")
113 a8083063 Iustin Pop
      raise
114 a8083063 Iustin Pop
115 a8083063 Iustin Pop
    self.statefile = f
116 a8083063 Iustin Pop
117 5a3103e9 Michael Hanselmann
    try:
118 5a3103e9 Michael Hanselmann
      self.data = simplejson.load(self.statefile)
119 5a3103e9 Michael Hanselmann
    except Exception, msg:
120 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
121 5a3103e9 Michael Hanselmann
      self.data = {}
122 5a3103e9 Michael Hanselmann
      sys.stderr.write("Empty or invalid state file. "
123 5a3103e9 Michael Hanselmann
          "Using defaults. Error message: %s\n" % msg)
124 5a3103e9 Michael Hanselmann
125 5a3103e9 Michael Hanselmann
    if "instance" not in self.data:
126 5a3103e9 Michael Hanselmann
      self.data["instance"] = {}
127 5a3103e9 Michael Hanselmann
    if "node" not in self.data:
128 5a3103e9 Michael Hanselmann
      self.data["node"] = {}
129 5a3103e9 Michael Hanselmann
130 5a3103e9 Michael Hanselmann
  def __del__(self):
131 5a3103e9 Michael Hanselmann
    """Called on destruction.
132 5a3103e9 Michael Hanselmann
133 5a3103e9 Michael Hanselmann
    """
134 5a3103e9 Michael Hanselmann
    if self.statefile:
135 5a3103e9 Michael Hanselmann
      self._Close()
136 5a3103e9 Michael Hanselmann
137 5a3103e9 Michael Hanselmann
  def _Close(self):
138 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
139 5a3103e9 Michael Hanselmann
140 5a3103e9 Michael Hanselmann
    """
141 5a3103e9 Michael Hanselmann
    assert self.statefile
142 5a3103e9 Michael Hanselmann
143 5a3103e9 Michael Hanselmann
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
144 5a3103e9 Michael Hanselmann
145 5a3103e9 Michael Hanselmann
    self.statefile.close()
146 5a3103e9 Michael Hanselmann
    self.statefile = None
147 5a3103e9 Michael Hanselmann
148 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
149 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
150 a8083063 Iustin Pop
151 5a3103e9 Michael Hanselmann
    """
152 5a3103e9 Michael Hanselmann
    ndata = self.data["node"]
153 5a3103e9 Michael Hanselmann
154 5a3103e9 Michael Hanselmann
    if name in ndata and "bootid" in ndata[name]:
155 5a3103e9 Michael Hanselmann
      return ndata[name]["bootid"]
156 5a3103e9 Michael Hanselmann
    return None
157 5a3103e9 Michael Hanselmann
158 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
159 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
160 5a3103e9 Michael Hanselmann
161 5a3103e9 Michael Hanselmann
    """
162 5a3103e9 Michael Hanselmann
    assert bootid
163 a8083063 Iustin Pop
164 5a3103e9 Michael Hanselmann
    ndata = self.data["node"]
165 a8083063 Iustin Pop
166 5a3103e9 Michael Hanselmann
    if name not in ndata:
167 5a3103e9 Michael Hanselmann
      ndata[name] = {}
168 5a3103e9 Michael Hanselmann
169 5a3103e9 Michael Hanselmann
    ndata[name]["bootid"] = bootid
170 5a3103e9 Michael Hanselmann
171 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
172 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
173 a8083063 Iustin Pop
174 a8083063 Iustin Pop
    Args:
175 a8083063 Iustin Pop
      instance - the instance to look up.
176 38242904 Iustin Pop
177 a8083063 Iustin Pop
    """
178 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
179 a8083063 Iustin Pop
180 5a3103e9 Michael Hanselmann
    if instance.name in idata:
181 5a3103e9 Michael Hanselmann
      return idata[instance.name]["restart_count"]
182 a8083063 Iustin Pop
183 a8083063 Iustin Pop
    return 0
184 a8083063 Iustin Pop
185 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
186 a8083063 Iustin Pop
    """Record a restart attempt.
187 a8083063 Iustin Pop
188 a8083063 Iustin Pop
    Args:
189 a8083063 Iustin Pop
      instance - the instance being restarted
190 38242904 Iustin Pop
191 a8083063 Iustin Pop
    """
192 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
193 a8083063 Iustin Pop
194 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
195 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
196 5a3103e9 Michael Hanselmann
    else:
197 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
198 a8083063 Iustin Pop
199 5a3103e9 Michael Hanselmann
    inst["restart_when"] = time.time()
200 5a3103e9 Michael Hanselmann
    inst["restart_count"] = idata.get("restart_count", 0) + 1
201 a8083063 Iustin Pop
202 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
203 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
204 a8083063 Iustin Pop
205 a8083063 Iustin Pop
    Args:
206 a8083063 Iustin Pop
      instance - the instance to remove from books
207 a8083063 Iustin Pop
208 38242904 Iustin Pop
    This method removes the record for a named instance.
209 38242904 Iustin Pop
210 a8083063 Iustin Pop
    """
211 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
212 a8083063 Iustin Pop
213 5a3103e9 Michael Hanselmann
    if instance.name in idata:
214 5a3103e9 Michael Hanselmann
      del idata[instance.name]
215 a8083063 Iustin Pop
216 a8083063 Iustin Pop
  def Save(self):
217 5a3103e9 Michael Hanselmann
    """Save state to file, then unlock and close it.
218 38242904 Iustin Pop
219 a8083063 Iustin Pop
    """
220 a8083063 Iustin Pop
    assert self.statefile
221 a8083063 Iustin Pop
222 a8083063 Iustin Pop
    self.statefile.seek(0)
223 a8083063 Iustin Pop
    self.statefile.truncate()
224 a8083063 Iustin Pop
225 5a3103e9 Michael Hanselmann
    simplejson.dump(self.data, self.statefile)
226 a8083063 Iustin Pop
227 5a3103e9 Michael Hanselmann
    self._Close()
228 a8083063 Iustin Pop
229 a8083063 Iustin Pop
230 a8083063 Iustin Pop
class Instance(object):
231 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
232 a8083063 Iustin Pop
233 a8083063 Iustin Pop
  Methods:
234 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
235 098c0958 Michael Hanselmann
236 a8083063 Iustin Pop
  """
237 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
238 a8083063 Iustin Pop
    self.name = name
239 a8083063 Iustin Pop
    self.state = state
240 5a3103e9 Michael Hanselmann
    self.autostart = autostart
241 a8083063 Iustin Pop
242 a8083063 Iustin Pop
  def Restart(self):
243 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
244 3ecf6786 Iustin Pop
245 3ecf6786 Iustin Pop
    """
246 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
247 a8083063 Iustin Pop
248 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
249 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
250 5a3103e9 Michael Hanselmann
251 5a3103e9 Michael Hanselmann
    """
252 5a3103e9 Michael Hanselmann
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
253 5a3103e9 Michael Hanselmann
254 a8083063 Iustin Pop
255 5a3103e9 Michael Hanselmann
def _RunListCmd(cmd):
256 5a3103e9 Michael Hanselmann
  """Runs a command and parses its output into lists.
257 38242904 Iustin Pop
258 a8083063 Iustin Pop
  """
259 5a3103e9 Michael Hanselmann
  for line in DoCmd(cmd).stdout.splitlines():
260 5a3103e9 Michael Hanselmann
    yield line.split(':')
261 a8083063 Iustin Pop
262 a8083063 Iustin Pop
263 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
264 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
265 5a3103e9 Michael Hanselmann
266 5a3103e9 Michael Hanselmann
  """
267 5a3103e9 Michael Hanselmann
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
268 5a3103e9 Michael Hanselmann
         '--separator=:']
269 5a3103e9 Michael Hanselmann
270 5a3103e9 Michael Hanselmann
  fields = 'name,oper_state,admin_state'
271 a8083063 Iustin Pop
272 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
273 5a3103e9 Michael Hanselmann
    fields += ',snodes'
274 a8083063 Iustin Pop
275 5a3103e9 Michael Hanselmann
  cmd.append('-o')
276 5a3103e9 Michael Hanselmann
  cmd.append(fields)
277 5a3103e9 Michael Hanselmann
278 5a3103e9 Michael Hanselmann
  instances = []
279 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
280 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
281 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
282 5a3103e9 Michael Hanselmann
283 5a3103e9 Michael Hanselmann
      if snodes == "-":
284 a8083063 Iustin Pop
        continue
285 5a3103e9 Michael Hanselmann
286 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
287 5a3103e9 Michael Hanselmann
        if node in snodes.split(','):
288 5a3103e9 Michael Hanselmann
          break
289 5a3103e9 Michael Hanselmann
      else:
290 a8083063 Iustin Pop
        continue
291 a8083063 Iustin Pop
292 5a3103e9 Michael Hanselmann
    else:
293 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
294 5a3103e9 Michael Hanselmann
295 5a3103e9 Michael Hanselmann
    instances.append(Instance(name, status, autostart != "no"))
296 a8083063 Iustin Pop
297 5a3103e9 Michael Hanselmann
  return instances
298 5a3103e9 Michael Hanselmann
299 5a3103e9 Michael Hanselmann
300 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
301 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
302 5a3103e9 Michael Hanselmann
303 5a3103e9 Michael Hanselmann
  """
304 5a3103e9 Michael Hanselmann
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
305 5a3103e9 Michael Hanselmann
         '--separator=:', '-o', 'name,bootid']
306 5a3103e9 Michael Hanselmann
307 5a3103e9 Michael Hanselmann
  ids = {}
308 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
309 5a3103e9 Michael Hanselmann
    (name, bootid) = fields
310 5a3103e9 Michael Hanselmann
    ids[name] = bootid
311 5a3103e9 Michael Hanselmann
312 5a3103e9 Michael Hanselmann
  return ids
313 a8083063 Iustin Pop
314 a8083063 Iustin Pop
315 a8083063 Iustin Pop
class Message(object):
316 a8083063 Iustin Pop
  """Encapsulation of a notice or error message.
317 38242904 Iustin Pop
318 a8083063 Iustin Pop
  """
319 a8083063 Iustin Pop
  def __init__(self, level, msg):
320 a8083063 Iustin Pop
    self.level = level
321 a8083063 Iustin Pop
    self.msg = msg
322 a8083063 Iustin Pop
    self.when = time.time()
323 a8083063 Iustin Pop
324 a8083063 Iustin Pop
  def __str__(self):
325 a8083063 Iustin Pop
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
326 a8083063 Iustin Pop
327 a8083063 Iustin Pop
328 5a3103e9 Michael Hanselmann
class Watcher(object):
329 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
330 a8083063 Iustin Pop
331 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
332 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
333 a8083063 Iustin Pop
  to restart machines that are down.
334 38242904 Iustin Pop
335 a8083063 Iustin Pop
  """
336 a8083063 Iustin Pop
  def __init__(self):
337 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
338 38242904 Iustin Pop
    master = sstore.GetMasterNode()
339 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
340 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
341 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
342 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
343 a8083063 Iustin Pop
    self.messages = []
344 a8083063 Iustin Pop
345 a8083063 Iustin Pop
  def Run(self):
346 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
347 5a3103e9 Michael Hanselmann
    self.CheckInstances(notepad)
348 5a3103e9 Michael Hanselmann
    self.CheckDisks(notepad)
349 5a3103e9 Michael Hanselmann
    notepad.Save()
350 5a3103e9 Michael Hanselmann
351 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
352 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
353 38242904 Iustin Pop
354 a8083063 Iustin Pop
    """
355 5a3103e9 Michael Hanselmann
    check_nodes = []
356 5a3103e9 Michael Hanselmann
    for name, id in self.bootids.iteritems():
357 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
358 5a3103e9 Michael Hanselmann
      if old != id:
359 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
360 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
361 5a3103e9 Michael Hanselmann
362 5a3103e9 Michael Hanselmann
    if check_nodes:
363 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
364 5a3103e9 Michael Hanselmann
      # secondary node.
365 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
366 5a3103e9 Michael Hanselmann
        try:
367 5a3103e9 Michael Hanselmann
          self.messages.append(Message(NOTICE,
368 5a3103e9 Michael Hanselmann
                                       "Activating disks for %s." %
369 5a3103e9 Michael Hanselmann
                                       instance.name))
370 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
371 5a3103e9 Michael Hanselmann
        except Error, x:
372 5a3103e9 Michael Hanselmann
          self.messages.append(Message(ERROR, str(x)))
373 5a3103e9 Michael Hanselmann
374 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
375 5a3103e9 Michael Hanselmann
      for name in check_nodes:
376 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
377 a8083063 Iustin Pop
378 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
379 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
380 5a3103e9 Michael Hanselmann
381 5a3103e9 Michael Hanselmann
    """
382 a8083063 Iustin Pop
    for instance in self.instances:
383 5a3103e9 Michael Hanselmann
      # Don't care about manually stopped instances
384 5a3103e9 Michael Hanselmann
      if not instance.autostart:
385 5a3103e9 Michael Hanselmann
        continue
386 5a3103e9 Michael Hanselmann
387 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
388 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
389 a8083063 Iustin Pop
390 a8083063 Iustin Pop
        if n > MAXTRIES:
391 a8083063 Iustin Pop
          # stay quiet.
392 a8083063 Iustin Pop
          continue
393 a8083063 Iustin Pop
        elif n < MAXTRIES:
394 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
395 a8083063 Iustin Pop
        else:
396 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
397 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
398 a8083063 Iustin Pop
                                       " times, giving up..." %
399 a8083063 Iustin Pop
                                       (instance.name, MAXTRIES)))
400 a8083063 Iustin Pop
          continue
401 a8083063 Iustin Pop
        try:
402 a8083063 Iustin Pop
          self.messages.append(Message(NOTICE,
403 a8083063 Iustin Pop
                                       "Restarting %s%s." %
404 a8083063 Iustin Pop
                                       (instance.name, last)))
405 a8083063 Iustin Pop
          instance.Restart()
406 a8083063 Iustin Pop
        except Error, x:
407 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, str(x)))
408 a8083063 Iustin Pop
409 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
410 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
411 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
412 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
413 a8083063 Iustin Pop
      else:
414 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
415 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
416 a8083063 Iustin Pop
          msg = Message(NOTICE,
417 a8083063 Iustin Pop
                        "Restart of %s succeeded." % instance.name)
418 a8083063 Iustin Pop
          self.messages.append(msg)
419 a8083063 Iustin Pop
420 a8083063 Iustin Pop
  def WriteReport(self, logfile):
421 38242904 Iustin Pop
    """Log all messages to file.
422 a8083063 Iustin Pop
423 a8083063 Iustin Pop
    Args:
424 a8083063 Iustin Pop
      logfile: file object open for writing (the log file)
425 38242904 Iustin Pop
426 a8083063 Iustin Pop
    """
427 a8083063 Iustin Pop
    for msg in self.messages:
428 a8083063 Iustin Pop
      print >> logfile, str(msg)
429 a8083063 Iustin Pop
430 a8083063 Iustin Pop
431 a8083063 Iustin Pop
def ParseOptions():
432 a8083063 Iustin Pop
  """Parse the command line options.
433 a8083063 Iustin Pop
434 a8083063 Iustin Pop
  Returns:
435 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
436 a8083063 Iustin Pop
437 a8083063 Iustin Pop
  """
438 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
439 a8083063 Iustin Pop
                        usage="%prog [-d]",
440 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
441 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
442 a8083063 Iustin Pop
443 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
444 a8083063 Iustin Pop
                    help="Don't redirect messages to the log file",
445 a8083063 Iustin Pop
                    default=False, action="store_true")
446 a8083063 Iustin Pop
  options, args = parser.parse_args()
447 a8083063 Iustin Pop
  return options, args
448 a8083063 Iustin Pop
449 a8083063 Iustin Pop
450 a8083063 Iustin Pop
def main():
451 a8083063 Iustin Pop
  """Main function.
452 a8083063 Iustin Pop
453 a8083063 Iustin Pop
  """
454 a8083063 Iustin Pop
  options, args = ParseOptions()
455 a8083063 Iustin Pop
456 a8083063 Iustin Pop
  if not options.debug:
457 5a3103e9 Michael Hanselmann
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
458 a8083063 Iustin Pop
459 a8083063 Iustin Pop
  try:
460 781b2b2b Michael Hanselmann
    try:
461 781b2b2b Michael Hanselmann
      watcher = Watcher()
462 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
463 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
464 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
465 5a3103e9 Michael Hanselmann
    watcher.Run()
466 5a3103e9 Michael Hanselmann
    watcher.WriteReport(sys.stdout)
467 38242904 Iustin Pop
  except NotMasterError:
468 38242904 Iustin Pop
    if options.debug:
469 38242904 Iustin Pop
      sys.stderr.write("Not master, exiting.\n")
470 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
471 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
472 89e1fc26 Iustin Pop
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
473 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
474 a8083063 Iustin Pop
  except Error, err:
475 a8083063 Iustin Pop
    print err
476 a8083063 Iustin Pop
477 5a3103e9 Michael Hanselmann
478 a8083063 Iustin Pop
if __name__ == '__main__':
479 a8083063 Iustin Pop
  main()