Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ eee1fa2d

History | View | Annotate | Download (12.7 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 a8083063 Iustin Pop
# Copyright (C) 2006, 2007 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 5a3103e9 Michael Hanselmann
import re
33 a8083063 Iustin Pop
import time
34 a8083063 Iustin Pop
import fcntl
35 a8083063 Iustin Pop
import errno
36 5a3103e9 Michael Hanselmann
import simplejson
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 38242904 Iustin Pop
from ganeti import ssconf
42 89e1fc26 Iustin Pop
from ganeti import errors
43 a8083063 Iustin Pop
44 a8083063 Iustin Pop
45 5a3103e9 Michael Hanselmann
MAXTRIES = 5
46 5a3103e9 Michael Hanselmann
BAD_STATES = ['stopped']
47 5a3103e9 Michael Hanselmann
HELPLESS_STATES = ['(node down)']
48 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
49 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
50 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
51 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
52 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
53 5a3103e9 Michael Hanselmann
54 5a3103e9 Michael Hanselmann
55 a8083063 Iustin Pop
class Error(Exception):
56 a8083063 Iustin Pop
  """Generic custom error class."""
57 38242904 Iustin Pop
58 38242904 Iustin Pop
59 38242904 Iustin Pop
class NotMasterError(Error):
60 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
def Indent(s, prefix='| '):
64 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
65 a8083063 Iustin Pop
66 a8083063 Iustin Pop
  Args:
67 a8083063 Iustin Pop
    s: The string to indent
68 a8083063 Iustin Pop
    prefix: The string to prepend each line.
69 38242904 Iustin Pop
70 a8083063 Iustin Pop
  """
71 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72 a8083063 Iustin Pop
73 a8083063 Iustin Pop
74 a8083063 Iustin Pop
def DoCmd(cmd):
75 a8083063 Iustin Pop
  """Run a shell command.
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
  Args:
78 a8083063 Iustin Pop
    cmd: the command to run.
79 a8083063 Iustin Pop
80 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
81 38242904 Iustin Pop
82 a8083063 Iustin Pop
  """
83 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
84 a8083063 Iustin Pop
85 a8083063 Iustin Pop
  if res.failed:
86 a8083063 Iustin Pop
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
87 a8083063 Iustin Pop
                (repr(cmd),
88 a8083063 Iustin Pop
                 Indent(res.fail_reason),
89 a8083063 Iustin Pop
                 Indent(res.stdout),
90 a8083063 Iustin Pop
                 Indent(res.stderr)))
91 a8083063 Iustin Pop
92 a8083063 Iustin Pop
  return res
93 a8083063 Iustin Pop
94 a8083063 Iustin Pop
95 5a3103e9 Michael Hanselmann
class WatcherState(object):
96 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
97 a8083063 Iustin Pop
98 a8083063 Iustin Pop
  """
99 a8083063 Iustin Pop
  def __init__(self):
100 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
101 5a3103e9 Michael Hanselmann
102 5a3103e9 Michael Hanselmann
    Raises StandardError on lock contention.
103 5a3103e9 Michael Hanselmann
104 5a3103e9 Michael Hanselmann
    """
105 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
106 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
107 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
108 a8083063 Iustin Pop
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
109 a8083063 Iustin Pop
    f = os.fdopen(f, 'w+')
110 a8083063 Iustin Pop
111 a8083063 Iustin Pop
    try:
112 a8083063 Iustin Pop
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
113 a8083063 Iustin Pop
    except IOError, x:
114 a8083063 Iustin Pop
      if x.errno == errno.EAGAIN:
115 3ecf6786 Iustin Pop
        raise StandardError("State file already locked")
116 a8083063 Iustin Pop
      raise
117 a8083063 Iustin Pop
118 a8083063 Iustin Pop
    self.statefile = f
119 a8083063 Iustin Pop
120 5a3103e9 Michael Hanselmann
    try:
121 5a3103e9 Michael Hanselmann
      self.data = simplejson.load(self.statefile)
122 5a3103e9 Michael Hanselmann
    except Exception, msg:
123 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
124 5a3103e9 Michael Hanselmann
      self.data = {}
125 f4bc1f2c Michael Hanselmann
      sys.stderr.write("Empty or invalid state file."
126 f4bc1f2c Michael Hanselmann
                       " Using defaults. Error message: %s\n" % msg)
127 5a3103e9 Michael Hanselmann
128 5a3103e9 Michael Hanselmann
    if "instance" not in self.data:
129 5a3103e9 Michael Hanselmann
      self.data["instance"] = {}
130 5a3103e9 Michael Hanselmann
    if "node" not in self.data:
131 5a3103e9 Michael Hanselmann
      self.data["node"] = {}
132 5a3103e9 Michael Hanselmann
133 5a3103e9 Michael Hanselmann
  def __del__(self):
134 5a3103e9 Michael Hanselmann
    """Called on destruction.
135 5a3103e9 Michael Hanselmann
136 5a3103e9 Michael Hanselmann
    """
137 5a3103e9 Michael Hanselmann
    if self.statefile:
138 5a3103e9 Michael Hanselmann
      self._Close()
139 5a3103e9 Michael Hanselmann
140 5a3103e9 Michael Hanselmann
  def _Close(self):
141 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
142 5a3103e9 Michael Hanselmann
143 5a3103e9 Michael Hanselmann
    """
144 5a3103e9 Michael Hanselmann
    assert self.statefile
145 5a3103e9 Michael Hanselmann
146 5a3103e9 Michael Hanselmann
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
147 5a3103e9 Michael Hanselmann
148 5a3103e9 Michael Hanselmann
    self.statefile.close()
149 5a3103e9 Michael Hanselmann
    self.statefile = None
150 5a3103e9 Michael Hanselmann
151 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
152 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
153 a8083063 Iustin Pop
154 5a3103e9 Michael Hanselmann
    """
155 5a3103e9 Michael Hanselmann
    ndata = self.data["node"]
156 5a3103e9 Michael Hanselmann
157 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
158 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
159 5a3103e9 Michael Hanselmann
    return None
160 5a3103e9 Michael Hanselmann
161 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
162 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
163 5a3103e9 Michael Hanselmann
164 5a3103e9 Michael Hanselmann
    """
165 5a3103e9 Michael Hanselmann
    assert bootid
166 a8083063 Iustin Pop
167 5a3103e9 Michael Hanselmann
    ndata = self.data["node"]
168 a8083063 Iustin Pop
169 5a3103e9 Michael Hanselmann
    if name not in ndata:
170 5a3103e9 Michael Hanselmann
      ndata[name] = {}
171 5a3103e9 Michael Hanselmann
172 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
173 5a3103e9 Michael Hanselmann
174 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
175 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
176 a8083063 Iustin Pop
177 a8083063 Iustin Pop
    Args:
178 a8083063 Iustin Pop
      instance - the instance to look up.
179 38242904 Iustin Pop
180 a8083063 Iustin Pop
    """
181 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
182 a8083063 Iustin Pop
183 5a3103e9 Michael Hanselmann
    if instance.name in idata:
184 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
185 a8083063 Iustin Pop
186 a8083063 Iustin Pop
    return 0
187 a8083063 Iustin Pop
188 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
189 a8083063 Iustin Pop
    """Record a restart attempt.
190 a8083063 Iustin Pop
191 a8083063 Iustin Pop
    Args:
192 a8083063 Iustin Pop
      instance - the instance being restarted
193 38242904 Iustin Pop
194 a8083063 Iustin Pop
    """
195 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
196 a8083063 Iustin Pop
197 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
198 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
199 5a3103e9 Michael Hanselmann
    else:
200 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
201 a8083063 Iustin Pop
202 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
203 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
204 a8083063 Iustin Pop
205 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
206 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
207 a8083063 Iustin Pop
208 a8083063 Iustin Pop
    Args:
209 a8083063 Iustin Pop
      instance - the instance to remove from books
210 a8083063 Iustin Pop
211 38242904 Iustin Pop
    This method removes the record for a named instance.
212 38242904 Iustin Pop
213 a8083063 Iustin Pop
    """
214 5a3103e9 Michael Hanselmann
    idata = self.data["instance"]
215 a8083063 Iustin Pop
216 5a3103e9 Michael Hanselmann
    if instance.name in idata:
217 5a3103e9 Michael Hanselmann
      del idata[instance.name]
218 a8083063 Iustin Pop
219 a8083063 Iustin Pop
  def Save(self):
220 5a3103e9 Michael Hanselmann
    """Save state to file, then unlock and close it.
221 38242904 Iustin Pop
222 a8083063 Iustin Pop
    """
223 a8083063 Iustin Pop
    assert self.statefile
224 a8083063 Iustin Pop
225 a8083063 Iustin Pop
    self.statefile.seek(0)
226 a8083063 Iustin Pop
    self.statefile.truncate()
227 a8083063 Iustin Pop
228 5a3103e9 Michael Hanselmann
    simplejson.dump(self.data, self.statefile)
229 a8083063 Iustin Pop
230 5a3103e9 Michael Hanselmann
    self._Close()
231 a8083063 Iustin Pop
232 a8083063 Iustin Pop
233 a8083063 Iustin Pop
class Instance(object):
234 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
235 a8083063 Iustin Pop
236 a8083063 Iustin Pop
  Methods:
237 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
238 098c0958 Michael Hanselmann
239 a8083063 Iustin Pop
  """
240 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
241 a8083063 Iustin Pop
    self.name = name
242 a8083063 Iustin Pop
    self.state = state
243 5a3103e9 Michael Hanselmann
    self.autostart = autostart
244 a8083063 Iustin Pop
245 a8083063 Iustin Pop
  def Restart(self):
246 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
247 3ecf6786 Iustin Pop
248 3ecf6786 Iustin Pop
    """
249 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
250 a8083063 Iustin Pop
251 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
252 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
253 5a3103e9 Michael Hanselmann
254 5a3103e9 Michael Hanselmann
    """
255 5a3103e9 Michael Hanselmann
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
256 5a3103e9 Michael Hanselmann
257 a8083063 Iustin Pop
258 5a3103e9 Michael Hanselmann
def _RunListCmd(cmd):
259 5a3103e9 Michael Hanselmann
  """Runs a command and parses its output into lists.
260 38242904 Iustin Pop
261 a8083063 Iustin Pop
  """
262 5a3103e9 Michael Hanselmann
  for line in DoCmd(cmd).stdout.splitlines():
263 5a3103e9 Michael Hanselmann
    yield line.split(':')
264 a8083063 Iustin Pop
265 a8083063 Iustin Pop
266 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
267 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
268 5a3103e9 Michael Hanselmann
269 5a3103e9 Michael Hanselmann
  """
270 5a3103e9 Michael Hanselmann
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
271 5a3103e9 Michael Hanselmann
         '--separator=:']
272 5a3103e9 Michael Hanselmann
273 5a3103e9 Michael Hanselmann
  fields = 'name,oper_state,admin_state'
274 a8083063 Iustin Pop
275 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
276 5a3103e9 Michael Hanselmann
    fields += ',snodes'
277 a8083063 Iustin Pop
278 5a3103e9 Michael Hanselmann
  cmd.append('-o')
279 5a3103e9 Michael Hanselmann
  cmd.append(fields)
280 5a3103e9 Michael Hanselmann
281 5a3103e9 Michael Hanselmann
  instances = []
282 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
283 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
284 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
285 5a3103e9 Michael Hanselmann
286 5a3103e9 Michael Hanselmann
      if snodes == "-":
287 a8083063 Iustin Pop
        continue
288 5a3103e9 Michael Hanselmann
289 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
290 5a3103e9 Michael Hanselmann
        if node in snodes.split(','):
291 5a3103e9 Michael Hanselmann
          break
292 5a3103e9 Michael Hanselmann
      else:
293 a8083063 Iustin Pop
        continue
294 a8083063 Iustin Pop
295 5a3103e9 Michael Hanselmann
    else:
296 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
297 5a3103e9 Michael Hanselmann
298 5a3103e9 Michael Hanselmann
    instances.append(Instance(name, status, autostart != "no"))
299 a8083063 Iustin Pop
300 5a3103e9 Michael Hanselmann
  return instances
301 5a3103e9 Michael Hanselmann
302 5a3103e9 Michael Hanselmann
303 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
304 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
305 5a3103e9 Michael Hanselmann
306 5a3103e9 Michael Hanselmann
  """
307 5a3103e9 Michael Hanselmann
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
308 5a3103e9 Michael Hanselmann
         '--separator=:', '-o', 'name,bootid']
309 5a3103e9 Michael Hanselmann
310 5a3103e9 Michael Hanselmann
  ids = {}
311 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
312 5a3103e9 Michael Hanselmann
    (name, bootid) = fields
313 5a3103e9 Michael Hanselmann
    ids[name] = bootid
314 5a3103e9 Michael Hanselmann
315 5a3103e9 Michael Hanselmann
  return ids
316 a8083063 Iustin Pop
317 a8083063 Iustin Pop
318 a8083063 Iustin Pop
class Message(object):
319 a8083063 Iustin Pop
  """Encapsulation of a notice or error message.
320 38242904 Iustin Pop
321 a8083063 Iustin Pop
  """
322 a8083063 Iustin Pop
  def __init__(self, level, msg):
323 a8083063 Iustin Pop
    self.level = level
324 a8083063 Iustin Pop
    self.msg = msg
325 a8083063 Iustin Pop
    self.when = time.time()
326 a8083063 Iustin Pop
327 a8083063 Iustin Pop
  def __str__(self):
328 a8083063 Iustin Pop
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
329 a8083063 Iustin Pop
330 a8083063 Iustin Pop
331 5a3103e9 Michael Hanselmann
class Watcher(object):
332 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
333 a8083063 Iustin Pop
334 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
335 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
336 a8083063 Iustin Pop
  to restart machines that are down.
337 38242904 Iustin Pop
338 a8083063 Iustin Pop
  """
339 a8083063 Iustin Pop
  def __init__(self):
340 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
341 38242904 Iustin Pop
    master = sstore.GetMasterNode()
342 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
343 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
344 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
345 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
346 a8083063 Iustin Pop
    self.messages = []
347 eee1fa2d Iustin Pop
    self.started_instances = set()
348 a8083063 Iustin Pop
349 a8083063 Iustin Pop
  def Run(self):
350 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
351 5a3103e9 Michael Hanselmann
    self.CheckInstances(notepad)
352 5a3103e9 Michael Hanselmann
    self.CheckDisks(notepad)
353 d2f311db Iustin Pop
    self.VerifyDisks()
354 5a3103e9 Michael Hanselmann
    notepad.Save()
355 5a3103e9 Michael Hanselmann
356 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
357 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
358 38242904 Iustin Pop
359 a8083063 Iustin Pop
    """
360 5a3103e9 Michael Hanselmann
    check_nodes = []
361 5a3103e9 Michael Hanselmann
    for name, id in self.bootids.iteritems():
362 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
363 5a3103e9 Michael Hanselmann
      if old != id:
364 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
365 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
366 5a3103e9 Michael Hanselmann
367 5a3103e9 Michael Hanselmann
    if check_nodes:
368 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
369 5a3103e9 Michael Hanselmann
      # secondary node.
370 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
371 0c0f834d Iustin Pop
        if not instance.autostart:
372 0c0f834d Iustin Pop
          self.messages.append(Message(NOTICE,
373 0c0f834d Iustin Pop
                                       ("Skipping disk activation for"
374 0c0f834d Iustin Pop
                                        " non-autostart instance '%s'." %
375 0c0f834d Iustin Pop
                                        instance.name)))
376 0c0f834d Iustin Pop
          continue
377 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
378 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
379 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
380 eee1fa2d Iustin Pop
          continue
381 5a3103e9 Michael Hanselmann
        try:
382 f4bc1f2c Michael Hanselmann
          self.messages.append(Message(NOTICE, ("Activating disks for %s." %
383 f4bc1f2c Michael Hanselmann
                                                instance.name)))
384 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
385 5a3103e9 Michael Hanselmann
        except Error, x:
386 5a3103e9 Michael Hanselmann
          self.messages.append(Message(ERROR, str(x)))
387 5a3103e9 Michael Hanselmann
388 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
389 5a3103e9 Michael Hanselmann
      for name in check_nodes:
390 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
391 a8083063 Iustin Pop
392 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
393 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
394 5a3103e9 Michael Hanselmann
395 5a3103e9 Michael Hanselmann
    """
396 a8083063 Iustin Pop
    for instance in self.instances:
397 5a3103e9 Michael Hanselmann
      # Don't care about manually stopped instances
398 5a3103e9 Michael Hanselmann
      if not instance.autostart:
399 5a3103e9 Michael Hanselmann
        continue
400 5a3103e9 Michael Hanselmann
401 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
402 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
403 a8083063 Iustin Pop
404 a8083063 Iustin Pop
        if n > MAXTRIES:
405 a8083063 Iustin Pop
          # stay quiet.
406 a8083063 Iustin Pop
          continue
407 a8083063 Iustin Pop
        elif n < MAXTRIES:
408 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
409 a8083063 Iustin Pop
        else:
410 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
411 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
412 a8083063 Iustin Pop
                                       " times, giving up..." %
413 a8083063 Iustin Pop
                                       (instance.name, MAXTRIES)))
414 a8083063 Iustin Pop
          continue
415 a8083063 Iustin Pop
        try:
416 f4bc1f2c Michael Hanselmann
          self.messages.append(Message(NOTICE, ("Restarting %s%s." %
417 f4bc1f2c Michael Hanselmann
                                                (instance.name, last))))
418 a8083063 Iustin Pop
          instance.Restart()
419 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
420 a8083063 Iustin Pop
        except Error, x:
421 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, str(x)))
422 a8083063 Iustin Pop
423 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
424 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
425 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
426 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
427 a8083063 Iustin Pop
      else:
428 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
429 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
430 f4bc1f2c Michael Hanselmann
          msg = Message(NOTICE, "Restart of %s succeeded." % instance.name)
431 a8083063 Iustin Pop
          self.messages.append(msg)
432 a8083063 Iustin Pop
433 d2f311db Iustin Pop
  def VerifyDisks(self):
434 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
435 d2f311db Iustin Pop
436 d2f311db Iustin Pop
    """
437 d2f311db Iustin Pop
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
438 d2f311db Iustin Pop
    if result.output:
439 d2f311db Iustin Pop
      self.messages.append(Message(NOTICE, result.output))
440 d2f311db Iustin Pop
441 a8083063 Iustin Pop
  def WriteReport(self, logfile):
442 38242904 Iustin Pop
    """Log all messages to file.
443 a8083063 Iustin Pop
444 a8083063 Iustin Pop
    Args:
445 a8083063 Iustin Pop
      logfile: file object open for writing (the log file)
446 38242904 Iustin Pop
447 a8083063 Iustin Pop
    """
448 a8083063 Iustin Pop
    for msg in self.messages:
449 a8083063 Iustin Pop
      print >> logfile, str(msg)
450 a8083063 Iustin Pop
451 a8083063 Iustin Pop
452 a8083063 Iustin Pop
def ParseOptions():
453 a8083063 Iustin Pop
  """Parse the command line options.
454 a8083063 Iustin Pop
455 a8083063 Iustin Pop
  Returns:
456 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
457 a8083063 Iustin Pop
458 a8083063 Iustin Pop
  """
459 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
460 a8083063 Iustin Pop
                        usage="%prog [-d]",
461 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
462 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
463 a8083063 Iustin Pop
464 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
465 a8083063 Iustin Pop
                    help="Don't redirect messages to the log file",
466 a8083063 Iustin Pop
                    default=False, action="store_true")
467 a8083063 Iustin Pop
  options, args = parser.parse_args()
468 a8083063 Iustin Pop
  return options, args
469 a8083063 Iustin Pop
470 a8083063 Iustin Pop
471 a8083063 Iustin Pop
def main():
472 a8083063 Iustin Pop
  """Main function.
473 a8083063 Iustin Pop
474 a8083063 Iustin Pop
  """
475 a8083063 Iustin Pop
  options, args = ParseOptions()
476 a8083063 Iustin Pop
477 a8083063 Iustin Pop
  if not options.debug:
478 5a3103e9 Michael Hanselmann
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
479 a8083063 Iustin Pop
480 a8083063 Iustin Pop
  try:
481 781b2b2b Michael Hanselmann
    try:
482 781b2b2b Michael Hanselmann
      watcher = Watcher()
483 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
484 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
485 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
486 5a3103e9 Michael Hanselmann
    watcher.Run()
487 5a3103e9 Michael Hanselmann
    watcher.WriteReport(sys.stdout)
488 38242904 Iustin Pop
  except NotMasterError:
489 38242904 Iustin Pop
    if options.debug:
490 38242904 Iustin Pop
      sys.stderr.write("Not master, exiting.\n")
491 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
492 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
493 89e1fc26 Iustin Pop
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
494 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
495 a8083063 Iustin Pop
  except Error, err:
496 a8083063 Iustin Pop
    print err
497 a8083063 Iustin Pop
498 5a3103e9 Michael Hanselmann
499 a8083063 Iustin Pop
if __name__ == '__main__':
500 a8083063 Iustin Pop
  main()