Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 9ae49f27

History | View | Annotate | Download (12.9 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 5a3103e9 Michael Hanselmann
import re
33 a8083063 Iustin Pop
import time
34 a8083063 Iustin Pop
import fcntl
35 a8083063 Iustin Pop
import errno
36 438b45d4 Michael Hanselmann
import logging
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 67fe61c4 Michael Hanselmann
from ganeti import serializer
42 38242904 Iustin Pop
from ganeti import ssconf
43 89e1fc26 Iustin Pop
from ganeti import errors
44 a8083063 Iustin Pop
45 a8083063 Iustin Pop
46 5a3103e9 Michael Hanselmann
MAXTRIES = 5
47 5a3103e9 Michael Hanselmann
BAD_STATES = ['stopped']
48 5a3103e9 Michael Hanselmann
HELPLESS_STATES = ['(node down)']
49 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
50 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
51 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
52 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
53 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
54 5a3103e9 Michael Hanselmann
55 5a3103e9 Michael Hanselmann
56 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
57 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
58 a8083063 Iustin Pop
59 a8083063 Iustin Pop
60 a8083063 Iustin Pop
def Indent(s, prefix='| '):
61 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
  Args:
64 a8083063 Iustin Pop
    s: The string to indent
65 a8083063 Iustin Pop
    prefix: The string to prepend each line.
66 38242904 Iustin Pop
67 a8083063 Iustin Pop
  """
68 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
69 a8083063 Iustin Pop
70 a8083063 Iustin Pop
71 a8083063 Iustin Pop
def DoCmd(cmd):
72 a8083063 Iustin Pop
  """Run a shell command.
73 a8083063 Iustin Pop
74 a8083063 Iustin Pop
  Args:
75 a8083063 Iustin Pop
    cmd: the command to run.
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
78 38242904 Iustin Pop
79 a8083063 Iustin Pop
  """
80 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
81 a8083063 Iustin Pop
82 a8083063 Iustin Pop
  if res.failed:
83 7bca53e4 Michael Hanselmann
    msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
84 7bca53e4 Michael Hanselmann
           (repr(cmd),
85 7bca53e4 Michael Hanselmann
            Indent(res.fail_reason),
86 7bca53e4 Michael Hanselmann
            Indent(res.stdout),
87 7bca53e4 Michael Hanselmann
            Indent(res.stderr)))
88 7bca53e4 Michael Hanselmann
    raise errors.CommandError(msg)
89 a8083063 Iustin Pop
90 a8083063 Iustin Pop
  return res
91 a8083063 Iustin Pop
92 a8083063 Iustin Pop
93 fc428e32 Michael Hanselmann
def LockFile(fd):
94 fc428e32 Michael Hanselmann
  """Locks a file using POSIX locks.
95 fc428e32 Michael Hanselmann
96 fc428e32 Michael Hanselmann
  """
97 fc428e32 Michael Hanselmann
  try:
98 fc428e32 Michael Hanselmann
    fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
99 fc428e32 Michael Hanselmann
  except IOError, err:
100 fc428e32 Michael Hanselmann
    if err.errno == errno.EAGAIN:
101 7bca53e4 Michael Hanselmann
      raise errors.LockError("File already locked")
102 fc428e32 Michael Hanselmann
    raise
103 fc428e32 Michael Hanselmann
104 fc428e32 Michael Hanselmann
105 5a3103e9 Michael Hanselmann
class WatcherState(object):
106 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
107 a8083063 Iustin Pop
108 a8083063 Iustin Pop
  """
109 a8083063 Iustin Pop
  def __init__(self):
110 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
111 5a3103e9 Michael Hanselmann
112 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
113 5a3103e9 Michael Hanselmann
114 5a3103e9 Michael Hanselmann
    """
115 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
116 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
117 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
118 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
119 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
120 a8083063 Iustin Pop
121 fc428e32 Michael Hanselmann
    LockFile(self.statefile.fileno())
122 a8083063 Iustin Pop
123 5a3103e9 Michael Hanselmann
    try:
124 b76f660d Michael Hanselmann
      self._data = serializer.Load(self.statefile.read())
125 5a3103e9 Michael Hanselmann
    except Exception, msg:
126 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
127 b76f660d Michael Hanselmann
      self._data = {}
128 438b45d4 Michael Hanselmann
      logging.warning(("Empty or invalid state file. Using defaults."
129 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
130 5a3103e9 Michael Hanselmann
131 b76f660d Michael Hanselmann
    if "instance" not in self._data:
132 b76f660d Michael Hanselmann
      self._data["instance"] = {}
133 b76f660d Michael Hanselmann
    if "node" not in self._data:
134 b76f660d Michael Hanselmann
      self._data["node"] = {}
135 5a3103e9 Michael Hanselmann
136 2fb96d39 Michael Hanselmann
    self._orig_data = self._data.copy()
137 2fb96d39 Michael Hanselmann
138 fc428e32 Michael Hanselmann
  def Save(self):
139 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
140 5a3103e9 Michael Hanselmann
141 5a3103e9 Michael Hanselmann
    """
142 fc428e32 Michael Hanselmann
    assert self.statefile
143 fc428e32 Michael Hanselmann
144 2fb96d39 Michael Hanselmann
    if self._orig_data == self._data:
145 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
146 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
147 2fb96d39 Michael Hanselmann
      return
148 2fb96d39 Michael Hanselmann
149 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
150 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
151 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
152 b76f660d Michael Hanselmann
                         data=serializer.Dump(self._data),
153 fc428e32 Michael Hanselmann
                         prewrite=LockFile, close=False)
154 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
155 5a3103e9 Michael Hanselmann
156 fc428e32 Michael Hanselmann
  def Close(self):
157 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
158 5a3103e9 Michael Hanselmann
159 5a3103e9 Michael Hanselmann
    """
160 5a3103e9 Michael Hanselmann
    assert self.statefile
161 5a3103e9 Michael Hanselmann
162 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
163 5a3103e9 Michael Hanselmann
    self.statefile.close()
164 5a3103e9 Michael Hanselmann
    self.statefile = None
165 5a3103e9 Michael Hanselmann
166 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
167 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
168 a8083063 Iustin Pop
169 5a3103e9 Michael Hanselmann
    """
170 b76f660d Michael Hanselmann
    ndata = self._data["node"]
171 5a3103e9 Michael Hanselmann
172 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
173 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
174 5a3103e9 Michael Hanselmann
    return None
175 5a3103e9 Michael Hanselmann
176 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
177 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
178 5a3103e9 Michael Hanselmann
179 5a3103e9 Michael Hanselmann
    """
180 5a3103e9 Michael Hanselmann
    assert bootid
181 a8083063 Iustin Pop
182 b76f660d Michael Hanselmann
    ndata = self._data["node"]
183 a8083063 Iustin Pop
184 5a3103e9 Michael Hanselmann
    if name not in ndata:
185 5a3103e9 Michael Hanselmann
      ndata[name] = {}
186 5a3103e9 Michael Hanselmann
187 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
188 5a3103e9 Michael Hanselmann
189 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
190 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
191 a8083063 Iustin Pop
192 a8083063 Iustin Pop
    Args:
193 a8083063 Iustin Pop
      instance - the instance to look up.
194 38242904 Iustin Pop
195 a8083063 Iustin Pop
    """
196 b76f660d Michael Hanselmann
    idata = self._data["instance"]
197 a8083063 Iustin Pop
198 5a3103e9 Michael Hanselmann
    if instance.name in idata:
199 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
200 a8083063 Iustin Pop
201 a8083063 Iustin Pop
    return 0
202 a8083063 Iustin Pop
203 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
204 a8083063 Iustin Pop
    """Record a restart attempt.
205 a8083063 Iustin Pop
206 a8083063 Iustin Pop
    Args:
207 a8083063 Iustin Pop
      instance - the instance being restarted
208 38242904 Iustin Pop
209 a8083063 Iustin Pop
    """
210 b76f660d Michael Hanselmann
    idata = self._data["instance"]
211 a8083063 Iustin Pop
212 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
213 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
214 5a3103e9 Michael Hanselmann
    else:
215 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
216 a8083063 Iustin Pop
217 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
218 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
219 a8083063 Iustin Pop
220 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
221 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
222 a8083063 Iustin Pop
223 a8083063 Iustin Pop
    Args:
224 a8083063 Iustin Pop
      instance - the instance to remove from books
225 a8083063 Iustin Pop
226 38242904 Iustin Pop
    This method removes the record for a named instance.
227 38242904 Iustin Pop
228 a8083063 Iustin Pop
    """
229 b76f660d Michael Hanselmann
    idata = self._data["instance"]
230 a8083063 Iustin Pop
231 5a3103e9 Michael Hanselmann
    if instance.name in idata:
232 5a3103e9 Michael Hanselmann
      del idata[instance.name]
233 a8083063 Iustin Pop
234 a8083063 Iustin Pop
235 a8083063 Iustin Pop
class Instance(object):
236 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
237 a8083063 Iustin Pop
238 a8083063 Iustin Pop
  Methods:
239 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
240 098c0958 Michael Hanselmann
241 a8083063 Iustin Pop
  """
242 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
243 a8083063 Iustin Pop
    self.name = name
244 a8083063 Iustin Pop
    self.state = state
245 5a3103e9 Michael Hanselmann
    self.autostart = autostart
246 a8083063 Iustin Pop
247 a8083063 Iustin Pop
  def Restart(self):
248 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
249 3ecf6786 Iustin Pop
250 3ecf6786 Iustin Pop
    """
251 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
252 a8083063 Iustin Pop
253 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
254 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
255 5a3103e9 Michael Hanselmann
256 5a3103e9 Michael Hanselmann
    """
257 5a3103e9 Michael Hanselmann
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
258 5a3103e9 Michael Hanselmann
259 a8083063 Iustin Pop
260 5a3103e9 Michael Hanselmann
def _RunListCmd(cmd):
261 5a3103e9 Michael Hanselmann
  """Runs a command and parses its output into lists.
262 38242904 Iustin Pop
263 a8083063 Iustin Pop
  """
264 5a3103e9 Michael Hanselmann
  for line in DoCmd(cmd).stdout.splitlines():
265 5a3103e9 Michael Hanselmann
    yield line.split(':')
266 a8083063 Iustin Pop
267 a8083063 Iustin Pop
268 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
269 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
270 5a3103e9 Michael Hanselmann
271 5a3103e9 Michael Hanselmann
  """
272 5a3103e9 Michael Hanselmann
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
273 5a3103e9 Michael Hanselmann
         '--separator=:']
274 5a3103e9 Michael Hanselmann
275 5a3103e9 Michael Hanselmann
  fields = 'name,oper_state,admin_state'
276 a8083063 Iustin Pop
277 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
278 5a3103e9 Michael Hanselmann
    fields += ',snodes'
279 a8083063 Iustin Pop
280 5a3103e9 Michael Hanselmann
  cmd.append('-o')
281 5a3103e9 Michael Hanselmann
  cmd.append(fields)
282 5a3103e9 Michael Hanselmann
283 5a3103e9 Michael Hanselmann
  instances = []
284 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
285 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
286 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
287 5a3103e9 Michael Hanselmann
288 5a3103e9 Michael Hanselmann
      if snodes == "-":
289 a8083063 Iustin Pop
        continue
290 5a3103e9 Michael Hanselmann
291 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
292 5a3103e9 Michael Hanselmann
        if node in snodes.split(','):
293 5a3103e9 Michael Hanselmann
          break
294 5a3103e9 Michael Hanselmann
      else:
295 a8083063 Iustin Pop
        continue
296 a8083063 Iustin Pop
297 5a3103e9 Michael Hanselmann
    else:
298 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
299 5a3103e9 Michael Hanselmann
300 5a3103e9 Michael Hanselmann
    instances.append(Instance(name, status, autostart != "no"))
301 a8083063 Iustin Pop
302 5a3103e9 Michael Hanselmann
  return instances
303 5a3103e9 Michael Hanselmann
304 5a3103e9 Michael Hanselmann
305 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
306 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
307 5a3103e9 Michael Hanselmann
308 5a3103e9 Michael Hanselmann
  """
309 5a3103e9 Michael Hanselmann
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
310 5a3103e9 Michael Hanselmann
         '--separator=:', '-o', 'name,bootid']
311 5a3103e9 Michael Hanselmann
312 5a3103e9 Michael Hanselmann
  ids = {}
313 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
314 5a3103e9 Michael Hanselmann
    (name, bootid) = fields
315 5a3103e9 Michael Hanselmann
    ids[name] = bootid
316 5a3103e9 Michael Hanselmann
317 5a3103e9 Michael Hanselmann
  return ids
318 a8083063 Iustin Pop
319 a8083063 Iustin Pop
320 5a3103e9 Michael Hanselmann
class Watcher(object):
321 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
322 a8083063 Iustin Pop
323 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
324 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
325 a8083063 Iustin Pop
  to restart machines that are down.
326 38242904 Iustin Pop
327 a8083063 Iustin Pop
  """
328 a8083063 Iustin Pop
  def __init__(self):
329 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
330 38242904 Iustin Pop
    master = sstore.GetMasterNode()
331 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
332 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
333 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
334 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
335 eee1fa2d Iustin Pop
    self.started_instances = set()
336 a8083063 Iustin Pop
337 a8083063 Iustin Pop
  def Run(self):
338 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
339 78f3bd30 Michael Hanselmann
    try:
340 78f3bd30 Michael Hanselmann
      self.CheckInstances(notepad)
341 78f3bd30 Michael Hanselmann
      self.CheckDisks(notepad)
342 78f3bd30 Michael Hanselmann
      self.VerifyDisks()
343 78f3bd30 Michael Hanselmann
    finally:
344 78f3bd30 Michael Hanselmann
      notepad.Save()
345 5a3103e9 Michael Hanselmann
346 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
347 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
348 38242904 Iustin Pop
349 a8083063 Iustin Pop
    """
350 5a3103e9 Michael Hanselmann
    check_nodes = []
351 5a3103e9 Michael Hanselmann
    for name, id in self.bootids.iteritems():
352 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
353 5a3103e9 Michael Hanselmann
      if old != id:
354 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
355 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
356 5a3103e9 Michael Hanselmann
357 5a3103e9 Michael Hanselmann
    if check_nodes:
358 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
359 5a3103e9 Michael Hanselmann
      # secondary node.
360 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
361 0c0f834d Iustin Pop
        if not instance.autostart:
362 438b45d4 Michael Hanselmann
          logging.info(("Skipping disk activation for non-autostart"
363 438b45d4 Michael Hanselmann
                        " instance %s"), instance.name)
364 0c0f834d Iustin Pop
          continue
365 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
366 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
367 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
368 eee1fa2d Iustin Pop
          continue
369 5a3103e9 Michael Hanselmann
        try:
370 438b45d4 Michael Hanselmann
          logging.info("Activating disks for instance %s", instance.name)
371 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
372 438b45d4 Michael Hanselmann
        except Error, err:
373 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
374 5a3103e9 Michael Hanselmann
375 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
376 5a3103e9 Michael Hanselmann
      for name in check_nodes:
377 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
378 a8083063 Iustin Pop
379 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
380 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
381 5a3103e9 Michael Hanselmann
382 5a3103e9 Michael Hanselmann
    """
383 a8083063 Iustin Pop
    for instance in self.instances:
384 5a3103e9 Michael Hanselmann
      # Don't care about manually stopped instances
385 5a3103e9 Michael Hanselmann
      if not instance.autostart:
386 5a3103e9 Michael Hanselmann
        continue
387 5a3103e9 Michael Hanselmann
388 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
389 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
390 a8083063 Iustin Pop
391 a8083063 Iustin Pop
        if n > MAXTRIES:
392 a8083063 Iustin Pop
          # stay quiet.
393 a8083063 Iustin Pop
          continue
394 a8083063 Iustin Pop
        elif n < MAXTRIES:
395 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
396 a8083063 Iustin Pop
        else:
397 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
398 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
399 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
400 a8083063 Iustin Pop
          continue
401 a8083063 Iustin Pop
        try:
402 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
403 438b45d4 Michael Hanselmann
                        instance.name, last)
404 a8083063 Iustin Pop
          instance.Restart()
405 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
406 438b45d4 Michael Hanselmann
        except Error, err:
407 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
408 a8083063 Iustin Pop
409 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
410 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
411 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
412 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
413 a8083063 Iustin Pop
      else:
414 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
415 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
416 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
417 a8083063 Iustin Pop
418 d2f311db Iustin Pop
  def VerifyDisks(self):
419 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
420 d2f311db Iustin Pop
421 d2f311db Iustin Pop
    """
422 d2f311db Iustin Pop
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
423 d2f311db Iustin Pop
    if result.output:
424 438b45d4 Michael Hanselmann
      logging.info(result.output)
425 a8083063 Iustin Pop
426 a8083063 Iustin Pop
427 a8083063 Iustin Pop
def ParseOptions():
428 a8083063 Iustin Pop
  """Parse the command line options.
429 a8083063 Iustin Pop
430 a8083063 Iustin Pop
  Returns:
431 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
432 a8083063 Iustin Pop
433 a8083063 Iustin Pop
  """
434 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
435 a8083063 Iustin Pop
                        usage="%prog [-d]",
436 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
437 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
438 a8083063 Iustin Pop
439 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
440 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
441 a8083063 Iustin Pop
                    default=False, action="store_true")
442 a8083063 Iustin Pop
  options, args = parser.parse_args()
443 a8083063 Iustin Pop
  return options, args
444 a8083063 Iustin Pop
445 a8083063 Iustin Pop
446 438b45d4 Michael Hanselmann
def SetupLogging(debug):
447 438b45d4 Michael Hanselmann
  """Configures the logging module.
448 438b45d4 Michael Hanselmann
449 438b45d4 Michael Hanselmann
  """
450 438b45d4 Michael Hanselmann
  formatter = logging.Formatter("%(asctime)s: %(message)s")
451 438b45d4 Michael Hanselmann
452 438b45d4 Michael Hanselmann
  logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
453 438b45d4 Michael Hanselmann
  logfile_handler.setFormatter(formatter)
454 438b45d4 Michael Hanselmann
  logfile_handler.setLevel(logging.INFO)
455 438b45d4 Michael Hanselmann
456 438b45d4 Michael Hanselmann
  stderr_handler = logging.StreamHandler()
457 438b45d4 Michael Hanselmann
  stderr_handler.setFormatter(formatter)
458 438b45d4 Michael Hanselmann
  if debug:
459 438b45d4 Michael Hanselmann
    stderr_handler.setLevel(logging.NOTSET)
460 438b45d4 Michael Hanselmann
  else:
461 438b45d4 Michael Hanselmann
    stderr_handler.setLevel(logging.CRITICAL)
462 438b45d4 Michael Hanselmann
463 438b45d4 Michael Hanselmann
  root_logger = logging.getLogger("")
464 438b45d4 Michael Hanselmann
  root_logger.setLevel(logging.NOTSET)
465 438b45d4 Michael Hanselmann
  root_logger.addHandler(logfile_handler)
466 438b45d4 Michael Hanselmann
  root_logger.addHandler(stderr_handler)
467 438b45d4 Michael Hanselmann
468 438b45d4 Michael Hanselmann
469 a8083063 Iustin Pop
def main():
470 a8083063 Iustin Pop
  """Main function.
471 a8083063 Iustin Pop
472 a8083063 Iustin Pop
  """
473 a8083063 Iustin Pop
  options, args = ParseOptions()
474 a8083063 Iustin Pop
475 438b45d4 Michael Hanselmann
  SetupLogging(options.debug)
476 a8083063 Iustin Pop
477 a8083063 Iustin Pop
  try:
478 781b2b2b Michael Hanselmann
    try:
479 781b2b2b Michael Hanselmann
      watcher = Watcher()
480 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
481 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
482 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
483 5a3103e9 Michael Hanselmann
    watcher.Run()
484 1b052f42 Michael Hanselmann
  except SystemExit:
485 1b052f42 Michael Hanselmann
    raise
486 38242904 Iustin Pop
  except NotMasterError:
487 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
488 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
489 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
490 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
491 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
492 438b45d4 Michael Hanselmann
  except Exception, err:
493 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
494 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
495 a8083063 Iustin Pop
496 5a3103e9 Michael Hanselmann
497 a8083063 Iustin Pop
if __name__ == '__main__':
498 a8083063 Iustin Pop
  main()