Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 195c7f91

History | View | Annotate | Download (12.4 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 a8083063 Iustin Pop
import fcntl
34 a8083063 Iustin Pop
import errno
35 438b45d4 Michael Hanselmann
import logging
36 a8083063 Iustin Pop
from optparse import OptionParser
37 a8083063 Iustin Pop
38 a8083063 Iustin Pop
from ganeti import utils
39 a8083063 Iustin Pop
from ganeti import constants
40 67fe61c4 Michael Hanselmann
from ganeti import serializer
41 38242904 Iustin Pop
from ganeti import ssconf
42 89e1fc26 Iustin Pop
from ganeti import errors
43 3b316acb Iustin Pop
from ganeti import logger
44 a8083063 Iustin Pop
45 a8083063 Iustin Pop
46 5a3103e9 Michael Hanselmann
MAXTRIES = 5
47 5a3103e9 Michael Hanselmann
BAD_STATES = ['stopped']
48 5a3103e9 Michael Hanselmann
HELPLESS_STATES = ['(node down)']
49 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
50 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
51 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
52 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
53 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
54 5a3103e9 Michael Hanselmann
55 5a3103e9 Michael Hanselmann
56 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
57 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
58 a8083063 Iustin Pop
59 a8083063 Iustin Pop
60 a8083063 Iustin Pop
def Indent(s, prefix='| '):
61 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
  Args:
64 a8083063 Iustin Pop
    s: The string to indent
65 a8083063 Iustin Pop
    prefix: The string to prepend each line.
66 38242904 Iustin Pop
67 a8083063 Iustin Pop
  """
68 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
69 a8083063 Iustin Pop
70 a8083063 Iustin Pop
71 a8083063 Iustin Pop
def DoCmd(cmd):
72 a8083063 Iustin Pop
  """Run a shell command.
73 a8083063 Iustin Pop
74 a8083063 Iustin Pop
  Args:
75 a8083063 Iustin Pop
    cmd: the command to run.
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
78 38242904 Iustin Pop
79 a8083063 Iustin Pop
  """
80 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
81 a8083063 Iustin Pop
82 a8083063 Iustin Pop
  if res.failed:
83 7bca53e4 Michael Hanselmann
    msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
84 7bca53e4 Michael Hanselmann
           (repr(cmd),
85 7bca53e4 Michael Hanselmann
            Indent(res.fail_reason),
86 7bca53e4 Michael Hanselmann
            Indent(res.stdout),
87 7bca53e4 Michael Hanselmann
            Indent(res.stderr)))
88 7bca53e4 Michael Hanselmann
    raise errors.CommandError(msg)
89 a8083063 Iustin Pop
90 a8083063 Iustin Pop
  return res
91 a8083063 Iustin Pop
92 a8083063 Iustin Pop
93 fc428e32 Michael Hanselmann
def LockFile(fd):
94 fc428e32 Michael Hanselmann
  """Locks a file using POSIX locks.
95 fc428e32 Michael Hanselmann
96 fc428e32 Michael Hanselmann
  """
97 fc428e32 Michael Hanselmann
  try:
98 fc428e32 Michael Hanselmann
    fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
99 fc428e32 Michael Hanselmann
  except IOError, err:
100 fc428e32 Michael Hanselmann
    if err.errno == errno.EAGAIN:
101 7bca53e4 Michael Hanselmann
      raise errors.LockError("File already locked")
102 fc428e32 Michael Hanselmann
    raise
103 fc428e32 Michael Hanselmann
104 fc428e32 Michael Hanselmann
105 5a3103e9 Michael Hanselmann
class WatcherState(object):
106 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
107 a8083063 Iustin Pop
108 a8083063 Iustin Pop
  """
109 a8083063 Iustin Pop
  def __init__(self):
110 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
111 5a3103e9 Michael Hanselmann
112 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
113 5a3103e9 Michael Hanselmann
114 5a3103e9 Michael Hanselmann
    """
115 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
116 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
117 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
118 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
119 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
120 a8083063 Iustin Pop
121 fc428e32 Michael Hanselmann
    LockFile(self.statefile.fileno())
122 a8083063 Iustin Pop
123 5a3103e9 Michael Hanselmann
    try:
124 b76f660d Michael Hanselmann
      self._data = serializer.Load(self.statefile.read())
125 5a3103e9 Michael Hanselmann
    except Exception, msg:
126 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
127 b76f660d Michael Hanselmann
      self._data = {}
128 438b45d4 Michael Hanselmann
      logging.warning(("Empty or invalid state file. Using defaults."
129 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
130 5a3103e9 Michael Hanselmann
131 b76f660d Michael Hanselmann
    if "instance" not in self._data:
132 b76f660d Michael Hanselmann
      self._data["instance"] = {}
133 b76f660d Michael Hanselmann
    if "node" not in self._data:
134 b76f660d Michael Hanselmann
      self._data["node"] = {}
135 5a3103e9 Michael Hanselmann
136 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
137 2fb96d39 Michael Hanselmann
138 fc428e32 Michael Hanselmann
  def Save(self):
139 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
140 5a3103e9 Michael Hanselmann
141 5a3103e9 Michael Hanselmann
    """
142 fc428e32 Michael Hanselmann
    assert self.statefile
143 fc428e32 Michael Hanselmann
144 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
145 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
146 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
147 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
148 2fb96d39 Michael Hanselmann
      return
149 2fb96d39 Michael Hanselmann
150 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
151 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
152 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
153 26517d45 Iustin Pop
                         data=serialized_form,
154 fc428e32 Michael Hanselmann
                         prewrite=LockFile, close=False)
155 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
156 5a3103e9 Michael Hanselmann
157 fc428e32 Michael Hanselmann
  def Close(self):
158 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
159 5a3103e9 Michael Hanselmann
160 5a3103e9 Michael Hanselmann
    """
161 5a3103e9 Michael Hanselmann
    assert self.statefile
162 5a3103e9 Michael Hanselmann
163 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
164 5a3103e9 Michael Hanselmann
    self.statefile.close()
165 5a3103e9 Michael Hanselmann
    self.statefile = None
166 5a3103e9 Michael Hanselmann
167 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
168 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
169 a8083063 Iustin Pop
170 5a3103e9 Michael Hanselmann
    """
171 b76f660d Michael Hanselmann
    ndata = self._data["node"]
172 5a3103e9 Michael Hanselmann
173 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
174 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
175 5a3103e9 Michael Hanselmann
    return None
176 5a3103e9 Michael Hanselmann
177 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
178 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
179 5a3103e9 Michael Hanselmann
180 5a3103e9 Michael Hanselmann
    """
181 5a3103e9 Michael Hanselmann
    assert bootid
182 a8083063 Iustin Pop
183 b76f660d Michael Hanselmann
    ndata = self._data["node"]
184 a8083063 Iustin Pop
185 5a3103e9 Michael Hanselmann
    if name not in ndata:
186 5a3103e9 Michael Hanselmann
      ndata[name] = {}
187 5a3103e9 Michael Hanselmann
188 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
189 5a3103e9 Michael Hanselmann
190 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
191 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
192 a8083063 Iustin Pop
193 a8083063 Iustin Pop
    Args:
194 a8083063 Iustin Pop
      instance - the instance to look up.
195 38242904 Iustin Pop
196 a8083063 Iustin Pop
    """
197 b76f660d Michael Hanselmann
    idata = self._data["instance"]
198 a8083063 Iustin Pop
199 5a3103e9 Michael Hanselmann
    if instance.name in idata:
200 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
201 a8083063 Iustin Pop
202 a8083063 Iustin Pop
    return 0
203 a8083063 Iustin Pop
204 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
205 a8083063 Iustin Pop
    """Record a restart attempt.
206 a8083063 Iustin Pop
207 a8083063 Iustin Pop
    Args:
208 a8083063 Iustin Pop
      instance - the instance being restarted
209 38242904 Iustin Pop
210 a8083063 Iustin Pop
    """
211 b76f660d Michael Hanselmann
    idata = self._data["instance"]
212 a8083063 Iustin Pop
213 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
214 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
215 5a3103e9 Michael Hanselmann
    else:
216 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
217 a8083063 Iustin Pop
218 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
219 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
220 a8083063 Iustin Pop
221 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
222 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
223 a8083063 Iustin Pop
224 a8083063 Iustin Pop
    Args:
225 a8083063 Iustin Pop
      instance - the instance to remove from books
226 a8083063 Iustin Pop
227 38242904 Iustin Pop
    This method removes the record for a named instance.
228 38242904 Iustin Pop
229 a8083063 Iustin Pop
    """
230 b76f660d Michael Hanselmann
    idata = self._data["instance"]
231 a8083063 Iustin Pop
232 5a3103e9 Michael Hanselmann
    if instance.name in idata:
233 5a3103e9 Michael Hanselmann
      del idata[instance.name]
234 a8083063 Iustin Pop
235 a8083063 Iustin Pop
236 a8083063 Iustin Pop
class Instance(object):
237 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
238 a8083063 Iustin Pop
239 a8083063 Iustin Pop
  Methods:
240 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
241 098c0958 Michael Hanselmann
242 a8083063 Iustin Pop
  """
243 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
244 a8083063 Iustin Pop
    self.name = name
245 a8083063 Iustin Pop
    self.state = state
246 5a3103e9 Michael Hanselmann
    self.autostart = autostart
247 a8083063 Iustin Pop
248 a8083063 Iustin Pop
  def Restart(self):
249 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
250 3ecf6786 Iustin Pop
251 3ecf6786 Iustin Pop
    """
252 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
253 a8083063 Iustin Pop
254 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
255 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
256 5a3103e9 Michael Hanselmann
257 5a3103e9 Michael Hanselmann
    """
258 5a3103e9 Michael Hanselmann
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
259 5a3103e9 Michael Hanselmann
260 a8083063 Iustin Pop
261 5a3103e9 Michael Hanselmann
def _RunListCmd(cmd):
262 5a3103e9 Michael Hanselmann
  """Runs a command and parses its output into lists.
263 38242904 Iustin Pop
264 a8083063 Iustin Pop
  """
265 5a3103e9 Michael Hanselmann
  for line in DoCmd(cmd).stdout.splitlines():
266 5a3103e9 Michael Hanselmann
    yield line.split(':')
267 a8083063 Iustin Pop
268 a8083063 Iustin Pop
269 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
270 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
271 5a3103e9 Michael Hanselmann
272 5a3103e9 Michael Hanselmann
  """
273 5a3103e9 Michael Hanselmann
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
274 5a3103e9 Michael Hanselmann
         '--separator=:']
275 5a3103e9 Michael Hanselmann
276 5a3103e9 Michael Hanselmann
  fields = 'name,oper_state,admin_state'
277 a8083063 Iustin Pop
278 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
279 5a3103e9 Michael Hanselmann
    fields += ',snodes'
280 a8083063 Iustin Pop
281 5a3103e9 Michael Hanselmann
  cmd.append('-o')
282 5a3103e9 Michael Hanselmann
  cmd.append(fields)
283 5a3103e9 Michael Hanselmann
284 5a3103e9 Michael Hanselmann
  instances = []
285 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
286 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
287 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
288 5a3103e9 Michael Hanselmann
289 5a3103e9 Michael Hanselmann
      if snodes == "-":
290 a8083063 Iustin Pop
        continue
291 5a3103e9 Michael Hanselmann
292 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
293 5a3103e9 Michael Hanselmann
        if node in snodes.split(','):
294 5a3103e9 Michael Hanselmann
          break
295 5a3103e9 Michael Hanselmann
      else:
296 a8083063 Iustin Pop
        continue
297 a8083063 Iustin Pop
298 5a3103e9 Michael Hanselmann
    else:
299 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
300 5a3103e9 Michael Hanselmann
301 5a3103e9 Michael Hanselmann
    instances.append(Instance(name, status, autostart != "no"))
302 a8083063 Iustin Pop
303 5a3103e9 Michael Hanselmann
  return instances
304 5a3103e9 Michael Hanselmann
305 5a3103e9 Michael Hanselmann
306 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
307 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
308 5a3103e9 Michael Hanselmann
309 5a3103e9 Michael Hanselmann
  """
310 5a3103e9 Michael Hanselmann
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
311 5a3103e9 Michael Hanselmann
         '--separator=:', '-o', 'name,bootid']
312 5a3103e9 Michael Hanselmann
313 5a3103e9 Michael Hanselmann
  ids = {}
314 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
315 5a3103e9 Michael Hanselmann
    (name, bootid) = fields
316 5a3103e9 Michael Hanselmann
    ids[name] = bootid
317 5a3103e9 Michael Hanselmann
318 5a3103e9 Michael Hanselmann
  return ids
319 a8083063 Iustin Pop
320 a8083063 Iustin Pop
321 5a3103e9 Michael Hanselmann
class Watcher(object):
322 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
323 a8083063 Iustin Pop
324 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
325 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
326 a8083063 Iustin Pop
  to restart machines that are down.
327 38242904 Iustin Pop
328 a8083063 Iustin Pop
  """
329 a8083063 Iustin Pop
  def __init__(self):
330 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
331 38242904 Iustin Pop
    master = sstore.GetMasterNode()
332 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
333 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
334 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
335 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
336 eee1fa2d Iustin Pop
    self.started_instances = set()
337 a8083063 Iustin Pop
338 a8083063 Iustin Pop
  def Run(self):
339 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
340 78f3bd30 Michael Hanselmann
    try:
341 78f3bd30 Michael Hanselmann
      self.CheckInstances(notepad)
342 78f3bd30 Michael Hanselmann
      self.CheckDisks(notepad)
343 78f3bd30 Michael Hanselmann
      self.VerifyDisks()
344 78f3bd30 Michael Hanselmann
    finally:
345 78f3bd30 Michael Hanselmann
      notepad.Save()
346 5a3103e9 Michael Hanselmann
347 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
348 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
349 38242904 Iustin Pop
350 a8083063 Iustin Pop
    """
351 5a3103e9 Michael Hanselmann
    check_nodes = []
352 26517d45 Iustin Pop
    for name, new_id in self.bootids.iteritems():
353 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
354 26517d45 Iustin Pop
      if old != new_id:
355 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
356 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
357 5a3103e9 Michael Hanselmann
358 5a3103e9 Michael Hanselmann
    if check_nodes:
359 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
360 5a3103e9 Michael Hanselmann
      # secondary node.
361 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
362 0c0f834d Iustin Pop
        if not instance.autostart:
363 438b45d4 Michael Hanselmann
          logging.info(("Skipping disk activation for non-autostart"
364 438b45d4 Michael Hanselmann
                        " instance %s"), instance.name)
365 0c0f834d Iustin Pop
          continue
366 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
367 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
368 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
369 eee1fa2d Iustin Pop
          continue
370 5a3103e9 Michael Hanselmann
        try:
371 438b45d4 Michael Hanselmann
          logging.info("Activating disks for instance %s", instance.name)
372 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
373 26517d45 Iustin Pop
        except Exception, err:
374 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
375 5a3103e9 Michael Hanselmann
376 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
377 5a3103e9 Michael Hanselmann
      for name in check_nodes:
378 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
379 a8083063 Iustin Pop
380 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
381 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
382 5a3103e9 Michael Hanselmann
383 5a3103e9 Michael Hanselmann
    """
384 a8083063 Iustin Pop
    for instance in self.instances:
385 5a3103e9 Michael Hanselmann
      # Don't care about manually stopped instances
386 5a3103e9 Michael Hanselmann
      if not instance.autostart:
387 5a3103e9 Michael Hanselmann
        continue
388 5a3103e9 Michael Hanselmann
389 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
390 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
391 a8083063 Iustin Pop
392 a8083063 Iustin Pop
        if n > MAXTRIES:
393 a8083063 Iustin Pop
          # stay quiet.
394 a8083063 Iustin Pop
          continue
395 a8083063 Iustin Pop
        elif n < MAXTRIES:
396 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
397 a8083063 Iustin Pop
        else:
398 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
399 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
400 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
401 a8083063 Iustin Pop
          continue
402 a8083063 Iustin Pop
        try:
403 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
404 438b45d4 Michael Hanselmann
                        instance.name, last)
405 a8083063 Iustin Pop
          instance.Restart()
406 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
407 26517d45 Iustin Pop
        except Exception, err:
408 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
409 a8083063 Iustin Pop
410 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
411 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
412 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
413 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
414 a8083063 Iustin Pop
      else:
415 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
416 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
417 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
418 a8083063 Iustin Pop
419 d2f311db Iustin Pop
  def VerifyDisks(self):
420 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
421 d2f311db Iustin Pop
422 d2f311db Iustin Pop
    """
423 d2f311db Iustin Pop
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
424 d2f311db Iustin Pop
    if result.output:
425 438b45d4 Michael Hanselmann
      logging.info(result.output)
426 a8083063 Iustin Pop
427 a8083063 Iustin Pop
428 a8083063 Iustin Pop
def ParseOptions():
429 a8083063 Iustin Pop
  """Parse the command line options.
430 a8083063 Iustin Pop
431 a8083063 Iustin Pop
  Returns:
432 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
433 a8083063 Iustin Pop
434 a8083063 Iustin Pop
  """
435 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
436 a8083063 Iustin Pop
                        usage="%prog [-d]",
437 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
438 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
439 a8083063 Iustin Pop
440 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
441 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
442 a8083063 Iustin Pop
                    default=False, action="store_true")
443 a8083063 Iustin Pop
  options, args = parser.parse_args()
444 a8083063 Iustin Pop
  return options, args
445 a8083063 Iustin Pop
446 a8083063 Iustin Pop
447 a8083063 Iustin Pop
def main():
448 a8083063 Iustin Pop
  """Main function.
449 a8083063 Iustin Pop
450 a8083063 Iustin Pop
  """
451 a8083063 Iustin Pop
  options, args = ParseOptions()
452 a8083063 Iustin Pop
453 3b316acb Iustin Pop
  logger.SetupDaemon(constants.LOG_WATCHER, debug=options.debug)
454 a8083063 Iustin Pop
455 a8083063 Iustin Pop
  try:
456 781b2b2b Michael Hanselmann
    try:
457 781b2b2b Michael Hanselmann
      watcher = Watcher()
458 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
459 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
460 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
461 5a3103e9 Michael Hanselmann
    watcher.Run()
462 1b052f42 Michael Hanselmann
  except SystemExit:
463 1b052f42 Michael Hanselmann
    raise
464 38242904 Iustin Pop
  except NotMasterError:
465 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
466 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
467 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
468 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
469 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
470 438b45d4 Michael Hanselmann
  except Exception, err:
471 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
472 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
473 a8083063 Iustin Pop
474 5a3103e9 Michael Hanselmann
475 a8083063 Iustin Pop
if __name__ == '__main__':
476 a8083063 Iustin Pop
  main()