Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ c54784d9

History | View | Annotate | Download (12.1 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 a8083063 Iustin Pop
import fcntl
34 a8083063 Iustin Pop
import errno
35 438b45d4 Michael Hanselmann
import logging
36 a8083063 Iustin Pop
from optparse import OptionParser
37 a8083063 Iustin Pop
38 a8083063 Iustin Pop
from ganeti import utils
39 a8083063 Iustin Pop
from ganeti import constants
40 67fe61c4 Michael Hanselmann
from ganeti import serializer
41 38242904 Iustin Pop
from ganeti import ssconf
42 89e1fc26 Iustin Pop
from ganeti import errors
43 3b316acb Iustin Pop
from ganeti import logger
44 a8083063 Iustin Pop
45 a8083063 Iustin Pop
46 5a3103e9 Michael Hanselmann
MAXTRIES = 5
47 5a3103e9 Michael Hanselmann
BAD_STATES = ['stopped']
48 5a3103e9 Michael Hanselmann
HELPLESS_STATES = ['(node down)']
49 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
50 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
51 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
52 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
53 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
54 5a3103e9 Michael Hanselmann
55 5a3103e9 Michael Hanselmann
56 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
57 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
58 a8083063 Iustin Pop
59 a8083063 Iustin Pop
60 a8083063 Iustin Pop
def Indent(s, prefix='| '):
61 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
  Args:
64 a8083063 Iustin Pop
    s: The string to indent
65 a8083063 Iustin Pop
    prefix: The string to prepend each line.
66 38242904 Iustin Pop
67 a8083063 Iustin Pop
  """
68 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
69 a8083063 Iustin Pop
70 a8083063 Iustin Pop
71 a8083063 Iustin Pop
def DoCmd(cmd):
72 a8083063 Iustin Pop
  """Run a shell command.
73 a8083063 Iustin Pop
74 a8083063 Iustin Pop
  Args:
75 a8083063 Iustin Pop
    cmd: the command to run.
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
78 38242904 Iustin Pop
79 a8083063 Iustin Pop
  """
80 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
81 a8083063 Iustin Pop
82 a8083063 Iustin Pop
  if res.failed:
83 7bca53e4 Michael Hanselmann
    msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
84 7bca53e4 Michael Hanselmann
           (repr(cmd),
85 7bca53e4 Michael Hanselmann
            Indent(res.fail_reason),
86 7bca53e4 Michael Hanselmann
            Indent(res.stdout),
87 7bca53e4 Michael Hanselmann
            Indent(res.stderr)))
88 7bca53e4 Michael Hanselmann
    raise errors.CommandError(msg)
89 a8083063 Iustin Pop
90 a8083063 Iustin Pop
  return res
91 a8083063 Iustin Pop
92 a8083063 Iustin Pop
93 5a3103e9 Michael Hanselmann
class WatcherState(object):
94 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
95 a8083063 Iustin Pop
96 a8083063 Iustin Pop
  """
97 a8083063 Iustin Pop
  def __init__(self):
98 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
99 5a3103e9 Michael Hanselmann
100 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
101 5a3103e9 Michael Hanselmann
102 5a3103e9 Michael Hanselmann
    """
103 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
104 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
105 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
106 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
107 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
108 a8083063 Iustin Pop
109 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
110 a8083063 Iustin Pop
111 5a3103e9 Michael Hanselmann
    try:
112 b76f660d Michael Hanselmann
      self._data = serializer.Load(self.statefile.read())
113 5a3103e9 Michael Hanselmann
    except Exception, msg:
114 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
115 b76f660d Michael Hanselmann
      self._data = {}
116 438b45d4 Michael Hanselmann
      logging.warning(("Empty or invalid state file. Using defaults."
117 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
118 5a3103e9 Michael Hanselmann
119 b76f660d Michael Hanselmann
    if "instance" not in self._data:
120 b76f660d Michael Hanselmann
      self._data["instance"] = {}
121 b76f660d Michael Hanselmann
    if "node" not in self._data:
122 b76f660d Michael Hanselmann
      self._data["node"] = {}
123 5a3103e9 Michael Hanselmann
124 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
125 2fb96d39 Michael Hanselmann
126 fc428e32 Michael Hanselmann
  def Save(self):
127 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
128 5a3103e9 Michael Hanselmann
129 5a3103e9 Michael Hanselmann
    """
130 fc428e32 Michael Hanselmann
    assert self.statefile
131 fc428e32 Michael Hanselmann
132 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
133 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
134 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
135 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
136 2fb96d39 Michael Hanselmann
      return
137 2fb96d39 Michael Hanselmann
138 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
139 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
140 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
141 26517d45 Iustin Pop
                         data=serialized_form,
142 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
143 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
144 5a3103e9 Michael Hanselmann
145 fc428e32 Michael Hanselmann
  def Close(self):
146 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
147 5a3103e9 Michael Hanselmann
148 5a3103e9 Michael Hanselmann
    """
149 5a3103e9 Michael Hanselmann
    assert self.statefile
150 5a3103e9 Michael Hanselmann
151 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
152 5a3103e9 Michael Hanselmann
    self.statefile.close()
153 5a3103e9 Michael Hanselmann
    self.statefile = None
154 5a3103e9 Michael Hanselmann
155 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
156 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
157 a8083063 Iustin Pop
158 5a3103e9 Michael Hanselmann
    """
159 b76f660d Michael Hanselmann
    ndata = self._data["node"]
160 5a3103e9 Michael Hanselmann
161 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
162 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
163 5a3103e9 Michael Hanselmann
    return None
164 5a3103e9 Michael Hanselmann
165 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
166 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
167 5a3103e9 Michael Hanselmann
168 5a3103e9 Michael Hanselmann
    """
169 5a3103e9 Michael Hanselmann
    assert bootid
170 a8083063 Iustin Pop
171 b76f660d Michael Hanselmann
    ndata = self._data["node"]
172 a8083063 Iustin Pop
173 5a3103e9 Michael Hanselmann
    if name not in ndata:
174 5a3103e9 Michael Hanselmann
      ndata[name] = {}
175 5a3103e9 Michael Hanselmann
176 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
177 5a3103e9 Michael Hanselmann
178 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
179 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
180 a8083063 Iustin Pop
181 a8083063 Iustin Pop
    Args:
182 a8083063 Iustin Pop
      instance - the instance to look up.
183 38242904 Iustin Pop
184 a8083063 Iustin Pop
    """
185 b76f660d Michael Hanselmann
    idata = self._data["instance"]
186 a8083063 Iustin Pop
187 5a3103e9 Michael Hanselmann
    if instance.name in idata:
188 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
189 a8083063 Iustin Pop
190 a8083063 Iustin Pop
    return 0
191 a8083063 Iustin Pop
192 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
193 a8083063 Iustin Pop
    """Record a restart attempt.
194 a8083063 Iustin Pop
195 a8083063 Iustin Pop
    Args:
196 a8083063 Iustin Pop
      instance - the instance being restarted
197 38242904 Iustin Pop
198 a8083063 Iustin Pop
    """
199 b76f660d Michael Hanselmann
    idata = self._data["instance"]
200 a8083063 Iustin Pop
201 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
202 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
203 5a3103e9 Michael Hanselmann
    else:
204 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
205 a8083063 Iustin Pop
206 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
207 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
208 a8083063 Iustin Pop
209 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
210 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
211 a8083063 Iustin Pop
212 a8083063 Iustin Pop
    Args:
213 a8083063 Iustin Pop
      instance - the instance to remove from books
214 a8083063 Iustin Pop
215 38242904 Iustin Pop
    This method removes the record for a named instance.
216 38242904 Iustin Pop
217 a8083063 Iustin Pop
    """
218 b76f660d Michael Hanselmann
    idata = self._data["instance"]
219 a8083063 Iustin Pop
220 5a3103e9 Michael Hanselmann
    if instance.name in idata:
221 5a3103e9 Michael Hanselmann
      del idata[instance.name]
222 a8083063 Iustin Pop
223 a8083063 Iustin Pop
224 a8083063 Iustin Pop
class Instance(object):
225 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
226 a8083063 Iustin Pop
227 a8083063 Iustin Pop
  Methods:
228 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
229 098c0958 Michael Hanselmann
230 a8083063 Iustin Pop
  """
231 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
232 a8083063 Iustin Pop
    self.name = name
233 a8083063 Iustin Pop
    self.state = state
234 5a3103e9 Michael Hanselmann
    self.autostart = autostart
235 a8083063 Iustin Pop
236 a8083063 Iustin Pop
  def Restart(self):
237 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
238 3ecf6786 Iustin Pop
239 3ecf6786 Iustin Pop
    """
240 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
241 a8083063 Iustin Pop
242 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
243 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
244 5a3103e9 Michael Hanselmann
245 5a3103e9 Michael Hanselmann
    """
246 5a3103e9 Michael Hanselmann
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
247 5a3103e9 Michael Hanselmann
248 a8083063 Iustin Pop
249 5a3103e9 Michael Hanselmann
def _RunListCmd(cmd):
250 5a3103e9 Michael Hanselmann
  """Runs a command and parses its output into lists.
251 38242904 Iustin Pop
252 a8083063 Iustin Pop
  """
253 5a3103e9 Michael Hanselmann
  for line in DoCmd(cmd).stdout.splitlines():
254 5a3103e9 Michael Hanselmann
    yield line.split(':')
255 a8083063 Iustin Pop
256 a8083063 Iustin Pop
257 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
258 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
259 5a3103e9 Michael Hanselmann
260 5a3103e9 Michael Hanselmann
  """
261 5a3103e9 Michael Hanselmann
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
262 5a3103e9 Michael Hanselmann
         '--separator=:']
263 5a3103e9 Michael Hanselmann
264 5a3103e9 Michael Hanselmann
  fields = 'name,oper_state,admin_state'
265 a8083063 Iustin Pop
266 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
267 5a3103e9 Michael Hanselmann
    fields += ',snodes'
268 a8083063 Iustin Pop
269 5a3103e9 Michael Hanselmann
  cmd.append('-o')
270 5a3103e9 Michael Hanselmann
  cmd.append(fields)
271 5a3103e9 Michael Hanselmann
272 5a3103e9 Michael Hanselmann
  instances = []
273 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
274 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
275 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
276 5a3103e9 Michael Hanselmann
277 5a3103e9 Michael Hanselmann
      if snodes == "-":
278 a8083063 Iustin Pop
        continue
279 5a3103e9 Michael Hanselmann
280 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
281 5a3103e9 Michael Hanselmann
        if node in snodes.split(','):
282 5a3103e9 Michael Hanselmann
          break
283 5a3103e9 Michael Hanselmann
      else:
284 a8083063 Iustin Pop
        continue
285 a8083063 Iustin Pop
286 5a3103e9 Michael Hanselmann
    else:
287 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
288 5a3103e9 Michael Hanselmann
289 5a3103e9 Michael Hanselmann
    instances.append(Instance(name, status, autostart != "no"))
290 a8083063 Iustin Pop
291 5a3103e9 Michael Hanselmann
  return instances
292 5a3103e9 Michael Hanselmann
293 5a3103e9 Michael Hanselmann
294 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
295 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
296 5a3103e9 Michael Hanselmann
297 5a3103e9 Michael Hanselmann
  """
298 5a3103e9 Michael Hanselmann
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
299 5a3103e9 Michael Hanselmann
         '--separator=:', '-o', 'name,bootid']
300 5a3103e9 Michael Hanselmann
301 5a3103e9 Michael Hanselmann
  ids = {}
302 5a3103e9 Michael Hanselmann
  for fields in _RunListCmd(cmd):
303 5a3103e9 Michael Hanselmann
    (name, bootid) = fields
304 5a3103e9 Michael Hanselmann
    ids[name] = bootid
305 5a3103e9 Michael Hanselmann
306 5a3103e9 Michael Hanselmann
  return ids
307 a8083063 Iustin Pop
308 a8083063 Iustin Pop
309 5a3103e9 Michael Hanselmann
class Watcher(object):
310 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
311 a8083063 Iustin Pop
312 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
313 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
314 a8083063 Iustin Pop
  to restart machines that are down.
315 38242904 Iustin Pop
316 a8083063 Iustin Pop
  """
317 a8083063 Iustin Pop
  def __init__(self):
318 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
319 38242904 Iustin Pop
    master = sstore.GetMasterNode()
320 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
321 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
322 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
323 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
324 eee1fa2d Iustin Pop
    self.started_instances = set()
325 a8083063 Iustin Pop
326 a8083063 Iustin Pop
  def Run(self):
327 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
328 78f3bd30 Michael Hanselmann
    try:
329 78f3bd30 Michael Hanselmann
      self.CheckInstances(notepad)
330 78f3bd30 Michael Hanselmann
      self.CheckDisks(notepad)
331 78f3bd30 Michael Hanselmann
      self.VerifyDisks()
332 78f3bd30 Michael Hanselmann
    finally:
333 78f3bd30 Michael Hanselmann
      notepad.Save()
334 5a3103e9 Michael Hanselmann
335 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
336 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
337 38242904 Iustin Pop
338 a8083063 Iustin Pop
    """
339 5a3103e9 Michael Hanselmann
    check_nodes = []
340 26517d45 Iustin Pop
    for name, new_id in self.bootids.iteritems():
341 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
342 26517d45 Iustin Pop
      if old != new_id:
343 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
344 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
345 5a3103e9 Michael Hanselmann
346 5a3103e9 Michael Hanselmann
    if check_nodes:
347 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
348 5a3103e9 Michael Hanselmann
      # secondary node.
349 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
350 0c0f834d Iustin Pop
        if not instance.autostart:
351 438b45d4 Michael Hanselmann
          logging.info(("Skipping disk activation for non-autostart"
352 438b45d4 Michael Hanselmann
                        " instance %s"), instance.name)
353 0c0f834d Iustin Pop
          continue
354 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
355 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
356 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
357 eee1fa2d Iustin Pop
          continue
358 5a3103e9 Michael Hanselmann
        try:
359 438b45d4 Michael Hanselmann
          logging.info("Activating disks for instance %s", instance.name)
360 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
361 26517d45 Iustin Pop
        except Exception, err:
362 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
363 5a3103e9 Michael Hanselmann
364 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
365 5a3103e9 Michael Hanselmann
      for name in check_nodes:
366 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
367 a8083063 Iustin Pop
368 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
369 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
370 5a3103e9 Michael Hanselmann
371 5a3103e9 Michael Hanselmann
    """
372 a8083063 Iustin Pop
    for instance in self.instances:
373 5a3103e9 Michael Hanselmann
      # Don't care about manually stopped instances
374 5a3103e9 Michael Hanselmann
      if not instance.autostart:
375 5a3103e9 Michael Hanselmann
        continue
376 5a3103e9 Michael Hanselmann
377 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
378 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
379 a8083063 Iustin Pop
380 a8083063 Iustin Pop
        if n > MAXTRIES:
381 a8083063 Iustin Pop
          # stay quiet.
382 a8083063 Iustin Pop
          continue
383 a8083063 Iustin Pop
        elif n < MAXTRIES:
384 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
385 a8083063 Iustin Pop
        else:
386 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
387 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
388 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
389 a8083063 Iustin Pop
          continue
390 a8083063 Iustin Pop
        try:
391 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
392 438b45d4 Michael Hanselmann
                        instance.name, last)
393 a8083063 Iustin Pop
          instance.Restart()
394 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
395 26517d45 Iustin Pop
        except Exception, err:
396 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
397 a8083063 Iustin Pop
398 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
399 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
400 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
401 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
402 a8083063 Iustin Pop
      else:
403 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
404 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
405 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
406 a8083063 Iustin Pop
407 d2f311db Iustin Pop
  def VerifyDisks(self):
408 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
409 d2f311db Iustin Pop
410 d2f311db Iustin Pop
    """
411 d2f311db Iustin Pop
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
412 d2f311db Iustin Pop
    if result.output:
413 438b45d4 Michael Hanselmann
      logging.info(result.output)
414 a8083063 Iustin Pop
415 a8083063 Iustin Pop
416 a8083063 Iustin Pop
def ParseOptions():
417 a8083063 Iustin Pop
  """Parse the command line options.
418 a8083063 Iustin Pop
419 a8083063 Iustin Pop
  Returns:
420 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
421 a8083063 Iustin Pop
422 a8083063 Iustin Pop
  """
423 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
424 a8083063 Iustin Pop
                        usage="%prog [-d]",
425 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
426 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
427 a8083063 Iustin Pop
428 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
429 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
430 a8083063 Iustin Pop
                    default=False, action="store_true")
431 a8083063 Iustin Pop
  options, args = parser.parse_args()
432 a8083063 Iustin Pop
  return options, args
433 a8083063 Iustin Pop
434 a8083063 Iustin Pop
435 a8083063 Iustin Pop
def main():
436 a8083063 Iustin Pop
  """Main function.
437 a8083063 Iustin Pop
438 a8083063 Iustin Pop
  """
439 a8083063 Iustin Pop
  options, args = ParseOptions()
440 a8083063 Iustin Pop
441 59f187eb Iustin Pop
  logger.SetupLogging(constants.LOG_WATCHER, debug=options.debug)
442 a8083063 Iustin Pop
443 a8083063 Iustin Pop
  try:
444 781b2b2b Michael Hanselmann
    try:
445 781b2b2b Michael Hanselmann
      watcher = Watcher()
446 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
447 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
448 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
449 5a3103e9 Michael Hanselmann
    watcher.Run()
450 1b052f42 Michael Hanselmann
  except SystemExit:
451 1b052f42 Michael Hanselmann
    raise
452 38242904 Iustin Pop
  except NotMasterError:
453 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
454 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
455 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
456 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
457 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
458 438b45d4 Michael Hanselmann
  except Exception, err:
459 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
460 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
461 a8083063 Iustin Pop
462 5a3103e9 Michael Hanselmann
463 a8083063 Iustin Pop
if __name__ == '__main__':
464 a8083063 Iustin Pop
  main()