Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 3fc175f0

History | View | Annotate | Download (12.1 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 a8083063 Iustin Pop
import fcntl
34 a8083063 Iustin Pop
import errno
35 438b45d4 Michael Hanselmann
import logging
36 a8083063 Iustin Pop
from optparse import OptionParser
37 a8083063 Iustin Pop
38 a8083063 Iustin Pop
from ganeti import utils
39 a8083063 Iustin Pop
from ganeti import constants
40 67fe61c4 Michael Hanselmann
from ganeti import serializer
41 38242904 Iustin Pop
from ganeti import ssconf
42 89e1fc26 Iustin Pop
from ganeti import errors
43 e125c67c Michael Hanselmann
from ganeti import opcodes
44 3b316acb Iustin Pop
from ganeti import logger
45 e125c67c Michael Hanselmann
from ganeti import cli
46 a8083063 Iustin Pop
47 a8083063 Iustin Pop
48 5a3103e9 Michael Hanselmann
MAXTRIES = 5
49 5a3103e9 Michael Hanselmann
BAD_STATES = ['stopped']
50 5a3103e9 Michael Hanselmann
HELPLESS_STATES = ['(node down)']
51 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
52 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
53 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
54 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
55 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
56 5a3103e9 Michael Hanselmann
57 5a3103e9 Michael Hanselmann
58 e125c67c Michael Hanselmann
# Global client object
59 e125c67c Michael Hanselmann
client = None
60 e125c67c Michael Hanselmann
61 e125c67c Michael Hanselmann
62 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
63 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
64 a8083063 Iustin Pop
65 a8083063 Iustin Pop
66 a8083063 Iustin Pop
def Indent(s, prefix='| '):
67 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
68 a8083063 Iustin Pop
69 a8083063 Iustin Pop
  Args:
70 a8083063 Iustin Pop
    s: The string to indent
71 a8083063 Iustin Pop
    prefix: The string to prepend each line.
72 38242904 Iustin Pop
73 a8083063 Iustin Pop
  """
74 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
75 a8083063 Iustin Pop
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
def DoCmd(cmd):
78 a8083063 Iustin Pop
  """Run a shell command.
79 a8083063 Iustin Pop
80 a8083063 Iustin Pop
  Args:
81 a8083063 Iustin Pop
    cmd: the command to run.
82 a8083063 Iustin Pop
83 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
84 38242904 Iustin Pop
85 a8083063 Iustin Pop
  """
86 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
87 a8083063 Iustin Pop
88 a8083063 Iustin Pop
  if res.failed:
89 7bca53e4 Michael Hanselmann
    msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
90 7bca53e4 Michael Hanselmann
           (repr(cmd),
91 7bca53e4 Michael Hanselmann
            Indent(res.fail_reason),
92 7bca53e4 Michael Hanselmann
            Indent(res.stdout),
93 7bca53e4 Michael Hanselmann
            Indent(res.stderr)))
94 7bca53e4 Michael Hanselmann
    raise errors.CommandError(msg)
95 a8083063 Iustin Pop
96 a8083063 Iustin Pop
  return res
97 a8083063 Iustin Pop
98 a8083063 Iustin Pop
99 5a3103e9 Michael Hanselmann
class WatcherState(object):
100 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
101 a8083063 Iustin Pop
102 a8083063 Iustin Pop
  """
103 a8083063 Iustin Pop
  def __init__(self):
104 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
105 5a3103e9 Michael Hanselmann
106 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
107 5a3103e9 Michael Hanselmann
108 5a3103e9 Michael Hanselmann
    """
109 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
110 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
111 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
112 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
113 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
114 a8083063 Iustin Pop
115 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
116 a8083063 Iustin Pop
117 5a3103e9 Michael Hanselmann
    try:
118 b76f660d Michael Hanselmann
      self._data = serializer.Load(self.statefile.read())
119 5a3103e9 Michael Hanselmann
    except Exception, msg:
120 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
121 b76f660d Michael Hanselmann
      self._data = {}
122 438b45d4 Michael Hanselmann
      logging.warning(("Empty or invalid state file. Using defaults."
123 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
124 5a3103e9 Michael Hanselmann
125 b76f660d Michael Hanselmann
    if "instance" not in self._data:
126 b76f660d Michael Hanselmann
      self._data["instance"] = {}
127 b76f660d Michael Hanselmann
    if "node" not in self._data:
128 b76f660d Michael Hanselmann
      self._data["node"] = {}
129 5a3103e9 Michael Hanselmann
130 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
131 2fb96d39 Michael Hanselmann
132 fc428e32 Michael Hanselmann
  def Save(self):
133 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
134 5a3103e9 Michael Hanselmann
135 5a3103e9 Michael Hanselmann
    """
136 fc428e32 Michael Hanselmann
    assert self.statefile
137 fc428e32 Michael Hanselmann
138 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
139 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
140 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
141 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
142 2fb96d39 Michael Hanselmann
      return
143 2fb96d39 Michael Hanselmann
144 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
145 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
146 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
147 26517d45 Iustin Pop
                         data=serialized_form,
148 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
149 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
150 5a3103e9 Michael Hanselmann
151 fc428e32 Michael Hanselmann
  def Close(self):
152 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
153 5a3103e9 Michael Hanselmann
154 5a3103e9 Michael Hanselmann
    """
155 5a3103e9 Michael Hanselmann
    assert self.statefile
156 5a3103e9 Michael Hanselmann
157 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
158 5a3103e9 Michael Hanselmann
    self.statefile.close()
159 5a3103e9 Michael Hanselmann
    self.statefile = None
160 5a3103e9 Michael Hanselmann
161 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
162 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
163 a8083063 Iustin Pop
164 5a3103e9 Michael Hanselmann
    """
165 b76f660d Michael Hanselmann
    ndata = self._data["node"]
166 5a3103e9 Michael Hanselmann
167 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
168 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
169 5a3103e9 Michael Hanselmann
    return None
170 5a3103e9 Michael Hanselmann
171 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
172 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
173 5a3103e9 Michael Hanselmann
174 5a3103e9 Michael Hanselmann
    """
175 5a3103e9 Michael Hanselmann
    assert bootid
176 a8083063 Iustin Pop
177 b76f660d Michael Hanselmann
    ndata = self._data["node"]
178 a8083063 Iustin Pop
179 5a3103e9 Michael Hanselmann
    if name not in ndata:
180 5a3103e9 Michael Hanselmann
      ndata[name] = {}
181 5a3103e9 Michael Hanselmann
182 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
183 5a3103e9 Michael Hanselmann
184 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
185 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
186 a8083063 Iustin Pop
187 a8083063 Iustin Pop
    Args:
188 a8083063 Iustin Pop
      instance - the instance to look up.
189 38242904 Iustin Pop
190 a8083063 Iustin Pop
    """
191 b76f660d Michael Hanselmann
    idata = self._data["instance"]
192 a8083063 Iustin Pop
193 5a3103e9 Michael Hanselmann
    if instance.name in idata:
194 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
195 a8083063 Iustin Pop
196 a8083063 Iustin Pop
    return 0
197 a8083063 Iustin Pop
198 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
199 a8083063 Iustin Pop
    """Record a restart attempt.
200 a8083063 Iustin Pop
201 a8083063 Iustin Pop
    Args:
202 a8083063 Iustin Pop
      instance - the instance being restarted
203 38242904 Iustin Pop
204 a8083063 Iustin Pop
    """
205 b76f660d Michael Hanselmann
    idata = self._data["instance"]
206 a8083063 Iustin Pop
207 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
208 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
209 5a3103e9 Michael Hanselmann
    else:
210 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
211 a8083063 Iustin Pop
212 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
213 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
214 a8083063 Iustin Pop
215 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
216 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
217 a8083063 Iustin Pop
218 a8083063 Iustin Pop
    Args:
219 a8083063 Iustin Pop
      instance - the instance to remove from books
220 a8083063 Iustin Pop
221 38242904 Iustin Pop
    This method removes the record for a named instance.
222 38242904 Iustin Pop
223 a8083063 Iustin Pop
    """
224 b76f660d Michael Hanselmann
    idata = self._data["instance"]
225 a8083063 Iustin Pop
226 5a3103e9 Michael Hanselmann
    if instance.name in idata:
227 5a3103e9 Michael Hanselmann
      del idata[instance.name]
228 a8083063 Iustin Pop
229 a8083063 Iustin Pop
230 a8083063 Iustin Pop
class Instance(object):
231 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
232 a8083063 Iustin Pop
233 a8083063 Iustin Pop
  Methods:
234 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
235 098c0958 Michael Hanselmann
236 a8083063 Iustin Pop
  """
237 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
238 a8083063 Iustin Pop
    self.name = name
239 a8083063 Iustin Pop
    self.state = state
240 5a3103e9 Michael Hanselmann
    self.autostart = autostart
241 a8083063 Iustin Pop
242 a8083063 Iustin Pop
  def Restart(self):
243 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
244 3ecf6786 Iustin Pop
245 3ecf6786 Iustin Pop
    """
246 e125c67c Michael Hanselmann
    op = opcodes.OpStartupInstance(instance_name=self.name,
247 e125c67c Michael Hanselmann
                                   force=False,
248 e125c67c Michael Hanselmann
                                   extra_args=None)
249 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
250 a8083063 Iustin Pop
251 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
252 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
253 5a3103e9 Michael Hanselmann
254 5a3103e9 Michael Hanselmann
    """
255 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
256 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
257 a8083063 Iustin Pop
258 a8083063 Iustin Pop
259 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
260 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
261 5a3103e9 Michael Hanselmann
262 5a3103e9 Michael Hanselmann
  """
263 e125c67c Michael Hanselmann
  fields = ["name", "oper_state", "admin_state"]
264 a8083063 Iustin Pop
265 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
266 e125c67c Michael Hanselmann
    fields.append("snodes")
267 a8083063 Iustin Pop
268 e125c67c Michael Hanselmann
  result = client.QueryInstances([], fields)
269 5a3103e9 Michael Hanselmann
270 5a3103e9 Michael Hanselmann
  instances = []
271 e125c67c Michael Hanselmann
  for fields in result:
272 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
273 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
274 5a3103e9 Michael Hanselmann
275 e125c67c Michael Hanselmann
      if not snodes:
276 a8083063 Iustin Pop
        continue
277 5a3103e9 Michael Hanselmann
278 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
279 e125c67c Michael Hanselmann
        if node in snodes:
280 5a3103e9 Michael Hanselmann
          break
281 5a3103e9 Michael Hanselmann
      else:
282 a8083063 Iustin Pop
        continue
283 a8083063 Iustin Pop
284 5a3103e9 Michael Hanselmann
    else:
285 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
286 5a3103e9 Michael Hanselmann
287 e125c67c Michael Hanselmann
    instances.append(Instance(name, status, autostart))
288 a8083063 Iustin Pop
289 5a3103e9 Michael Hanselmann
  return instances
290 5a3103e9 Michael Hanselmann
291 5a3103e9 Michael Hanselmann
292 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
293 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
294 5a3103e9 Michael Hanselmann
295 5a3103e9 Michael Hanselmann
  """
296 e125c67c Michael Hanselmann
  result = client.QueryNodes([], ["name", "bootid"])
297 e125c67c Michael Hanselmann
  return dict([(name, bootid) for name, bootid in result])
298 a8083063 Iustin Pop
299 a8083063 Iustin Pop
300 5a3103e9 Michael Hanselmann
class Watcher(object):
301 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
302 a8083063 Iustin Pop
303 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
304 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
305 a8083063 Iustin Pop
  to restart machines that are down.
306 38242904 Iustin Pop
307 a8083063 Iustin Pop
  """
308 a8083063 Iustin Pop
  def __init__(self):
309 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
310 38242904 Iustin Pop
    master = sstore.GetMasterNode()
311 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
312 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
313 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
314 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
315 eee1fa2d Iustin Pop
    self.started_instances = set()
316 a8083063 Iustin Pop
317 a8083063 Iustin Pop
  def Run(self):
318 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
319 78f3bd30 Michael Hanselmann
    try:
320 78f3bd30 Michael Hanselmann
      self.CheckInstances(notepad)
321 78f3bd30 Michael Hanselmann
      self.CheckDisks(notepad)
322 78f3bd30 Michael Hanselmann
      self.VerifyDisks()
323 78f3bd30 Michael Hanselmann
    finally:
324 78f3bd30 Michael Hanselmann
      notepad.Save()
325 5a3103e9 Michael Hanselmann
326 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
327 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
328 38242904 Iustin Pop
329 a8083063 Iustin Pop
    """
330 5a3103e9 Michael Hanselmann
    check_nodes = []
331 26517d45 Iustin Pop
    for name, new_id in self.bootids.iteritems():
332 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
333 26517d45 Iustin Pop
      if old != new_id:
334 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
335 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
336 5a3103e9 Michael Hanselmann
337 5a3103e9 Michael Hanselmann
    if check_nodes:
338 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
339 5a3103e9 Michael Hanselmann
      # secondary node.
340 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
341 0c0f834d Iustin Pop
        if not instance.autostart:
342 438b45d4 Michael Hanselmann
          logging.info(("Skipping disk activation for non-autostart"
343 438b45d4 Michael Hanselmann
                        " instance %s"), instance.name)
344 0c0f834d Iustin Pop
          continue
345 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
346 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
347 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
348 eee1fa2d Iustin Pop
          continue
349 5a3103e9 Michael Hanselmann
        try:
350 438b45d4 Michael Hanselmann
          logging.info("Activating disks for instance %s", instance.name)
351 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
352 26517d45 Iustin Pop
        except Exception, err:
353 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
354 5a3103e9 Michael Hanselmann
355 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
356 5a3103e9 Michael Hanselmann
      for name in check_nodes:
357 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
358 a8083063 Iustin Pop
359 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
360 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
361 5a3103e9 Michael Hanselmann
362 5a3103e9 Michael Hanselmann
    """
363 a8083063 Iustin Pop
    for instance in self.instances:
364 5a3103e9 Michael Hanselmann
      # Don't care about manually stopped instances
365 5a3103e9 Michael Hanselmann
      if not instance.autostart:
366 5a3103e9 Michael Hanselmann
        continue
367 5a3103e9 Michael Hanselmann
368 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
369 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
370 a8083063 Iustin Pop
371 a8083063 Iustin Pop
        if n > MAXTRIES:
372 a8083063 Iustin Pop
          # stay quiet.
373 a8083063 Iustin Pop
          continue
374 a8083063 Iustin Pop
        elif n < MAXTRIES:
375 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
376 a8083063 Iustin Pop
        else:
377 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
378 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
379 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
380 a8083063 Iustin Pop
          continue
381 a8083063 Iustin Pop
        try:
382 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
383 438b45d4 Michael Hanselmann
                        instance.name, last)
384 a8083063 Iustin Pop
          instance.Restart()
385 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
386 26517d45 Iustin Pop
        except Exception, err:
387 438b45d4 Michael Hanselmann
          logging.error(str(err), exc_info=True)
388 a8083063 Iustin Pop
389 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
390 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
391 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
392 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
393 a8083063 Iustin Pop
      else:
394 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
395 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
396 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
397 a8083063 Iustin Pop
398 d2f311db Iustin Pop
  def VerifyDisks(self):
399 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
400 d2f311db Iustin Pop
401 d2f311db Iustin Pop
    """
402 e125c67c Michael Hanselmann
    # TODO: What should we do here?
403 e125c67c Michael Hanselmann
    result = DoCmd(['gnt-cluster', 'verify-disks'])
404 d2f311db Iustin Pop
    if result.output:
405 438b45d4 Michael Hanselmann
      logging.info(result.output)
406 a8083063 Iustin Pop
407 a8083063 Iustin Pop
408 a8083063 Iustin Pop
def ParseOptions():
409 a8083063 Iustin Pop
  """Parse the command line options.
410 a8083063 Iustin Pop
411 a8083063 Iustin Pop
  Returns:
412 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
413 a8083063 Iustin Pop
414 a8083063 Iustin Pop
  """
415 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
416 a8083063 Iustin Pop
                        usage="%prog [-d]",
417 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
418 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
419 a8083063 Iustin Pop
420 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
421 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
422 a8083063 Iustin Pop
                    default=False, action="store_true")
423 a8083063 Iustin Pop
  options, args = parser.parse_args()
424 a8083063 Iustin Pop
  return options, args
425 a8083063 Iustin Pop
426 a8083063 Iustin Pop
427 a8083063 Iustin Pop
def main():
428 a8083063 Iustin Pop
  """Main function.
429 a8083063 Iustin Pop
430 a8083063 Iustin Pop
  """
431 e125c67c Michael Hanselmann
  global client
432 e125c67c Michael Hanselmann
433 a8083063 Iustin Pop
  options, args = ParseOptions()
434 a8083063 Iustin Pop
435 59f187eb Iustin Pop
  logger.SetupLogging(constants.LOG_WATCHER, debug=options.debug)
436 a8083063 Iustin Pop
437 a8083063 Iustin Pop
  try:
438 e125c67c Michael Hanselmann
    client = cli.GetClient()
439 e125c67c Michael Hanselmann
440 781b2b2b Michael Hanselmann
    try:
441 781b2b2b Michael Hanselmann
      watcher = Watcher()
442 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
443 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
444 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
445 e125c67c Michael Hanselmann
446 5a3103e9 Michael Hanselmann
    watcher.Run()
447 1b052f42 Michael Hanselmann
  except SystemExit:
448 1b052f42 Michael Hanselmann
    raise
449 38242904 Iustin Pop
  except NotMasterError:
450 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
451 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
452 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
453 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
454 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
455 438b45d4 Michael Hanselmann
  except Exception, err:
456 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
457 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
458 a8083063 Iustin Pop
459 5a3103e9 Michael Hanselmann
460 a8083063 Iustin Pop
if __name__ == '__main__':
461 a8083063 Iustin Pop
  main()