Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 9e47cad8

History | View | Annotate | Download (22.6 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 a8083063 Iustin Pop
import sys
36 a8083063 Iustin Pop
import time
37 438b45d4 Michael Hanselmann
import logging
38 a8083063 Iustin Pop
from optparse import OptionParser
39 a8083063 Iustin Pop
40 a8083063 Iustin Pop
from ganeti import utils
41 a8083063 Iustin Pop
from ganeti import constants
42 67fe61c4 Michael Hanselmann
from ganeti import serializer
43 89e1fc26 Iustin Pop
from ganeti import errors
44 e125c67c Michael Hanselmann
from ganeti import opcodes
45 e125c67c Michael Hanselmann
from ganeti import cli
46 7dfb83c2 Iustin Pop
from ganeti import luxi
47 50273051 Iustin Pop
from ganeti import ssconf
48 50273051 Iustin Pop
from ganeti import bdev
49 50273051 Iustin Pop
from ganeti import hypervisor
50 db147305 Tom Limoncelli
from ganeti import rapi
51 50273051 Iustin Pop
from ganeti.confd import client as confd_client
52 a8083063 Iustin Pop
53 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
54 db147305 Tom Limoncelli
55 a8083063 Iustin Pop
56 5a3103e9 Michael Hanselmann
MAXTRIES = 5
57 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
58 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
59 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
60 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
61 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
62 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
63 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
64 5a3103e9 Michael Hanselmann
65 5a3103e9 Michael Hanselmann
66 e125c67c Michael Hanselmann
# Global client object
67 e125c67c Michael Hanselmann
client = None
68 e125c67c Michael Hanselmann
69 e125c67c Michael Hanselmann
70 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
71 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
72 a8083063 Iustin Pop
73 a8083063 Iustin Pop
74 3753b2cb Michael Hanselmann
def ShouldPause():
75 3753b2cb Michael Hanselmann
  """Check whether we should pause.
76 3753b2cb Michael Hanselmann
77 3753b2cb Michael Hanselmann
  """
78 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
79 3753b2cb Michael Hanselmann
80 3753b2cb Michael Hanselmann
81 f1115454 Guido Trotter
def StartNodeDaemons():
82 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
83 f1115454 Guido Trotter
84 f1115454 Guido Trotter
  """
85 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
86 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
87 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
88 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
89 f1115454 Guido Trotter
90 f1115454 Guido Trotter
91 9e289e36 Guido Trotter
def RunWatcherHooks():
92 9e289e36 Guido Trotter
  """Run the watcher hooks.
93 9e289e36 Guido Trotter
94 9e289e36 Guido Trotter
  """
95 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
96 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
97 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
98 10e689d4 Iustin Pop
    return
99 9e289e36 Guido Trotter
100 9e289e36 Guido Trotter
  try:
101 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
102 9e289e36 Guido Trotter
  except Exception, msg: # pylint: disable-msg=W0703
103 9e289e36 Guido Trotter
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
104 9e289e36 Guido Trotter
105 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
106 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
107 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
108 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
109 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
110 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
111 9e289e36 Guido Trotter
      if runresult.failed:
112 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
113 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
114 9e289e36 Guido Trotter
      else:
115 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
116 9e289e36 Guido Trotter
                      runresult.output)
117 9e289e36 Guido Trotter
118 001b3825 Michael Hanselmann
119 50273051 Iustin Pop
class NodeMaintenance(object):
120 50273051 Iustin Pop
  """Talks to confd daemons and possible shutdown instances/drbd devices.
121 50273051 Iustin Pop
122 50273051 Iustin Pop
  """
123 50273051 Iustin Pop
  def __init__(self):
124 50273051 Iustin Pop
    self.store_cb = confd_client.StoreResultCallback()
125 50273051 Iustin Pop
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
126 50273051 Iustin Pop
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
127 50273051 Iustin Pop
128 50273051 Iustin Pop
  @staticmethod
129 50273051 Iustin Pop
  def ShouldRun():
130 50273051 Iustin Pop
    """Checks whether node maintenance should run.
131 50273051 Iustin Pop
132 50273051 Iustin Pop
    """
133 50273051 Iustin Pop
    try:
134 50273051 Iustin Pop
      return ssconf.SimpleStore().GetMaintainNodeHealth()
135 50273051 Iustin Pop
    except errors.ConfigurationError, err:
136 50273051 Iustin Pop
      logging.error("Configuration error, not activating node maintenance: %s",
137 50273051 Iustin Pop
                    err)
138 50273051 Iustin Pop
      return False
139 50273051 Iustin Pop
140 50273051 Iustin Pop
  @staticmethod
141 50273051 Iustin Pop
  def GetRunningInstances():
142 50273051 Iustin Pop
    """Compute list of hypervisor/running instances.
143 50273051 Iustin Pop
144 50273051 Iustin Pop
    """
145 50273051 Iustin Pop
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
146 50273051 Iustin Pop
    results = []
147 50273051 Iustin Pop
    for hv_name in hyp_list:
148 50273051 Iustin Pop
      try:
149 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
150 50273051 Iustin Pop
        ilist = hv.ListInstances()
151 50273051 Iustin Pop
        results.extend([(iname, hv_name) for iname in ilist])
152 50273051 Iustin Pop
      except: # pylint: disable-msg=W0702
153 50273051 Iustin Pop
        logging.error("Error while listing instances for hypervisor %s",
154 50273051 Iustin Pop
                      hv_name, exc_info=True)
155 50273051 Iustin Pop
    return results
156 50273051 Iustin Pop
157 50273051 Iustin Pop
  @staticmethod
158 50273051 Iustin Pop
  def GetUsedDRBDs():
159 50273051 Iustin Pop
    """Get list of used DRBD minors.
160 50273051 Iustin Pop
161 50273051 Iustin Pop
    """
162 50273051 Iustin Pop
    return bdev.DRBD8.GetUsedDevs().keys()
163 50273051 Iustin Pop
164 50273051 Iustin Pop
  @classmethod
165 50273051 Iustin Pop
  def DoMaintenance(cls, role):
166 50273051 Iustin Pop
    """Maintain the instance list.
167 50273051 Iustin Pop
168 50273051 Iustin Pop
    """
169 50273051 Iustin Pop
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
170 50273051 Iustin Pop
      inst_running = cls.GetRunningInstances()
171 50273051 Iustin Pop
      cls.ShutdownInstances(inst_running)
172 50273051 Iustin Pop
      drbd_running = cls.GetUsedDRBDs()
173 50273051 Iustin Pop
      cls.ShutdownDRBD(drbd_running)
174 50273051 Iustin Pop
    else:
175 50273051 Iustin Pop
      logging.debug("Not doing anything for role %s", role)
176 50273051 Iustin Pop
177 50273051 Iustin Pop
  @staticmethod
178 50273051 Iustin Pop
  def ShutdownInstances(inst_running):
179 50273051 Iustin Pop
    """Shutdown running instances.
180 50273051 Iustin Pop
181 50273051 Iustin Pop
    """
182 50273051 Iustin Pop
    names_running = set([i[0] for i in inst_running])
183 50273051 Iustin Pop
    if names_running:
184 50273051 Iustin Pop
      logging.info("Following instances should not be running,"
185 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(names_running))
186 50273051 Iustin Pop
      # this dictionary will collapse duplicate instance names (only
187 50273051 Iustin Pop
      # xen pvm/vhm) into a single key, which is fine
188 50273051 Iustin Pop
      i2h = dict(inst_running)
189 50273051 Iustin Pop
      for name in names_running:
190 50273051 Iustin Pop
        hv_name = i2h[name]
191 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
192 50273051 Iustin Pop
        hv.StopInstance(None, force=True, name=name)
193 50273051 Iustin Pop
194 50273051 Iustin Pop
  @staticmethod
195 50273051 Iustin Pop
  def ShutdownDRBD(drbd_running):
196 50273051 Iustin Pop
    """Shutdown active DRBD devices.
197 50273051 Iustin Pop
198 50273051 Iustin Pop
    """
199 50273051 Iustin Pop
    if drbd_running:
200 50273051 Iustin Pop
      logging.info("Following DRBD minors should not be active,"
201 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
202 50273051 Iustin Pop
      for minor in drbd_running:
203 50273051 Iustin Pop
        # pylint: disable-msg=W0212
204 50273051 Iustin Pop
        # using the private method as is, pending enhancements to the DRBD
205 50273051 Iustin Pop
        # interface
206 50273051 Iustin Pop
        bdev.DRBD8._ShutdownAll(minor)
207 50273051 Iustin Pop
208 50273051 Iustin Pop
  def Exec(self):
209 50273051 Iustin Pop
    """Check node status versus cluster desired state.
210 50273051 Iustin Pop
211 50273051 Iustin Pop
    """
212 50273051 Iustin Pop
    my_name = utils.HostInfo().name
213 50273051 Iustin Pop
    req = confd_client.ConfdClientRequest(type=
214 50273051 Iustin Pop
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
215 50273051 Iustin Pop
                                          query=my_name)
216 ebacb943 Iustin Pop
    self.confd_client.SendRequest(req, async=False, coverage=-1)
217 50273051 Iustin Pop
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
218 50273051 Iustin Pop
    if not timed_out:
219 50273051 Iustin Pop
      # should have a valid response
220 50273051 Iustin Pop
      status, result = self.store_cb.GetResponse(req.rsalt)
221 50273051 Iustin Pop
      assert status, "Missing result but received replies"
222 50273051 Iustin Pop
      if not self.filter_cb.consistent[req.rsalt]:
223 50273051 Iustin Pop
        logging.warning("Inconsistent replies, not doing anything")
224 50273051 Iustin Pop
        return
225 50273051 Iustin Pop
      self.DoMaintenance(result.server_reply.answer)
226 50273051 Iustin Pop
    else:
227 50273051 Iustin Pop
      logging.warning("Confd query timed out, cannot do maintenance actions")
228 50273051 Iustin Pop
229 50273051 Iustin Pop
230 5a3103e9 Michael Hanselmann
class WatcherState(object):
231 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
232 a8083063 Iustin Pop
233 a8083063 Iustin Pop
  """
234 001b3825 Michael Hanselmann
  def __init__(self, statefile):
235 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
236 5a3103e9 Michael Hanselmann
237 001b3825 Michael Hanselmann
    @type statefile: file
238 001b3825 Michael Hanselmann
    @param statefile: State file object
239 5a3103e9 Michael Hanselmann
240 5a3103e9 Michael Hanselmann
    """
241 001b3825 Michael Hanselmann
    self.statefile = statefile
242 a8083063 Iustin Pop
243 5a3103e9 Michael Hanselmann
    try:
244 2c404217 Iustin Pop
      state_data = self.statefile.read()
245 2c404217 Iustin Pop
      if not state_data:
246 2c404217 Iustin Pop
        self._data = {}
247 2c404217 Iustin Pop
      else:
248 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
249 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
250 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
251 b76f660d Michael Hanselmann
      self._data = {}
252 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
253 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
254 5a3103e9 Michael Hanselmann
255 b76f660d Michael Hanselmann
    if "instance" not in self._data:
256 b76f660d Michael Hanselmann
      self._data["instance"] = {}
257 b76f660d Michael Hanselmann
    if "node" not in self._data:
258 b76f660d Michael Hanselmann
      self._data["node"] = {}
259 5a3103e9 Michael Hanselmann
260 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
261 2fb96d39 Michael Hanselmann
262 fc428e32 Michael Hanselmann
  def Save(self):
263 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
264 5a3103e9 Michael Hanselmann
265 5a3103e9 Michael Hanselmann
    """
266 fc428e32 Michael Hanselmann
    assert self.statefile
267 fc428e32 Michael Hanselmann
268 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
269 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
270 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
271 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
272 2fb96d39 Michael Hanselmann
      return
273 2fb96d39 Michael Hanselmann
274 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
275 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
276 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
277 26517d45 Iustin Pop
                         data=serialized_form,
278 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
279 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
280 5a3103e9 Michael Hanselmann
281 fc428e32 Michael Hanselmann
  def Close(self):
282 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
283 5a3103e9 Michael Hanselmann
284 5a3103e9 Michael Hanselmann
    """
285 5a3103e9 Michael Hanselmann
    assert self.statefile
286 5a3103e9 Michael Hanselmann
287 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
288 5a3103e9 Michael Hanselmann
    self.statefile.close()
289 5a3103e9 Michael Hanselmann
    self.statefile = None
290 5a3103e9 Michael Hanselmann
291 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
292 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
293 a8083063 Iustin Pop
294 5a3103e9 Michael Hanselmann
    """
295 b76f660d Michael Hanselmann
    ndata = self._data["node"]
296 5a3103e9 Michael Hanselmann
297 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
298 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
299 5a3103e9 Michael Hanselmann
    return None
300 5a3103e9 Michael Hanselmann
301 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
302 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
303 5a3103e9 Michael Hanselmann
304 5a3103e9 Michael Hanselmann
    """
305 5a3103e9 Michael Hanselmann
    assert bootid
306 a8083063 Iustin Pop
307 b76f660d Michael Hanselmann
    ndata = self._data["node"]
308 a8083063 Iustin Pop
309 5a3103e9 Michael Hanselmann
    if name not in ndata:
310 5a3103e9 Michael Hanselmann
      ndata[name] = {}
311 5a3103e9 Michael Hanselmann
312 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
313 5a3103e9 Michael Hanselmann
314 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
315 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
316 a8083063 Iustin Pop
317 c41eea6e Iustin Pop
    @type instance: L{Instance}
318 c41eea6e Iustin Pop
    @param instance: the instance to look up
319 38242904 Iustin Pop
320 a8083063 Iustin Pop
    """
321 b76f660d Michael Hanselmann
    idata = self._data["instance"]
322 a8083063 Iustin Pop
323 5a3103e9 Michael Hanselmann
    if instance.name in idata:
324 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
325 a8083063 Iustin Pop
326 a8083063 Iustin Pop
    return 0
327 a8083063 Iustin Pop
328 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
329 a8083063 Iustin Pop
    """Record a restart attempt.
330 a8083063 Iustin Pop
331 c41eea6e Iustin Pop
    @type instance: L{Instance}
332 c41eea6e Iustin Pop
    @param instance: the instance being restarted
333 38242904 Iustin Pop
334 a8083063 Iustin Pop
    """
335 b76f660d Michael Hanselmann
    idata = self._data["instance"]
336 a8083063 Iustin Pop
337 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
338 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
339 5a3103e9 Michael Hanselmann
    else:
340 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
341 a8083063 Iustin Pop
342 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
343 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
344 a8083063 Iustin Pop
345 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
346 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
347 a8083063 Iustin Pop
348 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
349 c41eea6e Iustin Pop
    track down instances).
350 a8083063 Iustin Pop
351 c41eea6e Iustin Pop
    @type instance: L{Instance}
352 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
353 38242904 Iustin Pop
354 a8083063 Iustin Pop
    """
355 b76f660d Michael Hanselmann
    idata = self._data["instance"]
356 a8083063 Iustin Pop
357 5a3103e9 Michael Hanselmann
    if instance.name in idata:
358 5a3103e9 Michael Hanselmann
      del idata[instance.name]
359 a8083063 Iustin Pop
360 a8083063 Iustin Pop
361 a8083063 Iustin Pop
class Instance(object):
362 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
363 a8083063 Iustin Pop
364 a8083063 Iustin Pop
  """
365 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
366 a8083063 Iustin Pop
    self.name = name
367 a8083063 Iustin Pop
    self.state = state
368 5a3103e9 Michael Hanselmann
    self.autostart = autostart
369 a8083063 Iustin Pop
370 a8083063 Iustin Pop
  def Restart(self):
371 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
372 3ecf6786 Iustin Pop
373 3ecf6786 Iustin Pop
    """
374 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
375 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
376 a8083063 Iustin Pop
377 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
378 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
379 5a3103e9 Michael Hanselmann
380 5a3103e9 Michael Hanselmann
    """
381 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
382 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
383 a8083063 Iustin Pop
384 a8083063 Iustin Pop
385 6dfcc47b Iustin Pop
def GetClusterData():
386 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
387 5a3103e9 Michael Hanselmann
388 5a3103e9 Michael Hanselmann
  """
389 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
390 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
391 6dfcc47b Iustin Pop
                                 use_locking=True)
392 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
393 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
394 6dfcc47b Iustin Pop
                             use_locking=True)
395 a8083063 Iustin Pop
396 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
397 a8083063 Iustin Pop
398 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
399 5a3103e9 Michael Hanselmann
400 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
401 78f44650 Iustin Pop
402 6dfcc47b Iustin Pop
  result = all_results[0]
403 6dfcc47b Iustin Pop
  smap = {}
404 5a3103e9 Michael Hanselmann
405 6dfcc47b Iustin Pop
  instances = {}
406 78f44650 Iustin Pop
407 78f44650 Iustin Pop
  # write the upfile
408 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
409 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
410 78f44650 Iustin Pop
411 6dfcc47b Iustin Pop
  for fields in result:
412 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
413 5a3103e9 Michael Hanselmann
414 6dfcc47b Iustin Pop
    # update the secondary node map
415 6dfcc47b Iustin Pop
    for node in snodes:
416 6dfcc47b Iustin Pop
      if node not in smap:
417 6dfcc47b Iustin Pop
        smap[node] = []
418 6dfcc47b Iustin Pop
      smap[node].append(name)
419 a8083063 Iustin Pop
420 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
421 5a3103e9 Michael Hanselmann
422 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
423 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
424 5a3103e9 Michael Hanselmann
425 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
426 5a3103e9 Michael Hanselmann
427 6dfcc47b Iustin Pop
  return instances, nodes, smap
428 a8083063 Iustin Pop
429 a8083063 Iustin Pop
430 5a3103e9 Michael Hanselmann
class Watcher(object):
431 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
432 a8083063 Iustin Pop
433 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
434 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
435 a8083063 Iustin Pop
  to restart machines that are down.
436 38242904 Iustin Pop
437 a8083063 Iustin Pop
  """
438 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
439 cc962d58 Iustin Pop
    self.notepad = notepad
440 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
441 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
442 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
443 24edc6d4 Iustin Pop
    # first archive old jobs
444 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
445 24edc6d4 Iustin Pop
    # and only then submit new ones
446 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
447 eee1fa2d Iustin Pop
    self.started_instances = set()
448 f07521e5 Iustin Pop
    self.opts = opts
449 a8083063 Iustin Pop
450 a8083063 Iustin Pop
  def Run(self):
451 cc962d58 Iustin Pop
    """Watcher run sequence.
452 cc962d58 Iustin Pop
453 cc962d58 Iustin Pop
    """
454 cc962d58 Iustin Pop
    notepad = self.notepad
455 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
456 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
457 cc962d58 Iustin Pop
    self.VerifyDisks()
458 5a3103e9 Michael Hanselmann
459 24edc6d4 Iustin Pop
  @staticmethod
460 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
461 f07521e5 Iustin Pop
    """Archive old jobs.
462 f07521e5 Iustin Pop
463 f07521e5 Iustin Pop
    """
464 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
465 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
466 f07521e5 Iustin Pop
467 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
468 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
469 38242904 Iustin Pop
470 a8083063 Iustin Pop
    """
471 5a3103e9 Michael Hanselmann
    check_nodes = []
472 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
473 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
474 37b77b18 Iustin Pop
      if new_id is None:
475 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
476 cbfc4681 Iustin Pop
        if not offline:
477 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
478 cbfc4681 Iustin Pop
                        name)
479 37b77b18 Iustin Pop
        continue
480 26517d45 Iustin Pop
      if old != new_id:
481 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
482 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
483 5a3103e9 Michael Hanselmann
484 5a3103e9 Michael Hanselmann
    if check_nodes:
485 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
486 5a3103e9 Michael Hanselmann
      # secondary node.
487 6dfcc47b Iustin Pop
      for node in check_nodes:
488 6dfcc47b Iustin Pop
        if node not in self.smap:
489 eee1fa2d Iustin Pop
          continue
490 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
491 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
492 6dfcc47b Iustin Pop
          if not instance.autostart:
493 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
494 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
495 6dfcc47b Iustin Pop
            continue
496 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
497 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
498 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
499 6dfcc47b Iustin Pop
            continue
500 6dfcc47b Iustin Pop
          try:
501 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
502 6dfcc47b Iustin Pop
            instance.ActivateDisks()
503 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
504 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
505 6dfcc47b Iustin Pop
                              instance.name)
506 5a3103e9 Michael Hanselmann
507 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
508 5a3103e9 Michael Hanselmann
      for name in check_nodes:
509 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
510 a8083063 Iustin Pop
511 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
512 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
513 5a3103e9 Michael Hanselmann
514 5a3103e9 Michael Hanselmann
    """
515 6dfcc47b Iustin Pop
    for instance in self.instances.values():
516 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
517 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
518 a8083063 Iustin Pop
519 a8083063 Iustin Pop
        if n > MAXTRIES:
520 a8083063 Iustin Pop
          # stay quiet.
521 a8083063 Iustin Pop
          continue
522 a8083063 Iustin Pop
        elif n < MAXTRIES:
523 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
524 a8083063 Iustin Pop
        else:
525 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
526 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
527 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
528 a8083063 Iustin Pop
          continue
529 a8083063 Iustin Pop
        try:
530 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
531 438b45d4 Michael Hanselmann
                        instance.name, last)
532 a8083063 Iustin Pop
          instance.Restart()
533 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
534 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
535 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
536 4bffa7f7 Iustin Pop
                            instance.name)
537 a8083063 Iustin Pop
538 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
539 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
540 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
541 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
542 a8083063 Iustin Pop
      else:
543 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
544 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
545 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
546 a8083063 Iustin Pop
547 b7309a0d Iustin Pop
  @staticmethod
548 b7309a0d Iustin Pop
  def VerifyDisks():
549 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
550 d2f311db Iustin Pop
551 d2f311db Iustin Pop
    """
552 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
553 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
554 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
555 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
556 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
557 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
558 5188ab37 Iustin Pop
      return
559 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
560 5188ab37 Iustin Pop
    if not offline_disk_instances:
561 5188ab37 Iustin Pop
      # nothing to do
562 5188ab37 Iustin Pop
      return
563 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
564 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
565 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
566 5188ab37 Iustin Pop
    # less the job queue
567 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
568 5188ab37 Iustin Pop
           for name in offline_disk_instances]
569 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
570 5188ab37 Iustin Pop
571 a9105b24 Michael Hanselmann
    try:
572 a9105b24 Michael Hanselmann
      cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
573 a9105b24 Michael Hanselmann
    except Exception: # pylint: disable-msg=W0703
574 a9105b24 Michael Hanselmann
      logging.exception("Error while activating disks")
575 a8083063 Iustin Pop
576 a8083063 Iustin Pop
577 001b3825 Michael Hanselmann
def OpenStateFile(path):
578 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
579 001b3825 Michael Hanselmann
580 001b3825 Michael Hanselmann
  @type path: string
581 001b3825 Michael Hanselmann
  @param path: Path to state file
582 001b3825 Michael Hanselmann
583 001b3825 Michael Hanselmann
  """
584 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
585 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
586 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
587 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
588 001b3825 Michael Hanselmann
589 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
590 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
591 001b3825 Michael Hanselmann
  # watcher from running.
592 001b3825 Michael Hanselmann
  try:
593 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
594 001b3825 Michael Hanselmann
  except errors.LockError, err:
595 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
596 001b3825 Michael Hanselmann
    return None
597 001b3825 Michael Hanselmann
598 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
599 001b3825 Michael Hanselmann
600 001b3825 Michael Hanselmann
601 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
602 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
603 db147305 Tom Limoncelli
604 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
605 db147305 Tom Limoncelli
  test is GetVersion.
606 db147305 Tom Limoncelli
607 db147305 Tom Limoncelli
  @type hostname: string
608 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
609 db147305 Tom Limoncelli
  @rtype: bool
610 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
611 db147305 Tom Limoncelli
612 db147305 Tom Limoncelli
  """
613 2a7c3583 Michael Hanselmann
  curl_config = rapi.client.GenericCurlConfig(cafile=constants.RAPI_CERT_FILE)
614 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
615 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
616 db147305 Tom Limoncelli
  try:
617 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
618 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
619 db147305 Tom Limoncelli
    logging.warning("RAPI Error: CertificateError (%s)", err)
620 db147305 Tom Limoncelli
    return False
621 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
622 db147305 Tom Limoncelli
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
623 db147305 Tom Limoncelli
    return False
624 db147305 Tom Limoncelli
  logging.debug("RAPI Result: master_version is %s", master_version)
625 db147305 Tom Limoncelli
  return master_version == constants.RAPI_VERSION
626 db147305 Tom Limoncelli
627 db147305 Tom Limoncelli
628 a8083063 Iustin Pop
def ParseOptions():
629 a8083063 Iustin Pop
  """Parse the command line options.
630 a8083063 Iustin Pop
631 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
632 a8083063 Iustin Pop
633 a8083063 Iustin Pop
  """
634 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
635 a8083063 Iustin Pop
                        usage="%prog [-d]",
636 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
637 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
638 a8083063 Iustin Pop
639 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
640 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
641 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
642 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
643 a8083063 Iustin Pop
  options, args = parser.parse_args()
644 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
645 a8083063 Iustin Pop
  return options, args
646 a8083063 Iustin Pop
647 a8083063 Iustin Pop
648 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
649 a8083063 Iustin Pop
def main():
650 a8083063 Iustin Pop
  """Main function.
651 a8083063 Iustin Pop
652 a8083063 Iustin Pop
  """
653 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
654 e125c67c Michael Hanselmann
655 f93427cd Iustin Pop
  options, args = ParseOptions()
656 f93427cd Iustin Pop
657 f93427cd Iustin Pop
  if args: # watcher doesn't take any arguments
658 f93427cd Iustin Pop
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
659 f93427cd Iustin Pop
    sys.exit(constants.EXIT_FAILURE)
660 a8083063 Iustin Pop
661 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
662 82d9caef Iustin Pop
                     stderr_logging=options.debug)
663 a8083063 Iustin Pop
664 3753b2cb Michael Hanselmann
  if ShouldPause():
665 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
666 3753b2cb Michael Hanselmann
    sys.exit(constants.EXIT_SUCCESS)
667 3753b2cb Michael Hanselmann
668 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
669 001b3825 Michael Hanselmann
  if not statefile:
670 001b3825 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
671 001b3825 Michael Hanselmann
672 24edc6d4 Iustin Pop
  update_file = False
673 a8083063 Iustin Pop
  try:
674 f1115454 Guido Trotter
    StartNodeDaemons()
675 9e289e36 Guido Trotter
    RunWatcherHooks()
676 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
677 50273051 Iustin Pop
    # masters can be properly cleaned up too
678 50273051 Iustin Pop
    if NodeMaintenance.ShouldRun():
679 50273051 Iustin Pop
      NodeMaintenance().Exec()
680 c4f0219c Iustin Pop
681 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
682 781b2b2b Michael Hanselmann
    try:
683 2c404217 Iustin Pop
      try:
684 2c404217 Iustin Pop
        client = cli.GetClient()
685 2c404217 Iustin Pop
      except errors.OpPrereqError:
686 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
687 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
688 24edc6d4 Iustin Pop
        update_file = True
689 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
690 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
691 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
692 7dfb83c2 Iustin Pop
                        str(err))
693 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
694 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
695 7dfb83c2 Iustin Pop
          sys.exit(constants.EXIT_FAILURE)
696 7dfb83c2 Iustin Pop
        # else retry the connection
697 7dfb83c2 Iustin Pop
        client = cli.GetClient()
698 cc962d58 Iustin Pop
699 83052f9e Guido Trotter
      # we are on master now
700 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
701 c4f0219c Iustin Pop
702 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
703 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
704 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
705 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
706 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
707 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
708 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
709 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
710 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
711 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
712 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
713 db147305 Tom Limoncelli
714 cc962d58 Iustin Pop
      try:
715 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
716 cc962d58 Iustin Pop
      except errors.ConfigurationError:
717 cc962d58 Iustin Pop
        # Just exit if there's no configuration
718 24edc6d4 Iustin Pop
        update_file = True
719 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
720 e125c67c Michael Hanselmann
721 cc962d58 Iustin Pop
      watcher.Run()
722 24edc6d4 Iustin Pop
      update_file = True
723 24edc6d4 Iustin Pop
724 cc962d58 Iustin Pop
    finally:
725 7dfb83c2 Iustin Pop
      if update_file:
726 7dfb83c2 Iustin Pop
        notepad.Save()
727 7dfb83c2 Iustin Pop
      else:
728 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
729 1b052f42 Michael Hanselmann
  except SystemExit:
730 1b052f42 Michael Hanselmann
    raise
731 38242904 Iustin Pop
  except NotMasterError:
732 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
733 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
734 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
735 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
736 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
737 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
738 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
739 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
740 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
741 438b45d4 Michael Hanselmann
  except Exception, err:
742 001b3825 Michael Hanselmann
    logging.exception(str(err))
743 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
744 a8083063 Iustin Pop
745 5a3103e9 Michael Hanselmann
746 a8083063 Iustin Pop
if __name__ == '__main__':
747 a8083063 Iustin Pop
  main()