Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ f5116c87

History | View | Annotate | Download (23.8 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 a8083063 Iustin Pop
import sys
36 a8083063 Iustin Pop
import time
37 438b45d4 Michael Hanselmann
import logging
38 a8083063 Iustin Pop
from optparse import OptionParser
39 a8083063 Iustin Pop
40 a8083063 Iustin Pop
from ganeti import utils
41 a8083063 Iustin Pop
from ganeti import constants
42 67fe61c4 Michael Hanselmann
from ganeti import serializer
43 89e1fc26 Iustin Pop
from ganeti import errors
44 e125c67c Michael Hanselmann
from ganeti import opcodes
45 e125c67c Michael Hanselmann
from ganeti import cli
46 7dfb83c2 Iustin Pop
from ganeti import luxi
47 50273051 Iustin Pop
from ganeti import ssconf
48 50273051 Iustin Pop
from ganeti import bdev
49 50273051 Iustin Pop
from ganeti import hypervisor
50 db147305 Tom Limoncelli
from ganeti import rapi
51 50273051 Iustin Pop
from ganeti.confd import client as confd_client
52 a744b676 Manuel Franceschini
from ganeti import netutils
53 a8083063 Iustin Pop
54 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
55 db147305 Tom Limoncelli
56 a8083063 Iustin Pop
57 5a3103e9 Michael Hanselmann
MAXTRIES = 5
58 f5116c87 Iustin Pop
# Delete any record that is older than 8 hours; this value is based on
59 f5116c87 Iustin Pop
# the fact that the current retry counter is 5, and watcher runs every
60 f5116c87 Iustin Pop
# 5 minutes, so it takes around half an hour to exceed the retry
61 f5116c87 Iustin Pop
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
62 f5116c87 Iustin Pop
RETRY_EXPIRATION = 8 * 3600
63 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
64 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
65 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
66 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
67 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
68 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
69 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
70 5a3103e9 Michael Hanselmann
71 5a3103e9 Michael Hanselmann
72 e125c67c Michael Hanselmann
# Global client object
73 e125c67c Michael Hanselmann
client = None
74 e125c67c Michael Hanselmann
75 e125c67c Michael Hanselmann
76 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
77 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
78 a8083063 Iustin Pop
79 a8083063 Iustin Pop
80 3753b2cb Michael Hanselmann
def ShouldPause():
81 3753b2cb Michael Hanselmann
  """Check whether we should pause.
82 3753b2cb Michael Hanselmann
83 3753b2cb Michael Hanselmann
  """
84 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
85 3753b2cb Michael Hanselmann
86 3753b2cb Michael Hanselmann
87 f1115454 Guido Trotter
def StartNodeDaemons():
88 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
89 f1115454 Guido Trotter
90 f1115454 Guido Trotter
  """
91 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
92 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
93 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
94 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
95 f1115454 Guido Trotter
96 f1115454 Guido Trotter
97 9e289e36 Guido Trotter
def RunWatcherHooks():
98 9e289e36 Guido Trotter
  """Run the watcher hooks.
99 9e289e36 Guido Trotter
100 9e289e36 Guido Trotter
  """
101 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
102 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
103 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
104 10e689d4 Iustin Pop
    return
105 9e289e36 Guido Trotter
106 9e289e36 Guido Trotter
  try:
107 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
108 9e289e36 Guido Trotter
  except Exception, msg: # pylint: disable-msg=W0703
109 9e289e36 Guido Trotter
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
110 9e289e36 Guido Trotter
111 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
112 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
113 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
114 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
115 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
116 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
117 9e289e36 Guido Trotter
      if runresult.failed:
118 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
119 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
120 9e289e36 Guido Trotter
      else:
121 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
122 9e289e36 Guido Trotter
                      runresult.output)
123 9e289e36 Guido Trotter
124 001b3825 Michael Hanselmann
125 50273051 Iustin Pop
class NodeMaintenance(object):
126 50273051 Iustin Pop
  """Talks to confd daemons and possible shutdown instances/drbd devices.
127 50273051 Iustin Pop
128 50273051 Iustin Pop
  """
129 50273051 Iustin Pop
  def __init__(self):
130 50273051 Iustin Pop
    self.store_cb = confd_client.StoreResultCallback()
131 50273051 Iustin Pop
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
132 50273051 Iustin Pop
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
133 50273051 Iustin Pop
134 50273051 Iustin Pop
  @staticmethod
135 50273051 Iustin Pop
  def ShouldRun():
136 50273051 Iustin Pop
    """Checks whether node maintenance should run.
137 50273051 Iustin Pop
138 50273051 Iustin Pop
    """
139 50273051 Iustin Pop
    try:
140 50273051 Iustin Pop
      return ssconf.SimpleStore().GetMaintainNodeHealth()
141 50273051 Iustin Pop
    except errors.ConfigurationError, err:
142 50273051 Iustin Pop
      logging.error("Configuration error, not activating node maintenance: %s",
143 50273051 Iustin Pop
                    err)
144 50273051 Iustin Pop
      return False
145 50273051 Iustin Pop
146 50273051 Iustin Pop
  @staticmethod
147 50273051 Iustin Pop
  def GetRunningInstances():
148 50273051 Iustin Pop
    """Compute list of hypervisor/running instances.
149 50273051 Iustin Pop
150 50273051 Iustin Pop
    """
151 50273051 Iustin Pop
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
152 50273051 Iustin Pop
    results = []
153 50273051 Iustin Pop
    for hv_name in hyp_list:
154 50273051 Iustin Pop
      try:
155 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
156 50273051 Iustin Pop
        ilist = hv.ListInstances()
157 50273051 Iustin Pop
        results.extend([(iname, hv_name) for iname in ilist])
158 50273051 Iustin Pop
      except: # pylint: disable-msg=W0702
159 50273051 Iustin Pop
        logging.error("Error while listing instances for hypervisor %s",
160 50273051 Iustin Pop
                      hv_name, exc_info=True)
161 50273051 Iustin Pop
    return results
162 50273051 Iustin Pop
163 50273051 Iustin Pop
  @staticmethod
164 50273051 Iustin Pop
  def GetUsedDRBDs():
165 50273051 Iustin Pop
    """Get list of used DRBD minors.
166 50273051 Iustin Pop
167 50273051 Iustin Pop
    """
168 50273051 Iustin Pop
    return bdev.DRBD8.GetUsedDevs().keys()
169 50273051 Iustin Pop
170 50273051 Iustin Pop
  @classmethod
171 50273051 Iustin Pop
  def DoMaintenance(cls, role):
172 50273051 Iustin Pop
    """Maintain the instance list.
173 50273051 Iustin Pop
174 50273051 Iustin Pop
    """
175 50273051 Iustin Pop
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
176 50273051 Iustin Pop
      inst_running = cls.GetRunningInstances()
177 50273051 Iustin Pop
      cls.ShutdownInstances(inst_running)
178 50273051 Iustin Pop
      drbd_running = cls.GetUsedDRBDs()
179 50273051 Iustin Pop
      cls.ShutdownDRBD(drbd_running)
180 50273051 Iustin Pop
    else:
181 50273051 Iustin Pop
      logging.debug("Not doing anything for role %s", role)
182 50273051 Iustin Pop
183 50273051 Iustin Pop
  @staticmethod
184 50273051 Iustin Pop
  def ShutdownInstances(inst_running):
185 50273051 Iustin Pop
    """Shutdown running instances.
186 50273051 Iustin Pop
187 50273051 Iustin Pop
    """
188 50273051 Iustin Pop
    names_running = set([i[0] for i in inst_running])
189 50273051 Iustin Pop
    if names_running:
190 50273051 Iustin Pop
      logging.info("Following instances should not be running,"
191 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(names_running))
192 50273051 Iustin Pop
      # this dictionary will collapse duplicate instance names (only
193 50273051 Iustin Pop
      # xen pvm/vhm) into a single key, which is fine
194 50273051 Iustin Pop
      i2h = dict(inst_running)
195 50273051 Iustin Pop
      for name in names_running:
196 50273051 Iustin Pop
        hv_name = i2h[name]
197 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
198 50273051 Iustin Pop
        hv.StopInstance(None, force=True, name=name)
199 50273051 Iustin Pop
200 50273051 Iustin Pop
  @staticmethod
201 50273051 Iustin Pop
  def ShutdownDRBD(drbd_running):
202 50273051 Iustin Pop
    """Shutdown active DRBD devices.
203 50273051 Iustin Pop
204 50273051 Iustin Pop
    """
205 50273051 Iustin Pop
    if drbd_running:
206 50273051 Iustin Pop
      logging.info("Following DRBD minors should not be active,"
207 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
208 50273051 Iustin Pop
      for minor in drbd_running:
209 50273051 Iustin Pop
        # pylint: disable-msg=W0212
210 50273051 Iustin Pop
        # using the private method as is, pending enhancements to the DRBD
211 50273051 Iustin Pop
        # interface
212 50273051 Iustin Pop
        bdev.DRBD8._ShutdownAll(minor)
213 50273051 Iustin Pop
214 50273051 Iustin Pop
  def Exec(self):
215 50273051 Iustin Pop
    """Check node status versus cluster desired state.
216 50273051 Iustin Pop
217 50273051 Iustin Pop
    """
218 a744b676 Manuel Franceschini
    my_name = netutils.HostInfo().name
219 50273051 Iustin Pop
    req = confd_client.ConfdClientRequest(type=
220 50273051 Iustin Pop
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
221 50273051 Iustin Pop
                                          query=my_name)
222 ebacb943 Iustin Pop
    self.confd_client.SendRequest(req, async=False, coverage=-1)
223 50273051 Iustin Pop
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
224 50273051 Iustin Pop
    if not timed_out:
225 50273051 Iustin Pop
      # should have a valid response
226 50273051 Iustin Pop
      status, result = self.store_cb.GetResponse(req.rsalt)
227 50273051 Iustin Pop
      assert status, "Missing result but received replies"
228 50273051 Iustin Pop
      if not self.filter_cb.consistent[req.rsalt]:
229 50273051 Iustin Pop
        logging.warning("Inconsistent replies, not doing anything")
230 50273051 Iustin Pop
        return
231 50273051 Iustin Pop
      self.DoMaintenance(result.server_reply.answer)
232 50273051 Iustin Pop
    else:
233 50273051 Iustin Pop
      logging.warning("Confd query timed out, cannot do maintenance actions")
234 50273051 Iustin Pop
235 50273051 Iustin Pop
236 5a3103e9 Michael Hanselmann
class WatcherState(object):
237 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
238 a8083063 Iustin Pop
239 a8083063 Iustin Pop
  """
240 001b3825 Michael Hanselmann
  def __init__(self, statefile):
241 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
242 5a3103e9 Michael Hanselmann
243 001b3825 Michael Hanselmann
    @type statefile: file
244 001b3825 Michael Hanselmann
    @param statefile: State file object
245 5a3103e9 Michael Hanselmann
246 5a3103e9 Michael Hanselmann
    """
247 001b3825 Michael Hanselmann
    self.statefile = statefile
248 a8083063 Iustin Pop
249 5a3103e9 Michael Hanselmann
    try:
250 2c404217 Iustin Pop
      state_data = self.statefile.read()
251 2c404217 Iustin Pop
      if not state_data:
252 2c404217 Iustin Pop
        self._data = {}
253 2c404217 Iustin Pop
      else:
254 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
255 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
256 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
257 b76f660d Michael Hanselmann
      self._data = {}
258 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
259 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
260 5a3103e9 Michael Hanselmann
261 b76f660d Michael Hanselmann
    if "instance" not in self._data:
262 b76f660d Michael Hanselmann
      self._data["instance"] = {}
263 b76f660d Michael Hanselmann
    if "node" not in self._data:
264 b76f660d Michael Hanselmann
      self._data["node"] = {}
265 5a3103e9 Michael Hanselmann
266 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
267 2fb96d39 Michael Hanselmann
268 fc428e32 Michael Hanselmann
  def Save(self):
269 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
270 5a3103e9 Michael Hanselmann
271 5a3103e9 Michael Hanselmann
    """
272 fc428e32 Michael Hanselmann
    assert self.statefile
273 fc428e32 Michael Hanselmann
274 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
275 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
276 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
277 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
278 2fb96d39 Michael Hanselmann
      return
279 2fb96d39 Michael Hanselmann
280 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
281 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
282 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
283 26517d45 Iustin Pop
                         data=serialized_form,
284 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
285 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
286 5a3103e9 Michael Hanselmann
287 fc428e32 Michael Hanselmann
  def Close(self):
288 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
289 5a3103e9 Michael Hanselmann
290 5a3103e9 Michael Hanselmann
    """
291 5a3103e9 Michael Hanselmann
    assert self.statefile
292 5a3103e9 Michael Hanselmann
293 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
294 5a3103e9 Michael Hanselmann
    self.statefile.close()
295 5a3103e9 Michael Hanselmann
    self.statefile = None
296 5a3103e9 Michael Hanselmann
297 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
298 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
299 a8083063 Iustin Pop
300 5a3103e9 Michael Hanselmann
    """
301 b76f660d Michael Hanselmann
    ndata = self._data["node"]
302 5a3103e9 Michael Hanselmann
303 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
304 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
305 5a3103e9 Michael Hanselmann
    return None
306 5a3103e9 Michael Hanselmann
307 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
308 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
309 5a3103e9 Michael Hanselmann
310 5a3103e9 Michael Hanselmann
    """
311 5a3103e9 Michael Hanselmann
    assert bootid
312 a8083063 Iustin Pop
313 b76f660d Michael Hanselmann
    ndata = self._data["node"]
314 a8083063 Iustin Pop
315 5a3103e9 Michael Hanselmann
    if name not in ndata:
316 5a3103e9 Michael Hanselmann
      ndata[name] = {}
317 5a3103e9 Michael Hanselmann
318 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
319 5a3103e9 Michael Hanselmann
320 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
321 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
322 a8083063 Iustin Pop
323 c41eea6e Iustin Pop
    @type instance: L{Instance}
324 c41eea6e Iustin Pop
    @param instance: the instance to look up
325 38242904 Iustin Pop
326 a8083063 Iustin Pop
    """
327 b76f660d Michael Hanselmann
    idata = self._data["instance"]
328 a8083063 Iustin Pop
329 5a3103e9 Michael Hanselmann
    if instance.name in idata:
330 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
331 a8083063 Iustin Pop
332 a8083063 Iustin Pop
    return 0
333 a8083063 Iustin Pop
334 f5116c87 Iustin Pop
  def MaintainInstanceList(self, instances):
335 f5116c87 Iustin Pop
    """Perform maintenance on the recorded instances.
336 f5116c87 Iustin Pop
337 f5116c87 Iustin Pop
    @type instances: list of string
338 f5116c87 Iustin Pop
    @param instances: the list of currently existing instances
339 f5116c87 Iustin Pop
340 f5116c87 Iustin Pop
    """
341 f5116c87 Iustin Pop
    idict = self._data["instance"]
342 f5116c87 Iustin Pop
    # First, delete obsolete instances
343 f5116c87 Iustin Pop
    obsolete_instances = set(idict).difference(instances)
344 f5116c87 Iustin Pop
    for inst in obsolete_instances:
345 f5116c87 Iustin Pop
      logging.debug("Forgetting obsolete instance %s", inst)
346 f5116c87 Iustin Pop
      del idict[inst]
347 f5116c87 Iustin Pop
348 f5116c87 Iustin Pop
    # Second, delete expired records
349 f5116c87 Iustin Pop
    earliest = time.time() - RETRY_EXPIRATION
350 f5116c87 Iustin Pop
    expired_instances = [i for i in idict
351 f5116c87 Iustin Pop
                         if idict[i][KEY_RESTART_WHEN] < earliest]
352 f5116c87 Iustin Pop
    for inst in expired_instances:
353 f5116c87 Iustin Pop
      logging.debug("Expiring record for instance %s", inst)
354 f5116c87 Iustin Pop
      del idict[inst]
355 f5116c87 Iustin Pop
356 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
357 a8083063 Iustin Pop
    """Record a restart attempt.
358 a8083063 Iustin Pop
359 c41eea6e Iustin Pop
    @type instance: L{Instance}
360 c41eea6e Iustin Pop
    @param instance: the instance being restarted
361 38242904 Iustin Pop
362 a8083063 Iustin Pop
    """
363 b76f660d Michael Hanselmann
    idata = self._data["instance"]
364 a8083063 Iustin Pop
365 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
366 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
367 5a3103e9 Michael Hanselmann
    else:
368 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
369 a8083063 Iustin Pop
370 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
371 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
372 a8083063 Iustin Pop
373 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
374 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
375 a8083063 Iustin Pop
376 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
377 c41eea6e Iustin Pop
    track down instances).
378 a8083063 Iustin Pop
379 c41eea6e Iustin Pop
    @type instance: L{Instance}
380 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
381 38242904 Iustin Pop
382 a8083063 Iustin Pop
    """
383 b76f660d Michael Hanselmann
    idata = self._data["instance"]
384 a8083063 Iustin Pop
385 5a3103e9 Michael Hanselmann
    if instance.name in idata:
386 5a3103e9 Michael Hanselmann
      del idata[instance.name]
387 a8083063 Iustin Pop
388 a8083063 Iustin Pop
389 a8083063 Iustin Pop
class Instance(object):
390 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
391 a8083063 Iustin Pop
392 a8083063 Iustin Pop
  """
393 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
394 a8083063 Iustin Pop
    self.name = name
395 a8083063 Iustin Pop
    self.state = state
396 5a3103e9 Michael Hanselmann
    self.autostart = autostart
397 a8083063 Iustin Pop
398 a8083063 Iustin Pop
  def Restart(self):
399 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
400 3ecf6786 Iustin Pop
401 3ecf6786 Iustin Pop
    """
402 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
403 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
404 a8083063 Iustin Pop
405 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
406 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
407 5a3103e9 Michael Hanselmann
408 5a3103e9 Michael Hanselmann
    """
409 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
410 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
411 a8083063 Iustin Pop
412 a8083063 Iustin Pop
413 6dfcc47b Iustin Pop
def GetClusterData():
414 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
415 5a3103e9 Michael Hanselmann
416 5a3103e9 Michael Hanselmann
  """
417 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
418 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
419 6dfcc47b Iustin Pop
                                 use_locking=True)
420 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
421 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
422 6dfcc47b Iustin Pop
                             use_locking=True)
423 a8083063 Iustin Pop
424 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
425 a8083063 Iustin Pop
426 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
427 5a3103e9 Michael Hanselmann
428 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
429 78f44650 Iustin Pop
430 6dfcc47b Iustin Pop
  result = all_results[0]
431 6dfcc47b Iustin Pop
  smap = {}
432 5a3103e9 Michael Hanselmann
433 6dfcc47b Iustin Pop
  instances = {}
434 78f44650 Iustin Pop
435 78f44650 Iustin Pop
  # write the upfile
436 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
437 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
438 78f44650 Iustin Pop
439 6dfcc47b Iustin Pop
  for fields in result:
440 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
441 5a3103e9 Michael Hanselmann
442 6dfcc47b Iustin Pop
    # update the secondary node map
443 6dfcc47b Iustin Pop
    for node in snodes:
444 6dfcc47b Iustin Pop
      if node not in smap:
445 6dfcc47b Iustin Pop
        smap[node] = []
446 6dfcc47b Iustin Pop
      smap[node].append(name)
447 a8083063 Iustin Pop
448 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
449 5a3103e9 Michael Hanselmann
450 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
451 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
452 5a3103e9 Michael Hanselmann
453 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
454 5a3103e9 Michael Hanselmann
455 6dfcc47b Iustin Pop
  return instances, nodes, smap
456 a8083063 Iustin Pop
457 a8083063 Iustin Pop
458 5a3103e9 Michael Hanselmann
class Watcher(object):
459 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
460 a8083063 Iustin Pop
461 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
462 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
463 a8083063 Iustin Pop
  to restart machines that are down.
464 38242904 Iustin Pop
465 a8083063 Iustin Pop
  """
466 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
467 cc962d58 Iustin Pop
    self.notepad = notepad
468 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
469 a744b676 Manuel Franceschini
    if master != netutils.HostInfo().name:
470 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
471 24edc6d4 Iustin Pop
    # first archive old jobs
472 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
473 24edc6d4 Iustin Pop
    # and only then submit new ones
474 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
475 eee1fa2d Iustin Pop
    self.started_instances = set()
476 f07521e5 Iustin Pop
    self.opts = opts
477 a8083063 Iustin Pop
478 a8083063 Iustin Pop
  def Run(self):
479 cc962d58 Iustin Pop
    """Watcher run sequence.
480 cc962d58 Iustin Pop
481 cc962d58 Iustin Pop
    """
482 cc962d58 Iustin Pop
    notepad = self.notepad
483 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
484 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
485 cc962d58 Iustin Pop
    self.VerifyDisks()
486 5a3103e9 Michael Hanselmann
487 24edc6d4 Iustin Pop
  @staticmethod
488 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
489 f07521e5 Iustin Pop
    """Archive old jobs.
490 f07521e5 Iustin Pop
491 f07521e5 Iustin Pop
    """
492 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
493 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
494 f07521e5 Iustin Pop
495 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
496 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
497 38242904 Iustin Pop
498 a8083063 Iustin Pop
    """
499 5a3103e9 Michael Hanselmann
    check_nodes = []
500 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
501 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
502 37b77b18 Iustin Pop
      if new_id is None:
503 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
504 cbfc4681 Iustin Pop
        if not offline:
505 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
506 cbfc4681 Iustin Pop
                        name)
507 37b77b18 Iustin Pop
        continue
508 26517d45 Iustin Pop
      if old != new_id:
509 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
510 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
511 5a3103e9 Michael Hanselmann
512 5a3103e9 Michael Hanselmann
    if check_nodes:
513 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
514 5a3103e9 Michael Hanselmann
      # secondary node.
515 6dfcc47b Iustin Pop
      for node in check_nodes:
516 6dfcc47b Iustin Pop
        if node not in self.smap:
517 eee1fa2d Iustin Pop
          continue
518 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
519 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
520 6dfcc47b Iustin Pop
          if not instance.autostart:
521 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
522 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
523 6dfcc47b Iustin Pop
            continue
524 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
525 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
526 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
527 6dfcc47b Iustin Pop
            continue
528 6dfcc47b Iustin Pop
          try:
529 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
530 6dfcc47b Iustin Pop
            instance.ActivateDisks()
531 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
532 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
533 6dfcc47b Iustin Pop
                              instance.name)
534 5a3103e9 Michael Hanselmann
535 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
536 5a3103e9 Michael Hanselmann
      for name in check_nodes:
537 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
538 a8083063 Iustin Pop
539 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
540 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
541 5a3103e9 Michael Hanselmann
542 5a3103e9 Michael Hanselmann
    """
543 f5116c87 Iustin Pop
    notepad.MaintainInstanceList(self.instances.keys())
544 f5116c87 Iustin Pop
545 6dfcc47b Iustin Pop
    for instance in self.instances.values():
546 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
547 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
548 a8083063 Iustin Pop
549 a8083063 Iustin Pop
        if n > MAXTRIES:
550 f5116c87 Iustin Pop
          logging.warning("Not restarting instance %s, retries exhausted",
551 f5116c87 Iustin Pop
                          instance.name)
552 a8083063 Iustin Pop
          continue
553 a8083063 Iustin Pop
        elif n < MAXTRIES:
554 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
555 a8083063 Iustin Pop
        else:
556 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
557 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
558 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
559 a8083063 Iustin Pop
          continue
560 a8083063 Iustin Pop
        try:
561 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
562 438b45d4 Michael Hanselmann
                        instance.name, last)
563 a8083063 Iustin Pop
          instance.Restart()
564 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
565 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
566 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
567 4bffa7f7 Iustin Pop
                            instance.name)
568 a8083063 Iustin Pop
569 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
570 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
571 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
572 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
573 a8083063 Iustin Pop
      else:
574 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
575 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
576 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
577 a8083063 Iustin Pop
578 b7309a0d Iustin Pop
  @staticmethod
579 b7309a0d Iustin Pop
  def VerifyDisks():
580 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
581 d2f311db Iustin Pop
582 d2f311db Iustin Pop
    """
583 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
584 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
585 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
586 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
587 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
588 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
589 5188ab37 Iustin Pop
      return
590 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
591 5188ab37 Iustin Pop
    if not offline_disk_instances:
592 5188ab37 Iustin Pop
      # nothing to do
593 5188ab37 Iustin Pop
      return
594 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
595 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
596 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
597 5188ab37 Iustin Pop
    # less the job queue
598 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
599 5188ab37 Iustin Pop
           for name in offline_disk_instances]
600 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
601 5188ab37 Iustin Pop
602 a9105b24 Michael Hanselmann
    try:
603 a9105b24 Michael Hanselmann
      cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
604 a9105b24 Michael Hanselmann
    except Exception: # pylint: disable-msg=W0703
605 a9105b24 Michael Hanselmann
      logging.exception("Error while activating disks")
606 a8083063 Iustin Pop
607 a8083063 Iustin Pop
608 001b3825 Michael Hanselmann
def OpenStateFile(path):
609 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
610 001b3825 Michael Hanselmann
611 001b3825 Michael Hanselmann
  @type path: string
612 001b3825 Michael Hanselmann
  @param path: Path to state file
613 001b3825 Michael Hanselmann
614 001b3825 Michael Hanselmann
  """
615 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
616 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
617 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
618 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
619 001b3825 Michael Hanselmann
620 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
621 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
622 001b3825 Michael Hanselmann
  # watcher from running.
623 001b3825 Michael Hanselmann
  try:
624 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
625 001b3825 Michael Hanselmann
  except errors.LockError, err:
626 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
627 001b3825 Michael Hanselmann
    return None
628 001b3825 Michael Hanselmann
629 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
630 001b3825 Michael Hanselmann
631 001b3825 Michael Hanselmann
632 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
633 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
634 db147305 Tom Limoncelli
635 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
636 db147305 Tom Limoncelli
  test is GetVersion.
637 db147305 Tom Limoncelli
638 db147305 Tom Limoncelli
  @type hostname: string
639 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
640 db147305 Tom Limoncelli
  @rtype: bool
641 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
642 db147305 Tom Limoncelli
643 db147305 Tom Limoncelli
  """
644 2a7c3583 Michael Hanselmann
  curl_config = rapi.client.GenericCurlConfig(cafile=constants.RAPI_CERT_FILE)
645 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
646 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
647 db147305 Tom Limoncelli
  try:
648 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
649 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
650 db147305 Tom Limoncelli
    logging.warning("RAPI Error: CertificateError (%s)", err)
651 db147305 Tom Limoncelli
    return False
652 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
653 db147305 Tom Limoncelli
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
654 db147305 Tom Limoncelli
    return False
655 db147305 Tom Limoncelli
  logging.debug("RAPI Result: master_version is %s", master_version)
656 db147305 Tom Limoncelli
  return master_version == constants.RAPI_VERSION
657 db147305 Tom Limoncelli
658 db147305 Tom Limoncelli
659 a8083063 Iustin Pop
def ParseOptions():
660 a8083063 Iustin Pop
  """Parse the command line options.
661 a8083063 Iustin Pop
662 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
663 a8083063 Iustin Pop
664 a8083063 Iustin Pop
  """
665 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
666 a8083063 Iustin Pop
                        usage="%prog [-d]",
667 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
668 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
669 a8083063 Iustin Pop
670 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
671 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
672 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
673 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
674 a8083063 Iustin Pop
  options, args = parser.parse_args()
675 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
676 a8083063 Iustin Pop
  return options, args
677 a8083063 Iustin Pop
678 a8083063 Iustin Pop
679 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
680 a8083063 Iustin Pop
def main():
681 a8083063 Iustin Pop
  """Main function.
682 a8083063 Iustin Pop
683 a8083063 Iustin Pop
  """
684 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
685 e125c67c Michael Hanselmann
686 f93427cd Iustin Pop
  options, args = ParseOptions()
687 f93427cd Iustin Pop
688 f93427cd Iustin Pop
  if args: # watcher doesn't take any arguments
689 f93427cd Iustin Pop
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
690 f93427cd Iustin Pop
    sys.exit(constants.EXIT_FAILURE)
691 a8083063 Iustin Pop
692 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
693 82d9caef Iustin Pop
                     stderr_logging=options.debug)
694 a8083063 Iustin Pop
695 3753b2cb Michael Hanselmann
  if ShouldPause():
696 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
697 3753b2cb Michael Hanselmann
    sys.exit(constants.EXIT_SUCCESS)
698 3753b2cb Michael Hanselmann
699 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
700 001b3825 Michael Hanselmann
  if not statefile:
701 001b3825 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
702 001b3825 Michael Hanselmann
703 24edc6d4 Iustin Pop
  update_file = False
704 a8083063 Iustin Pop
  try:
705 f1115454 Guido Trotter
    StartNodeDaemons()
706 9e289e36 Guido Trotter
    RunWatcherHooks()
707 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
708 50273051 Iustin Pop
    # masters can be properly cleaned up too
709 50273051 Iustin Pop
    if NodeMaintenance.ShouldRun():
710 50273051 Iustin Pop
      NodeMaintenance().Exec()
711 c4f0219c Iustin Pop
712 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
713 781b2b2b Michael Hanselmann
    try:
714 2c404217 Iustin Pop
      try:
715 2c404217 Iustin Pop
        client = cli.GetClient()
716 2c404217 Iustin Pop
      except errors.OpPrereqError:
717 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
718 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
719 24edc6d4 Iustin Pop
        update_file = True
720 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
721 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
722 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
723 7dfb83c2 Iustin Pop
                        str(err))
724 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
725 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
726 7dfb83c2 Iustin Pop
          sys.exit(constants.EXIT_FAILURE)
727 7dfb83c2 Iustin Pop
        # else retry the connection
728 7dfb83c2 Iustin Pop
        client = cli.GetClient()
729 cc962d58 Iustin Pop
730 83052f9e Guido Trotter
      # we are on master now
731 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
732 c4f0219c Iustin Pop
733 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
734 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
735 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
736 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
737 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
738 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
739 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
740 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
741 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
742 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
743 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
744 db147305 Tom Limoncelli
745 cc962d58 Iustin Pop
      try:
746 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
747 cc962d58 Iustin Pop
      except errors.ConfigurationError:
748 cc962d58 Iustin Pop
        # Just exit if there's no configuration
749 24edc6d4 Iustin Pop
        update_file = True
750 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
751 e125c67c Michael Hanselmann
752 cc962d58 Iustin Pop
      watcher.Run()
753 24edc6d4 Iustin Pop
      update_file = True
754 24edc6d4 Iustin Pop
755 cc962d58 Iustin Pop
    finally:
756 7dfb83c2 Iustin Pop
      if update_file:
757 7dfb83c2 Iustin Pop
        notepad.Save()
758 7dfb83c2 Iustin Pop
      else:
759 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
760 1b052f42 Michael Hanselmann
  except SystemExit:
761 1b052f42 Michael Hanselmann
    raise
762 38242904 Iustin Pop
  except NotMasterError:
763 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
764 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
765 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
766 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
767 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
768 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
769 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
770 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
771 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
772 438b45d4 Michael Hanselmann
  except Exception, err:
773 001b3825 Michael Hanselmann
    logging.exception(str(err))
774 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
775 a8083063 Iustin Pop
776 5a3103e9 Michael Hanselmann
777 a8083063 Iustin Pop
if __name__ == '__main__':
778 a8083063 Iustin Pop
  main()