Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ d1e9c98d

History | View | Annotate | Download (24.9 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 cfcc79c6 Michael Hanselmann
import os.path
36 a8083063 Iustin Pop
import sys
37 a8083063 Iustin Pop
import time
38 438b45d4 Michael Hanselmann
import logging
39 a8083063 Iustin Pop
from optparse import OptionParser
40 a8083063 Iustin Pop
41 a8083063 Iustin Pop
from ganeti import utils
42 a8083063 Iustin Pop
from ganeti import constants
43 83e5e26f René Nussbaumer
from ganeti import compat
44 67fe61c4 Michael Hanselmann
from ganeti import serializer
45 89e1fc26 Iustin Pop
from ganeti import errors
46 e125c67c Michael Hanselmann
from ganeti import opcodes
47 e125c67c Michael Hanselmann
from ganeti import cli
48 7dfb83c2 Iustin Pop
from ganeti import luxi
49 50273051 Iustin Pop
from ganeti import ssconf
50 50273051 Iustin Pop
from ganeti import bdev
51 50273051 Iustin Pop
from ganeti import hypervisor
52 db147305 Tom Limoncelli
from ganeti import rapi
53 50273051 Iustin Pop
from ganeti.confd import client as confd_client
54 a744b676 Manuel Franceschini
from ganeti import netutils
55 a8083063 Iustin Pop
56 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
57 db147305 Tom Limoncelli
58 a8083063 Iustin Pop
59 5a3103e9 Michael Hanselmann
MAXTRIES = 5
60 f5116c87 Iustin Pop
# Delete any record that is older than 8 hours; this value is based on
61 f5116c87 Iustin Pop
# the fact that the current retry counter is 5, and watcher runs every
62 f5116c87 Iustin Pop
# 5 minutes, so it takes around half an hour to exceed the retry
63 f5116c87 Iustin Pop
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
64 f5116c87 Iustin Pop
RETRY_EXPIRATION = 8 * 3600
65 61a980a9 Michael Hanselmann
BAD_STATES = [constants.INSTST_ERRORDOWN]
66 61a980a9 Michael Hanselmann
HELPLESS_STATES = [constants.INSTST_NODEDOWN, constants.INSTST_NODEOFFLINE]
67 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
68 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
69 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
70 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
71 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
72 5a3103e9 Michael Hanselmann
73 5a3103e9 Michael Hanselmann
74 e125c67c Michael Hanselmann
# Global client object
75 e125c67c Michael Hanselmann
client = None
76 e125c67c Michael Hanselmann
77 e125c67c Michael Hanselmann
78 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
79 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
80 a8083063 Iustin Pop
81 a8083063 Iustin Pop
82 3753b2cb Michael Hanselmann
def ShouldPause():
83 3753b2cb Michael Hanselmann
  """Check whether we should pause.
84 3753b2cb Michael Hanselmann

85 3753b2cb Michael Hanselmann
  """
86 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
87 3753b2cb Michael Hanselmann
88 3753b2cb Michael Hanselmann
89 f1115454 Guido Trotter
def StartNodeDaemons():
90 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
91 f1115454 Guido Trotter

92 f1115454 Guido Trotter
  """
93 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
94 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
95 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
96 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
97 f1115454 Guido Trotter
98 f1115454 Guido Trotter
99 9e289e36 Guido Trotter
def RunWatcherHooks():
100 9e289e36 Guido Trotter
  """Run the watcher hooks.
101 9e289e36 Guido Trotter

102 9e289e36 Guido Trotter
  """
103 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
104 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
105 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
106 10e689d4 Iustin Pop
    return
107 9e289e36 Guido Trotter
108 9e289e36 Guido Trotter
  try:
109 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
110 9e289e36 Guido Trotter
  except Exception, msg: # pylint: disable-msg=W0703
111 9e289e36 Guido Trotter
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
112 9e289e36 Guido Trotter
113 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
114 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
115 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
116 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
117 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
118 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
119 9e289e36 Guido Trotter
      if runresult.failed:
120 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
121 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
122 9e289e36 Guido Trotter
      else:
123 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
124 9e289e36 Guido Trotter
                      runresult.output)
125 9e289e36 Guido Trotter
126 001b3825 Michael Hanselmann
127 50273051 Iustin Pop
class NodeMaintenance(object):
128 50273051 Iustin Pop
  """Talks to confd daemons and possible shutdown instances/drbd devices.
129 50273051 Iustin Pop

130 50273051 Iustin Pop
  """
131 50273051 Iustin Pop
  def __init__(self):
132 50273051 Iustin Pop
    self.store_cb = confd_client.StoreResultCallback()
133 50273051 Iustin Pop
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
134 50273051 Iustin Pop
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
135 50273051 Iustin Pop
136 50273051 Iustin Pop
  @staticmethod
137 50273051 Iustin Pop
  def ShouldRun():
138 50273051 Iustin Pop
    """Checks whether node maintenance should run.
139 50273051 Iustin Pop

140 50273051 Iustin Pop
    """
141 50273051 Iustin Pop
    try:
142 50273051 Iustin Pop
      return ssconf.SimpleStore().GetMaintainNodeHealth()
143 50273051 Iustin Pop
    except errors.ConfigurationError, err:
144 50273051 Iustin Pop
      logging.error("Configuration error, not activating node maintenance: %s",
145 50273051 Iustin Pop
                    err)
146 50273051 Iustin Pop
      return False
147 50273051 Iustin Pop
148 50273051 Iustin Pop
  @staticmethod
149 50273051 Iustin Pop
  def GetRunningInstances():
150 50273051 Iustin Pop
    """Compute list of hypervisor/running instances.
151 50273051 Iustin Pop

152 50273051 Iustin Pop
    """
153 50273051 Iustin Pop
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
154 50273051 Iustin Pop
    results = []
155 50273051 Iustin Pop
    for hv_name in hyp_list:
156 50273051 Iustin Pop
      try:
157 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
158 50273051 Iustin Pop
        ilist = hv.ListInstances()
159 50273051 Iustin Pop
        results.extend([(iname, hv_name) for iname in ilist])
160 50273051 Iustin Pop
      except: # pylint: disable-msg=W0702
161 50273051 Iustin Pop
        logging.error("Error while listing instances for hypervisor %s",
162 50273051 Iustin Pop
                      hv_name, exc_info=True)
163 50273051 Iustin Pop
    return results
164 50273051 Iustin Pop
165 50273051 Iustin Pop
  @staticmethod
166 50273051 Iustin Pop
  def GetUsedDRBDs():
167 50273051 Iustin Pop
    """Get list of used DRBD minors.
168 50273051 Iustin Pop

169 50273051 Iustin Pop
    """
170 50273051 Iustin Pop
    return bdev.DRBD8.GetUsedDevs().keys()
171 50273051 Iustin Pop
172 50273051 Iustin Pop
  @classmethod
173 50273051 Iustin Pop
  def DoMaintenance(cls, role):
174 50273051 Iustin Pop
    """Maintain the instance list.
175 50273051 Iustin Pop

176 50273051 Iustin Pop
    """
177 50273051 Iustin Pop
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
178 50273051 Iustin Pop
      inst_running = cls.GetRunningInstances()
179 50273051 Iustin Pop
      cls.ShutdownInstances(inst_running)
180 50273051 Iustin Pop
      drbd_running = cls.GetUsedDRBDs()
181 50273051 Iustin Pop
      cls.ShutdownDRBD(drbd_running)
182 50273051 Iustin Pop
    else:
183 50273051 Iustin Pop
      logging.debug("Not doing anything for role %s", role)
184 50273051 Iustin Pop
185 50273051 Iustin Pop
  @staticmethod
186 50273051 Iustin Pop
  def ShutdownInstances(inst_running):
187 50273051 Iustin Pop
    """Shutdown running instances.
188 50273051 Iustin Pop

189 50273051 Iustin Pop
    """
190 50273051 Iustin Pop
    names_running = set([i[0] for i in inst_running])
191 50273051 Iustin Pop
    if names_running:
192 50273051 Iustin Pop
      logging.info("Following instances should not be running,"
193 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(names_running))
194 50273051 Iustin Pop
      # this dictionary will collapse duplicate instance names (only
195 50273051 Iustin Pop
      # xen pvm/vhm) into a single key, which is fine
196 50273051 Iustin Pop
      i2h = dict(inst_running)
197 50273051 Iustin Pop
      for name in names_running:
198 50273051 Iustin Pop
        hv_name = i2h[name]
199 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
200 50273051 Iustin Pop
        hv.StopInstance(None, force=True, name=name)
201 50273051 Iustin Pop
202 50273051 Iustin Pop
  @staticmethod
203 50273051 Iustin Pop
  def ShutdownDRBD(drbd_running):
204 50273051 Iustin Pop
    """Shutdown active DRBD devices.
205 50273051 Iustin Pop

206 50273051 Iustin Pop
    """
207 50273051 Iustin Pop
    if drbd_running:
208 50273051 Iustin Pop
      logging.info("Following DRBD minors should not be active,"
209 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
210 50273051 Iustin Pop
      for minor in drbd_running:
211 50273051 Iustin Pop
        # pylint: disable-msg=W0212
212 50273051 Iustin Pop
        # using the private method as is, pending enhancements to the DRBD
213 50273051 Iustin Pop
        # interface
214 50273051 Iustin Pop
        bdev.DRBD8._ShutdownAll(minor)
215 50273051 Iustin Pop
216 50273051 Iustin Pop
  def Exec(self):
217 50273051 Iustin Pop
    """Check node status versus cluster desired state.
218 50273051 Iustin Pop

219 50273051 Iustin Pop
    """
220 b705c7a6 Manuel Franceschini
    my_name = netutils.Hostname.GetSysName()
221 50273051 Iustin Pop
    req = confd_client.ConfdClientRequest(type=
222 50273051 Iustin Pop
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
223 50273051 Iustin Pop
                                          query=my_name)
224 ebacb943 Iustin Pop
    self.confd_client.SendRequest(req, async=False, coverage=-1)
225 50273051 Iustin Pop
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
226 50273051 Iustin Pop
    if not timed_out:
227 50273051 Iustin Pop
      # should have a valid response
228 50273051 Iustin Pop
      status, result = self.store_cb.GetResponse(req.rsalt)
229 50273051 Iustin Pop
      assert status, "Missing result but received replies"
230 50273051 Iustin Pop
      if not self.filter_cb.consistent[req.rsalt]:
231 50273051 Iustin Pop
        logging.warning("Inconsistent replies, not doing anything")
232 50273051 Iustin Pop
        return
233 50273051 Iustin Pop
      self.DoMaintenance(result.server_reply.answer)
234 50273051 Iustin Pop
    else:
235 50273051 Iustin Pop
      logging.warning("Confd query timed out, cannot do maintenance actions")
236 50273051 Iustin Pop
237 50273051 Iustin Pop
238 5a3103e9 Michael Hanselmann
class WatcherState(object):
239 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
240 a8083063 Iustin Pop

241 a8083063 Iustin Pop
  """
242 001b3825 Michael Hanselmann
  def __init__(self, statefile):
243 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
244 5a3103e9 Michael Hanselmann

245 001b3825 Michael Hanselmann
    @type statefile: file
246 001b3825 Michael Hanselmann
    @param statefile: State file object
247 5a3103e9 Michael Hanselmann

248 5a3103e9 Michael Hanselmann
    """
249 001b3825 Michael Hanselmann
    self.statefile = statefile
250 a8083063 Iustin Pop
251 5a3103e9 Michael Hanselmann
    try:
252 2c404217 Iustin Pop
      state_data = self.statefile.read()
253 2c404217 Iustin Pop
      if not state_data:
254 2c404217 Iustin Pop
        self._data = {}
255 2c404217 Iustin Pop
      else:
256 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
257 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
258 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
259 b76f660d Michael Hanselmann
      self._data = {}
260 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
261 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
262 5a3103e9 Michael Hanselmann
263 b76f660d Michael Hanselmann
    if "instance" not in self._data:
264 b76f660d Michael Hanselmann
      self._data["instance"] = {}
265 b76f660d Michael Hanselmann
    if "node" not in self._data:
266 b76f660d Michael Hanselmann
      self._data["node"] = {}
267 5a3103e9 Michael Hanselmann
268 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
269 2fb96d39 Michael Hanselmann
270 fc428e32 Michael Hanselmann
  def Save(self):
271 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
272 5a3103e9 Michael Hanselmann

273 5a3103e9 Michael Hanselmann
    """
274 fc428e32 Michael Hanselmann
    assert self.statefile
275 fc428e32 Michael Hanselmann
276 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
277 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
278 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
279 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
280 2fb96d39 Michael Hanselmann
      return
281 2fb96d39 Michael Hanselmann
282 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
283 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
284 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
285 26517d45 Iustin Pop
                         data=serialized_form,
286 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
287 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
288 5a3103e9 Michael Hanselmann
289 fc428e32 Michael Hanselmann
  def Close(self):
290 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
291 5a3103e9 Michael Hanselmann

292 5a3103e9 Michael Hanselmann
    """
293 5a3103e9 Michael Hanselmann
    assert self.statefile
294 5a3103e9 Michael Hanselmann
295 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
296 5a3103e9 Michael Hanselmann
    self.statefile.close()
297 5a3103e9 Michael Hanselmann
    self.statefile = None
298 5a3103e9 Michael Hanselmann
299 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
300 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
301 a8083063 Iustin Pop

302 5a3103e9 Michael Hanselmann
    """
303 b76f660d Michael Hanselmann
    ndata = self._data["node"]
304 5a3103e9 Michael Hanselmann
305 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
306 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
307 5a3103e9 Michael Hanselmann
    return None
308 5a3103e9 Michael Hanselmann
309 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
310 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
311 5a3103e9 Michael Hanselmann

312 5a3103e9 Michael Hanselmann
    """
313 5a3103e9 Michael Hanselmann
    assert bootid
314 a8083063 Iustin Pop
315 b76f660d Michael Hanselmann
    ndata = self._data["node"]
316 a8083063 Iustin Pop
317 5a3103e9 Michael Hanselmann
    if name not in ndata:
318 5a3103e9 Michael Hanselmann
      ndata[name] = {}
319 5a3103e9 Michael Hanselmann
320 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
321 5a3103e9 Michael Hanselmann
322 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
323 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
324 a8083063 Iustin Pop

325 c41eea6e Iustin Pop
    @type instance: L{Instance}
326 c41eea6e Iustin Pop
    @param instance: the instance to look up
327 38242904 Iustin Pop

328 a8083063 Iustin Pop
    """
329 b76f660d Michael Hanselmann
    idata = self._data["instance"]
330 a8083063 Iustin Pop
331 5a3103e9 Michael Hanselmann
    if instance.name in idata:
332 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
333 a8083063 Iustin Pop
334 a8083063 Iustin Pop
    return 0
335 a8083063 Iustin Pop
336 f5116c87 Iustin Pop
  def MaintainInstanceList(self, instances):
337 f5116c87 Iustin Pop
    """Perform maintenance on the recorded instances.
338 f5116c87 Iustin Pop

339 f5116c87 Iustin Pop
    @type instances: list of string
340 f5116c87 Iustin Pop
    @param instances: the list of currently existing instances
341 f5116c87 Iustin Pop

342 f5116c87 Iustin Pop
    """
343 f5116c87 Iustin Pop
    idict = self._data["instance"]
344 f5116c87 Iustin Pop
    # First, delete obsolete instances
345 f5116c87 Iustin Pop
    obsolete_instances = set(idict).difference(instances)
346 f5116c87 Iustin Pop
    for inst in obsolete_instances:
347 f5116c87 Iustin Pop
      logging.debug("Forgetting obsolete instance %s", inst)
348 f5116c87 Iustin Pop
      del idict[inst]
349 f5116c87 Iustin Pop
350 f5116c87 Iustin Pop
    # Second, delete expired records
351 f5116c87 Iustin Pop
    earliest = time.time() - RETRY_EXPIRATION
352 f5116c87 Iustin Pop
    expired_instances = [i for i in idict
353 f5116c87 Iustin Pop
                         if idict[i][KEY_RESTART_WHEN] < earliest]
354 f5116c87 Iustin Pop
    for inst in expired_instances:
355 f5116c87 Iustin Pop
      logging.debug("Expiring record for instance %s", inst)
356 f5116c87 Iustin Pop
      del idict[inst]
357 f5116c87 Iustin Pop
358 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
359 a8083063 Iustin Pop
    """Record a restart attempt.
360 a8083063 Iustin Pop

361 c41eea6e Iustin Pop
    @type instance: L{Instance}
362 c41eea6e Iustin Pop
    @param instance: the instance being restarted
363 38242904 Iustin Pop

364 a8083063 Iustin Pop
    """
365 b76f660d Michael Hanselmann
    idata = self._data["instance"]
366 a8083063 Iustin Pop
367 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
368 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
369 5a3103e9 Michael Hanselmann
    else:
370 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
371 a8083063 Iustin Pop
372 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
373 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
374 a8083063 Iustin Pop
375 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
376 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
377 a8083063 Iustin Pop

378 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
379 c41eea6e Iustin Pop
    track down instances).
380 a8083063 Iustin Pop

381 c41eea6e Iustin Pop
    @type instance: L{Instance}
382 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
383 38242904 Iustin Pop

384 a8083063 Iustin Pop
    """
385 b76f660d Michael Hanselmann
    idata = self._data["instance"]
386 a8083063 Iustin Pop
387 5a3103e9 Michael Hanselmann
    if instance.name in idata:
388 5a3103e9 Michael Hanselmann
      del idata[instance.name]
389 a8083063 Iustin Pop
390 a8083063 Iustin Pop
391 a8083063 Iustin Pop
class Instance(object):
392 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
393 a8083063 Iustin Pop

394 a8083063 Iustin Pop
  """
395 83e5e26f René Nussbaumer
  def __init__(self, name, state, autostart, snodes):
396 a8083063 Iustin Pop
    self.name = name
397 a8083063 Iustin Pop
    self.state = state
398 5a3103e9 Michael Hanselmann
    self.autostart = autostart
399 83e5e26f René Nussbaumer
    self.snodes = snodes
400 a8083063 Iustin Pop
401 a8083063 Iustin Pop
  def Restart(self):
402 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
403 3ecf6786 Iustin Pop

404 3ecf6786 Iustin Pop
    """
405 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
406 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
407 a8083063 Iustin Pop
408 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
409 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
410 5a3103e9 Michael Hanselmann

411 5a3103e9 Michael Hanselmann
    """
412 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
413 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
414 a8083063 Iustin Pop
415 a8083063 Iustin Pop
416 6dfcc47b Iustin Pop
def GetClusterData():
417 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
418 5a3103e9 Michael Hanselmann

419 5a3103e9 Michael Hanselmann
  """
420 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
421 f2af0bec Iustin Pop
  op1 = opcodes.OpInstanceQuery(output_fields=op1_fields, names=[],
422 f2af0bec Iustin Pop
                                use_locking=True)
423 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
424 2237687b Iustin Pop
  op2 = opcodes.OpNodeQuery(output_fields=op2_fields, names=[],
425 2237687b Iustin Pop
                            use_locking=True)
426 a8083063 Iustin Pop
427 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
428 a8083063 Iustin Pop
429 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
430 5a3103e9 Michael Hanselmann
431 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
432 78f44650 Iustin Pop
433 6dfcc47b Iustin Pop
  result = all_results[0]
434 6dfcc47b Iustin Pop
  smap = {}
435 5a3103e9 Michael Hanselmann
436 6dfcc47b Iustin Pop
  instances = {}
437 78f44650 Iustin Pop
438 78f44650 Iustin Pop
  # write the upfile
439 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
440 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
441 78f44650 Iustin Pop
442 6dfcc47b Iustin Pop
  for fields in result:
443 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
444 5a3103e9 Michael Hanselmann
445 6dfcc47b Iustin Pop
    # update the secondary node map
446 6dfcc47b Iustin Pop
    for node in snodes:
447 6dfcc47b Iustin Pop
      if node not in smap:
448 6dfcc47b Iustin Pop
        smap[node] = []
449 6dfcc47b Iustin Pop
      smap[node].append(name)
450 a8083063 Iustin Pop
451 83e5e26f René Nussbaumer
    instances[name] = Instance(name, status, autostart, snodes)
452 5a3103e9 Michael Hanselmann
453 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
454 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
455 5a3103e9 Michael Hanselmann
456 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
457 5a3103e9 Michael Hanselmann
458 6dfcc47b Iustin Pop
  return instances, nodes, smap
459 a8083063 Iustin Pop
460 a8083063 Iustin Pop
461 5a3103e9 Michael Hanselmann
class Watcher(object):
462 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
463 a8083063 Iustin Pop

464 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
465 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
466 a8083063 Iustin Pop
  to restart machines that are down.
467 38242904 Iustin Pop

468 a8083063 Iustin Pop
  """
469 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
470 cc962d58 Iustin Pop
    self.notepad = notepad
471 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
472 b705c7a6 Manuel Franceschini
    if master != netutils.Hostname.GetSysName():
473 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
474 24edc6d4 Iustin Pop
    # first archive old jobs
475 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
476 24edc6d4 Iustin Pop
    # and only then submit new ones
477 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
478 eee1fa2d Iustin Pop
    self.started_instances = set()
479 f07521e5 Iustin Pop
    self.opts = opts
480 a8083063 Iustin Pop
481 a8083063 Iustin Pop
  def Run(self):
482 cc962d58 Iustin Pop
    """Watcher run sequence.
483 cc962d58 Iustin Pop

484 cc962d58 Iustin Pop
    """
485 cc962d58 Iustin Pop
    notepad = self.notepad
486 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
487 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
488 cc962d58 Iustin Pop
    self.VerifyDisks()
489 5a3103e9 Michael Hanselmann
490 24edc6d4 Iustin Pop
  @staticmethod
491 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
492 f07521e5 Iustin Pop
    """Archive old jobs.
493 f07521e5 Iustin Pop

494 f07521e5 Iustin Pop
    """
495 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
496 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
497 f07521e5 Iustin Pop
498 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
499 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
500 38242904 Iustin Pop

501 a8083063 Iustin Pop
    """
502 5a3103e9 Michael Hanselmann
    check_nodes = []
503 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
504 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
505 37b77b18 Iustin Pop
      if new_id is None:
506 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
507 cbfc4681 Iustin Pop
        if not offline:
508 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
509 cbfc4681 Iustin Pop
                        name)
510 37b77b18 Iustin Pop
        continue
511 26517d45 Iustin Pop
      if old != new_id:
512 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
513 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
514 5a3103e9 Michael Hanselmann
515 5a3103e9 Michael Hanselmann
    if check_nodes:
516 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
517 5a3103e9 Michael Hanselmann
      # secondary node.
518 6dfcc47b Iustin Pop
      for node in check_nodes:
519 6dfcc47b Iustin Pop
        if node not in self.smap:
520 eee1fa2d Iustin Pop
          continue
521 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
522 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
523 6dfcc47b Iustin Pop
          if not instance.autostart:
524 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
525 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
526 6dfcc47b Iustin Pop
            continue
527 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
528 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
529 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
530 604c175c Iustin Pop
            logging.debug("Skipping disk activation for instance %s, as"
531 604c175c Iustin Pop
                          " it was already started", instance.name)
532 6dfcc47b Iustin Pop
            continue
533 6dfcc47b Iustin Pop
          try:
534 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
535 6dfcc47b Iustin Pop
            instance.ActivateDisks()
536 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
537 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
538 6dfcc47b Iustin Pop
                              instance.name)
539 5a3103e9 Michael Hanselmann
540 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
541 5a3103e9 Michael Hanselmann
      for name in check_nodes:
542 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
543 a8083063 Iustin Pop
544 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
545 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
546 5a3103e9 Michael Hanselmann

547 5a3103e9 Michael Hanselmann
    """
548 f5116c87 Iustin Pop
    notepad.MaintainInstanceList(self.instances.keys())
549 f5116c87 Iustin Pop
550 6dfcc47b Iustin Pop
    for instance in self.instances.values():
551 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
552 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
553 a8083063 Iustin Pop
554 a8083063 Iustin Pop
        if n > MAXTRIES:
555 f5116c87 Iustin Pop
          logging.warning("Not restarting instance %s, retries exhausted",
556 f5116c87 Iustin Pop
                          instance.name)
557 a8083063 Iustin Pop
          continue
558 a8083063 Iustin Pop
        elif n < MAXTRIES:
559 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
560 a8083063 Iustin Pop
        else:
561 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
562 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
563 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
564 a8083063 Iustin Pop
          continue
565 a8083063 Iustin Pop
        try:
566 604c175c Iustin Pop
          logging.info("Restarting %s%s", instance.name, last)
567 a8083063 Iustin Pop
          instance.Restart()
568 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
569 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
570 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
571 4bffa7f7 Iustin Pop
                            instance.name)
572 a8083063 Iustin Pop
573 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
574 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
575 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
576 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
577 a8083063 Iustin Pop
      else:
578 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
579 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
580 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
581 a8083063 Iustin Pop
582 83e5e26f René Nussbaumer
  def _CheckForOfflineNodes(self, instance):
583 83e5e26f René Nussbaumer
    """Checks if given instances has any secondary in offline status.
584 83e5e26f René Nussbaumer

585 83e5e26f René Nussbaumer
    @param instance: The instance object
586 83e5e26f René Nussbaumer
    @return: True if any of the secondary is offline, False otherwise
587 83e5e26f René Nussbaumer

588 83e5e26f René Nussbaumer
    """
589 83e5e26f René Nussbaumer
    bootids = []
590 83e5e26f René Nussbaumer
    for node in instance.snodes:
591 83e5e26f René Nussbaumer
      bootids.append(self.bootids[node])
592 83e5e26f René Nussbaumer
593 83e5e26f René Nussbaumer
    return compat.any(offline for (_, offline) in bootids)
594 83e5e26f René Nussbaumer
595 83e5e26f René Nussbaumer
  def VerifyDisks(self):
596 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
597 d2f311db Iustin Pop

598 d2f311db Iustin Pop
    """
599 bd8210a7 Iustin Pop
    op = opcodes.OpClusterVerifyDisks()
600 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
601 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
602 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
603 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
604 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
605 5188ab37 Iustin Pop
      return
606 83e5e26f René Nussbaumer
    offline_disk_instances = result[1]
607 5188ab37 Iustin Pop
    if not offline_disk_instances:
608 5188ab37 Iustin Pop
      # nothing to do
609 604c175c Iustin Pop
      logging.debug("verify-disks reported no offline disks, nothing to do")
610 5188ab37 Iustin Pop
      return
611 604c175c Iustin Pop
    logging.debug("Will activate disks for instance(s) %s",
612 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
613 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
614 5188ab37 Iustin Pop
    # less the job queue
615 83e5e26f René Nussbaumer
    job = []
616 83e5e26f René Nussbaumer
    for name in offline_disk_instances:
617 83e5e26f René Nussbaumer
      instance = self.instances[name]
618 83e5e26f René Nussbaumer
      if (instance.state in HELPLESS_STATES or
619 83e5e26f René Nussbaumer
          self._CheckForOfflineNodes(instance)):
620 83e5e26f René Nussbaumer
        logging.info("Skip instance %s because it is in helpless state or has"
621 83e5e26f René Nussbaumer
                     " one offline secondary", name)
622 83e5e26f René Nussbaumer
        continue
623 83e5e26f René Nussbaumer
      job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
624 5188ab37 Iustin Pop
625 83e5e26f René Nussbaumer
    if job:
626 83e5e26f René Nussbaumer
      job_id = cli.SendJob(job, cl=client)
627 83e5e26f René Nussbaumer
628 83e5e26f René Nussbaumer
      try:
629 83e5e26f René Nussbaumer
        cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
630 83e5e26f René Nussbaumer
      except Exception: # pylint: disable-msg=W0703
631 83e5e26f René Nussbaumer
        logging.exception("Error while activating disks")
632 a8083063 Iustin Pop
633 a8083063 Iustin Pop
634 001b3825 Michael Hanselmann
def OpenStateFile(path):
635 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
636 001b3825 Michael Hanselmann

637 001b3825 Michael Hanselmann
  @type path: string
638 001b3825 Michael Hanselmann
  @param path: Path to state file
639 001b3825 Michael Hanselmann

640 001b3825 Michael Hanselmann
  """
641 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
642 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
643 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
644 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
645 001b3825 Michael Hanselmann
646 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
647 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
648 001b3825 Michael Hanselmann
  # watcher from running.
649 001b3825 Michael Hanselmann
  try:
650 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
651 001b3825 Michael Hanselmann
  except errors.LockError, err:
652 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
653 001b3825 Michael Hanselmann
    return None
654 001b3825 Michael Hanselmann
655 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
656 001b3825 Michael Hanselmann
657 001b3825 Michael Hanselmann
658 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
659 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
660 db147305 Tom Limoncelli

661 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
662 db147305 Tom Limoncelli
  test is GetVersion.
663 db147305 Tom Limoncelli

664 db147305 Tom Limoncelli
  @type hostname: string
665 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
666 db147305 Tom Limoncelli
  @rtype: bool
667 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
668 db147305 Tom Limoncelli

669 db147305 Tom Limoncelli
  """
670 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
671 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
672 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
673 db147305 Tom Limoncelli
  try:
674 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
675 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
676 db147305 Tom Limoncelli
    logging.warning("RAPI Error: CertificateError (%s)", err)
677 db147305 Tom Limoncelli
    return False
678 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
679 db147305 Tom Limoncelli
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
680 db147305 Tom Limoncelli
    return False
681 db147305 Tom Limoncelli
  logging.debug("RAPI Result: master_version is %s", master_version)
682 db147305 Tom Limoncelli
  return master_version == constants.RAPI_VERSION
683 db147305 Tom Limoncelli
684 db147305 Tom Limoncelli
685 a8083063 Iustin Pop
def ParseOptions():
686 a8083063 Iustin Pop
  """Parse the command line options.
687 a8083063 Iustin Pop

688 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
689 a8083063 Iustin Pop

690 a8083063 Iustin Pop
  """
691 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
692 a8083063 Iustin Pop
                        usage="%prog [-d]",
693 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
694 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
695 a8083063 Iustin Pop
696 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
697 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
698 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
699 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
700 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
701 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
702 a8083063 Iustin Pop
  options, args = parser.parse_args()
703 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
704 a8083063 Iustin Pop
  return options, args
705 a8083063 Iustin Pop
706 a8083063 Iustin Pop
707 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
708 9f4bb951 Michael Hanselmann
def Main():
709 a8083063 Iustin Pop
  """Main function.
710 a8083063 Iustin Pop

711 a8083063 Iustin Pop
  """
712 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
713 e125c67c Michael Hanselmann
714 f93427cd Iustin Pop
  options, args = ParseOptions()
715 f93427cd Iustin Pop
716 f93427cd Iustin Pop
  if args: # watcher doesn't take any arguments
717 f93427cd Iustin Pop
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
718 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
719 a8083063 Iustin Pop
720 cfcc79c6 Michael Hanselmann
  utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
721 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
722 a8083063 Iustin Pop
723 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
724 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
725 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
726 3753b2cb Michael Hanselmann
727 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
728 001b3825 Michael Hanselmann
  if not statefile:
729 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
730 001b3825 Michael Hanselmann
731 24edc6d4 Iustin Pop
  update_file = False
732 a8083063 Iustin Pop
  try:
733 f1115454 Guido Trotter
    StartNodeDaemons()
734 9e289e36 Guido Trotter
    RunWatcherHooks()
735 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
736 50273051 Iustin Pop
    # masters can be properly cleaned up too
737 50273051 Iustin Pop
    if NodeMaintenance.ShouldRun():
738 50273051 Iustin Pop
      NodeMaintenance().Exec()
739 c4f0219c Iustin Pop
740 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
741 781b2b2b Michael Hanselmann
    try:
742 2c404217 Iustin Pop
      try:
743 2c404217 Iustin Pop
        client = cli.GetClient()
744 2c404217 Iustin Pop
      except errors.OpPrereqError:
745 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
746 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
747 24edc6d4 Iustin Pop
        update_file = True
748 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
749 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
750 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
751 7dfb83c2 Iustin Pop
                        str(err))
752 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
753 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
754 9f4bb951 Michael Hanselmann
          return constants.EXIT_FAILURE
755 7dfb83c2 Iustin Pop
        # else retry the connection
756 7dfb83c2 Iustin Pop
        client = cli.GetClient()
757 cc962d58 Iustin Pop
758 83052f9e Guido Trotter
      # we are on master now
759 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
760 c4f0219c Iustin Pop
761 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
762 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
763 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
764 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
765 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
766 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
767 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
768 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
769 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
770 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
771 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
772 db147305 Tom Limoncelli
773 cc962d58 Iustin Pop
      try:
774 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
775 cc962d58 Iustin Pop
      except errors.ConfigurationError:
776 cc962d58 Iustin Pop
        # Just exit if there's no configuration
777 24edc6d4 Iustin Pop
        update_file = True
778 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
779 e125c67c Michael Hanselmann
780 cc962d58 Iustin Pop
      watcher.Run()
781 24edc6d4 Iustin Pop
      update_file = True
782 24edc6d4 Iustin Pop
783 cc962d58 Iustin Pop
    finally:
784 7dfb83c2 Iustin Pop
      if update_file:
785 7dfb83c2 Iustin Pop
        notepad.Save()
786 7dfb83c2 Iustin Pop
      else:
787 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
788 1b052f42 Michael Hanselmann
  except SystemExit:
789 1b052f42 Michael Hanselmann
    raise
790 38242904 Iustin Pop
  except NotMasterError:
791 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
792 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
793 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
794 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
795 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
796 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
797 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
798 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
799 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
800 438b45d4 Michael Hanselmann
  except Exception, err:
801 001b3825 Michael Hanselmann
    logging.exception(str(err))
802 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
803 5a3103e9 Michael Hanselmann
804 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS