Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ a0aa6b49

History | View | Annotate | Download (24.6 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 cfcc79c6 Michael Hanselmann
import os.path
36 a8083063 Iustin Pop
import sys
37 a8083063 Iustin Pop
import time
38 438b45d4 Michael Hanselmann
import logging
39 a8083063 Iustin Pop
from optparse import OptionParser
40 a8083063 Iustin Pop
41 a8083063 Iustin Pop
from ganeti import utils
42 a8083063 Iustin Pop
from ganeti import constants
43 83e5e26f René Nussbaumer
from ganeti import compat
44 67fe61c4 Michael Hanselmann
from ganeti import serializer
45 89e1fc26 Iustin Pop
from ganeti import errors
46 e125c67c Michael Hanselmann
from ganeti import opcodes
47 e125c67c Michael Hanselmann
from ganeti import cli
48 7dfb83c2 Iustin Pop
from ganeti import luxi
49 50273051 Iustin Pop
from ganeti import ssconf
50 50273051 Iustin Pop
from ganeti import bdev
51 50273051 Iustin Pop
from ganeti import hypervisor
52 db147305 Tom Limoncelli
from ganeti import rapi
53 50273051 Iustin Pop
from ganeti.confd import client as confd_client
54 a744b676 Manuel Franceschini
from ganeti import netutils
55 a8083063 Iustin Pop
56 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
57 db147305 Tom Limoncelli
58 a8083063 Iustin Pop
59 5a3103e9 Michael Hanselmann
MAXTRIES = 5
60 f5116c87 Iustin Pop
# Delete any record that is older than 8 hours; this value is based on
61 f5116c87 Iustin Pop
# the fact that the current retry counter is 5, and watcher runs every
62 f5116c87 Iustin Pop
# 5 minutes, so it takes around half an hour to exceed the retry
63 f5116c87 Iustin Pop
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
64 f5116c87 Iustin Pop
RETRY_EXPIRATION = 8 * 3600
65 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
66 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
67 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
68 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
69 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
70 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
71 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
72 5a3103e9 Michael Hanselmann
73 5a3103e9 Michael Hanselmann
74 a0aa6b49 Michael Hanselmann
# Global LUXI client object
75 e125c67c Michael Hanselmann
client = None
76 e125c67c Michael Hanselmann
77 e125c67c Michael Hanselmann
78 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
79 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
80 a8083063 Iustin Pop
81 a8083063 Iustin Pop
82 3753b2cb Michael Hanselmann
def ShouldPause():
83 3753b2cb Michael Hanselmann
  """Check whether we should pause.
84 3753b2cb Michael Hanselmann

85 3753b2cb Michael Hanselmann
  """
86 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
87 3753b2cb Michael Hanselmann
88 3753b2cb Michael Hanselmann
89 f1115454 Guido Trotter
def StartNodeDaemons():
90 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
91 f1115454 Guido Trotter

92 f1115454 Guido Trotter
  """
93 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
94 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
95 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
96 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
97 f1115454 Guido Trotter
98 f1115454 Guido Trotter
99 9e289e36 Guido Trotter
def RunWatcherHooks():
100 9e289e36 Guido Trotter
  """Run the watcher hooks.
101 9e289e36 Guido Trotter

102 9e289e36 Guido Trotter
  """
103 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
104 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
105 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
106 10e689d4 Iustin Pop
    return
107 9e289e36 Guido Trotter
108 9e289e36 Guido Trotter
  try:
109 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
110 a0aa6b49 Michael Hanselmann
  except Exception: # pylint: disable-msg=W0703
111 a0aa6b49 Michael Hanselmann
    logging.exception("RunParts %s failed: %s", hooks_dir)
112 a0aa6b49 Michael Hanselmann
    return
113 9e289e36 Guido Trotter
114 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
115 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
116 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
117 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
118 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
119 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
120 9e289e36 Guido Trotter
      if runresult.failed:
121 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
122 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
123 9e289e36 Guido Trotter
      else:
124 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
125 9e289e36 Guido Trotter
                      runresult.output)
126 9e289e36 Guido Trotter
127 001b3825 Michael Hanselmann
128 50273051 Iustin Pop
class NodeMaintenance(object):
129 50273051 Iustin Pop
  """Talks to confd daemons and possible shutdown instances/drbd devices.
130 50273051 Iustin Pop

131 50273051 Iustin Pop
  """
132 50273051 Iustin Pop
  def __init__(self):
133 50273051 Iustin Pop
    self.store_cb = confd_client.StoreResultCallback()
134 50273051 Iustin Pop
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
135 50273051 Iustin Pop
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
136 50273051 Iustin Pop
137 50273051 Iustin Pop
  @staticmethod
138 50273051 Iustin Pop
  def ShouldRun():
139 50273051 Iustin Pop
    """Checks whether node maintenance should run.
140 50273051 Iustin Pop

141 50273051 Iustin Pop
    """
142 50273051 Iustin Pop
    try:
143 50273051 Iustin Pop
      return ssconf.SimpleStore().GetMaintainNodeHealth()
144 50273051 Iustin Pop
    except errors.ConfigurationError, err:
145 50273051 Iustin Pop
      logging.error("Configuration error, not activating node maintenance: %s",
146 50273051 Iustin Pop
                    err)
147 50273051 Iustin Pop
      return False
148 50273051 Iustin Pop
149 50273051 Iustin Pop
  @staticmethod
150 50273051 Iustin Pop
  def GetRunningInstances():
151 50273051 Iustin Pop
    """Compute list of hypervisor/running instances.
152 50273051 Iustin Pop

153 50273051 Iustin Pop
    """
154 50273051 Iustin Pop
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
155 50273051 Iustin Pop
    results = []
156 50273051 Iustin Pop
    for hv_name in hyp_list:
157 50273051 Iustin Pop
      try:
158 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
159 50273051 Iustin Pop
        ilist = hv.ListInstances()
160 50273051 Iustin Pop
        results.extend([(iname, hv_name) for iname in ilist])
161 50273051 Iustin Pop
      except: # pylint: disable-msg=W0702
162 50273051 Iustin Pop
        logging.error("Error while listing instances for hypervisor %s",
163 50273051 Iustin Pop
                      hv_name, exc_info=True)
164 50273051 Iustin Pop
    return results
165 50273051 Iustin Pop
166 50273051 Iustin Pop
  @staticmethod
167 50273051 Iustin Pop
  def GetUsedDRBDs():
168 50273051 Iustin Pop
    """Get list of used DRBD minors.
169 50273051 Iustin Pop

170 50273051 Iustin Pop
    """
171 50273051 Iustin Pop
    return bdev.DRBD8.GetUsedDevs().keys()
172 50273051 Iustin Pop
173 50273051 Iustin Pop
  @classmethod
174 50273051 Iustin Pop
  def DoMaintenance(cls, role):
175 50273051 Iustin Pop
    """Maintain the instance list.
176 50273051 Iustin Pop

177 50273051 Iustin Pop
    """
178 50273051 Iustin Pop
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
179 50273051 Iustin Pop
      inst_running = cls.GetRunningInstances()
180 50273051 Iustin Pop
      cls.ShutdownInstances(inst_running)
181 50273051 Iustin Pop
      drbd_running = cls.GetUsedDRBDs()
182 50273051 Iustin Pop
      cls.ShutdownDRBD(drbd_running)
183 50273051 Iustin Pop
    else:
184 50273051 Iustin Pop
      logging.debug("Not doing anything for role %s", role)
185 50273051 Iustin Pop
186 50273051 Iustin Pop
  @staticmethod
187 50273051 Iustin Pop
  def ShutdownInstances(inst_running):
188 50273051 Iustin Pop
    """Shutdown running instances.
189 50273051 Iustin Pop

190 50273051 Iustin Pop
    """
191 50273051 Iustin Pop
    names_running = set([i[0] for i in inst_running])
192 50273051 Iustin Pop
    if names_running:
193 50273051 Iustin Pop
      logging.info("Following instances should not be running,"
194 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(names_running))
195 50273051 Iustin Pop
      # this dictionary will collapse duplicate instance names (only
196 50273051 Iustin Pop
      # xen pvm/vhm) into a single key, which is fine
197 50273051 Iustin Pop
      i2h = dict(inst_running)
198 50273051 Iustin Pop
      for name in names_running:
199 50273051 Iustin Pop
        hv_name = i2h[name]
200 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
201 50273051 Iustin Pop
        hv.StopInstance(None, force=True, name=name)
202 50273051 Iustin Pop
203 50273051 Iustin Pop
  @staticmethod
204 50273051 Iustin Pop
  def ShutdownDRBD(drbd_running):
205 50273051 Iustin Pop
    """Shutdown active DRBD devices.
206 50273051 Iustin Pop

207 50273051 Iustin Pop
    """
208 50273051 Iustin Pop
    if drbd_running:
209 50273051 Iustin Pop
      logging.info("Following DRBD minors should not be active,"
210 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
211 50273051 Iustin Pop
      for minor in drbd_running:
212 50273051 Iustin Pop
        # pylint: disable-msg=W0212
213 50273051 Iustin Pop
        # using the private method as is, pending enhancements to the DRBD
214 50273051 Iustin Pop
        # interface
215 50273051 Iustin Pop
        bdev.DRBD8._ShutdownAll(minor)
216 50273051 Iustin Pop
217 50273051 Iustin Pop
  def Exec(self):
218 50273051 Iustin Pop
    """Check node status versus cluster desired state.
219 50273051 Iustin Pop

220 50273051 Iustin Pop
    """
221 b705c7a6 Manuel Franceschini
    my_name = netutils.Hostname.GetSysName()
222 50273051 Iustin Pop
    req = confd_client.ConfdClientRequest(type=
223 50273051 Iustin Pop
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
224 50273051 Iustin Pop
                                          query=my_name)
225 ebacb943 Iustin Pop
    self.confd_client.SendRequest(req, async=False, coverage=-1)
226 50273051 Iustin Pop
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
227 50273051 Iustin Pop
    if not timed_out:
228 50273051 Iustin Pop
      # should have a valid response
229 50273051 Iustin Pop
      status, result = self.store_cb.GetResponse(req.rsalt)
230 50273051 Iustin Pop
      assert status, "Missing result but received replies"
231 50273051 Iustin Pop
      if not self.filter_cb.consistent[req.rsalt]:
232 50273051 Iustin Pop
        logging.warning("Inconsistent replies, not doing anything")
233 50273051 Iustin Pop
        return
234 50273051 Iustin Pop
      self.DoMaintenance(result.server_reply.answer)
235 50273051 Iustin Pop
    else:
236 50273051 Iustin Pop
      logging.warning("Confd query timed out, cannot do maintenance actions")
237 50273051 Iustin Pop
238 50273051 Iustin Pop
239 5a3103e9 Michael Hanselmann
class WatcherState(object):
240 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
241 a8083063 Iustin Pop

242 a8083063 Iustin Pop
  """
243 001b3825 Michael Hanselmann
  def __init__(self, statefile):
244 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
245 5a3103e9 Michael Hanselmann

246 001b3825 Michael Hanselmann
    @type statefile: file
247 001b3825 Michael Hanselmann
    @param statefile: State file object
248 5a3103e9 Michael Hanselmann

249 5a3103e9 Michael Hanselmann
    """
250 001b3825 Michael Hanselmann
    self.statefile = statefile
251 a8083063 Iustin Pop
252 5a3103e9 Michael Hanselmann
    try:
253 2c404217 Iustin Pop
      state_data = self.statefile.read()
254 2c404217 Iustin Pop
      if not state_data:
255 2c404217 Iustin Pop
        self._data = {}
256 2c404217 Iustin Pop
      else:
257 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
258 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
259 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
260 b76f660d Michael Hanselmann
      self._data = {}
261 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
262 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
263 5a3103e9 Michael Hanselmann
264 b76f660d Michael Hanselmann
    if "instance" not in self._data:
265 b76f660d Michael Hanselmann
      self._data["instance"] = {}
266 b76f660d Michael Hanselmann
    if "node" not in self._data:
267 b76f660d Michael Hanselmann
      self._data["node"] = {}
268 5a3103e9 Michael Hanselmann
269 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
270 2fb96d39 Michael Hanselmann
271 fc428e32 Michael Hanselmann
  def Save(self):
272 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
273 5a3103e9 Michael Hanselmann

274 5a3103e9 Michael Hanselmann
    """
275 fc428e32 Michael Hanselmann
    assert self.statefile
276 fc428e32 Michael Hanselmann
277 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
278 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
279 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
280 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
281 2fb96d39 Michael Hanselmann
      return
282 2fb96d39 Michael Hanselmann
283 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
284 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
285 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
286 26517d45 Iustin Pop
                         data=serialized_form,
287 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
288 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
289 5a3103e9 Michael Hanselmann
290 fc428e32 Michael Hanselmann
  def Close(self):
291 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
292 5a3103e9 Michael Hanselmann

293 5a3103e9 Michael Hanselmann
    """
294 5a3103e9 Michael Hanselmann
    assert self.statefile
295 5a3103e9 Michael Hanselmann
296 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
297 5a3103e9 Michael Hanselmann
    self.statefile.close()
298 5a3103e9 Michael Hanselmann
    self.statefile = None
299 5a3103e9 Michael Hanselmann
300 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
301 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
302 a8083063 Iustin Pop

303 5a3103e9 Michael Hanselmann
    """
304 b76f660d Michael Hanselmann
    ndata = self._data["node"]
305 5a3103e9 Michael Hanselmann
306 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
307 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
308 5a3103e9 Michael Hanselmann
    return None
309 5a3103e9 Michael Hanselmann
310 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
311 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
312 5a3103e9 Michael Hanselmann

313 5a3103e9 Michael Hanselmann
    """
314 5a3103e9 Michael Hanselmann
    assert bootid
315 a8083063 Iustin Pop
316 b76f660d Michael Hanselmann
    ndata = self._data["node"]
317 a8083063 Iustin Pop
318 5a3103e9 Michael Hanselmann
    if name not in ndata:
319 5a3103e9 Michael Hanselmann
      ndata[name] = {}
320 5a3103e9 Michael Hanselmann
321 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
322 5a3103e9 Michael Hanselmann
323 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
324 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
325 a8083063 Iustin Pop

326 c41eea6e Iustin Pop
    @type instance: L{Instance}
327 c41eea6e Iustin Pop
    @param instance: the instance to look up
328 38242904 Iustin Pop

329 a8083063 Iustin Pop
    """
330 b76f660d Michael Hanselmann
    idata = self._data["instance"]
331 a8083063 Iustin Pop
332 5a3103e9 Michael Hanselmann
    if instance.name in idata:
333 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
334 a8083063 Iustin Pop
335 a8083063 Iustin Pop
    return 0
336 a8083063 Iustin Pop
337 f5116c87 Iustin Pop
  def MaintainInstanceList(self, instances):
338 f5116c87 Iustin Pop
    """Perform maintenance on the recorded instances.
339 f5116c87 Iustin Pop

340 f5116c87 Iustin Pop
    @type instances: list of string
341 f5116c87 Iustin Pop
    @param instances: the list of currently existing instances
342 f5116c87 Iustin Pop

343 f5116c87 Iustin Pop
    """
344 f5116c87 Iustin Pop
    idict = self._data["instance"]
345 f5116c87 Iustin Pop
    # First, delete obsolete instances
346 f5116c87 Iustin Pop
    obsolete_instances = set(idict).difference(instances)
347 f5116c87 Iustin Pop
    for inst in obsolete_instances:
348 f5116c87 Iustin Pop
      logging.debug("Forgetting obsolete instance %s", inst)
349 f5116c87 Iustin Pop
      del idict[inst]
350 f5116c87 Iustin Pop
351 f5116c87 Iustin Pop
    # Second, delete expired records
352 f5116c87 Iustin Pop
    earliest = time.time() - RETRY_EXPIRATION
353 f5116c87 Iustin Pop
    expired_instances = [i for i in idict
354 f5116c87 Iustin Pop
                         if idict[i][KEY_RESTART_WHEN] < earliest]
355 f5116c87 Iustin Pop
    for inst in expired_instances:
356 f5116c87 Iustin Pop
      logging.debug("Expiring record for instance %s", inst)
357 f5116c87 Iustin Pop
      del idict[inst]
358 f5116c87 Iustin Pop
359 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
360 a8083063 Iustin Pop
    """Record a restart attempt.
361 a8083063 Iustin Pop

362 c41eea6e Iustin Pop
    @type instance: L{Instance}
363 c41eea6e Iustin Pop
    @param instance: the instance being restarted
364 38242904 Iustin Pop

365 a8083063 Iustin Pop
    """
366 b76f660d Michael Hanselmann
    idata = self._data["instance"]
367 a8083063 Iustin Pop
368 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
369 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
370 5a3103e9 Michael Hanselmann
    else:
371 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
372 a8083063 Iustin Pop
373 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
374 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
375 a8083063 Iustin Pop
376 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
377 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
378 a8083063 Iustin Pop

379 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
380 c41eea6e Iustin Pop
    track down instances).
381 a8083063 Iustin Pop

382 c41eea6e Iustin Pop
    @type instance: L{Instance}
383 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
384 38242904 Iustin Pop

385 a8083063 Iustin Pop
    """
386 b76f660d Michael Hanselmann
    idata = self._data["instance"]
387 a8083063 Iustin Pop
388 5a3103e9 Michael Hanselmann
    if instance.name in idata:
389 5a3103e9 Michael Hanselmann
      del idata[instance.name]
390 a8083063 Iustin Pop
391 a8083063 Iustin Pop
392 a8083063 Iustin Pop
class Instance(object):
393 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
394 a8083063 Iustin Pop

395 a8083063 Iustin Pop
  """
396 83e5e26f René Nussbaumer
  def __init__(self, name, state, autostart, snodes):
397 a8083063 Iustin Pop
    self.name = name
398 a8083063 Iustin Pop
    self.state = state
399 5a3103e9 Michael Hanselmann
    self.autostart = autostart
400 83e5e26f René Nussbaumer
    self.snodes = snodes
401 a8083063 Iustin Pop
402 a8083063 Iustin Pop
  def Restart(self):
403 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
404 3ecf6786 Iustin Pop

405 3ecf6786 Iustin Pop
    """
406 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
407 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
408 a8083063 Iustin Pop
409 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
410 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
411 5a3103e9 Michael Hanselmann

412 5a3103e9 Michael Hanselmann
    """
413 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
414 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
415 a8083063 Iustin Pop
416 a8083063 Iustin Pop
417 6dfcc47b Iustin Pop
def GetClusterData():
418 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
419 5a3103e9 Michael Hanselmann

420 5a3103e9 Michael Hanselmann
  """
421 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
422 f2af0bec Iustin Pop
  op1 = opcodes.OpInstanceQuery(output_fields=op1_fields, names=[],
423 f2af0bec Iustin Pop
                                use_locking=True)
424 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
425 2237687b Iustin Pop
  op2 = opcodes.OpNodeQuery(output_fields=op2_fields, names=[],
426 2237687b Iustin Pop
                            use_locking=True)
427 a8083063 Iustin Pop
428 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
429 a8083063 Iustin Pop
430 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
431 5a3103e9 Michael Hanselmann
432 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
433 78f44650 Iustin Pop
434 6dfcc47b Iustin Pop
  result = all_results[0]
435 6dfcc47b Iustin Pop
  smap = {}
436 5a3103e9 Michael Hanselmann
437 6dfcc47b Iustin Pop
  instances = {}
438 78f44650 Iustin Pop
439 78f44650 Iustin Pop
  # write the upfile
440 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
441 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
442 78f44650 Iustin Pop
443 6dfcc47b Iustin Pop
  for fields in result:
444 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
445 5a3103e9 Michael Hanselmann
446 6dfcc47b Iustin Pop
    # update the secondary node map
447 6dfcc47b Iustin Pop
    for node in snodes:
448 6dfcc47b Iustin Pop
      if node not in smap:
449 6dfcc47b Iustin Pop
        smap[node] = []
450 6dfcc47b Iustin Pop
      smap[node].append(name)
451 a8083063 Iustin Pop
452 83e5e26f René Nussbaumer
    instances[name] = Instance(name, status, autostart, snodes)
453 5a3103e9 Michael Hanselmann
454 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
455 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
456 5a3103e9 Michael Hanselmann
457 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
458 5a3103e9 Michael Hanselmann
459 6dfcc47b Iustin Pop
  return instances, nodes, smap
460 a8083063 Iustin Pop
461 a8083063 Iustin Pop
462 5a3103e9 Michael Hanselmann
class Watcher(object):
463 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
464 a8083063 Iustin Pop

465 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
466 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
467 a8083063 Iustin Pop
  to restart machines that are down.
468 38242904 Iustin Pop

469 a8083063 Iustin Pop
  """
470 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
471 cc962d58 Iustin Pop
    self.notepad = notepad
472 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
473 b705c7a6 Manuel Franceschini
    if master != netutils.Hostname.GetSysName():
474 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
475 24edc6d4 Iustin Pop
    # first archive old jobs
476 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
477 24edc6d4 Iustin Pop
    # and only then submit new ones
478 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
479 eee1fa2d Iustin Pop
    self.started_instances = set()
480 f07521e5 Iustin Pop
    self.opts = opts
481 a8083063 Iustin Pop
482 a8083063 Iustin Pop
  def Run(self):
483 cc962d58 Iustin Pop
    """Watcher run sequence.
484 cc962d58 Iustin Pop

485 cc962d58 Iustin Pop
    """
486 cc962d58 Iustin Pop
    notepad = self.notepad
487 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
488 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
489 cc962d58 Iustin Pop
    self.VerifyDisks()
490 5a3103e9 Michael Hanselmann
491 24edc6d4 Iustin Pop
  @staticmethod
492 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
493 f07521e5 Iustin Pop
    """Archive old jobs.
494 f07521e5 Iustin Pop

495 f07521e5 Iustin Pop
    """
496 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
497 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
498 f07521e5 Iustin Pop
499 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
500 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
501 38242904 Iustin Pop

502 a8083063 Iustin Pop
    """
503 5a3103e9 Michael Hanselmann
    check_nodes = []
504 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
505 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
506 37b77b18 Iustin Pop
      if new_id is None:
507 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
508 cbfc4681 Iustin Pop
        if not offline:
509 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
510 cbfc4681 Iustin Pop
                        name)
511 37b77b18 Iustin Pop
        continue
512 26517d45 Iustin Pop
      if old != new_id:
513 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
514 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
515 5a3103e9 Michael Hanselmann
516 5a3103e9 Michael Hanselmann
    if check_nodes:
517 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
518 5a3103e9 Michael Hanselmann
      # secondary node.
519 6dfcc47b Iustin Pop
      for node in check_nodes:
520 6dfcc47b Iustin Pop
        if node not in self.smap:
521 eee1fa2d Iustin Pop
          continue
522 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
523 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
524 6dfcc47b Iustin Pop
          if not instance.autostart:
525 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
526 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
527 6dfcc47b Iustin Pop
            continue
528 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
529 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
530 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
531 6dfcc47b Iustin Pop
            continue
532 6dfcc47b Iustin Pop
          try:
533 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
534 6dfcc47b Iustin Pop
            instance.ActivateDisks()
535 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
536 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
537 6dfcc47b Iustin Pop
                              instance.name)
538 5a3103e9 Michael Hanselmann
539 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
540 5a3103e9 Michael Hanselmann
      for name in check_nodes:
541 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
542 a8083063 Iustin Pop
543 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
544 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
545 5a3103e9 Michael Hanselmann

546 5a3103e9 Michael Hanselmann
    """
547 f5116c87 Iustin Pop
    notepad.MaintainInstanceList(self.instances.keys())
548 f5116c87 Iustin Pop
549 6dfcc47b Iustin Pop
    for instance in self.instances.values():
550 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
551 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
552 a8083063 Iustin Pop
553 a8083063 Iustin Pop
        if n > MAXTRIES:
554 f5116c87 Iustin Pop
          logging.warning("Not restarting instance %s, retries exhausted",
555 f5116c87 Iustin Pop
                          instance.name)
556 a8083063 Iustin Pop
          continue
557 a8083063 Iustin Pop
        elif n < MAXTRIES:
558 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
559 a8083063 Iustin Pop
        else:
560 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
561 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
562 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
563 a8083063 Iustin Pop
          continue
564 a8083063 Iustin Pop
        try:
565 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
566 438b45d4 Michael Hanselmann
                        instance.name, last)
567 a8083063 Iustin Pop
          instance.Restart()
568 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
569 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
570 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
571 4bffa7f7 Iustin Pop
                            instance.name)
572 a8083063 Iustin Pop
573 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
574 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
575 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
576 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
577 a8083063 Iustin Pop
      else:
578 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
579 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
580 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
581 a8083063 Iustin Pop
582 83e5e26f René Nussbaumer
  def _CheckForOfflineNodes(self, instance):
583 83e5e26f René Nussbaumer
    """Checks if given instances has any secondary in offline status.
584 83e5e26f René Nussbaumer

585 83e5e26f René Nussbaumer
    @param instance: The instance object
586 83e5e26f René Nussbaumer
    @return: True if any of the secondary is offline, False otherwise
587 83e5e26f René Nussbaumer

588 83e5e26f René Nussbaumer
    """
589 83e5e26f René Nussbaumer
    bootids = []
590 83e5e26f René Nussbaumer
    for node in instance.snodes:
591 83e5e26f René Nussbaumer
      bootids.append(self.bootids[node])
592 83e5e26f René Nussbaumer
593 83e5e26f René Nussbaumer
    return compat.any(offline for (_, offline) in bootids)
594 83e5e26f René Nussbaumer
595 83e5e26f René Nussbaumer
  def VerifyDisks(self):
596 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
597 d2f311db Iustin Pop

598 d2f311db Iustin Pop
    """
599 bd8210a7 Iustin Pop
    op = opcodes.OpClusterVerifyDisks()
600 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
601 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
602 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
603 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
604 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
605 5188ab37 Iustin Pop
      return
606 83e5e26f René Nussbaumer
    offline_disk_instances = result[1]
607 5188ab37 Iustin Pop
    if not offline_disk_instances:
608 5188ab37 Iustin Pop
      # nothing to do
609 5188ab37 Iustin Pop
      return
610 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
611 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
612 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
613 5188ab37 Iustin Pop
    # less the job queue
614 83e5e26f René Nussbaumer
    job = []
615 83e5e26f René Nussbaumer
    for name in offline_disk_instances:
616 83e5e26f René Nussbaumer
      instance = self.instances[name]
617 83e5e26f René Nussbaumer
      if (instance.state in HELPLESS_STATES or
618 83e5e26f René Nussbaumer
          self._CheckForOfflineNodes(instance)):
619 83e5e26f René Nussbaumer
        logging.info("Skip instance %s because it is in helpless state or has"
620 83e5e26f René Nussbaumer
                     " one offline secondary", name)
621 83e5e26f René Nussbaumer
        continue
622 83e5e26f René Nussbaumer
      job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
623 5188ab37 Iustin Pop
624 83e5e26f René Nussbaumer
    if job:
625 83e5e26f René Nussbaumer
      job_id = cli.SendJob(job, cl=client)
626 83e5e26f René Nussbaumer
627 83e5e26f René Nussbaumer
      try:
628 83e5e26f René Nussbaumer
        cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
629 83e5e26f René Nussbaumer
      except Exception: # pylint: disable-msg=W0703
630 83e5e26f René Nussbaumer
        logging.exception("Error while activating disks")
631 a8083063 Iustin Pop
632 a8083063 Iustin Pop
633 001b3825 Michael Hanselmann
def OpenStateFile(path):
634 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
635 001b3825 Michael Hanselmann

636 001b3825 Michael Hanselmann
  @type path: string
637 001b3825 Michael Hanselmann
  @param path: Path to state file
638 001b3825 Michael Hanselmann

639 001b3825 Michael Hanselmann
  """
640 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
641 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
642 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
643 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
644 001b3825 Michael Hanselmann
645 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
646 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
647 001b3825 Michael Hanselmann
  # watcher from running.
648 001b3825 Michael Hanselmann
  try:
649 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
650 001b3825 Michael Hanselmann
  except errors.LockError, err:
651 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
652 001b3825 Michael Hanselmann
    return None
653 001b3825 Michael Hanselmann
654 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
655 001b3825 Michael Hanselmann
656 001b3825 Michael Hanselmann
657 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
658 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
659 db147305 Tom Limoncelli

660 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
661 db147305 Tom Limoncelli
  test is GetVersion.
662 db147305 Tom Limoncelli

663 db147305 Tom Limoncelli
  @type hostname: string
664 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
665 db147305 Tom Limoncelli
  @rtype: bool
666 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
667 db147305 Tom Limoncelli

668 db147305 Tom Limoncelli
  """
669 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
670 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
671 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
672 db147305 Tom Limoncelli
  try:
673 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
674 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
675 db147305 Tom Limoncelli
    logging.warning("RAPI Error: CertificateError (%s)", err)
676 db147305 Tom Limoncelli
    return False
677 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
678 db147305 Tom Limoncelli
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
679 db147305 Tom Limoncelli
    return False
680 db147305 Tom Limoncelli
  logging.debug("RAPI Result: master_version is %s", master_version)
681 db147305 Tom Limoncelli
  return master_version == constants.RAPI_VERSION
682 db147305 Tom Limoncelli
683 db147305 Tom Limoncelli
684 a8083063 Iustin Pop
def ParseOptions():
685 a8083063 Iustin Pop
  """Parse the command line options.
686 a8083063 Iustin Pop

687 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
688 a8083063 Iustin Pop

689 a8083063 Iustin Pop
  """
690 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
691 a8083063 Iustin Pop
                        usage="%prog [-d]",
692 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
693 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
694 a8083063 Iustin Pop
695 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
696 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
697 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
698 f0a80b01 Michael Hanselmann
                          " 6 hours)")
699 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
700 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
701 a8083063 Iustin Pop
  options, args = parser.parse_args()
702 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
703 f0a80b01 Michael Hanselmann
704 f0a80b01 Michael Hanselmann
  if args:
705 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
706 f0a80b01 Michael Hanselmann
707 f0a80b01 Michael Hanselmann
  return (options, args)
708 a8083063 Iustin Pop
709 a8083063 Iustin Pop
710 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
711 9f4bb951 Michael Hanselmann
def Main():
712 a8083063 Iustin Pop
  """Main function.
713 a8083063 Iustin Pop

714 a8083063 Iustin Pop
  """
715 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
716 e125c67c Michael Hanselmann
717 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
718 a8083063 Iustin Pop
719 cfcc79c6 Michael Hanselmann
  utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
720 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
721 a8083063 Iustin Pop
722 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
723 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
724 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
725 3753b2cb Michael Hanselmann
726 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
727 001b3825 Michael Hanselmann
  if not statefile:
728 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
729 001b3825 Michael Hanselmann
730 24edc6d4 Iustin Pop
  update_file = False
731 a8083063 Iustin Pop
  try:
732 f1115454 Guido Trotter
    StartNodeDaemons()
733 9e289e36 Guido Trotter
    RunWatcherHooks()
734 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
735 50273051 Iustin Pop
    # masters can be properly cleaned up too
736 50273051 Iustin Pop
    if NodeMaintenance.ShouldRun():
737 50273051 Iustin Pop
      NodeMaintenance().Exec()
738 c4f0219c Iustin Pop
739 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
740 781b2b2b Michael Hanselmann
    try:
741 2c404217 Iustin Pop
      try:
742 2c404217 Iustin Pop
        client = cli.GetClient()
743 2c404217 Iustin Pop
      except errors.OpPrereqError:
744 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
745 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
746 24edc6d4 Iustin Pop
        update_file = True
747 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
748 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
749 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
750 7dfb83c2 Iustin Pop
                        str(err))
751 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
752 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
753 9f4bb951 Michael Hanselmann
          return constants.EXIT_FAILURE
754 7dfb83c2 Iustin Pop
        # else retry the connection
755 7dfb83c2 Iustin Pop
        client = cli.GetClient()
756 cc962d58 Iustin Pop
757 83052f9e Guido Trotter
      # we are on master now
758 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
759 c4f0219c Iustin Pop
760 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
761 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
762 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
763 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
764 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
765 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
766 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
767 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
768 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
769 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
770 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
771 db147305 Tom Limoncelli
772 cc962d58 Iustin Pop
      try:
773 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
774 cc962d58 Iustin Pop
      except errors.ConfigurationError:
775 cc962d58 Iustin Pop
        # Just exit if there's no configuration
776 24edc6d4 Iustin Pop
        update_file = True
777 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
778 e125c67c Michael Hanselmann
779 cc962d58 Iustin Pop
      watcher.Run()
780 24edc6d4 Iustin Pop
      update_file = True
781 24edc6d4 Iustin Pop
782 cc962d58 Iustin Pop
    finally:
783 7dfb83c2 Iustin Pop
      if update_file:
784 7dfb83c2 Iustin Pop
        notepad.Save()
785 7dfb83c2 Iustin Pop
      else:
786 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
787 1b052f42 Michael Hanselmann
  except SystemExit:
788 1b052f42 Michael Hanselmann
    raise
789 38242904 Iustin Pop
  except NotMasterError:
790 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
791 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
792 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
793 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
794 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
795 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
796 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
797 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
798 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
799 438b45d4 Michael Hanselmann
  except Exception, err:
800 001b3825 Michael Hanselmann
    logging.exception(str(err))
801 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
802 5a3103e9 Michael Hanselmann
803 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS