Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ 4b163794

History | View | Annotate | Download (24.7 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 a8083063 Iustin Pop
import sys
36 a8083063 Iustin Pop
import time
37 438b45d4 Michael Hanselmann
import logging
38 a8083063 Iustin Pop
from optparse import OptionParser
39 a8083063 Iustin Pop
40 a8083063 Iustin Pop
from ganeti import utils
41 a8083063 Iustin Pop
from ganeti import constants
42 83e5e26f René Nussbaumer
from ganeti import compat
43 67fe61c4 Michael Hanselmann
from ganeti import serializer
44 89e1fc26 Iustin Pop
from ganeti import errors
45 e125c67c Michael Hanselmann
from ganeti import opcodes
46 e125c67c Michael Hanselmann
from ganeti import cli
47 7dfb83c2 Iustin Pop
from ganeti import luxi
48 50273051 Iustin Pop
from ganeti import ssconf
49 50273051 Iustin Pop
from ganeti import bdev
50 50273051 Iustin Pop
from ganeti import hypervisor
51 db147305 Tom Limoncelli
from ganeti import rapi
52 50273051 Iustin Pop
from ganeti.confd import client as confd_client
53 a744b676 Manuel Franceschini
from ganeti import netutils
54 a8083063 Iustin Pop
55 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
56 db147305 Tom Limoncelli
57 a8083063 Iustin Pop
58 5a3103e9 Michael Hanselmann
MAXTRIES = 5
59 f5116c87 Iustin Pop
# Delete any record that is older than 8 hours; this value is based on
60 f5116c87 Iustin Pop
# the fact that the current retry counter is 5, and watcher runs every
61 f5116c87 Iustin Pop
# 5 minutes, so it takes around half an hour to exceed the retry
62 f5116c87 Iustin Pop
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
63 f5116c87 Iustin Pop
RETRY_EXPIRATION = 8 * 3600
64 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
65 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
66 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
67 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
68 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
69 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
70 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
71 5a3103e9 Michael Hanselmann
72 5a3103e9 Michael Hanselmann
73 e125c67c Michael Hanselmann
# Global client object
74 e125c67c Michael Hanselmann
client = None
75 e125c67c Michael Hanselmann
76 e125c67c Michael Hanselmann
77 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
78 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
79 a8083063 Iustin Pop
80 a8083063 Iustin Pop
81 3753b2cb Michael Hanselmann
def ShouldPause():
82 3753b2cb Michael Hanselmann
  """Check whether we should pause.
83 3753b2cb Michael Hanselmann

84 3753b2cb Michael Hanselmann
  """
85 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
86 3753b2cb Michael Hanselmann
87 3753b2cb Michael Hanselmann
88 f1115454 Guido Trotter
def StartNodeDaemons():
89 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
90 f1115454 Guido Trotter

91 f1115454 Guido Trotter
  """
92 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
93 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
94 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
95 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
96 f1115454 Guido Trotter
97 f1115454 Guido Trotter
98 9e289e36 Guido Trotter
def RunWatcherHooks():
99 9e289e36 Guido Trotter
  """Run the watcher hooks.
100 9e289e36 Guido Trotter

101 9e289e36 Guido Trotter
  """
102 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
103 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
104 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
105 10e689d4 Iustin Pop
    return
106 9e289e36 Guido Trotter
107 9e289e36 Guido Trotter
  try:
108 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
109 9e289e36 Guido Trotter
  except Exception, msg: # pylint: disable-msg=W0703
110 9e289e36 Guido Trotter
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
111 9e289e36 Guido Trotter
112 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
113 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
114 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
115 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
116 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
117 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
118 9e289e36 Guido Trotter
      if runresult.failed:
119 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
120 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
121 9e289e36 Guido Trotter
      else:
122 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
123 9e289e36 Guido Trotter
                      runresult.output)
124 9e289e36 Guido Trotter
125 001b3825 Michael Hanselmann
126 50273051 Iustin Pop
class NodeMaintenance(object):
127 50273051 Iustin Pop
  """Talks to confd daemons and possible shutdown instances/drbd devices.
128 50273051 Iustin Pop

129 50273051 Iustin Pop
  """
130 50273051 Iustin Pop
  def __init__(self):
131 50273051 Iustin Pop
    self.store_cb = confd_client.StoreResultCallback()
132 50273051 Iustin Pop
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
133 50273051 Iustin Pop
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
134 50273051 Iustin Pop
135 50273051 Iustin Pop
  @staticmethod
136 50273051 Iustin Pop
  def ShouldRun():
137 50273051 Iustin Pop
    """Checks whether node maintenance should run.
138 50273051 Iustin Pop

139 50273051 Iustin Pop
    """
140 50273051 Iustin Pop
    try:
141 50273051 Iustin Pop
      return ssconf.SimpleStore().GetMaintainNodeHealth()
142 50273051 Iustin Pop
    except errors.ConfigurationError, err:
143 50273051 Iustin Pop
      logging.error("Configuration error, not activating node maintenance: %s",
144 50273051 Iustin Pop
                    err)
145 50273051 Iustin Pop
      return False
146 50273051 Iustin Pop
147 50273051 Iustin Pop
  @staticmethod
148 50273051 Iustin Pop
  def GetRunningInstances():
149 50273051 Iustin Pop
    """Compute list of hypervisor/running instances.
150 50273051 Iustin Pop

151 50273051 Iustin Pop
    """
152 50273051 Iustin Pop
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
153 50273051 Iustin Pop
    results = []
154 50273051 Iustin Pop
    for hv_name in hyp_list:
155 50273051 Iustin Pop
      try:
156 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
157 50273051 Iustin Pop
        ilist = hv.ListInstances()
158 50273051 Iustin Pop
        results.extend([(iname, hv_name) for iname in ilist])
159 50273051 Iustin Pop
      except: # pylint: disable-msg=W0702
160 50273051 Iustin Pop
        logging.error("Error while listing instances for hypervisor %s",
161 50273051 Iustin Pop
                      hv_name, exc_info=True)
162 50273051 Iustin Pop
    return results
163 50273051 Iustin Pop
164 50273051 Iustin Pop
  @staticmethod
165 50273051 Iustin Pop
  def GetUsedDRBDs():
166 50273051 Iustin Pop
    """Get list of used DRBD minors.
167 50273051 Iustin Pop

168 50273051 Iustin Pop
    """
169 50273051 Iustin Pop
    return bdev.DRBD8.GetUsedDevs().keys()
170 50273051 Iustin Pop
171 50273051 Iustin Pop
  @classmethod
172 50273051 Iustin Pop
  def DoMaintenance(cls, role):
173 50273051 Iustin Pop
    """Maintain the instance list.
174 50273051 Iustin Pop

175 50273051 Iustin Pop
    """
176 50273051 Iustin Pop
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
177 50273051 Iustin Pop
      inst_running = cls.GetRunningInstances()
178 50273051 Iustin Pop
      cls.ShutdownInstances(inst_running)
179 50273051 Iustin Pop
      drbd_running = cls.GetUsedDRBDs()
180 50273051 Iustin Pop
      cls.ShutdownDRBD(drbd_running)
181 50273051 Iustin Pop
    else:
182 50273051 Iustin Pop
      logging.debug("Not doing anything for role %s", role)
183 50273051 Iustin Pop
184 50273051 Iustin Pop
  @staticmethod
185 50273051 Iustin Pop
  def ShutdownInstances(inst_running):
186 50273051 Iustin Pop
    """Shutdown running instances.
187 50273051 Iustin Pop

188 50273051 Iustin Pop
    """
189 50273051 Iustin Pop
    names_running = set([i[0] for i in inst_running])
190 50273051 Iustin Pop
    if names_running:
191 50273051 Iustin Pop
      logging.info("Following instances should not be running,"
192 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(names_running))
193 50273051 Iustin Pop
      # this dictionary will collapse duplicate instance names (only
194 50273051 Iustin Pop
      # xen pvm/vhm) into a single key, which is fine
195 50273051 Iustin Pop
      i2h = dict(inst_running)
196 50273051 Iustin Pop
      for name in names_running:
197 50273051 Iustin Pop
        hv_name = i2h[name]
198 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
199 50273051 Iustin Pop
        hv.StopInstance(None, force=True, name=name)
200 50273051 Iustin Pop
201 50273051 Iustin Pop
  @staticmethod
202 50273051 Iustin Pop
  def ShutdownDRBD(drbd_running):
203 50273051 Iustin Pop
    """Shutdown active DRBD devices.
204 50273051 Iustin Pop

205 50273051 Iustin Pop
    """
206 50273051 Iustin Pop
    if drbd_running:
207 50273051 Iustin Pop
      logging.info("Following DRBD minors should not be active,"
208 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
209 50273051 Iustin Pop
      for minor in drbd_running:
210 50273051 Iustin Pop
        # pylint: disable-msg=W0212
211 50273051 Iustin Pop
        # using the private method as is, pending enhancements to the DRBD
212 50273051 Iustin Pop
        # interface
213 50273051 Iustin Pop
        bdev.DRBD8._ShutdownAll(minor)
214 50273051 Iustin Pop
215 50273051 Iustin Pop
  def Exec(self):
216 50273051 Iustin Pop
    """Check node status versus cluster desired state.
217 50273051 Iustin Pop

218 50273051 Iustin Pop
    """
219 b705c7a6 Manuel Franceschini
    my_name = netutils.Hostname.GetSysName()
220 50273051 Iustin Pop
    req = confd_client.ConfdClientRequest(type=
221 50273051 Iustin Pop
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
222 50273051 Iustin Pop
                                          query=my_name)
223 ebacb943 Iustin Pop
    self.confd_client.SendRequest(req, async=False, coverage=-1)
224 50273051 Iustin Pop
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
225 50273051 Iustin Pop
    if not timed_out:
226 50273051 Iustin Pop
      # should have a valid response
227 50273051 Iustin Pop
      status, result = self.store_cb.GetResponse(req.rsalt)
228 50273051 Iustin Pop
      assert status, "Missing result but received replies"
229 50273051 Iustin Pop
      if not self.filter_cb.consistent[req.rsalt]:
230 50273051 Iustin Pop
        logging.warning("Inconsistent replies, not doing anything")
231 50273051 Iustin Pop
        return
232 50273051 Iustin Pop
      self.DoMaintenance(result.server_reply.answer)
233 50273051 Iustin Pop
    else:
234 50273051 Iustin Pop
      logging.warning("Confd query timed out, cannot do maintenance actions")
235 50273051 Iustin Pop
236 50273051 Iustin Pop
237 5a3103e9 Michael Hanselmann
class WatcherState(object):
238 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
239 a8083063 Iustin Pop

240 a8083063 Iustin Pop
  """
241 001b3825 Michael Hanselmann
  def __init__(self, statefile):
242 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
243 5a3103e9 Michael Hanselmann

244 001b3825 Michael Hanselmann
    @type statefile: file
245 001b3825 Michael Hanselmann
    @param statefile: State file object
246 5a3103e9 Michael Hanselmann

247 5a3103e9 Michael Hanselmann
    """
248 001b3825 Michael Hanselmann
    self.statefile = statefile
249 a8083063 Iustin Pop
250 5a3103e9 Michael Hanselmann
    try:
251 2c404217 Iustin Pop
      state_data = self.statefile.read()
252 2c404217 Iustin Pop
      if not state_data:
253 2c404217 Iustin Pop
        self._data = {}
254 2c404217 Iustin Pop
      else:
255 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
256 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
257 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
258 b76f660d Michael Hanselmann
      self._data = {}
259 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
260 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
261 5a3103e9 Michael Hanselmann
262 b76f660d Michael Hanselmann
    if "instance" not in self._data:
263 b76f660d Michael Hanselmann
      self._data["instance"] = {}
264 b76f660d Michael Hanselmann
    if "node" not in self._data:
265 b76f660d Michael Hanselmann
      self._data["node"] = {}
266 5a3103e9 Michael Hanselmann
267 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
268 2fb96d39 Michael Hanselmann
269 fc428e32 Michael Hanselmann
  def Save(self):
270 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
271 5a3103e9 Michael Hanselmann

272 5a3103e9 Michael Hanselmann
    """
273 fc428e32 Michael Hanselmann
    assert self.statefile
274 fc428e32 Michael Hanselmann
275 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
276 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
277 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
278 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
279 2fb96d39 Michael Hanselmann
      return
280 2fb96d39 Michael Hanselmann
281 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
282 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
283 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
284 26517d45 Iustin Pop
                         data=serialized_form,
285 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
286 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
287 5a3103e9 Michael Hanselmann
288 fc428e32 Michael Hanselmann
  def Close(self):
289 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
290 5a3103e9 Michael Hanselmann

291 5a3103e9 Michael Hanselmann
    """
292 5a3103e9 Michael Hanselmann
    assert self.statefile
293 5a3103e9 Michael Hanselmann
294 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
295 5a3103e9 Michael Hanselmann
    self.statefile.close()
296 5a3103e9 Michael Hanselmann
    self.statefile = None
297 5a3103e9 Michael Hanselmann
298 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
299 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
300 a8083063 Iustin Pop

301 5a3103e9 Michael Hanselmann
    """
302 b76f660d Michael Hanselmann
    ndata = self._data["node"]
303 5a3103e9 Michael Hanselmann
304 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
305 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
306 5a3103e9 Michael Hanselmann
    return None
307 5a3103e9 Michael Hanselmann
308 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
309 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
310 5a3103e9 Michael Hanselmann

311 5a3103e9 Michael Hanselmann
    """
312 5a3103e9 Michael Hanselmann
    assert bootid
313 a8083063 Iustin Pop
314 b76f660d Michael Hanselmann
    ndata = self._data["node"]
315 a8083063 Iustin Pop
316 5a3103e9 Michael Hanselmann
    if name not in ndata:
317 5a3103e9 Michael Hanselmann
      ndata[name] = {}
318 5a3103e9 Michael Hanselmann
319 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
320 5a3103e9 Michael Hanselmann
321 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
322 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
323 a8083063 Iustin Pop

324 c41eea6e Iustin Pop
    @type instance: L{Instance}
325 c41eea6e Iustin Pop
    @param instance: the instance to look up
326 38242904 Iustin Pop

327 a8083063 Iustin Pop
    """
328 b76f660d Michael Hanselmann
    idata = self._data["instance"]
329 a8083063 Iustin Pop
330 5a3103e9 Michael Hanselmann
    if instance.name in idata:
331 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
332 a8083063 Iustin Pop
333 a8083063 Iustin Pop
    return 0
334 a8083063 Iustin Pop
335 f5116c87 Iustin Pop
  def MaintainInstanceList(self, instances):
336 f5116c87 Iustin Pop
    """Perform maintenance on the recorded instances.
337 f5116c87 Iustin Pop

338 f5116c87 Iustin Pop
    @type instances: list of string
339 f5116c87 Iustin Pop
    @param instances: the list of currently existing instances
340 f5116c87 Iustin Pop

341 f5116c87 Iustin Pop
    """
342 f5116c87 Iustin Pop
    idict = self._data["instance"]
343 f5116c87 Iustin Pop
    # First, delete obsolete instances
344 f5116c87 Iustin Pop
    obsolete_instances = set(idict).difference(instances)
345 f5116c87 Iustin Pop
    for inst in obsolete_instances:
346 f5116c87 Iustin Pop
      logging.debug("Forgetting obsolete instance %s", inst)
347 f5116c87 Iustin Pop
      del idict[inst]
348 f5116c87 Iustin Pop
349 f5116c87 Iustin Pop
    # Second, delete expired records
350 f5116c87 Iustin Pop
    earliest = time.time() - RETRY_EXPIRATION
351 f5116c87 Iustin Pop
    expired_instances = [i for i in idict
352 f5116c87 Iustin Pop
                         if idict[i][KEY_RESTART_WHEN] < earliest]
353 f5116c87 Iustin Pop
    for inst in expired_instances:
354 f5116c87 Iustin Pop
      logging.debug("Expiring record for instance %s", inst)
355 f5116c87 Iustin Pop
      del idict[inst]
356 f5116c87 Iustin Pop
357 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
358 a8083063 Iustin Pop
    """Record a restart attempt.
359 a8083063 Iustin Pop

360 c41eea6e Iustin Pop
    @type instance: L{Instance}
361 c41eea6e Iustin Pop
    @param instance: the instance being restarted
362 38242904 Iustin Pop

363 a8083063 Iustin Pop
    """
364 b76f660d Michael Hanselmann
    idata = self._data["instance"]
365 a8083063 Iustin Pop
366 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
367 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
368 5a3103e9 Michael Hanselmann
    else:
369 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
370 a8083063 Iustin Pop
371 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
372 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
373 a8083063 Iustin Pop
374 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
375 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
376 a8083063 Iustin Pop

377 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
378 c41eea6e Iustin Pop
    track down instances).
379 a8083063 Iustin Pop

380 c41eea6e Iustin Pop
    @type instance: L{Instance}
381 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
382 38242904 Iustin Pop

383 a8083063 Iustin Pop
    """
384 b76f660d Michael Hanselmann
    idata = self._data["instance"]
385 a8083063 Iustin Pop
386 5a3103e9 Michael Hanselmann
    if instance.name in idata:
387 5a3103e9 Michael Hanselmann
      del idata[instance.name]
388 a8083063 Iustin Pop
389 a8083063 Iustin Pop
390 a8083063 Iustin Pop
class Instance(object):
391 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
392 a8083063 Iustin Pop

393 a8083063 Iustin Pop
  """
394 83e5e26f René Nussbaumer
  def __init__(self, name, state, autostart, snodes):
395 a8083063 Iustin Pop
    self.name = name
396 a8083063 Iustin Pop
    self.state = state
397 5a3103e9 Michael Hanselmann
    self.autostart = autostart
398 83e5e26f René Nussbaumer
    self.snodes = snodes
399 a8083063 Iustin Pop
400 a8083063 Iustin Pop
  def Restart(self):
401 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
402 3ecf6786 Iustin Pop

403 3ecf6786 Iustin Pop
    """
404 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
405 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
406 a8083063 Iustin Pop
407 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
408 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
409 5a3103e9 Michael Hanselmann

410 5a3103e9 Michael Hanselmann
    """
411 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
412 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
413 a8083063 Iustin Pop
414 a8083063 Iustin Pop
415 6dfcc47b Iustin Pop
def GetClusterData():
416 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
417 5a3103e9 Michael Hanselmann

418 5a3103e9 Michael Hanselmann
  """
419 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
420 f2af0bec Iustin Pop
  op1 = opcodes.OpInstanceQuery(output_fields=op1_fields, names=[],
421 f2af0bec Iustin Pop
                                use_locking=True)
422 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
423 2237687b Iustin Pop
  op2 = opcodes.OpNodeQuery(output_fields=op2_fields, names=[],
424 2237687b Iustin Pop
                            use_locking=True)
425 a8083063 Iustin Pop
426 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
427 a8083063 Iustin Pop
428 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
429 5a3103e9 Michael Hanselmann
430 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
431 78f44650 Iustin Pop
432 6dfcc47b Iustin Pop
  result = all_results[0]
433 6dfcc47b Iustin Pop
  smap = {}
434 5a3103e9 Michael Hanselmann
435 6dfcc47b Iustin Pop
  instances = {}
436 78f44650 Iustin Pop
437 78f44650 Iustin Pop
  # write the upfile
438 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
439 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
440 78f44650 Iustin Pop
441 6dfcc47b Iustin Pop
  for fields in result:
442 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
443 5a3103e9 Michael Hanselmann
444 6dfcc47b Iustin Pop
    # update the secondary node map
445 6dfcc47b Iustin Pop
    for node in snodes:
446 6dfcc47b Iustin Pop
      if node not in smap:
447 6dfcc47b Iustin Pop
        smap[node] = []
448 6dfcc47b Iustin Pop
      smap[node].append(name)
449 a8083063 Iustin Pop
450 83e5e26f René Nussbaumer
    instances[name] = Instance(name, status, autostart, snodes)
451 5a3103e9 Michael Hanselmann
452 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
453 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
454 5a3103e9 Michael Hanselmann
455 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
456 5a3103e9 Michael Hanselmann
457 6dfcc47b Iustin Pop
  return instances, nodes, smap
458 a8083063 Iustin Pop
459 a8083063 Iustin Pop
460 5a3103e9 Michael Hanselmann
class Watcher(object):
461 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
462 a8083063 Iustin Pop

463 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
464 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
465 a8083063 Iustin Pop
  to restart machines that are down.
466 38242904 Iustin Pop

467 a8083063 Iustin Pop
  """
468 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
469 cc962d58 Iustin Pop
    self.notepad = notepad
470 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
471 b705c7a6 Manuel Franceschini
    if master != netutils.Hostname.GetSysName():
472 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
473 24edc6d4 Iustin Pop
    # first archive old jobs
474 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
475 24edc6d4 Iustin Pop
    # and only then submit new ones
476 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
477 eee1fa2d Iustin Pop
    self.started_instances = set()
478 f07521e5 Iustin Pop
    self.opts = opts
479 a8083063 Iustin Pop
480 a8083063 Iustin Pop
  def Run(self):
481 cc962d58 Iustin Pop
    """Watcher run sequence.
482 cc962d58 Iustin Pop

483 cc962d58 Iustin Pop
    """
484 cc962d58 Iustin Pop
    notepad = self.notepad
485 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
486 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
487 cc962d58 Iustin Pop
    self.VerifyDisks()
488 5a3103e9 Michael Hanselmann
489 24edc6d4 Iustin Pop
  @staticmethod
490 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
491 f07521e5 Iustin Pop
    """Archive old jobs.
492 f07521e5 Iustin Pop

493 f07521e5 Iustin Pop
    """
494 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
495 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
496 f07521e5 Iustin Pop
497 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
498 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
499 38242904 Iustin Pop

500 a8083063 Iustin Pop
    """
501 5a3103e9 Michael Hanselmann
    check_nodes = []
502 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
503 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
504 37b77b18 Iustin Pop
      if new_id is None:
505 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
506 cbfc4681 Iustin Pop
        if not offline:
507 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
508 cbfc4681 Iustin Pop
                        name)
509 37b77b18 Iustin Pop
        continue
510 26517d45 Iustin Pop
      if old != new_id:
511 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
512 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
513 5a3103e9 Michael Hanselmann
514 5a3103e9 Michael Hanselmann
    if check_nodes:
515 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
516 5a3103e9 Michael Hanselmann
      # secondary node.
517 6dfcc47b Iustin Pop
      for node in check_nodes:
518 6dfcc47b Iustin Pop
        if node not in self.smap:
519 eee1fa2d Iustin Pop
          continue
520 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
521 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
522 6dfcc47b Iustin Pop
          if not instance.autostart:
523 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
524 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
525 6dfcc47b Iustin Pop
            continue
526 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
527 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
528 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
529 6dfcc47b Iustin Pop
            continue
530 6dfcc47b Iustin Pop
          try:
531 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
532 6dfcc47b Iustin Pop
            instance.ActivateDisks()
533 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
534 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
535 6dfcc47b Iustin Pop
                              instance.name)
536 5a3103e9 Michael Hanselmann
537 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
538 5a3103e9 Michael Hanselmann
      for name in check_nodes:
539 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
540 a8083063 Iustin Pop
541 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
542 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
543 5a3103e9 Michael Hanselmann

544 5a3103e9 Michael Hanselmann
    """
545 f5116c87 Iustin Pop
    notepad.MaintainInstanceList(self.instances.keys())
546 f5116c87 Iustin Pop
547 6dfcc47b Iustin Pop
    for instance in self.instances.values():
548 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
549 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
550 a8083063 Iustin Pop
551 a8083063 Iustin Pop
        if n > MAXTRIES:
552 f5116c87 Iustin Pop
          logging.warning("Not restarting instance %s, retries exhausted",
553 f5116c87 Iustin Pop
                          instance.name)
554 a8083063 Iustin Pop
          continue
555 a8083063 Iustin Pop
        elif n < MAXTRIES:
556 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
557 a8083063 Iustin Pop
        else:
558 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
559 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
560 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
561 a8083063 Iustin Pop
          continue
562 a8083063 Iustin Pop
        try:
563 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
564 438b45d4 Michael Hanselmann
                        instance.name, last)
565 a8083063 Iustin Pop
          instance.Restart()
566 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
567 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
568 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
569 4bffa7f7 Iustin Pop
                            instance.name)
570 a8083063 Iustin Pop
571 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
572 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
573 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
574 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
575 a8083063 Iustin Pop
      else:
576 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
577 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
578 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
579 a8083063 Iustin Pop
580 83e5e26f René Nussbaumer
  def _CheckForOfflineNodes(self, instance):
581 83e5e26f René Nussbaumer
    """Checks if given instances has any secondary in offline status.
582 83e5e26f René Nussbaumer

583 83e5e26f René Nussbaumer
    @param instance: The instance object
584 83e5e26f René Nussbaumer
    @return: True if any of the secondary is offline, False otherwise
585 83e5e26f René Nussbaumer

586 83e5e26f René Nussbaumer
    """
587 83e5e26f René Nussbaumer
    bootids = []
588 83e5e26f René Nussbaumer
    for node in instance.snodes:
589 83e5e26f René Nussbaumer
      bootids.append(self.bootids[node])
590 83e5e26f René Nussbaumer
591 83e5e26f René Nussbaumer
    return compat.any(offline for (_, offline) in bootids)
592 83e5e26f René Nussbaumer
593 83e5e26f René Nussbaumer
  def VerifyDisks(self):
594 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
595 d2f311db Iustin Pop

596 d2f311db Iustin Pop
    """
597 bd8210a7 Iustin Pop
    op = opcodes.OpClusterVerifyDisks()
598 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
599 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
600 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
601 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
602 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
603 5188ab37 Iustin Pop
      return
604 83e5e26f René Nussbaumer
    offline_disk_instances = result[1]
605 5188ab37 Iustin Pop
    if not offline_disk_instances:
606 5188ab37 Iustin Pop
      # nothing to do
607 5188ab37 Iustin Pop
      return
608 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
609 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
610 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
611 5188ab37 Iustin Pop
    # less the job queue
612 83e5e26f René Nussbaumer
    job = []
613 83e5e26f René Nussbaumer
    for name in offline_disk_instances:
614 83e5e26f René Nussbaumer
      instance = self.instances[name]
615 83e5e26f René Nussbaumer
      if (instance.state in HELPLESS_STATES or
616 83e5e26f René Nussbaumer
          self._CheckForOfflineNodes(instance)):
617 83e5e26f René Nussbaumer
        logging.info("Skip instance %s because it is in helpless state or has"
618 83e5e26f René Nussbaumer
                     " one offline secondary", name)
619 83e5e26f René Nussbaumer
        continue
620 83e5e26f René Nussbaumer
      job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
621 5188ab37 Iustin Pop
622 83e5e26f René Nussbaumer
    if job:
623 83e5e26f René Nussbaumer
      job_id = cli.SendJob(job, cl=client)
624 83e5e26f René Nussbaumer
625 83e5e26f René Nussbaumer
      try:
626 83e5e26f René Nussbaumer
        cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
627 83e5e26f René Nussbaumer
      except Exception: # pylint: disable-msg=W0703
628 83e5e26f René Nussbaumer
        logging.exception("Error while activating disks")
629 a8083063 Iustin Pop
630 a8083063 Iustin Pop
631 001b3825 Michael Hanselmann
def OpenStateFile(path):
632 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
633 001b3825 Michael Hanselmann

634 001b3825 Michael Hanselmann
  @type path: string
635 001b3825 Michael Hanselmann
  @param path: Path to state file
636 001b3825 Michael Hanselmann

637 001b3825 Michael Hanselmann
  """
638 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
639 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
640 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
641 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
642 001b3825 Michael Hanselmann
643 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
644 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
645 001b3825 Michael Hanselmann
  # watcher from running.
646 001b3825 Michael Hanselmann
  try:
647 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
648 001b3825 Michael Hanselmann
  except errors.LockError, err:
649 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
650 001b3825 Michael Hanselmann
    return None
651 001b3825 Michael Hanselmann
652 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
653 001b3825 Michael Hanselmann
654 001b3825 Michael Hanselmann
655 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
656 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
657 db147305 Tom Limoncelli

658 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
659 db147305 Tom Limoncelli
  test is GetVersion.
660 db147305 Tom Limoncelli

661 db147305 Tom Limoncelli
  @type hostname: string
662 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
663 db147305 Tom Limoncelli
  @rtype: bool
664 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
665 db147305 Tom Limoncelli

666 db147305 Tom Limoncelli
  """
667 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
668 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
669 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
670 db147305 Tom Limoncelli
  try:
671 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
672 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
673 db147305 Tom Limoncelli
    logging.warning("RAPI Error: CertificateError (%s)", err)
674 db147305 Tom Limoncelli
    return False
675 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
676 db147305 Tom Limoncelli
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
677 db147305 Tom Limoncelli
    return False
678 db147305 Tom Limoncelli
  logging.debug("RAPI Result: master_version is %s", master_version)
679 db147305 Tom Limoncelli
  return master_version == constants.RAPI_VERSION
680 db147305 Tom Limoncelli
681 db147305 Tom Limoncelli
682 a8083063 Iustin Pop
def ParseOptions():
683 a8083063 Iustin Pop
  """Parse the command line options.
684 a8083063 Iustin Pop

685 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
686 a8083063 Iustin Pop

687 a8083063 Iustin Pop
  """
688 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
689 a8083063 Iustin Pop
                        usage="%prog [-d]",
690 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
691 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
692 a8083063 Iustin Pop
693 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
694 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
695 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
696 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
697 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
698 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
699 a8083063 Iustin Pop
  options, args = parser.parse_args()
700 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
701 a8083063 Iustin Pop
  return options, args
702 a8083063 Iustin Pop
703 a8083063 Iustin Pop
704 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
705 9f4bb951 Michael Hanselmann
def Main():
706 a8083063 Iustin Pop
  """Main function.
707 a8083063 Iustin Pop

708 a8083063 Iustin Pop
  """
709 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
710 e125c67c Michael Hanselmann
711 f93427cd Iustin Pop
  options, args = ParseOptions()
712 f93427cd Iustin Pop
713 f93427cd Iustin Pop
  if args: # watcher doesn't take any arguments
714 f93427cd Iustin Pop
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
715 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
716 a8083063 Iustin Pop
717 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
718 82d9caef Iustin Pop
                     stderr_logging=options.debug)
719 a8083063 Iustin Pop
720 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
721 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
722 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
723 3753b2cb Michael Hanselmann
724 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
725 001b3825 Michael Hanselmann
  if not statefile:
726 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
727 001b3825 Michael Hanselmann
728 24edc6d4 Iustin Pop
  update_file = False
729 a8083063 Iustin Pop
  try:
730 f1115454 Guido Trotter
    StartNodeDaemons()
731 9e289e36 Guido Trotter
    RunWatcherHooks()
732 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
733 50273051 Iustin Pop
    # masters can be properly cleaned up too
734 50273051 Iustin Pop
    if NodeMaintenance.ShouldRun():
735 50273051 Iustin Pop
      NodeMaintenance().Exec()
736 c4f0219c Iustin Pop
737 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
738 781b2b2b Michael Hanselmann
    try:
739 2c404217 Iustin Pop
      try:
740 2c404217 Iustin Pop
        client = cli.GetClient()
741 2c404217 Iustin Pop
      except errors.OpPrereqError:
742 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
743 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
744 24edc6d4 Iustin Pop
        update_file = True
745 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
746 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
747 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
748 7dfb83c2 Iustin Pop
                        str(err))
749 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
750 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
751 9f4bb951 Michael Hanselmann
          return constants.EXIT_FAILURE
752 7dfb83c2 Iustin Pop
        # else retry the connection
753 7dfb83c2 Iustin Pop
        client = cli.GetClient()
754 cc962d58 Iustin Pop
755 83052f9e Guido Trotter
      # we are on master now
756 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
757 c4f0219c Iustin Pop
758 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
759 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
760 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
761 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
762 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
763 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
764 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
765 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
766 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
767 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
768 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
769 db147305 Tom Limoncelli
770 cc962d58 Iustin Pop
      try:
771 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
772 cc962d58 Iustin Pop
      except errors.ConfigurationError:
773 cc962d58 Iustin Pop
        # Just exit if there's no configuration
774 24edc6d4 Iustin Pop
        update_file = True
775 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
776 e125c67c Michael Hanselmann
777 cc962d58 Iustin Pop
      watcher.Run()
778 24edc6d4 Iustin Pop
      update_file = True
779 24edc6d4 Iustin Pop
780 cc962d58 Iustin Pop
    finally:
781 7dfb83c2 Iustin Pop
      if update_file:
782 7dfb83c2 Iustin Pop
        notepad.Save()
783 7dfb83c2 Iustin Pop
      else:
784 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
785 1b052f42 Michael Hanselmann
  except SystemExit:
786 1b052f42 Michael Hanselmann
    raise
787 38242904 Iustin Pop
  except NotMasterError:
788 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
789 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
790 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
791 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
792 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
793 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
794 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
795 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
796 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
797 438b45d4 Michael Hanselmann
  except Exception, err:
798 001b3825 Michael Hanselmann
    logging.exception(str(err))
799 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
800 5a3103e9 Michael Hanselmann
801 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS