Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ ae1a845c

History | View | Annotate | Download (25.3 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 cfcc79c6 Michael Hanselmann
import os.path
36 a8083063 Iustin Pop
import sys
37 a8083063 Iustin Pop
import time
38 438b45d4 Michael Hanselmann
import logging
39 a8083063 Iustin Pop
from optparse import OptionParser
40 a8083063 Iustin Pop
41 a8083063 Iustin Pop
from ganeti import utils
42 a8083063 Iustin Pop
from ganeti import constants
43 83e5e26f René Nussbaumer
from ganeti import compat
44 67fe61c4 Michael Hanselmann
from ganeti import serializer
45 89e1fc26 Iustin Pop
from ganeti import errors
46 e125c67c Michael Hanselmann
from ganeti import opcodes
47 e125c67c Michael Hanselmann
from ganeti import cli
48 7dfb83c2 Iustin Pop
from ganeti import luxi
49 50273051 Iustin Pop
from ganeti import ssconf
50 50273051 Iustin Pop
from ganeti import bdev
51 50273051 Iustin Pop
from ganeti import hypervisor
52 db147305 Tom Limoncelli
from ganeti import rapi
53 50273051 Iustin Pop
from ganeti.confd import client as confd_client
54 a744b676 Manuel Franceschini
from ganeti import netutils
55 a8083063 Iustin Pop
56 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
57 db147305 Tom Limoncelli
58 a8083063 Iustin Pop
59 5a3103e9 Michael Hanselmann
MAXTRIES = 5
60 f5116c87 Iustin Pop
# Delete any record that is older than 8 hours; this value is based on
61 f5116c87 Iustin Pop
# the fact that the current retry counter is 5, and watcher runs every
62 f5116c87 Iustin Pop
# 5 minutes, so it takes around half an hour to exceed the retry
63 f5116c87 Iustin Pop
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
64 f5116c87 Iustin Pop
RETRY_EXPIRATION = 8 * 3600
65 61a980a9 Michael Hanselmann
BAD_STATES = [constants.INSTST_ERRORDOWN]
66 61a980a9 Michael Hanselmann
HELPLESS_STATES = [constants.INSTST_NODEDOWN, constants.INSTST_NODEOFFLINE]
67 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
68 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
69 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
70 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
71 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
72 5a3103e9 Michael Hanselmann
73 5a3103e9 Michael Hanselmann
74 a0aa6b49 Michael Hanselmann
# Global LUXI client object
75 e125c67c Michael Hanselmann
client = None
76 e125c67c Michael Hanselmann
77 e125c67c Michael Hanselmann
78 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
79 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
80 a8083063 Iustin Pop
81 a8083063 Iustin Pop
82 3753b2cb Michael Hanselmann
def ShouldPause():
83 3753b2cb Michael Hanselmann
  """Check whether we should pause.
84 3753b2cb Michael Hanselmann

85 3753b2cb Michael Hanselmann
  """
86 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
87 3753b2cb Michael Hanselmann
88 3753b2cb Michael Hanselmann
89 f1115454 Guido Trotter
def StartNodeDaemons():
90 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
91 f1115454 Guido Trotter

92 f1115454 Guido Trotter
  """
93 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
94 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
95 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
96 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
97 f1115454 Guido Trotter
98 f1115454 Guido Trotter
99 9e289e36 Guido Trotter
def RunWatcherHooks():
100 9e289e36 Guido Trotter
  """Run the watcher hooks.
101 9e289e36 Guido Trotter

102 9e289e36 Guido Trotter
  """
103 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
104 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
105 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
106 10e689d4 Iustin Pop
    return
107 9e289e36 Guido Trotter
108 9e289e36 Guido Trotter
  try:
109 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
110 a0aa6b49 Michael Hanselmann
  except Exception: # pylint: disable-msg=W0703
111 a0aa6b49 Michael Hanselmann
    logging.exception("RunParts %s failed: %s", hooks_dir)
112 a0aa6b49 Michael Hanselmann
    return
113 9e289e36 Guido Trotter
114 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
115 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
116 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
117 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
118 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
119 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
120 9e289e36 Guido Trotter
      if runresult.failed:
121 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
122 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
123 9e289e36 Guido Trotter
      else:
124 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
125 9e289e36 Guido Trotter
                      runresult.output)
126 9e289e36 Guido Trotter
127 001b3825 Michael Hanselmann
128 50273051 Iustin Pop
class NodeMaintenance(object):
129 50273051 Iustin Pop
  """Talks to confd daemons and possible shutdown instances/drbd devices.
130 50273051 Iustin Pop

131 50273051 Iustin Pop
  """
132 50273051 Iustin Pop
  def __init__(self):
133 50273051 Iustin Pop
    self.store_cb = confd_client.StoreResultCallback()
134 50273051 Iustin Pop
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
135 50273051 Iustin Pop
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
136 50273051 Iustin Pop
137 50273051 Iustin Pop
  @staticmethod
138 50273051 Iustin Pop
  def ShouldRun():
139 50273051 Iustin Pop
    """Checks whether node maintenance should run.
140 50273051 Iustin Pop

141 50273051 Iustin Pop
    """
142 50273051 Iustin Pop
    try:
143 50273051 Iustin Pop
      return ssconf.SimpleStore().GetMaintainNodeHealth()
144 50273051 Iustin Pop
    except errors.ConfigurationError, err:
145 50273051 Iustin Pop
      logging.error("Configuration error, not activating node maintenance: %s",
146 50273051 Iustin Pop
                    err)
147 50273051 Iustin Pop
      return False
148 50273051 Iustin Pop
149 50273051 Iustin Pop
  @staticmethod
150 50273051 Iustin Pop
  def GetRunningInstances():
151 50273051 Iustin Pop
    """Compute list of hypervisor/running instances.
152 50273051 Iustin Pop

153 50273051 Iustin Pop
    """
154 50273051 Iustin Pop
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
155 50273051 Iustin Pop
    results = []
156 50273051 Iustin Pop
    for hv_name in hyp_list:
157 50273051 Iustin Pop
      try:
158 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
159 50273051 Iustin Pop
        ilist = hv.ListInstances()
160 50273051 Iustin Pop
        results.extend([(iname, hv_name) for iname in ilist])
161 50273051 Iustin Pop
      except: # pylint: disable-msg=W0702
162 50273051 Iustin Pop
        logging.error("Error while listing instances for hypervisor %s",
163 50273051 Iustin Pop
                      hv_name, exc_info=True)
164 50273051 Iustin Pop
    return results
165 50273051 Iustin Pop
166 50273051 Iustin Pop
  @staticmethod
167 50273051 Iustin Pop
  def GetUsedDRBDs():
168 50273051 Iustin Pop
    """Get list of used DRBD minors.
169 50273051 Iustin Pop

170 50273051 Iustin Pop
    """
171 50273051 Iustin Pop
    return bdev.DRBD8.GetUsedDevs().keys()
172 50273051 Iustin Pop
173 50273051 Iustin Pop
  @classmethod
174 50273051 Iustin Pop
  def DoMaintenance(cls, role):
175 50273051 Iustin Pop
    """Maintain the instance list.
176 50273051 Iustin Pop

177 50273051 Iustin Pop
    """
178 50273051 Iustin Pop
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
179 50273051 Iustin Pop
      inst_running = cls.GetRunningInstances()
180 50273051 Iustin Pop
      cls.ShutdownInstances(inst_running)
181 50273051 Iustin Pop
      drbd_running = cls.GetUsedDRBDs()
182 50273051 Iustin Pop
      cls.ShutdownDRBD(drbd_running)
183 50273051 Iustin Pop
    else:
184 50273051 Iustin Pop
      logging.debug("Not doing anything for role %s", role)
185 50273051 Iustin Pop
186 50273051 Iustin Pop
  @staticmethod
187 50273051 Iustin Pop
  def ShutdownInstances(inst_running):
188 50273051 Iustin Pop
    """Shutdown running instances.
189 50273051 Iustin Pop

190 50273051 Iustin Pop
    """
191 50273051 Iustin Pop
    names_running = set([i[0] for i in inst_running])
192 50273051 Iustin Pop
    if names_running:
193 50273051 Iustin Pop
      logging.info("Following instances should not be running,"
194 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(names_running))
195 50273051 Iustin Pop
      # this dictionary will collapse duplicate instance names (only
196 50273051 Iustin Pop
      # xen pvm/vhm) into a single key, which is fine
197 50273051 Iustin Pop
      i2h = dict(inst_running)
198 50273051 Iustin Pop
      for name in names_running:
199 50273051 Iustin Pop
        hv_name = i2h[name]
200 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
201 50273051 Iustin Pop
        hv.StopInstance(None, force=True, name=name)
202 50273051 Iustin Pop
203 50273051 Iustin Pop
  @staticmethod
204 50273051 Iustin Pop
  def ShutdownDRBD(drbd_running):
205 50273051 Iustin Pop
    """Shutdown active DRBD devices.
206 50273051 Iustin Pop

207 50273051 Iustin Pop
    """
208 50273051 Iustin Pop
    if drbd_running:
209 50273051 Iustin Pop
      logging.info("Following DRBD minors should not be active,"
210 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
211 50273051 Iustin Pop
      for minor in drbd_running:
212 50273051 Iustin Pop
        # pylint: disable-msg=W0212
213 50273051 Iustin Pop
        # using the private method as is, pending enhancements to the DRBD
214 50273051 Iustin Pop
        # interface
215 50273051 Iustin Pop
        bdev.DRBD8._ShutdownAll(minor)
216 50273051 Iustin Pop
217 50273051 Iustin Pop
  def Exec(self):
218 50273051 Iustin Pop
    """Check node status versus cluster desired state.
219 50273051 Iustin Pop

220 50273051 Iustin Pop
    """
221 b705c7a6 Manuel Franceschini
    my_name = netutils.Hostname.GetSysName()
222 50273051 Iustin Pop
    req = confd_client.ConfdClientRequest(type=
223 50273051 Iustin Pop
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
224 50273051 Iustin Pop
                                          query=my_name)
225 ebacb943 Iustin Pop
    self.confd_client.SendRequest(req, async=False, coverage=-1)
226 50273051 Iustin Pop
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
227 50273051 Iustin Pop
    if not timed_out:
228 50273051 Iustin Pop
      # should have a valid response
229 50273051 Iustin Pop
      status, result = self.store_cb.GetResponse(req.rsalt)
230 50273051 Iustin Pop
      assert status, "Missing result but received replies"
231 50273051 Iustin Pop
      if not self.filter_cb.consistent[req.rsalt]:
232 50273051 Iustin Pop
        logging.warning("Inconsistent replies, not doing anything")
233 50273051 Iustin Pop
        return
234 50273051 Iustin Pop
      self.DoMaintenance(result.server_reply.answer)
235 50273051 Iustin Pop
    else:
236 50273051 Iustin Pop
      logging.warning("Confd query timed out, cannot do maintenance actions")
237 50273051 Iustin Pop
238 50273051 Iustin Pop
239 5a3103e9 Michael Hanselmann
class WatcherState(object):
240 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
241 a8083063 Iustin Pop

242 a8083063 Iustin Pop
  """
243 001b3825 Michael Hanselmann
  def __init__(self, statefile):
244 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
245 5a3103e9 Michael Hanselmann

246 001b3825 Michael Hanselmann
    @type statefile: file
247 001b3825 Michael Hanselmann
    @param statefile: State file object
248 5a3103e9 Michael Hanselmann

249 5a3103e9 Michael Hanselmann
    """
250 001b3825 Michael Hanselmann
    self.statefile = statefile
251 a8083063 Iustin Pop
252 5a3103e9 Michael Hanselmann
    try:
253 2c404217 Iustin Pop
      state_data = self.statefile.read()
254 2c404217 Iustin Pop
      if not state_data:
255 2c404217 Iustin Pop
        self._data = {}
256 2c404217 Iustin Pop
      else:
257 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
258 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
259 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
260 b76f660d Michael Hanselmann
      self._data = {}
261 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
262 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
263 5a3103e9 Michael Hanselmann
264 b76f660d Michael Hanselmann
    if "instance" not in self._data:
265 b76f660d Michael Hanselmann
      self._data["instance"] = {}
266 b76f660d Michael Hanselmann
    if "node" not in self._data:
267 b76f660d Michael Hanselmann
      self._data["node"] = {}
268 5a3103e9 Michael Hanselmann
269 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
270 2fb96d39 Michael Hanselmann
271 fc428e32 Michael Hanselmann
  def Save(self):
272 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
273 5a3103e9 Michael Hanselmann

274 5a3103e9 Michael Hanselmann
    """
275 fc428e32 Michael Hanselmann
    assert self.statefile
276 fc428e32 Michael Hanselmann
277 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
278 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
279 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
280 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
281 2fb96d39 Michael Hanselmann
      return
282 2fb96d39 Michael Hanselmann
283 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
284 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
285 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
286 26517d45 Iustin Pop
                         data=serialized_form,
287 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
288 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
289 5a3103e9 Michael Hanselmann
290 fc428e32 Michael Hanselmann
  def Close(self):
291 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
292 5a3103e9 Michael Hanselmann

293 5a3103e9 Michael Hanselmann
    """
294 5a3103e9 Michael Hanselmann
    assert self.statefile
295 5a3103e9 Michael Hanselmann
296 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
297 5a3103e9 Michael Hanselmann
    self.statefile.close()
298 5a3103e9 Michael Hanselmann
    self.statefile = None
299 5a3103e9 Michael Hanselmann
300 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
301 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
302 a8083063 Iustin Pop

303 5a3103e9 Michael Hanselmann
    """
304 b76f660d Michael Hanselmann
    ndata = self._data["node"]
305 5a3103e9 Michael Hanselmann
306 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
307 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
308 5a3103e9 Michael Hanselmann
    return None
309 5a3103e9 Michael Hanselmann
310 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
311 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
312 5a3103e9 Michael Hanselmann

313 5a3103e9 Michael Hanselmann
    """
314 5a3103e9 Michael Hanselmann
    assert bootid
315 a8083063 Iustin Pop
316 b76f660d Michael Hanselmann
    ndata = self._data["node"]
317 a8083063 Iustin Pop
318 5a3103e9 Michael Hanselmann
    if name not in ndata:
319 5a3103e9 Michael Hanselmann
      ndata[name] = {}
320 5a3103e9 Michael Hanselmann
321 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
322 5a3103e9 Michael Hanselmann
323 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
324 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
325 a8083063 Iustin Pop

326 c41eea6e Iustin Pop
    @type instance: L{Instance}
327 c41eea6e Iustin Pop
    @param instance: the instance to look up
328 38242904 Iustin Pop

329 a8083063 Iustin Pop
    """
330 b76f660d Michael Hanselmann
    idata = self._data["instance"]
331 a8083063 Iustin Pop
332 5a3103e9 Michael Hanselmann
    if instance.name in idata:
333 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
334 a8083063 Iustin Pop
335 a8083063 Iustin Pop
    return 0
336 a8083063 Iustin Pop
337 f5116c87 Iustin Pop
  def MaintainInstanceList(self, instances):
338 f5116c87 Iustin Pop
    """Perform maintenance on the recorded instances.
339 f5116c87 Iustin Pop

340 f5116c87 Iustin Pop
    @type instances: list of string
341 f5116c87 Iustin Pop
    @param instances: the list of currently existing instances
342 f5116c87 Iustin Pop

343 f5116c87 Iustin Pop
    """
344 f5116c87 Iustin Pop
    idict = self._data["instance"]
345 f5116c87 Iustin Pop
    # First, delete obsolete instances
346 f5116c87 Iustin Pop
    obsolete_instances = set(idict).difference(instances)
347 f5116c87 Iustin Pop
    for inst in obsolete_instances:
348 f5116c87 Iustin Pop
      logging.debug("Forgetting obsolete instance %s", inst)
349 f5116c87 Iustin Pop
      del idict[inst]
350 f5116c87 Iustin Pop
351 f5116c87 Iustin Pop
    # Second, delete expired records
352 f5116c87 Iustin Pop
    earliest = time.time() - RETRY_EXPIRATION
353 f5116c87 Iustin Pop
    expired_instances = [i for i in idict
354 f5116c87 Iustin Pop
                         if idict[i][KEY_RESTART_WHEN] < earliest]
355 f5116c87 Iustin Pop
    for inst in expired_instances:
356 f5116c87 Iustin Pop
      logging.debug("Expiring record for instance %s", inst)
357 f5116c87 Iustin Pop
      del idict[inst]
358 f5116c87 Iustin Pop
359 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
360 a8083063 Iustin Pop
    """Record a restart attempt.
361 a8083063 Iustin Pop

362 c41eea6e Iustin Pop
    @type instance: L{Instance}
363 c41eea6e Iustin Pop
    @param instance: the instance being restarted
364 38242904 Iustin Pop

365 a8083063 Iustin Pop
    """
366 b76f660d Michael Hanselmann
    idata = self._data["instance"]
367 a8083063 Iustin Pop
368 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
369 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
370 5a3103e9 Michael Hanselmann
    else:
371 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
372 a8083063 Iustin Pop
373 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
374 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
375 a8083063 Iustin Pop
376 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
377 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
378 a8083063 Iustin Pop

379 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
380 c41eea6e Iustin Pop
    track down instances).
381 a8083063 Iustin Pop

382 c41eea6e Iustin Pop
    @type instance: L{Instance}
383 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
384 38242904 Iustin Pop

385 a8083063 Iustin Pop
    """
386 b76f660d Michael Hanselmann
    idata = self._data["instance"]
387 a8083063 Iustin Pop
388 5a3103e9 Michael Hanselmann
    if instance.name in idata:
389 5a3103e9 Michael Hanselmann
      del idata[instance.name]
390 a8083063 Iustin Pop
391 a8083063 Iustin Pop
392 a8083063 Iustin Pop
class Instance(object):
393 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
394 a8083063 Iustin Pop

395 a8083063 Iustin Pop
  """
396 83e5e26f René Nussbaumer
  def __init__(self, name, state, autostart, snodes):
397 a8083063 Iustin Pop
    self.name = name
398 a8083063 Iustin Pop
    self.state = state
399 5a3103e9 Michael Hanselmann
    self.autostart = autostart
400 83e5e26f René Nussbaumer
    self.snodes = snodes
401 a8083063 Iustin Pop
402 a8083063 Iustin Pop
  def Restart(self):
403 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
404 3ecf6786 Iustin Pop

405 3ecf6786 Iustin Pop
    """
406 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
407 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
408 a8083063 Iustin Pop
409 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
410 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
411 5a3103e9 Michael Hanselmann

412 5a3103e9 Michael Hanselmann
    """
413 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
414 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
415 a8083063 Iustin Pop
416 a8083063 Iustin Pop
417 6dfcc47b Iustin Pop
def GetClusterData():
418 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
419 5a3103e9 Michael Hanselmann

420 5a3103e9 Michael Hanselmann
  """
421 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
422 f2af0bec Iustin Pop
  op1 = opcodes.OpInstanceQuery(output_fields=op1_fields, names=[],
423 f2af0bec Iustin Pop
                                use_locking=True)
424 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
425 2237687b Iustin Pop
  op2 = opcodes.OpNodeQuery(output_fields=op2_fields, names=[],
426 2237687b Iustin Pop
                            use_locking=True)
427 a8083063 Iustin Pop
428 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
429 a8083063 Iustin Pop
430 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
431 5a3103e9 Michael Hanselmann
432 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
433 78f44650 Iustin Pop
434 6dfcc47b Iustin Pop
  result = all_results[0]
435 6dfcc47b Iustin Pop
  smap = {}
436 5a3103e9 Michael Hanselmann
437 6dfcc47b Iustin Pop
  instances = {}
438 78f44650 Iustin Pop
439 78f44650 Iustin Pop
  # write the upfile
440 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
441 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
442 78f44650 Iustin Pop
443 6dfcc47b Iustin Pop
  for fields in result:
444 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
445 5a3103e9 Michael Hanselmann
446 6dfcc47b Iustin Pop
    # update the secondary node map
447 6dfcc47b Iustin Pop
    for node in snodes:
448 6dfcc47b Iustin Pop
      if node not in smap:
449 6dfcc47b Iustin Pop
        smap[node] = []
450 6dfcc47b Iustin Pop
      smap[node].append(name)
451 a8083063 Iustin Pop
452 83e5e26f René Nussbaumer
    instances[name] = Instance(name, status, autostart, snodes)
453 5a3103e9 Michael Hanselmann
454 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
455 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
456 5a3103e9 Michael Hanselmann
457 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
458 5a3103e9 Michael Hanselmann
459 6dfcc47b Iustin Pop
  return instances, nodes, smap
460 a8083063 Iustin Pop
461 a8083063 Iustin Pop
462 5a3103e9 Michael Hanselmann
class Watcher(object):
463 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
464 a8083063 Iustin Pop

465 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
466 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
467 a8083063 Iustin Pop
  to restart machines that are down.
468 38242904 Iustin Pop

469 a8083063 Iustin Pop
  """
470 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
471 cc962d58 Iustin Pop
    self.notepad = notepad
472 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
473 b705c7a6 Manuel Franceschini
    if master != netutils.Hostname.GetSysName():
474 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
475 24edc6d4 Iustin Pop
    # first archive old jobs
476 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
477 24edc6d4 Iustin Pop
    # and only then submit new ones
478 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
479 eee1fa2d Iustin Pop
    self.started_instances = set()
480 f07521e5 Iustin Pop
    self.opts = opts
481 a8083063 Iustin Pop
482 a8083063 Iustin Pop
  def Run(self):
483 cc962d58 Iustin Pop
    """Watcher run sequence.
484 cc962d58 Iustin Pop

485 cc962d58 Iustin Pop
    """
486 cc962d58 Iustin Pop
    notepad = self.notepad
487 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
488 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
489 cc962d58 Iustin Pop
    self.VerifyDisks()
490 5a3103e9 Michael Hanselmann
491 24edc6d4 Iustin Pop
  @staticmethod
492 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
493 f07521e5 Iustin Pop
    """Archive old jobs.
494 f07521e5 Iustin Pop

495 f07521e5 Iustin Pop
    """
496 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
497 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
498 f07521e5 Iustin Pop
499 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
500 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
501 38242904 Iustin Pop

502 a8083063 Iustin Pop
    """
503 5a3103e9 Michael Hanselmann
    check_nodes = []
504 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
505 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
506 37b77b18 Iustin Pop
      if new_id is None:
507 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
508 cbfc4681 Iustin Pop
        if not offline:
509 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
510 cbfc4681 Iustin Pop
                        name)
511 37b77b18 Iustin Pop
        continue
512 26517d45 Iustin Pop
      if old != new_id:
513 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
514 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
515 5a3103e9 Michael Hanselmann
516 5a3103e9 Michael Hanselmann
    if check_nodes:
517 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
518 5a3103e9 Michael Hanselmann
      # secondary node.
519 6dfcc47b Iustin Pop
      for node in check_nodes:
520 6dfcc47b Iustin Pop
        if node not in self.smap:
521 eee1fa2d Iustin Pop
          continue
522 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
523 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
524 6dfcc47b Iustin Pop
          if not instance.autostart:
525 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
526 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
527 6dfcc47b Iustin Pop
            continue
528 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
529 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
530 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
531 604c175c Iustin Pop
            logging.debug("Skipping disk activation for instance %s, as"
532 604c175c Iustin Pop
                          " it was already started", instance.name)
533 6dfcc47b Iustin Pop
            continue
534 6dfcc47b Iustin Pop
          try:
535 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
536 6dfcc47b Iustin Pop
            instance.ActivateDisks()
537 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
538 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
539 6dfcc47b Iustin Pop
                              instance.name)
540 5a3103e9 Michael Hanselmann
541 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
542 5a3103e9 Michael Hanselmann
      for name in check_nodes:
543 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
544 a8083063 Iustin Pop
545 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
546 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
547 5a3103e9 Michael Hanselmann

548 5a3103e9 Michael Hanselmann
    """
549 f5116c87 Iustin Pop
    notepad.MaintainInstanceList(self.instances.keys())
550 f5116c87 Iustin Pop
551 6dfcc47b Iustin Pop
    for instance in self.instances.values():
552 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
553 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
554 a8083063 Iustin Pop
555 a8083063 Iustin Pop
        if n > MAXTRIES:
556 f5116c87 Iustin Pop
          logging.warning("Not restarting instance %s, retries exhausted",
557 f5116c87 Iustin Pop
                          instance.name)
558 a8083063 Iustin Pop
          continue
559 a8083063 Iustin Pop
        elif n < MAXTRIES:
560 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
561 a8083063 Iustin Pop
        else:
562 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
563 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
564 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
565 a8083063 Iustin Pop
          continue
566 a8083063 Iustin Pop
        try:
567 604c175c Iustin Pop
          logging.info("Restarting %s%s", instance.name, last)
568 a8083063 Iustin Pop
          instance.Restart()
569 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
570 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
571 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
572 4bffa7f7 Iustin Pop
                            instance.name)
573 a8083063 Iustin Pop
574 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
575 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
576 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
577 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
578 a8083063 Iustin Pop
      else:
579 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
580 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
581 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
582 a8083063 Iustin Pop
583 83e5e26f René Nussbaumer
  def _CheckForOfflineNodes(self, instance):
584 83e5e26f René Nussbaumer
    """Checks if given instances has any secondary in offline status.
585 83e5e26f René Nussbaumer

586 83e5e26f René Nussbaumer
    @param instance: The instance object
587 83e5e26f René Nussbaumer
    @return: True if any of the secondary is offline, False otherwise
588 83e5e26f René Nussbaumer

589 83e5e26f René Nussbaumer
    """
590 83e5e26f René Nussbaumer
    bootids = []
591 83e5e26f René Nussbaumer
    for node in instance.snodes:
592 83e5e26f René Nussbaumer
      bootids.append(self.bootids[node])
593 83e5e26f René Nussbaumer
594 83e5e26f René Nussbaumer
    return compat.any(offline for (_, offline) in bootids)
595 83e5e26f René Nussbaumer
596 83e5e26f René Nussbaumer
  def VerifyDisks(self):
597 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
598 d2f311db Iustin Pop

599 d2f311db Iustin Pop
    """
600 ae1a845c Michael Hanselmann
    job_id = client.SubmitJob([opcodes.OpClusterVerifyDisks()])
601 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
602 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
603 ae1a845c Michael Hanselmann
604 ae1a845c Michael Hanselmann
    # Keep track of submitted jobs
605 ae1a845c Michael Hanselmann
    jex = cli.JobExecutor(cl=client, feedback_fn=logging.debug)
606 ae1a845c Michael Hanselmann
607 ae1a845c Michael Hanselmann
    archive_jobs = set()
608 ae1a845c Michael Hanselmann
    for (status, job_id) in result[constants.JOB_IDS_KEY]:
609 ae1a845c Michael Hanselmann
      jex.AddJobId(None, status, job_id)
610 ae1a845c Michael Hanselmann
      if status:
611 ae1a845c Michael Hanselmann
        archive_jobs.add(job_id)
612 ae1a845c Michael Hanselmann
613 ae1a845c Michael Hanselmann
    offline_disk_instances = set()
614 ae1a845c Michael Hanselmann
615 ae1a845c Michael Hanselmann
    for (status, result) in jex.GetResults():
616 ae1a845c Michael Hanselmann
      if not status:
617 ae1a845c Michael Hanselmann
        logging.error("Verify-disks job failed: %s", result)
618 ae1a845c Michael Hanselmann
        continue
619 ae1a845c Michael Hanselmann
620 ae1a845c Michael Hanselmann
      ((_, instances, _), ) = result
621 ae1a845c Michael Hanselmann
622 ae1a845c Michael Hanselmann
      offline_disk_instances.update(instances)
623 ae1a845c Michael Hanselmann
624 ae1a845c Michael Hanselmann
    for job_id in archive_jobs:
625 ae1a845c Michael Hanselmann
      client.ArchiveJob(job_id)
626 ae1a845c Michael Hanselmann
627 5188ab37 Iustin Pop
    if not offline_disk_instances:
628 5188ab37 Iustin Pop
      # nothing to do
629 604c175c Iustin Pop
      logging.debug("verify-disks reported no offline disks, nothing to do")
630 5188ab37 Iustin Pop
      return
631 ae1a845c Michael Hanselmann
632 604c175c Iustin Pop
    logging.debug("Will activate disks for instance(s) %s",
633 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
634 ae1a845c Michael Hanselmann
635 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
636 5188ab37 Iustin Pop
    # less the job queue
637 83e5e26f René Nussbaumer
    job = []
638 83e5e26f René Nussbaumer
    for name in offline_disk_instances:
639 83e5e26f René Nussbaumer
      instance = self.instances[name]
640 83e5e26f René Nussbaumer
      if (instance.state in HELPLESS_STATES or
641 83e5e26f René Nussbaumer
          self._CheckForOfflineNodes(instance)):
642 83e5e26f René Nussbaumer
        logging.info("Skip instance %s because it is in helpless state or has"
643 83e5e26f René Nussbaumer
                     " one offline secondary", name)
644 83e5e26f René Nussbaumer
        continue
645 83e5e26f René Nussbaumer
      job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
646 5188ab37 Iustin Pop
647 83e5e26f René Nussbaumer
    if job:
648 83e5e26f René Nussbaumer
      job_id = cli.SendJob(job, cl=client)
649 83e5e26f René Nussbaumer
650 83e5e26f René Nussbaumer
      try:
651 83e5e26f René Nussbaumer
        cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
652 83e5e26f René Nussbaumer
      except Exception: # pylint: disable-msg=W0703
653 83e5e26f René Nussbaumer
        logging.exception("Error while activating disks")
654 a8083063 Iustin Pop
655 a8083063 Iustin Pop
656 001b3825 Michael Hanselmann
def OpenStateFile(path):
657 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
658 001b3825 Michael Hanselmann

659 001b3825 Michael Hanselmann
  @type path: string
660 001b3825 Michael Hanselmann
  @param path: Path to state file
661 001b3825 Michael Hanselmann

662 001b3825 Michael Hanselmann
  """
663 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
664 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
665 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
666 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
667 001b3825 Michael Hanselmann
668 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
669 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
670 001b3825 Michael Hanselmann
  # watcher from running.
671 001b3825 Michael Hanselmann
  try:
672 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
673 001b3825 Michael Hanselmann
  except errors.LockError, err:
674 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
675 001b3825 Michael Hanselmann
    return None
676 001b3825 Michael Hanselmann
677 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
678 001b3825 Michael Hanselmann
679 001b3825 Michael Hanselmann
680 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
681 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
682 db147305 Tom Limoncelli

683 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
684 db147305 Tom Limoncelli
  test is GetVersion.
685 db147305 Tom Limoncelli

686 db147305 Tom Limoncelli
  @type hostname: string
687 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
688 db147305 Tom Limoncelli
  @rtype: bool
689 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
690 db147305 Tom Limoncelli

691 db147305 Tom Limoncelli
  """
692 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
693 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
694 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
695 db147305 Tom Limoncelli
  try:
696 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
697 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
698 db147305 Tom Limoncelli
    logging.warning("RAPI Error: CertificateError (%s)", err)
699 db147305 Tom Limoncelli
    return False
700 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
701 db147305 Tom Limoncelli
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
702 db147305 Tom Limoncelli
    return False
703 db147305 Tom Limoncelli
  logging.debug("RAPI Result: master_version is %s", master_version)
704 db147305 Tom Limoncelli
  return master_version == constants.RAPI_VERSION
705 db147305 Tom Limoncelli
706 db147305 Tom Limoncelli
707 a8083063 Iustin Pop
def ParseOptions():
708 a8083063 Iustin Pop
  """Parse the command line options.
709 a8083063 Iustin Pop

710 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
711 a8083063 Iustin Pop

712 a8083063 Iustin Pop
  """
713 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
714 a8083063 Iustin Pop
                        usage="%prog [-d]",
715 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
716 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
717 a8083063 Iustin Pop
718 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
719 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
720 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
721 f0a80b01 Michael Hanselmann
                          " 6 hours)")
722 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
723 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
724 a8083063 Iustin Pop
  options, args = parser.parse_args()
725 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
726 f0a80b01 Michael Hanselmann
727 f0a80b01 Michael Hanselmann
  if args:
728 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
729 f0a80b01 Michael Hanselmann
730 f0a80b01 Michael Hanselmann
  return (options, args)
731 a8083063 Iustin Pop
732 a8083063 Iustin Pop
733 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
734 9f4bb951 Michael Hanselmann
def Main():
735 a8083063 Iustin Pop
  """Main function.
736 a8083063 Iustin Pop

737 a8083063 Iustin Pop
  """
738 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
739 e125c67c Michael Hanselmann
740 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
741 a8083063 Iustin Pop
742 cfcc79c6 Michael Hanselmann
  utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
743 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
744 a8083063 Iustin Pop
745 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
746 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
747 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
748 3753b2cb Michael Hanselmann
749 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
750 001b3825 Michael Hanselmann
  if not statefile:
751 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
752 001b3825 Michael Hanselmann
753 24edc6d4 Iustin Pop
  update_file = False
754 a8083063 Iustin Pop
  try:
755 f1115454 Guido Trotter
    StartNodeDaemons()
756 9e289e36 Guido Trotter
    RunWatcherHooks()
757 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
758 50273051 Iustin Pop
    # masters can be properly cleaned up too
759 50273051 Iustin Pop
    if NodeMaintenance.ShouldRun():
760 50273051 Iustin Pop
      NodeMaintenance().Exec()
761 c4f0219c Iustin Pop
762 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
763 781b2b2b Michael Hanselmann
    try:
764 2c404217 Iustin Pop
      try:
765 2c404217 Iustin Pop
        client = cli.GetClient()
766 2c404217 Iustin Pop
      except errors.OpPrereqError:
767 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
768 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
769 24edc6d4 Iustin Pop
        update_file = True
770 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
771 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
772 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
773 7dfb83c2 Iustin Pop
                        str(err))
774 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
775 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
776 9f4bb951 Michael Hanselmann
          return constants.EXIT_FAILURE
777 7dfb83c2 Iustin Pop
        # else retry the connection
778 7dfb83c2 Iustin Pop
        client = cli.GetClient()
779 cc962d58 Iustin Pop
780 83052f9e Guido Trotter
      # we are on master now
781 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
782 c4f0219c Iustin Pop
783 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
784 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
785 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
786 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
787 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
788 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
789 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
790 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
791 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
792 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
793 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
794 db147305 Tom Limoncelli
795 cc962d58 Iustin Pop
      try:
796 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
797 cc962d58 Iustin Pop
      except errors.ConfigurationError:
798 cc962d58 Iustin Pop
        # Just exit if there's no configuration
799 24edc6d4 Iustin Pop
        update_file = True
800 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
801 e125c67c Michael Hanselmann
802 cc962d58 Iustin Pop
      watcher.Run()
803 24edc6d4 Iustin Pop
      update_file = True
804 24edc6d4 Iustin Pop
805 cc962d58 Iustin Pop
    finally:
806 7dfb83c2 Iustin Pop
      if update_file:
807 7dfb83c2 Iustin Pop
        notepad.Save()
808 7dfb83c2 Iustin Pop
      else:
809 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
810 1b052f42 Michael Hanselmann
  except SystemExit:
811 1b052f42 Michael Hanselmann
    raise
812 38242904 Iustin Pop
  except NotMasterError:
813 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
814 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
815 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
816 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
817 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
818 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
819 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
820 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
821 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
822 438b45d4 Michael Hanselmann
  except Exception, err:
823 001b3825 Michael Hanselmann
    logging.exception(str(err))
824 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
825 5a3103e9 Michael Hanselmann
826 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS