Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 1e915b86

History | View | Annotate | Download (20.9 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 a8083063 Iustin Pop
import sys
36 a8083063 Iustin Pop
import time
37 438b45d4 Michael Hanselmann
import logging
38 a8083063 Iustin Pop
from optparse import OptionParser
39 a8083063 Iustin Pop
40 a8083063 Iustin Pop
from ganeti import utils
41 a8083063 Iustin Pop
from ganeti import constants
42 67fe61c4 Michael Hanselmann
from ganeti import serializer
43 89e1fc26 Iustin Pop
from ganeti import errors
44 e125c67c Michael Hanselmann
from ganeti import opcodes
45 e125c67c Michael Hanselmann
from ganeti import cli
46 7dfb83c2 Iustin Pop
from ganeti import luxi
47 50273051 Iustin Pop
from ganeti import ssconf
48 50273051 Iustin Pop
from ganeti import bdev
49 50273051 Iustin Pop
from ganeti import hypervisor
50 50273051 Iustin Pop
from ganeti.confd import client as confd_client
51 a8083063 Iustin Pop
52 a8083063 Iustin Pop
53 5a3103e9 Michael Hanselmann
MAXTRIES = 5
54 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
55 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
56 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
57 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
58 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
59 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
60 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
61 5a3103e9 Michael Hanselmann
62 5a3103e9 Michael Hanselmann
63 e125c67c Michael Hanselmann
# Global client object
64 e125c67c Michael Hanselmann
client = None
65 e125c67c Michael Hanselmann
66 e125c67c Michael Hanselmann
67 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
68 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
69 a8083063 Iustin Pop
70 a8083063 Iustin Pop
71 3753b2cb Michael Hanselmann
def ShouldPause():
72 3753b2cb Michael Hanselmann
  """Check whether we should pause.
73 3753b2cb Michael Hanselmann
74 3753b2cb Michael Hanselmann
  """
75 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
76 3753b2cb Michael Hanselmann
77 3753b2cb Michael Hanselmann
78 f1115454 Guido Trotter
def StartNodeDaemons():
79 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
80 f1115454 Guido Trotter
81 f1115454 Guido Trotter
  """
82 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
83 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
84 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
85 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
86 f1115454 Guido Trotter
87 f1115454 Guido Trotter
88 9e289e36 Guido Trotter
def RunWatcherHooks():
89 9e289e36 Guido Trotter
  """Run the watcher hooks.
90 9e289e36 Guido Trotter
91 9e289e36 Guido Trotter
  """
92 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
93 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
94 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
95 10e689d4 Iustin Pop
    return
96 9e289e36 Guido Trotter
97 9e289e36 Guido Trotter
  try:
98 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
99 9e289e36 Guido Trotter
  except Exception, msg: # pylint: disable-msg=W0703
100 9e289e36 Guido Trotter
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
101 9e289e36 Guido Trotter
102 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
103 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
104 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
105 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
106 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
107 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
108 9e289e36 Guido Trotter
      if runresult.failed:
109 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
110 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
111 9e289e36 Guido Trotter
      else:
112 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
113 9e289e36 Guido Trotter
                      runresult.output)
114 9e289e36 Guido Trotter
115 001b3825 Michael Hanselmann
116 50273051 Iustin Pop
class NodeMaintenance(object):
117 50273051 Iustin Pop
  """Talks to confd daemons and possible shutdown instances/drbd devices.
118 50273051 Iustin Pop
119 50273051 Iustin Pop
  """
120 50273051 Iustin Pop
  def __init__(self):
121 50273051 Iustin Pop
    self.store_cb = confd_client.StoreResultCallback()
122 50273051 Iustin Pop
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
123 50273051 Iustin Pop
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
124 50273051 Iustin Pop
125 50273051 Iustin Pop
  @staticmethod
126 50273051 Iustin Pop
  def ShouldRun():
127 50273051 Iustin Pop
    """Checks whether node maintenance should run.
128 50273051 Iustin Pop
129 50273051 Iustin Pop
    """
130 50273051 Iustin Pop
    try:
131 50273051 Iustin Pop
      return ssconf.SimpleStore().GetMaintainNodeHealth()
132 50273051 Iustin Pop
    except errors.ConfigurationError, err:
133 50273051 Iustin Pop
      logging.error("Configuration error, not activating node maintenance: %s",
134 50273051 Iustin Pop
                    err)
135 50273051 Iustin Pop
      return False
136 50273051 Iustin Pop
137 50273051 Iustin Pop
  @staticmethod
138 50273051 Iustin Pop
  def GetRunningInstances():
139 50273051 Iustin Pop
    """Compute list of hypervisor/running instances.
140 50273051 Iustin Pop
141 50273051 Iustin Pop
    """
142 50273051 Iustin Pop
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
143 50273051 Iustin Pop
    results = []
144 50273051 Iustin Pop
    for hv_name in hyp_list:
145 50273051 Iustin Pop
      try:
146 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
147 50273051 Iustin Pop
        ilist = hv.ListInstances()
148 50273051 Iustin Pop
        results.extend([(iname, hv_name) for iname in ilist])
149 50273051 Iustin Pop
      except: # pylint: disable-msg=W0702
150 50273051 Iustin Pop
        logging.error("Error while listing instances for hypervisor %s",
151 50273051 Iustin Pop
                      hv_name, exc_info=True)
152 50273051 Iustin Pop
    return results
153 50273051 Iustin Pop
154 50273051 Iustin Pop
  @staticmethod
155 50273051 Iustin Pop
  def GetUsedDRBDs():
156 50273051 Iustin Pop
    """Get list of used DRBD minors.
157 50273051 Iustin Pop
158 50273051 Iustin Pop
    """
159 50273051 Iustin Pop
    return bdev.DRBD8.GetUsedDevs().keys()
160 50273051 Iustin Pop
161 50273051 Iustin Pop
  @classmethod
162 50273051 Iustin Pop
  def DoMaintenance(cls, role):
163 50273051 Iustin Pop
    """Maintain the instance list.
164 50273051 Iustin Pop
165 50273051 Iustin Pop
    """
166 50273051 Iustin Pop
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
167 50273051 Iustin Pop
      inst_running = cls.GetRunningInstances()
168 50273051 Iustin Pop
      cls.ShutdownInstances(inst_running)
169 50273051 Iustin Pop
      drbd_running = cls.GetUsedDRBDs()
170 50273051 Iustin Pop
      cls.ShutdownDRBD(drbd_running)
171 50273051 Iustin Pop
    else:
172 50273051 Iustin Pop
      logging.debug("Not doing anything for role %s", role)
173 50273051 Iustin Pop
174 50273051 Iustin Pop
  @staticmethod
175 50273051 Iustin Pop
  def ShutdownInstances(inst_running):
176 50273051 Iustin Pop
    """Shutdown running instances.
177 50273051 Iustin Pop
178 50273051 Iustin Pop
    """
179 50273051 Iustin Pop
    names_running = set([i[0] for i in inst_running])
180 50273051 Iustin Pop
    if names_running:
181 50273051 Iustin Pop
      logging.info("Following instances should not be running,"
182 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(names_running))
183 50273051 Iustin Pop
      # this dictionary will collapse duplicate instance names (only
184 50273051 Iustin Pop
      # xen pvm/vhm) into a single key, which is fine
185 50273051 Iustin Pop
      i2h = dict(inst_running)
186 50273051 Iustin Pop
      for name in names_running:
187 50273051 Iustin Pop
        hv_name = i2h[name]
188 50273051 Iustin Pop
        hv = hypervisor.GetHypervisor(hv_name)
189 50273051 Iustin Pop
        hv.StopInstance(None, force=True, name=name)
190 50273051 Iustin Pop
191 50273051 Iustin Pop
  @staticmethod
192 50273051 Iustin Pop
  def ShutdownDRBD(drbd_running):
193 50273051 Iustin Pop
    """Shutdown active DRBD devices.
194 50273051 Iustin Pop
195 50273051 Iustin Pop
    """
196 50273051 Iustin Pop
    if drbd_running:
197 50273051 Iustin Pop
      logging.info("Following DRBD minors should not be active,"
198 50273051 Iustin Pop
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
199 50273051 Iustin Pop
      for minor in drbd_running:
200 50273051 Iustin Pop
        # pylint: disable-msg=W0212
201 50273051 Iustin Pop
        # using the private method as is, pending enhancements to the DRBD
202 50273051 Iustin Pop
        # interface
203 50273051 Iustin Pop
        bdev.DRBD8._ShutdownAll(minor)
204 50273051 Iustin Pop
205 50273051 Iustin Pop
  def Exec(self):
206 50273051 Iustin Pop
    """Check node status versus cluster desired state.
207 50273051 Iustin Pop
208 50273051 Iustin Pop
    """
209 50273051 Iustin Pop
    my_name = utils.HostInfo().name
210 50273051 Iustin Pop
    req = confd_client.ConfdClientRequest(type=
211 50273051 Iustin Pop
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
212 50273051 Iustin Pop
                                          query=my_name)
213 ebacb943 Iustin Pop
    self.confd_client.SendRequest(req, async=False, coverage=-1)
214 50273051 Iustin Pop
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
215 50273051 Iustin Pop
    if not timed_out:
216 50273051 Iustin Pop
      # should have a valid response
217 50273051 Iustin Pop
      status, result = self.store_cb.GetResponse(req.rsalt)
218 50273051 Iustin Pop
      assert status, "Missing result but received replies"
219 50273051 Iustin Pop
      if not self.filter_cb.consistent[req.rsalt]:
220 50273051 Iustin Pop
        logging.warning("Inconsistent replies, not doing anything")
221 50273051 Iustin Pop
        return
222 50273051 Iustin Pop
      self.DoMaintenance(result.server_reply.answer)
223 50273051 Iustin Pop
    else:
224 50273051 Iustin Pop
      logging.warning("Confd query timed out, cannot do maintenance actions")
225 50273051 Iustin Pop
226 50273051 Iustin Pop
227 5a3103e9 Michael Hanselmann
class WatcherState(object):
228 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
229 a8083063 Iustin Pop
230 a8083063 Iustin Pop
  """
231 001b3825 Michael Hanselmann
  def __init__(self, statefile):
232 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
233 5a3103e9 Michael Hanselmann
234 001b3825 Michael Hanselmann
    @type statefile: file
235 001b3825 Michael Hanselmann
    @param statefile: State file object
236 5a3103e9 Michael Hanselmann
237 5a3103e9 Michael Hanselmann
    """
238 001b3825 Michael Hanselmann
    self.statefile = statefile
239 a8083063 Iustin Pop
240 5a3103e9 Michael Hanselmann
    try:
241 2c404217 Iustin Pop
      state_data = self.statefile.read()
242 2c404217 Iustin Pop
      if not state_data:
243 2c404217 Iustin Pop
        self._data = {}
244 2c404217 Iustin Pop
      else:
245 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
246 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
247 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
248 b76f660d Michael Hanselmann
      self._data = {}
249 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
250 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
251 5a3103e9 Michael Hanselmann
252 b76f660d Michael Hanselmann
    if "instance" not in self._data:
253 b76f660d Michael Hanselmann
      self._data["instance"] = {}
254 b76f660d Michael Hanselmann
    if "node" not in self._data:
255 b76f660d Michael Hanselmann
      self._data["node"] = {}
256 5a3103e9 Michael Hanselmann
257 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
258 2fb96d39 Michael Hanselmann
259 fc428e32 Michael Hanselmann
  def Save(self):
260 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
261 5a3103e9 Michael Hanselmann
262 5a3103e9 Michael Hanselmann
    """
263 fc428e32 Michael Hanselmann
    assert self.statefile
264 fc428e32 Michael Hanselmann
265 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
266 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
267 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
268 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
269 2fb96d39 Michael Hanselmann
      return
270 2fb96d39 Michael Hanselmann
271 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
272 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
273 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
274 26517d45 Iustin Pop
                         data=serialized_form,
275 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
276 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
277 5a3103e9 Michael Hanselmann
278 fc428e32 Michael Hanselmann
  def Close(self):
279 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
280 5a3103e9 Michael Hanselmann
281 5a3103e9 Michael Hanselmann
    """
282 5a3103e9 Michael Hanselmann
    assert self.statefile
283 5a3103e9 Michael Hanselmann
284 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
285 5a3103e9 Michael Hanselmann
    self.statefile.close()
286 5a3103e9 Michael Hanselmann
    self.statefile = None
287 5a3103e9 Michael Hanselmann
288 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
289 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
290 a8083063 Iustin Pop
291 5a3103e9 Michael Hanselmann
    """
292 b76f660d Michael Hanselmann
    ndata = self._data["node"]
293 5a3103e9 Michael Hanselmann
294 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
295 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
296 5a3103e9 Michael Hanselmann
    return None
297 5a3103e9 Michael Hanselmann
298 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
299 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
300 5a3103e9 Michael Hanselmann
301 5a3103e9 Michael Hanselmann
    """
302 5a3103e9 Michael Hanselmann
    assert bootid
303 a8083063 Iustin Pop
304 b76f660d Michael Hanselmann
    ndata = self._data["node"]
305 a8083063 Iustin Pop
306 5a3103e9 Michael Hanselmann
    if name not in ndata:
307 5a3103e9 Michael Hanselmann
      ndata[name] = {}
308 5a3103e9 Michael Hanselmann
309 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
310 5a3103e9 Michael Hanselmann
311 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
312 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
313 a8083063 Iustin Pop
314 c41eea6e Iustin Pop
    @type instance: L{Instance}
315 c41eea6e Iustin Pop
    @param instance: the instance to look up
316 38242904 Iustin Pop
317 a8083063 Iustin Pop
    """
318 b76f660d Michael Hanselmann
    idata = self._data["instance"]
319 a8083063 Iustin Pop
320 5a3103e9 Michael Hanselmann
    if instance.name in idata:
321 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
322 a8083063 Iustin Pop
323 a8083063 Iustin Pop
    return 0
324 a8083063 Iustin Pop
325 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
326 a8083063 Iustin Pop
    """Record a restart attempt.
327 a8083063 Iustin Pop
328 c41eea6e Iustin Pop
    @type instance: L{Instance}
329 c41eea6e Iustin Pop
    @param instance: the instance being restarted
330 38242904 Iustin Pop
331 a8083063 Iustin Pop
    """
332 b76f660d Michael Hanselmann
    idata = self._data["instance"]
333 a8083063 Iustin Pop
334 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
335 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
336 5a3103e9 Michael Hanselmann
    else:
337 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
338 a8083063 Iustin Pop
339 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
340 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
341 a8083063 Iustin Pop
342 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
343 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
344 a8083063 Iustin Pop
345 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
346 c41eea6e Iustin Pop
    track down instances).
347 a8083063 Iustin Pop
348 c41eea6e Iustin Pop
    @type instance: L{Instance}
349 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
350 38242904 Iustin Pop
351 a8083063 Iustin Pop
    """
352 b76f660d Michael Hanselmann
    idata = self._data["instance"]
353 a8083063 Iustin Pop
354 5a3103e9 Michael Hanselmann
    if instance.name in idata:
355 5a3103e9 Michael Hanselmann
      del idata[instance.name]
356 a8083063 Iustin Pop
357 a8083063 Iustin Pop
358 a8083063 Iustin Pop
class Instance(object):
359 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
360 a8083063 Iustin Pop
361 a8083063 Iustin Pop
  """
362 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
363 a8083063 Iustin Pop
    self.name = name
364 a8083063 Iustin Pop
    self.state = state
365 5a3103e9 Michael Hanselmann
    self.autostart = autostart
366 a8083063 Iustin Pop
367 a8083063 Iustin Pop
  def Restart(self):
368 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
369 3ecf6786 Iustin Pop
370 3ecf6786 Iustin Pop
    """
371 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
372 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
373 a8083063 Iustin Pop
374 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
375 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
376 5a3103e9 Michael Hanselmann
377 5a3103e9 Michael Hanselmann
    """
378 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
379 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
380 a8083063 Iustin Pop
381 a8083063 Iustin Pop
382 6dfcc47b Iustin Pop
def GetClusterData():
383 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
384 5a3103e9 Michael Hanselmann
385 5a3103e9 Michael Hanselmann
  """
386 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
387 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
388 6dfcc47b Iustin Pop
                                 use_locking=True)
389 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
390 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
391 6dfcc47b Iustin Pop
                             use_locking=True)
392 a8083063 Iustin Pop
393 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
394 a8083063 Iustin Pop
395 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
396 5a3103e9 Michael Hanselmann
397 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
398 78f44650 Iustin Pop
399 6dfcc47b Iustin Pop
  result = all_results[0]
400 6dfcc47b Iustin Pop
  smap = {}
401 5a3103e9 Michael Hanselmann
402 6dfcc47b Iustin Pop
  instances = {}
403 78f44650 Iustin Pop
404 78f44650 Iustin Pop
  # write the upfile
405 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
406 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
407 78f44650 Iustin Pop
408 6dfcc47b Iustin Pop
  for fields in result:
409 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
410 5a3103e9 Michael Hanselmann
411 6dfcc47b Iustin Pop
    # update the secondary node map
412 6dfcc47b Iustin Pop
    for node in snodes:
413 6dfcc47b Iustin Pop
      if node not in smap:
414 6dfcc47b Iustin Pop
        smap[node] = []
415 6dfcc47b Iustin Pop
      smap[node].append(name)
416 a8083063 Iustin Pop
417 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
418 5a3103e9 Michael Hanselmann
419 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
420 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
421 5a3103e9 Michael Hanselmann
422 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
423 5a3103e9 Michael Hanselmann
424 6dfcc47b Iustin Pop
  return instances, nodes, smap
425 a8083063 Iustin Pop
426 a8083063 Iustin Pop
427 5a3103e9 Michael Hanselmann
class Watcher(object):
428 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
429 a8083063 Iustin Pop
430 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
431 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
432 a8083063 Iustin Pop
  to restart machines that are down.
433 38242904 Iustin Pop
434 a8083063 Iustin Pop
  """
435 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
436 cc962d58 Iustin Pop
    self.notepad = notepad
437 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
438 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
439 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
440 24edc6d4 Iustin Pop
    # first archive old jobs
441 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
442 24edc6d4 Iustin Pop
    # and only then submit new ones
443 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
444 eee1fa2d Iustin Pop
    self.started_instances = set()
445 f07521e5 Iustin Pop
    self.opts = opts
446 a8083063 Iustin Pop
447 a8083063 Iustin Pop
  def Run(self):
448 cc962d58 Iustin Pop
    """Watcher run sequence.
449 cc962d58 Iustin Pop
450 cc962d58 Iustin Pop
    """
451 cc962d58 Iustin Pop
    notepad = self.notepad
452 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
453 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
454 cc962d58 Iustin Pop
    self.VerifyDisks()
455 5a3103e9 Michael Hanselmann
456 24edc6d4 Iustin Pop
  @staticmethod
457 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
458 f07521e5 Iustin Pop
    """Archive old jobs.
459 f07521e5 Iustin Pop
460 f07521e5 Iustin Pop
    """
461 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
462 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
463 f07521e5 Iustin Pop
464 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
465 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
466 38242904 Iustin Pop
467 a8083063 Iustin Pop
    """
468 5a3103e9 Michael Hanselmann
    check_nodes = []
469 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
470 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
471 37b77b18 Iustin Pop
      if new_id is None:
472 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
473 cbfc4681 Iustin Pop
        if not offline:
474 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
475 cbfc4681 Iustin Pop
                        name)
476 37b77b18 Iustin Pop
        continue
477 26517d45 Iustin Pop
      if old != new_id:
478 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
479 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
480 5a3103e9 Michael Hanselmann
481 5a3103e9 Michael Hanselmann
    if check_nodes:
482 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
483 5a3103e9 Michael Hanselmann
      # secondary node.
484 6dfcc47b Iustin Pop
      for node in check_nodes:
485 6dfcc47b Iustin Pop
        if node not in self.smap:
486 eee1fa2d Iustin Pop
          continue
487 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
488 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
489 6dfcc47b Iustin Pop
          if not instance.autostart:
490 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
491 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
492 6dfcc47b Iustin Pop
            continue
493 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
494 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
495 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
496 6dfcc47b Iustin Pop
            continue
497 6dfcc47b Iustin Pop
          try:
498 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
499 6dfcc47b Iustin Pop
            instance.ActivateDisks()
500 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
501 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
502 6dfcc47b Iustin Pop
                              instance.name)
503 5a3103e9 Michael Hanselmann
504 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
505 5a3103e9 Michael Hanselmann
      for name in check_nodes:
506 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
507 a8083063 Iustin Pop
508 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
509 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
510 5a3103e9 Michael Hanselmann
511 5a3103e9 Michael Hanselmann
    """
512 6dfcc47b Iustin Pop
    for instance in self.instances.values():
513 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
514 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
515 a8083063 Iustin Pop
516 a8083063 Iustin Pop
        if n > MAXTRIES:
517 a8083063 Iustin Pop
          # stay quiet.
518 a8083063 Iustin Pop
          continue
519 a8083063 Iustin Pop
        elif n < MAXTRIES:
520 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
521 a8083063 Iustin Pop
        else:
522 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
523 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
524 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
525 a8083063 Iustin Pop
          continue
526 a8083063 Iustin Pop
        try:
527 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
528 438b45d4 Michael Hanselmann
                        instance.name, last)
529 a8083063 Iustin Pop
          instance.Restart()
530 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
531 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
532 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
533 4bffa7f7 Iustin Pop
                            instance.name)
534 a8083063 Iustin Pop
535 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
536 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
537 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
538 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
539 a8083063 Iustin Pop
      else:
540 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
541 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
542 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
543 a8083063 Iustin Pop
544 b7309a0d Iustin Pop
  @staticmethod
545 b7309a0d Iustin Pop
  def VerifyDisks():
546 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
547 d2f311db Iustin Pop
548 d2f311db Iustin Pop
    """
549 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
550 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
551 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
552 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
553 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
554 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
555 5188ab37 Iustin Pop
      return
556 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
557 5188ab37 Iustin Pop
    if not offline_disk_instances:
558 5188ab37 Iustin Pop
      # nothing to do
559 5188ab37 Iustin Pop
      return
560 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
561 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
562 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
563 5188ab37 Iustin Pop
    # less the job queue
564 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
565 5188ab37 Iustin Pop
           for name in offline_disk_instances]
566 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
567 5188ab37 Iustin Pop
568 a9105b24 Michael Hanselmann
    try:
569 a9105b24 Michael Hanselmann
      cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
570 a9105b24 Michael Hanselmann
    except Exception: # pylint: disable-msg=W0703
571 a9105b24 Michael Hanselmann
      logging.exception("Error while activating disks")
572 a8083063 Iustin Pop
573 a8083063 Iustin Pop
574 001b3825 Michael Hanselmann
def OpenStateFile(path):
575 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
576 001b3825 Michael Hanselmann
577 001b3825 Michael Hanselmann
  @type path: string
578 001b3825 Michael Hanselmann
  @param path: Path to state file
579 001b3825 Michael Hanselmann
580 001b3825 Michael Hanselmann
  """
581 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
582 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
583 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
584 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
585 001b3825 Michael Hanselmann
586 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
587 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
588 001b3825 Michael Hanselmann
  # watcher from running.
589 001b3825 Michael Hanselmann
  try:
590 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
591 001b3825 Michael Hanselmann
  except errors.LockError, err:
592 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
593 001b3825 Michael Hanselmann
    return None
594 001b3825 Michael Hanselmann
595 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
596 001b3825 Michael Hanselmann
597 001b3825 Michael Hanselmann
598 a8083063 Iustin Pop
def ParseOptions():
599 a8083063 Iustin Pop
  """Parse the command line options.
600 a8083063 Iustin Pop
601 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
602 a8083063 Iustin Pop
603 a8083063 Iustin Pop
  """
604 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
605 a8083063 Iustin Pop
                        usage="%prog [-d]",
606 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
607 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
608 a8083063 Iustin Pop
609 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
610 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
611 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
612 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
613 a8083063 Iustin Pop
  options, args = parser.parse_args()
614 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
615 a8083063 Iustin Pop
  return options, args
616 a8083063 Iustin Pop
617 a8083063 Iustin Pop
618 a8083063 Iustin Pop
def main():
619 a8083063 Iustin Pop
  """Main function.
620 a8083063 Iustin Pop
621 a8083063 Iustin Pop
  """
622 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
623 e125c67c Michael Hanselmann
624 f93427cd Iustin Pop
  options, args = ParseOptions()
625 f93427cd Iustin Pop
626 f93427cd Iustin Pop
  if args: # watcher doesn't take any arguments
627 f93427cd Iustin Pop
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
628 f93427cd Iustin Pop
    sys.exit(constants.EXIT_FAILURE)
629 a8083063 Iustin Pop
630 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
631 82d9caef Iustin Pop
                     stderr_logging=options.debug)
632 a8083063 Iustin Pop
633 3753b2cb Michael Hanselmann
  if ShouldPause():
634 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
635 3753b2cb Michael Hanselmann
    sys.exit(constants.EXIT_SUCCESS)
636 3753b2cb Michael Hanselmann
637 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
638 001b3825 Michael Hanselmann
  if not statefile:
639 001b3825 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
640 001b3825 Michael Hanselmann
641 24edc6d4 Iustin Pop
  update_file = False
642 a8083063 Iustin Pop
  try:
643 f1115454 Guido Trotter
    StartNodeDaemons()
644 9e289e36 Guido Trotter
    RunWatcherHooks()
645 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
646 50273051 Iustin Pop
    # masters can be properly cleaned up too
647 50273051 Iustin Pop
    if NodeMaintenance.ShouldRun():
648 50273051 Iustin Pop
      NodeMaintenance().Exec()
649 c4f0219c Iustin Pop
650 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
651 781b2b2b Michael Hanselmann
    try:
652 2c404217 Iustin Pop
      try:
653 2c404217 Iustin Pop
        client = cli.GetClient()
654 2c404217 Iustin Pop
      except errors.OpPrereqError:
655 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
656 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
657 24edc6d4 Iustin Pop
        update_file = True
658 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
659 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
660 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
661 7dfb83c2 Iustin Pop
                        str(err))
662 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
663 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
664 7dfb83c2 Iustin Pop
          sys.exit(constants.EXIT_FAILURE)
665 7dfb83c2 Iustin Pop
        # else retry the connection
666 7dfb83c2 Iustin Pop
        client = cli.GetClient()
667 cc962d58 Iustin Pop
668 83052f9e Guido Trotter
      # we are on master now
669 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
670 c4f0219c Iustin Pop
671 cc962d58 Iustin Pop
      try:
672 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
673 cc962d58 Iustin Pop
      except errors.ConfigurationError:
674 cc962d58 Iustin Pop
        # Just exit if there's no configuration
675 24edc6d4 Iustin Pop
        update_file = True
676 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
677 e125c67c Michael Hanselmann
678 cc962d58 Iustin Pop
      watcher.Run()
679 24edc6d4 Iustin Pop
      update_file = True
680 24edc6d4 Iustin Pop
681 cc962d58 Iustin Pop
    finally:
682 7dfb83c2 Iustin Pop
      if update_file:
683 7dfb83c2 Iustin Pop
        notepad.Save()
684 7dfb83c2 Iustin Pop
      else:
685 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
686 1b052f42 Michael Hanselmann
  except SystemExit:
687 1b052f42 Michael Hanselmann
    raise
688 38242904 Iustin Pop
  except NotMasterError:
689 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
690 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
691 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
692 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
693 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
694 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
695 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
696 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
697 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
698 438b45d4 Michael Hanselmann
  except Exception, err:
699 001b3825 Michael Hanselmann
    logging.exception(str(err))
700 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
701 a8083063 Iustin Pop
702 5a3103e9 Michael Hanselmann
703 a8083063 Iustin Pop
if __name__ == '__main__':
704 a8083063 Iustin Pop
  main()