Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ c4feafe8

History | View | Annotate | Download (17 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 7260cfbe Iustin Pop
# pylint: disable-msg=C0103,W0142
31 7260cfbe Iustin Pop
32 7260cfbe Iustin Pop
# C0103: Invalid name ganeti-watcher
33 7260cfbe Iustin Pop
34 a8083063 Iustin Pop
import os
35 a8083063 Iustin Pop
import sys
36 a8083063 Iustin Pop
import time
37 438b45d4 Michael Hanselmann
import logging
38 a8083063 Iustin Pop
from optparse import OptionParser
39 a8083063 Iustin Pop
40 a8083063 Iustin Pop
from ganeti import utils
41 a8083063 Iustin Pop
from ganeti import constants
42 67fe61c4 Michael Hanselmann
from ganeti import serializer
43 89e1fc26 Iustin Pop
from ganeti import errors
44 e125c67c Michael Hanselmann
from ganeti import opcodes
45 e125c67c Michael Hanselmann
from ganeti import cli
46 7dfb83c2 Iustin Pop
from ganeti import luxi
47 a8083063 Iustin Pop
48 a8083063 Iustin Pop
49 5a3103e9 Michael Hanselmann
MAXTRIES = 5
50 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
51 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
52 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
53 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
54 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
55 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
56 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
57 5a3103e9 Michael Hanselmann
58 5a3103e9 Michael Hanselmann
59 e125c67c Michael Hanselmann
# Global client object
60 e125c67c Michael Hanselmann
client = None
61 e125c67c Michael Hanselmann
62 e125c67c Michael Hanselmann
63 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
64 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
65 a8083063 Iustin Pop
66 a8083063 Iustin Pop
67 3753b2cb Michael Hanselmann
def ShouldPause():
68 3753b2cb Michael Hanselmann
  """Check whether we should pause.
69 3753b2cb Michael Hanselmann
70 3753b2cb Michael Hanselmann
  """
71 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
72 3753b2cb Michael Hanselmann
73 3753b2cb Michael Hanselmann
74 f1115454 Guido Trotter
def StartNodeDaemons():
75 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
76 f1115454 Guido Trotter
77 f1115454 Guido Trotter
  """
78 f1115454 Guido Trotter
  # on master or not, try to start the node dameon
79 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
80 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
81 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
82 f1115454 Guido Trotter
83 f1115454 Guido Trotter
84 9e289e36 Guido Trotter
def RunWatcherHooks():
85 9e289e36 Guido Trotter
  """Run the watcher hooks.
86 9e289e36 Guido Trotter
87 9e289e36 Guido Trotter
  """
88 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
89 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
90 9e289e36 Guido Trotter
91 9e289e36 Guido Trotter
  try:
92 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
93 9e289e36 Guido Trotter
  except Exception, msg: # pylint: disable-msg=W0703
94 9e289e36 Guido Trotter
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
95 9e289e36 Guido Trotter
96 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
97 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
98 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
99 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
100 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
101 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
102 9e289e36 Guido Trotter
      if runresult.failed:
103 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
104 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
105 9e289e36 Guido Trotter
      else:
106 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
107 9e289e36 Guido Trotter
                      runresult.output)
108 9e289e36 Guido Trotter
109 001b3825 Michael Hanselmann
110 5a3103e9 Michael Hanselmann
class WatcherState(object):
111 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
112 a8083063 Iustin Pop
113 a8083063 Iustin Pop
  """
114 001b3825 Michael Hanselmann
  def __init__(self, statefile):
115 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
116 5a3103e9 Michael Hanselmann
117 001b3825 Michael Hanselmann
    @type statefile: file
118 001b3825 Michael Hanselmann
    @param statefile: State file object
119 5a3103e9 Michael Hanselmann
120 5a3103e9 Michael Hanselmann
    """
121 001b3825 Michael Hanselmann
    self.statefile = statefile
122 a8083063 Iustin Pop
123 5a3103e9 Michael Hanselmann
    try:
124 2c404217 Iustin Pop
      state_data = self.statefile.read()
125 2c404217 Iustin Pop
      if not state_data:
126 2c404217 Iustin Pop
        self._data = {}
127 2c404217 Iustin Pop
      else:
128 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
129 7260cfbe Iustin Pop
    except Exception, msg: # pylint: disable-msg=W0703
130 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
131 b76f660d Michael Hanselmann
      self._data = {}
132 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
133 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
134 5a3103e9 Michael Hanselmann
135 b76f660d Michael Hanselmann
    if "instance" not in self._data:
136 b76f660d Michael Hanselmann
      self._data["instance"] = {}
137 b76f660d Michael Hanselmann
    if "node" not in self._data:
138 b76f660d Michael Hanselmann
      self._data["node"] = {}
139 5a3103e9 Michael Hanselmann
140 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
141 2fb96d39 Michael Hanselmann
142 fc428e32 Michael Hanselmann
  def Save(self):
143 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
144 5a3103e9 Michael Hanselmann
145 5a3103e9 Michael Hanselmann
    """
146 fc428e32 Michael Hanselmann
    assert self.statefile
147 fc428e32 Michael Hanselmann
148 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
149 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
150 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
151 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
152 2fb96d39 Michael Hanselmann
      return
153 2fb96d39 Michael Hanselmann
154 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
155 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
156 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
157 26517d45 Iustin Pop
                         data=serialized_form,
158 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
159 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
160 5a3103e9 Michael Hanselmann
161 fc428e32 Michael Hanselmann
  def Close(self):
162 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
163 5a3103e9 Michael Hanselmann
164 5a3103e9 Michael Hanselmann
    """
165 5a3103e9 Michael Hanselmann
    assert self.statefile
166 5a3103e9 Michael Hanselmann
167 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
168 5a3103e9 Michael Hanselmann
    self.statefile.close()
169 5a3103e9 Michael Hanselmann
    self.statefile = None
170 5a3103e9 Michael Hanselmann
171 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
172 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
173 a8083063 Iustin Pop
174 5a3103e9 Michael Hanselmann
    """
175 b76f660d Michael Hanselmann
    ndata = self._data["node"]
176 5a3103e9 Michael Hanselmann
177 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
178 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
179 5a3103e9 Michael Hanselmann
    return None
180 5a3103e9 Michael Hanselmann
181 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
182 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
183 5a3103e9 Michael Hanselmann
184 5a3103e9 Michael Hanselmann
    """
185 5a3103e9 Michael Hanselmann
    assert bootid
186 a8083063 Iustin Pop
187 b76f660d Michael Hanselmann
    ndata = self._data["node"]
188 a8083063 Iustin Pop
189 5a3103e9 Michael Hanselmann
    if name not in ndata:
190 5a3103e9 Michael Hanselmann
      ndata[name] = {}
191 5a3103e9 Michael Hanselmann
192 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
193 5a3103e9 Michael Hanselmann
194 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
195 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
196 a8083063 Iustin Pop
197 c41eea6e Iustin Pop
    @type instance: L{Instance}
198 c41eea6e Iustin Pop
    @param instance: the instance to look up
199 38242904 Iustin Pop
200 a8083063 Iustin Pop
    """
201 b76f660d Michael Hanselmann
    idata = self._data["instance"]
202 a8083063 Iustin Pop
203 5a3103e9 Michael Hanselmann
    if instance.name in idata:
204 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
205 a8083063 Iustin Pop
206 a8083063 Iustin Pop
    return 0
207 a8083063 Iustin Pop
208 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
209 a8083063 Iustin Pop
    """Record a restart attempt.
210 a8083063 Iustin Pop
211 c41eea6e Iustin Pop
    @type instance: L{Instance}
212 c41eea6e Iustin Pop
    @param instance: the instance being restarted
213 38242904 Iustin Pop
214 a8083063 Iustin Pop
    """
215 b76f660d Michael Hanselmann
    idata = self._data["instance"]
216 a8083063 Iustin Pop
217 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
218 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
219 5a3103e9 Michael Hanselmann
    else:
220 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
221 a8083063 Iustin Pop
222 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
223 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
224 a8083063 Iustin Pop
225 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
226 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
227 a8083063 Iustin Pop
228 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
229 c41eea6e Iustin Pop
    track down instances).
230 a8083063 Iustin Pop
231 c41eea6e Iustin Pop
    @type instance: L{Instance}
232 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
233 38242904 Iustin Pop
234 a8083063 Iustin Pop
    """
235 b76f660d Michael Hanselmann
    idata = self._data["instance"]
236 a8083063 Iustin Pop
237 5a3103e9 Michael Hanselmann
    if instance.name in idata:
238 5a3103e9 Michael Hanselmann
      del idata[instance.name]
239 a8083063 Iustin Pop
240 a8083063 Iustin Pop
241 a8083063 Iustin Pop
class Instance(object):
242 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
243 a8083063 Iustin Pop
244 a8083063 Iustin Pop
  """
245 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
246 a8083063 Iustin Pop
    self.name = name
247 a8083063 Iustin Pop
    self.state = state
248 5a3103e9 Michael Hanselmann
    self.autostart = autostart
249 a8083063 Iustin Pop
250 a8083063 Iustin Pop
  def Restart(self):
251 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
252 3ecf6786 Iustin Pop
253 3ecf6786 Iustin Pop
    """
254 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
255 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
256 a8083063 Iustin Pop
257 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
258 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
259 5a3103e9 Michael Hanselmann
260 5a3103e9 Michael Hanselmann
    """
261 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
262 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
263 a8083063 Iustin Pop
264 a8083063 Iustin Pop
265 6dfcc47b Iustin Pop
def GetClusterData():
266 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
267 5a3103e9 Michael Hanselmann
268 5a3103e9 Michael Hanselmann
  """
269 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
270 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
271 6dfcc47b Iustin Pop
                                 use_locking=True)
272 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
273 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
274 6dfcc47b Iustin Pop
                             use_locking=True)
275 a8083063 Iustin Pop
276 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
277 a8083063 Iustin Pop
278 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
279 5a3103e9 Michael Hanselmann
280 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
281 78f44650 Iustin Pop
282 6dfcc47b Iustin Pop
  result = all_results[0]
283 6dfcc47b Iustin Pop
  smap = {}
284 5a3103e9 Michael Hanselmann
285 6dfcc47b Iustin Pop
  instances = {}
286 78f44650 Iustin Pop
287 78f44650 Iustin Pop
  # write the upfile
288 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
289 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
290 78f44650 Iustin Pop
291 6dfcc47b Iustin Pop
  for fields in result:
292 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
293 5a3103e9 Michael Hanselmann
294 6dfcc47b Iustin Pop
    # update the secondary node map
295 6dfcc47b Iustin Pop
    for node in snodes:
296 6dfcc47b Iustin Pop
      if node not in smap:
297 6dfcc47b Iustin Pop
        smap[node] = []
298 6dfcc47b Iustin Pop
      smap[node].append(name)
299 a8083063 Iustin Pop
300 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
301 5a3103e9 Michael Hanselmann
302 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
303 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
304 5a3103e9 Michael Hanselmann
305 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
306 5a3103e9 Michael Hanselmann
307 6dfcc47b Iustin Pop
  return instances, nodes, smap
308 a8083063 Iustin Pop
309 a8083063 Iustin Pop
310 5a3103e9 Michael Hanselmann
class Watcher(object):
311 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
312 a8083063 Iustin Pop
313 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
314 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
315 a8083063 Iustin Pop
  to restart machines that are down.
316 38242904 Iustin Pop
317 a8083063 Iustin Pop
  """
318 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
319 cc962d58 Iustin Pop
    self.notepad = notepad
320 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
321 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
322 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
323 24edc6d4 Iustin Pop
    # first archive old jobs
324 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
325 24edc6d4 Iustin Pop
    # and only then submit new ones
326 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
327 eee1fa2d Iustin Pop
    self.started_instances = set()
328 f07521e5 Iustin Pop
    self.opts = opts
329 a8083063 Iustin Pop
330 a8083063 Iustin Pop
  def Run(self):
331 cc962d58 Iustin Pop
    """Watcher run sequence.
332 cc962d58 Iustin Pop
333 cc962d58 Iustin Pop
    """
334 cc962d58 Iustin Pop
    notepad = self.notepad
335 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
336 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
337 cc962d58 Iustin Pop
    self.VerifyDisks()
338 5a3103e9 Michael Hanselmann
339 24edc6d4 Iustin Pop
  @staticmethod
340 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
341 f07521e5 Iustin Pop
    """Archive old jobs.
342 f07521e5 Iustin Pop
343 f07521e5 Iustin Pop
    """
344 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
345 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
346 f07521e5 Iustin Pop
347 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
348 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
349 38242904 Iustin Pop
350 a8083063 Iustin Pop
    """
351 5a3103e9 Michael Hanselmann
    check_nodes = []
352 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
353 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
354 37b77b18 Iustin Pop
      if new_id is None:
355 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
356 cbfc4681 Iustin Pop
        if not offline:
357 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
358 cbfc4681 Iustin Pop
                        name)
359 37b77b18 Iustin Pop
        continue
360 26517d45 Iustin Pop
      if old != new_id:
361 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
362 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
363 5a3103e9 Michael Hanselmann
364 5a3103e9 Michael Hanselmann
    if check_nodes:
365 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
366 5a3103e9 Michael Hanselmann
      # secondary node.
367 6dfcc47b Iustin Pop
      for node in check_nodes:
368 6dfcc47b Iustin Pop
        if node not in self.smap:
369 eee1fa2d Iustin Pop
          continue
370 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
371 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
372 6dfcc47b Iustin Pop
          if not instance.autostart:
373 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
374 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
375 6dfcc47b Iustin Pop
            continue
376 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
377 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
378 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
379 6dfcc47b Iustin Pop
            continue
380 6dfcc47b Iustin Pop
          try:
381 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
382 6dfcc47b Iustin Pop
            instance.ActivateDisks()
383 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
384 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
385 6dfcc47b Iustin Pop
                              instance.name)
386 5a3103e9 Michael Hanselmann
387 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
388 5a3103e9 Michael Hanselmann
      for name in check_nodes:
389 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
390 a8083063 Iustin Pop
391 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
392 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
393 5a3103e9 Michael Hanselmann
394 5a3103e9 Michael Hanselmann
    """
395 6dfcc47b Iustin Pop
    for instance in self.instances.values():
396 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
397 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
398 a8083063 Iustin Pop
399 a8083063 Iustin Pop
        if n > MAXTRIES:
400 a8083063 Iustin Pop
          # stay quiet.
401 a8083063 Iustin Pop
          continue
402 a8083063 Iustin Pop
        elif n < MAXTRIES:
403 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
404 a8083063 Iustin Pop
        else:
405 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
406 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
407 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
408 a8083063 Iustin Pop
          continue
409 a8083063 Iustin Pop
        try:
410 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
411 438b45d4 Michael Hanselmann
                        instance.name, last)
412 a8083063 Iustin Pop
          instance.Restart()
413 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
414 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
415 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
416 4bffa7f7 Iustin Pop
                            instance.name)
417 a8083063 Iustin Pop
418 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
419 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
420 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
421 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
422 a8083063 Iustin Pop
      else:
423 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
424 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
425 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
426 a8083063 Iustin Pop
427 b7309a0d Iustin Pop
  @staticmethod
428 b7309a0d Iustin Pop
  def VerifyDisks():
429 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
430 d2f311db Iustin Pop
431 d2f311db Iustin Pop
    """
432 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
433 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
434 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
435 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
436 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
437 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
438 5188ab37 Iustin Pop
      return
439 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
440 5188ab37 Iustin Pop
    if not offline_disk_instances:
441 5188ab37 Iustin Pop
      # nothing to do
442 5188ab37 Iustin Pop
      return
443 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
444 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
445 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
446 5188ab37 Iustin Pop
    # less the job queue
447 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
448 5188ab37 Iustin Pop
           for name in offline_disk_instances]
449 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
450 5188ab37 Iustin Pop
451 a9105b24 Michael Hanselmann
    try:
452 a9105b24 Michael Hanselmann
      cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
453 a9105b24 Michael Hanselmann
    except Exception: # pylint: disable-msg=W0703
454 a9105b24 Michael Hanselmann
      logging.exception("Error while activating disks")
455 a8083063 Iustin Pop
456 a8083063 Iustin Pop
457 001b3825 Michael Hanselmann
def OpenStateFile(path):
458 001b3825 Michael Hanselmann
  """Opens the state file and acquires a lock on it.
459 001b3825 Michael Hanselmann
460 001b3825 Michael Hanselmann
  @type path: string
461 001b3825 Michael Hanselmann
  @param path: Path to state file
462 001b3825 Michael Hanselmann
463 001b3825 Michael Hanselmann
  """
464 001b3825 Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
465 001b3825 Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
466 001b3825 Michael Hanselmann
  # an existing file -or- allow creating if not existing.
467 001b3825 Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
468 001b3825 Michael Hanselmann
469 001b3825 Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
470 001b3825 Michael Hanselmann
  # might already be running or another program is temporarily blocking the
471 001b3825 Michael Hanselmann
  # watcher from running.
472 001b3825 Michael Hanselmann
  try:
473 001b3825 Michael Hanselmann
    utils.LockFile(statefile_fd)
474 001b3825 Michael Hanselmann
  except errors.LockError, err:
475 001b3825 Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
476 001b3825 Michael Hanselmann
    return None
477 001b3825 Michael Hanselmann
478 001b3825 Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
479 001b3825 Michael Hanselmann
480 001b3825 Michael Hanselmann
481 a8083063 Iustin Pop
def ParseOptions():
482 a8083063 Iustin Pop
  """Parse the command line options.
483 a8083063 Iustin Pop
484 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
485 a8083063 Iustin Pop
486 a8083063 Iustin Pop
  """
487 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
488 a8083063 Iustin Pop
                        usage="%prog [-d]",
489 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
490 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
491 a8083063 Iustin Pop
492 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
493 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
494 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
495 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
496 a8083063 Iustin Pop
  options, args = parser.parse_args()
497 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
498 a8083063 Iustin Pop
  return options, args
499 a8083063 Iustin Pop
500 a8083063 Iustin Pop
501 a8083063 Iustin Pop
def main():
502 a8083063 Iustin Pop
  """Main function.
503 a8083063 Iustin Pop
504 a8083063 Iustin Pop
  """
505 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
506 e125c67c Michael Hanselmann
507 f93427cd Iustin Pop
  options, args = ParseOptions()
508 f93427cd Iustin Pop
509 f93427cd Iustin Pop
  if args: # watcher doesn't take any arguments
510 f93427cd Iustin Pop
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
511 f93427cd Iustin Pop
    sys.exit(constants.EXIT_FAILURE)
512 a8083063 Iustin Pop
513 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
514 82d9caef Iustin Pop
                     stderr_logging=options.debug)
515 a8083063 Iustin Pop
516 3753b2cb Michael Hanselmann
  if ShouldPause():
517 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
518 3753b2cb Michael Hanselmann
    sys.exit(constants.EXIT_SUCCESS)
519 3753b2cb Michael Hanselmann
520 001b3825 Michael Hanselmann
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
521 001b3825 Michael Hanselmann
  if not statefile:
522 001b3825 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
523 001b3825 Michael Hanselmann
524 24edc6d4 Iustin Pop
  update_file = False
525 a8083063 Iustin Pop
  try:
526 f1115454 Guido Trotter
    StartNodeDaemons()
527 9e289e36 Guido Trotter
    RunWatcherHooks()
528 c4f0219c Iustin Pop
529 001b3825 Michael Hanselmann
    notepad = WatcherState(statefile)
530 781b2b2b Michael Hanselmann
    try:
531 2c404217 Iustin Pop
      try:
532 2c404217 Iustin Pop
        client = cli.GetClient()
533 2c404217 Iustin Pop
      except errors.OpPrereqError:
534 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
535 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
536 24edc6d4 Iustin Pop
        update_file = True
537 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
538 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
539 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
540 7dfb83c2 Iustin Pop
                        str(err))
541 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
542 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
543 7dfb83c2 Iustin Pop
          sys.exit(constants.EXIT_FAILURE)
544 7dfb83c2 Iustin Pop
        # else retry the connection
545 7dfb83c2 Iustin Pop
        client = cli.GetClient()
546 cc962d58 Iustin Pop
547 83052f9e Guido Trotter
      # we are on master now
548 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
549 c4f0219c Iustin Pop
550 cc962d58 Iustin Pop
      try:
551 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
552 cc962d58 Iustin Pop
      except errors.ConfigurationError:
553 cc962d58 Iustin Pop
        # Just exit if there's no configuration
554 24edc6d4 Iustin Pop
        update_file = True
555 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
556 e125c67c Michael Hanselmann
557 cc962d58 Iustin Pop
      watcher.Run()
558 24edc6d4 Iustin Pop
      update_file = True
559 24edc6d4 Iustin Pop
560 cc962d58 Iustin Pop
    finally:
561 7dfb83c2 Iustin Pop
      if update_file:
562 7dfb83c2 Iustin Pop
        notepad.Save()
563 7dfb83c2 Iustin Pop
      else:
564 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
565 1b052f42 Michael Hanselmann
  except SystemExit:
566 1b052f42 Michael Hanselmann
    raise
567 38242904 Iustin Pop
  except NotMasterError:
568 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
569 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
570 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
571 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
572 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
573 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
574 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
575 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
576 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
577 438b45d4 Michael Hanselmann
  except Exception, err:
578 001b3825 Michael Hanselmann
    logging.exception(str(err))
579 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
580 a8083063 Iustin Pop
581 5a3103e9 Michael Hanselmann
582 a8083063 Iustin Pop
if __name__ == '__main__':
583 a8083063 Iustin Pop
  main()