Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ debac808

History | View | Annotate | Download (15.7 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 438b45d4 Michael Hanselmann
import logging
34 3753b2cb Michael Hanselmann
import errno
35 a8083063 Iustin Pop
from optparse import OptionParser
36 a8083063 Iustin Pop
37 a8083063 Iustin Pop
from ganeti import utils
38 a8083063 Iustin Pop
from ganeti import constants
39 67fe61c4 Michael Hanselmann
from ganeti import serializer
40 89e1fc26 Iustin Pop
from ganeti import errors
41 e125c67c Michael Hanselmann
from ganeti import opcodes
42 e125c67c Michael Hanselmann
from ganeti import cli
43 7dfb83c2 Iustin Pop
from ganeti import luxi
44 a8083063 Iustin Pop
45 a8083063 Iustin Pop
46 5a3103e9 Michael Hanselmann
MAXTRIES = 5
47 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
48 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
49 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
50 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
51 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
52 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
53 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
54 5a3103e9 Michael Hanselmann
55 5a3103e9 Michael Hanselmann
56 e125c67c Michael Hanselmann
# Global client object
57 e125c67c Michael Hanselmann
client = None
58 e125c67c Michael Hanselmann
59 e125c67c Michael Hanselmann
60 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
61 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
64 a8083063 Iustin Pop
def Indent(s, prefix='| '):
65 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
66 a8083063 Iustin Pop
67 c41eea6e Iustin Pop
  @param s: the string to indent
68 c41eea6e Iustin Pop
  @param prefix: the string to prepend each line
69 38242904 Iustin Pop
70 a8083063 Iustin Pop
  """
71 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72 a8083063 Iustin Pop
73 a8083063 Iustin Pop
74 3753b2cb Michael Hanselmann
def ShouldPause():
75 3753b2cb Michael Hanselmann
  """Check whether we should pause.
76 3753b2cb Michael Hanselmann
77 3753b2cb Michael Hanselmann
  """
78 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
79 3753b2cb Michael Hanselmann
80 3753b2cb Michael Hanselmann
81 7dfb83c2 Iustin Pop
def StartMaster():
82 7dfb83c2 Iustin Pop
  """Try to start the master daemon.
83 7dfb83c2 Iustin Pop
84 7dfb83c2 Iustin Pop
  """
85 7dfb83c2 Iustin Pop
  result = utils.RunCmd(['ganeti-masterd'])
86 7dfb83c2 Iustin Pop
  if result.failed:
87 7dfb83c2 Iustin Pop
    logging.error("Can't start the master daemon: output '%s'", result.output)
88 7dfb83c2 Iustin Pop
  return not result.failed
89 7dfb83c2 Iustin Pop
90 7dfb83c2 Iustin Pop
91 c4f0219c Iustin Pop
def EnsureDaemon(daemon):
92 c4f0219c Iustin Pop
  """Check for and start daemon if not alive.
93 c4f0219c Iustin Pop
94 c4f0219c Iustin Pop
  """
95 c4f0219c Iustin Pop
  pidfile = utils.DaemonPidFileName(daemon)
96 c4f0219c Iustin Pop
  pid = utils.ReadPidFile(pidfile)
97 c4f0219c Iustin Pop
  if pid == 0 or not utils.IsProcessAlive(pid): # no file or dead pid
98 c4f0219c Iustin Pop
    logging.debug("Daemon '%s' not alive, trying to restart", daemon)
99 c4f0219c Iustin Pop
    result = utils.RunCmd([daemon])
100 c4f0219c Iustin Pop
    if not result:
101 c4f0219c Iustin Pop
      logging.error("Can't start daemon '%s', failure %s, output: %s",
102 c4f0219c Iustin Pop
                    daemon, result.fail_reason, result.output)
103 c4f0219c Iustin Pop
104 c4f0219c Iustin Pop
105 5a3103e9 Michael Hanselmann
class WatcherState(object):
106 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
107 a8083063 Iustin Pop
108 a8083063 Iustin Pop
  """
109 a8083063 Iustin Pop
  def __init__(self):
110 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
111 5a3103e9 Michael Hanselmann
112 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
113 5a3103e9 Michael Hanselmann
114 5a3103e9 Michael Hanselmann
    """
115 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
116 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
117 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
118 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
119 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
120 a8083063 Iustin Pop
121 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
122 a8083063 Iustin Pop
123 5a3103e9 Michael Hanselmann
    try:
124 2c404217 Iustin Pop
      state_data = self.statefile.read()
125 2c404217 Iustin Pop
      if not state_data:
126 2c404217 Iustin Pop
        self._data = {}
127 2c404217 Iustin Pop
      else:
128 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
129 5a3103e9 Michael Hanselmann
    except Exception, msg:
130 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
131 b76f660d Michael Hanselmann
      self._data = {}
132 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
133 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
134 5a3103e9 Michael Hanselmann
135 b76f660d Michael Hanselmann
    if "instance" not in self._data:
136 b76f660d Michael Hanselmann
      self._data["instance"] = {}
137 b76f660d Michael Hanselmann
    if "node" not in self._data:
138 b76f660d Michael Hanselmann
      self._data["node"] = {}
139 5a3103e9 Michael Hanselmann
140 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
141 2fb96d39 Michael Hanselmann
142 fc428e32 Michael Hanselmann
  def Save(self):
143 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
144 5a3103e9 Michael Hanselmann
145 5a3103e9 Michael Hanselmann
    """
146 fc428e32 Michael Hanselmann
    assert self.statefile
147 fc428e32 Michael Hanselmann
148 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
149 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
150 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
151 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
152 2fb96d39 Michael Hanselmann
      return
153 2fb96d39 Michael Hanselmann
154 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
155 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
156 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
157 26517d45 Iustin Pop
                         data=serialized_form,
158 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
159 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
160 5a3103e9 Michael Hanselmann
161 fc428e32 Michael Hanselmann
  def Close(self):
162 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
163 5a3103e9 Michael Hanselmann
164 5a3103e9 Michael Hanselmann
    """
165 5a3103e9 Michael Hanselmann
    assert self.statefile
166 5a3103e9 Michael Hanselmann
167 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
168 5a3103e9 Michael Hanselmann
    self.statefile.close()
169 5a3103e9 Michael Hanselmann
    self.statefile = None
170 5a3103e9 Michael Hanselmann
171 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
172 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
173 a8083063 Iustin Pop
174 5a3103e9 Michael Hanselmann
    """
175 b76f660d Michael Hanselmann
    ndata = self._data["node"]
176 5a3103e9 Michael Hanselmann
177 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
178 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
179 5a3103e9 Michael Hanselmann
    return None
180 5a3103e9 Michael Hanselmann
181 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
182 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
183 5a3103e9 Michael Hanselmann
184 5a3103e9 Michael Hanselmann
    """
185 5a3103e9 Michael Hanselmann
    assert bootid
186 a8083063 Iustin Pop
187 b76f660d Michael Hanselmann
    ndata = self._data["node"]
188 a8083063 Iustin Pop
189 5a3103e9 Michael Hanselmann
    if name not in ndata:
190 5a3103e9 Michael Hanselmann
      ndata[name] = {}
191 5a3103e9 Michael Hanselmann
192 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
193 5a3103e9 Michael Hanselmann
194 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
195 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
196 a8083063 Iustin Pop
197 c41eea6e Iustin Pop
    @type instance: L{Instance}
198 c41eea6e Iustin Pop
    @param instance: the instance to look up
199 38242904 Iustin Pop
200 a8083063 Iustin Pop
    """
201 b76f660d Michael Hanselmann
    idata = self._data["instance"]
202 a8083063 Iustin Pop
203 5a3103e9 Michael Hanselmann
    if instance.name in idata:
204 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
205 a8083063 Iustin Pop
206 a8083063 Iustin Pop
    return 0
207 a8083063 Iustin Pop
208 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
209 a8083063 Iustin Pop
    """Record a restart attempt.
210 a8083063 Iustin Pop
211 c41eea6e Iustin Pop
    @type instance: L{Instance}
212 c41eea6e Iustin Pop
    @param instance: the instance being restarted
213 38242904 Iustin Pop
214 a8083063 Iustin Pop
    """
215 b76f660d Michael Hanselmann
    idata = self._data["instance"]
216 a8083063 Iustin Pop
217 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
218 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
219 5a3103e9 Michael Hanselmann
    else:
220 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
221 a8083063 Iustin Pop
222 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
223 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
224 a8083063 Iustin Pop
225 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
226 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
227 a8083063 Iustin Pop
228 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
229 c41eea6e Iustin Pop
    track down instances).
230 a8083063 Iustin Pop
231 c41eea6e Iustin Pop
    @type instance: L{Instance}
232 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
233 38242904 Iustin Pop
234 a8083063 Iustin Pop
    """
235 b76f660d Michael Hanselmann
    idata = self._data["instance"]
236 a8083063 Iustin Pop
237 5a3103e9 Michael Hanselmann
    if instance.name in idata:
238 5a3103e9 Michael Hanselmann
      del idata[instance.name]
239 a8083063 Iustin Pop
240 a8083063 Iustin Pop
241 a8083063 Iustin Pop
class Instance(object):
242 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
243 a8083063 Iustin Pop
244 a8083063 Iustin Pop
  """
245 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
246 a8083063 Iustin Pop
    self.name = name
247 a8083063 Iustin Pop
    self.state = state
248 5a3103e9 Michael Hanselmann
    self.autostart = autostart
249 a8083063 Iustin Pop
250 a8083063 Iustin Pop
  def Restart(self):
251 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
252 3ecf6786 Iustin Pop
253 3ecf6786 Iustin Pop
    """
254 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
255 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
256 a8083063 Iustin Pop
257 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
258 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
259 5a3103e9 Michael Hanselmann
260 5a3103e9 Michael Hanselmann
    """
261 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
262 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
263 a8083063 Iustin Pop
264 a8083063 Iustin Pop
265 6dfcc47b Iustin Pop
def GetClusterData():
266 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
267 5a3103e9 Michael Hanselmann
268 5a3103e9 Michael Hanselmann
  """
269 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
270 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
271 6dfcc47b Iustin Pop
                                 use_locking=True)
272 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
273 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
274 6dfcc47b Iustin Pop
                             use_locking=True)
275 a8083063 Iustin Pop
276 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
277 a8083063 Iustin Pop
278 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
279 5a3103e9 Michael Hanselmann
280 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
281 78f44650 Iustin Pop
282 6dfcc47b Iustin Pop
  result = all_results[0]
283 6dfcc47b Iustin Pop
  smap = {}
284 5a3103e9 Michael Hanselmann
285 6dfcc47b Iustin Pop
  instances = {}
286 78f44650 Iustin Pop
287 78f44650 Iustin Pop
  # write the upfile
288 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
289 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
290 78f44650 Iustin Pop
291 6dfcc47b Iustin Pop
  for fields in result:
292 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
293 5a3103e9 Michael Hanselmann
294 6dfcc47b Iustin Pop
    # update the secondary node map
295 6dfcc47b Iustin Pop
    for node in snodes:
296 6dfcc47b Iustin Pop
      if node not in smap:
297 6dfcc47b Iustin Pop
        smap[node] = []
298 6dfcc47b Iustin Pop
      smap[node].append(name)
299 a8083063 Iustin Pop
300 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
301 5a3103e9 Michael Hanselmann
302 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
303 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
304 5a3103e9 Michael Hanselmann
305 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
306 5a3103e9 Michael Hanselmann
307 6dfcc47b Iustin Pop
  return instances, nodes, smap
308 a8083063 Iustin Pop
309 a8083063 Iustin Pop
310 5a3103e9 Michael Hanselmann
class Watcher(object):
311 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
312 a8083063 Iustin Pop
313 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
314 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
315 a8083063 Iustin Pop
  to restart machines that are down.
316 38242904 Iustin Pop
317 a8083063 Iustin Pop
  """
318 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
319 cc962d58 Iustin Pop
    self.notepad = notepad
320 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
321 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
322 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
323 24edc6d4 Iustin Pop
    # first archive old jobs
324 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
325 24edc6d4 Iustin Pop
    # and only then submit new ones
326 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
327 eee1fa2d Iustin Pop
    self.started_instances = set()
328 f07521e5 Iustin Pop
    self.opts = opts
329 a8083063 Iustin Pop
330 a8083063 Iustin Pop
  def Run(self):
331 cc962d58 Iustin Pop
    """Watcher run sequence.
332 cc962d58 Iustin Pop
333 cc962d58 Iustin Pop
    """
334 cc962d58 Iustin Pop
    notepad = self.notepad
335 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
336 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
337 cc962d58 Iustin Pop
    self.VerifyDisks()
338 5a3103e9 Michael Hanselmann
339 24edc6d4 Iustin Pop
  @staticmethod
340 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
341 f07521e5 Iustin Pop
    """Archive old jobs.
342 f07521e5 Iustin Pop
343 f07521e5 Iustin Pop
    """
344 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
345 f07521e5 Iustin Pop
    logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))
346 f07521e5 Iustin Pop
347 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
348 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
349 38242904 Iustin Pop
350 a8083063 Iustin Pop
    """
351 5a3103e9 Michael Hanselmann
    check_nodes = []
352 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
353 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
354 37b77b18 Iustin Pop
      if new_id is None:
355 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
356 cbfc4681 Iustin Pop
        if not offline:
357 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
358 cbfc4681 Iustin Pop
                        name)
359 37b77b18 Iustin Pop
        continue
360 26517d45 Iustin Pop
      if old != new_id:
361 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
362 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
363 5a3103e9 Michael Hanselmann
364 5a3103e9 Michael Hanselmann
    if check_nodes:
365 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
366 5a3103e9 Michael Hanselmann
      # secondary node.
367 6dfcc47b Iustin Pop
      for node in check_nodes:
368 6dfcc47b Iustin Pop
        if node not in self.smap:
369 eee1fa2d Iustin Pop
          continue
370 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
371 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
372 6dfcc47b Iustin Pop
          if not instance.autostart:
373 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
374 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
375 6dfcc47b Iustin Pop
            continue
376 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
377 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
378 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
379 6dfcc47b Iustin Pop
            continue
380 6dfcc47b Iustin Pop
          try:
381 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
382 6dfcc47b Iustin Pop
            instance.ActivateDisks()
383 6dfcc47b Iustin Pop
          except Exception:
384 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
385 6dfcc47b Iustin Pop
                              instance.name)
386 5a3103e9 Michael Hanselmann
387 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
388 5a3103e9 Michael Hanselmann
      for name in check_nodes:
389 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
390 a8083063 Iustin Pop
391 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
392 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
393 5a3103e9 Michael Hanselmann
394 5a3103e9 Michael Hanselmann
    """
395 6dfcc47b Iustin Pop
    for instance in self.instances.values():
396 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
397 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
398 a8083063 Iustin Pop
399 a8083063 Iustin Pop
        if n > MAXTRIES:
400 a8083063 Iustin Pop
          # stay quiet.
401 a8083063 Iustin Pop
          continue
402 a8083063 Iustin Pop
        elif n < MAXTRIES:
403 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
404 a8083063 Iustin Pop
        else:
405 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
406 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
407 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
408 a8083063 Iustin Pop
          continue
409 a8083063 Iustin Pop
        try:
410 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
411 438b45d4 Michael Hanselmann
                        instance.name, last)
412 a8083063 Iustin Pop
          instance.Restart()
413 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
414 b7309a0d Iustin Pop
        except Exception:
415 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
416 4bffa7f7 Iustin Pop
                            instance.name)
417 a8083063 Iustin Pop
418 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
419 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
420 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
421 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
422 a8083063 Iustin Pop
      else:
423 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
424 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
425 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
426 a8083063 Iustin Pop
427 b7309a0d Iustin Pop
  @staticmethod
428 b7309a0d Iustin Pop
  def VerifyDisks():
429 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
430 d2f311db Iustin Pop
431 d2f311db Iustin Pop
    """
432 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
433 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
434 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
435 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
436 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
437 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
438 5188ab37 Iustin Pop
      return
439 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
440 5188ab37 Iustin Pop
    if not offline_disk_instances:
441 5188ab37 Iustin Pop
      # nothing to do
442 5188ab37 Iustin Pop
      return
443 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
444 5188ab37 Iustin Pop
                  ", ".join(offline_disk_instances))
445 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
446 5188ab37 Iustin Pop
    # less the job queue
447 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
448 5188ab37 Iustin Pop
           for name in offline_disk_instances]
449 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
450 5188ab37 Iustin Pop
451 5188ab37 Iustin Pop
    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
452 a8083063 Iustin Pop
453 a8083063 Iustin Pop
454 a8083063 Iustin Pop
def ParseOptions():
455 a8083063 Iustin Pop
  """Parse the command line options.
456 a8083063 Iustin Pop
457 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
458 a8083063 Iustin Pop
459 a8083063 Iustin Pop
  """
460 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
461 a8083063 Iustin Pop
                        usage="%prog [-d]",
462 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
463 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
464 a8083063 Iustin Pop
465 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
466 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
467 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
468 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
469 a8083063 Iustin Pop
  options, args = parser.parse_args()
470 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
471 a8083063 Iustin Pop
  return options, args
472 a8083063 Iustin Pop
473 a8083063 Iustin Pop
474 a8083063 Iustin Pop
def main():
475 a8083063 Iustin Pop
  """Main function.
476 a8083063 Iustin Pop
477 a8083063 Iustin Pop
  """
478 e125c67c Michael Hanselmann
  global client
479 e125c67c Michael Hanselmann
480 a8083063 Iustin Pop
  options, args = ParseOptions()
481 a8083063 Iustin Pop
482 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
483 82d9caef Iustin Pop
                     stderr_logging=options.debug)
484 a8083063 Iustin Pop
485 3753b2cb Michael Hanselmann
  if ShouldPause():
486 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
487 3753b2cb Michael Hanselmann
    sys.exit(constants.EXIT_SUCCESS)
488 3753b2cb Michael Hanselmann
489 24edc6d4 Iustin Pop
  update_file = False
490 a8083063 Iustin Pop
  try:
491 83052f9e Guido Trotter
    # on master or not, try to start the node dameon
492 83052f9e Guido Trotter
    EnsureDaemon(constants.NODED)
493 c4f0219c Iustin Pop
494 cc962d58 Iustin Pop
    notepad = WatcherState()
495 781b2b2b Michael Hanselmann
    try:
496 2c404217 Iustin Pop
      try:
497 2c404217 Iustin Pop
        client = cli.GetClient()
498 2c404217 Iustin Pop
      except errors.OpPrereqError:
499 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
500 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
501 24edc6d4 Iustin Pop
        update_file = True
502 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
503 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
504 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
505 7dfb83c2 Iustin Pop
                        str(err))
506 7dfb83c2 Iustin Pop
        if not StartMaster():
507 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
508 7dfb83c2 Iustin Pop
          sys.exit(constants.EXIT_FAILURE)
509 7dfb83c2 Iustin Pop
        # else retry the connection
510 7dfb83c2 Iustin Pop
        client = cli.GetClient()
511 cc962d58 Iustin Pop
512 83052f9e Guido Trotter
      # we are on master now
513 83052f9e Guido Trotter
      EnsureDaemon(constants.RAPI)
514 c4f0219c Iustin Pop
515 cc962d58 Iustin Pop
      try:
516 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
517 cc962d58 Iustin Pop
      except errors.ConfigurationError:
518 cc962d58 Iustin Pop
        # Just exit if there's no configuration
519 24edc6d4 Iustin Pop
        update_file = True
520 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
521 e125c67c Michael Hanselmann
522 cc962d58 Iustin Pop
      watcher.Run()
523 24edc6d4 Iustin Pop
      update_file = True
524 24edc6d4 Iustin Pop
525 cc962d58 Iustin Pop
    finally:
526 7dfb83c2 Iustin Pop
      if update_file:
527 7dfb83c2 Iustin Pop
        notepad.Save()
528 7dfb83c2 Iustin Pop
      else:
529 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
530 1b052f42 Michael Hanselmann
  except SystemExit:
531 1b052f42 Michael Hanselmann
    raise
532 38242904 Iustin Pop
  except NotMasterError:
533 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
534 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
535 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
536 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
537 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
538 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
539 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
540 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
541 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
542 438b45d4 Michael Hanselmann
  except Exception, err:
543 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
544 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
545 a8083063 Iustin Pop
546 5a3103e9 Michael Hanselmann
547 a8083063 Iustin Pop
if __name__ == '__main__':
548 a8083063 Iustin Pop
  main()