Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 2503680f

History | View | Annotate | Download (15.7 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 438b45d4 Michael Hanselmann
import logging
34 a8083063 Iustin Pop
from optparse import OptionParser
35 a8083063 Iustin Pop
36 a8083063 Iustin Pop
from ganeti import utils
37 a8083063 Iustin Pop
from ganeti import constants
38 67fe61c4 Michael Hanselmann
from ganeti import serializer
39 89e1fc26 Iustin Pop
from ganeti import errors
40 e125c67c Michael Hanselmann
from ganeti import opcodes
41 e125c67c Michael Hanselmann
from ganeti import cli
42 7dfb83c2 Iustin Pop
from ganeti import luxi
43 a8083063 Iustin Pop
44 a8083063 Iustin Pop
45 5a3103e9 Michael Hanselmann
MAXTRIES = 5
46 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
47 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
48 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
49 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
50 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
51 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
52 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
53 5a3103e9 Michael Hanselmann
54 5a3103e9 Michael Hanselmann
55 e125c67c Michael Hanselmann
# Global client object
56 e125c67c Michael Hanselmann
client = None
57 e125c67c Michael Hanselmann
58 e125c67c Michael Hanselmann
59 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
60 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
def Indent(s, prefix='| '):
64 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
65 a8083063 Iustin Pop
66 c41eea6e Iustin Pop
  @param s: the string to indent
67 c41eea6e Iustin Pop
  @param prefix: the string to prepend each line
68 38242904 Iustin Pop
69 a8083063 Iustin Pop
  """
70 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
71 a8083063 Iustin Pop
72 a8083063 Iustin Pop
73 7dfb83c2 Iustin Pop
def StartMaster():
74 7dfb83c2 Iustin Pop
  """Try to start the master daemon.
75 7dfb83c2 Iustin Pop
76 7dfb83c2 Iustin Pop
  """
77 7dfb83c2 Iustin Pop
  result = utils.RunCmd(['ganeti-masterd'])
78 7dfb83c2 Iustin Pop
  if result.failed:
79 7dfb83c2 Iustin Pop
    logging.error("Can't start the master daemon: output '%s'", result.output)
80 7dfb83c2 Iustin Pop
  return not result.failed
81 7dfb83c2 Iustin Pop
82 7dfb83c2 Iustin Pop
83 c4f0219c Iustin Pop
def EnsureDaemon(daemon):
84 c4f0219c Iustin Pop
  """Check for and start daemon if not alive.
85 c4f0219c Iustin Pop
86 c4f0219c Iustin Pop
  """
87 c4f0219c Iustin Pop
  pidfile = utils.DaemonPidFileName(daemon)
88 c4f0219c Iustin Pop
  pid = utils.ReadPidFile(pidfile)
89 c4f0219c Iustin Pop
  if pid == 0 or not utils.IsProcessAlive(pid): # no file or dead pid
90 c4f0219c Iustin Pop
    logging.debug("Daemon '%s' not alive, trying to restart", daemon)
91 c4f0219c Iustin Pop
    result = utils.RunCmd([daemon])
92 c4f0219c Iustin Pop
    if not result:
93 c4f0219c Iustin Pop
      logging.error("Can't start daemon '%s', failure %s, output: %s",
94 c4f0219c Iustin Pop
                    daemon, result.fail_reason, result.output)
95 c4f0219c Iustin Pop
96 c4f0219c Iustin Pop
97 5a3103e9 Michael Hanselmann
class WatcherState(object):
98 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
99 a8083063 Iustin Pop
100 a8083063 Iustin Pop
  """
101 a8083063 Iustin Pop
  def __init__(self):
102 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
103 5a3103e9 Michael Hanselmann
104 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
105 5a3103e9 Michael Hanselmann
106 5a3103e9 Michael Hanselmann
    """
107 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
108 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
109 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
110 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
111 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
112 a8083063 Iustin Pop
113 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
114 a8083063 Iustin Pop
115 5a3103e9 Michael Hanselmann
    try:
116 2c404217 Iustin Pop
      state_data = self.statefile.read()
117 2c404217 Iustin Pop
      if not state_data:
118 2c404217 Iustin Pop
        self._data = {}
119 2c404217 Iustin Pop
      else:
120 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
121 5a3103e9 Michael Hanselmann
    except Exception, msg:
122 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
123 b76f660d Michael Hanselmann
      self._data = {}
124 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
125 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
126 5a3103e9 Michael Hanselmann
127 b76f660d Michael Hanselmann
    if "instance" not in self._data:
128 b76f660d Michael Hanselmann
      self._data["instance"] = {}
129 b76f660d Michael Hanselmann
    if "node" not in self._data:
130 b76f660d Michael Hanselmann
      self._data["node"] = {}
131 5a3103e9 Michael Hanselmann
132 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
133 2fb96d39 Michael Hanselmann
134 fc428e32 Michael Hanselmann
  def Save(self):
135 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
136 5a3103e9 Michael Hanselmann
137 5a3103e9 Michael Hanselmann
    """
138 fc428e32 Michael Hanselmann
    assert self.statefile
139 fc428e32 Michael Hanselmann
140 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
141 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
142 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
143 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
144 2fb96d39 Michael Hanselmann
      return
145 2fb96d39 Michael Hanselmann
146 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
147 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
148 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
149 26517d45 Iustin Pop
                         data=serialized_form,
150 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
151 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
152 5a3103e9 Michael Hanselmann
153 fc428e32 Michael Hanselmann
  def Close(self):
154 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
155 5a3103e9 Michael Hanselmann
156 5a3103e9 Michael Hanselmann
    """
157 5a3103e9 Michael Hanselmann
    assert self.statefile
158 5a3103e9 Michael Hanselmann
159 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
160 5a3103e9 Michael Hanselmann
    self.statefile.close()
161 5a3103e9 Michael Hanselmann
    self.statefile = None
162 5a3103e9 Michael Hanselmann
163 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
164 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
165 a8083063 Iustin Pop
166 5a3103e9 Michael Hanselmann
    """
167 b76f660d Michael Hanselmann
    ndata = self._data["node"]
168 5a3103e9 Michael Hanselmann
169 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
170 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
171 5a3103e9 Michael Hanselmann
    return None
172 5a3103e9 Michael Hanselmann
173 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
174 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
175 5a3103e9 Michael Hanselmann
176 5a3103e9 Michael Hanselmann
    """
177 5a3103e9 Michael Hanselmann
    assert bootid
178 a8083063 Iustin Pop
179 b76f660d Michael Hanselmann
    ndata = self._data["node"]
180 a8083063 Iustin Pop
181 5a3103e9 Michael Hanselmann
    if name not in ndata:
182 5a3103e9 Michael Hanselmann
      ndata[name] = {}
183 5a3103e9 Michael Hanselmann
184 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
185 5a3103e9 Michael Hanselmann
186 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
187 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
188 a8083063 Iustin Pop
189 c41eea6e Iustin Pop
    @type instance: L{Instance}
190 c41eea6e Iustin Pop
    @param instance: the instance to look up
191 38242904 Iustin Pop
192 a8083063 Iustin Pop
    """
193 b76f660d Michael Hanselmann
    idata = self._data["instance"]
194 a8083063 Iustin Pop
195 5a3103e9 Michael Hanselmann
    if instance.name in idata:
196 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
197 a8083063 Iustin Pop
198 a8083063 Iustin Pop
    return 0
199 a8083063 Iustin Pop
200 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
201 a8083063 Iustin Pop
    """Record a restart attempt.
202 a8083063 Iustin Pop
203 c41eea6e Iustin Pop
    @type instance: L{Instance}
204 c41eea6e Iustin Pop
    @param instance: the instance being restarted
205 38242904 Iustin Pop
206 a8083063 Iustin Pop
    """
207 b76f660d Michael Hanselmann
    idata = self._data["instance"]
208 a8083063 Iustin Pop
209 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
210 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
211 5a3103e9 Michael Hanselmann
    else:
212 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
213 a8083063 Iustin Pop
214 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
215 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
216 a8083063 Iustin Pop
217 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
218 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
219 a8083063 Iustin Pop
220 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
221 c41eea6e Iustin Pop
    track down instances).
222 a8083063 Iustin Pop
223 c41eea6e Iustin Pop
    @type instance: L{Instance}
224 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
225 38242904 Iustin Pop
226 a8083063 Iustin Pop
    """
227 b76f660d Michael Hanselmann
    idata = self._data["instance"]
228 a8083063 Iustin Pop
229 5a3103e9 Michael Hanselmann
    if instance.name in idata:
230 5a3103e9 Michael Hanselmann
      del idata[instance.name]
231 a8083063 Iustin Pop
232 a8083063 Iustin Pop
233 a8083063 Iustin Pop
class Instance(object):
234 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
235 a8083063 Iustin Pop
236 a8083063 Iustin Pop
  """
237 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
238 a8083063 Iustin Pop
    self.name = name
239 a8083063 Iustin Pop
    self.state = state
240 5a3103e9 Michael Hanselmann
    self.autostart = autostart
241 a8083063 Iustin Pop
242 a8083063 Iustin Pop
  def Restart(self):
243 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
244 3ecf6786 Iustin Pop
245 3ecf6786 Iustin Pop
    """
246 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
247 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
248 a8083063 Iustin Pop
249 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
250 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
251 5a3103e9 Michael Hanselmann
252 5a3103e9 Michael Hanselmann
    """
253 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
254 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
255 a8083063 Iustin Pop
256 a8083063 Iustin Pop
257 6dfcc47b Iustin Pop
def GetClusterData():
258 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
259 5a3103e9 Michael Hanselmann
260 5a3103e9 Michael Hanselmann
  """
261 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
262 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
263 6dfcc47b Iustin Pop
                                 use_locking=True)
264 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
265 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
266 6dfcc47b Iustin Pop
                             use_locking=True)
267 a8083063 Iustin Pop
268 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
269 a8083063 Iustin Pop
270 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
271 5a3103e9 Michael Hanselmann
272 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
273 78f44650 Iustin Pop
274 6dfcc47b Iustin Pop
  result = all_results[0]
275 6dfcc47b Iustin Pop
  smap = {}
276 5a3103e9 Michael Hanselmann
277 6dfcc47b Iustin Pop
  instances = {}
278 78f44650 Iustin Pop
279 78f44650 Iustin Pop
  # write the upfile
280 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
281 78f44650 Iustin Pop
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
282 78f44650 Iustin Pop
283 6dfcc47b Iustin Pop
  for fields in result:
284 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
285 5a3103e9 Michael Hanselmann
286 6dfcc47b Iustin Pop
    # update the secondary node map
287 6dfcc47b Iustin Pop
    for node in snodes:
288 6dfcc47b Iustin Pop
      if node not in smap:
289 6dfcc47b Iustin Pop
        smap[node] = []
290 6dfcc47b Iustin Pop
      smap[node].append(name)
291 a8083063 Iustin Pop
292 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
293 5a3103e9 Michael Hanselmann
294 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
295 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
296 5a3103e9 Michael Hanselmann
297 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
298 5a3103e9 Michael Hanselmann
299 6dfcc47b Iustin Pop
  return instances, nodes, smap
300 a8083063 Iustin Pop
301 a8083063 Iustin Pop
302 5a3103e9 Michael Hanselmann
class Watcher(object):
303 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
304 a8083063 Iustin Pop
305 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
306 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
307 a8083063 Iustin Pop
  to restart machines that are down.
308 38242904 Iustin Pop
309 a8083063 Iustin Pop
  """
310 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
311 cc962d58 Iustin Pop
    self.notepad = notepad
312 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
313 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
314 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
315 24edc6d4 Iustin Pop
    # first archive old jobs
316 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
317 24edc6d4 Iustin Pop
    # and only then submit new ones
318 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
319 eee1fa2d Iustin Pop
    self.started_instances = set()
320 f07521e5 Iustin Pop
    self.opts = opts
321 a8083063 Iustin Pop
322 a8083063 Iustin Pop
  def Run(self):
323 cc962d58 Iustin Pop
    """Watcher run sequence.
324 cc962d58 Iustin Pop
325 cc962d58 Iustin Pop
    """
326 cc962d58 Iustin Pop
    notepad = self.notepad
327 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
328 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
329 cc962d58 Iustin Pop
    self.VerifyDisks()
330 5a3103e9 Michael Hanselmann
331 24edc6d4 Iustin Pop
  @staticmethod
332 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
333 f07521e5 Iustin Pop
    """Archive old jobs.
334 f07521e5 Iustin Pop
335 f07521e5 Iustin Pop
    """
336 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
337 f07521e5 Iustin Pop
    logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))
338 f07521e5 Iustin Pop
339 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
340 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
341 38242904 Iustin Pop
342 a8083063 Iustin Pop
    """
343 5a3103e9 Michael Hanselmann
    check_nodes = []
344 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
345 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
346 37b77b18 Iustin Pop
      if new_id is None:
347 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
348 cbfc4681 Iustin Pop
        if not offline:
349 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
350 cbfc4681 Iustin Pop
                        name)
351 37b77b18 Iustin Pop
        continue
352 26517d45 Iustin Pop
      if old != new_id:
353 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
354 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
355 5a3103e9 Michael Hanselmann
356 5a3103e9 Michael Hanselmann
    if check_nodes:
357 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
358 5a3103e9 Michael Hanselmann
      # secondary node.
359 6dfcc47b Iustin Pop
      for node in check_nodes:
360 6dfcc47b Iustin Pop
        if node not in self.smap:
361 eee1fa2d Iustin Pop
          continue
362 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
363 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
364 6dfcc47b Iustin Pop
          if not instance.autostart:
365 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
366 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
367 6dfcc47b Iustin Pop
            continue
368 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
369 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
370 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
371 6dfcc47b Iustin Pop
            continue
372 6dfcc47b Iustin Pop
          try:
373 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
374 6dfcc47b Iustin Pop
            instance.ActivateDisks()
375 6dfcc47b Iustin Pop
          except Exception:
376 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
377 6dfcc47b Iustin Pop
                              instance.name)
378 5a3103e9 Michael Hanselmann
379 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
380 5a3103e9 Michael Hanselmann
      for name in check_nodes:
381 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
382 a8083063 Iustin Pop
383 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
384 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
385 5a3103e9 Michael Hanselmann
386 5a3103e9 Michael Hanselmann
    """
387 6dfcc47b Iustin Pop
    for instance in self.instances.values():
388 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
389 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
390 a8083063 Iustin Pop
391 a8083063 Iustin Pop
        if n > MAXTRIES:
392 a8083063 Iustin Pop
          # stay quiet.
393 a8083063 Iustin Pop
          continue
394 a8083063 Iustin Pop
        elif n < MAXTRIES:
395 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
396 a8083063 Iustin Pop
        else:
397 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
398 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
399 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
400 a8083063 Iustin Pop
          continue
401 a8083063 Iustin Pop
        try:
402 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
403 438b45d4 Michael Hanselmann
                        instance.name, last)
404 a8083063 Iustin Pop
          instance.Restart()
405 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
406 b7309a0d Iustin Pop
        except Exception:
407 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
408 4bffa7f7 Iustin Pop
                            instance.name)
409 a8083063 Iustin Pop
410 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
411 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
412 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
413 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
414 a8083063 Iustin Pop
      else:
415 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
416 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
417 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
418 a8083063 Iustin Pop
419 b7309a0d Iustin Pop
  @staticmethod
420 b7309a0d Iustin Pop
  def VerifyDisks():
421 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
422 d2f311db Iustin Pop
423 d2f311db Iustin Pop
    """
424 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
425 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
426 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
427 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
428 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
429 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
430 5188ab37 Iustin Pop
      return
431 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
432 5188ab37 Iustin Pop
    if not offline_disk_instances:
433 5188ab37 Iustin Pop
      # nothing to do
434 5188ab37 Iustin Pop
      return
435 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
436 5188ab37 Iustin Pop
                  ", ".join(offline_disk_instances))
437 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
438 5188ab37 Iustin Pop
    # less the job queue
439 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
440 5188ab37 Iustin Pop
           for name in offline_disk_instances]
441 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
442 5188ab37 Iustin Pop
443 5188ab37 Iustin Pop
    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
444 a8083063 Iustin Pop
445 a8083063 Iustin Pop
446 a8083063 Iustin Pop
def ParseOptions():
447 a8083063 Iustin Pop
  """Parse the command line options.
448 a8083063 Iustin Pop
449 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
450 a8083063 Iustin Pop
451 a8083063 Iustin Pop
  """
452 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
453 a8083063 Iustin Pop
                        usage="%prog [-d]",
454 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
455 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
456 a8083063 Iustin Pop
457 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
458 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
459 a8083063 Iustin Pop
                    default=False, action="store_true")
460 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
461 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
462 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
463 a8083063 Iustin Pop
  options, args = parser.parse_args()
464 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
465 a8083063 Iustin Pop
  return options, args
466 a8083063 Iustin Pop
467 a8083063 Iustin Pop
468 a8083063 Iustin Pop
def main():
469 a8083063 Iustin Pop
  """Main function.
470 a8083063 Iustin Pop
471 a8083063 Iustin Pop
  """
472 e125c67c Michael Hanselmann
  global client
473 e125c67c Michael Hanselmann
474 a8083063 Iustin Pop
  options, args = ParseOptions()
475 a8083063 Iustin Pop
476 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
477 82d9caef Iustin Pop
                     stderr_logging=options.debug)
478 a8083063 Iustin Pop
479 24edc6d4 Iustin Pop
  update_file = False
480 a8083063 Iustin Pop
  try:
481 c4f0219c Iustin Pop
    # on master or not, try to start the node dameon (use _PID but is
482 c4f0219c Iustin Pop
    # the same as daemon name)
483 c4f0219c Iustin Pop
    EnsureDaemon(constants.NODED_PID)
484 c4f0219c Iustin Pop
485 cc962d58 Iustin Pop
    notepad = WatcherState()
486 781b2b2b Michael Hanselmann
    try:
487 2c404217 Iustin Pop
      try:
488 2c404217 Iustin Pop
        client = cli.GetClient()
489 2c404217 Iustin Pop
      except errors.OpPrereqError:
490 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
491 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
492 24edc6d4 Iustin Pop
        update_file = True
493 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
494 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
495 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
496 7dfb83c2 Iustin Pop
                        str(err))
497 7dfb83c2 Iustin Pop
        if not StartMaster():
498 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
499 7dfb83c2 Iustin Pop
          sys.exit(constants.EXIT_FAILURE)
500 7dfb83c2 Iustin Pop
        # else retry the connection
501 7dfb83c2 Iustin Pop
        client = cli.GetClient()
502 cc962d58 Iustin Pop
503 c4f0219c Iustin Pop
      # we are on master now (use _PID but is the same as daemon name)
504 c4f0219c Iustin Pop
      EnsureDaemon(constants.RAPI_PID)
505 c4f0219c Iustin Pop
506 cc962d58 Iustin Pop
      try:
507 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
508 cc962d58 Iustin Pop
      except errors.ConfigurationError:
509 cc962d58 Iustin Pop
        # Just exit if there's no configuration
510 24edc6d4 Iustin Pop
        update_file = True
511 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
512 e125c67c Michael Hanselmann
513 cc962d58 Iustin Pop
      watcher.Run()
514 24edc6d4 Iustin Pop
      update_file = True
515 24edc6d4 Iustin Pop
516 cc962d58 Iustin Pop
    finally:
517 7dfb83c2 Iustin Pop
      if update_file:
518 7dfb83c2 Iustin Pop
        notepad.Save()
519 7dfb83c2 Iustin Pop
      else:
520 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
521 1b052f42 Michael Hanselmann
  except SystemExit:
522 1b052f42 Michael Hanselmann
    raise
523 38242904 Iustin Pop
  except NotMasterError:
524 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
525 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
526 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
527 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
528 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
529 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
530 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
531 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
532 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
533 438b45d4 Michael Hanselmann
  except Exception, err:
534 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
535 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
536 a8083063 Iustin Pop
537 5a3103e9 Michael Hanselmann
538 a8083063 Iustin Pop
if __name__ == '__main__':
539 a8083063 Iustin Pop
  main()