Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 90b54c26

History | View | Annotate | Download (14.4 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 438b45d4 Michael Hanselmann
import logging
34 a8083063 Iustin Pop
from optparse import OptionParser
35 a8083063 Iustin Pop
36 a8083063 Iustin Pop
from ganeti import utils
37 a8083063 Iustin Pop
from ganeti import constants
38 67fe61c4 Michael Hanselmann
from ganeti import serializer
39 89e1fc26 Iustin Pop
from ganeti import errors
40 e125c67c Michael Hanselmann
from ganeti import opcodes
41 e125c67c Michael Hanselmann
from ganeti import cli
42 7dfb83c2 Iustin Pop
from ganeti import luxi
43 a8083063 Iustin Pop
44 a8083063 Iustin Pop
45 5a3103e9 Michael Hanselmann
MAXTRIES = 5
46 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
47 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
48 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
49 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
50 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
51 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
52 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
53 5a3103e9 Michael Hanselmann
54 5a3103e9 Michael Hanselmann
55 e125c67c Michael Hanselmann
# Global client object
56 e125c67c Michael Hanselmann
client = None
57 e125c67c Michael Hanselmann
58 e125c67c Michael Hanselmann
59 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
60 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
def Indent(s, prefix='| '):
64 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
65 a8083063 Iustin Pop
66 c41eea6e Iustin Pop
  @param s: the string to indent
67 c41eea6e Iustin Pop
  @param prefix: the string to prepend each line
68 38242904 Iustin Pop
69 a8083063 Iustin Pop
  """
70 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
71 a8083063 Iustin Pop
72 a8083063 Iustin Pop
73 7dfb83c2 Iustin Pop
def StartMaster():
74 7dfb83c2 Iustin Pop
  """Try to start the master daemon.
75 7dfb83c2 Iustin Pop
76 7dfb83c2 Iustin Pop
  """
77 7dfb83c2 Iustin Pop
  result = utils.RunCmd(['ganeti-masterd'])
78 7dfb83c2 Iustin Pop
  if result.failed:
79 7dfb83c2 Iustin Pop
    logging.error("Can't start the master daemon: output '%s'", result.output)
80 7dfb83c2 Iustin Pop
  return not result.failed
81 7dfb83c2 Iustin Pop
82 7dfb83c2 Iustin Pop
83 5a3103e9 Michael Hanselmann
class WatcherState(object):
84 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
85 a8083063 Iustin Pop
86 a8083063 Iustin Pop
  """
87 a8083063 Iustin Pop
  def __init__(self):
88 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
89 5a3103e9 Michael Hanselmann
90 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
91 5a3103e9 Michael Hanselmann
92 5a3103e9 Michael Hanselmann
    """
93 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
94 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
95 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
96 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
97 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
98 a8083063 Iustin Pop
99 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
100 a8083063 Iustin Pop
101 5a3103e9 Michael Hanselmann
    try:
102 2c404217 Iustin Pop
      state_data = self.statefile.read()
103 2c404217 Iustin Pop
      if not state_data:
104 2c404217 Iustin Pop
        self._data = {}
105 2c404217 Iustin Pop
      else:
106 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
107 5a3103e9 Michael Hanselmann
    except Exception, msg:
108 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
109 b76f660d Michael Hanselmann
      self._data = {}
110 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
111 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
112 5a3103e9 Michael Hanselmann
113 b76f660d Michael Hanselmann
    if "instance" not in self._data:
114 b76f660d Michael Hanselmann
      self._data["instance"] = {}
115 b76f660d Michael Hanselmann
    if "node" not in self._data:
116 b76f660d Michael Hanselmann
      self._data["node"] = {}
117 5a3103e9 Michael Hanselmann
118 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
119 2fb96d39 Michael Hanselmann
120 fc428e32 Michael Hanselmann
  def Save(self):
121 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
122 5a3103e9 Michael Hanselmann
123 5a3103e9 Michael Hanselmann
    """
124 fc428e32 Michael Hanselmann
    assert self.statefile
125 fc428e32 Michael Hanselmann
126 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
127 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
128 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
129 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
130 2fb96d39 Michael Hanselmann
      return
131 2fb96d39 Michael Hanselmann
132 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
133 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
134 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
135 26517d45 Iustin Pop
                         data=serialized_form,
136 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
137 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
138 5a3103e9 Michael Hanselmann
139 fc428e32 Michael Hanselmann
  def Close(self):
140 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
141 5a3103e9 Michael Hanselmann
142 5a3103e9 Michael Hanselmann
    """
143 5a3103e9 Michael Hanselmann
    assert self.statefile
144 5a3103e9 Michael Hanselmann
145 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
146 5a3103e9 Michael Hanselmann
    self.statefile.close()
147 5a3103e9 Michael Hanselmann
    self.statefile = None
148 5a3103e9 Michael Hanselmann
149 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
150 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
151 a8083063 Iustin Pop
152 5a3103e9 Michael Hanselmann
    """
153 b76f660d Michael Hanselmann
    ndata = self._data["node"]
154 5a3103e9 Michael Hanselmann
155 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
156 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
157 5a3103e9 Michael Hanselmann
    return None
158 5a3103e9 Michael Hanselmann
159 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
160 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
161 5a3103e9 Michael Hanselmann
162 5a3103e9 Michael Hanselmann
    """
163 5a3103e9 Michael Hanselmann
    assert bootid
164 a8083063 Iustin Pop
165 b76f660d Michael Hanselmann
    ndata = self._data["node"]
166 a8083063 Iustin Pop
167 5a3103e9 Michael Hanselmann
    if name not in ndata:
168 5a3103e9 Michael Hanselmann
      ndata[name] = {}
169 5a3103e9 Michael Hanselmann
170 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
171 5a3103e9 Michael Hanselmann
172 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
173 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
174 a8083063 Iustin Pop
175 c41eea6e Iustin Pop
    @type instance: L{Instance}
176 c41eea6e Iustin Pop
    @param instance: the instance to look up
177 38242904 Iustin Pop
178 a8083063 Iustin Pop
    """
179 b76f660d Michael Hanselmann
    idata = self._data["instance"]
180 a8083063 Iustin Pop
181 5a3103e9 Michael Hanselmann
    if instance.name in idata:
182 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
183 a8083063 Iustin Pop
184 a8083063 Iustin Pop
    return 0
185 a8083063 Iustin Pop
186 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
187 a8083063 Iustin Pop
    """Record a restart attempt.
188 a8083063 Iustin Pop
189 c41eea6e Iustin Pop
    @type instance: L{Instance}
190 c41eea6e Iustin Pop
    @param instance: the instance being restarted
191 38242904 Iustin Pop
192 a8083063 Iustin Pop
    """
193 b76f660d Michael Hanselmann
    idata = self._data["instance"]
194 a8083063 Iustin Pop
195 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
196 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
197 5a3103e9 Michael Hanselmann
    else:
198 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
199 a8083063 Iustin Pop
200 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
201 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
202 a8083063 Iustin Pop
203 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
204 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
205 a8083063 Iustin Pop
206 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
207 c41eea6e Iustin Pop
    track down instances).
208 a8083063 Iustin Pop
209 c41eea6e Iustin Pop
    @type instance: L{Instance}
210 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
211 38242904 Iustin Pop
212 a8083063 Iustin Pop
    """
213 b76f660d Michael Hanselmann
    idata = self._data["instance"]
214 a8083063 Iustin Pop
215 5a3103e9 Michael Hanselmann
    if instance.name in idata:
216 5a3103e9 Michael Hanselmann
      del idata[instance.name]
217 a8083063 Iustin Pop
218 a8083063 Iustin Pop
219 a8083063 Iustin Pop
class Instance(object):
220 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
221 a8083063 Iustin Pop
222 a8083063 Iustin Pop
  """
223 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
224 a8083063 Iustin Pop
    self.name = name
225 a8083063 Iustin Pop
    self.state = state
226 5a3103e9 Michael Hanselmann
    self.autostart = autostart
227 a8083063 Iustin Pop
228 a8083063 Iustin Pop
  def Restart(self):
229 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
230 3ecf6786 Iustin Pop
231 3ecf6786 Iustin Pop
    """
232 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
233 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
234 a8083063 Iustin Pop
235 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
236 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
237 5a3103e9 Michael Hanselmann
238 5a3103e9 Michael Hanselmann
    """
239 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
240 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
241 a8083063 Iustin Pop
242 a8083063 Iustin Pop
243 6dfcc47b Iustin Pop
def GetClusterData():
244 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
245 5a3103e9 Michael Hanselmann
246 5a3103e9 Michael Hanselmann
  """
247 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
248 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
249 6dfcc47b Iustin Pop
                                 use_locking=True)
250 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
251 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
252 6dfcc47b Iustin Pop
                             use_locking=True)
253 a8083063 Iustin Pop
254 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
255 a8083063 Iustin Pop
256 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
257 5a3103e9 Michael Hanselmann
258 6dfcc47b Iustin Pop
  result = all_results[0]
259 6dfcc47b Iustin Pop
  smap = {}
260 5a3103e9 Michael Hanselmann
261 6dfcc47b Iustin Pop
  instances = {}
262 6dfcc47b Iustin Pop
  for fields in result:
263 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
264 5a3103e9 Michael Hanselmann
265 6dfcc47b Iustin Pop
    # update the secondary node map
266 6dfcc47b Iustin Pop
    for node in snodes:
267 6dfcc47b Iustin Pop
      if node not in smap:
268 6dfcc47b Iustin Pop
        smap[node] = []
269 6dfcc47b Iustin Pop
      smap[node].append(name)
270 a8083063 Iustin Pop
271 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
272 5a3103e9 Michael Hanselmann
273 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
274 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
275 5a3103e9 Michael Hanselmann
276 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
277 5a3103e9 Michael Hanselmann
278 6dfcc47b Iustin Pop
  return instances, nodes, smap
279 a8083063 Iustin Pop
280 a8083063 Iustin Pop
281 5a3103e9 Michael Hanselmann
class Watcher(object):
282 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
283 a8083063 Iustin Pop
284 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
285 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
286 a8083063 Iustin Pop
  to restart machines that are down.
287 38242904 Iustin Pop
288 a8083063 Iustin Pop
  """
289 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
290 cc962d58 Iustin Pop
    self.notepad = notepad
291 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
292 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
293 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
294 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
295 eee1fa2d Iustin Pop
    self.started_instances = set()
296 f07521e5 Iustin Pop
    self.opts = opts
297 a8083063 Iustin Pop
298 a8083063 Iustin Pop
  def Run(self):
299 cc962d58 Iustin Pop
    """Watcher run sequence.
300 cc962d58 Iustin Pop
301 cc962d58 Iustin Pop
    """
302 cc962d58 Iustin Pop
    notepad = self.notepad
303 cc962d58 Iustin Pop
    self.ArchiveJobs(self.opts.job_age)
304 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
305 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
306 cc962d58 Iustin Pop
    self.VerifyDisks()
307 5a3103e9 Michael Hanselmann
308 f07521e5 Iustin Pop
  def ArchiveJobs(self, age):
309 f07521e5 Iustin Pop
    """Archive old jobs.
310 f07521e5 Iustin Pop
311 f07521e5 Iustin Pop
    """
312 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
313 f07521e5 Iustin Pop
    logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))
314 f07521e5 Iustin Pop
315 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
316 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
317 38242904 Iustin Pop
318 a8083063 Iustin Pop
    """
319 5a3103e9 Michael Hanselmann
    check_nodes = []
320 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
321 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
322 37b77b18 Iustin Pop
      if new_id is None:
323 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
324 cbfc4681 Iustin Pop
        if not offline:
325 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
326 cbfc4681 Iustin Pop
                        name)
327 37b77b18 Iustin Pop
        continue
328 26517d45 Iustin Pop
      if old != new_id:
329 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
330 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
331 5a3103e9 Michael Hanselmann
332 5a3103e9 Michael Hanselmann
    if check_nodes:
333 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
334 5a3103e9 Michael Hanselmann
      # secondary node.
335 6dfcc47b Iustin Pop
      for node in check_nodes:
336 6dfcc47b Iustin Pop
        if node not in self.smap:
337 eee1fa2d Iustin Pop
          continue
338 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
339 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
340 6dfcc47b Iustin Pop
          if not instance.autostart:
341 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
342 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
343 6dfcc47b Iustin Pop
            continue
344 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
345 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
346 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
347 6dfcc47b Iustin Pop
            continue
348 6dfcc47b Iustin Pop
          try:
349 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
350 6dfcc47b Iustin Pop
            instance.ActivateDisks()
351 6dfcc47b Iustin Pop
          except Exception:
352 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
353 6dfcc47b Iustin Pop
                              instance.name)
354 5a3103e9 Michael Hanselmann
355 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
356 5a3103e9 Michael Hanselmann
      for name in check_nodes:
357 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
358 a8083063 Iustin Pop
359 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
360 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
361 5a3103e9 Michael Hanselmann
362 5a3103e9 Michael Hanselmann
    """
363 6dfcc47b Iustin Pop
    for instance in self.instances.values():
364 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
365 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
366 a8083063 Iustin Pop
367 a8083063 Iustin Pop
        if n > MAXTRIES:
368 a8083063 Iustin Pop
          # stay quiet.
369 a8083063 Iustin Pop
          continue
370 a8083063 Iustin Pop
        elif n < MAXTRIES:
371 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
372 a8083063 Iustin Pop
        else:
373 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
374 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
375 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
376 a8083063 Iustin Pop
          continue
377 a8083063 Iustin Pop
        try:
378 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
379 438b45d4 Michael Hanselmann
                        instance.name, last)
380 a8083063 Iustin Pop
          instance.Restart()
381 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
382 b7309a0d Iustin Pop
        except Exception:
383 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
384 4bffa7f7 Iustin Pop
                            instance.name)
385 a8083063 Iustin Pop
386 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
387 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
388 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
389 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
390 a8083063 Iustin Pop
      else:
391 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
392 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
393 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
394 a8083063 Iustin Pop
395 b7309a0d Iustin Pop
  @staticmethod
396 b7309a0d Iustin Pop
  def VerifyDisks():
397 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
398 d2f311db Iustin Pop
399 d2f311db Iustin Pop
    """
400 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
401 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
402 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
403 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
404 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
405 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
406 5188ab37 Iustin Pop
      return
407 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
408 5188ab37 Iustin Pop
    if not offline_disk_instances:
409 5188ab37 Iustin Pop
      # nothing to do
410 5188ab37 Iustin Pop
      return
411 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
412 5188ab37 Iustin Pop
                  ", ".join(offline_disk_instances))
413 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
414 5188ab37 Iustin Pop
    # less the job queue
415 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
416 5188ab37 Iustin Pop
           for name in offline_disk_instances]
417 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
418 5188ab37 Iustin Pop
419 5188ab37 Iustin Pop
    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
420 a8083063 Iustin Pop
421 a8083063 Iustin Pop
422 a8083063 Iustin Pop
def ParseOptions():
423 a8083063 Iustin Pop
  """Parse the command line options.
424 a8083063 Iustin Pop
425 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
426 a8083063 Iustin Pop
427 a8083063 Iustin Pop
  """
428 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
429 a8083063 Iustin Pop
                        usage="%prog [-d]",
430 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
431 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
432 a8083063 Iustin Pop
433 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
434 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
435 a8083063 Iustin Pop
                    default=False, action="store_true")
436 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
437 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
438 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
439 a8083063 Iustin Pop
  options, args = parser.parse_args()
440 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
441 a8083063 Iustin Pop
  return options, args
442 a8083063 Iustin Pop
443 a8083063 Iustin Pop
444 a8083063 Iustin Pop
def main():
445 a8083063 Iustin Pop
  """Main function.
446 a8083063 Iustin Pop
447 a8083063 Iustin Pop
  """
448 e125c67c Michael Hanselmann
  global client
449 e125c67c Michael Hanselmann
450 a8083063 Iustin Pop
  options, args = ParseOptions()
451 a8083063 Iustin Pop
452 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
453 82d9caef Iustin Pop
                     stderr_logging=options.debug)
454 a8083063 Iustin Pop
455 7dfb83c2 Iustin Pop
  update_file = True
456 a8083063 Iustin Pop
  try:
457 cc962d58 Iustin Pop
    notepad = WatcherState()
458 781b2b2b Michael Hanselmann
    try:
459 2c404217 Iustin Pop
      try:
460 2c404217 Iustin Pop
        client = cli.GetClient()
461 2c404217 Iustin Pop
      except errors.OpPrereqError:
462 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
463 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
464 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
465 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
466 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
467 7dfb83c2 Iustin Pop
                        str(err))
468 7dfb83c2 Iustin Pop
        if not StartMaster():
469 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
470 7dfb83c2 Iustin Pop
          update_file = False
471 7dfb83c2 Iustin Pop
          sys.exit(constants.EXIT_FAILURE)
472 7dfb83c2 Iustin Pop
        # else retry the connection
473 7dfb83c2 Iustin Pop
        client = cli.GetClient()
474 cc962d58 Iustin Pop
475 cc962d58 Iustin Pop
      try:
476 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
477 cc962d58 Iustin Pop
      except errors.ConfigurationError:
478 cc962d58 Iustin Pop
        # Just exit if there's no configuration
479 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
480 e125c67c Michael Hanselmann
481 cc962d58 Iustin Pop
      watcher.Run()
482 cc962d58 Iustin Pop
    finally:
483 7dfb83c2 Iustin Pop
      if update_file:
484 7dfb83c2 Iustin Pop
        notepad.Save()
485 7dfb83c2 Iustin Pop
      else:
486 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
487 1b052f42 Michael Hanselmann
  except SystemExit:
488 1b052f42 Michael Hanselmann
    raise
489 38242904 Iustin Pop
  except NotMasterError:
490 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
491 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
492 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
493 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
494 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
495 438b45d4 Michael Hanselmann
  except Exception, err:
496 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
497 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
498 a8083063 Iustin Pop
499 5a3103e9 Michael Hanselmann
500 a8083063 Iustin Pop
if __name__ == '__main__':
501 a8083063 Iustin Pop
  main()