Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 74aa2478

History | View | Annotate | Download (13 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 438b45d4 Michael Hanselmann
import logging
34 a8083063 Iustin Pop
from optparse import OptionParser
35 a8083063 Iustin Pop
36 a8083063 Iustin Pop
from ganeti import utils
37 a8083063 Iustin Pop
from ganeti import constants
38 67fe61c4 Michael Hanselmann
from ganeti import serializer
39 89e1fc26 Iustin Pop
from ganeti import errors
40 e125c67c Michael Hanselmann
from ganeti import opcodes
41 e125c67c Michael Hanselmann
from ganeti import cli
42 a8083063 Iustin Pop
43 a8083063 Iustin Pop
44 5a3103e9 Michael Hanselmann
MAXTRIES = 5
45 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
46 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
47 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
48 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
49 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
50 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
51 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
52 5a3103e9 Michael Hanselmann
53 5a3103e9 Michael Hanselmann
54 e125c67c Michael Hanselmann
# Global client object
55 e125c67c Michael Hanselmann
client = None
56 e125c67c Michael Hanselmann
57 e125c67c Michael Hanselmann
58 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
59 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
60 a8083063 Iustin Pop
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
def Indent(s, prefix='| '):
63 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
64 a8083063 Iustin Pop
65 c41eea6e Iustin Pop
  @param s: the string to indent
66 c41eea6e Iustin Pop
  @param prefix: the string to prepend each line
67 38242904 Iustin Pop
68 a8083063 Iustin Pop
  """
69 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
70 a8083063 Iustin Pop
71 a8083063 Iustin Pop
72 5a3103e9 Michael Hanselmann
class WatcherState(object):
73 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
74 a8083063 Iustin Pop
75 a8083063 Iustin Pop
  """
76 a8083063 Iustin Pop
  def __init__(self):
77 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
78 5a3103e9 Michael Hanselmann
79 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
80 5a3103e9 Michael Hanselmann
81 5a3103e9 Michael Hanselmann
    """
82 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
83 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
84 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
85 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
86 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
87 a8083063 Iustin Pop
88 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
89 a8083063 Iustin Pop
90 5a3103e9 Michael Hanselmann
    try:
91 b76f660d Michael Hanselmann
      self._data = serializer.Load(self.statefile.read())
92 5a3103e9 Michael Hanselmann
    except Exception, msg:
93 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
94 b76f660d Michael Hanselmann
      self._data = {}
95 438b45d4 Michael Hanselmann
      logging.warning(("Empty or invalid state file. Using defaults."
96 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
97 5a3103e9 Michael Hanselmann
98 b76f660d Michael Hanselmann
    if "instance" not in self._data:
99 b76f660d Michael Hanselmann
      self._data["instance"] = {}
100 b76f660d Michael Hanselmann
    if "node" not in self._data:
101 b76f660d Michael Hanselmann
      self._data["node"] = {}
102 5a3103e9 Michael Hanselmann
103 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
104 2fb96d39 Michael Hanselmann
105 fc428e32 Michael Hanselmann
  def Save(self):
106 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
107 5a3103e9 Michael Hanselmann
108 5a3103e9 Michael Hanselmann
    """
109 fc428e32 Michael Hanselmann
    assert self.statefile
110 fc428e32 Michael Hanselmann
111 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
112 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
113 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
114 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
115 2fb96d39 Michael Hanselmann
      return
116 2fb96d39 Michael Hanselmann
117 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
118 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
119 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
120 26517d45 Iustin Pop
                         data=serialized_form,
121 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
122 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
123 5a3103e9 Michael Hanselmann
124 fc428e32 Michael Hanselmann
  def Close(self):
125 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
126 5a3103e9 Michael Hanselmann
127 5a3103e9 Michael Hanselmann
    """
128 5a3103e9 Michael Hanselmann
    assert self.statefile
129 5a3103e9 Michael Hanselmann
130 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
131 5a3103e9 Michael Hanselmann
    self.statefile.close()
132 5a3103e9 Michael Hanselmann
    self.statefile = None
133 5a3103e9 Michael Hanselmann
134 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
135 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
136 a8083063 Iustin Pop
137 5a3103e9 Michael Hanselmann
    """
138 b76f660d Michael Hanselmann
    ndata = self._data["node"]
139 5a3103e9 Michael Hanselmann
140 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
141 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
142 5a3103e9 Michael Hanselmann
    return None
143 5a3103e9 Michael Hanselmann
144 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
145 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
146 5a3103e9 Michael Hanselmann
147 5a3103e9 Michael Hanselmann
    """
148 5a3103e9 Michael Hanselmann
    assert bootid
149 a8083063 Iustin Pop
150 b76f660d Michael Hanselmann
    ndata = self._data["node"]
151 a8083063 Iustin Pop
152 5a3103e9 Michael Hanselmann
    if name not in ndata:
153 5a3103e9 Michael Hanselmann
      ndata[name] = {}
154 5a3103e9 Michael Hanselmann
155 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
156 5a3103e9 Michael Hanselmann
157 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
158 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
159 a8083063 Iustin Pop
160 c41eea6e Iustin Pop
    @type instance: L{Instance}
161 c41eea6e Iustin Pop
    @param instance: the instance to look up
162 38242904 Iustin Pop
163 a8083063 Iustin Pop
    """
164 b76f660d Michael Hanselmann
    idata = self._data["instance"]
165 a8083063 Iustin Pop
166 5a3103e9 Michael Hanselmann
    if instance.name in idata:
167 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
168 a8083063 Iustin Pop
169 a8083063 Iustin Pop
    return 0
170 a8083063 Iustin Pop
171 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
172 a8083063 Iustin Pop
    """Record a restart attempt.
173 a8083063 Iustin Pop
174 c41eea6e Iustin Pop
    @type instance: L{Instance}
175 c41eea6e Iustin Pop
    @param instance: the instance being restarted
176 38242904 Iustin Pop
177 a8083063 Iustin Pop
    """
178 b76f660d Michael Hanselmann
    idata = self._data["instance"]
179 a8083063 Iustin Pop
180 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
181 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
182 5a3103e9 Michael Hanselmann
    else:
183 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
184 a8083063 Iustin Pop
185 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
186 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
187 a8083063 Iustin Pop
188 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
189 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
190 a8083063 Iustin Pop
191 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
192 c41eea6e Iustin Pop
    track down instances).
193 a8083063 Iustin Pop
194 c41eea6e Iustin Pop
    @type instance: L{Instance}
195 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
196 38242904 Iustin Pop
197 a8083063 Iustin Pop
    """
198 b76f660d Michael Hanselmann
    idata = self._data["instance"]
199 a8083063 Iustin Pop
200 5a3103e9 Michael Hanselmann
    if instance.name in idata:
201 5a3103e9 Michael Hanselmann
      del idata[instance.name]
202 a8083063 Iustin Pop
203 a8083063 Iustin Pop
204 a8083063 Iustin Pop
class Instance(object):
205 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
206 a8083063 Iustin Pop
207 a8083063 Iustin Pop
  """
208 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
209 a8083063 Iustin Pop
    self.name = name
210 a8083063 Iustin Pop
    self.state = state
211 5a3103e9 Michael Hanselmann
    self.autostart = autostart
212 a8083063 Iustin Pop
213 a8083063 Iustin Pop
  def Restart(self):
214 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
215 3ecf6786 Iustin Pop
216 3ecf6786 Iustin Pop
    """
217 e125c67c Michael Hanselmann
    op = opcodes.OpStartupInstance(instance_name=self.name,
218 e125c67c Michael Hanselmann
                                   force=False,
219 e125c67c Michael Hanselmann
                                   extra_args=None)
220 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
221 a8083063 Iustin Pop
222 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
223 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
224 5a3103e9 Michael Hanselmann
225 5a3103e9 Michael Hanselmann
    """
226 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
227 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
228 a8083063 Iustin Pop
229 a8083063 Iustin Pop
230 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
231 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
232 5a3103e9 Michael Hanselmann
233 5a3103e9 Michael Hanselmann
  """
234 b7309a0d Iustin Pop
  fields = ["name", "status", "admin_state"]
235 a8083063 Iustin Pop
236 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
237 e125c67c Michael Hanselmann
    fields.append("snodes")
238 a8083063 Iustin Pop
239 ec79568d Iustin Pop
  result = client.QueryInstances([], fields, True)
240 5a3103e9 Michael Hanselmann
241 5a3103e9 Michael Hanselmann
  instances = []
242 e125c67c Michael Hanselmann
  for fields in result:
243 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
244 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
245 5a3103e9 Michael Hanselmann
246 e125c67c Michael Hanselmann
      if not snodes:
247 a8083063 Iustin Pop
        continue
248 5a3103e9 Michael Hanselmann
249 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
250 e125c67c Michael Hanselmann
        if node in snodes:
251 5a3103e9 Michael Hanselmann
          break
252 5a3103e9 Michael Hanselmann
      else:
253 a8083063 Iustin Pop
        continue
254 a8083063 Iustin Pop
255 5a3103e9 Michael Hanselmann
    else:
256 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
257 5a3103e9 Michael Hanselmann
258 e125c67c Michael Hanselmann
    instances.append(Instance(name, status, autostart))
259 a8083063 Iustin Pop
260 5a3103e9 Michael Hanselmann
  return instances
261 5a3103e9 Michael Hanselmann
262 5a3103e9 Michael Hanselmann
263 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
264 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
265 5a3103e9 Michael Hanselmann
266 5a3103e9 Michael Hanselmann
  """
267 ec79568d Iustin Pop
  result = client.QueryNodes([], ["name", "bootid", "offline"], True)
268 cbfc4681 Iustin Pop
  return dict([(name, (bootid, offline)) for name, bootid, offline in result])
269 a8083063 Iustin Pop
270 a8083063 Iustin Pop
271 5a3103e9 Michael Hanselmann
class Watcher(object):
272 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
273 a8083063 Iustin Pop
274 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
275 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
276 a8083063 Iustin Pop
  to restart machines that are down.
277 38242904 Iustin Pop
278 a8083063 Iustin Pop
  """
279 f07521e5 Iustin Pop
  def __init__(self, opts):
280 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
281 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
282 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
283 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
284 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
285 eee1fa2d Iustin Pop
    self.started_instances = set()
286 f07521e5 Iustin Pop
    self.opts = opts
287 a8083063 Iustin Pop
288 a8083063 Iustin Pop
  def Run(self):
289 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
290 78f3bd30 Michael Hanselmann
    try:
291 f07521e5 Iustin Pop
      self.ArchiveJobs(self.opts.job_age)
292 78f3bd30 Michael Hanselmann
      self.CheckInstances(notepad)
293 78f3bd30 Michael Hanselmann
      self.CheckDisks(notepad)
294 78f3bd30 Michael Hanselmann
      self.VerifyDisks()
295 78f3bd30 Michael Hanselmann
    finally:
296 78f3bd30 Michael Hanselmann
      notepad.Save()
297 5a3103e9 Michael Hanselmann
298 f07521e5 Iustin Pop
  def ArchiveJobs(self, age):
299 f07521e5 Iustin Pop
    """Archive old jobs.
300 f07521e5 Iustin Pop
301 f07521e5 Iustin Pop
    """
302 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
303 f07521e5 Iustin Pop
    logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))
304 f07521e5 Iustin Pop
305 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
306 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
307 38242904 Iustin Pop
308 a8083063 Iustin Pop
    """
309 5a3103e9 Michael Hanselmann
    check_nodes = []
310 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
311 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
312 37b77b18 Iustin Pop
      if new_id is None:
313 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
314 cbfc4681 Iustin Pop
        if not offline:
315 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
316 cbfc4681 Iustin Pop
                        name)
317 37b77b18 Iustin Pop
        continue
318 26517d45 Iustin Pop
      if old != new_id:
319 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
320 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
321 5a3103e9 Michael Hanselmann
322 5a3103e9 Michael Hanselmann
    if check_nodes:
323 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
324 5a3103e9 Michael Hanselmann
      # secondary node.
325 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
326 0c0f834d Iustin Pop
        if not instance.autostart:
327 438b45d4 Michael Hanselmann
          logging.info(("Skipping disk activation for non-autostart"
328 438b45d4 Michael Hanselmann
                        " instance %s"), instance.name)
329 0c0f834d Iustin Pop
          continue
330 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
331 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
332 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
333 eee1fa2d Iustin Pop
          continue
334 5a3103e9 Michael Hanselmann
        try:
335 438b45d4 Michael Hanselmann
          logging.info("Activating disks for instance %s", instance.name)
336 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
337 b7309a0d Iustin Pop
        except Exception:
338 b7309a0d Iustin Pop
          logging.exception("Error while activating disks for instance %s",
339 b7309a0d Iustin Pop
                            instance.name)
340 5a3103e9 Michael Hanselmann
341 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
342 5a3103e9 Michael Hanselmann
      for name in check_nodes:
343 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
344 a8083063 Iustin Pop
345 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
346 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
347 5a3103e9 Michael Hanselmann
348 5a3103e9 Michael Hanselmann
    """
349 a8083063 Iustin Pop
    for instance in self.instances:
350 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
351 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
352 a8083063 Iustin Pop
353 a8083063 Iustin Pop
        if n > MAXTRIES:
354 a8083063 Iustin Pop
          # stay quiet.
355 a8083063 Iustin Pop
          continue
356 a8083063 Iustin Pop
        elif n < MAXTRIES:
357 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
358 a8083063 Iustin Pop
        else:
359 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
360 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
361 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
362 a8083063 Iustin Pop
          continue
363 a8083063 Iustin Pop
        try:
364 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
365 438b45d4 Michael Hanselmann
                        instance.name, last)
366 a8083063 Iustin Pop
          instance.Restart()
367 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
368 b7309a0d Iustin Pop
        except Exception:
369 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
370 4bffa7f7 Iustin Pop
                            instance.name)
371 a8083063 Iustin Pop
372 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
373 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
374 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
375 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
376 a8083063 Iustin Pop
      else:
377 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
378 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
379 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
380 a8083063 Iustin Pop
381 b7309a0d Iustin Pop
  @staticmethod
382 b7309a0d Iustin Pop
  def VerifyDisks():
383 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
384 d2f311db Iustin Pop
385 d2f311db Iustin Pop
    """
386 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
387 5188ab37 Iustin Pop
    result = cli.SubmitOpCode(op, cl=client)
388 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
389 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
390 5188ab37 Iustin Pop
      return
391 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
392 5188ab37 Iustin Pop
    if not offline_disk_instances:
393 5188ab37 Iustin Pop
      # nothing to do
394 5188ab37 Iustin Pop
      return
395 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
396 5188ab37 Iustin Pop
                  ", ".join(offline_disk_instances))
397 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
398 5188ab37 Iustin Pop
    # less the job queue
399 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
400 5188ab37 Iustin Pop
           for name in offline_disk_instances]
401 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
402 5188ab37 Iustin Pop
403 5188ab37 Iustin Pop
    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
404 a8083063 Iustin Pop
405 a8083063 Iustin Pop
406 a8083063 Iustin Pop
def ParseOptions():
407 a8083063 Iustin Pop
  """Parse the command line options.
408 a8083063 Iustin Pop
409 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
410 a8083063 Iustin Pop
411 a8083063 Iustin Pop
  """
412 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
413 a8083063 Iustin Pop
                        usage="%prog [-d]",
414 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
415 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
416 a8083063 Iustin Pop
417 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
418 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
419 a8083063 Iustin Pop
                    default=False, action="store_true")
420 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
421 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
422 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
423 a8083063 Iustin Pop
  options, args = parser.parse_args()
424 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
425 a8083063 Iustin Pop
  return options, args
426 a8083063 Iustin Pop
427 a8083063 Iustin Pop
428 a8083063 Iustin Pop
def main():
429 a8083063 Iustin Pop
  """Main function.
430 a8083063 Iustin Pop
431 a8083063 Iustin Pop
  """
432 e125c67c Michael Hanselmann
  global client
433 e125c67c Michael Hanselmann
434 a8083063 Iustin Pop
  options, args = ParseOptions()
435 a8083063 Iustin Pop
436 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
437 82d9caef Iustin Pop
                     stderr_logging=options.debug)
438 a8083063 Iustin Pop
439 a8083063 Iustin Pop
  try:
440 e125c67c Michael Hanselmann
    client = cli.GetClient()
441 e125c67c Michael Hanselmann
442 781b2b2b Michael Hanselmann
    try:
443 f07521e5 Iustin Pop
      watcher = Watcher(options)
444 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
445 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
446 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
447 e125c67c Michael Hanselmann
448 5a3103e9 Michael Hanselmann
    watcher.Run()
449 1b052f42 Michael Hanselmann
  except SystemExit:
450 1b052f42 Michael Hanselmann
    raise
451 38242904 Iustin Pop
  except NotMasterError:
452 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
453 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
454 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
455 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
456 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
457 438b45d4 Michael Hanselmann
  except Exception, err:
458 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
459 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
460 a8083063 Iustin Pop
461 5a3103e9 Michael Hanselmann
462 a8083063 Iustin Pop
if __name__ == '__main__':
463 a8083063 Iustin Pop
  main()