Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 82d9caef

History | View | Annotate | Download (12.4 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 438b45d4 Michael Hanselmann
import logging
34 a8083063 Iustin Pop
from optparse import OptionParser
35 a8083063 Iustin Pop
36 a8083063 Iustin Pop
from ganeti import utils
37 a8083063 Iustin Pop
from ganeti import constants
38 67fe61c4 Michael Hanselmann
from ganeti import serializer
39 89e1fc26 Iustin Pop
from ganeti import errors
40 e125c67c Michael Hanselmann
from ganeti import opcodes
41 e125c67c Michael Hanselmann
from ganeti import cli
42 a8083063 Iustin Pop
43 a8083063 Iustin Pop
44 5a3103e9 Michael Hanselmann
MAXTRIES = 5
45 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
46 b7309a0d Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown']
47 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
48 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
49 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
50 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
51 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
52 5a3103e9 Michael Hanselmann
53 5a3103e9 Michael Hanselmann
54 e125c67c Michael Hanselmann
# Global client object
55 e125c67c Michael Hanselmann
client = None
56 e125c67c Michael Hanselmann
57 e125c67c Michael Hanselmann
58 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
59 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
60 a8083063 Iustin Pop
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
def Indent(s, prefix='| '):
63 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
64 a8083063 Iustin Pop
65 a8083063 Iustin Pop
  Args:
66 a8083063 Iustin Pop
    s: The string to indent
67 a8083063 Iustin Pop
    prefix: The string to prepend each line.
68 38242904 Iustin Pop
69 a8083063 Iustin Pop
  """
70 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
71 a8083063 Iustin Pop
72 a8083063 Iustin Pop
73 5a3103e9 Michael Hanselmann
class WatcherState(object):
74 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
75 a8083063 Iustin Pop
76 a8083063 Iustin Pop
  """
77 a8083063 Iustin Pop
  def __init__(self):
78 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
79 5a3103e9 Michael Hanselmann
80 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
81 5a3103e9 Michael Hanselmann
82 5a3103e9 Michael Hanselmann
    """
83 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
84 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
85 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
86 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
87 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
88 a8083063 Iustin Pop
89 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
90 a8083063 Iustin Pop
91 5a3103e9 Michael Hanselmann
    try:
92 b76f660d Michael Hanselmann
      self._data = serializer.Load(self.statefile.read())
93 5a3103e9 Michael Hanselmann
    except Exception, msg:
94 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
95 b76f660d Michael Hanselmann
      self._data = {}
96 438b45d4 Michael Hanselmann
      logging.warning(("Empty or invalid state file. Using defaults."
97 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
98 5a3103e9 Michael Hanselmann
99 b76f660d Michael Hanselmann
    if "instance" not in self._data:
100 b76f660d Michael Hanselmann
      self._data["instance"] = {}
101 b76f660d Michael Hanselmann
    if "node" not in self._data:
102 b76f660d Michael Hanselmann
      self._data["node"] = {}
103 5a3103e9 Michael Hanselmann
104 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
105 2fb96d39 Michael Hanselmann
106 fc428e32 Michael Hanselmann
  def Save(self):
107 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
108 5a3103e9 Michael Hanselmann
109 5a3103e9 Michael Hanselmann
    """
110 fc428e32 Michael Hanselmann
    assert self.statefile
111 fc428e32 Michael Hanselmann
112 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
113 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
114 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
115 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
116 2fb96d39 Michael Hanselmann
      return
117 2fb96d39 Michael Hanselmann
118 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
119 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
120 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
121 26517d45 Iustin Pop
                         data=serialized_form,
122 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
123 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
124 5a3103e9 Michael Hanselmann
125 fc428e32 Michael Hanselmann
  def Close(self):
126 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
127 5a3103e9 Michael Hanselmann
128 5a3103e9 Michael Hanselmann
    """
129 5a3103e9 Michael Hanselmann
    assert self.statefile
130 5a3103e9 Michael Hanselmann
131 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
132 5a3103e9 Michael Hanselmann
    self.statefile.close()
133 5a3103e9 Michael Hanselmann
    self.statefile = None
134 5a3103e9 Michael Hanselmann
135 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
136 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
137 a8083063 Iustin Pop
138 5a3103e9 Michael Hanselmann
    """
139 b76f660d Michael Hanselmann
    ndata = self._data["node"]
140 5a3103e9 Michael Hanselmann
141 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
142 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
143 5a3103e9 Michael Hanselmann
    return None
144 5a3103e9 Michael Hanselmann
145 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
146 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
147 5a3103e9 Michael Hanselmann
148 5a3103e9 Michael Hanselmann
    """
149 5a3103e9 Michael Hanselmann
    assert bootid
150 a8083063 Iustin Pop
151 b76f660d Michael Hanselmann
    ndata = self._data["node"]
152 a8083063 Iustin Pop
153 5a3103e9 Michael Hanselmann
    if name not in ndata:
154 5a3103e9 Michael Hanselmann
      ndata[name] = {}
155 5a3103e9 Michael Hanselmann
156 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
157 5a3103e9 Michael Hanselmann
158 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
159 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
160 a8083063 Iustin Pop
161 a8083063 Iustin Pop
    Args:
162 a8083063 Iustin Pop
      instance - the instance to look up.
163 38242904 Iustin Pop
164 a8083063 Iustin Pop
    """
165 b76f660d Michael Hanselmann
    idata = self._data["instance"]
166 a8083063 Iustin Pop
167 5a3103e9 Michael Hanselmann
    if instance.name in idata:
168 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
169 a8083063 Iustin Pop
170 a8083063 Iustin Pop
    return 0
171 a8083063 Iustin Pop
172 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
173 a8083063 Iustin Pop
    """Record a restart attempt.
174 a8083063 Iustin Pop
175 a8083063 Iustin Pop
    Args:
176 a8083063 Iustin Pop
      instance - the instance being restarted
177 38242904 Iustin Pop
178 a8083063 Iustin Pop
    """
179 b76f660d Michael Hanselmann
    idata = self._data["instance"]
180 a8083063 Iustin Pop
181 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
182 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
183 5a3103e9 Michael Hanselmann
    else:
184 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
185 a8083063 Iustin Pop
186 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
187 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
188 a8083063 Iustin Pop
189 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
190 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
191 a8083063 Iustin Pop
192 a8083063 Iustin Pop
    Args:
193 a8083063 Iustin Pop
      instance - the instance to remove from books
194 a8083063 Iustin Pop
195 38242904 Iustin Pop
    This method removes the record for a named instance.
196 38242904 Iustin Pop
197 a8083063 Iustin Pop
    """
198 b76f660d Michael Hanselmann
    idata = self._data["instance"]
199 a8083063 Iustin Pop
200 5a3103e9 Michael Hanselmann
    if instance.name in idata:
201 5a3103e9 Michael Hanselmann
      del idata[instance.name]
202 a8083063 Iustin Pop
203 a8083063 Iustin Pop
204 a8083063 Iustin Pop
class Instance(object):
205 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
206 a8083063 Iustin Pop
207 a8083063 Iustin Pop
  Methods:
208 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
209 098c0958 Michael Hanselmann
210 a8083063 Iustin Pop
  """
211 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
212 a8083063 Iustin Pop
    self.name = name
213 a8083063 Iustin Pop
    self.state = state
214 5a3103e9 Michael Hanselmann
    self.autostart = autostart
215 a8083063 Iustin Pop
216 a8083063 Iustin Pop
  def Restart(self):
217 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
218 3ecf6786 Iustin Pop
219 3ecf6786 Iustin Pop
    """
220 e125c67c Michael Hanselmann
    op = opcodes.OpStartupInstance(instance_name=self.name,
221 e125c67c Michael Hanselmann
                                   force=False,
222 e125c67c Michael Hanselmann
                                   extra_args=None)
223 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
224 a8083063 Iustin Pop
225 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
226 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
227 5a3103e9 Michael Hanselmann
228 5a3103e9 Michael Hanselmann
    """
229 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
230 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
231 a8083063 Iustin Pop
232 a8083063 Iustin Pop
233 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
234 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
235 5a3103e9 Michael Hanselmann
236 5a3103e9 Michael Hanselmann
  """
237 b7309a0d Iustin Pop
  fields = ["name", "status", "admin_state"]
238 a8083063 Iustin Pop
239 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
240 e125c67c Michael Hanselmann
    fields.append("snodes")
241 a8083063 Iustin Pop
242 e125c67c Michael Hanselmann
  result = client.QueryInstances([], fields)
243 5a3103e9 Michael Hanselmann
244 5a3103e9 Michael Hanselmann
  instances = []
245 e125c67c Michael Hanselmann
  for fields in result:
246 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
247 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
248 5a3103e9 Michael Hanselmann
249 e125c67c Michael Hanselmann
      if not snodes:
250 a8083063 Iustin Pop
        continue
251 5a3103e9 Michael Hanselmann
252 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
253 e125c67c Michael Hanselmann
        if node in snodes:
254 5a3103e9 Michael Hanselmann
          break
255 5a3103e9 Michael Hanselmann
      else:
256 a8083063 Iustin Pop
        continue
257 a8083063 Iustin Pop
258 5a3103e9 Michael Hanselmann
    else:
259 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
260 5a3103e9 Michael Hanselmann
261 e125c67c Michael Hanselmann
    instances.append(Instance(name, status, autostart))
262 a8083063 Iustin Pop
263 5a3103e9 Michael Hanselmann
  return instances
264 5a3103e9 Michael Hanselmann
265 5a3103e9 Michael Hanselmann
266 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
267 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
268 5a3103e9 Michael Hanselmann
269 5a3103e9 Michael Hanselmann
  """
270 e125c67c Michael Hanselmann
  result = client.QueryNodes([], ["name", "bootid"])
271 e125c67c Michael Hanselmann
  return dict([(name, bootid) for name, bootid in result])
272 a8083063 Iustin Pop
273 a8083063 Iustin Pop
274 5a3103e9 Michael Hanselmann
class Watcher(object):
275 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
276 a8083063 Iustin Pop
277 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
278 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
279 a8083063 Iustin Pop
  to restart machines that are down.
280 38242904 Iustin Pop
281 a8083063 Iustin Pop
  """
282 a8083063 Iustin Pop
  def __init__(self):
283 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
284 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
285 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
286 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
287 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
288 eee1fa2d Iustin Pop
    self.started_instances = set()
289 a8083063 Iustin Pop
290 a8083063 Iustin Pop
  def Run(self):
291 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
292 78f3bd30 Michael Hanselmann
    try:
293 78f3bd30 Michael Hanselmann
      self.CheckInstances(notepad)
294 78f3bd30 Michael Hanselmann
      self.CheckDisks(notepad)
295 78f3bd30 Michael Hanselmann
      self.VerifyDisks()
296 78f3bd30 Michael Hanselmann
    finally:
297 78f3bd30 Michael Hanselmann
      notepad.Save()
298 5a3103e9 Michael Hanselmann
299 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
300 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
301 38242904 Iustin Pop
302 a8083063 Iustin Pop
    """
303 5a3103e9 Michael Hanselmann
    check_nodes = []
304 26517d45 Iustin Pop
    for name, new_id in self.bootids.iteritems():
305 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
306 37b77b18 Iustin Pop
      if new_id is None:
307 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
308 37b77b18 Iustin Pop
        logging.debug("Node %s missing boot id, skipping secondary checks",
309 37b77b18 Iustin Pop
                      name)
310 37b77b18 Iustin Pop
        continue
311 26517d45 Iustin Pop
      if old != new_id:
312 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
313 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
314 5a3103e9 Michael Hanselmann
315 5a3103e9 Michael Hanselmann
    if check_nodes:
316 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
317 5a3103e9 Michael Hanselmann
      # secondary node.
318 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
319 0c0f834d Iustin Pop
        if not instance.autostart:
320 438b45d4 Michael Hanselmann
          logging.info(("Skipping disk activation for non-autostart"
321 438b45d4 Michael Hanselmann
                        " instance %s"), instance.name)
322 0c0f834d Iustin Pop
          continue
323 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
324 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
325 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
326 eee1fa2d Iustin Pop
          continue
327 5a3103e9 Michael Hanselmann
        try:
328 438b45d4 Michael Hanselmann
          logging.info("Activating disks for instance %s", instance.name)
329 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
330 b7309a0d Iustin Pop
        except Exception:
331 b7309a0d Iustin Pop
          logging.exception("Error while activating disks for instance %s",
332 b7309a0d Iustin Pop
                            instance.name)
333 5a3103e9 Michael Hanselmann
334 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
335 5a3103e9 Michael Hanselmann
      for name in check_nodes:
336 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
337 a8083063 Iustin Pop
338 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
339 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
340 5a3103e9 Michael Hanselmann
341 5a3103e9 Michael Hanselmann
    """
342 a8083063 Iustin Pop
    for instance in self.instances:
343 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
344 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
345 a8083063 Iustin Pop
346 a8083063 Iustin Pop
        if n > MAXTRIES:
347 a8083063 Iustin Pop
          # stay quiet.
348 a8083063 Iustin Pop
          continue
349 a8083063 Iustin Pop
        elif n < MAXTRIES:
350 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
351 a8083063 Iustin Pop
        else:
352 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
353 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
354 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
355 a8083063 Iustin Pop
          continue
356 a8083063 Iustin Pop
        try:
357 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
358 438b45d4 Michael Hanselmann
                        instance.name, last)
359 a8083063 Iustin Pop
          instance.Restart()
360 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
361 b7309a0d Iustin Pop
        except Exception:
362 b7309a0d Iustin Pop
          logging.exception("Erro while restarting instance %s", instance.name)
363 a8083063 Iustin Pop
364 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
365 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
366 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
367 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
368 a8083063 Iustin Pop
      else:
369 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
370 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
371 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
372 a8083063 Iustin Pop
373 b7309a0d Iustin Pop
  @staticmethod
374 b7309a0d Iustin Pop
  def VerifyDisks():
375 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
376 d2f311db Iustin Pop
377 d2f311db Iustin Pop
    """
378 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
379 5188ab37 Iustin Pop
    result = cli.SubmitOpCode(op, cl=client)
380 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
381 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
382 5188ab37 Iustin Pop
      return
383 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
384 5188ab37 Iustin Pop
    if not offline_disk_instances:
385 5188ab37 Iustin Pop
      # nothing to do
386 5188ab37 Iustin Pop
      return
387 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
388 5188ab37 Iustin Pop
                  ", ".join(offline_disk_instances))
389 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
390 5188ab37 Iustin Pop
    # less the job queue
391 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
392 5188ab37 Iustin Pop
           for name in offline_disk_instances]
393 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
394 5188ab37 Iustin Pop
395 5188ab37 Iustin Pop
    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
396 a8083063 Iustin Pop
397 a8083063 Iustin Pop
398 a8083063 Iustin Pop
def ParseOptions():
399 a8083063 Iustin Pop
  """Parse the command line options.
400 a8083063 Iustin Pop
401 a8083063 Iustin Pop
  Returns:
402 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
403 a8083063 Iustin Pop
404 a8083063 Iustin Pop
  """
405 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
406 a8083063 Iustin Pop
                        usage="%prog [-d]",
407 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
408 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
409 a8083063 Iustin Pop
410 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
411 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
412 a8083063 Iustin Pop
                    default=False, action="store_true")
413 a8083063 Iustin Pop
  options, args = parser.parse_args()
414 a8083063 Iustin Pop
  return options, args
415 a8083063 Iustin Pop
416 a8083063 Iustin Pop
417 a8083063 Iustin Pop
def main():
418 a8083063 Iustin Pop
  """Main function.
419 a8083063 Iustin Pop
420 a8083063 Iustin Pop
  """
421 e125c67c Michael Hanselmann
  global client
422 e125c67c Michael Hanselmann
423 a8083063 Iustin Pop
  options, args = ParseOptions()
424 a8083063 Iustin Pop
425 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
426 82d9caef Iustin Pop
                     stderr_logging=options.debug)
427 a8083063 Iustin Pop
428 a8083063 Iustin Pop
  try:
429 e125c67c Michael Hanselmann
    client = cli.GetClient()
430 e125c67c Michael Hanselmann
431 781b2b2b Michael Hanselmann
    try:
432 781b2b2b Michael Hanselmann
      watcher = Watcher()
433 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
434 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
435 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
436 e125c67c Michael Hanselmann
437 5a3103e9 Michael Hanselmann
    watcher.Run()
438 1b052f42 Michael Hanselmann
  except SystemExit:
439 1b052f42 Michael Hanselmann
    raise
440 38242904 Iustin Pop
  except NotMasterError:
441 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
442 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
443 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
444 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
445 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
446 438b45d4 Michael Hanselmann
  except Exception, err:
447 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
448 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
449 a8083063 Iustin Pop
450 5a3103e9 Michael Hanselmann
451 a8083063 Iustin Pop
if __name__ == '__main__':
452 a8083063 Iustin Pop
  main()