Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 8b3fd458

History | View | Annotate | Download (12.4 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 438b45d4 Michael Hanselmann
import logging
34 a8083063 Iustin Pop
from optparse import OptionParser
35 a8083063 Iustin Pop
36 a8083063 Iustin Pop
from ganeti import utils
37 a8083063 Iustin Pop
from ganeti import constants
38 67fe61c4 Michael Hanselmann
from ganeti import serializer
39 89e1fc26 Iustin Pop
from ganeti import errors
40 e125c67c Michael Hanselmann
from ganeti import opcodes
41 3b316acb Iustin Pop
from ganeti import logger
42 e125c67c Michael Hanselmann
from ganeti import cli
43 a8083063 Iustin Pop
44 a8083063 Iustin Pop
45 5a3103e9 Michael Hanselmann
MAXTRIES = 5
46 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
47 b7309a0d Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown']
48 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
49 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
50 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
51 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
52 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
53 5a3103e9 Michael Hanselmann
54 5a3103e9 Michael Hanselmann
55 e125c67c Michael Hanselmann
# Global client object
56 e125c67c Michael Hanselmann
client = None
57 e125c67c Michael Hanselmann
58 e125c67c Michael Hanselmann
59 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
60 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
63 a8083063 Iustin Pop
def Indent(s, prefix='| '):
64 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
65 a8083063 Iustin Pop
66 a8083063 Iustin Pop
  Args:
67 a8083063 Iustin Pop
    s: The string to indent
68 a8083063 Iustin Pop
    prefix: The string to prepend each line.
69 38242904 Iustin Pop
70 a8083063 Iustin Pop
  """
71 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72 a8083063 Iustin Pop
73 a8083063 Iustin Pop
74 5a3103e9 Michael Hanselmann
class WatcherState(object):
75 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
76 a8083063 Iustin Pop
77 a8083063 Iustin Pop
  """
78 a8083063 Iustin Pop
  def __init__(self):
79 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
80 5a3103e9 Michael Hanselmann
81 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
82 5a3103e9 Michael Hanselmann
83 5a3103e9 Michael Hanselmann
    """
84 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
85 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
86 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
87 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
88 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
89 a8083063 Iustin Pop
90 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
91 a8083063 Iustin Pop
92 5a3103e9 Michael Hanselmann
    try:
93 b76f660d Michael Hanselmann
      self._data = serializer.Load(self.statefile.read())
94 5a3103e9 Michael Hanselmann
    except Exception, msg:
95 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
96 b76f660d Michael Hanselmann
      self._data = {}
97 438b45d4 Michael Hanselmann
      logging.warning(("Empty or invalid state file. Using defaults."
98 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
99 5a3103e9 Michael Hanselmann
100 b76f660d Michael Hanselmann
    if "instance" not in self._data:
101 b76f660d Michael Hanselmann
      self._data["instance"] = {}
102 b76f660d Michael Hanselmann
    if "node" not in self._data:
103 b76f660d Michael Hanselmann
      self._data["node"] = {}
104 5a3103e9 Michael Hanselmann
105 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
106 2fb96d39 Michael Hanselmann
107 fc428e32 Michael Hanselmann
  def Save(self):
108 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
109 5a3103e9 Michael Hanselmann
110 5a3103e9 Michael Hanselmann
    """
111 fc428e32 Michael Hanselmann
    assert self.statefile
112 fc428e32 Michael Hanselmann
113 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
114 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
115 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
116 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
117 2fb96d39 Michael Hanselmann
      return
118 2fb96d39 Michael Hanselmann
119 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
120 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
121 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
122 26517d45 Iustin Pop
                         data=serialized_form,
123 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
124 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
125 5a3103e9 Michael Hanselmann
126 fc428e32 Michael Hanselmann
  def Close(self):
127 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
128 5a3103e9 Michael Hanselmann
129 5a3103e9 Michael Hanselmann
    """
130 5a3103e9 Michael Hanselmann
    assert self.statefile
131 5a3103e9 Michael Hanselmann
132 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
133 5a3103e9 Michael Hanselmann
    self.statefile.close()
134 5a3103e9 Michael Hanselmann
    self.statefile = None
135 5a3103e9 Michael Hanselmann
136 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
137 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
138 a8083063 Iustin Pop
139 5a3103e9 Michael Hanselmann
    """
140 b76f660d Michael Hanselmann
    ndata = self._data["node"]
141 5a3103e9 Michael Hanselmann
142 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
143 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
144 5a3103e9 Michael Hanselmann
    return None
145 5a3103e9 Michael Hanselmann
146 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
147 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
148 5a3103e9 Michael Hanselmann
149 5a3103e9 Michael Hanselmann
    """
150 5a3103e9 Michael Hanselmann
    assert bootid
151 a8083063 Iustin Pop
152 b76f660d Michael Hanselmann
    ndata = self._data["node"]
153 a8083063 Iustin Pop
154 5a3103e9 Michael Hanselmann
    if name not in ndata:
155 5a3103e9 Michael Hanselmann
      ndata[name] = {}
156 5a3103e9 Michael Hanselmann
157 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
158 5a3103e9 Michael Hanselmann
159 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
160 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
161 a8083063 Iustin Pop
162 a8083063 Iustin Pop
    Args:
163 a8083063 Iustin Pop
      instance - the instance to look up.
164 38242904 Iustin Pop
165 a8083063 Iustin Pop
    """
166 b76f660d Michael Hanselmann
    idata = self._data["instance"]
167 a8083063 Iustin Pop
168 5a3103e9 Michael Hanselmann
    if instance.name in idata:
169 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
170 a8083063 Iustin Pop
171 a8083063 Iustin Pop
    return 0
172 a8083063 Iustin Pop
173 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
174 a8083063 Iustin Pop
    """Record a restart attempt.
175 a8083063 Iustin Pop
176 a8083063 Iustin Pop
    Args:
177 a8083063 Iustin Pop
      instance - the instance being restarted
178 38242904 Iustin Pop
179 a8083063 Iustin Pop
    """
180 b76f660d Michael Hanselmann
    idata = self._data["instance"]
181 a8083063 Iustin Pop
182 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
183 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
184 5a3103e9 Michael Hanselmann
    else:
185 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
186 a8083063 Iustin Pop
187 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
188 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
189 a8083063 Iustin Pop
190 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
191 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
192 a8083063 Iustin Pop
193 a8083063 Iustin Pop
    Args:
194 a8083063 Iustin Pop
      instance - the instance to remove from books
195 a8083063 Iustin Pop
196 38242904 Iustin Pop
    This method removes the record for a named instance.
197 38242904 Iustin Pop
198 a8083063 Iustin Pop
    """
199 b76f660d Michael Hanselmann
    idata = self._data["instance"]
200 a8083063 Iustin Pop
201 5a3103e9 Michael Hanselmann
    if instance.name in idata:
202 5a3103e9 Michael Hanselmann
      del idata[instance.name]
203 a8083063 Iustin Pop
204 a8083063 Iustin Pop
205 a8083063 Iustin Pop
class Instance(object):
206 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
207 a8083063 Iustin Pop
208 a8083063 Iustin Pop
  Methods:
209 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
210 098c0958 Michael Hanselmann
211 a8083063 Iustin Pop
  """
212 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
213 a8083063 Iustin Pop
    self.name = name
214 a8083063 Iustin Pop
    self.state = state
215 5a3103e9 Michael Hanselmann
    self.autostart = autostart
216 a8083063 Iustin Pop
217 a8083063 Iustin Pop
  def Restart(self):
218 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
219 3ecf6786 Iustin Pop
220 3ecf6786 Iustin Pop
    """
221 e125c67c Michael Hanselmann
    op = opcodes.OpStartupInstance(instance_name=self.name,
222 e125c67c Michael Hanselmann
                                   force=False,
223 e125c67c Michael Hanselmann
                                   extra_args=None)
224 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
225 a8083063 Iustin Pop
226 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
227 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
228 5a3103e9 Michael Hanselmann
229 5a3103e9 Michael Hanselmann
    """
230 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
231 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
232 a8083063 Iustin Pop
233 a8083063 Iustin Pop
234 5a3103e9 Michael Hanselmann
def GetInstanceList(with_secondaries=None):
235 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
236 5a3103e9 Michael Hanselmann
237 5a3103e9 Michael Hanselmann
  """
238 b7309a0d Iustin Pop
  fields = ["name", "status", "admin_state"]
239 a8083063 Iustin Pop
240 5a3103e9 Michael Hanselmann
  if with_secondaries is not None:
241 e125c67c Michael Hanselmann
    fields.append("snodes")
242 a8083063 Iustin Pop
243 e125c67c Michael Hanselmann
  result = client.QueryInstances([], fields)
244 5a3103e9 Michael Hanselmann
245 5a3103e9 Michael Hanselmann
  instances = []
246 e125c67c Michael Hanselmann
  for fields in result:
247 5a3103e9 Michael Hanselmann
    if with_secondaries is not None:
248 5a3103e9 Michael Hanselmann
      (name, status, autostart, snodes) = fields
249 5a3103e9 Michael Hanselmann
250 e125c67c Michael Hanselmann
      if not snodes:
251 a8083063 Iustin Pop
        continue
252 5a3103e9 Michael Hanselmann
253 5a3103e9 Michael Hanselmann
      for node in with_secondaries:
254 e125c67c Michael Hanselmann
        if node in snodes:
255 5a3103e9 Michael Hanselmann
          break
256 5a3103e9 Michael Hanselmann
      else:
257 a8083063 Iustin Pop
        continue
258 a8083063 Iustin Pop
259 5a3103e9 Michael Hanselmann
    else:
260 5a3103e9 Michael Hanselmann
      (name, status, autostart) = fields
261 5a3103e9 Michael Hanselmann
262 e125c67c Michael Hanselmann
    instances.append(Instance(name, status, autostart))
263 a8083063 Iustin Pop
264 5a3103e9 Michael Hanselmann
  return instances
265 5a3103e9 Michael Hanselmann
266 5a3103e9 Michael Hanselmann
267 5a3103e9 Michael Hanselmann
def GetNodeBootIDs():
268 5a3103e9 Michael Hanselmann
  """Get a dict mapping nodes to boot IDs.
269 5a3103e9 Michael Hanselmann
270 5a3103e9 Michael Hanselmann
  """
271 e125c67c Michael Hanselmann
  result = client.QueryNodes([], ["name", "bootid"])
272 e125c67c Michael Hanselmann
  return dict([(name, bootid) for name, bootid in result])
273 a8083063 Iustin Pop
274 a8083063 Iustin Pop
275 5a3103e9 Michael Hanselmann
class Watcher(object):
276 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
277 a8083063 Iustin Pop
278 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
279 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
280 a8083063 Iustin Pop
  to restart machines that are down.
281 38242904 Iustin Pop
282 a8083063 Iustin Pop
  """
283 a8083063 Iustin Pop
  def __init__(self):
284 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
285 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
286 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
287 5a3103e9 Michael Hanselmann
    self.instances = GetInstanceList()
288 5a3103e9 Michael Hanselmann
    self.bootids = GetNodeBootIDs()
289 eee1fa2d Iustin Pop
    self.started_instances = set()
290 a8083063 Iustin Pop
291 a8083063 Iustin Pop
  def Run(self):
292 5a3103e9 Michael Hanselmann
    notepad = WatcherState()
293 78f3bd30 Michael Hanselmann
    try:
294 78f3bd30 Michael Hanselmann
      self.CheckInstances(notepad)
295 78f3bd30 Michael Hanselmann
      self.CheckDisks(notepad)
296 78f3bd30 Michael Hanselmann
      self.VerifyDisks()
297 78f3bd30 Michael Hanselmann
    finally:
298 78f3bd30 Michael Hanselmann
      notepad.Save()
299 5a3103e9 Michael Hanselmann
300 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
301 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
302 38242904 Iustin Pop
303 a8083063 Iustin Pop
    """
304 5a3103e9 Michael Hanselmann
    check_nodes = []
305 26517d45 Iustin Pop
    for name, new_id in self.bootids.iteritems():
306 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
307 37b77b18 Iustin Pop
      if new_id is None:
308 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
309 37b77b18 Iustin Pop
        logging.debug("Node %s missing boot id, skipping secondary checks",
310 37b77b18 Iustin Pop
                      name)
311 37b77b18 Iustin Pop
        continue
312 26517d45 Iustin Pop
      if old != new_id:
313 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
314 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
315 5a3103e9 Michael Hanselmann
316 5a3103e9 Michael Hanselmann
    if check_nodes:
317 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
318 5a3103e9 Michael Hanselmann
      # secondary node.
319 5a3103e9 Michael Hanselmann
      for instance in GetInstanceList(with_secondaries=check_nodes):
320 0c0f834d Iustin Pop
        if not instance.autostart:
321 438b45d4 Michael Hanselmann
          logging.info(("Skipping disk activation for non-autostart"
322 438b45d4 Michael Hanselmann
                        " instance %s"), instance.name)
323 0c0f834d Iustin Pop
          continue
324 eee1fa2d Iustin Pop
        if instance.name in self.started_instances:
325 eee1fa2d Iustin Pop
          # we already tried to start the instance, which should have
326 eee1fa2d Iustin Pop
          # activated its drives (if they can be at all)
327 eee1fa2d Iustin Pop
          continue
328 5a3103e9 Michael Hanselmann
        try:
329 438b45d4 Michael Hanselmann
          logging.info("Activating disks for instance %s", instance.name)
330 5a3103e9 Michael Hanselmann
          instance.ActivateDisks()
331 b7309a0d Iustin Pop
        except Exception:
332 b7309a0d Iustin Pop
          logging.exception("Error while activating disks for instance %s",
333 b7309a0d Iustin Pop
                            instance.name)
334 5a3103e9 Michael Hanselmann
335 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
336 5a3103e9 Michael Hanselmann
      for name in check_nodes:
337 5a3103e9 Michael Hanselmann
        notepad.SetNodeBootID(name, self.bootids[name])
338 a8083063 Iustin Pop
339 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
340 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
341 5a3103e9 Michael Hanselmann
342 5a3103e9 Michael Hanselmann
    """
343 a8083063 Iustin Pop
    for instance in self.instances:
344 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
345 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
346 a8083063 Iustin Pop
347 a8083063 Iustin Pop
        if n > MAXTRIES:
348 a8083063 Iustin Pop
          # stay quiet.
349 a8083063 Iustin Pop
          continue
350 a8083063 Iustin Pop
        elif n < MAXTRIES:
351 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
352 a8083063 Iustin Pop
        else:
353 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
354 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
355 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
356 a8083063 Iustin Pop
          continue
357 a8083063 Iustin Pop
        try:
358 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
359 438b45d4 Michael Hanselmann
                        instance.name, last)
360 a8083063 Iustin Pop
          instance.Restart()
361 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
362 b7309a0d Iustin Pop
        except Exception:
363 b7309a0d Iustin Pop
          logging.exception("Erro while restarting instance %s", instance.name)
364 a8083063 Iustin Pop
365 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
366 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
367 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
368 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
369 a8083063 Iustin Pop
      else:
370 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
371 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
372 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
373 a8083063 Iustin Pop
374 b7309a0d Iustin Pop
  @staticmethod
375 b7309a0d Iustin Pop
  def VerifyDisks():
376 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
377 d2f311db Iustin Pop
378 d2f311db Iustin Pop
    """
379 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
380 5188ab37 Iustin Pop
    result = cli.SubmitOpCode(op, cl=client)
381 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
382 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
383 5188ab37 Iustin Pop
      return
384 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
385 5188ab37 Iustin Pop
    if not offline_disk_instances:
386 5188ab37 Iustin Pop
      # nothing to do
387 5188ab37 Iustin Pop
      return
388 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
389 5188ab37 Iustin Pop
                  ", ".join(offline_disk_instances))
390 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
391 5188ab37 Iustin Pop
    # less the job queue
392 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
393 5188ab37 Iustin Pop
           for name in offline_disk_instances]
394 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
395 5188ab37 Iustin Pop
396 5188ab37 Iustin Pop
    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
397 a8083063 Iustin Pop
398 a8083063 Iustin Pop
399 a8083063 Iustin Pop
def ParseOptions():
400 a8083063 Iustin Pop
  """Parse the command line options.
401 a8083063 Iustin Pop
402 a8083063 Iustin Pop
  Returns:
403 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
404 a8083063 Iustin Pop
405 a8083063 Iustin Pop
  """
406 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
407 a8083063 Iustin Pop
                        usage="%prog [-d]",
408 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
409 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
410 a8083063 Iustin Pop
411 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
412 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
413 a8083063 Iustin Pop
                    default=False, action="store_true")
414 a8083063 Iustin Pop
  options, args = parser.parse_args()
415 a8083063 Iustin Pop
  return options, args
416 a8083063 Iustin Pop
417 a8083063 Iustin Pop
418 a8083063 Iustin Pop
def main():
419 a8083063 Iustin Pop
  """Main function.
420 a8083063 Iustin Pop
421 a8083063 Iustin Pop
  """
422 e125c67c Michael Hanselmann
  global client
423 e125c67c Michael Hanselmann
424 a8083063 Iustin Pop
  options, args = ParseOptions()
425 a8083063 Iustin Pop
426 5188ab37 Iustin Pop
  logger.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
427 5188ab37 Iustin Pop
                      stderr_logging=options.debug)
428 a8083063 Iustin Pop
429 a8083063 Iustin Pop
  try:
430 e125c67c Michael Hanselmann
    client = cli.GetClient()
431 e125c67c Michael Hanselmann
432 781b2b2b Michael Hanselmann
    try:
433 781b2b2b Michael Hanselmann
      watcher = Watcher()
434 781b2b2b Michael Hanselmann
    except errors.ConfigurationError:
435 781b2b2b Michael Hanselmann
      # Just exit if there's no configuration
436 781b2b2b Michael Hanselmann
      sys.exit(constants.EXIT_SUCCESS)
437 e125c67c Michael Hanselmann
438 5a3103e9 Michael Hanselmann
    watcher.Run()
439 1b052f42 Michael Hanselmann
  except SystemExit:
440 1b052f42 Michael Hanselmann
    raise
441 38242904 Iustin Pop
  except NotMasterError:
442 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
443 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
444 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
445 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
446 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
447 438b45d4 Michael Hanselmann
  except Exception, err:
448 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
449 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
450 a8083063 Iustin Pop
451 5a3103e9 Michael Hanselmann
452 a8083063 Iustin Pop
if __name__ == '__main__':
453 a8083063 Iustin Pop
  main()