Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 2c404217

History | View | Annotate | Download (13.6 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 fc428e32 Michael Hanselmann
# Copyright (C) 2006, 2007, 2008 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 a8083063 Iustin Pop
import sys
32 a8083063 Iustin Pop
import time
33 438b45d4 Michael Hanselmann
import logging
34 a8083063 Iustin Pop
from optparse import OptionParser
35 a8083063 Iustin Pop
36 a8083063 Iustin Pop
from ganeti import utils
37 a8083063 Iustin Pop
from ganeti import constants
38 67fe61c4 Michael Hanselmann
from ganeti import serializer
39 89e1fc26 Iustin Pop
from ganeti import errors
40 e125c67c Michael Hanselmann
from ganeti import opcodes
41 e125c67c Michael Hanselmann
from ganeti import cli
42 a8083063 Iustin Pop
43 a8083063 Iustin Pop
44 5a3103e9 Michael Hanselmann
MAXTRIES = 5
45 b7309a0d Iustin Pop
BAD_STATES = ['ERROR_down']
46 cbfc4681 Iustin Pop
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
47 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
48 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
49 7b195d9b Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
50 7b195d9b Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
51 7b195d9b Michael Hanselmann
KEY_BOOT_ID = "bootid"
52 5a3103e9 Michael Hanselmann
53 5a3103e9 Michael Hanselmann
54 e125c67c Michael Hanselmann
# Global client object
55 e125c67c Michael Hanselmann
client = None
56 e125c67c Michael Hanselmann
57 e125c67c Michael Hanselmann
58 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
59 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
60 a8083063 Iustin Pop
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
def Indent(s, prefix='| '):
63 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
64 a8083063 Iustin Pop
65 c41eea6e Iustin Pop
  @param s: the string to indent
66 c41eea6e Iustin Pop
  @param prefix: the string to prepend each line
67 38242904 Iustin Pop
68 a8083063 Iustin Pop
  """
69 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
70 a8083063 Iustin Pop
71 a8083063 Iustin Pop
72 5a3103e9 Michael Hanselmann
class WatcherState(object):
73 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
74 a8083063 Iustin Pop
75 a8083063 Iustin Pop
  """
76 a8083063 Iustin Pop
  def __init__(self):
77 5a3103e9 Michael Hanselmann
    """Open, lock, read and parse the file.
78 5a3103e9 Michael Hanselmann
79 7bca53e4 Michael Hanselmann
    Raises exception on lock contention.
80 5a3103e9 Michael Hanselmann
81 5a3103e9 Michael Hanselmann
    """
82 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
83 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
84 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
85 fc428e32 Michael Hanselmann
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
86 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
87 a8083063 Iustin Pop
88 eb0f0ce0 Michael Hanselmann
    utils.LockFile(self.statefile.fileno())
89 a8083063 Iustin Pop
90 5a3103e9 Michael Hanselmann
    try:
91 2c404217 Iustin Pop
      state_data = self.statefile.read()
92 2c404217 Iustin Pop
      if not state_data:
93 2c404217 Iustin Pop
        self._data = {}
94 2c404217 Iustin Pop
      else:
95 2c404217 Iustin Pop
        self._data = serializer.Load(state_data)
96 5a3103e9 Michael Hanselmann
    except Exception, msg:
97 5a3103e9 Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
98 b76f660d Michael Hanselmann
      self._data = {}
99 2c404217 Iustin Pop
      logging.warning(("Invalid state file. Using defaults."
100 438b45d4 Michael Hanselmann
                       " Error message: %s"), msg)
101 5a3103e9 Michael Hanselmann
102 b76f660d Michael Hanselmann
    if "instance" not in self._data:
103 b76f660d Michael Hanselmann
      self._data["instance"] = {}
104 b76f660d Michael Hanselmann
    if "node" not in self._data:
105 b76f660d Michael Hanselmann
      self._data["node"] = {}
106 5a3103e9 Michael Hanselmann
107 26517d45 Iustin Pop
    self._orig_data = serializer.Dump(self._data)
108 2fb96d39 Michael Hanselmann
109 fc428e32 Michael Hanselmann
  def Save(self):
110 fc428e32 Michael Hanselmann
    """Save state to file, then unlock and close it.
111 5a3103e9 Michael Hanselmann
112 5a3103e9 Michael Hanselmann
    """
113 fc428e32 Michael Hanselmann
    assert self.statefile
114 fc428e32 Michael Hanselmann
115 26517d45 Iustin Pop
    serialized_form = serializer.Dump(self._data)
116 26517d45 Iustin Pop
    if self._orig_data == serialized_form:
117 2fb96d39 Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
118 2fb96d39 Michael Hanselmann
      os.utime(constants.WATCHER_STATEFILE, None)
119 2fb96d39 Michael Hanselmann
      return
120 2fb96d39 Michael Hanselmann
121 fc428e32 Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
122 fc428e32 Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
123 fc428e32 Michael Hanselmann
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
124 26517d45 Iustin Pop
                         data=serialized_form,
125 eb0f0ce0 Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
126 fc428e32 Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
127 5a3103e9 Michael Hanselmann
128 fc428e32 Michael Hanselmann
  def Close(self):
129 5a3103e9 Michael Hanselmann
    """Unlock configuration file and close it.
130 5a3103e9 Michael Hanselmann
131 5a3103e9 Michael Hanselmann
    """
132 5a3103e9 Michael Hanselmann
    assert self.statefile
133 5a3103e9 Michael Hanselmann
134 fc428e32 Michael Hanselmann
    # Files are automatically unlocked when closing them
135 5a3103e9 Michael Hanselmann
    self.statefile.close()
136 5a3103e9 Michael Hanselmann
    self.statefile = None
137 5a3103e9 Michael Hanselmann
138 5a3103e9 Michael Hanselmann
  def GetNodeBootID(self, name):
139 5a3103e9 Michael Hanselmann
    """Returns the last boot ID of a node or None.
140 a8083063 Iustin Pop
141 5a3103e9 Michael Hanselmann
    """
142 b76f660d Michael Hanselmann
    ndata = self._data["node"]
143 5a3103e9 Michael Hanselmann
144 7b195d9b Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
145 7b195d9b Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
146 5a3103e9 Michael Hanselmann
    return None
147 5a3103e9 Michael Hanselmann
148 5a3103e9 Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
149 5a3103e9 Michael Hanselmann
    """Sets the boot ID of a node.
150 5a3103e9 Michael Hanselmann
151 5a3103e9 Michael Hanselmann
    """
152 5a3103e9 Michael Hanselmann
    assert bootid
153 a8083063 Iustin Pop
154 b76f660d Michael Hanselmann
    ndata = self._data["node"]
155 a8083063 Iustin Pop
156 5a3103e9 Michael Hanselmann
    if name not in ndata:
157 5a3103e9 Michael Hanselmann
      ndata[name] = {}
158 5a3103e9 Michael Hanselmann
159 7b195d9b Michael Hanselmann
    ndata[name][KEY_BOOT_ID] = bootid
160 5a3103e9 Michael Hanselmann
161 5a3103e9 Michael Hanselmann
  def NumberOfRestartAttempts(self, instance):
162 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
163 a8083063 Iustin Pop
164 c41eea6e Iustin Pop
    @type instance: L{Instance}
165 c41eea6e Iustin Pop
    @param instance: the instance to look up
166 38242904 Iustin Pop
167 a8083063 Iustin Pop
    """
168 b76f660d Michael Hanselmann
    idata = self._data["instance"]
169 a8083063 Iustin Pop
170 5a3103e9 Michael Hanselmann
    if instance.name in idata:
171 7b195d9b Michael Hanselmann
      return idata[instance.name][KEY_RESTART_COUNT]
172 a8083063 Iustin Pop
173 a8083063 Iustin Pop
    return 0
174 a8083063 Iustin Pop
175 5a3103e9 Michael Hanselmann
  def RecordRestartAttempt(self, instance):
176 a8083063 Iustin Pop
    """Record a restart attempt.
177 a8083063 Iustin Pop
178 c41eea6e Iustin Pop
    @type instance: L{Instance}
179 c41eea6e Iustin Pop
    @param instance: the instance being restarted
180 38242904 Iustin Pop
181 a8083063 Iustin Pop
    """
182 b76f660d Michael Hanselmann
    idata = self._data["instance"]
183 a8083063 Iustin Pop
184 5a3103e9 Michael Hanselmann
    if instance.name not in idata:
185 5a3103e9 Michael Hanselmann
      inst = idata[instance.name] = {}
186 5a3103e9 Michael Hanselmann
    else:
187 5a3103e9 Michael Hanselmann
      inst = idata[instance.name]
188 a8083063 Iustin Pop
189 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
190 7b195d9b Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
191 a8083063 Iustin Pop
192 5a3103e9 Michael Hanselmann
  def RemoveInstance(self, instance):
193 c41eea6e Iustin Pop
    """Update state to reflect that a machine is running.
194 a8083063 Iustin Pop
195 c41eea6e Iustin Pop
    This method removes the record for a named instance (as we only
196 c41eea6e Iustin Pop
    track down instances).
197 a8083063 Iustin Pop
198 c41eea6e Iustin Pop
    @type instance: L{Instance}
199 c41eea6e Iustin Pop
    @param instance: the instance to remove from books
200 38242904 Iustin Pop
201 a8083063 Iustin Pop
    """
202 b76f660d Michael Hanselmann
    idata = self._data["instance"]
203 a8083063 Iustin Pop
204 5a3103e9 Michael Hanselmann
    if instance.name in idata:
205 5a3103e9 Michael Hanselmann
      del idata[instance.name]
206 a8083063 Iustin Pop
207 a8083063 Iustin Pop
208 a8083063 Iustin Pop
class Instance(object):
209 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
210 a8083063 Iustin Pop
211 a8083063 Iustin Pop
  """
212 5a3103e9 Michael Hanselmann
  def __init__(self, name, state, autostart):
213 a8083063 Iustin Pop
    self.name = name
214 a8083063 Iustin Pop
    self.state = state
215 5a3103e9 Michael Hanselmann
    self.autostart = autostart
216 a8083063 Iustin Pop
217 a8083063 Iustin Pop
  def Restart(self):
218 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
219 3ecf6786 Iustin Pop
220 3ecf6786 Iustin Pop
    """
221 07813a9e Iustin Pop
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
222 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
223 a8083063 Iustin Pop
224 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
225 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
226 5a3103e9 Michael Hanselmann
227 5a3103e9 Michael Hanselmann
    """
228 e125c67c Michael Hanselmann
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
229 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
230 a8083063 Iustin Pop
231 a8083063 Iustin Pop
232 6dfcc47b Iustin Pop
def GetClusterData():
233 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
234 5a3103e9 Michael Hanselmann
235 5a3103e9 Michael Hanselmann
  """
236 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
237 6dfcc47b Iustin Pop
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
238 6dfcc47b Iustin Pop
                                 use_locking=True)
239 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
240 6dfcc47b Iustin Pop
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
241 6dfcc47b Iustin Pop
                             use_locking=True)
242 a8083063 Iustin Pop
243 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
244 a8083063 Iustin Pop
245 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
246 5a3103e9 Michael Hanselmann
247 6dfcc47b Iustin Pop
  result = all_results[0]
248 6dfcc47b Iustin Pop
  smap = {}
249 5a3103e9 Michael Hanselmann
250 6dfcc47b Iustin Pop
  instances = {}
251 6dfcc47b Iustin Pop
  for fields in result:
252 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
253 5a3103e9 Michael Hanselmann
254 6dfcc47b Iustin Pop
    # update the secondary node map
255 6dfcc47b Iustin Pop
    for node in snodes:
256 6dfcc47b Iustin Pop
      if node not in smap:
257 6dfcc47b Iustin Pop
        smap[node] = []
258 6dfcc47b Iustin Pop
      smap[node].append(name)
259 a8083063 Iustin Pop
260 6dfcc47b Iustin Pop
    instances[name] = Instance(name, status, autostart)
261 5a3103e9 Michael Hanselmann
262 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
263 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
264 5a3103e9 Michael Hanselmann
265 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
266 5a3103e9 Michael Hanselmann
267 6dfcc47b Iustin Pop
  return instances, nodes, smap
268 a8083063 Iustin Pop
269 a8083063 Iustin Pop
270 5a3103e9 Michael Hanselmann
class Watcher(object):
271 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
272 a8083063 Iustin Pop
273 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
274 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
275 a8083063 Iustin Pop
  to restart machines that are down.
276 38242904 Iustin Pop
277 a8083063 Iustin Pop
  """
278 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
279 cc962d58 Iustin Pop
    self.notepad = notepad
280 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
281 89e1fc26 Iustin Pop
    if master != utils.HostInfo().name:
282 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
283 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
284 eee1fa2d Iustin Pop
    self.started_instances = set()
285 f07521e5 Iustin Pop
    self.opts = opts
286 a8083063 Iustin Pop
287 a8083063 Iustin Pop
  def Run(self):
288 cc962d58 Iustin Pop
    """Watcher run sequence.
289 cc962d58 Iustin Pop
290 cc962d58 Iustin Pop
    """
291 cc962d58 Iustin Pop
    notepad = self.notepad
292 cc962d58 Iustin Pop
    self.ArchiveJobs(self.opts.job_age)
293 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
294 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
295 cc962d58 Iustin Pop
    self.VerifyDisks()
296 5a3103e9 Michael Hanselmann
297 f07521e5 Iustin Pop
  def ArchiveJobs(self, age):
298 f07521e5 Iustin Pop
    """Archive old jobs.
299 f07521e5 Iustin Pop
300 f07521e5 Iustin Pop
    """
301 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
302 f07521e5 Iustin Pop
    logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))
303 f07521e5 Iustin Pop
304 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
305 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
306 38242904 Iustin Pop
307 a8083063 Iustin Pop
    """
308 5a3103e9 Michael Hanselmann
    check_nodes = []
309 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
310 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
311 37b77b18 Iustin Pop
      if new_id is None:
312 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
313 cbfc4681 Iustin Pop
        if not offline:
314 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
315 cbfc4681 Iustin Pop
                        name)
316 37b77b18 Iustin Pop
        continue
317 26517d45 Iustin Pop
      if old != new_id:
318 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
319 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
320 5a3103e9 Michael Hanselmann
321 5a3103e9 Michael Hanselmann
    if check_nodes:
322 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
323 5a3103e9 Michael Hanselmann
      # secondary node.
324 6dfcc47b Iustin Pop
      for node in check_nodes:
325 6dfcc47b Iustin Pop
        if node not in self.smap:
326 eee1fa2d Iustin Pop
          continue
327 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
328 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
329 6dfcc47b Iustin Pop
          if not instance.autostart:
330 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
331 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
332 6dfcc47b Iustin Pop
            continue
333 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
334 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
335 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
336 6dfcc47b Iustin Pop
            continue
337 6dfcc47b Iustin Pop
          try:
338 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
339 6dfcc47b Iustin Pop
            instance.ActivateDisks()
340 6dfcc47b Iustin Pop
          except Exception:
341 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
342 6dfcc47b Iustin Pop
                              instance.name)
343 5a3103e9 Michael Hanselmann
344 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
345 5a3103e9 Michael Hanselmann
      for name in check_nodes:
346 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
347 a8083063 Iustin Pop
348 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
349 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
350 5a3103e9 Michael Hanselmann
351 5a3103e9 Michael Hanselmann
    """
352 6dfcc47b Iustin Pop
    for instance in self.instances.values():
353 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
354 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
355 a8083063 Iustin Pop
356 a8083063 Iustin Pop
        if n > MAXTRIES:
357 a8083063 Iustin Pop
          # stay quiet.
358 a8083063 Iustin Pop
          continue
359 a8083063 Iustin Pop
        elif n < MAXTRIES:
360 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
361 a8083063 Iustin Pop
        else:
362 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
363 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
364 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
365 a8083063 Iustin Pop
          continue
366 a8083063 Iustin Pop
        try:
367 438b45d4 Michael Hanselmann
          logging.info("Restarting %s%s",
368 438b45d4 Michael Hanselmann
                        instance.name, last)
369 a8083063 Iustin Pop
          instance.Restart()
370 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
371 b7309a0d Iustin Pop
        except Exception:
372 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
373 4bffa7f7 Iustin Pop
                            instance.name)
374 a8083063 Iustin Pop
375 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
376 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
377 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
378 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
379 a8083063 Iustin Pop
      else:
380 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
381 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
382 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
383 a8083063 Iustin Pop
384 b7309a0d Iustin Pop
  @staticmethod
385 b7309a0d Iustin Pop
  def VerifyDisks():
386 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
387 d2f311db Iustin Pop
388 d2f311db Iustin Pop
    """
389 5188ab37 Iustin Pop
    op = opcodes.OpVerifyDisks()
390 6dfcc47b Iustin Pop
    job_id = client.SubmitJob([op])
391 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
392 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
393 5188ab37 Iustin Pop
    if not isinstance(result, (tuple, list)):
394 5188ab37 Iustin Pop
      logging.error("Can't get a valid result from verify-disks")
395 5188ab37 Iustin Pop
      return
396 5188ab37 Iustin Pop
    offline_disk_instances = result[2]
397 5188ab37 Iustin Pop
    if not offline_disk_instances:
398 5188ab37 Iustin Pop
      # nothing to do
399 5188ab37 Iustin Pop
      return
400 5188ab37 Iustin Pop
    logging.debug("Will activate disks for instances %s",
401 5188ab37 Iustin Pop
                  ", ".join(offline_disk_instances))
402 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
403 5188ab37 Iustin Pop
    # less the job queue
404 5188ab37 Iustin Pop
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
405 5188ab37 Iustin Pop
           for name in offline_disk_instances]
406 5188ab37 Iustin Pop
    job_id = cli.SendJob(job, cl=client)
407 5188ab37 Iustin Pop
408 5188ab37 Iustin Pop
    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
409 a8083063 Iustin Pop
410 a8083063 Iustin Pop
411 a8083063 Iustin Pop
def ParseOptions():
412 a8083063 Iustin Pop
  """Parse the command line options.
413 a8083063 Iustin Pop
414 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
415 a8083063 Iustin Pop
416 a8083063 Iustin Pop
  """
417 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
418 a8083063 Iustin Pop
                        usage="%prog [-d]",
419 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
420 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
421 a8083063 Iustin Pop
422 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
423 438b45d4 Michael Hanselmann
                    help="Write all messages to stderr",
424 a8083063 Iustin Pop
                    default=False, action="store_true")
425 f07521e5 Iustin Pop
  parser.add_option("-A", "--job-age", dest="job_age",
426 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
427 f07521e5 Iustin Pop
                    " 6 hours)", default=6*3600)
428 a8083063 Iustin Pop
  options, args = parser.parse_args()
429 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
430 a8083063 Iustin Pop
  return options, args
431 a8083063 Iustin Pop
432 a8083063 Iustin Pop
433 a8083063 Iustin Pop
def main():
434 a8083063 Iustin Pop
  """Main function.
435 a8083063 Iustin Pop
436 a8083063 Iustin Pop
  """
437 e125c67c Michael Hanselmann
  global client
438 e125c67c Michael Hanselmann
439 a8083063 Iustin Pop
  options, args = ParseOptions()
440 a8083063 Iustin Pop
441 82d9caef Iustin Pop
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
442 82d9caef Iustin Pop
                     stderr_logging=options.debug)
443 a8083063 Iustin Pop
444 a8083063 Iustin Pop
  try:
445 cc962d58 Iustin Pop
    notepad = WatcherState()
446 781b2b2b Michael Hanselmann
    try:
447 2c404217 Iustin Pop
      try:
448 2c404217 Iustin Pop
        client = cli.GetClient()
449 2c404217 Iustin Pop
      except errors.OpPrereqError:
450 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
451 2c404217 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
452 cc962d58 Iustin Pop
453 cc962d58 Iustin Pop
      try:
454 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
455 cc962d58 Iustin Pop
      except errors.ConfigurationError:
456 cc962d58 Iustin Pop
        # Just exit if there's no configuration
457 cc962d58 Iustin Pop
        sys.exit(constants.EXIT_SUCCESS)
458 e125c67c Michael Hanselmann
459 cc962d58 Iustin Pop
      watcher.Run()
460 cc962d58 Iustin Pop
    finally:
461 cc962d58 Iustin Pop
      notepad.Save()
462 1b052f42 Michael Hanselmann
  except SystemExit:
463 1b052f42 Michael Hanselmann
    raise
464 38242904 Iustin Pop
  except NotMasterError:
465 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
466 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
467 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
468 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
469 89e1fc26 Iustin Pop
    sys.exit(constants.EXIT_NODESETUP_ERROR)
470 438b45d4 Michael Hanselmann
  except Exception, err:
471 438b45d4 Michael Hanselmann
    logging.error(str(err), exc_info=True)
472 438b45d4 Michael Hanselmann
    sys.exit(constants.EXIT_FAILURE)
473 a8083063 Iustin Pop
474 5a3103e9 Michael Hanselmann
475 a8083063 Iustin Pop
if __name__ == '__main__':
476 a8083063 Iustin Pop
  main()