Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ d7c42723

History | View | Annotate | Download (16.6 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 cfcc79c6 Michael Hanselmann
import os.path
32 a8083063 Iustin Pop
import sys
33 a8083063 Iustin Pop
import time
34 438b45d4 Michael Hanselmann
import logging
35 a8083063 Iustin Pop
from optparse import OptionParser
36 a8083063 Iustin Pop
37 a8083063 Iustin Pop
from ganeti import utils
38 a8083063 Iustin Pop
from ganeti import constants
39 83e5e26f René Nussbaumer
from ganeti import compat
40 89e1fc26 Iustin Pop
from ganeti import errors
41 e125c67c Michael Hanselmann
from ganeti import opcodes
42 e125c67c Michael Hanselmann
from ganeti import cli
43 7dfb83c2 Iustin Pop
from ganeti import luxi
44 db147305 Tom Limoncelli
from ganeti import rapi
45 a744b676 Manuel Franceschini
from ganeti import netutils
46 a8083063 Iustin Pop
47 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
48 adf6301e Michael Hanselmann
49 adf6301e Michael Hanselmann
from ganeti.watcher import nodemaint
50 adf6301e Michael Hanselmann
from ganeti.watcher import state
51 db147305 Tom Limoncelli
52 a8083063 Iustin Pop
53 5a3103e9 Michael Hanselmann
MAXTRIES = 5
54 5a3103e9 Michael Hanselmann
55 5a3103e9 Michael Hanselmann
56 a0aa6b49 Michael Hanselmann
# Global LUXI client object
57 e125c67c Michael Hanselmann
client = None
58 0cc9e018 Michael Hanselmann
BAD_STATES = frozenset([
59 0cc9e018 Michael Hanselmann
  constants.INSTST_ERRORDOWN,
60 0cc9e018 Michael Hanselmann
  ])
61 0cc9e018 Michael Hanselmann
HELPLESS_STATES = frozenset([
62 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEDOWN,
63 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEOFFLINE,
64 0cc9e018 Michael Hanselmann
  ])
65 0cc9e018 Michael Hanselmann
NOTICE = "NOTICE"
66 0cc9e018 Michael Hanselmann
ERROR = "ERROR"
67 e125c67c Michael Hanselmann
68 e125c67c Michael Hanselmann
69 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
70 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
71 a8083063 Iustin Pop
72 a8083063 Iustin Pop
73 3753b2cb Michael Hanselmann
def ShouldPause():
74 3753b2cb Michael Hanselmann
  """Check whether we should pause.
75 3753b2cb Michael Hanselmann

76 3753b2cb Michael Hanselmann
  """
77 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
78 3753b2cb Michael Hanselmann
79 3753b2cb Michael Hanselmann
80 f1115454 Guido Trotter
def StartNodeDaemons():
81 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
82 f1115454 Guido Trotter

83 f1115454 Guido Trotter
  """
84 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
85 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
86 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
87 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
88 f1115454 Guido Trotter
89 f1115454 Guido Trotter
90 9e289e36 Guido Trotter
def RunWatcherHooks():
91 9e289e36 Guido Trotter
  """Run the watcher hooks.
92 9e289e36 Guido Trotter

93 9e289e36 Guido Trotter
  """
94 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
95 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
96 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
97 10e689d4 Iustin Pop
    return
98 9e289e36 Guido Trotter
99 9e289e36 Guido Trotter
  try:
100 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
101 a0aa6b49 Michael Hanselmann
  except Exception: # pylint: disable-msg=W0703
102 a0aa6b49 Michael Hanselmann
    logging.exception("RunParts %s failed: %s", hooks_dir)
103 a0aa6b49 Michael Hanselmann
    return
104 9e289e36 Guido Trotter
105 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
106 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
107 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
108 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
109 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
110 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
111 9e289e36 Guido Trotter
      if runresult.failed:
112 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
113 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
114 9e289e36 Guido Trotter
      else:
115 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
116 9e289e36 Guido Trotter
                      runresult.output)
117 013ce4ae Michael Hanselmann
    else:
118 013ce4ae Michael Hanselmann
      raise errors.ProgrammerError("Unknown status %s returned by RunParts",
119 013ce4ae Michael Hanselmann
                                   status)
120 9e289e36 Guido Trotter
121 001b3825 Michael Hanselmann
122 a8083063 Iustin Pop
class Instance(object):
123 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
124 a8083063 Iustin Pop

125 a8083063 Iustin Pop
  """
126 adf6301e Michael Hanselmann
  def __init__(self, name, status, autostart, snodes):
127 a8083063 Iustin Pop
    self.name = name
128 adf6301e Michael Hanselmann
    self.status = status
129 5a3103e9 Michael Hanselmann
    self.autostart = autostart
130 83e5e26f René Nussbaumer
    self.snodes = snodes
131 a8083063 Iustin Pop
132 a8083063 Iustin Pop
  def Restart(self):
133 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
134 3ecf6786 Iustin Pop

135 3ecf6786 Iustin Pop
    """
136 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
137 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
138 a8083063 Iustin Pop
139 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
140 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
141 5a3103e9 Michael Hanselmann

142 5a3103e9 Michael Hanselmann
    """
143 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
144 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
145 a8083063 Iustin Pop
146 a8083063 Iustin Pop
147 6dfcc47b Iustin Pop
def GetClusterData():
148 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
149 5a3103e9 Michael Hanselmann

150 5a3103e9 Michael Hanselmann
  """
151 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
152 f2af0bec Iustin Pop
  op1 = opcodes.OpInstanceQuery(output_fields=op1_fields, names=[],
153 f2af0bec Iustin Pop
                                use_locking=True)
154 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
155 2237687b Iustin Pop
  op2 = opcodes.OpNodeQuery(output_fields=op2_fields, names=[],
156 2237687b Iustin Pop
                            use_locking=True)
157 a8083063 Iustin Pop
158 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
159 a8083063 Iustin Pop
160 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
161 5a3103e9 Michael Hanselmann
162 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
163 78f44650 Iustin Pop
164 6dfcc47b Iustin Pop
  result = all_results[0]
165 6dfcc47b Iustin Pop
  smap = {}
166 5a3103e9 Michael Hanselmann
167 6dfcc47b Iustin Pop
  instances = {}
168 78f44650 Iustin Pop
169 bcf0450d Michael Hanselmann
  # write the instance status file
170 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
171 bcf0450d Michael Hanselmann
  utils.WriteFile(file_name=constants.INSTANCE_STATUS_FILE, data=up_data)
172 78f44650 Iustin Pop
173 6dfcc47b Iustin Pop
  for fields in result:
174 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
175 5a3103e9 Michael Hanselmann
176 6dfcc47b Iustin Pop
    # update the secondary node map
177 6dfcc47b Iustin Pop
    for node in snodes:
178 6dfcc47b Iustin Pop
      if node not in smap:
179 6dfcc47b Iustin Pop
        smap[node] = []
180 6dfcc47b Iustin Pop
      smap[node].append(name)
181 a8083063 Iustin Pop
182 83e5e26f René Nussbaumer
    instances[name] = Instance(name, status, autostart, snodes)
183 5a3103e9 Michael Hanselmann
184 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
185 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
186 5a3103e9 Michael Hanselmann
187 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
188 5a3103e9 Michael Hanselmann
189 6dfcc47b Iustin Pop
  return instances, nodes, smap
190 a8083063 Iustin Pop
191 a8083063 Iustin Pop
192 5a3103e9 Michael Hanselmann
class Watcher(object):
193 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
194 a8083063 Iustin Pop

195 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
196 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
197 a8083063 Iustin Pop
  to restart machines that are down.
198 38242904 Iustin Pop

199 a8083063 Iustin Pop
  """
200 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
201 cc962d58 Iustin Pop
    self.notepad = notepad
202 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
203 b705c7a6 Manuel Franceschini
    if master != netutils.Hostname.GetSysName():
204 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
205 24edc6d4 Iustin Pop
    # first archive old jobs
206 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
207 24edc6d4 Iustin Pop
    # and only then submit new ones
208 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
209 eee1fa2d Iustin Pop
    self.started_instances = set()
210 f07521e5 Iustin Pop
    self.opts = opts
211 a8083063 Iustin Pop
212 a8083063 Iustin Pop
  def Run(self):
213 cc962d58 Iustin Pop
    """Watcher run sequence.
214 cc962d58 Iustin Pop

215 cc962d58 Iustin Pop
    """
216 cc962d58 Iustin Pop
    notepad = self.notepad
217 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
218 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
219 cc962d58 Iustin Pop
    self.VerifyDisks()
220 5a3103e9 Michael Hanselmann
221 24edc6d4 Iustin Pop
  @staticmethod
222 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
223 f07521e5 Iustin Pop
    """Archive old jobs.
224 f07521e5 Iustin Pop

225 f07521e5 Iustin Pop
    """
226 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
227 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
228 f07521e5 Iustin Pop
229 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
230 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
231 38242904 Iustin Pop

232 a8083063 Iustin Pop
    """
233 5a3103e9 Michael Hanselmann
    check_nodes = []
234 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
235 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
236 37b77b18 Iustin Pop
      if new_id is None:
237 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
238 cbfc4681 Iustin Pop
        if not offline:
239 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
240 cbfc4681 Iustin Pop
                        name)
241 37b77b18 Iustin Pop
        continue
242 26517d45 Iustin Pop
      if old != new_id:
243 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
244 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
245 5a3103e9 Michael Hanselmann
246 5a3103e9 Michael Hanselmann
    if check_nodes:
247 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
248 5a3103e9 Michael Hanselmann
      # secondary node.
249 6dfcc47b Iustin Pop
      for node in check_nodes:
250 6dfcc47b Iustin Pop
        if node not in self.smap:
251 eee1fa2d Iustin Pop
          continue
252 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
253 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
254 6dfcc47b Iustin Pop
          if not instance.autostart:
255 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
256 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
257 6dfcc47b Iustin Pop
            continue
258 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
259 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
260 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
261 604c175c Iustin Pop
            logging.debug("Skipping disk activation for instance %s, as"
262 604c175c Iustin Pop
                          " it was already started", instance.name)
263 6dfcc47b Iustin Pop
            continue
264 6dfcc47b Iustin Pop
          try:
265 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
266 6dfcc47b Iustin Pop
            instance.ActivateDisks()
267 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
268 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
269 6dfcc47b Iustin Pop
                              instance.name)
270 5a3103e9 Michael Hanselmann
271 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
272 5a3103e9 Michael Hanselmann
      for name in check_nodes:
273 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
274 a8083063 Iustin Pop
275 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
276 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
277 5a3103e9 Michael Hanselmann

278 5a3103e9 Michael Hanselmann
    """
279 f5116c87 Iustin Pop
    notepad.MaintainInstanceList(self.instances.keys())
280 f5116c87 Iustin Pop
281 6dfcc47b Iustin Pop
    for instance in self.instances.values():
282 adf6301e Michael Hanselmann
      if instance.status in BAD_STATES:
283 54ca6e4b Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance.name)
284 a8083063 Iustin Pop
285 a8083063 Iustin Pop
        if n > MAXTRIES:
286 f5116c87 Iustin Pop
          logging.warning("Not restarting instance %s, retries exhausted",
287 f5116c87 Iustin Pop
                          instance.name)
288 a8083063 Iustin Pop
          continue
289 a8083063 Iustin Pop
        elif n < MAXTRIES:
290 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
291 a8083063 Iustin Pop
        else:
292 54ca6e4b Michael Hanselmann
          notepad.RecordRestartAttempt(instance.name)
293 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
294 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
295 a8083063 Iustin Pop
          continue
296 a8083063 Iustin Pop
        try:
297 604c175c Iustin Pop
          logging.info("Restarting %s%s", instance.name, last)
298 a8083063 Iustin Pop
          instance.Restart()
299 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
300 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
301 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
302 4bffa7f7 Iustin Pop
                            instance.name)
303 a8083063 Iustin Pop
304 54ca6e4b Michael Hanselmann
        notepad.RecordRestartAttempt(instance.name)
305 adf6301e Michael Hanselmann
      elif instance.status in HELPLESS_STATES:
306 54ca6e4b Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance.name):
307 54ca6e4b Michael Hanselmann
          notepad.RemoveInstance(instance.name)
308 a8083063 Iustin Pop
      else:
309 54ca6e4b Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance.name):
310 54ca6e4b Michael Hanselmann
          notepad.RemoveInstance(instance.name)
311 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
312 a8083063 Iustin Pop
313 83e5e26f René Nussbaumer
  def _CheckForOfflineNodes(self, instance):
314 83e5e26f René Nussbaumer
    """Checks if given instances has any secondary in offline status.
315 83e5e26f René Nussbaumer

316 83e5e26f René Nussbaumer
    @param instance: The instance object
317 83e5e26f René Nussbaumer
    @return: True if any of the secondary is offline, False otherwise
318 83e5e26f René Nussbaumer

319 83e5e26f René Nussbaumer
    """
320 83e5e26f René Nussbaumer
    bootids = []
321 83e5e26f René Nussbaumer
    for node in instance.snodes:
322 83e5e26f René Nussbaumer
      bootids.append(self.bootids[node])
323 83e5e26f René Nussbaumer
324 83e5e26f René Nussbaumer
    return compat.any(offline for (_, offline) in bootids)
325 83e5e26f René Nussbaumer
326 83e5e26f René Nussbaumer
  def VerifyDisks(self):
327 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
328 d2f311db Iustin Pop

329 d2f311db Iustin Pop
    """
330 ae1a845c Michael Hanselmann
    job_id = client.SubmitJob([opcodes.OpClusterVerifyDisks()])
331 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
332 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
333 ae1a845c Michael Hanselmann
334 ae1a845c Michael Hanselmann
    # Keep track of submitted jobs
335 ae1a845c Michael Hanselmann
    jex = cli.JobExecutor(cl=client, feedback_fn=logging.debug)
336 ae1a845c Michael Hanselmann
337 ae1a845c Michael Hanselmann
    archive_jobs = set()
338 ae1a845c Michael Hanselmann
    for (status, job_id) in result[constants.JOB_IDS_KEY]:
339 ae1a845c Michael Hanselmann
      jex.AddJobId(None, status, job_id)
340 ae1a845c Michael Hanselmann
      if status:
341 ae1a845c Michael Hanselmann
        archive_jobs.add(job_id)
342 ae1a845c Michael Hanselmann
343 ae1a845c Michael Hanselmann
    offline_disk_instances = set()
344 ae1a845c Michael Hanselmann
345 ae1a845c Michael Hanselmann
    for (status, result) in jex.GetResults():
346 ae1a845c Michael Hanselmann
      if not status:
347 ae1a845c Michael Hanselmann
        logging.error("Verify-disks job failed: %s", result)
348 ae1a845c Michael Hanselmann
        continue
349 ae1a845c Michael Hanselmann
350 ae1a845c Michael Hanselmann
      ((_, instances, _), ) = result
351 ae1a845c Michael Hanselmann
352 ae1a845c Michael Hanselmann
      offline_disk_instances.update(instances)
353 ae1a845c Michael Hanselmann
354 ae1a845c Michael Hanselmann
    for job_id in archive_jobs:
355 ae1a845c Michael Hanselmann
      client.ArchiveJob(job_id)
356 ae1a845c Michael Hanselmann
357 5188ab37 Iustin Pop
    if not offline_disk_instances:
358 5188ab37 Iustin Pop
      # nothing to do
359 604c175c Iustin Pop
      logging.debug("verify-disks reported no offline disks, nothing to do")
360 5188ab37 Iustin Pop
      return
361 ae1a845c Michael Hanselmann
362 604c175c Iustin Pop
    logging.debug("Will activate disks for instance(s) %s",
363 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
364 ae1a845c Michael Hanselmann
365 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
366 5188ab37 Iustin Pop
    # less the job queue
367 83e5e26f René Nussbaumer
    job = []
368 83e5e26f René Nussbaumer
    for name in offline_disk_instances:
369 83e5e26f René Nussbaumer
      instance = self.instances[name]
370 adf6301e Michael Hanselmann
      if (instance.status in HELPLESS_STATES or
371 83e5e26f René Nussbaumer
          self._CheckForOfflineNodes(instance)):
372 83e5e26f René Nussbaumer
        logging.info("Skip instance %s because it is in helpless state or has"
373 83e5e26f René Nussbaumer
                     " one offline secondary", name)
374 83e5e26f René Nussbaumer
        continue
375 83e5e26f René Nussbaumer
      job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
376 5188ab37 Iustin Pop
377 83e5e26f René Nussbaumer
    if job:
378 83e5e26f René Nussbaumer
      job_id = cli.SendJob(job, cl=client)
379 83e5e26f René Nussbaumer
380 83e5e26f René Nussbaumer
      try:
381 83e5e26f René Nussbaumer
        cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
382 83e5e26f René Nussbaumer
      except Exception: # pylint: disable-msg=W0703
383 83e5e26f René Nussbaumer
        logging.exception("Error while activating disks")
384 a8083063 Iustin Pop
385 a8083063 Iustin Pop
386 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
387 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
388 db147305 Tom Limoncelli

389 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
390 db147305 Tom Limoncelli
  test is GetVersion.
391 db147305 Tom Limoncelli

392 db147305 Tom Limoncelli
  @type hostname: string
393 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
394 db147305 Tom Limoncelli
  @rtype: bool
395 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
396 db147305 Tom Limoncelli

397 db147305 Tom Limoncelli
  """
398 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
399 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
400 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
401 db147305 Tom Limoncelli
  try:
402 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
403 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
404 d7c42723 Michael Hanselmann
    logging.warning("RAPI certificate error: %s", err)
405 db147305 Tom Limoncelli
    return False
406 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
407 d7c42723 Michael Hanselmann
    logging.warning("RAPI error: %s", err)
408 db147305 Tom Limoncelli
    return False
409 d7c42723 Michael Hanselmann
  else:
410 d7c42723 Michael Hanselmann
    logging.debug("Reported RAPI version %s", master_version)
411 d7c42723 Michael Hanselmann
    return master_version == constants.RAPI_VERSION
412 db147305 Tom Limoncelli
413 db147305 Tom Limoncelli
414 a8083063 Iustin Pop
def ParseOptions():
415 a8083063 Iustin Pop
  """Parse the command line options.
416 a8083063 Iustin Pop

417 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
418 a8083063 Iustin Pop

419 a8083063 Iustin Pop
  """
420 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
421 a8083063 Iustin Pop
                        usage="%prog [-d]",
422 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
423 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
424 a8083063 Iustin Pop
425 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
426 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
427 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
428 f0a80b01 Michael Hanselmann
                          " 6 hours)")
429 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
430 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
431 a8083063 Iustin Pop
  options, args = parser.parse_args()
432 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
433 f0a80b01 Michael Hanselmann
434 f0a80b01 Michael Hanselmann
  if args:
435 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
436 f0a80b01 Michael Hanselmann
437 f0a80b01 Michael Hanselmann
  return (options, args)
438 a8083063 Iustin Pop
439 a8083063 Iustin Pop
440 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
441 9f4bb951 Michael Hanselmann
def Main():
442 a8083063 Iustin Pop
  """Main function.
443 a8083063 Iustin Pop

444 a8083063 Iustin Pop
  """
445 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
446 e125c67c Michael Hanselmann
447 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
448 a8083063 Iustin Pop
449 cfcc79c6 Michael Hanselmann
  utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
450 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
451 a8083063 Iustin Pop
452 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
453 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
454 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
455 3753b2cb Michael Hanselmann
456 adf6301e Michael Hanselmann
  statefile = \
457 adf6301e Michael Hanselmann
    state.OpenStateFile(constants.WATCHER_STATEFILE)
458 001b3825 Michael Hanselmann
  if not statefile:
459 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
460 001b3825 Michael Hanselmann
461 24edc6d4 Iustin Pop
  update_file = False
462 a8083063 Iustin Pop
  try:
463 f1115454 Guido Trotter
    StartNodeDaemons()
464 9e289e36 Guido Trotter
    RunWatcherHooks()
465 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
466 50273051 Iustin Pop
    # masters can be properly cleaned up too
467 adf6301e Michael Hanselmann
    if nodemaint.NodeMaintenance.ShouldRun():
468 adf6301e Michael Hanselmann
      nodemaint.NodeMaintenance().Exec()
469 c4f0219c Iustin Pop
470 adf6301e Michael Hanselmann
    notepad = state.WatcherState(statefile)
471 781b2b2b Michael Hanselmann
    try:
472 2c404217 Iustin Pop
      try:
473 2c404217 Iustin Pop
        client = cli.GetClient()
474 2c404217 Iustin Pop
      except errors.OpPrereqError:
475 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
476 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
477 24edc6d4 Iustin Pop
        update_file = True
478 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
479 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
480 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
481 7dfb83c2 Iustin Pop
                        str(err))
482 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
483 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
484 9f4bb951 Michael Hanselmann
          return constants.EXIT_FAILURE
485 7dfb83c2 Iustin Pop
        # else retry the connection
486 7dfb83c2 Iustin Pop
        client = cli.GetClient()
487 cc962d58 Iustin Pop
488 83052f9e Guido Trotter
      # we are on master now
489 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
490 c4f0219c Iustin Pop
491 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
492 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
493 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
494 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
495 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
496 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
497 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
498 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
499 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
500 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
501 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
502 db147305 Tom Limoncelli
503 cc962d58 Iustin Pop
      try:
504 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
505 cc962d58 Iustin Pop
      except errors.ConfigurationError:
506 cc962d58 Iustin Pop
        # Just exit if there's no configuration
507 24edc6d4 Iustin Pop
        update_file = True
508 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
509 e125c67c Michael Hanselmann
510 cc962d58 Iustin Pop
      watcher.Run()
511 24edc6d4 Iustin Pop
      update_file = True
512 24edc6d4 Iustin Pop
513 cc962d58 Iustin Pop
    finally:
514 7dfb83c2 Iustin Pop
      if update_file:
515 54ca6e4b Michael Hanselmann
        notepad.Save(constants.WATCHER_STATEFILE)
516 7dfb83c2 Iustin Pop
      else:
517 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
518 1b052f42 Michael Hanselmann
  except SystemExit:
519 1b052f42 Michael Hanselmann
    raise
520 38242904 Iustin Pop
  except NotMasterError:
521 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
522 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
523 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
524 013ce4ae Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting", err.args[0])
525 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
526 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
527 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
528 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
529 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
530 438b45d4 Michael Hanselmann
  except Exception, err:
531 001b3825 Michael Hanselmann
    logging.exception(str(err))
532 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
533 5a3103e9 Michael Hanselmann
534 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS