Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ adf6301e

History | View | Annotate | Download (16.4 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 cfcc79c6 Michael Hanselmann
import os.path
32 a8083063 Iustin Pop
import sys
33 a8083063 Iustin Pop
import time
34 438b45d4 Michael Hanselmann
import logging
35 a8083063 Iustin Pop
from optparse import OptionParser
36 a8083063 Iustin Pop
37 a8083063 Iustin Pop
from ganeti import utils
38 a8083063 Iustin Pop
from ganeti import constants
39 83e5e26f René Nussbaumer
from ganeti import compat
40 89e1fc26 Iustin Pop
from ganeti import errors
41 e125c67c Michael Hanselmann
from ganeti import opcodes
42 e125c67c Michael Hanselmann
from ganeti import cli
43 7dfb83c2 Iustin Pop
from ganeti import luxi
44 db147305 Tom Limoncelli
from ganeti import rapi
45 a744b676 Manuel Franceschini
from ganeti import netutils
46 a8083063 Iustin Pop
47 db147305 Tom Limoncelli
import ganeti.rapi.client # pylint: disable-msg=W0611
48 adf6301e Michael Hanselmann
49 adf6301e Michael Hanselmann
from ganeti.watcher import nodemaint
50 adf6301e Michael Hanselmann
from ganeti.watcher import state
51 db147305 Tom Limoncelli
52 a8083063 Iustin Pop
53 5a3103e9 Michael Hanselmann
MAXTRIES = 5
54 61a980a9 Michael Hanselmann
BAD_STATES = [constants.INSTST_ERRORDOWN]
55 61a980a9 Michael Hanselmann
HELPLESS_STATES = [constants.INSTST_NODEDOWN, constants.INSTST_NODEOFFLINE]
56 5a3103e9 Michael Hanselmann
NOTICE = 'NOTICE'
57 5a3103e9 Michael Hanselmann
ERROR = 'ERROR'
58 5a3103e9 Michael Hanselmann
59 5a3103e9 Michael Hanselmann
60 a0aa6b49 Michael Hanselmann
# Global LUXI client object
61 e125c67c Michael Hanselmann
client = None
62 e125c67c Michael Hanselmann
63 e125c67c Michael Hanselmann
64 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
65 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
66 a8083063 Iustin Pop
67 a8083063 Iustin Pop
68 3753b2cb Michael Hanselmann
def ShouldPause():
69 3753b2cb Michael Hanselmann
  """Check whether we should pause.
70 3753b2cb Michael Hanselmann

71 3753b2cb Michael Hanselmann
  """
72 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
73 3753b2cb Michael Hanselmann
74 3753b2cb Michael Hanselmann
75 f1115454 Guido Trotter
def StartNodeDaemons():
76 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
77 f1115454 Guido Trotter

78 f1115454 Guido Trotter
  """
79 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
80 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
81 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
82 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
83 f1115454 Guido Trotter
84 f1115454 Guido Trotter
85 9e289e36 Guido Trotter
def RunWatcherHooks():
86 9e289e36 Guido Trotter
  """Run the watcher hooks.
87 9e289e36 Guido Trotter

88 9e289e36 Guido Trotter
  """
89 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
90 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
91 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
92 10e689d4 Iustin Pop
    return
93 9e289e36 Guido Trotter
94 9e289e36 Guido Trotter
  try:
95 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
96 a0aa6b49 Michael Hanselmann
  except Exception: # pylint: disable-msg=W0703
97 a0aa6b49 Michael Hanselmann
    logging.exception("RunParts %s failed: %s", hooks_dir)
98 a0aa6b49 Michael Hanselmann
    return
99 9e289e36 Guido Trotter
100 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
101 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
102 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
103 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
104 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
105 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
106 9e289e36 Guido Trotter
      if runresult.failed:
107 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
108 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
109 9e289e36 Guido Trotter
      else:
110 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
111 9e289e36 Guido Trotter
                      runresult.output)
112 9e289e36 Guido Trotter
113 001b3825 Michael Hanselmann
114 a8083063 Iustin Pop
class Instance(object):
115 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
116 a8083063 Iustin Pop

117 a8083063 Iustin Pop
  """
118 adf6301e Michael Hanselmann
  def __init__(self, name, status, autostart, snodes):
119 a8083063 Iustin Pop
    self.name = name
120 adf6301e Michael Hanselmann
    self.status = status
121 5a3103e9 Michael Hanselmann
    self.autostart = autostart
122 83e5e26f René Nussbaumer
    self.snodes = snodes
123 a8083063 Iustin Pop
124 a8083063 Iustin Pop
  def Restart(self):
125 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
126 3ecf6786 Iustin Pop

127 3ecf6786 Iustin Pop
    """
128 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
129 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
130 a8083063 Iustin Pop
131 5a3103e9 Michael Hanselmann
  def ActivateDisks(self):
132 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
133 5a3103e9 Michael Hanselmann

134 5a3103e9 Michael Hanselmann
    """
135 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
136 e125c67c Michael Hanselmann
    cli.SubmitOpCode(op, cl=client)
137 a8083063 Iustin Pop
138 a8083063 Iustin Pop
139 6dfcc47b Iustin Pop
def GetClusterData():
140 5a3103e9 Michael Hanselmann
  """Get a list of instances on this cluster.
141 5a3103e9 Michael Hanselmann

142 5a3103e9 Michael Hanselmann
  """
143 6dfcc47b Iustin Pop
  op1_fields = ["name", "status", "admin_state", "snodes"]
144 f2af0bec Iustin Pop
  op1 = opcodes.OpInstanceQuery(output_fields=op1_fields, names=[],
145 f2af0bec Iustin Pop
                                use_locking=True)
146 6dfcc47b Iustin Pop
  op2_fields = ["name", "bootid", "offline"]
147 2237687b Iustin Pop
  op2 = opcodes.OpNodeQuery(output_fields=op2_fields, names=[],
148 2237687b Iustin Pop
                            use_locking=True)
149 a8083063 Iustin Pop
150 6dfcc47b Iustin Pop
  job_id = client.SubmitJob([op1, op2])
151 a8083063 Iustin Pop
152 6dfcc47b Iustin Pop
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
153 5a3103e9 Michael Hanselmann
154 78f44650 Iustin Pop
  logging.debug("Got data from cluster, writing instance status file")
155 78f44650 Iustin Pop
156 6dfcc47b Iustin Pop
  result = all_results[0]
157 6dfcc47b Iustin Pop
  smap = {}
158 5a3103e9 Michael Hanselmann
159 6dfcc47b Iustin Pop
  instances = {}
160 78f44650 Iustin Pop
161 bcf0450d Michael Hanselmann
  # write the instance status file
162 78f44650 Iustin Pop
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
163 bcf0450d Michael Hanselmann
  utils.WriteFile(file_name=constants.INSTANCE_STATUS_FILE, data=up_data)
164 78f44650 Iustin Pop
165 6dfcc47b Iustin Pop
  for fields in result:
166 6dfcc47b Iustin Pop
    (name, status, autostart, snodes) = fields
167 5a3103e9 Michael Hanselmann
168 6dfcc47b Iustin Pop
    # update the secondary node map
169 6dfcc47b Iustin Pop
    for node in snodes:
170 6dfcc47b Iustin Pop
      if node not in smap:
171 6dfcc47b Iustin Pop
        smap[node] = []
172 6dfcc47b Iustin Pop
      smap[node].append(name)
173 a8083063 Iustin Pop
174 83e5e26f René Nussbaumer
    instances[name] = Instance(name, status, autostart, snodes)
175 5a3103e9 Michael Hanselmann
176 6dfcc47b Iustin Pop
  nodes =  dict([(name, (bootid, offline))
177 6dfcc47b Iustin Pop
                 for name, bootid, offline in all_results[1]])
178 5a3103e9 Michael Hanselmann
179 6dfcc47b Iustin Pop
  client.ArchiveJob(job_id)
180 5a3103e9 Michael Hanselmann
181 6dfcc47b Iustin Pop
  return instances, nodes, smap
182 a8083063 Iustin Pop
183 a8083063 Iustin Pop
184 5a3103e9 Michael Hanselmann
class Watcher(object):
185 55c85950 Iustin Pop
  """Encapsulate the logic for restarting erroneously halted virtual machines.
186 a8083063 Iustin Pop

187 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
188 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
189 a8083063 Iustin Pop
  to restart machines that are down.
190 38242904 Iustin Pop

191 a8083063 Iustin Pop
  """
192 cc962d58 Iustin Pop
  def __init__(self, opts, notepad):
193 cc962d58 Iustin Pop
    self.notepad = notepad
194 2859b87b Michael Hanselmann
    master = client.QueryConfigValues(["master_node"])[0]
195 b705c7a6 Manuel Franceschini
    if master != netutils.Hostname.GetSysName():
196 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
197 24edc6d4 Iustin Pop
    # first archive old jobs
198 24edc6d4 Iustin Pop
    self.ArchiveJobs(opts.job_age)
199 24edc6d4 Iustin Pop
    # and only then submit new ones
200 6dfcc47b Iustin Pop
    self.instances, self.bootids, self.smap = GetClusterData()
201 eee1fa2d Iustin Pop
    self.started_instances = set()
202 f07521e5 Iustin Pop
    self.opts = opts
203 a8083063 Iustin Pop
204 a8083063 Iustin Pop
  def Run(self):
205 cc962d58 Iustin Pop
    """Watcher run sequence.
206 cc962d58 Iustin Pop

207 cc962d58 Iustin Pop
    """
208 cc962d58 Iustin Pop
    notepad = self.notepad
209 cc962d58 Iustin Pop
    self.CheckInstances(notepad)
210 cc962d58 Iustin Pop
    self.CheckDisks(notepad)
211 cc962d58 Iustin Pop
    self.VerifyDisks()
212 5a3103e9 Michael Hanselmann
213 24edc6d4 Iustin Pop
  @staticmethod
214 24edc6d4 Iustin Pop
  def ArchiveJobs(age):
215 f07521e5 Iustin Pop
    """Archive old jobs.
216 f07521e5 Iustin Pop

217 f07521e5 Iustin Pop
    """
218 f07521e5 Iustin Pop
    arch_count, left_count = client.AutoArchiveJobs(age)
219 07b8a2b5 Iustin Pop
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
220 f07521e5 Iustin Pop
221 5a3103e9 Michael Hanselmann
  def CheckDisks(self, notepad):
222 5a3103e9 Michael Hanselmann
    """Check all nodes for restarted ones.
223 38242904 Iustin Pop

224 a8083063 Iustin Pop
    """
225 5a3103e9 Michael Hanselmann
    check_nodes = []
226 cbfc4681 Iustin Pop
    for name, (new_id, offline) in self.bootids.iteritems():
227 5a3103e9 Michael Hanselmann
      old = notepad.GetNodeBootID(name)
228 37b77b18 Iustin Pop
      if new_id is None:
229 37b77b18 Iustin Pop
        # Bad node, not returning a boot id
230 cbfc4681 Iustin Pop
        if not offline:
231 cbfc4681 Iustin Pop
          logging.debug("Node %s missing boot id, skipping secondary checks",
232 cbfc4681 Iustin Pop
                        name)
233 37b77b18 Iustin Pop
        continue
234 26517d45 Iustin Pop
      if old != new_id:
235 5a3103e9 Michael Hanselmann
        # Node's boot ID has changed, proably through a reboot.
236 5a3103e9 Michael Hanselmann
        check_nodes.append(name)
237 5a3103e9 Michael Hanselmann
238 5a3103e9 Michael Hanselmann
    if check_nodes:
239 5a3103e9 Michael Hanselmann
      # Activate disks for all instances with any of the checked nodes as a
240 5a3103e9 Michael Hanselmann
      # secondary node.
241 6dfcc47b Iustin Pop
      for node in check_nodes:
242 6dfcc47b Iustin Pop
        if node not in self.smap:
243 eee1fa2d Iustin Pop
          continue
244 6dfcc47b Iustin Pop
        for instance_name in self.smap[node]:
245 6dfcc47b Iustin Pop
          instance = self.instances[instance_name]
246 6dfcc47b Iustin Pop
          if not instance.autostart:
247 6dfcc47b Iustin Pop
            logging.info(("Skipping disk activation for non-autostart"
248 6dfcc47b Iustin Pop
                          " instance %s"), instance.name)
249 6dfcc47b Iustin Pop
            continue
250 6dfcc47b Iustin Pop
          if instance.name in self.started_instances:
251 6dfcc47b Iustin Pop
            # we already tried to start the instance, which should have
252 6dfcc47b Iustin Pop
            # activated its drives (if they can be at all)
253 604c175c Iustin Pop
            logging.debug("Skipping disk activation for instance %s, as"
254 604c175c Iustin Pop
                          " it was already started", instance.name)
255 6dfcc47b Iustin Pop
            continue
256 6dfcc47b Iustin Pop
          try:
257 6dfcc47b Iustin Pop
            logging.info("Activating disks for instance %s", instance.name)
258 6dfcc47b Iustin Pop
            instance.ActivateDisks()
259 7260cfbe Iustin Pop
          except Exception: # pylint: disable-msg=W0703
260 6dfcc47b Iustin Pop
            logging.exception("Error while activating disks for instance %s",
261 6dfcc47b Iustin Pop
                              instance.name)
262 5a3103e9 Michael Hanselmann
263 5a3103e9 Michael Hanselmann
      # Keep changed boot IDs
264 5a3103e9 Michael Hanselmann
      for name in check_nodes:
265 3448aa22 Iustin Pop
        notepad.SetNodeBootID(name, self.bootids[name][0])
266 a8083063 Iustin Pop
267 5a3103e9 Michael Hanselmann
  def CheckInstances(self, notepad):
268 5a3103e9 Michael Hanselmann
    """Make a pass over the list of instances, restarting downed ones.
269 5a3103e9 Michael Hanselmann

270 5a3103e9 Michael Hanselmann
    """
271 f5116c87 Iustin Pop
    notepad.MaintainInstanceList(self.instances.keys())
272 f5116c87 Iustin Pop
273 6dfcc47b Iustin Pop
    for instance in self.instances.values():
274 adf6301e Michael Hanselmann
      if instance.status in BAD_STATES:
275 5a3103e9 Michael Hanselmann
        n = notepad.NumberOfRestartAttempts(instance)
276 a8083063 Iustin Pop
277 a8083063 Iustin Pop
        if n > MAXTRIES:
278 f5116c87 Iustin Pop
          logging.warning("Not restarting instance %s, retries exhausted",
279 f5116c87 Iustin Pop
                          instance.name)
280 a8083063 Iustin Pop
          continue
281 a8083063 Iustin Pop
        elif n < MAXTRIES:
282 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
283 a8083063 Iustin Pop
        else:
284 5a3103e9 Michael Hanselmann
          notepad.RecordRestartAttempt(instance)
285 438b45d4 Michael Hanselmann
          logging.error("Could not restart %s after %d attempts, giving up",
286 438b45d4 Michael Hanselmann
                        instance.name, MAXTRIES)
287 a8083063 Iustin Pop
          continue
288 a8083063 Iustin Pop
        try:
289 604c175c Iustin Pop
          logging.info("Restarting %s%s", instance.name, last)
290 a8083063 Iustin Pop
          instance.Restart()
291 eee1fa2d Iustin Pop
          self.started_instances.add(instance.name)
292 7260cfbe Iustin Pop
        except Exception: # pylint: disable-msg=W0703
293 4bffa7f7 Iustin Pop
          logging.exception("Error while restarting instance %s",
294 4bffa7f7 Iustin Pop
                            instance.name)
295 a8083063 Iustin Pop
296 5a3103e9 Michael Hanselmann
        notepad.RecordRestartAttempt(instance)
297 adf6301e Michael Hanselmann
      elif instance.status in HELPLESS_STATES:
298 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
299 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
300 a8083063 Iustin Pop
      else:
301 5a3103e9 Michael Hanselmann
        if notepad.NumberOfRestartAttempts(instance):
302 5a3103e9 Michael Hanselmann
          notepad.RemoveInstance(instance)
303 438b45d4 Michael Hanselmann
          logging.info("Restart of %s succeeded", instance.name)
304 a8083063 Iustin Pop
305 83e5e26f René Nussbaumer
  def _CheckForOfflineNodes(self, instance):
306 83e5e26f René Nussbaumer
    """Checks if given instances has any secondary in offline status.
307 83e5e26f René Nussbaumer

308 83e5e26f René Nussbaumer
    @param instance: The instance object
309 83e5e26f René Nussbaumer
    @return: True if any of the secondary is offline, False otherwise
310 83e5e26f René Nussbaumer

311 83e5e26f René Nussbaumer
    """
312 83e5e26f René Nussbaumer
    bootids = []
313 83e5e26f René Nussbaumer
    for node in instance.snodes:
314 83e5e26f René Nussbaumer
      bootids.append(self.bootids[node])
315 83e5e26f René Nussbaumer
316 83e5e26f René Nussbaumer
    return compat.any(offline for (_, offline) in bootids)
317 83e5e26f René Nussbaumer
318 83e5e26f René Nussbaumer
  def VerifyDisks(self):
319 d2f311db Iustin Pop
    """Run gnt-cluster verify-disks.
320 d2f311db Iustin Pop

321 d2f311db Iustin Pop
    """
322 ae1a845c Michael Hanselmann
    job_id = client.SubmitJob([opcodes.OpClusterVerifyDisks()])
323 6dfcc47b Iustin Pop
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
324 6dfcc47b Iustin Pop
    client.ArchiveJob(job_id)
325 ae1a845c Michael Hanselmann
326 ae1a845c Michael Hanselmann
    # Keep track of submitted jobs
327 ae1a845c Michael Hanselmann
    jex = cli.JobExecutor(cl=client, feedback_fn=logging.debug)
328 ae1a845c Michael Hanselmann
329 ae1a845c Michael Hanselmann
    archive_jobs = set()
330 ae1a845c Michael Hanselmann
    for (status, job_id) in result[constants.JOB_IDS_KEY]:
331 ae1a845c Michael Hanselmann
      jex.AddJobId(None, status, job_id)
332 ae1a845c Michael Hanselmann
      if status:
333 ae1a845c Michael Hanselmann
        archive_jobs.add(job_id)
334 ae1a845c Michael Hanselmann
335 ae1a845c Michael Hanselmann
    offline_disk_instances = set()
336 ae1a845c Michael Hanselmann
337 ae1a845c Michael Hanselmann
    for (status, result) in jex.GetResults():
338 ae1a845c Michael Hanselmann
      if not status:
339 ae1a845c Michael Hanselmann
        logging.error("Verify-disks job failed: %s", result)
340 ae1a845c Michael Hanselmann
        continue
341 ae1a845c Michael Hanselmann
342 ae1a845c Michael Hanselmann
      ((_, instances, _), ) = result
343 ae1a845c Michael Hanselmann
344 ae1a845c Michael Hanselmann
      offline_disk_instances.update(instances)
345 ae1a845c Michael Hanselmann
346 ae1a845c Michael Hanselmann
    for job_id in archive_jobs:
347 ae1a845c Michael Hanselmann
      client.ArchiveJob(job_id)
348 ae1a845c Michael Hanselmann
349 5188ab37 Iustin Pop
    if not offline_disk_instances:
350 5188ab37 Iustin Pop
      # nothing to do
351 604c175c Iustin Pop
      logging.debug("verify-disks reported no offline disks, nothing to do")
352 5188ab37 Iustin Pop
      return
353 ae1a845c Michael Hanselmann
354 604c175c Iustin Pop
    logging.debug("Will activate disks for instance(s) %s",
355 1f864b60 Iustin Pop
                  utils.CommaJoin(offline_disk_instances))
356 ae1a845c Michael Hanselmann
357 5188ab37 Iustin Pop
    # we submit only one job, and wait for it. not optimal, but spams
358 5188ab37 Iustin Pop
    # less the job queue
359 83e5e26f René Nussbaumer
    job = []
360 83e5e26f René Nussbaumer
    for name in offline_disk_instances:
361 83e5e26f René Nussbaumer
      instance = self.instances[name]
362 adf6301e Michael Hanselmann
      if (instance.status in HELPLESS_STATES or
363 83e5e26f René Nussbaumer
          self._CheckForOfflineNodes(instance)):
364 83e5e26f René Nussbaumer
        logging.info("Skip instance %s because it is in helpless state or has"
365 83e5e26f René Nussbaumer
                     " one offline secondary", name)
366 83e5e26f René Nussbaumer
        continue
367 83e5e26f René Nussbaumer
      job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
368 5188ab37 Iustin Pop
369 83e5e26f René Nussbaumer
    if job:
370 83e5e26f René Nussbaumer
      job_id = cli.SendJob(job, cl=client)
371 83e5e26f René Nussbaumer
372 83e5e26f René Nussbaumer
      try:
373 83e5e26f René Nussbaumer
        cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
374 83e5e26f René Nussbaumer
      except Exception: # pylint: disable-msg=W0703
375 83e5e26f René Nussbaumer
        logging.exception("Error while activating disks")
376 a8083063 Iustin Pop
377 a8083063 Iustin Pop
378 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
379 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
380 db147305 Tom Limoncelli

381 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
382 db147305 Tom Limoncelli
  test is GetVersion.
383 db147305 Tom Limoncelli

384 db147305 Tom Limoncelli
  @type hostname: string
385 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
386 db147305 Tom Limoncelli
  @rtype: bool
387 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
388 db147305 Tom Limoncelli

389 db147305 Tom Limoncelli
  """
390 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
391 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
392 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
393 db147305 Tom Limoncelli
  try:
394 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
395 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
396 db147305 Tom Limoncelli
    logging.warning("RAPI Error: CertificateError (%s)", err)
397 db147305 Tom Limoncelli
    return False
398 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
399 db147305 Tom Limoncelli
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
400 db147305 Tom Limoncelli
    return False
401 db147305 Tom Limoncelli
  logging.debug("RAPI Result: master_version is %s", master_version)
402 db147305 Tom Limoncelli
  return master_version == constants.RAPI_VERSION
403 db147305 Tom Limoncelli
404 db147305 Tom Limoncelli
405 a8083063 Iustin Pop
def ParseOptions():
406 a8083063 Iustin Pop
  """Parse the command line options.
407 a8083063 Iustin Pop

408 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
409 a8083063 Iustin Pop

410 a8083063 Iustin Pop
  """
411 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
412 a8083063 Iustin Pop
                        usage="%prog [-d]",
413 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
414 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
415 a8083063 Iustin Pop
416 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
417 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
418 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
419 f0a80b01 Michael Hanselmann
                          " 6 hours)")
420 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
421 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
422 a8083063 Iustin Pop
  options, args = parser.parse_args()
423 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
424 f0a80b01 Michael Hanselmann
425 f0a80b01 Michael Hanselmann
  if args:
426 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
427 f0a80b01 Michael Hanselmann
428 f0a80b01 Michael Hanselmann
  return (options, args)
429 a8083063 Iustin Pop
430 a8083063 Iustin Pop
431 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
432 9f4bb951 Michael Hanselmann
def Main():
433 a8083063 Iustin Pop
  """Main function.
434 a8083063 Iustin Pop

435 a8083063 Iustin Pop
  """
436 7260cfbe Iustin Pop
  global client # pylint: disable-msg=W0603
437 e125c67c Michael Hanselmann
438 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
439 a8083063 Iustin Pop
440 cfcc79c6 Michael Hanselmann
  utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
441 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
442 a8083063 Iustin Pop
443 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
444 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
445 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
446 3753b2cb Michael Hanselmann
447 adf6301e Michael Hanselmann
  statefile = \
448 adf6301e Michael Hanselmann
    state.OpenStateFile(constants.WATCHER_STATEFILE)
449 001b3825 Michael Hanselmann
  if not statefile:
450 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
451 001b3825 Michael Hanselmann
452 24edc6d4 Iustin Pop
  update_file = False
453 a8083063 Iustin Pop
  try:
454 f1115454 Guido Trotter
    StartNodeDaemons()
455 9e289e36 Guido Trotter
    RunWatcherHooks()
456 50273051 Iustin Pop
    # run node maintenance in all cases, even if master, so that old
457 50273051 Iustin Pop
    # masters can be properly cleaned up too
458 adf6301e Michael Hanselmann
    if nodemaint.NodeMaintenance.ShouldRun():
459 adf6301e Michael Hanselmann
      nodemaint.NodeMaintenance().Exec()
460 c4f0219c Iustin Pop
461 adf6301e Michael Hanselmann
    notepad = state.WatcherState(statefile)
462 781b2b2b Michael Hanselmann
    try:
463 2c404217 Iustin Pop
      try:
464 2c404217 Iustin Pop
        client = cli.GetClient()
465 2c404217 Iustin Pop
      except errors.OpPrereqError:
466 2c404217 Iustin Pop
        # this is, from cli.GetClient, a not-master case
467 7dfb83c2 Iustin Pop
        logging.debug("Not on master, exiting")
468 24edc6d4 Iustin Pop
        update_file = True
469 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
470 7dfb83c2 Iustin Pop
      except luxi.NoMasterError, err:
471 7dfb83c2 Iustin Pop
        logging.warning("Master seems to be down (%s), trying to restart",
472 7dfb83c2 Iustin Pop
                        str(err))
473 2826b361 Guido Trotter
        if not utils.EnsureDaemon(constants.MASTERD):
474 7dfb83c2 Iustin Pop
          logging.critical("Can't start the master, exiting")
475 9f4bb951 Michael Hanselmann
          return constants.EXIT_FAILURE
476 7dfb83c2 Iustin Pop
        # else retry the connection
477 7dfb83c2 Iustin Pop
        client = cli.GetClient()
478 cc962d58 Iustin Pop
479 83052f9e Guido Trotter
      # we are on master now
480 2826b361 Guido Trotter
      utils.EnsureDaemon(constants.RAPI)
481 c4f0219c Iustin Pop
482 db147305 Tom Limoncelli
      # If RAPI isn't responding to queries, try one restart.
483 db147305 Tom Limoncelli
      logging.debug("Attempting to talk with RAPI.")
484 9769bb78 Manuel Franceschini
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
485 db147305 Tom Limoncelli
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
486 db147305 Tom Limoncelli
                        " Restarting Ganeti RAPI.")
487 db147305 Tom Limoncelli
        utils.StopDaemon(constants.RAPI)
488 db147305 Tom Limoncelli
        utils.EnsureDaemon(constants.RAPI)
489 db147305 Tom Limoncelli
        logging.debug("Second attempt to talk with RAPI")
490 9769bb78 Manuel Franceschini
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
491 db147305 Tom Limoncelli
          logging.fatal("RAPI is not responding. Please investigate.")
492 db147305 Tom Limoncelli
      logging.debug("Successfully talked to RAPI.")
493 db147305 Tom Limoncelli
494 cc962d58 Iustin Pop
      try:
495 cc962d58 Iustin Pop
        watcher = Watcher(options, notepad)
496 cc962d58 Iustin Pop
      except errors.ConfigurationError:
497 cc962d58 Iustin Pop
        # Just exit if there's no configuration
498 24edc6d4 Iustin Pop
        update_file = True
499 9f4bb951 Michael Hanselmann
        return constants.EXIT_SUCCESS
500 e125c67c Michael Hanselmann
501 cc962d58 Iustin Pop
      watcher.Run()
502 24edc6d4 Iustin Pop
      update_file = True
503 24edc6d4 Iustin Pop
504 cc962d58 Iustin Pop
    finally:
505 7dfb83c2 Iustin Pop
      if update_file:
506 7dfb83c2 Iustin Pop
        notepad.Save()
507 7dfb83c2 Iustin Pop
      else:
508 7dfb83c2 Iustin Pop
        logging.debug("Not updating status file due to failure")
509 1b052f42 Michael Hanselmann
  except SystemExit:
510 1b052f42 Michael Hanselmann
    raise
511 38242904 Iustin Pop
  except NotMasterError:
512 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
513 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
514 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
515 438b45d4 Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
516 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
517 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
518 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
519 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
520 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
521 438b45d4 Michael Hanselmann
  except Exception, err:
522 001b3825 Michael Hanselmann
    logging.exception(str(err))
523 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
524 5a3103e9 Michael Hanselmann
525 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS