Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ bc57fa8d

History | View | Annotate | Download (25.1 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 9598b71f Michele Tartara
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 cfcc79c6 Michael Hanselmann
import os.path
32 a8083063 Iustin Pop
import sys
33 a8083063 Iustin Pop
import time
34 438b45d4 Michael Hanselmann
import logging
35 16e0b9c9 Michael Hanselmann
import operator
36 9bb69bb5 Michael Hanselmann
import errno
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 83e5e26f René Nussbaumer
from ganeti import compat
42 89e1fc26 Iustin Pop
from ganeti import errors
43 e125c67c Michael Hanselmann
from ganeti import opcodes
44 e125c67c Michael Hanselmann
from ganeti import cli
45 7dfb83c2 Iustin Pop
from ganeti import luxi
46 db147305 Tom Limoncelli
from ganeti import rapi
47 a744b676 Manuel Franceschini
from ganeti import netutils
48 16e0b9c9 Michael Hanselmann
from ganeti import qlang
49 16e0b9c9 Michael Hanselmann
from ganeti import objects
50 16e0b9c9 Michael Hanselmann
from ganeti import ssconf
51 16e0b9c9 Michael Hanselmann
from ganeti import ht
52 57fe4a5b Michael Hanselmann
from ganeti import pathutils
53 a8083063 Iustin Pop
54 b459a848 Andrea Spadaccini
import ganeti.rapi.client # pylint: disable=W0611
55 fc3f75dd Iustin Pop
from ganeti.rapi.client import UsesRapiClient
56 adf6301e Michael Hanselmann
57 adf6301e Michael Hanselmann
from ganeti.watcher import nodemaint
58 adf6301e Michael Hanselmann
from ganeti.watcher import state
59 db147305 Tom Limoncelli
60 a8083063 Iustin Pop
61 5a3103e9 Michael Hanselmann
MAXTRIES = 5
62 b8028dcf Michael Hanselmann
BAD_STATES = compat.UniqueFrozenset([
63 0cc9e018 Michael Hanselmann
  constants.INSTST_ERRORDOWN,
64 0cc9e018 Michael Hanselmann
  ])
65 b8028dcf Michael Hanselmann
HELPLESS_STATES = compat.UniqueFrozenset([
66 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEDOWN,
67 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEOFFLINE,
68 0cc9e018 Michael Hanselmann
  ])
69 0cc9e018 Michael Hanselmann
NOTICE = "NOTICE"
70 0cc9e018 Michael Hanselmann
ERROR = "ERROR"
71 e125c67c Michael Hanselmann
72 16e0b9c9 Michael Hanselmann
#: Number of seconds to wait between starting child processes for node groups
73 16e0b9c9 Michael Hanselmann
CHILD_PROCESS_DELAY = 1.0
74 16e0b9c9 Michael Hanselmann
75 9bb69bb5 Michael Hanselmann
#: How many seconds to wait for instance status file lock
76 9bb69bb5 Michael Hanselmann
INSTANCE_STATUS_LOCK_TIMEOUT = 10.0
77 9bb69bb5 Michael Hanselmann
78 e125c67c Michael Hanselmann
79 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
80 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
81 a8083063 Iustin Pop
82 a8083063 Iustin Pop
83 3753b2cb Michael Hanselmann
def ShouldPause():
84 3753b2cb Michael Hanselmann
  """Check whether we should pause.
85 3753b2cb Michael Hanselmann

86 3753b2cb Michael Hanselmann
  """
87 57fe4a5b Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(pathutils.WATCHER_PAUSEFILE))
88 3753b2cb Michael Hanselmann
89 3753b2cb Michael Hanselmann
90 f1115454 Guido Trotter
def StartNodeDaemons():
91 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
92 f1115454 Guido Trotter

93 f1115454 Guido Trotter
  """
94 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
95 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
96 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
97 aa224134 Iustin Pop
  if constants.ENABLE_CONFD:
98 aa224134 Iustin Pop
    utils.EnsureDaemon(constants.CONFD)
99 c300dbe4 Michele Tartara
  # start mond as well: all nodes need monitoring
100 c300dbe4 Michele Tartara
  if constants.ENABLE_MOND:
101 c300dbe4 Michele Tartara
    utils.EnsureDaemon(constants.MOND)
102 c300dbe4 Michele Tartara
103 f1115454 Guido Trotter
104 9e289e36 Guido Trotter
def RunWatcherHooks():
105 9e289e36 Guido Trotter
  """Run the watcher hooks.
106 9e289e36 Guido Trotter

107 9e289e36 Guido Trotter
  """
108 57fe4a5b Michael Hanselmann
  hooks_dir = utils.PathJoin(pathutils.HOOKS_BASE_DIR,
109 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
110 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
111 10e689d4 Iustin Pop
    return
112 9e289e36 Guido Trotter
113 9e289e36 Guido Trotter
  try:
114 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
115 17385bd2 Andrea Spadaccini
  except Exception, err: # pylint: disable=W0703
116 17385bd2 Andrea Spadaccini
    logging.exception("RunParts %s failed: %s", hooks_dir, err)
117 a0aa6b49 Michael Hanselmann
    return
118 9e289e36 Guido Trotter
119 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
120 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
121 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
122 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
123 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
124 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
125 9e289e36 Guido Trotter
      if runresult.failed:
126 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
127 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
128 9e289e36 Guido Trotter
      else:
129 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
130 9e289e36 Guido Trotter
                      runresult.output)
131 013ce4ae Michael Hanselmann
    else:
132 013ce4ae Michael Hanselmann
      raise errors.ProgrammerError("Unknown status %s returned by RunParts",
133 013ce4ae Michael Hanselmann
                                   status)
134 9e289e36 Guido Trotter
135 001b3825 Michael Hanselmann
136 a8083063 Iustin Pop
class Instance(object):
137 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
138 a8083063 Iustin Pop

139 a8083063 Iustin Pop
  """
140 d962dbf9 Thomas Thrainer
  def __init__(self, name, status, disks_active, snodes):
141 a8083063 Iustin Pop
    self.name = name
142 adf6301e Michael Hanselmann
    self.status = status
143 d962dbf9 Thomas Thrainer
    self.disks_active = disks_active
144 83e5e26f René Nussbaumer
    self.snodes = snodes
145 a8083063 Iustin Pop
146 16e0b9c9 Michael Hanselmann
  def Restart(self, cl):
147 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
148 3ecf6786 Iustin Pop

149 3ecf6786 Iustin Pop
    """
150 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
151 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
152 a8083063 Iustin Pop
153 16e0b9c9 Michael Hanselmann
  def ActivateDisks(self, cl):
154 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
155 5a3103e9 Michael Hanselmann

156 5a3103e9 Michael Hanselmann
    """
157 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
158 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
159 a8083063 Iustin Pop
160 a8083063 Iustin Pop
161 16e0b9c9 Michael Hanselmann
class Node:
162 16e0b9c9 Michael Hanselmann
  """Data container representing cluster node.
163 5a3103e9 Michael Hanselmann

164 5a3103e9 Michael Hanselmann
  """
165 16e0b9c9 Michael Hanselmann
  def __init__(self, name, bootid, offline, secondaries):
166 16e0b9c9 Michael Hanselmann
    """Initializes this class.
167 a8083063 Iustin Pop

168 16e0b9c9 Michael Hanselmann
    """
169 16e0b9c9 Michael Hanselmann
    self.name = name
170 16e0b9c9 Michael Hanselmann
    self.bootid = bootid
171 16e0b9c9 Michael Hanselmann
    self.offline = offline
172 16e0b9c9 Michael Hanselmann
    self.secondaries = secondaries
173 5a3103e9 Michael Hanselmann
174 78f44650 Iustin Pop
175 e52e0ddc Jose A. Lopes
def _CleanupInstance(cl, notepad, inst):
176 e52e0ddc Jose A. Lopes
  n = notepad.NumberOfCleanupAttempts(inst.name)
177 e52e0ddc Jose A. Lopes
178 e52e0ddc Jose A. Lopes
  if n > MAXTRIES:
179 e52e0ddc Jose A. Lopes
    logging.warning("Not cleaning up instance '%s', retries exhausted",
180 e52e0ddc Jose A. Lopes
                    inst.name)
181 e52e0ddc Jose A. Lopes
    return
182 e52e0ddc Jose A. Lopes
183 e52e0ddc Jose A. Lopes
  logging.info("Instance '%s' was shutdown by the user, cleaning up instance",
184 e52e0ddc Jose A. Lopes
               inst.name)
185 e52e0ddc Jose A. Lopes
  op = opcodes.OpInstanceShutdown(instance_name=inst.name)
186 e52e0ddc Jose A. Lopes
187 e52e0ddc Jose A. Lopes
  try:
188 e52e0ddc Jose A. Lopes
    cli.SubmitOpCode(op, cl=cl)
189 e52e0ddc Jose A. Lopes
    if notepad.NumberOfCleanupAttempts(inst.name):
190 e52e0ddc Jose A. Lopes
      notepad.RemoveInstance(inst.name)
191 e52e0ddc Jose A. Lopes
  except Exception: # pylint: disable=W0703
192 e52e0ddc Jose A. Lopes
    logging.exception("Error while cleaning up instance '%s'", inst.name)
193 e52e0ddc Jose A. Lopes
    notepad.RecordCleanupAttempt(inst.name)
194 e52e0ddc Jose A. Lopes
195 e52e0ddc Jose A. Lopes
196 16e0b9c9 Michael Hanselmann
def _CheckInstances(cl, notepad, instances):
197 16e0b9c9 Michael Hanselmann
  """Make a pass over the list of instances, restarting downed ones.
198 5a3103e9 Michael Hanselmann

199 16e0b9c9 Michael Hanselmann
  """
200 16e0b9c9 Michael Hanselmann
  notepad.MaintainInstanceList(instances.keys())
201 78f44650 Iustin Pop
202 16e0b9c9 Michael Hanselmann
  started = set()
203 78f44650 Iustin Pop
204 16e0b9c9 Michael Hanselmann
  for inst in instances.values():
205 e52e0ddc Jose A. Lopes
    if inst.status == constants.INSTST_USERDOWN:
206 e52e0ddc Jose A. Lopes
      _CleanupInstance(cl, notepad, inst)
207 e52e0ddc Jose A. Lopes
    elif inst.status in BAD_STATES:
208 16e0b9c9 Michael Hanselmann
      n = notepad.NumberOfRestartAttempts(inst.name)
209 5a3103e9 Michael Hanselmann
210 16e0b9c9 Michael Hanselmann
      if n > MAXTRIES:
211 16e0b9c9 Michael Hanselmann
        logging.warning("Not restarting instance '%s', retries exhausted",
212 16e0b9c9 Michael Hanselmann
                        inst.name)
213 16e0b9c9 Michael Hanselmann
        continue
214 a8083063 Iustin Pop
215 16e0b9c9 Michael Hanselmann
      if n == MAXTRIES:
216 16e0b9c9 Michael Hanselmann
        notepad.RecordRestartAttempt(inst.name)
217 16e0b9c9 Michael Hanselmann
        logging.error("Could not restart instance '%s' after %s attempts,"
218 16e0b9c9 Michael Hanselmann
                      " giving up", inst.name, MAXTRIES)
219 16e0b9c9 Michael Hanselmann
        continue
220 5a3103e9 Michael Hanselmann
221 16e0b9c9 Michael Hanselmann
      try:
222 16e0b9c9 Michael Hanselmann
        logging.info("Restarting instance '%s' (attempt #%s)",
223 16e0b9c9 Michael Hanselmann
                     inst.name, n + 1)
224 16e0b9c9 Michael Hanselmann
        inst.Restart(cl)
225 b459a848 Andrea Spadaccini
      except Exception: # pylint: disable=W0703
226 16e0b9c9 Michael Hanselmann
        logging.exception("Error while restarting instance '%s'", inst.name)
227 16e0b9c9 Michael Hanselmann
      else:
228 16e0b9c9 Michael Hanselmann
        started.add(inst.name)
229 5a3103e9 Michael Hanselmann
230 16e0b9c9 Michael Hanselmann
      notepad.RecordRestartAttempt(inst.name)
231 5a3103e9 Michael Hanselmann
232 16e0b9c9 Michael Hanselmann
    else:
233 16e0b9c9 Michael Hanselmann
      if notepad.NumberOfRestartAttempts(inst.name):
234 16e0b9c9 Michael Hanselmann
        notepad.RemoveInstance(inst.name)
235 16e0b9c9 Michael Hanselmann
        if inst.status not in HELPLESS_STATES:
236 16e0b9c9 Michael Hanselmann
          logging.info("Restart of instance '%s' succeeded", inst.name)
237 a8083063 Iustin Pop
238 16e0b9c9 Michael Hanselmann
  return started
239 a8083063 Iustin Pop
240 a8083063 Iustin Pop
241 16e0b9c9 Michael Hanselmann
def _CheckDisks(cl, notepad, nodes, instances, started):
242 16e0b9c9 Michael Hanselmann
  """Check all nodes for restarted ones.
243 38242904 Iustin Pop

244 a8083063 Iustin Pop
  """
245 16e0b9c9 Michael Hanselmann
  check_nodes = []
246 16e0b9c9 Michael Hanselmann
247 16e0b9c9 Michael Hanselmann
  for node in nodes.values():
248 16e0b9c9 Michael Hanselmann
    old = notepad.GetNodeBootID(node.name)
249 16e0b9c9 Michael Hanselmann
    if not node.bootid:
250 16e0b9c9 Michael Hanselmann
      # Bad node, not returning a boot id
251 16e0b9c9 Michael Hanselmann
      if not node.offline:
252 16e0b9c9 Michael Hanselmann
        logging.debug("Node '%s' missing boot ID, skipping secondary checks",
253 16e0b9c9 Michael Hanselmann
                      node.name)
254 16e0b9c9 Michael Hanselmann
      continue
255 16e0b9c9 Michael Hanselmann
256 16e0b9c9 Michael Hanselmann
    if old != node.bootid:
257 16e0b9c9 Michael Hanselmann
      # Node's boot ID has changed, probably through a reboot
258 16e0b9c9 Michael Hanselmann
      check_nodes.append(node)
259 16e0b9c9 Michael Hanselmann
260 16e0b9c9 Michael Hanselmann
  if check_nodes:
261 16e0b9c9 Michael Hanselmann
    # Activate disks for all instances with any of the checked nodes as a
262 16e0b9c9 Michael Hanselmann
    # secondary node.
263 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
264 16e0b9c9 Michael Hanselmann
      for instance_name in node.secondaries:
265 16e0b9c9 Michael Hanselmann
        try:
266 16e0b9c9 Michael Hanselmann
          inst = instances[instance_name]
267 16e0b9c9 Michael Hanselmann
        except KeyError:
268 16e0b9c9 Michael Hanselmann
          logging.info("Can't find instance '%s', maybe it was ignored",
269 16e0b9c9 Michael Hanselmann
                       instance_name)
270 eee1fa2d Iustin Pop
          continue
271 a8083063 Iustin Pop
272 d962dbf9 Thomas Thrainer
        if not inst.disks_active:
273 d962dbf9 Thomas Thrainer
          logging.info("Skipping disk activation for instance with not"
274 d962dbf9 Thomas Thrainer
                       " activated disks '%s'", inst.name)
275 a8083063 Iustin Pop
          continue
276 16e0b9c9 Michael Hanselmann
277 16e0b9c9 Michael Hanselmann
        if inst.name in started:
278 16e0b9c9 Michael Hanselmann
          # we already tried to start the instance, which should have
279 16e0b9c9 Michael Hanselmann
          # activated its drives (if they can be at all)
280 16e0b9c9 Michael Hanselmann
          logging.debug("Skipping disk activation for instance '%s' as"
281 16e0b9c9 Michael Hanselmann
                        " it was already started", inst.name)
282 a8083063 Iustin Pop
          continue
283 16e0b9c9 Michael Hanselmann
284 a8083063 Iustin Pop
        try:
285 16e0b9c9 Michael Hanselmann
          logging.info("Activating disks for instance '%s'", inst.name)
286 16e0b9c9 Michael Hanselmann
          inst.ActivateDisks(cl)
287 b459a848 Andrea Spadaccini
        except Exception: # pylint: disable=W0703
288 16e0b9c9 Michael Hanselmann
          logging.exception("Error while activating disks for instance '%s'",
289 16e0b9c9 Michael Hanselmann
                            inst.name)
290 a8083063 Iustin Pop
291 16e0b9c9 Michael Hanselmann
    # Keep changed boot IDs
292 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
293 16e0b9c9 Michael Hanselmann
      notepad.SetNodeBootID(node.name, node.bootid)
294 a8083063 Iustin Pop
295 83e5e26f René Nussbaumer
296 16e0b9c9 Michael Hanselmann
def _CheckForOfflineNodes(nodes, instance):
297 16e0b9c9 Michael Hanselmann
  """Checks if given instances has any secondary in offline status.
298 ae1a845c Michael Hanselmann

299 16e0b9c9 Michael Hanselmann
  @param instance: The instance object
300 16e0b9c9 Michael Hanselmann
  @return: True if any of the secondary is offline, False otherwise
301 ae1a845c Michael Hanselmann

302 16e0b9c9 Michael Hanselmann
  """
303 16e0b9c9 Michael Hanselmann
  return compat.any(nodes[node_name].offline for node_name in instance.snodes)
304 ae1a845c Michael Hanselmann
305 ae1a845c Michael Hanselmann
306 16e0b9c9 Michael Hanselmann
def _VerifyDisks(cl, uuid, nodes, instances):
307 16e0b9c9 Michael Hanselmann
  """Run a per-group "gnt-cluster verify-disks".
308 ae1a845c Michael Hanselmann

309 16e0b9c9 Michael Hanselmann
  """
310 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob([opcodes.OpGroupVerifyDisks(group_name=uuid)])
311 16e0b9c9 Michael Hanselmann
  ((_, offline_disk_instances, _), ) = \
312 16e0b9c9 Michael Hanselmann
    cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
313 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
314 ae1a845c Michael Hanselmann
315 16e0b9c9 Michael Hanselmann
  if not offline_disk_instances:
316 16e0b9c9 Michael Hanselmann
    # nothing to do
317 16e0b9c9 Michael Hanselmann
    logging.debug("Verify-disks reported no offline disks, nothing to do")
318 16e0b9c9 Michael Hanselmann
    return
319 ae1a845c Michael Hanselmann
320 16e0b9c9 Michael Hanselmann
  logging.debug("Will activate disks for instance(s) %s",
321 16e0b9c9 Michael Hanselmann
                utils.CommaJoin(offline_disk_instances))
322 ae1a845c Michael Hanselmann
323 16e0b9c9 Michael Hanselmann
  # We submit only one job, and wait for it. Not optimal, but this puts less
324 16e0b9c9 Michael Hanselmann
  # load on the job queue.
325 16e0b9c9 Michael Hanselmann
  job = []
326 16e0b9c9 Michael Hanselmann
  for name in offline_disk_instances:
327 16e0b9c9 Michael Hanselmann
    try:
328 16e0b9c9 Michael Hanselmann
      inst = instances[name]
329 16e0b9c9 Michael Hanselmann
    except KeyError:
330 16e0b9c9 Michael Hanselmann
      logging.info("Can't find instance '%s', maybe it was ignored", name)
331 16e0b9c9 Michael Hanselmann
      continue
332 ae1a845c Michael Hanselmann
333 16e0b9c9 Michael Hanselmann
    if inst.status in HELPLESS_STATES or _CheckForOfflineNodes(nodes, inst):
334 40b068e5 Iustin Pop
      logging.info("Skipping instance '%s' because it is in a helpless state"
335 40b068e5 Iustin Pop
                   " or has offline secondaries", name)
336 16e0b9c9 Michael Hanselmann
      continue
337 ae1a845c Michael Hanselmann
338 16e0b9c9 Michael Hanselmann
    job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
339 5188ab37 Iustin Pop
340 16e0b9c9 Michael Hanselmann
  if job:
341 16e0b9c9 Michael Hanselmann
    job_id = cli.SendJob(job, cl=cl)
342 83e5e26f René Nussbaumer
343 16e0b9c9 Michael Hanselmann
    try:
344 16e0b9c9 Michael Hanselmann
      cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
345 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
346 16e0b9c9 Michael Hanselmann
      logging.exception("Error while activating disks")
347 a8083063 Iustin Pop
348 a8083063 Iustin Pop
349 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
350 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
351 db147305 Tom Limoncelli

352 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
353 db147305 Tom Limoncelli
  test is GetVersion.
354 db147305 Tom Limoncelli

355 db147305 Tom Limoncelli
  @type hostname: string
356 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
357 db147305 Tom Limoncelli
  @rtype: bool
358 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
359 db147305 Tom Limoncelli

360 db147305 Tom Limoncelli
  """
361 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
362 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
363 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
364 db147305 Tom Limoncelli
  try:
365 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
366 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
367 d7c42723 Michael Hanselmann
    logging.warning("RAPI certificate error: %s", err)
368 db147305 Tom Limoncelli
    return False
369 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
370 d7c42723 Michael Hanselmann
    logging.warning("RAPI error: %s", err)
371 db147305 Tom Limoncelli
    return False
372 d7c42723 Michael Hanselmann
  else:
373 d7c42723 Michael Hanselmann
    logging.debug("Reported RAPI version %s", master_version)
374 d7c42723 Michael Hanselmann
    return master_version == constants.RAPI_VERSION
375 db147305 Tom Limoncelli
376 db147305 Tom Limoncelli
377 a8083063 Iustin Pop
def ParseOptions():
378 a8083063 Iustin Pop
  """Parse the command line options.
379 a8083063 Iustin Pop

380 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
381 a8083063 Iustin Pop

382 a8083063 Iustin Pop
  """
383 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
384 a8083063 Iustin Pop
                        usage="%prog [-d]",
385 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
386 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
387 a8083063 Iustin Pop
388 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
389 16e0b9c9 Michael Hanselmann
  parser.add_option(cli.NODEGROUP_OPT)
390 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
391 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
392 f0a80b01 Michael Hanselmann
                          " 6 hours)")
393 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
394 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
395 5f01e6ad Michael Hanselmann
  parser.add_option("--wait-children", dest="wait_children",
396 16e0b9c9 Michael Hanselmann
                    action="store_true", help="Wait for child processes")
397 5f01e6ad Michael Hanselmann
  parser.add_option("--no-wait-children", dest="wait_children",
398 40b068e5 Iustin Pop
                    action="store_false",
399 40b068e5 Iustin Pop
                    help="Don't wait for child processes")
400 5f01e6ad Michael Hanselmann
  # See optparse documentation for why default values are not set by options
401 5f01e6ad Michael Hanselmann
  parser.set_defaults(wait_children=True)
402 a8083063 Iustin Pop
  options, args = parser.parse_args()
403 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
404 f0a80b01 Michael Hanselmann
405 f0a80b01 Michael Hanselmann
  if args:
406 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
407 f0a80b01 Michael Hanselmann
408 f0a80b01 Michael Hanselmann
  return (options, args)
409 a8083063 Iustin Pop
410 a8083063 Iustin Pop
411 9bb69bb5 Michael Hanselmann
def _WriteInstanceStatus(filename, data):
412 9bb69bb5 Michael Hanselmann
  """Writes the per-group instance status file.
413 9bb69bb5 Michael Hanselmann

414 9bb69bb5 Michael Hanselmann
  The entries are sorted.
415 8f07dc0d Michael Hanselmann

416 9bb69bb5 Michael Hanselmann
  @type filename: string
417 9bb69bb5 Michael Hanselmann
  @param filename: Path to instance status file
418 9bb69bb5 Michael Hanselmann
  @type data: list of tuple; (instance name as string, status as string)
419 9bb69bb5 Michael Hanselmann
  @param data: Instance name and status
420 8f07dc0d Michael Hanselmann

421 8f07dc0d Michael Hanselmann
  """
422 9bb69bb5 Michael Hanselmann
  logging.debug("Updating instance status file '%s' with %s instances",
423 9bb69bb5 Michael Hanselmann
                filename, len(data))
424 8f07dc0d Michael Hanselmann
425 9bb69bb5 Michael Hanselmann
  utils.WriteFile(filename,
426 9bb69bb5 Michael Hanselmann
                  data="".join(map(compat.partial(operator.mod, "%s %s\n"),
427 9bb69bb5 Michael Hanselmann
                                   sorted(data))))
428 9bb69bb5 Michael Hanselmann
429 9bb69bb5 Michael Hanselmann
430 9bb69bb5 Michael Hanselmann
def _UpdateInstanceStatus(filename, instances):
431 9bb69bb5 Michael Hanselmann
  """Writes an instance status file from L{Instance} objects.
432 9bb69bb5 Michael Hanselmann

433 9bb69bb5 Michael Hanselmann
  @type filename: string
434 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
435 9bb69bb5 Michael Hanselmann
  @type instances: list of L{Instance}
436 9bb69bb5 Michael Hanselmann

437 9bb69bb5 Michael Hanselmann
  """
438 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, [(inst.name, inst.status)
439 9bb69bb5 Michael Hanselmann
                                  for inst in instances])
440 9bb69bb5 Michael Hanselmann
441 9bb69bb5 Michael Hanselmann
442 9bb69bb5 Michael Hanselmann
def _ReadInstanceStatus(filename):
443 9bb69bb5 Michael Hanselmann
  """Reads an instance status file.
444 9bb69bb5 Michael Hanselmann

445 9bb69bb5 Michael Hanselmann
  @type filename: string
446 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
447 9bb69bb5 Michael Hanselmann
  @rtype: tuple; (None or number, list of lists containing instance name and
448 9bb69bb5 Michael Hanselmann
    status)
449 9bb69bb5 Michael Hanselmann
  @return: File's mtime and instance status contained in the file; mtime is
450 9bb69bb5 Michael Hanselmann
    C{None} if file can't be read
451 9bb69bb5 Michael Hanselmann

452 9bb69bb5 Michael Hanselmann
  """
453 9bb69bb5 Michael Hanselmann
  logging.debug("Reading per-group instance status from '%s'", filename)
454 9bb69bb5 Michael Hanselmann
455 2635bb04 Michael Hanselmann
  statcb = utils.FileStatHelper()
456 9bb69bb5 Michael Hanselmann
  try:
457 9bb69bb5 Michael Hanselmann
    content = utils.ReadFile(filename, preread=statcb)
458 9bb69bb5 Michael Hanselmann
  except EnvironmentError, err:
459 9bb69bb5 Michael Hanselmann
    if err.errno == errno.ENOENT:
460 9bb69bb5 Michael Hanselmann
      logging.error("Can't read '%s', does not exist (yet)", filename)
461 9bb69bb5 Michael Hanselmann
    else:
462 9bb69bb5 Michael Hanselmann
      logging.exception("Unable to read '%s', ignoring", filename)
463 9bb69bb5 Michael Hanselmann
    return (None, None)
464 9bb69bb5 Michael Hanselmann
  else:
465 6f9e71bb Michael Hanselmann
    return (statcb.st.st_mtime, [line.split(None, 1)
466 9bb69bb5 Michael Hanselmann
                                 for line in content.splitlines()])
467 9bb69bb5 Michael Hanselmann
468 9bb69bb5 Michael Hanselmann
469 9bb69bb5 Michael Hanselmann
def _MergeInstanceStatus(filename, pergroup_filename, groups):
470 9bb69bb5 Michael Hanselmann
  """Merges all per-group instance status files into a global one.
471 9bb69bb5 Michael Hanselmann

472 9bb69bb5 Michael Hanselmann
  @type filename: string
473 9bb69bb5 Michael Hanselmann
  @param filename: Path to global instance status file
474 9bb69bb5 Michael Hanselmann
  @type pergroup_filename: string
475 9bb69bb5 Michael Hanselmann
  @param pergroup_filename: Path to per-group status files, must contain "%s"
476 9bb69bb5 Michael Hanselmann
    to be replaced with group UUID
477 9bb69bb5 Michael Hanselmann
  @type groups: sequence
478 9bb69bb5 Michael Hanselmann
  @param groups: UUIDs of known groups
479 9bb69bb5 Michael Hanselmann

480 9bb69bb5 Michael Hanselmann
  """
481 9bb69bb5 Michael Hanselmann
  # Lock global status file in exclusive mode
482 9bb69bb5 Michael Hanselmann
  lock = utils.FileLock.Open(filename)
483 9bb69bb5 Michael Hanselmann
  try:
484 9bb69bb5 Michael Hanselmann
    lock.Exclusive(blocking=True, timeout=INSTANCE_STATUS_LOCK_TIMEOUT)
485 9bb69bb5 Michael Hanselmann
  except errors.LockError, err:
486 9bb69bb5 Michael Hanselmann
    # All per-group processes will lock and update the file. None of them
487 9bb69bb5 Michael Hanselmann
    # should take longer than 10 seconds (the value of
488 9bb69bb5 Michael Hanselmann
    # INSTANCE_STATUS_LOCK_TIMEOUT).
489 9bb69bb5 Michael Hanselmann
    logging.error("Can't acquire lock on instance status file '%s', not"
490 9bb69bb5 Michael Hanselmann
                  " updating: %s", filename, err)
491 9bb69bb5 Michael Hanselmann
    return
492 9bb69bb5 Michael Hanselmann
493 9bb69bb5 Michael Hanselmann
  logging.debug("Acquired exclusive lock on '%s'", filename)
494 9bb69bb5 Michael Hanselmann
495 9bb69bb5 Michael Hanselmann
  data = {}
496 9bb69bb5 Michael Hanselmann
497 9bb69bb5 Michael Hanselmann
  # Load instance status from all groups
498 9bb69bb5 Michael Hanselmann
  for group_uuid in groups:
499 9bb69bb5 Michael Hanselmann
    (mtime, instdata) = _ReadInstanceStatus(pergroup_filename % group_uuid)
500 9bb69bb5 Michael Hanselmann
501 9bb69bb5 Michael Hanselmann
    if mtime is not None:
502 9bb69bb5 Michael Hanselmann
      for (instance_name, status) in instdata:
503 9bb69bb5 Michael Hanselmann
        data.setdefault(instance_name, []).append((mtime, status))
504 9bb69bb5 Michael Hanselmann
505 9bb69bb5 Michael Hanselmann
  # Select last update based on file mtime
506 9bb69bb5 Michael Hanselmann
  inststatus = [(instance_name, sorted(status, reverse=True)[0][1])
507 9bb69bb5 Michael Hanselmann
                for (instance_name, status) in data.items()]
508 9bb69bb5 Michael Hanselmann
509 9bb69bb5 Michael Hanselmann
  # Write the global status file. Don't touch file after it's been
510 9bb69bb5 Michael Hanselmann
  # updated--there is no lock anymore.
511 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, inststatus)
512 8f07dc0d Michael Hanselmann
513 8f07dc0d Michael Hanselmann
514 16e0b9c9 Michael Hanselmann
def GetLuxiClient(try_restart):
515 16e0b9c9 Michael Hanselmann
  """Tries to connect to the master daemon.
516 16e0b9c9 Michael Hanselmann

517 16e0b9c9 Michael Hanselmann
  @type try_restart: bool
518 16e0b9c9 Michael Hanselmann
  @param try_restart: Whether to attempt to restart the master daemon
519 16e0b9c9 Michael Hanselmann

520 16e0b9c9 Michael Hanselmann
  """
521 16e0b9c9 Michael Hanselmann
  try:
522 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
523 16e0b9c9 Michael Hanselmann
  except errors.OpPrereqError, err:
524 16e0b9c9 Michael Hanselmann
    # this is, from cli.GetClient, a not-master case
525 16e0b9c9 Michael Hanselmann
    raise NotMasterError("Not on master node (%s)" % err)
526 16e0b9c9 Michael Hanselmann
527 16e0b9c9 Michael Hanselmann
  except luxi.NoMasterError, err:
528 16e0b9c9 Michael Hanselmann
    if not try_restart:
529 16e0b9c9 Michael Hanselmann
      raise
530 16e0b9c9 Michael Hanselmann
531 16e0b9c9 Michael Hanselmann
    logging.warning("Master daemon seems to be down (%s), trying to restart",
532 16e0b9c9 Michael Hanselmann
                    err)
533 16e0b9c9 Michael Hanselmann
534 16e0b9c9 Michael Hanselmann
    if not utils.EnsureDaemon(constants.MASTERD):
535 16e0b9c9 Michael Hanselmann
      raise errors.GenericError("Can't start the master daemon")
536 16e0b9c9 Michael Hanselmann
537 16e0b9c9 Michael Hanselmann
    # Retry the connection
538 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
539 16e0b9c9 Michael Hanselmann
540 16e0b9c9 Michael Hanselmann
541 16e0b9c9 Michael Hanselmann
def _StartGroupChildren(cl, wait):
542 16e0b9c9 Michael Hanselmann
  """Starts a new instance of the watcher for every node group.
543 16e0b9c9 Michael Hanselmann

544 16e0b9c9 Michael Hanselmann
  """
545 16e0b9c9 Michael Hanselmann
  assert not compat.any(arg.startswith(cli.NODEGROUP_OPT_NAME)
546 16e0b9c9 Michael Hanselmann
                        for arg in sys.argv)
547 16e0b9c9 Michael Hanselmann
548 16e0b9c9 Michael Hanselmann
  result = cl.QueryGroups([], ["name", "uuid"], False)
549 16e0b9c9 Michael Hanselmann
550 16e0b9c9 Michael Hanselmann
  children = []
551 16e0b9c9 Michael Hanselmann
552 16e0b9c9 Michael Hanselmann
  for (idx, (name, uuid)) in enumerate(result):
553 16e0b9c9 Michael Hanselmann
    args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]
554 16e0b9c9 Michael Hanselmann
555 16e0b9c9 Michael Hanselmann
    if idx > 0:
556 16e0b9c9 Michael Hanselmann
      # Let's not kill the system
557 16e0b9c9 Michael Hanselmann
      time.sleep(CHILD_PROCESS_DELAY)
558 16e0b9c9 Michael Hanselmann
559 16e0b9c9 Michael Hanselmann
    logging.debug("Spawning child for group '%s' (%s), arguments %s",
560 16e0b9c9 Michael Hanselmann
                  name, uuid, args)
561 16e0b9c9 Michael Hanselmann
562 16e0b9c9 Michael Hanselmann
    try:
563 16e0b9c9 Michael Hanselmann
      # TODO: Should utils.StartDaemon be used instead?
564 16e0b9c9 Michael Hanselmann
      pid = os.spawnv(os.P_NOWAIT, args[0], args)
565 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
566 16e0b9c9 Michael Hanselmann
      logging.exception("Failed to start child for group '%s' (%s)",
567 16e0b9c9 Michael Hanselmann
                        name, uuid)
568 16e0b9c9 Michael Hanselmann
    else:
569 16e0b9c9 Michael Hanselmann
      logging.debug("Started with PID %s", pid)
570 16e0b9c9 Michael Hanselmann
      children.append(pid)
571 16e0b9c9 Michael Hanselmann
572 16e0b9c9 Michael Hanselmann
  if wait:
573 16e0b9c9 Michael Hanselmann
    for pid in children:
574 16e0b9c9 Michael Hanselmann
      logging.debug("Waiting for child PID %s", pid)
575 16e0b9c9 Michael Hanselmann
      try:
576 16e0b9c9 Michael Hanselmann
        result = utils.RetryOnSignal(os.waitpid, pid, 0)
577 16e0b9c9 Michael Hanselmann
      except EnvironmentError, err:
578 16e0b9c9 Michael Hanselmann
        result = str(err)
579 16e0b9c9 Michael Hanselmann
580 16e0b9c9 Michael Hanselmann
      logging.debug("Child PID %s exited with status %s", pid, result)
581 16e0b9c9 Michael Hanselmann
582 16e0b9c9 Michael Hanselmann
583 16e0b9c9 Michael Hanselmann
def _ArchiveJobs(cl, age):
584 16e0b9c9 Michael Hanselmann
  """Archives old jobs.
585 16e0b9c9 Michael Hanselmann

586 16e0b9c9 Michael Hanselmann
  """
587 16e0b9c9 Michael Hanselmann
  (arch_count, left_count) = cl.AutoArchiveJobs(age)
588 16e0b9c9 Michael Hanselmann
  logging.debug("Archived %s jobs, left %s", arch_count, left_count)
589 16e0b9c9 Michael Hanselmann
590 16e0b9c9 Michael Hanselmann
591 16e0b9c9 Michael Hanselmann
def _CheckMaster(cl):
592 16e0b9c9 Michael Hanselmann
  """Ensures current host is master node.
593 16e0b9c9 Michael Hanselmann

594 16e0b9c9 Michael Hanselmann
  """
595 16e0b9c9 Michael Hanselmann
  (master, ) = cl.QueryConfigValues(["master_node"])
596 16e0b9c9 Michael Hanselmann
  if master != netutils.Hostname.GetSysName():
597 16e0b9c9 Michael Hanselmann
    raise NotMasterError("This is not the master node")
598 16e0b9c9 Michael Hanselmann
599 16e0b9c9 Michael Hanselmann
600 fc3f75dd Iustin Pop
@UsesRapiClient
601 16e0b9c9 Michael Hanselmann
def _GlobalWatcher(opts):
602 16e0b9c9 Michael Hanselmann
  """Main function for global watcher.
603 16e0b9c9 Michael Hanselmann

604 16e0b9c9 Michael Hanselmann
  At the end child processes are spawned for every node group.
605 16e0b9c9 Michael Hanselmann

606 16e0b9c9 Michael Hanselmann
  """
607 16e0b9c9 Michael Hanselmann
  StartNodeDaemons()
608 16e0b9c9 Michael Hanselmann
  RunWatcherHooks()
609 16e0b9c9 Michael Hanselmann
610 16e0b9c9 Michael Hanselmann
  # Run node maintenance in all cases, even if master, so that old masters can
611 16e0b9c9 Michael Hanselmann
  # be properly cleaned up
612 b459a848 Andrea Spadaccini
  if nodemaint.NodeMaintenance.ShouldRun(): # pylint: disable=E0602
613 b459a848 Andrea Spadaccini
    nodemaint.NodeMaintenance().Exec() # pylint: disable=E0602
614 16e0b9c9 Michael Hanselmann
615 16e0b9c9 Michael Hanselmann
  try:
616 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(True)
617 16e0b9c9 Michael Hanselmann
  except NotMasterError:
618 16e0b9c9 Michael Hanselmann
    # Don't proceed on non-master nodes
619 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
620 16e0b9c9 Michael Hanselmann
621 16e0b9c9 Michael Hanselmann
  # we are on master now
622 16e0b9c9 Michael Hanselmann
  utils.EnsureDaemon(constants.RAPI)
623 16e0b9c9 Michael Hanselmann
624 16e0b9c9 Michael Hanselmann
  # If RAPI isn't responding to queries, try one restart
625 16e0b9c9 Michael Hanselmann
  logging.debug("Attempting to talk to remote API on %s",
626 16e0b9c9 Michael Hanselmann
                constants.IP4_ADDRESS_LOCALHOST)
627 16e0b9c9 Michael Hanselmann
  if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
628 16e0b9c9 Michael Hanselmann
    logging.warning("Couldn't get answer from remote API, restaring daemon")
629 16e0b9c9 Michael Hanselmann
    utils.StopDaemon(constants.RAPI)
630 16e0b9c9 Michael Hanselmann
    utils.EnsureDaemon(constants.RAPI)
631 16e0b9c9 Michael Hanselmann
    logging.debug("Second attempt to talk to remote API")
632 16e0b9c9 Michael Hanselmann
    if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
633 16e0b9c9 Michael Hanselmann
      logging.fatal("RAPI is not responding")
634 16e0b9c9 Michael Hanselmann
  logging.debug("Successfully talked to remote API")
635 16e0b9c9 Michael Hanselmann
636 16e0b9c9 Michael Hanselmann
  _CheckMaster(client)
637 16e0b9c9 Michael Hanselmann
  _ArchiveJobs(client, opts.job_age)
638 16e0b9c9 Michael Hanselmann
639 16e0b9c9 Michael Hanselmann
  # Spawn child processes for all node groups
640 16e0b9c9 Michael Hanselmann
  _StartGroupChildren(client, opts.wait_children)
641 16e0b9c9 Michael Hanselmann
642 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
643 16e0b9c9 Michael Hanselmann
644 16e0b9c9 Michael Hanselmann
645 16e0b9c9 Michael Hanselmann
def _GetGroupData(cl, uuid):
646 16e0b9c9 Michael Hanselmann
  """Retrieves instances and nodes per node group.
647 16e0b9c9 Michael Hanselmann

648 16e0b9c9 Michael Hanselmann
  """
649 16e0b9c9 Michael Hanselmann
  job = [
650 16e0b9c9 Michael Hanselmann
    # Get all primary instances in group
651 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_INSTANCE,
652 d962dbf9 Thomas Thrainer
                    fields=["name", "status", "disks_active", "snodes",
653 16e0b9c9 Michael Hanselmann
                            "pnode.group.uuid", "snodes.group.uuid"],
654 2e5c33db Iustin Pop
                    qfilter=[qlang.OP_EQUAL, "pnode.group.uuid", uuid],
655 5bfb1134 Michael Hanselmann
                    use_locking=True),
656 16e0b9c9 Michael Hanselmann
657 16e0b9c9 Michael Hanselmann
    # Get all nodes in group
658 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_NODE,
659 16e0b9c9 Michael Hanselmann
                    fields=["name", "bootid", "offline"],
660 2e5c33db Iustin Pop
                    qfilter=[qlang.OP_EQUAL, "group.uuid", uuid],
661 5bfb1134 Michael Hanselmann
                    use_locking=True),
662 16e0b9c9 Michael Hanselmann
    ]
663 16e0b9c9 Michael Hanselmann
664 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob(job)
665 16e0b9c9 Michael Hanselmann
  results = map(objects.QueryResponse.FromDict,
666 16e0b9c9 Michael Hanselmann
                cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug))
667 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
668 16e0b9c9 Michael Hanselmann
669 16e0b9c9 Michael Hanselmann
  results_data = map(operator.attrgetter("data"), results)
670 16e0b9c9 Michael Hanselmann
671 16e0b9c9 Michael Hanselmann
  # Ensure results are tuples with two values
672 16e0b9c9 Michael Hanselmann
  assert compat.all(map(ht.TListOf(ht.TListOf(ht.TIsLength(2))), results_data))
673 16e0b9c9 Michael Hanselmann
674 16e0b9c9 Michael Hanselmann
  # Extract values ignoring result status
675 16e0b9c9 Michael Hanselmann
  (raw_instances, raw_nodes) = [[map(compat.snd, values)
676 16e0b9c9 Michael Hanselmann
                                 for values in res]
677 16e0b9c9 Michael Hanselmann
                                for res in results_data]
678 16e0b9c9 Michael Hanselmann
679 16e0b9c9 Michael Hanselmann
  secondaries = {}
680 16e0b9c9 Michael Hanselmann
  instances = []
681 16e0b9c9 Michael Hanselmann
682 16e0b9c9 Michael Hanselmann
  # Load all instances
683 d962dbf9 Thomas Thrainer
  for (name, status, disks_active, snodes, pnode_group_uuid,
684 16e0b9c9 Michael Hanselmann
       snodes_group_uuid) in raw_instances:
685 16e0b9c9 Michael Hanselmann
    if snodes and set([pnode_group_uuid]) != set(snodes_group_uuid):
686 16e0b9c9 Michael Hanselmann
      logging.error("Ignoring split instance '%s', primary group %s, secondary"
687 16e0b9c9 Michael Hanselmann
                    " groups %s", name, pnode_group_uuid,
688 16e0b9c9 Michael Hanselmann
                    utils.CommaJoin(snodes_group_uuid))
689 16e0b9c9 Michael Hanselmann
    else:
690 d962dbf9 Thomas Thrainer
      instances.append(Instance(name, status, disks_active, snodes))
691 16e0b9c9 Michael Hanselmann
692 16e0b9c9 Michael Hanselmann
      for node in snodes:
693 16e0b9c9 Michael Hanselmann
        secondaries.setdefault(node, set()).add(name)
694 16e0b9c9 Michael Hanselmann
695 16e0b9c9 Michael Hanselmann
  # Load all nodes
696 16e0b9c9 Michael Hanselmann
  nodes = [Node(name, bootid, offline, secondaries.get(name, set()))
697 16e0b9c9 Michael Hanselmann
           for (name, bootid, offline) in raw_nodes]
698 16e0b9c9 Michael Hanselmann
699 16e0b9c9 Michael Hanselmann
  return (dict((node.name, node) for node in nodes),
700 16e0b9c9 Michael Hanselmann
          dict((inst.name, inst) for inst in instances))
701 16e0b9c9 Michael Hanselmann
702 16e0b9c9 Michael Hanselmann
703 9bb69bb5 Michael Hanselmann
def _LoadKnownGroups():
704 9bb69bb5 Michael Hanselmann
  """Returns a list of all node groups known by L{ssconf}.
705 16e0b9c9 Michael Hanselmann

706 16e0b9c9 Michael Hanselmann
  """
707 16e0b9c9 Michael Hanselmann
  groups = ssconf.SimpleStore().GetNodegroupList()
708 16e0b9c9 Michael Hanselmann
709 9bb69bb5 Michael Hanselmann
  result = list(line.split(None, 1)[0] for line in groups
710 9bb69bb5 Michael Hanselmann
                if line.strip())
711 9bb69bb5 Michael Hanselmann
712 9bb69bb5 Michael Hanselmann
  if not compat.all(map(utils.UUID_RE.match, result)):
713 9bb69bb5 Michael Hanselmann
    raise errors.GenericError("Ssconf contains invalid group UUID")
714 9bb69bb5 Michael Hanselmann
715 9bb69bb5 Michael Hanselmann
  return result
716 16e0b9c9 Michael Hanselmann
717 16e0b9c9 Michael Hanselmann
718 16e0b9c9 Michael Hanselmann
def _GroupWatcher(opts):
719 16e0b9c9 Michael Hanselmann
  """Main function for per-group watcher process.
720 16e0b9c9 Michael Hanselmann

721 16e0b9c9 Michael Hanselmann
  """
722 16e0b9c9 Michael Hanselmann
  group_uuid = opts.nodegroup.lower()
723 16e0b9c9 Michael Hanselmann
724 16e0b9c9 Michael Hanselmann
  if not utils.UUID_RE.match(group_uuid):
725 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group parameter (%s) must be given a UUID,"
726 16e0b9c9 Michael Hanselmann
                              " got '%s'" %
727 16e0b9c9 Michael Hanselmann
                              (cli.NODEGROUP_OPT_NAME, group_uuid))
728 16e0b9c9 Michael Hanselmann
729 16e0b9c9 Michael Hanselmann
  logging.info("Watcher for node group '%s'", group_uuid)
730 16e0b9c9 Michael Hanselmann
731 9bb69bb5 Michael Hanselmann
  known_groups = _LoadKnownGroups()
732 9bb69bb5 Michael Hanselmann
733 16e0b9c9 Michael Hanselmann
  # Check if node group is known
734 9bb69bb5 Michael Hanselmann
  if group_uuid not in known_groups:
735 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group '%s' is not known by ssconf" %
736 16e0b9c9 Michael Hanselmann
                              group_uuid)
737 16e0b9c9 Michael Hanselmann
738 40b068e5 Iustin Pop
  # Group UUID has been verified and should not contain any dangerous
739 40b068e5 Iustin Pop
  # characters
740 57fe4a5b Michael Hanselmann
  state_path = pathutils.WATCHER_GROUP_STATE_FILE % group_uuid
741 57fe4a5b Michael Hanselmann
  inst_status_path = pathutils.WATCHER_GROUP_INSTANCE_STATUS_FILE % group_uuid
742 16e0b9c9 Michael Hanselmann
743 16e0b9c9 Michael Hanselmann
  logging.debug("Using state file %s", state_path)
744 16e0b9c9 Michael Hanselmann
745 16e0b9c9 Michael Hanselmann
  # Global watcher
746 b459a848 Andrea Spadaccini
  statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
747 16e0b9c9 Michael Hanselmann
  if not statefile:
748 16e0b9c9 Michael Hanselmann
    return constants.EXIT_FAILURE
749 16e0b9c9 Michael Hanselmann
750 b459a848 Andrea Spadaccini
  notepad = state.WatcherState(statefile) # pylint: disable=E0602
751 16e0b9c9 Michael Hanselmann
  try:
752 16e0b9c9 Michael Hanselmann
    # Connect to master daemon
753 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(False)
754 16e0b9c9 Michael Hanselmann
755 16e0b9c9 Michael Hanselmann
    _CheckMaster(client)
756 16e0b9c9 Michael Hanselmann
757 16e0b9c9 Michael Hanselmann
    (nodes, instances) = _GetGroupData(client, group_uuid)
758 16e0b9c9 Michael Hanselmann
759 9bb69bb5 Michael Hanselmann
    # Update per-group instance status file
760 9bb69bb5 Michael Hanselmann
    _UpdateInstanceStatus(inst_status_path, instances.values())
761 9bb69bb5 Michael Hanselmann
762 57fe4a5b Michael Hanselmann
    _MergeInstanceStatus(pathutils.INSTANCE_STATUS_FILE,
763 57fe4a5b Michael Hanselmann
                         pathutils.WATCHER_GROUP_INSTANCE_STATUS_FILE,
764 9bb69bb5 Michael Hanselmann
                         known_groups)
765 9bb69bb5 Michael Hanselmann
766 16e0b9c9 Michael Hanselmann
    started = _CheckInstances(client, notepad, instances)
767 16e0b9c9 Michael Hanselmann
    _CheckDisks(client, notepad, nodes, instances, started)
768 16e0b9c9 Michael Hanselmann
    _VerifyDisks(client, group_uuid, nodes, instances)
769 16e0b9c9 Michael Hanselmann
  except Exception, err:
770 16e0b9c9 Michael Hanselmann
    logging.info("Not updating status file due to failure: %s", err)
771 16e0b9c9 Michael Hanselmann
    raise
772 16e0b9c9 Michael Hanselmann
  else:
773 16e0b9c9 Michael Hanselmann
    # Save changes for next run
774 16e0b9c9 Michael Hanselmann
    notepad.Save(state_path)
775 16e0b9c9 Michael Hanselmann
776 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
777 16e0b9c9 Michael Hanselmann
778 16e0b9c9 Michael Hanselmann
779 9f4bb951 Michael Hanselmann
def Main():
780 a8083063 Iustin Pop
  """Main function.
781 a8083063 Iustin Pop

782 a8083063 Iustin Pop
  """
783 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
784 a8083063 Iustin Pop
785 57fe4a5b Michael Hanselmann
  utils.SetupLogging(pathutils.LOG_WATCHER, sys.argv[0],
786 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
787 a8083063 Iustin Pop
788 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
789 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
790 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
791 3753b2cb Michael Hanselmann
792 16e0b9c9 Michael Hanselmann
  # Try to acquire global watcher lock in shared mode
793 57fe4a5b Michael Hanselmann
  lock = utils.FileLock.Open(pathutils.WATCHER_LOCK_FILE)
794 a8083063 Iustin Pop
  try:
795 16e0b9c9 Michael Hanselmann
    lock.Shared(blocking=False)
796 16e0b9c9 Michael Hanselmann
  except (EnvironmentError, errors.LockError), err:
797 16e0b9c9 Michael Hanselmann
    logging.error("Can't acquire lock on %s: %s",
798 57fe4a5b Michael Hanselmann
                  pathutils.WATCHER_LOCK_FILE, err)
799 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
800 db147305 Tom Limoncelli
801 16e0b9c9 Michael Hanselmann
  if options.nodegroup is None:
802 16e0b9c9 Michael Hanselmann
    fn = _GlobalWatcher
803 16e0b9c9 Michael Hanselmann
  else:
804 16e0b9c9 Michael Hanselmann
    # Per-nodegroup watcher
805 16e0b9c9 Michael Hanselmann
    fn = _GroupWatcher
806 16e0b9c9 Michael Hanselmann
807 16e0b9c9 Michael Hanselmann
  try:
808 16e0b9c9 Michael Hanselmann
    return fn(options)
809 16e0b9c9 Michael Hanselmann
  except (SystemExit, KeyboardInterrupt):
810 1b052f42 Michael Hanselmann
    raise
811 38242904 Iustin Pop
  except NotMasterError:
812 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
813 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
814 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
815 013ce4ae Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting", err.args[0])
816 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
817 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
818 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
819 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
820 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
821 438b45d4 Michael Hanselmann
  except Exception, err:
822 001b3825 Michael Hanselmann
    logging.exception(str(err))
823 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
824 5a3103e9 Michael Hanselmann
825 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS