Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ 0ba177e2

History | View | Annotate | Download (24.3 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 cfcc79c6 Michael Hanselmann
import os.path
32 a8083063 Iustin Pop
import sys
33 a8083063 Iustin Pop
import time
34 438b45d4 Michael Hanselmann
import logging
35 16e0b9c9 Michael Hanselmann
import operator
36 9bb69bb5 Michael Hanselmann
import errno
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 83e5e26f René Nussbaumer
from ganeti import compat
42 89e1fc26 Iustin Pop
from ganeti import errors
43 e125c67c Michael Hanselmann
from ganeti import opcodes
44 e125c67c Michael Hanselmann
from ganeti import cli
45 7dfb83c2 Iustin Pop
from ganeti import luxi
46 db147305 Tom Limoncelli
from ganeti import rapi
47 a744b676 Manuel Franceschini
from ganeti import netutils
48 16e0b9c9 Michael Hanselmann
from ganeti import qlang
49 16e0b9c9 Michael Hanselmann
from ganeti import objects
50 16e0b9c9 Michael Hanselmann
from ganeti import ssconf
51 16e0b9c9 Michael Hanselmann
from ganeti import ht
52 a8083063 Iustin Pop
53 b459a848 Andrea Spadaccini
import ganeti.rapi.client # pylint: disable=W0611
54 adf6301e Michael Hanselmann
55 adf6301e Michael Hanselmann
from ganeti.watcher import nodemaint
56 adf6301e Michael Hanselmann
from ganeti.watcher import state
57 db147305 Tom Limoncelli
58 a8083063 Iustin Pop
59 5a3103e9 Michael Hanselmann
MAXTRIES = 5
60 0cc9e018 Michael Hanselmann
BAD_STATES = frozenset([
61 0cc9e018 Michael Hanselmann
  constants.INSTST_ERRORDOWN,
62 0cc9e018 Michael Hanselmann
  ])
63 0cc9e018 Michael Hanselmann
HELPLESS_STATES = frozenset([
64 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEDOWN,
65 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEOFFLINE,
66 0cc9e018 Michael Hanselmann
  ])
67 0cc9e018 Michael Hanselmann
NOTICE = "NOTICE"
68 0cc9e018 Michael Hanselmann
ERROR = "ERROR"
69 e125c67c Michael Hanselmann
70 16e0b9c9 Michael Hanselmann
#: Number of seconds to wait between starting child processes for node groups
71 16e0b9c9 Michael Hanselmann
CHILD_PROCESS_DELAY = 1.0
72 16e0b9c9 Michael Hanselmann
73 9bb69bb5 Michael Hanselmann
#: How many seconds to wait for instance status file lock
74 9bb69bb5 Michael Hanselmann
INSTANCE_STATUS_LOCK_TIMEOUT = 10.0
75 9bb69bb5 Michael Hanselmann
76 e125c67c Michael Hanselmann
77 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
78 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
79 a8083063 Iustin Pop
80 a8083063 Iustin Pop
81 3753b2cb Michael Hanselmann
def ShouldPause():
82 3753b2cb Michael Hanselmann
  """Check whether we should pause.
83 3753b2cb Michael Hanselmann

84 3753b2cb Michael Hanselmann
  """
85 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
86 3753b2cb Michael Hanselmann
87 3753b2cb Michael Hanselmann
88 f1115454 Guido Trotter
def StartNodeDaemons():
89 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
90 f1115454 Guido Trotter

91 f1115454 Guido Trotter
  """
92 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
93 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
94 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
95 aa224134 Iustin Pop
  if constants.ENABLE_CONFD:
96 aa224134 Iustin Pop
    utils.EnsureDaemon(constants.CONFD)
97 f1115454 Guido Trotter
98 f1115454 Guido Trotter
99 9e289e36 Guido Trotter
def RunWatcherHooks():
100 9e289e36 Guido Trotter
  """Run the watcher hooks.
101 9e289e36 Guido Trotter

102 9e289e36 Guido Trotter
  """
103 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
104 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
105 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
106 10e689d4 Iustin Pop
    return
107 9e289e36 Guido Trotter
108 9e289e36 Guido Trotter
  try:
109 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
110 17385bd2 Andrea Spadaccini
  except Exception, err: # pylint: disable=W0703
111 17385bd2 Andrea Spadaccini
    logging.exception("RunParts %s failed: %s", hooks_dir, err)
112 a0aa6b49 Michael Hanselmann
    return
113 9e289e36 Guido Trotter
114 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
115 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
116 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
117 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
118 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
119 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
120 9e289e36 Guido Trotter
      if runresult.failed:
121 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
122 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
123 9e289e36 Guido Trotter
      else:
124 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
125 9e289e36 Guido Trotter
                      runresult.output)
126 013ce4ae Michael Hanselmann
    else:
127 013ce4ae Michael Hanselmann
      raise errors.ProgrammerError("Unknown status %s returned by RunParts",
128 013ce4ae Michael Hanselmann
                                   status)
129 9e289e36 Guido Trotter
130 001b3825 Michael Hanselmann
131 a8083063 Iustin Pop
class Instance(object):
132 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
133 a8083063 Iustin Pop

134 a8083063 Iustin Pop
  """
135 adf6301e Michael Hanselmann
  def __init__(self, name, status, autostart, snodes):
136 a8083063 Iustin Pop
    self.name = name
137 adf6301e Michael Hanselmann
    self.status = status
138 5a3103e9 Michael Hanselmann
    self.autostart = autostart
139 83e5e26f René Nussbaumer
    self.snodes = snodes
140 a8083063 Iustin Pop
141 16e0b9c9 Michael Hanselmann
  def Restart(self, cl):
142 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
143 3ecf6786 Iustin Pop

144 3ecf6786 Iustin Pop
    """
145 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
146 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
147 a8083063 Iustin Pop
148 16e0b9c9 Michael Hanselmann
  def ActivateDisks(self, cl):
149 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
150 5a3103e9 Michael Hanselmann

151 5a3103e9 Michael Hanselmann
    """
152 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
153 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
154 a8083063 Iustin Pop
155 a8083063 Iustin Pop
156 16e0b9c9 Michael Hanselmann
class Node:
157 16e0b9c9 Michael Hanselmann
  """Data container representing cluster node.
158 5a3103e9 Michael Hanselmann

159 5a3103e9 Michael Hanselmann
  """
160 16e0b9c9 Michael Hanselmann
  def __init__(self, name, bootid, offline, secondaries):
161 16e0b9c9 Michael Hanselmann
    """Initializes this class.
162 a8083063 Iustin Pop

163 16e0b9c9 Michael Hanselmann
    """
164 16e0b9c9 Michael Hanselmann
    self.name = name
165 16e0b9c9 Michael Hanselmann
    self.bootid = bootid
166 16e0b9c9 Michael Hanselmann
    self.offline = offline
167 16e0b9c9 Michael Hanselmann
    self.secondaries = secondaries
168 5a3103e9 Michael Hanselmann
169 78f44650 Iustin Pop
170 16e0b9c9 Michael Hanselmann
def _CheckInstances(cl, notepad, instances):
171 16e0b9c9 Michael Hanselmann
  """Make a pass over the list of instances, restarting downed ones.
172 5a3103e9 Michael Hanselmann

173 16e0b9c9 Michael Hanselmann
  """
174 16e0b9c9 Michael Hanselmann
  notepad.MaintainInstanceList(instances.keys())
175 78f44650 Iustin Pop
176 16e0b9c9 Michael Hanselmann
  started = set()
177 78f44650 Iustin Pop
178 16e0b9c9 Michael Hanselmann
  for inst in instances.values():
179 16e0b9c9 Michael Hanselmann
    if inst.status in BAD_STATES:
180 16e0b9c9 Michael Hanselmann
      n = notepad.NumberOfRestartAttempts(inst.name)
181 5a3103e9 Michael Hanselmann
182 16e0b9c9 Michael Hanselmann
      if n > MAXTRIES:
183 16e0b9c9 Michael Hanselmann
        logging.warning("Not restarting instance '%s', retries exhausted",
184 16e0b9c9 Michael Hanselmann
                        inst.name)
185 16e0b9c9 Michael Hanselmann
        continue
186 a8083063 Iustin Pop
187 16e0b9c9 Michael Hanselmann
      if n == MAXTRIES:
188 16e0b9c9 Michael Hanselmann
        notepad.RecordRestartAttempt(inst.name)
189 16e0b9c9 Michael Hanselmann
        logging.error("Could not restart instance '%s' after %s attempts,"
190 16e0b9c9 Michael Hanselmann
                      " giving up", inst.name, MAXTRIES)
191 16e0b9c9 Michael Hanselmann
        continue
192 5a3103e9 Michael Hanselmann
193 16e0b9c9 Michael Hanselmann
      try:
194 16e0b9c9 Michael Hanselmann
        logging.info("Restarting instance '%s' (attempt #%s)",
195 16e0b9c9 Michael Hanselmann
                     inst.name, n + 1)
196 16e0b9c9 Michael Hanselmann
        inst.Restart(cl)
197 b459a848 Andrea Spadaccini
      except Exception: # pylint: disable=W0703
198 16e0b9c9 Michael Hanselmann
        logging.exception("Error while restarting instance '%s'", inst.name)
199 16e0b9c9 Michael Hanselmann
      else:
200 16e0b9c9 Michael Hanselmann
        started.add(inst.name)
201 5a3103e9 Michael Hanselmann
202 16e0b9c9 Michael Hanselmann
      notepad.RecordRestartAttempt(inst.name)
203 5a3103e9 Michael Hanselmann
204 16e0b9c9 Michael Hanselmann
    else:
205 16e0b9c9 Michael Hanselmann
      if notepad.NumberOfRestartAttempts(inst.name):
206 16e0b9c9 Michael Hanselmann
        notepad.RemoveInstance(inst.name)
207 16e0b9c9 Michael Hanselmann
        if inst.status not in HELPLESS_STATES:
208 16e0b9c9 Michael Hanselmann
          logging.info("Restart of instance '%s' succeeded", inst.name)
209 a8083063 Iustin Pop
210 16e0b9c9 Michael Hanselmann
  return started
211 a8083063 Iustin Pop
212 a8083063 Iustin Pop
213 16e0b9c9 Michael Hanselmann
def _CheckDisks(cl, notepad, nodes, instances, started):
214 16e0b9c9 Michael Hanselmann
  """Check all nodes for restarted ones.
215 38242904 Iustin Pop

216 a8083063 Iustin Pop
  """
217 16e0b9c9 Michael Hanselmann
  check_nodes = []
218 16e0b9c9 Michael Hanselmann
219 16e0b9c9 Michael Hanselmann
  for node in nodes.values():
220 16e0b9c9 Michael Hanselmann
    old = notepad.GetNodeBootID(node.name)
221 16e0b9c9 Michael Hanselmann
    if not node.bootid:
222 16e0b9c9 Michael Hanselmann
      # Bad node, not returning a boot id
223 16e0b9c9 Michael Hanselmann
      if not node.offline:
224 16e0b9c9 Michael Hanselmann
        logging.debug("Node '%s' missing boot ID, skipping secondary checks",
225 16e0b9c9 Michael Hanselmann
                      node.name)
226 16e0b9c9 Michael Hanselmann
      continue
227 16e0b9c9 Michael Hanselmann
228 16e0b9c9 Michael Hanselmann
    if old != node.bootid:
229 16e0b9c9 Michael Hanselmann
      # Node's boot ID has changed, probably through a reboot
230 16e0b9c9 Michael Hanselmann
      check_nodes.append(node)
231 16e0b9c9 Michael Hanselmann
232 16e0b9c9 Michael Hanselmann
  if check_nodes:
233 16e0b9c9 Michael Hanselmann
    # Activate disks for all instances with any of the checked nodes as a
234 16e0b9c9 Michael Hanselmann
    # secondary node.
235 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
236 16e0b9c9 Michael Hanselmann
      for instance_name in node.secondaries:
237 16e0b9c9 Michael Hanselmann
        try:
238 16e0b9c9 Michael Hanselmann
          inst = instances[instance_name]
239 16e0b9c9 Michael Hanselmann
        except KeyError:
240 16e0b9c9 Michael Hanselmann
          logging.info("Can't find instance '%s', maybe it was ignored",
241 16e0b9c9 Michael Hanselmann
                       instance_name)
242 eee1fa2d Iustin Pop
          continue
243 a8083063 Iustin Pop
244 16e0b9c9 Michael Hanselmann
        if not inst.autostart:
245 16e0b9c9 Michael Hanselmann
          logging.info("Skipping disk activation for non-autostart"
246 16e0b9c9 Michael Hanselmann
                       " instance '%s'", inst.name)
247 a8083063 Iustin Pop
          continue
248 16e0b9c9 Michael Hanselmann
249 16e0b9c9 Michael Hanselmann
        if inst.name in started:
250 16e0b9c9 Michael Hanselmann
          # we already tried to start the instance, which should have
251 16e0b9c9 Michael Hanselmann
          # activated its drives (if they can be at all)
252 16e0b9c9 Michael Hanselmann
          logging.debug("Skipping disk activation for instance '%s' as"
253 16e0b9c9 Michael Hanselmann
                        " it was already started", inst.name)
254 a8083063 Iustin Pop
          continue
255 16e0b9c9 Michael Hanselmann
256 a8083063 Iustin Pop
        try:
257 16e0b9c9 Michael Hanselmann
          logging.info("Activating disks for instance '%s'", inst.name)
258 16e0b9c9 Michael Hanselmann
          inst.ActivateDisks(cl)
259 b459a848 Andrea Spadaccini
        except Exception: # pylint: disable=W0703
260 16e0b9c9 Michael Hanselmann
          logging.exception("Error while activating disks for instance '%s'",
261 16e0b9c9 Michael Hanselmann
                            inst.name)
262 a8083063 Iustin Pop
263 16e0b9c9 Michael Hanselmann
    # Keep changed boot IDs
264 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
265 16e0b9c9 Michael Hanselmann
      notepad.SetNodeBootID(node.name, node.bootid)
266 a8083063 Iustin Pop
267 83e5e26f René Nussbaumer
268 16e0b9c9 Michael Hanselmann
def _CheckForOfflineNodes(nodes, instance):
269 16e0b9c9 Michael Hanselmann
  """Checks if given instances has any secondary in offline status.
270 ae1a845c Michael Hanselmann

271 16e0b9c9 Michael Hanselmann
  @param instance: The instance object
272 16e0b9c9 Michael Hanselmann
  @return: True if any of the secondary is offline, False otherwise
273 ae1a845c Michael Hanselmann

274 16e0b9c9 Michael Hanselmann
  """
275 16e0b9c9 Michael Hanselmann
  return compat.any(nodes[node_name].offline for node_name in instance.snodes)
276 ae1a845c Michael Hanselmann
277 ae1a845c Michael Hanselmann
278 16e0b9c9 Michael Hanselmann
def _VerifyDisks(cl, uuid, nodes, instances):
279 16e0b9c9 Michael Hanselmann
  """Run a per-group "gnt-cluster verify-disks".
280 ae1a845c Michael Hanselmann

281 16e0b9c9 Michael Hanselmann
  """
282 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob([opcodes.OpGroupVerifyDisks(group_name=uuid)])
283 16e0b9c9 Michael Hanselmann
  ((_, offline_disk_instances, _), ) = \
284 16e0b9c9 Michael Hanselmann
    cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
285 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
286 ae1a845c Michael Hanselmann
287 16e0b9c9 Michael Hanselmann
  if not offline_disk_instances:
288 16e0b9c9 Michael Hanselmann
    # nothing to do
289 16e0b9c9 Michael Hanselmann
    logging.debug("Verify-disks reported no offline disks, nothing to do")
290 16e0b9c9 Michael Hanselmann
    return
291 ae1a845c Michael Hanselmann
292 16e0b9c9 Michael Hanselmann
  logging.debug("Will activate disks for instance(s) %s",
293 16e0b9c9 Michael Hanselmann
                utils.CommaJoin(offline_disk_instances))
294 ae1a845c Michael Hanselmann
295 16e0b9c9 Michael Hanselmann
  # We submit only one job, and wait for it. Not optimal, but this puts less
296 16e0b9c9 Michael Hanselmann
  # load on the job queue.
297 16e0b9c9 Michael Hanselmann
  job = []
298 16e0b9c9 Michael Hanselmann
  for name in offline_disk_instances:
299 16e0b9c9 Michael Hanselmann
    try:
300 16e0b9c9 Michael Hanselmann
      inst = instances[name]
301 16e0b9c9 Michael Hanselmann
    except KeyError:
302 16e0b9c9 Michael Hanselmann
      logging.info("Can't find instance '%s', maybe it was ignored", name)
303 16e0b9c9 Michael Hanselmann
      continue
304 ae1a845c Michael Hanselmann
305 16e0b9c9 Michael Hanselmann
    if inst.status in HELPLESS_STATES or _CheckForOfflineNodes(nodes, inst):
306 16e0b9c9 Michael Hanselmann
      logging.info("Skipping instance '%s' because it is in a helpless state or"
307 16e0b9c9 Michael Hanselmann
                   " has offline secondaries", name)
308 16e0b9c9 Michael Hanselmann
      continue
309 ae1a845c Michael Hanselmann
310 16e0b9c9 Michael Hanselmann
    job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
311 5188ab37 Iustin Pop
312 16e0b9c9 Michael Hanselmann
  if job:
313 16e0b9c9 Michael Hanselmann
    job_id = cli.SendJob(job, cl=cl)
314 83e5e26f René Nussbaumer
315 16e0b9c9 Michael Hanselmann
    try:
316 16e0b9c9 Michael Hanselmann
      cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
317 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
318 16e0b9c9 Michael Hanselmann
      logging.exception("Error while activating disks")
319 a8083063 Iustin Pop
320 a8083063 Iustin Pop
321 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
322 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
323 db147305 Tom Limoncelli

324 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
325 db147305 Tom Limoncelli
  test is GetVersion.
326 db147305 Tom Limoncelli

327 db147305 Tom Limoncelli
  @type hostname: string
328 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
329 db147305 Tom Limoncelli
  @rtype: bool
330 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
331 db147305 Tom Limoncelli

332 db147305 Tom Limoncelli
  """
333 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
334 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
335 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
336 db147305 Tom Limoncelli
  try:
337 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
338 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
339 d7c42723 Michael Hanselmann
    logging.warning("RAPI certificate error: %s", err)
340 db147305 Tom Limoncelli
    return False
341 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
342 d7c42723 Michael Hanselmann
    logging.warning("RAPI error: %s", err)
343 db147305 Tom Limoncelli
    return False
344 d7c42723 Michael Hanselmann
  else:
345 d7c42723 Michael Hanselmann
    logging.debug("Reported RAPI version %s", master_version)
346 d7c42723 Michael Hanselmann
    return master_version == constants.RAPI_VERSION
347 db147305 Tom Limoncelli
348 db147305 Tom Limoncelli
349 a8083063 Iustin Pop
def ParseOptions():
350 a8083063 Iustin Pop
  """Parse the command line options.
351 a8083063 Iustin Pop

352 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
353 a8083063 Iustin Pop

354 a8083063 Iustin Pop
  """
355 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
356 a8083063 Iustin Pop
                        usage="%prog [-d]",
357 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
358 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
359 a8083063 Iustin Pop
360 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
361 16e0b9c9 Michael Hanselmann
  parser.add_option(cli.NODEGROUP_OPT)
362 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
363 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
364 f0a80b01 Michael Hanselmann
                          " 6 hours)")
365 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
366 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
367 5f01e6ad Michael Hanselmann
  parser.add_option("--wait-children", dest="wait_children",
368 16e0b9c9 Michael Hanselmann
                    action="store_true", help="Wait for child processes")
369 5f01e6ad Michael Hanselmann
  parser.add_option("--no-wait-children", dest="wait_children",
370 5f01e6ad Michael Hanselmann
                    action="store_false", help="Don't wait for child processes")
371 5f01e6ad Michael Hanselmann
  # See optparse documentation for why default values are not set by options
372 5f01e6ad Michael Hanselmann
  parser.set_defaults(wait_children=True)
373 a8083063 Iustin Pop
  options, args = parser.parse_args()
374 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
375 f0a80b01 Michael Hanselmann
376 f0a80b01 Michael Hanselmann
  if args:
377 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
378 f0a80b01 Michael Hanselmann
379 f0a80b01 Michael Hanselmann
  return (options, args)
380 a8083063 Iustin Pop
381 a8083063 Iustin Pop
382 9bb69bb5 Michael Hanselmann
def _WriteInstanceStatus(filename, data):
383 9bb69bb5 Michael Hanselmann
  """Writes the per-group instance status file.
384 9bb69bb5 Michael Hanselmann

385 9bb69bb5 Michael Hanselmann
  The entries are sorted.
386 8f07dc0d Michael Hanselmann

387 9bb69bb5 Michael Hanselmann
  @type filename: string
388 9bb69bb5 Michael Hanselmann
  @param filename: Path to instance status file
389 9bb69bb5 Michael Hanselmann
  @type data: list of tuple; (instance name as string, status as string)
390 9bb69bb5 Michael Hanselmann
  @param data: Instance name and status
391 8f07dc0d Michael Hanselmann

392 8f07dc0d Michael Hanselmann
  """
393 9bb69bb5 Michael Hanselmann
  logging.debug("Updating instance status file '%s' with %s instances",
394 9bb69bb5 Michael Hanselmann
                filename, len(data))
395 8f07dc0d Michael Hanselmann
396 9bb69bb5 Michael Hanselmann
  utils.WriteFile(filename,
397 9bb69bb5 Michael Hanselmann
                  data="".join(map(compat.partial(operator.mod, "%s %s\n"),
398 9bb69bb5 Michael Hanselmann
                                   sorted(data))))
399 9bb69bb5 Michael Hanselmann
400 9bb69bb5 Michael Hanselmann
401 9bb69bb5 Michael Hanselmann
def _UpdateInstanceStatus(filename, instances):
402 9bb69bb5 Michael Hanselmann
  """Writes an instance status file from L{Instance} objects.
403 9bb69bb5 Michael Hanselmann

404 9bb69bb5 Michael Hanselmann
  @type filename: string
405 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
406 9bb69bb5 Michael Hanselmann
  @type instances: list of L{Instance}
407 9bb69bb5 Michael Hanselmann

408 9bb69bb5 Michael Hanselmann
  """
409 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, [(inst.name, inst.status)
410 9bb69bb5 Michael Hanselmann
                                  for inst in instances])
411 9bb69bb5 Michael Hanselmann
412 9bb69bb5 Michael Hanselmann
413 9bb69bb5 Michael Hanselmann
class _StatCb:
414 9bb69bb5 Michael Hanselmann
  """Helper to store file handle's C{fstat}.
415 9bb69bb5 Michael Hanselmann

416 9bb69bb5 Michael Hanselmann
  """
417 9bb69bb5 Michael Hanselmann
  def __init__(self):
418 9bb69bb5 Michael Hanselmann
    """Initializes this class.
419 9bb69bb5 Michael Hanselmann

420 9bb69bb5 Michael Hanselmann
    """
421 9bb69bb5 Michael Hanselmann
    self.st = None
422 9bb69bb5 Michael Hanselmann
423 9bb69bb5 Michael Hanselmann
  def __call__(self, fh):
424 9bb69bb5 Michael Hanselmann
    """Calls C{fstat} on file handle.
425 8f07dc0d Michael Hanselmann

426 9bb69bb5 Michael Hanselmann
    """
427 9bb69bb5 Michael Hanselmann
    self.st = os.fstat(fh.fileno())
428 8f07dc0d Michael Hanselmann
429 9bb69bb5 Michael Hanselmann
430 9bb69bb5 Michael Hanselmann
def _ReadInstanceStatus(filename):
431 9bb69bb5 Michael Hanselmann
  """Reads an instance status file.
432 9bb69bb5 Michael Hanselmann

433 9bb69bb5 Michael Hanselmann
  @type filename: string
434 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
435 9bb69bb5 Michael Hanselmann
  @rtype: tuple; (None or number, list of lists containing instance name and
436 9bb69bb5 Michael Hanselmann
    status)
437 9bb69bb5 Michael Hanselmann
  @return: File's mtime and instance status contained in the file; mtime is
438 9bb69bb5 Michael Hanselmann
    C{None} if file can't be read
439 9bb69bb5 Michael Hanselmann

440 9bb69bb5 Michael Hanselmann
  """
441 9bb69bb5 Michael Hanselmann
  logging.debug("Reading per-group instance status from '%s'", filename)
442 9bb69bb5 Michael Hanselmann
443 9bb69bb5 Michael Hanselmann
  statcb = _StatCb()
444 9bb69bb5 Michael Hanselmann
  try:
445 9bb69bb5 Michael Hanselmann
    content = utils.ReadFile(filename, preread=statcb)
446 9bb69bb5 Michael Hanselmann
  except EnvironmentError, err:
447 9bb69bb5 Michael Hanselmann
    if err.errno == errno.ENOENT:
448 9bb69bb5 Michael Hanselmann
      logging.error("Can't read '%s', does not exist (yet)", filename)
449 9bb69bb5 Michael Hanselmann
    else:
450 9bb69bb5 Michael Hanselmann
      logging.exception("Unable to read '%s', ignoring", filename)
451 9bb69bb5 Michael Hanselmann
    return (None, None)
452 9bb69bb5 Michael Hanselmann
  else:
453 6f9e71bb Michael Hanselmann
    return (statcb.st.st_mtime, [line.split(None, 1)
454 9bb69bb5 Michael Hanselmann
                                 for line in content.splitlines()])
455 9bb69bb5 Michael Hanselmann
456 9bb69bb5 Michael Hanselmann
457 9bb69bb5 Michael Hanselmann
def _MergeInstanceStatus(filename, pergroup_filename, groups):
458 9bb69bb5 Michael Hanselmann
  """Merges all per-group instance status files into a global one.
459 9bb69bb5 Michael Hanselmann

460 9bb69bb5 Michael Hanselmann
  @type filename: string
461 9bb69bb5 Michael Hanselmann
  @param filename: Path to global instance status file
462 9bb69bb5 Michael Hanselmann
  @type pergroup_filename: string
463 9bb69bb5 Michael Hanselmann
  @param pergroup_filename: Path to per-group status files, must contain "%s"
464 9bb69bb5 Michael Hanselmann
    to be replaced with group UUID
465 9bb69bb5 Michael Hanselmann
  @type groups: sequence
466 9bb69bb5 Michael Hanselmann
  @param groups: UUIDs of known groups
467 9bb69bb5 Michael Hanselmann

468 9bb69bb5 Michael Hanselmann
  """
469 9bb69bb5 Michael Hanselmann
  # Lock global status file in exclusive mode
470 9bb69bb5 Michael Hanselmann
  lock = utils.FileLock.Open(filename)
471 9bb69bb5 Michael Hanselmann
  try:
472 9bb69bb5 Michael Hanselmann
    lock.Exclusive(blocking=True, timeout=INSTANCE_STATUS_LOCK_TIMEOUT)
473 9bb69bb5 Michael Hanselmann
  except errors.LockError, err:
474 9bb69bb5 Michael Hanselmann
    # All per-group processes will lock and update the file. None of them
475 9bb69bb5 Michael Hanselmann
    # should take longer than 10 seconds (the value of
476 9bb69bb5 Michael Hanselmann
    # INSTANCE_STATUS_LOCK_TIMEOUT).
477 9bb69bb5 Michael Hanselmann
    logging.error("Can't acquire lock on instance status file '%s', not"
478 9bb69bb5 Michael Hanselmann
                  " updating: %s", filename, err)
479 9bb69bb5 Michael Hanselmann
    return
480 9bb69bb5 Michael Hanselmann
481 9bb69bb5 Michael Hanselmann
  logging.debug("Acquired exclusive lock on '%s'", filename)
482 9bb69bb5 Michael Hanselmann
483 9bb69bb5 Michael Hanselmann
  data = {}
484 9bb69bb5 Michael Hanselmann
485 9bb69bb5 Michael Hanselmann
  # Load instance status from all groups
486 9bb69bb5 Michael Hanselmann
  for group_uuid in groups:
487 9bb69bb5 Michael Hanselmann
    (mtime, instdata) = _ReadInstanceStatus(pergroup_filename % group_uuid)
488 9bb69bb5 Michael Hanselmann
489 9bb69bb5 Michael Hanselmann
    if mtime is not None:
490 9bb69bb5 Michael Hanselmann
      for (instance_name, status) in instdata:
491 9bb69bb5 Michael Hanselmann
        data.setdefault(instance_name, []).append((mtime, status))
492 9bb69bb5 Michael Hanselmann
493 9bb69bb5 Michael Hanselmann
  # Select last update based on file mtime
494 9bb69bb5 Michael Hanselmann
  inststatus = [(instance_name, sorted(status, reverse=True)[0][1])
495 9bb69bb5 Michael Hanselmann
                for (instance_name, status) in data.items()]
496 9bb69bb5 Michael Hanselmann
497 9bb69bb5 Michael Hanselmann
  # Write the global status file. Don't touch file after it's been
498 9bb69bb5 Michael Hanselmann
  # updated--there is no lock anymore.
499 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, inststatus)
500 8f07dc0d Michael Hanselmann
501 8f07dc0d Michael Hanselmann
502 16e0b9c9 Michael Hanselmann
def GetLuxiClient(try_restart):
503 16e0b9c9 Michael Hanselmann
  """Tries to connect to the master daemon.
504 16e0b9c9 Michael Hanselmann

505 16e0b9c9 Michael Hanselmann
  @type try_restart: bool
506 16e0b9c9 Michael Hanselmann
  @param try_restart: Whether to attempt to restart the master daemon
507 16e0b9c9 Michael Hanselmann

508 16e0b9c9 Michael Hanselmann
  """
509 16e0b9c9 Michael Hanselmann
  try:
510 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
511 16e0b9c9 Michael Hanselmann
  except errors.OpPrereqError, err:
512 16e0b9c9 Michael Hanselmann
    # this is, from cli.GetClient, a not-master case
513 16e0b9c9 Michael Hanselmann
    raise NotMasterError("Not on master node (%s)" % err)
514 16e0b9c9 Michael Hanselmann
515 16e0b9c9 Michael Hanselmann
  except luxi.NoMasterError, err:
516 16e0b9c9 Michael Hanselmann
    if not try_restart:
517 16e0b9c9 Michael Hanselmann
      raise
518 16e0b9c9 Michael Hanselmann
519 16e0b9c9 Michael Hanselmann
    logging.warning("Master daemon seems to be down (%s), trying to restart",
520 16e0b9c9 Michael Hanselmann
                    err)
521 16e0b9c9 Michael Hanselmann
522 16e0b9c9 Michael Hanselmann
    if not utils.EnsureDaemon(constants.MASTERD):
523 16e0b9c9 Michael Hanselmann
      raise errors.GenericError("Can't start the master daemon")
524 16e0b9c9 Michael Hanselmann
525 16e0b9c9 Michael Hanselmann
    # Retry the connection
526 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
527 16e0b9c9 Michael Hanselmann
528 16e0b9c9 Michael Hanselmann
529 16e0b9c9 Michael Hanselmann
def _StartGroupChildren(cl, wait):
530 16e0b9c9 Michael Hanselmann
  """Starts a new instance of the watcher for every node group.
531 16e0b9c9 Michael Hanselmann

532 16e0b9c9 Michael Hanselmann
  """
533 16e0b9c9 Michael Hanselmann
  assert not compat.any(arg.startswith(cli.NODEGROUP_OPT_NAME)
534 16e0b9c9 Michael Hanselmann
                        for arg in sys.argv)
535 16e0b9c9 Michael Hanselmann
536 16e0b9c9 Michael Hanselmann
  result = cl.QueryGroups([], ["name", "uuid"], False)
537 16e0b9c9 Michael Hanselmann
538 16e0b9c9 Michael Hanselmann
  children = []
539 16e0b9c9 Michael Hanselmann
540 16e0b9c9 Michael Hanselmann
  for (idx, (name, uuid)) in enumerate(result):
541 16e0b9c9 Michael Hanselmann
    args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]
542 16e0b9c9 Michael Hanselmann
543 16e0b9c9 Michael Hanselmann
    if idx > 0:
544 16e0b9c9 Michael Hanselmann
      # Let's not kill the system
545 16e0b9c9 Michael Hanselmann
      time.sleep(CHILD_PROCESS_DELAY)
546 16e0b9c9 Michael Hanselmann
547 16e0b9c9 Michael Hanselmann
    logging.debug("Spawning child for group '%s' (%s), arguments %s",
548 16e0b9c9 Michael Hanselmann
                  name, uuid, args)
549 16e0b9c9 Michael Hanselmann
550 16e0b9c9 Michael Hanselmann
    try:
551 16e0b9c9 Michael Hanselmann
      # TODO: Should utils.StartDaemon be used instead?
552 16e0b9c9 Michael Hanselmann
      pid = os.spawnv(os.P_NOWAIT, args[0], args)
553 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
554 16e0b9c9 Michael Hanselmann
      logging.exception("Failed to start child for group '%s' (%s)",
555 16e0b9c9 Michael Hanselmann
                        name, uuid)
556 16e0b9c9 Michael Hanselmann
    else:
557 16e0b9c9 Michael Hanselmann
      logging.debug("Started with PID %s", pid)
558 16e0b9c9 Michael Hanselmann
      children.append(pid)
559 16e0b9c9 Michael Hanselmann
560 16e0b9c9 Michael Hanselmann
  if wait:
561 16e0b9c9 Michael Hanselmann
    for pid in children:
562 16e0b9c9 Michael Hanselmann
      logging.debug("Waiting for child PID %s", pid)
563 16e0b9c9 Michael Hanselmann
      try:
564 16e0b9c9 Michael Hanselmann
        result = utils.RetryOnSignal(os.waitpid, pid, 0)
565 16e0b9c9 Michael Hanselmann
      except EnvironmentError, err:
566 16e0b9c9 Michael Hanselmann
        result = str(err)
567 16e0b9c9 Michael Hanselmann
568 16e0b9c9 Michael Hanselmann
      logging.debug("Child PID %s exited with status %s", pid, result)
569 16e0b9c9 Michael Hanselmann
570 16e0b9c9 Michael Hanselmann
571 16e0b9c9 Michael Hanselmann
def _ArchiveJobs(cl, age):
572 16e0b9c9 Michael Hanselmann
  """Archives old jobs.
573 16e0b9c9 Michael Hanselmann

574 16e0b9c9 Michael Hanselmann
  """
575 16e0b9c9 Michael Hanselmann
  (arch_count, left_count) = cl.AutoArchiveJobs(age)
576 16e0b9c9 Michael Hanselmann
  logging.debug("Archived %s jobs, left %s", arch_count, left_count)
577 16e0b9c9 Michael Hanselmann
578 16e0b9c9 Michael Hanselmann
579 16e0b9c9 Michael Hanselmann
def _CheckMaster(cl):
580 16e0b9c9 Michael Hanselmann
  """Ensures current host is master node.
581 16e0b9c9 Michael Hanselmann

582 16e0b9c9 Michael Hanselmann
  """
583 16e0b9c9 Michael Hanselmann
  (master, ) = cl.QueryConfigValues(["master_node"])
584 16e0b9c9 Michael Hanselmann
  if master != netutils.Hostname.GetSysName():
585 16e0b9c9 Michael Hanselmann
    raise NotMasterError("This is not the master node")
586 16e0b9c9 Michael Hanselmann
587 16e0b9c9 Michael Hanselmann
588 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
589 16e0b9c9 Michael Hanselmann
def _GlobalWatcher(opts):
590 16e0b9c9 Michael Hanselmann
  """Main function for global watcher.
591 16e0b9c9 Michael Hanselmann

592 16e0b9c9 Michael Hanselmann
  At the end child processes are spawned for every node group.
593 16e0b9c9 Michael Hanselmann

594 16e0b9c9 Michael Hanselmann
  """
595 16e0b9c9 Michael Hanselmann
  StartNodeDaemons()
596 16e0b9c9 Michael Hanselmann
  RunWatcherHooks()
597 16e0b9c9 Michael Hanselmann
598 16e0b9c9 Michael Hanselmann
  # Run node maintenance in all cases, even if master, so that old masters can
599 16e0b9c9 Michael Hanselmann
  # be properly cleaned up
600 b459a848 Andrea Spadaccini
  if nodemaint.NodeMaintenance.ShouldRun(): # pylint: disable=E0602
601 b459a848 Andrea Spadaccini
    nodemaint.NodeMaintenance().Exec() # pylint: disable=E0602
602 16e0b9c9 Michael Hanselmann
603 16e0b9c9 Michael Hanselmann
  try:
604 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(True)
605 16e0b9c9 Michael Hanselmann
  except NotMasterError:
606 16e0b9c9 Michael Hanselmann
    # Don't proceed on non-master nodes
607 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
608 16e0b9c9 Michael Hanselmann
609 16e0b9c9 Michael Hanselmann
  # we are on master now
610 16e0b9c9 Michael Hanselmann
  utils.EnsureDaemon(constants.RAPI)
611 16e0b9c9 Michael Hanselmann
612 16e0b9c9 Michael Hanselmann
  # If RAPI isn't responding to queries, try one restart
613 16e0b9c9 Michael Hanselmann
  logging.debug("Attempting to talk to remote API on %s",
614 16e0b9c9 Michael Hanselmann
                constants.IP4_ADDRESS_LOCALHOST)
615 16e0b9c9 Michael Hanselmann
  if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
616 16e0b9c9 Michael Hanselmann
    logging.warning("Couldn't get answer from remote API, restaring daemon")
617 16e0b9c9 Michael Hanselmann
    utils.StopDaemon(constants.RAPI)
618 16e0b9c9 Michael Hanselmann
    utils.EnsureDaemon(constants.RAPI)
619 16e0b9c9 Michael Hanselmann
    logging.debug("Second attempt to talk to remote API")
620 16e0b9c9 Michael Hanselmann
    if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
621 16e0b9c9 Michael Hanselmann
      logging.fatal("RAPI is not responding")
622 16e0b9c9 Michael Hanselmann
  logging.debug("Successfully talked to remote API")
623 16e0b9c9 Michael Hanselmann
624 16e0b9c9 Michael Hanselmann
  _CheckMaster(client)
625 16e0b9c9 Michael Hanselmann
  _ArchiveJobs(client, opts.job_age)
626 16e0b9c9 Michael Hanselmann
627 16e0b9c9 Michael Hanselmann
  # Spawn child processes for all node groups
628 16e0b9c9 Michael Hanselmann
  _StartGroupChildren(client, opts.wait_children)
629 16e0b9c9 Michael Hanselmann
630 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
631 16e0b9c9 Michael Hanselmann
632 16e0b9c9 Michael Hanselmann
633 16e0b9c9 Michael Hanselmann
def _GetGroupData(cl, uuid):
634 16e0b9c9 Michael Hanselmann
  """Retrieves instances and nodes per node group.
635 16e0b9c9 Michael Hanselmann

636 16e0b9c9 Michael Hanselmann
  """
637 16e0b9c9 Michael Hanselmann
  job = [
638 16e0b9c9 Michael Hanselmann
    # Get all primary instances in group
639 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_INSTANCE,
640 16e0b9c9 Michael Hanselmann
                    fields=["name", "status", "admin_state", "snodes",
641 16e0b9c9 Michael Hanselmann
                            "pnode.group.uuid", "snodes.group.uuid"],
642 2e5c33db Iustin Pop
                    qfilter=[qlang.OP_EQUAL, "pnode.group.uuid", uuid],
643 5bfb1134 Michael Hanselmann
                    use_locking=True),
644 16e0b9c9 Michael Hanselmann
645 16e0b9c9 Michael Hanselmann
    # Get all nodes in group
646 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_NODE,
647 16e0b9c9 Michael Hanselmann
                    fields=["name", "bootid", "offline"],
648 2e5c33db Iustin Pop
                    qfilter=[qlang.OP_EQUAL, "group.uuid", uuid],
649 5bfb1134 Michael Hanselmann
                    use_locking=True),
650 16e0b9c9 Michael Hanselmann
    ]
651 16e0b9c9 Michael Hanselmann
652 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob(job)
653 16e0b9c9 Michael Hanselmann
  results = map(objects.QueryResponse.FromDict,
654 16e0b9c9 Michael Hanselmann
                cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug))
655 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
656 16e0b9c9 Michael Hanselmann
657 16e0b9c9 Michael Hanselmann
  results_data = map(operator.attrgetter("data"), results)
658 16e0b9c9 Michael Hanselmann
659 16e0b9c9 Michael Hanselmann
  # Ensure results are tuples with two values
660 16e0b9c9 Michael Hanselmann
  assert compat.all(map(ht.TListOf(ht.TListOf(ht.TIsLength(2))), results_data))
661 16e0b9c9 Michael Hanselmann
662 16e0b9c9 Michael Hanselmann
  # Extract values ignoring result status
663 16e0b9c9 Michael Hanselmann
  (raw_instances, raw_nodes) = [[map(compat.snd, values)
664 16e0b9c9 Michael Hanselmann
                                 for values in res]
665 16e0b9c9 Michael Hanselmann
                                for res in results_data]
666 16e0b9c9 Michael Hanselmann
667 16e0b9c9 Michael Hanselmann
  secondaries = {}
668 16e0b9c9 Michael Hanselmann
  instances = []
669 16e0b9c9 Michael Hanselmann
670 16e0b9c9 Michael Hanselmann
  # Load all instances
671 16e0b9c9 Michael Hanselmann
  for (name, status, autostart, snodes, pnode_group_uuid,
672 16e0b9c9 Michael Hanselmann
       snodes_group_uuid) in raw_instances:
673 16e0b9c9 Michael Hanselmann
    if snodes and set([pnode_group_uuid]) != set(snodes_group_uuid):
674 16e0b9c9 Michael Hanselmann
      logging.error("Ignoring split instance '%s', primary group %s, secondary"
675 16e0b9c9 Michael Hanselmann
                    " groups %s", name, pnode_group_uuid,
676 16e0b9c9 Michael Hanselmann
                    utils.CommaJoin(snodes_group_uuid))
677 16e0b9c9 Michael Hanselmann
    else:
678 16e0b9c9 Michael Hanselmann
      instances.append(Instance(name, status, autostart, snodes))
679 16e0b9c9 Michael Hanselmann
680 16e0b9c9 Michael Hanselmann
      for node in snodes:
681 16e0b9c9 Michael Hanselmann
        secondaries.setdefault(node, set()).add(name)
682 16e0b9c9 Michael Hanselmann
683 16e0b9c9 Michael Hanselmann
  # Load all nodes
684 16e0b9c9 Michael Hanselmann
  nodes = [Node(name, bootid, offline, secondaries.get(name, set()))
685 16e0b9c9 Michael Hanselmann
           for (name, bootid, offline) in raw_nodes]
686 16e0b9c9 Michael Hanselmann
687 16e0b9c9 Michael Hanselmann
  return (dict((node.name, node) for node in nodes),
688 16e0b9c9 Michael Hanselmann
          dict((inst.name, inst) for inst in instances))
689 16e0b9c9 Michael Hanselmann
690 16e0b9c9 Michael Hanselmann
691 9bb69bb5 Michael Hanselmann
def _LoadKnownGroups():
692 9bb69bb5 Michael Hanselmann
  """Returns a list of all node groups known by L{ssconf}.
693 16e0b9c9 Michael Hanselmann

694 16e0b9c9 Michael Hanselmann
  """
695 16e0b9c9 Michael Hanselmann
  groups = ssconf.SimpleStore().GetNodegroupList()
696 16e0b9c9 Michael Hanselmann
697 9bb69bb5 Michael Hanselmann
  result = list(line.split(None, 1)[0] for line in groups
698 9bb69bb5 Michael Hanselmann
                if line.strip())
699 9bb69bb5 Michael Hanselmann
700 9bb69bb5 Michael Hanselmann
  if not compat.all(map(utils.UUID_RE.match, result)):
701 9bb69bb5 Michael Hanselmann
    raise errors.GenericError("Ssconf contains invalid group UUID")
702 9bb69bb5 Michael Hanselmann
703 9bb69bb5 Michael Hanselmann
  return result
704 16e0b9c9 Michael Hanselmann
705 16e0b9c9 Michael Hanselmann
706 16e0b9c9 Michael Hanselmann
def _GroupWatcher(opts):
707 16e0b9c9 Michael Hanselmann
  """Main function for per-group watcher process.
708 16e0b9c9 Michael Hanselmann

709 16e0b9c9 Michael Hanselmann
  """
710 16e0b9c9 Michael Hanselmann
  group_uuid = opts.nodegroup.lower()
711 16e0b9c9 Michael Hanselmann
712 16e0b9c9 Michael Hanselmann
  if not utils.UUID_RE.match(group_uuid):
713 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group parameter (%s) must be given a UUID,"
714 16e0b9c9 Michael Hanselmann
                              " got '%s'" %
715 16e0b9c9 Michael Hanselmann
                              (cli.NODEGROUP_OPT_NAME, group_uuid))
716 16e0b9c9 Michael Hanselmann
717 16e0b9c9 Michael Hanselmann
  logging.info("Watcher for node group '%s'", group_uuid)
718 16e0b9c9 Michael Hanselmann
719 9bb69bb5 Michael Hanselmann
  known_groups = _LoadKnownGroups()
720 9bb69bb5 Michael Hanselmann
721 16e0b9c9 Michael Hanselmann
  # Check if node group is known
722 9bb69bb5 Michael Hanselmann
  if group_uuid not in known_groups:
723 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group '%s' is not known by ssconf" %
724 16e0b9c9 Michael Hanselmann
                              group_uuid)
725 16e0b9c9 Michael Hanselmann
726 9bb69bb5 Michael Hanselmann
  # Group UUID has been verified and should not contain any dangerous characters
727 16e0b9c9 Michael Hanselmann
  state_path = constants.WATCHER_GROUP_STATE_FILE % group_uuid
728 9bb69bb5 Michael Hanselmann
  inst_status_path = constants.WATCHER_GROUP_INSTANCE_STATUS_FILE % group_uuid
729 16e0b9c9 Michael Hanselmann
730 16e0b9c9 Michael Hanselmann
  logging.debug("Using state file %s", state_path)
731 16e0b9c9 Michael Hanselmann
732 16e0b9c9 Michael Hanselmann
  # Global watcher
733 b459a848 Andrea Spadaccini
  statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
734 16e0b9c9 Michael Hanselmann
  if not statefile:
735 16e0b9c9 Michael Hanselmann
    return constants.EXIT_FAILURE
736 16e0b9c9 Michael Hanselmann
737 b459a848 Andrea Spadaccini
  notepad = state.WatcherState(statefile) # pylint: disable=E0602
738 16e0b9c9 Michael Hanselmann
  try:
739 16e0b9c9 Michael Hanselmann
    # Connect to master daemon
740 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(False)
741 16e0b9c9 Michael Hanselmann
742 16e0b9c9 Michael Hanselmann
    _CheckMaster(client)
743 16e0b9c9 Michael Hanselmann
744 16e0b9c9 Michael Hanselmann
    (nodes, instances) = _GetGroupData(client, group_uuid)
745 16e0b9c9 Michael Hanselmann
746 9bb69bb5 Michael Hanselmann
    # Update per-group instance status file
747 9bb69bb5 Michael Hanselmann
    _UpdateInstanceStatus(inst_status_path, instances.values())
748 9bb69bb5 Michael Hanselmann
749 9bb69bb5 Michael Hanselmann
    _MergeInstanceStatus(constants.INSTANCE_STATUS_FILE,
750 9bb69bb5 Michael Hanselmann
                         constants.WATCHER_GROUP_INSTANCE_STATUS_FILE,
751 9bb69bb5 Michael Hanselmann
                         known_groups)
752 9bb69bb5 Michael Hanselmann
753 16e0b9c9 Michael Hanselmann
    started = _CheckInstances(client, notepad, instances)
754 16e0b9c9 Michael Hanselmann
    _CheckDisks(client, notepad, nodes, instances, started)
755 16e0b9c9 Michael Hanselmann
    _VerifyDisks(client, group_uuid, nodes, instances)
756 16e0b9c9 Michael Hanselmann
  except Exception, err:
757 16e0b9c9 Michael Hanselmann
    logging.info("Not updating status file due to failure: %s", err)
758 16e0b9c9 Michael Hanselmann
    raise
759 16e0b9c9 Michael Hanselmann
  else:
760 16e0b9c9 Michael Hanselmann
    # Save changes for next run
761 16e0b9c9 Michael Hanselmann
    notepad.Save(state_path)
762 16e0b9c9 Michael Hanselmann
763 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
764 16e0b9c9 Michael Hanselmann
765 16e0b9c9 Michael Hanselmann
766 9f4bb951 Michael Hanselmann
def Main():
767 a8083063 Iustin Pop
  """Main function.
768 a8083063 Iustin Pop

769 a8083063 Iustin Pop
  """
770 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
771 a8083063 Iustin Pop
772 cfcc79c6 Michael Hanselmann
  utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
773 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
774 a8083063 Iustin Pop
775 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
776 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
777 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
778 3753b2cb Michael Hanselmann
779 16e0b9c9 Michael Hanselmann
  # Try to acquire global watcher lock in shared mode
780 16e0b9c9 Michael Hanselmann
  lock = utils.FileLock.Open(constants.WATCHER_LOCK_FILE)
781 a8083063 Iustin Pop
  try:
782 16e0b9c9 Michael Hanselmann
    lock.Shared(blocking=False)
783 16e0b9c9 Michael Hanselmann
  except (EnvironmentError, errors.LockError), err:
784 16e0b9c9 Michael Hanselmann
    logging.error("Can't acquire lock on %s: %s",
785 16e0b9c9 Michael Hanselmann
                  constants.WATCHER_LOCK_FILE, err)
786 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
787 db147305 Tom Limoncelli
788 16e0b9c9 Michael Hanselmann
  if options.nodegroup is None:
789 16e0b9c9 Michael Hanselmann
    fn = _GlobalWatcher
790 16e0b9c9 Michael Hanselmann
  else:
791 16e0b9c9 Michael Hanselmann
    # Per-nodegroup watcher
792 16e0b9c9 Michael Hanselmann
    fn = _GroupWatcher
793 16e0b9c9 Michael Hanselmann
794 16e0b9c9 Michael Hanselmann
  try:
795 16e0b9c9 Michael Hanselmann
    return fn(options)
796 16e0b9c9 Michael Hanselmann
  except (SystemExit, KeyboardInterrupt):
797 1b052f42 Michael Hanselmann
    raise
798 38242904 Iustin Pop
  except NotMasterError:
799 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
800 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
801 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
802 013ce4ae Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting", err.args[0])
803 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
804 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
805 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
806 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
807 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
808 438b45d4 Michael Hanselmann
  except Exception, err:
809 001b3825 Michael Hanselmann
    logging.exception(str(err))
810 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
811 5a3103e9 Michael Hanselmann
812 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS