Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ f7f03738

History | View | Annotate | Download (24.2 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 40b068e5 Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 cfcc79c6 Michael Hanselmann
import os.path
32 a8083063 Iustin Pop
import sys
33 a8083063 Iustin Pop
import time
34 438b45d4 Michael Hanselmann
import logging
35 16e0b9c9 Michael Hanselmann
import operator
36 9bb69bb5 Michael Hanselmann
import errno
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 83e5e26f René Nussbaumer
from ganeti import compat
42 89e1fc26 Iustin Pop
from ganeti import errors
43 e125c67c Michael Hanselmann
from ganeti import opcodes
44 e125c67c Michael Hanselmann
from ganeti import cli
45 7dfb83c2 Iustin Pop
from ganeti import luxi
46 db147305 Tom Limoncelli
from ganeti import rapi
47 a744b676 Manuel Franceschini
from ganeti import netutils
48 16e0b9c9 Michael Hanselmann
from ganeti import qlang
49 16e0b9c9 Michael Hanselmann
from ganeti import objects
50 16e0b9c9 Michael Hanselmann
from ganeti import ssconf
51 16e0b9c9 Michael Hanselmann
from ganeti import ht
52 57fe4a5b Michael Hanselmann
from ganeti import pathutils
53 a8083063 Iustin Pop
54 b459a848 Andrea Spadaccini
import ganeti.rapi.client # pylint: disable=W0611
55 fc3f75dd Iustin Pop
from ganeti.rapi.client import UsesRapiClient
56 adf6301e Michael Hanselmann
57 adf6301e Michael Hanselmann
from ganeti.watcher import nodemaint
58 adf6301e Michael Hanselmann
from ganeti.watcher import state
59 db147305 Tom Limoncelli
60 a8083063 Iustin Pop
61 5a3103e9 Michael Hanselmann
MAXTRIES = 5
62 b8028dcf Michael Hanselmann
BAD_STATES = compat.UniqueFrozenset([
63 0cc9e018 Michael Hanselmann
  constants.INSTST_ERRORDOWN,
64 0cc9e018 Michael Hanselmann
  ])
65 b8028dcf Michael Hanselmann
HELPLESS_STATES = compat.UniqueFrozenset([
66 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEDOWN,
67 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEOFFLINE,
68 0cc9e018 Michael Hanselmann
  ])
69 0cc9e018 Michael Hanselmann
NOTICE = "NOTICE"
70 0cc9e018 Michael Hanselmann
ERROR = "ERROR"
71 e125c67c Michael Hanselmann
72 16e0b9c9 Michael Hanselmann
#: Number of seconds to wait between starting child processes for node groups
73 16e0b9c9 Michael Hanselmann
CHILD_PROCESS_DELAY = 1.0
74 16e0b9c9 Michael Hanselmann
75 9bb69bb5 Michael Hanselmann
#: How many seconds to wait for instance status file lock
76 9bb69bb5 Michael Hanselmann
INSTANCE_STATUS_LOCK_TIMEOUT = 10.0
77 9bb69bb5 Michael Hanselmann
78 e125c67c Michael Hanselmann
79 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
80 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
81 a8083063 Iustin Pop
82 a8083063 Iustin Pop
83 3753b2cb Michael Hanselmann
def ShouldPause():
84 3753b2cb Michael Hanselmann
  """Check whether we should pause.
85 3753b2cb Michael Hanselmann

86 3753b2cb Michael Hanselmann
  """
87 57fe4a5b Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(pathutils.WATCHER_PAUSEFILE))
88 3753b2cb Michael Hanselmann
89 3753b2cb Michael Hanselmann
90 f1115454 Guido Trotter
def StartNodeDaemons():
91 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
92 f1115454 Guido Trotter

93 f1115454 Guido Trotter
  """
94 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
95 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
96 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
97 aa224134 Iustin Pop
  if constants.ENABLE_CONFD:
98 aa224134 Iustin Pop
    utils.EnsureDaemon(constants.CONFD)
99 f1115454 Guido Trotter
100 f1115454 Guido Trotter
101 9e289e36 Guido Trotter
def RunWatcherHooks():
102 9e289e36 Guido Trotter
  """Run the watcher hooks.
103 9e289e36 Guido Trotter

104 9e289e36 Guido Trotter
  """
105 57fe4a5b Michael Hanselmann
  hooks_dir = utils.PathJoin(pathutils.HOOKS_BASE_DIR,
106 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
107 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
108 10e689d4 Iustin Pop
    return
109 9e289e36 Guido Trotter
110 9e289e36 Guido Trotter
  try:
111 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
112 17385bd2 Andrea Spadaccini
  except Exception, err: # pylint: disable=W0703
113 17385bd2 Andrea Spadaccini
    logging.exception("RunParts %s failed: %s", hooks_dir, err)
114 a0aa6b49 Michael Hanselmann
    return
115 9e289e36 Guido Trotter
116 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
117 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
118 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
119 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
120 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
121 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
122 9e289e36 Guido Trotter
      if runresult.failed:
123 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
124 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
125 9e289e36 Guido Trotter
      else:
126 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
127 9e289e36 Guido Trotter
                      runresult.output)
128 013ce4ae Michael Hanselmann
    else:
129 013ce4ae Michael Hanselmann
      raise errors.ProgrammerError("Unknown status %s returned by RunParts",
130 013ce4ae Michael Hanselmann
                                   status)
131 9e289e36 Guido Trotter
132 001b3825 Michael Hanselmann
133 a8083063 Iustin Pop
class Instance(object):
134 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
135 a8083063 Iustin Pop

136 a8083063 Iustin Pop
  """
137 adf6301e Michael Hanselmann
  def __init__(self, name, status, autostart, snodes):
138 a8083063 Iustin Pop
    self.name = name
139 adf6301e Michael Hanselmann
    self.status = status
140 5a3103e9 Michael Hanselmann
    self.autostart = autostart
141 83e5e26f René Nussbaumer
    self.snodes = snodes
142 a8083063 Iustin Pop
143 16e0b9c9 Michael Hanselmann
  def Restart(self, cl):
144 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
145 3ecf6786 Iustin Pop

146 3ecf6786 Iustin Pop
    """
147 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
148 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
149 a8083063 Iustin Pop
150 16e0b9c9 Michael Hanselmann
  def ActivateDisks(self, cl):
151 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
152 5a3103e9 Michael Hanselmann

153 5a3103e9 Michael Hanselmann
    """
154 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
155 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
156 a8083063 Iustin Pop
157 a8083063 Iustin Pop
158 16e0b9c9 Michael Hanselmann
class Node:
159 16e0b9c9 Michael Hanselmann
  """Data container representing cluster node.
160 5a3103e9 Michael Hanselmann

161 5a3103e9 Michael Hanselmann
  """
162 16e0b9c9 Michael Hanselmann
  def __init__(self, name, bootid, offline, secondaries):
163 16e0b9c9 Michael Hanselmann
    """Initializes this class.
164 a8083063 Iustin Pop

165 16e0b9c9 Michael Hanselmann
    """
166 16e0b9c9 Michael Hanselmann
    self.name = name
167 16e0b9c9 Michael Hanselmann
    self.bootid = bootid
168 16e0b9c9 Michael Hanselmann
    self.offline = offline
169 16e0b9c9 Michael Hanselmann
    self.secondaries = secondaries
170 5a3103e9 Michael Hanselmann
171 78f44650 Iustin Pop
172 16e0b9c9 Michael Hanselmann
def _CheckInstances(cl, notepad, instances):
173 16e0b9c9 Michael Hanselmann
  """Make a pass over the list of instances, restarting downed ones.
174 5a3103e9 Michael Hanselmann

175 16e0b9c9 Michael Hanselmann
  """
176 16e0b9c9 Michael Hanselmann
  notepad.MaintainInstanceList(instances.keys())
177 78f44650 Iustin Pop
178 16e0b9c9 Michael Hanselmann
  started = set()
179 78f44650 Iustin Pop
180 16e0b9c9 Michael Hanselmann
  for inst in instances.values():
181 16e0b9c9 Michael Hanselmann
    if inst.status in BAD_STATES:
182 16e0b9c9 Michael Hanselmann
      n = notepad.NumberOfRestartAttempts(inst.name)
183 5a3103e9 Michael Hanselmann
184 16e0b9c9 Michael Hanselmann
      if n > MAXTRIES:
185 16e0b9c9 Michael Hanselmann
        logging.warning("Not restarting instance '%s', retries exhausted",
186 16e0b9c9 Michael Hanselmann
                        inst.name)
187 16e0b9c9 Michael Hanselmann
        continue
188 a8083063 Iustin Pop
189 16e0b9c9 Michael Hanselmann
      if n == MAXTRIES:
190 16e0b9c9 Michael Hanselmann
        notepad.RecordRestartAttempt(inst.name)
191 16e0b9c9 Michael Hanselmann
        logging.error("Could not restart instance '%s' after %s attempts,"
192 16e0b9c9 Michael Hanselmann
                      " giving up", inst.name, MAXTRIES)
193 16e0b9c9 Michael Hanselmann
        continue
194 5a3103e9 Michael Hanselmann
195 16e0b9c9 Michael Hanselmann
      try:
196 16e0b9c9 Michael Hanselmann
        logging.info("Restarting instance '%s' (attempt #%s)",
197 16e0b9c9 Michael Hanselmann
                     inst.name, n + 1)
198 16e0b9c9 Michael Hanselmann
        inst.Restart(cl)
199 b459a848 Andrea Spadaccini
      except Exception: # pylint: disable=W0703
200 16e0b9c9 Michael Hanselmann
        logging.exception("Error while restarting instance '%s'", inst.name)
201 16e0b9c9 Michael Hanselmann
      else:
202 16e0b9c9 Michael Hanselmann
        started.add(inst.name)
203 5a3103e9 Michael Hanselmann
204 16e0b9c9 Michael Hanselmann
      notepad.RecordRestartAttempt(inst.name)
205 5a3103e9 Michael Hanselmann
206 16e0b9c9 Michael Hanselmann
    else:
207 16e0b9c9 Michael Hanselmann
      if notepad.NumberOfRestartAttempts(inst.name):
208 16e0b9c9 Michael Hanselmann
        notepad.RemoveInstance(inst.name)
209 16e0b9c9 Michael Hanselmann
        if inst.status not in HELPLESS_STATES:
210 16e0b9c9 Michael Hanselmann
          logging.info("Restart of instance '%s' succeeded", inst.name)
211 a8083063 Iustin Pop
212 16e0b9c9 Michael Hanselmann
  return started
213 a8083063 Iustin Pop
214 a8083063 Iustin Pop
215 16e0b9c9 Michael Hanselmann
def _CheckDisks(cl, notepad, nodes, instances, started):
216 16e0b9c9 Michael Hanselmann
  """Check all nodes for restarted ones.
217 38242904 Iustin Pop

218 a8083063 Iustin Pop
  """
219 16e0b9c9 Michael Hanselmann
  check_nodes = []
220 16e0b9c9 Michael Hanselmann
221 16e0b9c9 Michael Hanselmann
  for node in nodes.values():
222 16e0b9c9 Michael Hanselmann
    old = notepad.GetNodeBootID(node.name)
223 16e0b9c9 Michael Hanselmann
    if not node.bootid:
224 16e0b9c9 Michael Hanselmann
      # Bad node, not returning a boot id
225 16e0b9c9 Michael Hanselmann
      if not node.offline:
226 16e0b9c9 Michael Hanselmann
        logging.debug("Node '%s' missing boot ID, skipping secondary checks",
227 16e0b9c9 Michael Hanselmann
                      node.name)
228 16e0b9c9 Michael Hanselmann
      continue
229 16e0b9c9 Michael Hanselmann
230 16e0b9c9 Michael Hanselmann
    if old != node.bootid:
231 16e0b9c9 Michael Hanselmann
      # Node's boot ID has changed, probably through a reboot
232 16e0b9c9 Michael Hanselmann
      check_nodes.append(node)
233 16e0b9c9 Michael Hanselmann
234 16e0b9c9 Michael Hanselmann
  if check_nodes:
235 16e0b9c9 Michael Hanselmann
    # Activate disks for all instances with any of the checked nodes as a
236 16e0b9c9 Michael Hanselmann
    # secondary node.
237 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
238 16e0b9c9 Michael Hanselmann
      for instance_name in node.secondaries:
239 16e0b9c9 Michael Hanselmann
        try:
240 16e0b9c9 Michael Hanselmann
          inst = instances[instance_name]
241 16e0b9c9 Michael Hanselmann
        except KeyError:
242 16e0b9c9 Michael Hanselmann
          logging.info("Can't find instance '%s', maybe it was ignored",
243 16e0b9c9 Michael Hanselmann
                       instance_name)
244 eee1fa2d Iustin Pop
          continue
245 a8083063 Iustin Pop
246 16e0b9c9 Michael Hanselmann
        if not inst.autostart:
247 16e0b9c9 Michael Hanselmann
          logging.info("Skipping disk activation for non-autostart"
248 16e0b9c9 Michael Hanselmann
                       " instance '%s'", inst.name)
249 a8083063 Iustin Pop
          continue
250 16e0b9c9 Michael Hanselmann
251 16e0b9c9 Michael Hanselmann
        if inst.name in started:
252 16e0b9c9 Michael Hanselmann
          # we already tried to start the instance, which should have
253 16e0b9c9 Michael Hanselmann
          # activated its drives (if they can be at all)
254 16e0b9c9 Michael Hanselmann
          logging.debug("Skipping disk activation for instance '%s' as"
255 16e0b9c9 Michael Hanselmann
                        " it was already started", inst.name)
256 a8083063 Iustin Pop
          continue
257 16e0b9c9 Michael Hanselmann
258 a8083063 Iustin Pop
        try:
259 16e0b9c9 Michael Hanselmann
          logging.info("Activating disks for instance '%s'", inst.name)
260 16e0b9c9 Michael Hanselmann
          inst.ActivateDisks(cl)
261 b459a848 Andrea Spadaccini
        except Exception: # pylint: disable=W0703
262 16e0b9c9 Michael Hanselmann
          logging.exception("Error while activating disks for instance '%s'",
263 16e0b9c9 Michael Hanselmann
                            inst.name)
264 a8083063 Iustin Pop
265 16e0b9c9 Michael Hanselmann
    # Keep changed boot IDs
266 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
267 16e0b9c9 Michael Hanselmann
      notepad.SetNodeBootID(node.name, node.bootid)
268 a8083063 Iustin Pop
269 83e5e26f René Nussbaumer
270 16e0b9c9 Michael Hanselmann
def _CheckForOfflineNodes(nodes, instance):
271 16e0b9c9 Michael Hanselmann
  """Checks if given instances has any secondary in offline status.
272 ae1a845c Michael Hanselmann

273 16e0b9c9 Michael Hanselmann
  @param instance: The instance object
274 16e0b9c9 Michael Hanselmann
  @return: True if any of the secondary is offline, False otherwise
275 ae1a845c Michael Hanselmann

276 16e0b9c9 Michael Hanselmann
  """
277 16e0b9c9 Michael Hanselmann
  return compat.any(nodes[node_name].offline for node_name in instance.snodes)
278 ae1a845c Michael Hanselmann
279 ae1a845c Michael Hanselmann
280 16e0b9c9 Michael Hanselmann
def _VerifyDisks(cl, uuid, nodes, instances):
281 16e0b9c9 Michael Hanselmann
  """Run a per-group "gnt-cluster verify-disks".
282 ae1a845c Michael Hanselmann

283 16e0b9c9 Michael Hanselmann
  """
284 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob([opcodes.OpGroupVerifyDisks(group_name=uuid)])
285 16e0b9c9 Michael Hanselmann
  ((_, offline_disk_instances, _), ) = \
286 16e0b9c9 Michael Hanselmann
    cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
287 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
288 ae1a845c Michael Hanselmann
289 16e0b9c9 Michael Hanselmann
  if not offline_disk_instances:
290 16e0b9c9 Michael Hanselmann
    # nothing to do
291 16e0b9c9 Michael Hanselmann
    logging.debug("Verify-disks reported no offline disks, nothing to do")
292 16e0b9c9 Michael Hanselmann
    return
293 ae1a845c Michael Hanselmann
294 16e0b9c9 Michael Hanselmann
  logging.debug("Will activate disks for instance(s) %s",
295 16e0b9c9 Michael Hanselmann
                utils.CommaJoin(offline_disk_instances))
296 ae1a845c Michael Hanselmann
297 16e0b9c9 Michael Hanselmann
  # We submit only one job, and wait for it. Not optimal, but this puts less
298 16e0b9c9 Michael Hanselmann
  # load on the job queue.
299 16e0b9c9 Michael Hanselmann
  job = []
300 16e0b9c9 Michael Hanselmann
  for name in offline_disk_instances:
301 16e0b9c9 Michael Hanselmann
    try:
302 16e0b9c9 Michael Hanselmann
      inst = instances[name]
303 16e0b9c9 Michael Hanselmann
    except KeyError:
304 16e0b9c9 Michael Hanselmann
      logging.info("Can't find instance '%s', maybe it was ignored", name)
305 16e0b9c9 Michael Hanselmann
      continue
306 ae1a845c Michael Hanselmann
307 16e0b9c9 Michael Hanselmann
    if inst.status in HELPLESS_STATES or _CheckForOfflineNodes(nodes, inst):
308 40b068e5 Iustin Pop
      logging.info("Skipping instance '%s' because it is in a helpless state"
309 40b068e5 Iustin Pop
                   " or has offline secondaries", name)
310 16e0b9c9 Michael Hanselmann
      continue
311 ae1a845c Michael Hanselmann
312 16e0b9c9 Michael Hanselmann
    job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
313 5188ab37 Iustin Pop
314 16e0b9c9 Michael Hanselmann
  if job:
315 16e0b9c9 Michael Hanselmann
    job_id = cli.SendJob(job, cl=cl)
316 83e5e26f René Nussbaumer
317 16e0b9c9 Michael Hanselmann
    try:
318 16e0b9c9 Michael Hanselmann
      cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
319 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
320 16e0b9c9 Michael Hanselmann
      logging.exception("Error while activating disks")
321 a8083063 Iustin Pop
322 a8083063 Iustin Pop
323 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
324 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
325 db147305 Tom Limoncelli

326 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
327 db147305 Tom Limoncelli
  test is GetVersion.
328 db147305 Tom Limoncelli

329 db147305 Tom Limoncelli
  @type hostname: string
330 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
331 db147305 Tom Limoncelli
  @rtype: bool
332 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
333 db147305 Tom Limoncelli

334 db147305 Tom Limoncelli
  """
335 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
336 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
337 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
338 db147305 Tom Limoncelli
  try:
339 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
340 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
341 d7c42723 Michael Hanselmann
    logging.warning("RAPI certificate error: %s", err)
342 db147305 Tom Limoncelli
    return False
343 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
344 d7c42723 Michael Hanselmann
    logging.warning("RAPI error: %s", err)
345 db147305 Tom Limoncelli
    return False
346 d7c42723 Michael Hanselmann
  else:
347 d7c42723 Michael Hanselmann
    logging.debug("Reported RAPI version %s", master_version)
348 d7c42723 Michael Hanselmann
    return master_version == constants.RAPI_VERSION
349 db147305 Tom Limoncelli
350 db147305 Tom Limoncelli
351 a8083063 Iustin Pop
def ParseOptions():
352 a8083063 Iustin Pop
  """Parse the command line options.
353 a8083063 Iustin Pop

354 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
355 a8083063 Iustin Pop

356 a8083063 Iustin Pop
  """
357 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
358 a8083063 Iustin Pop
                        usage="%prog [-d]",
359 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
360 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
361 a8083063 Iustin Pop
362 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
363 16e0b9c9 Michael Hanselmann
  parser.add_option(cli.NODEGROUP_OPT)
364 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
365 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
366 f0a80b01 Michael Hanselmann
                          " 6 hours)")
367 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
368 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
369 5f01e6ad Michael Hanselmann
  parser.add_option("--wait-children", dest="wait_children",
370 16e0b9c9 Michael Hanselmann
                    action="store_true", help="Wait for child processes")
371 5f01e6ad Michael Hanselmann
  parser.add_option("--no-wait-children", dest="wait_children",
372 40b068e5 Iustin Pop
                    action="store_false",
373 40b068e5 Iustin Pop
                    help="Don't wait for child processes")
374 5f01e6ad Michael Hanselmann
  # See optparse documentation for why default values are not set by options
375 5f01e6ad Michael Hanselmann
  parser.set_defaults(wait_children=True)
376 a8083063 Iustin Pop
  options, args = parser.parse_args()
377 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
378 f0a80b01 Michael Hanselmann
379 f0a80b01 Michael Hanselmann
  if args:
380 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
381 f0a80b01 Michael Hanselmann
382 f0a80b01 Michael Hanselmann
  return (options, args)
383 a8083063 Iustin Pop
384 a8083063 Iustin Pop
385 9bb69bb5 Michael Hanselmann
def _WriteInstanceStatus(filename, data):
386 9bb69bb5 Michael Hanselmann
  """Writes the per-group instance status file.
387 9bb69bb5 Michael Hanselmann

388 9bb69bb5 Michael Hanselmann
  The entries are sorted.
389 8f07dc0d Michael Hanselmann

390 9bb69bb5 Michael Hanselmann
  @type filename: string
391 9bb69bb5 Michael Hanselmann
  @param filename: Path to instance status file
392 9bb69bb5 Michael Hanselmann
  @type data: list of tuple; (instance name as string, status as string)
393 9bb69bb5 Michael Hanselmann
  @param data: Instance name and status
394 8f07dc0d Michael Hanselmann

395 8f07dc0d Michael Hanselmann
  """
396 9bb69bb5 Michael Hanselmann
  logging.debug("Updating instance status file '%s' with %s instances",
397 9bb69bb5 Michael Hanselmann
                filename, len(data))
398 8f07dc0d Michael Hanselmann
399 9bb69bb5 Michael Hanselmann
  utils.WriteFile(filename,
400 9bb69bb5 Michael Hanselmann
                  data="".join(map(compat.partial(operator.mod, "%s %s\n"),
401 9bb69bb5 Michael Hanselmann
                                   sorted(data))))
402 9bb69bb5 Michael Hanselmann
403 9bb69bb5 Michael Hanselmann
404 9bb69bb5 Michael Hanselmann
def _UpdateInstanceStatus(filename, instances):
405 9bb69bb5 Michael Hanselmann
  """Writes an instance status file from L{Instance} objects.
406 9bb69bb5 Michael Hanselmann

407 9bb69bb5 Michael Hanselmann
  @type filename: string
408 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
409 9bb69bb5 Michael Hanselmann
  @type instances: list of L{Instance}
410 9bb69bb5 Michael Hanselmann

411 9bb69bb5 Michael Hanselmann
  """
412 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, [(inst.name, inst.status)
413 9bb69bb5 Michael Hanselmann
                                  for inst in instances])
414 9bb69bb5 Michael Hanselmann
415 9bb69bb5 Michael Hanselmann
416 9bb69bb5 Michael Hanselmann
def _ReadInstanceStatus(filename):
417 9bb69bb5 Michael Hanselmann
  """Reads an instance status file.
418 9bb69bb5 Michael Hanselmann

419 9bb69bb5 Michael Hanselmann
  @type filename: string
420 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
421 9bb69bb5 Michael Hanselmann
  @rtype: tuple; (None or number, list of lists containing instance name and
422 9bb69bb5 Michael Hanselmann
    status)
423 9bb69bb5 Michael Hanselmann
  @return: File's mtime and instance status contained in the file; mtime is
424 9bb69bb5 Michael Hanselmann
    C{None} if file can't be read
425 9bb69bb5 Michael Hanselmann

426 9bb69bb5 Michael Hanselmann
  """
427 9bb69bb5 Michael Hanselmann
  logging.debug("Reading per-group instance status from '%s'", filename)
428 9bb69bb5 Michael Hanselmann
429 2635bb04 Michael Hanselmann
  statcb = utils.FileStatHelper()
430 9bb69bb5 Michael Hanselmann
  try:
431 9bb69bb5 Michael Hanselmann
    content = utils.ReadFile(filename, preread=statcb)
432 9bb69bb5 Michael Hanselmann
  except EnvironmentError, err:
433 9bb69bb5 Michael Hanselmann
    if err.errno == errno.ENOENT:
434 9bb69bb5 Michael Hanselmann
      logging.error("Can't read '%s', does not exist (yet)", filename)
435 9bb69bb5 Michael Hanselmann
    else:
436 9bb69bb5 Michael Hanselmann
      logging.exception("Unable to read '%s', ignoring", filename)
437 9bb69bb5 Michael Hanselmann
    return (None, None)
438 9bb69bb5 Michael Hanselmann
  else:
439 6f9e71bb Michael Hanselmann
    return (statcb.st.st_mtime, [line.split(None, 1)
440 9bb69bb5 Michael Hanselmann
                                 for line in content.splitlines()])
441 9bb69bb5 Michael Hanselmann
442 9bb69bb5 Michael Hanselmann
443 9bb69bb5 Michael Hanselmann
def _MergeInstanceStatus(filename, pergroup_filename, groups):
444 9bb69bb5 Michael Hanselmann
  """Merges all per-group instance status files into a global one.
445 9bb69bb5 Michael Hanselmann

446 9bb69bb5 Michael Hanselmann
  @type filename: string
447 9bb69bb5 Michael Hanselmann
  @param filename: Path to global instance status file
448 9bb69bb5 Michael Hanselmann
  @type pergroup_filename: string
449 9bb69bb5 Michael Hanselmann
  @param pergroup_filename: Path to per-group status files, must contain "%s"
450 9bb69bb5 Michael Hanselmann
    to be replaced with group UUID
451 9bb69bb5 Michael Hanselmann
  @type groups: sequence
452 9bb69bb5 Michael Hanselmann
  @param groups: UUIDs of known groups
453 9bb69bb5 Michael Hanselmann

454 9bb69bb5 Michael Hanselmann
  """
455 9bb69bb5 Michael Hanselmann
  # Lock global status file in exclusive mode
456 9bb69bb5 Michael Hanselmann
  lock = utils.FileLock.Open(filename)
457 9bb69bb5 Michael Hanselmann
  try:
458 9bb69bb5 Michael Hanselmann
    lock.Exclusive(blocking=True, timeout=INSTANCE_STATUS_LOCK_TIMEOUT)
459 9bb69bb5 Michael Hanselmann
  except errors.LockError, err:
460 9bb69bb5 Michael Hanselmann
    # All per-group processes will lock and update the file. None of them
461 9bb69bb5 Michael Hanselmann
    # should take longer than 10 seconds (the value of
462 9bb69bb5 Michael Hanselmann
    # INSTANCE_STATUS_LOCK_TIMEOUT).
463 9bb69bb5 Michael Hanselmann
    logging.error("Can't acquire lock on instance status file '%s', not"
464 9bb69bb5 Michael Hanselmann
                  " updating: %s", filename, err)
465 9bb69bb5 Michael Hanselmann
    return
466 9bb69bb5 Michael Hanselmann
467 9bb69bb5 Michael Hanselmann
  logging.debug("Acquired exclusive lock on '%s'", filename)
468 9bb69bb5 Michael Hanselmann
469 9bb69bb5 Michael Hanselmann
  data = {}
470 9bb69bb5 Michael Hanselmann
471 9bb69bb5 Michael Hanselmann
  # Load instance status from all groups
472 9bb69bb5 Michael Hanselmann
  for group_uuid in groups:
473 9bb69bb5 Michael Hanselmann
    (mtime, instdata) = _ReadInstanceStatus(pergroup_filename % group_uuid)
474 9bb69bb5 Michael Hanselmann
475 9bb69bb5 Michael Hanselmann
    if mtime is not None:
476 9bb69bb5 Michael Hanselmann
      for (instance_name, status) in instdata:
477 9bb69bb5 Michael Hanselmann
        data.setdefault(instance_name, []).append((mtime, status))
478 9bb69bb5 Michael Hanselmann
479 9bb69bb5 Michael Hanselmann
  # Select last update based on file mtime
480 9bb69bb5 Michael Hanselmann
  inststatus = [(instance_name, sorted(status, reverse=True)[0][1])
481 9bb69bb5 Michael Hanselmann
                for (instance_name, status) in data.items()]
482 9bb69bb5 Michael Hanselmann
483 9bb69bb5 Michael Hanselmann
  # Write the global status file. Don't touch file after it's been
484 9bb69bb5 Michael Hanselmann
  # updated--there is no lock anymore.
485 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, inststatus)
486 8f07dc0d Michael Hanselmann
487 8f07dc0d Michael Hanselmann
488 16e0b9c9 Michael Hanselmann
def GetLuxiClient(try_restart):
489 16e0b9c9 Michael Hanselmann
  """Tries to connect to the master daemon.
490 16e0b9c9 Michael Hanselmann

491 16e0b9c9 Michael Hanselmann
  @type try_restart: bool
492 16e0b9c9 Michael Hanselmann
  @param try_restart: Whether to attempt to restart the master daemon
493 16e0b9c9 Michael Hanselmann

494 16e0b9c9 Michael Hanselmann
  """
495 16e0b9c9 Michael Hanselmann
  try:
496 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
497 16e0b9c9 Michael Hanselmann
  except errors.OpPrereqError, err:
498 16e0b9c9 Michael Hanselmann
    # this is, from cli.GetClient, a not-master case
499 16e0b9c9 Michael Hanselmann
    raise NotMasterError("Not on master node (%s)" % err)
500 16e0b9c9 Michael Hanselmann
501 16e0b9c9 Michael Hanselmann
  except luxi.NoMasterError, err:
502 16e0b9c9 Michael Hanselmann
    if not try_restart:
503 16e0b9c9 Michael Hanselmann
      raise
504 16e0b9c9 Michael Hanselmann
505 16e0b9c9 Michael Hanselmann
    logging.warning("Master daemon seems to be down (%s), trying to restart",
506 16e0b9c9 Michael Hanselmann
                    err)
507 16e0b9c9 Michael Hanselmann
508 16e0b9c9 Michael Hanselmann
    if not utils.EnsureDaemon(constants.MASTERD):
509 16e0b9c9 Michael Hanselmann
      raise errors.GenericError("Can't start the master daemon")
510 16e0b9c9 Michael Hanselmann
511 16e0b9c9 Michael Hanselmann
    # Retry the connection
512 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
513 16e0b9c9 Michael Hanselmann
514 16e0b9c9 Michael Hanselmann
515 16e0b9c9 Michael Hanselmann
def _StartGroupChildren(cl, wait):
516 16e0b9c9 Michael Hanselmann
  """Starts a new instance of the watcher for every node group.
517 16e0b9c9 Michael Hanselmann

518 16e0b9c9 Michael Hanselmann
  """
519 16e0b9c9 Michael Hanselmann
  assert not compat.any(arg.startswith(cli.NODEGROUP_OPT_NAME)
520 16e0b9c9 Michael Hanselmann
                        for arg in sys.argv)
521 16e0b9c9 Michael Hanselmann
522 16e0b9c9 Michael Hanselmann
  result = cl.QueryGroups([], ["name", "uuid"], False)
523 16e0b9c9 Michael Hanselmann
524 16e0b9c9 Michael Hanselmann
  children = []
525 16e0b9c9 Michael Hanselmann
526 16e0b9c9 Michael Hanselmann
  for (idx, (name, uuid)) in enumerate(result):
527 16e0b9c9 Michael Hanselmann
    args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]
528 16e0b9c9 Michael Hanselmann
529 16e0b9c9 Michael Hanselmann
    if idx > 0:
530 16e0b9c9 Michael Hanselmann
      # Let's not kill the system
531 16e0b9c9 Michael Hanselmann
      time.sleep(CHILD_PROCESS_DELAY)
532 16e0b9c9 Michael Hanselmann
533 16e0b9c9 Michael Hanselmann
    logging.debug("Spawning child for group '%s' (%s), arguments %s",
534 16e0b9c9 Michael Hanselmann
                  name, uuid, args)
535 16e0b9c9 Michael Hanselmann
536 16e0b9c9 Michael Hanselmann
    try:
537 16e0b9c9 Michael Hanselmann
      # TODO: Should utils.StartDaemon be used instead?
538 16e0b9c9 Michael Hanselmann
      pid = os.spawnv(os.P_NOWAIT, args[0], args)
539 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
540 16e0b9c9 Michael Hanselmann
      logging.exception("Failed to start child for group '%s' (%s)",
541 16e0b9c9 Michael Hanselmann
                        name, uuid)
542 16e0b9c9 Michael Hanselmann
    else:
543 16e0b9c9 Michael Hanselmann
      logging.debug("Started with PID %s", pid)
544 16e0b9c9 Michael Hanselmann
      children.append(pid)
545 16e0b9c9 Michael Hanselmann
546 16e0b9c9 Michael Hanselmann
  if wait:
547 16e0b9c9 Michael Hanselmann
    for pid in children:
548 16e0b9c9 Michael Hanselmann
      logging.debug("Waiting for child PID %s", pid)
549 16e0b9c9 Michael Hanselmann
      try:
550 16e0b9c9 Michael Hanselmann
        result = utils.RetryOnSignal(os.waitpid, pid, 0)
551 16e0b9c9 Michael Hanselmann
      except EnvironmentError, err:
552 16e0b9c9 Michael Hanselmann
        result = str(err)
553 16e0b9c9 Michael Hanselmann
554 16e0b9c9 Michael Hanselmann
      logging.debug("Child PID %s exited with status %s", pid, result)
555 16e0b9c9 Michael Hanselmann
556 16e0b9c9 Michael Hanselmann
557 16e0b9c9 Michael Hanselmann
def _ArchiveJobs(cl, age):
558 16e0b9c9 Michael Hanselmann
  """Archives old jobs.
559 16e0b9c9 Michael Hanselmann

560 16e0b9c9 Michael Hanselmann
  """
561 16e0b9c9 Michael Hanselmann
  (arch_count, left_count) = cl.AutoArchiveJobs(age)
562 16e0b9c9 Michael Hanselmann
  logging.debug("Archived %s jobs, left %s", arch_count, left_count)
563 16e0b9c9 Michael Hanselmann
564 16e0b9c9 Michael Hanselmann
565 16e0b9c9 Michael Hanselmann
def _CheckMaster(cl):
566 16e0b9c9 Michael Hanselmann
  """Ensures current host is master node.
567 16e0b9c9 Michael Hanselmann

568 16e0b9c9 Michael Hanselmann
  """
569 16e0b9c9 Michael Hanselmann
  (master, ) = cl.QueryConfigValues(["master_node"])
570 16e0b9c9 Michael Hanselmann
  if master != netutils.Hostname.GetSysName():
571 16e0b9c9 Michael Hanselmann
    raise NotMasterError("This is not the master node")
572 16e0b9c9 Michael Hanselmann
573 16e0b9c9 Michael Hanselmann
574 fc3f75dd Iustin Pop
@UsesRapiClient
575 16e0b9c9 Michael Hanselmann
def _GlobalWatcher(opts):
576 16e0b9c9 Michael Hanselmann
  """Main function for global watcher.
577 16e0b9c9 Michael Hanselmann

578 16e0b9c9 Michael Hanselmann
  At the end child processes are spawned for every node group.
579 16e0b9c9 Michael Hanselmann

580 16e0b9c9 Michael Hanselmann
  """
581 16e0b9c9 Michael Hanselmann
  StartNodeDaemons()
582 16e0b9c9 Michael Hanselmann
  RunWatcherHooks()
583 16e0b9c9 Michael Hanselmann
584 16e0b9c9 Michael Hanselmann
  # Run node maintenance in all cases, even if master, so that old masters can
585 16e0b9c9 Michael Hanselmann
  # be properly cleaned up
586 b459a848 Andrea Spadaccini
  if nodemaint.NodeMaintenance.ShouldRun(): # pylint: disable=E0602
587 b459a848 Andrea Spadaccini
    nodemaint.NodeMaintenance().Exec() # pylint: disable=E0602
588 16e0b9c9 Michael Hanselmann
589 16e0b9c9 Michael Hanselmann
  try:
590 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(True)
591 16e0b9c9 Michael Hanselmann
  except NotMasterError:
592 16e0b9c9 Michael Hanselmann
    # Don't proceed on non-master nodes
593 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
594 16e0b9c9 Michael Hanselmann
595 16e0b9c9 Michael Hanselmann
  # we are on master now
596 16e0b9c9 Michael Hanselmann
  utils.EnsureDaemon(constants.RAPI)
597 16e0b9c9 Michael Hanselmann
598 16e0b9c9 Michael Hanselmann
  # If RAPI isn't responding to queries, try one restart
599 16e0b9c9 Michael Hanselmann
  logging.debug("Attempting to talk to remote API on %s",
600 16e0b9c9 Michael Hanselmann
                constants.IP4_ADDRESS_LOCALHOST)
601 16e0b9c9 Michael Hanselmann
  if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
602 16e0b9c9 Michael Hanselmann
    logging.warning("Couldn't get answer from remote API, restaring daemon")
603 16e0b9c9 Michael Hanselmann
    utils.StopDaemon(constants.RAPI)
604 16e0b9c9 Michael Hanselmann
    utils.EnsureDaemon(constants.RAPI)
605 16e0b9c9 Michael Hanselmann
    logging.debug("Second attempt to talk to remote API")
606 16e0b9c9 Michael Hanselmann
    if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
607 16e0b9c9 Michael Hanselmann
      logging.fatal("RAPI is not responding")
608 16e0b9c9 Michael Hanselmann
  logging.debug("Successfully talked to remote API")
609 16e0b9c9 Michael Hanselmann
610 16e0b9c9 Michael Hanselmann
  _CheckMaster(client)
611 16e0b9c9 Michael Hanselmann
  _ArchiveJobs(client, opts.job_age)
612 16e0b9c9 Michael Hanselmann
613 16e0b9c9 Michael Hanselmann
  # Spawn child processes for all node groups
614 16e0b9c9 Michael Hanselmann
  _StartGroupChildren(client, opts.wait_children)
615 16e0b9c9 Michael Hanselmann
616 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
617 16e0b9c9 Michael Hanselmann
618 16e0b9c9 Michael Hanselmann
619 16e0b9c9 Michael Hanselmann
def _GetGroupData(cl, uuid):
620 16e0b9c9 Michael Hanselmann
  """Retrieves instances and nodes per node group.
621 16e0b9c9 Michael Hanselmann

622 16e0b9c9 Michael Hanselmann
  """
623 16e0b9c9 Michael Hanselmann
  job = [
624 16e0b9c9 Michael Hanselmann
    # Get all primary instances in group
625 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_INSTANCE,
626 16e0b9c9 Michael Hanselmann
                    fields=["name", "status", "admin_state", "snodes",
627 16e0b9c9 Michael Hanselmann
                            "pnode.group.uuid", "snodes.group.uuid"],
628 2e5c33db Iustin Pop
                    qfilter=[qlang.OP_EQUAL, "pnode.group.uuid", uuid],
629 5bfb1134 Michael Hanselmann
                    use_locking=True),
630 16e0b9c9 Michael Hanselmann
631 16e0b9c9 Michael Hanselmann
    # Get all nodes in group
632 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_NODE,
633 16e0b9c9 Michael Hanselmann
                    fields=["name", "bootid", "offline"],
634 2e5c33db Iustin Pop
                    qfilter=[qlang.OP_EQUAL, "group.uuid", uuid],
635 5bfb1134 Michael Hanselmann
                    use_locking=True),
636 16e0b9c9 Michael Hanselmann
    ]
637 16e0b9c9 Michael Hanselmann
638 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob(job)
639 16e0b9c9 Michael Hanselmann
  results = map(objects.QueryResponse.FromDict,
640 16e0b9c9 Michael Hanselmann
                cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug))
641 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
642 16e0b9c9 Michael Hanselmann
643 16e0b9c9 Michael Hanselmann
  results_data = map(operator.attrgetter("data"), results)
644 16e0b9c9 Michael Hanselmann
645 16e0b9c9 Michael Hanselmann
  # Ensure results are tuples with two values
646 16e0b9c9 Michael Hanselmann
  assert compat.all(map(ht.TListOf(ht.TListOf(ht.TIsLength(2))), results_data))
647 16e0b9c9 Michael Hanselmann
648 16e0b9c9 Michael Hanselmann
  # Extract values ignoring result status
649 16e0b9c9 Michael Hanselmann
  (raw_instances, raw_nodes) = [[map(compat.snd, values)
650 16e0b9c9 Michael Hanselmann
                                 for values in res]
651 16e0b9c9 Michael Hanselmann
                                for res in results_data]
652 16e0b9c9 Michael Hanselmann
653 16e0b9c9 Michael Hanselmann
  secondaries = {}
654 16e0b9c9 Michael Hanselmann
  instances = []
655 16e0b9c9 Michael Hanselmann
656 16e0b9c9 Michael Hanselmann
  # Load all instances
657 16e0b9c9 Michael Hanselmann
  for (name, status, autostart, snodes, pnode_group_uuid,
658 16e0b9c9 Michael Hanselmann
       snodes_group_uuid) in raw_instances:
659 16e0b9c9 Michael Hanselmann
    if snodes and set([pnode_group_uuid]) != set(snodes_group_uuid):
660 16e0b9c9 Michael Hanselmann
      logging.error("Ignoring split instance '%s', primary group %s, secondary"
661 16e0b9c9 Michael Hanselmann
                    " groups %s", name, pnode_group_uuid,
662 16e0b9c9 Michael Hanselmann
                    utils.CommaJoin(snodes_group_uuid))
663 16e0b9c9 Michael Hanselmann
    else:
664 16e0b9c9 Michael Hanselmann
      instances.append(Instance(name, status, autostart, snodes))
665 16e0b9c9 Michael Hanselmann
666 16e0b9c9 Michael Hanselmann
      for node in snodes:
667 16e0b9c9 Michael Hanselmann
        secondaries.setdefault(node, set()).add(name)
668 16e0b9c9 Michael Hanselmann
669 16e0b9c9 Michael Hanselmann
  # Load all nodes
670 16e0b9c9 Michael Hanselmann
  nodes = [Node(name, bootid, offline, secondaries.get(name, set()))
671 16e0b9c9 Michael Hanselmann
           for (name, bootid, offline) in raw_nodes]
672 16e0b9c9 Michael Hanselmann
673 16e0b9c9 Michael Hanselmann
  return (dict((node.name, node) for node in nodes),
674 16e0b9c9 Michael Hanselmann
          dict((inst.name, inst) for inst in instances))
675 16e0b9c9 Michael Hanselmann
676 16e0b9c9 Michael Hanselmann
677 9bb69bb5 Michael Hanselmann
def _LoadKnownGroups():
678 9bb69bb5 Michael Hanselmann
  """Returns a list of all node groups known by L{ssconf}.
679 16e0b9c9 Michael Hanselmann

680 16e0b9c9 Michael Hanselmann
  """
681 16e0b9c9 Michael Hanselmann
  groups = ssconf.SimpleStore().GetNodegroupList()
682 16e0b9c9 Michael Hanselmann
683 9bb69bb5 Michael Hanselmann
  result = list(line.split(None, 1)[0] for line in groups
684 9bb69bb5 Michael Hanselmann
                if line.strip())
685 9bb69bb5 Michael Hanselmann
686 9bb69bb5 Michael Hanselmann
  if not compat.all(map(utils.UUID_RE.match, result)):
687 9bb69bb5 Michael Hanselmann
    raise errors.GenericError("Ssconf contains invalid group UUID")
688 9bb69bb5 Michael Hanselmann
689 9bb69bb5 Michael Hanselmann
  return result
690 16e0b9c9 Michael Hanselmann
691 16e0b9c9 Michael Hanselmann
692 16e0b9c9 Michael Hanselmann
def _GroupWatcher(opts):
693 16e0b9c9 Michael Hanselmann
  """Main function for per-group watcher process.
694 16e0b9c9 Michael Hanselmann

695 16e0b9c9 Michael Hanselmann
  """
696 16e0b9c9 Michael Hanselmann
  group_uuid = opts.nodegroup.lower()
697 16e0b9c9 Michael Hanselmann
698 16e0b9c9 Michael Hanselmann
  if not utils.UUID_RE.match(group_uuid):
699 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group parameter (%s) must be given a UUID,"
700 16e0b9c9 Michael Hanselmann
                              " got '%s'" %
701 16e0b9c9 Michael Hanselmann
                              (cli.NODEGROUP_OPT_NAME, group_uuid))
702 16e0b9c9 Michael Hanselmann
703 16e0b9c9 Michael Hanselmann
  logging.info("Watcher for node group '%s'", group_uuid)
704 16e0b9c9 Michael Hanselmann
705 9bb69bb5 Michael Hanselmann
  known_groups = _LoadKnownGroups()
706 9bb69bb5 Michael Hanselmann
707 16e0b9c9 Michael Hanselmann
  # Check if node group is known
708 9bb69bb5 Michael Hanselmann
  if group_uuid not in known_groups:
709 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group '%s' is not known by ssconf" %
710 16e0b9c9 Michael Hanselmann
                              group_uuid)
711 16e0b9c9 Michael Hanselmann
712 40b068e5 Iustin Pop
  # Group UUID has been verified and should not contain any dangerous
713 40b068e5 Iustin Pop
  # characters
714 57fe4a5b Michael Hanselmann
  state_path = pathutils.WATCHER_GROUP_STATE_FILE % group_uuid
715 57fe4a5b Michael Hanselmann
  inst_status_path = pathutils.WATCHER_GROUP_INSTANCE_STATUS_FILE % group_uuid
716 16e0b9c9 Michael Hanselmann
717 16e0b9c9 Michael Hanselmann
  logging.debug("Using state file %s", state_path)
718 16e0b9c9 Michael Hanselmann
719 16e0b9c9 Michael Hanselmann
  # Global watcher
720 b459a848 Andrea Spadaccini
  statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
721 16e0b9c9 Michael Hanselmann
  if not statefile:
722 16e0b9c9 Michael Hanselmann
    return constants.EXIT_FAILURE
723 16e0b9c9 Michael Hanselmann
724 b459a848 Andrea Spadaccini
  notepad = state.WatcherState(statefile) # pylint: disable=E0602
725 16e0b9c9 Michael Hanselmann
  try:
726 16e0b9c9 Michael Hanselmann
    # Connect to master daemon
727 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(False)
728 16e0b9c9 Michael Hanselmann
729 16e0b9c9 Michael Hanselmann
    _CheckMaster(client)
730 16e0b9c9 Michael Hanselmann
731 16e0b9c9 Michael Hanselmann
    (nodes, instances) = _GetGroupData(client, group_uuid)
732 16e0b9c9 Michael Hanselmann
733 9bb69bb5 Michael Hanselmann
    # Update per-group instance status file
734 9bb69bb5 Michael Hanselmann
    _UpdateInstanceStatus(inst_status_path, instances.values())
735 9bb69bb5 Michael Hanselmann
736 57fe4a5b Michael Hanselmann
    _MergeInstanceStatus(pathutils.INSTANCE_STATUS_FILE,
737 57fe4a5b Michael Hanselmann
                         pathutils.WATCHER_GROUP_INSTANCE_STATUS_FILE,
738 9bb69bb5 Michael Hanselmann
                         known_groups)
739 9bb69bb5 Michael Hanselmann
740 16e0b9c9 Michael Hanselmann
    started = _CheckInstances(client, notepad, instances)
741 16e0b9c9 Michael Hanselmann
    _CheckDisks(client, notepad, nodes, instances, started)
742 16e0b9c9 Michael Hanselmann
    _VerifyDisks(client, group_uuid, nodes, instances)
743 16e0b9c9 Michael Hanselmann
  except Exception, err:
744 16e0b9c9 Michael Hanselmann
    logging.info("Not updating status file due to failure: %s", err)
745 16e0b9c9 Michael Hanselmann
    raise
746 16e0b9c9 Michael Hanselmann
  else:
747 16e0b9c9 Michael Hanselmann
    # Save changes for next run
748 16e0b9c9 Michael Hanselmann
    notepad.Save(state_path)
749 16e0b9c9 Michael Hanselmann
750 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
751 16e0b9c9 Michael Hanselmann
752 16e0b9c9 Michael Hanselmann
753 9f4bb951 Michael Hanselmann
def Main():
754 a8083063 Iustin Pop
  """Main function.
755 a8083063 Iustin Pop

756 a8083063 Iustin Pop
  """
757 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
758 a8083063 Iustin Pop
759 57fe4a5b Michael Hanselmann
  utils.SetupLogging(pathutils.LOG_WATCHER, sys.argv[0],
760 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
761 a8083063 Iustin Pop
762 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
763 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
764 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
765 3753b2cb Michael Hanselmann
766 16e0b9c9 Michael Hanselmann
  # Try to acquire global watcher lock in shared mode
767 57fe4a5b Michael Hanselmann
  lock = utils.FileLock.Open(pathutils.WATCHER_LOCK_FILE)
768 a8083063 Iustin Pop
  try:
769 16e0b9c9 Michael Hanselmann
    lock.Shared(blocking=False)
770 16e0b9c9 Michael Hanselmann
  except (EnvironmentError, errors.LockError), err:
771 16e0b9c9 Michael Hanselmann
    logging.error("Can't acquire lock on %s: %s",
772 57fe4a5b Michael Hanselmann
                  pathutils.WATCHER_LOCK_FILE, err)
773 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
774 db147305 Tom Limoncelli
775 16e0b9c9 Michael Hanselmann
  if options.nodegroup is None:
776 16e0b9c9 Michael Hanselmann
    fn = _GlobalWatcher
777 16e0b9c9 Michael Hanselmann
  else:
778 16e0b9c9 Michael Hanselmann
    # Per-nodegroup watcher
779 16e0b9c9 Michael Hanselmann
    fn = _GroupWatcher
780 16e0b9c9 Michael Hanselmann
781 16e0b9c9 Michael Hanselmann
  try:
782 16e0b9c9 Michael Hanselmann
    return fn(options)
783 16e0b9c9 Michael Hanselmann
  except (SystemExit, KeyboardInterrupt):
784 1b052f42 Michael Hanselmann
    raise
785 38242904 Iustin Pop
  except NotMasterError:
786 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
787 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
788 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
789 013ce4ae Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting", err.args[0])
790 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
791 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
792 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
793 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
794 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
795 438b45d4 Michael Hanselmann
  except Exception, err:
796 001b3825 Michael Hanselmann
    logging.exception(str(err))
797 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
798 5a3103e9 Michael Hanselmann
799 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS