Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ b459a848

History | View | Annotate | Download (24.3 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 f2af0bec Iustin Pop
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 cfcc79c6 Michael Hanselmann
import os.path
32 a8083063 Iustin Pop
import sys
33 a8083063 Iustin Pop
import time
34 438b45d4 Michael Hanselmann
import logging
35 16e0b9c9 Michael Hanselmann
import operator
36 9bb69bb5 Michael Hanselmann
import errno
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 83e5e26f René Nussbaumer
from ganeti import compat
42 89e1fc26 Iustin Pop
from ganeti import errors
43 e125c67c Michael Hanselmann
from ganeti import opcodes
44 e125c67c Michael Hanselmann
from ganeti import cli
45 7dfb83c2 Iustin Pop
from ganeti import luxi
46 db147305 Tom Limoncelli
from ganeti import rapi
47 a744b676 Manuel Franceschini
from ganeti import netutils
48 16e0b9c9 Michael Hanselmann
from ganeti import qlang
49 16e0b9c9 Michael Hanselmann
from ganeti import objects
50 16e0b9c9 Michael Hanselmann
from ganeti import ssconf
51 16e0b9c9 Michael Hanselmann
from ganeti import ht
52 a8083063 Iustin Pop
53 b459a848 Andrea Spadaccini
import ganeti.rapi.client # pylint: disable=W0611
54 adf6301e Michael Hanselmann
55 adf6301e Michael Hanselmann
from ganeti.watcher import nodemaint
56 adf6301e Michael Hanselmann
from ganeti.watcher import state
57 db147305 Tom Limoncelli
58 a8083063 Iustin Pop
59 5a3103e9 Michael Hanselmann
MAXTRIES = 5
60 0cc9e018 Michael Hanselmann
BAD_STATES = frozenset([
61 0cc9e018 Michael Hanselmann
  constants.INSTST_ERRORDOWN,
62 0cc9e018 Michael Hanselmann
  ])
63 0cc9e018 Michael Hanselmann
HELPLESS_STATES = frozenset([
64 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEDOWN,
65 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEOFFLINE,
66 0cc9e018 Michael Hanselmann
  ])
67 0cc9e018 Michael Hanselmann
NOTICE = "NOTICE"
68 0cc9e018 Michael Hanselmann
ERROR = "ERROR"
69 e125c67c Michael Hanselmann
70 16e0b9c9 Michael Hanselmann
#: Number of seconds to wait between starting child processes for node groups
71 16e0b9c9 Michael Hanselmann
CHILD_PROCESS_DELAY = 1.0
72 16e0b9c9 Michael Hanselmann
73 9bb69bb5 Michael Hanselmann
#: How many seconds to wait for instance status file lock
74 9bb69bb5 Michael Hanselmann
INSTANCE_STATUS_LOCK_TIMEOUT = 10.0
75 9bb69bb5 Michael Hanselmann
76 e125c67c Michael Hanselmann
77 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
78 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
79 a8083063 Iustin Pop
80 a8083063 Iustin Pop
81 3753b2cb Michael Hanselmann
def ShouldPause():
82 3753b2cb Michael Hanselmann
  """Check whether we should pause.
83 3753b2cb Michael Hanselmann

84 3753b2cb Michael Hanselmann
  """
85 3753b2cb Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
86 3753b2cb Michael Hanselmann
87 3753b2cb Michael Hanselmann
88 f1115454 Guido Trotter
def StartNodeDaemons():
89 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
90 f1115454 Guido Trotter

91 f1115454 Guido Trotter
  """
92 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
93 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
94 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
95 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.CONFD)
96 f1115454 Guido Trotter
97 f1115454 Guido Trotter
98 9e289e36 Guido Trotter
def RunWatcherHooks():
99 9e289e36 Guido Trotter
  """Run the watcher hooks.
100 9e289e36 Guido Trotter

101 9e289e36 Guido Trotter
  """
102 c4feafe8 Iustin Pop
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
103 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
104 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
105 10e689d4 Iustin Pop
    return
106 9e289e36 Guido Trotter
107 9e289e36 Guido Trotter
  try:
108 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
109 b459a848 Andrea Spadaccini
  except Exception: # pylint: disable=W0703
110 a0aa6b49 Michael Hanselmann
    logging.exception("RunParts %s failed: %s", hooks_dir)
111 a0aa6b49 Michael Hanselmann
    return
112 9e289e36 Guido Trotter
113 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
114 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
115 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
116 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
117 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
118 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
119 9e289e36 Guido Trotter
      if runresult.failed:
120 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
121 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
122 9e289e36 Guido Trotter
      else:
123 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
124 9e289e36 Guido Trotter
                      runresult.output)
125 013ce4ae Michael Hanselmann
    else:
126 013ce4ae Michael Hanselmann
      raise errors.ProgrammerError("Unknown status %s returned by RunParts",
127 013ce4ae Michael Hanselmann
                                   status)
128 9e289e36 Guido Trotter
129 001b3825 Michael Hanselmann
130 a8083063 Iustin Pop
class Instance(object):
131 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
132 a8083063 Iustin Pop

133 a8083063 Iustin Pop
  """
134 adf6301e Michael Hanselmann
  def __init__(self, name, status, autostart, snodes):
135 a8083063 Iustin Pop
    self.name = name
136 adf6301e Michael Hanselmann
    self.status = status
137 5a3103e9 Michael Hanselmann
    self.autostart = autostart
138 83e5e26f René Nussbaumer
    self.snodes = snodes
139 a8083063 Iustin Pop
140 16e0b9c9 Michael Hanselmann
  def Restart(self, cl):
141 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
142 3ecf6786 Iustin Pop

143 3ecf6786 Iustin Pop
    """
144 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
145 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
146 a8083063 Iustin Pop
147 16e0b9c9 Michael Hanselmann
  def ActivateDisks(self, cl):
148 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
149 5a3103e9 Michael Hanselmann

150 5a3103e9 Michael Hanselmann
    """
151 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
152 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
153 a8083063 Iustin Pop
154 a8083063 Iustin Pop
155 16e0b9c9 Michael Hanselmann
class Node:
156 16e0b9c9 Michael Hanselmann
  """Data container representing cluster node.
157 5a3103e9 Michael Hanselmann

158 5a3103e9 Michael Hanselmann
  """
159 16e0b9c9 Michael Hanselmann
  def __init__(self, name, bootid, offline, secondaries):
160 16e0b9c9 Michael Hanselmann
    """Initializes this class.
161 a8083063 Iustin Pop

162 16e0b9c9 Michael Hanselmann
    """
163 16e0b9c9 Michael Hanselmann
    self.name = name
164 16e0b9c9 Michael Hanselmann
    self.bootid = bootid
165 16e0b9c9 Michael Hanselmann
    self.offline = offline
166 16e0b9c9 Michael Hanselmann
    self.secondaries = secondaries
167 5a3103e9 Michael Hanselmann
168 78f44650 Iustin Pop
169 16e0b9c9 Michael Hanselmann
def _CheckInstances(cl, notepad, instances):
170 16e0b9c9 Michael Hanselmann
  """Make a pass over the list of instances, restarting downed ones.
171 5a3103e9 Michael Hanselmann

172 16e0b9c9 Michael Hanselmann
  """
173 16e0b9c9 Michael Hanselmann
  notepad.MaintainInstanceList(instances.keys())
174 78f44650 Iustin Pop
175 16e0b9c9 Michael Hanselmann
  started = set()
176 78f44650 Iustin Pop
177 16e0b9c9 Michael Hanselmann
  for inst in instances.values():
178 16e0b9c9 Michael Hanselmann
    if inst.status in BAD_STATES:
179 16e0b9c9 Michael Hanselmann
      n = notepad.NumberOfRestartAttempts(inst.name)
180 5a3103e9 Michael Hanselmann
181 16e0b9c9 Michael Hanselmann
      if n > MAXTRIES:
182 16e0b9c9 Michael Hanselmann
        logging.warning("Not restarting instance '%s', retries exhausted",
183 16e0b9c9 Michael Hanselmann
                        inst.name)
184 16e0b9c9 Michael Hanselmann
        continue
185 a8083063 Iustin Pop
186 16e0b9c9 Michael Hanselmann
      if n == MAXTRIES:
187 16e0b9c9 Michael Hanselmann
        notepad.RecordRestartAttempt(inst.name)
188 16e0b9c9 Michael Hanselmann
        logging.error("Could not restart instance '%s' after %s attempts,"
189 16e0b9c9 Michael Hanselmann
                      " giving up", inst.name, MAXTRIES)
190 16e0b9c9 Michael Hanselmann
        continue
191 5a3103e9 Michael Hanselmann
192 16e0b9c9 Michael Hanselmann
      try:
193 16e0b9c9 Michael Hanselmann
        logging.info("Restarting instance '%s' (attempt #%s)",
194 16e0b9c9 Michael Hanselmann
                     inst.name, n + 1)
195 16e0b9c9 Michael Hanselmann
        inst.Restart(cl)
196 b459a848 Andrea Spadaccini
      except Exception: # pylint: disable=W0703
197 16e0b9c9 Michael Hanselmann
        logging.exception("Error while restarting instance '%s'", inst.name)
198 16e0b9c9 Michael Hanselmann
      else:
199 16e0b9c9 Michael Hanselmann
        started.add(inst.name)
200 5a3103e9 Michael Hanselmann
201 16e0b9c9 Michael Hanselmann
      notepad.RecordRestartAttempt(inst.name)
202 5a3103e9 Michael Hanselmann
203 16e0b9c9 Michael Hanselmann
    else:
204 16e0b9c9 Michael Hanselmann
      if notepad.NumberOfRestartAttempts(inst.name):
205 16e0b9c9 Michael Hanselmann
        notepad.RemoveInstance(inst.name)
206 16e0b9c9 Michael Hanselmann
        if inst.status not in HELPLESS_STATES:
207 16e0b9c9 Michael Hanselmann
          logging.info("Restart of instance '%s' succeeded", inst.name)
208 a8083063 Iustin Pop
209 16e0b9c9 Michael Hanselmann
  return started
210 a8083063 Iustin Pop
211 a8083063 Iustin Pop
212 16e0b9c9 Michael Hanselmann
def _CheckDisks(cl, notepad, nodes, instances, started):
213 16e0b9c9 Michael Hanselmann
  """Check all nodes for restarted ones.
214 38242904 Iustin Pop

215 a8083063 Iustin Pop
  """
216 16e0b9c9 Michael Hanselmann
  check_nodes = []
217 16e0b9c9 Michael Hanselmann
218 16e0b9c9 Michael Hanselmann
  for node in nodes.values():
219 16e0b9c9 Michael Hanselmann
    old = notepad.GetNodeBootID(node.name)
220 16e0b9c9 Michael Hanselmann
    if not node.bootid:
221 16e0b9c9 Michael Hanselmann
      # Bad node, not returning a boot id
222 16e0b9c9 Michael Hanselmann
      if not node.offline:
223 16e0b9c9 Michael Hanselmann
        logging.debug("Node '%s' missing boot ID, skipping secondary checks",
224 16e0b9c9 Michael Hanselmann
                      node.name)
225 16e0b9c9 Michael Hanselmann
      continue
226 16e0b9c9 Michael Hanselmann
227 16e0b9c9 Michael Hanselmann
    if old != node.bootid:
228 16e0b9c9 Michael Hanselmann
      # Node's boot ID has changed, probably through a reboot
229 16e0b9c9 Michael Hanselmann
      check_nodes.append(node)
230 16e0b9c9 Michael Hanselmann
231 16e0b9c9 Michael Hanselmann
  if check_nodes:
232 16e0b9c9 Michael Hanselmann
    # Activate disks for all instances with any of the checked nodes as a
233 16e0b9c9 Michael Hanselmann
    # secondary node.
234 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
235 16e0b9c9 Michael Hanselmann
      for instance_name in node.secondaries:
236 16e0b9c9 Michael Hanselmann
        try:
237 16e0b9c9 Michael Hanselmann
          inst = instances[instance_name]
238 16e0b9c9 Michael Hanselmann
        except KeyError:
239 16e0b9c9 Michael Hanselmann
          logging.info("Can't find instance '%s', maybe it was ignored",
240 16e0b9c9 Michael Hanselmann
                       instance_name)
241 eee1fa2d Iustin Pop
          continue
242 a8083063 Iustin Pop
243 16e0b9c9 Michael Hanselmann
        if not inst.autostart:
244 16e0b9c9 Michael Hanselmann
          logging.info("Skipping disk activation for non-autostart"
245 16e0b9c9 Michael Hanselmann
                       " instance '%s'", inst.name)
246 a8083063 Iustin Pop
          continue
247 16e0b9c9 Michael Hanselmann
248 16e0b9c9 Michael Hanselmann
        if inst.name in started:
249 16e0b9c9 Michael Hanselmann
          # we already tried to start the instance, which should have
250 16e0b9c9 Michael Hanselmann
          # activated its drives (if they can be at all)
251 16e0b9c9 Michael Hanselmann
          logging.debug("Skipping disk activation for instance '%s' as"
252 16e0b9c9 Michael Hanselmann
                        " it was already started", inst.name)
253 a8083063 Iustin Pop
          continue
254 16e0b9c9 Michael Hanselmann
255 a8083063 Iustin Pop
        try:
256 16e0b9c9 Michael Hanselmann
          logging.info("Activating disks for instance '%s'", inst.name)
257 16e0b9c9 Michael Hanselmann
          inst.ActivateDisks(cl)
258 b459a848 Andrea Spadaccini
        except Exception: # pylint: disable=W0703
259 16e0b9c9 Michael Hanselmann
          logging.exception("Error while activating disks for instance '%s'",
260 16e0b9c9 Michael Hanselmann
                            inst.name)
261 a8083063 Iustin Pop
262 16e0b9c9 Michael Hanselmann
    # Keep changed boot IDs
263 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
264 16e0b9c9 Michael Hanselmann
      notepad.SetNodeBootID(node.name, node.bootid)
265 a8083063 Iustin Pop
266 83e5e26f René Nussbaumer
267 16e0b9c9 Michael Hanselmann
def _CheckForOfflineNodes(nodes, instance):
268 16e0b9c9 Michael Hanselmann
  """Checks if given instances has any secondary in offline status.
269 ae1a845c Michael Hanselmann

270 16e0b9c9 Michael Hanselmann
  @param instance: The instance object
271 16e0b9c9 Michael Hanselmann
  @return: True if any of the secondary is offline, False otherwise
272 ae1a845c Michael Hanselmann

273 16e0b9c9 Michael Hanselmann
  """
274 16e0b9c9 Michael Hanselmann
  return compat.any(nodes[node_name].offline for node_name in instance.snodes)
275 ae1a845c Michael Hanselmann
276 ae1a845c Michael Hanselmann
277 16e0b9c9 Michael Hanselmann
def _VerifyDisks(cl, uuid, nodes, instances):
278 16e0b9c9 Michael Hanselmann
  """Run a per-group "gnt-cluster verify-disks".
279 ae1a845c Michael Hanselmann

280 16e0b9c9 Michael Hanselmann
  """
281 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob([opcodes.OpGroupVerifyDisks(group_name=uuid)])
282 16e0b9c9 Michael Hanselmann
  ((_, offline_disk_instances, _), ) = \
283 16e0b9c9 Michael Hanselmann
    cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
284 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
285 ae1a845c Michael Hanselmann
286 16e0b9c9 Michael Hanselmann
  if not offline_disk_instances:
287 16e0b9c9 Michael Hanselmann
    # nothing to do
288 16e0b9c9 Michael Hanselmann
    logging.debug("Verify-disks reported no offline disks, nothing to do")
289 16e0b9c9 Michael Hanselmann
    return
290 ae1a845c Michael Hanselmann
291 16e0b9c9 Michael Hanselmann
  logging.debug("Will activate disks for instance(s) %s",
292 16e0b9c9 Michael Hanselmann
                utils.CommaJoin(offline_disk_instances))
293 ae1a845c Michael Hanselmann
294 16e0b9c9 Michael Hanselmann
  # We submit only one job, and wait for it. Not optimal, but this puts less
295 16e0b9c9 Michael Hanselmann
  # load on the job queue.
296 16e0b9c9 Michael Hanselmann
  job = []
297 16e0b9c9 Michael Hanselmann
  for name in offline_disk_instances:
298 16e0b9c9 Michael Hanselmann
    try:
299 16e0b9c9 Michael Hanselmann
      inst = instances[name]
300 16e0b9c9 Michael Hanselmann
    except KeyError:
301 16e0b9c9 Michael Hanselmann
      logging.info("Can't find instance '%s', maybe it was ignored", name)
302 16e0b9c9 Michael Hanselmann
      continue
303 ae1a845c Michael Hanselmann
304 16e0b9c9 Michael Hanselmann
    if inst.status in HELPLESS_STATES or _CheckForOfflineNodes(nodes, inst):
305 16e0b9c9 Michael Hanselmann
      logging.info("Skipping instance '%s' because it is in a helpless state or"
306 16e0b9c9 Michael Hanselmann
                   " has offline secondaries", name)
307 16e0b9c9 Michael Hanselmann
      continue
308 ae1a845c Michael Hanselmann
309 16e0b9c9 Michael Hanselmann
    job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
310 5188ab37 Iustin Pop
311 16e0b9c9 Michael Hanselmann
  if job:
312 16e0b9c9 Michael Hanselmann
    job_id = cli.SendJob(job, cl=cl)
313 83e5e26f René Nussbaumer
314 16e0b9c9 Michael Hanselmann
    try:
315 16e0b9c9 Michael Hanselmann
      cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
316 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
317 16e0b9c9 Michael Hanselmann
      logging.exception("Error while activating disks")
318 a8083063 Iustin Pop
319 a8083063 Iustin Pop
320 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
321 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
322 db147305 Tom Limoncelli

323 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
324 db147305 Tom Limoncelli
  test is GetVersion.
325 db147305 Tom Limoncelli

326 db147305 Tom Limoncelli
  @type hostname: string
327 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
328 db147305 Tom Limoncelli
  @rtype: bool
329 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
330 db147305 Tom Limoncelli

331 db147305 Tom Limoncelli
  """
332 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
333 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
334 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
335 db147305 Tom Limoncelli
  try:
336 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
337 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
338 d7c42723 Michael Hanselmann
    logging.warning("RAPI certificate error: %s", err)
339 db147305 Tom Limoncelli
    return False
340 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
341 d7c42723 Michael Hanselmann
    logging.warning("RAPI error: %s", err)
342 db147305 Tom Limoncelli
    return False
343 d7c42723 Michael Hanselmann
  else:
344 d7c42723 Michael Hanselmann
    logging.debug("Reported RAPI version %s", master_version)
345 d7c42723 Michael Hanselmann
    return master_version == constants.RAPI_VERSION
346 db147305 Tom Limoncelli
347 db147305 Tom Limoncelli
348 a8083063 Iustin Pop
def ParseOptions():
349 a8083063 Iustin Pop
  """Parse the command line options.
350 a8083063 Iustin Pop

351 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
352 a8083063 Iustin Pop

353 a8083063 Iustin Pop
  """
354 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
355 a8083063 Iustin Pop
                        usage="%prog [-d]",
356 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
357 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
358 a8083063 Iustin Pop
359 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
360 16e0b9c9 Michael Hanselmann
  parser.add_option(cli.NODEGROUP_OPT)
361 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
362 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
363 f0a80b01 Michael Hanselmann
                          " 6 hours)")
364 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
365 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
366 5f01e6ad Michael Hanselmann
  parser.add_option("--wait-children", dest="wait_children",
367 16e0b9c9 Michael Hanselmann
                    action="store_true", help="Wait for child processes")
368 5f01e6ad Michael Hanselmann
  parser.add_option("--no-wait-children", dest="wait_children",
369 5f01e6ad Michael Hanselmann
                    action="store_false", help="Don't wait for child processes")
370 5f01e6ad Michael Hanselmann
  # See optparse documentation for why default values are not set by options
371 5f01e6ad Michael Hanselmann
  parser.set_defaults(wait_children=True)
372 a8083063 Iustin Pop
  options, args = parser.parse_args()
373 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
374 f0a80b01 Michael Hanselmann
375 f0a80b01 Michael Hanselmann
  if args:
376 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
377 f0a80b01 Michael Hanselmann
378 f0a80b01 Michael Hanselmann
  return (options, args)
379 a8083063 Iustin Pop
380 a8083063 Iustin Pop
381 9bb69bb5 Michael Hanselmann
def _WriteInstanceStatus(filename, data):
382 9bb69bb5 Michael Hanselmann
  """Writes the per-group instance status file.
383 9bb69bb5 Michael Hanselmann

384 9bb69bb5 Michael Hanselmann
  The entries are sorted.
385 8f07dc0d Michael Hanselmann

386 9bb69bb5 Michael Hanselmann
  @type filename: string
387 9bb69bb5 Michael Hanselmann
  @param filename: Path to instance status file
388 9bb69bb5 Michael Hanselmann
  @type data: list of tuple; (instance name as string, status as string)
389 9bb69bb5 Michael Hanselmann
  @param data: Instance name and status
390 8f07dc0d Michael Hanselmann

391 8f07dc0d Michael Hanselmann
  """
392 9bb69bb5 Michael Hanselmann
  logging.debug("Updating instance status file '%s' with %s instances",
393 9bb69bb5 Michael Hanselmann
                filename, len(data))
394 8f07dc0d Michael Hanselmann
395 9bb69bb5 Michael Hanselmann
  utils.WriteFile(filename,
396 9bb69bb5 Michael Hanselmann
                  data="".join(map(compat.partial(operator.mod, "%s %s\n"),
397 9bb69bb5 Michael Hanselmann
                                   sorted(data))))
398 9bb69bb5 Michael Hanselmann
399 9bb69bb5 Michael Hanselmann
400 9bb69bb5 Michael Hanselmann
def _UpdateInstanceStatus(filename, instances):
401 9bb69bb5 Michael Hanselmann
  """Writes an instance status file from L{Instance} objects.
402 9bb69bb5 Michael Hanselmann

403 9bb69bb5 Michael Hanselmann
  @type filename: string
404 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
405 9bb69bb5 Michael Hanselmann
  @type instances: list of L{Instance}
406 9bb69bb5 Michael Hanselmann

407 9bb69bb5 Michael Hanselmann
  """
408 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, [(inst.name, inst.status)
409 9bb69bb5 Michael Hanselmann
                                  for inst in instances])
410 9bb69bb5 Michael Hanselmann
411 9bb69bb5 Michael Hanselmann
412 9bb69bb5 Michael Hanselmann
class _StatCb:
413 9bb69bb5 Michael Hanselmann
  """Helper to store file handle's C{fstat}.
414 9bb69bb5 Michael Hanselmann

415 9bb69bb5 Michael Hanselmann
  """
416 9bb69bb5 Michael Hanselmann
  def __init__(self):
417 9bb69bb5 Michael Hanselmann
    """Initializes this class.
418 9bb69bb5 Michael Hanselmann

419 9bb69bb5 Michael Hanselmann
    """
420 9bb69bb5 Michael Hanselmann
    self.st = None
421 9bb69bb5 Michael Hanselmann
422 9bb69bb5 Michael Hanselmann
  def __call__(self, fh):
423 9bb69bb5 Michael Hanselmann
    """Calls C{fstat} on file handle.
424 8f07dc0d Michael Hanselmann

425 9bb69bb5 Michael Hanselmann
    """
426 9bb69bb5 Michael Hanselmann
    self.st = os.fstat(fh.fileno())
427 8f07dc0d Michael Hanselmann
428 9bb69bb5 Michael Hanselmann
429 9bb69bb5 Michael Hanselmann
def _ReadInstanceStatus(filename):
430 9bb69bb5 Michael Hanselmann
  """Reads an instance status file.
431 9bb69bb5 Michael Hanselmann

432 9bb69bb5 Michael Hanselmann
  @type filename: string
433 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
434 9bb69bb5 Michael Hanselmann
  @rtype: tuple; (None or number, list of lists containing instance name and
435 9bb69bb5 Michael Hanselmann
    status)
436 9bb69bb5 Michael Hanselmann
  @return: File's mtime and instance status contained in the file; mtime is
437 9bb69bb5 Michael Hanselmann
    C{None} if file can't be read
438 9bb69bb5 Michael Hanselmann

439 9bb69bb5 Michael Hanselmann
  """
440 9bb69bb5 Michael Hanselmann
  logging.debug("Reading per-group instance status from '%s'", filename)
441 9bb69bb5 Michael Hanselmann
442 9bb69bb5 Michael Hanselmann
  statcb = _StatCb()
443 9bb69bb5 Michael Hanselmann
  try:
444 9bb69bb5 Michael Hanselmann
    content = utils.ReadFile(filename, preread=statcb)
445 9bb69bb5 Michael Hanselmann
  except EnvironmentError, err:
446 9bb69bb5 Michael Hanselmann
    if err.errno == errno.ENOENT:
447 9bb69bb5 Michael Hanselmann
      logging.error("Can't read '%s', does not exist (yet)", filename)
448 9bb69bb5 Michael Hanselmann
    else:
449 9bb69bb5 Michael Hanselmann
      logging.exception("Unable to read '%s', ignoring", filename)
450 9bb69bb5 Michael Hanselmann
    return (None, None)
451 9bb69bb5 Michael Hanselmann
  else:
452 6f9e71bb Michael Hanselmann
    return (statcb.st.st_mtime, [line.split(None, 1)
453 9bb69bb5 Michael Hanselmann
                                 for line in content.splitlines()])
454 9bb69bb5 Michael Hanselmann
455 9bb69bb5 Michael Hanselmann
456 9bb69bb5 Michael Hanselmann
def _MergeInstanceStatus(filename, pergroup_filename, groups):
457 9bb69bb5 Michael Hanselmann
  """Merges all per-group instance status files into a global one.
458 9bb69bb5 Michael Hanselmann

459 9bb69bb5 Michael Hanselmann
  @type filename: string
460 9bb69bb5 Michael Hanselmann
  @param filename: Path to global instance status file
461 9bb69bb5 Michael Hanselmann
  @type pergroup_filename: string
462 9bb69bb5 Michael Hanselmann
  @param pergroup_filename: Path to per-group status files, must contain "%s"
463 9bb69bb5 Michael Hanselmann
    to be replaced with group UUID
464 9bb69bb5 Michael Hanselmann
  @type groups: sequence
465 9bb69bb5 Michael Hanselmann
  @param groups: UUIDs of known groups
466 9bb69bb5 Michael Hanselmann

467 9bb69bb5 Michael Hanselmann
  """
468 9bb69bb5 Michael Hanselmann
  # Lock global status file in exclusive mode
469 9bb69bb5 Michael Hanselmann
  lock = utils.FileLock.Open(filename)
470 9bb69bb5 Michael Hanselmann
  try:
471 9bb69bb5 Michael Hanselmann
    lock.Exclusive(blocking=True, timeout=INSTANCE_STATUS_LOCK_TIMEOUT)
472 9bb69bb5 Michael Hanselmann
  except errors.LockError, err:
473 9bb69bb5 Michael Hanselmann
    # All per-group processes will lock and update the file. None of them
474 9bb69bb5 Michael Hanselmann
    # should take longer than 10 seconds (the value of
475 9bb69bb5 Michael Hanselmann
    # INSTANCE_STATUS_LOCK_TIMEOUT).
476 9bb69bb5 Michael Hanselmann
    logging.error("Can't acquire lock on instance status file '%s', not"
477 9bb69bb5 Michael Hanselmann
                  " updating: %s", filename, err)
478 9bb69bb5 Michael Hanselmann
    return
479 9bb69bb5 Michael Hanselmann
480 9bb69bb5 Michael Hanselmann
  logging.debug("Acquired exclusive lock on '%s'", filename)
481 9bb69bb5 Michael Hanselmann
482 9bb69bb5 Michael Hanselmann
  data = {}
483 9bb69bb5 Michael Hanselmann
484 9bb69bb5 Michael Hanselmann
  # Load instance status from all groups
485 9bb69bb5 Michael Hanselmann
  for group_uuid in groups:
486 9bb69bb5 Michael Hanselmann
    (mtime, instdata) = _ReadInstanceStatus(pergroup_filename % group_uuid)
487 9bb69bb5 Michael Hanselmann
488 9bb69bb5 Michael Hanselmann
    if mtime is not None:
489 9bb69bb5 Michael Hanselmann
      for (instance_name, status) in instdata:
490 9bb69bb5 Michael Hanselmann
        data.setdefault(instance_name, []).append((mtime, status))
491 9bb69bb5 Michael Hanselmann
492 9bb69bb5 Michael Hanselmann
  # Select last update based on file mtime
493 9bb69bb5 Michael Hanselmann
  inststatus = [(instance_name, sorted(status, reverse=True)[0][1])
494 9bb69bb5 Michael Hanselmann
                for (instance_name, status) in data.items()]
495 9bb69bb5 Michael Hanselmann
496 9bb69bb5 Michael Hanselmann
  # Write the global status file. Don't touch file after it's been
497 9bb69bb5 Michael Hanselmann
  # updated--there is no lock anymore.
498 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, inststatus)
499 8f07dc0d Michael Hanselmann
500 8f07dc0d Michael Hanselmann
501 16e0b9c9 Michael Hanselmann
def GetLuxiClient(try_restart):
502 16e0b9c9 Michael Hanselmann
  """Tries to connect to the master daemon.
503 16e0b9c9 Michael Hanselmann

504 16e0b9c9 Michael Hanselmann
  @type try_restart: bool
505 16e0b9c9 Michael Hanselmann
  @param try_restart: Whether to attempt to restart the master daemon
506 16e0b9c9 Michael Hanselmann

507 16e0b9c9 Michael Hanselmann
  """
508 16e0b9c9 Michael Hanselmann
  try:
509 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
510 16e0b9c9 Michael Hanselmann
  except errors.OpPrereqError, err:
511 16e0b9c9 Michael Hanselmann
    # this is, from cli.GetClient, a not-master case
512 16e0b9c9 Michael Hanselmann
    raise NotMasterError("Not on master node (%s)" % err)
513 16e0b9c9 Michael Hanselmann
514 16e0b9c9 Michael Hanselmann
  except luxi.NoMasterError, err:
515 16e0b9c9 Michael Hanselmann
    if not try_restart:
516 16e0b9c9 Michael Hanselmann
      raise
517 16e0b9c9 Michael Hanselmann
518 16e0b9c9 Michael Hanselmann
    logging.warning("Master daemon seems to be down (%s), trying to restart",
519 16e0b9c9 Michael Hanselmann
                    err)
520 16e0b9c9 Michael Hanselmann
521 16e0b9c9 Michael Hanselmann
    if not utils.EnsureDaemon(constants.MASTERD):
522 16e0b9c9 Michael Hanselmann
      raise errors.GenericError("Can't start the master daemon")
523 16e0b9c9 Michael Hanselmann
524 16e0b9c9 Michael Hanselmann
    # Retry the connection
525 16e0b9c9 Michael Hanselmann
    return cli.GetClient()
526 16e0b9c9 Michael Hanselmann
527 16e0b9c9 Michael Hanselmann
528 16e0b9c9 Michael Hanselmann
def _StartGroupChildren(cl, wait):
529 16e0b9c9 Michael Hanselmann
  """Starts a new instance of the watcher for every node group.
530 16e0b9c9 Michael Hanselmann

531 16e0b9c9 Michael Hanselmann
  """
532 16e0b9c9 Michael Hanselmann
  assert not compat.any(arg.startswith(cli.NODEGROUP_OPT_NAME)
533 16e0b9c9 Michael Hanselmann
                        for arg in sys.argv)
534 16e0b9c9 Michael Hanselmann
535 16e0b9c9 Michael Hanselmann
  result = cl.QueryGroups([], ["name", "uuid"], False)
536 16e0b9c9 Michael Hanselmann
537 16e0b9c9 Michael Hanselmann
  children = []
538 16e0b9c9 Michael Hanselmann
539 16e0b9c9 Michael Hanselmann
  for (idx, (name, uuid)) in enumerate(result):
540 16e0b9c9 Michael Hanselmann
    args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]
541 16e0b9c9 Michael Hanselmann
542 16e0b9c9 Michael Hanselmann
    if idx > 0:
543 16e0b9c9 Michael Hanselmann
      # Let's not kill the system
544 16e0b9c9 Michael Hanselmann
      time.sleep(CHILD_PROCESS_DELAY)
545 16e0b9c9 Michael Hanselmann
546 16e0b9c9 Michael Hanselmann
    logging.debug("Spawning child for group '%s' (%s), arguments %s",
547 16e0b9c9 Michael Hanselmann
                  name, uuid, args)
548 16e0b9c9 Michael Hanselmann
549 16e0b9c9 Michael Hanselmann
    try:
550 16e0b9c9 Michael Hanselmann
      # TODO: Should utils.StartDaemon be used instead?
551 16e0b9c9 Michael Hanselmann
      pid = os.spawnv(os.P_NOWAIT, args[0], args)
552 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
553 16e0b9c9 Michael Hanselmann
      logging.exception("Failed to start child for group '%s' (%s)",
554 16e0b9c9 Michael Hanselmann
                        name, uuid)
555 16e0b9c9 Michael Hanselmann
    else:
556 16e0b9c9 Michael Hanselmann
      logging.debug("Started with PID %s", pid)
557 16e0b9c9 Michael Hanselmann
      children.append(pid)
558 16e0b9c9 Michael Hanselmann
559 16e0b9c9 Michael Hanselmann
  if wait:
560 16e0b9c9 Michael Hanselmann
    for pid in children:
561 16e0b9c9 Michael Hanselmann
      logging.debug("Waiting for child PID %s", pid)
562 16e0b9c9 Michael Hanselmann
      try:
563 16e0b9c9 Michael Hanselmann
        result = utils.RetryOnSignal(os.waitpid, pid, 0)
564 16e0b9c9 Michael Hanselmann
      except EnvironmentError, err:
565 16e0b9c9 Michael Hanselmann
        result = str(err)
566 16e0b9c9 Michael Hanselmann
567 16e0b9c9 Michael Hanselmann
      logging.debug("Child PID %s exited with status %s", pid, result)
568 16e0b9c9 Michael Hanselmann
569 16e0b9c9 Michael Hanselmann
570 16e0b9c9 Michael Hanselmann
def _ArchiveJobs(cl, age):
571 16e0b9c9 Michael Hanselmann
  """Archives old jobs.
572 16e0b9c9 Michael Hanselmann

573 16e0b9c9 Michael Hanselmann
  """
574 16e0b9c9 Michael Hanselmann
  (arch_count, left_count) = cl.AutoArchiveJobs(age)
575 16e0b9c9 Michael Hanselmann
  logging.debug("Archived %s jobs, left %s", arch_count, left_count)
576 16e0b9c9 Michael Hanselmann
577 16e0b9c9 Michael Hanselmann
578 16e0b9c9 Michael Hanselmann
def _CheckMaster(cl):
579 16e0b9c9 Michael Hanselmann
  """Ensures current host is master node.
580 16e0b9c9 Michael Hanselmann

581 16e0b9c9 Michael Hanselmann
  """
582 16e0b9c9 Michael Hanselmann
  (master, ) = cl.QueryConfigValues(["master_node"])
583 16e0b9c9 Michael Hanselmann
  if master != netutils.Hostname.GetSysName():
584 16e0b9c9 Michael Hanselmann
    raise NotMasterError("This is not the master node")
585 16e0b9c9 Michael Hanselmann
586 16e0b9c9 Michael Hanselmann
587 2a7c3583 Michael Hanselmann
@rapi.client.UsesRapiClient
588 16e0b9c9 Michael Hanselmann
def _GlobalWatcher(opts):
589 16e0b9c9 Michael Hanselmann
  """Main function for global watcher.
590 16e0b9c9 Michael Hanselmann

591 16e0b9c9 Michael Hanselmann
  At the end child processes are spawned for every node group.
592 16e0b9c9 Michael Hanselmann

593 16e0b9c9 Michael Hanselmann
  """
594 16e0b9c9 Michael Hanselmann
  StartNodeDaemons()
595 16e0b9c9 Michael Hanselmann
  RunWatcherHooks()
596 16e0b9c9 Michael Hanselmann
597 16e0b9c9 Michael Hanselmann
  # Run node maintenance in all cases, even if master, so that old masters can
598 16e0b9c9 Michael Hanselmann
  # be properly cleaned up
599 b459a848 Andrea Spadaccini
  if nodemaint.NodeMaintenance.ShouldRun(): # pylint: disable=E0602
600 b459a848 Andrea Spadaccini
    nodemaint.NodeMaintenance().Exec() # pylint: disable=E0602
601 16e0b9c9 Michael Hanselmann
602 16e0b9c9 Michael Hanselmann
  try:
603 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(True)
604 16e0b9c9 Michael Hanselmann
  except NotMasterError:
605 16e0b9c9 Michael Hanselmann
    # Don't proceed on non-master nodes
606 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
607 16e0b9c9 Michael Hanselmann
608 16e0b9c9 Michael Hanselmann
  # we are on master now
609 16e0b9c9 Michael Hanselmann
  utils.EnsureDaemon(constants.RAPI)
610 16e0b9c9 Michael Hanselmann
611 16e0b9c9 Michael Hanselmann
  # If RAPI isn't responding to queries, try one restart
612 16e0b9c9 Michael Hanselmann
  logging.debug("Attempting to talk to remote API on %s",
613 16e0b9c9 Michael Hanselmann
                constants.IP4_ADDRESS_LOCALHOST)
614 16e0b9c9 Michael Hanselmann
  if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
615 16e0b9c9 Michael Hanselmann
    logging.warning("Couldn't get answer from remote API, restaring daemon")
616 16e0b9c9 Michael Hanselmann
    utils.StopDaemon(constants.RAPI)
617 16e0b9c9 Michael Hanselmann
    utils.EnsureDaemon(constants.RAPI)
618 16e0b9c9 Michael Hanselmann
    logging.debug("Second attempt to talk to remote API")
619 16e0b9c9 Michael Hanselmann
    if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
620 16e0b9c9 Michael Hanselmann
      logging.fatal("RAPI is not responding")
621 16e0b9c9 Michael Hanselmann
  logging.debug("Successfully talked to remote API")
622 16e0b9c9 Michael Hanselmann
623 16e0b9c9 Michael Hanselmann
  _CheckMaster(client)
624 16e0b9c9 Michael Hanselmann
  _ArchiveJobs(client, opts.job_age)
625 16e0b9c9 Michael Hanselmann
626 16e0b9c9 Michael Hanselmann
  # Spawn child processes for all node groups
627 16e0b9c9 Michael Hanselmann
  _StartGroupChildren(client, opts.wait_children)
628 16e0b9c9 Michael Hanselmann
629 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
630 16e0b9c9 Michael Hanselmann
631 16e0b9c9 Michael Hanselmann
632 16e0b9c9 Michael Hanselmann
def _GetGroupData(cl, uuid):
633 16e0b9c9 Michael Hanselmann
  """Retrieves instances and nodes per node group.
634 16e0b9c9 Michael Hanselmann

635 16e0b9c9 Michael Hanselmann
  """
636 16e0b9c9 Michael Hanselmann
  job = [
637 16e0b9c9 Michael Hanselmann
    # Get all primary instances in group
638 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_INSTANCE,
639 16e0b9c9 Michael Hanselmann
                    fields=["name", "status", "admin_state", "snodes",
640 16e0b9c9 Michael Hanselmann
                            "pnode.group.uuid", "snodes.group.uuid"],
641 5bfb1134 Michael Hanselmann
                    filter=[qlang.OP_EQUAL, "pnode.group.uuid", uuid],
642 5bfb1134 Michael Hanselmann
                    use_locking=True),
643 16e0b9c9 Michael Hanselmann
644 16e0b9c9 Michael Hanselmann
    # Get all nodes in group
645 16e0b9c9 Michael Hanselmann
    opcodes.OpQuery(what=constants.QR_NODE,
646 16e0b9c9 Michael Hanselmann
                    fields=["name", "bootid", "offline"],
647 5bfb1134 Michael Hanselmann
                    filter=[qlang.OP_EQUAL, "group.uuid", uuid],
648 5bfb1134 Michael Hanselmann
                    use_locking=True),
649 16e0b9c9 Michael Hanselmann
    ]
650 16e0b9c9 Michael Hanselmann
651 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob(job)
652 16e0b9c9 Michael Hanselmann
  results = map(objects.QueryResponse.FromDict,
653 16e0b9c9 Michael Hanselmann
                cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug))
654 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
655 16e0b9c9 Michael Hanselmann
656 16e0b9c9 Michael Hanselmann
  results_data = map(operator.attrgetter("data"), results)
657 16e0b9c9 Michael Hanselmann
658 16e0b9c9 Michael Hanselmann
  # Ensure results are tuples with two values
659 16e0b9c9 Michael Hanselmann
  assert compat.all(map(ht.TListOf(ht.TListOf(ht.TIsLength(2))), results_data))
660 16e0b9c9 Michael Hanselmann
661 16e0b9c9 Michael Hanselmann
  # Extract values ignoring result status
662 16e0b9c9 Michael Hanselmann
  (raw_instances, raw_nodes) = [[map(compat.snd, values)
663 16e0b9c9 Michael Hanselmann
                                 for values in res]
664 16e0b9c9 Michael Hanselmann
                                for res in results_data]
665 16e0b9c9 Michael Hanselmann
666 16e0b9c9 Michael Hanselmann
  secondaries = {}
667 16e0b9c9 Michael Hanselmann
  instances = []
668 16e0b9c9 Michael Hanselmann
669 16e0b9c9 Michael Hanselmann
  # Load all instances
670 16e0b9c9 Michael Hanselmann
  for (name, status, autostart, snodes, pnode_group_uuid,
671 16e0b9c9 Michael Hanselmann
       snodes_group_uuid) in raw_instances:
672 16e0b9c9 Michael Hanselmann
    if snodes and set([pnode_group_uuid]) != set(snodes_group_uuid):
673 16e0b9c9 Michael Hanselmann
      logging.error("Ignoring split instance '%s', primary group %s, secondary"
674 16e0b9c9 Michael Hanselmann
                    " groups %s", name, pnode_group_uuid,
675 16e0b9c9 Michael Hanselmann
                    utils.CommaJoin(snodes_group_uuid))
676 16e0b9c9 Michael Hanselmann
    else:
677 16e0b9c9 Michael Hanselmann
      instances.append(Instance(name, status, autostart, snodes))
678 16e0b9c9 Michael Hanselmann
679 16e0b9c9 Michael Hanselmann
      for node in snodes:
680 16e0b9c9 Michael Hanselmann
        secondaries.setdefault(node, set()).add(name)
681 16e0b9c9 Michael Hanselmann
682 16e0b9c9 Michael Hanselmann
  # Load all nodes
683 16e0b9c9 Michael Hanselmann
  nodes = [Node(name, bootid, offline, secondaries.get(name, set()))
684 16e0b9c9 Michael Hanselmann
           for (name, bootid, offline) in raw_nodes]
685 16e0b9c9 Michael Hanselmann
686 16e0b9c9 Michael Hanselmann
  return (dict((node.name, node) for node in nodes),
687 16e0b9c9 Michael Hanselmann
          dict((inst.name, inst) for inst in instances))
688 16e0b9c9 Michael Hanselmann
689 16e0b9c9 Michael Hanselmann
690 9bb69bb5 Michael Hanselmann
def _LoadKnownGroups():
691 9bb69bb5 Michael Hanselmann
  """Returns a list of all node groups known by L{ssconf}.
692 16e0b9c9 Michael Hanselmann

693 16e0b9c9 Michael Hanselmann
  """
694 16e0b9c9 Michael Hanselmann
  groups = ssconf.SimpleStore().GetNodegroupList()
695 16e0b9c9 Michael Hanselmann
696 9bb69bb5 Michael Hanselmann
  result = list(line.split(None, 1)[0] for line in groups
697 9bb69bb5 Michael Hanselmann
                if line.strip())
698 9bb69bb5 Michael Hanselmann
699 9bb69bb5 Michael Hanselmann
  if not compat.all(map(utils.UUID_RE.match, result)):
700 9bb69bb5 Michael Hanselmann
    raise errors.GenericError("Ssconf contains invalid group UUID")
701 9bb69bb5 Michael Hanselmann
702 9bb69bb5 Michael Hanselmann
  return result
703 16e0b9c9 Michael Hanselmann
704 16e0b9c9 Michael Hanselmann
705 16e0b9c9 Michael Hanselmann
def _GroupWatcher(opts):
706 16e0b9c9 Michael Hanselmann
  """Main function for per-group watcher process.
707 16e0b9c9 Michael Hanselmann

708 16e0b9c9 Michael Hanselmann
  """
709 16e0b9c9 Michael Hanselmann
  group_uuid = opts.nodegroup.lower()
710 16e0b9c9 Michael Hanselmann
711 16e0b9c9 Michael Hanselmann
  if not utils.UUID_RE.match(group_uuid):
712 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group parameter (%s) must be given a UUID,"
713 16e0b9c9 Michael Hanselmann
                              " got '%s'" %
714 16e0b9c9 Michael Hanselmann
                              (cli.NODEGROUP_OPT_NAME, group_uuid))
715 16e0b9c9 Michael Hanselmann
716 16e0b9c9 Michael Hanselmann
  logging.info("Watcher for node group '%s'", group_uuid)
717 16e0b9c9 Michael Hanselmann
718 9bb69bb5 Michael Hanselmann
  known_groups = _LoadKnownGroups()
719 9bb69bb5 Michael Hanselmann
720 16e0b9c9 Michael Hanselmann
  # Check if node group is known
721 9bb69bb5 Michael Hanselmann
  if group_uuid not in known_groups:
722 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group '%s' is not known by ssconf" %
723 16e0b9c9 Michael Hanselmann
                              group_uuid)
724 16e0b9c9 Michael Hanselmann
725 9bb69bb5 Michael Hanselmann
  # Group UUID has been verified and should not contain any dangerous characters
726 16e0b9c9 Michael Hanselmann
  state_path = constants.WATCHER_GROUP_STATE_FILE % group_uuid
727 9bb69bb5 Michael Hanselmann
  inst_status_path = constants.WATCHER_GROUP_INSTANCE_STATUS_FILE % group_uuid
728 16e0b9c9 Michael Hanselmann
729 16e0b9c9 Michael Hanselmann
  logging.debug("Using state file %s", state_path)
730 16e0b9c9 Michael Hanselmann
731 16e0b9c9 Michael Hanselmann
  # Global watcher
732 b459a848 Andrea Spadaccini
  statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
733 16e0b9c9 Michael Hanselmann
  if not statefile:
734 16e0b9c9 Michael Hanselmann
    return constants.EXIT_FAILURE
735 16e0b9c9 Michael Hanselmann
736 b459a848 Andrea Spadaccini
  notepad = state.WatcherState(statefile) # pylint: disable=E0602
737 16e0b9c9 Michael Hanselmann
  try:
738 16e0b9c9 Michael Hanselmann
    # Connect to master daemon
739 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(False)
740 16e0b9c9 Michael Hanselmann
741 16e0b9c9 Michael Hanselmann
    _CheckMaster(client)
742 16e0b9c9 Michael Hanselmann
743 16e0b9c9 Michael Hanselmann
    (nodes, instances) = _GetGroupData(client, group_uuid)
744 16e0b9c9 Michael Hanselmann
745 9bb69bb5 Michael Hanselmann
    # Update per-group instance status file
746 9bb69bb5 Michael Hanselmann
    _UpdateInstanceStatus(inst_status_path, instances.values())
747 9bb69bb5 Michael Hanselmann
748 9bb69bb5 Michael Hanselmann
    _MergeInstanceStatus(constants.INSTANCE_STATUS_FILE,
749 9bb69bb5 Michael Hanselmann
                         constants.WATCHER_GROUP_INSTANCE_STATUS_FILE,
750 9bb69bb5 Michael Hanselmann
                         known_groups)
751 9bb69bb5 Michael Hanselmann
752 16e0b9c9 Michael Hanselmann
    started = _CheckInstances(client, notepad, instances)
753 16e0b9c9 Michael Hanselmann
    _CheckDisks(client, notepad, nodes, instances, started)
754 16e0b9c9 Michael Hanselmann
    _VerifyDisks(client, group_uuid, nodes, instances)
755 16e0b9c9 Michael Hanselmann
  except Exception, err:
756 16e0b9c9 Michael Hanselmann
    logging.info("Not updating status file due to failure: %s", err)
757 16e0b9c9 Michael Hanselmann
    raise
758 16e0b9c9 Michael Hanselmann
  else:
759 16e0b9c9 Michael Hanselmann
    # Save changes for next run
760 16e0b9c9 Michael Hanselmann
    notepad.Save(state_path)
761 16e0b9c9 Michael Hanselmann
762 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
763 16e0b9c9 Michael Hanselmann
764 16e0b9c9 Michael Hanselmann
765 9f4bb951 Michael Hanselmann
def Main():
766 a8083063 Iustin Pop
  """Main function.
767 a8083063 Iustin Pop

768 a8083063 Iustin Pop
  """
769 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
770 a8083063 Iustin Pop
771 cfcc79c6 Michael Hanselmann
  utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
772 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
773 a8083063 Iustin Pop
774 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
775 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
776 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
777 3753b2cb Michael Hanselmann
778 16e0b9c9 Michael Hanselmann
  # Try to acquire global watcher lock in shared mode
779 16e0b9c9 Michael Hanselmann
  lock = utils.FileLock.Open(constants.WATCHER_LOCK_FILE)
780 a8083063 Iustin Pop
  try:
781 16e0b9c9 Michael Hanselmann
    lock.Shared(blocking=False)
782 16e0b9c9 Michael Hanselmann
  except (EnvironmentError, errors.LockError), err:
783 16e0b9c9 Michael Hanselmann
    logging.error("Can't acquire lock on %s: %s",
784 16e0b9c9 Michael Hanselmann
                  constants.WATCHER_LOCK_FILE, err)
785 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
786 db147305 Tom Limoncelli
787 16e0b9c9 Michael Hanselmann
  if options.nodegroup is None:
788 16e0b9c9 Michael Hanselmann
    fn = _GlobalWatcher
789 16e0b9c9 Michael Hanselmann
  else:
790 16e0b9c9 Michael Hanselmann
    # Per-nodegroup watcher
791 16e0b9c9 Michael Hanselmann
    fn = _GroupWatcher
792 16e0b9c9 Michael Hanselmann
793 16e0b9c9 Michael Hanselmann
  try:
794 16e0b9c9 Michael Hanselmann
    return fn(options)
795 16e0b9c9 Michael Hanselmann
  except (SystemExit, KeyboardInterrupt):
796 1b052f42 Michael Hanselmann
    raise
797 38242904 Iustin Pop
  except NotMasterError:
798 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
799 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
800 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
801 013ce4ae Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting", err.args[0])
802 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
803 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
804 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
805 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
806 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
807 438b45d4 Michael Hanselmann
  except Exception, err:
808 001b3825 Michael Hanselmann
    logging.exception(str(err))
809 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
810 5a3103e9 Michael Hanselmann
811 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS