Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / __init__.py @ 3360026f

History | View | Annotate | Download (25.6 kB)

1 9f4bb951 Michael Hanselmann
#
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 9598b71f Michele Tartara
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 55c85950 Iustin Pop
"""Tool to restart erroneously downed virtual machines.
23 a8083063 Iustin Pop

24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop

28 5a3103e9 Michael Hanselmann
"""
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
import os
31 cfcc79c6 Michael Hanselmann
import os.path
32 a8083063 Iustin Pop
import sys
33 a8083063 Iustin Pop
import time
34 438b45d4 Michael Hanselmann
import logging
35 16e0b9c9 Michael Hanselmann
import operator
36 9bb69bb5 Michael Hanselmann
import errno
37 a8083063 Iustin Pop
from optparse import OptionParser
38 a8083063 Iustin Pop
39 a8083063 Iustin Pop
from ganeti import utils
40 a8083063 Iustin Pop
from ganeti import constants
41 83e5e26f René Nussbaumer
from ganeti import compat
42 89e1fc26 Iustin Pop
from ganeti import errors
43 e125c67c Michael Hanselmann
from ganeti import opcodes
44 e125c67c Michael Hanselmann
from ganeti import cli
45 9ba38706 Petr Pudlak
import ganeti.rpc.node as rpc
46 9ba38706 Petr Pudlak
import ganeti.rpc.errors as rpcerr
47 db147305 Tom Limoncelli
from ganeti import rapi
48 a744b676 Manuel Franceschini
from ganeti import netutils
49 16e0b9c9 Michael Hanselmann
from ganeti import qlang
50 16e0b9c9 Michael Hanselmann
from ganeti import objects
51 16e0b9c9 Michael Hanselmann
from ganeti import ssconf
52 16e0b9c9 Michael Hanselmann
from ganeti import ht
53 57fe4a5b Michael Hanselmann
from ganeti import pathutils
54 a8083063 Iustin Pop
55 b459a848 Andrea Spadaccini
import ganeti.rapi.client # pylint: disable=W0611
56 fc3f75dd Iustin Pop
from ganeti.rapi.client import UsesRapiClient
57 adf6301e Michael Hanselmann
58 adf6301e Michael Hanselmann
from ganeti.watcher import nodemaint
59 adf6301e Michael Hanselmann
from ganeti.watcher import state
60 db147305 Tom Limoncelli
61 a8083063 Iustin Pop
62 5a3103e9 Michael Hanselmann
MAXTRIES = 5
63 b8028dcf Michael Hanselmann
BAD_STATES = compat.UniqueFrozenset([
64 0cc9e018 Michael Hanselmann
  constants.INSTST_ERRORDOWN,
65 0cc9e018 Michael Hanselmann
  ])
66 b8028dcf Michael Hanselmann
HELPLESS_STATES = compat.UniqueFrozenset([
67 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEDOWN,
68 0cc9e018 Michael Hanselmann
  constants.INSTST_NODEOFFLINE,
69 0cc9e018 Michael Hanselmann
  ])
70 0cc9e018 Michael Hanselmann
NOTICE = "NOTICE"
71 0cc9e018 Michael Hanselmann
ERROR = "ERROR"
72 e125c67c Michael Hanselmann
73 16e0b9c9 Michael Hanselmann
#: Number of seconds to wait between starting child processes for node groups
74 16e0b9c9 Michael Hanselmann
CHILD_PROCESS_DELAY = 1.0
75 16e0b9c9 Michael Hanselmann
76 9bb69bb5 Michael Hanselmann
#: How many seconds to wait for instance status file lock
77 9bb69bb5 Michael Hanselmann
INSTANCE_STATUS_LOCK_TIMEOUT = 10.0
78 9bb69bb5 Michael Hanselmann
79 e125c67c Michael Hanselmann
80 7bca53e4 Michael Hanselmann
class NotMasterError(errors.GenericError):
81 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
82 a8083063 Iustin Pop
83 a8083063 Iustin Pop
84 3753b2cb Michael Hanselmann
def ShouldPause():
85 3753b2cb Michael Hanselmann
  """Check whether we should pause.
86 3753b2cb Michael Hanselmann

87 3753b2cb Michael Hanselmann
  """
88 57fe4a5b Michael Hanselmann
  return bool(utils.ReadWatcherPauseFile(pathutils.WATCHER_PAUSEFILE))
89 3753b2cb Michael Hanselmann
90 3753b2cb Michael Hanselmann
91 f1115454 Guido Trotter
def StartNodeDaemons():
92 f1115454 Guido Trotter
  """Start all the daemons that should be running on all nodes.
93 f1115454 Guido Trotter

94 f1115454 Guido Trotter
  """
95 55c85950 Iustin Pop
  # on master or not, try to start the node daemon
96 2826b361 Guido Trotter
  utils.EnsureDaemon(constants.NODED)
97 f1115454 Guido Trotter
  # start confd as well. On non candidates it will be in disabled mode.
98 aa224134 Iustin Pop
  if constants.ENABLE_CONFD:
99 aa224134 Iustin Pop
    utils.EnsureDaemon(constants.CONFD)
100 c300dbe4 Michele Tartara
  # start mond as well: all nodes need monitoring
101 c300dbe4 Michele Tartara
  if constants.ENABLE_MOND:
102 c300dbe4 Michele Tartara
    utils.EnsureDaemon(constants.MOND)
103 c300dbe4 Michele Tartara
104 f1115454 Guido Trotter
105 9e289e36 Guido Trotter
def RunWatcherHooks():
106 9e289e36 Guido Trotter
  """Run the watcher hooks.
107 9e289e36 Guido Trotter

108 9e289e36 Guido Trotter
  """
109 57fe4a5b Michael Hanselmann
  hooks_dir = utils.PathJoin(pathutils.HOOKS_BASE_DIR,
110 c4feafe8 Iustin Pop
                             constants.HOOKS_NAME_WATCHER)
111 10e689d4 Iustin Pop
  if not os.path.isdir(hooks_dir):
112 10e689d4 Iustin Pop
    return
113 9e289e36 Guido Trotter
114 9e289e36 Guido Trotter
  try:
115 9e289e36 Guido Trotter
    results = utils.RunParts(hooks_dir)
116 17385bd2 Andrea Spadaccini
  except Exception, err: # pylint: disable=W0703
117 17385bd2 Andrea Spadaccini
    logging.exception("RunParts %s failed: %s", hooks_dir, err)
118 a0aa6b49 Michael Hanselmann
    return
119 9e289e36 Guido Trotter
120 9e289e36 Guido Trotter
  for (relname, status, runresult) in results:
121 9e289e36 Guido Trotter
    if status == constants.RUNPARTS_SKIP:
122 9e289e36 Guido Trotter
      logging.debug("Watcher hook %s: skipped", relname)
123 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_ERR:
124 9e289e36 Guido Trotter
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
125 9e289e36 Guido Trotter
    elif status == constants.RUNPARTS_RUN:
126 9e289e36 Guido Trotter
      if runresult.failed:
127 9e289e36 Guido Trotter
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
128 9e289e36 Guido Trotter
                        relname, runresult.exit_code, runresult.output)
129 9e289e36 Guido Trotter
      else:
130 9e289e36 Guido Trotter
        logging.debug("Watcher hook %s: success (output: %s)", relname,
131 9e289e36 Guido Trotter
                      runresult.output)
132 013ce4ae Michael Hanselmann
    else:
133 013ce4ae Michael Hanselmann
      raise errors.ProgrammerError("Unknown status %s returned by RunParts",
134 013ce4ae Michael Hanselmann
                                   status)
135 9e289e36 Guido Trotter
136 001b3825 Michael Hanselmann
137 a8083063 Iustin Pop
class Instance(object):
138 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
139 a8083063 Iustin Pop

140 a8083063 Iustin Pop
  """
141 d962dbf9 Thomas Thrainer
  def __init__(self, name, status, disks_active, snodes):
142 a8083063 Iustin Pop
    self.name = name
143 adf6301e Michael Hanselmann
    self.status = status
144 d962dbf9 Thomas Thrainer
    self.disks_active = disks_active
145 83e5e26f René Nussbaumer
    self.snodes = snodes
146 a8083063 Iustin Pop
147 16e0b9c9 Michael Hanselmann
  def Restart(self, cl):
148 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
149 3ecf6786 Iustin Pop

150 3ecf6786 Iustin Pop
    """
151 c873d91c Iustin Pop
    op = opcodes.OpInstanceStartup(instance_name=self.name, force=False)
152 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
153 a8083063 Iustin Pop
154 16e0b9c9 Michael Hanselmann
  def ActivateDisks(self, cl):
155 5a3103e9 Michael Hanselmann
    """Encapsulates the activation of all disks of an instance.
156 5a3103e9 Michael Hanselmann

157 5a3103e9 Michael Hanselmann
    """
158 83f5d475 Iustin Pop
    op = opcodes.OpInstanceActivateDisks(instance_name=self.name)
159 16e0b9c9 Michael Hanselmann
    cli.SubmitOpCode(op, cl=cl)
160 a8083063 Iustin Pop
161 a8083063 Iustin Pop
162 16e0b9c9 Michael Hanselmann
class Node:
163 16e0b9c9 Michael Hanselmann
  """Data container representing cluster node.
164 5a3103e9 Michael Hanselmann

165 5a3103e9 Michael Hanselmann
  """
166 16e0b9c9 Michael Hanselmann
  def __init__(self, name, bootid, offline, secondaries):
167 16e0b9c9 Michael Hanselmann
    """Initializes this class.
168 a8083063 Iustin Pop

169 16e0b9c9 Michael Hanselmann
    """
170 16e0b9c9 Michael Hanselmann
    self.name = name
171 16e0b9c9 Michael Hanselmann
    self.bootid = bootid
172 16e0b9c9 Michael Hanselmann
    self.offline = offline
173 16e0b9c9 Michael Hanselmann
    self.secondaries = secondaries
174 5a3103e9 Michael Hanselmann
175 78f44650 Iustin Pop
176 bc69c426 Jose A. Lopes
def _CleanupInstance(cl, notepad, inst, locks):
177 e52e0ddc Jose A. Lopes
  n = notepad.NumberOfCleanupAttempts(inst.name)
178 e52e0ddc Jose A. Lopes
179 bc69c426 Jose A. Lopes
  if inst.name in locks:
180 bc69c426 Jose A. Lopes
    logging.info("Not cleaning up instance '%s', instance is locked",
181 bc69c426 Jose A. Lopes
                 inst.name)
182 bc69c426 Jose A. Lopes
    return
183 bc69c426 Jose A. Lopes
184 e52e0ddc Jose A. Lopes
  if n > MAXTRIES:
185 e52e0ddc Jose A. Lopes
    logging.warning("Not cleaning up instance '%s', retries exhausted",
186 e52e0ddc Jose A. Lopes
                    inst.name)
187 e52e0ddc Jose A. Lopes
    return
188 e52e0ddc Jose A. Lopes
189 e52e0ddc Jose A. Lopes
  logging.info("Instance '%s' was shutdown by the user, cleaning up instance",
190 e52e0ddc Jose A. Lopes
               inst.name)
191 e52e0ddc Jose A. Lopes
  op = opcodes.OpInstanceShutdown(instance_name=inst.name)
192 e52e0ddc Jose A. Lopes
193 e52e0ddc Jose A. Lopes
  try:
194 e52e0ddc Jose A. Lopes
    cli.SubmitOpCode(op, cl=cl)
195 e52e0ddc Jose A. Lopes
    if notepad.NumberOfCleanupAttempts(inst.name):
196 e52e0ddc Jose A. Lopes
      notepad.RemoveInstance(inst.name)
197 e52e0ddc Jose A. Lopes
  except Exception: # pylint: disable=W0703
198 e52e0ddc Jose A. Lopes
    logging.exception("Error while cleaning up instance '%s'", inst.name)
199 e52e0ddc Jose A. Lopes
    notepad.RecordCleanupAttempt(inst.name)
200 e52e0ddc Jose A. Lopes
201 e52e0ddc Jose A. Lopes
202 bc69c426 Jose A. Lopes
def _CheckInstances(cl, notepad, instances, locks):
203 16e0b9c9 Michael Hanselmann
  """Make a pass over the list of instances, restarting downed ones.
204 5a3103e9 Michael Hanselmann

205 16e0b9c9 Michael Hanselmann
  """
206 16e0b9c9 Michael Hanselmann
  notepad.MaintainInstanceList(instances.keys())
207 78f44650 Iustin Pop
208 16e0b9c9 Michael Hanselmann
  started = set()
209 78f44650 Iustin Pop
210 16e0b9c9 Michael Hanselmann
  for inst in instances.values():
211 e52e0ddc Jose A. Lopes
    if inst.status == constants.INSTST_USERDOWN:
212 bc69c426 Jose A. Lopes
      _CleanupInstance(cl, notepad, inst, locks)
213 e52e0ddc Jose A. Lopes
    elif inst.status in BAD_STATES:
214 16e0b9c9 Michael Hanselmann
      n = notepad.NumberOfRestartAttempts(inst.name)
215 5a3103e9 Michael Hanselmann
216 16e0b9c9 Michael Hanselmann
      if n > MAXTRIES:
217 16e0b9c9 Michael Hanselmann
        logging.warning("Not restarting instance '%s', retries exhausted",
218 16e0b9c9 Michael Hanselmann
                        inst.name)
219 16e0b9c9 Michael Hanselmann
        continue
220 a8083063 Iustin Pop
221 16e0b9c9 Michael Hanselmann
      if n == MAXTRIES:
222 16e0b9c9 Michael Hanselmann
        notepad.RecordRestartAttempt(inst.name)
223 16e0b9c9 Michael Hanselmann
        logging.error("Could not restart instance '%s' after %s attempts,"
224 16e0b9c9 Michael Hanselmann
                      " giving up", inst.name, MAXTRIES)
225 16e0b9c9 Michael Hanselmann
        continue
226 5a3103e9 Michael Hanselmann
227 16e0b9c9 Michael Hanselmann
      try:
228 16e0b9c9 Michael Hanselmann
        logging.info("Restarting instance '%s' (attempt #%s)",
229 16e0b9c9 Michael Hanselmann
                     inst.name, n + 1)
230 16e0b9c9 Michael Hanselmann
        inst.Restart(cl)
231 b459a848 Andrea Spadaccini
      except Exception: # pylint: disable=W0703
232 16e0b9c9 Michael Hanselmann
        logging.exception("Error while restarting instance '%s'", inst.name)
233 16e0b9c9 Michael Hanselmann
      else:
234 16e0b9c9 Michael Hanselmann
        started.add(inst.name)
235 5a3103e9 Michael Hanselmann
236 16e0b9c9 Michael Hanselmann
      notepad.RecordRestartAttempt(inst.name)
237 5a3103e9 Michael Hanselmann
238 16e0b9c9 Michael Hanselmann
    else:
239 16e0b9c9 Michael Hanselmann
      if notepad.NumberOfRestartAttempts(inst.name):
240 16e0b9c9 Michael Hanselmann
        notepad.RemoveInstance(inst.name)
241 16e0b9c9 Michael Hanselmann
        if inst.status not in HELPLESS_STATES:
242 16e0b9c9 Michael Hanselmann
          logging.info("Restart of instance '%s' succeeded", inst.name)
243 a8083063 Iustin Pop
244 16e0b9c9 Michael Hanselmann
  return started
245 a8083063 Iustin Pop
246 a8083063 Iustin Pop
247 16e0b9c9 Michael Hanselmann
def _CheckDisks(cl, notepad, nodes, instances, started):
248 16e0b9c9 Michael Hanselmann
  """Check all nodes for restarted ones.
249 38242904 Iustin Pop

250 a8083063 Iustin Pop
  """
251 16e0b9c9 Michael Hanselmann
  check_nodes = []
252 16e0b9c9 Michael Hanselmann
253 16e0b9c9 Michael Hanselmann
  for node in nodes.values():
254 16e0b9c9 Michael Hanselmann
    old = notepad.GetNodeBootID(node.name)
255 16e0b9c9 Michael Hanselmann
    if not node.bootid:
256 16e0b9c9 Michael Hanselmann
      # Bad node, not returning a boot id
257 16e0b9c9 Michael Hanselmann
      if not node.offline:
258 16e0b9c9 Michael Hanselmann
        logging.debug("Node '%s' missing boot ID, skipping secondary checks",
259 16e0b9c9 Michael Hanselmann
                      node.name)
260 16e0b9c9 Michael Hanselmann
      continue
261 16e0b9c9 Michael Hanselmann
262 16e0b9c9 Michael Hanselmann
    if old != node.bootid:
263 16e0b9c9 Michael Hanselmann
      # Node's boot ID has changed, probably through a reboot
264 16e0b9c9 Michael Hanselmann
      check_nodes.append(node)
265 16e0b9c9 Michael Hanselmann
266 16e0b9c9 Michael Hanselmann
  if check_nodes:
267 16e0b9c9 Michael Hanselmann
    # Activate disks for all instances with any of the checked nodes as a
268 16e0b9c9 Michael Hanselmann
    # secondary node.
269 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
270 16e0b9c9 Michael Hanselmann
      for instance_name in node.secondaries:
271 16e0b9c9 Michael Hanselmann
        try:
272 16e0b9c9 Michael Hanselmann
          inst = instances[instance_name]
273 16e0b9c9 Michael Hanselmann
        except KeyError:
274 16e0b9c9 Michael Hanselmann
          logging.info("Can't find instance '%s', maybe it was ignored",
275 16e0b9c9 Michael Hanselmann
                       instance_name)
276 eee1fa2d Iustin Pop
          continue
277 a8083063 Iustin Pop
278 d962dbf9 Thomas Thrainer
        if not inst.disks_active:
279 d962dbf9 Thomas Thrainer
          logging.info("Skipping disk activation for instance with not"
280 d962dbf9 Thomas Thrainer
                       " activated disks '%s'", inst.name)
281 a8083063 Iustin Pop
          continue
282 16e0b9c9 Michael Hanselmann
283 16e0b9c9 Michael Hanselmann
        if inst.name in started:
284 16e0b9c9 Michael Hanselmann
          # we already tried to start the instance, which should have
285 16e0b9c9 Michael Hanselmann
          # activated its drives (if they can be at all)
286 16e0b9c9 Michael Hanselmann
          logging.debug("Skipping disk activation for instance '%s' as"
287 16e0b9c9 Michael Hanselmann
                        " it was already started", inst.name)
288 a8083063 Iustin Pop
          continue
289 16e0b9c9 Michael Hanselmann
290 a8083063 Iustin Pop
        try:
291 16e0b9c9 Michael Hanselmann
          logging.info("Activating disks for instance '%s'", inst.name)
292 16e0b9c9 Michael Hanselmann
          inst.ActivateDisks(cl)
293 b459a848 Andrea Spadaccini
        except Exception: # pylint: disable=W0703
294 16e0b9c9 Michael Hanselmann
          logging.exception("Error while activating disks for instance '%s'",
295 16e0b9c9 Michael Hanselmann
                            inst.name)
296 a8083063 Iustin Pop
297 16e0b9c9 Michael Hanselmann
    # Keep changed boot IDs
298 16e0b9c9 Michael Hanselmann
    for node in check_nodes:
299 16e0b9c9 Michael Hanselmann
      notepad.SetNodeBootID(node.name, node.bootid)
300 a8083063 Iustin Pop
301 83e5e26f René Nussbaumer
302 16e0b9c9 Michael Hanselmann
def _CheckForOfflineNodes(nodes, instance):
303 16e0b9c9 Michael Hanselmann
  """Checks if given instances has any secondary in offline status.
304 ae1a845c Michael Hanselmann

305 16e0b9c9 Michael Hanselmann
  @param instance: The instance object
306 16e0b9c9 Michael Hanselmann
  @return: True if any of the secondary is offline, False otherwise
307 ae1a845c Michael Hanselmann

308 16e0b9c9 Michael Hanselmann
  """
309 16e0b9c9 Michael Hanselmann
  return compat.any(nodes[node_name].offline for node_name in instance.snodes)
310 ae1a845c Michael Hanselmann
311 ae1a845c Michael Hanselmann
312 16e0b9c9 Michael Hanselmann
def _VerifyDisks(cl, uuid, nodes, instances):
313 16e0b9c9 Michael Hanselmann
  """Run a per-group "gnt-cluster verify-disks".
314 ae1a845c Michael Hanselmann

315 16e0b9c9 Michael Hanselmann
  """
316 16e0b9c9 Michael Hanselmann
  job_id = cl.SubmitJob([opcodes.OpGroupVerifyDisks(group_name=uuid)])
317 16e0b9c9 Michael Hanselmann
  ((_, offline_disk_instances, _), ) = \
318 16e0b9c9 Michael Hanselmann
    cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
319 16e0b9c9 Michael Hanselmann
  cl.ArchiveJob(job_id)
320 ae1a845c Michael Hanselmann
321 16e0b9c9 Michael Hanselmann
  if not offline_disk_instances:
322 16e0b9c9 Michael Hanselmann
    # nothing to do
323 16e0b9c9 Michael Hanselmann
    logging.debug("Verify-disks reported no offline disks, nothing to do")
324 16e0b9c9 Michael Hanselmann
    return
325 ae1a845c Michael Hanselmann
326 16e0b9c9 Michael Hanselmann
  logging.debug("Will activate disks for instance(s) %s",
327 16e0b9c9 Michael Hanselmann
                utils.CommaJoin(offline_disk_instances))
328 ae1a845c Michael Hanselmann
329 16e0b9c9 Michael Hanselmann
  # We submit only one job, and wait for it. Not optimal, but this puts less
330 16e0b9c9 Michael Hanselmann
  # load on the job queue.
331 16e0b9c9 Michael Hanselmann
  job = []
332 16e0b9c9 Michael Hanselmann
  for name in offline_disk_instances:
333 16e0b9c9 Michael Hanselmann
    try:
334 16e0b9c9 Michael Hanselmann
      inst = instances[name]
335 16e0b9c9 Michael Hanselmann
    except KeyError:
336 16e0b9c9 Michael Hanselmann
      logging.info("Can't find instance '%s', maybe it was ignored", name)
337 16e0b9c9 Michael Hanselmann
      continue
338 ae1a845c Michael Hanselmann
339 16e0b9c9 Michael Hanselmann
    if inst.status in HELPLESS_STATES or _CheckForOfflineNodes(nodes, inst):
340 40b068e5 Iustin Pop
      logging.info("Skipping instance '%s' because it is in a helpless state"
341 40b068e5 Iustin Pop
                   " or has offline secondaries", name)
342 16e0b9c9 Michael Hanselmann
      continue
343 ae1a845c Michael Hanselmann
344 16e0b9c9 Michael Hanselmann
    job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
345 5188ab37 Iustin Pop
346 16e0b9c9 Michael Hanselmann
  if job:
347 16e0b9c9 Michael Hanselmann
    job_id = cli.SendJob(job, cl=cl)
348 83e5e26f René Nussbaumer
349 16e0b9c9 Michael Hanselmann
    try:
350 16e0b9c9 Michael Hanselmann
      cli.PollJob(job_id, cl=cl, feedback_fn=logging.debug)
351 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
352 16e0b9c9 Michael Hanselmann
      logging.exception("Error while activating disks")
353 a8083063 Iustin Pop
354 a8083063 Iustin Pop
355 db147305 Tom Limoncelli
def IsRapiResponding(hostname):
356 db147305 Tom Limoncelli
  """Connects to RAPI port and does a simple test.
357 db147305 Tom Limoncelli

358 db147305 Tom Limoncelli
  Connects to RAPI port of hostname and does a simple test. At this time, the
359 db147305 Tom Limoncelli
  test is GetVersion.
360 db147305 Tom Limoncelli

361 6177890b Michele Tartara
  If RAPI responds with error code "401 Unauthorized", the test is successful,
362 6177890b Michele Tartara
  because the aim of this function is to assess whether RAPI is responding, not
363 6177890b Michele Tartara
  if it is accessible.
364 6177890b Michele Tartara

365 db147305 Tom Limoncelli
  @type hostname: string
366 db147305 Tom Limoncelli
  @param hostname: hostname of the node to connect to.
367 db147305 Tom Limoncelli
  @rtype: bool
368 db147305 Tom Limoncelli
  @return: Whether RAPI is working properly
369 db147305 Tom Limoncelli

370 db147305 Tom Limoncelli
  """
371 34f06005 Iustin Pop
  curl_config = rapi.client.GenericCurlConfig()
372 2a7c3583 Michael Hanselmann
  rapi_client = rapi.client.GanetiRapiClient(hostname,
373 2a7c3583 Michael Hanselmann
                                             curl_config_fn=curl_config)
374 db147305 Tom Limoncelli
  try:
375 db147305 Tom Limoncelli
    master_version = rapi_client.GetVersion()
376 db147305 Tom Limoncelli
  except rapi.client.CertificateError, err:
377 d7c42723 Michael Hanselmann
    logging.warning("RAPI certificate error: %s", err)
378 db147305 Tom Limoncelli
    return False
379 db147305 Tom Limoncelli
  except rapi.client.GanetiApiError, err:
380 6177890b Michele Tartara
    if err.code == 401:
381 6177890b Michele Tartara
      # Unauthorized, but RAPI is alive and responding
382 6177890b Michele Tartara
      return True
383 6177890b Michele Tartara
    else:
384 6177890b Michele Tartara
      logging.warning("RAPI error: %s", err)
385 6177890b Michele Tartara
      return False
386 d7c42723 Michael Hanselmann
  else:
387 d7c42723 Michael Hanselmann
    logging.debug("Reported RAPI version %s", master_version)
388 d7c42723 Michael Hanselmann
    return master_version == constants.RAPI_VERSION
389 db147305 Tom Limoncelli
390 db147305 Tom Limoncelli
391 a8083063 Iustin Pop
def ParseOptions():
392 a8083063 Iustin Pop
  """Parse the command line options.
393 a8083063 Iustin Pop

394 c41eea6e Iustin Pop
  @return: (options, args) as from OptionParser.parse_args()
395 a8083063 Iustin Pop

396 a8083063 Iustin Pop
  """
397 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
398 a8083063 Iustin Pop
                        usage="%prog [-d]",
399 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
400 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
401 a8083063 Iustin Pop
402 6d4e8ec0 Iustin Pop
  parser.add_option(cli.DEBUG_OPT)
403 16e0b9c9 Michael Hanselmann
  parser.add_option(cli.NODEGROUP_OPT)
404 f0a80b01 Michael Hanselmann
  parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
405 f07521e5 Iustin Pop
                    help="Autoarchive jobs older than this age (default"
406 f0a80b01 Michael Hanselmann
                          " 6 hours)")
407 46c8a6ab Iustin Pop
  parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
408 46c8a6ab Iustin Pop
                    action="store_true", help="Ignore cluster pause setting")
409 5f01e6ad Michael Hanselmann
  parser.add_option("--wait-children", dest="wait_children",
410 16e0b9c9 Michael Hanselmann
                    action="store_true", help="Wait for child processes")
411 5f01e6ad Michael Hanselmann
  parser.add_option("--no-wait-children", dest="wait_children",
412 40b068e5 Iustin Pop
                    action="store_false",
413 40b068e5 Iustin Pop
                    help="Don't wait for child processes")
414 5f01e6ad Michael Hanselmann
  # See optparse documentation for why default values are not set by options
415 5f01e6ad Michael Hanselmann
  parser.set_defaults(wait_children=True)
416 a8083063 Iustin Pop
  options, args = parser.parse_args()
417 f07521e5 Iustin Pop
  options.job_age = cli.ParseTimespec(options.job_age)
418 f0a80b01 Michael Hanselmann
419 f0a80b01 Michael Hanselmann
  if args:
420 f0a80b01 Michael Hanselmann
    parser.error("No arguments expected")
421 f0a80b01 Michael Hanselmann
422 f0a80b01 Michael Hanselmann
  return (options, args)
423 a8083063 Iustin Pop
424 a8083063 Iustin Pop
425 9bb69bb5 Michael Hanselmann
def _WriteInstanceStatus(filename, data):
426 9bb69bb5 Michael Hanselmann
  """Writes the per-group instance status file.
427 9bb69bb5 Michael Hanselmann

428 9bb69bb5 Michael Hanselmann
  The entries are sorted.
429 8f07dc0d Michael Hanselmann

430 9bb69bb5 Michael Hanselmann
  @type filename: string
431 9bb69bb5 Michael Hanselmann
  @param filename: Path to instance status file
432 9bb69bb5 Michael Hanselmann
  @type data: list of tuple; (instance name as string, status as string)
433 9bb69bb5 Michael Hanselmann
  @param data: Instance name and status
434 8f07dc0d Michael Hanselmann

435 8f07dc0d Michael Hanselmann
  """
436 9bb69bb5 Michael Hanselmann
  logging.debug("Updating instance status file '%s' with %s instances",
437 9bb69bb5 Michael Hanselmann
                filename, len(data))
438 8f07dc0d Michael Hanselmann
439 9bb69bb5 Michael Hanselmann
  utils.WriteFile(filename,
440 9bb69bb5 Michael Hanselmann
                  data="".join(map(compat.partial(operator.mod, "%s %s\n"),
441 9bb69bb5 Michael Hanselmann
                                   sorted(data))))
442 9bb69bb5 Michael Hanselmann
443 9bb69bb5 Michael Hanselmann
444 9bb69bb5 Michael Hanselmann
def _UpdateInstanceStatus(filename, instances):
445 9bb69bb5 Michael Hanselmann
  """Writes an instance status file from L{Instance} objects.
446 9bb69bb5 Michael Hanselmann

447 9bb69bb5 Michael Hanselmann
  @type filename: string
448 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
449 9bb69bb5 Michael Hanselmann
  @type instances: list of L{Instance}
450 9bb69bb5 Michael Hanselmann

451 9bb69bb5 Michael Hanselmann
  """
452 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, [(inst.name, inst.status)
453 9bb69bb5 Michael Hanselmann
                                  for inst in instances])
454 9bb69bb5 Michael Hanselmann
455 9bb69bb5 Michael Hanselmann
456 9bb69bb5 Michael Hanselmann
def _ReadInstanceStatus(filename):
457 9bb69bb5 Michael Hanselmann
  """Reads an instance status file.
458 9bb69bb5 Michael Hanselmann

459 9bb69bb5 Michael Hanselmann
  @type filename: string
460 9bb69bb5 Michael Hanselmann
  @param filename: Path to status file
461 9bb69bb5 Michael Hanselmann
  @rtype: tuple; (None or number, list of lists containing instance name and
462 9bb69bb5 Michael Hanselmann
    status)
463 9bb69bb5 Michael Hanselmann
  @return: File's mtime and instance status contained in the file; mtime is
464 9bb69bb5 Michael Hanselmann
    C{None} if file can't be read
465 9bb69bb5 Michael Hanselmann

466 9bb69bb5 Michael Hanselmann
  """
467 9bb69bb5 Michael Hanselmann
  logging.debug("Reading per-group instance status from '%s'", filename)
468 9bb69bb5 Michael Hanselmann
469 2635bb04 Michael Hanselmann
  statcb = utils.FileStatHelper()
470 9bb69bb5 Michael Hanselmann
  try:
471 9bb69bb5 Michael Hanselmann
    content = utils.ReadFile(filename, preread=statcb)
472 9bb69bb5 Michael Hanselmann
  except EnvironmentError, err:
473 9bb69bb5 Michael Hanselmann
    if err.errno == errno.ENOENT:
474 9bb69bb5 Michael Hanselmann
      logging.error("Can't read '%s', does not exist (yet)", filename)
475 9bb69bb5 Michael Hanselmann
    else:
476 9bb69bb5 Michael Hanselmann
      logging.exception("Unable to read '%s', ignoring", filename)
477 9bb69bb5 Michael Hanselmann
    return (None, None)
478 9bb69bb5 Michael Hanselmann
  else:
479 6f9e71bb Michael Hanselmann
    return (statcb.st.st_mtime, [line.split(None, 1)
480 9bb69bb5 Michael Hanselmann
                                 for line in content.splitlines()])
481 9bb69bb5 Michael Hanselmann
482 9bb69bb5 Michael Hanselmann
483 9bb69bb5 Michael Hanselmann
def _MergeInstanceStatus(filename, pergroup_filename, groups):
484 9bb69bb5 Michael Hanselmann
  """Merges all per-group instance status files into a global one.
485 9bb69bb5 Michael Hanselmann

486 9bb69bb5 Michael Hanselmann
  @type filename: string
487 9bb69bb5 Michael Hanselmann
  @param filename: Path to global instance status file
488 9bb69bb5 Michael Hanselmann
  @type pergroup_filename: string
489 9bb69bb5 Michael Hanselmann
  @param pergroup_filename: Path to per-group status files, must contain "%s"
490 9bb69bb5 Michael Hanselmann
    to be replaced with group UUID
491 9bb69bb5 Michael Hanselmann
  @type groups: sequence
492 9bb69bb5 Michael Hanselmann
  @param groups: UUIDs of known groups
493 9bb69bb5 Michael Hanselmann

494 9bb69bb5 Michael Hanselmann
  """
495 9bb69bb5 Michael Hanselmann
  # Lock global status file in exclusive mode
496 9bb69bb5 Michael Hanselmann
  lock = utils.FileLock.Open(filename)
497 9bb69bb5 Michael Hanselmann
  try:
498 9bb69bb5 Michael Hanselmann
    lock.Exclusive(blocking=True, timeout=INSTANCE_STATUS_LOCK_TIMEOUT)
499 9bb69bb5 Michael Hanselmann
  except errors.LockError, err:
500 9bb69bb5 Michael Hanselmann
    # All per-group processes will lock and update the file. None of them
501 9bb69bb5 Michael Hanselmann
    # should take longer than 10 seconds (the value of
502 9bb69bb5 Michael Hanselmann
    # INSTANCE_STATUS_LOCK_TIMEOUT).
503 9bb69bb5 Michael Hanselmann
    logging.error("Can't acquire lock on instance status file '%s', not"
504 9bb69bb5 Michael Hanselmann
                  " updating: %s", filename, err)
505 9bb69bb5 Michael Hanselmann
    return
506 9bb69bb5 Michael Hanselmann
507 9bb69bb5 Michael Hanselmann
  logging.debug("Acquired exclusive lock on '%s'", filename)
508 9bb69bb5 Michael Hanselmann
509 9bb69bb5 Michael Hanselmann
  data = {}
510 9bb69bb5 Michael Hanselmann
511 9bb69bb5 Michael Hanselmann
  # Load instance status from all groups
512 9bb69bb5 Michael Hanselmann
  for group_uuid in groups:
513 9bb69bb5 Michael Hanselmann
    (mtime, instdata) = _ReadInstanceStatus(pergroup_filename % group_uuid)
514 9bb69bb5 Michael Hanselmann
515 9bb69bb5 Michael Hanselmann
    if mtime is not None:
516 9bb69bb5 Michael Hanselmann
      for (instance_name, status) in instdata:
517 9bb69bb5 Michael Hanselmann
        data.setdefault(instance_name, []).append((mtime, status))
518 9bb69bb5 Michael Hanselmann
519 9bb69bb5 Michael Hanselmann
  # Select last update based on file mtime
520 9bb69bb5 Michael Hanselmann
  inststatus = [(instance_name, sorted(status, reverse=True)[0][1])
521 9bb69bb5 Michael Hanselmann
                for (instance_name, status) in data.items()]
522 9bb69bb5 Michael Hanselmann
523 9bb69bb5 Michael Hanselmann
  # Write the global status file. Don't touch file after it's been
524 9bb69bb5 Michael Hanselmann
  # updated--there is no lock anymore.
525 9bb69bb5 Michael Hanselmann
  _WriteInstanceStatus(filename, inststatus)
526 8f07dc0d Michael Hanselmann
527 8f07dc0d Michael Hanselmann
528 39bdcf76 Klaus Aehlig
def GetLuxiClient(try_restart):
529 39bdcf76 Klaus Aehlig
  """Tries to connect to the luxi daemon.
530 16e0b9c9 Michael Hanselmann

531 16e0b9c9 Michael Hanselmann
  @type try_restart: bool
532 16e0b9c9 Michael Hanselmann
  @param try_restart: Whether to attempt to restart the master daemon
533 16e0b9c9 Michael Hanselmann

534 16e0b9c9 Michael Hanselmann
  """
535 16e0b9c9 Michael Hanselmann
  try:
536 39bdcf76 Klaus Aehlig
    return cli.GetClient()
537 16e0b9c9 Michael Hanselmann
  except errors.OpPrereqError, err:
538 16e0b9c9 Michael Hanselmann
    # this is, from cli.GetClient, a not-master case
539 16e0b9c9 Michael Hanselmann
    raise NotMasterError("Not on master node (%s)" % err)
540 16e0b9c9 Michael Hanselmann
541 9ba38706 Petr Pudlak
  except rpcerr.NoMasterError, err:
542 16e0b9c9 Michael Hanselmann
    if not try_restart:
543 16e0b9c9 Michael Hanselmann
      raise
544 16e0b9c9 Michael Hanselmann
545 39bdcf76 Klaus Aehlig
    logging.warning("Luxi daemon seems to be down (%s), trying to restart",
546 16e0b9c9 Michael Hanselmann
                    err)
547 16e0b9c9 Michael Hanselmann
548 39bdcf76 Klaus Aehlig
    if not utils.EnsureDaemon(constants.LUXID):
549 16e0b9c9 Michael Hanselmann
      raise errors.GenericError("Can't start the master daemon")
550 16e0b9c9 Michael Hanselmann
551 16e0b9c9 Michael Hanselmann
    # Retry the connection
552 39bdcf76 Klaus Aehlig
    return cli.GetClient()
553 16e0b9c9 Michael Hanselmann
554 16e0b9c9 Michael Hanselmann
555 16e0b9c9 Michael Hanselmann
def _StartGroupChildren(cl, wait):
556 16e0b9c9 Michael Hanselmann
  """Starts a new instance of the watcher for every node group.
557 16e0b9c9 Michael Hanselmann

558 16e0b9c9 Michael Hanselmann
  """
559 16e0b9c9 Michael Hanselmann
  assert not compat.any(arg.startswith(cli.NODEGROUP_OPT_NAME)
560 16e0b9c9 Michael Hanselmann
                        for arg in sys.argv)
561 16e0b9c9 Michael Hanselmann
562 16e0b9c9 Michael Hanselmann
  result = cl.QueryGroups([], ["name", "uuid"], False)
563 16e0b9c9 Michael Hanselmann
564 16e0b9c9 Michael Hanselmann
  children = []
565 16e0b9c9 Michael Hanselmann
566 16e0b9c9 Michael Hanselmann
  for (idx, (name, uuid)) in enumerate(result):
567 16e0b9c9 Michael Hanselmann
    args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]
568 16e0b9c9 Michael Hanselmann
569 16e0b9c9 Michael Hanselmann
    if idx > 0:
570 16e0b9c9 Michael Hanselmann
      # Let's not kill the system
571 16e0b9c9 Michael Hanselmann
      time.sleep(CHILD_PROCESS_DELAY)
572 16e0b9c9 Michael Hanselmann
573 16e0b9c9 Michael Hanselmann
    logging.debug("Spawning child for group '%s' (%s), arguments %s",
574 16e0b9c9 Michael Hanselmann
                  name, uuid, args)
575 16e0b9c9 Michael Hanselmann
576 16e0b9c9 Michael Hanselmann
    try:
577 16e0b9c9 Michael Hanselmann
      # TODO: Should utils.StartDaemon be used instead?
578 16e0b9c9 Michael Hanselmann
      pid = os.spawnv(os.P_NOWAIT, args[0], args)
579 b459a848 Andrea Spadaccini
    except Exception: # pylint: disable=W0703
580 16e0b9c9 Michael Hanselmann
      logging.exception("Failed to start child for group '%s' (%s)",
581 16e0b9c9 Michael Hanselmann
                        name, uuid)
582 16e0b9c9 Michael Hanselmann
    else:
583 16e0b9c9 Michael Hanselmann
      logging.debug("Started with PID %s", pid)
584 16e0b9c9 Michael Hanselmann
      children.append(pid)
585 16e0b9c9 Michael Hanselmann
586 16e0b9c9 Michael Hanselmann
  if wait:
587 16e0b9c9 Michael Hanselmann
    for pid in children:
588 16e0b9c9 Michael Hanselmann
      logging.debug("Waiting for child PID %s", pid)
589 16e0b9c9 Michael Hanselmann
      try:
590 16e0b9c9 Michael Hanselmann
        result = utils.RetryOnSignal(os.waitpid, pid, 0)
591 16e0b9c9 Michael Hanselmann
      except EnvironmentError, err:
592 16e0b9c9 Michael Hanselmann
        result = str(err)
593 16e0b9c9 Michael Hanselmann
594 16e0b9c9 Michael Hanselmann
      logging.debug("Child PID %s exited with status %s", pid, result)
595 16e0b9c9 Michael Hanselmann
596 16e0b9c9 Michael Hanselmann
597 16e0b9c9 Michael Hanselmann
def _ArchiveJobs(cl, age):
598 16e0b9c9 Michael Hanselmann
  """Archives old jobs.
599 16e0b9c9 Michael Hanselmann

600 16e0b9c9 Michael Hanselmann
  """
601 16e0b9c9 Michael Hanselmann
  (arch_count, left_count) = cl.AutoArchiveJobs(age)
602 16e0b9c9 Michael Hanselmann
  logging.debug("Archived %s jobs, left %s", arch_count, left_count)
603 16e0b9c9 Michael Hanselmann
604 16e0b9c9 Michael Hanselmann
605 16e0b9c9 Michael Hanselmann
def _CheckMaster(cl):
606 16e0b9c9 Michael Hanselmann
  """Ensures current host is master node.
607 16e0b9c9 Michael Hanselmann

608 16e0b9c9 Michael Hanselmann
  """
609 16e0b9c9 Michael Hanselmann
  (master, ) = cl.QueryConfigValues(["master_node"])
610 16e0b9c9 Michael Hanselmann
  if master != netutils.Hostname.GetSysName():
611 16e0b9c9 Michael Hanselmann
    raise NotMasterError("This is not the master node")
612 16e0b9c9 Michael Hanselmann
613 16e0b9c9 Michael Hanselmann
614 fc3f75dd Iustin Pop
@UsesRapiClient
615 16e0b9c9 Michael Hanselmann
def _GlobalWatcher(opts):
616 16e0b9c9 Michael Hanselmann
  """Main function for global watcher.
617 16e0b9c9 Michael Hanselmann

618 16e0b9c9 Michael Hanselmann
  At the end child processes are spawned for every node group.
619 16e0b9c9 Michael Hanselmann

620 16e0b9c9 Michael Hanselmann
  """
621 16e0b9c9 Michael Hanselmann
  StartNodeDaemons()
622 16e0b9c9 Michael Hanselmann
  RunWatcherHooks()
623 16e0b9c9 Michael Hanselmann
624 16e0b9c9 Michael Hanselmann
  # Run node maintenance in all cases, even if master, so that old masters can
625 16e0b9c9 Michael Hanselmann
  # be properly cleaned up
626 b459a848 Andrea Spadaccini
  if nodemaint.NodeMaintenance.ShouldRun(): # pylint: disable=E0602
627 b459a848 Andrea Spadaccini
    nodemaint.NodeMaintenance().Exec() # pylint: disable=E0602
628 16e0b9c9 Michael Hanselmann
629 16e0b9c9 Michael Hanselmann
  try:
630 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(True)
631 16e0b9c9 Michael Hanselmann
  except NotMasterError:
632 16e0b9c9 Michael Hanselmann
    # Don't proceed on non-master nodes
633 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
634 16e0b9c9 Michael Hanselmann
635 16e0b9c9 Michael Hanselmann
  # we are on master now
636 16e0b9c9 Michael Hanselmann
  utils.EnsureDaemon(constants.RAPI)
637 16e0b9c9 Michael Hanselmann
638 16e0b9c9 Michael Hanselmann
  # If RAPI isn't responding to queries, try one restart
639 16e0b9c9 Michael Hanselmann
  logging.debug("Attempting to talk to remote API on %s",
640 16e0b9c9 Michael Hanselmann
                constants.IP4_ADDRESS_LOCALHOST)
641 16e0b9c9 Michael Hanselmann
  if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
642 16e0b9c9 Michael Hanselmann
    logging.warning("Couldn't get answer from remote API, restaring daemon")
643 16e0b9c9 Michael Hanselmann
    utils.StopDaemon(constants.RAPI)
644 16e0b9c9 Michael Hanselmann
    utils.EnsureDaemon(constants.RAPI)
645 16e0b9c9 Michael Hanselmann
    logging.debug("Second attempt to talk to remote API")
646 16e0b9c9 Michael Hanselmann
    if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
647 16e0b9c9 Michael Hanselmann
      logging.fatal("RAPI is not responding")
648 16e0b9c9 Michael Hanselmann
  logging.debug("Successfully talked to remote API")
649 16e0b9c9 Michael Hanselmann
650 16e0b9c9 Michael Hanselmann
  _CheckMaster(client)
651 16e0b9c9 Michael Hanselmann
  _ArchiveJobs(client, opts.job_age)
652 16e0b9c9 Michael Hanselmann
653 16e0b9c9 Michael Hanselmann
  # Spawn child processes for all node groups
654 39bdcf76 Klaus Aehlig
  _StartGroupChildren(client, opts.wait_children)
655 16e0b9c9 Michael Hanselmann
656 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
657 16e0b9c9 Michael Hanselmann
658 16e0b9c9 Michael Hanselmann
659 20fb929a Helga Velroyen
def _GetGroupData(qcl, uuid):
660 16e0b9c9 Michael Hanselmann
  """Retrieves instances and nodes per node group.
661 16e0b9c9 Michael Hanselmann

662 16e0b9c9 Michael Hanselmann
  """
663 bc69c426 Jose A. Lopes
  locks = qcl.Query(constants.QR_LOCK, ["name", "mode"], None)
664 bc69c426 Jose A. Lopes
665 bc69c426 Jose A. Lopes
  prefix = "instance/"
666 bc69c426 Jose A. Lopes
  prefix_len = len(prefix)
667 bc69c426 Jose A. Lopes
668 bc69c426 Jose A. Lopes
  locked_instances = set()
669 bc69c426 Jose A. Lopes
670 bc69c426 Jose A. Lopes
  for [[_, name], [_, lock]] in locks.data:
671 bc69c426 Jose A. Lopes
    if name.startswith(prefix) and lock:
672 bc69c426 Jose A. Lopes
      locked_instances.add(name[prefix_len:])
673 bc69c426 Jose A. Lopes
674 20fb929a Helga Velroyen
  queries = [
675 20fb929a Helga Velroyen
      (constants.QR_INSTANCE,
676 20fb929a Helga Velroyen
       ["name", "status", "disks_active", "snodes",
677 20fb929a Helga Velroyen
        "pnode.group.uuid", "snodes.group.uuid"],
678 20fb929a Helga Velroyen
       [qlang.OP_EQUAL, "pnode.group.uuid", uuid]),
679 20fb929a Helga Velroyen
      (constants.QR_NODE,
680 20fb929a Helga Velroyen
       ["name", "bootid", "offline"],
681 20fb929a Helga Velroyen
       [qlang.OP_EQUAL, "group.uuid", uuid]),
682 20fb929a Helga Velroyen
      ]
683 20fb929a Helga Velroyen
684 20fb929a Helga Velroyen
  results = []
685 20fb929a Helga Velroyen
  for what, fields, qfilter in queries:
686 20fb929a Helga Velroyen
    results.append(qcl.Query(what, fields, qfilter))
687 16e0b9c9 Michael Hanselmann
688 16e0b9c9 Michael Hanselmann
  results_data = map(operator.attrgetter("data"), results)
689 16e0b9c9 Michael Hanselmann
690 16e0b9c9 Michael Hanselmann
  # Ensure results are tuples with two values
691 16e0b9c9 Michael Hanselmann
  assert compat.all(map(ht.TListOf(ht.TListOf(ht.TIsLength(2))), results_data))
692 16e0b9c9 Michael Hanselmann
693 16e0b9c9 Michael Hanselmann
  # Extract values ignoring result status
694 16e0b9c9 Michael Hanselmann
  (raw_instances, raw_nodes) = [[map(compat.snd, values)
695 16e0b9c9 Michael Hanselmann
                                 for values in res]
696 16e0b9c9 Michael Hanselmann
                                for res in results_data]
697 16e0b9c9 Michael Hanselmann
698 16e0b9c9 Michael Hanselmann
  secondaries = {}
699 16e0b9c9 Michael Hanselmann
  instances = []
700 16e0b9c9 Michael Hanselmann
701 16e0b9c9 Michael Hanselmann
  # Load all instances
702 d962dbf9 Thomas Thrainer
  for (name, status, disks_active, snodes, pnode_group_uuid,
703 16e0b9c9 Michael Hanselmann
       snodes_group_uuid) in raw_instances:
704 16e0b9c9 Michael Hanselmann
    if snodes and set([pnode_group_uuid]) != set(snodes_group_uuid):
705 16e0b9c9 Michael Hanselmann
      logging.error("Ignoring split instance '%s', primary group %s, secondary"
706 16e0b9c9 Michael Hanselmann
                    " groups %s", name, pnode_group_uuid,
707 16e0b9c9 Michael Hanselmann
                    utils.CommaJoin(snodes_group_uuid))
708 16e0b9c9 Michael Hanselmann
    else:
709 d962dbf9 Thomas Thrainer
      instances.append(Instance(name, status, disks_active, snodes))
710 16e0b9c9 Michael Hanselmann
711 16e0b9c9 Michael Hanselmann
      for node in snodes:
712 16e0b9c9 Michael Hanselmann
        secondaries.setdefault(node, set()).add(name)
713 16e0b9c9 Michael Hanselmann
714 16e0b9c9 Michael Hanselmann
  # Load all nodes
715 16e0b9c9 Michael Hanselmann
  nodes = [Node(name, bootid, offline, secondaries.get(name, set()))
716 16e0b9c9 Michael Hanselmann
           for (name, bootid, offline) in raw_nodes]
717 16e0b9c9 Michael Hanselmann
718 16e0b9c9 Michael Hanselmann
  return (dict((node.name, node) for node in nodes),
719 bc69c426 Jose A. Lopes
          dict((inst.name, inst) for inst in instances),
720 bc69c426 Jose A. Lopes
          locked_instances)
721 16e0b9c9 Michael Hanselmann
722 16e0b9c9 Michael Hanselmann
723 9bb69bb5 Michael Hanselmann
def _LoadKnownGroups():
724 9bb69bb5 Michael Hanselmann
  """Returns a list of all node groups known by L{ssconf}.
725 16e0b9c9 Michael Hanselmann

726 16e0b9c9 Michael Hanselmann
  """
727 16e0b9c9 Michael Hanselmann
  groups = ssconf.SimpleStore().GetNodegroupList()
728 16e0b9c9 Michael Hanselmann
729 9bb69bb5 Michael Hanselmann
  result = list(line.split(None, 1)[0] for line in groups
730 9bb69bb5 Michael Hanselmann
                if line.strip())
731 9bb69bb5 Michael Hanselmann
732 9bb69bb5 Michael Hanselmann
  if not compat.all(map(utils.UUID_RE.match, result)):
733 9bb69bb5 Michael Hanselmann
    raise errors.GenericError("Ssconf contains invalid group UUID")
734 9bb69bb5 Michael Hanselmann
735 9bb69bb5 Michael Hanselmann
  return result
736 16e0b9c9 Michael Hanselmann
737 16e0b9c9 Michael Hanselmann
738 16e0b9c9 Michael Hanselmann
def _GroupWatcher(opts):
739 16e0b9c9 Michael Hanselmann
  """Main function for per-group watcher process.
740 16e0b9c9 Michael Hanselmann

741 16e0b9c9 Michael Hanselmann
  """
742 16e0b9c9 Michael Hanselmann
  group_uuid = opts.nodegroup.lower()
743 16e0b9c9 Michael Hanselmann
744 16e0b9c9 Michael Hanselmann
  if not utils.UUID_RE.match(group_uuid):
745 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group parameter (%s) must be given a UUID,"
746 16e0b9c9 Michael Hanselmann
                              " got '%s'" %
747 16e0b9c9 Michael Hanselmann
                              (cli.NODEGROUP_OPT_NAME, group_uuid))
748 16e0b9c9 Michael Hanselmann
749 16e0b9c9 Michael Hanselmann
  logging.info("Watcher for node group '%s'", group_uuid)
750 16e0b9c9 Michael Hanselmann
751 9bb69bb5 Michael Hanselmann
  known_groups = _LoadKnownGroups()
752 9bb69bb5 Michael Hanselmann
753 16e0b9c9 Michael Hanselmann
  # Check if node group is known
754 9bb69bb5 Michael Hanselmann
  if group_uuid not in known_groups:
755 16e0b9c9 Michael Hanselmann
    raise errors.GenericError("Node group '%s' is not known by ssconf" %
756 16e0b9c9 Michael Hanselmann
                              group_uuid)
757 16e0b9c9 Michael Hanselmann
758 40b068e5 Iustin Pop
  # Group UUID has been verified and should not contain any dangerous
759 40b068e5 Iustin Pop
  # characters
760 57fe4a5b Michael Hanselmann
  state_path = pathutils.WATCHER_GROUP_STATE_FILE % group_uuid
761 57fe4a5b Michael Hanselmann
  inst_status_path = pathutils.WATCHER_GROUP_INSTANCE_STATUS_FILE % group_uuid
762 16e0b9c9 Michael Hanselmann
763 16e0b9c9 Michael Hanselmann
  logging.debug("Using state file %s", state_path)
764 16e0b9c9 Michael Hanselmann
765 16e0b9c9 Michael Hanselmann
  # Global watcher
766 b459a848 Andrea Spadaccini
  statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
767 16e0b9c9 Michael Hanselmann
  if not statefile:
768 16e0b9c9 Michael Hanselmann
    return constants.EXIT_FAILURE
769 16e0b9c9 Michael Hanselmann
770 b459a848 Andrea Spadaccini
  notepad = state.WatcherState(statefile) # pylint: disable=E0602
771 16e0b9c9 Michael Hanselmann
  try:
772 16e0b9c9 Michael Hanselmann
    # Connect to master daemon
773 16e0b9c9 Michael Hanselmann
    client = GetLuxiClient(False)
774 16e0b9c9 Michael Hanselmann
775 16e0b9c9 Michael Hanselmann
    _CheckMaster(client)
776 16e0b9c9 Michael Hanselmann
777 0f6997b9 Klaus Aehlig
    (nodes, instances, locks) = _GetGroupData(client, group_uuid)
778 16e0b9c9 Michael Hanselmann
779 9bb69bb5 Michael Hanselmann
    # Update per-group instance status file
780 9bb69bb5 Michael Hanselmann
    _UpdateInstanceStatus(inst_status_path, instances.values())
781 9bb69bb5 Michael Hanselmann
782 57fe4a5b Michael Hanselmann
    _MergeInstanceStatus(pathutils.INSTANCE_STATUS_FILE,
783 57fe4a5b Michael Hanselmann
                         pathutils.WATCHER_GROUP_INSTANCE_STATUS_FILE,
784 9bb69bb5 Michael Hanselmann
                         known_groups)
785 9bb69bb5 Michael Hanselmann
786 bc69c426 Jose A. Lopes
    started = _CheckInstances(client, notepad, instances, locks)
787 16e0b9c9 Michael Hanselmann
    _CheckDisks(client, notepad, nodes, instances, started)
788 16e0b9c9 Michael Hanselmann
    _VerifyDisks(client, group_uuid, nodes, instances)
789 16e0b9c9 Michael Hanselmann
  except Exception, err:
790 16e0b9c9 Michael Hanselmann
    logging.info("Not updating status file due to failure: %s", err)
791 16e0b9c9 Michael Hanselmann
    raise
792 16e0b9c9 Michael Hanselmann
  else:
793 16e0b9c9 Michael Hanselmann
    # Save changes for next run
794 16e0b9c9 Michael Hanselmann
    notepad.Save(state_path)
795 16e0b9c9 Michael Hanselmann
796 16e0b9c9 Michael Hanselmann
  return constants.EXIT_SUCCESS
797 16e0b9c9 Michael Hanselmann
798 16e0b9c9 Michael Hanselmann
799 9f4bb951 Michael Hanselmann
def Main():
800 a8083063 Iustin Pop
  """Main function.
801 a8083063 Iustin Pop

802 a8083063 Iustin Pop
  """
803 f0a80b01 Michael Hanselmann
  (options, _) = ParseOptions()
804 a8083063 Iustin Pop
805 57fe4a5b Michael Hanselmann
  utils.SetupLogging(pathutils.LOG_WATCHER, sys.argv[0],
806 cfcc79c6 Michael Hanselmann
                     debug=options.debug, stderr_logging=options.debug)
807 a8083063 Iustin Pop
808 46c8a6ab Iustin Pop
  if ShouldPause() and not options.ignore_pause:
809 3753b2cb Michael Hanselmann
    logging.debug("Pause has been set, exiting")
810 9f4bb951 Michael Hanselmann
    return constants.EXIT_SUCCESS
811 3753b2cb Michael Hanselmann
812 16e0b9c9 Michael Hanselmann
  # Try to acquire global watcher lock in shared mode
813 57fe4a5b Michael Hanselmann
  lock = utils.FileLock.Open(pathutils.WATCHER_LOCK_FILE)
814 a8083063 Iustin Pop
  try:
815 16e0b9c9 Michael Hanselmann
    lock.Shared(blocking=False)
816 16e0b9c9 Michael Hanselmann
  except (EnvironmentError, errors.LockError), err:
817 16e0b9c9 Michael Hanselmann
    logging.error("Can't acquire lock on %s: %s",
818 57fe4a5b Michael Hanselmann
                  pathutils.WATCHER_LOCK_FILE, err)
819 16e0b9c9 Michael Hanselmann
    return constants.EXIT_SUCCESS
820 db147305 Tom Limoncelli
821 16e0b9c9 Michael Hanselmann
  if options.nodegroup is None:
822 16e0b9c9 Michael Hanselmann
    fn = _GlobalWatcher
823 16e0b9c9 Michael Hanselmann
  else:
824 16e0b9c9 Michael Hanselmann
    # Per-nodegroup watcher
825 16e0b9c9 Michael Hanselmann
    fn = _GroupWatcher
826 16e0b9c9 Michael Hanselmann
827 16e0b9c9 Michael Hanselmann
  try:
828 16e0b9c9 Michael Hanselmann
    return fn(options)
829 16e0b9c9 Michael Hanselmann
  except (SystemExit, KeyboardInterrupt):
830 1b052f42 Michael Hanselmann
    raise
831 38242904 Iustin Pop
  except NotMasterError:
832 438b45d4 Michael Hanselmann
    logging.debug("Not master, exiting")
833 9f4bb951 Michael Hanselmann
    return constants.EXIT_NOTMASTER
834 89e1fc26 Iustin Pop
  except errors.ResolverError, err:
835 013ce4ae Michael Hanselmann
    logging.error("Cannot resolve hostname '%s', exiting", err.args[0])
836 9f4bb951 Michael Hanselmann
    return constants.EXIT_NODESETUP_ERROR
837 24edc6d4 Iustin Pop
  except errors.JobQueueFull:
838 24edc6d4 Iustin Pop
    logging.error("Job queue is full, can't query cluster state")
839 24edc6d4 Iustin Pop
  except errors.JobQueueDrainError:
840 24edc6d4 Iustin Pop
    logging.error("Job queue is drained, can't maintain cluster state")
841 438b45d4 Michael Hanselmann
  except Exception, err:
842 001b3825 Michael Hanselmann
    logging.exception(str(err))
843 9f4bb951 Michael Hanselmann
    return constants.EXIT_FAILURE
844 5a3103e9 Michael Hanselmann
845 9f4bb951 Michael Hanselmann
  return constants.EXIT_SUCCESS