Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 2a7c3583

History | View | Annotate | Download (22.6 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erroneously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
# pylint: disable-msg=C0103,W0142
31

    
32
# C0103: Invalid name ganeti-watcher
33

    
34
import os
35
import sys
36
import time
37
import logging
38
from optparse import OptionParser
39

    
40
from ganeti import utils
41
from ganeti import constants
42
from ganeti import serializer
43
from ganeti import errors
44
from ganeti import opcodes
45
from ganeti import cli
46
from ganeti import luxi
47
from ganeti import ssconf
48
from ganeti import bdev
49
from ganeti import hypervisor
50
from ganeti import rapi
51
from ganeti.confd import client as confd_client
52

    
53
import ganeti.rapi.client # pylint: disable-msg=W0611
54

    
55

    
56
MAXTRIES = 5
57
BAD_STATES = ['ERROR_down']
58
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
59
NOTICE = 'NOTICE'
60
ERROR = 'ERROR'
61
KEY_RESTART_COUNT = "restart_count"
62
KEY_RESTART_WHEN = "restart_when"
63
KEY_BOOT_ID = "bootid"
64

    
65

    
66
# Global client object
67
client = None
68

    
69

    
70
class NotMasterError(errors.GenericError):
71
  """Exception raised when this host is not the master."""
72

    
73

    
74
def ShouldPause():
75
  """Check whether we should pause.
76

    
77
  """
78
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
79

    
80

    
81
def StartNodeDaemons():
82
  """Start all the daemons that should be running on all nodes.
83

    
84
  """
85
  # on master or not, try to start the node daemon
86
  utils.EnsureDaemon(constants.NODED)
87
  # start confd as well. On non candidates it will be in disabled mode.
88
  utils.EnsureDaemon(constants.CONFD)
89

    
90

    
91
def RunWatcherHooks():
92
  """Run the watcher hooks.
93

    
94
  """
95
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
96
                             constants.HOOKS_NAME_WATCHER)
97
  if not os.path.isdir(hooks_dir):
98
    return
99

    
100
  try:
101
    results = utils.RunParts(hooks_dir)
102
  except Exception, msg: # pylint: disable-msg=W0703
103
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
104

    
105
  for (relname, status, runresult) in results:
106
    if status == constants.RUNPARTS_SKIP:
107
      logging.debug("Watcher hook %s: skipped", relname)
108
    elif status == constants.RUNPARTS_ERR:
109
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
110
    elif status == constants.RUNPARTS_RUN:
111
      if runresult.failed:
112
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
113
                        relname, runresult.exit_code, runresult.output)
114
      else:
115
        logging.debug("Watcher hook %s: success (output: %s)", relname,
116
                      runresult.output)
117

    
118

    
119
class NodeMaintenance(object):
120
  """Talks to confd daemons and possible shutdown instances/drbd devices.
121

    
122
  """
123
  def __init__(self):
124
    self.store_cb = confd_client.StoreResultCallback()
125
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
126
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
127

    
128
  @staticmethod
129
  def ShouldRun():
130
    """Checks whether node maintenance should run.
131

    
132
    """
133
    try:
134
      return ssconf.SimpleStore().GetMaintainNodeHealth()
135
    except errors.ConfigurationError, err:
136
      logging.error("Configuration error, not activating node maintenance: %s",
137
                    err)
138
      return False
139

    
140
  @staticmethod
141
  def GetRunningInstances():
142
    """Compute list of hypervisor/running instances.
143

    
144
    """
145
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
146
    results = []
147
    for hv_name in hyp_list:
148
      try:
149
        hv = hypervisor.GetHypervisor(hv_name)
150
        ilist = hv.ListInstances()
151
        results.extend([(iname, hv_name) for iname in ilist])
152
      except: # pylint: disable-msg=W0702
153
        logging.error("Error while listing instances for hypervisor %s",
154
                      hv_name, exc_info=True)
155
    return results
156

    
157
  @staticmethod
158
  def GetUsedDRBDs():
159
    """Get list of used DRBD minors.
160

    
161
    """
162
    return bdev.DRBD8.GetUsedDevs().keys()
163

    
164
  @classmethod
165
  def DoMaintenance(cls, role):
166
    """Maintain the instance list.
167

    
168
    """
169
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
170
      inst_running = cls.GetRunningInstances()
171
      cls.ShutdownInstances(inst_running)
172
      drbd_running = cls.GetUsedDRBDs()
173
      cls.ShutdownDRBD(drbd_running)
174
    else:
175
      logging.debug("Not doing anything for role %s", role)
176

    
177
  @staticmethod
178
  def ShutdownInstances(inst_running):
179
    """Shutdown running instances.
180

    
181
    """
182
    names_running = set([i[0] for i in inst_running])
183
    if names_running:
184
      logging.info("Following instances should not be running,"
185
                   " shutting them down: %s", utils.CommaJoin(names_running))
186
      # this dictionary will collapse duplicate instance names (only
187
      # xen pvm/vhm) into a single key, which is fine
188
      i2h = dict(inst_running)
189
      for name in names_running:
190
        hv_name = i2h[name]
191
        hv = hypervisor.GetHypervisor(hv_name)
192
        hv.StopInstance(None, force=True, name=name)
193

    
194
  @staticmethod
195
  def ShutdownDRBD(drbd_running):
196
    """Shutdown active DRBD devices.
197

    
198
    """
199
    if drbd_running:
200
      logging.info("Following DRBD minors should not be active,"
201
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
202
      for minor in drbd_running:
203
        # pylint: disable-msg=W0212
204
        # using the private method as is, pending enhancements to the DRBD
205
        # interface
206
        bdev.DRBD8._ShutdownAll(minor)
207

    
208
  def Exec(self):
209
    """Check node status versus cluster desired state.
210

    
211
    """
212
    my_name = utils.HostInfo().name
213
    req = confd_client.ConfdClientRequest(type=
214
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
215
                                          query=my_name)
216
    self.confd_client.SendRequest(req, async=False, coverage=-1)
217
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
218
    if not timed_out:
219
      # should have a valid response
220
      status, result = self.store_cb.GetResponse(req.rsalt)
221
      assert status, "Missing result but received replies"
222
      if not self.filter_cb.consistent[req.rsalt]:
223
        logging.warning("Inconsistent replies, not doing anything")
224
        return
225
      self.DoMaintenance(result.server_reply.answer)
226
    else:
227
      logging.warning("Confd query timed out, cannot do maintenance actions")
228

    
229

    
230
class WatcherState(object):
231
  """Interface to a state file recording restart attempts.
232

    
233
  """
234
  def __init__(self, statefile):
235
    """Open, lock, read and parse the file.
236

    
237
    @type statefile: file
238
    @param statefile: State file object
239

    
240
    """
241
    self.statefile = statefile
242

    
243
    try:
244
      state_data = self.statefile.read()
245
      if not state_data:
246
        self._data = {}
247
      else:
248
        self._data = serializer.Load(state_data)
249
    except Exception, msg: # pylint: disable-msg=W0703
250
      # Ignore errors while loading the file and treat it as empty
251
      self._data = {}
252
      logging.warning(("Invalid state file. Using defaults."
253
                       " Error message: %s"), msg)
254

    
255
    if "instance" not in self._data:
256
      self._data["instance"] = {}
257
    if "node" not in self._data:
258
      self._data["node"] = {}
259

    
260
    self._orig_data = serializer.Dump(self._data)
261

    
262
  def Save(self):
263
    """Save state to file, then unlock and close it.
264

    
265
    """
266
    assert self.statefile
267

    
268
    serialized_form = serializer.Dump(self._data)
269
    if self._orig_data == serialized_form:
270
      logging.debug("Data didn't change, just touching status file")
271
      os.utime(constants.WATCHER_STATEFILE, None)
272
      return
273

    
274
    # We need to make sure the file is locked before renaming it, otherwise
275
    # starting ganeti-watcher again at the same time will create a conflict.
276
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
277
                         data=serialized_form,
278
                         prewrite=utils.LockFile, close=False)
279
    self.statefile = os.fdopen(fd, 'w+')
280

    
281
  def Close(self):
282
    """Unlock configuration file and close it.
283

    
284
    """
285
    assert self.statefile
286

    
287
    # Files are automatically unlocked when closing them
288
    self.statefile.close()
289
    self.statefile = None
290

    
291
  def GetNodeBootID(self, name):
292
    """Returns the last boot ID of a node or None.
293

    
294
    """
295
    ndata = self._data["node"]
296

    
297
    if name in ndata and KEY_BOOT_ID in ndata[name]:
298
      return ndata[name][KEY_BOOT_ID]
299
    return None
300

    
301
  def SetNodeBootID(self, name, bootid):
302
    """Sets the boot ID of a node.
303

    
304
    """
305
    assert bootid
306

    
307
    ndata = self._data["node"]
308

    
309
    if name not in ndata:
310
      ndata[name] = {}
311

    
312
    ndata[name][KEY_BOOT_ID] = bootid
313

    
314
  def NumberOfRestartAttempts(self, instance):
315
    """Returns number of previous restart attempts.
316

    
317
    @type instance: L{Instance}
318
    @param instance: the instance to look up
319

    
320
    """
321
    idata = self._data["instance"]
322

    
323
    if instance.name in idata:
324
      return idata[instance.name][KEY_RESTART_COUNT]
325

    
326
    return 0
327

    
328
  def RecordRestartAttempt(self, instance):
329
    """Record a restart attempt.
330

    
331
    @type instance: L{Instance}
332
    @param instance: the instance being restarted
333

    
334
    """
335
    idata = self._data["instance"]
336

    
337
    if instance.name not in idata:
338
      inst = idata[instance.name] = {}
339
    else:
340
      inst = idata[instance.name]
341

    
342
    inst[KEY_RESTART_WHEN] = time.time()
343
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
344

    
345
  def RemoveInstance(self, instance):
346
    """Update state to reflect that a machine is running.
347

    
348
    This method removes the record for a named instance (as we only
349
    track down instances).
350

    
351
    @type instance: L{Instance}
352
    @param instance: the instance to remove from books
353

    
354
    """
355
    idata = self._data["instance"]
356

    
357
    if instance.name in idata:
358
      del idata[instance.name]
359

    
360

    
361
class Instance(object):
362
  """Abstraction for a Virtual Machine instance.
363

    
364
  """
365
  def __init__(self, name, state, autostart):
366
    self.name = name
367
    self.state = state
368
    self.autostart = autostart
369

    
370
  def Restart(self):
371
    """Encapsulates the start of an instance.
372

    
373
    """
374
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
375
    cli.SubmitOpCode(op, cl=client)
376

    
377
  def ActivateDisks(self):
378
    """Encapsulates the activation of all disks of an instance.
379

    
380
    """
381
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
382
    cli.SubmitOpCode(op, cl=client)
383

    
384

    
385
def GetClusterData():
386
  """Get a list of instances on this cluster.
387

    
388
  """
389
  op1_fields = ["name", "status", "admin_state", "snodes"]
390
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
391
                                 use_locking=True)
392
  op2_fields = ["name", "bootid", "offline"]
393
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
394
                             use_locking=True)
395

    
396
  job_id = client.SubmitJob([op1, op2])
397

    
398
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
399

    
400
  logging.debug("Got data from cluster, writing instance status file")
401

    
402
  result = all_results[0]
403
  smap = {}
404

    
405
  instances = {}
406

    
407
  # write the upfile
408
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
409
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
410

    
411
  for fields in result:
412
    (name, status, autostart, snodes) = fields
413

    
414
    # update the secondary node map
415
    for node in snodes:
416
      if node not in smap:
417
        smap[node] = []
418
      smap[node].append(name)
419

    
420
    instances[name] = Instance(name, status, autostart)
421

    
422
  nodes =  dict([(name, (bootid, offline))
423
                 for name, bootid, offline in all_results[1]])
424

    
425
  client.ArchiveJob(job_id)
426

    
427
  return instances, nodes, smap
428

    
429

    
430
class Watcher(object):
431
  """Encapsulate the logic for restarting erroneously halted virtual machines.
432

    
433
  The calling program should periodically instantiate me and call Run().
434
  This will traverse the list of instances, and make up to MAXTRIES attempts
435
  to restart machines that are down.
436

    
437
  """
438
  def __init__(self, opts, notepad):
439
    self.notepad = notepad
440
    master = client.QueryConfigValues(["master_node"])[0]
441
    if master != utils.HostInfo().name:
442
      raise NotMasterError("This is not the master node")
443
    # first archive old jobs
444
    self.ArchiveJobs(opts.job_age)
445
    # and only then submit new ones
446
    self.instances, self.bootids, self.smap = GetClusterData()
447
    self.started_instances = set()
448
    self.opts = opts
449

    
450
  def Run(self):
451
    """Watcher run sequence.
452

    
453
    """
454
    notepad = self.notepad
455
    self.CheckInstances(notepad)
456
    self.CheckDisks(notepad)
457
    self.VerifyDisks()
458

    
459
  @staticmethod
460
  def ArchiveJobs(age):
461
    """Archive old jobs.
462

    
463
    """
464
    arch_count, left_count = client.AutoArchiveJobs(age)
465
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
466

    
467
  def CheckDisks(self, notepad):
468
    """Check all nodes for restarted ones.
469

    
470
    """
471
    check_nodes = []
472
    for name, (new_id, offline) in self.bootids.iteritems():
473
      old = notepad.GetNodeBootID(name)
474
      if new_id is None:
475
        # Bad node, not returning a boot id
476
        if not offline:
477
          logging.debug("Node %s missing boot id, skipping secondary checks",
478
                        name)
479
        continue
480
      if old != new_id:
481
        # Node's boot ID has changed, proably through a reboot.
482
        check_nodes.append(name)
483

    
484
    if check_nodes:
485
      # Activate disks for all instances with any of the checked nodes as a
486
      # secondary node.
487
      for node in check_nodes:
488
        if node not in self.smap:
489
          continue
490
        for instance_name in self.smap[node]:
491
          instance = self.instances[instance_name]
492
          if not instance.autostart:
493
            logging.info(("Skipping disk activation for non-autostart"
494
                          " instance %s"), instance.name)
495
            continue
496
          if instance.name in self.started_instances:
497
            # we already tried to start the instance, which should have
498
            # activated its drives (if they can be at all)
499
            continue
500
          try:
501
            logging.info("Activating disks for instance %s", instance.name)
502
            instance.ActivateDisks()
503
          except Exception: # pylint: disable-msg=W0703
504
            logging.exception("Error while activating disks for instance %s",
505
                              instance.name)
506

    
507
      # Keep changed boot IDs
508
      for name in check_nodes:
509
        notepad.SetNodeBootID(name, self.bootids[name][0])
510

    
511
  def CheckInstances(self, notepad):
512
    """Make a pass over the list of instances, restarting downed ones.
513

    
514
    """
515
    for instance in self.instances.values():
516
      if instance.state in BAD_STATES:
517
        n = notepad.NumberOfRestartAttempts(instance)
518

    
519
        if n > MAXTRIES:
520
          # stay quiet.
521
          continue
522
        elif n < MAXTRIES:
523
          last = " (Attempt #%d)" % (n + 1)
524
        else:
525
          notepad.RecordRestartAttempt(instance)
526
          logging.error("Could not restart %s after %d attempts, giving up",
527
                        instance.name, MAXTRIES)
528
          continue
529
        try:
530
          logging.info("Restarting %s%s",
531
                        instance.name, last)
532
          instance.Restart()
533
          self.started_instances.add(instance.name)
534
        except Exception: # pylint: disable-msg=W0703
535
          logging.exception("Error while restarting instance %s",
536
                            instance.name)
537

    
538
        notepad.RecordRestartAttempt(instance)
539
      elif instance.state in HELPLESS_STATES:
540
        if notepad.NumberOfRestartAttempts(instance):
541
          notepad.RemoveInstance(instance)
542
      else:
543
        if notepad.NumberOfRestartAttempts(instance):
544
          notepad.RemoveInstance(instance)
545
          logging.info("Restart of %s succeeded", instance.name)
546

    
547
  @staticmethod
548
  def VerifyDisks():
549
    """Run gnt-cluster verify-disks.
550

    
551
    """
552
    op = opcodes.OpVerifyDisks()
553
    job_id = client.SubmitJob([op])
554
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
555
    client.ArchiveJob(job_id)
556
    if not isinstance(result, (tuple, list)):
557
      logging.error("Can't get a valid result from verify-disks")
558
      return
559
    offline_disk_instances = result[2]
560
    if not offline_disk_instances:
561
      # nothing to do
562
      return
563
    logging.debug("Will activate disks for instances %s",
564
                  utils.CommaJoin(offline_disk_instances))
565
    # we submit only one job, and wait for it. not optimal, but spams
566
    # less the job queue
567
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
568
           for name in offline_disk_instances]
569
    job_id = cli.SendJob(job, cl=client)
570

    
571
    try:
572
      cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
573
    except Exception: # pylint: disable-msg=W0703
574
      logging.exception("Error while activating disks")
575

    
576

    
577
def OpenStateFile(path):
578
  """Opens the state file and acquires a lock on it.
579

    
580
  @type path: string
581
  @param path: Path to state file
582

    
583
  """
584
  # The two-step dance below is necessary to allow both opening existing
585
  # file read/write and creating if not existing. Vanilla open will truncate
586
  # an existing file -or- allow creating if not existing.
587
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
588

    
589
  # Try to acquire lock on state file. If this fails, another watcher instance
590
  # might already be running or another program is temporarily blocking the
591
  # watcher from running.
592
  try:
593
    utils.LockFile(statefile_fd)
594
  except errors.LockError, err:
595
    logging.error("Can't acquire lock on state file %s: %s", path, err)
596
    return None
597

    
598
  return os.fdopen(statefile_fd, "w+")
599

    
600

    
601
def IsRapiResponding(hostname):
602
  """Connects to RAPI port and does a simple test.
603

    
604
  Connects to RAPI port of hostname and does a simple test. At this time, the
605
  test is GetVersion.
606

    
607
  @type hostname: string
608
  @param hostname: hostname of the node to connect to.
609
  @rtype: bool
610
  @return: Whether RAPI is working properly
611

    
612
  """
613
  curl_config = rapi.client.GenericCurlConfig(cafile=constants.RAPI_CERT_FILE)
614
  rapi_client = rapi.client.GanetiRapiClient(hostname,
615
                                             curl_config_fn=curl_config)
616
  try:
617
    master_version = rapi_client.GetVersion()
618
  except rapi.client.CertificateError, err:
619
    logging.warning("RAPI Error: CertificateError (%s)", err)
620
    return False
621
  except rapi.client.GanetiApiError, err:
622
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
623
    return False
624
  logging.debug("RAPI Result: master_version is %s", master_version)
625
  return master_version == constants.RAPI_VERSION
626

    
627

    
628
def ParseOptions():
629
  """Parse the command line options.
630

    
631
  @return: (options, args) as from OptionParser.parse_args()
632

    
633
  """
634
  parser = OptionParser(description="Ganeti cluster watcher",
635
                        usage="%prog [-d]",
636
                        version="%%prog (ganeti) %s" %
637
                        constants.RELEASE_VERSION)
638

    
639
  parser.add_option(cli.DEBUG_OPT)
640
  parser.add_option("-A", "--job-age", dest="job_age",
641
                    help="Autoarchive jobs older than this age (default"
642
                    " 6 hours)", default=6*3600)
643
  options, args = parser.parse_args()
644
  options.job_age = cli.ParseTimespec(options.job_age)
645
  return options, args
646

    
647

    
648
@rapi.client.UsesRapiClient
649
def main():
650
  """Main function.
651

    
652
  """
653
  global client # pylint: disable-msg=W0603
654

    
655
  options, args = ParseOptions()
656

    
657
  if args: # watcher doesn't take any arguments
658
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
659
    sys.exit(constants.EXIT_FAILURE)
660

    
661
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
662
                     stderr_logging=options.debug)
663

    
664
  if ShouldPause():
665
    logging.debug("Pause has been set, exiting")
666
    sys.exit(constants.EXIT_SUCCESS)
667

    
668
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
669
  if not statefile:
670
    sys.exit(constants.EXIT_FAILURE)
671

    
672
  update_file = False
673
  try:
674
    StartNodeDaemons()
675
    RunWatcherHooks()
676
    # run node maintenance in all cases, even if master, so that old
677
    # masters can be properly cleaned up too
678
    if NodeMaintenance.ShouldRun():
679
      NodeMaintenance().Exec()
680

    
681
    notepad = WatcherState(statefile)
682
    try:
683
      try:
684
        client = cli.GetClient()
685
      except errors.OpPrereqError:
686
        # this is, from cli.GetClient, a not-master case
687
        logging.debug("Not on master, exiting")
688
        update_file = True
689
        sys.exit(constants.EXIT_SUCCESS)
690
      except luxi.NoMasterError, err:
691
        logging.warning("Master seems to be down (%s), trying to restart",
692
                        str(err))
693
        if not utils.EnsureDaemon(constants.MASTERD):
694
          logging.critical("Can't start the master, exiting")
695
          sys.exit(constants.EXIT_FAILURE)
696
        # else retry the connection
697
        client = cli.GetClient()
698

    
699
      # we are on master now
700
      utils.EnsureDaemon(constants.RAPI)
701

    
702
      # If RAPI isn't responding to queries, try one restart.
703
      logging.debug("Attempting to talk with RAPI.")
704
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
705
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
706
                        " Restarting Ganeti RAPI.")
707
        utils.StopDaemon(constants.RAPI)
708
        utils.EnsureDaemon(constants.RAPI)
709
        logging.debug("Second attempt to talk with RAPI")
710
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
711
          logging.fatal("RAPI is not responding. Please investigate.")
712
      logging.debug("Successfully talked to RAPI.")
713

    
714
      try:
715
        watcher = Watcher(options, notepad)
716
      except errors.ConfigurationError:
717
        # Just exit if there's no configuration
718
        update_file = True
719
        sys.exit(constants.EXIT_SUCCESS)
720

    
721
      watcher.Run()
722
      update_file = True
723

    
724
    finally:
725
      if update_file:
726
        notepad.Save()
727
      else:
728
        logging.debug("Not updating status file due to failure")
729
  except SystemExit:
730
    raise
731
  except NotMasterError:
732
    logging.debug("Not master, exiting")
733
    sys.exit(constants.EXIT_NOTMASTER)
734
  except errors.ResolverError, err:
735
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
736
    sys.exit(constants.EXIT_NODESETUP_ERROR)
737
  except errors.JobQueueFull:
738
    logging.error("Job queue is full, can't query cluster state")
739
  except errors.JobQueueDrainError:
740
    logging.error("Job queue is drained, can't maintain cluster state")
741
  except Exception, err:
742
    logging.exception(str(err))
743
    sys.exit(constants.EXIT_FAILURE)
744

    
745

    
746
if __name__ == '__main__':
747
  main()