Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ a744b676

History | View | Annotate | Download (22.6 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erroneously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
# pylint: disable-msg=C0103,W0142
31

    
32
# C0103: Invalid name ganeti-watcher
33

    
34
import os
35
import sys
36
import time
37
import logging
38
from optparse import OptionParser
39

    
40
from ganeti import utils
41
from ganeti import constants
42
from ganeti import serializer
43
from ganeti import errors
44
from ganeti import opcodes
45
from ganeti import cli
46
from ganeti import luxi
47
from ganeti import ssconf
48
from ganeti import bdev
49
from ganeti import hypervisor
50
from ganeti import rapi
51
from ganeti.confd import client as confd_client
52
from ganeti import netutils
53

    
54
import ganeti.rapi.client # pylint: disable-msg=W0611
55

    
56

    
57
MAXTRIES = 5
58
BAD_STATES = ['ERROR_down']
59
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
60
NOTICE = 'NOTICE'
61
ERROR = 'ERROR'
62
KEY_RESTART_COUNT = "restart_count"
63
KEY_RESTART_WHEN = "restart_when"
64
KEY_BOOT_ID = "bootid"
65

    
66

    
67
# Global client object
68
client = None
69

    
70

    
71
class NotMasterError(errors.GenericError):
72
  """Exception raised when this host is not the master."""
73

    
74

    
75
def ShouldPause():
76
  """Check whether we should pause.
77

    
78
  """
79
  return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
80

    
81

    
82
def StartNodeDaemons():
83
  """Start all the daemons that should be running on all nodes.
84

    
85
  """
86
  # on master or not, try to start the node daemon
87
  utils.EnsureDaemon(constants.NODED)
88
  # start confd as well. On non candidates it will be in disabled mode.
89
  utils.EnsureDaemon(constants.CONFD)
90

    
91

    
92
def RunWatcherHooks():
93
  """Run the watcher hooks.
94

    
95
  """
96
  hooks_dir = utils.PathJoin(constants.HOOKS_BASE_DIR,
97
                             constants.HOOKS_NAME_WATCHER)
98
  if not os.path.isdir(hooks_dir):
99
    return
100

    
101
  try:
102
    results = utils.RunParts(hooks_dir)
103
  except Exception, msg: # pylint: disable-msg=W0703
104
    logging.critical("RunParts %s failed: %s", hooks_dir, msg)
105

    
106
  for (relname, status, runresult) in results:
107
    if status == constants.RUNPARTS_SKIP:
108
      logging.debug("Watcher hook %s: skipped", relname)
109
    elif status == constants.RUNPARTS_ERR:
110
      logging.warning("Watcher hook %s: error (%s)", relname, runresult)
111
    elif status == constants.RUNPARTS_RUN:
112
      if runresult.failed:
113
        logging.warning("Watcher hook %s: failed (exit: %d) (output: %s)",
114
                        relname, runresult.exit_code, runresult.output)
115
      else:
116
        logging.debug("Watcher hook %s: success (output: %s)", relname,
117
                      runresult.output)
118

    
119

    
120
class NodeMaintenance(object):
121
  """Talks to confd daemons and possible shutdown instances/drbd devices.
122

    
123
  """
124
  def __init__(self):
125
    self.store_cb = confd_client.StoreResultCallback()
126
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
127
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
128

    
129
  @staticmethod
130
  def ShouldRun():
131
    """Checks whether node maintenance should run.
132

    
133
    """
134
    try:
135
      return ssconf.SimpleStore().GetMaintainNodeHealth()
136
    except errors.ConfigurationError, err:
137
      logging.error("Configuration error, not activating node maintenance: %s",
138
                    err)
139
      return False
140

    
141
  @staticmethod
142
  def GetRunningInstances():
143
    """Compute list of hypervisor/running instances.
144

    
145
    """
146
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
147
    results = []
148
    for hv_name in hyp_list:
149
      try:
150
        hv = hypervisor.GetHypervisor(hv_name)
151
        ilist = hv.ListInstances()
152
        results.extend([(iname, hv_name) for iname in ilist])
153
      except: # pylint: disable-msg=W0702
154
        logging.error("Error while listing instances for hypervisor %s",
155
                      hv_name, exc_info=True)
156
    return results
157

    
158
  @staticmethod
159
  def GetUsedDRBDs():
160
    """Get list of used DRBD minors.
161

    
162
    """
163
    return bdev.DRBD8.GetUsedDevs().keys()
164

    
165
  @classmethod
166
  def DoMaintenance(cls, role):
167
    """Maintain the instance list.
168

    
169
    """
170
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
171
      inst_running = cls.GetRunningInstances()
172
      cls.ShutdownInstances(inst_running)
173
      drbd_running = cls.GetUsedDRBDs()
174
      cls.ShutdownDRBD(drbd_running)
175
    else:
176
      logging.debug("Not doing anything for role %s", role)
177

    
178
  @staticmethod
179
  def ShutdownInstances(inst_running):
180
    """Shutdown running instances.
181

    
182
    """
183
    names_running = set([i[0] for i in inst_running])
184
    if names_running:
185
      logging.info("Following instances should not be running,"
186
                   " shutting them down: %s", utils.CommaJoin(names_running))
187
      # this dictionary will collapse duplicate instance names (only
188
      # xen pvm/vhm) into a single key, which is fine
189
      i2h = dict(inst_running)
190
      for name in names_running:
191
        hv_name = i2h[name]
192
        hv = hypervisor.GetHypervisor(hv_name)
193
        hv.StopInstance(None, force=True, name=name)
194

    
195
  @staticmethod
196
  def ShutdownDRBD(drbd_running):
197
    """Shutdown active DRBD devices.
198

    
199
    """
200
    if drbd_running:
201
      logging.info("Following DRBD minors should not be active,"
202
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
203
      for minor in drbd_running:
204
        # pylint: disable-msg=W0212
205
        # using the private method as is, pending enhancements to the DRBD
206
        # interface
207
        bdev.DRBD8._ShutdownAll(minor)
208

    
209
  def Exec(self):
210
    """Check node status versus cluster desired state.
211

    
212
    """
213
    my_name = netutils.HostInfo().name
214
    req = confd_client.ConfdClientRequest(type=
215
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
216
                                          query=my_name)
217
    self.confd_client.SendRequest(req, async=False, coverage=-1)
218
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
219
    if not timed_out:
220
      # should have a valid response
221
      status, result = self.store_cb.GetResponse(req.rsalt)
222
      assert status, "Missing result but received replies"
223
      if not self.filter_cb.consistent[req.rsalt]:
224
        logging.warning("Inconsistent replies, not doing anything")
225
        return
226
      self.DoMaintenance(result.server_reply.answer)
227
    else:
228
      logging.warning("Confd query timed out, cannot do maintenance actions")
229

    
230

    
231
class WatcherState(object):
232
  """Interface to a state file recording restart attempts.
233

    
234
  """
235
  def __init__(self, statefile):
236
    """Open, lock, read and parse the file.
237

    
238
    @type statefile: file
239
    @param statefile: State file object
240

    
241
    """
242
    self.statefile = statefile
243

    
244
    try:
245
      state_data = self.statefile.read()
246
      if not state_data:
247
        self._data = {}
248
      else:
249
        self._data = serializer.Load(state_data)
250
    except Exception, msg: # pylint: disable-msg=W0703
251
      # Ignore errors while loading the file and treat it as empty
252
      self._data = {}
253
      logging.warning(("Invalid state file. Using defaults."
254
                       " Error message: %s"), msg)
255

    
256
    if "instance" not in self._data:
257
      self._data["instance"] = {}
258
    if "node" not in self._data:
259
      self._data["node"] = {}
260

    
261
    self._orig_data = serializer.Dump(self._data)
262

    
263
  def Save(self):
264
    """Save state to file, then unlock and close it.
265

    
266
    """
267
    assert self.statefile
268

    
269
    serialized_form = serializer.Dump(self._data)
270
    if self._orig_data == serialized_form:
271
      logging.debug("Data didn't change, just touching status file")
272
      os.utime(constants.WATCHER_STATEFILE, None)
273
      return
274

    
275
    # We need to make sure the file is locked before renaming it, otherwise
276
    # starting ganeti-watcher again at the same time will create a conflict.
277
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
278
                         data=serialized_form,
279
                         prewrite=utils.LockFile, close=False)
280
    self.statefile = os.fdopen(fd, 'w+')
281

    
282
  def Close(self):
283
    """Unlock configuration file and close it.
284

    
285
    """
286
    assert self.statefile
287

    
288
    # Files are automatically unlocked when closing them
289
    self.statefile.close()
290
    self.statefile = None
291

    
292
  def GetNodeBootID(self, name):
293
    """Returns the last boot ID of a node or None.
294

    
295
    """
296
    ndata = self._data["node"]
297

    
298
    if name in ndata and KEY_BOOT_ID in ndata[name]:
299
      return ndata[name][KEY_BOOT_ID]
300
    return None
301

    
302
  def SetNodeBootID(self, name, bootid):
303
    """Sets the boot ID of a node.
304

    
305
    """
306
    assert bootid
307

    
308
    ndata = self._data["node"]
309

    
310
    if name not in ndata:
311
      ndata[name] = {}
312

    
313
    ndata[name][KEY_BOOT_ID] = bootid
314

    
315
  def NumberOfRestartAttempts(self, instance):
316
    """Returns number of previous restart attempts.
317

    
318
    @type instance: L{Instance}
319
    @param instance: the instance to look up
320

    
321
    """
322
    idata = self._data["instance"]
323

    
324
    if instance.name in idata:
325
      return idata[instance.name][KEY_RESTART_COUNT]
326

    
327
    return 0
328

    
329
  def RecordRestartAttempt(self, instance):
330
    """Record a restart attempt.
331

    
332
    @type instance: L{Instance}
333
    @param instance: the instance being restarted
334

    
335
    """
336
    idata = self._data["instance"]
337

    
338
    if instance.name not in idata:
339
      inst = idata[instance.name] = {}
340
    else:
341
      inst = idata[instance.name]
342

    
343
    inst[KEY_RESTART_WHEN] = time.time()
344
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
345

    
346
  def RemoveInstance(self, instance):
347
    """Update state to reflect that a machine is running.
348

    
349
    This method removes the record for a named instance (as we only
350
    track down instances).
351

    
352
    @type instance: L{Instance}
353
    @param instance: the instance to remove from books
354

    
355
    """
356
    idata = self._data["instance"]
357

    
358
    if instance.name in idata:
359
      del idata[instance.name]
360

    
361

    
362
class Instance(object):
363
  """Abstraction for a Virtual Machine instance.
364

    
365
  """
366
  def __init__(self, name, state, autostart):
367
    self.name = name
368
    self.state = state
369
    self.autostart = autostart
370

    
371
  def Restart(self):
372
    """Encapsulates the start of an instance.
373

    
374
    """
375
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
376
    cli.SubmitOpCode(op, cl=client)
377

    
378
  def ActivateDisks(self):
379
    """Encapsulates the activation of all disks of an instance.
380

    
381
    """
382
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
383
    cli.SubmitOpCode(op, cl=client)
384

    
385

    
386
def GetClusterData():
387
  """Get a list of instances on this cluster.
388

    
389
  """
390
  op1_fields = ["name", "status", "admin_state", "snodes"]
391
  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
392
                                 use_locking=True)
393
  op2_fields = ["name", "bootid", "offline"]
394
  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
395
                             use_locking=True)
396

    
397
  job_id = client.SubmitJob([op1, op2])
398

    
399
  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
400

    
401
  logging.debug("Got data from cluster, writing instance status file")
402

    
403
  result = all_results[0]
404
  smap = {}
405

    
406
  instances = {}
407

    
408
  # write the upfile
409
  up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
410
  utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
411

    
412
  for fields in result:
413
    (name, status, autostart, snodes) = fields
414

    
415
    # update the secondary node map
416
    for node in snodes:
417
      if node not in smap:
418
        smap[node] = []
419
      smap[node].append(name)
420

    
421
    instances[name] = Instance(name, status, autostart)
422

    
423
  nodes =  dict([(name, (bootid, offline))
424
                 for name, bootid, offline in all_results[1]])
425

    
426
  client.ArchiveJob(job_id)
427

    
428
  return instances, nodes, smap
429

    
430

    
431
class Watcher(object):
432
  """Encapsulate the logic for restarting erroneously halted virtual machines.
433

    
434
  The calling program should periodically instantiate me and call Run().
435
  This will traverse the list of instances, and make up to MAXTRIES attempts
436
  to restart machines that are down.
437

    
438
  """
439
  def __init__(self, opts, notepad):
440
    self.notepad = notepad
441
    master = client.QueryConfigValues(["master_node"])[0]
442
    if master != netutils.HostInfo().name:
443
      raise NotMasterError("This is not the master node")
444
    # first archive old jobs
445
    self.ArchiveJobs(opts.job_age)
446
    # and only then submit new ones
447
    self.instances, self.bootids, self.smap = GetClusterData()
448
    self.started_instances = set()
449
    self.opts = opts
450

    
451
  def Run(self):
452
    """Watcher run sequence.
453

    
454
    """
455
    notepad = self.notepad
456
    self.CheckInstances(notepad)
457
    self.CheckDisks(notepad)
458
    self.VerifyDisks()
459

    
460
  @staticmethod
461
  def ArchiveJobs(age):
462
    """Archive old jobs.
463

    
464
    """
465
    arch_count, left_count = client.AutoArchiveJobs(age)
466
    logging.debug("Archived %s jobs, left %s", arch_count, left_count)
467

    
468
  def CheckDisks(self, notepad):
469
    """Check all nodes for restarted ones.
470

    
471
    """
472
    check_nodes = []
473
    for name, (new_id, offline) in self.bootids.iteritems():
474
      old = notepad.GetNodeBootID(name)
475
      if new_id is None:
476
        # Bad node, not returning a boot id
477
        if not offline:
478
          logging.debug("Node %s missing boot id, skipping secondary checks",
479
                        name)
480
        continue
481
      if old != new_id:
482
        # Node's boot ID has changed, proably through a reboot.
483
        check_nodes.append(name)
484

    
485
    if check_nodes:
486
      # Activate disks for all instances with any of the checked nodes as a
487
      # secondary node.
488
      for node in check_nodes:
489
        if node not in self.smap:
490
          continue
491
        for instance_name in self.smap[node]:
492
          instance = self.instances[instance_name]
493
          if not instance.autostart:
494
            logging.info(("Skipping disk activation for non-autostart"
495
                          " instance %s"), instance.name)
496
            continue
497
          if instance.name in self.started_instances:
498
            # we already tried to start the instance, which should have
499
            # activated its drives (if they can be at all)
500
            continue
501
          try:
502
            logging.info("Activating disks for instance %s", instance.name)
503
            instance.ActivateDisks()
504
          except Exception: # pylint: disable-msg=W0703
505
            logging.exception("Error while activating disks for instance %s",
506
                              instance.name)
507

    
508
      # Keep changed boot IDs
509
      for name in check_nodes:
510
        notepad.SetNodeBootID(name, self.bootids[name][0])
511

    
512
  def CheckInstances(self, notepad):
513
    """Make a pass over the list of instances, restarting downed ones.
514

    
515
    """
516
    for instance in self.instances.values():
517
      if instance.state in BAD_STATES:
518
        n = notepad.NumberOfRestartAttempts(instance)
519

    
520
        if n > MAXTRIES:
521
          # stay quiet.
522
          continue
523
        elif n < MAXTRIES:
524
          last = " (Attempt #%d)" % (n + 1)
525
        else:
526
          notepad.RecordRestartAttempt(instance)
527
          logging.error("Could not restart %s after %d attempts, giving up",
528
                        instance.name, MAXTRIES)
529
          continue
530
        try:
531
          logging.info("Restarting %s%s",
532
                        instance.name, last)
533
          instance.Restart()
534
          self.started_instances.add(instance.name)
535
        except Exception: # pylint: disable-msg=W0703
536
          logging.exception("Error while restarting instance %s",
537
                            instance.name)
538

    
539
        notepad.RecordRestartAttempt(instance)
540
      elif instance.state in HELPLESS_STATES:
541
        if notepad.NumberOfRestartAttempts(instance):
542
          notepad.RemoveInstance(instance)
543
      else:
544
        if notepad.NumberOfRestartAttempts(instance):
545
          notepad.RemoveInstance(instance)
546
          logging.info("Restart of %s succeeded", instance.name)
547

    
548
  @staticmethod
549
  def VerifyDisks():
550
    """Run gnt-cluster verify-disks.
551

    
552
    """
553
    op = opcodes.OpVerifyDisks()
554
    job_id = client.SubmitJob([op])
555
    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
556
    client.ArchiveJob(job_id)
557
    if not isinstance(result, (tuple, list)):
558
      logging.error("Can't get a valid result from verify-disks")
559
      return
560
    offline_disk_instances = result[2]
561
    if not offline_disk_instances:
562
      # nothing to do
563
      return
564
    logging.debug("Will activate disks for instances %s",
565
                  utils.CommaJoin(offline_disk_instances))
566
    # we submit only one job, and wait for it. not optimal, but spams
567
    # less the job queue
568
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
569
           for name in offline_disk_instances]
570
    job_id = cli.SendJob(job, cl=client)
571

    
572
    try:
573
      cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
574
    except Exception: # pylint: disable-msg=W0703
575
      logging.exception("Error while activating disks")
576

    
577

    
578
def OpenStateFile(path):
579
  """Opens the state file and acquires a lock on it.
580

    
581
  @type path: string
582
  @param path: Path to state file
583

    
584
  """
585
  # The two-step dance below is necessary to allow both opening existing
586
  # file read/write and creating if not existing. Vanilla open will truncate
587
  # an existing file -or- allow creating if not existing.
588
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
589

    
590
  # Try to acquire lock on state file. If this fails, another watcher instance
591
  # might already be running or another program is temporarily blocking the
592
  # watcher from running.
593
  try:
594
    utils.LockFile(statefile_fd)
595
  except errors.LockError, err:
596
    logging.error("Can't acquire lock on state file %s: %s", path, err)
597
    return None
598

    
599
  return os.fdopen(statefile_fd, "w+")
600

    
601

    
602
def IsRapiResponding(hostname):
603
  """Connects to RAPI port and does a simple test.
604

    
605
  Connects to RAPI port of hostname and does a simple test. At this time, the
606
  test is GetVersion.
607

    
608
  @type hostname: string
609
  @param hostname: hostname of the node to connect to.
610
  @rtype: bool
611
  @return: Whether RAPI is working properly
612

    
613
  """
614
  curl_config = rapi.client.GenericCurlConfig(cafile=constants.RAPI_CERT_FILE)
615
  rapi_client = rapi.client.GanetiRapiClient(hostname,
616
                                             curl_config_fn=curl_config)
617
  try:
618
    master_version = rapi_client.GetVersion()
619
  except rapi.client.CertificateError, err:
620
    logging.warning("RAPI Error: CertificateError (%s)", err)
621
    return False
622
  except rapi.client.GanetiApiError, err:
623
    logging.warning("RAPI Error: GanetiApiError (%s)", err)
624
    return False
625
  logging.debug("RAPI Result: master_version is %s", master_version)
626
  return master_version == constants.RAPI_VERSION
627

    
628

    
629
def ParseOptions():
630
  """Parse the command line options.
631

    
632
  @return: (options, args) as from OptionParser.parse_args()
633

    
634
  """
635
  parser = OptionParser(description="Ganeti cluster watcher",
636
                        usage="%prog [-d]",
637
                        version="%%prog (ganeti) %s" %
638
                        constants.RELEASE_VERSION)
639

    
640
  parser.add_option(cli.DEBUG_OPT)
641
  parser.add_option("-A", "--job-age", dest="job_age",
642
                    help="Autoarchive jobs older than this age (default"
643
                    " 6 hours)", default=6*3600)
644
  options, args = parser.parse_args()
645
  options.job_age = cli.ParseTimespec(options.job_age)
646
  return options, args
647

    
648

    
649
@rapi.client.UsesRapiClient
650
def main():
651
  """Main function.
652

    
653
  """
654
  global client # pylint: disable-msg=W0603
655

    
656
  options, args = ParseOptions()
657

    
658
  if args: # watcher doesn't take any arguments
659
    print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
660
    sys.exit(constants.EXIT_FAILURE)
661

    
662
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
663
                     stderr_logging=options.debug)
664

    
665
  if ShouldPause():
666
    logging.debug("Pause has been set, exiting")
667
    sys.exit(constants.EXIT_SUCCESS)
668

    
669
  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
670
  if not statefile:
671
    sys.exit(constants.EXIT_FAILURE)
672

    
673
  update_file = False
674
  try:
675
    StartNodeDaemons()
676
    RunWatcherHooks()
677
    # run node maintenance in all cases, even if master, so that old
678
    # masters can be properly cleaned up too
679
    if NodeMaintenance.ShouldRun():
680
      NodeMaintenance().Exec()
681

    
682
    notepad = WatcherState(statefile)
683
    try:
684
      try:
685
        client = cli.GetClient()
686
      except errors.OpPrereqError:
687
        # this is, from cli.GetClient, a not-master case
688
        logging.debug("Not on master, exiting")
689
        update_file = True
690
        sys.exit(constants.EXIT_SUCCESS)
691
      except luxi.NoMasterError, err:
692
        logging.warning("Master seems to be down (%s), trying to restart",
693
                        str(err))
694
        if not utils.EnsureDaemon(constants.MASTERD):
695
          logging.critical("Can't start the master, exiting")
696
          sys.exit(constants.EXIT_FAILURE)
697
        # else retry the connection
698
        client = cli.GetClient()
699

    
700
      # we are on master now
701
      utils.EnsureDaemon(constants.RAPI)
702

    
703
      # If RAPI isn't responding to queries, try one restart.
704
      logging.debug("Attempting to talk with RAPI.")
705
      if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
706
        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
707
                        " Restarting Ganeti RAPI.")
708
        utils.StopDaemon(constants.RAPI)
709
        utils.EnsureDaemon(constants.RAPI)
710
        logging.debug("Second attempt to talk with RAPI")
711
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
712
          logging.fatal("RAPI is not responding. Please investigate.")
713
      logging.debug("Successfully talked to RAPI.")
714

    
715
      try:
716
        watcher = Watcher(options, notepad)
717
      except errors.ConfigurationError:
718
        # Just exit if there's no configuration
719
        update_file = True
720
        sys.exit(constants.EXIT_SUCCESS)
721

    
722
      watcher.Run()
723
      update_file = True
724

    
725
    finally:
726
      if update_file:
727
        notepad.Save()
728
      else:
729
        logging.debug("Not updating status file due to failure")
730
  except SystemExit:
731
    raise
732
  except NotMasterError:
733
    logging.debug("Not master, exiting")
734
    sys.exit(constants.EXIT_NOTMASTER)
735
  except errors.ResolverError, err:
736
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
737
    sys.exit(constants.EXIT_NODESETUP_ERROR)
738
  except errors.JobQueueFull:
739
    logging.error("Job queue is full, can't query cluster state")
740
  except errors.JobQueueDrainError:
741
    logging.error("Job queue is drained, can't maintain cluster state")
742
  except Exception, err:
743
    logging.exception(str(err))
744
    sys.exit(constants.EXIT_FAILURE)
745

    
746

    
747
if __name__ == '__main__':
748
  main()