"""
+# pylint: disable-msg=C0103,W0142
+
+# C0103: Invalid name ganeti-watcher
+
import os
import sys
import time
import logging
+import errno
from optparse import OptionParser
from ganeti import utils
return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
-def StartMaster():
- """Try to start the master daemon.
+def ShouldPause():
+ """Check whether we should pause.
"""
- result = utils.RunCmd(['ganeti-masterd'])
+ return bool(utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE))
+
+
+def EnsureDaemon(name):
+ """Check for and start daemon if not alive.
+
+ """
+ result = utils.RunCmd([constants.DAEMON_UTIL, "check-and-start", name])
if result.failed:
- logging.error("Can't start the master daemon: output '%s'", result.output)
- return not result.failed
+ logging.error("Can't start daemon '%s', failure %s, output: %s",
+ name, result.fail_reason, result.output)
+ return False
+
+ return True
class WatcherState(object):
self._data = {}
else:
self._data = serializer.Load(state_data)
- except Exception, msg:
+ except Exception, msg: # pylint: disable-msg=W0703
# Ignore errors while loading the file and treat it as empty
self._data = {}
logging.warning(("Invalid state file. Using defaults."
all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
+ logging.debug("Got data from cluster, writing instance status file")
+
result = all_results[0]
smap = {}
instances = {}
+
+ # write the upfile
+ up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
+ utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
+
for fields in result:
(name, status, autostart, snodes) = fields
master = client.QueryConfigValues(["master_node"])[0]
if master != utils.HostInfo().name:
raise NotMasterError("This is not the master node")
+ # first archive old jobs
+ self.ArchiveJobs(opts.job_age)
+ # and only then submit new ones
self.instances, self.bootids, self.smap = GetClusterData()
self.started_instances = set()
self.opts = opts
"""
notepad = self.notepad
- self.ArchiveJobs(self.opts.job_age)
self.CheckInstances(notepad)
self.CheckDisks(notepad)
self.VerifyDisks()
- def ArchiveJobs(self, age):
+ @staticmethod
+ def ArchiveJobs(age):
"""Archive old jobs.
"""
arch_count, left_count = client.AutoArchiveJobs(age)
- logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))
+ logging.debug("Archived %s jobs, left %s", arch_count, left_count)
def CheckDisks(self, notepad):
"""Check all nodes for restarted ones.
try:
logging.info("Activating disks for instance %s", instance.name)
instance.ActivateDisks()
- except Exception:
+ except Exception: # pylint: disable-msg=W0703
logging.exception("Error while activating disks for instance %s",
instance.name)
instance.name, last)
instance.Restart()
self.started_instances.add(instance.name)
- except Exception:
+ except Exception: # pylint: disable-msg=W0703
logging.exception("Error while restarting instance %s",
instance.name)
# nothing to do
return
logging.debug("Will activate disks for instances %s",
- ", ".join(offline_disk_instances))
+ utils.CommaJoin(offline_disk_instances))
# we submit only one job, and wait for it. not optimal, but spams
# less the job queue
job = [opcodes.OpActivateInstanceDisks(instance_name=name)
version="%%prog (ganeti) %s" %
constants.RELEASE_VERSION)
- parser.add_option("-d", "--debug", dest="debug",
- help="Write all messages to stderr",
- default=False, action="store_true")
+ parser.add_option(cli.DEBUG_OPT)
parser.add_option("-A", "--job-age", dest="job_age",
help="Autoarchive jobs older than this age (default"
" 6 hours)", default=6*3600)
"""Main function.
"""
- global client
+ global client # pylint: disable-msg=W0603
options, args = ParseOptions()
+ if args: # watcher doesn't take any arguments
+ print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
+ sys.exit(constants.EXIT_FAILURE)
+
utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
stderr_logging=options.debug)
- update_file = True
+ if ShouldPause():
+ logging.debug("Pause has been set, exiting")
+ sys.exit(constants.EXIT_SUCCESS)
+
+ update_file = False
try:
+ # on master or not, try to start the node dameon
+ EnsureDaemon(constants.NODED)
+
notepad = WatcherState()
try:
try:
except errors.OpPrereqError:
# this is, from cli.GetClient, a not-master case
logging.debug("Not on master, exiting")
+ update_file = True
sys.exit(constants.EXIT_SUCCESS)
except luxi.NoMasterError, err:
logging.warning("Master seems to be down (%s), trying to restart",
str(err))
- if not StartMaster():
+ if not EnsureDaemon(constants.MASTERD):
logging.critical("Can't start the master, exiting")
- update_file = False
sys.exit(constants.EXIT_FAILURE)
# else retry the connection
client = cli.GetClient()
+ # we are on master now
+ EnsureDaemon(constants.RAPI)
+
try:
watcher = Watcher(options, notepad)
except errors.ConfigurationError:
# Just exit if there's no configuration
+ update_file = True
sys.exit(constants.EXIT_SUCCESS)
watcher.Run()
+ update_file = True
+
finally:
if update_file:
notepad.Save()
except errors.ResolverError, err:
logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
sys.exit(constants.EXIT_NODESETUP_ERROR)
+ except errors.JobQueueFull:
+ logging.error("Job queue is full, can't query cluster state")
+ except errors.JobQueueDrainError:
+ logging.error("Job queue is drained, can't maintain cluster state")
except Exception, err:
logging.error(str(err), exc_info=True)
sys.exit(constants.EXIT_FAILURE)