Cluster: add nicparams, and update them on upgrade

[ganeti-local] / daemons / ganeti-watcher
diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher

index 94c6b48..42a2eaf 100755 (executable)
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -30,24 +30,21 @@ by a node reboot.  Run from cron or similar.
  import os
  import sys
  import time
  import os
  import sys
  import time
-import fcntl
-import errno
  import logging
  from optparse import OptionParser
  
  from ganeti import utils
  from ganeti import constants
  from ganeti import serializer
  import logging
  from optparse import OptionParser
  
  from ganeti import utils
  from ganeti import constants
  from ganeti import serializer
-from ganeti import ssconf
  from ganeti import errors
  from ganeti import opcodes
  from ganeti import errors
  from ganeti import opcodes
-from ganeti import logger
  from ganeti import cli
  from ganeti import cli
+from ganeti import luxi
  
  
  MAXTRIES = 5
  
  
  MAXTRIES = 5
-BAD_STATES = ['stopped']
-HELPLESS_STATES = ['(node down)']
+BAD_STATES = ['ERROR_down']
+HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
  NOTICE = 'NOTICE'
  ERROR = 'ERROR'
  KEY_RESTART_COUNT = "restart_count"
  NOTICE = 'NOTICE'
  ERROR = 'ERROR'
  KEY_RESTART_COUNT = "restart_count"
@@ -66,14 +63,23 @@ class NotMasterError(errors.GenericError):
  def Indent(s, prefix='| '):
    """Indent a piece of text with a given prefix before each line.
  
  def Indent(s, prefix='| '):
    """Indent a piece of text with a given prefix before each line.
  
-  Args:
-    s: The string to indent
-    prefix: The string to prepend each line.
+  @param s: the string to indent
+  @param prefix: the string to prepend each line
  
    """
    return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
  
  
  
    """
    return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
  
  
+def StartMaster():
+  """Try to start the master daemon.
+
+  """
+  result = utils.RunCmd(['ganeti-masterd'])
+  if result.failed:
+    logging.error("Can't start the master daemon: output '%s'", result.output)
+  return not result.failed
+
+
  class WatcherState(object):
    """Interface to a state file recording restart attempts.
  
  class WatcherState(object):
    """Interface to a state file recording restart attempts.
  
@@ -93,11 +99,15 @@ class WatcherState(object):
      utils.LockFile(self.statefile.fileno())
  
      try:
      utils.LockFile(self.statefile.fileno())
  
      try:
-      self._data = serializer.Load(self.statefile.read())
+      state_data = self.statefile.read()
+      if not state_data:
+        self._data = {}
+      else:
+        self._data = serializer.Load(state_data)
      except Exception, msg:
        # Ignore errors while loading the file and treat it as empty
        self._data = {}
      except Exception, msg:
        # Ignore errors while loading the file and treat it as empty
        self._data = {}
-      logging.warning(("Empty or invalid state file. Using defaults."
+      logging.warning(("Invalid state file. Using defaults."
                         " Error message: %s"), msg)
  
      if "instance" not in self._data:
                         " Error message: %s"), msg)
  
      if "instance" not in self._data:
@@ -162,8 +172,8 @@ class WatcherState(object):
    def NumberOfRestartAttempts(self, instance):
      """Returns number of previous restart attempts.
  
    def NumberOfRestartAttempts(self, instance):
      """Returns number of previous restart attempts.
  
-    Args:
-      instance - the instance to look up.
+    @type instance: L{Instance}
+    @param instance: the instance to look up
  
      """
      idata = self._data["instance"]
  
      """
      idata = self._data["instance"]
@@ -176,8 +186,8 @@ class WatcherState(object):
    def RecordRestartAttempt(self, instance):
      """Record a restart attempt.
  
    def RecordRestartAttempt(self, instance):
      """Record a restart attempt.
  
-    Args:
-      instance - the instance being restarted
+    @type instance: L{Instance}
+    @param instance: the instance being restarted
  
      """
      idata = self._data["instance"]
  
      """
      idata = self._data["instance"]
@@ -191,12 +201,13 @@ class WatcherState(object):
      inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
  
    def RemoveInstance(self, instance):
      inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
  
    def RemoveInstance(self, instance):
-    """Update state to reflect that a machine is running, i.e. remove record.
+    """Update state to reflect that a machine is running.
  
  
-    Args:
-      instance - the instance to remove from books
+    This method removes the record for a named instance (as we only
+    track down instances).
  
  
-    This method removes the record for a named instance.
+    @type instance: L{Instance}
+    @param instance: the instance to remove from books
  
      """
      idata = self._data["instance"]
  
      """
      idata = self._data["instance"]
@@ -208,9 +219,6 @@ class WatcherState(object):
  class Instance(object):
    """Abstraction for a Virtual Machine instance.
  
  class Instance(object):
    """Abstraction for a Virtual Machine instance.
  
-  Methods:
-    Restart(): issue a command to restart the represented machine.
-
    """
    def __init__(self, name, state, autostart):
      self.name = name
    """
    def __init__(self, name, state, autostart):
      self.name = name
@@ -221,9 +229,7 @@ class Instance(object):
      """Encapsulates the start of an instance.
  
      """
      """Encapsulates the start of an instance.
  
      """
-    op = opcodes.OpStartupInstance(instance_name=self.name,
-                                   force=False,
-                                   extra_args=None)
+    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
      cli.SubmitOpCode(op, cl=client)
  
    def ActivateDisks(self):
      cli.SubmitOpCode(op, cl=client)
  
    def ActivateDisks(self):
@@ -234,45 +240,42 @@ class Instance(object):
      cli.SubmitOpCode(op, cl=client)
  
  
      cli.SubmitOpCode(op, cl=client)
  
  
-def GetInstanceList(with_secondaries=None):
+def GetClusterData():
    """Get a list of instances on this cluster.
  
    """
    """Get a list of instances on this cluster.
  
    """
-  fields = ["name", "oper_state", "admin_state"]
-
-  if with_secondaries is not None:
-    fields.append("snodes")
+  op1_fields = ["name", "status", "admin_state", "snodes"]
+  op1 = opcodes.OpQueryInstances(output_fields=op1_fields, names=[],
+                                 use_locking=True)
+  op2_fields = ["name", "bootid", "offline"]
+  op2 = opcodes.OpQueryNodes(output_fields=op2_fields, names=[],
+                             use_locking=True)
  
  
-  result = client.QueryInstances([], fields)
+  job_id = client.SubmitJob([op1, op2])
  
  
-  instances = []
-  for fields in result:
-    if with_secondaries is not None:
-      (name, status, autostart, snodes) = fields
+  all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
  
  
-      if not snodes:
-        continue
+  result = all_results[0]
+  smap = {}
  
  
-      for node in with_secondaries:
-        if node in snodes:
-          break
-      else:
-        continue
-
-    else:
-      (name, status, autostart) = fields
+  instances = {}
+  for fields in result:
+    (name, status, autostart, snodes) = fields
  
  
-    instances.append(Instance(name, status, autostart))
+    # update the secondary node map
+    for node in snodes:
+      if node not in smap:
+        smap[node] = []
+      smap[node].append(name)
  
  
-  return instances
+    instances[name] = Instance(name, status, autostart)
  
  
+  nodes =  dict([(name, (bootid, offline))
+                 for name, bootid, offline in all_results[1]])
  
  
-def GetNodeBootIDs():
-  """Get a dict mapping nodes to boot IDs.
+  client.ArchiveJob(job_id)
  
  
-  """
-  result = client.QueryNodes([], ["name", "bootid"])
-  return dict([(name, bootid) for name, bootid in result])
+  return instances, nodes, smap
  
  
  class Watcher(object):
  
  
  class Watcher(object):
@@ -283,31 +286,45 @@ class Watcher(object):
    to restart machines that are down.
  
    """
    to restart machines that are down.
  
    """
-  def __init__(self):
-    sstore = ssconf.SimpleStore()
-    master = sstore.GetMasterNode()
+  def __init__(self, opts, notepad):
+    self.notepad = notepad
+    master = client.QueryConfigValues(["master_node"])[0]
      if master != utils.HostInfo().name:
        raise NotMasterError("This is not the master node")
      if master != utils.HostInfo().name:
        raise NotMasterError("This is not the master node")
-    self.instances = GetInstanceList()
-    self.bootids = GetNodeBootIDs()
+    self.instances, self.bootids, self.smap = GetClusterData()
      self.started_instances = set()
      self.started_instances = set()
+    self.opts = opts
  
    def Run(self):
  
    def Run(self):
-    notepad = WatcherState()
-    try:
-      self.CheckInstances(notepad)
-      self.CheckDisks(notepad)
-      self.VerifyDisks()
-    finally:
-      notepad.Save()
+    """Watcher run sequence.
+
+    """
+    notepad = self.notepad
+    self.ArchiveJobs(self.opts.job_age)
+    self.CheckInstances(notepad)
+    self.CheckDisks(notepad)
+    self.VerifyDisks()
+
+  def ArchiveJobs(self, age):
+    """Archive old jobs.
+
+    """
+    arch_count, left_count = client.AutoArchiveJobs(age)
+    logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))
  
    def CheckDisks(self, notepad):
      """Check all nodes for restarted ones.
  
      """
      check_nodes = []
  
    def CheckDisks(self, notepad):
      """Check all nodes for restarted ones.
  
      """
      check_nodes = []
-    for name, new_id in self.bootids.iteritems():
+    for name, (new_id, offline) in self.bootids.iteritems():
        old = notepad.GetNodeBootID(name)
        old = notepad.GetNodeBootID(name)
+      if new_id is None:
+        # Bad node, not returning a boot id
+        if not offline:
+          logging.debug("Node %s missing boot id, skipping secondary checks",
+                        name)
+        continue
        if old != new_id:
          # Node's boot ID has changed, proably through a reboot.
          check_nodes.append(name)
        if old != new_id:
          # Node's boot ID has changed, proably through a reboot.
          check_nodes.append(name)
@@ -315,34 +332,35 @@ class Watcher(object):
      if check_nodes:
        # Activate disks for all instances with any of the checked nodes as a
        # secondary node.
      if check_nodes:
        # Activate disks for all instances with any of the checked nodes as a
        # secondary node.
-      for instance in GetInstanceList(with_secondaries=check_nodes):
-        if not instance.autostart:
-          logging.info(("Skipping disk activation for non-autostart"
-                        " instance %s"), instance.name)
-          continue
-        if instance.name in self.started_instances:
-          # we already tried to start the instance, which should have
-          # activated its drives (if they can be at all)
+      for node in check_nodes:
+        if node not in self.smap:
            continue
            continue
-        try:
-          logging.info("Activating disks for instance %s", instance.name)
-          instance.ActivateDisks()
-        except Exception, err:
-          logging.error(str(err), exc_info=True)
+        for instance_name in self.smap[node]:
+          instance = self.instances[instance_name]
+          if not instance.autostart:
+            logging.info(("Skipping disk activation for non-autostart"
+                          " instance %s"), instance.name)
+            continue
+          if instance.name in self.started_instances:
+            # we already tried to start the instance, which should have
+            # activated its drives (if they can be at all)
+            continue
+          try:
+            logging.info("Activating disks for instance %s", instance.name)
+            instance.ActivateDisks()
+          except Exception:
+            logging.exception("Error while activating disks for instance %s",
+                              instance.name)
  
        # Keep changed boot IDs
        for name in check_nodes:
  
        # Keep changed boot IDs
        for name in check_nodes:
-        notepad.SetNodeBootID(name, self.bootids[name])
+        notepad.SetNodeBootID(name, self.bootids[name][0])
  
    def CheckInstances(self, notepad):
      """Make a pass over the list of instances, restarting downed ones.
  
      """
  
    def CheckInstances(self, notepad):
      """Make a pass over the list of instances, restarting downed ones.
  
      """
-    for instance in self.instances:
-      # Don't care about manually stopped instances
-      if not instance.autostart:
-        continue
-
+    for instance in self.instances.values():
        if instance.state in BAD_STATES:
          n = notepad.NumberOfRestartAttempts(instance)
  
        if instance.state in BAD_STATES:
          n = notepad.NumberOfRestartAttempts(instance)
  
@@ -361,8 +379,9 @@ class Watcher(object):
                          instance.name, last)
            instance.Restart()
            self.started_instances.add(instance.name)
                          instance.name, last)
            instance.Restart()
            self.started_instances.add(instance.name)
-        except Exception, err:
-          logging.error(str(err), exc_info=True)
+        except Exception:
+          logging.exception("Error while restarting instance %s",
+                            instance.name)
  
          notepad.RecordRestartAttempt(instance)
        elif instance.state in HELPLESS_STATES:
  
          notepad.RecordRestartAttempt(instance)
        elif instance.state in HELPLESS_STATES:
@@ -373,12 +392,15 @@ class Watcher(object):
            notepad.RemoveInstance(instance)
            logging.info("Restart of %s succeeded", instance.name)
  
            notepad.RemoveInstance(instance)
            logging.info("Restart of %s succeeded", instance.name)
  
-  def VerifyDisks(self):
+  @staticmethod
+  def VerifyDisks():
      """Run gnt-cluster verify-disks.
  
      """
      op = opcodes.OpVerifyDisks()
      """Run gnt-cluster verify-disks.
  
      """
      op = opcodes.OpVerifyDisks()
-    result = cli.SubmitOpCode(op, cl=client)
+    job_id = client.SubmitJob([op])
+    result = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)[0]
+    client.ArchiveJob(job_id)
      if not isinstance(result, (tuple, list)):
        logging.error("Can't get a valid result from verify-disks")
        return
      if not isinstance(result, (tuple, list)):
        logging.error("Can't get a valid result from verify-disks")
        return
@@ -400,8 +422,7 @@ class Watcher(object):
  def ParseOptions():
    """Parse the command line options.
  
  def ParseOptions():
    """Parse the command line options.
  
-  Returns:
-    (options, args) as from OptionParser.parse_args()
+  @return: (options, args) as from OptionParser.parse_args()
  
    """
    parser = OptionParser(description="Ganeti cluster watcher",
  
    """
    parser = OptionParser(description="Ganeti cluster watcher",
@@ -412,7 +433,11 @@ def ParseOptions():
    parser.add_option("-d", "--debug", dest="debug",
                      help="Write all messages to stderr",
                      default=False, action="store_true")
    parser.add_option("-d", "--debug", dest="debug",
                      help="Write all messages to stderr",
                      default=False, action="store_true")
+  parser.add_option("-A", "--job-age", dest="job_age",
+                    help="Autoarchive jobs older than this age (default"
+                    " 6 hours)", default=6*3600)
    options, args = parser.parse_args()
    options, args = parser.parse_args()
+  options.job_age = cli.ParseTimespec(options.job_age)
    return options, args
  
  
    return options, args
  
  
@@ -424,19 +449,41 @@ def main():
  
    options, args = ParseOptions()
  
  
    options, args = ParseOptions()
  
-  logger.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
-                      stderr_logging=options.debug)
+  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
+                     stderr_logging=options.debug)
  
  
+  update_file = True
    try:
    try:
-    client = cli.GetClient()
-
+    notepad = WatcherState()
      try:
      try:
-      watcher = Watcher()
-    except errors.ConfigurationError:
-      # Just exit if there's no configuration
-      sys.exit(constants.EXIT_SUCCESS)
-
-    watcher.Run()
+      try:
+        client = cli.GetClient()
+      except errors.OpPrereqError:
+        # this is, from cli.GetClient, a not-master case
+        logging.debug("Not on master, exiting")
+        sys.exit(constants.EXIT_SUCCESS)
+      except luxi.NoMasterError, err:
+        logging.warning("Master seems to be down (%s), trying to restart",
+                        str(err))
+        if not StartMaster():
+          logging.critical("Can't start the master, exiting")
+          update_file = False
+          sys.exit(constants.EXIT_FAILURE)
+        # else retry the connection
+        client = cli.GetClient()
+
+      try:
+        watcher = Watcher(options, notepad)
+      except errors.ConfigurationError:
+        # Just exit if there's no configuration
+        sys.exit(constants.EXIT_SUCCESS)
+
+      watcher.Run()
+    finally:
+      if update_file:
+        notepad.Save()
+      else:
+        logging.debug("Not updating status file due to failure")
    except SystemExit:
      raise
    except NotMasterError:
    except SystemExit:
      raise
    except NotMasterError: