Add a new cluster parameter maintain_node_health

[ganeti-local] / lib / cli.py
diff --git a/lib/cli.py b/lib/cli.py

index cbf6561..a647f73 100644 (file)
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -36,6 +36,7 @@ from ganeti import opcodes
  from ganeti import luxi
  from ganeti import ssconf
  from ganeti import rpc
+from ganeti import ssh
  
  from optparse import (OptionParser, TitledHelpFormatter,
                        Option, OptionValueError)
@@ -45,6 +46,7 @@ __all__ = [
    # Command line options
    "ALLOCATABLE_OPT",
    "ALL_OPT",
+  "AUTO_PROMOTE_OPT",
    "AUTO_REPLACE_OPT",
    "BACKEND_OPT",
    "CLEANUP_OPT",
@@ -56,6 +58,7 @@ __all__ = [
    "DISK_OPT",
    "DISK_TEMPLATE_OPT",
    "DRAINED_OPT",
+  "EARLY_RELEASE_OPT",
    "ENABLED_HV_OPT",
    "ERROR_CODES_OPT",
    "FIELDS_OPT",
@@ -73,15 +76,20 @@ __all__ = [
    "IGNORE_SECONDARIES_OPT",
    "IGNORE_SIZE_OPT",
    "MAC_PREFIX_OPT",
+  "MAINTAIN_NODE_HEALTH_OPT",
    "MASTER_NETDEV_OPT",
    "MC_OPT",
    "NET_OPT",
+  "NEW_CLUSTER_CERT_OPT",
+  "NEW_CONFD_HMAC_KEY_OPT",
+  "NEW_RAPI_CERT_OPT",
    "NEW_SECONDARY_OPT",
    "NIC_PARAMS_OPT",
    "NODE_LIST_OPT",
    "NODE_PLACEMENT_OPT",
    "NOHDR_OPT",
    "NOIPCHECK_OPT",
+  "NO_INSTALL_OPT",
    "NONAMECHECK_OPT",
    "NOLVM_STORAGE_OPT",
    "NOMODIFY_ETCHOSTS_OPT",
@@ -99,6 +107,7 @@ __all__ = [
    "OFFLINE_OPT",
    "OS_OPT",
    "OS_SIZE_OPT",
+  "RAPI_CERT_OPT",
    "READD_OPT",
    "REBOOT_TYPE_OPT",
    "SECONDARY_IP_OPT",
@@ -115,6 +124,7 @@ __all__ = [
    "TAG_SRC_OPT",
    "TIMEOUT_OPT",
    "USEUNITS_OPT",
+  "USE_REPL_NET_OPT",
    "VERBOSE_OPT",
    "VG_NAME_OPT",
    "YES_DOIT_OPT",
@@ -126,6 +136,7 @@ __all__ = [
    "JobExecutor",
    "JobSubmittedException",
    "ParseTimespec",
+  "RunWhileClusterStopped",
    "SubmitOpCode",
    "SubmitOrSend",
    "UsesRPC",
@@ -145,6 +156,7 @@ __all__ = [
    "ARGS_NONE",
    "ARGS_ONE_INSTANCE",
    "ARGS_ONE_NODE",
+  "ARGS_ONE_OS",
    "ArgChoice",
    "ArgCommand",
    "ArgFile",
@@ -152,6 +164,7 @@ __all__ = [
    "ArgInstance",
    "ArgJobId",
    "ArgNode",
+  "ArgOs",
    "ArgSuggest",
    "ArgUnknown",
    "OPT_COMPL_INST_ADD_NODES",
@@ -245,11 +258,18 @@ class ArgHost(_Argument):
    """
  
  
+class ArgOs(_Argument):
+  """OS argument.
+
+  """
+
+
  ARGS_NONE = []
  ARGS_MANY_INSTANCES = [ArgInstance()]
  ARGS_MANY_NODES = [ArgNode()]
  ARGS_ONE_INSTANCE = [ArgInstance(min=1, max=1)]
  ARGS_ONE_NODE = [ArgNode(min=1, max=1)]
+ARGS_ONE_OS = [ArgOs(min=1, max=1)]
  
  
  def _ExtractTagsObject(opts, args):
@@ -438,6 +458,21 @@ def check_key_val(option, opt, value):  # pylint: disable-msg=W0613
    return _SplitKeyVal(opt, value)
  
  
+def check_bool(option, opt, value): # pylint: disable-msg=W0613
+  """Custom parser for yes/no options.
+
+  This will store the parsed value as either True or False.
+
+  """
+  value = value.lower()
+  if value == constants.VALUE_FALSE or value == "no":
+    return False
+  elif value == constants.VALUE_TRUE or value == "yes":
+    return True
+  else:
+    raise errors.ParameterError("Invalid boolean value '%s'" % value)
+
+
  # completion_suggestion is normally a list. Using numeric values not evaluating
  # to False for dynamic completion.
  (OPT_COMPL_MANY_NODES,
@@ -468,23 +503,23 @@ class CliOption(Option):
      "identkeyval",
      "keyval",
      "unit",
+    "bool",
      )
    TYPE_CHECKER = Option.TYPE_CHECKER.copy()
    TYPE_CHECKER["identkeyval"] = check_ident_key_val
    TYPE_CHECKER["keyval"] = check_key_val
    TYPE_CHECKER["unit"] = check_unit
+  TYPE_CHECKER["bool"] = check_bool
  
  
  # optparse.py sets make_option, so we do it for our own option class, too
  cli_option = CliOption
  
  
-_YESNO = ("yes", "no")
  _YORNO = "yes|no"
  
-DEBUG_OPT = cli_option("-d", "--debug", default=False,
-                       action="store_true",
-                       help="Turn debugging on")
+DEBUG_OPT = cli_option("-d", "--debug", default=0, action="count",
+                       help="Increase debugging level")
  
  NOHDR_OPT = cli_option("--no-headers", default=False,
                         action="store_true", dest="no_headers",
@@ -575,6 +610,11 @@ FORCE_VARIANT_OPT = cli_option("--force-variant", dest="force_variant",
                                 action="store_true", default=False,
                                 help="Force an unknown variant")
  
+NO_INSTALL_OPT = cli_option("--no-install", dest="no_install",
+                            action="store_true", default=False,
+                            help="Do not install the OS (will"
+                            " enable no-start)")
+
  BACKEND_OPT = cli_option("-B", "--backend-parameters", dest="beparams",
                           type="keyval", default={},
                           help="Backend parameters")
@@ -699,6 +739,11 @@ ON_SECONDARY_OPT = cli_option("-s", "--on-secondary", dest="on_secondary",
                                help="Replace the disk(s) on the secondary"
                                " node (only for the drbd template)")
  
+AUTO_PROMOTE_OPT = cli_option("--auto-promote", dest="auto_promote",
+                              default=False, action="store_true",
+                              help="Lock all nodes and auto-promote as needed"
+                              " to MC status")
+
  AUTO_REPLACE_OPT = cli_option("-a", "--auto", dest="auto",
                                default=False, action="store_true",
                                help="Automatically replace faulty disks"
@@ -731,19 +776,19 @@ NOSSH_KEYCHECK_OPT = cli_option("--no-ssh-key-check", dest="ssh_key_check",
  
  
  MC_OPT = cli_option("-C", "--master-candidate", dest="master_candidate",
-                    choices=_YESNO, default=None, metavar=_YORNO,
+                    type="bool", default=None, metavar=_YORNO,
                      help="Set the master_candidate flag on the node")
  
  OFFLINE_OPT = cli_option("-O", "--offline", dest="offline", metavar=_YORNO,
-                         choices=_YESNO, default=None,
+                         type="bool", default=None,
                           help="Set the offline flag on the node")
  
  DRAINED_OPT = cli_option("-D", "--drained", dest="drained", metavar=_YORNO,
-                         choices=_YESNO, default=None,
+                         type="bool", default=None,
                           help="Set the drained flag on the node")
  
  ALLOCATABLE_OPT = cli_option("--allocatable", dest="allocatable",
-                             choices=_YESNO, default=None, metavar=_YORNO,
+                             type="bool", default=None, metavar=_YORNO,
                               help="Set the allocatable flag on a volume")
  
  NOLVM_STORAGE_OPT = cli_option("--no-lvm-storage", dest="lvm_storage",
@@ -837,6 +882,45 @@ SHUTDOWN_TIMEOUT_OPT = cli_option("--shutdown-timeout",
                           default=constants.DEFAULT_SHUTDOWN_TIMEOUT,
                           help="Maximum time to wait for instance shutdown")
  
+EARLY_RELEASE_OPT = cli_option("--early-release",
+                               dest="early_release", default=False,
+                               action="store_true",
+                               help="Release the locks on the secondary"
+                               " node(s) early")
+
+NEW_CLUSTER_CERT_OPT = cli_option("--new-cluster-certificate",
+                                  dest="new_cluster_cert",
+                                  default=False, action="store_true",
+                                  help="Generate a new cluster certificate")
+
+RAPI_CERT_OPT = cli_option("--rapi-certificate", dest="rapi_cert",
+                           default=None,
+                           help="File containing new RAPI certificate")
+
+NEW_RAPI_CERT_OPT = cli_option("--new-rapi-certificate", dest="new_rapi_cert",
+                               default=None, action="store_true",
+                               help=("Generate a new self-signed RAPI"
+                                     " certificate"))
+
+NEW_CONFD_HMAC_KEY_OPT = cli_option("--new-confd-hmac-key",
+                                    dest="new_confd_hmac_key",
+                                    default=False, action="store_true",
+                                    help=("Create a new HMAC key for %s" %
+                                          constants.CONFD))
+
+USE_REPL_NET_OPT = cli_option("--use-replication-network",
+                              dest="use_replication_network",
+                              help="Whether to use the replication network"
+                              " for talking to the nodes",
+                              action="store_true", default=False)
+
+MAINTAIN_NODE_HEALTH_OPT = \
+    cli_option("--maintain-node-health", dest="maintain_node_health",
+               metavar=_YORNO, default=None, type="bool",
+               help="Configure the cluster to automatically maintain node"
+               " health, by shutting down unknown instances, shutting down"
+               " unknown DRBD devices, etc.")
+
  
  def _ParseArgs(argv, commands, aliases):
    """Parser for the command line arguments.
@@ -1126,12 +1210,28 @@ def PollJob(job_id, cl=None, feedback_fn=None):
    prev_job_info = None
    prev_logmsg_serial = None
  
+  status = None
+
+  notified_queued = False
+  notified_waitlock = False
+
    while True:
-    result = cl.WaitForJobChange(job_id, ["status"], prev_job_info,
-                                 prev_logmsg_serial)
+    result = cl.WaitForJobChangeOnce(job_id, ["status"], prev_job_info,
+                                     prev_logmsg_serial)
      if not result:
        # job not found, go away!
        raise errors.JobLost("Job with id %s lost" % job_id)
+    elif result == constants.JOB_NOTCHANGED:
+      if status is not None and not callable(feedback_fn):
+        if status == constants.JOB_STATUS_QUEUED and not notified_queued:
+          ToStderr("Job %s is waiting in queue", job_id)
+          notified_queued = True
+        elif status == constants.JOB_STATUS_WAITLOCK and not notified_waitlock:
+          ToStderr("Job %s is trying to acquire all necessary locks", job_id)
+          notified_waitlock = True
+
+      # Wait again
+      continue
  
      # Split result, a tuple of (field values, log entries)
      (job_info, log_entries) = result
@@ -1182,7 +1282,7 @@ def PollJob(job_id, cl=None, feedback_fn=None):
      raise errors.OpExecError(result)
  
  
-def SubmitOpCode(op, cl=None, feedback_fn=None):
+def SubmitOpCode(op, cl=None, feedback_fn=None, opts=None):
    """Legacy function to submit an opcode.
  
    This is just a simple wrapper over the construction of the processor
@@ -1193,6 +1293,8 @@ def SubmitOpCode(op, cl=None, feedback_fn=None):
    if cl is None:
      cl = GetClient()
  
+  SetGenericOpcodeOpts([op], opts)
+
    job_id = SendJob([op], cl)
  
    op_results = PollJob(job_id, cl=cl, feedback_fn=feedback_fn)
@@ -1208,16 +1310,35 @@ def SubmitOrSend(op, opts, cl=None, feedback_fn=None):
    whether to just send the job and print its identifier. It is used in
    order to simplify the implementation of the '--submit' option.
  
-  It will also add the dry-run parameter from the options passed, if true.
+  It will also process the opcodes if we're sending the via SendJob
+  (otherwise SubmitOpCode does it).
  
    """
-  if opts and opts.dry_run:
-    op.dry_run = opts.dry_run
    if opts and opts.submit_only:
-    job_id = SendJob([op], cl=cl)
+    job = [op]
+    SetGenericOpcodeOpts(job, opts)
+    job_id = SendJob(job, cl=cl)
      raise JobSubmittedException(job_id)
    else:
-    return SubmitOpCode(op, cl=cl, feedback_fn=feedback_fn)
+    return SubmitOpCode(op, cl=cl, feedback_fn=feedback_fn, opts=opts)
+
+
+def SetGenericOpcodeOpts(opcode_list, options):
+  """Processor for generic options.
+
+  This function updates the given opcodes based on generic command
+  line options (like debug, dry-run, etc.).
+
+  @param opcode_list: list of opcodes
+  @param options: command line options or None
+  @return: None (in-place modification)
+
+  """
+  if not options:
+    return
+  for op in opcode_list:
+    op.dry_run = options.dry_run
+    op.debug_level = options.debug
  
  
  def GetClient():
@@ -1442,13 +1563,23 @@ def GenericInstanceCreate(mode, opts, args):
        if not isinstance(ddict, dict):
          msg = "Invalid disk/%d value: expected dict, got %s" % (didx, ddict)
          raise errors.OpPrereqError(msg)
-      elif "size" not in ddict:
-        raise errors.OpPrereqError("Missing size for disk %d" % didx)
-      try:
-        ddict["size"] = utils.ParseUnit(ddict["size"])
-      except ValueError, err:
-        raise errors.OpPrereqError("Invalid disk size for disk %d: %s" %
-                                   (didx, err))
+      elif "size" in ddict:
+        if "adopt" in ddict:
+          raise errors.OpPrereqError("Only one of 'size' and 'adopt' allowed"
+                                     " (disk %d)" % didx)
+        try:
+          ddict["size"] = utils.ParseUnit(ddict["size"])
+        except ValueError, err:
+          raise errors.OpPrereqError("Invalid disk size for disk %d: %s" %
+                                     (didx, err))
+      elif "adopt" in ddict:
+        if mode == constants.INSTANCE_IMPORT:
+          raise errors.OpPrereqError("Disk adoption not allowed for instance"
+                                     " import")
+        ddict["size"] = 0
+      else:
+        raise errors.OpPrereqError("Missing size or adoption source for"
+                                   " disk %d" % didx)
        disks[didx] = ddict
  
    utils.ForceDictType(opts.beparams, constants.BES_PARAMETER_TYPES)
@@ -1459,11 +1590,13 @@ def GenericInstanceCreate(mode, opts, args):
      os_type = opts.os
      src_node = None
      src_path = None
+    no_install = opts.no_install
    elif mode == constants.INSTANCE_IMPORT:
      start = False
      os_type = None
      src_node = opts.src_node
      src_path = opts.src_dir
+    no_install = None
    else:
      raise errors.ProgrammerError("Invalid creation mode %s" % mode)
  
@@ -1485,12 +1618,136 @@ def GenericInstanceCreate(mode, opts, args):
                                  start=start,
                                  os_type=os_type,
                                  src_node=src_node,
-                                src_path=src_path)
+                                src_path=src_path,
+                                no_install=no_install)
  
    SubmitOrSend(op, opts)
    return 0
  
  
+class _RunWhileClusterStoppedHelper:
+  """Helper class for L{RunWhileClusterStopped} to simplify state management
+
+  """
+  def __init__(self, feedback_fn, cluster_name, master_node, online_nodes):
+    """Initializes this class.
+
+    @type feedback_fn: callable
+    @param feedback_fn: Feedback function
+    @type cluster_name: string
+    @param cluster_name: Cluster name
+    @type master_node: string
+    @param master_node Master node name
+    @type online_nodes: list
+    @param online_nodes: List of names of online nodes
+
+    """
+    self.feedback_fn = feedback_fn
+    self.cluster_name = cluster_name
+    self.master_node = master_node
+    self.online_nodes = online_nodes
+
+    self.ssh = ssh.SshRunner(self.cluster_name)
+
+    self.nonmaster_nodes = [name for name in online_nodes
+                            if name != master_node]
+
+    assert self.master_node not in self.nonmaster_nodes
+
+  def _RunCmd(self, node_name, cmd):
+    """Runs a command on the local or a remote machine.
+
+    @type node_name: string
+    @param node_name: Machine name
+    @type cmd: list
+    @param cmd: Command
+
+    """
+    if node_name is None or node_name == self.master_node:
+      # No need to use SSH
+      result = utils.RunCmd(cmd)
+    else:
+      result = self.ssh.Run(node_name, "root", utils.ShellQuoteArgs(cmd))
+
+    if result.failed:
+      errmsg = ["Failed to run command %s" % result.cmd]
+      if node_name:
+        errmsg.append("on node %s" % node_name)
+      errmsg.append(": exitcode %s and error %s" %
+                    (result.exit_code, result.output))
+      raise errors.OpExecError(" ".join(errmsg))
+
+  def Call(self, fn, *args):
+    """Call function while all daemons are stopped.
+
+    @type fn: callable
+    @param fn: Function to be called
+
+    """
+    # Pause watcher by acquiring an exclusive lock on watcher state file
+    self.feedback_fn("Blocking watcher")
+    watcher_block = utils.FileLock.Open(constants.WATCHER_STATEFILE)
+    try:
+      # TODO: Currently, this just blocks. There's no timeout.
+      # TODO: Should it be a shared lock?
+      watcher_block.Exclusive(blocking=True)
+
+      # Stop master daemons, so that no new jobs can come in and all running
+      # ones are finished
+      self.feedback_fn("Stopping master daemons")
+      self._RunCmd(None, [constants.DAEMON_UTIL, "stop-master"])
+      try:
+        # Stop daemons on all nodes
+        for node_name in self.online_nodes:
+          self.feedback_fn("Stopping daemons on %s" % node_name)
+          self._RunCmd(node_name, [constants.DAEMON_UTIL, "stop-all"])
+
+        # All daemons are shut down now
+        try:
+          return fn(self, *args)
+        except Exception, err:
+          _, errmsg = FormatError(err)
+          logging.exception("Caught exception")
+          self.feedback_fn(errmsg)
+          raise
+      finally:
+        # Start cluster again, master node last
+        for node_name in self.nonmaster_nodes + [self.master_node]:
+          self.feedback_fn("Starting daemons on %s" % node_name)
+          self._RunCmd(node_name, [constants.DAEMON_UTIL, "start-all"])
+    finally:
+      # Resume watcher
+      watcher_block.Close()
+
+
+def RunWhileClusterStopped(feedback_fn, fn, *args):
+  """Calls a function while all cluster daemons are stopped.
+
+  @type feedback_fn: callable
+  @param feedback_fn: Feedback function
+  @type fn: callable
+  @param fn: Function to be called when daemons are stopped
+
+  """
+  feedback_fn("Gathering cluster information")
+
+  # This ensures we're running on the master daemon
+  cl = GetClient()
+
+  (cluster_name, master_node) = \
+    cl.QueryConfigValues(["cluster_name", "master_node"])
+
+  online_nodes = GetOnlineNodes([], cl=cl)
+
+  # Don't keep a reference to the client. The master daemon will go away.
+  del cl
+
+  assert master_node in online_nodes
+
+  return _RunWhileClusterStoppedHelper(feedback_fn, cluster_name, master_node,
+                                       online_nodes).Call(fn, *args)
+
+
  def GenerateTable(headers, fields, separator, data,
                    numfields=None, unitfields=None,
                    units=None):
@@ -1561,7 +1818,7 @@ def GenerateTable(headers, fields, separator, data,
        if unitfields.Matches(fields[idx]):
          try:
            val = int(val)
-        except ValueError:
+        except (TypeError, ValueError):
            pass
          else:
            val = row[idx] = utils.FormatUnit(val, units)
@@ -1575,14 +1832,17 @@ def GenerateTable(headers, fields, separator, data,
      for idx, name in enumerate(fields):
        hdr = headers[name]
        if separator is None:
-        if idx == len(fields) - 1 and not numfields.Matches(name):
-          mlens[idx] = 0
-        else:
-          mlens[idx] = max(mlens[idx], len(hdr))
+        mlens[idx] = max(mlens[idx], len(hdr))
          args.append(mlens[idx])
        args.append(hdr)
      result.append(format % tuple(args))
  
+  if separator is None:
+    assert len(mlens) == len(fields)
+
+    if fields and not numfields.Matches(fields[-1]):
+      mlens[-1] = 0
+
    for line in data:
      args = []
      if line is None:
@@ -1639,7 +1899,7 @@ def ParseTimespec(value):
    if value[-1] not in suffix_map:
      try:
        value = int(value)
-    except ValueError:
+    except (TypeError, ValueError):
        raise errors.OpPrereqError("Invalid time specification '%s'" % value)
    else:
      multiplier = suffix_map[value[-1]]
@@ -1649,12 +1909,13 @@ def ParseTimespec(value):
                                   " suffix passed)")
      try:
        value = int(value) * multiplier
-    except ValueError:
+    except (TypeError, ValueError):
        raise errors.OpPrereqError("Invalid time specification '%s'" % value)
    return value
  
  
-def GetOnlineNodes(nodes, cl=None, nowarn=False):
+def GetOnlineNodes(nodes, cl=None, nowarn=False, secondary_ips=False,
+                   filter_master=False):
    """Returns the names of online nodes.
  
    This function will also log a warning on stderr with the names of
@@ -1667,17 +1928,36 @@ def GetOnlineNodes(nodes, cl=None, nowarn=False):
    @param nowarn: by default, this function will output a note with the
        offline nodes that are skipped; if this parameter is True the
        note is not displayed
+  @type secondary_ips: boolean
+  @param secondary_ips: if True, return the secondary IPs instead of the
+      names, useful for doing network traffic over the replication interface
+      (if any)
+  @type filter_master: boolean
+  @param filter_master: if True, do not return the master node in the list
+      (useful in coordination with secondary_ips where we cannot check our
+      node name against the list)
  
    """
    if cl is None:
      cl = GetClient()
  
-  result = cl.QueryNodes(names=nodes, fields=["name", "offline"],
+  if secondary_ips:
+    name_idx = 2
+  else:
+    name_idx = 0
+
+  if filter_master:
+    master_node = cl.QueryConfigValues(["master_node"])[0]
+    filter_fn = lambda x: x != master_node
+  else:
+    filter_fn = lambda _: True
+
+  result = cl.QueryNodes(names=nodes, fields=["name", "offline", "sip"],
                           use_locking=False)
    offline = [row[0] for row in result if row[1]]
    if offline and not nowarn:
      ToStderr("Note: skipping offline node(s): %s" % utils.CommaJoin(offline))
-  return [row[0] for row in result if not row[1]]
+  return [row[name_idx] for row in result if not row[1] and filter_fn(row[0])]
  
  
  def _ToStream(stream, txt, *args):
@@ -1729,13 +2009,15 @@ class JobExecutor(object):
    GetResults() calls.
  
    """
-  def __init__(self, cl=None, verbose=True):
+  def __init__(self, cl=None, verbose=True, opts=None, feedback_fn=None):
      self.queue = []
      if cl is None:
        cl = GetClient()
      self.cl = cl
      self.verbose = verbose
      self.jobs = []
+    self.opts = opts
+    self.feedback_fn = feedback_fn
  
    def QueueJob(self, name, *ops):
      """Record a job for later submit.
@@ -1743,6 +2025,7 @@ class JobExecutor(object):
      @type name: string
      @param name: a description of the job, will be used in WaitJobSet
      """
+    SetGenericOpcodeOpts(ops, self.opts)
      self.queue.append((name, ops))
  
    def SubmitPending(self):
@@ -1750,8 +2033,31 @@ class JobExecutor(object):
  
      """
      results = self.cl.SubmitManyJobs([row[1] for row in self.queue])
-    for ((status, data), (name, _)) in zip(results, self.queue):
-      self.jobs.append((status, data, name))
+    for (idx, ((status, data), (name, _))) in enumerate(zip(results,
+                                                            self.queue)):
+      self.jobs.append((idx, status, data, name))
+
+  def _ChooseJob(self):
+    """Choose a non-waiting/queued job to poll next.
+
+    """
+    assert self.jobs, "_ChooseJob called with empty job list"
+
+    result = self.cl.QueryJobs([i[2] for i in self.jobs], ["status"])
+    assert result
+
+    for job_data, status in zip(self.jobs, result):
+      if status[0] in (constants.JOB_STATUS_QUEUED,
+                    constants.JOB_STATUS_WAITLOCK,
+                    constants.JOB_STATUS_CANCELING):
+        # job is still waiting
+        continue
+      # good candidate found
+      self.jobs.remove(job_data)
+      return job_data
+
+    # no job found
+    return self.jobs.pop(0)
  
    def GetResults(self):
      """Wait for and return the results of all jobs.
@@ -1766,18 +2072,21 @@ class JobExecutor(object):
        self.SubmitPending()
      results = []
      if self.verbose:
-      ok_jobs = [row[1] for row in self.jobs if row[0]]
+      ok_jobs = [row[2] for row in self.jobs if row[1]]
        if ok_jobs:
          ToStdout("Submitted jobs %s", utils.CommaJoin(ok_jobs))
-    for submit_status, jid, name in self.jobs:
-      if not submit_status:
-        ToStderr("Failed to submit job for %s: %s", name, jid)
-        results.append((False, jid))
-        continue
-      if self.verbose:
-        ToStdout("Waiting for job %s for %s...", jid, name)
+
+    # first, remove any non-submitted jobs
+    self.jobs, failures = utils.partition(self.jobs, lambda x: x[1])
+    for idx, _, jid, name in failures:
+      ToStderr("Failed to submit job for %s: %s", name, jid)
+      results.append((idx, False, jid))
+
+    while self.jobs:
+      (idx, _, jid, name) = self._ChooseJob()
+      ToStdout("Waiting for job %s for %s...", jid, name)
        try:
-        job_result = PollJob(jid, cl=self.cl)
+        job_result = PollJob(jid, cl=self.cl, feedback_fn=self.feedback_fn)
          success = True
        except (errors.GenericError, luxi.ProtocolError), err:
          _, job_result = FormatError(err)
@@ -1785,7 +2094,12 @@ class JobExecutor(object):
          # the error message will always be shown, verbose or not
          ToStderr("Job %s for %s has failed: %s", jid, name, job_result)
  
-      results.append((success, job_result))
+      results.append((idx, success, job_result))
+
+    # sort based on the index, then drop it
+    results.sort()
+    results = [i[1:] for i in results]
+
      return results
  
    def WaitOrShow(self, wait):
@@ -1800,7 +2114,7 @@ class JobExecutor(object):
      else:
        if not self.jobs:
          self.SubmitPending()
-      for status, result, name in self.jobs:
+      for _, status, result, name in self.jobs:
          if status:
            ToStdout("%s: %s", result, name)
          else: