Add default instance shutdown timeout constant
[ganeti-local] / lib / cmdlib.py
index 4c8584c..0023cac 100644 (file)
@@ -670,22 +670,33 @@ def _BuildInstanceHookEnvByObject(lu, instance, override=None):
   return _BuildInstanceHookEnv(**args)
 
 
-def _AdjustCandidatePool(lu):
+def _AdjustCandidatePool(lu, exceptions):
   """Adjust the candidate pool after node operations.
 
   """
-  mod_list = lu.cfg.MaintainCandidatePool()
+  mod_list = lu.cfg.MaintainCandidatePool(exceptions)
   if mod_list:
     lu.LogInfo("Promoted nodes to master candidate role: %s",
                ", ".join(node.name for node in mod_list))
     for name in mod_list:
       lu.context.ReaddNode(name)
-  mc_now, mc_max = lu.cfg.GetMasterCandidateStats()
+  mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
   if mc_now > mc_max:
     lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
                (mc_now, mc_max))
 
 
+def _DecideSelfPromotion(lu, exceptions=None):
+  """Decide whether I should promote myself as a master candidate.
+
+  """
+  cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
+  mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
+  # the new node will increase mc_max with one, so:
+  mc_should = min(mc_should + 1, cp_size)
+  return mc_now < mc_should
+
+
 def _CheckNicsBridgesExist(lu, target_nics, target_node,
                                profile=constants.PP_DEFAULT):
   """Check that the brigdes needed by a list of nics exist.
@@ -711,6 +722,26 @@ def _CheckInstanceBridgesExist(lu, instance, node=None):
   _CheckNicsBridgesExist(lu, instance.nics, node)
 
 
+def _CheckOSVariant(os, name):
+  """Check whether an OS name conforms to the os variants specification.
+
+  @type os: L{objects.OS}
+  @param os: OS object to check
+  @type name: string
+  @param name: OS name passed by the user, to check for validity
+
+  """
+  if not os.supported_variants:
+    return
+  try:
+    variant = name.split("+", 1)[1]
+  except IndexError:
+    raise errors.OpPrereqError("OS name must include a variant")
+
+  if variant not in os.supported_variants:
+    raise errors.OpPrereqError("Unsupported OS variant")
+
+
 def _GetNodeInstancesInner(cfg, fn):
   return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
 
@@ -1594,7 +1625,6 @@ class LURepairDiskSizes(NoHooksLU):
         if full_name is None:
           raise errors.OpPrereqError("Instance '%s' not known" % name)
         self.wanted_names.append(full_name)
-      self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
       self.needed_locks = {
         locking.LEVEL_NODE: [],
         locking.LEVEL_INSTANCE: self.wanted_names,
@@ -1624,6 +1654,29 @@ class LURepairDiskSizes(NoHooksLU):
     self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
                              in self.wanted_names]
 
+  def _EnsureChildSizes(self, disk):
+    """Ensure children of the disk have the needed disk size.
+
+    This is valid mainly for DRBD8 and fixes an issue where the
+    children have smaller disk size.
+
+    @param disk: an L{ganeti.objects.Disk} object
+
+    """
+    if disk.dev_type == constants.LD_DRBD8:
+      assert disk.children, "Empty children for DRBD8?"
+      fchild = disk.children[0]
+      mismatch = fchild.size < disk.size
+      if mismatch:
+        self.LogInfo("Child disk has size %d, parent %d, fixing",
+                     fchild.size, disk.size)
+        fchild.size = disk.size
+
+      # and we recurse on this child only, not on the metadev
+      return self._EnsureChildSizes(fchild) or mismatch
+    else:
+      return False
+
   def Exec(self, feedback_fn):
     """Verify the size of cluster disks.
 
@@ -1640,8 +1693,11 @@ class LURepairDiskSizes(NoHooksLU):
 
     changed = []
     for node, dskl in per_node_disks.items():
-      result = self.rpc.call_blockdev_getsizes(node, [v[2] for v in dskl])
-      if result.RemoteFailMsg():
+      newl = [v[2].Copy() for v in dskl]
+      for dsk in newl:
+        self.cfg.SetDiskID(dsk, node)
+      result = self.rpc.call_blockdev_getsizes(node, newl)
+      if result.fail_msg:
         self.LogWarning("Failure in blockdev_getsizes call to node"
                         " %s, ignoring", node)
         continue
@@ -1666,6 +1722,9 @@ class LURepairDiskSizes(NoHooksLU):
           disk.size = size
           self.cfg.Update(instance)
           changed.append((instance.name, idx, size))
+        if self._EnsureChildSizes(disk):
+          self.cfg.Update(instance)
+          changed.append((instance.name, idx, disk.size))
     return changed
 
 
@@ -1876,8 +1935,7 @@ class LUSetClusterParams(LogicalUnit):
       invalid_hvs = set(self.hv_list) - constants.HYPER_TYPES
       if invalid_hvs:
         raise errors.OpPrereqError("Enabled hypervisors contains invalid"
-                                   " entries: %s" %
-                                   utils.CommaJoin(invalid_hvs))
+                                   " entries: %s" % " ,".join(invalid_hvs))
     else:
       self.hv_list = cluster.enabled_hypervisors
 
@@ -1918,7 +1976,7 @@ class LUSetClusterParams(LogicalUnit):
     if self.op.candidate_pool_size is not None:
       self.cluster.candidate_pool_size = self.op.candidate_pool_size
       # we need to update the pool size here, otherwise the save will fail
-      _AdjustCandidatePool(self)
+      _AdjustCandidatePool(self, [])
 
     self.cfg.Update(self.cluster)
 
@@ -2106,7 +2164,9 @@ class LUDiagnoseOS(NoHooksLU):
   _OP_REQP = ["output_fields", "names"]
   REQ_BGL = False
   _FIELDS_STATIC = utils.FieldSet()
-  _FIELDS_DYNAMIC = utils.FieldSet("name", "valid", "node_status")
+  _FIELDS_DYNAMIC = utils.FieldSet("name", "valid", "node_status", "variants")
+  # Fields that need calculation of global os validity
+  _FIELDS_NEEDVALID = frozenset(["valid", "variants"])
 
   def ExpandNames(self):
     if self.op.names:
@@ -2154,14 +2214,14 @@ class LUDiagnoseOS(NoHooksLU):
     for node_name, nr in rlist.items():
       if nr.fail_msg or not nr.payload:
         continue
-      for name, path, status, diagnose in nr.payload:
+      for name, path, status, diagnose, variants in nr.payload:
         if name not in all_os:
           # build a list of nodes for this os containing empty lists
           # for each node in node_list
           all_os[name] = {}
           for nname in good_nodes:
             all_os[name][nname] = []
-        all_os[name][node_name].append((path, status, diagnose))
+        all_os[name][node_name].append((path, status, diagnose, variants))
     return all_os
 
   def Exec(self, feedback_fn):
@@ -2172,18 +2232,38 @@ class LUDiagnoseOS(NoHooksLU):
     node_data = self.rpc.call_os_diagnose(valid_nodes)
     pol = self._DiagnoseByOS(valid_nodes, node_data)
     output = []
+    calc_valid = self._FIELDS_NEEDVALID.intersection(self.op.output_fields)
+    calc_variants = "variants" in self.op.output_fields
+
     for os_name, os_data in pol.items():
       row = []
+      if calc_valid:
+        valid = True
+        variants = None
+        for osl in os_data.values():
+          valid = valid and osl and osl[0][1]
+          if not valid:
+            variants = None
+            break
+          if calc_variants:
+            node_variants = osl[0][3]
+            if variants is None:
+              variants = node_variants
+            else:
+              variants = [v for v in variants if v in node_variants]
+
       for field in self.op.output_fields:
         if field == "name":
           val = os_name
         elif field == "valid":
-          val = utils.all([osl and osl[0][1] for osl in os_data.values()])
+          val = valid
         elif field == "node_status":
           # this is just a copy of the dict
           val = {}
           for node_name, nos_list in os_data.items():
             val[node_name] = nos_list
+        elif field == "variants":
+          val =  variants
         else:
           raise errors.ParameterError(field)
         row.append(val)
@@ -2254,6 +2334,8 @@ class LURemoveNode(LogicalUnit):
     logging.info("Stopping the node daemon and removing configs from node %s",
                  node.name)
 
+    # Promote nodes to master candidate as needed
+    _AdjustCandidatePool(self, exceptions=[node.name])
     self.context.RemoveNode(node.name)
 
     # Run post hooks on the node before it's removed
@@ -2269,9 +2351,6 @@ class LURemoveNode(LogicalUnit):
       self.LogWarning("Errors encountered on the remote node while leaving"
                       " the cluster: %s", msg)
 
-    # Promote nodes to master candidate as needed
-    _AdjustCandidatePool(self)
-
 
 class LUQueryNodes(NoHooksLU):
   """Logical unit for querying nodes.
@@ -2279,6 +2358,10 @@ class LUQueryNodes(NoHooksLU):
   """
   _OP_REQP = ["output_fields", "names", "use_locking"]
   REQ_BGL = False
+
+  _SIMPLE_FIELDS = ["name", "serial_no", "ctime", "mtime", "uuid",
+                    "master_candidate", "offline", "drained"]
+
   _FIELDS_DYNAMIC = utils.FieldSet(
     "dtotal", "dfree",
     "mtotal", "mnode", "mfree",
@@ -2286,16 +2369,12 @@ class LUQueryNodes(NoHooksLU):
     "ctotal", "cnodes", "csockets",
     )
 
-  _FIELDS_STATIC = utils.FieldSet(
-    "name", "pinst_cnt", "sinst_cnt",
+  _FIELDS_STATIC = utils.FieldSet(*[
+    "pinst_cnt", "sinst_cnt",
     "pinst_list", "sinst_list",
     "pip", "sip", "tags",
-    "serial_no", "ctime", "mtime",
-    "master_candidate",
     "master",
-    "offline",
-    "drained",
-    "role",
+    "role"] + _SIMPLE_FIELDS
     )
 
   def ExpandNames(self):
@@ -2396,8 +2475,8 @@ class LUQueryNodes(NoHooksLU):
     for node in nodelist:
       node_output = []
       for field in self.op.output_fields:
-        if field == "name":
-          val = node.name
+        if field in self._SIMPLE_FIELDS:
+          val = getattr(node, field)
         elif field == "pinst_list":
           val = list(node_to_primary[node.name])
         elif field == "sinst_list":
@@ -2412,20 +2491,8 @@ class LUQueryNodes(NoHooksLU):
           val = node.secondary_ip
         elif field == "tags":
           val = list(node.GetTags())
-        elif field == "serial_no":
-          val = node.serial_no
-        elif field == "ctime":
-          val = node.ctime
-        elif field == "mtime":
-          val = node.mtime
-        elif field == "master_candidate":
-          val = node.master_candidate
         elif field == "master":
           val = node.name == master_node
-        elif field == "offline":
-          val = node.offline
-        elif field == "drained":
-          val = node.drained
         elif self._FIELDS_DYNAMIC.Matches(field):
           val = live_data[node.name].get(field, None)
         elif field == "role":
@@ -2778,15 +2845,12 @@ class LUAddNode(LogicalUnit):
         raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
                                    " based ping to noded port")
 
-    cp_size = self.cfg.GetClusterInfo().candidate_pool_size
     if self.op.readd:
       exceptions = [node]
     else:
       exceptions = []
-    mc_now, mc_max = self.cfg.GetMasterCandidateStats(exceptions)
-    # the new node will increase mc_max with one, so:
-    mc_max = min(mc_max + 1, cp_size)
-    self.master_candidate = mc_now < mc_max
+
+    self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
 
     if self.op.readd:
       self.new_node = self.cfg.GetNodeInfo(node)
@@ -2873,7 +2937,8 @@ class LUAddNode(LogicalUnit):
       nl_payload = result[verifier].payload[constants.NV_NODELIST]
       if nl_payload:
         for failed in nl_payload:
-          feedback_fn("ssh/hostname verification failed %s -> %s" %
+          feedback_fn("ssh/hostname verification failed"
+                      " (checking from %s): %s" %
                       (verifier, nl_payload[failed]))
         raise errors.OpExecError("ssh/hostname verification failed.")
 
@@ -2885,7 +2950,7 @@ class LUAddNode(LogicalUnit):
       # and make sure the new node will not have old files around
       if not new_node.master_candidate:
         result = self.rpc.call_node_demote_from_mc(new_node.name)
-        msg = result.RemoteFailMsg()
+        msg = result.fail_msg
         if msg:
           self.LogWarning("Node failed to demote itself from master"
                           " candidate status: %s" % msg)
@@ -2953,14 +3018,22 @@ class LUSetNodeParams(LogicalUnit):
         raise errors.OpPrereqError("The master role can be changed"
                                    " only via masterfailover")
 
-    if ((self.op.master_candidate == False or self.op.offline == True or
-         self.op.drained == True) and node.master_candidate):
+    # Boolean value that tells us whether we're offlining or draining the node
+    offline_or_drain = self.op.offline == True or self.op.drained == True
+    deoffline_or_drain = self.op.offline == False or self.op.drained == False
+
+    if (node.master_candidate and
+        (self.op.master_candidate == False or offline_or_drain)):
       cp_size = self.cfg.GetClusterInfo().candidate_pool_size
-      num_candidates, _ = self.cfg.GetMasterCandidateStats()
-      if num_candidates <= cp_size:
+      mc_now, mc_should, mc_max = self.cfg.GetMasterCandidateStats()
+      if mc_now <= cp_size:
         msg = ("Not enough master candidates (desired"
-               " %d, new value will be %d)" % (cp_size, num_candidates-1))
-        if self.op.force:
+               " %d, new value will be %d)" % (cp_size, mc_now-1))
+        # Only allow forcing the operation if it's an offline/drain operation,
+        # and we could not possibly promote more nodes.
+        # FIXME: this can still lead to issues if in any way another node which
+        # could be promoted appears in the meantime.
+        if self.op.force and offline_or_drain and mc_should == mc_max:
           self.LogWarning(msg)
         else:
           raise errors.OpPrereqError(msg)
@@ -2971,6 +3044,13 @@ class LUSetNodeParams(LogicalUnit):
       raise errors.OpPrereqError("Node '%s' is offline or drained, can't set"
                                  " to master_candidate" % node.name)
 
+    # If we're being deofflined/drained, we'll MC ourself if needed
+    if (deoffline_or_drain and not offline_or_drain and not
+        self.op.master_candidate == True):
+      self.op.master_candidate = _DecideSelfPromotion(self)
+      if self.op.master_candidate:
+        self.LogInfo("Autopromoting node to master candidate")
+
     return
 
   def Exec(self, feedback_fn):
@@ -3013,7 +3093,7 @@ class LUSetNodeParams(LogicalUnit):
           changed_mc = True
           result.append(("master_candidate", "auto-demotion due to drain"))
           rrc = self.rpc.call_node_demote_from_mc(node.name)
-          msg = rrc.RemoteFailMsg()
+          msg = rrc.fail_msg
           if msg:
             self.LogWarning("Node failed to demote itself: %s" % msg)
         if node.offline:
@@ -3114,6 +3194,7 @@ class LUQueryClusterInfo(NoHooksLU):
       "file_storage_dir": cluster.file_storage_dir,
       "ctime": cluster.ctime,
       "mtime": cluster.mtime,
+      "uuid": cluster.uuid,
       "tags": list(cluster.GetTags()),
       }
 
@@ -3687,6 +3768,7 @@ class LUReinstallInstance(LogicalUnit):
                                   instance.primary_node))
 
     self.op.os_type = getattr(self.op, "os_type", None)
+    self.op.force_variant = getattr(self.op, "force_variant", False)
     if self.op.os_type is not None:
       # OS verification
       pnode = self.cfg.GetNodeInfo(
@@ -3697,6 +3779,8 @@ class LUReinstallInstance(LogicalUnit):
       result = self.rpc.call_os_get(pnode.name, self.op.os_type)
       result.Raise("OS '%s' not in supported OS list for primary node %s" %
                    (self.op.os_type, pnode.name), prereq=True)
+      if not self.op.force_variant:
+        _CheckOSVariant(result.payload, self.op.os_type)
 
     self.instance = instance
 
@@ -3983,6 +4067,8 @@ class LUQueryInstances(NoHooksLU):
   """
   _OP_REQP = ["output_fields", "names", "use_locking"]
   REQ_BGL = False
+  _SIMPLE_FIELDS = ["name", "os", "network_port", "hypervisor",
+                    "serial_no", "ctime", "mtime", "uuid"]
   _FIELDS_STATIC = utils.FieldSet(*["name", "os", "pnode", "snodes",
                                     "admin_state",
                                     "disk_template", "ip", "mac", "bridge",
@@ -3995,9 +4081,8 @@ class LUQueryInstances(NoHooksLU):
                                     r"(nic)\.(bridge)/([0-9]+)",
                                     r"(nic)\.(macs|ips|modes|links|bridges)",
                                     r"(disk|nic)\.(count)",
-                                    "serial_no", "hypervisor", "hvparams",
-                                    "ctime", "mtime",
-                                    ] +
+                                    "hvparams",
+                                    ] + _SIMPLE_FIELDS +
                                   ["hv/%s" % name
                                    for name in constants.HVS_PARAMETERS] +
                                   ["be/%s" % name
@@ -4077,7 +4162,7 @@ class LUQueryInstances(NoHooksLU):
         if result.offline:
           # offline nodes will be in both lists
           off_nodes.append(name)
-        if result.RemoteFailMsg():
+        if result.fail_msg:
           bad_nodes.append(name)
         else:
           if result.payload:
@@ -4100,10 +4185,8 @@ class LUQueryInstances(NoHooksLU):
                                  nic.nicparams) for nic in instance.nics]
       for field in self.op.output_fields:
         st_match = self._FIELDS_STATIC.Matches(field)
-        if field == "name":
-          val = instance.name
-        elif field == "os":
-          val = instance.os
+        if field in self._SIMPLE_FIELDS:
+          val = getattr(instance, field)
         elif field == "pnode":
           val = instance.primary_node
         elif field == "snodes":
@@ -4180,16 +4263,6 @@ class LUQueryInstances(NoHooksLU):
           val = _ComputeDiskSize(instance.disk_template, disk_sizes)
         elif field == "tags":
           val = list(instance.GetTags())
-        elif field == "serial_no":
-          val = instance.serial_no
-        elif field == "ctime":
-          val = instance.ctime
-        elif field == "mtime":
-          val = instance.mtime
-        elif field == "network_port":
-          val = instance.network_port
-        elif field == "hypervisor":
-          val = instance.hypervisor
         elif field == "hvparams":
           val = i_hv
         elif (field.startswith(HVPREFIX) and
@@ -5207,7 +5280,7 @@ def _CreateDisks(lu, instance, to_skip=None, target_node=None):
     result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
 
     result.Raise("Failed to create directory '%s' on"
-                 " node %s: %s" % (file_storage_dir, pnode))
+                 " node %s" % (file_storage_dir, pnode))
 
   # Note: this needs to be kept in sync with adding of disks in
   # LUSetInstanceParams
@@ -5258,10 +5331,10 @@ def _RemoveDisks(lu, instance, target_node=None):
 
   if instance.disk_template == constants.DT_FILE:
     file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
-    if target_node is node:
-      tgt = instance.primary_node
+    if target_node:
+      tgt = target_node
     else:
-      tgt = instance.target_node
+      tgt = instance.primary_node
     result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
     if result.fail_msg:
       lu.LogWarning("Could not remove directory '%s' on node %s: %s",
@@ -5530,9 +5603,15 @@ class LUCreateInstance(LogicalUnit):
           self.op.src_path = src_path = \
             os.path.join(constants.EXPORT_DIR, src_path)
 
+      # On import force_variant must be True, because if we forced it at
+      # initial install, our only chance when importing it back is that it
+      # works again!
+      self.op.force_variant = True
+
     else: # INSTANCE_CREATE
       if getattr(self.op, "os_type", None) is None:
         raise errors.OpPrereqError("No guest OS specified")
+      self.op.force_variant = getattr(self.op, "force_variant", False)
 
   def _RunAllocator(self):
     """Run the allocator based on input opcode.
@@ -5762,6 +5841,8 @@ class LUCreateInstance(LogicalUnit):
     result = self.rpc.call_os_get(pnode.name, self.op.os_type)
     result.Raise("OS '%s' not in supported os list for primary node %s" %
                  (self.op.os_type, pnode.name), prereq=True)
+    if not self.op.force_variant:
+      _CheckOSVariant(result.payload, self.op.os_type)
 
     _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
 
@@ -7030,6 +7111,7 @@ class LUQueryInstanceData(NoHooksLU):
         "serial_no": instance.serial_no,
         "mtime": instance.mtime,
         "ctime": instance.ctime,
+        "uuid": instance.uuid,
         }
 
       result[instance.name] = idict