X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/eddabe9983f954635e1da7d25909859be31e2929..e82ac01cfd4fa14a8e5e03a0c77045c21537f4e0:/lib/cmdlib.py diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 90f016c..caae061 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -832,7 +832,8 @@ class LUVerifyCluster(LogicalUnit): for prinode, instances in nodeinfo['sinst-by-pnode'].iteritems(): needed_mem = 0 for instance in instances: - needed_mem += instance_cfg[instance].memory + if instance_cfg[instance].auto_balance: + needed_mem += instance_cfg[instance].memory if nodeinfo['mfree'] < needed_mem: feedback_fn(" - ERROR: not enough memory on node %s to accomodate" " failovers should node %s fail" % (node, prinode)) @@ -879,6 +880,7 @@ class LUVerifyCluster(LogicalUnit): nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist] instancelist = utils.NiceSort(self.cfg.GetInstanceList()) i_non_redundant = [] # Non redundant instances + i_non_a_balanced = [] # Non auto-balanced instances node_volume = {} node_instance = {} node_info = {} @@ -1001,6 +1003,9 @@ class LUVerifyCluster(LogicalUnit): feedback_fn(" - WARNING: multiple secondaries for instance %s" % instance) + if not inst_config.auto_balance: + i_non_a_balanced.append(instance) + for snode in inst_config.secondary_nodes: if snode in node_info: node_info[snode]['sinst'].append(instance) @@ -1032,6 +1037,10 @@ class LUVerifyCluster(LogicalUnit): feedback_fn(" - NOTICE: %d non-redundant instance(s) found." % len(i_non_redundant)) + if i_non_a_balanced: + feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found." + % len(i_non_a_balanced)) + return int(bad) def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result): @@ -1181,8 +1190,7 @@ class LURenameCluster(LogicalUnit): raise errors.OpPrereqError("Neither the name nor the IP address of the" " cluster has changed") if new_ip != old_ip: - result = utils.RunCmd(["fping", "-q", new_ip]) - if not result.failed: + if utils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT): raise errors.OpPrereqError("The given cluster IP address (%s) is" " reachable on the network. Aborting." % new_ip) @@ -1491,7 +1499,7 @@ class LUQueryNodes(NoHooksLU): "dtotal", "dfree", "mtotal", "mnode", "mfree", "bootid", - "ctotal", + "ctotal", "cnodes", "csockets", ]) _CheckOutputFields(static=["name", "pinst_cnt", "sinst_cnt", @@ -1517,14 +1525,17 @@ class LUQueryNodes(NoHooksLU): for name in nodenames: nodeinfo = node_data.get(name, None) if nodeinfo: + fn = utils.TryConvert live_data[name] = { - "mtotal": utils.TryConvert(int, nodeinfo['memory_total']), - "mnode": utils.TryConvert(int, nodeinfo['memory_dom0']), - "mfree": utils.TryConvert(int, nodeinfo['memory_free']), - "dtotal": utils.TryConvert(int, nodeinfo['vg_size']), - "dfree": utils.TryConvert(int, nodeinfo['vg_free']), - "ctotal": utils.TryConvert(int, nodeinfo['cpu_total']), - "bootid": nodeinfo['bootid'], + "mtotal": fn(int, nodeinfo.get('memory_total', None)), + "mnode": fn(int, nodeinfo.get('memory_dom0', None)), + "mfree": fn(int, nodeinfo.get('memory_free', None)), + "dtotal": fn(int, nodeinfo.get('vg_size', None)), + "dfree": fn(int, nodeinfo.get('vg_free', None)), + "ctotal": fn(int, nodeinfo.get('cpu_total', None)), + "bootid": nodeinfo.get('bootid', None), + "cnodes": fn(int, nodeinfo.get('cpu_nodes', None)), + "csockets": fn(int, nodeinfo.get('cpu_sockets', None)), } else: live_data[name] = {} @@ -2261,7 +2272,8 @@ def _CheckNodeFreeMemory(cfg, node, reason, requested): """ nodeinfo = rpc.call_node_info([node], cfg.GetVGName()) - if not nodeinfo or not isinstance(nodeinfo, dict): + if not (nodeinfo and isinstance(nodeinfo, dict) and + node in nodeinfo and isinstance(nodeinfo[node], dict)): raise errors.OpPrereqError("Could not contact node %s for resource" " information" % (node,)) @@ -2589,9 +2601,7 @@ class LURenameInstance(LogicalUnit): new_name) if not getattr(self.op, "ignore_ip", False): - command = ["fping", "-q", name_info.ip] - result = utils.RunCmd(command) - if not result.failed: + if utils.TcpPing(name_info.ip, constants.DEFAULT_NODED_PORT): raise errors.OpPrereqError("IP %s of instance %s already in use" % (name_info.ip, new_name)) @@ -2612,8 +2622,8 @@ class LURenameInstance(LogicalUnit): try: if not rpc.call_instance_run_rename(inst.primary_node, inst, old_name, "sda", "sdb"): - msg = ("Could run OS rename script for instance %s on node %s (but the" - " instance has been renamed in Ganeti)" % + msg = ("Could not run OS rename script for instance %s on node %s" + " (but the instance has been renamed in Ganeti)" % (inst.name, inst.primary_node)) logger.Error(msg) finally: @@ -2695,7 +2705,12 @@ class LUQueryInstances(NoHooksLU): _CheckOutputFields(static=["name", "os", "pnode", "snodes", "admin_state", "admin_ram", "disk_template", "ip", "mac", "bridge", - "sda_size", "sdb_size", "vcpus", "tags"], + "sda_size", "sdb_size", "vcpus", "tags", + "auto_balance", + "network_port", "kernel_path", "initrd_path", + "hvm_boot_order", "hvm_acpi", "hvm_pae", + "hvm_cdrom_image_path", "hvm_nic_type", + "hvm_disk_type", "vnc_bind_address"], dynamic=self.dynamic_fields, selected=self.op.output_fields) @@ -2790,6 +2805,19 @@ class LUQueryInstances(NoHooksLU): val = instance.vcpus elif field == "tags": val = list(instance.GetTags()) + elif field == "auto_balance": + val = instance.auto_balance + elif field in ("network_port", "kernel_path", "initrd_path", + "hvm_boot_order", "hvm_acpi", "hvm_pae", + "hvm_cdrom_image_path", "hvm_nic_type", + "hvm_disk_type", "vnc_bind_address"): + val = getattr(instance, field, None) + if val is None: + if field in ("hvm_nic_type", "hvm_disk_type", + "kernel_path", "initrd_path"): + val = "default" + else: + val = "-" else: raise errors.ParameterError(field) iout.append(val) @@ -2923,7 +2951,7 @@ class LUMigrateInstance(LogicalUnit): """ HPATH = "instance-migrate" HTYPE = constants.HTYPE_INSTANCE - _OP_REQP = ["instance_name", "live"] + _OP_REQP = ["instance_name", "live", "cleanup"] def BuildHooksEnv(self): """Build hooks env. @@ -2958,7 +2986,7 @@ class LUMigrateInstance(LogicalUnit): target_node = secondary_nodes[0] # check memory requirements on the secondary node - _CheckNodeFreeMemory(self.cfg, target_node, "failing over instance %s" % + _CheckNodeFreeMemory(self.cfg, target_node, "migrating instance %s" % instance.name, instance.memory) # check bridge existance @@ -2968,13 +2996,15 @@ class LUMigrateInstance(LogicalUnit): " exist on destination node '%s'" % (brlist, target_node)) - migratable = rpc.call_instance_migratable(instance.primary_node, instance) - if not migratable: - raise errors.OpPrereqError("Can't contact node '%s'" % - instance.primary_node) - if not migratable[0]: - raise errors.OpPrereqError("Can't migrate: %s - please use failover" % - migratable[1]) + if not self.op.cleanup: + migratable = rpc.call_instance_migratable(instance.primary_node, + instance) + if not migratable: + raise errors.OpPrereqError("Can't contact node '%s'" % + instance.primary_node) + if not migratable[0]: + raise errors.OpPrereqError("Can't migrate: %s - please use failover" % + migratable[1]) self.instance = instance @@ -3067,6 +3097,69 @@ class LUMigrateInstance(LogicalUnit): raise errors.OpExecError("Cannot identify disks node %s," " error %s" % (node, result[node][1])) + def _ExecCleanup(self): + """Try to cleanup after a failed migration. + + The cleanup is done by: + - check that the instance is running only on one node + (and update the config if needed) + - change disks on its secondary node to secondary + - wait until disks are fully synchronized + - disconnect from the network + - change disks into single-master mode + - wait again until disks are fully synchronized + + """ + instance = self.instance + target_node = self.target_node + source_node = self.source_node + + # check running on only one node + self.feedback_fn("* checking where the instance actually runs" + " (if this hangs, the hypervisor might be in" + " a bad state)") + ins_l = rpc.call_instance_list(self.all_nodes) + for node in self.all_nodes: + if not type(ins_l[node]) is list: + raise errors.OpExecError("Can't contact node '%s'" % node) + + runningon_source = instance.name in ins_l[source_node] + runningon_target = instance.name in ins_l[target_node] + + if runningon_source and runningon_target: + raise errors.OpExecError("Instance seems to be running on two nodes," + " or the hypervisor is confused. You will have" + " to ensure manually that it runs only on one" + " and restart this operation.") + + if not (runningon_source or runningon_target): + raise errors.OpExecError("Instance does not seem to be running at all." + " In this case, it's safer to repair by" + " running 'gnt-instance stop' to ensure disk" + " shutdown, and then restarting it.") + + if runningon_target: + # the migration has actually succeeded, we need to update the config + self.feedback_fn("* instance running on secondary node (%s)," + " updating config" % target_node) + instance.primary_node = target_node + self.cfg.Update(instance) + demoted_node = source_node + else: + self.feedback_fn("* instance confirmed to be running on its" + " primary node (%s)" % source_node) + demoted_node = target_node + + self._IdentifyDisks() + + self._EnsureSecondary(demoted_node) + self._WaitUntilSync() + self._GoStandalone() + self._GoReconnect(False) + self._WaitUntilSync() + + self.feedback_fn("* done") + def _ExecMigration(self): """Migrate an instance. @@ -3098,12 +3191,12 @@ class LUMigrateInstance(LogicalUnit): self._WaitUntilSync() self.feedback_fn("* migrating instance to %s" % target_node) + time.sleep(10) result = rpc.call_instance_migrate(source_node, instance, self.nodes_ip[target_node], self.op.live) if not result or not result[0]: logger.Error("Instance migration failed, trying to revert disk status") - try: self._EnsureSecondary(target_node) self._GoStandalone() @@ -3115,6 +3208,7 @@ class LUMigrateInstance(LogicalUnit): raise errors.OpExecError("Could not migrate instance %s: %s" % (instance.name, result[1])) + time.sleep(10) instance.primary_node = target_node # distribute new instance config to the other nodes @@ -3141,7 +3235,10 @@ class LUMigrateInstance(LogicalUnit): self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip, self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip, } - return self._ExecMigration() + if self.op.cleanup: + return self._ExecCleanup() + else: + return self._ExecMigration() def _CreateBlockDevOnPrimary(cfg, node, instance, device, info): @@ -3415,7 +3512,7 @@ class LUCreateInstance(LogicalUnit): HTYPE = constants.HTYPE_INSTANCE _OP_REQP = ["instance_name", "mem_size", "disk_size", "disk_template", "swap_size", "mode", "start", "vcpus", - "wait_for_sync", "ip_check", "mac"] + "wait_for_sync", "ip_check", "mac", "auto_balance"] def _RunAllocator(self): """Run the allocator based on input opcode. @@ -3446,7 +3543,8 @@ class LUCreateInstance(LogicalUnit): if len(ial.nodes) != ial.required_nodes: raise errors.OpPrereqError("iallocator '%s' returned invalid number" " of nodes (%s), required %s" % - (len(ial.nodes), ial.required_nodes)) + (self.op.iallocator, len(ial.nodes), + ial.required_nodes)) self.op.pnode = ial.nodes[0] logger.ToStdout("Selected nodes for the instance: %s" % (", ".join(ial.nodes),)) @@ -3494,7 +3592,7 @@ class LUCreateInstance(LogicalUnit): # set optional parameters to none if they don't exist for attr in ["kernel_path", "initrd_path", "hvm_boot_order", "pnode", "iallocator", "hvm_acpi", "hvm_pae", "hvm_cdrom_image_path", - "vnc_bind_address"]: + "hvm_nic_type", "hvm_disk_type", "vnc_bind_address"]: if not hasattr(self.op, attr): setattr(self.op, attr, None) @@ -3695,6 +3793,15 @@ class LUCreateInstance(LogicalUnit): " like a valid IP address" % self.op.vnc_bind_address) + # Xen HVM device type checks + if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31: + if self.op.hvm_nic_type not in constants.HT_HVM_VALID_NIC_TYPES: + raise errors.OpPrereqError("Invalid NIC type %s specified for Xen HVM" + " hypervisor" % self.op.hvm_nic_type) + if self.op.hvm_disk_type not in constants.HT_HVM_VALID_DISK_TYPES: + raise errors.OpPrereqError("Invalid disk type %s specified for Xen HVM" + " hypervisor" % self.op.hvm_disk_type) + if self.op.start: self.instance_status = 'up' else: @@ -3746,6 +3853,9 @@ class LUCreateInstance(LogicalUnit): hvm_pae=self.op.hvm_pae, hvm_cdrom_image_path=self.op.hvm_cdrom_image_path, vnc_bind_address=self.op.vnc_bind_address, + hvm_nic_type=self.op.hvm_nic_type, + hvm_disk_type=self.op.hvm_disk_type, + auto_balance=bool(self.op.auto_balance), ) feedback_fn("* creating instance disks...") @@ -4140,7 +4250,7 @@ class LUReplaceDisks(LogicalUnit): if self.op.remote_node is not None: raise errors.OpPrereqError("Give either the iallocator or the new" " secondary, not both") - self.op.remote_node = self._RunAllocator() + self._RunAllocator() remote_node = self.op.remote_node if remote_node is not None: @@ -4760,7 +4870,7 @@ class LUQueryInstanceData(NoHooksLU): """Query runtime instance data. """ - _OP_REQP = ["instances"] + _OP_REQP = ["instances", "static"] def CheckPrereq(self): """Check prerequisites. @@ -4788,8 +4898,13 @@ class LUQueryInstanceData(NoHooksLU): """Compute block device status. """ - self.cfg.SetDiskID(dev, instance.primary_node) - dev_pstatus = rpc.call_blockdev_find(instance.primary_node, dev) + static = self.op.static + if not static: + self.cfg.SetDiskID(dev, instance.primary_node) + dev_pstatus = rpc.call_blockdev_find(instance.primary_node, dev) + else: + dev_pstatus = None + if dev.dev_type in constants.LDS_DRBD: # we change the snode then (otherwise we use the one passed in) if dev.logical_id[0] == instance.primary_node: @@ -4797,7 +4912,7 @@ class LUQueryInstanceData(NoHooksLU): else: snode = dev.logical_id[0] - if snode: + if snode and not static: self.cfg.SetDiskID(dev, snode) dev_sstatus = rpc.call_blockdev_find(snode, dev) else: @@ -4825,12 +4940,15 @@ class LUQueryInstanceData(NoHooksLU): """Gather and return data""" result = {} for instance in self.wanted_instances: - remote_info = rpc.call_instance_info(instance.primary_node, - instance.name) - if remote_info and "state" in remote_info: - remote_state = "up" + if not self.op.static: + remote_info = rpc.call_instance_info(instance.primary_node, + instance.name) + if remote_info and "state" in remote_info: + remote_state = "up" + else: + remote_state = "down" else: - remote_state = "down" + remote_state = None if instance.status == "down": config_state = "down" else: @@ -4850,6 +4968,7 @@ class LUQueryInstanceData(NoHooksLU): "nics": [(nic.mac, nic.ip, nic.bridge) for nic in instance.nics], "disks": disks, "vcpus": instance.vcpus, + "auto_balance": instance.auto_balance, } htkind = self.sstore.GetHypervisorType() @@ -4862,9 +4981,28 @@ class LUQueryInstanceData(NoHooksLU): idict["hvm_acpi"] = instance.hvm_acpi idict["hvm_pae"] = instance.hvm_pae idict["hvm_cdrom_image_path"] = instance.hvm_cdrom_image_path + idict["hvm_nic_type"] = instance.hvm_nic_type + idict["hvm_disk_type"] = instance.hvm_disk_type if htkind in constants.HTS_REQ_PORT: - idict["vnc_bind_address"] = instance.vnc_bind_address + if instance.vnc_bind_address is None: + vnc_bind_address = constants.VNC_DEFAULT_BIND_ADDRESS + else: + vnc_bind_address = instance.vnc_bind_address + if instance.network_port is None: + vnc_console_port = None + elif vnc_bind_address == constants.BIND_ADDRESS_GLOBAL: + vnc_console_port = "%s:%s" % (instance.primary_node, + instance.network_port) + elif vnc_bind_address == constants.LOCALHOST_IP_ADDRESS: + vnc_console_port = "%s:%s on node %s" % (vnc_bind_address, + instance.network_port, + instance.primary_node) + else: + vnc_console_port = "%s:%s" % (instance.vnc_bind_address, + instance.network_port) + idict["vnc_console_port"] = vnc_console_port + idict["vnc_bind_address"] = vnc_bind_address idict["network_port"] = instance.network_port result[instance.name] = idict @@ -4926,12 +5064,19 @@ class LUSetInstanceParms(LogicalUnit): self.hvm_boot_order = getattr(self.op, "hvm_boot_order", None) self.hvm_acpi = getattr(self.op, "hvm_acpi", None) self.hvm_pae = getattr(self.op, "hvm_pae", None) + self.hvm_nic_type = getattr(self.op, "hvm_nic_type", None) + self.hvm_disk_type = getattr(self.op, "hvm_disk_type", None) self.hvm_cdrom_image_path = getattr(self.op, "hvm_cdrom_image_path", None) self.vnc_bind_address = getattr(self.op, "vnc_bind_address", None) - all_parms = [self.mem, self.vcpus, self.ip, self.bridge, self.mac, - self.kernel_path, self.initrd_path, self.hvm_boot_order, - self.hvm_acpi, self.hvm_pae, self.hvm_cdrom_image_path, - self.vnc_bind_address] + self.force = getattr(self.op, "force", None) + self.auto_balance = getattr(self.op, "auto_balance", None) + all_parms = [ + self.mem, self.vcpus, self.ip, self.bridge, self.mac, + self.kernel_path, self.initrd_path, self.hvm_boot_order, + self.hvm_acpi, self.hvm_pae, self.hvm_cdrom_image_path, + self.vnc_bind_address, self.hvm_nic_type, self.hvm_disk_type, + self.auto_balance, + ] if all_parms.count(None) == len(all_parms): raise errors.OpPrereqError("No changes submitted") if self.mem is not None: @@ -5012,6 +5157,22 @@ class LUSetInstanceParms(LogicalUnit): " like a valid IP address" % self.op.vnc_bind_address) + # Xen HVM device type checks + if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31: + if self.op.hvm_nic_type is not None: + if self.op.hvm_nic_type not in constants.HT_HVM_VALID_NIC_TYPES: + raise errors.OpPrereqError("Invalid NIC type %s specified for Xen" + " HVM hypervisor" % self.op.hvm_nic_type) + if self.op.hvm_disk_type is not None: + if self.op.hvm_disk_type not in constants.HT_HVM_VALID_DISK_TYPES: + raise errors.OpPrereqError("Invalid disk type %s specified for Xen" + " HVM hypervisor" % self.op.hvm_disk_type) + + # auto balance setting + if self.auto_balance is not None: + # convert the value to a proper bool value, if it's not + self.auto_balance = bool(self.auto_balance) + instance = self.cfg.GetInstanceInfo( self.cfg.ExpandInstanceName(self.op.instance_name)) if instance is None: @@ -5019,6 +5180,39 @@ class LUSetInstanceParms(LogicalUnit): self.op.instance_name) self.op.instance_name = instance.name self.instance = instance + self.warn = [] + if self.mem is not None and not self.force: + pnode = self.instance.primary_node + nodelist = [pnode] + if instance.auto_balance: + nodelist.extend(instance.secondary_nodes) + instance_info = rpc.call_instance_info(pnode, instance.name) + nodeinfo = rpc.call_node_info(nodelist, self.cfg.GetVGName()) + + if pnode not in nodeinfo or not isinstance(nodeinfo[pnode], dict): + # Assume the primary node is unreachable and go ahead + self.warn.append("Can't get info from primary node %s" % pnode) + else: + if instance_info: + current_mem = instance_info['memory'] + else: + # Assume instance not running + # (there is a slight race condition here, but it's not very probable, + # and we have no other way to check) + current_mem = 0 + miss_mem = self.mem - current_mem - nodeinfo[pnode]['memory_free'] + if miss_mem > 0: + raise errors.OpPrereqError("This change will prevent the instance" + " from starting, due to %d MB of memory" + " missing on its primary node" % miss_mem) + + if instance.auto_balance: + for node in instance.secondary_nodes: + if node not in nodeinfo or not isinstance(nodeinfo[node], dict): + self.warn.append("Can't get info from secondary node %s" % node) + elif self.mem > nodeinfo[node]['memory_free']: + self.warn.append("Not enough memory to failover instance to" + " secondary node %s" % node) return def Exec(self, feedback_fn): @@ -5026,6 +5220,11 @@ class LUSetInstanceParms(LogicalUnit): All parameters take effect only at the next restart of the instance. """ + # Process here the warnings from CheckPrereq, as we don't have a + # feedback_fn there. + for warn in self.warn: + feedback_fn("WARNING: %s" % warn) + result = [] instance = self.instance if self.mem: @@ -5061,6 +5260,12 @@ class LUSetInstanceParms(LogicalUnit): if self.hvm_pae is not None: instance.hvm_pae = self.hvm_pae result.append(("hvm_pae", self.hvm_pae)) + if self.hvm_nic_type is not None: + instance.hvm_nic_type = self.hvm_nic_type + result.append(("hvm_nic_type", self.hvm_nic_type)) + if self.hvm_disk_type is not None: + instance.hvm_disk_type = self.hvm_disk_type + result.append(("hvm_disk_type", self.hvm_disk_type)) if self.hvm_cdrom_image_path: if self.hvm_cdrom_image_path == constants.VALUE_NONE: instance.hvm_cdrom_image_path = None @@ -5070,6 +5275,9 @@ class LUSetInstanceParms(LogicalUnit): if self.vnc_bind_address: instance.vnc_bind_address = self.vnc_bind_address result.append(("vnc_bind_address", self.vnc_bind_address)) + if self.auto_balance is not None: + instance.auto_balance = self.auto_balance + result.append(("auto_balance", self.auto_balance)) self.cfg.AddInstance(instance)