X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/53e4e875d5d4fff1ce0332a36471c84bce3498ab..0f1a06e3e8e61259841848a7186f9c22ade6135d:/lib/cmdlib.py diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 6318d45..4cfb38f 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -43,6 +43,7 @@ from ganeti import objects from ganeti import opcodes from ganeti import ssconf + class LogicalUnit(object): """Logical Unit base class. @@ -73,6 +74,8 @@ class LogicalUnit(object): self.op = op self.cfg = cfg self.sstore = sstore + self.__ssh = None + for attr_name in self._OP_REQP: attr_val = getattr(op, attr_name, None) if attr_val is None: @@ -88,6 +91,16 @@ class LogicalUnit(object): raise errors.OpPrereqError("Commands must be run on the master" " node %s" % master) + def __GetSSH(self): + """Returns the SshRunner object + + """ + if not self.__ssh: + self.__ssh = ssh.SshRunner(self.sstore) + return self.__ssh + + ssh = property(fget=__GetSSH) + def CheckPrereq(self): """Check prerequisites for this LU. @@ -303,85 +316,6 @@ def _BuildInstanceHookEnvByObject(instance, override=None): return _BuildInstanceHookEnv(**args) -def _UpdateKnownHosts(fullnode, ip, pubkey): - """Ensure a node has a correct known_hosts entry. - - Args: - fullnode - Fully qualified domain name of host. (str) - ip - IPv4 address of host (str) - pubkey - the public key of the cluster - - """ - if os.path.exists(constants.SSH_KNOWN_HOSTS_FILE): - f = open(constants.SSH_KNOWN_HOSTS_FILE, 'r+') - else: - f = open(constants.SSH_KNOWN_HOSTS_FILE, 'w+') - - inthere = False - - save_lines = [] - add_lines = [] - removed = False - - for rawline in f: - logger.Debug('read %s' % (repr(rawline),)) - - parts = rawline.rstrip('\r\n').split() - - # Ignore unwanted lines - if len(parts) >= 3 and not rawline.lstrip()[0] == '#': - fields = parts[0].split(',') - key = parts[2] - - haveall = True - havesome = False - for spec in [ ip, fullnode ]: - if spec not in fields: - haveall = False - if spec in fields: - havesome = True - - logger.Debug("key, pubkey = %s." % (repr((key, pubkey)),)) - if haveall and key == pubkey: - inthere = True - save_lines.append(rawline) - logger.Debug("Keeping known_hosts '%s'." % (repr(rawline),)) - continue - - if havesome and (not haveall or key != pubkey): - removed = True - logger.Debug("Discarding known_hosts '%s'." % (repr(rawline),)) - continue - - save_lines.append(rawline) - - if not inthere: - add_lines.append('%s,%s ssh-rsa %s\n' % (fullnode, ip, pubkey)) - logger.Debug("Adding known_hosts '%s'." % (repr(add_lines[-1]),)) - - if removed: - save_lines = save_lines + add_lines - - # Write a new file and replace old. - fd, tmpname = tempfile.mkstemp('.tmp', 'known_hosts.', - constants.DATA_DIR) - newfile = os.fdopen(fd, 'w') - try: - newfile.write(''.join(save_lines)) - finally: - newfile.close() - logger.Debug("Wrote new known_hosts.") - os.rename(tmpname, constants.SSH_KNOWN_HOSTS_FILE) - - elif add_lines: - # Simply appending a new line will do the trick. - f.seek(0, 2) - for add in add_lines: - f.write(add) - - f.close() - - def _HasValidVG(vglist, vgname): """Checks if the volume group list is valid. @@ -479,8 +413,8 @@ class LUInitCluster(LogicalUnit): """ HPATH = "cluster-init" HTYPE = constants.HTYPE_CLUSTER - _OP_REQP = ["cluster_name", "hypervisor_type", "vg_name", "mac_prefix", - "def_bridge", "master_netdev"] + _OP_REQP = ["cluster_name", "hypervisor_type", "mac_prefix", + "def_bridge", "master_netdev", "file_storage_dir"] REQ_CLUSTER = False def BuildHooksEnv(self): @@ -510,35 +444,60 @@ class LUInitCluster(LogicalUnit): if hostname.ip.startswith("127."): raise errors.OpPrereqError("This host's IP resolves to the private" - " range (%s). Please fix DNS or /etc/hosts." % - (hostname.ip,)) + " range (%s). Please fix DNS or %s." % + (hostname.ip, constants.ETC_HOSTS)) - self.clustername = clustername = utils.HostInfo(self.op.cluster_name) - - if not utils.TcpPing(constants.LOCALHOST_IP_ADDRESS, hostname.ip, - constants.DEFAULT_NODED_PORT): + if not utils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT, + source=constants.LOCALHOST_IP_ADDRESS): raise errors.OpPrereqError("Inconsistency: this host's name resolves" " to %s,\nbut this ip address does not" " belong to this host." " Aborting." % hostname.ip) + self.clustername = clustername = utils.HostInfo(self.op.cluster_name) + + if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, + timeout=5): + raise errors.OpPrereqError("Cluster IP already active. Aborting.") + secondary_ip = getattr(self.op, "secondary_ip", None) if secondary_ip and not utils.IsValidIP(secondary_ip): raise errors.OpPrereqError("Invalid secondary ip given") if (secondary_ip and secondary_ip != hostname.ip and - (not utils.TcpPing(constants.LOCALHOST_IP_ADDRESS, secondary_ip, - constants.DEFAULT_NODED_PORT))): + (not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT, + source=constants.LOCALHOST_IP_ADDRESS))): raise errors.OpPrereqError("You gave %s as secondary IP," " but it does not belong to this host." % secondary_ip) self.secondary_ip = secondary_ip - # checks presence of the volume group given - vgstatus = _HasValidVG(utils.ListVolumeGroups(), self.op.vg_name) + if not hasattr(self.op, "vg_name"): + self.op.vg_name = None + # if vg_name not None, checks if volume group is valid + if self.op.vg_name: + vgstatus = _HasValidVG(utils.ListVolumeGroups(), self.op.vg_name) + if vgstatus: + raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if" + " you are not using lvm" % vgstatus) + + self.op.file_storage_dir = os.path.normpath(self.op.file_storage_dir) + + if not os.path.isabs(self.op.file_storage_dir): + raise errors.OpPrereqError("The file storage directory you have is" + " not an absolute path.") + + if not os.path.exists(self.op.file_storage_dir): + try: + os.makedirs(self.op.file_storage_dir, 0750) + except OSError, err: + raise errors.OpPrereqError("Cannot create file storage directory" + " '%s': %s" % + (self.op.file_storage_dir, err)) - if vgstatus: - raise errors.OpPrereqError("Error: %s" % vgstatus) + if not os.path.isdir(self.op.file_storage_dir): + raise errors.OpPrereqError("The file storage directory '%s' is not" + " a directory." % self.op.file_storage_dir) if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", self.op.mac_prefix): @@ -574,6 +533,7 @@ class LUInitCluster(LogicalUnit): ss.SetKey(ss.SS_MASTER_IP, clustername.ip) ss.SetKey(ss.SS_MASTER_NETDEV, self.op.master_netdev) ss.SetKey(ss.SS_CLUSTER_NAME, clustername.name) + ss.SetKey(ss.SS_FILE_STORAGE_DIR, self.op.file_storage_dir) # set up the inter-node password and certificate _InitGanetiServerSetup(ss) @@ -590,9 +550,6 @@ class LUInitCluster(LogicalUnit): sshkey = sshline.split(" ")[1] _AddHostToEtcHosts(hostname.name) - - _UpdateKnownHosts(hostname.name, hostname.ip, sshkey) - _InitSSHSetup(hostname.name) # init of cluster config file @@ -601,6 +558,8 @@ class LUInitCluster(LogicalUnit): sshkey, self.op.mac_prefix, self.op.vg_name, self.op.def_bridge) + ssh.WriteKnownHostsFile(cfgw, ss, constants.SSH_KNOWN_HOSTS_FILE) + class LUDestroyCluster(NoHooksLU): """Logical unit for destroying the cluster. @@ -632,6 +591,8 @@ class LUDestroyCluster(NoHooksLU): """ master = self.sstore.GetMasterNode() + if not rpc.call_node_stop_master(master): + raise errors.OpExecError("Could not disable the master role") priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS) utils.CreateBackup(priv_key) utils.CreateBackup(pub_key) @@ -947,7 +908,7 @@ class LUVerifyDisks(NoHooksLU): inst = nv_dict.pop((node, lv_name), None) if (not lv_online and inst is not None and inst.name not in res_instances): - res_instances.append(inst.name) + res_instances.append(inst.name) # any leftover items in nv_dict are missing LVs, let's arrange the # data better @@ -972,7 +933,7 @@ class LURenameCluster(LogicalUnit): """ env = { - "OP_TARGET": self.op.sstore.GetClusterName(), + "OP_TARGET": self.sstore.GetClusterName(), "NEW_NAME": self.op.name, } mn = self.sstore.GetMasterNode() @@ -1038,6 +999,79 @@ class LURenameCluster(LogicalUnit): " please restart manually.") +def _RecursiveCheckIfLVMBased(disk): + """Check if the given disk or its children are lvm-based. + + Args: + disk: ganeti.objects.Disk object + + Returns: + boolean indicating whether a LD_LV dev_type was found or not + + """ + if disk.children: + for chdisk in disk.children: + if _RecursiveCheckIfLVMBased(chdisk): + return True + return disk.dev_type == constants.LD_LV + + +class LUSetClusterParams(LogicalUnit): + """Change the parameters of the cluster. + + """ + HPATH = "cluster-modify" + HTYPE = constants.HTYPE_CLUSTER + _OP_REQP = [] + + def BuildHooksEnv(self): + """Build hooks env. + + """ + env = { + "OP_TARGET": self.sstore.GetClusterName(), + "NEW_VG_NAME": self.op.vg_name, + } + mn = self.sstore.GetMasterNode() + return env, [mn], [mn] + + def CheckPrereq(self): + """Check prerequisites. + + This checks whether the given params don't conflict and + if the given volume group is valid. + + """ + if not self.op.vg_name: + instances = [self.cfg.GetInstanceInfo(name) + for name in self.cfg.GetInstanceList()] + for inst in instances: + for disk in inst.disks: + if _RecursiveCheckIfLVMBased(disk): + raise errors.OpPrereqError("Cannot disable lvm storage while" + " lvm-based instances exist") + + # if vg_name not None, checks given volume group on all nodes + if self.op.vg_name: + node_list = self.cfg.GetNodeList() + vglist = rpc.call_vg_list(node_list) + for node in node_list: + vgstatus = _HasValidVG(vglist[node], self.op.vg_name) + if vgstatus: + raise errors.OpPrereqError("Error on node '%s': %s" % + (node, vgstatus)) + + def Exec(self, feedback_fn): + """Change the parameters of the cluster. + + """ + if self.op.vg_name != self.cfg.GetVGName(): + self.cfg.SetVGName(self.op.vg_name) + else: + feedback_fn("Cluster LVM configuration already in desired" + " state, not changing") + + def _WaitForSync(cfgw, instance, proc, oneshot=False, unlock=False): """Sleep and poll for an instance's disk to sync. @@ -1120,7 +1154,7 @@ def _CheckDiskConsistency(cfgw, dev, node, on_primary, ldisk=False): if on_primary or dev.AssembleOnSecondary(): rstats = rpc.call_blockdev_find(node, dev) if not rstats: - logger.ToStderr("Can't get any data from node %s" % node) + logger.ToStderr("Node %s: Disk degraded, not found or node down" % node) result = False else: result = result and (not rstats[idx]) @@ -1222,7 +1256,7 @@ class LURemoveNode(LogicalUnit): rpc.call_node_leave_cluster(node.name) - ssh.SSHCall(node.name, 'root', "%s stop" % constants.NODE_INITD_SCRIPT) + self.ssh.Run(node.name, 'root', "%s stop" % constants.NODE_INITD_SCRIPT) logger.Info("Removing node %s from config" % node.name) @@ -1475,16 +1509,13 @@ class LUAddNode(LogicalUnit): " new node doesn't have one") # checks reachablity - if not utils.TcpPing(utils.HostInfo().name, - primary_ip, - constants.DEFAULT_NODED_PORT): + if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT): raise errors.OpPrereqError("Node not reachable by ping") if not newbie_singlehomed: # check reachability from my secondary ip to newbie's secondary ip - if not utils.TcpPing(myself.secondary_ip, - secondary_ip, - constants.DEFAULT_NODED_PORT): + if not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT, + source=myself.secondary_ip): raise errors.OpPrereqError("Node secondary ip not reachable by TCP" " based ping to noded port") @@ -1535,7 +1566,7 @@ class LUAddNode(LogicalUnit): constants.SSL_CERT_FILE, gntpem, constants.NODE_INITD_SCRIPT)) - result = ssh.SSHCall(node, 'root', mycommand, batch=False, ask_key=True) + result = self.ssh.Run(node, 'root', mycommand, batch=False, ask_key=True) if result.failed: raise errors.OpExecError("Remote command on node %s, error: %s," " output: %s" % @@ -1580,9 +1611,6 @@ class LUAddNode(LogicalUnit): # Add node to our /etc/hosts, and add key to known_hosts _AddHostToEtcHosts(new_node.name) - _UpdateKnownHosts(new_node.name, new_node.primary_ip, - self.cfg.GetHostKey()) - if new_node.secondary_ip != new_node.primary_ip: if not rpc.call_node_tcp_ping(new_node.name, constants.LOCALHOST_IP_ADDRESS, @@ -1593,7 +1621,7 @@ class LUAddNode(LogicalUnit): " you gave (%s). Please fix and re-run this" " command." % new_node.secondary_ip) - success, msg = ssh.VerifyNodeHostname(node) + success, msg = self.ssh.VerifyNodeHostname(node) if not success: raise errors.OpExecError("Node '%s' claims it has a different hostname" " than the one the resolver gives: %s." @@ -1608,7 +1636,7 @@ class LUAddNode(LogicalUnit): dist_nodes.remove(myself.name) logger.Debug("Copying hosts and known_hosts to all nodes") - for fname in ("/etc/hosts", constants.SSH_KNOWN_HOSTS_FILE): + for fname in (constants.ETC_HOSTS, constants.SSH_KNOWN_HOSTS_FILE): result = rpc.call_upload_file(dist_nodes, fname) for to_node in dist_nodes: if not result[to_node]: @@ -1619,7 +1647,7 @@ class LUAddNode(LogicalUnit): if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31: to_copy.append(constants.VNC_PASSWORD_FILE) for fname in to_copy: - if not ssh.CopyFileToNode(node, fname): + if not self.ssh.CopyFileToNode(node, fname): logger.Error("could not copy file %s to node %s" % (fname, node)) logger.Info("adding node %s to cluster.conf" % node) @@ -1763,7 +1791,7 @@ class LUClusterCopyFile(NoHooksLU): for node in self.nodes: if node == myname: continue - if not ssh.CopyFileToNode(node, filename): + if not self.ssh.CopyFileToNode(node, filename): logger.Error("Copy of file %s to node %s failed" % (filename, node)) @@ -1804,9 +1832,15 @@ class LURunClusterCommand(NoHooksLU): """Run a command on some nodes. """ + # put the master at the end of the nodes list + master_node = self.sstore.GetMasterNode() + if master_node in self.nodes: + self.nodes.remove(master_node) + self.nodes.append(master_node) + data = [] for node in self.nodes: - result = ssh.SSHCall(node, "root", self.op.command) + result = self.ssh.Run(node, "root", self.op.command) data.append((node, result.output, result.exit_code)) return data @@ -1860,23 +1894,41 @@ def _AssembleInstanceDisks(instance, cfg, ignore_secondaries=False): """ device_info = [] disks_ok = True + iname = instance.name + # With the two passes mechanism we try to reduce the window of + # opportunity for the race condition of switching DRBD to primary + # before handshaking occured, but we do not eliminate it + + # The proper fix would be to wait (with some limits) until the + # connection has been made and drbd transitions from WFConnection + # into any other network-connected state (Connected, SyncTarget, + # SyncSource, etc.) + + # 1st pass, assemble on all nodes in secondary mode for inst_disk in instance.disks: - master_result = None for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node): cfg.SetDiskID(node_disk, node) - is_primary = node == instance.primary_node - result = rpc.call_blockdev_assemble(node, node_disk, - instance.name, is_primary) + result = rpc.call_blockdev_assemble(node, node_disk, iname, False) if not result: logger.Error("could not prepare block device %s on node %s" - " (is_primary=%s)" % - (inst_disk.iv_name, node, is_primary)) - if is_primary or not ignore_secondaries: + " (is_primary=False, pass=1)" % (inst_disk.iv_name, node)) + if not ignore_secondaries: disks_ok = False - if is_primary: - master_result = result - device_info.append((instance.primary_node, inst_disk.iv_name, - master_result)) + + # FIXME: race condition on drbd migration to primary + + # 2nd pass, do only the primary node + for inst_disk in instance.disks: + for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node): + if node != instance.primary_node: + continue + cfg.SetDiskID(node_disk, node) + result = rpc.call_blockdev_assemble(node, node_disk, iname, True) + if not result: + logger.Error("could not prepare block device %s on node %s" + " (is_primary=True, pass=2)" % (inst_disk.iv_name, node)) + disks_ok = False + device_info.append((instance.primary_node, inst_disk.iv_name, result)) # leave the disks configured for the primary node # this is a workaround that would be fixed better by @@ -2041,6 +2093,8 @@ class LUStartupInstance(LogicalUnit): force = self.op.force extra_args = getattr(self.op, "extra_args", "") + self.cfg.MarkInstanceUp(instance.name) + node_current = instance.primary_node _StartInstanceDisks(self.cfg, instance, force) @@ -2049,8 +2103,6 @@ class LUStartupInstance(LogicalUnit): _ShutdownInstanceDisks(instance, self.cfg) raise errors.OpExecError("Could not start instance") - self.cfg.MarkInstanceUp(instance.name) - class LURebootInstance(LogicalUnit): """Reboot an instance. @@ -2166,10 +2218,10 @@ class LUShutdownInstance(LogicalUnit): """ instance = self.instance node_current = instance.primary_node + self.cfg.MarkInstanceDown(instance.name) if not rpc.call_instance_shutdown(node_current, instance): logger.Error("could not shutdown instance") - self.cfg.MarkInstanceDown(instance.name) _ShutdownInstanceDisks(instance, self.cfg) @@ -2297,6 +2349,11 @@ class LURenameInstance(LogicalUnit): name_info = utils.HostInfo(self.op.new_name) self.op.new_name = new_name = name_info.name + instance_list = self.cfg.GetInstanceList() + if new_name in instance_list: + raise errors.OpPrereqError("Instance '%s' is already in the cluster" % + instance_name) + if not getattr(self.op, "ignore_ip", False): command = ["fping", "-q", name_info.ip] result = utils.RunCmd(command) @@ -2400,7 +2457,7 @@ class LUQueryInstances(NoHooksLU): This checks that the fields required are valid output fields. """ - self.dynamic_fields = frozenset(["oper_state", "oper_ram"]) + self.dynamic_fields = frozenset(["oper_state", "oper_ram", "status"]) _CheckOutputFields(static=["name", "os", "pnode", "snodes", "admin_state", "admin_ram", "disk_template", "ip", "mac", "bridge", @@ -2457,6 +2514,21 @@ class LUQueryInstances(NoHooksLU): val = None else: val = bool(live_data.get(instance.name)) + elif field == "status": + if instance.primary_node in bad_nodes: + val = "ERROR_nodedown" + else: + running = bool(live_data.get(instance.name)) + if running: + if instance.status != "down": + val = "running" + else: + val = "ERROR_up" + else: + if instance.status != "down": + val = "ERROR_down" + else: + val = "ADMIN_down" elif field == "admin_ram": val = instance.memory elif field == "oper_ram": @@ -2562,7 +2634,7 @@ class LUFailoverInstance(LogicalUnit): for dev in instance.disks: # for remote_raid1, these are md over drbd if not _CheckDiskConsistency(self.cfg, dev, target_node, False): - if not self.op.ignore_consistency: + if instance.status == "up" and not self.op.ignore_consistency: raise errors.OpExecError("Disk %s is degraded on target node," " aborting failover." % dev.iv_name) @@ -2587,21 +2659,23 @@ class LUFailoverInstance(LogicalUnit): # distribute new instance config to the other nodes self.cfg.AddInstance(instance) - feedback_fn("* activating the instance's disks on target node") - logger.Info("Starting instance %s on node %s" % - (instance.name, target_node)) + # Only start the instance if it's marked as up + if instance.status == "up": + feedback_fn("* activating the instance's disks on target node") + logger.Info("Starting instance %s on node %s" % + (instance.name, target_node)) - disks_ok, dummy = _AssembleInstanceDisks(instance, self.cfg, - ignore_secondaries=True) - if not disks_ok: - _ShutdownInstanceDisks(instance, self.cfg) - raise errors.OpExecError("Can't activate the instance's disks") + disks_ok, dummy = _AssembleInstanceDisks(instance, self.cfg, + ignore_secondaries=True) + if not disks_ok: + _ShutdownInstanceDisks(instance, self.cfg) + raise errors.OpExecError("Can't activate the instance's disks") - feedback_fn("* starting the instance on the target node") - if not rpc.call_instance_start(target_node, instance, None): - _ShutdownInstanceDisks(instance, self.cfg) - raise errors.OpExecError("Could not start instance %s on node %s." % - (instance.name, target_node)) + feedback_fn("* starting the instance on the target node") + if not rpc.call_instance_start(target_node, instance, None): + _ShutdownInstanceDisks(instance, self.cfg) + raise errors.OpExecError("Could not start instance %s on node %s." % + (instance.name, target_node)) def _CreateBlockDevOnPrimary(cfg, node, instance, device, info): @@ -2699,18 +2773,20 @@ def _GenerateDRBD8Branch(cfg, primary, secondary, size, names, iv_name): iv_name=iv_name) return drbd_dev + def _GenerateDiskTemplate(cfg, template_name, instance_name, primary_node, - secondary_nodes, disk_sz, swap_sz): + secondary_nodes, disk_sz, swap_sz, + file_storage_dir, file_driver): """Generate the entire disk layout for a given template type. """ #TODO: compute space requirements vgname = cfg.GetVGName() - if template_name == "diskless": + if template_name == constants.DT_DISKLESS: disks = [] - elif template_name == "plain": + elif template_name == constants.DT_PLAIN: if len(secondary_nodes) != 0: raise errors.ProgrammerError("Wrong template configuration") @@ -2722,43 +2798,6 @@ def _GenerateDiskTemplate(cfg, template_name, logical_id=(vgname, names[1]), iv_name = "sdb") disks = [sda_dev, sdb_dev] - elif template_name == "local_raid1": - if len(secondary_nodes) != 0: - raise errors.ProgrammerError("Wrong template configuration") - - - names = _GenerateUniqueNames(cfg, [".sda_m1", ".sda_m2", - ".sdb_m1", ".sdb_m2"]) - sda_dev_m1 = objects.Disk(dev_type=constants.LD_LV, size=disk_sz, - logical_id=(vgname, names[0])) - sda_dev_m2 = objects.Disk(dev_type=constants.LD_LV, size=disk_sz, - logical_id=(vgname, names[1])) - md_sda_dev = objects.Disk(dev_type=constants.LD_MD_R1, iv_name = "sda", - size=disk_sz, - children = [sda_dev_m1, sda_dev_m2]) - sdb_dev_m1 = objects.Disk(dev_type=constants.LD_LV, size=swap_sz, - logical_id=(vgname, names[2])) - sdb_dev_m2 = objects.Disk(dev_type=constants.LD_LV, size=swap_sz, - logical_id=(vgname, names[3])) - md_sdb_dev = objects.Disk(dev_type=constants.LD_MD_R1, iv_name = "sdb", - size=swap_sz, - children = [sdb_dev_m1, sdb_dev_m2]) - disks = [md_sda_dev, md_sdb_dev] - elif template_name == constants.DT_REMOTE_RAID1: - if len(secondary_nodes) != 1: - raise errors.ProgrammerError("Wrong template configuration") - remote_node = secondary_nodes[0] - names = _GenerateUniqueNames(cfg, [".sda_data", ".sda_meta", - ".sdb_data", ".sdb_meta"]) - drbd_sda_dev = _GenerateMDDRBDBranch(cfg, primary_node, remote_node, - disk_sz, names[0:2]) - md_sda_dev = objects.Disk(dev_type=constants.LD_MD_R1, iv_name="sda", - children = [drbd_sda_dev], size=disk_sz) - drbd_sdb_dev = _GenerateMDDRBDBranch(cfg, primary_node, remote_node, - swap_sz, names[2:4]) - md_sdb_dev = objects.Disk(dev_type=constants.LD_MD_R1, iv_name="sdb", - children = [drbd_sdb_dev], size=swap_sz) - disks = [md_sda_dev, md_sdb_dev] elif template_name == constants.DT_DRBD8: if len(secondary_nodes) != 1: raise errors.ProgrammerError("Wrong template configuration") @@ -2770,6 +2809,17 @@ def _GenerateDiskTemplate(cfg, template_name, drbd_sdb_dev = _GenerateDRBD8Branch(cfg, primary_node, remote_node, swap_sz, names[2:4], "sdb") disks = [drbd_sda_dev, drbd_sdb_dev] + elif template_name == constants.DT_FILE: + if len(secondary_nodes) != 0: + raise errors.ProgrammerError("Wrong template configuration") + + file_sda_dev = objects.Disk(dev_type=constants.LD_FILE, size=disk_sz, + iv_name="sda", logical_id=(file_driver, + "%s/sda" % file_storage_dir)) + file_sdb_dev = objects.Disk(dev_type=constants.LD_FILE, size=swap_sz, + iv_name="sdb", logical_id=(file_driver, + "%s/sdb" % file_storage_dir)) + disks = [file_sda_dev, file_sdb_dev] else: raise errors.ProgrammerError("Invalid disk template '%s'" % template_name) return disks @@ -2796,6 +2846,19 @@ def _CreateDisks(cfg, instance): """ info = _GetInstanceInfoText(instance) + if instance.disk_template == constants.DT_FILE: + file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1]) + result = rpc.call_file_storage_dir_create(instance.primary_node, + file_storage_dir) + + if not result: + logger.Error("Could not connect to node '%s'" % inst.primary_node) + return False + + if not result[0]: + logger.Error("failed to create directory '%s'" % file_storage_dir) + return False + for device in instance.disks: logger.Info("creating volume %s for instance %s" % (device.iv_name, instance.name)) @@ -2841,6 +2904,14 @@ def _RemoveDisks(instance, cfg): " continuing anyway" % (device.iv_name, node)) result = False + + if instance.disk_template == constants.DT_FILE: + file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1]) + if not rpc.call_file_storage_dir_remove(instance.primary_node, + file_storage_dir): + logger.Error("could not remove directory '%s'" % file_storage_dir) + result = False + return result @@ -2878,7 +2949,7 @@ class LUCreateInstance(LogicalUnit): os_type=self.op.os_type, memory=self.op.mem_size, vcpus=self.op.vcpus, - nics=[(self.inst_ip, self.op.bridge)], + nics=[(self.inst_ip, self.op.bridge, self.op.mac)], )) nl = ([self.sstore.GetMasterNode(), self.op.pnode] + @@ -2899,6 +2970,11 @@ class LUCreateInstance(LogicalUnit): raise errors.OpPrereqError("Invalid instance creation mode '%s'" % self.op.mode) + if (not self.cfg.GetVGName() and + self.op.disk_template not in constants.DTS_NOT_LVM): + raise errors.OpPrereqError("Cluster does not support lvm-based" + " instances") + if self.op.mode == constants.INSTANCE_IMPORT: src_node = getattr(self.op, "src_node", None) src_path = getattr(self.op, "src_path", None) @@ -2951,6 +3027,15 @@ class LUCreateInstance(LogicalUnit): if self.op.disk_template not in constants.DISK_TEMPLATES: raise errors.OpPrereqError("Invalid disk template name") + if (self.op.file_driver and + not self.op.file_driver in constants.FILE_DRIVER): + raise errors.OpPrereqError("Invalid file driver name '%s'" % + self.op.file_driver) + + if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir): + raise errors.OpPrereqError("File storage directory not a relative" + " path") + if self.op.disk_template in constants.DTS_NET_MIRROR: if getattr(self.op, "snode", None) is None: raise errors.OpPrereqError("The networked disk templates need" @@ -2969,10 +3054,9 @@ class LUCreateInstance(LogicalUnit): req_size_dict = { constants.DT_DISKLESS: None, constants.DT_PLAIN: self.op.disk_size + self.op.swap_size, - constants.DT_LOCAL_RAID1: (self.op.disk_size + self.op.swap_size) * 2, # 256 MB are added for drbd metadata, 128MB for each drbd device - constants.DT_REMOTE_RAID1: self.op.disk_size + self.op.swap_size + 256, constants.DT_DRBD8: self.op.disk_size + self.op.swap_size + 256, + constants.DT_FILE: None, } if self.op.disk_template not in req_size_dict: @@ -3034,8 +3118,7 @@ class LUCreateInstance(LogicalUnit): " adding an instance in start mode") if self.op.ip_check: - if utils.TcpPing(utils.HostInfo().name, hostname1.ip, - constants.DEFAULT_NODED_PORT): + if utils.TcpPing(hostname1.ip, constants.DEFAULT_NODED_PORT): raise errors.OpPrereqError("IP %s of instance %s already in use" % (hostname1.ip, instance_name)) @@ -3060,8 +3143,8 @@ class LUCreateInstance(LogicalUnit): # boot order verification if self.op.hvm_boot_order is not None: if len(self.op.hvm_boot_order.strip("acdn")) != 0: - raise errors.OpPrereqError("invalid boot order specified," - " must be one or more of [acdn]") + raise errors.OpPrereqError("invalid boot order specified," + " must be one or more of [acdn]") if self.op.start: self.instance_status = 'up' @@ -3090,11 +3173,19 @@ class LUCreateInstance(LogicalUnit): else: network_port = None + # build the full file storage dir path + file_storage_dir = os.path.normpath(os.path.join( + self.sstore.GetFileStorageDir(), + self.op.file_storage_dir, instance)) + + disks = _GenerateDiskTemplate(self.cfg, self.op.disk_template, instance, pnode_name, self.secondaries, self.op.disk_size, - self.op.swap_size) + self.op.swap_size, + file_storage_dir, + self.op.file_driver) iobj = objects.Instance(name=instance, os=self.op.os_type, primary_node=pnode_name, @@ -3207,214 +3298,9 @@ class LUConnectConsole(NoHooksLU): hyper = hypervisor.GetHypervisor() console_cmd = hyper.GetShellCommandForConsole(instance) - # build ssh cmdline - argv = ["ssh", "-q", "-t"] - argv.extend(ssh.KNOWN_HOSTS_OPTS) - argv.extend(ssh.BATCH_MODE_OPTS) - argv.append(node) - argv.append(console_cmd) - return "ssh", argv - - -class LUAddMDDRBDComponent(LogicalUnit): - """Adda new mirror member to an instance's disk. - - """ - HPATH = "mirror-add" - HTYPE = constants.HTYPE_INSTANCE - _OP_REQP = ["instance_name", "remote_node", "disk_name"] - - def BuildHooksEnv(self): - """Build hooks env. - - This runs on the master, the primary and all the secondaries. - - """ - env = { - "NEW_SECONDARY": self.op.remote_node, - "DISK_NAME": self.op.disk_name, - } - env.update(_BuildInstanceHookEnvByObject(self.instance)) - nl = [self.sstore.GetMasterNode(), self.instance.primary_node, - self.op.remote_node,] + list(self.instance.secondary_nodes) - return env, nl, nl - - def CheckPrereq(self): - """Check prerequisites. - - This checks that the instance is in the cluster. - - """ - instance = self.cfg.GetInstanceInfo( - self.cfg.ExpandInstanceName(self.op.instance_name)) - if instance is None: - raise errors.OpPrereqError("Instance '%s' not known" % - self.op.instance_name) - self.instance = instance - - remote_node = self.cfg.ExpandNodeName(self.op.remote_node) - if remote_node is None: - raise errors.OpPrereqError("Node '%s' not known" % self.op.remote_node) - self.remote_node = remote_node - - if remote_node == instance.primary_node: - raise errors.OpPrereqError("The specified node is the primary node of" - " the instance.") - - if instance.disk_template != constants.DT_REMOTE_RAID1: - raise errors.OpPrereqError("Instance's disk layout is not" - " remote_raid1.") - for disk in instance.disks: - if disk.iv_name == self.op.disk_name: - break - else: - raise errors.OpPrereqError("Can't find this device ('%s') in the" - " instance." % self.op.disk_name) - if len(disk.children) > 1: - raise errors.OpPrereqError("The device already has two slave devices." - " This would create a 3-disk raid1 which we" - " don't allow.") - self.disk = disk - - def Exec(self, feedback_fn): - """Add the mirror component - - """ - disk = self.disk - instance = self.instance - - remote_node = self.remote_node - lv_names = [".%s_%s" % (disk.iv_name, suf) for suf in ["data", "meta"]] - names = _GenerateUniqueNames(self.cfg, lv_names) - new_drbd = _GenerateMDDRBDBranch(self.cfg, instance.primary_node, - remote_node, disk.size, names) - - logger.Info("adding new mirror component on secondary") - #HARDCODE - if not _CreateBlockDevOnSecondary(self.cfg, remote_node, instance, - new_drbd, False, - _GetInstanceInfoText(instance)): - raise errors.OpExecError("Failed to create new component on secondary" - " node %s" % remote_node) - - logger.Info("adding new mirror component on primary") - #HARDCODE - if not _CreateBlockDevOnPrimary(self.cfg, instance.primary_node, - instance, new_drbd, - _GetInstanceInfoText(instance)): - # remove secondary dev - self.cfg.SetDiskID(new_drbd, remote_node) - rpc.call_blockdev_remove(remote_node, new_drbd) - raise errors.OpExecError("Failed to create volume on primary") - - # the device exists now - # call the primary node to add the mirror to md - logger.Info("adding new mirror component to md") - if not rpc.call_blockdev_addchildren(instance.primary_node, - disk, [new_drbd]): - logger.Error("Can't add mirror compoment to md!") - self.cfg.SetDiskID(new_drbd, remote_node) - if not rpc.call_blockdev_remove(remote_node, new_drbd): - logger.Error("Can't rollback on secondary") - self.cfg.SetDiskID(new_drbd, instance.primary_node) - if not rpc.call_blockdev_remove(instance.primary_node, new_drbd): - logger.Error("Can't rollback on primary") - raise errors.OpExecError("Can't add mirror component to md array") - - disk.children.append(new_drbd) - - self.cfg.AddInstance(instance) - - _WaitForSync(self.cfg, instance, self.proc) - - return 0 - - -class LURemoveMDDRBDComponent(LogicalUnit): - """Remove a component from a remote_raid1 disk. - - """ - HPATH = "mirror-remove" - HTYPE = constants.HTYPE_INSTANCE - _OP_REQP = ["instance_name", "disk_name", "disk_id"] - - def BuildHooksEnv(self): - """Build hooks env. - - This runs on the master, the primary and all the secondaries. - - """ - env = { - "DISK_NAME": self.op.disk_name, - "DISK_ID": self.op.disk_id, - "OLD_SECONDARY": self.old_secondary, - } - env.update(_BuildInstanceHookEnvByObject(self.instance)) - nl = [self.sstore.GetMasterNode(), - self.instance.primary_node] + list(self.instance.secondary_nodes) - return env, nl, nl - - def CheckPrereq(self): - """Check prerequisites. - This checks that the instance is in the cluster. - - """ - instance = self.cfg.GetInstanceInfo( - self.cfg.ExpandInstanceName(self.op.instance_name)) - if instance is None: - raise errors.OpPrereqError("Instance '%s' not known" % - self.op.instance_name) - self.instance = instance - - if instance.disk_template != constants.DT_REMOTE_RAID1: - raise errors.OpPrereqError("Instance's disk layout is not" - " remote_raid1.") - for disk in instance.disks: - if disk.iv_name == self.op.disk_name: - break - else: - raise errors.OpPrereqError("Can't find this device ('%s') in the" - " instance." % self.op.disk_name) - for child in disk.children: - if (child.dev_type == constants.LD_DRBD7 and - child.logical_id[2] == self.op.disk_id): - break - else: - raise errors.OpPrereqError("Can't find the device with this port.") - - if len(disk.children) < 2: - raise errors.OpPrereqError("Cannot remove the last component from" - " a mirror.") - self.disk = disk - self.child = child - if self.child.logical_id[0] == instance.primary_node: - oid = 1 - else: - oid = 0 - self.old_secondary = self.child.logical_id[oid] - - def Exec(self, feedback_fn): - """Remove the mirror component - - """ - instance = self.instance - disk = self.disk - child = self.child - logger.Info("remove mirror component") - self.cfg.SetDiskID(disk, instance.primary_node) - if not rpc.call_blockdev_removechildren(instance.primary_node, - disk, [child]): - raise errors.OpExecError("Can't remove child from mirror.") - - for node in child.logical_id[:2]: - self.cfg.SetDiskID(child, node) - if not rpc.call_blockdev_remove(node, child): - logger.Error("Warning: failed to remove device from node %s," - " continuing operation." % node) - - disk.children.remove(child) - self.cfg.AddInstance(instance) + # build ssh cmdline + return self.ssh.BuildCmd(node, "root", console_cmd, batch=True, tty=True) class LUReplaceDisks(LogicalUnit): @@ -3723,7 +3609,7 @@ class LUReplaceDisks(LogicalUnit): # ok, we created the new LVs, so now we know we have the needed # storage; as such, we proceed on the target node to rename # old_lv to _old, and new_lv to old_lv; note that we rename LVs - # using the assumption than logical_id == physical_id (which in + # using the assumption that logical_id == physical_id (which in # turn is the unique_id on that node) # FIXME(iustin): use a better name for the replaced LVs @@ -3995,7 +3881,7 @@ class LUQueryInstanceData(NoHooksLU): instance = self.cfg.GetInstanceInfo(self.cfg.ExpandInstanceName(name)) if instance is None: raise errors.OpPrereqError("No such instance name '%s'" % name) - self.wanted_instances.append(instance) + self.wanted_instances.append(instance) else: self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name in self.cfg.GetInstanceList()] @@ -4079,7 +3965,7 @@ class LUQueryInstanceData(NoHooksLU): return result -class LUSetInstanceParms(LogicalUnit): +class LUSetInstanceParams(LogicalUnit): """Modifies an instances's parameters. """ @@ -4098,7 +3984,7 @@ class LUSetInstanceParms(LogicalUnit): args['memory'] = self.mem if self.vcpus: args['vcpus'] = self.vcpus - if self.do_ip or self.do_bridge: + if self.do_ip or self.do_bridge or self.mac: if self.do_ip: ip = self.ip else: @@ -4107,7 +3993,11 @@ class LUSetInstanceParms(LogicalUnit): bridge = self.bridge else: bridge = self.instance.nics[0].bridge - args['nics'] = [(ip, bridge)] + if self.mac: + mac = self.mac + else: + mac = self.instance.nics[0].mac + args['nics'] = [(ip, bridge, mac)] env = _BuildInstanceHookEnvByObject(self.instance, override=args) nl = [self.sstore.GetMasterNode(), self.instance.primary_node] + list(self.instance.secondary_nodes) @@ -4127,9 +4017,9 @@ class LUSetInstanceParms(LogicalUnit): self.kernel_path = getattr(self.op, "kernel_path", None) self.initrd_path = getattr(self.op, "initrd_path", None) self.hvm_boot_order = getattr(self.op, "hvm_boot_order", None) - all_parms = [self.mem, self.vcpus, self.ip, self.bridge, self.mac, - self.kernel_path, self.initrd_path, self.hvm_boot_order] - if all_parms.count(None) == len(all_parms): + all_params = [self.mem, self.vcpus, self.ip, self.bridge, self.mac, + self.kernel_path, self.initrd_path, self.hvm_boot_order] + if all_params.count(None) == len(all_params): raise errors.OpPrereqError("No changes submitted") if self.mem is not None: try: @@ -4312,10 +4202,11 @@ class LUExportInstance(LogicalUnit): instance = self.instance dst_node = self.dst_node src_node = instance.primary_node - # shutdown the instance, unless requested not to do so if self.op.shutdown: - op = opcodes.OpShutdownInstance(instance_name=instance.name) - self.proc.ChainOpCode(op) + # shutdown the instance, but not the disks + if not rpc.call_instance_shutdown(src_node, instance): + raise errors.OpExecError("Could not shutdown instance %s on node %s" % + (instance.name, source_node)) vgname = self.cfg.GetVGName() @@ -4338,10 +4229,10 @@ class LUExportInstance(LogicalUnit): snap_disks.append(new_dev) finally: - if self.op.shutdown: - op = opcodes.OpStartupInstance(instance_name=instance.name, - force=False) - self.proc.ChainOpCode(op) + if self.op.shutdown and instance.status == "up": + if not rpc.call_instance_start(src_node, instance, None): + _ShutdownInstanceDisks(instance, self.cfg) + raise errors.OpExecError("Could not start instance") # TODO: check for size @@ -4524,3 +4415,39 @@ class LUDelTags(TagsLU): raise errors.OpRetryError("There has been a modification to the" " config file and the operation has been" " aborted. Please retry.") + +class LUTestDelay(NoHooksLU): + """Sleep for a specified amount of time. + + This LU sleeps on the master and/or nodes for a specified amoutn of + time. + + """ + _OP_REQP = ["duration", "on_master", "on_nodes"] + + def CheckPrereq(self): + """Check prerequisites. + + This checks that we have a good list of nodes and/or the duration + is valid. + + """ + + if self.op.on_nodes: + self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes) + + def Exec(self, feedback_fn): + """Do the actual sleep. + + """ + if self.op.on_master: + if not utils.TestDelay(self.op.duration): + raise errors.OpExecError("Error during master delay test") + if self.op.on_nodes: + result = rpc.call_test_delay(self.op.on_nodes, self.op.duration) + if not result: + raise errors.OpExecError("Complete failure from rpc call") + for node, node_result in result.items(): + if not node_result: + raise errors.OpExecError("Failure during rpc call to node %s," + " result: %s" % (node, node_result))