"""
HPATH = "cluster-init"
HTYPE = constants.HTYPE_CLUSTER
- _OP_REQP = ["cluster_name", "hypervisor_type", "vg_name", "mac_prefix",
+ _OP_REQP = ["cluster_name", "hypervisor_type", "mac_prefix",
"def_bridge", "master_netdev", "file_storage_dir"]
REQ_CLUSTER = False
secondary_ip)
self.secondary_ip = secondary_ip
- # checks presence of the volume group given
- vgstatus = _HasValidVG(utils.ListVolumeGroups(), self.op.vg_name)
-
- if vgstatus:
- raise errors.OpPrereqError("Error: %s" % vgstatus)
+ if not hasattr(self.op, "vg_name"):
+ self.op.vg_name = None
+ # if vg_name not None, checks if volume group is valid
+ if self.op.vg_name:
+ vgstatus = _HasValidVG(utils.ListVolumeGroups(), self.op.vg_name)
+ if vgstatus:
+ raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
+ " you are not using lvm" % vgstatus)
self.op.file_storage_dir = os.path.normpath(self.op.file_storage_dir)
"""Verifies the cluster status.
"""
- _OP_REQP = []
+ _OP_REQP = ["skip_checks"]
def _VerifyNode(self, node, file_list, local_cksum, vglist, node_result,
remote_version, feedback_fn):
# compares ganeti version
local_version = constants.PROTOCOL_VERSION
if not remote_version:
- feedback_fn(" - ERROR: connection to %s failed" % (node))
+ feedback_fn(" - ERROR: connection to %s failed" % (node))
return True
if local_version != remote_version:
feedback_fn(" - ERROR: hypervisor verify failure: '%s'" % hyp_result)
return bad
- def _VerifyInstance(self, instance, node_vol_is, node_instance, feedback_fn):
+ def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
+ node_instance, feedback_fn):
"""Verify an instance.
This function checks to see if the required block devices are
"""
bad = False
- instancelist = self.cfg.GetInstanceList()
- if not instance in instancelist:
- feedback_fn(" - ERROR: instance %s not in instance list %s" %
- (instance, instancelist))
- bad = True
-
- instanceconfig = self.cfg.GetInstanceInfo(instance)
node_current = instanceconfig.primary_node
node_vol_should = {}
bad = True
if not instanceconfig.status == 'down':
- if not instance in node_instance[node_current]:
+ if (node_current not in node_instance or
+ not instance in node_instance[node_current]):
feedback_fn(" - ERROR: instance %s not running on node %s" %
(instance, node_current))
bad = True
bad = True
return bad
+ def _VerifyNPlusOneMemory(self, node_info, instance_cfg, feedback_fn):
+ """Verify N+1 Memory Resilience.
+
+ Check that if one single node dies we can still start all the instances it
+ was primary for.
+
+ """
+ bad = False
+
+ for node, nodeinfo in node_info.iteritems():
+ # This code checks that every node which is now listed as secondary has
+ # enough memory to host all instances it is supposed to should a single
+ # other node in the cluster fail.
+ # FIXME: not ready for failover to an arbitrary node
+ # FIXME: does not support file-backed instances
+ # WARNING: we currently take into account down instances as well as up
+ # ones, considering that even if they're down someone might want to start
+ # them even in the event of a node failure.
+ for prinode, instances in nodeinfo['sinst-by-pnode'].iteritems():
+ needed_mem = 0
+ for instance in instances:
+ needed_mem += instance_cfg[instance].memory
+ if nodeinfo['mfree'] < needed_mem:
+ feedback_fn(" - ERROR: not enough memory on node %s to accomodate"
+ " failovers should node %s fail" % (node, prinode))
+ bad = True
+ return bad
+
def CheckPrereq(self):
"""Check prerequisites.
- This has no prerequisites.
+ Transform the list of checks we're going to skip into a set and check that
+ all its members are valid.
"""
- pass
+ self.skip_set = frozenset(self.op.skip_checks)
+ if not constants.VERIFY_OPTIONAL_CHECKS.issuperset(self.skip_set):
+ raise errors.OpPrereqError("Invalid checks to be skipped specified")
def Exec(self, feedback_fn):
"""Verify integrity of cluster, performing various test on nodes.
vg_name = self.cfg.GetVGName()
nodelist = utils.NiceSort(self.cfg.GetNodeList())
instancelist = utils.NiceSort(self.cfg.GetInstanceList())
+ i_non_redundant = [] # Non redundant instances
node_volume = {}
node_instance = {}
+ node_info = {}
+ instance_cfg = {}
# FIXME: verify OS list
# do local checksums
}
all_nvinfo = rpc.call_node_verify(nodelist, node_verify_param)
all_rversion = rpc.call_version(nodelist)
+ all_ninfo = rpc.call_node_info(nodelist, self.cfg.GetVGName())
for node in nodelist:
feedback_fn("* Verifying node %s" % node)
node_instance[node] = nodeinstance
+ # node_info
+ nodeinfo = all_ninfo[node]
+ if not isinstance(nodeinfo, dict):
+ feedback_fn(" - ERROR: connection to %s failed" % (node,))
+ bad = True
+ continue
+
+ try:
+ node_info[node] = {
+ "mfree": int(nodeinfo['memory_free']),
+ "dfree": int(nodeinfo['vg_free']),
+ "pinst": [],
+ "sinst": [],
+ # dictionary holding all instances this node is secondary for,
+ # grouped by their primary node. Each key is a cluster node, and each
+ # value is a list of instances which have the key as primary and the
+ # current node as secondary. this is handy to calculate N+1 memory
+ # availability if you can only failover from a primary to its
+ # secondary.
+ "sinst-by-pnode": {},
+ }
+ except ValueError:
+ feedback_fn(" - ERROR: invalid value returned from node %s" % (node,))
+ bad = True
+ continue
+
node_vol_should = {}
for instance in instancelist:
feedback_fn("* Verifying instance %s" % instance)
- result = self._VerifyInstance(instance, node_volume, node_instance,
- feedback_fn)
- bad = bad or result
-
inst_config = self.cfg.GetInstanceInfo(instance)
+ result = self._VerifyInstance(instance, inst_config, node_volume,
+ node_instance, feedback_fn)
+ bad = bad or result
inst_config.MapLVsByNode(node_vol_should)
+ instance_cfg[instance] = inst_config
+
+ pnode = inst_config.primary_node
+ if pnode in node_info:
+ node_info[pnode]['pinst'].append(instance)
+ else:
+ feedback_fn(" - ERROR: instance %s, connection to primary node"
+ " %s failed" % (instance, pnode))
+ bad = True
+
+ # If the instance is non-redundant we cannot survive losing its primary
+ # node, so we are not N+1 compliant. On the other hand we have no disk
+ # templates with more than one secondary so that situation is not well
+ # supported either.
+ # FIXME: does not support file-backed instances
+ if len(inst_config.secondary_nodes) == 0:
+ i_non_redundant.append(instance)
+ elif len(inst_config.secondary_nodes) > 1:
+ feedback_fn(" - WARNING: multiple secondaries for instance %s"
+ % instance)
+
+ for snode in inst_config.secondary_nodes:
+ if snode in node_info:
+ node_info[snode]['sinst'].append(instance)
+ if pnode not in node_info[snode]['sinst-by-pnode']:
+ node_info[snode]['sinst-by-pnode'][pnode] = []
+ node_info[snode]['sinst-by-pnode'][pnode].append(instance)
+ else:
+ feedback_fn(" - ERROR: instance %s, connection to secondary node"
+ " %s failed" % (instance, snode))
+
feedback_fn("* Verifying orphan volumes")
result = self._VerifyOrphanVolumes(node_vol_should, node_volume,
feedback_fn)
feedback_fn)
bad = bad or result
+ if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
+ feedback_fn("* Verifying N+1 Memory redundancy")
+ result = self._VerifyNPlusOneMemory(node_info, instance_cfg, feedback_fn)
+ bad = bad or result
+
+ feedback_fn("* Other Notes")
+ if i_non_redundant:
+ feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
+ % len(i_non_redundant))
+
return int(bad)
" please restart manually.")
+def _RecursiveCheckIfLVMBased(disk):
+ """Check if the given disk or its children are lvm-based.
+
+ Args:
+ disk: ganeti.objects.Disk object
+
+ Returns:
+ boolean indicating whether a LD_LV dev_type was found or not
+
+ """
+ if disk.children:
+ for chdisk in disk.children:
+ if _RecursiveCheckIfLVMBased(chdisk):
+ return True
+ return disk.dev_type == constants.LD_LV
+
+
+class LUSetClusterParams(LogicalUnit):
+ """Change the parameters of the cluster.
+
+ """
+ HPATH = "cluster-modify"
+ HTYPE = constants.HTYPE_CLUSTER
+ _OP_REQP = []
+
+ def BuildHooksEnv(self):
+ """Build hooks env.
+
+ """
+ env = {
+ "OP_TARGET": self.sstore.GetClusterName(),
+ "NEW_VG_NAME": self.op.vg_name,
+ }
+ mn = self.sstore.GetMasterNode()
+ return env, [mn], [mn]
+
+ def CheckPrereq(self):
+ """Check prerequisites.
+
+ This checks whether the given params don't conflict and
+ if the given volume group is valid.
+
+ """
+ if not self.op.vg_name:
+ instances = [self.cfg.GetInstanceInfo(name)
+ for name in self.cfg.GetInstanceList()]
+ for inst in instances:
+ for disk in inst.disks:
+ if _RecursiveCheckIfLVMBased(disk):
+ raise errors.OpPrereqError("Cannot disable lvm storage while"
+ " lvm-based instances exist")
+
+ # if vg_name not None, checks given volume group on all nodes
+ if self.op.vg_name:
+ node_list = self.cfg.GetNodeList()
+ vglist = rpc.call_vg_list(node_list)
+ for node in node_list:
+ vgstatus = _HasValidVG(vglist[node], self.op.vg_name)
+ if vgstatus:
+ raise errors.OpPrereqError("Error on node '%s': %s" %
+ (node, vgstatus))
+
+ def Exec(self, feedback_fn):
+ """Change the parameters of the cluster.
+
+ """
+ if self.op.vg_name != self.cfg.GetVGName():
+ self.cfg.SetVGName(self.op.vg_name)
+ else:
+ feedback_fn("Cluster LVM configuration already in desired"
+ " state, not changing")
+
+
def _WaitForSync(cfgw, instance, proc, oneshot=False, unlock=False):
"""Sleep and poll for an instance's disk to sync.
if on_primary or dev.AssembleOnSecondary():
rstats = rpc.call_blockdev_find(node, dev)
if not rstats:
- logger.ToStderr("Can't get any data from node %s" % node)
+ logger.ToStderr("Node %s: Disk degraded, not found or node down" % node)
result = False
else:
result = result and (not rstats[idx])
"""Logical unit for OS diagnose/query.
"""
- _OP_REQP = []
+ _OP_REQP = ["output_fields", "names"]
def CheckPrereq(self):
"""Check prerequisites.
This always succeeds, since this is a pure query LU.
"""
- return
+ if self.op.names:
+ raise errors.OpPrereqError("Selective OS query not supported")
+
+ self.dynamic_fields = frozenset(["name", "valid", "node_status"])
+ _CheckOutputFields(static=[],
+ dynamic=self.dynamic_fields,
+ selected=self.op.output_fields)
+
+ @staticmethod
+ def _DiagnoseByOS(node_list, rlist):
+ """Remaps a per-node return list into an a per-os per-node dictionary
+
+ Args:
+ node_list: a list with the names of all nodes
+ rlist: a map with node names as keys and OS objects as values
+
+ Returns:
+ map: a map with osnames as keys and as value another map, with
+ nodes as
+ keys and list of OS objects as values
+ e.g. {"debian-etch": {"node1": [<object>,...],
+ "node2": [<object>,]}
+ }
+
+ """
+ all_os = {}
+ for node_name, nr in rlist.iteritems():
+ if not nr:
+ continue
+ for os in nr:
+ if os.name not in all_os:
+ # build a list of nodes for this os containing empty lists
+ # for each node in node_list
+ all_os[os.name] = {}
+ for nname in node_list:
+ all_os[os.name][nname] = []
+ all_os[os.name][node_name].append(os)
+ return all_os
def Exec(self, feedback_fn):
"""Compute the list of OSes.
node_data = rpc.call_os_diagnose(node_list)
if node_data == False:
raise errors.OpExecError("Can't gather the list of OSes")
- return node_data
+ pol = self._DiagnoseByOS(node_list, node_data)
+ output = []
+ for os_name, os_data in pol.iteritems():
+ row = []
+ for field in self.op.output_fields:
+ if field == "name":
+ val = os_name
+ elif field == "valid":
+ val = utils.all([osl and osl[0] for osl in os_data.values()])
+ elif field == "node_status":
+ val = {}
+ for node_name, nos_list in os_data.iteritems():
+ val[node_name] = [(v.status, v.path) for v in nos_list]
+ else:
+ raise errors.ParameterError(field)
+ row.append(val)
+ output.append(row)
+
+ return output
class LURemoveNode(LogicalUnit):
"""Run a command on some nodes.
"""
+ # put the master at the end of the nodes list
+ master_node = self.sstore.GetMasterNode()
+ if master_node in self.nodes:
+ self.nodes.remove(master_node)
+ self.nodes.append(master_node)
+
data = []
for node in self.nodes:
result = self.ssh.Run(node, "root", self.op.command)
inst = self.instance
old_name = inst.name
+ if inst.disk_template == constants.DT_FILE:
+ old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
+
self.cfg.RenameInstance(inst.name, self.op.new_name)
# re-read the instance from the configuration after rename
inst = self.cfg.GetInstanceInfo(self.op.new_name)
+ if inst.disk_template == constants.DT_FILE:
+ new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
+ result = rpc.call_file_storage_dir_rename(inst.primary_node,
+ old_file_storage_dir,
+ new_file_storage_dir)
+
+ if not result:
+ raise errors.OpExecError("Could not connect to node '%s' to rename"
+ " directory '%s' to '%s' (but the instance"
+ " has been renamed in Ganeti)" % (
+ inst.primary_node, old_file_storage_dir,
+ new_file_storage_dir))
+
+ if not result[0]:
+ raise errors.OpExecError("Could not rename directory '%s' to '%s'"
+ " (but the instance has been renamed in"
+ " Ganeti)" % (old_file_storage_dir,
+ new_file_storage_dir))
+
_StartInstanceDisks(self.cfg, inst, None)
try:
if not rpc.call_instance_run_rename(inst.primary_node, inst, old_name,
for dev in instance.disks:
# for remote_raid1, these are md over drbd
if not _CheckDiskConsistency(self.cfg, dev, target_node, False):
- if not self.op.ignore_consistency:
+ if instance.status == "up" and not self.op.ignore_consistency:
raise errors.OpExecError("Disk %s is degraded on target node,"
" aborting failover." % dev.iv_name)
# distribute new instance config to the other nodes
self.cfg.AddInstance(instance)
- feedback_fn("* activating the instance's disks on target node")
- logger.Info("Starting instance %s on node %s" %
- (instance.name, target_node))
+ # Only start the instance if it's marked as up
+ if instance.status == "up":
+ feedback_fn("* activating the instance's disks on target node")
+ logger.Info("Starting instance %s on node %s" %
+ (instance.name, target_node))
- disks_ok, dummy = _AssembleInstanceDisks(instance, self.cfg,
- ignore_secondaries=True)
- if not disks_ok:
- _ShutdownInstanceDisks(instance, self.cfg)
- raise errors.OpExecError("Can't activate the instance's disks")
+ disks_ok, dummy = _AssembleInstanceDisks(instance, self.cfg,
+ ignore_secondaries=True)
+ if not disks_ok:
+ _ShutdownInstanceDisks(instance, self.cfg)
+ raise errors.OpExecError("Can't activate the instance's disks")
- feedback_fn("* starting the instance on the target node")
- if not rpc.call_instance_start(target_node, instance, None):
- _ShutdownInstanceDisks(instance, self.cfg)
- raise errors.OpExecError("Could not start instance %s on node %s." %
- (instance.name, target_node))
+ feedback_fn("* starting the instance on the target node")
+ if not rpc.call_instance_start(target_node, instance, None):
+ _ShutdownInstanceDisks(instance, self.cfg)
+ raise errors.OpExecError("Could not start instance %s on node %s." %
+ (instance.name, target_node))
def _CreateBlockDevOnPrimary(cfg, node, instance, device, info):
def _GenerateDiskTemplate(cfg, template_name,
instance_name, primary_node,
- secondary_nodes, disk_sz, swap_sz):
+ secondary_nodes, disk_sz, swap_sz,
+ file_storage_dir, file_driver):
"""Generate the entire disk layout for a given template type.
"""
drbd_sdb_dev = _GenerateDRBD8Branch(cfg, primary_node, remote_node,
swap_sz, names[2:4], "sdb")
disks = [drbd_sda_dev, drbd_sdb_dev]
+ elif template_name == constants.DT_FILE:
+ if len(secondary_nodes) != 0:
+ raise errors.ProgrammerError("Wrong template configuration")
+
+ file_sda_dev = objects.Disk(dev_type=constants.LD_FILE, size=disk_sz,
+ iv_name="sda", logical_id=(file_driver,
+ "%s/sda" % file_storage_dir))
+ file_sdb_dev = objects.Disk(dev_type=constants.LD_FILE, size=swap_sz,
+ iv_name="sdb", logical_id=(file_driver,
+ "%s/sdb" % file_storage_dir))
+ disks = [file_sda_dev, file_sdb_dev]
else:
raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
return disks
"""
info = _GetInstanceInfoText(instance)
+ if instance.disk_template == constants.DT_FILE:
+ file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
+ result = rpc.call_file_storage_dir_create(instance.primary_node,
+ file_storage_dir)
+
+ if not result:
+ logger.Error("Could not connect to node '%s'" % inst.primary_node)
+ return False
+
+ if not result[0]:
+ logger.Error("failed to create directory '%s'" % file_storage_dir)
+ return False
+
for device in instance.disks:
logger.Info("creating volume %s for instance %s" %
- (device.iv_name, instance.name))
+ (device.iv_name, instance.name))
#HARDCODE
for secondary_node in instance.secondary_nodes:
if not _CreateBlockDevOnSecondary(cfg, secondary_node, instance,
logger.Error("failed to create volume %s on primary!" %
device.iv_name)
return False
+
return True
" continuing anyway" %
(device.iv_name, node))
result = False
+
+ if instance.disk_template == constants.DT_FILE:
+ file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
+ if not rpc.call_file_storage_dir_remove(instance.primary_node,
+ file_storage_dir):
+ logger.Error("could not remove directory '%s'" % file_storage_dir)
+ result = False
+
return result
if self.op.disk_template not in constants.DISK_TEMPLATES:
raise errors.OpPrereqError("Invalid disk template name")
+ if (self.op.file_driver and
+ not self.op.file_driver in constants.FILE_DRIVER):
+ raise errors.OpPrereqError("Invalid file driver name '%s'" %
+ self.op.file_driver)
+
+ if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
+ raise errors.OpPrereqError("File storage directory not a relative"
+ " path")
+
if self.op.disk_template in constants.DTS_NET_MIRROR:
if getattr(self.op, "snode", None) is None:
raise errors.OpPrereqError("The networked disk templates need"
constants.DT_PLAIN: self.op.disk_size + self.op.swap_size,
# 256 MB are added for drbd metadata, 128MB for each drbd device
constants.DT_DRBD8: self.op.disk_size + self.op.swap_size + 256,
+ constants.DT_FILE: None,
}
if self.op.disk_template not in req_size_dict:
else:
network_port = None
+ # this is needed because os.path.join does not accept None arguments
+ if self.op.file_storage_dir is None:
+ string_file_storage_dir = ""
+ else:
+ string_file_storage_dir = self.op.file_storage_dir
+
+ # build the full file storage dir path
+ file_storage_dir = os.path.normpath(os.path.join(
+ self.sstore.GetFileStorageDir(),
+ string_file_storage_dir, instance))
+
+
disks = _GenerateDiskTemplate(self.cfg,
self.op.disk_template,
instance, pnode_name,
self.secondaries, self.op.disk_size,
- self.op.swap_size)
+ self.op.swap_size,
+ file_storage_dir,
+ self.op.file_driver)
iobj = objects.Instance(name=instance, os=self.op.os_type,
primary_node=pnode_name,
return result
-class LUSetInstanceParms(LogicalUnit):
+class LUSetInstanceParams(LogicalUnit):
"""Modifies an instances's parameters.
"""
self.kernel_path = getattr(self.op, "kernel_path", None)
self.initrd_path = getattr(self.op, "initrd_path", None)
self.hvm_boot_order = getattr(self.op, "hvm_boot_order", None)
- all_parms = [self.mem, self.vcpus, self.ip, self.bridge, self.mac,
- self.kernel_path, self.initrd_path, self.hvm_boot_order]
- if all_parms.count(None) == len(all_parms):
+ all_params = [self.mem, self.vcpus, self.ip, self.bridge, self.mac,
+ self.kernel_path, self.initrd_path, self.hvm_boot_order]
+ if all_params.count(None) == len(all_params):
raise errors.OpPrereqError("No changes submitted")
if self.mem is not None:
try:
instance = self.instance
dst_node = self.dst_node
src_node = instance.primary_node
- # shutdown the instance, unless requested not to do so
if self.op.shutdown:
- op = opcodes.OpShutdownInstance(instance_name=instance.name)
- self.proc.ChainOpCode(op)
+ # shutdown the instance, but not the disks
+ if not rpc.call_instance_shutdown(src_node, instance):
+ raise errors.OpExecError("Could not shutdown instance %s on node %s" %
+ (instance.name, source_node))
vgname = self.cfg.GetVGName()
snap_disks.append(new_dev)
finally:
- if self.op.shutdown:
- op = opcodes.OpStartupInstance(instance_name=instance.name,
- force=False)
- self.proc.ChainOpCode(op)
+ if self.op.shutdown and instance.status == "up":
+ if not rpc.call_instance_start(src_node, instance, None):
+ _ShutdownInstanceDisks(instance, self.cfg)
+ raise errors.OpExecError("Could not start instance")
# TODO: check for size
for dev in snap_disks:
- if not rpc.call_snapshot_export(src_node, dev, dst_node.name,
- instance):
- logger.Error("could not export block device %s from node"
- " %s to node %s" %
- (dev.logical_id[1], src_node, dst_node.name))
+ if not rpc.call_snapshot_export(src_node, dev, dst_node.name, instance):
+ logger.Error("could not export block device %s from node %s to node %s"
+ % (dev.logical_id[1], src_node, dst_node.name))
if not rpc.call_blockdev_remove(src_node, dev):
- logger.Error("could not remove snapshot block device %s from"
- " node %s" % (dev.logical_id[1], src_node))
+ logger.Error("could not remove snapshot block device %s from node %s" %
+ (dev.logical_id[1], src_node))
if not rpc.call_finalize_export(dst_node.name, instance, snap_disks):
logger.Error("could not finalize export for instance %s on node %s" %