X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/ae48ac3256ef0cf5a0bf210cbf0343358b8f602d..ba55d062da8dfb89a37afc2f13f2e689d0094829:/lib/backend.py

diff --git a/lib/backend.py b/lib/backend.py
index 830607e..08b65aa 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -46,6 +46,35 @@ from ganeti import objects
 from ganeti import ssconf
 
 
+class RPCFail(Exception):
+  """Class denoting RPC failure.
+
+  Its argument is the error message.
+
+  """
+
+def _Fail(msg, *args, **kwargs):
+  """Log an error and the raise an RPCFail exception.
+
+  This exception is then handled specially in the ganeti daemon and
+  turned into a 'failed' return type. As such, this function is a
+  useful shortcut for logging the error and returning it to the master
+  daemon.
+
+  @type msg: string
+  @param msg: the text of the exception
+  @raise RPCFail
+
+  """
+  if args:
+    msg = msg % args
+  if "exc" in kwargs and kwargs["exc"]:
+    logging.exception(msg)
+  else:
+    logging.error(msg)
+  raise RPCFail(msg)
+
+
 def _GetConfig():
   """Simple wrapper to return a SimpleStore.
 
@@ -260,9 +289,7 @@ def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub):
     priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS,
                                                     mkdir=True)
   except errors.OpExecError, err:
-    msg = "Error while processing user ssh files"
-    logging.exception(msg)
-    return (False, "%s: %s" % (msg, err))
+    _Fail("Error while processing user ssh files: %s", err, exc=True)
 
   for name, content in [(priv_key, sshkey), (pub_key, sshpub)]:
     utils.WriteFile(name, data=content, mode=0600)
@@ -660,19 +687,26 @@ def GetAllInstancesInfo(hypervisor_list):
           'state': state,
           'time': times,
           }
-        if name in output and output[name] != value:
-          raise errors.HypervisorError("Instance %s running duplicate"
-                                       " with different parameters" % name)
+        if name in output:
+          # we only check static parameters, like memory and vcpus,
+          # and not state and time which can change between the
+          # invocations of the different hypervisors
+          for key in 'memory', 'vcpus':
+            if value[key] != output[name][key]:
+              raise errors.HypervisorError("Instance %s is running twice"
+                                           " with different parameters" % name)
         output[name] = value
 
   return output
 
 
-def InstanceOsAdd(instance):
+def InstanceOsAdd(instance, reinstall):
   """Add an OS to an instance.
 
   @type instance: L{objects.Instance}
   @param instance: Instance whose OS is to be installed
+  @type reinstall: boolean
+  @param reinstall: whether this is an instance reinstall
   @rtype: boolean
   @return: the success of the operation
 
@@ -688,6 +722,8 @@ def InstanceOsAdd(instance):
               (os_name, os_dir, os_err))
 
   create_env = OSEnvironment(instance)
+  if reinstall:
+    create_env['INSTANCE_REINSTALL'] = "1"
 
   logfile = "%s/add-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                      instance.name, int(time.time()))
@@ -856,7 +892,7 @@ def _GatherAndLinkBlockDevs(instance):
   return block_devices
 
 
-def StartInstance(instance, extra_args):
+def StartInstance(instance):
   """Start an instance.
 
   @type instance: L{objects.Instance}
@@ -873,14 +909,12 @@ def StartInstance(instance, extra_args):
   try:
     block_devices = _GatherAndLinkBlockDevs(instance)
     hyper = hypervisor.GetHypervisor(instance.hypervisor)
-    hyper.StartInstance(instance, block_devices, extra_args)
+    hyper.StartInstance(instance, block_devices)
   except errors.BlockDeviceError, err:
-    logging.exception("Failed to start instance")
-    return (False, "Block device error: %s" % str(err))
+    _Fail("Block device error: %s", err, exc=True)
   except errors.HypervisorError, err:
-    logging.exception("Failed to start instance")
     _RemoveBlockDevLinks(instance.name, instance.disks)
-    return (False, "Hypervisor error: %s" % str(err))
+    _Fail("Hypervisor error: %s", err, exc=True)
 
   return (True, "Instance started successfully")
 
@@ -906,9 +940,7 @@ def InstanceShutdown(instance):
   try:
     hyper.StopInstance(instance)
   except errors.HypervisorError, err:
-    msg = "Failed to stop instance %s: %s" % (instance.name, err)
-    logging.error(msg)
-    return (False, msg)
+    _Fail("Failed to stop instance %s: %s", instance.name, err)
 
   # test every 10secs for 2min
 
@@ -925,23 +957,18 @@ def InstanceShutdown(instance):
     try:
       hyper.StopInstance(instance, force=True)
     except errors.HypervisorError, err:
-      msg = "Failed to force stop instance %s: %s" % (instance.name, err)
-      logging.error(msg)
-      return (False, msg)
+      _Fail("Failed to force stop instance %s: %s", instance.name, err)
 
     time.sleep(1)
     if instance.name in GetInstanceList([hv_name]):
-      msg = ("Could not shutdown instance %s even by destroy" %
-             instance.name)
-      logging.error(msg)
-      return (False, msg)
+      _Fail("Could not shutdown instance %s even by destroy", instance.name)
 
   _RemoveBlockDevLinks(instance.name, instance.disks)
 
   return (True, "Instance has been shutdown successfully")
 
 
-def InstanceReboot(instance, reboot_type, extra_args):
+def InstanceReboot(instance, reboot_type):
   """Reboot an instance.
 
   @type instance: L{objects.Instance}
@@ -963,30 +990,24 @@ def InstanceReboot(instance, reboot_type, extra_args):
   running_instances = GetInstanceList([instance.hypervisor])
 
   if instance.name not in running_instances:
-    msg = "Cannot reboot instance %s that is not running" % instance.name
-    logging.error(msg)
-    return (False, msg)
+    _Fail("Cannot reboot instance %s that is not running", instance.name)
 
   hyper = hypervisor.GetHypervisor(instance.hypervisor)
   if reboot_type == constants.INSTANCE_REBOOT_SOFT:
     try:
       hyper.RebootInstance(instance)
     except errors.HypervisorError, err:
-      msg = "Failed to soft reboot instance %s: %s" % (instance.name, err)
-      logging.error(msg)
-      return (False, msg)
+      _Fail("Failed to soft reboot instance %s: %s", instance.name, err)
   elif reboot_type == constants.INSTANCE_REBOOT_HARD:
     try:
       stop_result = InstanceShutdown(instance)
       if not stop_result[0]:
         return stop_result
-      return StartInstance(instance, extra_args)
+      return StartInstance(instance)
     except errors.HypervisorError, err:
-      msg = "Failed to hard reboot instance %s: %s" % (instance.name, err)
-      logging.error(msg)
-      return (False, msg)
+      _Fail("Failed to hard reboot instance %s: %s", instance.name, err)
   else:
-    return (False, "Invalid reboot_type received: %s" % (reboot_type,))
+    _Fail("Invalid reboot_type received: %s", reboot_type)
 
   return (True, "Reboot successful")
 
@@ -1002,9 +1023,7 @@ def MigrationInfo(instance):
   try:
     info = hyper.MigrationInfo(instance)
   except errors.HypervisorError, err:
-    msg = "Failed to fetch migration information"
-    logging.exception(msg)
-    return (False, '%s: %s' % (msg, err))
+    _Fail("Failed to fetch migration information: %s", err, exc=True)
   return (True, info)
 
 
@@ -1023,9 +1042,7 @@ def AcceptInstance(instance, info, target):
   try:
     hyper.AcceptInstance(instance, info, target)
   except errors.HypervisorError, err:
-    msg = "Failed to accept instance"
-    logging.exception(msg)
-    return (False, '%s: %s' % (msg, err))
+    _Fail("Failed to accept instance: %s", err, exc=True)
   return (True, "Accept successfull")
 
 
@@ -1044,9 +1061,7 @@ def FinalizeMigration(instance, info, success):
   try:
     hyper.FinalizeMigration(instance, info, success)
   except errors.HypervisorError, err:
-    msg = "Failed to finalize migration"
-    logging.exception(msg)
-    return (False, '%s: %s' % (msg, err))
+    _Fail("Failed to finalize migration: %s", err, exc=True)
   return (True, "Migration Finalized")
 
 
@@ -1071,9 +1086,7 @@ def MigrateInstance(instance, target, live):
   try:
     hyper.MigrateInstance(instance.name, target, live)
   except errors.HypervisorError, err:
-    msg = "Failed to migrate instance"
-    logging.exception(msg)
-    return (False, "%s: %s" % (msg, err))
+    _Fail("Failed to migrate instance: %s", err, exc=True)
   return (True, "Migration successfull")
 
 
@@ -1104,42 +1117,32 @@ def BlockdevCreate(disk, size, owner, on_primary, info):
       try:
         crdev = _RecursiveAssembleBD(child, owner, on_primary)
       except errors.BlockDeviceError, err:
-        errmsg = "Can't assemble device %s: %s" % (child, err)
-        logging.error(errmsg)
-        return False, errmsg
+        _Fail("Can't assemble device %s: %s", child, err)
       if on_primary or disk.AssembleOnSecondary():
         # we need the children open in case the device itself has to
         # be assembled
         try:
           crdev.Open()
         except errors.BlockDeviceError, err:
-          errmsg = "Can't make child '%s' read-write: %s" % (child, err)
-          logging.error(errmsg)
-          return False, errmsg
+          _Fail("Can't make child '%s' read-write: %s", child, err)
       clist.append(crdev)
 
   try:
     device = bdev.Create(disk.dev_type, disk.physical_id, clist, size)
   except errors.BlockDeviceError, err:
-    return False, "Can't create block device: %s" % str(err)
+    _Fail("Can't create block device: %s", err)
 
   if on_primary or disk.AssembleOnSecondary():
     try:
       device.Assemble()
     except errors.BlockDeviceError, err:
-      errmsg = ("Can't assemble device after creation, very"
-                " unusual event: %s" % str(err))
-      logging.error(errmsg)
-      return False, errmsg
+      _Fail("Can't assemble device after creation, unusual event: %s", err)
     device.SetSyncSpeed(constants.SYNC_SPEED)
     if on_primary or disk.OpenOnSecondary():
       try:
         device.Open(force=True)
       except errors.BlockDeviceError, err:
-        errmsg = ("Can't make device r/w after creation, very"
-                  " unusual event: %s" % str(err))
-        logging.error(errmsg)
-        return False, errmsg
+        _Fail("Can't make device r/w after creation, unusual event: %s", err)
     DevCacheManager.UpdateCache(device.dev_path, owner,
                                 on_primary, disk.iv_name)
 
@@ -1317,15 +1320,12 @@ def BlockdevAddchildren(parent_cdev, new_cdevs):
   """
   parent_bdev = _RecursiveFindBD(parent_cdev)
   if parent_bdev is None:
-    logging.error("Can't find parent device")
-    return False
+    _Fail("Can't find parent device '%s' in add children", parent_cdev)
   new_bdevs = [_RecursiveFindBD(disk) for disk in new_cdevs]
   if new_bdevs.count(None) > 0:
-    logging.error("Can't find new device(s) to add: %s:%s",
-                  new_bdevs, new_cdevs)
-    return False
+    _Fail("Can't find new device(s) to add: %s:%s", new_bdevs, new_cdevs)
   parent_bdev.AddChildren(new_bdevs)
-  return True
+  return (True, None)
 
 
 def BlockdevRemovechildren(parent_cdev, new_cdevs):
@@ -1341,23 +1341,20 @@ def BlockdevRemovechildren(parent_cdev, new_cdevs):
   """
   parent_bdev = _RecursiveFindBD(parent_cdev)
   if parent_bdev is None:
-    logging.error("Can't find parent in remove children: %s", parent_cdev)
-    return False
+    _Fail("Can't find parent device '%s' in remove children", parent_cdev)
   devs = []
   for disk in new_cdevs:
     rpath = disk.StaticDevPath()
     if rpath is None:
       bd = _RecursiveFindBD(disk)
       if bd is None:
-        logging.error("Can't find dynamic device %s while removing children",
-                      disk)
-        return False
+        _Fail("Can't find device %s while removing children", disk)
       else:
         devs.append(bd.dev_path)
     else:
       devs.append(rpath)
   parent_bdev.RemoveChildren(devs)
-  return True
+  return (True, None)
 
 
 def BlockdevGetmirrorstatus(disks):
@@ -1377,9 +1374,9 @@ def BlockdevGetmirrorstatus(disks):
   for dsk in disks:
     rbd = _RecursiveFindBD(dsk)
     if rbd is None:
-      raise errors.BlockDeviceError("Can't find device %s" % str(dsk))
+      _Fail("Can't find device %s", dsk)
     stats.append(rbd.CombinedSyncStatus())
-  return stats
+  return True, stats
 
 
 def _RecursiveFindBD(disk):
@@ -1418,7 +1415,7 @@ def BlockdevFind(disk):
   try:
     rbd = _RecursiveFindBD(disk)
   except errors.BlockDeviceError, err:
-    return (False, str(err))
+    _Fail("Failed to find device: %s", err, exc=True)
   if rbd is None:
     return (True, None)
   return (True, (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus())
@@ -1450,27 +1447,30 @@ def UploadFile(file_name, data, mode, uid, gid, atime, mtime):
 
   """
   if not os.path.isabs(file_name):
-    logging.error("Filename passed to UploadFile is not absolute: '%s'",
-                  file_name)
-    return False
+    _Fail("Filename passed to UploadFile is not absolute: '%s'", file_name)
 
-  allowed_files = [
+  allowed_files = set([
     constants.CLUSTER_CONF_FILE,
     constants.ETC_HOSTS,
     constants.SSH_KNOWN_HOSTS_FILE,
     constants.VNC_PASSWORD_FILE,
-    ]
+    constants.RAPI_CERT_FILE,
+    constants.RAPI_USERS_FILE,
+    ])
+
+  for hv_name in constants.HYPER_TYPES:
+    hv_class = hypervisor.GetHypervisor(hv_name)
+    allowed_files.update(hv_class.GetAncillaryFiles())
 
   if file_name not in allowed_files:
-    logging.error("Filename passed to UploadFile not in allowed"
-                 " upload targets: '%s'", file_name)
-    return False
+    _Fail("Filename passed to UploadFile not in allowed upload targets: '%s'",
+          file_name)
 
   raw_data = _Decompress(data)
 
   utils.WriteFile(file_name, data=raw_data, mode=mode, uid=uid, gid=gid,
                   atime=atime, mtime=mtime)
-  return True
+  return (True, "success")
 
 
 def WriteSsconfFiles(values):
@@ -1680,7 +1680,11 @@ def OSEnvironment(instance, debug=0):
     result['NIC_%d_MAC' % idx] = nic.mac
     if nic.ip:
       result['NIC_%d_IP' % idx] = nic.ip
-    result['NIC_%d_BRIDGE' % idx] = nic.bridge
+    result['NIC_%d_MODE' % idx] = nic.nicparams[constants.NIC_MODE]
+    if nic.nicparams[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
+      result['NIC_%d_BRIDGE' % idx] = nic.nicparams[constants.NIC_LINK]
+    if nic.nicparams[constants.NIC_LINK]:
+      result['NIC_%d_LINK' % idx] = nic.nicparams[constants.NIC_LINK]
     if constants.HV_NIC_TYPE in instance.hvparams:
       result['NIC_%d_FRONTEND_TYPE' % idx] = \
         instance.hvparams[constants.HV_NIC_TYPE]
@@ -1708,7 +1712,7 @@ def BlockdevGrow(disk, amount):
   try:
     r_dev.Grow(amount)
   except errors.BlockDeviceError, err:
-    return False, str(err)
+    _Fail("Failed to grow block device: %s", err, exc=True)
 
   return True, None
 
@@ -1739,13 +1743,12 @@ def BlockdevSnapshot(disk):
     r_dev = _RecursiveFindBD(disk)
     if r_dev is not None:
       # let's stay on the safe side and ask for the full size, for now
-      return r_dev.Snapshot(disk.size)
+      return True, r_dev.Snapshot(disk.size)
     else:
-      return None
+      _Fail("Cannot find block device %s", disk)
   else:
-    raise errors.ProgrammerError("Cannot snapshot non-lvm block device"
-                                 " '%s' of type '%s'" %
-                                 (disk.unique_id, disk.dev_type))
+    _Fail("Cannot snapshot non-lvm block device '%s' of type '%s'",
+          disk.unique_id, disk.dev_type)
 
 
 def ExportSnapshot(disk, dest_node, instance, cluster_name, idx):
@@ -1777,8 +1780,8 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx):
     os.mkdir(constants.LOG_OS_DIR, 0750)
   real_disk = _RecursiveFindBD(disk)
   if real_disk is None:
-    raise errors.BlockDeviceError("Block device '%s' is not set up" %
-                                  str(disk))
+    _Fail("Block device '%s' is not set up", disk)
+
   real_disk.Open()
 
   export_env['EXPORT_DEVICE'] = real_disk.dev_path
@@ -1807,11 +1810,10 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx):
   result = utils.RunCmd(command, env=export_env)
 
   if result.failed:
-    logging.error("os snapshot export command '%s' returned error: %s"
-                  " output: %s", command, result.fail_reason, result.output)
-    return False
+    _Fail("OS snapshot export command '%s' returned error: %s"
+          " output: %s", command, result.fail_reason, result.output)
 
-  return True
+  return (True, None)
 
 
 def FinalizeExport(instance, snap_disks):
@@ -1997,10 +1999,12 @@ def BlockdevRename(devlist):
   @return: True if all renames succeeded, False otherwise
 
   """
+  msgs = []
   result = True
   for disk, unique_id in devlist:
     dev = _RecursiveFindBD(disk)
     if dev is None:
+      msgs.append("Can't find device %s in rename" % str(disk))
       result = False
       continue
     try:
@@ -2015,9 +2019,11 @@ def BlockdevRename(devlist):
         # cache? for now, we only lose lvm data when we rename, which
         # is less critical than DRBD or MD
     except errors.BlockDeviceError, err:
+      msgs.append("Can't rename device '%s' to '%s': %s" %
+                  (dev, unique_id, err))
       logging.exception("Can't rename device '%s' to '%s'", dev, unique_id)
       result = False
-  return result
+  return (result, "; ".join(msgs))
 
 
 def _TransformFileStorageDir(file_storage_dir):
@@ -2247,7 +2253,7 @@ def BlockdevClose(instance_name, disks):
   for cf in disks:
     rd = _RecursiveFindBD(cf)
     if rd is None:
-      return (False, "Can't find device %s" % cf)
+      _Fail("Can't find device %s", cf)
     bdevs.append(rd)
 
   msg = []
@@ -2338,8 +2344,8 @@ def DrbdDisconnectNet(nodes_ip, disks):
     try:
       rd.DisconnectNet()
     except errors.BlockDeviceError, err:
-      logging.exception("Failed to go into standalone mode")
-      return (False, "Can't change network configuration: %s" % str(err))
+      _Fail("Can't change network configuration to standalone mode: %s",
+            err, exc=True)
   return (True, "All disks are now disconnected")
 
 
@@ -2356,14 +2362,14 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
       try:
         _SymlinkBlockDev(instance_name, rd.dev_path, idx)
       except EnvironmentError, err:
-        return (False, "Can't create symlink: %s" % str(err))
+        _Fail("Can't create symlink: %s", err)
   # reconnect disks, switch to new master configuration and if
   # needed primary mode
   for rd in bdevs:
     try:
       rd.AttachNet(multimaster)
     except errors.BlockDeviceError, err:
-      return (False, "Can't change network configuration: %s" % str(err))
+      _Fail("Can't change network configuration: %s", err)
   # wait until the disks are connected; we need to retry the re-attach
   # if the device becomes standalone, as this might happen if the one
   # node disconnects and reconnects in a different mode before the
@@ -2385,7 +2391,7 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
         try:
           rd.ReAttachNet(multimaster)
         except errors.BlockDeviceError, err:
-          return (False, "Can't change network configuration: %s" % str(err))
+          _Fail("Can't change network configuration: %s", err)
     if all_connected:
       break
     time.sleep(sleep_time)
@@ -2398,7 +2404,7 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
       try:
         rd.Open()
       except errors.BlockDeviceError, err:
-        return (False, "Can't change to primary mode: %s" % str(err))
+        _Fail("Can't change to primary mode: %s", err)
   if multimaster:
     msg = "multi-master and primary"
   else:
@@ -2428,6 +2434,25 @@ def DrbdWaitSync(nodes_ip, disks):
   return (not failure, (alldone, min_resync))
 
 
+def PowercycleNode(hypervisor_type):
+  """Hard-powercycle the node.
+
+  Because we need to return first, and schedule the powercycle in the
+  background, we won't be able to report failures nicely.
+
+  """
+  hyper = hypervisor.GetHypervisor(hypervisor_type)
+  try:
+    pid = os.fork()
+  except OSError, err:
+    # if we can't fork, we'll pretend that we're in the child process
+    pid = 0
+  if pid > 0:
+    return (True, "Reboot scheduled in 5 seconds")
+  time.sleep(5)
+  hyper.PowercycleNode()
+
+
 class HooksRunner(object):
   """Hook runner.