Revision 53c776b5

b/lib/backend.py
975 975
      - msg is a string with details in case of failure
976 976

  
977 977
  """
978
  hyper = hypervisor.GetHypervisor(instance.hypervisor_name)
978
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
979 979

  
980 980
  try:
981 981
    hyper.MigrateInstance(instance.name, target, live)
982 982
  except errors.HypervisorError, err:
983
    msg = "Failed to migrate instance: %s" % str(err)
984
    logging.error(msg)
985
    return (False, msg)
983
    msg = "Failed to migrate instance"
984
    logging.exception(msg)
985
    return (False, "%s: %s" % (msg, err))
986 986
  return (True, "Migration successfull")
987 987

  
988 988

  
......
2223 2223
    return status, bdevs
2224 2224

  
2225 2225
  if multimaster:
2226
    for cf, rd in zip(disks, bdevs):
2226
    for idx, rd in enumerate(bdevs):
2227 2227
      try:
2228
        _SymlinkBlockDev(instance_name, rd.dev_path, cf.iv_name)
2228
        _SymlinkBlockDev(instance_name, rd.dev_path, idx)
2229 2229
      except EnvironmentError, err:
2230 2230
        return (False, "Can't create symlink: %s" % str(err))
2231 2231
  # reconnect disks, switch to new master configuration and if
b/lib/cmdlib.py
3387 3387
                                 (instance.name, target_node))
3388 3388

  
3389 3389

  
3390
class LUMigrateInstance(LogicalUnit):
3391
  """Migrate an instance.
3392

  
3393
  This is migration without shutting down, compared to the failover,
3394
  which is done with shutdown.
3395

  
3396
  """
3397
  HPATH = "instance-migrate"
3398
  HTYPE = constants.HTYPE_INSTANCE
3399
  _OP_REQP = ["instance_name", "live", "cleanup"]
3400

  
3401
  REQ_BGL = False
3402

  
3403
  def ExpandNames(self):
3404
    self._ExpandAndLockInstance()
3405
    self.needed_locks[locking.LEVEL_NODE] = []
3406
    self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3407

  
3408
  def DeclareLocks(self, level):
3409
    if level == locking.LEVEL_NODE:
3410
      self._LockInstancesNodes()
3411

  
3412
  def BuildHooksEnv(self):
3413
    """Build hooks env.
3414

  
3415
    This runs on master, primary and secondary nodes of the instance.
3416

  
3417
    """
3418
    env = _BuildInstanceHookEnvByObject(self, self.instance)
3419
    nl = [self.cfg.GetMasterNode()] + list(self.instance.secondary_nodes)
3420
    return env, nl, nl
3421

  
3422
  def CheckPrereq(self):
3423
    """Check prerequisites.
3424

  
3425
    This checks that the instance is in the cluster.
3426

  
3427
    """
3428
    instance = self.cfg.GetInstanceInfo(
3429
      self.cfg.ExpandInstanceName(self.op.instance_name))
3430
    if instance is None:
3431
      raise errors.OpPrereqError("Instance '%s' not known" %
3432
                                 self.op.instance_name)
3433

  
3434
    if instance.disk_template != constants.DT_DRBD8:
3435
      raise errors.OpPrereqError("Instance's disk layout is not"
3436
                                 " drbd8, cannot migrate.")
3437

  
3438
    secondary_nodes = instance.secondary_nodes
3439
    if not secondary_nodes:
3440
      raise errors.ProgrammerError("no secondary node but using "
3441
                                   "drbd8 disk template")
3442

  
3443
    i_be = self.cfg.GetClusterInfo().FillBE(instance)
3444

  
3445
    target_node = secondary_nodes[0]
3446
    # check memory requirements on the secondary node
3447
    _CheckNodeFreeMemory(self, target_node, "migrating instance %s" %
3448
                         instance.name, i_be[constants.BE_MEMORY],
3449
                         instance.hypervisor)
3450

  
3451
    # check bridge existance
3452
    brlist = [nic.bridge for nic in instance.nics]
3453
    result = self.rpc.call_bridges_exist(target_node, brlist)
3454
    if result.failed or not result.data:
3455
      raise errors.OpPrereqError("One or more target bridges %s does not"
3456
                                 " exist on destination node '%s'" %
3457
                                 (brlist, target_node))
3458

  
3459
    if not self.op.cleanup:
3460
      result = self.rpc.call_instance_migratable(instance.primary_node,
3461
                                                 instance)
3462
      msg = result.RemoteFailMsg()
3463
      if msg:
3464
        raise errors.OpPrereqError("Can't migrate: %s - please use failover" %
3465
                                   msg)
3466

  
3467
    self.instance = instance
3468

  
3469
  def _WaitUntilSync(self):
3470
    """Poll with custom rpc for disk sync.
3471

  
3472
    This uses our own step-based rpc call.
3473

  
3474
    """
3475
    self.feedback_fn("* wait until resync is done")
3476
    all_done = False
3477
    while not all_done:
3478
      all_done = True
3479
      result = self.rpc.call_drbd_wait_sync(self.all_nodes,
3480
                                            self.nodes_ip,
3481
                                            self.instance.disks)
3482
      min_percent = 100
3483
      for node, nres in result.items():
3484
        msg = nres.RemoteFailMsg()
3485
        if msg:
3486
          raise errors.OpExecError("Cannot resync disks on node %s: %s" %
3487
                                   (node, msg))
3488
        node_done, node_percent = nres.data[1]
3489
        all_done = all_done and node_done
3490
        if node_percent is not None:
3491
          min_percent = min(min_percent, node_percent)
3492
      if not all_done:
3493
        if min_percent < 100:
3494
          self.feedback_fn("   - progress: %.1f%%" % min_percent)
3495
        time.sleep(2)
3496

  
3497
  def _EnsureSecondary(self, node):
3498
    """Demote a node to secondary.
3499

  
3500
    """
3501
    self.feedback_fn("* switching node %s to secondary mode" % node)
3502

  
3503
    for dev in self.instance.disks:
3504
      self.cfg.SetDiskID(dev, node)
3505

  
3506
    result = self.rpc.call_blockdev_close(node, self.instance.name,
3507
                                          self.instance.disks)
3508
    msg = result.RemoteFailMsg()
3509
    if msg:
3510
      raise errors.OpExecError("Cannot change disk to secondary on node %s,"
3511
                               " error %s" % (node, msg))
3512

  
3513
  def _GoStandalone(self):
3514
    """Disconnect from the network.
3515

  
3516
    """
3517
    self.feedback_fn("* changing into standalone mode")
3518
    result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
3519
                                               self.instance.disks)
3520
    for node, nres in result.items():
3521
      msg = nres.RemoteFailMsg()
3522
      if msg:
3523
        raise errors.OpExecError("Cannot disconnect disks node %s,"
3524
                                 " error %s" % (node, msg))
3525

  
3526
  def _GoReconnect(self, multimaster):
3527
    """Reconnect to the network.
3528

  
3529
    """
3530
    if multimaster:
3531
      msg = "dual-master"
3532
    else:
3533
      msg = "single-master"
3534
    self.feedback_fn("* changing disks into %s mode" % msg)
3535
    result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
3536
                                           self.instance.disks,
3537
                                           self.instance.name, multimaster)
3538
    for node, nres in result.items():
3539
      msg = nres.RemoteFailMsg()
3540
      if msg:
3541
        raise errors.OpExecError("Cannot change disks config on node %s,"
3542
                                 " error: %s" % (node, msg))
3543

  
3544
  def _ExecCleanup(self):
3545
    """Try to cleanup after a failed migration.
3546

  
3547
    The cleanup is done by:
3548
      - check that the instance is running only on one node
3549
        (and update the config if needed)
3550
      - change disks on its secondary node to secondary
3551
      - wait until disks are fully synchronized
3552
      - disconnect from the network
3553
      - change disks into single-master mode
3554
      - wait again until disks are fully synchronized
3555

  
3556
    """
3557
    instance = self.instance
3558
    target_node = self.target_node
3559
    source_node = self.source_node
3560

  
3561
    # check running on only one node
3562
    self.feedback_fn("* checking where the instance actually runs"
3563
                     " (if this hangs, the hypervisor might be in"
3564
                     " a bad state)")
3565
    ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
3566
    for node, result in ins_l.items():
3567
      result.Raise()
3568
      if not isinstance(result.data, list):
3569
        raise errors.OpExecError("Can't contact node '%s'" % node)
3570

  
3571
    runningon_source = instance.name in ins_l[source_node].data
3572
    runningon_target = instance.name in ins_l[target_node].data
3573

  
3574
    if runningon_source and runningon_target:
3575
      raise errors.OpExecError("Instance seems to be running on two nodes,"
3576
                               " or the hypervisor is confused. You will have"
3577
                               " to ensure manually that it runs only on one"
3578
                               " and restart this operation.")
3579

  
3580
    if not (runningon_source or runningon_target):
3581
      raise errors.OpExecError("Instance does not seem to be running at all."
3582
                               " In this case, it's safer to repair by"
3583
                               " running 'gnt-instance stop' to ensure disk"
3584
                               " shutdown, and then restarting it.")
3585

  
3586
    if runningon_target:
3587
      # the migration has actually succeeded, we need to update the config
3588
      self.feedback_fn("* instance running on secondary node (%s),"
3589
                       " updating config" % target_node)
3590
      instance.primary_node = target_node
3591
      self.cfg.Update(instance)
3592
      demoted_node = source_node
3593
    else:
3594
      self.feedback_fn("* instance confirmed to be running on its"
3595
                       " primary node (%s)" % source_node)
3596
      demoted_node = target_node
3597

  
3598
    self._EnsureSecondary(demoted_node)
3599
    try:
3600
      self._WaitUntilSync()
3601
    except errors.OpExecError:
3602
      # we ignore here errors, since if the device is standalone, it
3603
      # won't be able to sync
3604
      pass
3605
    self._GoStandalone()
3606
    self._GoReconnect(False)
3607
    self._WaitUntilSync()
3608

  
3609
    self.feedback_fn("* done")
3610

  
3611
  def _ExecMigration(self):
3612
    """Migrate an instance.
3613

  
3614
    The migrate is done by:
3615
      - change the disks into dual-master mode
3616
      - wait until disks are fully synchronized again
3617
      - migrate the instance
3618
      - change disks on the new secondary node (the old primary) to secondary
3619
      - wait until disks are fully synchronized
3620
      - change disks into single-master mode
3621

  
3622
    """
3623
    instance = self.instance
3624
    target_node = self.target_node
3625
    source_node = self.source_node
3626

  
3627
    self.feedback_fn("* checking disk consistency between source and target")
3628
    for dev in instance.disks:
3629
      if not _CheckDiskConsistency(self, dev, target_node, False):
3630
        raise errors.OpExecError("Disk %s is degraded or not fully"
3631
                                 " synchronized on target node,"
3632
                                 " aborting migrate." % dev.iv_name)
3633

  
3634
    self._EnsureSecondary(target_node)
3635
    self._GoStandalone()
3636
    self._GoReconnect(True)
3637
    self._WaitUntilSync()
3638

  
3639
    self.feedback_fn("* migrating instance to %s" % target_node)
3640
    time.sleep(10)
3641
    result = self.rpc.call_instance_migrate(source_node, instance,
3642
                                            self.nodes_ip[target_node],
3643
                                            self.op.live)
3644
    msg = result.RemoteFailMsg()
3645
    if msg:
3646
      logging.error("Instance migration failed, trying to revert"
3647
                    " disk status: %s", msg)
3648
      try:
3649
        self._EnsureSecondary(target_node)
3650
        self._GoStandalone()
3651
        self._GoReconnect(False)
3652
        self._WaitUntilSync()
3653
      except errors.OpExecError, err:
3654
        self.LogWarning("Migration failed and I can't reconnect the"
3655
                        " drives: error '%s'\n"
3656
                        "Please look and recover the instance status" %
3657
                        str(err))
3658

  
3659
      raise errors.OpExecError("Could not migrate instance %s: %s" %
3660
                               (instance.name, msg))
3661
    time.sleep(10)
3662

  
3663
    instance.primary_node = target_node
3664
    # distribute new instance config to the other nodes
3665
    self.cfg.Update(instance)
3666

  
3667
    self._EnsureSecondary(source_node)
3668
    self._WaitUntilSync()
3669
    self._GoStandalone()
3670
    self._GoReconnect(False)
3671
    self._WaitUntilSync()
3672

  
3673
    self.feedback_fn("* done")
3674

  
3675
  def Exec(self, feedback_fn):
3676
    """Perform the migration.
3677

  
3678
    """
3679
    self.feedback_fn = feedback_fn
3680

  
3681
    self.source_node = self.instance.primary_node
3682
    self.target_node = self.instance.secondary_nodes[0]
3683
    self.all_nodes = [self.source_node, self.target_node]
3684
    self.nodes_ip = {
3685
      self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
3686
      self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
3687
      }
3688
    if self.op.cleanup:
3689
      return self._ExecCleanup()
3690
    else:
3691
      return self._ExecMigration()
3692

  
3693

  
3390 3694
def _CreateBlockDevOnPrimary(lu, node, instance, device, info):
3391 3695
  """Create a tree of block devices on the primary node.
3392 3696

  
b/lib/hypervisor/hv_xen.py
51 51
    raise NotImplementedError
52 52

  
53 53
  @staticmethod
54
  def _RemoveConfigFile(instance):
54
  def _RemoveConfigFile(instance_name):
55 55
    """Remove the xen configuration file.
56 56

  
57 57
    """
58
    utils.RemoveFile("/etc/xen/%s" % instance.name)
58
    utils.RemoveFile("/etc/xen/%s" % instance_name)
59 59

  
60 60
  @staticmethod
61 61
  def _GetXMList(include_node):
......
155 155
    """Stop an instance.
156 156

  
157 157
    """
158
    self._RemoveConfigFile(instance)
158
    self._RemoveConfigFile(instance.name)
159 159
    if force:
160 160
      command = ["xm", "destroy", instance.name]
161 161
    else:
......
290 290
    if result.failed:
291 291
      raise errors.HypervisorError("Failed to migrate instance %s: %s" %
292 292
                                   (instance, result.output))
293
    # remove old xen file after migration succeeded
294
    try:
295
      self._RemoveConfigFile(instance)
296
    except EnvironmentError, err:
297
      logger.Error("Failure while removing instance config file: %s" %
298
                   str(err))
293 299

  
294 300

  
295 301
class XenPvmHypervisor(XenHypervisor):
b/lib/mcpu.py
68 68
    opcodes.OpDeactivateInstanceDisks: cmdlib.LUDeactivateInstanceDisks,
69 69
    opcodes.OpReplaceDisks: cmdlib.LUReplaceDisks,
70 70
    opcodes.OpFailoverInstance: cmdlib.LUFailoverInstance,
71
    opcodes.OpMigrateInstance: cmdlib.LUMigrateInstance,
71 72
    opcodes.OpConnectConsole: cmdlib.LUConnectConsole,
72 73
    opcodes.OpQueryInstances: cmdlib.LUQueryInstances,
73 74
    opcodes.OpQueryInstanceData: cmdlib.LUQueryInstanceData,
b/lib/opcodes.py
413 413
  __slots__ = ["instance_name", "ignore_consistency"]
414 414

  
415 415

  
416
class OpMigrateInstance(OpCode):
417
  """Migrate an instance.
418

  
419
  This migrates (without shutting down an instance) to its secondary
420
  node.
421

  
422
  @var instance_name: the name of the instance
423

  
424
  """
425
  OP_ID = "OP_INSTANCE_MIGRATE"
426
  __slots__ = ["instance_name", "live", "cleanup"]
427

  
428

  
416 429
class OpConnectConsole(OpCode):
417 430
  """Connect to an instance's console."""
418 431
  OP_ID = "OP_INSTANCE_CONSOLE"
b/scripts/gnt-instance
821 821
  return 0
822 822

  
823 823

  
824
def MigrateInstance(opts, args):
825
  """Migrate an instance.
826

  
827
  The migrate is done without shutdown.
828

  
829
  Args:
830
    opts - class with options as members
831
    args - list with a single element, the instance name
832
  Opts used:
833
    force - whether to migrate without asking questions.
834

  
835
  """
836
  instance_name = args[0]
837
  force = opts.force
838

  
839
  if not force:
840
    if opts.cleanup:
841
      usertext = ("Instance %s will be recovered from a failed migration."
842
                  " Note that the migration procedure (including cleanup)" %
843
                  (instance_name,))
844
    else:
845
      usertext = ("Instance %s will be migrated. Note that migration" %
846
                  (instance_name,))
847
    usertext += (" is **experimental** in this version."
848
                " This might impact the instance if anything goes wrong."
849
                " Continue?")
850
    if not AskUser(usertext):
851
      return 1
852

  
853
  op = opcodes.OpMigrateInstance(instance_name=instance_name, live=opts.live,
854
                                 cleanup=opts.cleanup)
855
  SubmitOpCode(op)
856
  return 0
857

  
858

  
824 859
def ConnectToInstanceConsole(opts, args):
825 860
  """Connect to the console of an instance.
826 861

  
......
1269 1304
               "[-f] <instance>",
1270 1305
               "Stops the instance and starts it on the backup node, using"
1271 1306
               " the remote mirror (only for instances of type drbd)"),
1307
  'migrate': (MigrateInstance, ARGS_ONE,
1308
               [DEBUG_OPT, FORCE_OPT,
1309
                make_option("--non-live", dest="live",
1310
                            default=True, action="store_false",
1311
                            help="Do a non-live migration (this usually means"
1312
                            " freeze the instance, save the state,"
1313
                            " transfer and only then resume running on the"
1314
                            " secondary node)"),
1315
                make_option("--cleanup", dest="cleanup",
1316
                            default=False, action="store_true",
1317
                            help="Instead of performing the migration, try to"
1318
                            " recover from a failed cleanup. This is safe"
1319
                            " to run even if the instance is healthy, but it"
1320
                            " will create extra replication traffic and "
1321
                            " disrupt briefly the replication (like during the"
1322
                            " migration"),
1323
                ],
1324
               "[-f] <instance>",
1325
               "Migrate instance to its secondary node"
1326
               " (only for instances of type drbd)"),
1272 1327
  'info': (ShowInstanceConfig, ARGS_ANY,
1273 1328
           [DEBUG_OPT,
1274 1329
            make_option("-s", "--static", dest="static",

Also available in: Unified diff