Statistics
| Branch: | Tag: | Revision:

root / lib / cmdlib.py @ 3e91897b

History | View | Annotate | Download (168.1 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Module implementing the master-side code."""
23

    
24
# pylint: disable-msg=W0613,W0201
25

    
26
import os
27
import os.path
28
import sha
29
import time
30
import tempfile
31
import re
32
import platform
33

    
34
from ganeti import rpc
35
from ganeti import ssh
36
from ganeti import logger
37
from ganeti import utils
38
from ganeti import errors
39
from ganeti import hypervisor
40
from ganeti import config
41
from ganeti import constants
42
from ganeti import objects
43
from ganeti import opcodes
44
from ganeti import ssconf
45
from ganeti import serializer
46

    
47

    
48
class LogicalUnit(object):
49
  """Logical Unit base class.
50

51
  Subclasses must follow these rules:
52
    - implement CheckPrereq which also fills in the opcode instance
53
      with all the fields (even if as None)
54
    - implement Exec
55
    - implement BuildHooksEnv
56
    - redefine HPATH and HTYPE
57
    - optionally redefine their run requirements (REQ_MASTER); note that all
58
      commands require root permissions
59

60
  """
61
  HPATH = None
62
  HTYPE = None
63
  _OP_REQP = []
64
  REQ_MASTER = True
65

    
66
  def __init__(self, processor, op, cfg, sstore):
67
    """Constructor for LogicalUnit.
68

69
    This needs to be overriden in derived classes in order to check op
70
    validity.
71

72
    """
73
    self.proc = processor
74
    self.op = op
75
    self.cfg = cfg
76
    self.sstore = sstore
77
    self.__ssh = None
78

    
79
    for attr_name in self._OP_REQP:
80
      attr_val = getattr(op, attr_name, None)
81
      if attr_val is None:
82
        raise errors.OpPrereqError("Required parameter '%s' missing" %
83
                                   attr_name)
84

    
85
    if not cfg.IsCluster():
86
      raise errors.OpPrereqError("Cluster not initialized yet,"
87
                                 " use 'gnt-cluster init' first.")
88
    if self.REQ_MASTER:
89
      master = sstore.GetMasterNode()
90
      if master != utils.HostInfo().name:
91
        raise errors.OpPrereqError("Commands must be run on the master"
92
                                   " node %s" % master)
93

    
94
  def __GetSSH(self):
95
    """Returns the SshRunner object
96

97
    """
98
    if not self.__ssh:
99
      self.__ssh = ssh.SshRunner(self.sstore)
100
    return self.__ssh
101

    
102
  ssh = property(fget=__GetSSH)
103

    
104
  def CheckPrereq(self):
105
    """Check prerequisites for this LU.
106

107
    This method should check that the prerequisites for the execution
108
    of this LU are fulfilled. It can do internode communication, but
109
    it should be idempotent - no cluster or system changes are
110
    allowed.
111

112
    The method should raise errors.OpPrereqError in case something is
113
    not fulfilled. Its return value is ignored.
114

115
    This method should also update all the parameters of the opcode to
116
    their canonical form; e.g. a short node name must be fully
117
    expanded after this method has successfully completed (so that
118
    hooks, logging, etc. work correctly).
119

120
    """
121
    raise NotImplementedError
122

    
123
  def Exec(self, feedback_fn):
124
    """Execute the LU.
125

126
    This method should implement the actual work. It should raise
127
    errors.OpExecError for failures that are somewhat dealt with in
128
    code, or expected.
129

130
    """
131
    raise NotImplementedError
132

    
133
  def BuildHooksEnv(self):
134
    """Build hooks environment for this LU.
135

136
    This method should return a three-node tuple consisting of: a dict
137
    containing the environment that will be used for running the
138
    specific hook for this LU, a list of node names on which the hook
139
    should run before the execution, and a list of node names on which
140
    the hook should run after the execution.
141

142
    The keys of the dict must not have 'GANETI_' prefixed as this will
143
    be handled in the hooks runner. Also note additional keys will be
144
    added by the hooks runner. If the LU doesn't define any
145
    environment, an empty dict (and not None) should be returned.
146

147
    No nodes should be returned as an empty list (and not None).
148

149
    Note that if the HPATH for a LU class is None, this function will
150
    not be called.
151

152
    """
153
    raise NotImplementedError
154

    
155
  def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
156
    """Notify the LU about the results of its hooks.
157

158
    This method is called every time a hooks phase is executed, and notifies
159
    the Logical Unit about the hooks' result. The LU can then use it to alter
160
    its result based on the hooks.  By default the method does nothing and the
161
    previous result is passed back unchanged but any LU can define it if it
162
    wants to use the local cluster hook-scripts somehow.
163

164
    Args:
165
      phase: the hooks phase that has just been run
166
      hooks_results: the results of the multi-node hooks rpc call
167
      feedback_fn: function to send feedback back to the caller
168
      lu_result: the previous result this LU had, or None in the PRE phase.
169

170
    """
171
    return lu_result
172

    
173

    
174
class NoHooksLU(LogicalUnit):
175
  """Simple LU which runs no hooks.
176

177
  This LU is intended as a parent for other LogicalUnits which will
178
  run no hooks, in order to reduce duplicate code.
179

180
  """
181
  HPATH = None
182
  HTYPE = None
183

    
184

    
185
def _GetWantedNodes(lu, nodes):
186
  """Returns list of checked and expanded node names.
187

188
  Args:
189
    nodes: List of nodes (strings) or None for all
190

191
  """
192
  if not isinstance(nodes, list):
193
    raise errors.OpPrereqError("Invalid argument type 'nodes'")
194

    
195
  if nodes:
196
    wanted = []
197

    
198
    for name in nodes:
199
      node = lu.cfg.ExpandNodeName(name)
200
      if node is None:
201
        raise errors.OpPrereqError("No such node name '%s'" % name)
202
      wanted.append(node)
203

    
204
  else:
205
    wanted = lu.cfg.GetNodeList()
206
  return utils.NiceSort(wanted)
207

    
208

    
209
def _GetWantedInstances(lu, instances):
210
  """Returns list of checked and expanded instance names.
211

212
  Args:
213
    instances: List of instances (strings) or None for all
214

215
  """
216
  if not isinstance(instances, list):
217
    raise errors.OpPrereqError("Invalid argument type 'instances'")
218

    
219
  if instances:
220
    wanted = []
221

    
222
    for name in instances:
223
      instance = lu.cfg.ExpandInstanceName(name)
224
      if instance is None:
225
        raise errors.OpPrereqError("No such instance name '%s'" % name)
226
      wanted.append(instance)
227

    
228
  else:
229
    wanted = lu.cfg.GetInstanceList()
230
  return utils.NiceSort(wanted)
231

    
232

    
233
def _CheckOutputFields(static, dynamic, selected):
234
  """Checks whether all selected fields are valid.
235

236
  Args:
237
    static: Static fields
238
    dynamic: Dynamic fields
239

240
  """
241
  static_fields = frozenset(static)
242
  dynamic_fields = frozenset(dynamic)
243

    
244
  all_fields = static_fields | dynamic_fields
245

    
246
  if not all_fields.issuperset(selected):
247
    raise errors.OpPrereqError("Unknown output fields selected: %s"
248
                               % ",".join(frozenset(selected).
249
                                          difference(all_fields)))
250

    
251

    
252
def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
253
                          memory, vcpus, nics):
254
  """Builds instance related env variables for hooks from single variables.
255

256
  Args:
257
    secondary_nodes: List of secondary nodes as strings
258
  """
259
  env = {
260
    "OP_TARGET": name,
261
    "INSTANCE_NAME": name,
262
    "INSTANCE_PRIMARY": primary_node,
263
    "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
264
    "INSTANCE_OS_TYPE": os_type,
265
    "INSTANCE_STATUS": status,
266
    "INSTANCE_MEMORY": memory,
267
    "INSTANCE_VCPUS": vcpus,
268
  }
269

    
270
  if nics:
271
    nic_count = len(nics)
272
    for idx, (ip, bridge, mac) in enumerate(nics):
273
      if ip is None:
274
        ip = ""
275
      env["INSTANCE_NIC%d_IP" % idx] = ip
276
      env["INSTANCE_NIC%d_BRIDGE" % idx] = bridge
277
      env["INSTANCE_NIC%d_HWADDR" % idx] = mac
278
  else:
279
    nic_count = 0
280

    
281
  env["INSTANCE_NIC_COUNT"] = nic_count
282

    
283
  return env
284

    
285

    
286
def _BuildInstanceHookEnvByObject(instance, override=None):
287
  """Builds instance related env variables for hooks from an object.
288

289
  Args:
290
    instance: objects.Instance object of instance
291
    override: dict of values to override
292
  """
293
  args = {
294
    'name': instance.name,
295
    'primary_node': instance.primary_node,
296
    'secondary_nodes': instance.secondary_nodes,
297
    'os_type': instance.os,
298
    'status': instance.os,
299
    'memory': instance.memory,
300
    'vcpus': instance.vcpus,
301
    'nics': [(nic.ip, nic.bridge, nic.mac) for nic in instance.nics],
302
  }
303
  if override:
304
    args.update(override)
305
  return _BuildInstanceHookEnv(**args)
306

    
307

    
308
def _CheckInstanceBridgesExist(instance):
309
  """Check that the brigdes needed by an instance exist.
310

311
  """
312
  # check bridges existance
313
  brlist = [nic.bridge for nic in instance.nics]
314
  if not rpc.call_bridges_exist(instance.primary_node, brlist):
315
    raise errors.OpPrereqError("one or more target bridges %s does not"
316
                               " exist on destination node '%s'" %
317
                               (brlist, instance.primary_node))
318

    
319

    
320
class LUDestroyCluster(NoHooksLU):
321
  """Logical unit for destroying the cluster.
322

323
  """
324
  _OP_REQP = []
325

    
326
  def CheckPrereq(self):
327
    """Check prerequisites.
328

329
    This checks whether the cluster is empty.
330

331
    Any errors are signalled by raising errors.OpPrereqError.
332

333
    """
334
    master = self.sstore.GetMasterNode()
335

    
336
    nodelist = self.cfg.GetNodeList()
337
    if len(nodelist) != 1 or nodelist[0] != master:
338
      raise errors.OpPrereqError("There are still %d node(s) in"
339
                                 " this cluster." % (len(nodelist) - 1))
340
    instancelist = self.cfg.GetInstanceList()
341
    if instancelist:
342
      raise errors.OpPrereqError("There are still %d instance(s) in"
343
                                 " this cluster." % len(instancelist))
344

    
345
  def Exec(self, feedback_fn):
346
    """Destroys the cluster.
347

348
    """
349
    master = self.sstore.GetMasterNode()
350
    if not rpc.call_node_stop_master(master):
351
      raise errors.OpExecError("Could not disable the master role")
352
    priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
353
    utils.CreateBackup(priv_key)
354
    utils.CreateBackup(pub_key)
355
    rpc.call_node_leave_cluster(master)
356

    
357

    
358
class LUVerifyCluster(LogicalUnit):
359
  """Verifies the cluster status.
360

361
  """
362
  HPATH = "cluster-verify"
363
  HTYPE = constants.HTYPE_CLUSTER
364
  _OP_REQP = ["skip_checks"]
365

    
366
  def _VerifyNode(self, node, file_list, local_cksum, vglist, node_result,
367
                  remote_version, feedback_fn):
368
    """Run multiple tests against a node.
369

370
    Test list:
371
      - compares ganeti version
372
      - checks vg existance and size > 20G
373
      - checks config file checksum
374
      - checks ssh to other nodes
375

376
    Args:
377
      node: name of the node to check
378
      file_list: required list of files
379
      local_cksum: dictionary of local files and their checksums
380

381
    """
382
    # compares ganeti version
383
    local_version = constants.PROTOCOL_VERSION
384
    if not remote_version:
385
      feedback_fn("  - ERROR: connection to %s failed" % (node))
386
      return True
387

    
388
    if local_version != remote_version:
389
      feedback_fn("  - ERROR: sw version mismatch: master %s, node(%s) %s" %
390
                      (local_version, node, remote_version))
391
      return True
392

    
393
    # checks vg existance and size > 20G
394

    
395
    bad = False
396
    if not vglist:
397
      feedback_fn("  - ERROR: unable to check volume groups on node %s." %
398
                      (node,))
399
      bad = True
400
    else:
401
      vgstatus = utils.CheckVolumeGroupSize(vglist, self.cfg.GetVGName(),
402
                                            constants.MIN_VG_SIZE)
403
      if vgstatus:
404
        feedback_fn("  - ERROR: %s on node %s" % (vgstatus, node))
405
        bad = True
406

    
407
    # checks config file checksum
408
    # checks ssh to any
409

    
410
    if 'filelist' not in node_result:
411
      bad = True
412
      feedback_fn("  - ERROR: node hasn't returned file checksum data")
413
    else:
414
      remote_cksum = node_result['filelist']
415
      for file_name in file_list:
416
        if file_name not in remote_cksum:
417
          bad = True
418
          feedback_fn("  - ERROR: file '%s' missing" % file_name)
419
        elif remote_cksum[file_name] != local_cksum[file_name]:
420
          bad = True
421
          feedback_fn("  - ERROR: file '%s' has wrong checksum" % file_name)
422

    
423
    if 'nodelist' not in node_result:
424
      bad = True
425
      feedback_fn("  - ERROR: node hasn't returned node ssh connectivity data")
426
    else:
427
      if node_result['nodelist']:
428
        bad = True
429
        for node in node_result['nodelist']:
430
          feedback_fn("  - ERROR: ssh communication with node '%s': %s" %
431
                          (node, node_result['nodelist'][node]))
432
    if 'node-net-test' not in node_result:
433
      bad = True
434
      feedback_fn("  - ERROR: node hasn't returned node tcp connectivity data")
435
    else:
436
      if node_result['node-net-test']:
437
        bad = True
438
        nlist = utils.NiceSort(node_result['node-net-test'].keys())
439
        for node in nlist:
440
          feedback_fn("  - ERROR: tcp communication with node '%s': %s" %
441
                          (node, node_result['node-net-test'][node]))
442

    
443
    hyp_result = node_result.get('hypervisor', None)
444
    if hyp_result is not None:
445
      feedback_fn("  - ERROR: hypervisor verify failure: '%s'" % hyp_result)
446
    return bad
447

    
448
  def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
449
                      node_instance, feedback_fn):
450
    """Verify an instance.
451

452
    This function checks to see if the required block devices are
453
    available on the instance's node.
454

455
    """
456
    bad = False
457

    
458
    node_current = instanceconfig.primary_node
459

    
460
    node_vol_should = {}
461
    instanceconfig.MapLVsByNode(node_vol_should)
462

    
463
    for node in node_vol_should:
464
      for volume in node_vol_should[node]:
465
        if node not in node_vol_is or volume not in node_vol_is[node]:
466
          feedback_fn("  - ERROR: volume %s missing on node %s" %
467
                          (volume, node))
468
          bad = True
469

    
470
    if not instanceconfig.status == 'down':
471
      if (node_current not in node_instance or
472
          not instance in node_instance[node_current]):
473
        feedback_fn("  - ERROR: instance %s not running on node %s" %
474
                        (instance, node_current))
475
        bad = True
476

    
477
    for node in node_instance:
478
      if (not node == node_current):
479
        if instance in node_instance[node]:
480
          feedback_fn("  - ERROR: instance %s should not run on node %s" %
481
                          (instance, node))
482
          bad = True
483

    
484
    return bad
485

    
486
  def _VerifyOrphanVolumes(self, node_vol_should, node_vol_is, feedback_fn):
487
    """Verify if there are any unknown volumes in the cluster.
488

489
    The .os, .swap and backup volumes are ignored. All other volumes are
490
    reported as unknown.
491

492
    """
493
    bad = False
494

    
495
    for node in node_vol_is:
496
      for volume in node_vol_is[node]:
497
        if node not in node_vol_should or volume not in node_vol_should[node]:
498
          feedback_fn("  - ERROR: volume %s on node %s should not exist" %
499
                      (volume, node))
500
          bad = True
501
    return bad
502

    
503
  def _VerifyOrphanInstances(self, instancelist, node_instance, feedback_fn):
504
    """Verify the list of running instances.
505

506
    This checks what instances are running but unknown to the cluster.
507

508
    """
509
    bad = False
510
    for node in node_instance:
511
      for runninginstance in node_instance[node]:
512
        if runninginstance not in instancelist:
513
          feedback_fn("  - ERROR: instance %s on node %s should not exist" %
514
                          (runninginstance, node))
515
          bad = True
516
    return bad
517

    
518
  def _VerifyNPlusOneMemory(self, node_info, instance_cfg, feedback_fn):
519
    """Verify N+1 Memory Resilience.
520

521
    Check that if one single node dies we can still start all the instances it
522
    was primary for.
523

524
    """
525
    bad = False
526

    
527
    for node, nodeinfo in node_info.iteritems():
528
      # This code checks that every node which is now listed as secondary has
529
      # enough memory to host all instances it is supposed to should a single
530
      # other node in the cluster fail.
531
      # FIXME: not ready for failover to an arbitrary node
532
      # FIXME: does not support file-backed instances
533
      # WARNING: we currently take into account down instances as well as up
534
      # ones, considering that even if they're down someone might want to start
535
      # them even in the event of a node failure.
536
      for prinode, instances in nodeinfo['sinst-by-pnode'].iteritems():
537
        needed_mem = 0
538
        for instance in instances:
539
          needed_mem += instance_cfg[instance].memory
540
        if nodeinfo['mfree'] < needed_mem:
541
          feedback_fn("  - ERROR: not enough memory on node %s to accomodate"
542
                      " failovers should node %s fail" % (node, prinode))
543
          bad = True
544
    return bad
545

    
546
  def CheckPrereq(self):
547
    """Check prerequisites.
548

549
    Transform the list of checks we're going to skip into a set and check that
550
    all its members are valid.
551

552
    """
553
    self.skip_set = frozenset(self.op.skip_checks)
554
    if not constants.VERIFY_OPTIONAL_CHECKS.issuperset(self.skip_set):
555
      raise errors.OpPrereqError("Invalid checks to be skipped specified")
556

    
557
  def BuildHooksEnv(self):
558
    """Build hooks env.
559

560
    Cluster-Verify hooks just rone in the post phase and their failure makes
561
    the output be logged in the verify output and the verification to fail.
562

563
    """
564
    all_nodes = self.cfg.GetNodeList()
565
    # TODO: populate the environment with useful information for verify hooks
566
    env = {}
567
    return env, [], all_nodes
568

    
569
  def Exec(self, feedback_fn):
570
    """Verify integrity of cluster, performing various test on nodes.
571

572
    """
573
    bad = False
574
    feedback_fn("* Verifying global settings")
575
    for msg in self.cfg.VerifyConfig():
576
      feedback_fn("  - ERROR: %s" % msg)
577

    
578
    vg_name = self.cfg.GetVGName()
579
    nodelist = utils.NiceSort(self.cfg.GetNodeList())
580
    nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
581
    instancelist = utils.NiceSort(self.cfg.GetInstanceList())
582
    i_non_redundant = [] # Non redundant instances
583
    node_volume = {}
584
    node_instance = {}
585
    node_info = {}
586
    instance_cfg = {}
587

    
588
    # FIXME: verify OS list
589
    # do local checksums
590
    file_names = list(self.sstore.GetFileList())
591
    file_names.append(constants.SSL_CERT_FILE)
592
    file_names.append(constants.CLUSTER_CONF_FILE)
593
    local_checksums = utils.FingerprintFiles(file_names)
594

    
595
    feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
596
    all_volumeinfo = rpc.call_volume_list(nodelist, vg_name)
597
    all_instanceinfo = rpc.call_instance_list(nodelist)
598
    all_vglist = rpc.call_vg_list(nodelist)
599
    node_verify_param = {
600
      'filelist': file_names,
601
      'nodelist': nodelist,
602
      'hypervisor': None,
603
      'node-net-test': [(node.name, node.primary_ip, node.secondary_ip)
604
                        for node in nodeinfo]
605
      }
606
    all_nvinfo = rpc.call_node_verify(nodelist, node_verify_param)
607
    all_rversion = rpc.call_version(nodelist)
608
    all_ninfo = rpc.call_node_info(nodelist, self.cfg.GetVGName())
609

    
610
    for node in nodelist:
611
      feedback_fn("* Verifying node %s" % node)
612
      result = self._VerifyNode(node, file_names, local_checksums,
613
                                all_vglist[node], all_nvinfo[node],
614
                                all_rversion[node], feedback_fn)
615
      bad = bad or result
616

    
617
      # node_volume
618
      volumeinfo = all_volumeinfo[node]
619

    
620
      if isinstance(volumeinfo, basestring):
621
        feedback_fn("  - ERROR: LVM problem on node %s: %s" %
622
                    (node, volumeinfo[-400:].encode('string_escape')))
623
        bad = True
624
        node_volume[node] = {}
625
      elif not isinstance(volumeinfo, dict):
626
        feedback_fn("  - ERROR: connection to %s failed" % (node,))
627
        bad = True
628
        continue
629
      else:
630
        node_volume[node] = volumeinfo
631

    
632
      # node_instance
633
      nodeinstance = all_instanceinfo[node]
634
      if type(nodeinstance) != list:
635
        feedback_fn("  - ERROR: connection to %s failed" % (node,))
636
        bad = True
637
        continue
638

    
639
      node_instance[node] = nodeinstance
640

    
641
      # node_info
642
      nodeinfo = all_ninfo[node]
643
      if not isinstance(nodeinfo, dict):
644
        feedback_fn("  - ERROR: connection to %s failed" % (node,))
645
        bad = True
646
        continue
647

    
648
      try:
649
        node_info[node] = {
650
          "mfree": int(nodeinfo['memory_free']),
651
          "dfree": int(nodeinfo['vg_free']),
652
          "pinst": [],
653
          "sinst": [],
654
          # dictionary holding all instances this node is secondary for,
655
          # grouped by their primary node. Each key is a cluster node, and each
656
          # value is a list of instances which have the key as primary and the
657
          # current node as secondary.  this is handy to calculate N+1 memory
658
          # availability if you can only failover from a primary to its
659
          # secondary.
660
          "sinst-by-pnode": {},
661
        }
662
      except ValueError:
663
        feedback_fn("  - ERROR: invalid value returned from node %s" % (node,))
664
        bad = True
665
        continue
666

    
667
    node_vol_should = {}
668

    
669
    for instance in instancelist:
670
      feedback_fn("* Verifying instance %s" % instance)
671
      inst_config = self.cfg.GetInstanceInfo(instance)
672
      result =  self._VerifyInstance(instance, inst_config, node_volume,
673
                                     node_instance, feedback_fn)
674
      bad = bad or result
675

    
676
      inst_config.MapLVsByNode(node_vol_should)
677

    
678
      instance_cfg[instance] = inst_config
679

    
680
      pnode = inst_config.primary_node
681
      if pnode in node_info:
682
        node_info[pnode]['pinst'].append(instance)
683
      else:
684
        feedback_fn("  - ERROR: instance %s, connection to primary node"
685
                    " %s failed" % (instance, pnode))
686
        bad = True
687

    
688
      # If the instance is non-redundant we cannot survive losing its primary
689
      # node, so we are not N+1 compliant. On the other hand we have no disk
690
      # templates with more than one secondary so that situation is not well
691
      # supported either.
692
      # FIXME: does not support file-backed instances
693
      if len(inst_config.secondary_nodes) == 0:
694
        i_non_redundant.append(instance)
695
      elif len(inst_config.secondary_nodes) > 1:
696
        feedback_fn("  - WARNING: multiple secondaries for instance %s"
697
                    % instance)
698

    
699
      for snode in inst_config.secondary_nodes:
700
        if snode in node_info:
701
          node_info[snode]['sinst'].append(instance)
702
          if pnode not in node_info[snode]['sinst-by-pnode']:
703
            node_info[snode]['sinst-by-pnode'][pnode] = []
704
          node_info[snode]['sinst-by-pnode'][pnode].append(instance)
705
        else:
706
          feedback_fn("  - ERROR: instance %s, connection to secondary node"
707
                      " %s failed" % (instance, snode))
708

    
709
    feedback_fn("* Verifying orphan volumes")
710
    result = self._VerifyOrphanVolumes(node_vol_should, node_volume,
711
                                       feedback_fn)
712
    bad = bad or result
713

    
714
    feedback_fn("* Verifying remaining instances")
715
    result = self._VerifyOrphanInstances(instancelist, node_instance,
716
                                         feedback_fn)
717
    bad = bad or result
718

    
719
    if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
720
      feedback_fn("* Verifying N+1 Memory redundancy")
721
      result = self._VerifyNPlusOneMemory(node_info, instance_cfg, feedback_fn)
722
      bad = bad or result
723

    
724
    feedback_fn("* Other Notes")
725
    if i_non_redundant:
726
      feedback_fn("  - NOTICE: %d non-redundant instance(s) found."
727
                  % len(i_non_redundant))
728

    
729
    return int(bad)
730

    
731
  def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
732
    """Analize the post-hooks' result, handle it, and send some
733
    nicely-formatted feedback back to the user.
734

735
    Args:
736
      phase: the hooks phase that has just been run
737
      hooks_results: the results of the multi-node hooks rpc call
738
      feedback_fn: function to send feedback back to the caller
739
      lu_result: previous Exec result
740

741
    """
742
    # We only really run POST phase hooks, and are only interested in their results
743
    if phase == constants.HOOKS_PHASE_POST:
744
      # Used to change hooks' output to proper indentation
745
      indent_re = re.compile('^', re.M)
746
      feedback_fn("* Hooks Results")
747
      if not hooks_results:
748
        feedback_fn("  - ERROR: general communication failure")
749
        lu_result = 1
750
      else:
751
        for node_name in hooks_results:
752
          show_node_header = True
753
          res = hooks_results[node_name]
754
          if res is False or not isinstance(res, list):
755
            feedback_fn("    Communication failure")
756
            lu_result = 1
757
            continue
758
          for script, hkr, output in res:
759
            if hkr == constants.HKR_FAIL:
760
              # The node header is only shown once, if there are
761
              # failing hooks on that node
762
              if show_node_header:
763
                feedback_fn("  Node %s:" % node_name)
764
                show_node_header = False
765
              feedback_fn("    ERROR: Script %s failed, output:" % script)
766
              output = indent_re.sub('      ', output)
767
              feedback_fn("%s" % output)
768
              lu_result = 1
769

    
770
      return lu_result
771

    
772

    
773
class LUVerifyDisks(NoHooksLU):
774
  """Verifies the cluster disks status.
775

776
  """
777
  _OP_REQP = []
778

    
779
  def CheckPrereq(self):
780
    """Check prerequisites.
781

782
    This has no prerequisites.
783

784
    """
785
    pass
786

    
787
  def Exec(self, feedback_fn):
788
    """Verify integrity of cluster disks.
789

790
    """
791
    result = res_nodes, res_nlvm, res_instances, res_missing = [], {}, [], {}
792

    
793
    vg_name = self.cfg.GetVGName()
794
    nodes = utils.NiceSort(self.cfg.GetNodeList())
795
    instances = [self.cfg.GetInstanceInfo(name)
796
                 for name in self.cfg.GetInstanceList()]
797

    
798
    nv_dict = {}
799
    for inst in instances:
800
      inst_lvs = {}
801
      if (inst.status != "up" or
802
          inst.disk_template not in constants.DTS_NET_MIRROR):
803
        continue
804
      inst.MapLVsByNode(inst_lvs)
805
      # transform { iname: {node: [vol,],},} to {(node, vol): iname}
806
      for node, vol_list in inst_lvs.iteritems():
807
        for vol in vol_list:
808
          nv_dict[(node, vol)] = inst
809

    
810
    if not nv_dict:
811
      return result
812

    
813
    node_lvs = rpc.call_volume_list(nodes, vg_name)
814

    
815
    to_act = set()
816
    for node in nodes:
817
      # node_volume
818
      lvs = node_lvs[node]
819

    
820
      if isinstance(lvs, basestring):
821
        logger.Info("error enumerating LVs on node %s: %s" % (node, lvs))
822
        res_nlvm[node] = lvs
823
      elif not isinstance(lvs, dict):
824
        logger.Info("connection to node %s failed or invalid data returned" %
825
                    (node,))
826
        res_nodes.append(node)
827
        continue
828

    
829
      for lv_name, (_, lv_inactive, lv_online) in lvs.iteritems():
830
        inst = nv_dict.pop((node, lv_name), None)
831
        if (not lv_online and inst is not None
832
            and inst.name not in res_instances):
833
          res_instances.append(inst.name)
834

    
835
    # any leftover items in nv_dict are missing LVs, let's arrange the
836
    # data better
837
    for key, inst in nv_dict.iteritems():
838
      if inst.name not in res_missing:
839
        res_missing[inst.name] = []
840
      res_missing[inst.name].append(key)
841

    
842
    return result
843

    
844

    
845
class LURenameCluster(LogicalUnit):
846
  """Rename the cluster.
847

848
  """
849
  HPATH = "cluster-rename"
850
  HTYPE = constants.HTYPE_CLUSTER
851
  _OP_REQP = ["name"]
852

    
853
  def BuildHooksEnv(self):
854
    """Build hooks env.
855

856
    """
857
    env = {
858
      "OP_TARGET": self.sstore.GetClusterName(),
859
      "NEW_NAME": self.op.name,
860
      }
861
    mn = self.sstore.GetMasterNode()
862
    return env, [mn], [mn]
863

    
864
  def CheckPrereq(self):
865
    """Verify that the passed name is a valid one.
866

867
    """
868
    hostname = utils.HostInfo(self.op.name)
869

    
870
    new_name = hostname.name
871
    self.ip = new_ip = hostname.ip
872
    old_name = self.sstore.GetClusterName()
873
    old_ip = self.sstore.GetMasterIP()
874
    if new_name == old_name and new_ip == old_ip:
875
      raise errors.OpPrereqError("Neither the name nor the IP address of the"
876
                                 " cluster has changed")
877
    if new_ip != old_ip:
878
      result = utils.RunCmd(["fping", "-q", new_ip])
879
      if not result.failed:
880
        raise errors.OpPrereqError("The given cluster IP address (%s) is"
881
                                   " reachable on the network. Aborting." %
882
                                   new_ip)
883

    
884
    self.op.name = new_name
885

    
886
  def Exec(self, feedback_fn):
887
    """Rename the cluster.
888

889
    """
890
    clustername = self.op.name
891
    ip = self.ip
892
    ss = self.sstore
893

    
894
    # shutdown the master IP
895
    master = ss.GetMasterNode()
896
    if not rpc.call_node_stop_master(master):
897
      raise errors.OpExecError("Could not disable the master role")
898

    
899
    try:
900
      # modify the sstore
901
      ss.SetKey(ss.SS_MASTER_IP, ip)
902
      ss.SetKey(ss.SS_CLUSTER_NAME, clustername)
903

    
904
      # Distribute updated ss config to all nodes
905
      myself = self.cfg.GetNodeInfo(master)
906
      dist_nodes = self.cfg.GetNodeList()
907
      if myself.name in dist_nodes:
908
        dist_nodes.remove(myself.name)
909

    
910
      logger.Debug("Copying updated ssconf data to all nodes")
911
      for keyname in [ss.SS_CLUSTER_NAME, ss.SS_MASTER_IP]:
912
        fname = ss.KeyToFilename(keyname)
913
        result = rpc.call_upload_file(dist_nodes, fname)
914
        for to_node in dist_nodes:
915
          if not result[to_node]:
916
            logger.Error("copy of file %s to node %s failed" %
917
                         (fname, to_node))
918
    finally:
919
      if not rpc.call_node_start_master(master):
920
        logger.Error("Could not re-enable the master role on the master,"
921
                     " please restart manually.")
922

    
923

    
924
def _RecursiveCheckIfLVMBased(disk):
925
  """Check if the given disk or its children are lvm-based.
926

927
  Args:
928
    disk: ganeti.objects.Disk object
929

930
  Returns:
931
    boolean indicating whether a LD_LV dev_type was found or not
932

933
  """
934
  if disk.children:
935
    for chdisk in disk.children:
936
      if _RecursiveCheckIfLVMBased(chdisk):
937
        return True
938
  return disk.dev_type == constants.LD_LV
939

    
940

    
941
class LUSetClusterParams(LogicalUnit):
942
  """Change the parameters of the cluster.
943

944
  """
945
  HPATH = "cluster-modify"
946
  HTYPE = constants.HTYPE_CLUSTER
947
  _OP_REQP = []
948

    
949
  def BuildHooksEnv(self):
950
    """Build hooks env.
951

952
    """
953
    env = {
954
      "OP_TARGET": self.sstore.GetClusterName(),
955
      "NEW_VG_NAME": self.op.vg_name,
956
      }
957
    mn = self.sstore.GetMasterNode()
958
    return env, [mn], [mn]
959

    
960
  def CheckPrereq(self):
961
    """Check prerequisites.
962

963
    This checks whether the given params don't conflict and
964
    if the given volume group is valid.
965

966
    """
967
    if not self.op.vg_name:
968
      instances = [self.cfg.GetInstanceInfo(name)
969
                   for name in self.cfg.GetInstanceList()]
970
      for inst in instances:
971
        for disk in inst.disks:
972
          if _RecursiveCheckIfLVMBased(disk):
973
            raise errors.OpPrereqError("Cannot disable lvm storage while"
974
                                       " lvm-based instances exist")
975

    
976
    # if vg_name not None, checks given volume group on all nodes
977
    if self.op.vg_name:
978
      node_list = self.cfg.GetNodeList()
979
      vglist = rpc.call_vg_list(node_list)
980
      for node in node_list:
981
        vgstatus = utils.CheckVolumeGroupSize(vglist[node], self.op.vg_name,
982
                                              constants.MIN_VG_SIZE)
983
        if vgstatus:
984
          raise errors.OpPrereqError("Error on node '%s': %s" %
985
                                     (node, vgstatus))
986

    
987
  def Exec(self, feedback_fn):
988
    """Change the parameters of the cluster.
989

990
    """
991
    if self.op.vg_name != self.cfg.GetVGName():
992
      self.cfg.SetVGName(self.op.vg_name)
993
    else:
994
      feedback_fn("Cluster LVM configuration already in desired"
995
                  " state, not changing")
996

    
997

    
998
def _WaitForSync(cfgw, instance, proc, oneshot=False, unlock=False):
999
  """Sleep and poll for an instance's disk to sync.
1000

1001
  """
1002
  if not instance.disks:
1003
    return True
1004

    
1005
  if not oneshot:
1006
    proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
1007

    
1008
  node = instance.primary_node
1009

    
1010
  for dev in instance.disks:
1011
    cfgw.SetDiskID(dev, node)
1012

    
1013
  retries = 0
1014
  while True:
1015
    max_time = 0
1016
    done = True
1017
    cumul_degraded = False
1018
    rstats = rpc.call_blockdev_getmirrorstatus(node, instance.disks)
1019
    if not rstats:
1020
      proc.LogWarning("Can't get any data from node %s" % node)
1021
      retries += 1
1022
      if retries >= 10:
1023
        raise errors.RemoteError("Can't contact node %s for mirror data,"
1024
                                 " aborting." % node)
1025
      time.sleep(6)
1026
      continue
1027
    retries = 0
1028
    for i in range(len(rstats)):
1029
      mstat = rstats[i]
1030
      if mstat is None:
1031
        proc.LogWarning("Can't compute data for node %s/%s" %
1032
                        (node, instance.disks[i].iv_name))
1033
        continue
1034
      # we ignore the ldisk parameter
1035
      perc_done, est_time, is_degraded, _ = mstat
1036
      cumul_degraded = cumul_degraded or (is_degraded and perc_done is None)
1037
      if perc_done is not None:
1038
        done = False
1039
        if est_time is not None:
1040
          rem_time = "%d estimated seconds remaining" % est_time
1041
          max_time = est_time
1042
        else:
1043
          rem_time = "no time estimate"
1044
        proc.LogInfo("- device %s: %5.2f%% done, %s" %
1045
                     (instance.disks[i].iv_name, perc_done, rem_time))
1046
    if done or oneshot:
1047
      break
1048

    
1049
    if unlock:
1050
      #utils.Unlock('cmd')
1051
      pass
1052
    try:
1053
      time.sleep(min(60, max_time))
1054
    finally:
1055
      if unlock:
1056
        #utils.Lock('cmd')
1057
        pass
1058

    
1059
  if done:
1060
    proc.LogInfo("Instance %s's disks are in sync." % instance.name)
1061
  return not cumul_degraded
1062

    
1063

    
1064
def _CheckDiskConsistency(cfgw, dev, node, on_primary, ldisk=False):
1065
  """Check that mirrors are not degraded.
1066

1067
  The ldisk parameter, if True, will change the test from the
1068
  is_degraded attribute (which represents overall non-ok status for
1069
  the device(s)) to the ldisk (representing the local storage status).
1070

1071
  """
1072
  cfgw.SetDiskID(dev, node)
1073
  if ldisk:
1074
    idx = 6
1075
  else:
1076
    idx = 5
1077

    
1078
  result = True
1079
  if on_primary or dev.AssembleOnSecondary():
1080
    rstats = rpc.call_blockdev_find(node, dev)
1081
    if not rstats:
1082
      logger.ToStderr("Node %s: Disk degraded, not found or node down" % node)
1083
      result = False
1084
    else:
1085
      result = result and (not rstats[idx])
1086
  if dev.children:
1087
    for child in dev.children:
1088
      result = result and _CheckDiskConsistency(cfgw, child, node, on_primary)
1089

    
1090
  return result
1091

    
1092

    
1093
class LUDiagnoseOS(NoHooksLU):
1094
  """Logical unit for OS diagnose/query.
1095

1096
  """
1097
  _OP_REQP = ["output_fields", "names"]
1098

    
1099
  def CheckPrereq(self):
1100
    """Check prerequisites.
1101

1102
    This always succeeds, since this is a pure query LU.
1103

1104
    """
1105
    if self.op.names:
1106
      raise errors.OpPrereqError("Selective OS query not supported")
1107

    
1108
    self.dynamic_fields = frozenset(["name", "valid", "node_status"])
1109
    _CheckOutputFields(static=[],
1110
                       dynamic=self.dynamic_fields,
1111
                       selected=self.op.output_fields)
1112

    
1113
  @staticmethod
1114
  def _DiagnoseByOS(node_list, rlist):
1115
    """Remaps a per-node return list into an a per-os per-node dictionary
1116

1117
      Args:
1118
        node_list: a list with the names of all nodes
1119
        rlist: a map with node names as keys and OS objects as values
1120

1121
      Returns:
1122
        map: a map with osnames as keys and as value another map, with
1123
             nodes as
1124
             keys and list of OS objects as values
1125
             e.g. {"debian-etch": {"node1": [<object>,...],
1126
                                   "node2": [<object>,]}
1127
                  }
1128

1129
    """
1130
    all_os = {}
1131
    for node_name, nr in rlist.iteritems():
1132
      if not nr:
1133
        continue
1134
      for os_obj in nr:
1135
        if os_obj.name not in all_os:
1136
          # build a list of nodes for this os containing empty lists
1137
          # for each node in node_list
1138
          all_os[os_obj.name] = {}
1139
          for nname in node_list:
1140
            all_os[os_obj.name][nname] = []
1141
        all_os[os_obj.name][node_name].append(os_obj)
1142
    return all_os
1143

    
1144
  def Exec(self, feedback_fn):
1145
    """Compute the list of OSes.
1146

1147
    """
1148
    node_list = self.cfg.GetNodeList()
1149
    node_data = rpc.call_os_diagnose(node_list)
1150
    if node_data == False:
1151
      raise errors.OpExecError("Can't gather the list of OSes")
1152
    pol = self._DiagnoseByOS(node_list, node_data)
1153
    output = []
1154
    for os_name, os_data in pol.iteritems():
1155
      row = []
1156
      for field in self.op.output_fields:
1157
        if field == "name":
1158
          val = os_name
1159
        elif field == "valid":
1160
          val = utils.all([osl and osl[0] for osl in os_data.values()])
1161
        elif field == "node_status":
1162
          val = {}
1163
          for node_name, nos_list in os_data.iteritems():
1164
            val[node_name] = [(v.status, v.path) for v in nos_list]
1165
        else:
1166
          raise errors.ParameterError(field)
1167
        row.append(val)
1168
      output.append(row)
1169

    
1170
    return output
1171

    
1172

    
1173
class LURemoveNode(LogicalUnit):
1174
  """Logical unit for removing a node.
1175

1176
  """
1177
  HPATH = "node-remove"
1178
  HTYPE = constants.HTYPE_NODE
1179
  _OP_REQP = ["node_name"]
1180

    
1181
  def BuildHooksEnv(self):
1182
    """Build hooks env.
1183

1184
    This doesn't run on the target node in the pre phase as a failed
1185
    node would not allows itself to run.
1186

1187
    """
1188
    env = {
1189
      "OP_TARGET": self.op.node_name,
1190
      "NODE_NAME": self.op.node_name,
1191
      }
1192
    all_nodes = self.cfg.GetNodeList()
1193
    all_nodes.remove(self.op.node_name)
1194
    return env, all_nodes, all_nodes
1195

    
1196
  def CheckPrereq(self):
1197
    """Check prerequisites.
1198

1199
    This checks:
1200
     - the node exists in the configuration
1201
     - it does not have primary or secondary instances
1202
     - it's not the master
1203

1204
    Any errors are signalled by raising errors.OpPrereqError.
1205

1206
    """
1207
    node = self.cfg.GetNodeInfo(self.cfg.ExpandNodeName(self.op.node_name))
1208
    if node is None:
1209
      raise errors.OpPrereqError, ("Node '%s' is unknown." % self.op.node_name)
1210

    
1211
    instance_list = self.cfg.GetInstanceList()
1212

    
1213
    masternode = self.sstore.GetMasterNode()
1214
    if node.name == masternode:
1215
      raise errors.OpPrereqError("Node is the master node,"
1216
                                 " you need to failover first.")
1217

    
1218
    for instance_name in instance_list:
1219
      instance = self.cfg.GetInstanceInfo(instance_name)
1220
      if node.name == instance.primary_node:
1221
        raise errors.OpPrereqError("Instance %s still running on the node,"
1222
                                   " please remove first." % instance_name)
1223
      if node.name in instance.secondary_nodes:
1224
        raise errors.OpPrereqError("Instance %s has node as a secondary,"
1225
                                   " please remove first." % instance_name)
1226
    self.op.node_name = node.name
1227
    self.node = node
1228

    
1229
  def Exec(self, feedback_fn):
1230
    """Removes the node from the cluster.
1231

1232
    """
1233
    node = self.node
1234
    logger.Info("stopping the node daemon and removing configs from node %s" %
1235
                node.name)
1236

    
1237
    rpc.call_node_leave_cluster(node.name)
1238

    
1239
    self.ssh.Run(node.name, 'root', "%s stop" % constants.NODE_INITD_SCRIPT)
1240

    
1241
    logger.Info("Removing node %s from config" % node.name)
1242

    
1243
    self.cfg.RemoveNode(node.name)
1244

    
1245
    utils.RemoveHostFromEtcHosts(node.name)
1246

    
1247

    
1248
class LUQueryNodes(NoHooksLU):
1249
  """Logical unit for querying nodes.
1250

1251
  """
1252
  _OP_REQP = ["output_fields", "names"]
1253

    
1254
  def CheckPrereq(self):
1255
    """Check prerequisites.
1256

1257
    This checks that the fields required are valid output fields.
1258

1259
    """
1260
    self.dynamic_fields = frozenset([
1261
      "dtotal", "dfree",
1262
      "mtotal", "mnode", "mfree",
1263
      "bootid",
1264
      "ctotal",
1265
      ])
1266

    
1267
    _CheckOutputFields(static=["name", "pinst_cnt", "sinst_cnt",
1268
                               "pinst_list", "sinst_list",
1269
                               "pip", "sip"],
1270
                       dynamic=self.dynamic_fields,
1271
                       selected=self.op.output_fields)
1272

    
1273
    self.wanted = _GetWantedNodes(self, self.op.names)
1274

    
1275
  def Exec(self, feedback_fn):
1276
    """Computes the list of nodes and their attributes.
1277

1278
    """
1279
    nodenames = self.wanted
1280
    nodelist = [self.cfg.GetNodeInfo(name) for name in nodenames]
1281

    
1282
    # begin data gathering
1283

    
1284
    if self.dynamic_fields.intersection(self.op.output_fields):
1285
      live_data = {}
1286
      node_data = rpc.call_node_info(nodenames, self.cfg.GetVGName())
1287
      for name in nodenames:
1288
        nodeinfo = node_data.get(name, None)
1289
        if nodeinfo:
1290
          live_data[name] = {
1291
            "mtotal": utils.TryConvert(int, nodeinfo['memory_total']),
1292
            "mnode": utils.TryConvert(int, nodeinfo['memory_dom0']),
1293
            "mfree": utils.TryConvert(int, nodeinfo['memory_free']),
1294
            "dtotal": utils.TryConvert(int, nodeinfo['vg_size']),
1295
            "dfree": utils.TryConvert(int, nodeinfo['vg_free']),
1296
            "ctotal": utils.TryConvert(int, nodeinfo['cpu_total']),
1297
            "bootid": nodeinfo['bootid'],
1298
            }
1299
        else:
1300
          live_data[name] = {}
1301
    else:
1302
      live_data = dict.fromkeys(nodenames, {})
1303

    
1304
    node_to_primary = dict([(name, set()) for name in nodenames])
1305
    node_to_secondary = dict([(name, set()) for name in nodenames])
1306

    
1307
    inst_fields = frozenset(("pinst_cnt", "pinst_list",
1308
                             "sinst_cnt", "sinst_list"))
1309
    if inst_fields & frozenset(self.op.output_fields):
1310
      instancelist = self.cfg.GetInstanceList()
1311

    
1312
      for instance_name in instancelist:
1313
        inst = self.cfg.GetInstanceInfo(instance_name)
1314
        if inst.primary_node in node_to_primary:
1315
          node_to_primary[inst.primary_node].add(inst.name)
1316
        for secnode in inst.secondary_nodes:
1317
          if secnode in node_to_secondary:
1318
            node_to_secondary[secnode].add(inst.name)
1319

    
1320
    # end data gathering
1321

    
1322
    output = []
1323
    for node in nodelist:
1324
      node_output = []
1325
      for field in self.op.output_fields:
1326
        if field == "name":
1327
          val = node.name
1328
        elif field == "pinst_list":
1329
          val = list(node_to_primary[node.name])
1330
        elif field == "sinst_list":
1331
          val = list(node_to_secondary[node.name])
1332
        elif field == "pinst_cnt":
1333
          val = len(node_to_primary[node.name])
1334
        elif field == "sinst_cnt":
1335
          val = len(node_to_secondary[node.name])
1336
        elif field == "pip":
1337
          val = node.primary_ip
1338
        elif field == "sip":
1339
          val = node.secondary_ip
1340
        elif field in self.dynamic_fields:
1341
          val = live_data[node.name].get(field, None)
1342
        else:
1343
          raise errors.ParameterError(field)
1344
        node_output.append(val)
1345
      output.append(node_output)
1346

    
1347
    return output
1348

    
1349

    
1350
class LUQueryNodeVolumes(NoHooksLU):
1351
  """Logical unit for getting volumes on node(s).
1352

1353
  """
1354
  _OP_REQP = ["nodes", "output_fields"]
1355

    
1356
  def CheckPrereq(self):
1357
    """Check prerequisites.
1358

1359
    This checks that the fields required are valid output fields.
1360

1361
    """
1362
    self.nodes = _GetWantedNodes(self, self.op.nodes)
1363

    
1364
    _CheckOutputFields(static=["node"],
1365
                       dynamic=["phys", "vg", "name", "size", "instance"],
1366
                       selected=self.op.output_fields)
1367

    
1368

    
1369
  def Exec(self, feedback_fn):
1370
    """Computes the list of nodes and their attributes.
1371

1372
    """
1373
    nodenames = self.nodes
1374
    volumes = rpc.call_node_volumes(nodenames)
1375

    
1376
    ilist = [self.cfg.GetInstanceInfo(iname) for iname
1377
             in self.cfg.GetInstanceList()]
1378

    
1379
    lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
1380

    
1381
    output = []
1382
    for node in nodenames:
1383
      if node not in volumes or not volumes[node]:
1384
        continue
1385

    
1386
      node_vols = volumes[node][:]
1387
      node_vols.sort(key=lambda vol: vol['dev'])
1388

    
1389
      for vol in node_vols:
1390
        node_output = []
1391
        for field in self.op.output_fields:
1392
          if field == "node":
1393
            val = node
1394
          elif field == "phys":
1395
            val = vol['dev']
1396
          elif field == "vg":
1397
            val = vol['vg']
1398
          elif field == "name":
1399
            val = vol['name']
1400
          elif field == "size":
1401
            val = int(float(vol['size']))
1402
          elif field == "instance":
1403
            for inst in ilist:
1404
              if node not in lv_by_node[inst]:
1405
                continue
1406
              if vol['name'] in lv_by_node[inst][node]:
1407
                val = inst.name
1408
                break
1409
            else:
1410
              val = '-'
1411
          else:
1412
            raise errors.ParameterError(field)
1413
          node_output.append(str(val))
1414

    
1415
        output.append(node_output)
1416

    
1417
    return output
1418

    
1419

    
1420
class LUAddNode(LogicalUnit):
1421
  """Logical unit for adding node to the cluster.
1422

1423
  """
1424
  HPATH = "node-add"
1425
  HTYPE = constants.HTYPE_NODE
1426
  _OP_REQP = ["node_name"]
1427

    
1428
  def BuildHooksEnv(self):
1429
    """Build hooks env.
1430

1431
    This will run on all nodes before, and on all nodes + the new node after.
1432

1433
    """
1434
    env = {
1435
      "OP_TARGET": self.op.node_name,
1436
      "NODE_NAME": self.op.node_name,
1437
      "NODE_PIP": self.op.primary_ip,
1438
      "NODE_SIP": self.op.secondary_ip,
1439
      }
1440
    nodes_0 = self.cfg.GetNodeList()
1441
    nodes_1 = nodes_0 + [self.op.node_name, ]
1442
    return env, nodes_0, nodes_1
1443

    
1444
  def CheckPrereq(self):
1445
    """Check prerequisites.
1446

1447
    This checks:
1448
     - the new node is not already in the config
1449
     - it is resolvable
1450
     - its parameters (single/dual homed) matches the cluster
1451

1452
    Any errors are signalled by raising errors.OpPrereqError.
1453

1454
    """
1455
    node_name = self.op.node_name
1456
    cfg = self.cfg
1457

    
1458
    dns_data = utils.HostInfo(node_name)
1459

    
1460
    node = dns_data.name
1461
    primary_ip = self.op.primary_ip = dns_data.ip
1462
    secondary_ip = getattr(self.op, "secondary_ip", None)
1463
    if secondary_ip is None:
1464
      secondary_ip = primary_ip
1465
    if not utils.IsValidIP(secondary_ip):
1466
      raise errors.OpPrereqError("Invalid secondary IP given")
1467
    self.op.secondary_ip = secondary_ip
1468

    
1469
    node_list = cfg.GetNodeList()
1470
    if not self.op.readd and node in node_list:
1471
      raise errors.OpPrereqError("Node %s is already in the configuration" %
1472
                                 node)
1473
    elif self.op.readd and node not in node_list:
1474
      raise errors.OpPrereqError("Node %s is not in the configuration" % node)
1475

    
1476
    for existing_node_name in node_list:
1477
      existing_node = cfg.GetNodeInfo(existing_node_name)
1478

    
1479
      if self.op.readd and node == existing_node_name:
1480
        if (existing_node.primary_ip != primary_ip or
1481
            existing_node.secondary_ip != secondary_ip):
1482
          raise errors.OpPrereqError("Readded node doesn't have the same IP"
1483
                                     " address configuration as before")
1484
        continue
1485

    
1486
      if (existing_node.primary_ip == primary_ip or
1487
          existing_node.secondary_ip == primary_ip or
1488
          existing_node.primary_ip == secondary_ip or
1489
          existing_node.secondary_ip == secondary_ip):
1490
        raise errors.OpPrereqError("New node ip address(es) conflict with"
1491
                                   " existing node %s" % existing_node.name)
1492

    
1493
    # check that the type of the node (single versus dual homed) is the
1494
    # same as for the master
1495
    myself = cfg.GetNodeInfo(self.sstore.GetMasterNode())
1496
    master_singlehomed = myself.secondary_ip == myself.primary_ip
1497
    newbie_singlehomed = secondary_ip == primary_ip
1498
    if master_singlehomed != newbie_singlehomed:
1499
      if master_singlehomed:
1500
        raise errors.OpPrereqError("The master has no private ip but the"
1501
                                   " new node has one")
1502
      else:
1503
        raise errors.OpPrereqError("The master has a private ip but the"
1504
                                   " new node doesn't have one")
1505

    
1506
    # checks reachablity
1507
    if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
1508
      raise errors.OpPrereqError("Node not reachable by ping")
1509

    
1510
    if not newbie_singlehomed:
1511
      # check reachability from my secondary ip to newbie's secondary ip
1512
      if not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
1513
                           source=myself.secondary_ip):
1514
        raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
1515
                                   " based ping to noded port")
1516

    
1517
    self.new_node = objects.Node(name=node,
1518
                                 primary_ip=primary_ip,
1519
                                 secondary_ip=secondary_ip)
1520

    
1521
    if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31:
1522
      if not os.path.exists(constants.VNC_PASSWORD_FILE):
1523
        raise errors.OpPrereqError("Cluster VNC password file %s missing" %
1524
                                   constants.VNC_PASSWORD_FILE)
1525

    
1526
  def Exec(self, feedback_fn):
1527
    """Adds the new node to the cluster.
1528

1529
    """
1530
    new_node = self.new_node
1531
    node = new_node.name
1532

    
1533
    # set up inter-node password and certificate and restarts the node daemon
1534
    gntpass = self.sstore.GetNodeDaemonPassword()
1535
    if not re.match('^[a-zA-Z0-9.]{1,64}$', gntpass):
1536
      raise errors.OpExecError("ganeti password corruption detected")
1537
    f = open(constants.SSL_CERT_FILE)
1538
    try:
1539
      gntpem = f.read(8192)
1540
    finally:
1541
      f.close()
1542
    # in the base64 pem encoding, neither '!' nor '.' are valid chars,
1543
    # so we use this to detect an invalid certificate; as long as the
1544
    # cert doesn't contain this, the here-document will be correctly
1545
    # parsed by the shell sequence below
1546
    if re.search('^!EOF\.', gntpem, re.MULTILINE):
1547
      raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
1548
    if not gntpem.endswith("\n"):
1549
      raise errors.OpExecError("PEM must end with newline")
1550
    logger.Info("copy cluster pass to %s and starting the node daemon" % node)
1551

    
1552
    # and then connect with ssh to set password and start ganeti-noded
1553
    # note that all the below variables are sanitized at this point,
1554
    # either by being constants or by the checks above
1555
    ss = self.sstore
1556
    mycommand = ("umask 077 && "
1557
                 "echo '%s' > '%s' && "
1558
                 "cat > '%s' << '!EOF.' && \n"
1559
                 "%s!EOF.\n%s restart" %
1560
                 (gntpass, ss.KeyToFilename(ss.SS_NODED_PASS),
1561
                  constants.SSL_CERT_FILE, gntpem,
1562
                  constants.NODE_INITD_SCRIPT))
1563

    
1564
    result = self.ssh.Run(node, 'root', mycommand, batch=False, ask_key=True)
1565
    if result.failed:
1566
      raise errors.OpExecError("Remote command on node %s, error: %s,"
1567
                               " output: %s" %
1568
                               (node, result.fail_reason, result.output))
1569

    
1570
    # check connectivity
1571
    time.sleep(4)
1572

    
1573
    result = rpc.call_version([node])[node]
1574
    if result:
1575
      if constants.PROTOCOL_VERSION == result:
1576
        logger.Info("communication to node %s fine, sw version %s match" %
1577
                    (node, result))
1578
      else:
1579
        raise errors.OpExecError("Version mismatch master version %s,"
1580
                                 " node version %s" %
1581
                                 (constants.PROTOCOL_VERSION, result))
1582
    else:
1583
      raise errors.OpExecError("Cannot get version from the new node")
1584

    
1585
    # setup ssh on node
1586
    logger.Info("copy ssh key to node %s" % node)
1587
    priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
1588
    keyarray = []
1589
    keyfiles = [constants.SSH_HOST_DSA_PRIV, constants.SSH_HOST_DSA_PUB,
1590
                constants.SSH_HOST_RSA_PRIV, constants.SSH_HOST_RSA_PUB,
1591
                priv_key, pub_key]
1592

    
1593
    for i in keyfiles:
1594
      f = open(i, 'r')
1595
      try:
1596
        keyarray.append(f.read())
1597
      finally:
1598
        f.close()
1599

    
1600
    result = rpc.call_node_add(node, keyarray[0], keyarray[1], keyarray[2],
1601
                               keyarray[3], keyarray[4], keyarray[5])
1602

    
1603
    if not result:
1604
      raise errors.OpExecError("Cannot transfer ssh keys to the new node")
1605

    
1606
    # Add node to our /etc/hosts, and add key to known_hosts
1607
    utils.AddHostToEtcHosts(new_node.name)
1608

    
1609
    if new_node.secondary_ip != new_node.primary_ip:
1610
      if not rpc.call_node_tcp_ping(new_node.name,
1611
                                    constants.LOCALHOST_IP_ADDRESS,
1612
                                    new_node.secondary_ip,
1613
                                    constants.DEFAULT_NODED_PORT,
1614
                                    10, False):
1615
        raise errors.OpExecError("Node claims it doesn't have the secondary ip"
1616
                                 " you gave (%s). Please fix and re-run this"
1617
                                 " command." % new_node.secondary_ip)
1618

    
1619
    success, msg = self.ssh.VerifyNodeHostname(node)
1620
    if not success:
1621
      raise errors.OpExecError("Node '%s' claims it has a different hostname"
1622
                               " than the one the resolver gives: %s."
1623
                               " Please fix and re-run this command." %
1624
                               (node, msg))
1625

    
1626
    # Distribute updated /etc/hosts and known_hosts to all nodes,
1627
    # including the node just added
1628
    myself = self.cfg.GetNodeInfo(self.sstore.GetMasterNode())
1629
    dist_nodes = self.cfg.GetNodeList()
1630
    if not self.op.readd:
1631
      dist_nodes.append(node)
1632
    if myself.name in dist_nodes:
1633
      dist_nodes.remove(myself.name)
1634

    
1635
    logger.Debug("Copying hosts and known_hosts to all nodes")
1636
    for fname in (constants.ETC_HOSTS, constants.SSH_KNOWN_HOSTS_FILE):
1637
      result = rpc.call_upload_file(dist_nodes, fname)
1638
      for to_node in dist_nodes:
1639
        if not result[to_node]:
1640
          logger.Error("copy of file %s to node %s failed" %
1641
                       (fname, to_node))
1642

    
1643
    to_copy = ss.GetFileList()
1644
    if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31:
1645
      to_copy.append(constants.VNC_PASSWORD_FILE)
1646
    for fname in to_copy:
1647
      if not self.ssh.CopyFileToNode(node, fname):
1648
        logger.Error("could not copy file %s to node %s" % (fname, node))
1649

    
1650
    if not self.op.readd:
1651
      logger.Info("adding node %s to cluster.conf" % node)
1652
      self.cfg.AddNode(new_node)
1653

    
1654

    
1655
class LUMasterFailover(LogicalUnit):
1656
  """Failover the master node to the current node.
1657

1658
  This is a special LU in that it must run on a non-master node.
1659

1660
  """
1661
  HPATH = "master-failover"
1662
  HTYPE = constants.HTYPE_CLUSTER
1663
  REQ_MASTER = False
1664
  _OP_REQP = []
1665

    
1666
  def BuildHooksEnv(self):
1667
    """Build hooks env.
1668

1669
    This will run on the new master only in the pre phase, and on all
1670
    the nodes in the post phase.
1671

1672
    """
1673
    env = {
1674
      "OP_TARGET": self.new_master,
1675
      "NEW_MASTER": self.new_master,
1676
      "OLD_MASTER": self.old_master,
1677
      }
1678
    return env, [self.new_master], self.cfg.GetNodeList()
1679

    
1680
  def CheckPrereq(self):
1681
    """Check prerequisites.
1682

1683
    This checks that we are not already the master.
1684

1685
    """
1686
    self.new_master = utils.HostInfo().name
1687
    self.old_master = self.sstore.GetMasterNode()
1688

    
1689
    if self.old_master == self.new_master:
1690
      raise errors.OpPrereqError("This commands must be run on the node"
1691
                                 " where you want the new master to be."
1692
                                 " %s is already the master" %
1693
                                 self.old_master)
1694

    
1695
  def Exec(self, feedback_fn):
1696
    """Failover the master node.
1697

1698
    This command, when run on a non-master node, will cause the current
1699
    master to cease being master, and the non-master to become new
1700
    master.
1701

1702
    """
1703
    #TODO: do not rely on gethostname returning the FQDN
1704
    logger.Info("setting master to %s, old master: %s" %
1705
                (self.new_master, self.old_master))
1706

    
1707
    if not rpc.call_node_stop_master(self.old_master):
1708
      logger.Error("could disable the master role on the old master"
1709
                   " %s, please disable manually" % self.old_master)
1710

    
1711
    ss = self.sstore
1712
    ss.SetKey(ss.SS_MASTER_NODE, self.new_master)
1713
    if not rpc.call_upload_file(self.cfg.GetNodeList(),
1714
                                ss.KeyToFilename(ss.SS_MASTER_NODE)):
1715
      logger.Error("could not distribute the new simple store master file"
1716
                   " to the other nodes, please check.")
1717

    
1718
    if not rpc.call_node_start_master(self.new_master):
1719
      logger.Error("could not start the master role on the new master"
1720
                   " %s, please check" % self.new_master)
1721
      feedback_fn("Error in activating the master IP on the new master,"
1722
                  " please fix manually.")
1723

    
1724

    
1725

    
1726
class LUQueryClusterInfo(NoHooksLU):
1727
  """Query cluster configuration.
1728

1729
  """
1730
  _OP_REQP = []
1731
  REQ_MASTER = False
1732

    
1733
  def CheckPrereq(self):
1734
    """No prerequsites needed for this LU.
1735

1736
    """
1737
    pass
1738

    
1739
  def Exec(self, feedback_fn):
1740
    """Return cluster config.
1741

1742
    """
1743
    result = {
1744
      "name": self.sstore.GetClusterName(),
1745
      "software_version": constants.RELEASE_VERSION,
1746
      "protocol_version": constants.PROTOCOL_VERSION,
1747
      "config_version": constants.CONFIG_VERSION,
1748
      "os_api_version": constants.OS_API_VERSION,
1749
      "export_version": constants.EXPORT_VERSION,
1750
      "master": self.sstore.GetMasterNode(),
1751
      "architecture": (platform.architecture()[0], platform.machine()),
1752
      "hypervisor_type": self.sstore.GetHypervisorType(),
1753
      }
1754

    
1755
    return result
1756

    
1757

    
1758
class LUClusterCopyFile(NoHooksLU):
1759
  """Copy file to cluster.
1760

1761
  """
1762
  _OP_REQP = ["nodes", "filename"]
1763

    
1764
  def CheckPrereq(self):
1765
    """Check prerequisites.
1766

1767
    It should check that the named file exists and that the given list
1768
    of nodes is valid.
1769

1770
    """
1771
    if not os.path.exists(self.op.filename):
1772
      raise errors.OpPrereqError("No such filename '%s'" % self.op.filename)
1773

    
1774
    self.nodes = _GetWantedNodes(self, self.op.nodes)
1775

    
1776
  def Exec(self, feedback_fn):
1777
    """Copy a file from master to some nodes.
1778

1779
    Args:
1780
      opts - class with options as members
1781
      args - list containing a single element, the file name
1782
    Opts used:
1783
      nodes - list containing the name of target nodes; if empty, all nodes
1784

1785
    """
1786
    filename = self.op.filename
1787

    
1788
    myname = utils.HostInfo().name
1789

    
1790
    for node in self.nodes:
1791
      if node == myname:
1792
        continue
1793
      if not self.ssh.CopyFileToNode(node, filename):
1794
        logger.Error("Copy of file %s to node %s failed" % (filename, node))
1795

    
1796

    
1797
class LUDumpClusterConfig(NoHooksLU):
1798
  """Return a text-representation of the cluster-config.
1799

1800
  """
1801
  _OP_REQP = []
1802

    
1803
  def CheckPrereq(self):
1804
    """No prerequisites.
1805

1806
    """
1807
    pass
1808

    
1809
  def Exec(self, feedback_fn):
1810
    """Dump a representation of the cluster config to the standard output.
1811

1812
    """
1813
    return self.cfg.DumpConfig()
1814

    
1815

    
1816
class LURunClusterCommand(NoHooksLU):
1817
  """Run a command on some nodes.
1818

1819
  """
1820
  _OP_REQP = ["command", "nodes"]
1821

    
1822
  def CheckPrereq(self):
1823
    """Check prerequisites.
1824

1825
    It checks that the given list of nodes is valid.
1826

1827
    """
1828
    self.nodes = _GetWantedNodes(self, self.op.nodes)
1829

    
1830
  def Exec(self, feedback_fn):
1831
    """Run a command on some nodes.
1832

1833
    """
1834
    # put the master at the end of the nodes list
1835
    master_node = self.sstore.GetMasterNode()
1836
    if master_node in self.nodes:
1837
      self.nodes.remove(master_node)
1838
      self.nodes.append(master_node)
1839

    
1840
    data = []
1841
    for node in self.nodes:
1842
      result = self.ssh.Run(node, "root", self.op.command)
1843
      data.append((node, result.output, result.exit_code))
1844

    
1845
    return data
1846

    
1847

    
1848
class LUActivateInstanceDisks(NoHooksLU):
1849
  """Bring up an instance's disks.
1850

1851
  """
1852
  _OP_REQP = ["instance_name"]
1853

    
1854
  def CheckPrereq(self):
1855
    """Check prerequisites.
1856

1857
    This checks that the instance is in the cluster.
1858

1859
    """
1860
    instance = self.cfg.GetInstanceInfo(
1861
      self.cfg.ExpandInstanceName(self.op.instance_name))
1862
    if instance is None:
1863
      raise errors.OpPrereqError("Instance '%s' not known" %
1864
                                 self.op.instance_name)
1865
    self.instance = instance
1866

    
1867

    
1868
  def Exec(self, feedback_fn):
1869
    """Activate the disks.
1870

1871
    """
1872
    disks_ok, disks_info = _AssembleInstanceDisks(self.instance, self.cfg)
1873
    if not disks_ok:
1874
      raise errors.OpExecError("Cannot activate block devices")
1875

    
1876
    return disks_info
1877

    
1878

    
1879
def _AssembleInstanceDisks(instance, cfg, ignore_secondaries=False):
1880
  """Prepare the block devices for an instance.
1881

1882
  This sets up the block devices on all nodes.
1883

1884
  Args:
1885
    instance: a ganeti.objects.Instance object
1886
    ignore_secondaries: if true, errors on secondary nodes won't result
1887
                        in an error return from the function
1888

1889
  Returns:
1890
    false if the operation failed
1891
    list of (host, instance_visible_name, node_visible_name) if the operation
1892
         suceeded with the mapping from node devices to instance devices
1893
  """
1894
  device_info = []
1895
  disks_ok = True
1896
  iname = instance.name
1897
  # With the two passes mechanism we try to reduce the window of
1898
  # opportunity for the race condition of switching DRBD to primary
1899
  # before handshaking occured, but we do not eliminate it
1900

    
1901
  # The proper fix would be to wait (with some limits) until the
1902
  # connection has been made and drbd transitions from WFConnection
1903
  # into any other network-connected state (Connected, SyncTarget,
1904
  # SyncSource, etc.)
1905

    
1906
  # 1st pass, assemble on all nodes in secondary mode
1907
  for inst_disk in instance.disks:
1908
    for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
1909
      cfg.SetDiskID(node_disk, node)
1910
      result = rpc.call_blockdev_assemble(node, node_disk, iname, False)
1911
      if not result:
1912
        logger.Error("could not prepare block device %s on node %s"
1913
                     " (is_primary=False, pass=1)" % (inst_disk.iv_name, node))
1914
        if not ignore_secondaries:
1915
          disks_ok = False
1916

    
1917
  # FIXME: race condition on drbd migration to primary
1918

    
1919
  # 2nd pass, do only the primary node
1920
  for inst_disk in instance.disks:
1921
    for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
1922
      if node != instance.primary_node:
1923
        continue
1924
      cfg.SetDiskID(node_disk, node)
1925
      result = rpc.call_blockdev_assemble(node, node_disk, iname, True)
1926
      if not result:
1927
        logger.Error("could not prepare block device %s on node %s"
1928
                     " (is_primary=True, pass=2)" % (inst_disk.iv_name, node))
1929
        disks_ok = False
1930
    device_info.append((instance.primary_node, inst_disk.iv_name, result))
1931

    
1932
  # leave the disks configured for the primary node
1933
  # this is a workaround that would be fixed better by
1934
  # improving the logical/physical id handling
1935
  for disk in instance.disks:
1936
    cfg.SetDiskID(disk, instance.primary_node)
1937

    
1938
  return disks_ok, device_info
1939

    
1940

    
1941
def _StartInstanceDisks(cfg, instance, force):
1942
  """Start the disks of an instance.
1943

1944
  """
1945
  disks_ok, dummy = _AssembleInstanceDisks(instance, cfg,
1946
                                           ignore_secondaries=force)
1947
  if not disks_ok:
1948
    _ShutdownInstanceDisks(instance, cfg)
1949
    if force is not None and not force:
1950
      logger.Error("If the message above refers to a secondary node,"
1951
                   " you can retry the operation using '--force'.")
1952
    raise errors.OpExecError("Disk consistency error")
1953

    
1954

    
1955
class LUDeactivateInstanceDisks(NoHooksLU):
1956
  """Shutdown an instance's disks.
1957

1958
  """
1959
  _OP_REQP = ["instance_name"]
1960

    
1961
  def CheckPrereq(self):
1962
    """Check prerequisites.
1963

1964
    This checks that the instance is in the cluster.
1965

1966
    """
1967
    instance = self.cfg.GetInstanceInfo(
1968
      self.cfg.ExpandInstanceName(self.op.instance_name))
1969
    if instance is None:
1970
      raise errors.OpPrereqError("Instance '%s' not known" %
1971
                                 self.op.instance_name)
1972
    self.instance = instance
1973

    
1974
  def Exec(self, feedback_fn):
1975
    """Deactivate the disks
1976

1977
    """
1978
    instance = self.instance
1979
    ins_l = rpc.call_instance_list([instance.primary_node])
1980
    ins_l = ins_l[instance.primary_node]
1981
    if not type(ins_l) is list:
1982
      raise errors.OpExecError("Can't contact node '%s'" %
1983
                               instance.primary_node)
1984

    
1985
    if self.instance.name in ins_l:
1986
      raise errors.OpExecError("Instance is running, can't shutdown"
1987
                               " block devices.")
1988

    
1989
    _ShutdownInstanceDisks(instance, self.cfg)
1990

    
1991

    
1992
def _ShutdownInstanceDisks(instance, cfg, ignore_primary=False):
1993
  """Shutdown block devices of an instance.
1994

1995
  This does the shutdown on all nodes of the instance.
1996

1997
  If the ignore_primary is false, errors on the primary node are
1998
  ignored.
1999

2000
  """
2001
  result = True
2002
  for disk in instance.disks:
2003
    for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
2004
      cfg.SetDiskID(top_disk, node)
2005
      if not rpc.call_blockdev_shutdown(node, top_disk):
2006
        logger.Error("could not shutdown block device %s on node %s" %
2007
                     (disk.iv_name, node))
2008
        if not ignore_primary or node != instance.primary_node:
2009
          result = False
2010
  return result
2011

    
2012

    
2013
def _CheckNodeFreeMemory(cfg, node, reason, requested):
2014
  """Checks if a node has enough free memory.
2015

2016
  This function check if a given node has the needed amount of free
2017
  memory. In case the node has less memory or we cannot get the
2018
  information from the node, this function raise an OpPrereqError
2019
  exception.
2020

2021
  Args:
2022
    - cfg: a ConfigWriter instance
2023
    - node: the node name
2024
    - reason: string to use in the error message
2025
    - requested: the amount of memory in MiB
2026

2027
  """
2028
  nodeinfo = rpc.call_node_info([node], cfg.GetVGName())
2029
  if not nodeinfo or not isinstance(nodeinfo, dict):
2030
    raise errors.OpPrereqError("Could not contact node %s for resource"
2031
                             " information" % (node,))
2032

    
2033
  free_mem = nodeinfo[node].get('memory_free')
2034
  if not isinstance(free_mem, int):
2035
    raise errors.OpPrereqError("Can't compute free memory on node %s, result"
2036
                             " was '%s'" % (node, free_mem))
2037
  if requested > free_mem:
2038
    raise errors.OpPrereqError("Not enough memory on node %s for %s:"
2039
                             " needed %s MiB, available %s MiB" %
2040
                             (node, reason, requested, free_mem))
2041

    
2042

    
2043
class LUStartupInstance(LogicalUnit):
2044
  """Starts an instance.
2045

2046
  """
2047
  HPATH = "instance-start"
2048
  HTYPE = constants.HTYPE_INSTANCE
2049
  _OP_REQP = ["instance_name", "force"]
2050

    
2051
  def BuildHooksEnv(self):
2052
    """Build hooks env.
2053

2054
    This runs on master, primary and secondary nodes of the instance.
2055

2056
    """
2057
    env = {
2058
      "FORCE": self.op.force,
2059
      }
2060
    env.update(_BuildInstanceHookEnvByObject(self.instance))
2061
    nl = ([self.sstore.GetMasterNode(), self.instance.primary_node] +
2062
          list(self.instance.secondary_nodes))
2063
    return env, nl, nl
2064

    
2065
  def CheckPrereq(self):
2066
    """Check prerequisites.
2067

2068
    This checks that the instance is in the cluster.
2069

2070
    """
2071
    instance = self.cfg.GetInstanceInfo(
2072
      self.cfg.ExpandInstanceName(self.op.instance_name))
2073
    if instance is None:
2074
      raise errors.OpPrereqError("Instance '%s' not known" %
2075
                                 self.op.instance_name)
2076

    
2077
    # check bridges existance
2078
    _CheckInstanceBridgesExist(instance)
2079

    
2080
    _CheckNodeFreeMemory(self.cfg, instance.primary_node,
2081
                         "starting instance %s" % instance.name,
2082
                         instance.memory)
2083

    
2084
    self.instance = instance
2085
    self.op.instance_name = instance.name
2086

    
2087
  def Exec(self, feedback_fn):
2088
    """Start the instance.
2089

2090
    """
2091
    instance = self.instance
2092
    force = self.op.force
2093
    extra_args = getattr(self.op, "extra_args", "")
2094

    
2095
    self.cfg.MarkInstanceUp(instance.name)
2096

    
2097
    node_current = instance.primary_node
2098

    
2099
    _StartInstanceDisks(self.cfg, instance, force)
2100

    
2101
    if not rpc.call_instance_start(node_current, instance, extra_args):
2102
      _ShutdownInstanceDisks(instance, self.cfg)
2103
      raise errors.OpExecError("Could not start instance")
2104

    
2105

    
2106
class LURebootInstance(LogicalUnit):
2107
  """Reboot an instance.
2108

2109
  """
2110
  HPATH = "instance-reboot"
2111
  HTYPE = constants.HTYPE_INSTANCE
2112
  _OP_REQP = ["instance_name", "ignore_secondaries", "reboot_type"]
2113

    
2114
  def BuildHooksEnv(self):
2115
    """Build hooks env.
2116

2117
    This runs on master, primary and secondary nodes of the instance.
2118

2119
    """
2120
    env = {
2121
      "IGNORE_SECONDARIES": self.op.ignore_secondaries,
2122
      }
2123
    env.update(_BuildInstanceHookEnvByObject(self.instance))
2124
    nl = ([self.sstore.GetMasterNode(), self.instance.primary_node] +
2125
          list(self.instance.secondary_nodes))
2126
    return env, nl, nl
2127

    
2128
  def CheckPrereq(self):
2129
    """Check prerequisites.
2130

2131
    This checks that the instance is in the cluster.
2132

2133
    """
2134
    instance = self.cfg.GetInstanceInfo(
2135
      self.cfg.ExpandInstanceName(self.op.instance_name))
2136
    if instance is None:
2137
      raise errors.OpPrereqError("Instance '%s' not known" %
2138
                                 self.op.instance_name)
2139

    
2140
    # check bridges existance
2141
    _CheckInstanceBridgesExist(instance)
2142

    
2143
    self.instance = instance
2144
    self.op.instance_name = instance.name
2145

    
2146
  def Exec(self, feedback_fn):
2147
    """Reboot the instance.
2148

2149
    """
2150
    instance = self.instance
2151
    ignore_secondaries = self.op.ignore_secondaries
2152
    reboot_type = self.op.reboot_type
2153
    extra_args = getattr(self.op, "extra_args", "")
2154

    
2155
    node_current = instance.primary_node
2156

    
2157
    if reboot_type not in [constants.INSTANCE_REBOOT_SOFT,
2158
                           constants.INSTANCE_REBOOT_HARD,
2159
                           constants.INSTANCE_REBOOT_FULL]:
2160
      raise errors.ParameterError("reboot type not in [%s, %s, %s]" %
2161
                                  (constants.INSTANCE_REBOOT_SOFT,
2162
                                   constants.INSTANCE_REBOOT_HARD,
2163
                                   constants.INSTANCE_REBOOT_FULL))
2164

    
2165
    if reboot_type in [constants.INSTANCE_REBOOT_SOFT,
2166
                       constants.INSTANCE_REBOOT_HARD]:
2167
      if not rpc.call_instance_reboot(node_current, instance,
2168
                                      reboot_type, extra_args):
2169
        raise errors.OpExecError("Could not reboot instance")
2170
    else:
2171
      if not rpc.call_instance_shutdown(node_current, instance):
2172
        raise errors.OpExecError("could not shutdown instance for full reboot")
2173
      _ShutdownInstanceDisks(instance, self.cfg)
2174
      _StartInstanceDisks(self.cfg, instance, ignore_secondaries)
2175
      if not rpc.call_instance_start(node_current, instance, extra_args):
2176
        _ShutdownInstanceDisks(instance, self.cfg)
2177
        raise errors.OpExecError("Could not start instance for full reboot")
2178

    
2179
    self.cfg.MarkInstanceUp(instance.name)
2180

    
2181

    
2182
class LUShutdownInstance(LogicalUnit):
2183
  """Shutdown an instance.
2184

2185
  """
2186
  HPATH = "instance-stop"
2187
  HTYPE = constants.HTYPE_INSTANCE
2188
  _OP_REQP = ["instance_name"]
2189

    
2190
  def BuildHooksEnv(self):
2191
    """Build hooks env.
2192

2193
    This runs on master, primary and secondary nodes of the instance.
2194

2195
    """
2196
    env = _BuildInstanceHookEnvByObject(self.instance)
2197
    nl = ([self.sstore.GetMasterNode(), self.instance.primary_node] +
2198
          list(self.instance.secondary_nodes))
2199
    return env, nl, nl
2200

    
2201
  def CheckPrereq(self):
2202
    """Check prerequisites.
2203

2204
    This checks that the instance is in the cluster.
2205

2206
    """
2207
    instance = self.cfg.GetInstanceInfo(
2208
      self.cfg.ExpandInstanceName(self.op.instance_name))
2209
    if instance is None:
2210
      raise errors.OpPrereqError("Instance '%s' not known" %
2211
                                 self.op.instance_name)
2212
    self.instance = instance
2213

    
2214
  def Exec(self, feedback_fn):
2215
    """Shutdown the instance.
2216

2217
    """
2218
    instance = self.instance
2219
    node_current = instance.primary_node
2220
    self.cfg.MarkInstanceDown(instance.name)
2221
    if not rpc.call_instance_shutdown(node_current, instance):
2222
      logger.Error("could not shutdown instance")
2223

    
2224
    _ShutdownInstanceDisks(instance, self.cfg)
2225

    
2226

    
2227
class LUReinstallInstance(LogicalUnit):
2228
  """Reinstall an instance.
2229

2230
  """
2231
  HPATH = "instance-reinstall"
2232
  HTYPE = constants.HTYPE_INSTANCE
2233
  _OP_REQP = ["instance_name"]
2234

    
2235
  def BuildHooksEnv(self):
2236
    """Build hooks env.
2237

2238
    This runs on master, primary and secondary nodes of the instance.
2239

2240
    """
2241
    env = _BuildInstanceHookEnvByObject(self.instance)
2242
    nl = ([self.sstore.GetMasterNode(), self.instance.primary_node] +
2243
          list(self.instance.secondary_nodes))
2244
    return env, nl, nl
2245

    
2246
  def CheckPrereq(self):
2247
    """Check prerequisites.
2248

2249
    This checks that the instance is in the cluster and is not running.
2250

2251
    """
2252
    instance = self.cfg.GetInstanceInfo(
2253
      self.cfg.ExpandInstanceName(self.op.instance_name))
2254
    if instance is None:
2255
      raise errors.OpPrereqError("Instance '%s' not known" %
2256
                                 self.op.instance_name)
2257
    if instance.disk_template == constants.DT_DISKLESS:
2258
      raise errors.OpPrereqError("Instance '%s' has no disks" %
2259
                                 self.op.instance_name)
2260
    if instance.status != "down":
2261
      raise errors.OpPrereqError("Instance '%s' is marked to be up" %
2262
                                 self.op.instance_name)
2263
    remote_info = rpc.call_instance_info(instance.primary_node, instance.name)
2264
    if remote_info:
2265
      raise errors.OpPrereqError("Instance '%s' is running on the node %s" %
2266
                                 (self.op.instance_name,
2267
                                  instance.primary_node))
2268

    
2269
    self.op.os_type = getattr(self.op, "os_type", None)
2270
    if self.op.os_type is not None:
2271
      # OS verification
2272
      pnode = self.cfg.GetNodeInfo(
2273
        self.cfg.ExpandNodeName(instance.primary_node))
2274
      if pnode is None:
2275
        raise errors.OpPrereqError("Primary node '%s' is unknown" %
2276
                                   self.op.pnode)
2277
      os_obj = rpc.call_os_get(pnode.name, self.op.os_type)
2278
      if not os_obj:
2279
        raise errors.OpPrereqError("OS '%s' not in supported OS list for"
2280
                                   " primary node"  % self.op.os_type)
2281

    
2282
    self.instance = instance
2283

    
2284
  def Exec(self, feedback_fn):
2285
    """Reinstall the instance.
2286

2287
    """
2288
    inst = self.instance
2289

    
2290
    if self.op.os_type is not None:
2291
      feedback_fn("Changing OS to '%s'..." % self.op.os_type)
2292
      inst.os = self.op.os_type
2293
      self.cfg.AddInstance(inst)
2294

    
2295
    _StartInstanceDisks(self.cfg, inst, None)
2296
    try:
2297
      feedback_fn("Running the instance OS create scripts...")
2298
      if not rpc.call_instance_os_add(inst.primary_node, inst, "sda", "sdb"):
2299
        raise errors.OpExecError("Could not install OS for instance %s"
2300
                                 " on node %s" %
2301
                                 (inst.name, inst.primary_node))
2302
    finally:
2303
      _ShutdownInstanceDisks(inst, self.cfg)
2304

    
2305

    
2306
class LURenameInstance(LogicalUnit):
2307
  """Rename an instance.
2308

2309
  """
2310
  HPATH = "instance-rename"
2311
  HTYPE = constants.HTYPE_INSTANCE
2312
  _OP_REQP = ["instance_name", "new_name"]
2313

    
2314
  def BuildHooksEnv(self):
2315
    """Build hooks env.
2316

2317
    This runs on master, primary and secondary nodes of the instance.
2318

2319
    """
2320
    env = _BuildInstanceHookEnvByObject(self.instance)
2321
    env["INSTANCE_NEW_NAME"] = self.op.new_name
2322
    nl = ([self.sstore.GetMasterNode(), self.instance.primary_node] +
2323
          list(self.instance.secondary_nodes))
2324
    return env, nl, nl
2325

    
2326
  def CheckPrereq(self):
2327
    """Check prerequisites.
2328

2329
    This checks that the instance is in the cluster and is not running.
2330

2331
    """
2332
    instance = self.cfg.GetInstanceInfo(
2333
      self.cfg.ExpandInstanceName(self.op.instance_name))
2334
    if instance is None:
2335
      raise errors.OpPrereqError("Instance '%s' not known" %
2336
                                 self.op.instance_name)
2337
    if instance.status != "down":
2338
      raise errors.OpPrereqError("Instance '%s' is marked to be up" %
2339
                                 self.op.instance_name)
2340
    remote_info = rpc.call_instance_info(instance.primary_node, instance.name)
2341
    if remote_info:
2342
      raise errors.OpPrereqError("Instance '%s' is running on the node %s" %
2343
                                 (self.op.instance_name,
2344
                                  instance.primary_node))
2345
    self.instance = instance
2346

    
2347
    # new name verification
2348
    name_info = utils.HostInfo(self.op.new_name)
2349

    
2350
    self.op.new_name = new_name = name_info.name
2351
    instance_list = self.cfg.GetInstanceList()
2352
    if new_name in instance_list:
2353
      raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
2354
                                 new_name)
2355

    
2356
    if not getattr(self.op, "ignore_ip", False):
2357
      command = ["fping", "-q", name_info.ip]
2358
      result = utils.RunCmd(command)
2359
      if not result.failed:
2360
        raise errors.OpPrereqError("IP %s of instance %s already in use" %
2361
                                   (name_info.ip, new_name))
2362

    
2363

    
2364
  def Exec(self, feedback_fn):
2365
    """Reinstall the instance.
2366

2367
    """
2368
    inst = self.instance
2369
    old_name = inst.name
2370

    
2371
    if inst.disk_template == constants.DT_FILE:
2372
      old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
2373

    
2374
    self.cfg.RenameInstance(inst.name, self.op.new_name)
2375

    
2376
    # re-read the instance from the configuration after rename
2377
    inst = self.cfg.GetInstanceInfo(self.op.new_name)
2378

    
2379
    if inst.disk_template == constants.DT_FILE:
2380
      new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
2381
      result = rpc.call_file_storage_dir_rename(inst.primary_node,
2382
                                                old_file_storage_dir,
2383
                                                new_file_storage_dir)
2384

    
2385
      if not result:
2386
        raise errors.OpExecError("Could not connect to node '%s' to rename"
2387
                                 " directory '%s' to '%s' (but the instance"
2388
                                 " has been renamed in Ganeti)" % (
2389
                                 inst.primary_node, old_file_storage_dir,
2390
                                 new_file_storage_dir))
2391

    
2392
      if not result[0]:
2393
        raise errors.OpExecError("Could not rename directory '%s' to '%s'"
2394
                                 " (but the instance has been renamed in"
2395
                                 " Ganeti)" % (old_file_storage_dir,
2396
                                               new_file_storage_dir))
2397

    
2398
    _StartInstanceDisks(self.cfg, inst, None)
2399
    try:
2400
      if not rpc.call_instance_run_rename(inst.primary_node, inst, old_name,
2401
                                          "sda", "sdb"):
2402
        msg = ("Could run OS rename script for instance %s on node %s (but the"
2403
               " instance has been renamed in Ganeti)" %
2404
               (inst.name, inst.primary_node))
2405
        logger.Error(msg)
2406
    finally:
2407
      _ShutdownInstanceDisks(inst, self.cfg)
2408

    
2409

    
2410
class LURemoveInstance(LogicalUnit):
2411
  """Remove an instance.
2412

2413
  """
2414
  HPATH = "instance-remove"
2415
  HTYPE = constants.HTYPE_INSTANCE
2416
  _OP_REQP = ["instance_name", "ignore_failures"]
2417

    
2418
  def BuildHooksEnv(self):
2419
    """Build hooks env.
2420

2421
    This runs on master, primary and secondary nodes of the instance.
2422

2423
    """
2424
    env = _BuildInstanceHookEnvByObject(self.instance)
2425
    nl = [self.sstore.GetMasterNode()]
2426
    return env, nl, nl
2427

    
2428
  def CheckPrereq(self):
2429
    """Check prerequisites.
2430

2431
    This checks that the instance is in the cluster.
2432

2433
    """
2434
    instance = self.cfg.GetInstanceInfo(
2435
      self.cfg.ExpandInstanceName(self.op.instance_name))
2436
    if instance is None:
2437
      raise errors.OpPrereqError("Instance '%s' not known" %
2438
                                 self.op.instance_name)
2439
    self.instance = instance
2440

    
2441
  def Exec(self, feedback_fn):
2442
    """Remove the instance.
2443

2444
    """
2445
    instance = self.instance
2446
    logger.Info("shutting down instance %s on node %s" %
2447
                (instance.name, instance.primary_node))
2448

    
2449
    if not rpc.call_instance_shutdown(instance.primary_node, instance):
2450
      if self.op.ignore_failures:
2451
        feedback_fn("Warning: can't shutdown instance")
2452
      else:
2453
        raise errors.OpExecError("Could not shutdown instance %s on node %s" %
2454
                                 (instance.name, instance.primary_node))
2455

    
2456
    logger.Info("removing block devices for instance %s" % instance.name)
2457

    
2458
    if not _RemoveDisks(instance, self.cfg):
2459
      if self.op.ignore_failures:
2460
        feedback_fn("Warning: can't remove instance's disks")
2461
      else:
2462
        raise errors.OpExecError("Can't remove instance's disks")
2463

    
2464
    logger.Info("removing instance %s out of cluster config" % instance.name)
2465

    
2466
    self.cfg.RemoveInstance(instance.name)
2467

    
2468

    
2469
class LUQueryInstances(NoHooksLU):
2470
  """Logical unit for querying instances.
2471

2472
  """
2473
  _OP_REQP = ["output_fields", "names"]
2474

    
2475
  def CheckPrereq(self):
2476
    """Check prerequisites.
2477

2478
    This checks that the fields required are valid output fields.
2479

2480
    """
2481
    self.dynamic_fields = frozenset(["oper_state", "oper_ram", "status"])
2482
    _CheckOutputFields(static=["name", "os", "pnode", "snodes",
2483
                               "admin_state", "admin_ram",
2484
                               "disk_template", "ip", "mac", "bridge",
2485
                               "sda_size", "sdb_size", "vcpus"],
2486
                       dynamic=self.dynamic_fields,
2487
                       selected=self.op.output_fields)
2488

    
2489
    self.wanted = _GetWantedInstances(self, self.op.names)
2490

    
2491
  def Exec(self, feedback_fn):
2492
    """Computes the list of nodes and their attributes.
2493

2494
    """
2495
    instance_names = self.wanted
2496
    instance_list = [self.cfg.GetInstanceInfo(iname) for iname
2497
                     in instance_names]
2498

    
2499
    # begin data gathering
2500

    
2501
    nodes = frozenset([inst.primary_node for inst in instance_list])
2502

    
2503
    bad_nodes = []
2504
    if self.dynamic_fields.intersection(self.op.output_fields):
2505
      live_data = {}
2506
      node_data = rpc.call_all_instances_info(nodes)
2507
      for name in nodes:
2508
        result = node_data[name]
2509
        if result:
2510
          live_data.update(result)
2511
        elif result == False:
2512
          bad_nodes.append(name)
2513
        # else no instance is alive
2514
    else:
2515
      live_data = dict([(name, {}) for name in instance_names])
2516

    
2517
    # end data gathering
2518

    
2519
    output = []
2520
    for instance in instance_list:
2521
      iout = []
2522
      for field in self.op.output_fields:
2523
        if field == "name":
2524
          val = instance.name
2525
        elif field == "os":
2526
          val = instance.os
2527
        elif field == "pnode":
2528
          val = instance.primary_node
2529
        elif field == "snodes":
2530
          val = list(instance.secondary_nodes)
2531
        elif field == "admin_state":
2532
          val = (instance.status != "down")
2533
        elif field == "oper_state":
2534
          if instance.primary_node in bad_nodes:
2535
            val = None
2536
          else:
2537
            val = bool(live_data.get(instance.name))
2538
        elif field == "status":
2539
          if instance.primary_node in bad_nodes:
2540
            val = "ERROR_nodedown"
2541
          else:
2542
            running = bool(live_data.get(instance.name))
2543
            if running:
2544
              if instance.status != "down":
2545
                val = "running"
2546
              else:
2547
                val = "ERROR_up"
2548
            else:
2549
              if instance.status != "down":
2550
                val = "ERROR_down"
2551
              else:
2552
                val = "ADMIN_down"
2553
        elif field == "admin_ram":
2554
          val = instance.memory
2555
        elif field == "oper_ram":
2556
          if instance.primary_node in bad_nodes:
2557
            val = None
2558
          elif instance.name in live_data:
2559
            val = live_data[instance.name].get("memory", "?")
2560
          else:
2561
            val = "-"
2562
        elif field == "disk_template":
2563
          val = instance.disk_template
2564
        elif field == "ip":
2565
          val = instance.nics[0].ip
2566
        elif field == "bridge":
2567
          val = instance.nics[0].bridge
2568
        elif field == "mac":
2569
          val = instance.nics[0].mac
2570
        elif field == "sda_size" or field == "sdb_size":
2571
          disk = instance.FindDisk(field[:3])
2572
          if disk is None:
2573
            val = None
2574
          else:
2575
            val = disk.size
2576
        elif field == "vcpus":
2577
          val = instance.vcpus
2578
        else:
2579
          raise errors.ParameterError(field)
2580
        iout.append(val)
2581
      output.append(iout)
2582

    
2583
    return output
2584

    
2585

    
2586
class LUFailoverInstance(LogicalUnit):
2587
  """Failover an instance.
2588

2589
  """
2590
  HPATH = "instance-failover"
2591
  HTYPE = constants.HTYPE_INSTANCE
2592
  _OP_REQP = ["instance_name", "ignore_consistency"]
2593

    
2594
  def BuildHooksEnv(self):
2595
    """Build hooks env.
2596

2597
    This runs on master, primary and secondary nodes of the instance.
2598

2599
    """
2600
    env = {
2601
      "IGNORE_CONSISTENCY": self.op.ignore_consistency,
2602
      }
2603
    env.update(_BuildInstanceHookEnvByObject(self.instance))
2604
    nl = [self.sstore.GetMasterNode()] + list(self.instance.secondary_nodes)
2605
    return env, nl, nl
2606

    
2607
  def CheckPrereq(self):
2608
    """Check prerequisites.
2609

2610
    This checks that the instance is in the cluster.
2611

2612
    """
2613
    instance = self.cfg.GetInstanceInfo(
2614
      self.cfg.ExpandInstanceName(self.op.instance_name))
2615
    if instance is None:
2616
      raise errors.OpPrereqError("Instance '%s' not known" %
2617
                                 self.op.instance_name)
2618

    
2619
    if instance.disk_template not in constants.DTS_NET_MIRROR:
2620
      raise errors.OpPrereqError("Instance's disk layout is not"
2621
                                 " network mirrored, cannot failover.")
2622

    
2623
    secondary_nodes = instance.secondary_nodes
2624
    if not secondary_nodes:
2625
      raise errors.ProgrammerError("no secondary node but using "
2626
                                   "a mirrored disk template")
2627

    
2628
    target_node = secondary_nodes[0]
2629
    # check memory requirements on the secondary node
2630
    _CheckNodeFreeMemory(self.cfg, target_node, "failing over instance %s" %
2631
                         instance.name, instance.memory)
2632

    
2633
    # check bridge existance
2634
    brlist = [nic.bridge for nic in instance.nics]
2635
    if not rpc.call_bridges_exist(target_node, brlist):
2636
      raise errors.OpPrereqError("One or more target bridges %s does not"
2637
                                 " exist on destination node '%s'" %
2638
                                 (brlist, target_node))
2639

    
2640
    self.instance = instance
2641

    
2642
  def Exec(self, feedback_fn):
2643
    """Failover an instance.
2644

2645
    The failover is done by shutting it down on its present node and
2646
    starting it on the secondary.
2647

2648
    """
2649
    instance = self.instance
2650

    
2651
    source_node = instance.primary_node
2652
    target_node = instance.secondary_nodes[0]
2653

    
2654
    feedback_fn("* checking disk consistency between source and target")
2655
    for dev in instance.disks:
2656
      # for drbd, these are drbd over lvm
2657
      if not _CheckDiskConsistency(self.cfg, dev, target_node, False):
2658
        if instance.status == "up" and not self.op.ignore_consistency:
2659
          raise errors.OpExecError("Disk %s is degraded on target node,"
2660
                                   " aborting failover." % dev.iv_name)
2661

    
2662
    feedback_fn("* shutting down instance on source node")
2663
    logger.Info("Shutting down instance %s on node %s" %
2664
                (instance.name, source_node))
2665

    
2666
    if not rpc.call_instance_shutdown(source_node, instance):
2667
      if self.op.ignore_consistency:
2668
        logger.Error("Could not shutdown instance %s on node %s. Proceeding"
2669
                     " anyway. Please make sure node %s is down"  %
2670
                     (instance.name, source_node, source_node))
2671
      else:
2672
        raise errors.OpExecError("Could not shutdown instance %s on node %s" %
2673
                                 (instance.name, source_node))
2674

    
2675
    feedback_fn("* deactivating the instance's disks on source node")
2676
    if not _ShutdownInstanceDisks(instance, self.cfg, ignore_primary=True):
2677
      raise errors.OpExecError("Can't shut down the instance's disks.")
2678

    
2679
    instance.primary_node = target_node
2680
    # distribute new instance config to the other nodes
2681
    self.cfg.AddInstance(instance)
2682

    
2683
    # Only start the instance if it's marked as up
2684
    if instance.status == "up":
2685
      feedback_fn("* activating the instance's disks on target node")
2686
      logger.Info("Starting instance %s on node %s" %
2687
                  (instance.name, target_node))
2688

    
2689
      disks_ok, dummy = _AssembleInstanceDisks(instance, self.cfg,
2690
                                               ignore_secondaries=True)
2691
      if not disks_ok:
2692
        _ShutdownInstanceDisks(instance, self.cfg)
2693
        raise errors.OpExecError("Can't activate the instance's disks")
2694

    
2695
      feedback_fn("* starting the instance on the target node")
2696
      if not rpc.call_instance_start(target_node, instance, None):
2697
        _ShutdownInstanceDisks(instance, self.cfg)
2698
        raise errors.OpExecError("Could not start instance %s on node %s." %
2699
                                 (instance.name, target_node))
2700

    
2701

    
2702
def _CreateBlockDevOnPrimary(cfg, node, instance, device, info):
2703
  """Create a tree of block devices on the primary node.
2704

2705
  This always creates all devices.
2706

2707
  """
2708
  if device.children:
2709
    for child in device.children:
2710
      if not _CreateBlockDevOnPrimary(cfg, node, instance, child, info):
2711
        return False
2712

    
2713
  cfg.SetDiskID(device, node)
2714
  new_id = rpc.call_blockdev_create(node, device, device.size,
2715
                                    instance.name, True, info)
2716
  if not new_id:
2717
    return False
2718
  if device.physical_id is None:
2719
    device.physical_id = new_id
2720
  return True
2721

    
2722

    
2723
def _CreateBlockDevOnSecondary(cfg, node, instance, device, force, info):
2724
  """Create a tree of block devices on a secondary node.
2725

2726
  If this device type has to be created on secondaries, create it and
2727
  all its children.
2728

2729
  If not, just recurse to children keeping the same 'force' value.
2730

2731
  """
2732
  if device.CreateOnSecondary():
2733
    force = True
2734
  if device.children:
2735
    for child in device.children:
2736
      if not _CreateBlockDevOnSecondary(cfg, node, instance,
2737
                                        child, force, info):
2738
        return False
2739

    
2740
  if not force:
2741
    return True
2742
  cfg.SetDiskID(device, node)
2743
  new_id = rpc.call_blockdev_create(node, device, device.size,
2744
                                    instance.name, False, info)
2745
  if not new_id:
2746
    return False
2747
  if device.physical_id is None:
2748
    device.physical_id = new_id
2749
  return True
2750

    
2751

    
2752
def _GenerateUniqueNames(cfg, exts):
2753
  """Generate a suitable LV name.
2754

2755
  This will generate a logical volume name for the given instance.
2756

2757
  """
2758
  results = []
2759
  for val in exts:
2760
    new_id = cfg.GenerateUniqueID()
2761
    results.append("%s%s" % (new_id, val))
2762
  return results
2763

    
2764

    
2765
def _GenerateMDDRBDBranch(cfg, primary, secondary, size, names):
2766
  """Generate a drbd device complete with its children.
2767

2768
  """
2769
  port = cfg.AllocatePort()
2770
  vgname = cfg.GetVGName()
2771
  dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
2772
                          logical_id=(vgname, names[0]))
2773
  dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
2774
                          logical_id=(vgname, names[1]))
2775
  drbd_dev = objects.Disk(dev_type=constants.LD_DRBD7, size=size,
2776
                          logical_id = (primary, secondary, port),
2777
                          children = [dev_data, dev_meta])
2778
  return drbd_dev
2779

    
2780

    
2781
def _GenerateDRBD8Branch(cfg, primary, secondary, size, names, iv_name):
2782
  """Generate a drbd8 device complete with its children.
2783

2784
  """
2785
  port = cfg.AllocatePort()
2786
  vgname = cfg.GetVGName()
2787
  dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
2788
                          logical_id=(vgname, names[0]))
2789
  dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
2790
                          logical_id=(vgname, names[1]))
2791
  drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
2792
                          logical_id = (primary, secondary, port),
2793
                          children = [dev_data, dev_meta],
2794
                          iv_name=iv_name)
2795
  return drbd_dev
2796

    
2797

    
2798
def _GenerateDiskTemplate(cfg, template_name,
2799
                          instance_name, primary_node,
2800
                          secondary_nodes, disk_sz, swap_sz,
2801
                          file_storage_dir, file_driver):
2802
  """Generate the entire disk layout for a given template type.
2803

2804
  """
2805
  #TODO: compute space requirements
2806

    
2807
  vgname = cfg.GetVGName()
2808
  if template_name == constants.DT_DISKLESS:
2809
    disks = []
2810
  elif template_name == constants.DT_PLAIN:
2811
    if len(secondary_nodes) != 0:
2812
      raise errors.ProgrammerError("Wrong template configuration")
2813

    
2814
    names = _GenerateUniqueNames(cfg, [".sda", ".sdb"])
2815
    sda_dev = objects.Disk(dev_type=constants.LD_LV, size=disk_sz,
2816
                           logical_id=(vgname, names[0]),
2817
                           iv_name = "sda")
2818
    sdb_dev = objects.Disk(dev_type=constants.LD_LV, size=swap_sz,
2819
                           logical_id=(vgname, names[1]),
2820
                           iv_name = "sdb")
2821
    disks = [sda_dev, sdb_dev]
2822
  elif template_name == constants.DT_DRBD8:
2823
    if len(secondary_nodes) != 1:
2824
      raise errors.ProgrammerError("Wrong template configuration")
2825
    remote_node = secondary_nodes[0]
2826
    names = _GenerateUniqueNames(cfg, [".sda_data", ".sda_meta",
2827
                                       ".sdb_data", ".sdb_meta"])
2828
    drbd_sda_dev = _GenerateDRBD8Branch(cfg, primary_node, remote_node,
2829
                                         disk_sz, names[0:2], "sda")
2830
    drbd_sdb_dev = _GenerateDRBD8Branch(cfg, primary_node, remote_node,
2831
                                         swap_sz, names[2:4], "sdb")
2832
    disks = [drbd_sda_dev, drbd_sdb_dev]
2833
  elif template_name == constants.DT_FILE:
2834
    if len(secondary_nodes) != 0:
2835
      raise errors.ProgrammerError("Wrong template configuration")
2836

    
2837
    file_sda_dev = objects.Disk(dev_type=constants.LD_FILE, size=disk_sz,
2838
                                iv_name="sda", logical_id=(file_driver,
2839
                                "%s/sda" % file_storage_dir))
2840
    file_sdb_dev = objects.Disk(dev_type=constants.LD_FILE, size=swap_sz,
2841
                                iv_name="sdb", logical_id=(file_driver,
2842
                                "%s/sdb" % file_storage_dir))
2843
    disks = [file_sda_dev, file_sdb_dev]
2844
  else:
2845
    raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
2846
  return disks
2847

    
2848

    
2849
def _GetInstanceInfoText(instance):
2850
  """Compute that text that should be added to the disk's metadata.
2851

2852
  """
2853
  return "originstname+%s" % instance.name
2854

    
2855

    
2856
def _CreateDisks(cfg, instance):
2857
  """Create all disks for an instance.
2858

2859
  This abstracts away some work from AddInstance.
2860

2861
  Args:
2862
    instance: the instance object
2863

2864
  Returns:
2865
    True or False showing the success of the creation process
2866

2867
  """
2868
  info = _GetInstanceInfoText(instance)
2869

    
2870
  if instance.disk_template == constants.DT_FILE:
2871
    file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
2872
    result = rpc.call_file_storage_dir_create(instance.primary_node,
2873
                                              file_storage_dir)
2874

    
2875
    if not result:
2876
      logger.Error("Could not connect to node '%s'" % instance.primary_node)
2877
      return False
2878

    
2879
    if not result[0]:
2880
      logger.Error("failed to create directory '%s'" % file_storage_dir)
2881
      return False
2882

    
2883
  for device in instance.disks:
2884
    logger.Info("creating volume %s for instance %s" %
2885
                (device.iv_name, instance.name))
2886
    #HARDCODE
2887
    for secondary_node in instance.secondary_nodes:
2888
      if not _CreateBlockDevOnSecondary(cfg, secondary_node, instance,
2889
                                        device, False, info):
2890
        logger.Error("failed to create volume %s (%s) on secondary node %s!" %
2891
                     (device.iv_name, device, secondary_node))
2892
        return False
2893
    #HARDCODE
2894
    if not _CreateBlockDevOnPrimary(cfg, instance.primary_node,
2895
                                    instance, device, info):
2896
      logger.Error("failed to create volume %s on primary!" %
2897
                   device.iv_name)
2898
      return False
2899

    
2900
  return True
2901

    
2902

    
2903
def _RemoveDisks(instance, cfg):
2904
  """Remove all disks for an instance.
2905

2906
  This abstracts away some work from `AddInstance()` and
2907
  `RemoveInstance()`. Note that in case some of the devices couldn't
2908
  be removed, the removal will continue with the other ones (compare
2909
  with `_CreateDisks()`).
2910

2911
  Args:
2912
    instance: the instance object
2913

2914
  Returns:
2915
    True or False showing the success of the removal proces
2916

2917
  """
2918
  logger.Info("removing block devices for instance %s" % instance.name)
2919

    
2920
  result = True
2921
  for device in instance.disks:
2922
    for node, disk in device.ComputeNodeTree(instance.primary_node):
2923
      cfg.SetDiskID(disk, node)
2924
      if not rpc.call_blockdev_remove(node, disk):
2925
        logger.Error("could not remove block device %s on node %s,"
2926
                     " continuing anyway" %
2927
                     (device.iv_name, node))
2928
        result = False
2929

    
2930
  if instance.disk_template == constants.DT_FILE:
2931
    file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
2932
    if not rpc.call_file_storage_dir_remove(instance.primary_node,
2933
                                            file_storage_dir):
2934
      logger.Error("could not remove directory '%s'" % file_storage_dir)
2935
      result = False
2936

    
2937
  return result
2938

    
2939

    
2940
def _ComputeDiskSize(disk_template, disk_size, swap_size):
2941
  """Compute disk size requirements in the volume group
2942

2943
  This is currently hard-coded for the two-drive layout.
2944

2945
  """
2946
  # Required free disk space as a function of disk and swap space
2947
  req_size_dict = {
2948
    constants.DT_DISKLESS: None,
2949
    constants.DT_PLAIN: disk_size + swap_size,
2950
    # 256 MB are added for drbd metadata, 128MB for each drbd device
2951
    constants.DT_DRBD8: disk_size + swap_size + 256,
2952
    constants.DT_FILE: None,
2953
  }
2954

    
2955
  if disk_template not in req_size_dict:
2956
    raise errors.ProgrammerError("Disk template '%s' size requirement"
2957
                                 " is unknown" %  disk_template)
2958

    
2959
  return req_size_dict[disk_template]
2960

    
2961

    
2962
class LUCreateInstance(LogicalUnit):
2963
  """Create an instance.
2964

2965
  """
2966
  HPATH = "instance-add"
2967
  HTYPE = constants.HTYPE_INSTANCE
2968
  _OP_REQP = ["instance_name", "mem_size", "disk_size",
2969
              "disk_template", "swap_size", "mode", "start", "vcpus",
2970
              "wait_for_sync", "ip_check", "mac"]
2971

    
2972
  def _RunAllocator(self):
2973
    """Run the allocator based on input opcode.
2974

2975
    """
2976
    disks = [{"size": self.op.disk_size, "mode": "w"},
2977
             {"size": self.op.swap_size, "mode": "w"}]
2978
    nics = [{"mac": self.op.mac, "ip": getattr(self.op, "ip", None),
2979
             "bridge": self.op.bridge}]
2980
    ial = IAllocator(self.cfg, self.sstore,
2981
                     mode=constants.IALLOCATOR_MODE_ALLOC,
2982
                     name=self.op.instance_name,
2983
                     disk_template=self.op.disk_template,
2984
                     tags=[],
2985
                     os=self.op.os_type,
2986
                     vcpus=self.op.vcpus,
2987
                     mem_size=self.op.mem_size,
2988
                     disks=disks,
2989
                     nics=nics,
2990
                     )
2991

    
2992
    ial.Run(self.op.iallocator)
2993

    
2994
    if not ial.success:
2995
      raise errors.OpPrereqError("Can't compute nodes using"
2996
                                 " iallocator '%s': %s" % (self.op.iallocator,
2997
                                                           ial.info))
2998
    if len(ial.nodes) != ial.required_nodes:
2999
      raise errors.OpPrereqError("iallocator '%s' returned invalid number"
3000
                                 " of nodes (%s), required %s" %
3001
                                 (len(ial.nodes), ial.required_nodes))
3002
    self.op.pnode = ial.nodes[0]
3003
    logger.ToStdout("Selected nodes for the instance: %s" %
3004
                    (", ".join(ial.nodes),))
3005
    logger.Info("Selected nodes for instance %s via iallocator %s: %s" %
3006
                (self.op.instance_name, self.op.iallocator, ial.nodes))
3007
    if ial.required_nodes == 2:
3008
      self.op.snode = ial.nodes[1]
3009

    
3010
  def BuildHooksEnv(self):
3011
    """Build hooks env.
3012

3013
    This runs on master, primary and secondary nodes of the instance.
3014

3015
    """
3016
    env = {
3017
      "INSTANCE_DISK_TEMPLATE": self.op.disk_template,
3018
      "INSTANCE_DISK_SIZE": self.op.disk_size,
3019
      "INSTANCE_SWAP_SIZE": self.op.swap_size,
3020
      "INSTANCE_ADD_MODE": self.op.mode,
3021
      }
3022
    if self.op.mode == constants.INSTANCE_IMPORT:
3023
      env["INSTANCE_SRC_NODE"] = self.op.src_node
3024
      env["INSTANCE_SRC_PATH"] = self.op.src_path
3025
      env["INSTANCE_SRC_IMAGE"] = self.src_image
3026

    
3027
    env.update(_BuildInstanceHookEnv(name=self.op.instance_name,
3028
      primary_node=self.op.pnode,
3029
      secondary_nodes=self.secondaries,
3030
      status=self.instance_status,
3031
      os_type=self.op.os_type,
3032
      memory=self.op.mem_size,
3033
      vcpus=self.op.vcpus,
3034
      nics=[(self.inst_ip, self.op.bridge, self.op.mac)],
3035
    ))
3036

    
3037
    nl = ([self.sstore.GetMasterNode(), self.op.pnode] +
3038
          self.secondaries)
3039
    return env, nl, nl
3040

    
3041

    
3042
  def CheckPrereq(self):
3043
    """Check prerequisites.
3044

3045
    """
3046
    # set optional parameters to none if they don't exist
3047
    for attr in ["kernel_path", "initrd_path", "hvm_boot_order", "pnode",
3048
                 "iallocator", "hvm_acpi", "hvm_pae", "hvm_cdrom_image_path",
3049
                 "vnc_bind_address"]:
3050
      if not hasattr(self.op, attr):
3051
        setattr(self.op, attr, None)
3052

    
3053
    if self.op.mode not in (constants.INSTANCE_CREATE,
3054
                            constants.INSTANCE_IMPORT):
3055
      raise errors.OpPrereqError("Invalid instance creation mode '%s'" %
3056
                                 self.op.mode)
3057

    
3058
    if (not self.cfg.GetVGName() and
3059
        self.op.disk_template not in constants.DTS_NOT_LVM):
3060
      raise errors.OpPrereqError("Cluster does not support lvm-based"
3061
                                 " instances")
3062

    
3063
    if self.op.mode == constants.INSTANCE_IMPORT:
3064
      src_node = getattr(self.op, "src_node", None)
3065
      src_path = getattr(self.op, "src_path", None)
3066
      if src_node is None or src_path is None:
3067
        raise errors.OpPrereqError("Importing an instance requires source"
3068
                                   " node and path options")
3069
      src_node_full = self.cfg.ExpandNodeName(src_node)
3070
      if src_node_full is None:
3071
        raise errors.OpPrereqError("Unknown source node '%s'" % src_node)
3072
      self.op.src_node = src_node = src_node_full
3073

    
3074
      if not os.path.isabs(src_path):
3075
        raise errors.OpPrereqError("The source path must be absolute")
3076

    
3077
      export_info = rpc.call_export_info(src_node, src_path)
3078

    
3079
      if not export_info:
3080
        raise errors.OpPrereqError("No export found in dir %s" % src_path)
3081

    
3082
      if not export_info.has_section(constants.INISECT_EXP):
3083
        raise errors.ProgrammerError("Corrupted export config")
3084

    
3085
      ei_version = export_info.get(constants.INISECT_EXP, 'version')
3086
      if (int(ei_version) != constants.EXPORT_VERSION):
3087
        raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
3088
                                   (ei_version, constants.EXPORT_VERSION))
3089

    
3090
      if int(export_info.get(constants.INISECT_INS, 'disk_count')) > 1:
3091
        raise errors.OpPrereqError("Can't import instance with more than"
3092
                                   " one data disk")
3093

    
3094
      # FIXME: are the old os-es, disk sizes, etc. useful?
3095
      self.op.os_type = export_info.get(constants.INISECT_EXP, 'os')
3096
      diskimage = os.path.join(src_path, export_info.get(constants.INISECT_INS,
3097
                                                         'disk0_dump'))
3098
      self.src_image = diskimage
3099
    else: # INSTANCE_CREATE
3100
      if getattr(self.op, "os_type", None) is None:
3101
        raise errors.OpPrereqError("No guest OS specified")
3102

    
3103
    #### instance parameters check
3104

    
3105
    # disk template and mirror node verification
3106
    if self.op.disk_template not in constants.DISK_TEMPLATES:
3107
      raise errors.OpPrereqError("Invalid disk template name")
3108

    
3109
    # instance name verification
3110
    hostname1 = utils.HostInfo(self.op.instance_name)
3111

    
3112
    self.op.instance_name = instance_name = hostname1.name
3113
    instance_list = self.cfg.GetInstanceList()
3114
    if instance_name in instance_list:
3115
      raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
3116
                                 instance_name)
3117

    
3118
    # ip validity checks
3119
    ip = getattr(self.op, "ip", None)
3120
    if ip is None or ip.lower() == "none":
3121
      inst_ip = None
3122
    elif ip.lower() == "auto":
3123
      inst_ip = hostname1.ip
3124
    else:
3125
      if not utils.IsValidIP(ip):
3126
        raise errors.OpPrereqError("given IP address '%s' doesn't look"
3127
                                   " like a valid IP" % ip)
3128
      inst_ip = ip
3129
    self.inst_ip = self.op.ip = inst_ip
3130

    
3131
    if self.op.start and not self.op.ip_check:
3132
      raise errors.OpPrereqError("Cannot ignore IP address conflicts when"
3133
                                 " adding an instance in start mode")
3134

    
3135
    if self.op.ip_check:
3136
      if utils.TcpPing(hostname1.ip, constants.DEFAULT_NODED_PORT):
3137
        raise errors.OpPrereqError("IP %s of instance %s already in use" %
3138
                                   (hostname1.ip, instance_name))
3139

    
3140
    # MAC address verification
3141
    if self.op.mac != "auto":
3142
      if not utils.IsValidMac(self.op.mac.lower()):
3143
        raise errors.OpPrereqError("invalid MAC address specified: %s" %
3144
                                   self.op.mac)
3145

    
3146
    # bridge verification
3147
    bridge = getattr(self.op, "bridge", None)
3148
    if bridge is None:
3149
      self.op.bridge = self.cfg.GetDefBridge()
3150
    else:
3151
      self.op.bridge = bridge
3152

    
3153
    # boot order verification
3154
    if self.op.hvm_boot_order is not None:
3155
      if len(self.op.hvm_boot_order.strip("acdn")) != 0:
3156
        raise errors.OpPrereqError("invalid boot order specified,"
3157
                                   " must be one or more of [acdn]")
3158
    # file storage checks
3159
    if (self.op.file_driver and
3160
        not self.op.file_driver in constants.FILE_DRIVER):
3161
      raise errors.OpPrereqError("Invalid file driver name '%s'" %
3162
                                 self.op.file_driver)
3163

    
3164
    if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
3165
      raise errors.OpPrereqError("File storage directory not a relative"
3166
                                 " path")
3167
    #### allocator run
3168

    
3169
    if [self.op.iallocator, self.op.pnode].count(None) != 1:
3170
      raise errors.OpPrereqError("One and only one of iallocator and primary"
3171
                                 " node must be given")
3172

    
3173
    if self.op.iallocator is not None:
3174
      self._RunAllocator()
3175

    
3176
    #### node related checks
3177

    
3178
    # check primary node
3179
    pnode = self.cfg.GetNodeInfo(self.cfg.ExpandNodeName(self.op.pnode))
3180
    if pnode is None:
3181
      raise errors.OpPrereqError("Primary node '%s' is unknown" %
3182
                                 self.op.pnode)
3183
    self.op.pnode = pnode.name
3184
    self.pnode = pnode
3185
    self.secondaries = []
3186

    
3187
    # mirror node verification
3188
    if self.op.disk_template in constants.DTS_NET_MIRROR:
3189
      if getattr(self.op, "snode", None) is None:
3190
        raise errors.OpPrereqError("The networked disk templates need"
3191
                                   " a mirror node")
3192

    
3193
      snode_name = self.cfg.ExpandNodeName(self.op.snode)
3194
      if snode_name is None:
3195
        raise errors.OpPrereqError("Unknown secondary node '%s'" %
3196
                                   self.op.snode)
3197
      elif snode_name == pnode.name:
3198
        raise errors.OpPrereqError("The secondary node cannot be"
3199
                                   " the primary node.")
3200
      self.secondaries.append(snode_name)
3201

    
3202
    req_size = _ComputeDiskSize(self.op.disk_template,
3203
                                self.op.disk_size, self.op.swap_size)
3204

    
3205
    # Check lv size requirements
3206
    if req_size is not None:
3207
      nodenames = [pnode.name] + self.secondaries
3208
      nodeinfo = rpc.call_node_info(nodenames, self.cfg.GetVGName())
3209
      for node in nodenames:
3210
        info = nodeinfo.get(node, None)
3211
        if not info:
3212
          raise errors.OpPrereqError("Cannot get current information"
3213
                                     " from node '%s'" % node)
3214
        vg_free = info.get('vg_free', None)
3215
        if not isinstance(vg_free, int):
3216
          raise errors.OpPrereqError("Can't compute free disk space on"
3217
                                     " node %s" % node)
3218
        if req_size > info['vg_free']:
3219
          raise errors.OpPrereqError("Not enough disk space on target node %s."
3220
                                     " %d MB available, %d MB required" %
3221
                                     (node, info['vg_free'], req_size))
3222

    
3223
    # os verification
3224
    os_obj = rpc.call_os_get(pnode.name, self.op.os_type)
3225
    if not os_obj:
3226
      raise errors.OpPrereqError("OS '%s' not in supported os list for"
3227
                                 " primary node"  % self.op.os_type)
3228

    
3229
    if self.op.kernel_path == constants.VALUE_NONE:
3230
      raise errors.OpPrereqError("Can't set instance kernel to none")
3231

    
3232

    
3233
    # bridge check on primary node
3234
    if not rpc.call_bridges_exist(self.pnode.name, [self.op.bridge]):
3235
      raise errors.OpPrereqError("target bridge '%s' does not exist on"
3236
                                 " destination node '%s'" %
3237
                                 (self.op.bridge, pnode.name))
3238

    
3239
    # memory check on primary node
3240
    if self.op.start:
3241
      _CheckNodeFreeMemory(self.cfg, self.pnode.name,
3242
                           "creating instance %s" % self.op.instance_name,
3243
                           self.op.mem_size)
3244

    
3245
    # hvm_cdrom_image_path verification
3246
    if self.op.hvm_cdrom_image_path is not None:
3247
      if not os.path.isabs(self.op.hvm_cdrom_image_path):
3248
        raise errors.OpPrereqError("The path to the HVM CDROM image must"
3249
                                   " be an absolute path or None, not %s" %
3250
                                   self.op.hvm_cdrom_image_path)
3251
      if not os.path.isfile(self.op.hvm_cdrom_image_path):
3252
        raise errors.OpPrereqError("The HVM CDROM image must either be a"
3253
                                   " regular file or a symlink pointing to"
3254
                                   " an existing regular file, not %s" %
3255
                                   self.op.hvm_cdrom_image_path)
3256

    
3257
    # vnc_bind_address verification
3258
    if self.op.vnc_bind_address is not None:
3259
      if not utils.IsValidIP(self.op.vnc_bind_address):
3260
        raise errors.OpPrereqError("given VNC bind address '%s' doesn't look"
3261
                                   " like a valid IP address" %
3262
                                   self.op.vnc_bind_address)
3263

    
3264
    if self.op.start:
3265
      self.instance_status = 'up'
3266
    else:
3267
      self.instance_status = 'down'
3268

    
3269
  def Exec(self, feedback_fn):
3270
    """Create and add the instance to the cluster.
3271

3272
    """
3273
    instance = self.op.instance_name
3274
    pnode_name = self.pnode.name
3275

    
3276
    if self.op.mac == "auto":
3277
      mac_address = self.cfg.GenerateMAC()
3278
    else:
3279
      mac_address = self.op.mac
3280

    
3281
    nic = objects.NIC(bridge=self.op.bridge, mac=mac_address)
3282
    if self.inst_ip is not None:
3283
      nic.ip = self.inst_ip
3284

    
3285
    ht_kind = self.sstore.GetHypervisorType()
3286
    if ht_kind in constants.HTS_REQ_PORT:
3287
      network_port = self.cfg.AllocatePort()
3288
    else:
3289
      network_port = None
3290

    
3291
    if self.op.vnc_bind_address is None:
3292
      self.op.vnc_bind_address = constants.VNC_DEFAULT_BIND_ADDRESS
3293

    
3294
    # this is needed because os.path.join does not accept None arguments
3295
    if self.op.file_storage_dir is None:
3296
      string_file_storage_dir = ""
3297
    else:
3298
      string_file_storage_dir = self.op.file_storage_dir
3299

    
3300
    # build the full file storage dir path
3301
    file_storage_dir = os.path.normpath(os.path.join(
3302
                                        self.sstore.GetFileStorageDir(),
3303
                                        string_file_storage_dir, instance))
3304

    
3305

    
3306
    disks = _GenerateDiskTemplate(self.cfg,
3307
                                  self.op.disk_template,
3308
                                  instance, pnode_name,
3309
                                  self.secondaries, self.op.disk_size,
3310
                                  self.op.swap_size,
3311
                                  file_storage_dir,
3312
                                  self.op.file_driver)
3313

    
3314
    iobj = objects.Instance(name=instance, os=self.op.os_type,
3315
                            primary_node=pnode_name,
3316
                            memory=self.op.mem_size,
3317
                            vcpus=self.op.vcpus,
3318
                            nics=[nic], disks=disks,
3319
                            disk_template=self.op.disk_template,
3320
                            status=self.instance_status,
3321
                            network_port=network_port,
3322
                            kernel_path=self.op.kernel_path,
3323
                            initrd_path=self.op.initrd_path,
3324
                            hvm_boot_order=self.op.hvm_boot_order,
3325
                            hvm_acpi=self.op.hvm_acpi,
3326
                            hvm_pae=self.op.hvm_pae,
3327
                            hvm_cdrom_image_path=self.op.hvm_cdrom_image_path,
3328
                            vnc_bind_address=self.op.vnc_bind_address,
3329
                            )
3330

    
3331
    feedback_fn("* creating instance disks...")
3332
    if not _CreateDisks(self.cfg, iobj):
3333
      _RemoveDisks(iobj, self.cfg)
3334
      raise errors.OpExecError("Device creation failed, reverting...")
3335

    
3336
    feedback_fn("adding instance %s to cluster config" % instance)
3337

    
3338
    self.cfg.AddInstance(iobj)
3339

    
3340
    if self.op.wait_for_sync:
3341
      disk_abort = not _WaitForSync(self.cfg, iobj, self.proc)
3342
    elif iobj.disk_template in constants.DTS_NET_MIRROR:
3343
      # make sure the disks are not degraded (still sync-ing is ok)
3344
      time.sleep(15)
3345
      feedback_fn("* checking mirrors status")
3346
      disk_abort = not _WaitForSync(self.cfg, iobj, self.proc, oneshot=True)
3347
    else:
3348
      disk_abort = False
3349

    
3350
    if disk_abort:
3351
      _RemoveDisks(iobj, self.cfg)
3352
      self.cfg.RemoveInstance(iobj.name)
3353
      raise errors.OpExecError("There are some degraded disks for"
3354
                               " this instance")
3355

    
3356
    feedback_fn("creating os for instance %s on node %s" %
3357
                (instance, pnode_name))
3358

    
3359
    if iobj.disk_template != constants.DT_DISKLESS:
3360
      if self.op.mode == constants.INSTANCE_CREATE:
3361
        feedback_fn("* running the instance OS create scripts...")
3362
        if not rpc.call_instance_os_add(pnode_name, iobj, "sda", "sdb"):
3363
          raise errors.OpExecError("could not add os for instance %s"
3364
                                   " on node %s" %
3365
                                   (instance, pnode_name))
3366

    
3367
      elif self.op.mode == constants.INSTANCE_IMPORT:
3368
        feedback_fn("* running the instance OS import scripts...")
3369
        src_node = self.op.src_node
3370
        src_image = self.src_image
3371
        if not rpc.call_instance_os_import(pnode_name, iobj, "sda", "sdb",
3372
                                                src_node, src_image):
3373
          raise errors.OpExecError("Could not import os for instance"
3374
                                   " %s on node %s" %
3375
                                   (instance, pnode_name))
3376
      else:
3377
        # also checked in the prereq part
3378
        raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
3379
                                     % self.op.mode)
3380

    
3381
    if self.op.start:
3382
      logger.Info("starting instance %s on node %s" % (instance, pnode_name))
3383
      feedback_fn("* starting instance...")
3384
      if not rpc.call_instance_start(pnode_name, iobj, None):
3385
        raise errors.OpExecError("Could not start instance")
3386

    
3387

    
3388
class LUConnectConsole(NoHooksLU):
3389
  """Connect to an instance's console.
3390

3391
  This is somewhat special in that it returns the command line that
3392
  you need to run on the master node in order to connect to the
3393
  console.
3394

3395
  """
3396
  _OP_REQP = ["instance_name"]
3397

    
3398
  def CheckPrereq(self):
3399
    """Check prerequisites.
3400

3401
    This checks that the instance is in the cluster.
3402

3403
    """
3404
    instance = self.cfg.GetInstanceInfo(
3405
      self.cfg.ExpandInstanceName(self.op.instance_name))
3406
    if instance is None:
3407
      raise errors.OpPrereqError("Instance '%s' not known" %
3408
                                 self.op.instance_name)
3409
    self.instance = instance
3410

    
3411
  def Exec(self, feedback_fn):
3412
    """Connect to the console of an instance
3413

3414
    """
3415
    instance = self.instance
3416
    node = instance.primary_node
3417

    
3418
    node_insts = rpc.call_instance_list([node])[node]
3419
    if node_insts is False:
3420
      raise errors.OpExecError("Can't connect to node %s." % node)
3421

    
3422
    if instance.name not in node_insts:
3423
      raise errors.OpExecError("Instance %s is not running." % instance.name)
3424

    
3425
    logger.Debug("connecting to console of %s on %s" % (instance.name, node))
3426

    
3427
    hyper = hypervisor.GetHypervisor()
3428
    console_cmd = hyper.GetShellCommandForConsole(instance)
3429

    
3430
    # build ssh cmdline
3431
    return self.ssh.BuildCmd(node, "root", console_cmd, batch=True, tty=True)
3432

    
3433

    
3434
class LUReplaceDisks(LogicalUnit):
3435
  """Replace the disks of an instance.
3436

3437
  """
3438
  HPATH = "mirrors-replace"
3439
  HTYPE = constants.HTYPE_INSTANCE
3440
  _OP_REQP = ["instance_name", "mode", "disks"]
3441

    
3442
  def _RunAllocator(self):
3443
    """Compute a new secondary node using an IAllocator.
3444

3445
    """
3446
    ial = IAllocator(self.cfg, self.sstore,
3447
                     mode=constants.IALLOCATOR_MODE_RELOC,
3448
                     name=self.op.instance_name,
3449
                     relocate_from=[self.sec_node])
3450

    
3451
    ial.Run(self.op.iallocator)
3452

    
3453
    if not ial.success:
3454
      raise errors.OpPrereqError("Can't compute nodes using"
3455
                                 " iallocator '%s': %s" % (self.op.iallocator,
3456
                                                           ial.info))
3457
    if len(ial.nodes) != ial.required_nodes:
3458
      raise errors.OpPrereqError("iallocator '%s' returned invalid number"
3459
                                 " of nodes (%s), required %s" %
3460
                                 (len(ial.nodes), ial.required_nodes))
3461
    self.op.remote_node = ial.nodes[0]
3462
    logger.ToStdout("Selected new secondary for the instance: %s" %
3463
                    self.op.remote_node)
3464

    
3465
  def BuildHooksEnv(self):
3466
    """Build hooks env.
3467

3468
    This runs on the master, the primary and all the secondaries.
3469

3470
    """
3471
    env = {
3472
      "MODE": self.op.mode,
3473
      "NEW_SECONDARY": self.op.remote_node,
3474
      "OLD_SECONDARY": self.instance.secondary_nodes[0],
3475
      }
3476
    env.update(_BuildInstanceHookEnvByObject(self.instance))
3477
    nl = [
3478
      self.sstore.GetMasterNode(),
3479
      self.instance.primary_node,
3480
      ]
3481
    if self.op.remote_node is not None:
3482
      nl.append(self.op.remote_node)
3483
    return env, nl, nl
3484

    
3485
  def CheckPrereq(self):
3486
    """Check prerequisites.
3487

3488
    This checks that the instance is in the cluster.
3489

3490
    """
3491
    if not hasattr(self.op, "remote_node"):
3492
      self.op.remote_node = None
3493

    
3494
    instance = self.cfg.GetInstanceInfo(
3495
      self.cfg.ExpandInstanceName(self.op.instance_name))
3496
    if instance is None:
3497
      raise errors.OpPrereqError("Instance '%s' not known" %
3498
                                 self.op.instance_name)
3499
    self.instance = instance
3500
    self.op.instance_name = instance.name
3501

    
3502
    if instance.disk_template not in constants.DTS_NET_MIRROR:
3503
      raise errors.OpPrereqError("Instance's disk layout is not"
3504
                                 " network mirrored.")
3505

    
3506
    if len(instance.secondary_nodes) != 1:
3507
      raise errors.OpPrereqError("The instance has a strange layout,"
3508
                                 " expected one secondary but found %d" %
3509
                                 len(instance.secondary_nodes))
3510

    
3511
    self.sec_node = instance.secondary_nodes[0]
3512

    
3513
    ia_name = getattr(self.op, "iallocator", None)
3514
    if ia_name is not None:
3515
      if self.op.remote_node is not None:
3516
        raise errors.OpPrereqError("Give either the iallocator or the new"
3517
                                   " secondary, not both")
3518
      self.op.remote_node = self._RunAllocator()
3519

    
3520
    remote_node = self.op.remote_node
3521
    if remote_node is not None:
3522
      remote_node = self.cfg.ExpandNodeName(remote_node)
3523
      if remote_node is None:
3524
        raise errors.OpPrereqError("Node '%s' not known" %
3525
                                   self.op.remote_node)
3526
      self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
3527
    else:
3528
      self.remote_node_info = None
3529
    if remote_node == instance.primary_node:
3530
      raise errors.OpPrereqError("The specified node is the primary node of"
3531
                                 " the instance.")
3532
    elif remote_node == self.sec_node:
3533
      if self.op.mode == constants.REPLACE_DISK_SEC:
3534
        # this is for DRBD8, where we can't execute the same mode of
3535
        # replacement as for drbd7 (no different port allocated)
3536
        raise errors.OpPrereqError("Same secondary given, cannot execute"
3537
                                   " replacement")
3538
    if instance.disk_template == constants.DT_DRBD8:
3539
      if (self.op.mode == constants.REPLACE_DISK_ALL and
3540
          remote_node is not None):
3541
        # switch to replace secondary mode
3542
        self.op.mode = constants.REPLACE_DISK_SEC
3543

    
3544
      if self.op.mode == constants.REPLACE_DISK_ALL:
3545
        raise errors.OpPrereqError("Template 'drbd' only allows primary or"
3546
                                   " secondary disk replacement, not"
3547
                                   " both at once")
3548
      elif self.op.mode == constants.REPLACE_DISK_PRI:
3549
        if remote_node is not None:
3550
          raise errors.OpPrereqError("Template 'drbd' does not allow changing"
3551
                                     " the secondary while doing a primary"
3552
                                     " node disk replacement")
3553
        self.tgt_node = instance.primary_node
3554
        self.oth_node = instance.secondary_nodes[0]
3555
      elif self.op.mode == constants.REPLACE_DISK_SEC:
3556
        self.new_node = remote_node # this can be None, in which case
3557
                                    # we don't change the secondary
3558
        self.tgt_node = instance.secondary_nodes[0]
3559
        self.oth_node = instance.primary_node
3560
      else:
3561
        raise errors.ProgrammerError("Unhandled disk replace mode")
3562

    
3563
    for name in self.op.disks:
3564
      if instance.FindDisk(name) is None:
3565
        raise errors.OpPrereqError("Disk '%s' not found for instance '%s'" %
3566
                                   (name, instance.name))
3567
    self.op.remote_node = remote_node
3568

    
3569
  def _ExecD8DiskOnly(self, feedback_fn):
3570
    """Replace a disk on the primary or secondary for dbrd8.
3571

3572
    The algorithm for replace is quite complicated:
3573
      - for each disk to be replaced:
3574
        - create new LVs on the target node with unique names
3575
        - detach old LVs from the drbd device
3576
        - rename old LVs to name_replaced.<time_t>
3577
        - rename new LVs to old LVs
3578
        - attach the new LVs (with the old names now) to the drbd device
3579
      - wait for sync across all devices
3580
      - for each modified disk:
3581
        - remove old LVs (which have the name name_replaces.<time_t>)
3582

3583
    Failures are not very well handled.
3584

3585
    """
3586
    steps_total = 6
3587
    warning, info = (self.proc.LogWarning, self.proc.LogInfo)
3588
    instance = self.instance
3589
    iv_names = {}
3590
    vgname = self.cfg.GetVGName()
3591
    # start of work
3592
    cfg = self.cfg
3593
    tgt_node = self.tgt_node
3594
    oth_node = self.oth_node
3595

    
3596
    # Step: check device activation
3597
    self.proc.LogStep(1, steps_total, "check device existence")
3598
    info("checking volume groups")
3599
    my_vg = cfg.GetVGName()
3600
    results = rpc.call_vg_list([oth_node, tgt_node])
3601
    if not results:
3602
      raise errors.OpExecError("Can't list volume groups on the nodes")
3603
    for node in oth_node, tgt_node:
3604
      res = results.get(node, False)
3605
      if not res or my_vg not in res:
3606
        raise errors.OpExecError("Volume group '%s' not found on %s" %
3607
                                 (my_vg, node))
3608
    for dev in instance.disks:
3609
      if not dev.iv_name in self.op.disks:
3610
        continue
3611
      for node in tgt_node, oth_node:
3612
        info("checking %s on %s" % (dev.iv_name, node))
3613
        cfg.SetDiskID(dev, node)
3614
        if not rpc.call_blockdev_find(node, dev):
3615
          raise errors.OpExecError("Can't find device %s on node %s" %
3616
                                   (dev.iv_name, node))
3617

    
3618
    # Step: check other node consistency
3619
    self.proc.LogStep(2, steps_total, "check peer consistency")
3620
    for dev in instance.disks:
3621
      if not dev.iv_name in self.op.disks:
3622
        continue
3623
      info("checking %s consistency on %s" % (dev.iv_name, oth_node))
3624
      if not _CheckDiskConsistency(self.cfg, dev, oth_node,
3625
                                   oth_node==instance.primary_node):
3626
        raise errors.OpExecError("Peer node (%s) has degraded storage, unsafe"
3627
                                 " to replace disks on this node (%s)" %
3628
                                 (oth_node, tgt_node))
3629

    
3630
    # Step: create new storage
3631
    self.proc.LogStep(3, steps_total, "allocate new storage")
3632
    for dev in instance.disks:
3633
      if not dev.iv_name in self.op.disks:
3634
        continue
3635
      size = dev.size
3636
      cfg.SetDiskID(dev, tgt_node)
3637
      lv_names = [".%s_%s" % (dev.iv_name, suf) for suf in ["data", "meta"]]
3638
      names = _GenerateUniqueNames(cfg, lv_names)
3639
      lv_data = objects.Disk(dev_type=constants.LD_LV, size=size,
3640
                             logical_id=(vgname, names[0]))
3641
      lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
3642
                             logical_id=(vgname, names[1]))
3643
      new_lvs = [lv_data, lv_meta]
3644
      old_lvs = dev.children
3645
      iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
3646
      info("creating new local storage on %s for %s" %
3647
           (tgt_node, dev.iv_name))
3648
      # since we *always* want to create this LV, we use the
3649
      # _Create...OnPrimary (which forces the creation), even if we
3650
      # are talking about the secondary node
3651
      for new_lv in new_lvs:
3652
        if not _CreateBlockDevOnPrimary(cfg, tgt_node, instance, new_lv,
3653
                                        _GetInstanceInfoText(instance)):
3654
          raise errors.OpExecError("Failed to create new LV named '%s' on"
3655
                                   " node '%s'" %
3656
                                   (new_lv.logical_id[1], tgt_node))
3657

    
3658
    # Step: for each lv, detach+rename*2+attach
3659
    self.proc.LogStep(4, steps_total, "change drbd configuration")
3660
    for dev, old_lvs, new_lvs in iv_names.itervalues():
3661
      info("detaching %s drbd from local storage" % dev.iv_name)
3662
      if not rpc.call_blockdev_removechildren(tgt_node, dev, old_lvs):
3663
        raise errors.OpExecError("Can't detach drbd from local storage on node"
3664
                                 " %s for device %s" % (tgt_node, dev.iv_name))
3665
      #dev.children = []
3666
      #cfg.Update(instance)
3667

    
3668
      # ok, we created the new LVs, so now we know we have the needed
3669
      # storage; as such, we proceed on the target node to rename
3670
      # old_lv to _old, and new_lv to old_lv; note that we rename LVs
3671
      # using the assumption that logical_id == physical_id (which in
3672
      # turn is the unique_id on that node)
3673

    
3674
      # FIXME(iustin): use a better name for the replaced LVs
3675
      temp_suffix = int(time.time())
3676
      ren_fn = lambda d, suff: (d.physical_id[0],
3677
                                d.physical_id[1] + "_replaced-%s" % suff)
3678
      # build the rename list based on what LVs exist on the node
3679
      rlist = []
3680
      for to_ren in old_lvs:
3681
        find_res = rpc.call_blockdev_find(tgt_node, to_ren)
3682
        if find_res is not None: # device exists
3683
          rlist.append((to_ren, ren_fn(to_ren, temp_suffix)))
3684

    
3685
      info("renaming the old LVs on the target node")
3686
      if not rpc.call_blockdev_rename(tgt_node, rlist):
3687
        raise errors.OpExecError("Can't rename old LVs on node %s" % tgt_node)
3688
      # now we rename the new LVs to the old LVs
3689
      info("renaming the new LVs on the target node")
3690
      rlist = [(new, old.physical_id) for old, new in zip(old_lvs, new_lvs)]
3691
      if not rpc.call_blockdev_rename(tgt_node, rlist):
3692
        raise errors.OpExecError("Can't rename new LVs on node %s" % tgt_node)
3693

    
3694
      for old, new in zip(old_lvs, new_lvs):
3695
        new.logical_id = old.logical_id
3696
        cfg.SetDiskID(new, tgt_node)
3697

    
3698
      for disk in old_lvs:
3699
        disk.logical_id = ren_fn(disk, temp_suffix)
3700
        cfg.SetDiskID(disk, tgt_node)
3701

    
3702
      # now that the new lvs have the old name, we can add them to the device
3703
      info("adding new mirror component on %s" % tgt_node)
3704
      if not rpc.call_blockdev_addchildren(tgt_node, dev, new_lvs):
3705
        for new_lv in new_lvs:
3706
          if not rpc.call_blockdev_remove(tgt_node, new_lv):
3707
            warning("Can't rollback device %s", hint="manually cleanup unused"
3708
                    " logical volumes")
3709
        raise errors.OpExecError("Can't add local storage to drbd")
3710

    
3711
      dev.children = new_lvs
3712
      cfg.Update(instance)
3713

    
3714
    # Step: wait for sync
3715

    
3716
    # this can fail as the old devices are degraded and _WaitForSync
3717
    # does a combined result over all disks, so we don't check its
3718
    # return value
3719
    self.proc.LogStep(5, steps_total, "sync devices")
3720
    _WaitForSync(cfg, instance, self.proc, unlock=True)
3721

    
3722
    # so check manually all the devices
3723
    for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
3724
      cfg.SetDiskID(dev, instance.primary_node)
3725
      is_degr = rpc.call_blockdev_find(instance.primary_node, dev)[5]
3726
      if is_degr:
3727
        raise errors.OpExecError("DRBD device %s is degraded!" % name)
3728

    
3729
    # Step: remove old storage
3730
    self.proc.LogStep(6, steps_total, "removing old storage")
3731
    for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
3732
      info("remove logical volumes for %s" % name)
3733
      for lv in old_lvs:
3734
        cfg.SetDiskID(lv, tgt_node)
3735
        if not rpc.call_blockdev_remove(tgt_node, lv):
3736
          warning("Can't remove old LV", hint="manually remove unused LVs")
3737
          continue
3738

    
3739
  def _ExecD8Secondary(self, feedback_fn):
3740
    """Replace the secondary node for drbd8.
3741

3742
    The algorithm for replace is quite complicated:
3743
      - for all disks of the instance:
3744
        - create new LVs on the new node with same names
3745
        - shutdown the drbd device on the old secondary
3746
        - disconnect the drbd network on the primary
3747
        - create the drbd device on the new secondary
3748
        - network attach the drbd on the primary, using an artifice:
3749
          the drbd code for Attach() will connect to the network if it
3750
          finds a device which is connected to the good local disks but
3751
          not network enabled
3752
      - wait for sync across all devices
3753
      - remove all disks from the old secondary
3754

3755
    Failures are not very well handled.
3756

3757
    """
3758
    steps_total = 6
3759
    warning, info = (self.proc.LogWarning, self.proc.LogInfo)
3760
    instance = self.instance
3761
    iv_names = {}
3762
    vgname = self.cfg.GetVGName()
3763
    # start of work
3764
    cfg = self.cfg
3765
    old_node = self.tgt_node
3766
    new_node = self.new_node
3767
    pri_node = instance.primary_node
3768

    
3769
    # Step: check device activation
3770
    self.proc.LogStep(1, steps_total, "check device existence")
3771
    info("checking volume groups")
3772
    my_vg = cfg.GetVGName()
3773
    results = rpc.call_vg_list([pri_node, new_node])
3774
    if not results:
3775
      raise errors.OpExecError("Can't list volume groups on the nodes")
3776
    for node in pri_node, new_node:
3777
      res = results.get(node, False)
3778
      if not res or my_vg not in res:
3779
        raise errors.OpExecError("Volume group '%s' not found on %s" %
3780
                                 (my_vg, node))
3781
    for dev in instance.disks:
3782
      if not dev.iv_name in self.op.disks:
3783
        continue
3784
      info("checking %s on %s" % (dev.iv_name, pri_node))
3785
      cfg.SetDiskID(dev, pri_node)
3786
      if not rpc.call_blockdev_find(pri_node, dev):
3787
        raise errors.OpExecError("Can't find device %s on node %s" %
3788
                                 (dev.iv_name, pri_node))
3789

    
3790
    # Step: check other node consistency
3791
    self.proc.LogStep(2, steps_total, "check peer consistency")
3792
    for dev in instance.disks:
3793
      if not dev.iv_name in self.op.disks:
3794
        continue
3795
      info("checking %s consistency on %s" % (dev.iv_name, pri_node))
3796
      if not _CheckDiskConsistency(self.cfg, dev, pri_node, True, ldisk=True):
3797
        raise errors.OpExecError("Primary node (%s) has degraded storage,"
3798
                                 " unsafe to replace the secondary" %
3799
                                 pri_node)
3800

    
3801
    # Step: create new storage
3802
    self.proc.LogStep(3, steps_total, "allocate new storage")
3803
    for dev in instance.disks:
3804
      size = dev.size
3805
      info("adding new local storage on %s for %s" % (new_node, dev.iv_name))
3806
      # since we *always* want to create this LV, we use the
3807
      # _Create...OnPrimary (which forces the creation), even if we
3808
      # are talking about the secondary node
3809
      for new_lv in dev.children:
3810
        if not _CreateBlockDevOnPrimary(cfg, new_node, instance, new_lv,
3811
                                        _GetInstanceInfoText(instance)):
3812
          raise errors.OpExecError("Failed to create new LV named '%s' on"
3813
                                   " node '%s'" %
3814
                                   (new_lv.logical_id[1], new_node))
3815

    
3816
      iv_names[dev.iv_name] = (dev, dev.children)
3817

    
3818
    self.proc.LogStep(4, steps_total, "changing drbd configuration")
3819
    for dev in instance.disks:
3820
      size = dev.size
3821
      info("activating a new drbd on %s for %s" % (new_node, dev.iv_name))
3822
      # create new devices on new_node
3823
      new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
3824
                              logical_id=(pri_node, new_node,
3825
                                          dev.logical_id[2]),
3826
                              children=dev.children)
3827
      if not _CreateBlockDevOnSecondary(cfg, new_node, instance,
3828
                                        new_drbd, False,
3829
                                      _GetInstanceInfoText(instance)):
3830
        raise errors.OpExecError("Failed to create new DRBD on"
3831
                                 " node '%s'" % new_node)
3832

    
3833
    for dev in instance.disks:
3834
      # we have new devices, shutdown the drbd on the old secondary
3835
      info("shutting down drbd for %s on old node" % dev.iv_name)
3836
      cfg.SetDiskID(dev, old_node)
3837
      if not rpc.call_blockdev_shutdown(old_node, dev):
3838
        warning("Failed to shutdown drbd for %s on old node" % dev.iv_name,
3839
                hint="Please cleanup this device manually as soon as possible")
3840

    
3841
    info("detaching primary drbds from the network (=> standalone)")
3842
    done = 0
3843
    for dev in instance.disks:
3844
      cfg.SetDiskID(dev, pri_node)
3845
      # set the physical (unique in bdev terms) id to None, meaning
3846
      # detach from network
3847
      dev.physical_id = (None,) * len(dev.physical_id)
3848
      # and 'find' the device, which will 'fix' it to match the
3849
      # standalone state
3850
      if rpc.call_blockdev_find(pri_node, dev):
3851
        done += 1
3852
      else:
3853
        warning("Failed to detach drbd %s from network, unusual case" %
3854
                dev.iv_name)
3855

    
3856
    if not done:
3857
      # no detaches succeeded (very unlikely)
3858
      raise errors.OpExecError("Can't detach at least one DRBD from old node")
3859

    
3860
    # if we managed to detach at least one, we update all the disks of
3861
    # the instance to point to the new secondary
3862
    info("updating instance configuration")
3863
    for dev in instance.disks:
3864
      dev.logical_id = (pri_node, new_node) + dev.logical_id[2:]
3865
      cfg.SetDiskID(dev, pri_node)
3866
    cfg.Update(instance)
3867

    
3868
    # and now perform the drbd attach
3869
    info("attaching primary drbds to new secondary (standalone => connected)")
3870
    failures = []
3871
    for dev in instance.disks:
3872
      info("attaching primary drbd for %s to new secondary node" % dev.iv_name)
3873
      # since the attach is smart, it's enough to 'find' the device,
3874
      # it will automatically activate the network, if the physical_id
3875
      # is correct
3876
      cfg.SetDiskID(dev, pri_node)
3877
      if not rpc.call_blockdev_find(pri_node, dev):
3878
        warning("can't attach drbd %s to new secondary!" % dev.iv_name,
3879
                "please do a gnt-instance info to see the status of disks")
3880

    
3881
    # this can fail as the old devices are degraded and _WaitForSync
3882
    # does a combined result over all disks, so we don't check its
3883
    # return value
3884
    self.proc.LogStep(5, steps_total, "sync devices")
3885
    _WaitForSync(cfg, instance, self.proc, unlock=True)
3886

    
3887
    # so check manually all the devices
3888
    for name, (dev, old_lvs) in iv_names.iteritems():
3889
      cfg.SetDiskID(dev, pri_node)
3890
      is_degr = rpc.call_blockdev_find(pri_node, dev)[5]
3891
      if is_degr:
3892
        raise errors.OpExecError("DRBD device %s is degraded!" % name)
3893

    
3894
    self.proc.LogStep(6, steps_total, "removing old storage")
3895
    for name, (dev, old_lvs) in iv_names.iteritems():
3896
      info("remove logical volumes for %s" % name)
3897
      for lv in old_lvs:
3898
        cfg.SetDiskID(lv, old_node)
3899
        if not rpc.call_blockdev_remove(old_node, lv):
3900
          warning("Can't remove LV on old secondary",
3901
                  hint="Cleanup stale volumes by hand")
3902

    
3903
  def Exec(self, feedback_fn):
3904
    """Execute disk replacement.
3905

3906
    This dispatches the disk replacement to the appropriate handler.
3907

3908
    """
3909
    instance = self.instance
3910
    if instance.disk_template == constants.DT_DRBD8:
3911
      if self.op.remote_node is None:
3912
        fn = self._ExecD8DiskOnly
3913
      else:
3914
        fn = self._ExecD8Secondary
3915
    else:
3916
      raise errors.ProgrammerError("Unhandled disk replacement case")
3917
    return fn(feedback_fn)
3918

    
3919

    
3920
class LUQueryInstanceData(NoHooksLU):
3921
  """Query runtime instance data.
3922

3923
  """
3924
  _OP_REQP = ["instances"]
3925

    
3926
  def CheckPrereq(self):
3927
    """Check prerequisites.
3928

3929
    This only checks the optional instance list against the existing names.
3930

3931
    """
3932
    if not isinstance(self.op.instances, list):
3933
      raise errors.OpPrereqError("Invalid argument type 'instances'")
3934
    if self.op.instances:
3935
      self.wanted_instances = []
3936
      names = self.op.instances
3937
      for name in names:
3938
        instance = self.cfg.GetInstanceInfo(self.cfg.ExpandInstanceName(name))
3939
        if instance is None:
3940
          raise errors.OpPrereqError("No such instance name '%s'" % name)
3941
        self.wanted_instances.append(instance)
3942
    else:
3943
      self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
3944
                               in self.cfg.GetInstanceList()]
3945
    return
3946

    
3947

    
3948
  def _ComputeDiskStatus(self, instance, snode, dev):
3949
    """Compute block device status.
3950

3951
    """
3952
    self.cfg.SetDiskID(dev, instance.primary_node)
3953
    dev_pstatus = rpc.call_blockdev_find(instance.primary_node, dev)
3954
    if dev.dev_type in constants.LDS_DRBD:
3955
      # we change the snode then (otherwise we use the one passed in)
3956
      if dev.logical_id[0] == instance.primary_node:
3957
        snode = dev.logical_id[1]
3958
      else:
3959
        snode = dev.logical_id[0]
3960

    
3961
    if snode:
3962
      self.cfg.SetDiskID(dev, snode)
3963
      dev_sstatus = rpc.call_blockdev_find(snode, dev)
3964
    else:
3965
      dev_sstatus = None
3966

    
3967
    if dev.children:
3968
      dev_children = [self._ComputeDiskStatus(instance, snode, child)
3969
                      for child in dev.children]
3970
    else:
3971
      dev_children = []
3972

    
3973
    data = {
3974
      "iv_name": dev.iv_name,
3975
      "dev_type": dev.dev_type,
3976
      "logical_id": dev.logical_id,
3977
      "physical_id": dev.physical_id,
3978
      "pstatus": dev_pstatus,
3979
      "sstatus": dev_sstatus,
3980
      "children": dev_children,
3981
      }
3982

    
3983
    return data
3984

    
3985
  def Exec(self, feedback_fn):
3986
    """Gather and return data"""
3987
    result = {}
3988
    for instance in self.wanted_instances:
3989
      remote_info = rpc.call_instance_info(instance.primary_node,
3990
                                                instance.name)