Statistics
| Branch: | Tag: | Revision:

root / lib / cmdlib.py @ 95f84636

History | View | Annotate | Download (341.1 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Module implementing the master-side code."""
23

    
24
# pylint: disable-msg=W0201
25

    
26
# W0201 since most LU attributes are defined in CheckPrereq or similar
27
# functions
28

    
29
import os
30
import os.path
31
import time
32
import re
33
import platform
34
import logging
35
import copy
36
import OpenSSL
37

    
38
from ganeti import ssh
39
from ganeti import utils
40
from ganeti import errors
41
from ganeti import hypervisor
42
from ganeti import locking
43
from ganeti import constants
44
from ganeti import objects
45
from ganeti import serializer
46
from ganeti import ssconf
47
from ganeti import uidpool
48
from ganeti import compat
49

    
50

    
51
class LogicalUnit(object):
52
  """Logical Unit base class.
53

54
  Subclasses must follow these rules:
55
    - implement ExpandNames
56
    - implement CheckPrereq (except when tasklets are used)
57
    - implement Exec (except when tasklets are used)
58
    - implement BuildHooksEnv
59
    - redefine HPATH and HTYPE
60
    - optionally redefine their run requirements:
61
        REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
62

63
  Note that all commands require root permissions.
64

65
  @ivar dry_run_result: the value (if any) that will be returned to the caller
66
      in dry-run mode (signalled by opcode dry_run parameter)
67

68
  """
69
  HPATH = None
70
  HTYPE = None
71
  _OP_REQP = []
72
  REQ_BGL = True
73

    
74
  def __init__(self, processor, op, context, rpc):
75
    """Constructor for LogicalUnit.
76

77
    This needs to be overridden in derived classes in order to check op
78
    validity.
79

80
    """
81
    self.proc = processor
82
    self.op = op
83
    self.cfg = context.cfg
84
    self.context = context
85
    self.rpc = rpc
86
    # Dicts used to declare locking needs to mcpu
87
    self.needed_locks = None
88
    self.acquired_locks = {}
89
    self.share_locks = dict.fromkeys(locking.LEVELS, 0)
90
    self.add_locks = {}
91
    self.remove_locks = {}
92
    # Used to force good behavior when calling helper functions
93
    self.recalculate_locks = {}
94
    self.__ssh = None
95
    # logging
96
    self.LogWarning = processor.LogWarning # pylint: disable-msg=C0103
97
    self.LogInfo = processor.LogInfo # pylint: disable-msg=C0103
98
    self.LogStep = processor.LogStep # pylint: disable-msg=C0103
99
    # support for dry-run
100
    self.dry_run_result = None
101
    # support for generic debug attribute
102
    if (not hasattr(self.op, "debug_level") or
103
        not isinstance(self.op.debug_level, int)):
104
      self.op.debug_level = 0
105

    
106
    # Tasklets
107
    self.tasklets = None
108

    
109
    for attr_name in self._OP_REQP:
110
      attr_val = getattr(op, attr_name, None)
111
      if attr_val is None:
112
        raise errors.OpPrereqError("Required parameter '%s' missing" %
113
                                   attr_name, errors.ECODE_INVAL)
114

    
115
    self.CheckArguments()
116

    
117
  def __GetSSH(self):
118
    """Returns the SshRunner object
119

120
    """
121
    if not self.__ssh:
122
      self.__ssh = ssh.SshRunner(self.cfg.GetClusterName())
123
    return self.__ssh
124

    
125
  ssh = property(fget=__GetSSH)
126

    
127
  def CheckArguments(self):
128
    """Check syntactic validity for the opcode arguments.
129

130
    This method is for doing a simple syntactic check and ensure
131
    validity of opcode parameters, without any cluster-related
132
    checks. While the same can be accomplished in ExpandNames and/or
133
    CheckPrereq, doing these separate is better because:
134

135
      - ExpandNames is left as as purely a lock-related function
136
      - CheckPrereq is run after we have acquired locks (and possible
137
        waited for them)
138

139
    The function is allowed to change the self.op attribute so that
140
    later methods can no longer worry about missing parameters.
141

142
    """
143
    pass
144

    
145
  def ExpandNames(self):
146
    """Expand names for this LU.
147

148
    This method is called before starting to execute the opcode, and it should
149
    update all the parameters of the opcode to their canonical form (e.g. a
150
    short node name must be fully expanded after this method has successfully
151
    completed). This way locking, hooks, logging, ecc. can work correctly.
152

153
    LUs which implement this method must also populate the self.needed_locks
154
    member, as a dict with lock levels as keys, and a list of needed lock names
155
    as values. Rules:
156

157
      - use an empty dict if you don't need any lock
158
      - if you don't need any lock at a particular level omit that level
159
      - don't put anything for the BGL level
160
      - if you want all locks at a level use locking.ALL_SET as a value
161

162
    If you need to share locks (rather than acquire them exclusively) at one
163
    level you can modify self.share_locks, setting a true value (usually 1) for
164
    that level. By default locks are not shared.
165

166
    This function can also define a list of tasklets, which then will be
167
    executed in order instead of the usual LU-level CheckPrereq and Exec
168
    functions, if those are not defined by the LU.
169

170
    Examples::
171

172
      # Acquire all nodes and one instance
173
      self.needed_locks = {
174
        locking.LEVEL_NODE: locking.ALL_SET,
175
        locking.LEVEL_INSTANCE: ['instance1.example.tld'],
176
      }
177
      # Acquire just two nodes
178
      self.needed_locks = {
179
        locking.LEVEL_NODE: ['node1.example.tld', 'node2.example.tld'],
180
      }
181
      # Acquire no locks
182
      self.needed_locks = {} # No, you can't leave it to the default value None
183

184
    """
185
    # The implementation of this method is mandatory only if the new LU is
186
    # concurrent, so that old LUs don't need to be changed all at the same
187
    # time.
188
    if self.REQ_BGL:
189
      self.needed_locks = {} # Exclusive LUs don't need locks.
190
    else:
191
      raise NotImplementedError
192

    
193
  def DeclareLocks(self, level):
194
    """Declare LU locking needs for a level
195

196
    While most LUs can just declare their locking needs at ExpandNames time,
197
    sometimes there's the need to calculate some locks after having acquired
198
    the ones before. This function is called just before acquiring locks at a
199
    particular level, but after acquiring the ones at lower levels, and permits
200
    such calculations. It can be used to modify self.needed_locks, and by
201
    default it does nothing.
202

203
    This function is only called if you have something already set in
204
    self.needed_locks for the level.
205

206
    @param level: Locking level which is going to be locked
207
    @type level: member of ganeti.locking.LEVELS
208

209
    """
210

    
211
  def CheckPrereq(self):
212
    """Check prerequisites for this LU.
213

214
    This method should check that the prerequisites for the execution
215
    of this LU are fulfilled. It can do internode communication, but
216
    it should be idempotent - no cluster or system changes are
217
    allowed.
218

219
    The method should raise errors.OpPrereqError in case something is
220
    not fulfilled. Its return value is ignored.
221

222
    This method should also update all the parameters of the opcode to
223
    their canonical form if it hasn't been done by ExpandNames before.
224

225
    """
226
    if self.tasklets is not None:
227
      for (idx, tl) in enumerate(self.tasklets):
228
        logging.debug("Checking prerequisites for tasklet %s/%s",
229
                      idx + 1, len(self.tasklets))
230
        tl.CheckPrereq()
231
    else:
232
      raise NotImplementedError
233

    
234
  def Exec(self, feedback_fn):
235
    """Execute the LU.
236

237
    This method should implement the actual work. It should raise
238
    errors.OpExecError for failures that are somewhat dealt with in
239
    code, or expected.
240

241
    """
242
    if self.tasklets is not None:
243
      for (idx, tl) in enumerate(self.tasklets):
244
        logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
245
        tl.Exec(feedback_fn)
246
    else:
247
      raise NotImplementedError
248

    
249
  def BuildHooksEnv(self):
250
    """Build hooks environment for this LU.
251

252
    This method should return a three-node tuple consisting of: a dict
253
    containing the environment that will be used for running the
254
    specific hook for this LU, a list of node names on which the hook
255
    should run before the execution, and a list of node names on which
256
    the hook should run after the execution.
257

258
    The keys of the dict must not have 'GANETI_' prefixed as this will
259
    be handled in the hooks runner. Also note additional keys will be
260
    added by the hooks runner. If the LU doesn't define any
261
    environment, an empty dict (and not None) should be returned.
262

263
    No nodes should be returned as an empty list (and not None).
264

265
    Note that if the HPATH for a LU class is None, this function will
266
    not be called.
267

268
    """
269
    raise NotImplementedError
270

    
271
  def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
272
    """Notify the LU about the results of its hooks.
273

274
    This method is called every time a hooks phase is executed, and notifies
275
    the Logical Unit about the hooks' result. The LU can then use it to alter
276
    its result based on the hooks.  By default the method does nothing and the
277
    previous result is passed back unchanged but any LU can define it if it
278
    wants to use the local cluster hook-scripts somehow.
279

280
    @param phase: one of L{constants.HOOKS_PHASE_POST} or
281
        L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
282
    @param hook_results: the results of the multi-node hooks rpc call
283
    @param feedback_fn: function used send feedback back to the caller
284
    @param lu_result: the previous Exec result this LU had, or None
285
        in the PRE phase
286
    @return: the new Exec result, based on the previous result
287
        and hook results
288

289
    """
290
    # API must be kept, thus we ignore the unused argument and could
291
    # be a function warnings
292
    # pylint: disable-msg=W0613,R0201
293
    return lu_result
294

    
295
  def _ExpandAndLockInstance(self):
296
    """Helper function to expand and lock an instance.
297

298
    Many LUs that work on an instance take its name in self.op.instance_name
299
    and need to expand it and then declare the expanded name for locking. This
300
    function does it, and then updates self.op.instance_name to the expanded
301
    name. It also initializes needed_locks as a dict, if this hasn't been done
302
    before.
303

304
    """
305
    if self.needed_locks is None:
306
      self.needed_locks = {}
307
    else:
308
      assert locking.LEVEL_INSTANCE not in self.needed_locks, \
309
        "_ExpandAndLockInstance called with instance-level locks set"
310
    self.op.instance_name = _ExpandInstanceName(self.cfg,
311
                                                self.op.instance_name)
312
    self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
313

    
314
  def _LockInstancesNodes(self, primary_only=False):
315
    """Helper function to declare instances' nodes for locking.
316

317
    This function should be called after locking one or more instances to lock
318
    their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
319
    with all primary or secondary nodes for instances already locked and
320
    present in self.needed_locks[locking.LEVEL_INSTANCE].
321

322
    It should be called from DeclareLocks, and for safety only works if
323
    self.recalculate_locks[locking.LEVEL_NODE] is set.
324

325
    In the future it may grow parameters to just lock some instance's nodes, or
326
    to just lock primaries or secondary nodes, if needed.
327

328
    If should be called in DeclareLocks in a way similar to::
329

330
      if level == locking.LEVEL_NODE:
331
        self._LockInstancesNodes()
332

333
    @type primary_only: boolean
334
    @param primary_only: only lock primary nodes of locked instances
335

336
    """
337
    assert locking.LEVEL_NODE in self.recalculate_locks, \
338
      "_LockInstancesNodes helper function called with no nodes to recalculate"
339

    
340
    # TODO: check if we're really been called with the instance locks held
341

    
342
    # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
343
    # future we might want to have different behaviors depending on the value
344
    # of self.recalculate_locks[locking.LEVEL_NODE]
345
    wanted_nodes = []
346
    for instance_name in self.acquired_locks[locking.LEVEL_INSTANCE]:
347
      instance = self.context.cfg.GetInstanceInfo(instance_name)
348
      wanted_nodes.append(instance.primary_node)
349
      if not primary_only:
350
        wanted_nodes.extend(instance.secondary_nodes)
351

    
352
    if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
353
      self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
354
    elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
355
      self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
356

    
357
    del self.recalculate_locks[locking.LEVEL_NODE]
358

    
359

    
360
class NoHooksLU(LogicalUnit): # pylint: disable-msg=W0223
361
  """Simple LU which runs no hooks.
362

363
  This LU is intended as a parent for other LogicalUnits which will
364
  run no hooks, in order to reduce duplicate code.
365

366
  """
367
  HPATH = None
368
  HTYPE = None
369

    
370
  def BuildHooksEnv(self):
371
    """Empty BuildHooksEnv for NoHooksLu.
372

373
    This just raises an error.
374

375
    """
376
    assert False, "BuildHooksEnv called for NoHooksLUs"
377

    
378

    
379
class Tasklet:
380
  """Tasklet base class.
381

382
  Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
383
  they can mix legacy code with tasklets. Locking needs to be done in the LU,
384
  tasklets know nothing about locks.
385

386
  Subclasses must follow these rules:
387
    - Implement CheckPrereq
388
    - Implement Exec
389

390
  """
391
  def __init__(self, lu):
392
    self.lu = lu
393

    
394
    # Shortcuts
395
    self.cfg = lu.cfg
396
    self.rpc = lu.rpc
397

    
398
  def CheckPrereq(self):
399
    """Check prerequisites for this tasklets.
400

401
    This method should check whether the prerequisites for the execution of
402
    this tasklet are fulfilled. It can do internode communication, but it
403
    should be idempotent - no cluster or system changes are allowed.
404

405
    The method should raise errors.OpPrereqError in case something is not
406
    fulfilled. Its return value is ignored.
407

408
    This method should also update all parameters to their canonical form if it
409
    hasn't been done before.
410

411
    """
412
    raise NotImplementedError
413

    
414
  def Exec(self, feedback_fn):
415
    """Execute the tasklet.
416

417
    This method should implement the actual work. It should raise
418
    errors.OpExecError for failures that are somewhat dealt with in code, or
419
    expected.
420

421
    """
422
    raise NotImplementedError
423

    
424

    
425
def _GetWantedNodes(lu, nodes):
426
  """Returns list of checked and expanded node names.
427

428
  @type lu: L{LogicalUnit}
429
  @param lu: the logical unit on whose behalf we execute
430
  @type nodes: list
431
  @param nodes: list of node names or None for all nodes
432
  @rtype: list
433
  @return: the list of nodes, sorted
434
  @raise errors.ProgrammerError: if the nodes parameter is wrong type
435

436
  """
437
  if not isinstance(nodes, list):
438
    raise errors.OpPrereqError("Invalid argument type 'nodes'",
439
                               errors.ECODE_INVAL)
440

    
441
  if not nodes:
442
    raise errors.ProgrammerError("_GetWantedNodes should only be called with a"
443
      " non-empty list of nodes whose name is to be expanded.")
444

    
445
  wanted = [_ExpandNodeName(lu.cfg, name) for name in nodes]
446
  return utils.NiceSort(wanted)
447

    
448

    
449
def _GetWantedInstances(lu, instances):
450
  """Returns list of checked and expanded instance names.
451

452
  @type lu: L{LogicalUnit}
453
  @param lu: the logical unit on whose behalf we execute
454
  @type instances: list
455
  @param instances: list of instance names or None for all instances
456
  @rtype: list
457
  @return: the list of instances, sorted
458
  @raise errors.OpPrereqError: if the instances parameter is wrong type
459
  @raise errors.OpPrereqError: if any of the passed instances is not found
460

461
  """
462
  if not isinstance(instances, list):
463
    raise errors.OpPrereqError("Invalid argument type 'instances'",
464
                               errors.ECODE_INVAL)
465

    
466
  if instances:
467
    wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
468
  else:
469
    wanted = utils.NiceSort(lu.cfg.GetInstanceList())
470
  return wanted
471

    
472

    
473
def _CheckOutputFields(static, dynamic, selected):
474
  """Checks whether all selected fields are valid.
475

476
  @type static: L{utils.FieldSet}
477
  @param static: static fields set
478
  @type dynamic: L{utils.FieldSet}
479
  @param dynamic: dynamic fields set
480

481
  """
482
  f = utils.FieldSet()
483
  f.Extend(static)
484
  f.Extend(dynamic)
485

    
486
  delta = f.NonMatching(selected)
487
  if delta:
488
    raise errors.OpPrereqError("Unknown output fields selected: %s"
489
                               % ",".join(delta), errors.ECODE_INVAL)
490

    
491

    
492
def _CheckBooleanOpField(op, name):
493
  """Validates boolean opcode parameters.
494

495
  This will ensure that an opcode parameter is either a boolean value,
496
  or None (but that it always exists).
497

498
  """
499
  val = getattr(op, name, None)
500
  if not (val is None or isinstance(val, bool)):
501
    raise errors.OpPrereqError("Invalid boolean parameter '%s' (%s)" %
502
                               (name, str(val)), errors.ECODE_INVAL)
503
  setattr(op, name, val)
504

    
505

    
506
def _CheckGlobalHvParams(params):
507
  """Validates that given hypervisor params are not global ones.
508

509
  This will ensure that instances don't get customised versions of
510
  global params.
511

512
  """
513
  used_globals = constants.HVC_GLOBALS.intersection(params)
514
  if used_globals:
515
    msg = ("The following hypervisor parameters are global and cannot"
516
           " be customized at instance level, please modify them at"
517
           " cluster level: %s" % utils.CommaJoin(used_globals))
518
    raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
519

    
520

    
521
def _CheckNodeOnline(lu, node):
522
  """Ensure that a given node is online.
523

524
  @param lu: the LU on behalf of which we make the check
525
  @param node: the node to check
526
  @raise errors.OpPrereqError: if the node is offline
527

528
  """
529
  if lu.cfg.GetNodeInfo(node).offline:
530
    raise errors.OpPrereqError("Can't use offline node %s" % node,
531
                               errors.ECODE_INVAL)
532

    
533

    
534
def _CheckNodeNotDrained(lu, node):
535
  """Ensure that a given node is not drained.
536

537
  @param lu: the LU on behalf of which we make the check
538
  @param node: the node to check
539
  @raise errors.OpPrereqError: if the node is drained
540

541
  """
542
  if lu.cfg.GetNodeInfo(node).drained:
543
    raise errors.OpPrereqError("Can't use drained node %s" % node,
544
                               errors.ECODE_INVAL)
545

    
546

    
547
def _CheckNodeHasOS(lu, node, os_name, force_variant):
548
  """Ensure that a node supports a given OS.
549

550
  @param lu: the LU on behalf of which we make the check
551
  @param node: the node to check
552
  @param os_name: the OS to query about
553
  @param force_variant: whether to ignore variant errors
554
  @raise errors.OpPrereqError: if the node is not supporting the OS
555

556
  """
557
  result = lu.rpc.call_os_get(node, os_name)
558
  result.Raise("OS '%s' not in supported OS list for node %s" %
559
               (os_name, node),
560
               prereq=True, ecode=errors.ECODE_INVAL)
561
  if not force_variant:
562
    _CheckOSVariant(result.payload, os_name)
563

    
564

    
565
def _RequireFileStorage():
566
  """Checks that file storage is enabled.
567

568
  @raise errors.OpPrereqError: when file storage is disabled
569

570
  """
571
  if not constants.ENABLE_FILE_STORAGE:
572
    raise errors.OpPrereqError("File storage disabled at configure time",
573
                               errors.ECODE_INVAL)
574

    
575

    
576
def _CheckDiskTemplate(template):
577
  """Ensure a given disk template is valid.
578

579
  """
580
  if template not in constants.DISK_TEMPLATES:
581
    msg = ("Invalid disk template name '%s', valid templates are: %s" %
582
           (template, utils.CommaJoin(constants.DISK_TEMPLATES)))
583
    raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
584
  if template == constants.DT_FILE:
585
    _RequireFileStorage()
586

    
587

    
588
def _CheckStorageType(storage_type):
589
  """Ensure a given storage type is valid.
590

591
  """
592
  if storage_type not in constants.VALID_STORAGE_TYPES:
593
    raise errors.OpPrereqError("Unknown storage type: %s" % storage_type,
594
                               errors.ECODE_INVAL)
595
  if storage_type == constants.ST_FILE:
596
    _RequireFileStorage()
597

    
598

    
599

    
600
def _CheckInstanceDown(lu, instance, reason):
601
  """Ensure that an instance is not running."""
602
  if instance.admin_up:
603
    raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
604
                               (instance.name, reason), errors.ECODE_STATE)
605

    
606
  pnode = instance.primary_node
607
  ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
608
  ins_l.Raise("Can't contact node %s for instance information" % pnode,
609
              prereq=True, ecode=errors.ECODE_ENVIRON)
610

    
611
  if instance.name in ins_l.payload:
612
    raise errors.OpPrereqError("Instance %s is running, %s" %
613
                               (instance.name, reason), errors.ECODE_STATE)
614

    
615

    
616
def _ExpandItemName(fn, name, kind):
617
  """Expand an item name.
618

619
  @param fn: the function to use for expansion
620
  @param name: requested item name
621
  @param kind: text description ('Node' or 'Instance')
622
  @return: the resolved (full) name
623
  @raise errors.OpPrereqError: if the item is not found
624

625
  """
626
  full_name = fn(name)
627
  if full_name is None:
628
    raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
629
                               errors.ECODE_NOENT)
630
  return full_name
631

    
632

    
633
def _ExpandNodeName(cfg, name):
634
  """Wrapper over L{_ExpandItemName} for nodes."""
635
  return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
636

    
637

    
638
def _ExpandInstanceName(cfg, name):
639
  """Wrapper over L{_ExpandItemName} for instance."""
640
  return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
641

    
642

    
643
def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
644
                          memory, vcpus, nics, disk_template, disks,
645
                          bep, hvp, hypervisor_name):
646
  """Builds instance related env variables for hooks
647

648
  This builds the hook environment from individual variables.
649

650
  @type name: string
651
  @param name: the name of the instance
652
  @type primary_node: string
653
  @param primary_node: the name of the instance's primary node
654
  @type secondary_nodes: list
655
  @param secondary_nodes: list of secondary nodes as strings
656
  @type os_type: string
657
  @param os_type: the name of the instance's OS
658
  @type status: boolean
659
  @param status: the should_run status of the instance
660
  @type memory: string
661
  @param memory: the memory size of the instance
662
  @type vcpus: string
663
  @param vcpus: the count of VCPUs the instance has
664
  @type nics: list
665
  @param nics: list of tuples (ip, mac, mode, link) representing
666
      the NICs the instance has
667
  @type disk_template: string
668
  @param disk_template: the disk template of the instance
669
  @type disks: list
670
  @param disks: the list of (size, mode) pairs
671
  @type bep: dict
672
  @param bep: the backend parameters for the instance
673
  @type hvp: dict
674
  @param hvp: the hypervisor parameters for the instance
675
  @type hypervisor_name: string
676
  @param hypervisor_name: the hypervisor for the instance
677
  @rtype: dict
678
  @return: the hook environment for this instance
679

680
  """
681
  if status:
682
    str_status = "up"
683
  else:
684
    str_status = "down"
685
  env = {
686
    "OP_TARGET": name,
687
    "INSTANCE_NAME": name,
688
    "INSTANCE_PRIMARY": primary_node,
689
    "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
690
    "INSTANCE_OS_TYPE": os_type,
691
    "INSTANCE_STATUS": str_status,
692
    "INSTANCE_MEMORY": memory,
693
    "INSTANCE_VCPUS": vcpus,
694
    "INSTANCE_DISK_TEMPLATE": disk_template,
695
    "INSTANCE_HYPERVISOR": hypervisor_name,
696
  }
697

    
698
  if nics:
699
    nic_count = len(nics)
700
    for idx, (ip, mac, mode, link) in enumerate(nics):
701
      if ip is None:
702
        ip = ""
703
      env["INSTANCE_NIC%d_IP" % idx] = ip
704
      env["INSTANCE_NIC%d_MAC" % idx] = mac
705
      env["INSTANCE_NIC%d_MODE" % idx] = mode
706
      env["INSTANCE_NIC%d_LINK" % idx] = link
707
      if mode == constants.NIC_MODE_BRIDGED:
708
        env["INSTANCE_NIC%d_BRIDGE" % idx] = link
709
  else:
710
    nic_count = 0
711

    
712
  env["INSTANCE_NIC_COUNT"] = nic_count
713

    
714
  if disks:
715
    disk_count = len(disks)
716
    for idx, (size, mode) in enumerate(disks):
717
      env["INSTANCE_DISK%d_SIZE" % idx] = size
718
      env["INSTANCE_DISK%d_MODE" % idx] = mode
719
  else:
720
    disk_count = 0
721

    
722
  env["INSTANCE_DISK_COUNT"] = disk_count
723

    
724
  for source, kind in [(bep, "BE"), (hvp, "HV")]:
725
    for key, value in source.items():
726
      env["INSTANCE_%s_%s" % (kind, key)] = value
727

    
728
  return env
729

    
730

    
731
def _NICListToTuple(lu, nics):
732
  """Build a list of nic information tuples.
733

734
  This list is suitable to be passed to _BuildInstanceHookEnv or as a return
735
  value in LUQueryInstanceData.
736

737
  @type lu:  L{LogicalUnit}
738
  @param lu: the logical unit on whose behalf we execute
739
  @type nics: list of L{objects.NIC}
740
  @param nics: list of nics to convert to hooks tuples
741

742
  """
743
  hooks_nics = []
744
  c_nicparams = lu.cfg.GetClusterInfo().nicparams[constants.PP_DEFAULT]
745
  for nic in nics:
746
    ip = nic.ip
747
    mac = nic.mac
748
    filled_params = objects.FillDict(c_nicparams, nic.nicparams)
749
    mode = filled_params[constants.NIC_MODE]
750
    link = filled_params[constants.NIC_LINK]
751
    hooks_nics.append((ip, mac, mode, link))
752
  return hooks_nics
753

    
754

    
755
def _BuildInstanceHookEnvByObject(lu, instance, override=None):
756
  """Builds instance related env variables for hooks from an object.
757

758
  @type lu: L{LogicalUnit}
759
  @param lu: the logical unit on whose behalf we execute
760
  @type instance: L{objects.Instance}
761
  @param instance: the instance for which we should build the
762
      environment
763
  @type override: dict
764
  @param override: dictionary with key/values that will override
765
      our values
766
  @rtype: dict
767
  @return: the hook environment dictionary
768

769
  """
770
  cluster = lu.cfg.GetClusterInfo()
771
  bep = cluster.FillBE(instance)
772
  hvp = cluster.FillHV(instance)
773
  args = {
774
    'name': instance.name,
775
    'primary_node': instance.primary_node,
776
    'secondary_nodes': instance.secondary_nodes,
777
    'os_type': instance.os,
778
    'status': instance.admin_up,
779
    'memory': bep[constants.BE_MEMORY],
780
    'vcpus': bep[constants.BE_VCPUS],
781
    'nics': _NICListToTuple(lu, instance.nics),
782
    'disk_template': instance.disk_template,
783
    'disks': [(disk.size, disk.mode) for disk in instance.disks],
784
    'bep': bep,
785
    'hvp': hvp,
786
    'hypervisor_name': instance.hypervisor,
787
  }
788
  if override:
789
    args.update(override)
790
  return _BuildInstanceHookEnv(**args) # pylint: disable-msg=W0142
791

    
792

    
793
def _AdjustCandidatePool(lu, exceptions):
794
  """Adjust the candidate pool after node operations.
795

796
  """
797
  mod_list = lu.cfg.MaintainCandidatePool(exceptions)
798
  if mod_list:
799
    lu.LogInfo("Promoted nodes to master candidate role: %s",
800
               utils.CommaJoin(node.name for node in mod_list))
801
    for name in mod_list:
802
      lu.context.ReaddNode(name)
803
  mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
804
  if mc_now > mc_max:
805
    lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
806
               (mc_now, mc_max))
807

    
808

    
809
def _DecideSelfPromotion(lu, exceptions=None):
810
  """Decide whether I should promote myself as a master candidate.
811

812
  """
813
  cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
814
  mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
815
  # the new node will increase mc_max with one, so:
816
  mc_should = min(mc_should + 1, cp_size)
817
  return mc_now < mc_should
818

    
819

    
820
def _CheckNicsBridgesExist(lu, target_nics, target_node,
821
                               profile=constants.PP_DEFAULT):
822
  """Check that the brigdes needed by a list of nics exist.
823

824
  """
825
  c_nicparams = lu.cfg.GetClusterInfo().nicparams[profile]
826
  paramslist = [objects.FillDict(c_nicparams, nic.nicparams)
827
                for nic in target_nics]
828
  brlist = [params[constants.NIC_LINK] for params in paramslist
829
            if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
830
  if brlist:
831
    result = lu.rpc.call_bridges_exist(target_node, brlist)
832
    result.Raise("Error checking bridges on destination node '%s'" %
833
                 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
834

    
835

    
836
def _CheckInstanceBridgesExist(lu, instance, node=None):
837
  """Check that the brigdes needed by an instance exist.
838

839
  """
840
  if node is None:
841
    node = instance.primary_node
842
  _CheckNicsBridgesExist(lu, instance.nics, node)
843

    
844

    
845
def _CheckOSVariant(os_obj, name):
846
  """Check whether an OS name conforms to the os variants specification.
847

848
  @type os_obj: L{objects.OS}
849
  @param os_obj: OS object to check
850
  @type name: string
851
  @param name: OS name passed by the user, to check for validity
852

853
  """
854
  if not os_obj.supported_variants:
855
    return
856
  try:
857
    variant = name.split("+", 1)[1]
858
  except IndexError:
859
    raise errors.OpPrereqError("OS name must include a variant",
860
                               errors.ECODE_INVAL)
861

    
862
  if variant not in os_obj.supported_variants:
863
    raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
864

    
865

    
866
def _GetNodeInstancesInner(cfg, fn):
867
  return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
868

    
869

    
870
def _GetNodeInstances(cfg, node_name):
871
  """Returns a list of all primary and secondary instances on a node.
872

873
  """
874

    
875
  return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
876

    
877

    
878
def _GetNodePrimaryInstances(cfg, node_name):
879
  """Returns primary instances on a node.
880

881
  """
882
  return _GetNodeInstancesInner(cfg,
883
                                lambda inst: node_name == inst.primary_node)
884

    
885

    
886
def _GetNodeSecondaryInstances(cfg, node_name):
887
  """Returns secondary instances on a node.
888

889
  """
890
  return _GetNodeInstancesInner(cfg,
891
                                lambda inst: node_name in inst.secondary_nodes)
892

    
893

    
894
def _GetStorageTypeArgs(cfg, storage_type):
895
  """Returns the arguments for a storage type.
896

897
  """
898
  # Special case for file storage
899
  if storage_type == constants.ST_FILE:
900
    # storage.FileStorage wants a list of storage directories
901
    return [[cfg.GetFileStorageDir()]]
902

    
903
  return []
904

    
905

    
906
def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
907
  faulty = []
908

    
909
  for dev in instance.disks:
910
    cfg.SetDiskID(dev, node_name)
911

    
912
  result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
913
  result.Raise("Failed to get disk status from node %s" % node_name,
914
               prereq=prereq, ecode=errors.ECODE_ENVIRON)
915

    
916
  for idx, bdev_status in enumerate(result.payload):
917
    if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
918
      faulty.append(idx)
919

    
920
  return faulty
921

    
922

    
923
def _FormatTimestamp(secs):
924
  """Formats a Unix timestamp with the local timezone.
925

926
  """
927
  return time.strftime("%F %T %Z", time.gmtime(secs))
928

    
929

    
930
class LUPostInitCluster(LogicalUnit):
931
  """Logical unit for running hooks after cluster initialization.
932

933
  """
934
  HPATH = "cluster-init"
935
  HTYPE = constants.HTYPE_CLUSTER
936
  _OP_REQP = []
937

    
938
  def BuildHooksEnv(self):
939
    """Build hooks env.
940

941
    """
942
    env = {"OP_TARGET": self.cfg.GetClusterName()}
943
    mn = self.cfg.GetMasterNode()
944
    return env, [], [mn]
945

    
946
  def CheckPrereq(self):
947
    """No prerequisites to check.
948

949
    """
950
    return True
951

    
952
  def Exec(self, feedback_fn):
953
    """Nothing to do.
954

955
    """
956
    return True
957

    
958

    
959
class LUDestroyCluster(LogicalUnit):
960
  """Logical unit for destroying the cluster.
961

962
  """
963
  HPATH = "cluster-destroy"
964
  HTYPE = constants.HTYPE_CLUSTER
965
  _OP_REQP = []
966

    
967
  def BuildHooksEnv(self):
968
    """Build hooks env.
969

970
    """
971
    env = {"OP_TARGET": self.cfg.GetClusterName()}
972
    return env, [], []
973

    
974
  def CheckPrereq(self):
975
    """Check prerequisites.
976

977
    This checks whether the cluster is empty.
978

979
    Any errors are signaled by raising errors.OpPrereqError.
980

981
    """
982
    master = self.cfg.GetMasterNode()
983

    
984
    nodelist = self.cfg.GetNodeList()
985
    if len(nodelist) != 1 or nodelist[0] != master:
986
      raise errors.OpPrereqError("There are still %d node(s) in"
987
                                 " this cluster." % (len(nodelist) - 1),
988
                                 errors.ECODE_INVAL)
989
    instancelist = self.cfg.GetInstanceList()
990
    if instancelist:
991
      raise errors.OpPrereqError("There are still %d instance(s) in"
992
                                 " this cluster." % len(instancelist),
993
                                 errors.ECODE_INVAL)
994

    
995
  def Exec(self, feedback_fn):
996
    """Destroys the cluster.
997

998
    """
999
    master = self.cfg.GetMasterNode()
1000
    modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
1001

    
1002
    # Run post hooks on master node before it's removed
1003
    hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
1004
    try:
1005
      hm.RunPhase(constants.HOOKS_PHASE_POST, [master])
1006
    except:
1007
      # pylint: disable-msg=W0702
1008
      self.LogWarning("Errors occurred running hooks on %s" % master)
1009

    
1010
    result = self.rpc.call_node_stop_master(master, False)
1011
    result.Raise("Could not disable the master role")
1012

    
1013
    if modify_ssh_setup:
1014
      priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
1015
      utils.CreateBackup(priv_key)
1016
      utils.CreateBackup(pub_key)
1017

    
1018
    return master
1019

    
1020

    
1021
def _VerifyCertificateInner(filename, expired, not_before, not_after, now,
1022
                            warn_days=constants.SSL_CERT_EXPIRATION_WARN,
1023
                            error_days=constants.SSL_CERT_EXPIRATION_ERROR):
1024
  """Verifies certificate details for LUVerifyCluster.
1025

1026
  """
1027
  if expired:
1028
    msg = "Certificate %s is expired" % filename
1029

    
1030
    if not_before is not None and not_after is not None:
1031
      msg += (" (valid from %s to %s)" %
1032
              (_FormatTimestamp(not_before),
1033
               _FormatTimestamp(not_after)))
1034
    elif not_before is not None:
1035
      msg += " (valid from %s)" % _FormatTimestamp(not_before)
1036
    elif not_after is not None:
1037
      msg += " (valid until %s)" % _FormatTimestamp(not_after)
1038

    
1039
    return (LUVerifyCluster.ETYPE_ERROR, msg)
1040

    
1041
  elif not_before is not None and not_before > now:
1042
    return (LUVerifyCluster.ETYPE_WARNING,
1043
            "Certificate %s not yet valid (valid from %s)" %
1044
            (filename, _FormatTimestamp(not_before)))
1045

    
1046
  elif not_after is not None:
1047
    remaining_days = int((not_after - now) / (24 * 3600))
1048

    
1049
    msg = ("Certificate %s expires in %d days" % (filename, remaining_days))
1050

    
1051
    if remaining_days <= error_days:
1052
      return (LUVerifyCluster.ETYPE_ERROR, msg)
1053

    
1054
    if remaining_days <= warn_days:
1055
      return (LUVerifyCluster.ETYPE_WARNING, msg)
1056

    
1057
  return (None, None)
1058

    
1059

    
1060
def _VerifyCertificate(filename):
1061
  """Verifies a certificate for LUVerifyCluster.
1062

1063
  @type filename: string
1064
  @param filename: Path to PEM file
1065

1066
  """
1067
  try:
1068
    cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1069
                                           utils.ReadFile(filename))
1070
  except Exception, err: # pylint: disable-msg=W0703
1071
    return (LUVerifyCluster.ETYPE_ERROR,
1072
            "Failed to load X509 certificate %s: %s" % (filename, err))
1073

    
1074
  # Depending on the pyOpenSSL version, this can just return (None, None)
1075
  (not_before, not_after) = utils.GetX509CertValidity(cert)
1076

    
1077
  return _VerifyCertificateInner(filename, cert.has_expired(),
1078
                                 not_before, not_after, time.time())
1079

    
1080

    
1081
class LUVerifyCluster(LogicalUnit):
1082
  """Verifies the cluster status.
1083

1084
  """
1085
  HPATH = "cluster-verify"
1086
  HTYPE = constants.HTYPE_CLUSTER
1087
  _OP_REQP = ["skip_checks", "verbose", "error_codes", "debug_simulate_errors"]
1088
  REQ_BGL = False
1089

    
1090
  TCLUSTER = "cluster"
1091
  TNODE = "node"
1092
  TINSTANCE = "instance"
1093

    
1094
  ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1095
  ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1096
  EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1097
  EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1098
  EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1099
  EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1100
  EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1101
  EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1102
  ENODEDRBD = (TNODE, "ENODEDRBD")
1103
  ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1104
  ENODEHOOKS = (TNODE, "ENODEHOOKS")
1105
  ENODEHV = (TNODE, "ENODEHV")
1106
  ENODELVM = (TNODE, "ENODELVM")
1107
  ENODEN1 = (TNODE, "ENODEN1")
1108
  ENODENET = (TNODE, "ENODENET")
1109
  ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1110
  ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1111
  ENODERPC = (TNODE, "ENODERPC")
1112
  ENODESSH = (TNODE, "ENODESSH")
1113
  ENODEVERSION = (TNODE, "ENODEVERSION")
1114
  ENODESETUP = (TNODE, "ENODESETUP")
1115
  ENODETIME = (TNODE, "ENODETIME")
1116

    
1117
  ETYPE_FIELD = "code"
1118
  ETYPE_ERROR = "ERROR"
1119
  ETYPE_WARNING = "WARNING"
1120

    
1121
  class NodeImage(object):
1122
    """A class representing the logical and physical status of a node.
1123

1124
    @ivar volumes: a structure as returned from
1125
        L{ganeti.backend.GetVolumeList} (runtime)
1126
    @ivar instances: a list of running instances (runtime)
1127
    @ivar pinst: list of configured primary instances (config)
1128
    @ivar sinst: list of configured secondary instances (config)
1129
    @ivar sbp: diction of {secondary-node: list of instances} of all peers
1130
        of this node (config)
1131
    @ivar mfree: free memory, as reported by hypervisor (runtime)
1132
    @ivar dfree: free disk, as reported by the node (runtime)
1133
    @ivar offline: the offline status (config)
1134
    @type rpc_fail: boolean
1135
    @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1136
        not whether the individual keys were correct) (runtime)
1137
    @type lvm_fail: boolean
1138
    @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1139
    @type hyp_fail: boolean
1140
    @ivar hyp_fail: whether the RPC call didn't return the instance list
1141
    @type ghost: boolean
1142
    @ivar ghost: whether this is a known node or not (config)
1143

1144
    """
1145
    def __init__(self, offline=False):
1146
      self.volumes = {}
1147
      self.instances = []
1148
      self.pinst = []
1149
      self.sinst = []
1150
      self.sbp = {}
1151
      self.mfree = 0
1152
      self.dfree = 0
1153
      self.offline = offline
1154
      self.rpc_fail = False
1155
      self.lvm_fail = False
1156
      self.hyp_fail = False
1157
      self.ghost = False
1158

    
1159
  def ExpandNames(self):
1160
    self.needed_locks = {
1161
      locking.LEVEL_NODE: locking.ALL_SET,
1162
      locking.LEVEL_INSTANCE: locking.ALL_SET,
1163
    }
1164
    self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1165

    
1166
  def _Error(self, ecode, item, msg, *args, **kwargs):
1167
    """Format an error message.
1168

1169
    Based on the opcode's error_codes parameter, either format a
1170
    parseable error code, or a simpler error string.
1171

1172
    This must be called only from Exec and functions called from Exec.
1173

1174
    """
1175
    ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1176
    itype, etxt = ecode
1177
    # first complete the msg
1178
    if args:
1179
      msg = msg % args
1180
    # then format the whole message
1181
    if self.op.error_codes:
1182
      msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1183
    else:
1184
      if item:
1185
        item = " " + item
1186
      else:
1187
        item = ""
1188
      msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1189
    # and finally report it via the feedback_fn
1190
    self._feedback_fn("  - %s" % msg)
1191

    
1192
  def _ErrorIf(self, cond, *args, **kwargs):
1193
    """Log an error message if the passed condition is True.
1194

1195
    """
1196
    cond = bool(cond) or self.op.debug_simulate_errors
1197
    if cond:
1198
      self._Error(*args, **kwargs)
1199
    # do not mark the operation as failed for WARN cases only
1200
    if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1201
      self.bad = self.bad or cond
1202

    
1203
  def _VerifyNode(self, ninfo, nresult):
1204
    """Run multiple tests against a node.
1205

1206
    Test list:
1207

1208
      - compares ganeti version
1209
      - checks vg existence and size > 20G
1210
      - checks config file checksum
1211
      - checks ssh to other nodes
1212

1213
    @type ninfo: L{objects.Node}
1214
    @param ninfo: the node to check
1215
    @param nresult: the results from the node
1216
    @rtype: boolean
1217
    @return: whether overall this call was successful (and we can expect
1218
         reasonable values in the respose)
1219

1220
    """
1221
    node = ninfo.name
1222
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1223

    
1224
    # main result, nresult should be a non-empty dict
1225
    test = not nresult or not isinstance(nresult, dict)
1226
    _ErrorIf(test, self.ENODERPC, node,
1227
                  "unable to verify node: no data returned")
1228
    if test:
1229
      return False
1230

    
1231
    # compares ganeti version
1232
    local_version = constants.PROTOCOL_VERSION
1233
    remote_version = nresult.get("version", None)
1234
    test = not (remote_version and
1235
                isinstance(remote_version, (list, tuple)) and
1236
                len(remote_version) == 2)
1237
    _ErrorIf(test, self.ENODERPC, node,
1238
             "connection to node returned invalid data")
1239
    if test:
1240
      return False
1241

    
1242
    test = local_version != remote_version[0]
1243
    _ErrorIf(test, self.ENODEVERSION, node,
1244
             "incompatible protocol versions: master %s,"
1245
             " node %s", local_version, remote_version[0])
1246
    if test:
1247
      return False
1248

    
1249
    # node seems compatible, we can actually try to look into its results
1250

    
1251
    # full package version
1252
    self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1253
                  self.ENODEVERSION, node,
1254
                  "software version mismatch: master %s, node %s",
1255
                  constants.RELEASE_VERSION, remote_version[1],
1256
                  code=self.ETYPE_WARNING)
1257

    
1258
    hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1259
    if isinstance(hyp_result, dict):
1260
      for hv_name, hv_result in hyp_result.iteritems():
1261
        test = hv_result is not None
1262
        _ErrorIf(test, self.ENODEHV, node,
1263
                 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1264

    
1265

    
1266
    test = nresult.get(constants.NV_NODESETUP,
1267
                           ["Missing NODESETUP results"])
1268
    _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1269
             "; ".join(test))
1270

    
1271
    return True
1272

    
1273
  def _VerifyNodeTime(self, ninfo, nresult,
1274
                      nvinfo_starttime, nvinfo_endtime):
1275
    """Check the node time.
1276

1277
    @type ninfo: L{objects.Node}
1278
    @param ninfo: the node to check
1279
    @param nresult: the remote results for the node
1280
    @param nvinfo_starttime: the start time of the RPC call
1281
    @param nvinfo_endtime: the end time of the RPC call
1282

1283
    """
1284
    node = ninfo.name
1285
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1286

    
1287
    ntime = nresult.get(constants.NV_TIME, None)
1288
    try:
1289
      ntime_merged = utils.MergeTime(ntime)
1290
    except (ValueError, TypeError):
1291
      _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1292
      return
1293

    
1294
    if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1295
      ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1296
    elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1297
      ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1298
    else:
1299
      ntime_diff = None
1300

    
1301
    _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1302
             "Node time diverges by at least %s from master node time",
1303
             ntime_diff)
1304

    
1305
  def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1306
    """Check the node time.
1307

1308
    @type ninfo: L{objects.Node}
1309
    @param ninfo: the node to check
1310
    @param nresult: the remote results for the node
1311
    @param vg_name: the configured VG name
1312

1313
    """
1314
    if vg_name is None:
1315
      return
1316

    
1317
    node = ninfo.name
1318
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1319

    
1320
    # checks vg existence and size > 20G
1321
    vglist = nresult.get(constants.NV_VGLIST, None)
1322
    test = not vglist
1323
    _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1324
    if not test:
1325
      vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1326
                                            constants.MIN_VG_SIZE)
1327
      _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1328

    
1329
    # check pv names
1330
    pvlist = nresult.get(constants.NV_PVLIST, None)
1331
    test = pvlist is None
1332
    _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1333
    if not test:
1334
      # check that ':' is not present in PV names, since it's a
1335
      # special character for lvcreate (denotes the range of PEs to
1336
      # use on the PV)
1337
      for _, pvname, owner_vg in pvlist:
1338
        test = ":" in pvname
1339
        _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1340
                 " '%s' of VG '%s'", pvname, owner_vg)
1341

    
1342
  def _VerifyNodeNetwork(self, ninfo, nresult):
1343
    """Check the node time.
1344

1345
    @type ninfo: L{objects.Node}
1346
    @param ninfo: the node to check
1347
    @param nresult: the remote results for the node
1348

1349
    """
1350
    node = ninfo.name
1351
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1352

    
1353
    test = constants.NV_NODELIST not in nresult
1354
    _ErrorIf(test, self.ENODESSH, node,
1355
             "node hasn't returned node ssh connectivity data")
1356
    if not test:
1357
      if nresult[constants.NV_NODELIST]:
1358
        for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1359
          _ErrorIf(True, self.ENODESSH, node,
1360
                   "ssh communication with node '%s': %s", a_node, a_msg)
1361

    
1362
    test = constants.NV_NODENETTEST not in nresult
1363
    _ErrorIf(test, self.ENODENET, node,
1364
             "node hasn't returned node tcp connectivity data")
1365
    if not test:
1366
      if nresult[constants.NV_NODENETTEST]:
1367
        nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1368
        for anode in nlist:
1369
          _ErrorIf(True, self.ENODENET, node,
1370
                   "tcp communication with node '%s': %s",
1371
                   anode, nresult[constants.NV_NODENETTEST][anode])
1372

    
1373
  def _VerifyInstance(self, instance, instanceconfig, node_image):
1374
    """Verify an instance.
1375

1376
    This function checks to see if the required block devices are
1377
    available on the instance's node.
1378

1379
    """
1380
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1381
    node_current = instanceconfig.primary_node
1382

    
1383
    node_vol_should = {}
1384
    instanceconfig.MapLVsByNode(node_vol_should)
1385

    
1386
    for node in node_vol_should:
1387
      n_img = node_image[node]
1388
      if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1389
        # ignore missing volumes on offline or broken nodes
1390
        continue
1391
      for volume in node_vol_should[node]:
1392
        test = volume not in n_img.volumes
1393
        _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1394
                 "volume %s missing on node %s", volume, node)
1395

    
1396
    if instanceconfig.admin_up:
1397
      pri_img = node_image[node_current]
1398
      test = instance not in pri_img.instances and not pri_img.offline
1399
      _ErrorIf(test, self.EINSTANCEDOWN, instance,
1400
               "instance not running on its primary node %s",
1401
               node_current)
1402

    
1403
    for node, n_img in node_image.items():
1404
      if (not node == node_current):
1405
        test = instance in n_img.instances
1406
        _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1407
                 "instance should not run on node %s", node)
1408

    
1409
  def _VerifyOrphanVolumes(self, node_vol_should, node_image):
1410
    """Verify if there are any unknown volumes in the cluster.
1411

1412
    The .os, .swap and backup volumes are ignored. All other volumes are
1413
    reported as unknown.
1414

1415
    """
1416
    for node, n_img in node_image.items():
1417
      if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1418
        # skip non-healthy nodes
1419
        continue
1420
      for volume in n_img.volumes:
1421
        test = (node not in node_vol_should or
1422
                volume not in node_vol_should[node])
1423
        self._ErrorIf(test, self.ENODEORPHANLV, node,
1424
                      "volume %s is unknown", volume)
1425

    
1426
  def _VerifyOrphanInstances(self, instancelist, node_image):
1427
    """Verify the list of running instances.
1428

1429
    This checks what instances are running but unknown to the cluster.
1430

1431
    """
1432
    for node, n_img in node_image.items():
1433
      for o_inst in n_img.instances:
1434
        test = o_inst not in instancelist
1435
        self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1436
                      "instance %s on node %s should not exist", o_inst, node)
1437

    
1438
  def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1439
    """Verify N+1 Memory Resilience.
1440

1441
    Check that if one single node dies we can still start all the
1442
    instances it was primary for.
1443

1444
    """
1445
    for node, n_img in node_image.items():
1446
      # This code checks that every node which is now listed as
1447
      # secondary has enough memory to host all instances it is
1448
      # supposed to should a single other node in the cluster fail.
1449
      # FIXME: not ready for failover to an arbitrary node
1450
      # FIXME: does not support file-backed instances
1451
      # WARNING: we currently take into account down instances as well
1452
      # as up ones, considering that even if they're down someone
1453
      # might want to start them even in the event of a node failure.
1454
      for prinode, instances in n_img.sbp.items():
1455
        needed_mem = 0
1456
        for instance in instances:
1457
          bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
1458
          if bep[constants.BE_AUTO_BALANCE]:
1459
            needed_mem += bep[constants.BE_MEMORY]
1460
        test = n_img.mfree < needed_mem
1461
        self._ErrorIf(test, self.ENODEN1, node,
1462
                      "not enough memory on to accommodate"
1463
                      " failovers should peer node %s fail", prinode)
1464

    
1465
  def _VerifyNodeFiles(self, ninfo, nresult, file_list, local_cksum,
1466
                       master_files):
1467
    """Verifies and computes the node required file checksums.
1468

1469
    @type ninfo: L{objects.Node}
1470
    @param ninfo: the node to check
1471
    @param nresult: the remote results for the node
1472
    @param file_list: required list of files
1473
    @param local_cksum: dictionary of local files and their checksums
1474
    @param master_files: list of files that only masters should have
1475

1476
    """
1477
    node = ninfo.name
1478
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1479

    
1480
    remote_cksum = nresult.get(constants.NV_FILELIST, None)
1481
    test = not isinstance(remote_cksum, dict)
1482
    _ErrorIf(test, self.ENODEFILECHECK, node,
1483
             "node hasn't returned file checksum data")
1484
    if test:
1485
      return
1486

    
1487
    for file_name in file_list:
1488
      node_is_mc = ninfo.master_candidate
1489
      must_have = (file_name not in master_files) or node_is_mc
1490
      # missing
1491
      test1 = file_name not in remote_cksum
1492
      # invalid checksum
1493
      test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1494
      # existing and good
1495
      test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1496
      _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1497
               "file '%s' missing", file_name)
1498
      _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1499
               "file '%s' has wrong checksum", file_name)
1500
      # not candidate and this is not a must-have file
1501
      _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1502
               "file '%s' should not exist on non master"
1503
               " candidates (and the file is outdated)", file_name)
1504
      # all good, except non-master/non-must have combination
1505
      _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1506
               "file '%s' should not exist"
1507
               " on non master candidates", file_name)
1508

    
1509
  def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_map):
1510
    """Verifies and the node DRBD status.
1511

1512
    @type ninfo: L{objects.Node}
1513
    @param ninfo: the node to check
1514
    @param nresult: the remote results for the node
1515
    @param instanceinfo: the dict of instances
1516
    @param drbd_map: the DRBD map as returned by
1517
        L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1518

1519
    """
1520
    node = ninfo.name
1521
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1522

    
1523
    # compute the DRBD minors
1524
    node_drbd = {}
1525
    for minor, instance in drbd_map[node].items():
1526
      test = instance not in instanceinfo
1527
      _ErrorIf(test, self.ECLUSTERCFG, None,
1528
               "ghost instance '%s' in temporary DRBD map", instance)
1529
        # ghost instance should not be running, but otherwise we
1530
        # don't give double warnings (both ghost instance and
1531
        # unallocated minor in use)
1532
      if test:
1533
        node_drbd[minor] = (instance, False)
1534
      else:
1535
        instance = instanceinfo[instance]
1536
        node_drbd[minor] = (instance.name, instance.admin_up)
1537

    
1538
    # and now check them
1539
    used_minors = nresult.get(constants.NV_DRBDLIST, [])
1540
    test = not isinstance(used_minors, (tuple, list))
1541
    _ErrorIf(test, self.ENODEDRBD, node,
1542
             "cannot parse drbd status file: %s", str(used_minors))
1543
    if test:
1544
      # we cannot check drbd status
1545
      return
1546

    
1547
    for minor, (iname, must_exist) in node_drbd.items():
1548
      test = minor not in used_minors and must_exist
1549
      _ErrorIf(test, self.ENODEDRBD, node,
1550
               "drbd minor %d of instance %s is not active", minor, iname)
1551
    for minor in used_minors:
1552
      test = minor not in node_drbd
1553
      _ErrorIf(test, self.ENODEDRBD, node,
1554
               "unallocated drbd minor %d is in use", minor)
1555

    
1556
  def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1557
    """Verifies and updates the node volume data.
1558

1559
    This function will update a L{NodeImage}'s internal structures
1560
    with data from the remote call.
1561

1562
    @type ninfo: L{objects.Node}
1563
    @param ninfo: the node to check
1564
    @param nresult: the remote results for the node
1565
    @param nimg: the node image object
1566
    @param vg_name: the configured VG name
1567

1568
    """
1569
    node = ninfo.name
1570
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1571

    
1572
    nimg.lvm_fail = True
1573
    lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1574
    if vg_name is None:
1575
      pass
1576
    elif isinstance(lvdata, basestring):
1577
      _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1578
               utils.SafeEncode(lvdata))
1579
    elif not isinstance(lvdata, dict):
1580
      _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1581
    else:
1582
      nimg.volumes = lvdata
1583
      nimg.lvm_fail = False
1584

    
1585
  def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1586
    """Verifies and updates the node instance list.
1587

1588
    If the listing was successful, then updates this node's instance
1589
    list. Otherwise, it marks the RPC call as failed for the instance
1590
    list key.
1591

1592
    @type ninfo: L{objects.Node}
1593
    @param ninfo: the node to check
1594
    @param nresult: the remote results for the node
1595
    @param nimg: the node image object
1596

1597
    """
1598
    idata = nresult.get(constants.NV_INSTANCELIST, None)
1599
    test = not isinstance(idata, list)
1600
    self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
1601
                  " (instancelist): %s", utils.SafeEncode(str(idata)))
1602
    if test:
1603
      nimg.hyp_fail = True
1604
    else:
1605
      nimg.instances = idata
1606

    
1607
  def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1608
    """Verifies and computes a node information map
1609

1610
    @type ninfo: L{objects.Node}
1611
    @param ninfo: the node to check
1612
    @param nresult: the remote results for the node
1613
    @param nimg: the node image object
1614
    @param vg_name: the configured VG name
1615

1616
    """
1617
    node = ninfo.name
1618
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1619

    
1620
    # try to read free memory (from the hypervisor)
1621
    hv_info = nresult.get(constants.NV_HVINFO, None)
1622
    test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1623
    _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1624
    if not test:
1625
      try:
1626
        nimg.mfree = int(hv_info["memory_free"])
1627
      except (ValueError, TypeError):
1628
        _ErrorIf(True, self.ENODERPC, node,
1629
                 "node returned invalid nodeinfo, check hypervisor")
1630

    
1631
    # FIXME: devise a free space model for file based instances as well
1632
    if vg_name is not None:
1633
      test = (constants.NV_VGLIST not in nresult or
1634
              vg_name not in nresult[constants.NV_VGLIST])
1635
      _ErrorIf(test, self.ENODELVM, node,
1636
               "node didn't return data for the volume group '%s'"
1637
               " - it is either missing or broken", vg_name)
1638
      if not test:
1639
        try:
1640
          nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1641
        except (ValueError, TypeError):
1642
          _ErrorIf(True, self.ENODERPC, node,
1643
                   "node returned invalid LVM info, check LVM status")
1644

    
1645
  def CheckPrereq(self):
1646
    """Check prerequisites.
1647

1648
    Transform the list of checks we're going to skip into a set and check that
1649
    all its members are valid.
1650

1651
    """
1652
    self.skip_set = frozenset(self.op.skip_checks)
1653
    if not constants.VERIFY_OPTIONAL_CHECKS.issuperset(self.skip_set):
1654
      raise errors.OpPrereqError("Invalid checks to be skipped specified",
1655
                                 errors.ECODE_INVAL)
1656

    
1657
  def BuildHooksEnv(self):
1658
    """Build hooks env.
1659

1660
    Cluster-Verify hooks just ran in the post phase and their failure makes
1661
    the output be logged in the verify output and the verification to fail.
1662

1663
    """
1664
    all_nodes = self.cfg.GetNodeList()
1665
    env = {
1666
      "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
1667
      }
1668
    for node in self.cfg.GetAllNodesInfo().values():
1669
      env["NODE_TAGS_%s" % node.name] = " ".join(node.GetTags())
1670

    
1671
    return env, [], all_nodes
1672

    
1673
  def Exec(self, feedback_fn):
1674
    """Verify integrity of cluster, performing various test on nodes.
1675

1676
    """
1677
    self.bad = False
1678
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1679
    verbose = self.op.verbose
1680
    self._feedback_fn = feedback_fn
1681
    feedback_fn("* Verifying global settings")
1682
    for msg in self.cfg.VerifyConfig():
1683
      _ErrorIf(True, self.ECLUSTERCFG, None, msg)
1684

    
1685
    # Check the cluster certificates
1686
    for cert_filename in constants.ALL_CERT_FILES:
1687
      (errcode, msg) = _VerifyCertificate(cert_filename)
1688
      _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
1689

    
1690
    vg_name = self.cfg.GetVGName()
1691
    hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
1692
    cluster = self.cfg.GetClusterInfo()
1693
    nodelist = utils.NiceSort(self.cfg.GetNodeList())
1694
    nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
1695
    instancelist = utils.NiceSort(self.cfg.GetInstanceList())
1696
    instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
1697
                        for iname in instancelist)
1698
    i_non_redundant = [] # Non redundant instances
1699
    i_non_a_balanced = [] # Non auto-balanced instances
1700
    n_offline = 0 # Count of offline nodes
1701
    n_drained = 0 # Count of nodes being drained
1702
    node_vol_should = {}
1703

    
1704
    # FIXME: verify OS list
1705
    # do local checksums
1706
    master_files = [constants.CLUSTER_CONF_FILE]
1707

    
1708
    file_names = ssconf.SimpleStore().GetFileList()
1709
    file_names.extend(constants.ALL_CERT_FILES)
1710
    file_names.extend(master_files)
1711
    if cluster.modify_etc_hosts:
1712
      file_names.append(constants.ETC_HOSTS)
1713

    
1714
    local_checksums = utils.FingerprintFiles(file_names)
1715

    
1716
    feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
1717
    node_verify_param = {
1718
      constants.NV_FILELIST: file_names,
1719
      constants.NV_NODELIST: [node.name for node in nodeinfo
1720
                              if not node.offline],
1721
      constants.NV_HYPERVISOR: hypervisors,
1722
      constants.NV_NODENETTEST: [(node.name, node.primary_ip,
1723
                                  node.secondary_ip) for node in nodeinfo
1724
                                 if not node.offline],
1725
      constants.NV_INSTANCELIST: hypervisors,
1726
      constants.NV_VERSION: None,
1727
      constants.NV_HVINFO: self.cfg.GetHypervisorType(),
1728
      constants.NV_NODESETUP: None,
1729
      constants.NV_TIME: None,
1730
      }
1731

    
1732
    if vg_name is not None:
1733
      node_verify_param[constants.NV_VGLIST] = None
1734
      node_verify_param[constants.NV_LVLIST] = vg_name
1735
      node_verify_param[constants.NV_PVLIST] = [vg_name]
1736
      node_verify_param[constants.NV_DRBDLIST] = None
1737

    
1738
    # Build our expected cluster state
1739
    node_image = dict((node.name, self.NodeImage(offline=node.offline))
1740
                      for node in nodeinfo)
1741

    
1742
    for instance in instancelist:
1743
      inst_config = instanceinfo[instance]
1744

    
1745
      for nname in inst_config.all_nodes:
1746
        if nname not in node_image:
1747
          # ghost node
1748
          gnode = self.NodeImage()
1749
          gnode.ghost = True
1750
          node_image[nname] = gnode
1751

    
1752
      inst_config.MapLVsByNode(node_vol_should)
1753

    
1754
      pnode = inst_config.primary_node
1755
      node_image[pnode].pinst.append(instance)
1756

    
1757
      for snode in inst_config.secondary_nodes:
1758
        nimg = node_image[snode]
1759
        nimg.sinst.append(instance)
1760
        if pnode not in nimg.sbp:
1761
          nimg.sbp[pnode] = []
1762
        nimg.sbp[pnode].append(instance)
1763

    
1764
    # At this point, we have the in-memory data structures complete,
1765
    # except for the runtime information, which we'll gather next
1766

    
1767
    # Due to the way our RPC system works, exact response times cannot be
1768
    # guaranteed (e.g. a broken node could run into a timeout). By keeping the
1769
    # time before and after executing the request, we can at least have a time
1770
    # window.
1771
    nvinfo_starttime = time.time()
1772
    all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
1773
                                           self.cfg.GetClusterName())
1774
    nvinfo_endtime = time.time()
1775

    
1776
    master_node = self.cfg.GetMasterNode()
1777
    all_drbd_map = self.cfg.ComputeDRBDMap()
1778

    
1779
    feedback_fn("* Verifying node status")
1780
    for node_i in nodeinfo:
1781
      node = node_i.name
1782
      nimg = node_image[node]
1783

    
1784
      if node_i.offline:
1785
        if verbose:
1786
          feedback_fn("* Skipping offline node %s" % (node,))
1787
        n_offline += 1
1788
        continue
1789

    
1790
      if node == master_node:
1791
        ntype = "master"
1792
      elif node_i.master_candidate:
1793
        ntype = "master candidate"
1794
      elif node_i.drained:
1795
        ntype = "drained"
1796
        n_drained += 1
1797
      else:
1798
        ntype = "regular"
1799
      if verbose:
1800
        feedback_fn("* Verifying node %s (%s)" % (node, ntype))
1801

    
1802
      msg = all_nvinfo[node].fail_msg
1803
      _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
1804
      if msg:
1805
        nimg.rpc_fail = True
1806
        continue
1807

    
1808
      nresult = all_nvinfo[node].payload
1809

    
1810
      nimg.call_ok = self._VerifyNode(node_i, nresult)
1811
      self._VerifyNodeNetwork(node_i, nresult)
1812
      self._VerifyNodeLVM(node_i, nresult, vg_name)
1813
      self._VerifyNodeFiles(node_i, nresult, file_names, local_checksums,
1814
                            master_files)
1815
      self._VerifyNodeDrbd(node_i, nresult, instanceinfo, all_drbd_map)
1816
      self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
1817

    
1818
      self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
1819
      self._UpdateNodeInstances(node_i, nresult, nimg)
1820
      self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
1821

    
1822
    feedback_fn("* Verifying instance status")
1823
    for instance in instancelist:
1824
      if verbose:
1825
        feedback_fn("* Verifying instance %s" % instance)
1826
      inst_config = instanceinfo[instance]
1827
      self._VerifyInstance(instance, inst_config, node_image)
1828
      inst_nodes_offline = []
1829

    
1830
      pnode = inst_config.primary_node
1831
      pnode_img = node_image[pnode]
1832
      _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
1833
               self.ENODERPC, pnode, "instance %s, connection to"
1834
               " primary node failed", instance)
1835

    
1836
      if pnode_img.offline:
1837
        inst_nodes_offline.append(pnode)
1838

    
1839
      # If the instance is non-redundant we cannot survive losing its primary
1840
      # node, so we are not N+1 compliant. On the other hand we have no disk
1841
      # templates with more than one secondary so that situation is not well
1842
      # supported either.
1843
      # FIXME: does not support file-backed instances
1844
      if not inst_config.secondary_nodes:
1845
        i_non_redundant.append(instance)
1846
      _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
1847
               instance, "instance has multiple secondary nodes: %s",
1848
               utils.CommaJoin(inst_config.secondary_nodes),
1849
               code=self.ETYPE_WARNING)
1850

    
1851
      if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
1852
        i_non_a_balanced.append(instance)
1853

    
1854
      for snode in inst_config.secondary_nodes:
1855
        s_img = node_image[snode]
1856
        _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
1857
                 "instance %s, connection to secondary node failed", instance)
1858

    
1859
        if s_img.offline:
1860
          inst_nodes_offline.append(snode)
1861

    
1862
      # warn that the instance lives on offline nodes
1863
      _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
1864
               "instance lives on offline node(s) %s",
1865
               utils.CommaJoin(inst_nodes_offline))
1866
      # ... or ghost nodes
1867
      for node in inst_config.all_nodes:
1868
        _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
1869
                 "instance lives on ghost node %s", node)
1870

    
1871
    feedback_fn("* Verifying orphan volumes")
1872
    self._VerifyOrphanVolumes(node_vol_should, node_image)
1873

    
1874
    feedback_fn("* Verifying orphan instances")
1875
    self._VerifyOrphanInstances(instancelist, node_image)
1876

    
1877
    if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
1878
      feedback_fn("* Verifying N+1 Memory redundancy")
1879
      self._VerifyNPlusOneMemory(node_image, instanceinfo)
1880

    
1881
    feedback_fn("* Other Notes")
1882
    if i_non_redundant:
1883
      feedback_fn("  - NOTICE: %d non-redundant instance(s) found."
1884
                  % len(i_non_redundant))
1885

    
1886
    if i_non_a_balanced:
1887
      feedback_fn("  - NOTICE: %d non-auto-balanced instance(s) found."
1888
                  % len(i_non_a_balanced))
1889

    
1890
    if n_offline:
1891
      feedback_fn("  - NOTICE: %d offline node(s) found." % n_offline)
1892

    
1893
    if n_drained:
1894
      feedback_fn("  - NOTICE: %d drained node(s) found." % n_drained)
1895

    
1896
    return not self.bad
1897

    
1898
  def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
1899
    """Analyze the post-hooks' result
1900

1901
    This method analyses the hook result, handles it, and sends some
1902
    nicely-formatted feedback back to the user.
1903

1904
    @param phase: one of L{constants.HOOKS_PHASE_POST} or
1905
        L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
1906
    @param hooks_results: the results of the multi-node hooks rpc call
1907
    @param feedback_fn: function used send feedback back to the caller
1908
    @param lu_result: previous Exec result
1909
    @return: the new Exec result, based on the previous result
1910
        and hook results
1911

1912
    """
1913
    # We only really run POST phase hooks, and are only interested in
1914
    # their results
1915
    if phase == constants.HOOKS_PHASE_POST:
1916
      # Used to change hooks' output to proper indentation
1917
      indent_re = re.compile('^', re.M)
1918
      feedback_fn("* Hooks Results")
1919
      assert hooks_results, "invalid result from hooks"
1920

    
1921
      for node_name in hooks_results:
1922
        res = hooks_results[node_name]
1923
        msg = res.fail_msg
1924
        test = msg and not res.offline
1925
        self._ErrorIf(test, self.ENODEHOOKS, node_name,
1926
                      "Communication failure in hooks execution: %s", msg)
1927
        if res.offline or msg:
1928
          # No need to investigate payload if node is offline or gave an error.
1929
          # override manually lu_result here as _ErrorIf only
1930
          # overrides self.bad
1931
          lu_result = 1
1932
          continue
1933
        for script, hkr, output in res.payload:
1934
          test = hkr == constants.HKR_FAIL
1935
          self._ErrorIf(test, self.ENODEHOOKS, node_name,
1936
                        "Script %s failed, output:", script)
1937
          if test:
1938
            output = indent_re.sub('      ', output)
1939
            feedback_fn("%s" % output)
1940
            lu_result = 0
1941

    
1942
      return lu_result
1943

    
1944

    
1945
class LUVerifyDisks(NoHooksLU):
1946
  """Verifies the cluster disks status.
1947

1948
  """
1949
  _OP_REQP = []
1950
  REQ_BGL = False
1951

    
1952
  def ExpandNames(self):
1953
    self.needed_locks = {
1954
      locking.LEVEL_NODE: locking.ALL_SET,
1955
      locking.LEVEL_INSTANCE: locking.ALL_SET,
1956
    }
1957
    self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1958

    
1959
  def CheckPrereq(self):
1960
    """Check prerequisites.
1961

1962
    This has no prerequisites.
1963

1964
    """
1965
    pass
1966

    
1967
  def Exec(self, feedback_fn):
1968
    """Verify integrity of cluster disks.
1969

1970
    @rtype: tuple of three items
1971
    @return: a tuple of (dict of node-to-node_error, list of instances
1972
        which need activate-disks, dict of instance: (node, volume) for
1973
        missing volumes
1974

1975
    """
1976
    result = res_nodes, res_instances, res_missing = {}, [], {}
1977

    
1978
    vg_name = self.cfg.GetVGName()
1979
    nodes = utils.NiceSort(self.cfg.GetNodeList())
1980
    instances = [self.cfg.GetInstanceInfo(name)
1981
                 for name in self.cfg.GetInstanceList()]
1982

    
1983
    nv_dict = {}
1984
    for inst in instances:
1985
      inst_lvs = {}
1986
      if (not inst.admin_up or
1987
          inst.disk_template not in constants.DTS_NET_MIRROR):
1988
        continue
1989
      inst.MapLVsByNode(inst_lvs)
1990
      # transform { iname: {node: [vol,],},} to {(node, vol): iname}
1991
      for node, vol_list in inst_lvs.iteritems():
1992
        for vol in vol_list:
1993
          nv_dict[(node, vol)] = inst
1994

    
1995
    if not nv_dict:
1996
      return result
1997

    
1998
    node_lvs = self.rpc.call_lv_list(nodes, vg_name)
1999

    
2000
    for node in nodes:
2001
      # node_volume
2002
      node_res = node_lvs[node]
2003
      if node_res.offline:
2004
        continue
2005
      msg = node_res.fail_msg
2006
      if msg:
2007
        logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2008
        res_nodes[node] = msg
2009
        continue
2010

    
2011
      lvs = node_res.payload
2012
      for lv_name, (_, _, lv_online) in lvs.items():
2013
        inst = nv_dict.pop((node, lv_name), None)
2014
        if (not lv_online and inst is not None
2015
            and inst.name not in res_instances):
2016
          res_instances.append(inst.name)
2017

    
2018
    # any leftover items in nv_dict are missing LVs, let's arrange the
2019
    # data better
2020
    for key, inst in nv_dict.iteritems():
2021
      if inst.name not in res_missing:
2022
        res_missing[inst.name] = []
2023
      res_missing[inst.name].append(key)
2024

    
2025
    return result
2026

    
2027

    
2028
class LURepairDiskSizes(NoHooksLU):
2029
  """Verifies the cluster disks sizes.
2030

2031
  """
2032
  _OP_REQP = ["instances"]
2033
  REQ_BGL = False
2034

    
2035
  def ExpandNames(self):
2036
    if not isinstance(self.op.instances, list):
2037
      raise errors.OpPrereqError("Invalid argument type 'instances'",
2038
                                 errors.ECODE_INVAL)
2039

    
2040
    if self.op.instances:
2041
      self.wanted_names = []
2042
      for name in self.op.instances:
2043
        full_name = _ExpandInstanceName(self.cfg, name)
2044
        self.wanted_names.append(full_name)
2045
      self.needed_locks = {
2046
        locking.LEVEL_NODE: [],
2047
        locking.LEVEL_INSTANCE: self.wanted_names,
2048
        }
2049
      self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
2050
    else:
2051
      self.wanted_names = None
2052
      self.needed_locks = {
2053
        locking.LEVEL_NODE: locking.ALL_SET,
2054
        locking.LEVEL_INSTANCE: locking.ALL_SET,
2055
        }
2056
    self.share_locks = dict(((i, 1) for i in locking.LEVELS))
2057

    
2058
  def DeclareLocks(self, level):
2059
    if level == locking.LEVEL_NODE and self.wanted_names is not None:
2060
      self._LockInstancesNodes(primary_only=True)
2061

    
2062
  def CheckPrereq(self):
2063
    """Check prerequisites.
2064

2065
    This only checks the optional instance list against the existing names.
2066

2067
    """
2068
    if self.wanted_names is None:
2069
      self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
2070

    
2071
    self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2072
                             in self.wanted_names]
2073

    
2074
  def _EnsureChildSizes(self, disk):
2075
    """Ensure children of the disk have the needed disk size.
2076

2077
    This is valid mainly for DRBD8 and fixes an issue where the
2078
    children have smaller disk size.
2079

2080
    @param disk: an L{ganeti.objects.Disk} object
2081

2082
    """
2083
    if disk.dev_type == constants.LD_DRBD8:
2084
      assert disk.children, "Empty children for DRBD8?"
2085
      fchild = disk.children[0]
2086
      mismatch = fchild.size < disk.size
2087
      if mismatch:
2088
        self.LogInfo("Child disk has size %d, parent %d, fixing",
2089
                     fchild.size, disk.size)
2090
        fchild.size = disk.size
2091

    
2092
      # and we recurse on this child only, not on the metadev
2093
      return self._EnsureChildSizes(fchild) or mismatch
2094
    else:
2095
      return False
2096

    
2097
  def Exec(self, feedback_fn):
2098
    """Verify the size of cluster disks.
2099

2100
    """
2101
    # TODO: check child disks too
2102
    # TODO: check differences in size between primary/secondary nodes
2103
    per_node_disks = {}
2104
    for instance in self.wanted_instances:
2105
      pnode = instance.primary_node
2106
      if pnode not in per_node_disks:
2107
        per_node_disks[pnode] = []
2108
      for idx, disk in enumerate(instance.disks):
2109
        per_node_disks[pnode].append((instance, idx, disk))
2110

    
2111
    changed = []
2112
    for node, dskl in per_node_disks.items():
2113
      newl = [v[2].Copy() for v in dskl]
2114
      for dsk in newl:
2115
        self.cfg.SetDiskID(dsk, node)
2116
      result = self.rpc.call_blockdev_getsizes(node, newl)
2117
      if result.fail_msg:
2118
        self.LogWarning("Failure in blockdev_getsizes call to node"
2119
                        " %s, ignoring", node)
2120
        continue
2121
      if len(result.data) != len(dskl):
2122
        self.LogWarning("Invalid result from node %s, ignoring node results",
2123
                        node)
2124
        continue
2125
      for ((instance, idx, disk), size) in zip(dskl, result.data):
2126
        if size is None:
2127
          self.LogWarning("Disk %d of instance %s did not return size"
2128
                          " information, ignoring", idx, instance.name)
2129
          continue
2130
        if not isinstance(size, (int, long)):
2131
          self.LogWarning("Disk %d of instance %s did not return valid"
2132
                          " size information, ignoring", idx, instance.name)
2133
          continue
2134
        size = size >> 20
2135
        if size != disk.size:
2136
          self.LogInfo("Disk %d of instance %s has mismatched size,"
2137
                       " correcting: recorded %d, actual %d", idx,
2138
                       instance.name, disk.size, size)
2139
          disk.size = size
2140
          self.cfg.Update(instance, feedback_fn)
2141
          changed.append((instance.name, idx, size))
2142
        if self._EnsureChildSizes(disk):
2143
          self.cfg.Update(instance, feedback_fn)
2144
          changed.append((instance.name, idx, disk.size))
2145
    return changed
2146

    
2147

    
2148
class LURenameCluster(LogicalUnit):
2149
  """Rename the cluster.
2150

2151
  """
2152
  HPATH = "cluster-rename"
2153
  HTYPE = constants.HTYPE_CLUSTER
2154
  _OP_REQP = ["name"]
2155

    
2156
  def BuildHooksEnv(self):
2157
    """Build hooks env.
2158

2159
    """
2160
    env = {
2161
      "OP_TARGET": self.cfg.GetClusterName(),
2162
      "NEW_NAME": self.op.name,
2163
      }
2164
    mn = self.cfg.GetMasterNode()
2165
    all_nodes = self.cfg.GetNodeList()
2166
    return env, [mn], all_nodes
2167

    
2168
  def CheckPrereq(self):
2169
    """Verify that the passed name is a valid one.
2170

2171
    """
2172
    hostname = utils.GetHostInfo(self.op.name)
2173

    
2174
    new_name = hostname.name
2175
    self.ip = new_ip = hostname.ip
2176
    old_name = self.cfg.GetClusterName()
2177
    old_ip = self.cfg.GetMasterIP()
2178
    if new_name == old_name and new_ip == old_ip:
2179
      raise errors.OpPrereqError("Neither the name nor the IP address of the"
2180
                                 " cluster has changed",
2181
                                 errors.ECODE_INVAL)
2182
    if new_ip != old_ip:
2183
      if utils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
2184
        raise errors.OpPrereqError("The given cluster IP address (%s) is"
2185
                                   " reachable on the network. Aborting." %
2186
                                   new_ip, errors.ECODE_NOTUNIQUE)
2187

    
2188
    self.op.name = new_name
2189

    
2190
  def Exec(self, feedback_fn):
2191
    """Rename the cluster.
2192

2193
    """
2194
    clustername = self.op.name
2195
    ip = self.ip
2196

    
2197
    # shutdown the master IP
2198
    master = self.cfg.GetMasterNode()
2199
    result = self.rpc.call_node_stop_master(master, False)
2200
    result.Raise("Could not disable the master role")
2201

    
2202
    try:
2203
      cluster = self.cfg.GetClusterInfo()
2204
      cluster.cluster_name = clustername
2205
      cluster.master_ip = ip
2206
      self.cfg.Update(cluster, feedback_fn)
2207

    
2208
      # update the known hosts file
2209
      ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
2210
      node_list = self.cfg.GetNodeList()
2211
      try:
2212
        node_list.remove(master)
2213
      except ValueError:
2214
        pass
2215
      result = self.rpc.call_upload_file(node_list,
2216
                                         constants.SSH_KNOWN_HOSTS_FILE)
2217
      for to_node, to_result in result.iteritems():
2218
        msg = to_result.fail_msg
2219
        if msg:
2220
          msg = ("Copy of file %s to node %s failed: %s" %
2221
                 (constants.SSH_KNOWN_HOSTS_FILE, to_node, msg))
2222
          self.proc.LogWarning(msg)
2223

    
2224
    finally:
2225
      result = self.rpc.call_node_start_master(master, False, False)
2226
      msg = result.fail_msg
2227
      if msg:
2228
        self.LogWarning("Could not re-enable the master role on"
2229
                        " the master, please restart manually: %s", msg)
2230

    
2231

    
2232
def _RecursiveCheckIfLVMBased(disk):
2233
  """Check if the given disk or its children are lvm-based.
2234

2235
  @type disk: L{objects.Disk}
2236
  @param disk: the disk to check
2237
  @rtype: boolean
2238
  @return: boolean indicating whether a LD_LV dev_type was found or not
2239

2240
  """
2241
  if disk.children:
2242
    for chdisk in disk.children:
2243
      if _RecursiveCheckIfLVMBased(chdisk):
2244
        return True
2245
  return disk.dev_type == constants.LD_LV
2246

    
2247

    
2248
class LUSetClusterParams(LogicalUnit):
2249
  """Change the parameters of the cluster.
2250

2251
  """
2252
  HPATH = "cluster-modify"
2253
  HTYPE = constants.HTYPE_CLUSTER
2254
  _OP_REQP = []
2255
  REQ_BGL = False
2256

    
2257
  def CheckArguments(self):
2258
    """Check parameters
2259

2260
    """
2261
    for attr in ["candidate_pool_size",
2262
                 "uid_pool", "add_uids", "remove_uids"]:
2263
      if not hasattr(self.op, attr):
2264
        setattr(self.op, attr, None)
2265

    
2266
    if self.op.candidate_pool_size is not None:
2267
      try:
2268
        self.op.candidate_pool_size = int(self.op.candidate_pool_size)
2269
      except (ValueError, TypeError), err:
2270
        raise errors.OpPrereqError("Invalid candidate_pool_size value: %s" %
2271
                                   str(err), errors.ECODE_INVAL)
2272
      if self.op.candidate_pool_size < 1:
2273
        raise errors.OpPrereqError("At least one master candidate needed",
2274
                                   errors.ECODE_INVAL)
2275

    
2276
    _CheckBooleanOpField(self.op, "maintain_node_health")
2277

    
2278
    if self.op.uid_pool:
2279
      uidpool.CheckUidPool(self.op.uid_pool)
2280

    
2281
    if self.op.add_uids:
2282
      uidpool.CheckUidPool(self.op.add_uids)
2283

    
2284
    if self.op.remove_uids:
2285
      uidpool.CheckUidPool(self.op.remove_uids)
2286

    
2287
  def ExpandNames(self):
2288
    # FIXME: in the future maybe other cluster params won't require checking on
2289
    # all nodes to be modified.
2290
    self.needed_locks = {
2291
      locking.LEVEL_NODE: locking.ALL_SET,
2292
    }
2293
    self.share_locks[locking.LEVEL_NODE] = 1
2294

    
2295
  def BuildHooksEnv(self):
2296
    """Build hooks env.
2297

2298
    """
2299
    env = {
2300
      "OP_TARGET": self.cfg.GetClusterName(),
2301
      "NEW_VG_NAME": self.op.vg_name,
2302
      }
2303
    mn = self.cfg.GetMasterNode()
2304
    return env, [mn], [mn]
2305

    
2306
  def CheckPrereq(self):
2307
    """Check prerequisites.
2308

2309
    This checks whether the given params don't conflict and
2310
    if the given volume group is valid.
2311

2312
    """
2313
    if self.op.vg_name is not None and not self.op.vg_name:
2314
      instances = self.cfg.GetAllInstancesInfo().values()
2315
      for inst in instances:
2316
        for disk in inst.disks:
2317
          if _RecursiveCheckIfLVMBased(disk):
2318
            raise errors.OpPrereqError("Cannot disable lvm storage while"
2319
                                       " lvm-based instances exist",
2320
                                       errors.ECODE_INVAL)
2321

    
2322
    node_list = self.acquired_locks[locking.LEVEL_NODE]
2323

    
2324
    # if vg_name not None, checks given volume group on all nodes
2325
    if self.op.vg_name:
2326
      vglist = self.rpc.call_vg_list(node_list)
2327
      for node in node_list:
2328
        msg = vglist[node].fail_msg
2329
        if msg:
2330
          # ignoring down node
2331
          self.LogWarning("Error while gathering data on node %s"
2332
                          " (ignoring node): %s", node, msg)
2333
          continue
2334
        vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2335
                                              self.op.vg_name,
2336
                                              constants.MIN_VG_SIZE)
2337
        if vgstatus:
2338
          raise errors.OpPrereqError("Error on node '%s': %s" %
2339
                                     (node, vgstatus), errors.ECODE_ENVIRON)
2340

    
2341
    self.cluster = cluster = self.cfg.GetClusterInfo()
2342
    # validate params changes
2343
    if self.op.beparams:
2344
      utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2345
      self.new_beparams = objects.FillDict(
2346
        cluster.beparams[constants.PP_DEFAULT], self.op.beparams)
2347

    
2348
    if self.op.nicparams:
2349
      utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2350
      self.new_nicparams = objects.FillDict(
2351
        cluster.nicparams[constants.PP_DEFAULT], self.op.nicparams)
2352
      objects.NIC.CheckParameterSyntax(self.new_nicparams)
2353
      nic_errors = []
2354

    
2355
      # check all instances for consistency
2356
      for instance in self.cfg.GetAllInstancesInfo().values():
2357
        for nic_idx, nic in enumerate(instance.nics):
2358
          params_copy = copy.deepcopy(nic.nicparams)
2359
          params_filled = objects.FillDict(self.new_nicparams, params_copy)
2360

    
2361
          # check parameter syntax
2362
          try:
2363
            objects.NIC.CheckParameterSyntax(params_filled)
2364
          except errors.ConfigurationError, err:
2365
            nic_errors.append("Instance %s, nic/%d: %s" %
2366
                              (instance.name, nic_idx, err))
2367

    
2368
          # if we're moving instances to routed, check that they have an ip
2369
          target_mode = params_filled[constants.NIC_MODE]
2370
          if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
2371
            nic_errors.append("Instance %s, nic/%d: routed nick with no ip" %
2372
                              (instance.name, nic_idx))
2373
      if nic_errors:
2374
        raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
2375
                                   "\n".join(nic_errors))
2376

    
2377
    # hypervisor list/parameters
2378
    self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
2379
    if self.op.hvparams:
2380
      if not isinstance(self.op.hvparams, dict):
2381
        raise errors.OpPrereqError("Invalid 'hvparams' parameter on input",
2382
                                   errors.ECODE_INVAL)
2383
      for hv_name, hv_dict in self.op.hvparams.items():
2384
        if hv_name not in self.new_hvparams:
2385
          self.new_hvparams[hv_name] = hv_dict
2386
        else:
2387
          self.new_hvparams[hv_name].update(hv_dict)
2388

    
2389
    # os hypervisor parameters
2390
    self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
2391
    if self.op.os_hvp:
2392
      if not isinstance(self.op.os_hvp, dict):
2393
        raise errors.OpPrereqError("Invalid 'os_hvp' parameter on input",
2394
                                   errors.ECODE_INVAL)
2395
      for os_name, hvs in self.op.os_hvp.items():
2396
        if not isinstance(hvs, dict):
2397
          raise errors.OpPrereqError(("Invalid 'os_hvp' parameter on"
2398
                                      " input"), errors.ECODE_INVAL)
2399
        if os_name not in self.new_os_hvp:
2400
          self.new_os_hvp[os_name] = hvs
2401
        else:
2402
          for hv_name, hv_dict in hvs.items():
2403
            if hv_name not in self.new_os_hvp[os_name]:
2404
              self.new_os_hvp[os_name][hv_name] = hv_dict
2405
            else:
2406
              self.new_os_hvp[os_name][hv_name].update(hv_dict)
2407

    
2408
    # changes to the hypervisor list
2409
    if self.op.enabled_hypervisors is not None:
2410
      self.hv_list = self.op.enabled_hypervisors
2411
      if not self.hv_list:
2412
        raise errors.OpPrereqError("Enabled hypervisors list must contain at"
2413
                                   " least one member",
2414
                                   errors.ECODE_INVAL)
2415
      invalid_hvs = set(self.hv_list) - constants.HYPER_TYPES
2416
      if invalid_hvs:
2417
        raise errors.OpPrereqError("Enabled hypervisors contains invalid"
2418
                                   " entries: %s" %
2419
                                   utils.CommaJoin(invalid_hvs),
2420
                                   errors.ECODE_INVAL)
2421
      for hv in self.hv_list:
2422
        # if the hypervisor doesn't already exist in the cluster
2423
        # hvparams, we initialize it to empty, and then (in both
2424
        # cases) we make sure to fill the defaults, as we might not
2425
        # have a complete defaults list if the hypervisor wasn't
2426
        # enabled before
2427
        if hv not in new_hvp:
2428
          new_hvp[hv] = {}
2429
        new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
2430
        utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
2431
    else:
2432
      self.hv_list = cluster.enabled_hypervisors
2433

    
2434
    if self.op.hvparams or self.op.enabled_hypervisors is not None:
2435
      # either the enabled list has changed, or the parameters have, validate
2436
      for hv_name, hv_params in self.new_hvparams.items():
2437
        if ((self.op.hvparams and hv_name in self.op.hvparams) or
2438
            (self.op.enabled_hypervisors and
2439
             hv_name in self.op.enabled_hypervisors)):
2440
          # either this is a new hypervisor, or its parameters have changed
2441
          hv_class = hypervisor.GetHypervisor(hv_name)
2442
          utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2443
          hv_class.CheckParameterSyntax(hv_params)
2444
          _CheckHVParams(self, node_list, hv_name, hv_params)
2445

    
2446
    if self.op.os_hvp:
2447
      # no need to check any newly-enabled hypervisors, since the
2448
      # defaults have already been checked in the above code-block
2449
      for os_name, os_hvp in self.new_os_hvp.items():
2450
        for hv_name, hv_params in os_hvp.items():
2451
          utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2452
          # we need to fill in the new os_hvp on top of the actual hv_p
2453
          cluster_defaults = self.new_hvparams.get(hv_name, {})
2454
          new_osp = objects.FillDict(cluster_defaults, hv_params)
2455
          hv_class = hypervisor.GetHypervisor(hv_name)
2456
          hv_class.CheckParameterSyntax(new_osp)
2457
          _CheckHVParams(self, node_list, hv_name, new_osp)
2458

    
2459

    
2460
  def Exec(self, feedback_fn):
2461
    """Change the parameters of the cluster.
2462

2463
    """
2464
    if self.op.vg_name is not None:
2465
      new_volume = self.op.vg_name
2466
      if not new_volume:
2467
        new_volume = None
2468
      if new_volume != self.cfg.GetVGName():
2469
        self.cfg.SetVGName(new_volume)
2470
      else:
2471
        feedback_fn("Cluster LVM configuration already in desired"
2472
                    " state, not changing")
2473
    if self.op.hvparams:
2474
      self.cluster.hvparams = self.new_hvparams
2475
    if self.op.os_hvp:
2476
      self.cluster.os_hvp = self.new_os_hvp
2477
    if self.op.enabled_hypervisors is not None:
2478
      self.cluster.hvparams = self.new_hvparams
2479
      self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
2480
    if self.op.beparams:
2481
      self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
2482
    if self.op.nicparams:
2483
      self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
2484

    
2485
    if self.op.candidate_pool_size is not None:
2486
      self.cluster.candidate_pool_size = self.op.candidate_pool_size
2487
      # we need to update the pool size here, otherwise the save will fail
2488
      _AdjustCandidatePool(self, [])
2489

    
2490
    if self.op.maintain_node_health is not None:
2491
      self.cluster.maintain_node_health = self.op.maintain_node_health
2492

    
2493
    if self.op.add_uids is not None:
2494
      uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
2495

    
2496
    if self.op.remove_uids is not None:
2497
      uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
2498

    
2499
    if self.op.uid_pool is not None:
2500
      self.cluster.uid_pool = self.op.uid_pool
2501

    
2502
    self.cfg.Update(self.cluster, feedback_fn)
2503

    
2504

    
2505
def _RedistributeAncillaryFiles(lu, additional_nodes=None):
2506
  """Distribute additional files which are part of the cluster configuration.
2507

2508
  ConfigWriter takes care of distributing the config and ssconf files, but
2509
  there are more files which should be distributed to all nodes. This function
2510
  makes sure those are copied.
2511

2512
  @param lu: calling logical unit
2513
  @param additional_nodes: list of nodes not in the config to distribute to
2514

2515
  """
2516
  # 1. Gather target nodes
2517
  myself = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
2518
  dist_nodes = lu.cfg.GetOnlineNodeList()
2519
  if additional_nodes is not None:
2520
    dist_nodes.extend(additional_nodes)
2521
  if myself.name in dist_nodes:
2522
    dist_nodes.remove(myself.name)
2523

    
2524
  # 2. Gather files to distribute
2525
  dist_files = set([constants.ETC_HOSTS,
2526
                    constants.SSH_KNOWN_HOSTS_FILE,
2527
                    constants.RAPI_CERT_FILE,
2528
                    constants.RAPI_USERS_FILE,
2529
                    constants.CONFD_HMAC_KEY,
2530
                   ])
2531

    
2532
  enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
2533
  for hv_name in enabled_hypervisors:
2534
    hv_class = hypervisor.GetHypervisor(hv_name)
2535
    dist_files.update(hv_class.GetAncillaryFiles())
2536

    
2537
  # 3. Perform the files upload
2538
  for fname in dist_files:
2539
    if os.path.exists(fname):
2540
      result = lu.rpc.call_upload_file(dist_nodes, fname)
2541
      for to_node, to_result in result.items():
2542
        msg = to_result.fail_msg
2543
        if msg:
2544
          msg = ("Copy of file %s to node %s failed: %s" %
2545
                 (fname, to_node, msg))
2546
          lu.proc.LogWarning(msg)
2547

    
2548

    
2549
class LURedistributeConfig(NoHooksLU):
2550
  """Force the redistribution of cluster configuration.
2551

2552
  This is a very simple LU.
2553

2554
  """
2555
  _OP_REQP = []
2556
  REQ_BGL = False
2557

    
2558
  def ExpandNames(self):
2559
    self.needed_locks = {
2560
      locking.LEVEL_NODE: locking.ALL_SET,
2561
    }
2562
    self.share_locks[locking.LEVEL_NODE] = 1
2563

    
2564
  def CheckPrereq(self):
2565
    """Check prerequisites.
2566

2567
    """
2568

    
2569
  def Exec(self, feedback_fn):
2570
    """Redistribute the configuration.
2571

2572
    """
2573
    self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
2574
    _RedistributeAncillaryFiles(self)
2575

    
2576

    
2577
def _WaitForSync(lu, instance, oneshot=False):
2578
  """Sleep and poll for an instance's disk to sync.
2579

2580
  """
2581
  if not instance.disks:
2582
    return True
2583

    
2584
  if not oneshot:
2585
    lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
2586

    
2587
  node = instance.primary_node
2588

    
2589
  for dev in instance.disks:
2590
    lu.cfg.SetDiskID(dev, node)
2591

    
2592
  # TODO: Convert to utils.Retry
2593

    
2594
  retries = 0
2595
  degr_retries = 10 # in seconds, as we sleep 1 second each time
2596
  while True:
2597
    max_time = 0
2598
    done = True
2599
    cumul_degraded = False
2600
    rstats = lu.rpc.call_blockdev_getmirrorstatus(node, instance.disks)
2601
    msg = rstats.fail_msg
2602
    if msg:
2603
      lu.LogWarning("Can't get any data from node %s: %s", node, msg)
2604
      retries += 1
2605
      if retries >= 10:
2606
        raise errors.RemoteError("Can't contact node %s for mirror data,"
2607
                                 " aborting." % node)
2608
      time.sleep(6)
2609
      continue
2610
    rstats = rstats.payload
2611
    retries = 0
2612
    for i, mstat in enumerate(rstats):
2613
      if mstat is None:
2614
        lu.LogWarning("Can't compute data for node %s/%s",
2615
                           node, instance.disks[i].iv_name)
2616
        continue
2617

    
2618
      cumul_degraded = (cumul_degraded or
2619
                        (mstat.is_degraded and mstat.sync_percent is None))
2620
      if mstat.sync_percent is not None:
2621
        done = False
2622
        if mstat.estimated_time is not None:
2623
          rem_time = "%d estimated seconds remaining" % mstat.estimated_time
2624
          max_time = mstat.estimated_time
2625
        else:
2626
          rem_time = "no time estimate"
2627
        lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
2628
                        (instance.disks[i].iv_name, mstat.sync_percent,
2629
                         rem_time))
2630

    
2631
    # if we're done but degraded, let's do a few small retries, to
2632
    # make sure we see a stable and not transient situation; therefore
2633
    # we force restart of the loop
2634
    if (done or oneshot) and cumul_degraded and degr_retries > 0:
2635
      logging.info("Degraded disks found, %d retries left", degr_retries)
2636
      degr_retries -= 1
2637
      time.sleep(1)
2638
      continue
2639

    
2640
    if done or oneshot:
2641
      break
2642

    
2643
    time.sleep(min(60, max_time))
2644

    
2645
  if done:
2646
    lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
2647
  return not cumul_degraded
2648

    
2649

    
2650
def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
2651
  """Check that mirrors are not degraded.
2652

2653
  The ldisk parameter, if True, will change the test from the
2654
  is_degraded attribute (which represents overall non-ok status for
2655
  the device(s)) to the ldisk (representing the local storage status).
2656

2657
  """
2658
  lu.cfg.SetDiskID(dev, node)
2659

    
2660
  result = True
2661

    
2662
  if on_primary or dev.AssembleOnSecondary():
2663
    rstats = lu.rpc.call_blockdev_find(node, dev)
2664
    msg = rstats.fail_msg
2665
    if msg:
2666
      lu.LogWarning("Can't find disk on node %s: %s", node, msg)
2667
      result = False
2668
    elif not rstats.payload:
2669
      lu.LogWarning("Can't find disk on node %s", node)
2670
      result = False
2671
    else:
2672
      if ldisk:
2673
        result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
2674
      else:
2675
        result = result and not rstats.payload.is_degraded
2676

    
2677
  if dev.children:
2678
    for child in dev.children:
2679
      result = result and _CheckDiskConsistency(lu, child, node, on_primary)
2680

    
2681
  return result
2682

    
2683

    
2684
class LUDiagnoseOS(NoHooksLU):
2685
  """Logical unit for OS diagnose/query.
2686

2687
  """
2688
  _OP_REQP = ["output_fields", "names"]
2689
  REQ_BGL = False
2690
  _FIELDS_STATIC = utils.FieldSet()
2691
  _FIELDS_DYNAMIC = utils.FieldSet("name", "valid", "node_status", "variants")
2692
  # Fields that need calculation of global os validity
2693
  _FIELDS_NEEDVALID = frozenset(["valid", "variants"])
2694

    
2695
  def ExpandNames(self):
2696
    if self.op.names:
2697
      raise errors.OpPrereqError("Selective OS query not supported",
2698
                                 errors.ECODE_INVAL)
2699

    
2700
    _CheckOutputFields(static=self._FIELDS_STATIC,
2701
                       dynamic=self._FIELDS_DYNAMIC,
2702
                       selected=self.op.output_fields)
2703

    
2704
    # Lock all nodes, in shared mode
2705
    # Temporary removal of locks, should be reverted later
2706
    # TODO: reintroduce locks when they are lighter-weight
2707
    self.needed_locks = {}
2708
    #self.share_locks[locking.LEVEL_NODE] = 1
2709
    #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
2710

    
2711
  def CheckPrereq(self):
2712
    """Check prerequisites.
2713

2714
    """
2715

    
2716
  @staticmethod
2717
  def _DiagnoseByOS(rlist):
2718
    """Remaps a per-node return list into an a per-os per-node dictionary
2719

2720
    @param rlist: a map with node names as keys and OS objects as values
2721

2722
    @rtype: dict
2723
    @return: a dictionary with osnames as keys and as value another map, with
2724
        nodes as keys and tuples of (path, status, diagnose) as values, eg::
2725

2726
          {"debian-etch": {"node1": [(/usr/lib/..., True, ""),
2727
                                     (/srv/..., False, "invalid api")],
2728
                           "node2": [(/srv/..., True, "")]}
2729
          }
2730

2731
    """
2732
    all_os = {}
2733
    # we build here the list of nodes that didn't fail the RPC (at RPC
2734
    # level), so that nodes with a non-responding node daemon don't
2735
    # make all OSes invalid
2736
    good_nodes = [node_name for node_name in rlist
2737
                  if not rlist[node_name].fail_msg]
2738
    for node_name, nr in rlist.items():
2739
      if nr.fail_msg or not nr.payload:
2740
        continue
2741
      for name, path, status, diagnose, variants in nr.payload:
2742
        if name not in all_os:
2743
          # build a list of nodes for this os containing empty lists
2744
          # for each node in node_list
2745
          all_os[name] = {}
2746
          for nname in good_nodes:
2747
            all_os[name][nname] = []
2748
        all_os[name][node_name].append((path, status, diagnose, variants))
2749
    return all_os
2750

    
2751
  def Exec(self, feedback_fn):
2752
    """Compute the list of OSes.
2753

2754
    """
2755
    valid_nodes = [node for node in self.cfg.GetOnlineNodeList()]
2756
    node_data = self.rpc.call_os_diagnose(valid_nodes)
2757
    pol = self._DiagnoseByOS(node_data)
2758
    output = []
2759
    calc_valid = self._FIELDS_NEEDVALID.intersection(self.op.output_fields)
2760
    calc_variants = "variants" in self.op.output_fields
2761

    
2762
    for os_name, os_data in pol.items():
2763
      row = []
2764
      if calc_valid:
2765
        valid = True
2766
        variants = None
2767
        for osl in os_data.values():
2768
          valid = valid and osl and osl[0][1]
2769
          if not valid:
2770
            variants = None
2771
            break
2772
          if calc_variants:
2773
            node_variants = osl[0][3]
2774
            if variants is None:
2775
              variants = node_variants
2776
            else:
2777
              variants = [v for v in variants if v in node_variants]
2778

    
2779
      for field in self.op.output_fields:
2780
        if field == "name":
2781
          val = os_name
2782
        elif field == "valid":
2783
          val = valid
2784
        elif field == "node_status":
2785
          # this is just a copy of the dict
2786
          val = {}
2787
          for node_name, nos_list in os_data.items():
2788
            val[node_name] = nos_list
2789
        elif field == "variants":
2790
          val =  variants
2791
        else:
2792
          raise errors.ParameterError(field)
2793
        row.append(val)
2794
      output.append(row)
2795

    
2796
    return output
2797

    
2798

    
2799
class LURemoveNode(LogicalUnit):
2800
  """Logical unit for removing a node.
2801

2802
  """
2803
  HPATH = "node-remove"
2804
  HTYPE = constants.HTYPE_NODE
2805
  _OP_REQP = ["node_name"]
2806

    
2807
  def BuildHooksEnv(self):
2808
    """Build hooks env.
2809

2810
    This doesn't run on the target node in the pre phase as a failed
2811
    node would then be impossible to remove.
2812

2813
    """
2814
    env = {
2815
      "OP_TARGET": self.op.node_name,
2816
      "NODE_NAME": self.op.node_name,
2817
      }
2818
    all_nodes = self.cfg.GetNodeList()
2819
    try:
2820
      all_nodes.remove(self.op.node_name)
2821
    except ValueError:
2822
      logging.warning("Node %s which is about to be removed not found"
2823
                      " in the all nodes list", self.op.node_name)
2824
    return env, all_nodes, all_nodes
2825

    
2826
  def CheckPrereq(self):
2827
    """Check prerequisites.
2828

2829
    This checks:
2830
     - the node exists in the configuration
2831
     - it does not have primary or secondary instances
2832
     - it's not the master
2833

2834
    Any errors are signaled by raising errors.OpPrereqError.
2835

2836
    """
2837
    self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
2838
    node = self.cfg.GetNodeInfo(self.op.node_name)
2839
    assert node is not None
2840

    
2841
    instance_list = self.cfg.GetInstanceList()
2842

    
2843
    masternode = self.cfg.GetMasterNode()
2844
    if node.name == masternode:
2845
      raise errors.OpPrereqError("Node is the master node,"
2846
                                 " you need to failover first.",
2847
                                 errors.ECODE_INVAL)
2848

    
2849
    for instance_name in instance_list:
2850
      instance = self.cfg.GetInstanceInfo(instance_name)
2851
      if node.name in instance.all_nodes:
2852
        raise errors.OpPrereqError("Instance %s is still running on the node,"
2853
                                   " please remove first." % instance_name,
2854
                                   errors.ECODE_INVAL)
2855
    self.op.node_name = node.name
2856
    self.node = node
2857

    
2858
  def Exec(self, feedback_fn):
2859
    """Removes the node from the cluster.
2860

2861
    """
2862
    node = self.node
2863
    logging.info("Stopping the node daemon and removing configs from node %s",
2864
                 node.name)
2865

    
2866
    modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
2867

    
2868
    # Promote nodes to master candidate as needed
2869
    _AdjustCandidatePool(self, exceptions=[node.name])
2870
    self.context.RemoveNode(node.name)
2871

    
2872
    # Run post hooks on the node before it's removed
2873
    hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
2874
    try:
2875
      hm.RunPhase(constants.HOOKS_PHASE_POST, [node.name])
2876
    except:
2877
      # pylint: disable-msg=W0702
2878
      self.LogWarning("Errors occurred running hooks on %s" % node.name)
2879

    
2880
    result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
2881
    msg = result.fail_msg
2882
    if msg:
2883
      self.LogWarning("Errors encountered on the remote node while leaving"
2884
                      " the cluster: %s", msg)
2885

    
2886
    # Remove node from our /etc/hosts
2887
    if self.cfg.GetClusterInfo().modify_etc_hosts:
2888
      # FIXME: this should be done via an rpc call to node daemon
2889
      utils.RemoveHostFromEtcHosts(node.name)
2890
      _RedistributeAncillaryFiles(self)
2891

    
2892

    
2893
class LUQueryNodes(NoHooksLU):
2894
  """Logical unit for querying nodes.
2895

2896
  """
2897
  # pylint: disable-msg=W0142
2898
  _OP_REQP = ["output_fields", "names", "use_locking"]
2899
  REQ_BGL = False
2900

    
2901
  _SIMPLE_FIELDS = ["name", "serial_no", "ctime", "mtime", "uuid",
2902
                    "master_candidate", "offline", "drained"]
2903

    
2904
  _FIELDS_DYNAMIC = utils.FieldSet(
2905
    "dtotal", "dfree",
2906
    "mtotal", "mnode", "mfree",
2907
    "bootid",
2908
    "ctotal", "cnodes", "csockets",
2909
    )
2910

    
2911
  _FIELDS_STATIC = utils.FieldSet(*[
2912
    "pinst_cnt", "sinst_cnt",
2913
    "pinst_list", "sinst_list",
2914
    "pip", "sip", "tags",
2915
    "master",
2916
    "role"] + _SIMPLE_FIELDS
2917
    )
2918

    
2919
  def ExpandNames(self):
2920
    _CheckOutputFields(static=self._FIELDS_STATIC,
2921
                       dynamic=self._FIELDS_DYNAMIC,
2922
                       selected=self.op.output_fields)
2923

    
2924
    self.needed_locks = {}
2925
    self.share_locks[locking.LEVEL_NODE] = 1
2926

    
2927
    if self.op.names:
2928
      self.wanted = _GetWantedNodes(self, self.op.names)
2929
    else:
2930
      self.wanted = locking.ALL_SET
2931

    
2932
    self.do_node_query = self._FIELDS_STATIC.NonMatching(self.op.output_fields)
2933
    self.do_locking = self.do_node_query and self.op.use_locking
2934
    if self.do_locking:
2935
      # if we don't request only static fields, we need to lock the nodes
2936
      self.needed_locks[locking.LEVEL_NODE] = self.wanted
2937

    
2938
  def CheckPrereq(self):
2939
    """Check prerequisites.
2940

2941
    """
2942
    # The validation of the node list is done in the _GetWantedNodes,
2943
    # if non empty, and if empty, there's no validation to do
2944
    pass
2945

    
2946
  def Exec(self, feedback_fn):
2947
    """Computes the list of nodes and their attributes.
2948

2949
    """
2950
    all_info = self.cfg.GetAllNodesInfo()
2951
    if self.do_locking:
2952
      nodenames = self.acquired_locks[locking.LEVEL_NODE]
2953
    elif self.wanted != locking.ALL_SET:
2954
      nodenames = self.wanted
2955
      missing = set(nodenames).difference(all_info.keys())
2956
      if missing:
2957
        raise errors.OpExecError(
2958
          "Some nodes were removed before retrieving their data: %s" % missing)
2959
    else:
2960
      nodenames = all_info.keys()
2961

    
2962
    nodenames = utils.NiceSort(nodenames)
2963
    nodelist = [all_info[name] for name in nodenames]
2964

    
2965
    # begin data gathering
2966

    
2967
    if self.do_node_query:
2968
      live_data = {}
2969
      node_data = self.rpc.call_node_info(nodenames, self.cfg.GetVGName(),
2970
                                          self.cfg.GetHypervisorType())
2971
      for name in nodenames:
2972
        nodeinfo = node_data[name]
2973
        if not nodeinfo.fail_msg and nodeinfo.payload:
2974
          nodeinfo = nodeinfo.payload
2975
          fn = utils.TryConvert
2976
          live_data[name] = {
2977
            "mtotal": fn(int, nodeinfo.get('memory_total', None)),
2978
            "mnode": fn(int, nodeinfo.get('memory_dom0', None)),
2979
            "mfree": fn(int, nodeinfo.get('memory_free', None)),
2980
            "dtotal": fn(int, nodeinfo.get('vg_size', None)),
2981
            "dfree": fn(int, nodeinfo.get('vg_free', None)),
2982
            "ctotal": fn(int, nodeinfo.get('cpu_total', None)),
2983
            "bootid": nodeinfo.get('bootid', None),
2984
            "cnodes": fn(int, nodeinfo.get('cpu_nodes', None)),
2985
            "csockets": fn(int, nodeinfo.get('cpu_sockets', None)),
2986
            }
2987
        else:
2988
          live_data[name] = {}
2989
    else:
2990
      live_data = dict.fromkeys(nodenames, {})
2991

    
2992
    node_to_primary = dict([(name, set()) for name in nodenames])
2993
    node_to_secondary = dict([(name, set()) for name in nodenames])
2994

    
2995
    inst_fields = frozenset(("pinst_cnt", "pinst_list",
2996
                             "sinst_cnt", "sinst_list"))
2997
    if inst_fields & frozenset(self.op.output_fields):
2998
      inst_data = self.cfg.GetAllInstancesInfo()
2999

    
3000
      for inst in inst_data.values():
3001
        if inst.primary_node in node_to_primary:
3002
          node_to_primary[inst.primary_node].add(inst.name)
3003
        for secnode in inst.secondary_nodes:
3004
          if secnode in node_to_secondary:
3005
            node_to_secondary[secnode].add(inst.name)
3006

    
3007
    master_node = self.cfg.GetMasterNode()
3008

    
3009
    # end data gathering
3010

    
3011
    output = []
3012
    for node in nodelist:
3013
      node_output = []
3014
      for field in self.op.output_fields:
3015
        if field in self._SIMPLE_FIELDS:
3016
          val = getattr(node, field)
3017
        elif field == "pinst_list":
3018
          val = list(node_to_primary[node.name])
3019
        elif field == "sinst_list":
3020
          val = list(node_to_secondary[node.name])
3021
        elif field == "pinst_cnt":
3022
          val = len(node_to_primary[node.name])
3023
        elif field == "sinst_cnt":
3024
          val = len(node_to_secondary[node.name])
3025
        elif field == "pip":
3026
          val = node.primary_ip
3027
        elif field == "sip":
3028
          val = node.secondary_ip
3029
        elif field == "tags":
3030
          val = list(node.GetTags())
3031
        elif field == "master":
3032
          val = node.name == master_node
3033
        elif self._FIELDS_DYNAMIC.Matches(field):
3034
          val = live_data[node.name].get(field, None)
3035
        elif field == "role":
3036
          if node.name == master_node:
3037
            val = "M"
3038
          elif node.master_candidate:
3039
            val = "C"
3040
          elif node.drained:
3041
            val = "D"
3042
          elif node.offline:
3043
            val = "O"
3044
          else:
3045
            val = "R"
3046
        else:
3047
          raise errors.ParameterError(field)
3048
        node_output.append(val)
3049
      output.append(node_output)
3050

    
3051
    return output
3052

    
3053

    
3054
class LUQueryNodeVolumes(NoHooksLU):
3055
  """Logical unit for getting volumes on node(s).
3056

3057
  """
3058
  _OP_REQP = ["nodes", "output_fields"]
3059
  REQ_BGL = False
3060
  _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
3061
  _FIELDS_STATIC = utils.FieldSet("node")
3062

    
3063
  def ExpandNames(self):
3064
    _CheckOutputFields(static=self._FIELDS_STATIC,
3065
                       dynamic=self._FIELDS_DYNAMIC,
3066
                       selected=self.op.output_fields)
3067

    
3068
    self.needed_locks = {}
3069
    self.share_locks[locking.LEVEL_NODE] = 1
3070
    if not self.op.nodes:
3071
      self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3072
    else:
3073
      self.needed_locks[locking.LEVEL_NODE] = \
3074
        _GetWantedNodes(self, self.op.nodes)
3075

    
3076
  def CheckPrereq(self):
3077
    """Check prerequisites.
3078

3079
    This checks that the fields required are valid output fields.
3080

3081
    """
3082
    self.nodes = self.acquired_locks[locking.LEVEL_NODE]
3083

    
3084
  def Exec(self, feedback_fn):
3085
    """Computes the list of nodes and their attributes.
3086

3087
    """
3088
    nodenames = self.nodes
3089
    volumes = self.rpc.call_node_volumes(nodenames)
3090

    
3091
    ilist = [self.cfg.GetInstanceInfo(iname) for iname
3092
             in self.cfg.GetInstanceList()]
3093

    
3094
    lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
3095

    
3096
    output = []
3097
    for node in nodenames:
3098
      nresult = volumes[node]
3099
      if nresult.offline:
3100
        continue
3101
      msg = nresult.fail_msg
3102
      if msg:
3103
        self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
3104
        continue
3105

    
3106
      node_vols = nresult.payload[:]
3107
      node_vols.sort(key=lambda vol: vol['dev'])
3108

    
3109
      for vol in node_vols:
3110
        node_output = []
3111
        for field in self.op.output_fields:
3112
          if field == "node":
3113
            val = node
3114
          elif field == "phys":
3115
            val = vol['dev']
3116
          elif field == "vg":
3117
            val = vol['vg']
3118
          elif field == "name":
3119
            val = vol['name']
3120
          elif field == "size":
3121
            val = int(float(vol['size']))
3122
          elif field == "instance":
3123
            for inst in ilist:
3124
              if node not in lv_by_node[inst]:
3125
                continue
3126
              if vol['name'] in lv_by_node[inst][node]:
3127
                val = inst.name
3128
                break
3129
            else:
3130
              val = '-'
3131
          else:
3132
            raise errors.ParameterError(field)
3133
          node_output.append(str(val))
3134

    
3135
        output.append(node_output)
3136

    
3137
    return output
3138

    
3139

    
3140
class LUQueryNodeStorage(NoHooksLU):
3141
  """Logical unit for getting information on storage units on node(s).
3142

3143
  """
3144
  _OP_REQP = ["nodes", "storage_type", "output_fields"]
3145
  REQ_BGL = False
3146
  _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
3147

    
3148
  def CheckArguments(self):
3149
    _CheckStorageType(self.op.storage_type)
3150

    
3151
    _CheckOutputFields(static=self._FIELDS_STATIC,
3152
                       dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
3153
                       selected=self.op.output_fields)
3154

    
3155
  def ExpandNames(self):
3156
    self.needed_locks = {}
3157
    self.share_locks[locking.LEVEL_NODE] = 1
3158

    
3159
    if self.op.nodes:
3160
      self.needed_locks[locking.LEVEL_NODE] = \
3161
        _GetWantedNodes(self, self.op.nodes)
3162
    else:
3163
      self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3164

    
3165
  def CheckPrereq(self):
3166
    """Check prerequisites.
3167

3168
    This checks that the fields required are valid output fields.
3169

3170
    """
3171
    self.op.name = getattr(self.op, "name", None)
3172

    
3173
    self.nodes = self.acquired_locks[locking.LEVEL_NODE]
3174

    
3175
  def Exec(self, feedback_fn):
3176
    """Computes the list of nodes and their attributes.
3177

3178
    """
3179
    # Always get name to sort by
3180
    if constants.SF_NAME in self.op.output_fields:
3181
      fields = self.op.output_fields[:]
3182
    else:
3183
      fields = [constants.SF_NAME] + self.op.output_fields
3184

    
3185
    # Never ask for node or type as it's only known to the LU
3186
    for extra in [constants.SF_NODE, constants.SF_TYPE]:
3187
      while extra in fields:
3188
        fields.remove(extra)
3189

    
3190
    field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
3191
    name_idx = field_idx[constants.SF_NAME]
3192

    
3193
    st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3194
    data = self.rpc.call_storage_list(self.nodes,
3195
                                      self.op.storage_type, st_args,
3196
                                      self.op.name, fields)
3197

    
3198
    result = []
3199

    
3200
    for node in utils.NiceSort(self.nodes):
3201
      nresult = data[node]
3202
      if nresult.offline:
3203
        continue
3204

    
3205
      msg = nresult.fail_msg
3206
      if msg:
3207
        self.LogWarning("Can't get storage data from node %s: %s", node, msg)
3208
        continue
3209

    
3210
      rows = dict([(row[name_idx], row) for row in nresult.payload])
3211

    
3212
      for name in utils.NiceSort(rows.keys()):
3213
        row = rows[name]
3214

    
3215
        out = []
3216

    
3217
        for field in self.op.output_fields:
3218
          if field == constants.SF_NODE:
3219
            val = node
3220
          elif field == constants.SF_TYPE:
3221
            val = self.op.storage_type
3222
          elif field in field_idx:
3223
            val = row[field_idx[field]]
3224
          else:
3225
            raise errors.ParameterError(field)
3226

    
3227
          out.append(val)
3228

    
3229
        result.append(out)
3230

    
3231
    return result
3232

    
3233

    
3234
class LUModifyNodeStorage(NoHooksLU):
3235
  """Logical unit for modifying a storage volume on a node.
3236

3237
  """
3238
  _OP_REQP = ["node_name", "storage_type", "name", "changes"]
3239
  REQ_BGL = False
3240

    
3241
  def CheckArguments(self):
3242
    self.opnode_name = _ExpandNodeName(self.cfg, self.op.node_name)
3243

    
3244
    _CheckStorageType(self.op.storage_type)
3245

    
3246
  def ExpandNames(self):
3247
    self.needed_locks = {
3248
      locking.LEVEL_NODE: self.op.node_name,
3249
      }
3250

    
3251
  def CheckPrereq(self):
3252
    """Check prerequisites.
3253

3254
    """
3255
    storage_type = self.op.storage_type
3256

    
3257
    try:
3258
      modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
3259
    except KeyError:
3260
      raise errors.OpPrereqError("Storage units of type '%s' can not be"
3261
                                 " modified" % storage_type,
3262
                                 errors.ECODE_INVAL)
3263

    
3264
    diff = set(self.op.changes.keys()) - modifiable
3265
    if diff:
3266
      raise errors.OpPrereqError("The following fields can not be modified for"
3267
                                 " storage units of type '%s': %r" %
3268
                                 (storage_type, list(diff)),
3269
                                 errors.ECODE_INVAL)
3270

    
3271
  def Exec(self, feedback_fn):
3272
    """Computes the list of nodes and their attributes.
3273

3274
    """
3275
    st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3276
    result = self.rpc.call_storage_modify(self.op.node_name,
3277
                                          self.op.storage_type, st_args,
3278
                                          self.op.name, self.op.changes)
3279
    result.Raise("Failed to modify storage unit '%s' on %s" %
3280
                 (self.op.name, self.op.node_name))
3281

    
3282

    
3283
class LUAddNode(LogicalUnit):
3284
  """Logical unit for adding node to the cluster.
3285

3286
  """
3287
  HPATH = "node-add"
3288
  HTYPE = constants.HTYPE_NODE
3289
  _OP_REQP = ["node_name"]
3290

    
3291
  def CheckArguments(self):
3292
    # validate/normalize the node name
3293
    self.op.node_name = utils.HostInfo.NormalizeName(self.op.node_name)
3294

    
3295
  def BuildHooksEnv(self):
3296
    """Build hooks env.
3297

3298
    This will run on all nodes before, and on all nodes + the new node after.
3299

3300
    """
3301
    env = {
3302
      "OP_TARGET": self.op.node_name,
3303
      "NODE_NAME": self.op.node_name,
3304
      "NODE_PIP": self.op.primary_ip,
3305
      "NODE_SIP": self.op.secondary_ip,
3306
      }
3307
    nodes_0 = self.cfg.GetNodeList()
3308
    nodes_1 = nodes_0 + [self.op.node_name, ]
3309
    return env, nodes_0, nodes_1
3310

    
3311
  def CheckPrereq(self):
3312
    """Check prerequisites.
3313

3314
    This checks:
3315
     - the new node is not already in the config
3316
     - it is resolvable
3317
     - its parameters (single/dual homed) matches the cluster
3318

3319
    Any errors are signaled by raising errors.OpPrereqError.
3320

3321
    """
3322
    node_name = self.op.node_name
3323
    cfg = self.cfg
3324

    
3325
    dns_data = utils.GetHostInfo(node_name)
3326

    
3327
    node = dns_data.name
3328
    primary_ip = self.op.primary_ip = dns_data.ip
3329
    secondary_ip = getattr(self.op, "secondary_ip", None)
3330
    if secondary_ip is None:
3331
      secondary_ip = primary_ip
3332
    if not utils.IsValidIP(secondary_ip):
3333
      raise errors.OpPrereqError("Invalid secondary IP given",
3334
                                 errors.ECODE_INVAL)
3335
    self.op.secondary_ip = secondary_ip
3336

    
3337
    node_list = cfg.GetNodeList()
3338
    if not self.op.readd and node in node_list:
3339
      raise errors.OpPrereqError("Node %s is already in the configuration" %
3340
                                 node, errors.ECODE_EXISTS)
3341
    elif self.op.readd and node not in node_list:
3342
      raise errors.OpPrereqError("Node %s is not in the configuration" % node,
3343
                                 errors.ECODE_NOENT)
3344

    
3345
    self.changed_primary_ip = False
3346

    
3347
    for existing_node_name in node_list:
3348
      existing_node = cfg.GetNodeInfo(existing_node_name)
3349

    
3350
      if self.op.readd and node == existing_node_name:
3351
        if existing_node.secondary_ip != secondary_ip:
3352
          raise errors.OpPrereqError("Readded node doesn't have the same IP"
3353
                                     " address configuration as before",
3354
                                     errors.ECODE_INVAL)
3355
        if existing_node.primary_ip != primary_ip:
3356
          self.changed_primary_ip = True
3357

    
3358
        continue
3359

    
3360
      if (existing_node.primary_ip == primary_ip or
3361
          existing_node.secondary_ip == primary_ip or
3362
          existing_node.primary_ip == secondary_ip or
3363
          existing_node.secondary_ip == secondary_ip):
3364
        raise errors.OpPrereqError("New node ip address(es) conflict with"
3365
                                   " existing node %s" % existing_node.name,
3366
                                   errors.ECODE_NOTUNIQUE)
3367

    
3368
    # check that the type of the node (single versus dual homed) is the
3369
    # same as for the master
3370
    myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
3371
    master_singlehomed = myself.secondary_ip == myself.primary_ip
3372
    newbie_singlehomed = secondary_ip == primary_ip
3373
    if master_singlehomed != newbie_singlehomed:
3374
      if master_singlehomed:
3375
        raise errors.OpPrereqError("The master has no private ip but the"
3376
                                   " new node has one",
3377
                                   errors.ECODE_INVAL)
3378
      else:
3379
        raise errors.OpPrereqError("The master has a private ip but the"
3380
                                   " new node doesn't have one",
3381
                                   errors.ECODE_INVAL)
3382

    
3383
    # checks reachability
3384
    if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
3385
      raise errors.OpPrereqError("Node not reachable by ping",
3386
                                 errors.ECODE_ENVIRON)
3387

    
3388
    if not newbie_singlehomed:
3389
      # check reachability from my secondary ip to newbie's secondary ip
3390
      if not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
3391
                           source=myself.secondary_ip):
3392
        raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
3393
                                   " based ping to noded port",
3394
                                   errors.ECODE_ENVIRON)
3395

    
3396
    if self.op.readd:
3397
      exceptions = [node]
3398
    else:
3399
      exceptions = []
3400

    
3401
    self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
3402

    
3403
    if self.op.readd:
3404
      self.new_node = self.cfg.GetNodeInfo(node)
3405
      assert self.new_node is not None, "Can't retrieve locked node %s" % node
3406
    else:
3407
      self.new_node = objects.Node(name=node,
3408
                                   primary_ip=primary_ip,
3409
                                   secondary_ip=secondary_ip,
3410
                                   master_candidate=self.master_candidate,
3411
                                   offline=False, drained=False)
3412

    
3413
  def Exec(self, feedback_fn):
3414
    """Adds the new node to the cluster.
3415

3416
    """
3417
    new_node = self.new_node
3418
    node = new_node.name
3419

    
3420
    # for re-adds, reset the offline/drained/master-candidate flags;
3421
    # we need to reset here, otherwise offline would prevent RPC calls
3422
    # later in the procedure; this also means that if the re-add
3423
    # fails, we are left with a non-offlined, broken node
3424
    if self.op.readd:
3425
      new_node.drained = new_node.offline = False # pylint: disable-msg=W0201
3426
      self.LogInfo("Readding a node, the offline/drained flags were reset")
3427
      # if we demote the node, we do cleanup later in the procedure
3428
      new_node.master_candidate = self.master_candidate
3429
      if self.changed_primary_ip:
3430
        new_node.primary_ip = self.op.primary_ip
3431

    
3432
    # notify the user about any possible mc promotion
3433
    if new_node.master_candidate:
3434
      self.LogInfo("Node will be a master candidate")
3435

    
3436
    # check connectivity
3437
    result = self.rpc.call_version([node])[node]
3438
    result.Raise("Can't get version information from node %s" % node)
3439
    if constants.PROTOCOL_VERSION == result.payload:
3440
      logging.info("Communication to node %s fine, sw version %s match",
3441
                   node, result.payload)
3442
    else:
3443
      raise errors.OpExecError("Version mismatch master version %s,"
3444
                               " node version %s" %
3445
                               (constants.PROTOCOL_VERSION, result.payload))
3446

    
3447
    # setup ssh on node
3448
    if self.cfg.GetClusterInfo().modify_ssh_setup:
3449
      logging.info("Copy ssh key to node %s", node)
3450
      priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
3451
      keyarray = []
3452
      keyfiles = [constants.SSH_HOST_DSA_PRIV, constants.SSH_HOST_DSA_PUB,
3453
                  constants.SSH_HOST_RSA_PRIV, constants.SSH_HOST_RSA_PUB,
3454
                  priv_key, pub_key]
3455

    
3456
      for i in keyfiles:
3457
        keyarray.append(utils.ReadFile(i))
3458

    
3459
      result = self.rpc.call_node_add(node, keyarray[0], keyarray[1],
3460
                                      keyarray[2], keyarray[3], keyarray[4],
3461
                                      keyarray[5])
3462
      result.Raise("Cannot transfer ssh keys to the new node")
3463

    
3464
    # Add node to our /etc/hosts, and add key to known_hosts
3465
    if self.cfg.GetClusterInfo().modify_etc_hosts:
3466
      # FIXME: this should be done via an rpc call to node daemon
3467
      utils.AddHostToEtcHosts(new_node.name)
3468

    
3469
    if new_node.secondary_ip != new_node.primary_ip:
3470
      result = self.rpc.call_node_has_ip_address(new_node.name,
3471
                                                 new_node.secondary_ip)
3472
      result.Raise("Failure checking secondary ip on node %s" % new_node.name,
3473
                   prereq=True, ecode=errors.ECODE_ENVIRON)
3474
      if not result.payload:
3475
        raise errors.OpExecError("Node claims it doesn't have the secondary ip"
3476
                                 " you gave (%s). Please fix and re-run this"
3477
                                 " command." % new_node.secondary_ip)
3478

    
3479
    node_verify_list = [self.cfg.GetMasterNode()]
3480
    node_verify_param = {
3481
      constants.NV_NODELIST: [node],
3482
      # TODO: do a node-net-test as well?
3483
    }
3484

    
3485
    result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
3486
                                       self.cfg.GetClusterName())
3487
    for verifier in node_verify_list:
3488
      result[verifier].Raise("Cannot communicate with node %s" % verifier)
3489
      nl_payload = result[verifier].payload[constants.NV_NODELIST]
3490
      if nl_payload:
3491
        for failed in nl_payload:
3492
          feedback_fn("ssh/hostname verification failed"
3493
                      " (checking from %s): %s" %
3494
                      (verifier, nl_payload[failed]))
3495
        raise errors.OpExecError("ssh/hostname verification failed.")
3496

    
3497
    if self.op.readd:
3498
      _RedistributeAncillaryFiles(self)
3499
      self.context.ReaddNode(new_node)
3500
      # make sure we redistribute the config
3501
      self.cfg.Update(new_node, feedback_fn)
3502
      # and make sure the new node will not have old files around
3503
      if not new_node.master_candidate:
3504
        result = self.rpc.call_node_demote_from_mc(new_node.name)
3505
        msg = result.fail_msg
3506
        if msg:
3507
          self.LogWarning("Node failed to demote itself from master"
3508
                          " candidate status: %s" % msg)
3509
    else:
3510
      _RedistributeAncillaryFiles(self, additional_nodes=[node])
3511
      self.context.AddNode(new_node, self.proc.GetECId())
3512

    
3513

    
3514
class LUSetNodeParams(LogicalUnit):
3515
  """Modifies the parameters of a node.
3516

3517
  """
3518
  HPATH = "node-modify"
3519
  HTYPE = constants.HTYPE_NODE
3520
  _OP_REQP = ["node_name"]
3521
  REQ_BGL = False
3522

    
3523
  def CheckArguments(self):
3524
    self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3525
    _CheckBooleanOpField(self.op, 'master_candidate')
3526
    _CheckBooleanOpField(self.op, 'offline')
3527
    _CheckBooleanOpField(self.op, 'drained')
3528
    _CheckBooleanOpField(self.op, 'auto_promote')
3529
    all_mods = [self.op.offline, self.op.master_candidate, self.op.drained]
3530
    if all_mods.count(None) == 3:
3531
      raise errors.OpPrereqError("Please pass at least one modification",
3532
                                 errors.ECODE_INVAL)
3533
    if all_mods.count(True) > 1:
3534
      raise errors.OpPrereqError("Can't set the node into more than one"
3535
                                 " state at the same time",
3536
                                 errors.ECODE_INVAL)
3537

    
3538
    # Boolean value that tells us whether we're offlining or draining the node
3539
    self.offline_or_drain = (self.op.offline == True or
3540
                             self.op.drained == True)
3541
    self.deoffline_or_drain = (self.op.offline == False or
3542
                               self.op.drained == False)
3543
    self.might_demote = (self.op.master_candidate == False or
3544
                         self.offline_or_drain)
3545

    
3546
    self.lock_all = self.op.auto_promote and self.might_demote
3547

    
3548

    
3549
  def ExpandNames(self):
3550
    if self.lock_all:
3551
      self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
3552
    else:
3553
      self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
3554

    
3555
  def BuildHooksEnv(self):
3556
    """Build hooks env.
3557

3558
    This runs on the master node.
3559

3560
    """
3561
    env = {
3562
      "OP_TARGET": self.op.node_name,
3563
      "MASTER_CANDIDATE": str(self.op.master_candidate),
3564
      "OFFLINE": str(self.op.offline),
3565
      "DRAINED": str(self.op.drained),
3566
      }
3567
    nl = [self.cfg.GetMasterNode(),
3568
          self.op.node_name]
3569
    return env, nl, nl
3570

    
3571
  def CheckPrereq(self):
3572
    """Check prerequisites.
3573

3574
    This only checks the instance list against the existing names.
3575

3576
    """
3577
    node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
3578

    
3579
    if (self.op.master_candidate is not None or
3580
        self.op.drained is not None or
3581
        self.op.offline is not None):
3582
      # we can't change the master's node flags
3583
      if self.op.node_name == self.cfg.GetMasterNode():
3584
        raise errors.OpPrereqError("The master role can be changed"
3585
                                   " only via masterfailover",
3586
                                   errors.ECODE_INVAL)
3587

    
3588

    
3589
    if node.master_candidate and self.might_demote and not self.lock_all:
3590
      assert not self.op.auto_promote, "auto-promote set but lock_all not"
3591
      # check if after removing the current node, we're missing master
3592
      # candidates
3593
      (mc_remaining, mc_should, _) = \
3594
          self.cfg.GetMasterCandidateStats(exceptions=[node.name])
3595
      if mc_remaining < mc_should:
3596
        raise errors.OpPrereqError("Not enough master candidates, please"
3597
                                   " pass auto_promote to allow promotion",
3598
                                   errors.ECODE_INVAL)
3599

    
3600
    if (self.op.master_candidate == True and
3601
        ((node.offline and not self.op.offline == False) or
3602
         (node.drained and not self.op.drained == False))):
3603
      raise errors.OpPrereqError("Node '%s' is offline or drained, can't set"
3604
                                 " to master_candidate" % node.name,
3605
                                 errors.ECODE_INVAL)
3606

    
3607
    # If we're being deofflined/drained, we'll MC ourself if needed
3608
    if (self.deoffline_or_drain and not self.offline_or_drain and not
3609
        self.op.master_candidate == True and not node.master_candidate):
3610
      self.op.master_candidate = _DecideSelfPromotion(self)
3611
      if self.op.master_candidate:
3612
        self.LogInfo("Autopromoting node to master candidate")
3613

    
3614
    return
3615

    
3616
  def Exec(self, feedback_fn):
3617
    """Modifies a node.
3618

3619
    """
3620
    node = self.node
3621

    
3622
    result = []
3623
    changed_mc = False
3624

    
3625
    if self.op.offline is not None:
3626
      node.offline = self.op.offline
3627
      result.append(("offline", str(self.op.offline)))
3628
      if self.op.offline == True:
3629
        if node.master_candidate:
3630
          node.master_candidate = False
3631
          changed_mc = True
3632
          result.append(("master_candidate", "auto-demotion due to offline"))
3633
        if node.drained:
3634
          node.drained = False
3635
          result.append(("drained", "clear drained status due to offline"))
3636

    
3637
    if self.op.master_candidate is not None:
3638
      node.master_candidate = self.op.master_candidate
3639
      changed_mc = True
3640
      result.append(("master_candidate", str(self.op.master_candidate)))
3641
      if self.op.master_candidate == False:
3642
        rrc = self.rpc.call_node_demote_from_mc(node.name)
3643
        msg = rrc.fail_msg
3644
        if msg:
3645
          self.LogWarning("Node failed to demote itself: %s" % msg)
3646

    
3647
    if self.op.drained is not None:
3648
      node.drained = self.op.drained
3649
      result.append(("drained", str(self.op.drained)))
3650
      if self.op.drained == True:
3651
        if node.master_candidate:
3652
          node.master_candidate = False
3653
          changed_mc = True
3654
          result.append(("master_candidate", "auto-demotion due to drain"))
3655
          rrc = self.rpc.call_node_demote_from_mc(node.name)
3656
          msg = rrc.fail_msg
3657
          if msg:
3658
            self.LogWarning("Node failed to demote itself: %s" % msg)
3659
        if node.offline:
3660
          node.offline = False
3661
          result.append(("offline", "clear offline status due to drain"))
3662

    
3663
    # we locked all nodes, we adjust the CP before updating this node
3664
    if self.lock_all:
3665
      _AdjustCandidatePool(self, [node.name])
3666

    
3667
    # this will trigger configuration file update, if needed
3668
    self.cfg.Update(node, feedback_fn)
3669

    
3670
    # this will trigger job queue propagation or cleanup
3671
    if changed_mc:
3672
      self.context.ReaddNode(node)
3673

    
3674
    return result
3675

    
3676

    
3677
class LUPowercycleNode(NoHooksLU):
3678
  """Powercycles a node.
3679

3680
  """
3681
  _OP_REQP = ["node_name", "force"]
3682
  REQ_BGL = False
3683

    
3684
  def CheckArguments(self):
3685
    self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3686
    if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
3687
      raise errors.OpPrereqError("The node is the master and the force"
3688
                                 " parameter was not set",
3689
                                 errors.ECODE_INVAL)
3690

    
3691
  def ExpandNames(self):
3692
    """Locking for PowercycleNode.
3693

3694
    This is a last-resort option and shouldn't block on other
3695
    jobs. Therefore, we grab no locks.
3696

3697
    """
3698
    self.needed_locks = {}
3699

    
3700
  def CheckPrereq(self):
3701
    """Check prerequisites.
3702

3703
    This LU has no prereqs.
3704

3705
    """
3706
    pass
3707

    
3708
  def Exec(self, feedback_fn):
3709
    """Reboots a node.
3710

3711
    """
3712
    result = self.rpc.call_node_powercycle(self.op.node_name,
3713
                                           self.cfg.GetHypervisorType())
3714
    result.Raise("Failed to schedule the reboot")
3715
    return result.payload
3716

    
3717

    
3718
class LUQueryClusterInfo(NoHooksLU):
3719
  """Query cluster configuration.
3720

3721
  """
3722
  _OP_REQP = []
3723
  REQ_BGL = False
3724

    
3725
  def ExpandNames(self):
3726
    self.needed_locks = {}
3727

    
3728
  def CheckPrereq(self):
3729
    """No prerequsites needed for this LU.
3730

3731
    """
3732
    pass
3733

    
3734
  def Exec(self, feedback_fn):
3735
    """Return cluster config.
3736

3737
    """
3738
    cluster = self.cfg.GetClusterInfo()
3739
    os_hvp = {}
3740

    
3741
    # Filter just for enabled hypervisors
3742
    for os_name, hv_dict in cluster.os_hvp.items():
3743
      os_hvp[os_name] = {}
3744
      for hv_name, hv_params in hv_dict.items():
3745
        if hv_name in cluster.enabled_hypervisors:
3746
          os_hvp[os_name][hv_name] = hv_params
3747

    
3748
    result = {
3749
      "software_version": constants.RELEASE_VERSION,
3750
      "protocol_version": constants.PROTOCOL_VERSION,
3751
      "config_version": constants.CONFIG_VERSION,
3752
      "os_api_version": max(constants.OS_API_VERSIONS),
3753
      "export_version": constants.EXPORT_VERSION,
3754
      "architecture": (platform.architecture()[0], platform.machine()),
3755
      "name": cluster.cluster_name,
3756
      "master": cluster.master_node,
3757
      "default_hypervisor": cluster.enabled_hypervisors[0],
3758
      "enabled_hypervisors": cluster.enabled_hypervisors,
3759
      "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
3760
                        for hypervisor_name in cluster.enabled_hypervisors]),
3761
      "os_hvp": os_hvp,
3762
      "beparams": cluster.beparams,
3763
      "nicparams": cluster.nicparams,
3764
      "candidate_pool_size": cluster.candidate_pool_size,
3765
      "master_netdev": cluster.master_netdev,
3766
      "volume_group_name": cluster.volume_group_name,
3767
      "file_storage_dir": cluster.file_storage_dir,
3768
      "maintain_node_health": cluster.maintain_node_health,
3769
      "ctime": cluster.ctime,
3770
      "mtime": cluster.mtime,
3771
      "uuid": cluster.uuid,
3772
      "tags": list(cluster.GetTags()),
3773
      "uid_pool": cluster.uid_pool,
3774
      }
3775

    
3776
    return result
3777

    
3778

    
3779
class LUQueryConfigValues(NoHooksLU):
3780
  """Return configuration values.
3781

3782
  """
3783
  _OP_REQP = []
3784
  REQ_BGL = False
3785
  _FIELDS_DYNAMIC = utils.FieldSet()
3786
  _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
3787
                                  "watcher_pause")
3788

    
3789
  def ExpandNames(self):
3790
    self.needed_locks = {}
3791

    
3792
    _CheckOutputFields(static=self._FIELDS_STATIC,
3793
                       dynamic=self._FIELDS_DYNAMIC,
3794
                       selected=self.op.output_fields)
3795

    
3796
  def CheckPrereq(self):
3797
    """No prerequisites.
3798

3799
    """
3800
    pass
3801

    
3802
  def Exec(self, feedback_fn):
3803
    """Dump a representation of the cluster config to the standard output.
3804

3805
    """
3806
    values = []
3807
    for field in self.op.output_fields:
3808
      if field == "cluster_name":
3809
        entry = self.cfg.GetClusterName()
3810
      elif field == "master_node":
3811
        entry = self.cfg.GetMasterNode()
3812
      elif field == "drain_flag":
3813
        entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
3814
      elif field == "watcher_pause":
3815
        entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
3816
      else:
3817
        raise errors.ParameterError(field)
3818
      values.append(entry)
3819
    return values
3820

    
3821

    
3822
class LUActivateInstanceDisks(NoHooksLU):
3823
  """Bring up an instance's disks.
3824

3825
  """
3826
  _OP_REQP = ["instance_name"]
3827
  REQ_BGL = False
3828

    
3829
  def ExpandNames(self):
3830
    self._ExpandAndLockInstance()
3831
    self.needed_locks[locking.LEVEL_NODE] = []
3832
    self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3833

    
3834
  def DeclareLocks(self, level):
3835
    if level == locking.LEVEL_NODE:
3836
      self._LockInstancesNodes()
3837

    
3838
  def CheckPrereq(self):
3839
    """Check prerequisites.
3840

3841
    This checks that the instance is in the cluster.
3842

3843
    """
3844
    self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
3845
    assert self.instance is not None, \
3846
      "Cannot retrieve locked instance %s" % self.op.instance_name
3847
    _CheckNodeOnline(self, self.instance.primary_node)
3848
    if not hasattr(self.op, "ignore_size"):
3849
      self.op.ignore_size = False
3850

    
3851
  def Exec(self, feedback_fn):
3852
    """Activate the disks.
3853

3854
    """
3855
    disks_ok, disks_info = \
3856
              _AssembleInstanceDisks(self, self.instance,
3857
                                     ignore_size=self.op.ignore_size)
3858
    if not disks_ok:
3859
      raise errors.OpExecError("Cannot activate block devices")
3860

    
3861
    return disks_info
3862

    
3863

    
3864
def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False,
3865
                           ignore_size=False):
3866
  """Prepare the block devices for an instance.
3867

3868
  This sets up the block devices on all nodes.
3869

3870
  @type lu: L{LogicalUnit}
3871
  @param lu: the logical unit on whose behalf we execute
3872
  @type instance: L{objects.Instance}
3873
  @param instance: the instance for whose disks we assemble
3874
  @type ignore_secondaries: boolean
3875
  @param ignore_secondaries: if true, errors on secondary nodes
3876
      won't result in an error return from the function
3877
  @type ignore_size: boolean
3878
  @param ignore_size: if true, the current known size of the disk
3879
      will not be used during the disk activation, useful for cases
3880
      when the size is wrong
3881
  @return: False if the operation failed, otherwise a list of
3882
      (host, instance_visible_name, node_visible_name)
3883
      with the mapping from node devices to instance devices
3884

3885
  """
3886
  device_info = []
3887
  disks_ok = True
3888
  iname = instance.name
3889
  # With the two passes mechanism we try to reduce the window of
3890
  # opportunity for the race condition of switching DRBD to primary
3891
  # before handshaking occured, but we do not eliminate it
3892

    
3893
  # The proper fix would be to wait (with some limits) until the
3894
  # connection has been made and drbd transitions from WFConnection
3895
  # into any other network-connected state (Connected, SyncTarget,
3896
  # SyncSource, etc.)
3897

    
3898
  # 1st pass, assemble on all nodes in secondary mode
3899
  for inst_disk in instance.disks:
3900
    for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
3901
      if ignore_size:
3902
        node_disk = node_disk.Copy()
3903
        node_disk.UnsetSize()
3904
      lu.cfg.SetDiskID(node_disk, node)
3905
      result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False)
3906
      msg = result.fail_msg
3907
      if msg:
3908
        lu.proc.LogWarning("Could not prepare block device %s on node %s"
3909
                           " (is_primary=False, pass=1): %s",
3910
                           inst_disk.iv_name, node, msg)
3911
        if not ignore_secondaries:
3912
          disks_ok = False
3913

    
3914
  # FIXME: race condition on drbd migration to primary
3915

    
3916
  # 2nd pass, do only the primary node
3917
  for inst_disk in instance.disks:
3918
    dev_path = None
3919

    
3920
    for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
3921
      if node != instance.primary_node:
3922
        continue
3923
      if ignore_size:
3924
        node_disk = node_disk.Copy()
3925
        node_disk.UnsetSize()
3926
      lu.cfg.SetDiskID(node_disk, node)
3927
      result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True)
3928
      msg = result.fail_msg
3929
      if msg:
3930
        lu.proc.LogWarning("Could not prepare block device %s on node %s"
3931
                           " (is_primary=True, pass=2): %s",
3932
                           inst_disk.iv_name, node, msg)
3933
        disks_ok = False
3934
      else:
3935
        dev_path = result.payload
3936

    
3937
    device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
3938

    
3939
  # leave the disks configured for the primary node
3940
  # this is a workaround that would be fixed better by
3941
  # improving the logical/physical id handling
3942
  for disk in instance.disks:
3943
    lu.cfg.SetDiskID(disk, instance.primary_node)
3944

    
3945
  return disks_ok, device_info
3946

    
3947

    
3948
def _StartInstanceDisks(lu, instance, force):
3949
  """Start the disks of an instance.
3950

3951
  """
3952
  disks_ok, _ = _AssembleInstanceDisks(lu, instance,
3953
                                           ignore_secondaries=force)
3954
  if not disks_ok:
3955
    _ShutdownInstanceDisks(lu, instance)
3956
    if force is not None and not force:
3957
      lu.proc.LogWarning("", hint="If the message above refers to a"
3958
                         " secondary node,"
3959
                         " you can retry the operation using '--force'.")
3960
    raise errors.OpExecError("Disk consistency error")
3961

    
3962

    
3963
class LUDeactivateInstanceDisks(NoHooksLU):
3964
  """Shutdown an instance's disks.
3965

3966
  """
3967
  _OP_REQP = ["instance_name"]
3968
  REQ_BGL = False
3969

    
3970
  def ExpandNames(self):
3971
    self._ExpandAndLockInstance()
3972
    self.needed_locks[locking.LEVEL_NODE] = []
3973
    self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3974

    
3975
  def DeclareLocks(self, level):
3976
    if level == locking.LEVEL_NODE:
3977
      self._LockInstancesNodes()
3978

    
3979
  def CheckPrereq(self):
3980
    """Check prerequisites.
3981

3982
    This checks that the instance is in the cluster.
3983

3984
    """
3985
    self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
3986
    assert self.instance is not None, \
3987
      "Cannot retrieve locked instance %s" % self.op.instance_name
3988

    
3989
  def Exec(self, feedback_fn):
3990
    """Deactivate the disks
3991

3992
    """
3993
    instance = self.instance
3994
    _SafeShutdownInstanceDisks(self, instance)
3995

    
3996

    
3997
def _SafeShutdownInstanceDisks(lu, instance):
3998
  """Shutdown block devices of an instance.
3999

4000
  This function checks if an instance is running, before calling
4001
  _ShutdownInstanceDisks.
4002

4003
  """
4004
  _CheckInstanceDown(lu, instance, "cannot shutdown disks")
4005
  _ShutdownInstanceDisks(lu, instance)
4006

    
4007

    
4008
def _ShutdownInstanceDisks(lu, instance, ignore_primary=False):
4009
  """Shutdown block devices of an instance.
4010

4011
  This does the shutdown on all nodes of the instance.
4012

4013
  If the ignore_primary is false, errors on the primary node are
4014
  ignored.
4015

4016
  """
4017
  all_result = True
4018
  for disk in instance.disks:
4019
    for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
4020
      lu.cfg.SetDiskID(top_disk, node)
4021
      result = lu.rpc.call_blockdev_shutdown(node, top_disk)
4022
      msg = result.fail_msg
4023
      if msg:
4024
        lu.LogWarning("Could not shutdown block device %s on node %s: %s",
4025
                      disk.iv_name, node, msg)
4026
        if not ignore_primary or node != instance.primary_node:
4027
          all_result = False
4028
  return all_result
4029

    
4030

    
4031
def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
4032
  """Checks if a node has enough free memory.
4033

4034
  This function check if a given node has the needed amount of free
4035
  memory. In case the node has less memory or we cannot get the
4036
  information from the node, this function raise an OpPrereqError
4037
  exception.
4038

4039
  @type lu: C{LogicalUnit}
4040
  @param lu: a logical unit from which we get configuration data
4041