Statistics
| Branch: | Tag: | Revision:

root / lib / client / gnt_cluster.py @ 5959f75d

History | View | Annotate | Download (50.9 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2010, 2011 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21
"""Cluster related commands"""
22

    
23
# pylint: disable=W0401,W0613,W0614,C0103
24
# W0401: Wildcard import ganeti.cli
25
# W0613: Unused argument, since all functions follow the same API
26
# W0614: Unused import %s from wildcard import (since we need cli)
27
# C0103: Invalid name gnt-cluster
28

    
29
import os.path
30
import time
31
import OpenSSL
32
import itertools
33

    
34
from ganeti.cli import *
35
from ganeti import opcodes
36
from ganeti import constants
37
from ganeti import errors
38
from ganeti import utils
39
from ganeti import bootstrap
40
from ganeti import ssh
41
from ganeti import objects
42
from ganeti import uidpool
43
from ganeti import compat
44
from ganeti import netutils
45

    
46

    
47
ON_OPT = cli_option("--on", default=False,
48
                    action="store_true", dest="on",
49
                    help="Recover from an EPO")
50

    
51
GROUPS_OPT = cli_option("--groups", default=False,
52
                    action="store_true", dest="groups",
53
                    help="Arguments are node groups instead of nodes")
54

    
55
_EPO_PING_INTERVAL = 30 # 30 seconds between pings
56
_EPO_PING_TIMEOUT = 1 # 1 second
57
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
58

    
59

    
60
@UsesRPC
61
def InitCluster(opts, args):
62
  """Initialize the cluster.
63

64
  @param opts: the command line options selected by the user
65
  @type args: list
66
  @param args: should contain only one element, the desired
67
      cluster name
68
  @rtype: int
69
  @return: the desired exit code
70

71
  """
72
  if not opts.lvm_storage and opts.vg_name:
73
    ToStderr("Options --no-lvm-storage and --vg-name conflict.")
74
    return 1
75

    
76
  vg_name = opts.vg_name
77
  if opts.lvm_storage and not opts.vg_name:
78
    vg_name = constants.DEFAULT_VG
79

    
80
  if not opts.drbd_storage and opts.drbd_helper:
81
    ToStderr("Options --no-drbd-storage and --drbd-usermode-helper conflict.")
82
    return 1
83

    
84
  drbd_helper = opts.drbd_helper
85
  if opts.drbd_storage and not opts.drbd_helper:
86
    drbd_helper = constants.DEFAULT_DRBD_HELPER
87

    
88
  master_netdev = opts.master_netdev
89
  if master_netdev is None:
90
    master_netdev = constants.DEFAULT_BRIDGE
91

    
92
  hvlist = opts.enabled_hypervisors
93
  if hvlist is None:
94
    hvlist = constants.DEFAULT_ENABLED_HYPERVISOR
95
  hvlist = hvlist.split(",")
96

    
97
  hvparams = dict(opts.hvparams)
98
  beparams = opts.beparams
99
  nicparams = opts.nicparams
100

    
101
  diskparams = dict(opts.diskparams)
102

    
103
  # check the disk template types here, as we cannot rely on the type check done
104
  # by the opcode parameter types
105
  diskparams_keys = set(diskparams.keys())
106
  if not (diskparams_keys <= constants.DISK_TEMPLATES):
107
    unknown = utils.NiceSort(diskparams_keys - constants.DISK_TEMPLATES)
108
    ToStderr("Disk templates unknown: %s" % utils.CommaJoin(unknown))
109
    return 1
110

    
111
  # prepare beparams dict
112
  beparams = objects.FillDict(constants.BEC_DEFAULTS, beparams)
113
  utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT)
114

    
115
  # prepare nicparams dict
116
  nicparams = objects.FillDict(constants.NICC_DEFAULTS, nicparams)
117
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
118

    
119
  # prepare ndparams dict
120
  if opts.ndparams is None:
121
    ndparams = dict(constants.NDC_DEFAULTS)
122
  else:
123
    ndparams = objects.FillDict(constants.NDC_DEFAULTS, opts.ndparams)
124
    utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
125

    
126
  # prepare hvparams dict
127
  for hv in constants.HYPER_TYPES:
128
    if hv not in hvparams:
129
      hvparams[hv] = {}
130
    hvparams[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], hvparams[hv])
131
    utils.ForceDictType(hvparams[hv], constants.HVS_PARAMETER_TYPES)
132

    
133
  # prepare diskparams dict
134
  for templ in constants.DISK_TEMPLATES:
135
    if templ not in diskparams:
136
      diskparams[templ] = {}
137
    diskparams[templ] = objects.FillDict(constants.DISK_DT_DEFAULTS[templ],
138
                                         diskparams[templ])
139
    utils.ForceDictType(diskparams[templ], constants.DISK_DT_TYPES)
140

    
141
  # prepare ipolicy dict
142
  ipolicy_raw = \
143
    objects.CreateIPolicyFromOpts(ispecs_mem_size=opts.ispecs_mem_size,
144
                                  ispecs_cpu_count=opts.ispecs_cpu_count,
145
                                  ispecs_disk_count=opts.ispecs_disk_count,
146
                                  ispecs_disk_size=opts.ispecs_disk_size,
147
                                  ispecs_nic_count=opts.ispecs_nic_count)
148
  ipolicy = objects.FillDictOfDicts(constants.IPOLICY_DEFAULTS, ipolicy_raw)
149
  for value in ipolicy.values():
150
    utils.ForceDictType(value, constants.ISPECS_PARAMETER_TYPES)
151

    
152
  if opts.candidate_pool_size is None:
153
    opts.candidate_pool_size = constants.MASTER_POOL_SIZE_DEFAULT
154

    
155
  if opts.mac_prefix is None:
156
    opts.mac_prefix = constants.DEFAULT_MAC_PREFIX
157

    
158
  uid_pool = opts.uid_pool
159
  if uid_pool is not None:
160
    uid_pool = uidpool.ParseUidPool(uid_pool)
161

    
162
  if opts.prealloc_wipe_disks is None:
163
    opts.prealloc_wipe_disks = False
164

    
165
  external_ip_setup_script = opts.use_external_mip_script
166
  if external_ip_setup_script is None:
167
    external_ip_setup_script = False
168

    
169
  try:
170
    primary_ip_version = int(opts.primary_ip_version)
171
  except (ValueError, TypeError), err:
172
    ToStderr("Invalid primary ip version value: %s" % str(err))
173
    return 1
174

    
175
  master_netmask = opts.master_netmask
176
  try:
177
    if master_netmask is not None:
178
      master_netmask = int(master_netmask)
179
  except (ValueError, TypeError), err:
180
    ToStderr("Invalid master netmask value: %s" % str(err))
181
    return 1
182

    
183
  bootstrap.InitCluster(cluster_name=args[0],
184
                        secondary_ip=opts.secondary_ip,
185
                        vg_name=vg_name,
186
                        mac_prefix=opts.mac_prefix,
187
                        master_netmask=master_netmask,
188
                        master_netdev=master_netdev,
189
                        file_storage_dir=opts.file_storage_dir,
190
                        shared_file_storage_dir=opts.shared_file_storage_dir,
191
                        enabled_hypervisors=hvlist,
192
                        hvparams=hvparams,
193
                        beparams=beparams,
194
                        nicparams=nicparams,
195
                        ndparams=ndparams,
196
                        diskparams=diskparams,
197
                        ipolicy=ipolicy,
198
                        candidate_pool_size=opts.candidate_pool_size,
199
                        modify_etc_hosts=opts.modify_etc_hosts,
200
                        modify_ssh_setup=opts.modify_ssh_setup,
201
                        maintain_node_health=opts.maintain_node_health,
202
                        drbd_helper=drbd_helper,
203
                        uid_pool=uid_pool,
204
                        default_iallocator=opts.default_iallocator,
205
                        primary_ip_version=primary_ip_version,
206
                        prealloc_wipe_disks=opts.prealloc_wipe_disks,
207
                        use_external_mip_script=external_ip_setup_script,
208
                        )
209
  op = opcodes.OpClusterPostInit()
210
  SubmitOpCode(op, opts=opts)
211
  return 0
212

    
213

    
214
@UsesRPC
215
def DestroyCluster(opts, args):
216
  """Destroy the cluster.
217

218
  @param opts: the command line options selected by the user
219
  @type args: list
220
  @param args: should be an empty list
221
  @rtype: int
222
  @return: the desired exit code
223

224
  """
225
  if not opts.yes_do_it:
226
    ToStderr("Destroying a cluster is irreversible. If you really want"
227
             " destroy this cluster, supply the --yes-do-it option.")
228
    return 1
229

    
230
  op = opcodes.OpClusterDestroy()
231
  master = SubmitOpCode(op, opts=opts)
232
  # if we reached this, the opcode didn't fail; we can proceed to
233
  # shutdown all the daemons
234
  bootstrap.FinalizeClusterDestroy(master)
235
  return 0
236

    
237

    
238
def RenameCluster(opts, args):
239
  """Rename the cluster.
240

241
  @param opts: the command line options selected by the user
242
  @type args: list
243
  @param args: should contain only one element, the new cluster name
244
  @rtype: int
245
  @return: the desired exit code
246

247
  """
248
  cl = GetClient()
249

    
250
  (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
251

    
252
  new_name = args[0]
253
  if not opts.force:
254
    usertext = ("This will rename the cluster from '%s' to '%s'. If you are"
255
                " connected over the network to the cluster name, the"
256
                " operation is very dangerous as the IP address will be"
257
                " removed from the node and the change may not go through."
258
                " Continue?") % (cluster_name, new_name)
259
    if not AskUser(usertext):
260
      return 1
261

    
262
  op = opcodes.OpClusterRename(name=new_name)
263
  result = SubmitOpCode(op, opts=opts, cl=cl)
264

    
265
  if result:
266
    ToStdout("Cluster renamed from '%s' to '%s'", cluster_name, result)
267

    
268
  return 0
269

    
270

    
271
def ActivateMasterIp(opts, args):
272
  """Activates the master IP.
273

274
  """
275
  op = opcodes.OpClusterActivateMasterIp()
276
  SubmitOpCode(op)
277
  return 0
278

    
279

    
280
def DeactivateMasterIp(opts, args):
281
  """Deactivates the master IP.
282

283
  """
284
  if not opts.confirm:
285
    usertext = ("This will disable the master IP. All the open connections to"
286
                " the master IP will be closed. To reach the master you will"
287
                " need to use its node IP."
288
                " Continue?")
289
    if not AskUser(usertext):
290
      return 1
291

    
292
  op = opcodes.OpClusterDeactivateMasterIp()
293
  SubmitOpCode(op)
294
  return 0
295

    
296

    
297
def RedistributeConfig(opts, args):
298
  """Forces push of the cluster configuration.
299

300
  @param opts: the command line options selected by the user
301
  @type args: list
302
  @param args: empty list
303
  @rtype: int
304
  @return: the desired exit code
305

306
  """
307
  op = opcodes.OpClusterRedistConf()
308
  SubmitOrSend(op, opts)
309
  return 0
310

    
311

    
312
def ShowClusterVersion(opts, args):
313
  """Write version of ganeti software to the standard output.
314

315
  @param opts: the command line options selected by the user
316
  @type args: list
317
  @param args: should be an empty list
318
  @rtype: int
319
  @return: the desired exit code
320

321
  """
322
  cl = GetClient()
323
  result = cl.QueryClusterInfo()
324
  ToStdout("Software version: %s", result["software_version"])
325
  ToStdout("Internode protocol: %s", result["protocol_version"])
326
  ToStdout("Configuration format: %s", result["config_version"])
327
  ToStdout("OS api version: %s", result["os_api_version"])
328
  ToStdout("Export interface: %s", result["export_version"])
329
  return 0
330

    
331

    
332
def ShowClusterMaster(opts, args):
333
  """Write name of master node to the standard output.
334

335
  @param opts: the command line options selected by the user
336
  @type args: list
337
  @param args: should be an empty list
338
  @rtype: int
339
  @return: the desired exit code
340

341
  """
342
  master = bootstrap.GetMaster()
343
  ToStdout(master)
344
  return 0
345

    
346

    
347
def _PrintGroupedParams(paramsdict, level=1, roman=False):
348
  """Print Grouped parameters (be, nic, disk) by group.
349

350
  @type paramsdict: dict of dicts
351
  @param paramsdict: {group: {param: value, ...}, ...}
352
  @type level: int
353
  @param level: Level of indention
354

355
  """
356
  indent = "  " * level
357
  for item, val in sorted(paramsdict.items()):
358
    if isinstance(val, dict):
359
      ToStdout("%s- %s:", indent, item)
360
      _PrintGroupedParams(val, level=level + 1, roman=roman)
361
    elif roman and isinstance(val, int):
362
      ToStdout("%s  %s: %s", indent, item, compat.TryToRoman(val))
363
    else:
364
      ToStdout("%s  %s: %s", indent, item, val)
365

    
366

    
367
def ShowClusterConfig(opts, args):
368
  """Shows cluster information.
369

370
  @param opts: the command line options selected by the user
371
  @type args: list
372
  @param args: should be an empty list
373
  @rtype: int
374
  @return: the desired exit code
375

376
  """
377
  cl = GetClient()
378
  result = cl.QueryClusterInfo()
379

    
380
  ToStdout("Cluster name: %s", result["name"])
381
  ToStdout("Cluster UUID: %s", result["uuid"])
382

    
383
  ToStdout("Creation time: %s", utils.FormatTime(result["ctime"]))
384
  ToStdout("Modification time: %s", utils.FormatTime(result["mtime"]))
385

    
386
  ToStdout("Master node: %s", result["master"])
387

    
388
  ToStdout("Architecture (this node): %s (%s)",
389
           result["architecture"][0], result["architecture"][1])
390

    
391
  if result["tags"]:
392
    tags = utils.CommaJoin(utils.NiceSort(result["tags"]))
393
  else:
394
    tags = "(none)"
395

    
396
  ToStdout("Tags: %s", tags)
397

    
398
  ToStdout("Default hypervisor: %s", result["default_hypervisor"])
399
  ToStdout("Enabled hypervisors: %s",
400
           utils.CommaJoin(result["enabled_hypervisors"]))
401

    
402
  ToStdout("Hypervisor parameters:")
403
  _PrintGroupedParams(result["hvparams"])
404

    
405
  ToStdout("OS-specific hypervisor parameters:")
406
  _PrintGroupedParams(result["os_hvp"])
407

    
408
  ToStdout("OS parameters:")
409
  _PrintGroupedParams(result["osparams"])
410

    
411
  ToStdout("Hidden OSes: %s", utils.CommaJoin(result["hidden_os"]))
412
  ToStdout("Blacklisted OSes: %s", utils.CommaJoin(result["blacklisted_os"]))
413

    
414
  ToStdout("Cluster parameters:")
415
  ToStdout("  - candidate pool size: %s",
416
            compat.TryToRoman(result["candidate_pool_size"],
417
                              convert=opts.roman_integers))
418
  ToStdout("  - master netdev: %s", result["master_netdev"])
419
  ToStdout("  - master netmask: %s", result["master_netmask"])
420
  ToStdout("  - use external master IP address setup script: %s",
421
           result["use_external_mip_script"])
422
  ToStdout("  - lvm volume group: %s", result["volume_group_name"])
423
  if result["reserved_lvs"]:
424
    reserved_lvs = utils.CommaJoin(result["reserved_lvs"])
425
  else:
426
    reserved_lvs = "(none)"
427
  ToStdout("  - lvm reserved volumes: %s", reserved_lvs)
428
  ToStdout("  - drbd usermode helper: %s", result["drbd_usermode_helper"])
429
  ToStdout("  - file storage path: %s", result["file_storage_dir"])
430
  ToStdout("  - shared file storage path: %s",
431
           result["shared_file_storage_dir"])
432
  ToStdout("  - maintenance of node health: %s",
433
           result["maintain_node_health"])
434
  ToStdout("  - uid pool: %s",
435
            uidpool.FormatUidPool(result["uid_pool"],
436
                                  roman=opts.roman_integers))
437
  ToStdout("  - default instance allocator: %s", result["default_iallocator"])
438
  ToStdout("  - primary ip version: %d", result["primary_ip_version"])
439
  ToStdout("  - preallocation wipe disks: %s", result["prealloc_wipe_disks"])
440
  ToStdout("  - OS search path: %s", utils.CommaJoin(constants.OS_SEARCH_PATH))
441

    
442
  ToStdout("Default node parameters:")
443
  _PrintGroupedParams(result["ndparams"], roman=opts.roman_integers)
444

    
445
  ToStdout("Default instance parameters:")
446
  _PrintGroupedParams(result["beparams"], roman=opts.roman_integers)
447

    
448
  ToStdout("Default nic parameters:")
449
  _PrintGroupedParams(result["nicparams"], roman=opts.roman_integers)
450

    
451
  ToStdout("Instance policy - limits for instances:")
452
  for key in constants.IPOLICY_PARAMETERS:
453
    ToStdout("  - %s", key)
454
    _PrintGroupedParams(result["ipolicy"][key], roman=opts.roman_integers)
455

    
456
  return 0
457

    
458

    
459
def ClusterCopyFile(opts, args):
460
  """Copy a file from master to some nodes.
461

462
  @param opts: the command line options selected by the user
463
  @type args: list
464
  @param args: should contain only one element, the path of
465
      the file to be copied
466
  @rtype: int
467
  @return: the desired exit code
468

469
  """
470
  filename = args[0]
471
  if not os.path.exists(filename):
472
    raise errors.OpPrereqError("No such filename '%s'" % filename,
473
                               errors.ECODE_INVAL)
474

    
475
  cl = GetClient()
476

    
477
  cluster_name = cl.QueryConfigValues(["cluster_name"])[0]
478

    
479
  results = GetOnlineNodes(nodes=opts.nodes, cl=cl, filter_master=True,
480
                           secondary_ips=opts.use_replication_network,
481
                           nodegroup=opts.nodegroup)
482

    
483
  srun = ssh.SshRunner(cluster_name=cluster_name)
484
  for node in results:
485
    if not srun.CopyFileToNode(node, filename):
486
      ToStderr("Copy of file %s to node %s failed", filename, node)
487

    
488
  return 0
489

    
490

    
491
def RunClusterCommand(opts, args):
492
  """Run a command on some nodes.
493

494
  @param opts: the command line options selected by the user
495
  @type args: list
496
  @param args: should contain the command to be run and its arguments
497
  @rtype: int
498
  @return: the desired exit code
499

500
  """
501
  cl = GetClient()
502

    
503
  command = " ".join(args)
504

    
505
  nodes = GetOnlineNodes(nodes=opts.nodes, cl=cl, nodegroup=opts.nodegroup)
506

    
507
  cluster_name, master_node = cl.QueryConfigValues(["cluster_name",
508
                                                    "master_node"])
509

    
510
  srun = ssh.SshRunner(cluster_name=cluster_name)
511

    
512
  # Make sure master node is at list end
513
  if master_node in nodes:
514
    nodes.remove(master_node)
515
    nodes.append(master_node)
516

    
517
  for name in nodes:
518
    result = srun.Run(name, "root", command)
519
    ToStdout("------------------------------------------------")
520
    ToStdout("node: %s", name)
521
    ToStdout("%s", result.output)
522
    ToStdout("return code = %s", result.exit_code)
523

    
524
  return 0
525

    
526

    
527
def VerifyCluster(opts, args):
528
  """Verify integrity of cluster, performing various test on nodes.
529

530
  @param opts: the command line options selected by the user
531
  @type args: list
532
  @param args: should be an empty list
533
  @rtype: int
534
  @return: the desired exit code
535

536
  """
537
  skip_checks = []
538

    
539
  if opts.skip_nplusone_mem:
540
    skip_checks.append(constants.VERIFY_NPLUSONE_MEM)
541

    
542
  cl = GetClient()
543

    
544
  op = opcodes.OpClusterVerify(verbose=opts.verbose,
545
                               error_codes=opts.error_codes,
546
                               debug_simulate_errors=opts.simulate_errors,
547
                               skip_checks=skip_checks,
548
                               ignore_errors=opts.ignore_errors,
549
                               group_name=opts.nodegroup)
550
  result = SubmitOpCode(op, cl=cl, opts=opts)
551

    
552
  # Keep track of submitted jobs
553
  jex = JobExecutor(cl=cl, opts=opts)
554

    
555
  for (status, job_id) in result[constants.JOB_IDS_KEY]:
556
    jex.AddJobId(None, status, job_id)
557

    
558
  results = jex.GetResults()
559

    
560
  (bad_jobs, bad_results) = \
561
    map(len,
562
        # Convert iterators to lists
563
        map(list,
564
            # Count errors
565
            map(compat.partial(itertools.ifilterfalse, bool),
566
                # Convert result to booleans in a tuple
567
                zip(*((job_success, len(op_results) == 1 and op_results[0])
568
                      for (job_success, op_results) in results)))))
569

    
570
  if bad_jobs == 0 and bad_results == 0:
571
    rcode = constants.EXIT_SUCCESS
572
  else:
573
    rcode = constants.EXIT_FAILURE
574
    if bad_jobs > 0:
575
      ToStdout("%s job(s) failed while verifying the cluster.", bad_jobs)
576

    
577
  return rcode
578

    
579

    
580
def VerifyDisks(opts, args):
581
  """Verify integrity of cluster disks.
582

583
  @param opts: the command line options selected by the user
584
  @type args: list
585
  @param args: should be an empty list
586
  @rtype: int
587
  @return: the desired exit code
588

589
  """
590
  cl = GetClient()
591

    
592
  op = opcodes.OpClusterVerifyDisks()
593

    
594
  result = SubmitOpCode(op, cl=cl, opts=opts)
595

    
596
  # Keep track of submitted jobs
597
  jex = JobExecutor(cl=cl, opts=opts)
598

    
599
  for (status, job_id) in result[constants.JOB_IDS_KEY]:
600
    jex.AddJobId(None, status, job_id)
601

    
602
  retcode = constants.EXIT_SUCCESS
603

    
604
  for (status, result) in jex.GetResults():
605
    if not status:
606
      ToStdout("Job failed: %s", result)
607
      continue
608

    
609
    ((bad_nodes, instances, missing), ) = result
610

    
611
    for node, text in bad_nodes.items():
612
      ToStdout("Error gathering data on node %s: %s",
613
               node, utils.SafeEncode(text[-400:]))
614
      retcode = constants.EXIT_FAILURE
615
      ToStdout("You need to fix these nodes first before fixing instances")
616

    
617
    for iname in instances:
618
      if iname in missing:
619
        continue
620
      op = opcodes.OpInstanceActivateDisks(instance_name=iname)
621
      try:
622
        ToStdout("Activating disks for instance '%s'", iname)
623
        SubmitOpCode(op, opts=opts, cl=cl)
624
      except errors.GenericError, err:
625
        nret, msg = FormatError(err)
626
        retcode |= nret
627
        ToStderr("Error activating disks for instance %s: %s", iname, msg)
628

    
629
    if missing:
630
      for iname, ival in missing.iteritems():
631
        all_missing = compat.all(x[0] in bad_nodes for x in ival)
632
        if all_missing:
633
          ToStdout("Instance %s cannot be verified as it lives on"
634
                   " broken nodes", iname)
635
        else:
636
          ToStdout("Instance %s has missing logical volumes:", iname)
637
          ival.sort()
638
          for node, vol in ival:
639
            if node in bad_nodes:
640
              ToStdout("\tbroken node %s /dev/%s", node, vol)
641
            else:
642
              ToStdout("\t%s /dev/%s", node, vol)
643

    
644
      ToStdout("You need to replace or recreate disks for all the above"
645
               " instances if this message persists after fixing broken nodes.")
646
      retcode = constants.EXIT_FAILURE
647

    
648
  return retcode
649

    
650

    
651
def RepairDiskSizes(opts, args):
652
  """Verify sizes of cluster disks.
653

654
  @param opts: the command line options selected by the user
655
  @type args: list
656
  @param args: optional list of instances to restrict check to
657
  @rtype: int
658
  @return: the desired exit code
659

660
  """
661
  op = opcodes.OpClusterRepairDiskSizes(instances=args)
662
  SubmitOpCode(op, opts=opts)
663

    
664

    
665
@UsesRPC
666
def MasterFailover(opts, args):
667
  """Failover the master node.
668

669
  This command, when run on a non-master node, will cause the current
670
  master to cease being master, and the non-master to become new
671
  master.
672

673
  @param opts: the command line options selected by the user
674
  @type args: list
675
  @param args: should be an empty list
676
  @rtype: int
677
  @return: the desired exit code
678

679
  """
680
  if opts.no_voting:
681
    usertext = ("This will perform the failover even if most other nodes"
682
                " are down, or if this node is outdated. This is dangerous"
683
                " as it can lead to a non-consistent cluster. Check the"
684
                " gnt-cluster(8) man page before proceeding. Continue?")
685
    if not AskUser(usertext):
686
      return 1
687

    
688
  return bootstrap.MasterFailover(no_voting=opts.no_voting)
689

    
690

    
691
def MasterPing(opts, args):
692
  """Checks if the master is alive.
693

694
  @param opts: the command line options selected by the user
695
  @type args: list
696
  @param args: should be an empty list
697
  @rtype: int
698
  @return: the desired exit code
699

700
  """
701
  try:
702
    cl = GetClient()
703
    cl.QueryClusterInfo()
704
    return 0
705
  except Exception: # pylint: disable=W0703
706
    return 1
707

    
708

    
709
def SearchTags(opts, args):
710
  """Searches the tags on all the cluster.
711

712
  @param opts: the command line options selected by the user
713
  @type args: list
714
  @param args: should contain only one element, the tag pattern
715
  @rtype: int
716
  @return: the desired exit code
717

718
  """
719
  op = opcodes.OpTagsSearch(pattern=args[0])
720
  result = SubmitOpCode(op, opts=opts)
721
  if not result:
722
    return 1
723
  result = list(result)
724
  result.sort()
725
  for path, tag in result:
726
    ToStdout("%s %s", path, tag)
727

    
728

    
729
def _ReadAndVerifyCert(cert_filename, verify_private_key=False):
730
  """Reads and verifies an X509 certificate.
731

732
  @type cert_filename: string
733
  @param cert_filename: the path of the file containing the certificate to
734
                        verify encoded in PEM format
735
  @type verify_private_key: bool
736
  @param verify_private_key: whether to verify the private key in addition to
737
                             the public certificate
738
  @rtype: string
739
  @return: a string containing the PEM-encoded certificate.
740

741
  """
742
  try:
743
    pem = utils.ReadFile(cert_filename)
744
  except IOError, err:
745
    raise errors.X509CertError(cert_filename,
746
                               "Unable to read certificate: %s" % str(err))
747

    
748
  try:
749
    OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, pem)
750
  except Exception, err:
751
    raise errors.X509CertError(cert_filename,
752
                               "Unable to load certificate: %s" % str(err))
753

    
754
  if verify_private_key:
755
    try:
756
      OpenSSL.crypto.load_privatekey(OpenSSL.crypto.FILETYPE_PEM, pem)
757
    except Exception, err:
758
      raise errors.X509CertError(cert_filename,
759
                                 "Unable to load private key: %s" % str(err))
760

    
761
  return pem
762

    
763

    
764
def _RenewCrypto(new_cluster_cert, new_rapi_cert, #pylint: disable=R0911
765
                 rapi_cert_filename, new_spice_cert, spice_cert_filename,
766
                 spice_cacert_filename, new_confd_hmac_key, new_cds,
767
                 cds_filename, force):
768
  """Renews cluster certificates, keys and secrets.
769

770
  @type new_cluster_cert: bool
771
  @param new_cluster_cert: Whether to generate a new cluster certificate
772
  @type new_rapi_cert: bool
773
  @param new_rapi_cert: Whether to generate a new RAPI certificate
774
  @type rapi_cert_filename: string
775
  @param rapi_cert_filename: Path to file containing new RAPI certificate
776
  @type new_spice_cert: bool
777
  @param new_spice_cert: Whether to generate a new SPICE certificate
778
  @type spice_cert_filename: string
779
  @param spice_cert_filename: Path to file containing new SPICE certificate
780
  @type spice_cacert_filename: string
781
  @param spice_cacert_filename: Path to file containing the certificate of the
782
                                CA that signed the SPICE certificate
783
  @type new_confd_hmac_key: bool
784
  @param new_confd_hmac_key: Whether to generate a new HMAC key
785
  @type new_cds: bool
786
  @param new_cds: Whether to generate a new cluster domain secret
787
  @type cds_filename: string
788
  @param cds_filename: Path to file containing new cluster domain secret
789
  @type force: bool
790
  @param force: Whether to ask user for confirmation
791

792
  """
793
  if new_rapi_cert and rapi_cert_filename:
794
    ToStderr("Only one of the --new-rapi-certificate and --rapi-certificate"
795
             " options can be specified at the same time.")
796
    return 1
797

    
798
  if new_cds and cds_filename:
799
    ToStderr("Only one of the --new-cluster-domain-secret and"
800
             " --cluster-domain-secret options can be specified at"
801
             " the same time.")
802
    return 1
803

    
804
  if new_spice_cert and (spice_cert_filename or spice_cacert_filename):
805
    ToStderr("When using --new-spice-certificate, the --spice-certificate"
806
             " and --spice-ca-certificate must not be used.")
807
    return 1
808

    
809
  if bool(spice_cacert_filename) ^ bool(spice_cert_filename):
810
    ToStderr("Both --spice-certificate and --spice-ca-certificate must be"
811
             " specified.")
812
    return 1
813

    
814
  rapi_cert_pem, spice_cert_pem, spice_cacert_pem = (None, None, None)
815
  try:
816
    if rapi_cert_filename:
817
      rapi_cert_pem = _ReadAndVerifyCert(rapi_cert_filename, True)
818
    if spice_cert_filename:
819
      spice_cert_pem = _ReadAndVerifyCert(spice_cert_filename, True)
820
      spice_cacert_pem = _ReadAndVerifyCert(spice_cacert_filename)
821
  except errors.X509CertError, err:
822
    ToStderr("Unable to load X509 certificate from %s: %s", err[0], err[1])
823
    return 1
824

    
825
  if cds_filename:
826
    try:
827
      cds = utils.ReadFile(cds_filename)
828
    except Exception, err: # pylint: disable=W0703
829
      ToStderr("Can't load new cluster domain secret from %s: %s" %
830
               (cds_filename, str(err)))
831
      return 1
832
  else:
833
    cds = None
834

    
835
  if not force:
836
    usertext = ("This requires all daemons on all nodes to be restarted and"
837
                " may take some time. Continue?")
838
    if not AskUser(usertext):
839
      return 1
840

    
841
  def _RenewCryptoInner(ctx):
842
    ctx.feedback_fn("Updating certificates and keys")
843
    bootstrap.GenerateClusterCrypto(new_cluster_cert,
844
                                    new_rapi_cert,
845
                                    new_spice_cert,
846
                                    new_confd_hmac_key,
847
                                    new_cds,
848
                                    rapi_cert_pem=rapi_cert_pem,
849
                                    spice_cert_pem=spice_cert_pem,
850
                                    spice_cacert_pem=spice_cacert_pem,
851
                                    cds=cds)
852

    
853
    files_to_copy = []
854

    
855
    if new_cluster_cert:
856
      files_to_copy.append(constants.NODED_CERT_FILE)
857

    
858
    if new_rapi_cert or rapi_cert_pem:
859
      files_to_copy.append(constants.RAPI_CERT_FILE)
860

    
861
    if new_spice_cert or spice_cert_pem:
862
      files_to_copy.append(constants.SPICE_CERT_FILE)
863
      files_to_copy.append(constants.SPICE_CACERT_FILE)
864

    
865
    if new_confd_hmac_key:
866
      files_to_copy.append(constants.CONFD_HMAC_KEY)
867

    
868
    if new_cds or cds:
869
      files_to_copy.append(constants.CLUSTER_DOMAIN_SECRET_FILE)
870

    
871
    if files_to_copy:
872
      for node_name in ctx.nonmaster_nodes:
873
        ctx.feedback_fn("Copying %s to %s" %
874
                        (", ".join(files_to_copy), node_name))
875
        for file_name in files_to_copy:
876
          ctx.ssh.CopyFileToNode(node_name, file_name)
877

    
878
  RunWhileClusterStopped(ToStdout, _RenewCryptoInner)
879

    
880
  ToStdout("All requested certificates and keys have been replaced."
881
           " Running \"gnt-cluster verify\" now is recommended.")
882

    
883
  return 0
884

    
885

    
886
def RenewCrypto(opts, args):
887
  """Renews cluster certificates, keys and secrets.
888

889
  """
890
  return _RenewCrypto(opts.new_cluster_cert,
891
                      opts.new_rapi_cert,
892
                      opts.rapi_cert,
893
                      opts.new_spice_cert,
894
                      opts.spice_cert,
895
                      opts.spice_cacert,
896
                      opts.new_confd_hmac_key,
897
                      opts.new_cluster_domain_secret,
898
                      opts.cluster_domain_secret,
899
                      opts.force)
900

    
901

    
902
def SetClusterParams(opts, args):
903
  """Modify the cluster.
904

905
  @param opts: the command line options selected by the user
906
  @type args: list
907
  @param args: should be an empty list
908
  @rtype: int
909
  @return: the desired exit code
910

911
  """
912
  if not (not opts.lvm_storage or opts.vg_name or
913
          not opts.drbd_storage or opts.drbd_helper or
914
          opts.enabled_hypervisors or opts.hvparams or
915
          opts.beparams or opts.nicparams or
916
          opts.ndparams or opts.diskparams or
917
          opts.candidate_pool_size is not None or
918
          opts.uid_pool is not None or
919
          opts.maintain_node_health is not None or
920
          opts.add_uids is not None or
921
          opts.remove_uids is not None or
922
          opts.default_iallocator is not None or
923
          opts.reserved_lvs is not None or
924
          opts.master_netdev is not None or
925
          opts.master_netmask is not None or
926
          opts.use_external_mip_script is not None or
927
          opts.prealloc_wipe_disks is not None or
928
          opts.hv_state or
929
          opts.disk_state or
930
          opts.ispecs_mem_size is not None or
931
          opts.ispecs_cpu_count is not None or
932
          opts.ispecs_disk_count is not None or
933
          opts.ispecs_disk_size is not None or
934
          opts.ispecs_nic_count is not None):
935
    ToStderr("Please give at least one of the parameters.")
936
    return 1
937

    
938
  vg_name = opts.vg_name
939
  if not opts.lvm_storage and opts.vg_name:
940
    ToStderr("Options --no-lvm-storage and --vg-name conflict.")
941
    return 1
942

    
943
  if not opts.lvm_storage:
944
    vg_name = ""
945

    
946
  drbd_helper = opts.drbd_helper
947
  if not opts.drbd_storage and opts.drbd_helper:
948
    ToStderr("Options --no-drbd-storage and --drbd-usermode-helper conflict.")
949
    return 1
950

    
951
  if not opts.drbd_storage:
952
    drbd_helper = ""
953

    
954
  hvlist = opts.enabled_hypervisors
955
  if hvlist is not None:
956
    hvlist = hvlist.split(",")
957

    
958
  # a list of (name, dict) we can pass directly to dict() (or [])
959
  hvparams = dict(opts.hvparams)
960
  for hv_params in hvparams.values():
961
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
962

    
963
  diskparams = dict(opts.diskparams)
964

    
965
  for dt_params in hvparams.values():
966
    utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
967

    
968
  beparams = opts.beparams
969
  utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT)
970

    
971
  nicparams = opts.nicparams
972
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
973

    
974
  ndparams = opts.ndparams
975
  if ndparams is not None:
976
    utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
977

    
978
  ipolicy = \
979
    objects.CreateIPolicyFromOpts(ispecs_mem_size=opts.ispecs_mem_size,
980
                                  ispecs_cpu_count=opts.ispecs_cpu_count,
981
                                  ispecs_disk_count=opts.ispecs_disk_count,
982
                                  ispecs_disk_size=opts.ispecs_disk_size,
983
                                  ispecs_nic_count=opts.ispecs_nic_count)
984
  for value in ipolicy.values():
985
    utils.ForceDictType(value, constants.ISPECS_PARAMETER_TYPES)
986

    
987
  mnh = opts.maintain_node_health
988

    
989
  uid_pool = opts.uid_pool
990
  if uid_pool is not None:
991
    uid_pool = uidpool.ParseUidPool(uid_pool)
992

    
993
  add_uids = opts.add_uids
994
  if add_uids is not None:
995
    add_uids = uidpool.ParseUidPool(add_uids)
996

    
997
  remove_uids = opts.remove_uids
998
  if remove_uids is not None:
999
    remove_uids = uidpool.ParseUidPool(remove_uids)
1000

    
1001
  if opts.reserved_lvs is not None:
1002
    if opts.reserved_lvs == "":
1003
      opts.reserved_lvs = []
1004
    else:
1005
      opts.reserved_lvs = utils.UnescapeAndSplit(opts.reserved_lvs, sep=",")
1006

    
1007
  if opts.master_netmask is not None:
1008
    try:
1009
      opts.master_netmask = int(opts.master_netmask)
1010
    except ValueError:
1011
      ToStderr("The --master-netmask option expects an int parameter.")
1012
      return 1
1013

    
1014
  ext_ip_script = opts.use_external_mip_script
1015

    
1016
  if opts.disk_state:
1017
    disk_state = utils.FlatToDict(opts.disk_state)
1018
  else:
1019
    disk_state = {}
1020

    
1021
  hv_state = dict(opts.hv_state)
1022

    
1023
  op = opcodes.OpClusterSetParams(vg_name=vg_name,
1024
                                  drbd_helper=drbd_helper,
1025
                                  enabled_hypervisors=hvlist,
1026
                                  hvparams=hvparams,
1027
                                  os_hvp=None,
1028
                                  beparams=beparams,
1029
                                  nicparams=nicparams,
1030
                                  ndparams=ndparams,
1031
                                  diskparams=diskparams,
1032
                                  ipolicy=ipolicy,
1033
                                  candidate_pool_size=opts.candidate_pool_size,
1034
                                  maintain_node_health=mnh,
1035
                                  uid_pool=uid_pool,
1036
                                  add_uids=add_uids,
1037
                                  remove_uids=remove_uids,
1038
                                  default_iallocator=opts.default_iallocator,
1039
                                  prealloc_wipe_disks=opts.prealloc_wipe_disks,
1040
                                  master_netdev=opts.master_netdev,
1041
                                  master_netmask=opts.master_netmask,
1042
                                  reserved_lvs=opts.reserved_lvs,
1043
                                  use_external_mip_script=ext_ip_script,
1044
                                  hv_state=hv_state,
1045
                                  disk_state=disk_state,
1046
                                  )
1047
  SubmitOpCode(op, opts=opts)
1048
  return 0
1049

    
1050

    
1051
def QueueOps(opts, args):
1052
  """Queue operations.
1053

1054
  @param opts: the command line options selected by the user
1055
  @type args: list
1056
  @param args: should contain only one element, the subcommand
1057
  @rtype: int
1058
  @return: the desired exit code
1059

1060
  """
1061
  command = args[0]
1062
  client = GetClient()
1063
  if command in ("drain", "undrain"):
1064
    drain_flag = command == "drain"
1065
    client.SetQueueDrainFlag(drain_flag)
1066
  elif command == "info":
1067
    result = client.QueryConfigValues(["drain_flag"])
1068
    if result[0]:
1069
      val = "set"
1070
    else:
1071
      val = "unset"
1072
    ToStdout("The drain flag is %s" % val)
1073
  else:
1074
    raise errors.OpPrereqError("Command '%s' is not valid." % command,
1075
                               errors.ECODE_INVAL)
1076

    
1077
  return 0
1078

    
1079

    
1080
def _ShowWatcherPause(until):
1081
  if until is None or until < time.time():
1082
    ToStdout("The watcher is not paused.")
1083
  else:
1084
    ToStdout("The watcher is paused until %s.", time.ctime(until))
1085

    
1086

    
1087
def WatcherOps(opts, args):
1088
  """Watcher operations.
1089

1090
  @param opts: the command line options selected by the user
1091
  @type args: list
1092
  @param args: should contain only one element, the subcommand
1093
  @rtype: int
1094
  @return: the desired exit code
1095

1096
  """
1097
  command = args[0]
1098
  client = GetClient()
1099

    
1100
  if command == "continue":
1101
    client.SetWatcherPause(None)
1102
    ToStdout("The watcher is no longer paused.")
1103

    
1104
  elif command == "pause":
1105
    if len(args) < 2:
1106
      raise errors.OpPrereqError("Missing pause duration", errors.ECODE_INVAL)
1107

    
1108
    result = client.SetWatcherPause(time.time() + ParseTimespec(args[1]))
1109
    _ShowWatcherPause(result)
1110

    
1111
  elif command == "info":
1112
    result = client.QueryConfigValues(["watcher_pause"])
1113
    _ShowWatcherPause(result[0])
1114

    
1115
  else:
1116
    raise errors.OpPrereqError("Command '%s' is not valid." % command,
1117
                               errors.ECODE_INVAL)
1118

    
1119
  return 0
1120

    
1121

    
1122
def _OobPower(opts, node_list, power):
1123
  """Puts the node in the list to desired power state.
1124

1125
  @param opts: The command line options selected by the user
1126
  @param node_list: The list of nodes to operate on
1127
  @param power: True if they should be powered on, False otherwise
1128
  @return: The success of the operation (none failed)
1129

1130
  """
1131
  if power:
1132
    command = constants.OOB_POWER_ON
1133
  else:
1134
    command = constants.OOB_POWER_OFF
1135

    
1136
  op = opcodes.OpOobCommand(node_names=node_list,
1137
                            command=command,
1138
                            ignore_status=True,
1139
                            timeout=opts.oob_timeout,
1140
                            power_delay=opts.power_delay)
1141
  result = SubmitOpCode(op, opts=opts)
1142
  errs = 0
1143
  for node_result in result:
1144
    (node_tuple, data_tuple) = node_result
1145
    (_, node_name) = node_tuple
1146
    (data_status, _) = data_tuple
1147
    if data_status != constants.RS_NORMAL:
1148
      assert data_status != constants.RS_UNAVAIL
1149
      errs += 1
1150
      ToStderr("There was a problem changing power for %s, please investigate",
1151
               node_name)
1152

    
1153
  if errs > 0:
1154
    return False
1155

    
1156
  return True
1157

    
1158

    
1159
def _InstanceStart(opts, inst_list, start):
1160
  """Puts the instances in the list to desired state.
1161

1162
  @param opts: The command line options selected by the user
1163
  @param inst_list: The list of instances to operate on
1164
  @param start: True if they should be started, False for shutdown
1165
  @return: The success of the operation (none failed)
1166

1167
  """
1168
  if start:
1169
    opcls = opcodes.OpInstanceStartup
1170
    text_submit, text_success, text_failed = ("startup", "started", "starting")
1171
  else:
1172
    opcls = compat.partial(opcodes.OpInstanceShutdown,
1173
                           timeout=opts.shutdown_timeout)
1174
    text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping")
1175

    
1176
  jex = JobExecutor(opts=opts)
1177

    
1178
  for inst in inst_list:
1179
    ToStdout("Submit %s of instance %s", text_submit, inst)
1180
    op = opcls(instance_name=inst)
1181
    jex.QueueJob(inst, op)
1182

    
1183
  results = jex.GetResults()
1184
  bad_cnt = len([1 for (success, _) in results if not success])
1185

    
1186
  if bad_cnt == 0:
1187
    ToStdout("All instances have been %s successfully", text_success)
1188
  else:
1189
    ToStderr("There were errors while %s instances:\n"
1190
             "%d error(s) out of %d instance(s)", text_failed, bad_cnt,
1191
             len(results))
1192
    return False
1193

    
1194
  return True
1195

    
1196

    
1197
class _RunWhenNodesReachableHelper:
1198
  """Helper class to make shared internal state sharing easier.
1199

1200
  @ivar success: Indicates if all action_cb calls were successful
1201

1202
  """
1203
  def __init__(self, node_list, action_cb, node2ip, port, feedback_fn,
1204
               _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep):
1205
    """Init the object.
1206

1207
    @param node_list: The list of nodes to be reachable
1208
    @param action_cb: Callback called when a new host is reachable
1209
    @type node2ip: dict
1210
    @param node2ip: Node to ip mapping
1211
    @param port: The port to use for the TCP ping
1212
    @param feedback_fn: The function used for feedback
1213
    @param _ping_fn: Function to check reachabilty (for unittest use only)
1214
    @param _sleep_fn: Function to sleep (for unittest use only)
1215

1216
    """
1217
    self.down = set(node_list)
1218
    self.up = set()
1219
    self.node2ip = node2ip
1220
    self.success = True
1221
    self.action_cb = action_cb
1222
    self.port = port
1223
    self.feedback_fn = feedback_fn
1224
    self._ping_fn = _ping_fn
1225
    self._sleep_fn = _sleep_fn
1226

    
1227
  def __call__(self):
1228
    """When called we run action_cb.
1229

1230
    @raises utils.RetryAgain: When there are still down nodes
1231

1232
    """
1233
    if not self.action_cb(self.up):
1234
      self.success = False
1235

    
1236
    if self.down:
1237
      raise utils.RetryAgain()
1238
    else:
1239
      return self.success
1240

    
1241
  def Wait(self, secs):
1242
    """Checks if a host is up or waits remaining seconds.
1243

1244
    @param secs: The secs remaining
1245

1246
    """
1247
    start = time.time()
1248
    for node in self.down:
1249
      if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT,
1250
                       live_port_needed=True):
1251
        self.feedback_fn("Node %s became available" % node)
1252
        self.up.add(node)
1253
        self.down -= self.up
1254
        # If we have a node available there is the possibility to run the
1255
        # action callback successfully, therefore we don't wait and return
1256
        return
1257

    
1258
    self._sleep_fn(max(0.0, start + secs - time.time()))
1259

    
1260

    
1261
def _RunWhenNodesReachable(node_list, action_cb, interval):
1262
  """Run action_cb when nodes become reachable.
1263

1264
  @param node_list: The list of nodes to be reachable
1265
  @param action_cb: Callback called when a new host is reachable
1266
  @param interval: The earliest time to retry
1267

1268
  """
1269
  client = GetClient()
1270
  cluster_info = client.QueryClusterInfo()
1271
  if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
1272
    family = netutils.IPAddress.family
1273
  else:
1274
    family = netutils.IP6Address.family
1275

    
1276
  node2ip = dict((node, netutils.GetHostname(node, family=family).ip)
1277
                 for node in node_list)
1278

    
1279
  port = netutils.GetDaemonPort(constants.NODED)
1280
  helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port,
1281
                                        ToStdout)
1282

    
1283
  try:
1284
    return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT,
1285
                       wait_fn=helper.Wait)
1286
  except utils.RetryTimeout:
1287
    ToStderr("Time exceeded while waiting for nodes to become reachable"
1288
             " again:\n  - %s", "  - ".join(helper.down))
1289
    return False
1290

    
1291

    
1292
def _MaybeInstanceStartup(opts, inst_map, nodes_online,
1293
                          _instance_start_fn=_InstanceStart):
1294
  """Start the instances conditional based on node_states.
1295

1296
  @param opts: The command line options selected by the user
1297
  @param inst_map: A dict of inst -> nodes mapping
1298
  @param nodes_online: A list of nodes online
1299
  @param _instance_start_fn: Callback to start instances (unittest use only)
1300
  @return: Success of the operation on all instances
1301

1302
  """
1303
  start_inst_list = []
1304
  for (inst, nodes) in inst_map.items():
1305
    if not (nodes - nodes_online):
1306
      # All nodes the instance lives on are back online
1307
      start_inst_list.append(inst)
1308

    
1309
  for inst in start_inst_list:
1310
    del inst_map[inst]
1311

    
1312
  if start_inst_list:
1313
    return _instance_start_fn(opts, start_inst_list, True)
1314

    
1315
  return True
1316

    
1317

    
1318
def _EpoOn(opts, full_node_list, node_list, inst_map):
1319
  """Does the actual power on.
1320

1321
  @param opts: The command line options selected by the user
1322
  @param full_node_list: All nodes to operate on (includes nodes not supporting
1323
                         OOB)
1324
  @param node_list: The list of nodes to operate on (all need to support OOB)
1325
  @param inst_map: A dict of inst -> nodes mapping
1326
  @return: The desired exit status
1327

1328
  """
1329
  if node_list and not _OobPower(opts, node_list, False):
1330
    ToStderr("Not all nodes seem to get back up, investigate and start"
1331
             " manually if needed")
1332

    
1333
  # Wait for the nodes to be back up
1334
  action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
1335

    
1336
  ToStdout("Waiting until all nodes are available again")
1337
  if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL):
1338
    ToStderr("Please investigate and start stopped instances manually")
1339
    return constants.EXIT_FAILURE
1340

    
1341
  return constants.EXIT_SUCCESS
1342

    
1343

    
1344
def _EpoOff(opts, node_list, inst_map):
1345
  """Does the actual power off.
1346

1347
  @param opts: The command line options selected by the user
1348
  @param node_list: The list of nodes to operate on (all need to support OOB)
1349
  @param inst_map: A dict of inst -> nodes mapping
1350
  @return: The desired exit status
1351

1352
  """
1353
  if not _InstanceStart(opts, inst_map.keys(), False):
1354
    ToStderr("Please investigate and stop instances manually before continuing")
1355
    return constants.EXIT_FAILURE
1356

    
1357
  if not node_list:
1358
    return constants.EXIT_SUCCESS
1359

    
1360
  if _OobPower(opts, node_list, False):
1361
    return constants.EXIT_SUCCESS
1362
  else:
1363
    return constants.EXIT_FAILURE
1364

    
1365

    
1366
def Epo(opts, args):
1367
  """EPO operations.
1368

1369
  @param opts: the command line options selected by the user
1370
  @type args: list
1371
  @param args: should contain only one element, the subcommand
1372
  @rtype: int
1373
  @return: the desired exit code
1374

1375
  """
1376
  if opts.groups and opts.show_all:
1377
    ToStderr("Only one of --groups or --all are allowed")
1378
    return constants.EXIT_FAILURE
1379
  elif args and opts.show_all:
1380
    ToStderr("Arguments in combination with --all are not allowed")
1381
    return constants.EXIT_FAILURE
1382

    
1383
  client = GetClient()
1384

    
1385
  if opts.groups:
1386
    node_query_list = itertools.chain(*client.QueryGroups(names=args,
1387
                                                          fields=["node_list"],
1388
                                                          use_locking=False))
1389
  else:
1390
    node_query_list = args
1391

    
1392
  result = client.QueryNodes(names=node_query_list,
1393
                             fields=["name", "master", "pinst_list",
1394
                                     "sinst_list", "powered", "offline"],
1395
                             use_locking=False)
1396
  node_list = []
1397
  inst_map = {}
1398
  for (idx, (node, master, pinsts, sinsts, powered,
1399
             offline)) in enumerate(result):
1400
    # Normalize the node_query_list as well
1401
    if not opts.show_all:
1402
      node_query_list[idx] = node
1403
    if not offline:
1404
      for inst in (pinsts + sinsts):
1405
        if inst in inst_map:
1406
          if not master:
1407
            inst_map[inst].add(node)
1408
        elif master:
1409
          inst_map[inst] = set()
1410
        else:
1411
          inst_map[inst] = set([node])
1412

    
1413
    if master and opts.on:
1414
      # We ignore the master for turning on the machines, in fact we are
1415
      # already operating on the master at this point :)
1416
      continue
1417
    elif master and not opts.show_all:
1418
      ToStderr("%s is the master node, please do a master-failover to another"
1419
               " node not affected by the EPO or use --all if you intend to"
1420
               " shutdown the whole cluster", node)
1421
      return constants.EXIT_FAILURE
1422
    elif powered is None:
1423
      ToStdout("Node %s does not support out-of-band handling, it can not be"
1424
               " handled in a fully automated manner", node)
1425
    elif powered == opts.on:
1426
      ToStdout("Node %s is already in desired power state, skipping", node)
1427
    elif not offline or (offline and powered):
1428
      node_list.append(node)
1429

    
1430
  if not opts.force and not ConfirmOperation(node_query_list, "nodes", "epo"):
1431
    return constants.EXIT_FAILURE
1432

    
1433
  if opts.on:
1434
    return _EpoOn(opts, node_query_list, node_list, inst_map)
1435
  else:
1436
    return _EpoOff(opts, node_list, inst_map)
1437

    
1438
INSTANCE_POLICY_OPTS = [
1439
  SPECS_CPU_COUNT_OPT,
1440
  SPECS_DISK_COUNT_OPT,
1441
  SPECS_DISK_SIZE_OPT,
1442
  SPECS_MEM_SIZE_OPT,
1443
  SPECS_NIC_COUNT_OPT,
1444
  ]
1445

    
1446
commands = {
1447
  "init": (
1448
    InitCluster, [ArgHost(min=1, max=1)],
1449
    [BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, GLOBAL_FILEDIR_OPT,
1450
     HVLIST_OPT, MAC_PREFIX_OPT, MASTER_NETDEV_OPT, MASTER_NETMASK_OPT,
1451
     NIC_PARAMS_OPT, NOLVM_STORAGE_OPT, NOMODIFY_ETCHOSTS_OPT,
1452
     NOMODIFY_SSH_SETUP_OPT, SECONDARY_IP_OPT, VG_NAME_OPT,
1453
     MAINTAIN_NODE_HEALTH_OPT, UIDPOOL_OPT, DRBD_HELPER_OPT, NODRBD_STORAGE_OPT,
1454
     DEFAULT_IALLOCATOR_OPT, PRIMARY_IP_VERSION_OPT, PREALLOC_WIPE_DISKS_OPT,
1455
     NODE_PARAMS_OPT, GLOBAL_SHARED_FILEDIR_OPT, USE_EXTERNAL_MIP_SCRIPT,
1456
     DISK_PARAMS_OPT] + INSTANCE_POLICY_OPTS,
1457
    "[opts...] <cluster_name>", "Initialises a new cluster configuration"),
1458
  "destroy": (
1459
    DestroyCluster, ARGS_NONE, [YES_DOIT_OPT],
1460
    "", "Destroy cluster"),
1461
  "rename": (
1462
    RenameCluster, [ArgHost(min=1, max=1)],
1463
    [FORCE_OPT, DRY_RUN_OPT],
1464
    "<new_name>",
1465
    "Renames the cluster"),
1466
  "redist-conf": (
1467
    RedistributeConfig, ARGS_NONE, [SUBMIT_OPT, DRY_RUN_OPT, PRIORITY_OPT],
1468
    "", "Forces a push of the configuration file and ssconf files"
1469
    " to the nodes in the cluster"),
1470
  "verify": (
1471
    VerifyCluster, ARGS_NONE,
1472
    [VERBOSE_OPT, DEBUG_SIMERR_OPT, ERROR_CODES_OPT, NONPLUS1_OPT,
1473
     DRY_RUN_OPT, PRIORITY_OPT, NODEGROUP_OPT, IGNORE_ERRORS_OPT],
1474
    "", "Does a check on the cluster configuration"),
1475
  "verify-disks": (
1476
    VerifyDisks, ARGS_NONE, [PRIORITY_OPT],
1477
    "", "Does a check on the cluster disk status"),
1478
  "repair-disk-sizes": (
1479
    RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
1480
    "[instance...]", "Updates mismatches in recorded disk sizes"),
1481
  "master-failover": (
1482
    MasterFailover, ARGS_NONE, [NOVOTING_OPT],
1483
    "", "Makes the current node the master"),
1484
  "master-ping": (
1485
    MasterPing, ARGS_NONE, [],
1486
    "", "Checks if the master is alive"),
1487
  "version": (
1488
    ShowClusterVersion, ARGS_NONE, [],
1489
    "", "Shows the cluster version"),
1490
  "getmaster": (
1491
    ShowClusterMaster, ARGS_NONE, [],
1492
    "", "Shows the cluster master"),
1493
  "copyfile": (
1494
    ClusterCopyFile, [ArgFile(min=1, max=1)],
1495
    [NODE_LIST_OPT, USE_REPL_NET_OPT, NODEGROUP_OPT],
1496
    "[-n node...] <filename>", "Copies a file to all (or only some) nodes"),
1497
  "command": (
1498
    RunClusterCommand, [ArgCommand(min=1)],
1499
    [NODE_LIST_OPT, NODEGROUP_OPT],
1500
    "[-n node...] <command>", "Runs a command on all (or only some) nodes"),
1501
  "info": (
1502
    ShowClusterConfig, ARGS_NONE, [ROMAN_OPT],
1503
    "[--roman]", "Show cluster configuration"),
1504
  "list-tags": (
1505
    ListTags, ARGS_NONE, [], "", "List the tags of the cluster"),
1506
  "add-tags": (
1507
    AddTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT],
1508
    "tag...", "Add tags to the cluster"),
1509
  "remove-tags": (
1510
    RemoveTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT],
1511
    "tag...", "Remove tags from the cluster"),
1512
  "search-tags": (
1513
    SearchTags, [ArgUnknown(min=1, max=1)], [PRIORITY_OPT], "",
1514
    "Searches the tags on all objects on"
1515
    " the cluster for a given pattern (regex)"),
1516
  "queue": (
1517
    QueueOps,
1518
    [ArgChoice(min=1, max=1, choices=["drain", "undrain", "info"])],
1519
    [], "drain|undrain|info", "Change queue properties"),
1520
  "watcher": (
1521
    WatcherOps,
1522
    [ArgChoice(min=1, max=1, choices=["pause", "continue", "info"]),
1523
     ArgSuggest(min=0, max=1, choices=["30m", "1h", "4h"])],
1524
    [],
1525
    "{pause <timespec>|continue|info}", "Change watcher properties"),
1526
  "modify": (
1527
    SetClusterParams, ARGS_NONE,
1528
    [BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, HVLIST_OPT, MASTER_NETDEV_OPT,
1529
     MASTER_NETMASK_OPT, NIC_PARAMS_OPT, NOLVM_STORAGE_OPT, VG_NAME_OPT,
1530
     MAINTAIN_NODE_HEALTH_OPT, UIDPOOL_OPT, ADD_UIDS_OPT, REMOVE_UIDS_OPT,
1531
     DRBD_HELPER_OPT, NODRBD_STORAGE_OPT, DEFAULT_IALLOCATOR_OPT,
1532
     RESERVED_LVS_OPT, DRY_RUN_OPT, PRIORITY_OPT, PREALLOC_WIPE_DISKS_OPT,
1533
     NODE_PARAMS_OPT, USE_EXTERNAL_MIP_SCRIPT, DISK_PARAMS_OPT, HV_STATE_OPT,
1534
     DISK_STATE_OPT] +
1535
    INSTANCE_POLICY_OPTS,
1536
    "[opts...]",
1537
    "Alters the parameters of the cluster"),
1538
  "renew-crypto": (
1539
    RenewCrypto, ARGS_NONE,
1540
    [NEW_CLUSTER_CERT_OPT, NEW_RAPI_CERT_OPT, RAPI_CERT_OPT,
1541
     NEW_CONFD_HMAC_KEY_OPT, FORCE_OPT,
1542
     NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT,
1543
     NEW_SPICE_CERT_OPT, SPICE_CERT_OPT, SPICE_CACERT_OPT],
1544
    "[opts...]",
1545
    "Renews cluster certificates, keys and secrets"),
1546
  "epo": (
1547
    Epo, [ArgUnknown()],
1548
    [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT,
1549
     SHUTDOWN_TIMEOUT_OPT, POWER_DELAY_OPT],
1550
    "[opts...] [args]",
1551
    "Performs an emergency power-off on given args"),
1552
  "activate-master-ip": (
1553
    ActivateMasterIp, ARGS_NONE, [], "", "Activates the master IP"),
1554
  "deactivate-master-ip": (
1555
    DeactivateMasterIp, ARGS_NONE, [CONFIRM_OPT], "",
1556
    "Deactivates the master IP"),
1557
  }
1558

    
1559

    
1560
#: dictionary with aliases for commands
1561
aliases = {
1562
  "masterfailover": "master-failover",
1563
}
1564

    
1565

    
1566
def Main():
1567
  return GenericMain(commands, override={"tag_type": constants.TAG_CLUSTER},
1568
                     aliases=aliases)