Revision c51ffffe lib/client/gnt_cluster.py

b/lib/client/gnt_cluster.py
43 43
from ganeti import uidpool
44 44
from ganeti import compat
45 45
from ganeti import netutils
46
from ganeti import ssconf
46 47
from ganeti import pathutils
48
from ganeti import qlang
47 49

  
48 50

  
49 51
ON_OPT = cli_option("--on", default=False,
......
64 66
                                " is drained",
65 67
                                default=False, action="store_true")
66 68

  
69
TO_OPT = cli_option("--to", default=None, type="string",
70
                    help="The Ganeti version to upgrade to")
71

  
72
RESUME_OPT = cli_option("--resume", default=False, action="store_true",
73
                        help="Resume any pending Ganeti upgrades")
74

  
67 75
_EPO_PING_INTERVAL = 30 # 30 seconds between pings
68 76
_EPO_PING_TIMEOUT = 1 # 1 second
69 77
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
......
1617 1625
  ToStdout(_GetCreateCommand(result))
1618 1626

  
1619 1627

  
1628
def _RunCommandAndReport(cmd):
1629
  """Run a command and report its output, iff it failed.
1630

  
1631
  @param cmd: the command to execute
1632
  @type cmd: list
1633
  @rtype: bool
1634
  @return: False, if the execution failed.
1635

  
1636
  """
1637
  result = utils.RunCmd(cmd)
1638
  if result.failed:
1639
    ToStderr("Command %s failed: %s; Output %s" %
1640
             (cmd, result.fail_reason, result.output))
1641
    return False
1642
  return True
1643

  
1644

  
1645
def _VerifyCommand(cmd):
1646
  """Verify that a given command succeeds on all online nodes.
1647

  
1648
  As this function is intended to run during upgrades, it
1649
  is implemented in such a way that it still works, if all Ganeti
1650
  daemons are down.
1651

  
1652
  @param cmd: the command to execute
1653
  @type cmd: list
1654
  @rtype: list
1655
  @return: the list of node names that are online where
1656
      the command failed.
1657

  
1658
  """
1659
  command = utils.text.ShellQuoteArgs([str(val) for val in cmd])
1660

  
1661
  nodes = ssconf.SimpleStore().GetOnlineNodeList()
1662
  master_node = ssconf.SimpleStore().GetMasterNode()
1663
  cluster_name = ssconf.SimpleStore().GetClusterName()
1664

  
1665
  # If master node is in 'nodes', make sure master node is at list end
1666
  if master_node in nodes:
1667
    nodes.remove(master_node)
1668
    nodes.append(master_node)
1669

  
1670
  failed = []
1671

  
1672
  srun = ssh.SshRunner(cluster_name=cluster_name)
1673
  for name in nodes:
1674
    result = srun.Run(name, constants.SSH_LOGIN_USER, command)
1675
    if result.exit_code != 0:
1676
      failed.append(name)
1677

  
1678
  return failed
1679

  
1680

  
1681
def _VerifyVersionInstalled(versionstring):
1682
  """Verify that the given version of ganeti is installed on all online nodes.
1683

  
1684
  Do nothing, if this is the case, otherwise print an appropriate
1685
  message to stderr.
1686

  
1687
  @param versionstring: the version to check for
1688
  @type versionstring: string
1689
  @rtype: bool
1690
  @return: True, if the version is installed on all online nodes
1691

  
1692
  """
1693
  badnodes = _VerifyCommand(["test", "-d",
1694
                             os.path.join(pathutils.PKGLIBDIR, versionstring)])
1695
  if badnodes:
1696
    ToStderr("Ganeti version %s not installed on nodes %s"
1697
             % (versionstring, ", ".join(badnodes)))
1698
    return False
1699

  
1700
  return True
1701

  
1702

  
1703
def _GetRunning():
1704
  """Determine the list of running jobs.
1705

  
1706
  @rtype: list
1707
  @return: the number of jobs still running
1708

  
1709
  """
1710
  cl = GetClient()
1711
  qfilter = qlang.MakeSimpleFilter("status",
1712
                                   frozenset([constants.JOB_STATUS_RUNNING]))
1713
  return len(cl.Query(constants.QR_JOB, [], qfilter).data)
1714

  
1715

  
1716
def _SetGanetiVersion(versionstring):
1717
  """Set the active version of ganeti to the given versionstring
1718

  
1719
  @type versionstring: string
1720
  @rtype: list
1721
  @return: the list of nodes where the version change failed
1722

  
1723
  """
1724
  failed = []
1725
  failed.extend(_VerifyCommand(
1726
      ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]))
1727
  failed.extend(_VerifyCommand(
1728
      ["ln", "-s", "-f", os.path.join(pathutils.PKGLIBDIR, versionstring),
1729
       os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]))
1730
  failed.extend(_VerifyCommand(
1731
      ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]))
1732
  failed.extend(_VerifyCommand(
1733
      ["ln", "-s", "-f", os.path.join(pathutils.SHAREDIR, versionstring),
1734
       os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]))
1735
  return list(set(failed))
1736

  
1737

  
1738
def _ExecuteCommands(fns):
1739
  """Execute a list of functions, in reverse order.
1740

  
1741
  @type fns: list of functions.
1742
  @param fns: the functions to be executed.
1743

  
1744
  """
1745
  for fn in reversed(fns):
1746
    fn()
1747

  
1748

  
1749
# pylint: disable=R0911
1750
def UpgradeGanetiCommand(opts, args):
1751
  """Upgrade a cluster to a new ganeti version.
1752

  
1753
  @param opts: the command line options selected by the user
1754
  @type args: list
1755
  @param args: should be an empty list
1756
  @rtype: int
1757
  @return: the desired exit code
1758

  
1759
  """
1760
  if ((not opts.resume and opts.to is None)
1761
      or (opts.resume and opts.to is not None)):
1762
    ToStderr("Precisely one of the options --to and --resume"
1763
             " has to be given")
1764
    return 1
1765

  
1766
  if opts.resume:
1767
    # TODO: implement
1768
    ToStderr("The --resume mode is not yet implemented")
1769
    return 1
1770

  
1771
  rollback = []
1772

  
1773
  versionstring = opts.to
1774
  version = utils.version.ParseVersion(versionstring)
1775
  if version is None:
1776
    ToStderr("Could not parse version string %s" % versionstring)
1777
    return 1
1778

  
1779
  msg = utils.version.UpgradeRange(version)
1780
  if msg is not None:
1781
    ToStderr("Cannot upgrade to %s: %s" % (versionstring, msg))
1782
    return 1
1783

  
1784
  downgrade = utils.version.ShouldCfgdowngrade(version)
1785

  
1786
  if not _VerifyVersionInstalled(versionstring):
1787
    return 1
1788

  
1789
  # TODO: write intent-to-upgrade file
1790

  
1791
  ToStdout("Draining queue")
1792
  client = GetClient()
1793
  client.SetQueueDrainFlag(True)
1794

  
1795
  rollback.append(lambda: GetClient().SetQueueDrainFlag(False))
1796

  
1797
  if utils.SimpleRetry(0, _GetRunning,
1798
                       constants.UPGRADE_QUEUE_POLL_INTERVAL,
1799
                       constants.UPGRADE_QUEUE_DRAIN_TIMEOUT):
1800
    ToStderr("Failed to completely empty the queue.")
1801
    _ExecuteCommands(rollback)
1802
    return 1
1803

  
1804
  ToStdout("Stopping daemons on master node.")
1805
  if not _RunCommandAndReport([pathutils.DAEMON_UTIL, "stop-all"]):
1806
    _ExecuteCommands(rollback)
1807
    return 1
1808

  
1809
  if not _VerifyVersionInstalled(versionstring):
1810
    utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
1811
    _ExecuteCommands(rollback)
1812
    return 1
1813

  
1814
  ToStdout("Stopping daemons everywhere.")
1815
  rollback.append(lambda: _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]))
1816
  badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
1817
  if badnodes:
1818
    ToStderr("Failed to stop daemons on %s." % (", ".join(badnodes),))
1819
    _ExecuteCommands(rollback)
1820
    return 1
1821

  
1822
  backuptar = os.path.join(pathutils.LOCALSTATEDIR,
1823
                           "lib/ganeti%d.tar" % time.time())
1824
  ToStdout("Backing up configuration as %s" % backuptar)
1825
  if not _RunCommandAndReport(["tar", "cf", backuptar,
1826
                               pathutils.DATA_DIR]):
1827
    _ExecuteCommands(rollback)
1828
    return 1
1829

  
1830
  if downgrade:
1831
    ToStdout("Downgrading configuration")
1832
    if not _RunCommandAndReport([pathutils.CFGUPGRADE, "--downgrade", "-f"]):
1833
      _ExecuteCommands(rollback)
1834
      return 1
1835

  
1836
  # Configuration change is the point of no return. From then onwards, it is
1837
  # safer to push through the up/dowgrade than to try to roll it back.
1838

  
1839
  returnvalue = 0
1840

  
1841
  ToStdout("Switching to version %s on all nodes" % versionstring)
1842
  rollback.append(lambda: _SetGanetiVersion(constants.DIR_VERSION))
1843
  badnodes = _SetGanetiVersion(versionstring)
1844
  if badnodes:
1845
    ToStderr("Failed to switch to Ganeti version %s on nodes %s"
1846
             % (versionstring, ", ".join(badnodes)))
1847
    if not downgrade:
1848
      _ExecuteCommands(rollback)
1849
      return 1
1850

  
1851
  # Now that we have changed to the new version of Ganeti we should
1852
  # not communicate over luxi any more, as luxi might have changed in
1853
  # incompatible ways. Therefore, manually call the corresponding ganeti
1854
  # commands using their canonical (version independent) path.
1855

  
1856
  if not downgrade:
1857
    ToStdout("Upgrading configuration")
1858
    if not _RunCommandAndReport([pathutils.CFGUPGRADE, "-f"]):
1859
      _ExecuteCommands(rollback)
1860
      return 1
1861

  
1862
  ToStdout("Starting daemons everywhere.")
1863
  badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
1864
  if badnodes:
1865
    ToStderr("Warning: failed to start daemons on %s." % (", ".join(badnodes),))
1866
    returnvalue = 1
1867

  
1868
  ToStdout("Ensuring directories everywhere.")
1869
  badnodes = _VerifyCommand([pathutils.ENSURE_DIRS])
1870
  if badnodes:
1871
    ToStderr("Warning: failed to ensure directories on %s." %
1872
             (", ".join(badnodes)))
1873
    returnvalue = 1
1874

  
1875
  ToStdout("Redistributing the configuration.")
1876
  if not _RunCommandAndReport(["gnt-cluster", "redist-conf", "--yes-do-it"]):
1877
    returnvalue = 1
1878

  
1879
  ToStdout("Restarting daemons everywhere.")
1880
  badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
1881
  badnodes.extend(_VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]))
1882
  if badnodes:
1883
    ToStderr("Warning: failed to start daemons on %s." %
1884
             (", ".join(list(set(badnodes))),))
1885
    returnvalue = 1
1886

  
1887
  ToStdout("Undraining the queue.")
1888
  if not _RunCommandAndReport(["gnt-cluster", "queue", "undrain"]):
1889
    returnvalue = 1
1890

  
1891
  # TODO: write intent-to-upgrade file
1892

  
1893
  ToStdout("Verifying cluster.")
1894
  if not _RunCommandAndReport(["gnt-cluster", "verify"]):
1895
    returnvalue = 1
1896

  
1897
  return returnvalue
1898

  
1899

  
1620 1900
commands = {
1621 1901
  "init": (
1622 1902
    InitCluster, [ArgHost(min=1, max=1)],
......
1735 2015
  "show-ispecs-cmd": (
1736 2016
    ShowCreateCommand, ARGS_NONE, [], "",
1737 2017
    "Show the command line to re-create the cluster"),
2018
  "upgrade": (
2019
    UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "",
2020
    "Upgrade (or downgrade) to a new Ganeti version"),
1738 2021
  }
1739 2022

  
1740 2023

  

Also available in: Unified diff