Revision c51ffffe lib/client/gnt_cluster.py
b/lib/client/gnt_cluster.py | ||
---|---|---|
43 | 43 |
from ganeti import uidpool |
44 | 44 |
from ganeti import compat |
45 | 45 |
from ganeti import netutils |
46 |
from ganeti import ssconf |
|
46 | 47 |
from ganeti import pathutils |
48 |
from ganeti import qlang |
|
47 | 49 |
|
48 | 50 |
|
49 | 51 |
ON_OPT = cli_option("--on", default=False, |
... | ... | |
64 | 66 |
" is drained", |
65 | 67 |
default=False, action="store_true") |
66 | 68 |
|
69 |
TO_OPT = cli_option("--to", default=None, type="string", |
|
70 |
help="The Ganeti version to upgrade to") |
|
71 |
|
|
72 |
RESUME_OPT = cli_option("--resume", default=False, action="store_true", |
|
73 |
help="Resume any pending Ganeti upgrades") |
|
74 |
|
|
67 | 75 |
_EPO_PING_INTERVAL = 30 # 30 seconds between pings |
68 | 76 |
_EPO_PING_TIMEOUT = 1 # 1 second |
69 | 77 |
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes |
... | ... | |
1617 | 1625 |
ToStdout(_GetCreateCommand(result)) |
1618 | 1626 |
|
1619 | 1627 |
|
1628 |
def _RunCommandAndReport(cmd): |
|
1629 |
"""Run a command and report its output, iff it failed. |
|
1630 |
|
|
1631 |
@param cmd: the command to execute |
|
1632 |
@type cmd: list |
|
1633 |
@rtype: bool |
|
1634 |
@return: False, if the execution failed. |
|
1635 |
|
|
1636 |
""" |
|
1637 |
result = utils.RunCmd(cmd) |
|
1638 |
if result.failed: |
|
1639 |
ToStderr("Command %s failed: %s; Output %s" % |
|
1640 |
(cmd, result.fail_reason, result.output)) |
|
1641 |
return False |
|
1642 |
return True |
|
1643 |
|
|
1644 |
|
|
1645 |
def _VerifyCommand(cmd): |
|
1646 |
"""Verify that a given command succeeds on all online nodes. |
|
1647 |
|
|
1648 |
As this function is intended to run during upgrades, it |
|
1649 |
is implemented in such a way that it still works, if all Ganeti |
|
1650 |
daemons are down. |
|
1651 |
|
|
1652 |
@param cmd: the command to execute |
|
1653 |
@type cmd: list |
|
1654 |
@rtype: list |
|
1655 |
@return: the list of node names that are online where |
|
1656 |
the command failed. |
|
1657 |
|
|
1658 |
""" |
|
1659 |
command = utils.text.ShellQuoteArgs([str(val) for val in cmd]) |
|
1660 |
|
|
1661 |
nodes = ssconf.SimpleStore().GetOnlineNodeList() |
|
1662 |
master_node = ssconf.SimpleStore().GetMasterNode() |
|
1663 |
cluster_name = ssconf.SimpleStore().GetClusterName() |
|
1664 |
|
|
1665 |
# If master node is in 'nodes', make sure master node is at list end |
|
1666 |
if master_node in nodes: |
|
1667 |
nodes.remove(master_node) |
|
1668 |
nodes.append(master_node) |
|
1669 |
|
|
1670 |
failed = [] |
|
1671 |
|
|
1672 |
srun = ssh.SshRunner(cluster_name=cluster_name) |
|
1673 |
for name in nodes: |
|
1674 |
result = srun.Run(name, constants.SSH_LOGIN_USER, command) |
|
1675 |
if result.exit_code != 0: |
|
1676 |
failed.append(name) |
|
1677 |
|
|
1678 |
return failed |
|
1679 |
|
|
1680 |
|
|
1681 |
def _VerifyVersionInstalled(versionstring): |
|
1682 |
"""Verify that the given version of ganeti is installed on all online nodes. |
|
1683 |
|
|
1684 |
Do nothing, if this is the case, otherwise print an appropriate |
|
1685 |
message to stderr. |
|
1686 |
|
|
1687 |
@param versionstring: the version to check for |
|
1688 |
@type versionstring: string |
|
1689 |
@rtype: bool |
|
1690 |
@return: True, if the version is installed on all online nodes |
|
1691 |
|
|
1692 |
""" |
|
1693 |
badnodes = _VerifyCommand(["test", "-d", |
|
1694 |
os.path.join(pathutils.PKGLIBDIR, versionstring)]) |
|
1695 |
if badnodes: |
|
1696 |
ToStderr("Ganeti version %s not installed on nodes %s" |
|
1697 |
% (versionstring, ", ".join(badnodes))) |
|
1698 |
return False |
|
1699 |
|
|
1700 |
return True |
|
1701 |
|
|
1702 |
|
|
1703 |
def _GetRunning(): |
|
1704 |
"""Determine the list of running jobs. |
|
1705 |
|
|
1706 |
@rtype: list |
|
1707 |
@return: the number of jobs still running |
|
1708 |
|
|
1709 |
""" |
|
1710 |
cl = GetClient() |
|
1711 |
qfilter = qlang.MakeSimpleFilter("status", |
|
1712 |
frozenset([constants.JOB_STATUS_RUNNING])) |
|
1713 |
return len(cl.Query(constants.QR_JOB, [], qfilter).data) |
|
1714 |
|
|
1715 |
|
|
1716 |
def _SetGanetiVersion(versionstring): |
|
1717 |
"""Set the active version of ganeti to the given versionstring |
|
1718 |
|
|
1719 |
@type versionstring: string |
|
1720 |
@rtype: list |
|
1721 |
@return: the list of nodes where the version change failed |
|
1722 |
|
|
1723 |
""" |
|
1724 |
failed = [] |
|
1725 |
failed.extend(_VerifyCommand( |
|
1726 |
["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")])) |
|
1727 |
failed.extend(_VerifyCommand( |
|
1728 |
["ln", "-s", "-f", os.path.join(pathutils.PKGLIBDIR, versionstring), |
|
1729 |
os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")])) |
|
1730 |
failed.extend(_VerifyCommand( |
|
1731 |
["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/share")])) |
|
1732 |
failed.extend(_VerifyCommand( |
|
1733 |
["ln", "-s", "-f", os.path.join(pathutils.SHAREDIR, versionstring), |
|
1734 |
os.path.join(pathutils.SYSCONFDIR, "ganeti/share")])) |
|
1735 |
return list(set(failed)) |
|
1736 |
|
|
1737 |
|
|
1738 |
def _ExecuteCommands(fns): |
|
1739 |
"""Execute a list of functions, in reverse order. |
|
1740 |
|
|
1741 |
@type fns: list of functions. |
|
1742 |
@param fns: the functions to be executed. |
|
1743 |
|
|
1744 |
""" |
|
1745 |
for fn in reversed(fns): |
|
1746 |
fn() |
|
1747 |
|
|
1748 |
|
|
1749 |
# pylint: disable=R0911 |
|
1750 |
def UpgradeGanetiCommand(opts, args): |
|
1751 |
"""Upgrade a cluster to a new ganeti version. |
|
1752 |
|
|
1753 |
@param opts: the command line options selected by the user |
|
1754 |
@type args: list |
|
1755 |
@param args: should be an empty list |
|
1756 |
@rtype: int |
|
1757 |
@return: the desired exit code |
|
1758 |
|
|
1759 |
""" |
|
1760 |
if ((not opts.resume and opts.to is None) |
|
1761 |
or (opts.resume and opts.to is not None)): |
|
1762 |
ToStderr("Precisely one of the options --to and --resume" |
|
1763 |
" has to be given") |
|
1764 |
return 1 |
|
1765 |
|
|
1766 |
if opts.resume: |
|
1767 |
# TODO: implement |
|
1768 |
ToStderr("The --resume mode is not yet implemented") |
|
1769 |
return 1 |
|
1770 |
|
|
1771 |
rollback = [] |
|
1772 |
|
|
1773 |
versionstring = opts.to |
|
1774 |
version = utils.version.ParseVersion(versionstring) |
|
1775 |
if version is None: |
|
1776 |
ToStderr("Could not parse version string %s" % versionstring) |
|
1777 |
return 1 |
|
1778 |
|
|
1779 |
msg = utils.version.UpgradeRange(version) |
|
1780 |
if msg is not None: |
|
1781 |
ToStderr("Cannot upgrade to %s: %s" % (versionstring, msg)) |
|
1782 |
return 1 |
|
1783 |
|
|
1784 |
downgrade = utils.version.ShouldCfgdowngrade(version) |
|
1785 |
|
|
1786 |
if not _VerifyVersionInstalled(versionstring): |
|
1787 |
return 1 |
|
1788 |
|
|
1789 |
# TODO: write intent-to-upgrade file |
|
1790 |
|
|
1791 |
ToStdout("Draining queue") |
|
1792 |
client = GetClient() |
|
1793 |
client.SetQueueDrainFlag(True) |
|
1794 |
|
|
1795 |
rollback.append(lambda: GetClient().SetQueueDrainFlag(False)) |
|
1796 |
|
|
1797 |
if utils.SimpleRetry(0, _GetRunning, |
|
1798 |
constants.UPGRADE_QUEUE_POLL_INTERVAL, |
|
1799 |
constants.UPGRADE_QUEUE_DRAIN_TIMEOUT): |
|
1800 |
ToStderr("Failed to completely empty the queue.") |
|
1801 |
_ExecuteCommands(rollback) |
|
1802 |
return 1 |
|
1803 |
|
|
1804 |
ToStdout("Stopping daemons on master node.") |
|
1805 |
if not _RunCommandAndReport([pathutils.DAEMON_UTIL, "stop-all"]): |
|
1806 |
_ExecuteCommands(rollback) |
|
1807 |
return 1 |
|
1808 |
|
|
1809 |
if not _VerifyVersionInstalled(versionstring): |
|
1810 |
utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"]) |
|
1811 |
_ExecuteCommands(rollback) |
|
1812 |
return 1 |
|
1813 |
|
|
1814 |
ToStdout("Stopping daemons everywhere.") |
|
1815 |
rollback.append(lambda: _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])) |
|
1816 |
badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"]) |
|
1817 |
if badnodes: |
|
1818 |
ToStderr("Failed to stop daemons on %s." % (", ".join(badnodes),)) |
|
1819 |
_ExecuteCommands(rollback) |
|
1820 |
return 1 |
|
1821 |
|
|
1822 |
backuptar = os.path.join(pathutils.LOCALSTATEDIR, |
|
1823 |
"lib/ganeti%d.tar" % time.time()) |
|
1824 |
ToStdout("Backing up configuration as %s" % backuptar) |
|
1825 |
if not _RunCommandAndReport(["tar", "cf", backuptar, |
|
1826 |
pathutils.DATA_DIR]): |
|
1827 |
_ExecuteCommands(rollback) |
|
1828 |
return 1 |
|
1829 |
|
|
1830 |
if downgrade: |
|
1831 |
ToStdout("Downgrading configuration") |
|
1832 |
if not _RunCommandAndReport([pathutils.CFGUPGRADE, "--downgrade", "-f"]): |
|
1833 |
_ExecuteCommands(rollback) |
|
1834 |
return 1 |
|
1835 |
|
|
1836 |
# Configuration change is the point of no return. From then onwards, it is |
|
1837 |
# safer to push through the up/dowgrade than to try to roll it back. |
|
1838 |
|
|
1839 |
returnvalue = 0 |
|
1840 |
|
|
1841 |
ToStdout("Switching to version %s on all nodes" % versionstring) |
|
1842 |
rollback.append(lambda: _SetGanetiVersion(constants.DIR_VERSION)) |
|
1843 |
badnodes = _SetGanetiVersion(versionstring) |
|
1844 |
if badnodes: |
|
1845 |
ToStderr("Failed to switch to Ganeti version %s on nodes %s" |
|
1846 |
% (versionstring, ", ".join(badnodes))) |
|
1847 |
if not downgrade: |
|
1848 |
_ExecuteCommands(rollback) |
|
1849 |
return 1 |
|
1850 |
|
|
1851 |
# Now that we have changed to the new version of Ganeti we should |
|
1852 |
# not communicate over luxi any more, as luxi might have changed in |
|
1853 |
# incompatible ways. Therefore, manually call the corresponding ganeti |
|
1854 |
# commands using their canonical (version independent) path. |
|
1855 |
|
|
1856 |
if not downgrade: |
|
1857 |
ToStdout("Upgrading configuration") |
|
1858 |
if not _RunCommandAndReport([pathutils.CFGUPGRADE, "-f"]): |
|
1859 |
_ExecuteCommands(rollback) |
|
1860 |
return 1 |
|
1861 |
|
|
1862 |
ToStdout("Starting daemons everywhere.") |
|
1863 |
badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]) |
|
1864 |
if badnodes: |
|
1865 |
ToStderr("Warning: failed to start daemons on %s." % (", ".join(badnodes),)) |
|
1866 |
returnvalue = 1 |
|
1867 |
|
|
1868 |
ToStdout("Ensuring directories everywhere.") |
|
1869 |
badnodes = _VerifyCommand([pathutils.ENSURE_DIRS]) |
|
1870 |
if badnodes: |
|
1871 |
ToStderr("Warning: failed to ensure directories on %s." % |
|
1872 |
(", ".join(badnodes))) |
|
1873 |
returnvalue = 1 |
|
1874 |
|
|
1875 |
ToStdout("Redistributing the configuration.") |
|
1876 |
if not _RunCommandAndReport(["gnt-cluster", "redist-conf", "--yes-do-it"]): |
|
1877 |
returnvalue = 1 |
|
1878 |
|
|
1879 |
ToStdout("Restarting daemons everywhere.") |
|
1880 |
badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"]) |
|
1881 |
badnodes.extend(_VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])) |
|
1882 |
if badnodes: |
|
1883 |
ToStderr("Warning: failed to start daemons on %s." % |
|
1884 |
(", ".join(list(set(badnodes))),)) |
|
1885 |
returnvalue = 1 |
|
1886 |
|
|
1887 |
ToStdout("Undraining the queue.") |
|
1888 |
if not _RunCommandAndReport(["gnt-cluster", "queue", "undrain"]): |
|
1889 |
returnvalue = 1 |
|
1890 |
|
|
1891 |
# TODO: write intent-to-upgrade file |
|
1892 |
|
|
1893 |
ToStdout("Verifying cluster.") |
|
1894 |
if not _RunCommandAndReport(["gnt-cluster", "verify"]): |
|
1895 |
returnvalue = 1 |
|
1896 |
|
|
1897 |
return returnvalue |
|
1898 |
|
|
1899 |
|
|
1620 | 1900 |
commands = { |
1621 | 1901 |
"init": ( |
1622 | 1902 |
InitCluster, [ArgHost(min=1, max=1)], |
... | ... | |
1735 | 2015 |
"show-ispecs-cmd": ( |
1736 | 2016 |
ShowCreateCommand, ARGS_NONE, [], "", |
1737 | 2017 |
"Show the command line to re-create the cluster"), |
2018 |
"upgrade": ( |
|
2019 |
UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "", |
|
2020 |
"Upgrade (or downgrade) to a new Ganeti version"), |
|
1738 | 2021 |
} |
1739 | 2022 |
|
1740 | 2023 |
|
Also available in: Unified diff