Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / state.py @ 3bd0f3d8

History | View | Annotate | Download (6.1 kB)

1 adf6301e Michael Hanselmann
#
2 adf6301e Michael Hanselmann
#
3 adf6301e Michael Hanselmann
4 adf6301e Michael Hanselmann
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 adf6301e Michael Hanselmann
#
6 adf6301e Michael Hanselmann
# This program is free software; you can redistribute it and/or modify
7 adf6301e Michael Hanselmann
# it under the terms of the GNU General Public License as published by
8 adf6301e Michael Hanselmann
# the Free Software Foundation; either version 2 of the License, or
9 adf6301e Michael Hanselmann
# (at your option) any later version.
10 adf6301e Michael Hanselmann
#
11 adf6301e Michael Hanselmann
# This program is distributed in the hope that it will be useful, but
12 adf6301e Michael Hanselmann
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 adf6301e Michael Hanselmann
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 adf6301e Michael Hanselmann
# General Public License for more details.
15 adf6301e Michael Hanselmann
#
16 adf6301e Michael Hanselmann
# You should have received a copy of the GNU General Public License
17 adf6301e Michael Hanselmann
# along with this program; if not, write to the Free Software
18 adf6301e Michael Hanselmann
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 adf6301e Michael Hanselmann
# 02110-1301, USA.
20 adf6301e Michael Hanselmann
21 adf6301e Michael Hanselmann
22 adf6301e Michael Hanselmann
"""Module keeping state for Ganeti watcher.
23 adf6301e Michael Hanselmann

24 adf6301e Michael Hanselmann
"""
25 adf6301e Michael Hanselmann
26 adf6301e Michael Hanselmann
import os
27 adf6301e Michael Hanselmann
import time
28 adf6301e Michael Hanselmann
import logging
29 adf6301e Michael Hanselmann
30 adf6301e Michael Hanselmann
from ganeti import utils
31 adf6301e Michael Hanselmann
from ganeti import serializer
32 adf6301e Michael Hanselmann
from ganeti import errors
33 adf6301e Michael Hanselmann
34 adf6301e Michael Hanselmann
35 adf6301e Michael Hanselmann
# Delete any record that is older than 8 hours; this value is based on
36 adf6301e Michael Hanselmann
# the fact that the current retry counter is 5, and watcher runs every
37 adf6301e Michael Hanselmann
# 5 minutes, so it takes around half an hour to exceed the retry
38 adf6301e Michael Hanselmann
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
39 adf6301e Michael Hanselmann
RETRY_EXPIRATION = 8 * 3600
40 adf6301e Michael Hanselmann
41 adf6301e Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
42 adf6301e Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
43 adf6301e Michael Hanselmann
KEY_BOOT_ID = "bootid"
44 adf6301e Michael Hanselmann
45 adf6301e Michael Hanselmann
46 adf6301e Michael Hanselmann
def OpenStateFile(path):
47 adf6301e Michael Hanselmann
  """Opens the state file and acquires a lock on it.
48 adf6301e Michael Hanselmann

49 adf6301e Michael Hanselmann
  @type path: string
50 adf6301e Michael Hanselmann
  @param path: Path to state file
51 adf6301e Michael Hanselmann

52 adf6301e Michael Hanselmann
  """
53 adf6301e Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
54 adf6301e Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
55 adf6301e Michael Hanselmann
  # an existing file -or- allow creating if not existing.
56 adf6301e Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
57 adf6301e Michael Hanselmann
58 adf6301e Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
59 adf6301e Michael Hanselmann
  # might already be running or another program is temporarily blocking the
60 adf6301e Michael Hanselmann
  # watcher from running.
61 adf6301e Michael Hanselmann
  try:
62 adf6301e Michael Hanselmann
    utils.LockFile(statefile_fd)
63 adf6301e Michael Hanselmann
  except errors.LockError, err:
64 adf6301e Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
65 adf6301e Michael Hanselmann
    return None
66 adf6301e Michael Hanselmann
67 adf6301e Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
68 adf6301e Michael Hanselmann
69 adf6301e Michael Hanselmann
70 adf6301e Michael Hanselmann
class WatcherState(object):
71 adf6301e Michael Hanselmann
  """Interface to a state file recording restart attempts.
72 adf6301e Michael Hanselmann

73 adf6301e Michael Hanselmann
  """
74 adf6301e Michael Hanselmann
  def __init__(self, statefile):
75 adf6301e Michael Hanselmann
    """Open, lock, read and parse the file.
76 adf6301e Michael Hanselmann

77 adf6301e Michael Hanselmann
    @type statefile: file
78 adf6301e Michael Hanselmann
    @param statefile: State file object
79 adf6301e Michael Hanselmann

80 adf6301e Michael Hanselmann
    """
81 adf6301e Michael Hanselmann
    self.statefile = statefile
82 adf6301e Michael Hanselmann
83 adf6301e Michael Hanselmann
    try:
84 adf6301e Michael Hanselmann
      state_data = self.statefile.read()
85 adf6301e Michael Hanselmann
      if not state_data:
86 adf6301e Michael Hanselmann
        self._data = {}
87 adf6301e Michael Hanselmann
      else:
88 adf6301e Michael Hanselmann
        self._data = serializer.Load(state_data)
89 b459a848 Andrea Spadaccini
    except Exception, msg: # pylint: disable=W0703
90 adf6301e Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
91 adf6301e Michael Hanselmann
      self._data = {}
92 adf6301e Michael Hanselmann
      logging.warning(("Invalid state file. Using defaults."
93 adf6301e Michael Hanselmann
                       " Error message: %s"), msg)
94 adf6301e Michael Hanselmann
95 adf6301e Michael Hanselmann
    if "instance" not in self._data:
96 adf6301e Michael Hanselmann
      self._data["instance"] = {}
97 adf6301e Michael Hanselmann
    if "node" not in self._data:
98 adf6301e Michael Hanselmann
      self._data["node"] = {}
99 adf6301e Michael Hanselmann
100 adf6301e Michael Hanselmann
    self._orig_data = serializer.Dump(self._data)
101 adf6301e Michael Hanselmann
102 54ca6e4b Michael Hanselmann
  def Save(self, filename):
103 adf6301e Michael Hanselmann
    """Save state to file, then unlock and close it.
104 adf6301e Michael Hanselmann

105 adf6301e Michael Hanselmann
    """
106 adf6301e Michael Hanselmann
    assert self.statefile
107 adf6301e Michael Hanselmann
108 adf6301e Michael Hanselmann
    serialized_form = serializer.Dump(self._data)
109 adf6301e Michael Hanselmann
    if self._orig_data == serialized_form:
110 adf6301e Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
111 54ca6e4b Michael Hanselmann
      os.utime(filename, None)
112 adf6301e Michael Hanselmann
      return
113 adf6301e Michael Hanselmann
114 adf6301e Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
115 adf6301e Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
116 54ca6e4b Michael Hanselmann
    fd = utils.WriteFile(filename,
117 adf6301e Michael Hanselmann
                         data=serialized_form,
118 adf6301e Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
119 adf6301e Michael Hanselmann
    self.statefile = os.fdopen(fd, 'w+')
120 adf6301e Michael Hanselmann
121 adf6301e Michael Hanselmann
  def Close(self):
122 adf6301e Michael Hanselmann
    """Unlock configuration file and close it.
123 adf6301e Michael Hanselmann

124 adf6301e Michael Hanselmann
    """
125 adf6301e Michael Hanselmann
    assert self.statefile
126 adf6301e Michael Hanselmann
127 adf6301e Michael Hanselmann
    # Files are automatically unlocked when closing them
128 adf6301e Michael Hanselmann
    self.statefile.close()
129 adf6301e Michael Hanselmann
    self.statefile = None
130 adf6301e Michael Hanselmann
131 adf6301e Michael Hanselmann
  def GetNodeBootID(self, name):
132 adf6301e Michael Hanselmann
    """Returns the last boot ID of a node or None.
133 adf6301e Michael Hanselmann

134 adf6301e Michael Hanselmann
    """
135 adf6301e Michael Hanselmann
    ndata = self._data["node"]
136 adf6301e Michael Hanselmann
137 adf6301e Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
138 adf6301e Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
139 adf6301e Michael Hanselmann
    return None
140 adf6301e Michael Hanselmann
141 adf6301e Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
142 adf6301e Michael Hanselmann
    """Sets the boot ID of a node.
143 adf6301e Michael Hanselmann

144 adf6301e Michael Hanselmann
    """
145 adf6301e Michael Hanselmann
    assert bootid
146 adf6301e Michael Hanselmann
147 adf6301e Michael Hanselmann
    ndata = self._data["node"]
148 adf6301e Michael Hanselmann
149 54ca6e4b Michael Hanselmann
    ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
150 adf6301e Michael Hanselmann
151 54ca6e4b Michael Hanselmann
  def NumberOfRestartAttempts(self, instance_name):
152 adf6301e Michael Hanselmann
    """Returns number of previous restart attempts.
153 adf6301e Michael Hanselmann

154 d66bbe9f Iustin Pop
    @type instance_name: string
155 d66bbe9f Iustin Pop
    @param instance_name: the name of the instance to look up
156 adf6301e Michael Hanselmann

157 adf6301e Michael Hanselmann
    """
158 adf6301e Michael Hanselmann
    idata = self._data["instance"]
159 adf6301e Michael Hanselmann
160 54ca6e4b Michael Hanselmann
    if instance_name in idata:
161 54ca6e4b Michael Hanselmann
      return idata[instance_name][KEY_RESTART_COUNT]
162 adf6301e Michael Hanselmann
163 adf6301e Michael Hanselmann
    return 0
164 adf6301e Michael Hanselmann
165 adf6301e Michael Hanselmann
  def MaintainInstanceList(self, instances):
166 adf6301e Michael Hanselmann
    """Perform maintenance on the recorded instances.
167 adf6301e Michael Hanselmann

168 adf6301e Michael Hanselmann
    @type instances: list of string
169 adf6301e Michael Hanselmann
    @param instances: the list of currently existing instances
170 adf6301e Michael Hanselmann

171 adf6301e Michael Hanselmann
    """
172 adf6301e Michael Hanselmann
    idict = self._data["instance"]
173 54ca6e4b Michael Hanselmann
174 adf6301e Michael Hanselmann
    # First, delete obsolete instances
175 adf6301e Michael Hanselmann
    obsolete_instances = set(idict).difference(instances)
176 adf6301e Michael Hanselmann
    for inst in obsolete_instances:
177 adf6301e Michael Hanselmann
      logging.debug("Forgetting obsolete instance %s", inst)
178 54ca6e4b Michael Hanselmann
      idict.pop(inst, None)
179 adf6301e Michael Hanselmann
180 adf6301e Michael Hanselmann
    # Second, delete expired records
181 adf6301e Michael Hanselmann
    earliest = time.time() - RETRY_EXPIRATION
182 adf6301e Michael Hanselmann
    expired_instances = [i for i in idict
183 adf6301e Michael Hanselmann
                         if idict[i][KEY_RESTART_WHEN] < earliest]
184 adf6301e Michael Hanselmann
    for inst in expired_instances:
185 adf6301e Michael Hanselmann
      logging.debug("Expiring record for instance %s", inst)
186 54ca6e4b Michael Hanselmann
      idict.pop(inst, None)
187 adf6301e Michael Hanselmann
188 54ca6e4b Michael Hanselmann
  def RecordRestartAttempt(self, instance_name):
189 adf6301e Michael Hanselmann
    """Record a restart attempt.
190 adf6301e Michael Hanselmann

191 d66bbe9f Iustin Pop
    @type instance_name: string
192 d66bbe9f Iustin Pop
    @param instance_name: the name of the instance being restarted
193 adf6301e Michael Hanselmann

194 adf6301e Michael Hanselmann
    """
195 adf6301e Michael Hanselmann
    idata = self._data["instance"]
196 adf6301e Michael Hanselmann
197 54ca6e4b Michael Hanselmann
    inst = idata.setdefault(instance_name, {})
198 adf6301e Michael Hanselmann
    inst[KEY_RESTART_WHEN] = time.time()
199 adf6301e Michael Hanselmann
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
200 adf6301e Michael Hanselmann
201 54ca6e4b Michael Hanselmann
  def RemoveInstance(self, instance_name):
202 adf6301e Michael Hanselmann
    """Update state to reflect that a machine is running.
203 adf6301e Michael Hanselmann

204 adf6301e Michael Hanselmann
    This method removes the record for a named instance (as we only
205 adf6301e Michael Hanselmann
    track down instances).
206 adf6301e Michael Hanselmann

207 d66bbe9f Iustin Pop
    @type instance_name: string
208 d66bbe9f Iustin Pop
    @param instance_name: the name of the instance to remove from books
209 adf6301e Michael Hanselmann

210 adf6301e Michael Hanselmann
    """
211 adf6301e Michael Hanselmann
    idata = self._data["instance"]
212 adf6301e Michael Hanselmann
213 54ca6e4b Michael Hanselmann
    idata.pop(instance_name, None)