Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / state.py @ 31d3b918

History | View | Annotate | Download (7.4 kB)

1 adf6301e Michael Hanselmann
#
2 adf6301e Michael Hanselmann
#
3 adf6301e Michael Hanselmann
4 adf6301e Michael Hanselmann
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5 adf6301e Michael Hanselmann
#
6 adf6301e Michael Hanselmann
# This program is free software; you can redistribute it and/or modify
7 adf6301e Michael Hanselmann
# it under the terms of the GNU General Public License as published by
8 adf6301e Michael Hanselmann
# the Free Software Foundation; either version 2 of the License, or
9 adf6301e Michael Hanselmann
# (at your option) any later version.
10 adf6301e Michael Hanselmann
#
11 adf6301e Michael Hanselmann
# This program is distributed in the hope that it will be useful, but
12 adf6301e Michael Hanselmann
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 adf6301e Michael Hanselmann
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 adf6301e Michael Hanselmann
# General Public License for more details.
15 adf6301e Michael Hanselmann
#
16 adf6301e Michael Hanselmann
# You should have received a copy of the GNU General Public License
17 adf6301e Michael Hanselmann
# along with this program; if not, write to the Free Software
18 adf6301e Michael Hanselmann
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 adf6301e Michael Hanselmann
# 02110-1301, USA.
20 adf6301e Michael Hanselmann
21 adf6301e Michael Hanselmann
22 adf6301e Michael Hanselmann
"""Module keeping state for Ganeti watcher.
23 adf6301e Michael Hanselmann

24 adf6301e Michael Hanselmann
"""
25 adf6301e Michael Hanselmann
26 adf6301e Michael Hanselmann
import os
27 adf6301e Michael Hanselmann
import time
28 adf6301e Michael Hanselmann
import logging
29 adf6301e Michael Hanselmann
30 adf6301e Michael Hanselmann
from ganeti import utils
31 adf6301e Michael Hanselmann
from ganeti import serializer
32 adf6301e Michael Hanselmann
from ganeti import errors
33 adf6301e Michael Hanselmann
34 adf6301e Michael Hanselmann
35 adf6301e Michael Hanselmann
# Delete any record that is older than 8 hours; this value is based on
36 adf6301e Michael Hanselmann
# the fact that the current retry counter is 5, and watcher runs every
37 adf6301e Michael Hanselmann
# 5 minutes, so it takes around half an hour to exceed the retry
38 adf6301e Michael Hanselmann
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
39 adf6301e Michael Hanselmann
RETRY_EXPIRATION = 8 * 3600
40 adf6301e Michael Hanselmann
41 e52e0ddc Jose A. Lopes
KEY_CLEANUP_COUNT = "cleanup_count"
42 e52e0ddc Jose A. Lopes
KEY_CLEANUP_WHEN = "cleanup_when"
43 adf6301e Michael Hanselmann
KEY_RESTART_COUNT = "restart_count"
44 adf6301e Michael Hanselmann
KEY_RESTART_WHEN = "restart_when"
45 adf6301e Michael Hanselmann
KEY_BOOT_ID = "bootid"
46 adf6301e Michael Hanselmann
47 adf6301e Michael Hanselmann
48 adf6301e Michael Hanselmann
def OpenStateFile(path):
49 adf6301e Michael Hanselmann
  """Opens the state file and acquires a lock on it.
50 adf6301e Michael Hanselmann

51 adf6301e Michael Hanselmann
  @type path: string
52 adf6301e Michael Hanselmann
  @param path: Path to state file
53 adf6301e Michael Hanselmann

54 adf6301e Michael Hanselmann
  """
55 adf6301e Michael Hanselmann
  # The two-step dance below is necessary to allow both opening existing
56 adf6301e Michael Hanselmann
  # file read/write and creating if not existing. Vanilla open will truncate
57 adf6301e Michael Hanselmann
  # an existing file -or- allow creating if not existing.
58 adf6301e Michael Hanselmann
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
59 adf6301e Michael Hanselmann
60 adf6301e Michael Hanselmann
  # Try to acquire lock on state file. If this fails, another watcher instance
61 adf6301e Michael Hanselmann
  # might already be running or another program is temporarily blocking the
62 adf6301e Michael Hanselmann
  # watcher from running.
63 adf6301e Michael Hanselmann
  try:
64 adf6301e Michael Hanselmann
    utils.LockFile(statefile_fd)
65 adf6301e Michael Hanselmann
  except errors.LockError, err:
66 adf6301e Michael Hanselmann
    logging.error("Can't acquire lock on state file %s: %s", path, err)
67 adf6301e Michael Hanselmann
    return None
68 adf6301e Michael Hanselmann
69 adf6301e Michael Hanselmann
  return os.fdopen(statefile_fd, "w+")
70 adf6301e Michael Hanselmann
71 adf6301e Michael Hanselmann
72 adf6301e Michael Hanselmann
class WatcherState(object):
73 adf6301e Michael Hanselmann
  """Interface to a state file recording restart attempts.
74 adf6301e Michael Hanselmann

75 adf6301e Michael Hanselmann
  """
76 adf6301e Michael Hanselmann
  def __init__(self, statefile):
77 adf6301e Michael Hanselmann
    """Open, lock, read and parse the file.
78 adf6301e Michael Hanselmann

79 adf6301e Michael Hanselmann
    @type statefile: file
80 adf6301e Michael Hanselmann
    @param statefile: State file object
81 adf6301e Michael Hanselmann

82 adf6301e Michael Hanselmann
    """
83 adf6301e Michael Hanselmann
    self.statefile = statefile
84 adf6301e Michael Hanselmann
85 adf6301e Michael Hanselmann
    try:
86 adf6301e Michael Hanselmann
      state_data = self.statefile.read()
87 adf6301e Michael Hanselmann
      if not state_data:
88 adf6301e Michael Hanselmann
        self._data = {}
89 adf6301e Michael Hanselmann
      else:
90 adf6301e Michael Hanselmann
        self._data = serializer.Load(state_data)
91 b459a848 Andrea Spadaccini
    except Exception, msg: # pylint: disable=W0703
92 adf6301e Michael Hanselmann
      # Ignore errors while loading the file and treat it as empty
93 adf6301e Michael Hanselmann
      self._data = {}
94 adf6301e Michael Hanselmann
      logging.warning(("Invalid state file. Using defaults."
95 adf6301e Michael Hanselmann
                       " Error message: %s"), msg)
96 adf6301e Michael Hanselmann
97 adf6301e Michael Hanselmann
    if "instance" not in self._data:
98 adf6301e Michael Hanselmann
      self._data["instance"] = {}
99 adf6301e Michael Hanselmann
    if "node" not in self._data:
100 adf6301e Michael Hanselmann
      self._data["node"] = {}
101 adf6301e Michael Hanselmann
102 adf6301e Michael Hanselmann
    self._orig_data = serializer.Dump(self._data)
103 adf6301e Michael Hanselmann
104 54ca6e4b Michael Hanselmann
  def Save(self, filename):
105 adf6301e Michael Hanselmann
    """Save state to file, then unlock and close it.
106 adf6301e Michael Hanselmann

107 adf6301e Michael Hanselmann
    """
108 adf6301e Michael Hanselmann
    assert self.statefile
109 adf6301e Michael Hanselmann
110 adf6301e Michael Hanselmann
    serialized_form = serializer.Dump(self._data)
111 adf6301e Michael Hanselmann
    if self._orig_data == serialized_form:
112 adf6301e Michael Hanselmann
      logging.debug("Data didn't change, just touching status file")
113 54ca6e4b Michael Hanselmann
      os.utime(filename, None)
114 adf6301e Michael Hanselmann
      return
115 adf6301e Michael Hanselmann
116 adf6301e Michael Hanselmann
    # We need to make sure the file is locked before renaming it, otherwise
117 adf6301e Michael Hanselmann
    # starting ganeti-watcher again at the same time will create a conflict.
118 54ca6e4b Michael Hanselmann
    fd = utils.WriteFile(filename,
119 adf6301e Michael Hanselmann
                         data=serialized_form,
120 adf6301e Michael Hanselmann
                         prewrite=utils.LockFile, close=False)
121 3ccb3a64 Michael Hanselmann
    self.statefile = os.fdopen(fd, "w+")
122 adf6301e Michael Hanselmann
123 adf6301e Michael Hanselmann
  def Close(self):
124 adf6301e Michael Hanselmann
    """Unlock configuration file and close it.
125 adf6301e Michael Hanselmann

126 adf6301e Michael Hanselmann
    """
127 adf6301e Michael Hanselmann
    assert self.statefile
128 adf6301e Michael Hanselmann
129 adf6301e Michael Hanselmann
    # Files are automatically unlocked when closing them
130 adf6301e Michael Hanselmann
    self.statefile.close()
131 adf6301e Michael Hanselmann
    self.statefile = None
132 adf6301e Michael Hanselmann
133 adf6301e Michael Hanselmann
  def GetNodeBootID(self, name):
134 adf6301e Michael Hanselmann
    """Returns the last boot ID of a node or None.
135 adf6301e Michael Hanselmann

136 adf6301e Michael Hanselmann
    """
137 adf6301e Michael Hanselmann
    ndata = self._data["node"]
138 adf6301e Michael Hanselmann
139 adf6301e Michael Hanselmann
    if name in ndata and KEY_BOOT_ID in ndata[name]:
140 adf6301e Michael Hanselmann
      return ndata[name][KEY_BOOT_ID]
141 adf6301e Michael Hanselmann
    return None
142 adf6301e Michael Hanselmann
143 adf6301e Michael Hanselmann
  def SetNodeBootID(self, name, bootid):
144 adf6301e Michael Hanselmann
    """Sets the boot ID of a node.
145 adf6301e Michael Hanselmann

146 adf6301e Michael Hanselmann
    """
147 adf6301e Michael Hanselmann
    assert bootid
148 adf6301e Michael Hanselmann
149 adf6301e Michael Hanselmann
    ndata = self._data["node"]
150 adf6301e Michael Hanselmann
151 54ca6e4b Michael Hanselmann
    ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
152 adf6301e Michael Hanselmann
153 54ca6e4b Michael Hanselmann
  def NumberOfRestartAttempts(self, instance_name):
154 adf6301e Michael Hanselmann
    """Returns number of previous restart attempts.
155 adf6301e Michael Hanselmann

156 d66bbe9f Iustin Pop
    @type instance_name: string
157 d66bbe9f Iustin Pop
    @param instance_name: the name of the instance to look up
158 adf6301e Michael Hanselmann

159 adf6301e Michael Hanselmann
    """
160 adf6301e Michael Hanselmann
    idata = self._data["instance"]
161 adf6301e Michael Hanselmann
162 54ca6e4b Michael Hanselmann
    if instance_name in idata:
163 54ca6e4b Michael Hanselmann
      return idata[instance_name][KEY_RESTART_COUNT]
164 adf6301e Michael Hanselmann
165 adf6301e Michael Hanselmann
    return 0
166 adf6301e Michael Hanselmann
167 e52e0ddc Jose A. Lopes
  def NumberOfCleanupAttempts(self, instance_name):
168 e52e0ddc Jose A. Lopes
    """Returns number of previous cleanup attempts.
169 e52e0ddc Jose A. Lopes

170 e52e0ddc Jose A. Lopes
    @type instance_name: string
171 e52e0ddc Jose A. Lopes
    @param instance_name: the name of the instance to look up
172 e52e0ddc Jose A. Lopes

173 e52e0ddc Jose A. Lopes
    """
174 e52e0ddc Jose A. Lopes
    idata = self._data["instance"]
175 e52e0ddc Jose A. Lopes
176 e52e0ddc Jose A. Lopes
    if instance_name in idata:
177 e52e0ddc Jose A. Lopes
      return idata[instance_name][KEY_CLEANUP_COUNT]
178 e52e0ddc Jose A. Lopes
179 e52e0ddc Jose A. Lopes
    return 0
180 e52e0ddc Jose A. Lopes
181 adf6301e Michael Hanselmann
  def MaintainInstanceList(self, instances):
182 adf6301e Michael Hanselmann
    """Perform maintenance on the recorded instances.
183 adf6301e Michael Hanselmann

184 adf6301e Michael Hanselmann
    @type instances: list of string
185 adf6301e Michael Hanselmann
    @param instances: the list of currently existing instances
186 adf6301e Michael Hanselmann

187 adf6301e Michael Hanselmann
    """
188 adf6301e Michael Hanselmann
    idict = self._data["instance"]
189 54ca6e4b Michael Hanselmann
190 adf6301e Michael Hanselmann
    # First, delete obsolete instances
191 adf6301e Michael Hanselmann
    obsolete_instances = set(idict).difference(instances)
192 adf6301e Michael Hanselmann
    for inst in obsolete_instances:
193 adf6301e Michael Hanselmann
      logging.debug("Forgetting obsolete instance %s", inst)
194 54ca6e4b Michael Hanselmann
      idict.pop(inst, None)
195 adf6301e Michael Hanselmann
196 adf6301e Michael Hanselmann
    # Second, delete expired records
197 adf6301e Michael Hanselmann
    earliest = time.time() - RETRY_EXPIRATION
198 adf6301e Michael Hanselmann
    expired_instances = [i for i in idict
199 adf6301e Michael Hanselmann
                         if idict[i][KEY_RESTART_WHEN] < earliest]
200 adf6301e Michael Hanselmann
    for inst in expired_instances:
201 adf6301e Michael Hanselmann
      logging.debug("Expiring record for instance %s", inst)
202 54ca6e4b Michael Hanselmann
      idict.pop(inst, None)
203 adf6301e Michael Hanselmann
204 e52e0ddc Jose A. Lopes
  @staticmethod
205 e52e0ddc Jose A. Lopes
  def _RecordAttempt(instances, instance_name, key_when, key_count):
206 e52e0ddc Jose A. Lopes
    """Record an event.
207 e52e0ddc Jose A. Lopes

208 e52e0ddc Jose A. Lopes
    @type instances: dict
209 e52e0ddc Jose A. Lopes
    @param instances: contains instance data indexed by instance_name
210 e52e0ddc Jose A. Lopes

211 e52e0ddc Jose A. Lopes
    @type instance_name: string
212 e52e0ddc Jose A. Lopes
    @param instance_name: name of the instance involved in the event
213 e52e0ddc Jose A. Lopes

214 e52e0ddc Jose A. Lopes
    @type key_when:
215 e52e0ddc Jose A. Lopes
    @param key_when: dict key for the information for when the event occurred
216 e52e0ddc Jose A. Lopes

217 e52e0ddc Jose A. Lopes
    @type key_count: int
218 e52e0ddc Jose A. Lopes
    @param key_count: dict key for the information for how many times
219 e52e0ddc Jose A. Lopes
                      the event occurred
220 e52e0ddc Jose A. Lopes

221 e52e0ddc Jose A. Lopes
    """
222 e52e0ddc Jose A. Lopes
    instance = instances.setdefault(instance_name, {})
223 e52e0ddc Jose A. Lopes
    instance[key_when] = time.time()
224 e52e0ddc Jose A. Lopes
    instance[key_count] = instance.get(key_count, 0) + 1
225 e52e0ddc Jose A. Lopes
226 54ca6e4b Michael Hanselmann
  def RecordRestartAttempt(self, instance_name):
227 adf6301e Michael Hanselmann
    """Record a restart attempt.
228 adf6301e Michael Hanselmann

229 d66bbe9f Iustin Pop
    @type instance_name: string
230 d66bbe9f Iustin Pop
    @param instance_name: the name of the instance being restarted
231 adf6301e Michael Hanselmann

232 adf6301e Michael Hanselmann
    """
233 e52e0ddc Jose A. Lopes
    self._RecordAttempt(self._data["instance"], instance_name,
234 e52e0ddc Jose A. Lopes
                        KEY_RESTART_WHEN, KEY_RESTART_COUNT)
235 e52e0ddc Jose A. Lopes
236 e52e0ddc Jose A. Lopes
  def RecordCleanupAttempt(self, instance_name):
237 e52e0ddc Jose A. Lopes
    """Record a cleanup attempt.
238 adf6301e Michael Hanselmann

239 e52e0ddc Jose A. Lopes
    @type instance_name: string
240 e52e0ddc Jose A. Lopes
    @param instance_name: the name of the instance being cleaned up
241 e52e0ddc Jose A. Lopes

242 e52e0ddc Jose A. Lopes
    """
243 e52e0ddc Jose A. Lopes
    self._RecordAttempt(self._data["instance"], instance_name,
244 e52e0ddc Jose A. Lopes
                        KEY_CLEANUP_WHEN, KEY_CLEANUP_COUNT)
245 adf6301e Michael Hanselmann
246 54ca6e4b Michael Hanselmann
  def RemoveInstance(self, instance_name):
247 adf6301e Michael Hanselmann
    """Update state to reflect that a machine is running.
248 adf6301e Michael Hanselmann

249 adf6301e Michael Hanselmann
    This method removes the record for a named instance (as we only
250 adf6301e Michael Hanselmann
    track down instances).
251 adf6301e Michael Hanselmann

252 d66bbe9f Iustin Pop
    @type instance_name: string
253 d66bbe9f Iustin Pop
    @param instance_name: the name of the instance to remove from books
254 adf6301e Michael Hanselmann

255 adf6301e Michael Hanselmann
    """
256 adf6301e Michael Hanselmann
    idata = self._data["instance"]
257 adf6301e Michael Hanselmann
258 54ca6e4b Michael Hanselmann
    idata.pop(instance_name, None)