4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module keeping state for Ganeti watcher.
30 from ganeti import utils
31 from ganeti import serializer
32 from ganeti import errors
35 # Delete any record that is older than 8 hours; this value is based on
36 # the fact that the current retry counter is 5, and watcher runs every
37 # 5 minutes, so it takes around half an hour to exceed the retry
38 # counter, so 8 hours (16*1/2h) seems like a reasonable reset time
39 RETRY_EXPIRATION = 8 * 3600
41 KEY_RESTART_COUNT = "restart_count"
42 KEY_RESTART_WHEN = "restart_when"
43 KEY_BOOT_ID = "bootid"
46 def OpenStateFile(path):
47 """Opens the state file and acquires a lock on it.
50 @param path: Path to state file
53 # The two-step dance below is necessary to allow both opening existing
54 # file read/write and creating if not existing. Vanilla open will truncate
55 # an existing file -or- allow creating if not existing.
56 statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
58 # Try to acquire lock on state file. If this fails, another watcher instance
59 # might already be running or another program is temporarily blocking the
60 # watcher from running.
62 utils.LockFile(statefile_fd)
63 except errors.LockError, err:
64 logging.error("Can't acquire lock on state file %s: %s", path, err)
67 return os.fdopen(statefile_fd, "w+")
70 class WatcherState(object):
71 """Interface to a state file recording restart attempts.
74 def __init__(self, statefile):
75 """Open, lock, read and parse the file.
78 @param statefile: State file object
81 self.statefile = statefile
84 state_data = self.statefile.read()
88 self._data = serializer.Load(state_data)
89 except Exception, msg: # pylint: disable=W0703
90 # Ignore errors while loading the file and treat it as empty
92 logging.warning(("Invalid state file. Using defaults."
93 " Error message: %s"), msg)
95 if "instance" not in self._data:
96 self._data["instance"] = {}
97 if "node" not in self._data:
98 self._data["node"] = {}
100 self._orig_data = serializer.Dump(self._data)
102 def Save(self, filename):
103 """Save state to file, then unlock and close it.
106 assert self.statefile
108 serialized_form = serializer.Dump(self._data)
109 if self._orig_data == serialized_form:
110 logging.debug("Data didn't change, just touching status file")
111 os.utime(filename, None)
114 # We need to make sure the file is locked before renaming it, otherwise
115 # starting ganeti-watcher again at the same time will create a conflict.
116 fd = utils.WriteFile(filename,
117 data=serialized_form,
118 prewrite=utils.LockFile, close=False)
119 self.statefile = os.fdopen(fd, "w+")
122 """Unlock configuration file and close it.
125 assert self.statefile
127 # Files are automatically unlocked when closing them
128 self.statefile.close()
129 self.statefile = None
131 def GetNodeBootID(self, name):
132 """Returns the last boot ID of a node or None.
135 ndata = self._data["node"]
137 if name in ndata and KEY_BOOT_ID in ndata[name]:
138 return ndata[name][KEY_BOOT_ID]
141 def SetNodeBootID(self, name, bootid):
142 """Sets the boot ID of a node.
147 ndata = self._data["node"]
149 ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
151 def NumberOfRestartAttempts(self, instance_name):
152 """Returns number of previous restart attempts.
154 @type instance_name: string
155 @param instance_name: the name of the instance to look up
158 idata = self._data["instance"]
160 if instance_name in idata:
161 return idata[instance_name][KEY_RESTART_COUNT]
165 def MaintainInstanceList(self, instances):
166 """Perform maintenance on the recorded instances.
168 @type instances: list of string
169 @param instances: the list of currently existing instances
172 idict = self._data["instance"]
174 # First, delete obsolete instances
175 obsolete_instances = set(idict).difference(instances)
176 for inst in obsolete_instances:
177 logging.debug("Forgetting obsolete instance %s", inst)
178 idict.pop(inst, None)
180 # Second, delete expired records
181 earliest = time.time() - RETRY_EXPIRATION
182 expired_instances = [i for i in idict
183 if idict[i][KEY_RESTART_WHEN] < earliest]
184 for inst in expired_instances:
185 logging.debug("Expiring record for instance %s", inst)
186 idict.pop(inst, None)
188 def RecordRestartAttempt(self, instance_name):
189 """Record a restart attempt.
191 @type instance_name: string
192 @param instance_name: the name of the instance being restarted
195 idata = self._data["instance"]
197 inst = idata.setdefault(instance_name, {})
198 inst[KEY_RESTART_WHEN] = time.time()
199 inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
201 def RemoveInstance(self, instance_name):
202 """Update state to reflect that a machine is running.
204 This method removes the record for a named instance (as we only
205 track down instances).
207 @type instance_name: string
208 @param instance_name: the name of the instance to remove from books
211 idata = self._data["instance"]
213 idata.pop(instance_name, None)