root / lib / watcher / state.py @ 31d3b918
History | View | Annotate | Download (7.4 kB)
1 | adf6301e | Michael Hanselmann | #
|
---|---|---|---|
2 | adf6301e | Michael Hanselmann | #
|
3 | adf6301e | Michael Hanselmann | |
4 | adf6301e | Michael Hanselmann | # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
|
5 | adf6301e | Michael Hanselmann | #
|
6 | adf6301e | Michael Hanselmann | # This program is free software; you can redistribute it and/or modify
|
7 | adf6301e | Michael Hanselmann | # it under the terms of the GNU General Public License as published by
|
8 | adf6301e | Michael Hanselmann | # the Free Software Foundation; either version 2 of the License, or
|
9 | adf6301e | Michael Hanselmann | # (at your option) any later version.
|
10 | adf6301e | Michael Hanselmann | #
|
11 | adf6301e | Michael Hanselmann | # This program is distributed in the hope that it will be useful, but
|
12 | adf6301e | Michael Hanselmann | # WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 | adf6301e | Michael Hanselmann | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 | adf6301e | Michael Hanselmann | # General Public License for more details.
|
15 | adf6301e | Michael Hanselmann | #
|
16 | adf6301e | Michael Hanselmann | # You should have received a copy of the GNU General Public License
|
17 | adf6301e | Michael Hanselmann | # along with this program; if not, write to the Free Software
|
18 | adf6301e | Michael Hanselmann | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
19 | adf6301e | Michael Hanselmann | # 02110-1301, USA.
|
20 | adf6301e | Michael Hanselmann | |
21 | adf6301e | Michael Hanselmann | |
22 | adf6301e | Michael Hanselmann | """Module keeping state for Ganeti watcher.
|
23 | adf6301e | Michael Hanselmann |
|
24 | adf6301e | Michael Hanselmann | """
|
25 | adf6301e | Michael Hanselmann | |
26 | adf6301e | Michael Hanselmann | import os |
27 | adf6301e | Michael Hanselmann | import time |
28 | adf6301e | Michael Hanselmann | import logging |
29 | adf6301e | Michael Hanselmann | |
30 | adf6301e | Michael Hanselmann | from ganeti import utils |
31 | adf6301e | Michael Hanselmann | from ganeti import serializer |
32 | adf6301e | Michael Hanselmann | from ganeti import errors |
33 | adf6301e | Michael Hanselmann | |
34 | adf6301e | Michael Hanselmann | |
35 | adf6301e | Michael Hanselmann | # Delete any record that is older than 8 hours; this value is based on
|
36 | adf6301e | Michael Hanselmann | # the fact that the current retry counter is 5, and watcher runs every
|
37 | adf6301e | Michael Hanselmann | # 5 minutes, so it takes around half an hour to exceed the retry
|
38 | adf6301e | Michael Hanselmann | # counter, so 8 hours (16*1/2h) seems like a reasonable reset time
|
39 | adf6301e | Michael Hanselmann | RETRY_EXPIRATION = 8 * 3600 |
40 | adf6301e | Michael Hanselmann | |
41 | e52e0ddc | Jose A. Lopes | KEY_CLEANUP_COUNT = "cleanup_count"
|
42 | e52e0ddc | Jose A. Lopes | KEY_CLEANUP_WHEN = "cleanup_when"
|
43 | adf6301e | Michael Hanselmann | KEY_RESTART_COUNT = "restart_count"
|
44 | adf6301e | Michael Hanselmann | KEY_RESTART_WHEN = "restart_when"
|
45 | adf6301e | Michael Hanselmann | KEY_BOOT_ID = "bootid"
|
46 | adf6301e | Michael Hanselmann | |
47 | adf6301e | Michael Hanselmann | |
48 | adf6301e | Michael Hanselmann | def OpenStateFile(path): |
49 | adf6301e | Michael Hanselmann | """Opens the state file and acquires a lock on it.
|
50 | adf6301e | Michael Hanselmann |
|
51 | adf6301e | Michael Hanselmann | @type path: string
|
52 | adf6301e | Michael Hanselmann | @param path: Path to state file
|
53 | adf6301e | Michael Hanselmann |
|
54 | adf6301e | Michael Hanselmann | """
|
55 | adf6301e | Michael Hanselmann | # The two-step dance below is necessary to allow both opening existing
|
56 | adf6301e | Michael Hanselmann | # file read/write and creating if not existing. Vanilla open will truncate
|
57 | adf6301e | Michael Hanselmann | # an existing file -or- allow creating if not existing.
|
58 | adf6301e | Michael Hanselmann | statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT) |
59 | adf6301e | Michael Hanselmann | |
60 | adf6301e | Michael Hanselmann | # Try to acquire lock on state file. If this fails, another watcher instance
|
61 | adf6301e | Michael Hanselmann | # might already be running or another program is temporarily blocking the
|
62 | adf6301e | Michael Hanselmann | # watcher from running.
|
63 | adf6301e | Michael Hanselmann | try:
|
64 | adf6301e | Michael Hanselmann | utils.LockFile(statefile_fd) |
65 | adf6301e | Michael Hanselmann | except errors.LockError, err:
|
66 | adf6301e | Michael Hanselmann | logging.error("Can't acquire lock on state file %s: %s", path, err)
|
67 | adf6301e | Michael Hanselmann | return None |
68 | adf6301e | Michael Hanselmann | |
69 | adf6301e | Michael Hanselmann | return os.fdopen(statefile_fd, "w+") |
70 | adf6301e | Michael Hanselmann | |
71 | adf6301e | Michael Hanselmann | |
72 | adf6301e | Michael Hanselmann | class WatcherState(object): |
73 | adf6301e | Michael Hanselmann | """Interface to a state file recording restart attempts.
|
74 | adf6301e | Michael Hanselmann |
|
75 | adf6301e | Michael Hanselmann | """
|
76 | adf6301e | Michael Hanselmann | def __init__(self, statefile): |
77 | adf6301e | Michael Hanselmann | """Open, lock, read and parse the file.
|
78 | adf6301e | Michael Hanselmann |
|
79 | adf6301e | Michael Hanselmann | @type statefile: file
|
80 | adf6301e | Michael Hanselmann | @param statefile: State file object
|
81 | adf6301e | Michael Hanselmann |
|
82 | adf6301e | Michael Hanselmann | """
|
83 | adf6301e | Michael Hanselmann | self.statefile = statefile
|
84 | adf6301e | Michael Hanselmann | |
85 | adf6301e | Michael Hanselmann | try:
|
86 | adf6301e | Michael Hanselmann | state_data = self.statefile.read()
|
87 | adf6301e | Michael Hanselmann | if not state_data: |
88 | adf6301e | Michael Hanselmann | self._data = {}
|
89 | adf6301e | Michael Hanselmann | else:
|
90 | adf6301e | Michael Hanselmann | self._data = serializer.Load(state_data)
|
91 | b459a848 | Andrea Spadaccini | except Exception, msg: # pylint: disable=W0703 |
92 | adf6301e | Michael Hanselmann | # Ignore errors while loading the file and treat it as empty
|
93 | adf6301e | Michael Hanselmann | self._data = {}
|
94 | adf6301e | Michael Hanselmann | logging.warning(("Invalid state file. Using defaults."
|
95 | adf6301e | Michael Hanselmann | " Error message: %s"), msg)
|
96 | adf6301e | Michael Hanselmann | |
97 | adf6301e | Michael Hanselmann | if "instance" not in self._data: |
98 | adf6301e | Michael Hanselmann | self._data["instance"] = {} |
99 | adf6301e | Michael Hanselmann | if "node" not in self._data: |
100 | adf6301e | Michael Hanselmann | self._data["node"] = {} |
101 | adf6301e | Michael Hanselmann | |
102 | adf6301e | Michael Hanselmann | self._orig_data = serializer.Dump(self._data) |
103 | adf6301e | Michael Hanselmann | |
104 | 54ca6e4b | Michael Hanselmann | def Save(self, filename): |
105 | adf6301e | Michael Hanselmann | """Save state to file, then unlock and close it.
|
106 | adf6301e | Michael Hanselmann |
|
107 | adf6301e | Michael Hanselmann | """
|
108 | adf6301e | Michael Hanselmann | assert self.statefile |
109 | adf6301e | Michael Hanselmann | |
110 | adf6301e | Michael Hanselmann | serialized_form = serializer.Dump(self._data)
|
111 | adf6301e | Michael Hanselmann | if self._orig_data == serialized_form: |
112 | adf6301e | Michael Hanselmann | logging.debug("Data didn't change, just touching status file")
|
113 | 54ca6e4b | Michael Hanselmann | os.utime(filename, None)
|
114 | adf6301e | Michael Hanselmann | return
|
115 | adf6301e | Michael Hanselmann | |
116 | adf6301e | Michael Hanselmann | # We need to make sure the file is locked before renaming it, otherwise
|
117 | adf6301e | Michael Hanselmann | # starting ganeti-watcher again at the same time will create a conflict.
|
118 | 54ca6e4b | Michael Hanselmann | fd = utils.WriteFile(filename, |
119 | adf6301e | Michael Hanselmann | data=serialized_form, |
120 | adf6301e | Michael Hanselmann | prewrite=utils.LockFile, close=False)
|
121 | 3ccb3a64 | Michael Hanselmann | self.statefile = os.fdopen(fd, "w+") |
122 | adf6301e | Michael Hanselmann | |
123 | adf6301e | Michael Hanselmann | def Close(self): |
124 | adf6301e | Michael Hanselmann | """Unlock configuration file and close it.
|
125 | adf6301e | Michael Hanselmann |
|
126 | adf6301e | Michael Hanselmann | """
|
127 | adf6301e | Michael Hanselmann | assert self.statefile |
128 | adf6301e | Michael Hanselmann | |
129 | adf6301e | Michael Hanselmann | # Files are automatically unlocked when closing them
|
130 | adf6301e | Michael Hanselmann | self.statefile.close()
|
131 | adf6301e | Michael Hanselmann | self.statefile = None |
132 | adf6301e | Michael Hanselmann | |
133 | adf6301e | Michael Hanselmann | def GetNodeBootID(self, name): |
134 | adf6301e | Michael Hanselmann | """Returns the last boot ID of a node or None.
|
135 | adf6301e | Michael Hanselmann |
|
136 | adf6301e | Michael Hanselmann | """
|
137 | adf6301e | Michael Hanselmann | ndata = self._data["node"] |
138 | adf6301e | Michael Hanselmann | |
139 | adf6301e | Michael Hanselmann | if name in ndata and KEY_BOOT_ID in ndata[name]: |
140 | adf6301e | Michael Hanselmann | return ndata[name][KEY_BOOT_ID]
|
141 | adf6301e | Michael Hanselmann | return None |
142 | adf6301e | Michael Hanselmann | |
143 | adf6301e | Michael Hanselmann | def SetNodeBootID(self, name, bootid): |
144 | adf6301e | Michael Hanselmann | """Sets the boot ID of a node.
|
145 | adf6301e | Michael Hanselmann |
|
146 | adf6301e | Michael Hanselmann | """
|
147 | adf6301e | Michael Hanselmann | assert bootid
|
148 | adf6301e | Michael Hanselmann | |
149 | adf6301e | Michael Hanselmann | ndata = self._data["node"] |
150 | adf6301e | Michael Hanselmann | |
151 | 54ca6e4b | Michael Hanselmann | ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid |
152 | adf6301e | Michael Hanselmann | |
153 | 54ca6e4b | Michael Hanselmann | def NumberOfRestartAttempts(self, instance_name): |
154 | adf6301e | Michael Hanselmann | """Returns number of previous restart attempts.
|
155 | adf6301e | Michael Hanselmann |
|
156 | d66bbe9f | Iustin Pop | @type instance_name: string
|
157 | d66bbe9f | Iustin Pop | @param instance_name: the name of the instance to look up
|
158 | adf6301e | Michael Hanselmann |
|
159 | adf6301e | Michael Hanselmann | """
|
160 | adf6301e | Michael Hanselmann | idata = self._data["instance"] |
161 | adf6301e | Michael Hanselmann | |
162 | 54ca6e4b | Michael Hanselmann | if instance_name in idata: |
163 | 54ca6e4b | Michael Hanselmann | return idata[instance_name][KEY_RESTART_COUNT]
|
164 | adf6301e | Michael Hanselmann | |
165 | adf6301e | Michael Hanselmann | return 0 |
166 | adf6301e | Michael Hanselmann | |
167 | e52e0ddc | Jose A. Lopes | def NumberOfCleanupAttempts(self, instance_name): |
168 | e52e0ddc | Jose A. Lopes | """Returns number of previous cleanup attempts.
|
169 | e52e0ddc | Jose A. Lopes |
|
170 | e52e0ddc | Jose A. Lopes | @type instance_name: string
|
171 | e52e0ddc | Jose A. Lopes | @param instance_name: the name of the instance to look up
|
172 | e52e0ddc | Jose A. Lopes |
|
173 | e52e0ddc | Jose A. Lopes | """
|
174 | e52e0ddc | Jose A. Lopes | idata = self._data["instance"] |
175 | e52e0ddc | Jose A. Lopes | |
176 | e52e0ddc | Jose A. Lopes | if instance_name in idata: |
177 | e52e0ddc | Jose A. Lopes | return idata[instance_name][KEY_CLEANUP_COUNT]
|
178 | e52e0ddc | Jose A. Lopes | |
179 | e52e0ddc | Jose A. Lopes | return 0 |
180 | e52e0ddc | Jose A. Lopes | |
181 | adf6301e | Michael Hanselmann | def MaintainInstanceList(self, instances): |
182 | adf6301e | Michael Hanselmann | """Perform maintenance on the recorded instances.
|
183 | adf6301e | Michael Hanselmann |
|
184 | adf6301e | Michael Hanselmann | @type instances: list of string
|
185 | adf6301e | Michael Hanselmann | @param instances: the list of currently existing instances
|
186 | adf6301e | Michael Hanselmann |
|
187 | adf6301e | Michael Hanselmann | """
|
188 | adf6301e | Michael Hanselmann | idict = self._data["instance"] |
189 | 54ca6e4b | Michael Hanselmann | |
190 | adf6301e | Michael Hanselmann | # First, delete obsolete instances
|
191 | adf6301e | Michael Hanselmann | obsolete_instances = set(idict).difference(instances)
|
192 | adf6301e | Michael Hanselmann | for inst in obsolete_instances: |
193 | adf6301e | Michael Hanselmann | logging.debug("Forgetting obsolete instance %s", inst)
|
194 | 54ca6e4b | Michael Hanselmann | idict.pop(inst, None)
|
195 | adf6301e | Michael Hanselmann | |
196 | adf6301e | Michael Hanselmann | # Second, delete expired records
|
197 | adf6301e | Michael Hanselmann | earliest = time.time() - RETRY_EXPIRATION |
198 | adf6301e | Michael Hanselmann | expired_instances = [i for i in idict |
199 | adf6301e | Michael Hanselmann | if idict[i][KEY_RESTART_WHEN] < earliest]
|
200 | adf6301e | Michael Hanselmann | for inst in expired_instances: |
201 | adf6301e | Michael Hanselmann | logging.debug("Expiring record for instance %s", inst)
|
202 | 54ca6e4b | Michael Hanselmann | idict.pop(inst, None)
|
203 | adf6301e | Michael Hanselmann | |
204 | e52e0ddc | Jose A. Lopes | @staticmethod
|
205 | e52e0ddc | Jose A. Lopes | def _RecordAttempt(instances, instance_name, key_when, key_count): |
206 | e52e0ddc | Jose A. Lopes | """Record an event.
|
207 | e52e0ddc | Jose A. Lopes |
|
208 | e52e0ddc | Jose A. Lopes | @type instances: dict
|
209 | e52e0ddc | Jose A. Lopes | @param instances: contains instance data indexed by instance_name
|
210 | e52e0ddc | Jose A. Lopes |
|
211 | e52e0ddc | Jose A. Lopes | @type instance_name: string
|
212 | e52e0ddc | Jose A. Lopes | @param instance_name: name of the instance involved in the event
|
213 | e52e0ddc | Jose A. Lopes |
|
214 | e52e0ddc | Jose A. Lopes | @type key_when:
|
215 | e52e0ddc | Jose A. Lopes | @param key_when: dict key for the information for when the event occurred
|
216 | e52e0ddc | Jose A. Lopes |
|
217 | e52e0ddc | Jose A. Lopes | @type key_count: int
|
218 | e52e0ddc | Jose A. Lopes | @param key_count: dict key for the information for how many times
|
219 | e52e0ddc | Jose A. Lopes | the event occurred
|
220 | e52e0ddc | Jose A. Lopes |
|
221 | e52e0ddc | Jose A. Lopes | """
|
222 | e52e0ddc | Jose A. Lopes | instance = instances.setdefault(instance_name, {}) |
223 | e52e0ddc | Jose A. Lopes | instance[key_when] = time.time() |
224 | e52e0ddc | Jose A. Lopes | instance[key_count] = instance.get(key_count, 0) + 1 |
225 | e52e0ddc | Jose A. Lopes | |
226 | 54ca6e4b | Michael Hanselmann | def RecordRestartAttempt(self, instance_name): |
227 | adf6301e | Michael Hanselmann | """Record a restart attempt.
|
228 | adf6301e | Michael Hanselmann |
|
229 | d66bbe9f | Iustin Pop | @type instance_name: string
|
230 | d66bbe9f | Iustin Pop | @param instance_name: the name of the instance being restarted
|
231 | adf6301e | Michael Hanselmann |
|
232 | adf6301e | Michael Hanselmann | """
|
233 | e52e0ddc | Jose A. Lopes | self._RecordAttempt(self._data["instance"], instance_name, |
234 | e52e0ddc | Jose A. Lopes | KEY_RESTART_WHEN, KEY_RESTART_COUNT) |
235 | e52e0ddc | Jose A. Lopes | |
236 | e52e0ddc | Jose A. Lopes | def RecordCleanupAttempt(self, instance_name): |
237 | e52e0ddc | Jose A. Lopes | """Record a cleanup attempt.
|
238 | adf6301e | Michael Hanselmann |
|
239 | e52e0ddc | Jose A. Lopes | @type instance_name: string
|
240 | e52e0ddc | Jose A. Lopes | @param instance_name: the name of the instance being cleaned up
|
241 | e52e0ddc | Jose A. Lopes |
|
242 | e52e0ddc | Jose A. Lopes | """
|
243 | e52e0ddc | Jose A. Lopes | self._RecordAttempt(self._data["instance"], instance_name, |
244 | e52e0ddc | Jose A. Lopes | KEY_CLEANUP_WHEN, KEY_CLEANUP_COUNT) |
245 | adf6301e | Michael Hanselmann | |
246 | 54ca6e4b | Michael Hanselmann | def RemoveInstance(self, instance_name): |
247 | adf6301e | Michael Hanselmann | """Update state to reflect that a machine is running.
|
248 | adf6301e | Michael Hanselmann |
|
249 | adf6301e | Michael Hanselmann | This method removes the record for a named instance (as we only
|
250 | adf6301e | Michael Hanselmann | track down instances).
|
251 | adf6301e | Michael Hanselmann |
|
252 | d66bbe9f | Iustin Pop | @type instance_name: string
|
253 | d66bbe9f | Iustin Pop | @param instance_name: the name of the instance to remove from books
|
254 | adf6301e | Michael Hanselmann |
|
255 | adf6301e | Michael Hanselmann | """
|
256 | adf6301e | Michael Hanselmann | idata = self._data["instance"] |
257 | adf6301e | Michael Hanselmann | |
258 | 54ca6e4b | Michael Hanselmann | idata.pop(instance_name, None) |