root / lib / watcher / state.py @ 40b068e5
History | View | Annotate | Download (6.1 kB)
1 | adf6301e | Michael Hanselmann | #
|
---|---|---|---|
2 | adf6301e | Michael Hanselmann | #
|
3 | adf6301e | Michael Hanselmann | |
4 | adf6301e | Michael Hanselmann | # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
|
5 | adf6301e | Michael Hanselmann | #
|
6 | adf6301e | Michael Hanselmann | # This program is free software; you can redistribute it and/or modify
|
7 | adf6301e | Michael Hanselmann | # it under the terms of the GNU General Public License as published by
|
8 | adf6301e | Michael Hanselmann | # the Free Software Foundation; either version 2 of the License, or
|
9 | adf6301e | Michael Hanselmann | # (at your option) any later version.
|
10 | adf6301e | Michael Hanselmann | #
|
11 | adf6301e | Michael Hanselmann | # This program is distributed in the hope that it will be useful, but
|
12 | adf6301e | Michael Hanselmann | # WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 | adf6301e | Michael Hanselmann | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 | adf6301e | Michael Hanselmann | # General Public License for more details.
|
15 | adf6301e | Michael Hanselmann | #
|
16 | adf6301e | Michael Hanselmann | # You should have received a copy of the GNU General Public License
|
17 | adf6301e | Michael Hanselmann | # along with this program; if not, write to the Free Software
|
18 | adf6301e | Michael Hanselmann | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
19 | adf6301e | Michael Hanselmann | # 02110-1301, USA.
|
20 | adf6301e | Michael Hanselmann | |
21 | adf6301e | Michael Hanselmann | |
22 | adf6301e | Michael Hanselmann | """Module keeping state for Ganeti watcher.
|
23 | adf6301e | Michael Hanselmann |
|
24 | adf6301e | Michael Hanselmann | """
|
25 | adf6301e | Michael Hanselmann | |
26 | adf6301e | Michael Hanselmann | import os |
27 | adf6301e | Michael Hanselmann | import time |
28 | adf6301e | Michael Hanselmann | import logging |
29 | adf6301e | Michael Hanselmann | |
30 | adf6301e | Michael Hanselmann | from ganeti import utils |
31 | adf6301e | Michael Hanselmann | from ganeti import serializer |
32 | adf6301e | Michael Hanselmann | from ganeti import errors |
33 | adf6301e | Michael Hanselmann | |
34 | adf6301e | Michael Hanselmann | |
35 | adf6301e | Michael Hanselmann | # Delete any record that is older than 8 hours; this value is based on
|
36 | adf6301e | Michael Hanselmann | # the fact that the current retry counter is 5, and watcher runs every
|
37 | adf6301e | Michael Hanselmann | # 5 minutes, so it takes around half an hour to exceed the retry
|
38 | adf6301e | Michael Hanselmann | # counter, so 8 hours (16*1/2h) seems like a reasonable reset time
|
39 | adf6301e | Michael Hanselmann | RETRY_EXPIRATION = 8 * 3600 |
40 | adf6301e | Michael Hanselmann | |
41 | adf6301e | Michael Hanselmann | KEY_RESTART_COUNT = "restart_count"
|
42 | adf6301e | Michael Hanselmann | KEY_RESTART_WHEN = "restart_when"
|
43 | adf6301e | Michael Hanselmann | KEY_BOOT_ID = "bootid"
|
44 | adf6301e | Michael Hanselmann | |
45 | adf6301e | Michael Hanselmann | |
46 | adf6301e | Michael Hanselmann | def OpenStateFile(path): |
47 | adf6301e | Michael Hanselmann | """Opens the state file and acquires a lock on it.
|
48 | adf6301e | Michael Hanselmann |
|
49 | adf6301e | Michael Hanselmann | @type path: string
|
50 | adf6301e | Michael Hanselmann | @param path: Path to state file
|
51 | adf6301e | Michael Hanselmann |
|
52 | adf6301e | Michael Hanselmann | """
|
53 | adf6301e | Michael Hanselmann | # The two-step dance below is necessary to allow both opening existing
|
54 | adf6301e | Michael Hanselmann | # file read/write and creating if not existing. Vanilla open will truncate
|
55 | adf6301e | Michael Hanselmann | # an existing file -or- allow creating if not existing.
|
56 | adf6301e | Michael Hanselmann | statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT) |
57 | adf6301e | Michael Hanselmann | |
58 | adf6301e | Michael Hanselmann | # Try to acquire lock on state file. If this fails, another watcher instance
|
59 | adf6301e | Michael Hanselmann | # might already be running or another program is temporarily blocking the
|
60 | adf6301e | Michael Hanselmann | # watcher from running.
|
61 | adf6301e | Michael Hanselmann | try:
|
62 | adf6301e | Michael Hanselmann | utils.LockFile(statefile_fd) |
63 | adf6301e | Michael Hanselmann | except errors.LockError, err:
|
64 | adf6301e | Michael Hanselmann | logging.error("Can't acquire lock on state file %s: %s", path, err)
|
65 | adf6301e | Michael Hanselmann | return None |
66 | adf6301e | Michael Hanselmann | |
67 | adf6301e | Michael Hanselmann | return os.fdopen(statefile_fd, "w+") |
68 | adf6301e | Michael Hanselmann | |
69 | adf6301e | Michael Hanselmann | |
70 | adf6301e | Michael Hanselmann | class WatcherState(object): |
71 | adf6301e | Michael Hanselmann | """Interface to a state file recording restart attempts.
|
72 | adf6301e | Michael Hanselmann |
|
73 | adf6301e | Michael Hanselmann | """
|
74 | adf6301e | Michael Hanselmann | def __init__(self, statefile): |
75 | adf6301e | Michael Hanselmann | """Open, lock, read and parse the file.
|
76 | adf6301e | Michael Hanselmann |
|
77 | adf6301e | Michael Hanselmann | @type statefile: file
|
78 | adf6301e | Michael Hanselmann | @param statefile: State file object
|
79 | adf6301e | Michael Hanselmann |
|
80 | adf6301e | Michael Hanselmann | """
|
81 | adf6301e | Michael Hanselmann | self.statefile = statefile
|
82 | adf6301e | Michael Hanselmann | |
83 | adf6301e | Michael Hanselmann | try:
|
84 | adf6301e | Michael Hanselmann | state_data = self.statefile.read()
|
85 | adf6301e | Michael Hanselmann | if not state_data: |
86 | adf6301e | Michael Hanselmann | self._data = {}
|
87 | adf6301e | Michael Hanselmann | else:
|
88 | adf6301e | Michael Hanselmann | self._data = serializer.Load(state_data)
|
89 | b459a848 | Andrea Spadaccini | except Exception, msg: # pylint: disable=W0703 |
90 | adf6301e | Michael Hanselmann | # Ignore errors while loading the file and treat it as empty
|
91 | adf6301e | Michael Hanselmann | self._data = {}
|
92 | adf6301e | Michael Hanselmann | logging.warning(("Invalid state file. Using defaults."
|
93 | adf6301e | Michael Hanselmann | " Error message: %s"), msg)
|
94 | adf6301e | Michael Hanselmann | |
95 | adf6301e | Michael Hanselmann | if "instance" not in self._data: |
96 | adf6301e | Michael Hanselmann | self._data["instance"] = {} |
97 | adf6301e | Michael Hanselmann | if "node" not in self._data: |
98 | adf6301e | Michael Hanselmann | self._data["node"] = {} |
99 | adf6301e | Michael Hanselmann | |
100 | adf6301e | Michael Hanselmann | self._orig_data = serializer.Dump(self._data) |
101 | adf6301e | Michael Hanselmann | |
102 | 54ca6e4b | Michael Hanselmann | def Save(self, filename): |
103 | adf6301e | Michael Hanselmann | """Save state to file, then unlock and close it.
|
104 | adf6301e | Michael Hanselmann |
|
105 | adf6301e | Michael Hanselmann | """
|
106 | adf6301e | Michael Hanselmann | assert self.statefile |
107 | adf6301e | Michael Hanselmann | |
108 | adf6301e | Michael Hanselmann | serialized_form = serializer.Dump(self._data)
|
109 | adf6301e | Michael Hanselmann | if self._orig_data == serialized_form: |
110 | adf6301e | Michael Hanselmann | logging.debug("Data didn't change, just touching status file")
|
111 | 54ca6e4b | Michael Hanselmann | os.utime(filename, None)
|
112 | adf6301e | Michael Hanselmann | return
|
113 | adf6301e | Michael Hanselmann | |
114 | adf6301e | Michael Hanselmann | # We need to make sure the file is locked before renaming it, otherwise
|
115 | adf6301e | Michael Hanselmann | # starting ganeti-watcher again at the same time will create a conflict.
|
116 | 54ca6e4b | Michael Hanselmann | fd = utils.WriteFile(filename, |
117 | adf6301e | Michael Hanselmann | data=serialized_form, |
118 | adf6301e | Michael Hanselmann | prewrite=utils.LockFile, close=False)
|
119 | 3ccb3a64 | Michael Hanselmann | self.statefile = os.fdopen(fd, "w+") |
120 | adf6301e | Michael Hanselmann | |
121 | adf6301e | Michael Hanselmann | def Close(self): |
122 | adf6301e | Michael Hanselmann | """Unlock configuration file and close it.
|
123 | adf6301e | Michael Hanselmann |
|
124 | adf6301e | Michael Hanselmann | """
|
125 | adf6301e | Michael Hanselmann | assert self.statefile |
126 | adf6301e | Michael Hanselmann | |
127 | adf6301e | Michael Hanselmann | # Files are automatically unlocked when closing them
|
128 | adf6301e | Michael Hanselmann | self.statefile.close()
|
129 | adf6301e | Michael Hanselmann | self.statefile = None |
130 | adf6301e | Michael Hanselmann | |
131 | adf6301e | Michael Hanselmann | def GetNodeBootID(self, name): |
132 | adf6301e | Michael Hanselmann | """Returns the last boot ID of a node or None.
|
133 | adf6301e | Michael Hanselmann |
|
134 | adf6301e | Michael Hanselmann | """
|
135 | adf6301e | Michael Hanselmann | ndata = self._data["node"] |
136 | adf6301e | Michael Hanselmann | |
137 | adf6301e | Michael Hanselmann | if name in ndata and KEY_BOOT_ID in ndata[name]: |
138 | adf6301e | Michael Hanselmann | return ndata[name][KEY_BOOT_ID]
|
139 | adf6301e | Michael Hanselmann | return None |
140 | adf6301e | Michael Hanselmann | |
141 | adf6301e | Michael Hanselmann | def SetNodeBootID(self, name, bootid): |
142 | adf6301e | Michael Hanselmann | """Sets the boot ID of a node.
|
143 | adf6301e | Michael Hanselmann |
|
144 | adf6301e | Michael Hanselmann | """
|
145 | adf6301e | Michael Hanselmann | assert bootid
|
146 | adf6301e | Michael Hanselmann | |
147 | adf6301e | Michael Hanselmann | ndata = self._data["node"] |
148 | adf6301e | Michael Hanselmann | |
149 | 54ca6e4b | Michael Hanselmann | ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid |
150 | adf6301e | Michael Hanselmann | |
151 | 54ca6e4b | Michael Hanselmann | def NumberOfRestartAttempts(self, instance_name): |
152 | adf6301e | Michael Hanselmann | """Returns number of previous restart attempts.
|
153 | adf6301e | Michael Hanselmann |
|
154 | d66bbe9f | Iustin Pop | @type instance_name: string
|
155 | d66bbe9f | Iustin Pop | @param instance_name: the name of the instance to look up
|
156 | adf6301e | Michael Hanselmann |
|
157 | adf6301e | Michael Hanselmann | """
|
158 | adf6301e | Michael Hanselmann | idata = self._data["instance"] |
159 | adf6301e | Michael Hanselmann | |
160 | 54ca6e4b | Michael Hanselmann | if instance_name in idata: |
161 | 54ca6e4b | Michael Hanselmann | return idata[instance_name][KEY_RESTART_COUNT]
|
162 | adf6301e | Michael Hanselmann | |
163 | adf6301e | Michael Hanselmann | return 0 |
164 | adf6301e | Michael Hanselmann | |
165 | adf6301e | Michael Hanselmann | def MaintainInstanceList(self, instances): |
166 | adf6301e | Michael Hanselmann | """Perform maintenance on the recorded instances.
|
167 | adf6301e | Michael Hanselmann |
|
168 | adf6301e | Michael Hanselmann | @type instances: list of string
|
169 | adf6301e | Michael Hanselmann | @param instances: the list of currently existing instances
|
170 | adf6301e | Michael Hanselmann |
|
171 | adf6301e | Michael Hanselmann | """
|
172 | adf6301e | Michael Hanselmann | idict = self._data["instance"] |
173 | 54ca6e4b | Michael Hanselmann | |
174 | adf6301e | Michael Hanselmann | # First, delete obsolete instances
|
175 | adf6301e | Michael Hanselmann | obsolete_instances = set(idict).difference(instances)
|
176 | adf6301e | Michael Hanselmann | for inst in obsolete_instances: |
177 | adf6301e | Michael Hanselmann | logging.debug("Forgetting obsolete instance %s", inst)
|
178 | 54ca6e4b | Michael Hanselmann | idict.pop(inst, None)
|
179 | adf6301e | Michael Hanselmann | |
180 | adf6301e | Michael Hanselmann | # Second, delete expired records
|
181 | adf6301e | Michael Hanselmann | earliest = time.time() - RETRY_EXPIRATION |
182 | adf6301e | Michael Hanselmann | expired_instances = [i for i in idict |
183 | adf6301e | Michael Hanselmann | if idict[i][KEY_RESTART_WHEN] < earliest]
|
184 | adf6301e | Michael Hanselmann | for inst in expired_instances: |
185 | adf6301e | Michael Hanselmann | logging.debug("Expiring record for instance %s", inst)
|
186 | 54ca6e4b | Michael Hanselmann | idict.pop(inst, None)
|
187 | adf6301e | Michael Hanselmann | |
188 | 54ca6e4b | Michael Hanselmann | def RecordRestartAttempt(self, instance_name): |
189 | adf6301e | Michael Hanselmann | """Record a restart attempt.
|
190 | adf6301e | Michael Hanselmann |
|
191 | d66bbe9f | Iustin Pop | @type instance_name: string
|
192 | d66bbe9f | Iustin Pop | @param instance_name: the name of the instance being restarted
|
193 | adf6301e | Michael Hanselmann |
|
194 | adf6301e | Michael Hanselmann | """
|
195 | adf6301e | Michael Hanselmann | idata = self._data["instance"] |
196 | adf6301e | Michael Hanselmann | |
197 | 54ca6e4b | Michael Hanselmann | inst = idata.setdefault(instance_name, {}) |
198 | adf6301e | Michael Hanselmann | inst[KEY_RESTART_WHEN] = time.time() |
199 | adf6301e | Michael Hanselmann | inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1 |
200 | adf6301e | Michael Hanselmann | |
201 | 54ca6e4b | Michael Hanselmann | def RemoveInstance(self, instance_name): |
202 | adf6301e | Michael Hanselmann | """Update state to reflect that a machine is running.
|
203 | adf6301e | Michael Hanselmann |
|
204 | adf6301e | Michael Hanselmann | This method removes the record for a named instance (as we only
|
205 | adf6301e | Michael Hanselmann | track down instances).
|
206 | adf6301e | Michael Hanselmann |
|
207 | d66bbe9f | Iustin Pop | @type instance_name: string
|
208 | d66bbe9f | Iustin Pop | @param instance_name: the name of the instance to remove from books
|
209 | adf6301e | Michael Hanselmann |
|
210 | adf6301e | Michael Hanselmann | """
|
211 | adf6301e | Michael Hanselmann | idata = self._data["instance"] |
212 | adf6301e | Michael Hanselmann | |
213 | 54ca6e4b | Michael Hanselmann | idata.pop(instance_name, None) |