Revision 50273051 daemons/ganeti-watcher
b/daemons/ganeti-watcher | ||
---|---|---|
44 | 44 |
from ganeti import opcodes |
45 | 45 |
from ganeti import cli |
46 | 46 |
from ganeti import luxi |
47 |
from ganeti import ssconf |
|
48 |
from ganeti import bdev |
|
49 |
from ganeti import hypervisor |
|
50 |
from ganeti.confd import client as confd_client |
|
47 | 51 |
|
48 | 52 |
|
49 | 53 |
MAXTRIES = 5 |
... | ... | |
109 | 113 |
runresult.output) |
110 | 114 |
|
111 | 115 |
|
116 |
class NodeMaintenance(object): |
|
117 |
"""Talks to confd daemons and possible shutdown instances/drbd devices. |
|
118 |
|
|
119 |
""" |
|
120 |
def __init__(self): |
|
121 |
self.store_cb = confd_client.StoreResultCallback() |
|
122 |
self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb) |
|
123 |
self.confd_client = confd_client.GetConfdClient(self.filter_cb) |
|
124 |
|
|
125 |
@staticmethod |
|
126 |
def ShouldRun(): |
|
127 |
"""Checks whether node maintenance should run. |
|
128 |
|
|
129 |
""" |
|
130 |
try: |
|
131 |
return ssconf.SimpleStore().GetMaintainNodeHealth() |
|
132 |
except errors.ConfigurationError, err: |
|
133 |
logging.error("Configuration error, not activating node maintenance: %s", |
|
134 |
err) |
|
135 |
return False |
|
136 |
|
|
137 |
@staticmethod |
|
138 |
def GetRunningInstances(): |
|
139 |
"""Compute list of hypervisor/running instances. |
|
140 |
|
|
141 |
""" |
|
142 |
hyp_list = ssconf.SimpleStore().GetHypervisorList() |
|
143 |
results = [] |
|
144 |
for hv_name in hyp_list: |
|
145 |
try: |
|
146 |
hv = hypervisor.GetHypervisor(hv_name) |
|
147 |
ilist = hv.ListInstances() |
|
148 |
results.extend([(iname, hv_name) for iname in ilist]) |
|
149 |
except: # pylint: disable-msg=W0702 |
|
150 |
logging.error("Error while listing instances for hypervisor %s", |
|
151 |
hv_name, exc_info=True) |
|
152 |
return results |
|
153 |
|
|
154 |
@staticmethod |
|
155 |
def GetUsedDRBDs(): |
|
156 |
"""Get list of used DRBD minors. |
|
157 |
|
|
158 |
""" |
|
159 |
return bdev.DRBD8.GetUsedDevs().keys() |
|
160 |
|
|
161 |
@classmethod |
|
162 |
def DoMaintenance(cls, role): |
|
163 |
"""Maintain the instance list. |
|
164 |
|
|
165 |
""" |
|
166 |
if role == constants.CONFD_NODE_ROLE_OFFLINE: |
|
167 |
inst_running = cls.GetRunningInstances() |
|
168 |
cls.ShutdownInstances(inst_running) |
|
169 |
drbd_running = cls.GetUsedDRBDs() |
|
170 |
cls.ShutdownDRBD(drbd_running) |
|
171 |
else: |
|
172 |
logging.debug("Not doing anything for role %s", role) |
|
173 |
|
|
174 |
@staticmethod |
|
175 |
def ShutdownInstances(inst_running): |
|
176 |
"""Shutdown running instances. |
|
177 |
|
|
178 |
""" |
|
179 |
names_running = set([i[0] for i in inst_running]) |
|
180 |
if names_running: |
|
181 |
logging.info("Following instances should not be running," |
|
182 |
" shutting them down: %s", utils.CommaJoin(names_running)) |
|
183 |
# this dictionary will collapse duplicate instance names (only |
|
184 |
# xen pvm/vhm) into a single key, which is fine |
|
185 |
i2h = dict(inst_running) |
|
186 |
for name in names_running: |
|
187 |
hv_name = i2h[name] |
|
188 |
hv = hypervisor.GetHypervisor(hv_name) |
|
189 |
hv.StopInstance(None, force=True, name=name) |
|
190 |
|
|
191 |
@staticmethod |
|
192 |
def ShutdownDRBD(drbd_running): |
|
193 |
"""Shutdown active DRBD devices. |
|
194 |
|
|
195 |
""" |
|
196 |
if drbd_running: |
|
197 |
logging.info("Following DRBD minors should not be active," |
|
198 |
" shutting them down: %s", utils.CommaJoin(drbd_running)) |
|
199 |
for minor in drbd_running: |
|
200 |
# pylint: disable-msg=W0212 |
|
201 |
# using the private method as is, pending enhancements to the DRBD |
|
202 |
# interface |
|
203 |
bdev.DRBD8._ShutdownAll(minor) |
|
204 |
|
|
205 |
def Exec(self): |
|
206 |
"""Check node status versus cluster desired state. |
|
207 |
|
|
208 |
""" |
|
209 |
my_name = utils.HostInfo().name |
|
210 |
req = confd_client.ConfdClientRequest(type= |
|
211 |
constants.CONFD_REQ_NODE_ROLE_BYNAME, |
|
212 |
query=my_name) |
|
213 |
self.confd_client.SendRequest(req, async=False) |
|
214 |
timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt) |
|
215 |
if not timed_out: |
|
216 |
# should have a valid response |
|
217 |
status, result = self.store_cb.GetResponse(req.rsalt) |
|
218 |
assert status, "Missing result but received replies" |
|
219 |
if not self.filter_cb.consistent[req.rsalt]: |
|
220 |
logging.warning("Inconsistent replies, not doing anything") |
|
221 |
return |
|
222 |
self.DoMaintenance(result.server_reply.answer) |
|
223 |
else: |
|
224 |
logging.warning("Confd query timed out, cannot do maintenance actions") |
|
225 |
|
|
226 |
|
|
112 | 227 |
class WatcherState(object): |
113 | 228 |
"""Interface to a state file recording restart attempts. |
114 | 229 |
|
... | ... | |
527 | 642 |
try: |
528 | 643 |
StartNodeDaemons() |
529 | 644 |
RunWatcherHooks() |
645 |
# run node maintenance in all cases, even if master, so that old |
|
646 |
# masters can be properly cleaned up too |
|
647 |
if NodeMaintenance.ShouldRun(): |
|
648 |
NodeMaintenance().Exec() |
|
530 | 649 |
|
531 | 650 |
notepad = WatcherState(statefile) |
532 | 651 |
try: |
Also available in: Unified diff