Revision 9ca87fb3 lib/watcher/__init__.py

b/lib/watcher/__init__.py
27 27

  
28 28
"""
29 29

  
30
# pylint: disable-msg=C0103,W0142
31

  
32
# C0103: Invalid name ganeti-watcher
33

  
34 30
import os
35 31
import os.path
36 32
import sys
......
46 42
from ganeti import opcodes
47 43
from ganeti import cli
48 44
from ganeti import luxi
49
from ganeti import ssconf
50
from ganeti import bdev
51
from ganeti import hypervisor
52 45
from ganeti import rapi
53
from ganeti.confd import client as confd_client
54 46
from ganeti import netutils
55 47

  
56 48
import ganeti.rapi.client # pylint: disable-msg=W0611
49
import ganeti.watcher.nodemaint # pylint: disable-msg=W0611
57 50

  
58 51

  
59 52
MAXTRIES = 5
......
125 118
                      runresult.output)
126 119

  
127 120

  
128
class NodeMaintenance(object):
129
  """Talks to confd daemons and possible shutdown instances/drbd devices.
130

  
131
  """
132
  def __init__(self):
133
    self.store_cb = confd_client.StoreResultCallback()
134
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
135
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
136

  
137
  @staticmethod
138
  def ShouldRun():
139
    """Checks whether node maintenance should run.
140

  
141
    """
142
    try:
143
      return ssconf.SimpleStore().GetMaintainNodeHealth()
144
    except errors.ConfigurationError, err:
145
      logging.error("Configuration error, not activating node maintenance: %s",
146
                    err)
147
      return False
148

  
149
  @staticmethod
150
  def GetRunningInstances():
151
    """Compute list of hypervisor/running instances.
152

  
153
    """
154
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
155
    results = []
156
    for hv_name in hyp_list:
157
      try:
158
        hv = hypervisor.GetHypervisor(hv_name)
159
        ilist = hv.ListInstances()
160
        results.extend([(iname, hv_name) for iname in ilist])
161
      except: # pylint: disable-msg=W0702
162
        logging.error("Error while listing instances for hypervisor %s",
163
                      hv_name, exc_info=True)
164
    return results
165

  
166
  @staticmethod
167
  def GetUsedDRBDs():
168
    """Get list of used DRBD minors.
169

  
170
    """
171
    return bdev.DRBD8.GetUsedDevs().keys()
172

  
173
  @classmethod
174
  def DoMaintenance(cls, role):
175
    """Maintain the instance list.
176

  
177
    """
178
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
179
      inst_running = cls.GetRunningInstances()
180
      cls.ShutdownInstances(inst_running)
181
      drbd_running = cls.GetUsedDRBDs()
182
      cls.ShutdownDRBD(drbd_running)
183
    else:
184
      logging.debug("Not doing anything for role %s", role)
185

  
186
  @staticmethod
187
  def ShutdownInstances(inst_running):
188
    """Shutdown running instances.
189

  
190
    """
191
    names_running = set([i[0] for i in inst_running])
192
    if names_running:
193
      logging.info("Following instances should not be running,"
194
                   " shutting them down: %s", utils.CommaJoin(names_running))
195
      # this dictionary will collapse duplicate instance names (only
196
      # xen pvm/vhm) into a single key, which is fine
197
      i2h = dict(inst_running)
198
      for name in names_running:
199
        hv_name = i2h[name]
200
        hv = hypervisor.GetHypervisor(hv_name)
201
        hv.StopInstance(None, force=True, name=name)
202

  
203
  @staticmethod
204
  def ShutdownDRBD(drbd_running):
205
    """Shutdown active DRBD devices.
206

  
207
    """
208
    if drbd_running:
209
      logging.info("Following DRBD minors should not be active,"
210
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
211
      for minor in drbd_running:
212
        # pylint: disable-msg=W0212
213
        # using the private method as is, pending enhancements to the DRBD
214
        # interface
215
        bdev.DRBD8._ShutdownAll(minor)
216

  
217
  def Exec(self):
218
    """Check node status versus cluster desired state.
219

  
220
    """
221
    my_name = netutils.Hostname.GetSysName()
222
    req = confd_client.ConfdClientRequest(type=
223
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
224
                                          query=my_name)
225
    self.confd_client.SendRequest(req, async=False, coverage=-1)
226
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
227
    if not timed_out:
228
      # should have a valid response
229
      status, result = self.store_cb.GetResponse(req.rsalt)
230
      assert status, "Missing result but received replies"
231
      if not self.filter_cb.consistent[req.rsalt]:
232
        logging.warning("Inconsistent replies, not doing anything")
233
        return
234
      self.DoMaintenance(result.server_reply.answer)
235
    else:
236
      logging.warning("Confd query timed out, cannot do maintenance actions")
237

  
238

  
239 121
class WatcherState(object):
240 122
  """Interface to a state file recording restart attempts.
241 123

  
......
756 638
    RunWatcherHooks()
757 639
    # run node maintenance in all cases, even if master, so that old
758 640
    # masters can be properly cleaned up too
759
    if NodeMaintenance.ShouldRun():
760
      NodeMaintenance().Exec()
641
    if nodemaint.NodeMaintenance.ShouldRun(): # pylint: disable-msg=E0602
642
      nodemaint.NodeMaintenance().Exec() # pylint: disable-msg=E0602
761 643

  
762 644
    notepad = WatcherState(statefile)
763 645
    try:

Also available in: Unified diff