Revision 50273051 daemons/ganeti-watcher

b/daemons/ganeti-watcher
44 44
from ganeti import opcodes
45 45
from ganeti import cli
46 46
from ganeti import luxi
47
from ganeti import ssconf
48
from ganeti import bdev
49
from ganeti import hypervisor
50
from ganeti.confd import client as confd_client
47 51

  
48 52

  
49 53
MAXTRIES = 5
......
109 113
                      runresult.output)
110 114

  
111 115

  
116
class NodeMaintenance(object):
117
  """Talks to confd daemons and possible shutdown instances/drbd devices.
118

  
119
  """
120
  def __init__(self):
121
    self.store_cb = confd_client.StoreResultCallback()
122
    self.filter_cb = confd_client.ConfdFilterCallback(self.store_cb)
123
    self.confd_client = confd_client.GetConfdClient(self.filter_cb)
124

  
125
  @staticmethod
126
  def ShouldRun():
127
    """Checks whether node maintenance should run.
128

  
129
    """
130
    try:
131
      return ssconf.SimpleStore().GetMaintainNodeHealth()
132
    except errors.ConfigurationError, err:
133
      logging.error("Configuration error, not activating node maintenance: %s",
134
                    err)
135
      return False
136

  
137
  @staticmethod
138
  def GetRunningInstances():
139
    """Compute list of hypervisor/running instances.
140

  
141
    """
142
    hyp_list = ssconf.SimpleStore().GetHypervisorList()
143
    results = []
144
    for hv_name in hyp_list:
145
      try:
146
        hv = hypervisor.GetHypervisor(hv_name)
147
        ilist = hv.ListInstances()
148
        results.extend([(iname, hv_name) for iname in ilist])
149
      except: # pylint: disable-msg=W0702
150
        logging.error("Error while listing instances for hypervisor %s",
151
                      hv_name, exc_info=True)
152
    return results
153

  
154
  @staticmethod
155
  def GetUsedDRBDs():
156
    """Get list of used DRBD minors.
157

  
158
    """
159
    return bdev.DRBD8.GetUsedDevs().keys()
160

  
161
  @classmethod
162
  def DoMaintenance(cls, role):
163
    """Maintain the instance list.
164

  
165
    """
166
    if role == constants.CONFD_NODE_ROLE_OFFLINE:
167
      inst_running = cls.GetRunningInstances()
168
      cls.ShutdownInstances(inst_running)
169
      drbd_running = cls.GetUsedDRBDs()
170
      cls.ShutdownDRBD(drbd_running)
171
    else:
172
      logging.debug("Not doing anything for role %s", role)
173

  
174
  @staticmethod
175
  def ShutdownInstances(inst_running):
176
    """Shutdown running instances.
177

  
178
    """
179
    names_running = set([i[0] for i in inst_running])
180
    if names_running:
181
      logging.info("Following instances should not be running,"
182
                   " shutting them down: %s", utils.CommaJoin(names_running))
183
      # this dictionary will collapse duplicate instance names (only
184
      # xen pvm/vhm) into a single key, which is fine
185
      i2h = dict(inst_running)
186
      for name in names_running:
187
        hv_name = i2h[name]
188
        hv = hypervisor.GetHypervisor(hv_name)
189
        hv.StopInstance(None, force=True, name=name)
190

  
191
  @staticmethod
192
  def ShutdownDRBD(drbd_running):
193
    """Shutdown active DRBD devices.
194

  
195
    """
196
    if drbd_running:
197
      logging.info("Following DRBD minors should not be active,"
198
                   " shutting them down: %s", utils.CommaJoin(drbd_running))
199
      for minor in drbd_running:
200
        # pylint: disable-msg=W0212
201
        # using the private method as is, pending enhancements to the DRBD
202
        # interface
203
        bdev.DRBD8._ShutdownAll(minor)
204

  
205
  def Exec(self):
206
    """Check node status versus cluster desired state.
207

  
208
    """
209
    my_name = utils.HostInfo().name
210
    req = confd_client.ConfdClientRequest(type=
211
                                          constants.CONFD_REQ_NODE_ROLE_BYNAME,
212
                                          query=my_name)
213
    self.confd_client.SendRequest(req, async=False)
214
    timed_out, _, _ = self.confd_client.WaitForReply(req.rsalt)
215
    if not timed_out:
216
      # should have a valid response
217
      status, result = self.store_cb.GetResponse(req.rsalt)
218
      assert status, "Missing result but received replies"
219
      if not self.filter_cb.consistent[req.rsalt]:
220
        logging.warning("Inconsistent replies, not doing anything")
221
        return
222
      self.DoMaintenance(result.server_reply.answer)
223
    else:
224
      logging.warning("Confd query timed out, cannot do maintenance actions")
225

  
226

  
112 227
class WatcherState(object):
113 228
  """Interface to a state file recording restart attempts.
114 229

  
......
527 642
  try:
528 643
    StartNodeDaemons()
529 644
    RunWatcherHooks()
645
    # run node maintenance in all cases, even if master, so that old
646
    # masters can be properly cleaned up too
647
    if NodeMaintenance.ShouldRun():
648
      NodeMaintenance().Exec()
530 649

  
531 650
    notepad = WatcherState(statefile)
532 651
    try:

Also available in: Unified diff