root / snf-cyclades-app / synnefo / logic / management / commands / reconcile-servers.py @ e77a29ab
History | View | Annotate | Download (13.1 kB)
1 |
# Copyright 2011-2012 GRNET S.A. All rights reserved.
|
---|---|
2 |
#
|
3 |
# Redistribution and use in source and binary forms, with or without
|
4 |
# modification, are permitted provided that the following conditions
|
5 |
# are met:
|
6 |
#
|
7 |
# 1. Redistributions of source code must retain the above copyright
|
8 |
# notice, this list of conditions and the following disclaimer.
|
9 |
#
|
10 |
# 2. Redistributions in binary form must reproduce the above copyright
|
11 |
# notice, this list of conditions and the following disclaimer in the
|
12 |
# documentation and/or other materials provided with the distribution.
|
13 |
#
|
14 |
# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
15 |
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
16 |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
17 |
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
18 |
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
19 |
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
20 |
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
21 |
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
22 |
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
23 |
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
24 |
# SUCH DAMAGE.
|
25 |
#
|
26 |
# The views and conclusions contained in the software and documentation are
|
27 |
# those of the authors and should not be interpreted as representing official
|
28 |
# policies, either expressed or implied, of GRNET S.A.
|
29 |
#
|
30 |
"""Reconciliation management command
|
31 |
|
32 |
Management command to reconcile the contents of the Synnefo DB with
|
33 |
the state of the Ganeti backend. See docstring on top of
|
34 |
logic/reconciliation.py for a description of reconciliation rules.
|
35 |
|
36 |
"""
|
37 |
import sys |
38 |
import datetime |
39 |
|
40 |
from optparse import make_option |
41 |
|
42 |
from django.core.management.base import BaseCommand, CommandError |
43 |
|
44 |
from synnefo.db.models import (Backend, VirtualMachine, Network, |
45 |
pooled_rapi_client) |
46 |
from synnefo.logic import reconciliation, utils |
47 |
from synnefo.logic import backend as backend_mod |
48 |
from synnefo.util.mac2eui64 import mac2eui64 |
49 |
from synnefo.management.common import get_backend |
50 |
|
51 |
|
52 |
class Command(BaseCommand): |
53 |
can_import_settings = True
|
54 |
|
55 |
help = 'Reconcile contents of Synnefo DB with state of Ganeti backend'
|
56 |
output_transaction = True # The management command runs inside |
57 |
# an SQL transaction
|
58 |
option_list = BaseCommand.option_list + ( |
59 |
make_option('--detect-stale', action='store_true', dest='detect_stale', |
60 |
default=False, help='Detect stale VM entries in DB'), |
61 |
make_option('--detect-orphans', action='store_true', |
62 |
dest='detect_orphans',
|
63 |
default=False, help='Detect orphan instances in Ganeti'), |
64 |
make_option('--detect-unsynced', action='store_true', |
65 |
dest='detect_unsynced',
|
66 |
default=False, help='Detect unsynced operstate between ' + |
67 |
'DB and Ganeti'),
|
68 |
make_option('--detect-build-errors', action='store_true', |
69 |
dest='detect_build_errors', default=False, |
70 |
help='Detect instances with build error'),
|
71 |
make_option('--detect-unsynced-nics', action='store_true', |
72 |
dest='detect_unsynced_nics', default=False, |
73 |
help='Detect unsynced nics between DB and Ganeti'),
|
74 |
make_option('--detect-all', action='store_true', |
75 |
dest='detect_all',
|
76 |
default=False, help='Enable all --detect-* arguments'), |
77 |
make_option('--fix-stale', action='store_true', dest='fix_stale', |
78 |
default=False, help='Fix (remove) stale DB entries in DB'), |
79 |
make_option('--fix-orphans', action='store_true', dest='fix_orphans', |
80 |
default=False, help='Fix (remove) orphan Ganeti VMs'), |
81 |
make_option('--fix-unsynced', action='store_true', dest='fix_unsynced', |
82 |
default=False, help='Fix server operstate in DB, set ' + |
83 |
'from Ganeti'),
|
84 |
make_option('--fix-build-errors', action='store_true', |
85 |
dest='fix_build_errors', default=False, |
86 |
help='Fix (remove) instances with build errors'),
|
87 |
make_option('--fix-unsynced-nics', action='store_true', |
88 |
dest='fix_unsynced_nics', default=False, |
89 |
help='Fix unsynced nics between DB and Ganeti'),
|
90 |
make_option('--fix-all', action='store_true', dest='fix_all', |
91 |
default=False, help='Enable all --fix-* arguments'), |
92 |
make_option('--backend-id', default=None, dest='backend-id', |
93 |
help='Reconcilie VMs only for this backend'),
|
94 |
) |
95 |
|
96 |
def _process_args(self, options): |
97 |
keys_detect = [k for k in options.keys() if k.startswith('detect_')] |
98 |
keys_fix = [k for k in options.keys() if k.startswith('fix_')] |
99 |
|
100 |
if not reduce(lambda x, y: x or y, |
101 |
map(lambda x: options[x], keys_detect)): |
102 |
options['detect_all'] = True |
103 |
|
104 |
if options['detect_all']: |
105 |
for kd in keys_detect: |
106 |
options[kd] = True
|
107 |
if options['fix_all']: |
108 |
for kf in keys_fix: |
109 |
options[kf] = True
|
110 |
|
111 |
for kf in keys_fix: |
112 |
kd = kf.replace('fix_', 'detect_', 1) |
113 |
if (options[kf] and not options[kd]): |
114 |
raise CommandError("Cannot use --%s without corresponding " |
115 |
"--%s argument" % (kf, kd))
|
116 |
|
117 |
def handle(self, **options): |
118 |
verbosity = int(options['verbosity']) |
119 |
self._process_args(options)
|
120 |
backend_id = options['backend-id']
|
121 |
if backend_id:
|
122 |
backends = [get_backend(backend_id)] |
123 |
else:
|
124 |
backends = Backend.objects.filter(offline=False)
|
125 |
|
126 |
D = reconciliation.get_servers_from_db(backends) |
127 |
G, GNics = reconciliation.get_instances_from_ganeti(backends) |
128 |
|
129 |
DBNics = reconciliation.get_nics_from_db(backends) |
130 |
|
131 |
#
|
132 |
# Detect problems
|
133 |
#
|
134 |
if options['detect_stale']: |
135 |
stale = reconciliation.stale_servers_in_db(D, G) |
136 |
if len(stale) > 0: |
137 |
print >> sys.stderr, "Found the following stale server IDs: " |
138 |
print " " + "\n ".join( |
139 |
[str(x) for x in stale]) |
140 |
elif verbosity == 2: |
141 |
print >> sys.stderr, "Found no stale server IDs in DB." |
142 |
|
143 |
if options['detect_orphans']: |
144 |
orphans = reconciliation.orphan_instances_in_ganeti(D, G) |
145 |
if len(orphans) > 0: |
146 |
print >> sys.stderr, "Found orphan Ganeti instances with IDs: " |
147 |
print " " + "\n ".join( |
148 |
[str(x) for x in orphans]) |
149 |
elif verbosity == 2: |
150 |
print >> sys.stderr, "Found no orphan Ganeti instances." |
151 |
|
152 |
if options['detect_unsynced']: |
153 |
unsynced = reconciliation.unsynced_operstate(D, G) |
154 |
if len(unsynced) > 0: |
155 |
print >> sys.stderr, "The operstate of the following server" \ |
156 |
" IDs is out-of-sync:"
|
157 |
print " " + "\n ".join( |
158 |
["%d is %s in DB, %s in Ganeti" %
|
159 |
(x[0], x[1], ('UP' if x[2] else 'DOWN')) |
160 |
for x in unsynced]) |
161 |
elif verbosity == 2: |
162 |
print >> sys.stderr, "The operstate of all servers is in sync." |
163 |
|
164 |
if options['detect_build_errors']: |
165 |
build_errors = reconciliation.instances_with_build_errors(D, G) |
166 |
if len(build_errors) > 0: |
167 |
msg = "The os for the following server IDs was not build"\
|
168 |
" successfully:"
|
169 |
print >> sys.stderr, msg
|
170 |
print " " + "\n ".join( |
171 |
["%d" % x for x in build_errors]) |
172 |
elif verbosity == 2: |
173 |
print >> sys.stderr, "Found no instances with build errors." |
174 |
|
175 |
if options['detect_unsynced_nics']: |
176 |
def pretty_print_nics(nics): |
177 |
if not nics: |
178 |
print ''.ljust(18) + 'None' |
179 |
for index, info in nics.items(): |
180 |
print ''.ljust(18) + 'nic/' + str(index) +\ |
181 |
': MAC: %s, IP: %s, Network: %s' % \
|
182 |
(info['mac'], info['ipv4'], info['network']) |
183 |
|
184 |
unsynced_nics = reconciliation.unsynced_nics(DBNics, GNics) |
185 |
if len(unsynced_nics) > 0: |
186 |
msg = "The NICs of the servers with the following IDs are"\
|
187 |
" unsynced:"
|
188 |
print >> sys.stderr, msg
|
189 |
for id, nics in unsynced_nics.items(): |
190 |
print ''.ljust(2) + '%6d:' % id |
191 |
print ''.ljust(8) + '%8s:' % 'DB' |
192 |
pretty_print_nics(nics[0])
|
193 |
print ''.ljust(8) + '%8s:' % 'Ganeti' |
194 |
pretty_print_nics(nics[1])
|
195 |
elif verbosity == 2: |
196 |
print >> sys.stderr, "All instance nics are synced." |
197 |
|
198 |
#
|
199 |
# Then fix them
|
200 |
#
|
201 |
if options['fix_stale'] and len(stale) > 0: |
202 |
print >> sys.stderr, \
|
203 |
"Simulating successful Ganeti removal for %d " \
|
204 |
"servers in the DB:" % len(stale) |
205 |
for vm in VirtualMachine.objects.filter(pk__in=stale): |
206 |
event_time = datetime.datetime.now() |
207 |
backend_mod.process_op_status( |
208 |
vm=vm, |
209 |
etime=event_time, |
210 |
jobid=-0,
|
211 |
opcode='OP_INSTANCE_REMOVE', status='success', |
212 |
logmsg='Reconciliation: simulated Ganeti event')
|
213 |
print >> sys.stderr, " ...done" |
214 |
|
215 |
if options['fix_orphans'] and len(orphans) > 0: |
216 |
print >> sys.stderr, \
|
217 |
"Issuing OP_INSTANCE_REMOVE for %d Ganeti instances:" % \
|
218 |
len(orphans)
|
219 |
for id in orphans: |
220 |
try:
|
221 |
vm = VirtualMachine.objects.get(pk=id)
|
222 |
with pooled_rapi_client(vm) as client: |
223 |
client.DeleteInstance(utils.id_to_instance_name(id))
|
224 |
except VirtualMachine.DoesNotExist:
|
225 |
print >> sys.stderr, "No entry for VM %d in DB !!" % id |
226 |
print >> sys.stderr, " ...done" |
227 |
|
228 |
if options['fix_unsynced'] and len(unsynced) > 0: |
229 |
print >> sys.stderr, "Setting the state of %d out-of-sync VMs:" % \ |
230 |
len(unsynced)
|
231 |
for id, db_state, ganeti_up in unsynced: |
232 |
vm = VirtualMachine.objects.get(pk=id)
|
233 |
opcode = "OP_INSTANCE_REBOOT" if ganeti_up \ |
234 |
else "OP_INSTANCE_SHUTDOWN" |
235 |
event_time = datetime.datetime.now() |
236 |
backend_mod.process_op_status( |
237 |
vm=vm, etime=event_time, jobid=-0,
|
238 |
opcode=opcode, status='success',
|
239 |
logmsg='Reconciliation: simulated Ganeti event')
|
240 |
print >> sys.stderr, " ...done" |
241 |
|
242 |
if options['fix_build_errors'] and len(build_errors) > 0: |
243 |
print >> sys.stderr, "Setting the state of %d build-errors VMs:" %\ |
244 |
len(build_errors)
|
245 |
for id in build_errors: |
246 |
vm = VirtualMachine.objects.get(pk=id)
|
247 |
event_time = datetime.datetime.now() |
248 |
backend_mod.process_op_status( |
249 |
vm=vm, etime=event_time, jobid=-0,
|
250 |
opcode="OP_INSTANCE_CREATE", status='error', |
251 |
logmsg='Reconciliation: simulated Ganeti event')
|
252 |
print >> sys.stderr, " ...done" |
253 |
|
254 |
if options['fix_unsynced_nics'] and len(unsynced_nics) > 0: |
255 |
print >> sys.stderr, "Setting the nics of %d out-of-sync VMs:" % \ |
256 |
len(unsynced_nics)
|
257 |
for id, nics in unsynced_nics.items(): |
258 |
vm = VirtualMachine.objects.get(pk=id)
|
259 |
nics = nics[1] # Ganeti nics |
260 |
if nics == {}: # No nics |
261 |
vm.nics.all.delete() |
262 |
continue
|
263 |
for index, nic in nics.items(): |
264 |
net_id = utils.id_from_network_name(nic['network'])
|
265 |
subnet6 = Network.objects.get(id=net_id).subnet6 |
266 |
# Produce ipv6
|
267 |
ipv6 = subnet6 and mac2eui64(nic['mac'], subnet6) or None |
268 |
nic['ipv6'] = ipv6
|
269 |
# Rename ipv4 to ip
|
270 |
nic['ip'] = nic['ipv4'] |
271 |
# Dict to sorted list
|
272 |
final_nics = [] |
273 |
nics_keys = nics.keys() |
274 |
nics_keys.sort() |
275 |
for i in nics_keys: |
276 |
if nics[i]['network']: |
277 |
final_nics.append(nics[i]) |
278 |
else:
|
279 |
print 'Network of nic %d of vm %s is None. ' \ |
280 |
'Can not reconcile' % (i, vm.backend_vm_id)
|
281 |
event_time = datetime.datetime.now() |
282 |
backend_mod.process_net_status(vm=vm, etime=event_time, |
283 |
nics=final_nics) |
284 |
print >> sys.stderr, " ...done" |