Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / state.py @ 653bc0f1

History | View | Annotate | Download (7.4 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Module keeping state for Ganeti watcher.
23

24
"""
25

    
26
import os
27
import time
28
import logging
29

    
30
from ganeti import utils
31
from ganeti import serializer
32
from ganeti import errors
33

    
34

    
35
# Delete any record that is older than 8 hours; this value is based on
36
# the fact that the current retry counter is 5, and watcher runs every
37
# 5 minutes, so it takes around half an hour to exceed the retry
38
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
39
RETRY_EXPIRATION = 8 * 3600
40

    
41
KEY_CLEANUP_COUNT = "cleanup_count"
42
KEY_CLEANUP_WHEN = "cleanup_when"
43
KEY_RESTART_COUNT = "restart_count"
44
KEY_RESTART_WHEN = "restart_when"
45
KEY_BOOT_ID = "bootid"
46

    
47

    
48
def OpenStateFile(path):
49
  """Opens the state file and acquires a lock on it.
50

51
  @type path: string
52
  @param path: Path to state file
53

54
  """
55
  # The two-step dance below is necessary to allow both opening existing
56
  # file read/write and creating if not existing. Vanilla open will truncate
57
  # an existing file -or- allow creating if not existing.
58
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
59

    
60
  # Try to acquire lock on state file. If this fails, another watcher instance
61
  # might already be running or another program is temporarily blocking the
62
  # watcher from running.
63
  try:
64
    utils.LockFile(statefile_fd)
65
  except errors.LockError, err:
66
    logging.error("Can't acquire lock on state file %s: %s", path, err)
67
    return None
68

    
69
  return os.fdopen(statefile_fd, "w+")
70

    
71

    
72
class WatcherState(object):
73
  """Interface to a state file recording restart attempts.
74

75
  """
76
  def __init__(self, statefile):
77
    """Open, lock, read and parse the file.
78

79
    @type statefile: file
80
    @param statefile: State file object
81

82
    """
83
    self.statefile = statefile
84

    
85
    try:
86
      state_data = self.statefile.read()
87
      if not state_data:
88
        self._data = {}
89
      else:
90
        self._data = serializer.Load(state_data)
91
    except Exception, msg: # pylint: disable=W0703
92
      # Ignore errors while loading the file and treat it as empty
93
      self._data = {}
94
      logging.warning(("Invalid state file. Using defaults."
95
                       " Error message: %s"), msg)
96

    
97
    if "instance" not in self._data:
98
      self._data["instance"] = {}
99
    if "node" not in self._data:
100
      self._data["node"] = {}
101

    
102
    self._orig_data = serializer.Dump(self._data)
103

    
104
  def Save(self, filename):
105
    """Save state to file, then unlock and close it.
106

107
    """
108
    assert self.statefile
109

    
110
    serialized_form = serializer.Dump(self._data)
111
    if self._orig_data == serialized_form:
112
      logging.debug("Data didn't change, just touching status file")
113
      os.utime(filename, None)
114
      return
115

    
116
    # We need to make sure the file is locked before renaming it, otherwise
117
    # starting ganeti-watcher again at the same time will create a conflict.
118
    fd = utils.WriteFile(filename,
119
                         data=serialized_form,
120
                         prewrite=utils.LockFile, close=False)
121
    self.statefile = os.fdopen(fd, "w+")
122

    
123
  def Close(self):
124
    """Unlock configuration file and close it.
125

126
    """
127
    assert self.statefile
128

    
129
    # Files are automatically unlocked when closing them
130
    self.statefile.close()
131
    self.statefile = None
132

    
133
  def GetNodeBootID(self, name):
134
    """Returns the last boot ID of a node or None.
135

136
    """
137
    ndata = self._data["node"]
138

    
139
    if name in ndata and KEY_BOOT_ID in ndata[name]:
140
      return ndata[name][KEY_BOOT_ID]
141
    return None
142

    
143
  def SetNodeBootID(self, name, bootid):
144
    """Sets the boot ID of a node.
145

146
    """
147
    assert bootid
148

    
149
    ndata = self._data["node"]
150

    
151
    ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
152

    
153
  def NumberOfRestartAttempts(self, instance_name):
154
    """Returns number of previous restart attempts.
155

156
    @type instance_name: string
157
    @param instance_name: the name of the instance to look up
158

159
    """
160
    idata = self._data["instance"]
161

    
162
    if instance_name in idata:
163
      return idata[instance_name][KEY_RESTART_COUNT]
164

    
165
    return 0
166

    
167
  def NumberOfCleanupAttempts(self, instance_name):
168
    """Returns number of previous cleanup attempts.
169

170
    @type instance_name: string
171
    @param instance_name: the name of the instance to look up
172

173
    """
174
    idata = self._data["instance"]
175

    
176
    if instance_name in idata:
177
      return idata[instance_name][KEY_CLEANUP_COUNT]
178

    
179
    return 0
180

    
181
  def MaintainInstanceList(self, instances):
182
    """Perform maintenance on the recorded instances.
183

184
    @type instances: list of string
185
    @param instances: the list of currently existing instances
186

187
    """
188
    idict = self._data["instance"]
189

    
190
    # First, delete obsolete instances
191
    obsolete_instances = set(idict).difference(instances)
192
    for inst in obsolete_instances:
193
      logging.debug("Forgetting obsolete instance %s", inst)
194
      idict.pop(inst, None)
195

    
196
    # Second, delete expired records
197
    earliest = time.time() - RETRY_EXPIRATION
198
    expired_instances = [i for i in idict
199
                         if idict[i][KEY_RESTART_WHEN] < earliest]
200
    for inst in expired_instances:
201
      logging.debug("Expiring record for instance %s", inst)
202
      idict.pop(inst, None)
203

    
204
  @staticmethod
205
  def _RecordAttempt(instances, instance_name, key_when, key_count):
206
    """Record an event.
207

208
    @type instances: dict
209
    @param instances: contains instance data indexed by instance_name
210

211
    @type instance_name: string
212
    @param instance_name: name of the instance involved in the event
213

214
    @type key_when:
215
    @param key_when: dict key for the information for when the event occurred
216

217
    @type key_count: int
218
    @param key_count: dict key for the information for how many times
219
                      the event occurred
220

221
    """
222
    instance = instances.setdefault(instance_name, {})
223
    instance[key_when] = time.time()
224
    instance[key_count] = instance.get(key_count, 0) + 1
225

    
226
  def RecordRestartAttempt(self, instance_name):
227
    """Record a restart attempt.
228

229
    @type instance_name: string
230
    @param instance_name: the name of the instance being restarted
231

232
    """
233
    self._RecordAttempt(self._data["instance"], instance_name,
234
                        KEY_RESTART_WHEN, KEY_RESTART_COUNT)
235

    
236
  def RecordCleanupAttempt(self, instance_name):
237
    """Record a cleanup attempt.
238

239
    @type instance_name: string
240
    @param instance_name: the name of the instance being cleaned up
241

242
    """
243
    self._RecordAttempt(self._data["instance"], instance_name,
244
                        KEY_CLEANUP_WHEN, KEY_CLEANUP_COUNT)
245

    
246
  def RemoveInstance(self, instance_name):
247
    """Update state to reflect that a machine is running.
248

249
    This method removes the record for a named instance (as we only
250
    track down instances).
251

252
    @type instance_name: string
253
    @param instance_name: the name of the instance to remove from books
254

255
    """
256
    idata = self._data["instance"]
257

    
258
    idata.pop(instance_name, None)