Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / state.py @ adf6301e

History | View | Annotate | Download (6.2 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Module keeping state for Ganeti watcher.
23

24
"""
25

    
26
import os
27
import time
28
import logging
29

    
30
from ganeti import utils
31
from ganeti import constants
32
from ganeti import serializer
33
from ganeti import errors
34

    
35

    
36
# Delete any record that is older than 8 hours; this value is based on
37
# the fact that the current retry counter is 5, and watcher runs every
38
# 5 minutes, so it takes around half an hour to exceed the retry
39
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
40
RETRY_EXPIRATION = 8 * 3600
41

    
42
KEY_RESTART_COUNT = "restart_count"
43
KEY_RESTART_WHEN = "restart_when"
44
KEY_BOOT_ID = "bootid"
45

    
46

    
47
def OpenStateFile(path):
48
  """Opens the state file and acquires a lock on it.
49

50
  @type path: string
51
  @param path: Path to state file
52

53
  """
54
  # The two-step dance below is necessary to allow both opening existing
55
  # file read/write and creating if not existing. Vanilla open will truncate
56
  # an existing file -or- allow creating if not existing.
57
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
58

    
59
  # Try to acquire lock on state file. If this fails, another watcher instance
60
  # might already be running or another program is temporarily blocking the
61
  # watcher from running.
62
  try:
63
    utils.LockFile(statefile_fd)
64
  except errors.LockError, err:
65
    logging.error("Can't acquire lock on state file %s: %s", path, err)
66
    return None
67

    
68
  return os.fdopen(statefile_fd, "w+")
69

    
70

    
71
class WatcherState(object):
72
  """Interface to a state file recording restart attempts.
73

74
  """
75
  def __init__(self, statefile):
76
    """Open, lock, read and parse the file.
77

78
    @type statefile: file
79
    @param statefile: State file object
80

81
    """
82
    self.statefile = statefile
83

    
84
    try:
85
      state_data = self.statefile.read()
86
      if not state_data:
87
        self._data = {}
88
      else:
89
        self._data = serializer.Load(state_data)
90
    except Exception, msg: # pylint: disable-msg=W0703
91
      # Ignore errors while loading the file and treat it as empty
92
      self._data = {}
93
      logging.warning(("Invalid state file. Using defaults."
94
                       " Error message: %s"), msg)
95

    
96
    if "instance" not in self._data:
97
      self._data["instance"] = {}
98
    if "node" not in self._data:
99
      self._data["node"] = {}
100

    
101
    self._orig_data = serializer.Dump(self._data)
102

    
103
  def Save(self):
104
    """Save state to file, then unlock and close it.
105

106
    """
107
    assert self.statefile
108

    
109
    serialized_form = serializer.Dump(self._data)
110
    if self._orig_data == serialized_form:
111
      logging.debug("Data didn't change, just touching status file")
112
      os.utime(constants.WATCHER_STATEFILE, None)
113
      return
114

    
115
    # We need to make sure the file is locked before renaming it, otherwise
116
    # starting ganeti-watcher again at the same time will create a conflict.
117
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
118
                         data=serialized_form,
119
                         prewrite=utils.LockFile, close=False)
120
    self.statefile = os.fdopen(fd, 'w+')
121

    
122
  def Close(self):
123
    """Unlock configuration file and close it.
124

125
    """
126
    assert self.statefile
127

    
128
    # Files are automatically unlocked when closing them
129
    self.statefile.close()
130
    self.statefile = None
131

    
132
  def GetNodeBootID(self, name):
133
    """Returns the last boot ID of a node or None.
134

135
    """
136
    ndata = self._data["node"]
137

    
138
    if name in ndata and KEY_BOOT_ID in ndata[name]:
139
      return ndata[name][KEY_BOOT_ID]
140
    return None
141

    
142
  def SetNodeBootID(self, name, bootid):
143
    """Sets the boot ID of a node.
144

145
    """
146
    assert bootid
147

    
148
    ndata = self._data["node"]
149

    
150
    if name not in ndata:
151
      ndata[name] = {}
152

    
153
    ndata[name][KEY_BOOT_ID] = bootid
154

    
155
  def NumberOfRestartAttempts(self, instance):
156
    """Returns number of previous restart attempts.
157

158
    @type instance: L{Instance}
159
    @param instance: the instance to look up
160

161
    """
162
    idata = self._data["instance"]
163

    
164
    if instance.name in idata:
165
      return idata[instance.name][KEY_RESTART_COUNT]
166

    
167
    return 0
168

    
169
  def MaintainInstanceList(self, instances):
170
    """Perform maintenance on the recorded instances.
171

172
    @type instances: list of string
173
    @param instances: the list of currently existing instances
174

175
    """
176
    idict = self._data["instance"]
177
    # First, delete obsolete instances
178
    obsolete_instances = set(idict).difference(instances)
179
    for inst in obsolete_instances:
180
      logging.debug("Forgetting obsolete instance %s", inst)
181
      del idict[inst]
182

    
183
    # Second, delete expired records
184
    earliest = time.time() - RETRY_EXPIRATION
185
    expired_instances = [i for i in idict
186
                         if idict[i][KEY_RESTART_WHEN] < earliest]
187
    for inst in expired_instances:
188
      logging.debug("Expiring record for instance %s", inst)
189
      del idict[inst]
190

    
191
  def RecordRestartAttempt(self, instance):
192
    """Record a restart attempt.
193

194
    @type instance: L{Instance}
195
    @param instance: the instance being restarted
196

197
    """
198
    idata = self._data["instance"]
199

    
200
    if instance.name not in idata:
201
      inst = idata[instance.name] = {}
202
    else:
203
      inst = idata[instance.name]
204

    
205
    inst[KEY_RESTART_WHEN] = time.time()
206
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
207

    
208
  def RemoveInstance(self, instance):
209
    """Update state to reflect that a machine is running.
210

211
    This method removes the record for a named instance (as we only
212
    track down instances).
213

214
    @type instance: L{Instance}
215
    @param instance: the instance to remove from books
216

217
    """
218
    idata = self._data["instance"]
219

    
220
    if instance.name in idata:
221
      del idata[instance.name]