Statistics
| Branch: | Tag: | Revision:

root / lib / watcher / state.py @ 40b068e5

History | View | Annotate | Download (6.1 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Module keeping state for Ganeti watcher.
23

24
"""
25

    
26
import os
27
import time
28
import logging
29

    
30
from ganeti import utils
31
from ganeti import serializer
32
from ganeti import errors
33

    
34

    
35
# Delete any record that is older than 8 hours; this value is based on
36
# the fact that the current retry counter is 5, and watcher runs every
37
# 5 minutes, so it takes around half an hour to exceed the retry
38
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
39
RETRY_EXPIRATION = 8 * 3600
40

    
41
KEY_RESTART_COUNT = "restart_count"
42
KEY_RESTART_WHEN = "restart_when"
43
KEY_BOOT_ID = "bootid"
44

    
45

    
46
def OpenStateFile(path):
47
  """Opens the state file and acquires a lock on it.
48

49
  @type path: string
50
  @param path: Path to state file
51

52
  """
53
  # The two-step dance below is necessary to allow both opening existing
54
  # file read/write and creating if not existing. Vanilla open will truncate
55
  # an existing file -or- allow creating if not existing.
56
  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
57

    
58
  # Try to acquire lock on state file. If this fails, another watcher instance
59
  # might already be running or another program is temporarily blocking the
60
  # watcher from running.
61
  try:
62
    utils.LockFile(statefile_fd)
63
  except errors.LockError, err:
64
    logging.error("Can't acquire lock on state file %s: %s", path, err)
65
    return None
66

    
67
  return os.fdopen(statefile_fd, "w+")
68

    
69

    
70
class WatcherState(object):
71
  """Interface to a state file recording restart attempts.
72

73
  """
74
  def __init__(self, statefile):
75
    """Open, lock, read and parse the file.
76

77
    @type statefile: file
78
    @param statefile: State file object
79

80
    """
81
    self.statefile = statefile
82

    
83
    try:
84
      state_data = self.statefile.read()
85
      if not state_data:
86
        self._data = {}
87
      else:
88
        self._data = serializer.Load(state_data)
89
    except Exception, msg: # pylint: disable=W0703
90
      # Ignore errors while loading the file and treat it as empty
91
      self._data = {}
92
      logging.warning(("Invalid state file. Using defaults."
93
                       " Error message: %s"), msg)
94

    
95
    if "instance" not in self._data:
96
      self._data["instance"] = {}
97
    if "node" not in self._data:
98
      self._data["node"] = {}
99

    
100
    self._orig_data = serializer.Dump(self._data)
101

    
102
  def Save(self, filename):
103
    """Save state to file, then unlock and close it.
104

105
    """
106
    assert self.statefile
107

    
108
    serialized_form = serializer.Dump(self._data)
109
    if self._orig_data == serialized_form:
110
      logging.debug("Data didn't change, just touching status file")
111
      os.utime(filename, None)
112
      return
113

    
114
    # We need to make sure the file is locked before renaming it, otherwise
115
    # starting ganeti-watcher again at the same time will create a conflict.
116
    fd = utils.WriteFile(filename,
117
                         data=serialized_form,
118
                         prewrite=utils.LockFile, close=False)
119
    self.statefile = os.fdopen(fd, "w+")
120

    
121
  def Close(self):
122
    """Unlock configuration file and close it.
123

124
    """
125
    assert self.statefile
126

    
127
    # Files are automatically unlocked when closing them
128
    self.statefile.close()
129
    self.statefile = None
130

    
131
  def GetNodeBootID(self, name):
132
    """Returns the last boot ID of a node or None.
133

134
    """
135
    ndata = self._data["node"]
136

    
137
    if name in ndata and KEY_BOOT_ID in ndata[name]:
138
      return ndata[name][KEY_BOOT_ID]
139
    return None
140

    
141
  def SetNodeBootID(self, name, bootid):
142
    """Sets the boot ID of a node.
143

144
    """
145
    assert bootid
146

    
147
    ndata = self._data["node"]
148

    
149
    ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
150

    
151
  def NumberOfRestartAttempts(self, instance_name):
152
    """Returns number of previous restart attempts.
153

154
    @type instance_name: string
155
    @param instance_name: the name of the instance to look up
156

157
    """
158
    idata = self._data["instance"]
159

    
160
    if instance_name in idata:
161
      return idata[instance_name][KEY_RESTART_COUNT]
162

    
163
    return 0
164

    
165
  def MaintainInstanceList(self, instances):
166
    """Perform maintenance on the recorded instances.
167

168
    @type instances: list of string
169
    @param instances: the list of currently existing instances
170

171
    """
172
    idict = self._data["instance"]
173

    
174
    # First, delete obsolete instances
175
    obsolete_instances = set(idict).difference(instances)
176
    for inst in obsolete_instances:
177
      logging.debug("Forgetting obsolete instance %s", inst)
178
      idict.pop(inst, None)
179

    
180
    # Second, delete expired records
181
    earliest = time.time() - RETRY_EXPIRATION
182
    expired_instances = [i for i in idict
183
                         if idict[i][KEY_RESTART_WHEN] < earliest]
184
    for inst in expired_instances:
185
      logging.debug("Expiring record for instance %s", inst)
186
      idict.pop(inst, None)
187

    
188
  def RecordRestartAttempt(self, instance_name):
189
    """Record a restart attempt.
190

191
    @type instance_name: string
192
    @param instance_name: the name of the instance being restarted
193

194
    """
195
    idata = self._data["instance"]
196

    
197
    inst = idata.setdefault(instance_name, {})
198
    inst[KEY_RESTART_WHEN] = time.time()
199
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
200

    
201
  def RemoveInstance(self, instance_name):
202
    """Update state to reflect that a machine is running.
203

204
    This method removes the record for a named instance (as we only
205
    track down instances).
206

207
    @type instance_name: string
208
    @param instance_name: the name of the instance to remove from books
209

210
    """
211
    idata = self._data["instance"]
212

    
213
    idata.pop(instance_name, None)