1 /*
2 * Copyright (c) 1998-2002 The Jgroup Team.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16 *
17 */
18
19 package jgroup.arm.recovery;
20
21 import java.util.HashMap;
22 import java.util.Map;
23 import java.util.Set;
24
25 import jgroup.relacs.config.Host;
26 import jgroup.relacs.config.HostSet;
27 import jgroup.relacs.config.Host.ReplicaState;
28
29 import org.apache.log4j.Logger;
30
31
32
33 /**
34 * If the current number of replicas falls below the minimal redundancy
35 * specified for this replica, and the number of available hosts in
36 * this partition is greater than the minimal redundancy then create
37 * new replicas to compensate for the lost redundancy.
38 *
39 * @author Hein Meling
40 * @since Jgroup 1.2
41 */
42 public class KeepMinimalInPartition
43 extends AbstractRecoveryStrategy
44 {
45
46 ////////////////////////////////////////////////////////////////////////////////////////////
47 // Logger
48 ////////////////////////////////////////////////////////////////////////////////////////////
49
50 /** Obtain logger for this class */
51 private static final Logger log = Logger.getLogger(KeepMinimalInPartition.class);
52
53
54 ////////////////////////////////////////////////////////////////////////////////////////////
55 // Constants
56 ////////////////////////////////////////////////////////////////////////////////////////////
57
58 private static final int RETRY_LIMIT = 3;
59
60
61 ////////////////////////////////////////////////////////////////////////////////////////////
62 // Fields
63 ////////////////////////////////////////////////////////////////////////////////////////////
64
65 /** Map of number of retries for a given host */
66 private final Map<Host, Integer> hostRetries = new HashMap<Host, Integer>();
67
68 /** The host on which a replacement replica has been installed (pending join) */
69 private Host recoveredHost;
70
71 /** The set of hosts in the current view */
72 private HostSet viewHosts;
73
74
75 ////////////////////////////////////////////////////////////////////////////////////////////
76 // RecoveryStrategy interface methods (those not implemented in AbstractRecoveryStrategy)
77 ////////////////////////////////////////////////////////////////////////////////////////////
78
79 /**
80 * FIXME UPDATE DOCUMENTATION:
81 * The <code>KeepMinimalInPartition</code> implementation simply
82 * assume that the servers do not maintain any state or has some
83 * other means of recovering their state, and tries to relocate
84 * the replicas to a new set of hosts.
85 *
86 * @param hosts
87 * The set of hosts on which the application was running,
88 * but has now failed.
89 *
90 * @return
91 * True if all replicas replacement repliacs has been created successfully,
92 * false otherwise.
93 */
94 public boolean handleFailure(HostSet hosts)
95 {
96 viewHosts = app.getViewHosts();
97 if (recoveredHost != null && !viewHosts.containsHost(recoveredHost)) {
98 /*
99 * The last recovered host is not yet in the current view; check the
100 * status of the replica on that host.
101 */
102 if (!checkReplicaStatus(recoveredHost)) {
103 if (log.isDebugEnabled())
104 log.debug("We are pending join from replica on " + recoveredHost);
105 // Reschedule the ViewMonitor
106 return true;
107 }
108 recoveredHost = null;
109 }
110 /*
111 * We need to clone the assigned host set since the underlying
112 * host set may be modified by the relocation below, which may
113 * cause a concurrent modification exception.
114 */
115 HostSet assignedHosts = (HostSet) hosts.clone();
116 if (log.isDebugEnabled())
117 log.debug("assignedHosts: " + assignedHosts);
118 if (log.isDebugEnabled())
119 log.debug("handleFailure(): " + app);
120 boolean groupFailed = prepareRecovery();
121 boolean recovered = true;
122 if (groupFailed && assignedHosts.getAvailHosts().isEmpty()) {
123 for (Host host : assignedHosts) {
124 recovered &= relocateReplica(host);
125 }
126 } else {
127 recovered = doRecover(assignedHosts);
128 }
129 return recovered;
130 }
131
132
133 private boolean doRecover(HostSet assignedHosts)
134 {
135 /*
136 * The assignedHosts set may have been updated in a previous run,
137 * and if so should contain the new host on which the replica is being
138 * installed (in case of recovery).
139 */
140 HostSet hostsToCheck = new HostSet();
141 HostSet recoverHosts = new HostSet();
142 hostsToCheck.addHosts(assignedHosts);
143 hostsToCheck.removeHosts(viewHosts);
144 if (log.isDebugEnabled()) {
145 log.debug(" viewHosts: " + viewHosts);
146 log.debug("assignedHosts: " + assignedHosts);
147 log.debug(" hostsToCheck: " + hostsToCheck);
148 log.debug(" recoverHosts: " + recoverHosts);
149 }
150
151 for (Host host : hostsToCheck) {
152 if (!host.isAvailable()) {
153 recoverHosts.addHost(host);
154 } else {
155 if (checkReplicaStatus(host)) {
156 // The replica on the host has not installed a view; recovery needed
157 recoverHosts.addHost(host);
158 }
159 }
160 }
161
162 if (log.isDebugEnabled())
163 log.debug(" recoverHosts: " + recoverHosts);
164 boolean recovered = true;
165 if (!recoverHosts.isEmpty()) {
166 /*
167 * If more than one replica is missing, relocate multiple replicas
168 * in this iteration of the recovery algorithm. Otherwise, we only
169 * relocate one replica for each iteration of the recovery algorithm.
170 * The recovery algorithm is triggered by the ViewMonitor timeouts.
171 * The missing variable is initialized by the prepareRecovery() method,
172 * and updated by the relocateReplica() method.
173 */
174 while (missing > 0 && !recoverHosts.isEmpty()) {
175 // We only attempt to recover from one host at a time
176 Host failedHost = recoverHosts.removeFirst();
177 // This call will actually modify the 'assignedHosts' set
178 recovered &= relocateReplica(failedHost);
179 }
180 }
181 return recovered;
182 }
183
184
185 /**
186 * Check the status of the replica associated with the outer <code>GroupData</code>
187 * object on the given host.
188 *
189 * If this method returns true it means that a recovery action is needed for
190 * the given host. If false is returned it means that recovery may be needed,
191 * but additional waits could regain the required redundancy levels.
192 *
193 * @param host The host on which to check the replica status.
194 * @return True is returned if the replica was created and later failed
195 * or is not responding by installing a view after several checks.
196 * False is returned if replica has been created on the given host,
197 * but has not yet installed a view reflecting this.
198 */
199 private boolean checkReplicaStatus(Host host)
200 {
201 Set replicas = host.queryReplicas();
202 /* The host has not yet joined the group; check if the replica has been created. */
203 if (replicas.contains(app.getClassData())) {
204 if (host.isJoining(app)) {
205 log.debug(app + " is pending to join the group on " + host);
206 } else {
207 ReplicaState state = host.getState(app);
208 log.debug(app + " state: " + state + " on " + host);
209 }
210 /* The replica has been created, but not installed a view. */
211 if (retriesExhausted(host)) {
212 log.warn("Removing replica on " + host + "; it has not installed a view in due time");
213 // host.removeReplica(app);
214 /* This replica has been removed from this host; try another host */
215 // return true;
216 //FIXME find better solutions
217 return false; // ignore this possibility
218 } else {
219 /* The replica has been created, but not installed a view. */
220 log.info(app + " has been created on " + host + " (but not yet installed a view)");
221 /* Just reschedule the view monitor */
222 return false;
223 }
224 } else {
225 /* The replica does not seem to have been created (or has crashed). */
226 if (retriesExhausted(host)) {
227 log.warn(app + ": Replica creation failed on " + host);
228 /* Try another host */
229 return true;
230 } else {
231 /* The replica was not created (or has crashed), but the host is still available. */
232 boolean created = restartReplica(host);
233 if (created) {
234 log.info(app + " has been restarted on " + host);
235 /* Just reschedule the view monitor */
236 return false;
237 } else {
238 log.warn(app + " failed to restart on " + host);
239 /* Try another host */
240 return true;
241 }
242 }
243 }
244 }
245
246
247 /**
248 * Returns true if the given host has exhausted its retry count; False is
249 * returned otherwise and the retry count is incremented.
250 *
251 * @param host the host whose retry count to query
252 */
253 private boolean retriesExhausted(Host host)
254 {
255 Integer retries = hostRetries.get(host);
256 if (retries == null)
257 retries = 0;
258 if (retries < RETRY_LIMIT) {
259 hostRetries.put(host, ++retries);
260 if (log.isDebugEnabled())
261 log.debug("Checking " + app + " on " + host + "; retry: " + retries);
262 return false;
263 } else {
264 resetRetries(host);
265 log.warn(app + ": Exhausted the retries to check on " + host);
266 return true;
267 }
268 }
269
270 private void resetRetries(Host host)
271 {
272 // Reset the retry count for future use of this host
273 hostRetries.put(host, 0);
274 }
275
276
277 /* (non-Javadoc)
278 * @see jgroup.core.arm.RecoveryStrategy#restartReplica(jgroup.relacs.config.Host)
279 */
280 public boolean restartReplica(Host host)
281 {
282 if (log.isDebugEnabled())
283 log.debug("restartReplica(): " + app);
284 /*
285 * Create a single new replica on the same host, if there still
286 * are missing replicas to satisfy the requirements of this
287 * recovery strategy.
288 */
289 boolean recovered = false;
290 if (missing > 0) {
291 recovered = host.createReplica(app);
292 if (recovered) {
293 missing--;
294 if (log.isDebugEnabled())
295 log.debug("Restarted replica " + app.getClassName() + " on " + host);
296 }
297 return recovered;
298 } else {
299 /*
300 * No missing replicas remains, recovery is consider to be successful
301 */
302 return true;
303 }
304 }
305
306
307 /* (non-Javadoc)
308 * @see jgroup.core.arm.RecoveryStrategy#relocateReplica(jgroup.relacs.config.Host)
309 */
310 public boolean relocateReplica(Host host)
311 {
312 if (log.isDebugEnabled())
313 log.debug("relocateReplica: " + app.getClassName() + " away from " + host);
314
315 /*
316 * Create a replacement replica for this application on the
317 * newly assigned host, if there are still missing replicas.
318 */
319 boolean recovered = false;
320 if (missing > 0) {
321 try {
322 recoveredHost = distScheme.reassignReplica(app, host);
323 recovered = recoveredHost.createReplica(app);
324 if (recovered) {
325 missing--;
326 if (log.isDebugEnabled())
327 log.debug("Reassigned replica to new host: " + recoveredHost);
328 }
329 } catch (Exception e) {
330 // Ignore; recovered is false
331 }
332 return recovered;
333 } else {
334 /*
335 * No missing replicas remains, recovery is consider to be successful
336 */
337 return true;
338 }
339 }
340
341 } // END KeepMinimalInPartition