1 /* 2 * Copyright (c) 1998-2002 The Jgroup Team. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU Lesser General Public License for more details. 12 * 13 * You should have received a copy of the GNU Lesser General Public License 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 16 * 17 */ 18 19 package jgroup.arm.recovery; 20 21 import java.util.HashMap; 22 import java.util.Map; 23 import java.util.Set; 24 25 import jgroup.relacs.config.Host; 26 import jgroup.relacs.config.HostSet; 27 import jgroup.relacs.config.Host.ReplicaState; 28 29 import org.apache.log4j.Logger; 30 31 32 33 /** 34 * If the current number of replicas falls below the minimal redundancy 35 * specified for this replica, and the number of available hosts in 36 * this partition is greater than the minimal redundancy then create 37 * new replicas to compensate for the lost redundancy. 38 * 39 * @author Hein Meling 40 * @since Jgroup 1.2 41 */ 42 public class KeepMinimalInPartition 43 extends AbstractRecoveryStrategy 44 { 45 46 //////////////////////////////////////////////////////////////////////////////////////////// 47 // Logger 48 //////////////////////////////////////////////////////////////////////////////////////////// 49 50 /** Obtain logger for this class */ 51 private static final Logger log = Logger.getLogger(KeepMinimalInPartition.class); 52 53 54 //////////////////////////////////////////////////////////////////////////////////////////// 55 // Constants 56 //////////////////////////////////////////////////////////////////////////////////////////// 57 58 private static final int RETRY_LIMIT = 3; 59 60 61 //////////////////////////////////////////////////////////////////////////////////////////// 62 // Fields 63 //////////////////////////////////////////////////////////////////////////////////////////// 64 65 /** Map of number of retries for a given host */ 66 private final Map<Host, Integer> hostRetries = new HashMap<Host, Integer>(); 67 68 /** The host on which a replacement replica has been installed (pending join) */ 69 private Host recoveredHost; 70 71 /** The set of hosts in the current view */ 72 private HostSet viewHosts; 73 74 75 //////////////////////////////////////////////////////////////////////////////////////////// 76 // RecoveryStrategy interface methods (those not implemented in AbstractRecoveryStrategy) 77 //////////////////////////////////////////////////////////////////////////////////////////// 78 79 /** 80 * FIXME UPDATE DOCUMENTATION: 81 * The <code>KeepMinimalInPartition</code> implementation simply 82 * assume that the servers do not maintain any state or has some 83 * other means of recovering their state, and tries to relocate 84 * the replicas to a new set of hosts. 85 * 86 * @param hosts 87 * The set of hosts on which the application was running, 88 * but has now failed. 89 * 90 * @return 91 * True if all replicas replacement repliacs has been created successfully, 92 * false otherwise. 93 */ 94 public boolean handleFailure(HostSet hosts) 95 { 96 viewHosts = app.getViewHosts(); 97 if (recoveredHost != null && !viewHosts.containsHost(recoveredHost)) { 98 /* 99 * The last recovered host is not yet in the current view; check the 100 * status of the replica on that host. 101 */ 102 if (!checkReplicaStatus(recoveredHost)) { 103 if (log.isDebugEnabled()) 104 log.debug("We are pending join from replica on " + recoveredHost); 105 // Reschedule the ViewMonitor 106 return true; 107 } 108 recoveredHost = null; 109 } 110 /* 111 * We need to clone the assigned host set since the underlying 112 * host set may be modified by the relocation below, which may 113 * cause a concurrent modification exception. 114 */ 115 HostSet assignedHosts = (HostSet) hosts.clone(); 116 if (log.isDebugEnabled()) 117 log.debug("assignedHosts: " + assignedHosts); 118 if (log.isDebugEnabled()) 119 log.debug("handleFailure(): " + app); 120 boolean groupFailed = prepareRecovery(); 121 boolean recovered = true; 122 if (groupFailed && assignedHosts.getAvailHosts().isEmpty()) { 123 for (Host host : assignedHosts) { 124 recovered &= relocateReplica(host); 125 } 126 } else { 127 recovered = doRecover(assignedHosts); 128 } 129 return recovered; 130 } 131 132 133 private boolean doRecover(HostSet assignedHosts) 134 { 135 /* 136 * The assignedHosts set may have been updated in a previous run, 137 * and if so should contain the new host on which the replica is being 138 * installed (in case of recovery). 139 */ 140 HostSet hostsToCheck = new HostSet(); 141 HostSet recoverHosts = new HostSet(); 142 hostsToCheck.addHosts(assignedHosts); 143 hostsToCheck.removeHosts(viewHosts); 144 if (log.isDebugEnabled()) { 145 log.debug(" viewHosts: " + viewHosts); 146 log.debug("assignedHosts: " + assignedHosts); 147 log.debug(" hostsToCheck: " + hostsToCheck); 148 log.debug(" recoverHosts: " + recoverHosts); 149 } 150 151 for (Host host : hostsToCheck) { 152 if (!host.isAvailable()) { 153 recoverHosts.addHost(host); 154 } else { 155 if (checkReplicaStatus(host)) { 156 // The replica on the host has not installed a view; recovery needed 157 recoverHosts.addHost(host); 158 } 159 } 160 } 161 162 if (log.isDebugEnabled()) 163 log.debug(" recoverHosts: " + recoverHosts); 164 boolean recovered = true; 165 if (!recoverHosts.isEmpty()) { 166 /* 167 * If more than one replica is missing, relocate multiple replicas 168 * in this iteration of the recovery algorithm. Otherwise, we only 169 * relocate one replica for each iteration of the recovery algorithm. 170 * The recovery algorithm is triggered by the ViewMonitor timeouts. 171 * The missing variable is initialized by the prepareRecovery() method, 172 * and updated by the relocateReplica() method. 173 */ 174 while (missing > 0 && !recoverHosts.isEmpty()) { 175 // We only attempt to recover from one host at a time 176 Host failedHost = recoverHosts.removeFirst(); 177 // This call will actually modify the 'assignedHosts' set 178 recovered &= relocateReplica(failedHost); 179 } 180 } 181 return recovered; 182 } 183 184 185 /** 186 * Check the status of the replica associated with the outer <code>GroupData</code> 187 * object on the given host. 188 * 189 * If this method returns true it means that a recovery action is needed for 190 * the given host. If false is returned it means that recovery may be needed, 191 * but additional waits could regain the required redundancy levels. 192 * 193 * @param host The host on which to check the replica status. 194 * @return True is returned if the replica was created and later failed 195 * or is not responding by installing a view after several checks. 196 * False is returned if replica has been created on the given host, 197 * but has not yet installed a view reflecting this. 198 */ 199 private boolean checkReplicaStatus(Host host) 200 { 201 Set replicas = host.queryReplicas(); 202 /* The host has not yet joined the group; check if the replica has been created. */ 203 if (replicas.contains(app.getClassData())) { 204 if (host.isJoining(app)) { 205 log.debug(app + " is pending to join the group on " + host); 206 } else { 207 ReplicaState state = host.getState(app); 208 log.debug(app + " state: " + state + " on " + host); 209 } 210 /* The replica has been created, but not installed a view. */ 211 if (retriesExhausted(host)) { 212 log.warn("Removing replica on " + host + "; it has not installed a view in due time"); 213 // host.removeReplica(app); 214 /* This replica has been removed from this host; try another host */ 215 // return true; 216 //FIXME find better solutions 217 return false; // ignore this possibility 218 } else { 219 /* The replica has been created, but not installed a view. */ 220 log.info(app + " has been created on " + host + " (but not yet installed a view)"); 221 /* Just reschedule the view monitor */ 222 return false; 223 } 224 } else { 225 /* The replica does not seem to have been created (or has crashed). */ 226 if (retriesExhausted(host)) { 227 log.warn(app + ": Replica creation failed on " + host); 228 /* Try another host */ 229 return true; 230 } else { 231 /* The replica was not created (or has crashed), but the host is still available. */ 232 boolean created = restartReplica(host); 233 if (created) { 234 log.info(app + " has been restarted on " + host); 235 /* Just reschedule the view monitor */ 236 return false; 237 } else { 238 log.warn(app + " failed to restart on " + host); 239 /* Try another host */ 240 return true; 241 } 242 } 243 } 244 } 245 246 247 /** 248 * Returns true if the given host has exhausted its retry count; False is 249 * returned otherwise and the retry count is incremented. 250 * 251 * @param host the host whose retry count to query 252 */ 253 private boolean retriesExhausted(Host host) 254 { 255 Integer retries = hostRetries.get(host); 256 if (retries == null) 257 retries = 0; 258 if (retries < RETRY_LIMIT) { 259 hostRetries.put(host, ++retries); 260 if (log.isDebugEnabled()) 261 log.debug("Checking " + app + " on " + host + "; retry: " + retries); 262 return false; 263 } else { 264 resetRetries(host); 265 log.warn(app + ": Exhausted the retries to check on " + host); 266 return true; 267 } 268 } 269 270 private void resetRetries(Host host) 271 { 272 // Reset the retry count for future use of this host 273 hostRetries.put(host, 0); 274 } 275 276 277 /* (non-Javadoc) 278 * @see jgroup.core.arm.RecoveryStrategy#restartReplica(jgroup.relacs.config.Host) 279 */ 280 public boolean restartReplica(Host host) 281 { 282 if (log.isDebugEnabled()) 283 log.debug("restartReplica(): " + app); 284 /* 285 * Create a single new replica on the same host, if there still 286 * are missing replicas to satisfy the requirements of this 287 * recovery strategy. 288 */ 289 boolean recovered = false; 290 if (missing > 0) { 291 recovered = host.createReplica(app); 292 if (recovered) { 293 missing--; 294 if (log.isDebugEnabled()) 295 log.debug("Restarted replica " + app.getClassName() + " on " + host); 296 } 297 return recovered; 298 } else { 299 /* 300 * No missing replicas remains, recovery is consider to be successful 301 */ 302 return true; 303 } 304 } 305 306 307 /* (non-Javadoc) 308 * @see jgroup.core.arm.RecoveryStrategy#relocateReplica(jgroup.relacs.config.Host) 309 */ 310 public boolean relocateReplica(Host host) 311 { 312 if (log.isDebugEnabled()) 313 log.debug("relocateReplica: " + app.getClassName() + " away from " + host); 314 315 /* 316 * Create a replacement replica for this application on the 317 * newly assigned host, if there are still missing replicas. 318 */ 319 boolean recovered = false; 320 if (missing > 0) { 321 try { 322 recoveredHost = distScheme.reassignReplica(app, host); 323 recovered = recoveredHost.createReplica(app); 324 if (recovered) { 325 missing--; 326 if (log.isDebugEnabled()) 327 log.debug("Reassigned replica to new host: " + recoveredHost); 328 } 329 } catch (Exception e) { 330 // Ignore; recovered is false 331 } 332 return recovered; 333 } else { 334 /* 335 * No missing replicas remains, recovery is consider to be successful 336 */ 337 return true; 338 } 339 } 340 341 } // END KeepMinimalInPartition