View Javadoc

1   /*
2    * Copyright (c) 1998-2002 The Jgroup Team.
3    *
4    * This program is free software; you can redistribute it and/or modify
5    * it under the terms of the GNU Lesser General Public License version 2 as
6    * published by the Free Software Foundation.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Lesser General Public License for more details.
12   *
13   * You should have received a copy of the GNU Lesser General Public License
14   * along with this program; if not, write to the Free Software
15   * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16   *
17   */
18   
19  package jgroup.arm.recovery;
20  
21  import java.util.HashMap;
22  import java.util.Map;
23  import java.util.Set;
24  
25  import jgroup.relacs.config.Host;
26  import jgroup.relacs.config.HostSet;
27  import jgroup.relacs.config.Host.ReplicaState;
28  
29  import org.apache.log4j.Logger;
30  
31  
32  
33  /**
34   *  If the current number of replicas falls below the minimal redundancy
35   *  specified for this replica, and the number of available hosts in
36   *  this partition is greater than the minimal redundancy then create
37   *  new replicas to compensate for the lost redundancy.
38   *
39   *  @author Hein Meling
40   *  @since Jgroup 1.2
41   */
42  public class KeepMinimalInPartition
43    extends AbstractRecoveryStrategy
44  {
45  
46    ////////////////////////////////////////////////////////////////////////////////////////////
47    // Logger
48    ////////////////////////////////////////////////////////////////////////////////////////////
49  
50    /** Obtain logger for this class */
51    private static final Logger log = Logger.getLogger(KeepMinimalInPartition.class);
52  
53  
54    ////////////////////////////////////////////////////////////////////////////////////////////
55    // Constants
56    ////////////////////////////////////////////////////////////////////////////////////////////
57  
58    private static final int RETRY_LIMIT = 3;
59  
60  
61    ////////////////////////////////////////////////////////////////////////////////////////////
62    // Fields
63    ////////////////////////////////////////////////////////////////////////////////////////////
64  
65    /** Map of number of retries for a given host */
66    private final Map<Host, Integer> hostRetries = new HashMap<Host, Integer>();
67  
68    /** The host on which a replacement replica has been installed (pending join) */
69    private Host recoveredHost;
70  
71    /** The set of hosts in the current view */
72    private HostSet viewHosts;
73  
74  
75    ////////////////////////////////////////////////////////////////////////////////////////////
76    // RecoveryStrategy interface methods (those not implemented in AbstractRecoveryStrategy)
77    ////////////////////////////////////////////////////////////////////////////////////////////
78  
79    /**
80     * FIXME UPDATE DOCUMENTATION:
81     *  The <code>KeepMinimalInPartition</code> implementation simply
82     *  assume that the servers do not maintain any state or has some
83     *  other means of recovering their state, and tries to relocate
84     *  the replicas to a new set of hosts.
85     *
86     *  @param hosts
87     *    The set of hosts on which the application was running,
88     *    but has now failed.
89     *
90     *  @return 
91     *    True if all replicas replacement repliacs has been created successfully,
92     *    false otherwise.
93     */
94    public boolean handleFailure(HostSet hosts)
95    {
96      viewHosts = app.getViewHosts();
97      if (recoveredHost != null && !viewHosts.containsHost(recoveredHost)) {
98        /*
99         * The last recovered host is not yet in the current view; check the
100        * status of the replica on that host.
101        */
102       if (!checkReplicaStatus(recoveredHost)) {
103         if (log.isDebugEnabled())
104           log.debug("We are pending join from replica on " + recoveredHost);
105         // Reschedule the ViewMonitor
106         return true;
107       }
108       recoveredHost = null;
109     }
110     /*
111      * We need to clone the assigned host set since the underlying
112      * host set may be modified by the relocation below, which may
113      * cause a concurrent modification exception.
114      */
115     HostSet assignedHosts = (HostSet) hosts.clone();
116     if (log.isDebugEnabled())
117       log.debug("assignedHosts: " + assignedHosts);
118     if (log.isDebugEnabled())
119       log.debug("handleFailure(): " + app);
120     boolean groupFailed = prepareRecovery();
121     boolean recovered = true;
122     if (groupFailed && assignedHosts.getAvailHosts().isEmpty()) {
123       for (Host host : assignedHosts) {
124         recovered &= relocateReplica(host);
125       }
126     } else {
127       recovered = doRecover(assignedHosts);
128     }
129     return recovered;
130   }
131 
132 
133   private boolean doRecover(HostSet assignedHosts)
134   {
135     /*
136      * The assignedHosts set may have been updated in a previous run,
137      * and if so should contain the new host on which the replica is being
138      * installed (in case of recovery).
139      */
140     HostSet hostsToCheck = new HostSet();
141     HostSet recoverHosts = new HostSet();
142     hostsToCheck.addHosts(assignedHosts);
143     hostsToCheck.removeHosts(viewHosts);
144     if (log.isDebugEnabled()) {
145       log.debug("    viewHosts: " + viewHosts);
146       log.debug("assignedHosts: " + assignedHosts);
147       log.debug(" hostsToCheck: " + hostsToCheck);
148       log.debug(" recoverHosts: " + recoverHosts);
149     }
150 
151     for (Host host : hostsToCheck) {
152       if (!host.isAvailable()) {
153         recoverHosts.addHost(host);
154       } else {
155         if (checkReplicaStatus(host)) {
156           // The replica on the host has not installed a view; recovery needed
157           recoverHosts.addHost(host);
158         }
159       }
160     }
161 
162     if (log.isDebugEnabled())
163       log.debug(" recoverHosts: " + recoverHosts);
164     boolean recovered = true;
165     if (!recoverHosts.isEmpty()) {
166       /*
167        * If more than one replica is missing, relocate multiple replicas
168        * in this iteration of the recovery algorithm.  Otherwise, we only
169        * relocate one replica for each iteration of the recovery algorithm.
170        * The recovery algorithm is triggered by the ViewMonitor timeouts.
171        * The missing variable is initialized by the prepareRecovery() method,
172        * and updated by the relocateReplica() method.
173        */
174       while (missing > 0 && !recoverHosts.isEmpty()) {
175         // We only attempt to recover from one host at a time
176         Host failedHost = recoverHosts.removeFirst();
177         // This call will actually modify the 'assignedHosts' set
178         recovered &= relocateReplica(failedHost);
179       }
180     }
181     return recovered;
182   }
183 
184 
185   /**
186    * Check the status of the replica associated with the outer <code>GroupData</code>
187    * object on the given host.
188    * 
189    * If this method returns true it means that a recovery action is needed for
190    * the given host.  If false is returned it means that recovery may be needed,
191    * but additional waits could regain the required redundancy levels.
192    * 
193    * @param host The host on which to check the replica status.
194    * @return True is returned if the replica was created and later failed
195    *   or is not responding by installing a view after several checks.
196    *   False is returned if replica has been created on the given host,
197    *   but has not yet installed a view reflecting this.
198    */
199   private boolean checkReplicaStatus(Host host)
200   {
201     Set replicas = host.queryReplicas();
202     /* The host has not yet joined the group; check if the replica has been created. */
203     if (replicas.contains(app.getClassData())) {
204       if (host.isJoining(app)) {
205         log.debug(app + " is pending to join the group on " + host);
206       } else {
207         ReplicaState state = host.getState(app);
208         log.debug(app + " state: " + state + " on " + host);
209       }
210       /* The replica has been created, but not installed a view. */
211       if (retriesExhausted(host)) {
212         log.warn("Removing replica on " + host + "; it has not installed a view in due time");
213 //        host.removeReplica(app);
214         /* This replica has been removed from this host; try another host */
215 //        return true;
216         //FIXME find better solutions
217         return false; // ignore this possibility
218       } else {
219         /* The replica has been created, but not installed a view. */
220         log.info(app + " has been created on " + host + " (but not yet installed a view)");
221         /* Just reschedule the view monitor */
222         return false;
223       }
224     } else {
225       /* The replica does not seem to have been created (or has crashed). */
226       if (retriesExhausted(host)) {
227         log.warn(app + ": Replica creation failed on " + host);
228         /* Try another host */
229         return true;
230       } else {
231         /* The replica was not created (or has crashed), but the host is still available. */
232         boolean created = restartReplica(host);
233         if (created) {
234           log.info(app + " has been restarted on " + host);
235           /* Just reschedule the view monitor */
236           return false;
237         } else {
238           log.warn(app + " failed to restart on " + host);
239           /* Try another host */
240           return true;
241         }
242       }
243     }
244   }
245 
246 
247   /**
248    * Returns true if the given host has exhausted its retry count; False is
249    * returned otherwise and the retry count is incremented.
250    * 
251    * @param host the host whose retry count to query
252    */
253   private boolean retriesExhausted(Host host)
254   {
255     Integer retries = hostRetries.get(host);
256     if (retries == null)
257       retries = 0;
258     if (retries < RETRY_LIMIT) {
259       hostRetries.put(host, ++retries);
260       if (log.isDebugEnabled())
261         log.debug("Checking " + app + " on " + host + "; retry: " + retries);
262       return false;
263     } else {
264       resetRetries(host);
265       log.warn(app + ": Exhausted the retries to check on " + host);
266       return true;
267     }
268   }
269 
270   private void resetRetries(Host host)
271   {
272     // Reset the retry count for future use of this host
273     hostRetries.put(host, 0);
274   }
275 
276 
277   /* (non-Javadoc)
278    * @see jgroup.core.arm.RecoveryStrategy#restartReplica(jgroup.relacs.config.Host)
279    */
280   public boolean restartReplica(Host host)
281   {
282     if (log.isDebugEnabled())
283       log.debug("restartReplica(): " + app);
284     /*
285      * Create a single new replica on the same host, if there still
286      * are missing replicas to satisfy the requirements of this 
287      * recovery strategy.
288      */
289     boolean recovered = false;
290     if (missing > 0) {
291       recovered = host.createReplica(app);
292       if (recovered) {
293         missing--;
294         if (log.isDebugEnabled())
295           log.debug("Restarted replica " + app.getClassName() + " on " + host);
296       }
297       return recovered;
298     } else {
299       /*
300        * No missing replicas remains, recovery is consider to be successful
301        */
302       return true;
303     }
304   }
305 
306 
307   /* (non-Javadoc)
308    * @see jgroup.core.arm.RecoveryStrategy#relocateReplica(jgroup.relacs.config.Host)
309    */
310   public boolean relocateReplica(Host host)
311   {
312     if (log.isDebugEnabled())
313       log.debug("relocateReplica: " + app.getClassName() + " away from " + host);
314 
315     /*
316      * Create a replacement replica for this application on the
317      * newly assigned host, if there are still missing replicas.
318      */
319     boolean recovered = false;
320     if (missing > 0) {
321       try {
322         recoveredHost = distScheme.reassignReplica(app, host);
323         recovered = recoveredHost.createReplica(app);
324         if (recovered) {
325           missing--;
326           if (log.isDebugEnabled())
327             log.debug("Reassigned replica to new host: " + recoveredHost);
328         }
329       } catch (Exception e) {
330         // Ignore; recovered is false
331       }
332       return recovered;
333     } else {
334       /*
335        * No missing replicas remains, recovery is consider to be successful
336        */
337       return true;
338     }
339   }
340 
341 } // END KeepMinimalInPartition