View Javadoc

1   /*
2    * Copyright (c) 1998-2002 The Jgroup Team.
3    *
4    * This program is free software; you can redistribute it and/or modify
5    * it under the terms of the GNU Lesser General Public License version 2 as
6    * published by the Free Software Foundation.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * GNU Lesser General Public License for more details.
12   *
13   * You should have received a copy of the GNU Lesser General Public License
14   * along with this program; if not, write to the Free Software
15   * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16   *
17   */
18  
19  package jgroup.relacs.daemon;
20  
21  import java.io.IOException;
22  import java.rmi.RemoteException;
23  import java.util.Iterator;
24  
25  import jgroup.core.ConfigManager;
26  import jgroup.core.JgroupException;
27  import jgroup.core.MemberId;
28  import jgroup.core.registry.BootstrapRegistry;
29  import jgroup.relacs.config.DistributedSystemConfig;
30  import jgroup.relacs.config.Domain;
31  import jgroup.relacs.config.Host;
32  import jgroup.relacs.config.HostSet;
33  import jgroup.relacs.config.TransportConfig;
34  import jgroup.relacs.events.Event;
35  import jgroup.relacs.gm.RemoteDispatcher;
36  import jgroup.util.MsgFactory;
37  import jgroup.util.ThreadMonitor;
38  
39  import org.apache.log4j.LogManager;
40  import org.apache.log4j.Logger;
41  import org.apache.log4j.MDC;
42  
43  
44  /**
45   *  The <code>DaemonInteraction</code> class facilitate a common
46   *  interaction point between the daemon and group managers.  Its main
47   *  purpose is to provide common error handling.
48   *
49   *  @author Hein Meling
50   *  @since Jgroup 1.2
51   */
52  public class DaemonInteraction
53  {
54  
55    ////////////////////////////////////////////////////////////////////////////////////////////
56    // Logger
57    ////////////////////////////////////////////////////////////////////////////////////////////
58  
59    /** Obtain logger for this class */
60    private static final Logger log = Logger.getLogger(DaemonInteraction.class);
61  
62  
63    ////////////////////////////////////////////////////////////////////////////////////////////
64    // Static fields
65    ////////////////////////////////////////////////////////////////////////////////////////////
66  
67    /** 
68     * The remote daemon object reference (same host, so also local, but
69     * going through RMI).  
70     */
71    private static DaemonService daemonService;
72  
73    /** Transport configuration information */
74    private static TransportConfig tconf;
75  
76  
77    ////////////////////////////////////////////////////////////////////////////////////////////
78    // Static methods
79    ////////////////////////////////////////////////////////////////////////////////////////////
80  
81    /**
82     *  Initialize the Jgroup daemon, according to the specified number of
83     *  daemons that should exist in this domain.
84     *
85     *  @throws JgroupException
86     */
87    public static void initDaemon()
88      throws JgroupException
89    {
90      /* Obtains configuration information */
91      tconf = (TransportConfig) ConfigManager.getConfig(TransportConfig.class);
92  
93      /* Obtain distributed system configuration information */
94      DistributedSystemConfig dsc = ConfigManager.getDistributedSystem();
95  
96      /* Message factory initialization */
97      MsgFactory.initMsgFactory(tconf.getPayload(), dsc.size(), dsc.numOfDomains());
98  
99      Domain localDomain = DistributedSystemConfig.getLocalDomain();
100     if (localDomain.allDaemons()) {
101       /* All the hosts in this domain should have a Jgroup daemon. */
102       init();
103 
104     } else if (localDomain.hasNoDaemons()) {
105       /*
106        * The local domain should not have any daemons; we need to query
107        * the remote domains for any Jgroup daemons.
108        */
109        throw new UnsupportedOperationException("The local domain must have at least one Jgroup daemon");
110 
111     } else {
112       /* Check if the local JVM already has a daemon */
113       if (checkLocalDaemon() < 0) {
114         try {
115           /* Try to obtain a daemon running in another JVM on the localhost */
116           daemonService = Daemon.lookupLocalDaemon();
117           if (log.isDebugEnabled()) {
118             log.debug("Daemon found in bootstrap registry");
119           }
120         } catch (JgroupException e) {
121           /* There is no daemon running on the local host. */
122           checkRemoteDaemons(localDomain);
123         }
124       }
125     }
126   }
127 
128 
129   /**
130    *  Returns the number of members known to the local daemon.  If a negative
131    *  value is returned, the local daemon references is either <code>null</code>
132    *  or no longer usable.
133    */
134   private static int checkLocalDaemon()
135   {
136     if (daemonService == null)
137       daemonService = Daemon.quickLookupLocalDaemon();
138     if (daemonService != null) {
139       try {
140         /* Check if the currently available daemon reference is valid. */
141         return daemonService.members();
142       } catch (RemoteException e) {
143         /* The locally known daemon reference is not valid. */
144       }
145     }
146     return -1;
147   }
148 
149 
150   /**
151    *  Check all hosts in the provided domain for available daemons.
152    *  Once we have determined the hosts that are running daemons, we
153    *  check if there is enough daemons in this domain, according to the
154    *  specified number of daemons that should reside in this domain.
155    *  If there is enough, we attach ourselves to the daemon with the
156    *  least associated members.  If there is not enough daemons, we
157    *  attempt to create one locally.
158    *
159    *  @param domain
160    *    The domain in which to look for Jgroup daemons.
161    *  @throws JgroupException
162    *    Raised if a local Jgroup daemon could not be created.
163    */
164   private static void checkRemoteDaemons(Domain domain)
165     throws JgroupException
166   {
167     HostSet hosts = domain.getHostSet();
168     int jdaemons = domain.numOfDaemons(), minMembers = Integer.MAX_VALUE;
169     for (Iterator iter = hosts.iterator(); jdaemons > 0 && iter.hasNext();) {
170       Host host = (Host) iter.next();
171       String dRegName = DaemonService.DAEMON_NAME + host.getPort();
172       try {
173         DaemonService jdService = (DaemonService) host.lookup(dRegName);
174 
175         /* Number of members associated with this particular daemon */
176         int members = jdService.members();
177         /* If we get here, we have found another daemon */
178         jdaemons--;
179         /*
180          * Until we know if we should create a local daemon, we assume
181          * that the daemon recently found to be hosting the least amount
182          * of members will be selected as our daemon.
183          */
184         if (members < minMembers) {
185           daemonService = jdService;
186           minMembers = members;
187         }
188         if (log.isDebugEnabled()) {
189           log.debug("Found daemon on " + host);
190           log.debug("Members: " + members);
191         }
192 
193       } catch (Exception e) {
194         /*
195          * This will catch NotBoundException, AccessException
196          * and RemoteException.
197          */
198       }
199     }
200     if (jdaemons > 0) {
201       if (log.isDebugEnabled()) {
202         log.debug("remaining jdaemons to create: " + jdaemons);
203         log.debug("domain.jdaemons: " + domain.numOfDaemons());
204       }
205       /*
206        * Since there is not enough daemons in this domain yet,
207        * we create one locally.
208        */
209       init();
210 
211     } else {
212       /*
213        * Since there is already enough Jgroup daemons in this domain
214        * we simply get the one with the least members assigned to it.
215        */
216       if (log.isDebugEnabled()) {
217         log.debug("Using: " + daemonService);
218       }
219 
220     }
221   }
222 
223 
224   /**
225    *  Create or attach to a Jgroup daemon on the local host.
226    */
227   private static void init()
228     throws JgroupException
229   {
230     try {
231       BootstrapRegistry.refreshRegistryStub();
232     } catch (RemoteException e) {
233       throw new JgroupException("Could not initialize the bootstrap registry");
234     }
235 
236     /* Check if the local JVM already has a daemon */
237     if (checkLocalDaemon() < 0) {
238       try {
239         /* Try to create a daemon in the local JVM. */
240         daemonService = Daemon.createDaemon(tconf);
241         if (log.isDebugEnabled())
242           log.debug("Daemon created");
243         Daemon.bindLocalDaemon(daemonService);
244       } catch (IOException ioe) {
245         if (log.isDebugEnabled())
246           log.warn("Failed to create local Daemon", ioe);
247         /*
248          * Try to obtain an RMI-based daemon running in another JVM,
249          * since we failed to create a daemon in the local JVM.
250          */
251         daemonService = Daemon.lookupLocalDaemon();
252         if (log.isDebugEnabled())
253           log.debug("Daemon found in bootstrap registry");
254       }
255     }
256   }
257 
258 
259   /**
260    *  Returns a <code>MemberId</code> for a new member.  The new
261    *  identifier is obtained by concatenating the endpoint of this
262    *  daemon, the incarnation number used to distinguish different
263    *  incarnation of the same endpoint, and a sequence number used to
264    *  distinguish different members running on the same host (different
265    *  JVMs).
266    *
267    *  @exception JgroupException
268    *    Raised if we could not obtain a member identifier due to
269    *    communication problems with the daemon.
270    */
271   public static MemberId getMemberId()
272     throws JgroupException
273   {
274     try {
275       return daemonService.getMemberId(DistributedSystemConfig.getLocalHost());
276     } catch (RemoteException e) {
277       throw new JgroupException("Unable to obtain my member identifier", e);
278     }
279   }
280 
281 
282   /**
283    *  Adds an event generated by a member to the daemon's event queue.
284    * 
285    *  @return
286    *    True if the event was successfully added; false is returned
287    *    if the daemon has crashed, and not recovered.
288    */
289   public static boolean addEvent(Event event)
290   {
291     try {
292       if (log.isDebugEnabled()) {
293         MDC.put("group", "[Group: " + event.getGid() + "]");
294         log.debug("Member->Daemon: " + event);
295       }
296       daemonService.addEvent(event);
297       if (log.isDebugEnabled())
298         log.debug("addEvent returned successfully (M->D)");
299       return true;
300     } catch (RemoteException e) {
301       /*
302        * If the system property 'jgroup.daemon.recover.locally' is set to
303        * true, we try to recover from a crashed daemon by performing the
304        * initDaemon() method.  Otherwise, we simply perform a system halt()
305        * to ensure that this JVM shuts itself down immediately.  This is
306        * useful to avoid that a member generates "noise" while it is in an
307        * unstable state, and may not recover successfully anyway.  This
308        * allows remaining members of the group to quickly detect this
309        * member's failure, and inform the ARM framework of the crash whom
310        * will take the opportune countermeasures.
311        */
312       log.warn("Member failed to send event to Daemon", e);
313       return handleDaemonFailure();
314     }
315   }
316 
317 
318   /**
319    *  Adds an event generated by the daemon to given (local) member's
320    *  event queue.
321    *
322    *  @param member
323    *    The recipient member (Dispatcher) of this event.
324    *  @param event
325    *    The event to be added to the given member's event queue.
326    *  @return
327    *    True if communication is ok; false otherwise.
328    */
329   public static boolean addEvent(MemberData member, Event event)
330   {
331     try {
332       if (log.isDebugEnabled()) {
333         log.debug("Daemon->Member(" + member + "): " + event);
334       }
335       RemoteDispatcher dispatcher = member.getRemoteDispatcher();
336       synchronized (dispatcher) {
337         dispatcher.addEvent(event);
338       }
339       if (log.isDebugEnabled())
340         log.debug("addEvent returned successfully (D->M)");
341       return true;
342     } catch (RemoteException e) {
343       /*
344        * Return false to notify the daemon that communication
345        * with the member has failed.  If the member is local
346        * (i.e., resides in the same virtual machine), this can
347        * never happend.  Otherwise, the daemon will take the
348        * opportune actions to install a new view.  Exception
349        * is printed in the debug log to enable to check for
350        * remote exceptions not related to failed communication.
351        */
352       log.warn("Daemon failed to send event to " + member, e);
353       return false;
354     } 
355   }
356 
357 
358   /**
359    *  Handle daemon-member interaction failures. <p>
360    *
361    *  If the system property 'jgroup.daemon.recover.locally' is true,
362    *  and there is a confirmed daemon crash, local recovery is performed.
363    *  Otherwise, this member will commit suicide.
364    *
365    *  @return
366    *    True is returned if a new daemon was successfully initialized.
367    */
368   public static boolean handleDaemonFailure()
369   {
370     if (DaemonInteraction.suspectDaemon()) {
371       /*
372        * The daemon was correctly suspected.  Check if we should
373        * try to recover locally, or leave it to ARM.  Default is
374        * to not try to recover the daemon locally.
375        */
376       if (Boolean.getBoolean("jgroup.daemon.recover.locally")) {
377         /*
378          * Try to recover a Jgroup daemon on the localhost.
379          */
380         if (log.isDebugEnabled())
381           log.debug("Trying to recover daemon locally");
382         try {
383           initDaemon();
384           return true;
385         } catch (JgroupException e) {
386           log.warn("Local daemon recovery failed.", e);
387           return false;
388         }
389       } else {
390         /*
391          * By default, we do not try to recover the daemon locally;
392          * instead we ensure to crash this JVM so that ARM can handle
393          * this crash.
394          */
395         log.error("Crashing this member dispatcher since the daemon crashed");
396         // Ensure that log files are closed
397         LogManager.shutdown();
398         Runtime.getRuntime().halt(13);
399         return false;
400       }
401     } else {
402       /*
403        * The daemon is still alive, but was suspected;
404        * appears not to be pinging me anymore.
405        */
406       log.error("DUMPING THREADS");
407       for (int i = 0; i < 10; i++) {
408         ThreadMonitor.dumpAllThreads();
409         try {
410           Thread.sleep(200);
411         } catch (InterruptedException e) { }
412       }
413       log.error("COMMITTING SUICIDE");
414       Runtime.getRuntime().halt(13);
415       return false;
416     }
417   }
418 
419 
420   /**
421    *  Method used to indicate that the daemon has not responded within
422    *  some predefined timeout.  Here we try to recover by installing
423    *  a new daemon locally.
424    *
425    *  @return true if the daemon is not responding, false otherwise.
426    */
427   public static boolean suspectDaemon()
428   {
429     assert daemonService != null : "DaemonService should not be null";
430     try {
431       int members = daemonService.members();
432       if (log.isDebugEnabled())
433         log.debug("Daemon is alive with members: "+ members);
434       return false;
435     } catch (RemoteException e1) {
436       if (log.isDebugEnabled()) {
437         log.debug("Daemon correctly suspected", e1);
438       }
439       return true;
440     }
441   }
442 
443 } // END DaemonInteraction