1 /*
2 * Copyright (c) 1998-2002 The Jgroup Team.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16 *
17 */
18
19 package jgroup.relacs.daemon;
20
21 import java.io.IOException;
22 import java.rmi.RemoteException;
23 import java.util.Iterator;
24
25 import jgroup.core.ConfigManager;
26 import jgroup.core.JgroupException;
27 import jgroup.core.MemberId;
28 import jgroup.core.registry.BootstrapRegistry;
29 import jgroup.relacs.config.DistributedSystemConfig;
30 import jgroup.relacs.config.Domain;
31 import jgroup.relacs.config.Host;
32 import jgroup.relacs.config.HostSet;
33 import jgroup.relacs.config.TransportConfig;
34 import jgroup.relacs.events.Event;
35 import jgroup.relacs.gm.RemoteDispatcher;
36 import jgroup.util.MsgFactory;
37 import jgroup.util.ThreadMonitor;
38
39 import org.apache.log4j.LogManager;
40 import org.apache.log4j.Logger;
41 import org.apache.log4j.MDC;
42
43
44 /**
45 * The <code>DaemonInteraction</code> class facilitate a common
46 * interaction point between the daemon and group managers. Its main
47 * purpose is to provide common error handling.
48 *
49 * @author Hein Meling
50 * @since Jgroup 1.2
51 */
52 public class DaemonInteraction
53 {
54
55 ////////////////////////////////////////////////////////////////////////////////////////////
56 // Logger
57 ////////////////////////////////////////////////////////////////////////////////////////////
58
59 /** Obtain logger for this class */
60 private static final Logger log = Logger.getLogger(DaemonInteraction.class);
61
62
63 ////////////////////////////////////////////////////////////////////////////////////////////
64 // Static fields
65 ////////////////////////////////////////////////////////////////////////////////////////////
66
67 /**
68 * The remote daemon object reference (same host, so also local, but
69 * going through RMI).
70 */
71 private static DaemonService daemonService;
72
73 /** Transport configuration information */
74 private static TransportConfig tconf;
75
76
77 ////////////////////////////////////////////////////////////////////////////////////////////
78 // Static methods
79 ////////////////////////////////////////////////////////////////////////////////////////////
80
81 /**
82 * Initialize the Jgroup daemon, according to the specified number of
83 * daemons that should exist in this domain.
84 *
85 * @throws JgroupException
86 */
87 public static void initDaemon()
88 throws JgroupException
89 {
90 /* Obtains configuration information */
91 tconf = (TransportConfig) ConfigManager.getConfig(TransportConfig.class);
92
93 /* Obtain distributed system configuration information */
94 DistributedSystemConfig dsc = ConfigManager.getDistributedSystem();
95
96 /* Message factory initialization */
97 MsgFactory.initMsgFactory(tconf.getPayload(), dsc.size(), dsc.numOfDomains());
98
99 Domain localDomain = DistributedSystemConfig.getLocalDomain();
100 if (localDomain.allDaemons()) {
101 /* All the hosts in this domain should have a Jgroup daemon. */
102 init();
103
104 } else if (localDomain.hasNoDaemons()) {
105 /*
106 * The local domain should not have any daemons; we need to query
107 * the remote domains for any Jgroup daemons.
108 */
109 throw new UnsupportedOperationException("The local domain must have at least one Jgroup daemon");
110
111 } else {
112 /* Check if the local JVM already has a daemon */
113 if (checkLocalDaemon() < 0) {
114 try {
115 /* Try to obtain a daemon running in another JVM on the localhost */
116 daemonService = Daemon.lookupLocalDaemon();
117 if (log.isDebugEnabled()) {
118 log.debug("Daemon found in bootstrap registry");
119 }
120 } catch (JgroupException e) {
121 /* There is no daemon running on the local host. */
122 checkRemoteDaemons(localDomain);
123 }
124 }
125 }
126 }
127
128
129 /**
130 * Returns the number of members known to the local daemon. If a negative
131 * value is returned, the local daemon references is either <code>null</code>
132 * or no longer usable.
133 */
134 private static int checkLocalDaemon()
135 {
136 if (daemonService == null)
137 daemonService = Daemon.quickLookupLocalDaemon();
138 if (daemonService != null) {
139 try {
140 /* Check if the currently available daemon reference is valid. */
141 return daemonService.members();
142 } catch (RemoteException e) {
143 /* The locally known daemon reference is not valid. */
144 }
145 }
146 return -1;
147 }
148
149
150 /**
151 * Check all hosts in the provided domain for available daemons.
152 * Once we have determined the hosts that are running daemons, we
153 * check if there is enough daemons in this domain, according to the
154 * specified number of daemons that should reside in this domain.
155 * If there is enough, we attach ourselves to the daemon with the
156 * least associated members. If there is not enough daemons, we
157 * attempt to create one locally.
158 *
159 * @param domain
160 * The domain in which to look for Jgroup daemons.
161 * @throws JgroupException
162 * Raised if a local Jgroup daemon could not be created.
163 */
164 private static void checkRemoteDaemons(Domain domain)
165 throws JgroupException
166 {
167 HostSet hosts = domain.getHostSet();
168 int jdaemons = domain.numOfDaemons(), minMembers = Integer.MAX_VALUE;
169 for (Iterator iter = hosts.iterator(); jdaemons > 0 && iter.hasNext();) {
170 Host host = (Host) iter.next();
171 String dRegName = DaemonService.DAEMON_NAME + host.getPort();
172 try {
173 DaemonService jdService = (DaemonService) host.lookup(dRegName);
174
175 /* Number of members associated with this particular daemon */
176 int members = jdService.members();
177 /* If we get here, we have found another daemon */
178 jdaemons--;
179 /*
180 * Until we know if we should create a local daemon, we assume
181 * that the daemon recently found to be hosting the least amount
182 * of members will be selected as our daemon.
183 */
184 if (members < minMembers) {
185 daemonService = jdService;
186 minMembers = members;
187 }
188 if (log.isDebugEnabled()) {
189 log.debug("Found daemon on " + host);
190 log.debug("Members: " + members);
191 }
192
193 } catch (Exception e) {
194 /*
195 * This will catch NotBoundException, AccessException
196 * and RemoteException.
197 */
198 }
199 }
200 if (jdaemons > 0) {
201 if (log.isDebugEnabled()) {
202 log.debug("remaining jdaemons to create: " + jdaemons);
203 log.debug("domain.jdaemons: " + domain.numOfDaemons());
204 }
205 /*
206 * Since there is not enough daemons in this domain yet,
207 * we create one locally.
208 */
209 init();
210
211 } else {
212 /*
213 * Since there is already enough Jgroup daemons in this domain
214 * we simply get the one with the least members assigned to it.
215 */
216 if (log.isDebugEnabled()) {
217 log.debug("Using: " + daemonService);
218 }
219
220 }
221 }
222
223
224 /**
225 * Create or attach to a Jgroup daemon on the local host.
226 */
227 private static void init()
228 throws JgroupException
229 {
230 try {
231 BootstrapRegistry.refreshRegistryStub();
232 } catch (RemoteException e) {
233 throw new JgroupException("Could not initialize the bootstrap registry");
234 }
235
236 /* Check if the local JVM already has a daemon */
237 if (checkLocalDaemon() < 0) {
238 try {
239 /* Try to create a daemon in the local JVM. */
240 daemonService = Daemon.createDaemon(tconf);
241 if (log.isDebugEnabled())
242 log.debug("Daemon created");
243 Daemon.bindLocalDaemon(daemonService);
244 } catch (IOException ioe) {
245 if (log.isDebugEnabled())
246 log.warn("Failed to create local Daemon", ioe);
247 /*
248 * Try to obtain an RMI-based daemon running in another JVM,
249 * since we failed to create a daemon in the local JVM.
250 */
251 daemonService = Daemon.lookupLocalDaemon();
252 if (log.isDebugEnabled())
253 log.debug("Daemon found in bootstrap registry");
254 }
255 }
256 }
257
258
259 /**
260 * Returns a <code>MemberId</code> for a new member. The new
261 * identifier is obtained by concatenating the endpoint of this
262 * daemon, the incarnation number used to distinguish different
263 * incarnation of the same endpoint, and a sequence number used to
264 * distinguish different members running on the same host (different
265 * JVMs).
266 *
267 * @exception JgroupException
268 * Raised if we could not obtain a member identifier due to
269 * communication problems with the daemon.
270 */
271 public static MemberId getMemberId()
272 throws JgroupException
273 {
274 try {
275 return daemonService.getMemberId(DistributedSystemConfig.getLocalHost());
276 } catch (RemoteException e) {
277 throw new JgroupException("Unable to obtain my member identifier", e);
278 }
279 }
280
281
282 /**
283 * Adds an event generated by a member to the daemon's event queue.
284 *
285 * @return
286 * True if the event was successfully added; false is returned
287 * if the daemon has crashed, and not recovered.
288 */
289 public static boolean addEvent(Event event)
290 {
291 try {
292 if (log.isDebugEnabled()) {
293 MDC.put("group", "[Group: " + event.getGid() + "]");
294 log.debug("Member->Daemon: " + event);
295 }
296 daemonService.addEvent(event);
297 if (log.isDebugEnabled())
298 log.debug("addEvent returned successfully (M->D)");
299 return true;
300 } catch (RemoteException e) {
301 /*
302 * If the system property 'jgroup.daemon.recover.locally' is set to
303 * true, we try to recover from a crashed daemon by performing the
304 * initDaemon() method. Otherwise, we simply perform a system halt()
305 * to ensure that this JVM shuts itself down immediately. This is
306 * useful to avoid that a member generates "noise" while it is in an
307 * unstable state, and may not recover successfully anyway. This
308 * allows remaining members of the group to quickly detect this
309 * member's failure, and inform the ARM framework of the crash whom
310 * will take the opportune countermeasures.
311 */
312 log.warn("Member failed to send event to Daemon", e);
313 return handleDaemonFailure();
314 }
315 }
316
317
318 /**
319 * Adds an event generated by the daemon to given (local) member's
320 * event queue.
321 *
322 * @param member
323 * The recipient member (Dispatcher) of this event.
324 * @param event
325 * The event to be added to the given member's event queue.
326 * @return
327 * True if communication is ok; false otherwise.
328 */
329 public static boolean addEvent(MemberData member, Event event)
330 {
331 try {
332 if (log.isDebugEnabled()) {
333 log.debug("Daemon->Member(" + member + "): " + event);
334 }
335 RemoteDispatcher dispatcher = member.getRemoteDispatcher();
336 synchronized (dispatcher) {
337 dispatcher.addEvent(event);
338 }
339 if (log.isDebugEnabled())
340 log.debug("addEvent returned successfully (D->M)");
341 return true;
342 } catch (RemoteException e) {
343 /*
344 * Return false to notify the daemon that communication
345 * with the member has failed. If the member is local
346 * (i.e., resides in the same virtual machine), this can
347 * never happend. Otherwise, the daemon will take the
348 * opportune actions to install a new view. Exception
349 * is printed in the debug log to enable to check for
350 * remote exceptions not related to failed communication.
351 */
352 log.warn("Daemon failed to send event to " + member, e);
353 return false;
354 }
355 }
356
357
358 /**
359 * Handle daemon-member interaction failures. <p>
360 *
361 * If the system property 'jgroup.daemon.recover.locally' is true,
362 * and there is a confirmed daemon crash, local recovery is performed.
363 * Otherwise, this member will commit suicide.
364 *
365 * @return
366 * True is returned if a new daemon was successfully initialized.
367 */
368 public static boolean handleDaemonFailure()
369 {
370 if (DaemonInteraction.suspectDaemon()) {
371 /*
372 * The daemon was correctly suspected. Check if we should
373 * try to recover locally, or leave it to ARM. Default is
374 * to not try to recover the daemon locally.
375 */
376 if (Boolean.getBoolean("jgroup.daemon.recover.locally")) {
377 /*
378 * Try to recover a Jgroup daemon on the localhost.
379 */
380 if (log.isDebugEnabled())
381 log.debug("Trying to recover daemon locally");
382 try {
383 initDaemon();
384 return true;
385 } catch (JgroupException e) {
386 log.warn("Local daemon recovery failed.", e);
387 return false;
388 }
389 } else {
390 /*
391 * By default, we do not try to recover the daemon locally;
392 * instead we ensure to crash this JVM so that ARM can handle
393 * this crash.
394 */
395 log.error("Crashing this member dispatcher since the daemon crashed");
396 // Ensure that log files are closed
397 LogManager.shutdown();
398 Runtime.getRuntime().halt(13);
399 return false;
400 }
401 } else {
402 /*
403 * The daemon is still alive, but was suspected;
404 * appears not to be pinging me anymore.
405 */
406 log.error("DUMPING THREADS");
407 for (int i = 0; i < 10; i++) {
408 ThreadMonitor.dumpAllThreads();
409 try {
410 Thread.sleep(200);
411 } catch (InterruptedException e) { }
412 }
413 log.error("COMMITTING SUICIDE");
414 Runtime.getRuntime().halt(13);
415 return false;
416 }
417 }
418
419
420 /**
421 * Method used to indicate that the daemon has not responded within
422 * some predefined timeout. Here we try to recover by installing
423 * a new daemon locally.
424 *
425 * @return true if the daemon is not responding, false otherwise.
426 */
427 public static boolean suspectDaemon()
428 {
429 assert daemonService != null : "DaemonService should not be null";
430 try {
431 int members = daemonService.members();
432 if (log.isDebugEnabled())
433 log.debug("Daemon is alive with members: "+ members);
434 return false;
435 } catch (RemoteException e1) {
436 if (log.isDebugEnabled()) {
437 log.debug("Daemon correctly suspected", e1);
438 }
439 return true;
440 }
441 }
442
443 } // END DaemonInteraction