[Date Prev][Date Next] [Chronological] [Thread] [Top]

Re: No replication after power failure



On Wed, 2007-10-03 at 16:08 +0200, Pierangelo Masarati wrote:
> Stelios Grigoriadis wrote:
> > I am not sure this would be considered a bug, but it is a problem for
> > us. If the master goes down, the replicas have no way of detecting it.
> > When the master is going back up again, all replica servers have to be
> > restarted. Is there a way to avoid this?
> > 
> > Using the KEEPALIVE option (socket or TCP) is not really an option since
> > the default timeout is 2 hours which is too long.
> > 
> > Another would be to have some kind of timeout in the epoll and check if
> > the master is responding, but that timeout is used for the runqueue?
> > 
> > Have you come across this? I was surprised to see that no one has had
> > any issues with it. Am I missing something?
> 
> This was recently discussed (ITS#5133), and the only alternative to 
> SO_KEEPALIVE would be to have some background thread poll the producer 
> on the syncrepl descriptors on a regular basis performing some no-op 
> (like searching the rootDSE requesting 1.1).  Aaron Richton noted that 
> support for SO_KEEPALIVE was added in OpenLDAP 2.3.28.
> 
> p.
> 
> 
> 
> Ing. Pierangelo Masarati
> OpenLDAP Core Team
> 
> SysNet s.r.l.
> via Dossi, 8 - 27100 Pavia - ITALIA
> http://www.sys-net.it
> ---------------------------------------
> Office:  +39 02 23998309
> Mobile:  +39 333 4963172
> Email:   pierangelo.masarati@sys-net.it
> ---------------------------------------
> 
> 

I have solved the problem by inserting a periodic check in the runqueue
(called do_mastercheck). The intervall is determined by a slapd.conf
parameter (mastercheckint) in the syncrepl section. The parameter is
optional. If it's not specified, it's not inserted in the runqueue. I
have tested the code and it seems to work.

The do_mastercheck function just does a dummy search against the master.
I'm supplying a patch (only syncrepl.c is affected) so you can hopefully
improve and incorporate the solution in the code.

/Stelios
--- servers/slapd/syncrepl.c	2007-10-05 15:17:32.000000000 +0200
+++ syncrepl.c	2007-10-05 15:17:38.000000000 +0200
@@ -78,6 +78,7 @@
 	int					si_manageDSAit;
 	int					si_slimit;
 	int					si_tlimit;
+        int                                     si_mastercheck_int;
 	int					si_refreshDelete;
 	int					si_refreshPresent;
 	int					si_syncdata;
@@ -1017,6 +1018,35 @@
 }
 
 static void *
+do_mastercheck(
+	void	*ctx,
+	void	*arg )
+{
+    struct re_s* rtask = arg;
+    syncinfo_t *si = ( syncinfo_t * ) rtask->arg;
+    int rc;
+    char *search_attrs[] = { NULL };
+    int res;
+	
+    if (si->si_ld) {
+        rc=ldap_search_ext_s(si->si_ld, "", LDAP_SCOPE_BASE, "(objectClass=*)", search_attrs, 0, NULL, NULL, NULL, 0, &res);    
+    } 
+
+    ldap_pvt_thread_mutex_lock( &slapd_rq.rq_mutex );
+    
+    if ( ldap_pvt_runqueue_isrunning( &slapd_rq, rtask )) {
+	ldap_pvt_runqueue_stoptask( &slapd_rq, rtask );
+    }
+
+
+    rtask->interval.tv_sec = si->si_interval;
+    ldap_pvt_runqueue_resched( &slapd_rq, rtask, 0 );
+    
+    ldap_pvt_thread_mutex_unlock( &slapd_rq.rq_mutex );
+
+}
+
+static void *
 do_syncrepl(
 	void	*ctx,
 	void	*arg )
@@ -2772,6 +2802,7 @@
 #define OLDAUTHCSTR		"bindprincipal"
 #define EXATTRSSTR		"exattrs"
 #define MANAGEDSAITSTR		"manageDSAit"
+#define MASTERCHECKINTSTR  "mastercheckint"
 
 /* FIXME: unused */
 #define LASTMODSTR		"lastmod"
@@ -3201,6 +3232,17 @@
 				Debug( LDAP_DEBUG_ANY, "%s: %s.\n", c->log, c->msg, 0 );
 				return 1;
 			}
+		} else if ( !strncasecmp( c->argv[ i ], MASTERCHECKINTSTR "=",
+					STRLENOF( MASTERCHECKINTSTR "=" ) ) )
+		{
+			val = c->argv[ i ] + STRLENOF( MASTERCHECKINTSTR "=" );
+                        if ( lutil_atoi( &si->si_mastercheck_int, val ) != 0 || si->si_mastercheck_int < 0 ) {
+                            snprintf( c->msg, sizeof( c->msg ),
+                                    "invalid master check interval value \"%s\".\n",
+                                    val );
+                            Debug( LDAP_DEBUG_ANY, "%s: %s.\n", c->log, c->msg, 0 );
+                            return 1;
+                        }
 		} else if ( !strncasecmp( c->argv[ i ], SYNCDATASTR "=",
 					STRLENOF( SYNCDATASTR "=" ) ) )
 		{
@@ -3276,6 +3318,7 @@
 	si->si_tlimit = 0;
 	si->si_slimit = 0;
 	si->si_conn_setup = 0;
+        si->si_mastercheck_int = 0;
 
 	si->si_presentlist = NULL;
 	LDAP_LIST_INIT( &si->si_nonpresentlist );
@@ -3304,6 +3347,7 @@
 			SLAP_DBFLAGS(c->be) |= SLAP_DBFLAG_NO_SCHEMA_CHECK;
 		}
 		c->be->be_syncinfo = si;
+                
 		return 0;
 	}
 }
@@ -3438,6 +3482,22 @@
 	ber_dupbv( bv, &bc );
 }
 
+static int add_mastercheck( ConfigArgs *c ) {
+    int rc;
+    syncinfo_t *si = c->be->be_syncinfo;
+
+    if ( si->si_mastercheck_int == 0 )
+        return 0;
+    
+    rc = ldap_pvt_runqueue_insert( &slapd_rq, si->si_mastercheck_int * 60,
+			do_mastercheck, si, "do_mastercheck", c->be->be_suffix[0].bv_val );
+                                    printf("Mastercheck int: %d\n", c->be->be_syncinfo->si_mastercheck_int);
+     if (rc < 0) 
+         Debug( LDAP_DEBUG_ANY, "failed to add syncinfo\n", 0, 0, 0 );
+     
+     return rc;
+}
+
 int
 syncrepl_config( ConfigArgs *c )
 {
@@ -3473,5 +3533,7 @@
 	} else if ( add_syncrepl( c ) ) {
 		return(1);
 	}
+        
+        add_mastercheck(c);
 	return config_sync_shadow( c );
 }