[Date Prev][Date Next] [Chronological] [Thread] [Top]

Re: No replication after power failure



Pierangelo Masarati wrote:
> Stelios Grigoriadis wrote:
>> I am not sure this would be considered a bug, but it is a problem for
>> us. If the master goes down, the replicas have no way of detecting it.
>> When the master is going back up again, all replica servers have to be
>> restarted. Is there a way to avoid this?
>>
>> Using the KEEPALIVE option (socket or TCP) is not really an option since
>> the default timeout is 2 hours which is too long.
>>
>> Another would be to have some kind of timeout in the epoll and check if
>> the master is responding, but that timeout is used for the runqueue?
>>
>> Have you come across this? I was surprised to see that no one has had
>> any issues with it. Am I missing something?
>
> This was recently discussed (ITS#5133), and the only alternative to
> SO_KEEPALIVE would be to have some background thread poll the producer
> on the syncrepl descriptors on a regular basis performing some no-op
> (like searching the rootDSE requesting 1.1).  Aaron Richton noted that
> support for SO_KEEPALIVE was added in OpenLDAP 2.3.28.
>
> p.
>
>
>
> Ing. Pierangelo Masarati
> OpenLDAP Core Team
>
> SysNet s.r.l.
> via Dossi, 8 - 27100 Pavia - ITALIA
> http://www.sys-net.it
> ---------------------------------------
> Office:  +39 02 23998309
> Mobile:  +39 333 4963172
> Email:   pierangelo.masarati@sys-net.it
> ---------------------------------------
>
>
I have solved the problem by inserting a periodic check (called
do_mastercheck) in the runqueue. The period is determined by the slapd.conf
parameter mastercheckint in the syncrepl section. The period is
specified in minutes and is optional. If it's not specified, it isn't
added. I have
tested it and it seems to work. I'm supplying a patch (only syncrepl.c
is affected) so you can review my solution and hopefully incorporate it in
the code (or better yet, improve and submit it).

/Stelios


--- servers/slapd/syncrepl.c	2007-10-05 15:17:32.000000000 +0200
+++ syncrepl.c	2007-10-05 15:17:38.000000000 +0200
@@ -78,6 +78,7 @@
 	int					si_manageDSAit;
 	int					si_slimit;
 	int					si_tlimit;
+        int                                     si_mastercheck_int;
 	int					si_refreshDelete;
 	int					si_refreshPresent;
 	int					si_syncdata;
@@ -1017,6 +1018,35 @@
 }
 
 static void *
+do_mastercheck(
+	void	*ctx,
+	void	*arg )
+{
+    struct re_s* rtask = arg;
+    syncinfo_t *si = ( syncinfo_t * ) rtask->arg;
+    int rc;
+    char *search_attrs[] = { NULL };
+    int res;
+	
+    if (si->si_ld) {
+        rc=ldap_search_ext_s(si->si_ld, "", LDAP_SCOPE_BASE, "(objectClass=*)", search_attrs, 0, NULL, NULL, NULL, 0, &res);    
+    } 
+
+    ldap_pvt_thread_mutex_lock( &slapd_rq.rq_mutex );
+    
+    if ( ldap_pvt_runqueue_isrunning( &slapd_rq, rtask )) {
+	ldap_pvt_runqueue_stoptask( &slapd_rq, rtask );
+    }
+
+
+    rtask->interval.tv_sec = si->si_interval;
+    ldap_pvt_runqueue_resched( &slapd_rq, rtask, 0 );
+    
+    ldap_pvt_thread_mutex_unlock( &slapd_rq.rq_mutex );
+
+}
+
+static void *
 do_syncrepl(
 	void	*ctx,
 	void	*arg )
@@ -2772,6 +2802,7 @@
 #define OLDAUTHCSTR		"bindprincipal"
 #define EXATTRSSTR		"exattrs"
 #define MANAGEDSAITSTR		"manageDSAit"
+#define MASTERCHECKINTSTR  "mastercheckint"
 
 /* FIXME: unused */
 #define LASTMODSTR		"lastmod"
@@ -3201,6 +3232,17 @@
 				Debug( LDAP_DEBUG_ANY, "%s: %s.\n", c->log, c->msg, 0 );
 				return 1;
 			}
+		} else if ( !strncasecmp( c->argv[ i ], MASTERCHECKINTSTR "=",
+					STRLENOF( MASTERCHECKINTSTR "=" ) ) )
+		{
+			val = c->argv[ i ] + STRLENOF( MASTERCHECKINTSTR "=" );
+                        if ( lutil_atoi( &si->si_mastercheck_int, val ) != 0 || si->si_mastercheck_int < 0 ) {
+                            snprintf( c->msg, sizeof( c->msg ),
+                                    "invalid master check interval value \"%s\".\n",
+                                    val );
+                            Debug( LDAP_DEBUG_ANY, "%s: %s.\n", c->log, c->msg, 0 );
+                            return 1;
+                        }
 		} else if ( !strncasecmp( c->argv[ i ], SYNCDATASTR "=",
 					STRLENOF( SYNCDATASTR "=" ) ) )
 		{
@@ -3276,6 +3318,7 @@
 	si->si_tlimit = 0;
 	si->si_slimit = 0;
 	si->si_conn_setup = 0;
+        si->si_mastercheck_int = 0;
 
 	si->si_presentlist = NULL;
 	LDAP_LIST_INIT( &si->si_nonpresentlist );
@@ -3304,6 +3347,7 @@
 			SLAP_DBFLAGS(c->be) |= SLAP_DBFLAG_NO_SCHEMA_CHECK;
 		}
 		c->be->be_syncinfo = si;
+                
 		return 0;
 	}
 }
@@ -3438,6 +3482,22 @@
 	ber_dupbv( bv, &bc );
 }
 
+static int add_mastercheck( ConfigArgs *c ) {
+    int rc;
+    syncinfo_t *si = c->be->be_syncinfo;
+
+    if ( si->si_mastercheck_int == 0 )
+        return 0;
+    
+    rc = ldap_pvt_runqueue_insert( &slapd_rq, si->si_mastercheck_int * 60,
+			do_mastercheck, si, "do_mastercheck", c->be->be_suffix[0].bv_val );
+                                    printf("Mastercheck int: %d\n", c->be->be_syncinfo->si_mastercheck_int);
+     if (rc < 0) 
+         Debug( LDAP_DEBUG_ANY, "failed to add syncinfo\n", 0, 0, 0 );
+     
+     return rc;
+}
+
 int
 syncrepl_config( ConfigArgs *c )
 {
@@ -3473,5 +3533,7 @@
 	} else if ( add_syncrepl( c ) ) {
 		return(1);
 	}
+        
+        add_mastercheck(c);
 	return config_sync_shadow( c );
 }