Issue 8696 - deadlock with 3-way delta-MMR and syncprov-checkpoint
Summary: deadlock with 3-way delta-MMR and syncprov-checkpoint
Status: VERIFIED FIXED
Alias: None
Product: OpenLDAP
Classification: Unclassified
Component: slapd (show other issues)
Version: 2.4.45
Hardware: All All
: --- normal
Target Milestone: ---
Assignee: OpenLDAP project
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2017-07-20 04:53 UTC by Ryan Tandy
Modified: 2017-12-11 21:03 UTC (History)
0 users

See Also:


Attachments

Note You need to log in before you can comment on or make changes to this issue.
Description Ryan Tandy 2017-07-20 04:53:36 UTC
Full_Name: Ryan Tandy
Version: 2.4.45
OS: Debian
URL: 
Submission from: (NULL) (24.68.41.160)
Submitted by: ryan


This is rather similar to ITS#8429 (the deadlock is at the same location), but
not enough for me to be sure it's the same.

cat > slapd.conf << EOF

include /path/to/core.schema
include /path/to/cosine.schema

serverid 1 ldap://:9001
serverid 2 ldap://:9002
serverid 3 ldap://:9003

database mdb
directory db
maxsize 104857600
envflags writemap
index objectClass,cn,entryCSN,entryUUID,uid eq

suffix dc=example,dc=com
rootdn cn=root,dc=example,dc=com
rootpw secret
access to * by * read
sizelimit unlimited

syncrepl rid=1 provider="ldap://:9001" searchbase="dc=example,dc=com"
type=refreshAndPersist retry="10 +"
  bindmethod=simple binddn="cn=root,dc=example,dc=com" credentials="secret"
  syncdata=accesslog logbase="cn=accesslog"
logfilter="(&(objectClass=auditWriteObject)(reqResult=0))"

syncrepl rid=2 provider="ldap://:9002" searchbase="dc=example,dc=com"
type=refreshAndPersist retry="10 +"
  bindmethod=simple binddn="cn=root,dc=example,dc=com" credentials="secret"
  syncdata=accesslog logbase="cn=accesslog"
logfilter="(&(objectClass=auditWriteObject)(reqResult=0))"

syncrepl rid=3 provider="ldap://:9003" searchbase="dc=example,dc=com"
type=refreshAndPersist retry="10 +"
  bindmethod=simple binddn="cn=root,dc=example,dc=com" credentials="secret"
  syncdata=accesslog logbase="cn=accesslog"
logfilter="(&(objectClass=auditWriteObject)(reqResult=0))"

mirrormode on

overlay syncprov
syncprov-checkpoint 10 1
syncprov-reloadhint TRUE

overlay accesslog
logdb cn=accesslog
logops writes
logsuccess true
logpurge 07+00:00 01+00:00

database mdb
directory accesslog
maxsize 104857600
envflags writemap
index entryCSN,objectClass,reqEnd,reqResult,reqStart eq

suffix cn=accesslog
access to * by * read
sizelimit unlimited

overlay syncprov
syncprov-nopresent TRUE
syncprov-reloadhint TRUE

EOF

cat > data.ldif << EOF

dn: dc=example,dc=com
objectClass: domain

dn: uid=u0,dc=example,dc=com
objectclass: account

dn: cn=g0,dc=example,dc=com
objectClass: groupOfNames
member:

EOF

Start up all three slapds and get them synced and settled. I also executed no-op
modifications on each node to ensure every server had CSNs from all the others.

cat > groupmod.ldif << EOF

dn: cn=g0,dc=example,dc=com
add: member
member: uid=u0,dc=example,dc=com

dn: cn=g0,dc=example,dc=com
delete: member
member: uid=u0,dc=example,dc=com

EOF

Execute the above modification on one node and watch the other two. After a few
times, I reliably get one or both nodes hanging.

If I disable syncprov-checkpoint, I cannot reproduce the hang.

Backtrace from a hung node:

Thread 6 (Thread 0x7f77093d0700 (LWP 28817)):
#0  pthread_cond_wait@@GLIBC_2.3.2 () at
../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1  0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b9696c0,
mutex=0x560c5b969698) at thr_posix.c:277
#2  0x0000560c5a9add6e in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at
tpool.c:683
#3  0x00007f7718a4a494 in start_thread (arg=0x7f77093d0700) at
pthread_create.c:333
#4  0x00007f771878ca8f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97

Thread 5 (Thread 0x7f7709bd1700 (LWP 28816)):
#0  pthread_cond_wait@@GLIBC_2.3.2 () at
../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1  0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b9696c0,
mutex=0x560c5b969698) at thr_posix.c:277
#2  0x0000560c5a9add6e in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at
tpool.c:683
#3  0x00007f7718a4a494 in start_thread (arg=0x7f7709bd1700) at
pthread_create.c:333
#4  0x00007f771878ca8f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97

Thread 4 (Thread 0x7f770a3d2700 (LWP 28815)):
#0  pthread_cond_wait@@GLIBC_2.3.2 () at
../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1  0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b993fc8,
mutex=0x560c5b993fa0) at thr_posix.c:277
#2  0x0000560c5a9ac942 in ldap_pvt_thread_rmutex_lock (rmutex=0x560c5b993f68,
owner=140149249615616) at rmutex.c:129
#3  0x0000560c5a98bd4c in accesslog_op_mod (op=0x7f770a3d14e0,
rs=0x7f770a3d1120) at accesslog.c:1994
#4  0x0000560c5a941763 in overlay_op_walk (op=0x7f770a3d14e0, rs=0x7f770a3d1120,
which=op_modify, oi=0x560c5b992a00,
    on=0x560c5b993d20) at backover.c:661
#5  0x0000560c5a941a50 in over_op_func (op=0x7f770a3d14e0, rs=0x7f770a3d1120,
which=op_modify) at backover.c:730
#6  0x0000560c5a941b84 in over_op_modify (op=0x7f770a3d14e0, rs=0x7f770a3d1120)
at backover.c:769
#7  0x0000560c5a92ef07 in syncrepl_message_to_op (si=0x560c5b992580,
op=0x7f770a3d14e0, msg=0x7f76f4103bd0)
    at syncrepl.c:2417
#8  0x0000560c5a929f7e in do_syncrep2 (op=0x7f770a3d14e0, si=0x560c5b992580) at
syncrepl.c:1014
#9  0x0000560c5a92c160 in do_syncrepl (ctx=0x7f770a3d1c10, arg=0x560c5b992980)
at syncrepl.c:1565
#10 0x0000560c5a8b11cd in connection_read_thread (ctx=0x7f770a3d1c10, argv=0xc)
at connection.c:1296
#11 0x0000560c5a9ade15 in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at
tpool.c:696
#12 0x00007f7718a4a494 in start_thread (arg=0x7f770a3d2700) at
pthread_create.c:333
#13 0x00007f771878ca8f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97

Thread 3 (Thread 0x7f770abd3700 (LWP 28814)):
#0  pthread_cond_wait@@GLIBC_2.3.2 () at
../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1  0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b9696c0,
mutex=0x560c5b969698) at thr_posix.c:277
#2  0x0000560c5a9add6e in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at
tpool.c:683
#3  0x00007f7718a4a494 in start_thread (arg=0x7f770abd3700) at
pthread_create.c:333
#4  0x00007f771878ca8f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97

Thread 2 (Thread 0x7f770b3d4700 (LWP 28813)):
#0  0x00007f771878d083 in epoll_wait () at
../sysdeps/unix/syscall-template.S:84
#1  0x0000560c5a8ac6b3 in slapd_daemon_task (ptr=0x560c5bd176e0) at
daemon.c:2539
#2  0x00007f7718a4a494 in start_thread (arg=0x7f770b3d4700) at
pthread_create.c:333
#3  0x00007f771878ca8f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97

Thread 1 (Thread 0x7f7719293400 (LWP 28812)):
#0  0x00007f7718a4b6cd in pthread_join (threadid=140149266401024,
thread_return=0x0) at pthread_join.c:90
#1  0x0000560c5a9af2f8 in ldap_pvt_thread_join (thread=140149266401024,
thread_return=0x0) at thr_posix.c:197
#2  0x0000560c5a8ad99c in slapd_daemon () at daemon.c:2932
#3  0x0000560c5a88c105 in main (argc=8, argv=0x7ffd119da7b8) at main.c:1017
Comment 1 Quanah Gibson-Mount 2017-12-11 21:02:45 UTC
changed notes
Comment 2 OpenLDAP project 2017-12-11 21:03:15 UTC
Duplicate of ITS#8752
Comment 3 Quanah Gibson-Mount 2017-12-11 21:03:15 UTC
changed notes
changed state Open to Closed