[Date Prev][Date Next] [Chronological] [Thread] [Top]

Re: (ITS#7842) mdb readers/writer exclusion protocol, as implemented, is racy.



On 05/16/2014 08:09 PM, Howard Chu wrote:
> rsbx@acm.org wrote:
>> Full_Name: Raymond S Brand
>> Version: mdb.master
>> OS: linux
>> URL: ftp://ftp.openldap.org/incoming/
>> Submission from: (NULL) (107.145.137.13)
>>
>>
>> [I would have uploaded this to the ftp.openldap.org but there was no
>> space.]
>>
>> As implemented, the mdb readers/writer exclusion protocol has a race
>> condition
>> that could result in a writer reclaiming and over-writing pages still
>> in use
>> by a reader.
>
> Yes, you're basically correct. But this is unlikely in practice because
> the reader thread has no blocking calls in its codepath, while the
> writer must acquire locks etc. Generally the only way for two write txns
> to complete while a reader thread is stalled is if you explicitly send a
> SIGSTOP to the reader thread while it's in its critical section.

Or just unlucky process scheduling.

>
>>
>> Pseudo code of the snapshot locking protocols of reader and writer
>> transactions. The labeled sections, for the purposes of this analysis,
>> can be
>> assumed to execute atomically.
>>
>> READER
>> ======
>>
>> *R1*    t0 = meta[0].txid;  t1 = meta[1].txid
>>          if (t1 > t0)
>>                  t0 = t1
>>
>> *R2*    readers[me].txid = t0
>>
>> *R3*    snapshot_root = meta[t0&1].snapshot
>>
>> *R4*    /* lookup data */
>>
>> *R5*    readers[me].txid = -1           // Release snapshot
>>
>> WRITER
>> ======
>>
>> *W1*    lock(writer, EXCLUSIVE)
>>          curr_txid = meta[0].txid
>>          if (meta[1].txid > curr_txid)
>>                  curr_txid = meta[1].txid
>>
>> *W2*    oldest = curr_txid
>>          for (i=0; i<reader_slots; i++)
>>                  t = readers[i].txid
>>                  if (t != -1 && t < oldest)
>>                          oldest = t
>>
>> *W3*    reclaim_old_pages(oldest)
>>
>>          /*
>>          ** Commit new transaction
>>          */
>>
>> *W4*    unlock(writer)
>>
>> ---------------------------------------------------------------------------
>>
>>
>> Adversarial scheduling analysis:
>>
>> The following timeline demonstrates that a writer can reclaim pages in
>> use by a
>> reader.
>>
>> T0  T1  T2  T3  T4     T5  T6  T7  T8  T9  T10
>> R1                             R2  R3      R4
>>      W1  W2  W3  W4  *  W1  W2          W3
>>
>> T0      Reader has found the latest txid, Xn.
>>
>> T1->T4  Reader is not scheduled to run and a write transaction
>> commits, Xn+1.
>>
>> T5      Reader is still not scheduled and another write transaction
>> starts.
>>
>> T6      Writer finds that the oldest referenced transaction is the last
>>          committed transaction, Xn+1.
>>
>> T7      Reader records Xn in the reader table.
>>
>> T8      Reader gets the snapshot root page for transaction Xn.
>>
>> T9      Writer, believing that Xn+1 is the oldest reference transaction,
>>          reclaims the pages of transaction Xn.
>>
>>
>> T10     Reader attempts to navigate pages of a transaction, Xn, that
>> has been
>>          reclaimed and "Bad Things Happen".
>>
>> In fact, bad things will happen if an even number of write
>> transactions commit
>> between W4 and W1, represented by the '*', in the timeline above.
>>
>> The fix is to search for the highest txid in R1 and between R2 and R3.
>> Optionally, followed by recording the txid, of the snapshot root found
>> in R3,
>> in the reader's reader table slot to, possibly, increase the number of
>> reclaimable transactions.
>>
>> The lack of compiler and memory barriers in the implementation of the
>> locking
>> protocol is also of concern.
>>
>> Beyond the above, the code in mdb_txn_renew0() after the
>>          "/* Copy the DB info and flags */"
>> comment appears to have a number of data races.
>>
>> ---
>>   libraries/liblmdb/mdb.c | 24 +++++++++++++++---------
>>   1 file changed, 15 insertions(+), 9 deletions(-)
>>
>> diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c
>> index e0f551e..908417c 100644
>> --- a/libraries/liblmdb/mdb.c
>> +++ b/libraries/liblmdb/mdb.c
>> @@ -533,11 +533,11 @@ typedef struct MDB_rxbody {
>>           *      started from so we can avoid overwriting any data
>> used in that
>>           *      particular version.
>>           */
>> -       txnid_t         mrb_txnid;
>> +       volatile txnid_t mrb_txnid;
>>          /** The process ID of the process owning this reader txn. */
>> -       MDB_PID_T       mrb_pid;
>> +       volatile MDB_PID_T mrb_pid;
>>          /** The thread ID of the thread owning this txn. */
>> -       pthread_t       mrb_tid;
>> +       volatile pthread_t mrb_tid;
>>   } MDB_rxbody;
>>
>>          /** The actual reader record, with cacheline padding. */
>> @@ -585,12 +585,12 @@ typedef struct MDB_txbody {
>>                   *      This is recorded here only for convenience;
>> the value
>> can always
>>                   *      be determined by reading the main database
>> meta pages.
>>                   */
>> -       txnid_t         mtb_txnid;
>> +       volatile txnid_t mtb_txnid;
>>                  /** The number of slots that have been used in the
>> reader
>> table.
>>                   *      This always records the maximum count, it is not
>> decremented
>>                   *      when readers release their slots.
>>                   */
>> -       unsigned        mtb_numreaders;
>> +       volatile unsigned mtb_numreaders;
>>   } MDB_txbody;
>>
>>          /** The actual reader table definition. */
>> @@ -854,7 +854,7 @@ typedef struct MDB_meta {
>>          /** Any persistent environment flags. @ref mdb_env */
>>   #define        mm_flags        mm_dbs[0].md_flags
>>          pgno_t          mm_last_pg;                     /**< last
>> used page in
>> file */
>> -       txnid_t         mm_txnid;                       /**< txnid that
>> committed this page */
>> +       volatile txnid_t mm_txnid;              /**< txnid that
>> committed this
>> page */
>>   } MDB_meta;
>>
>>          /** Buffer for a stack-allocated meta page.
>> @@ -2303,10 +2303,12 @@ mdb_txn_renew0(MDB_txn *txn)
>>
>>          if (txn->mt_flags & MDB_TXN_RDONLY) {
>>                  if (!ti) {
>> +                       /* No readers table; app responsible for
>> locking */
>>                          meta = env->me_metas[ mdb_env_pick_meta(env) ];
>>                          txn->mt_txnid = meta->mm_txnid;
>>                          txn->mt_u.reader = NULL;
>>                  } else {
>> +                       /* Has readers table */
>>                          MDB_reader *r = (env->me_flags & MDB_NOTLS) ?
>> txn->mt_u.reader :
>>                                  pthread_getspecific(env->me_txkey);
>>                          if (r) {
>> @@ -2347,8 +2349,12 @@ mdb_txn_renew0(MDB_txn *txn)
>>                                          return rc;
>>                                  }
>>                          }
>> -                       txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
>>                          txn->mt_u.reader = r;
>> +                       r->mr_txnid = ti->mti_txnid;
>> +
>> +                       /* Really need a memory barrier here */
>> +
>> +                       txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
>>                          meta = env->me_metas[txn->mt_txnid & 1];
>>                  }
>>          } else {
>> @@ -2376,10 +2382,10 @@ mdb_txn_renew0(MDB_txn *txn)
>>          }
>>
>>          /* Copy the DB info and flags */
>> -       memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
>> +       memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));  /*
>> Racy */
>>
>>          /* Moved to here to avoid a data race in read TXNs */
>> -       txn->mt_next_pgno = meta->mm_last_pg+1;
>> +       txn->mt_next_pgno = meta->mm_last_pg+1;                 /*
>> Racy */
>>
>>          for (i=2; i<txn->mt_numdbs; i++) {
>>                  x = env->me_dbflags[i];
>>
>
>