[Date Prev][Date Next] [Chronological] [Thread] [Top]

Re: (ITS#7455) MDB database grows without bound



This is a multi-part message in MIME format.
--------------040901020606020603090609
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

Howard Chu wrote:
> quanah@OpenLDAP.org wrote:
>> Full_Name: Quanah Gibson-Mount
>> Version: 2.4.33
>> OS: Linux 2.6
>> URL: ftp://ftp.openldap.org/incoming/
>> Submission from: (NULL) (74.196.25.250)
>>
>>
>> I have a very small DB (about 25MB from a fresh slapadd).  However, the data.mdb
>> file grows by about 50MB a day.  I.e., the database size on disk *doubles* every
>> day.  It is now up to 571MB in size after
>>
>>
>> Here is the DB after a fresh slapadd:
>> zimbra@zre-ldap002:~/data/ldap/mdb/db$ du -c -h data.mdb
>> 25M     data.mdb
>>
>> Here is the DB on the production server:
>> [zimbra@ldap01-zcs db]$ du -c -h data.mdb
>> 571M    data.mdb
>
> Based on the mdb_stat output you pasted, this is simply a case of overflow
> pages not reusing freelist pages. The significant info here is the freelist
> info and the number of overflow pages used in the id2e database.

Here's the patch we're currently testing for this issue.
It appears to work but is maybe not being aggressive enough in reclaiming 
space. We may want to increase the number of retries a bit more.

-- 
   -- Howard Chu
   CTO, Symas Corp.           http://www.symas.com
   Director, Highland Sun     http://highlandsun.com/hyc/
   Chief Architect, OpenLDAP  http://www.openldap.org/project/

--------------040901020606020603090609
Content-Type: text/plain; charset=UTF-8;
 name="diff.txt"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="diff.txt"

diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c
index 251ab6a..117b402 100644
--- a/libraries/libmdb/mdb.c
+++ b/libraries/libmdb/mdb.c
@@ -1242,6 +1242,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
 	MDB_page *np;
 	pgno_t pgno = P_INVALID;
 	MDB_ID2 mid;
+	txnid_t oldest = 0, last;
 	int rc;
 
 	*mp = NULL;
@@ -1254,12 +1255,11 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
 		if (!txn->mt_env->me_pghead &&
 			txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
 			/* See if there's anything in the free DB */
-			int j;
 			MDB_reader *r;
 			MDB_cursor m2;
 			MDB_node *leaf;
 			MDB_val data;
-			txnid_t *kptr, last;
+			txnid_t *kptr;
 
 			mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
 			if (!txn->mt_env->me_pgfirst) {
@@ -1282,15 +1282,21 @@ again:
 				last = *(txnid_t *)key.mv_data;
 			}
 
-			/* Unusable if referred by a meta page or reader... */
-			j = 1;
-			if (last < txn->mt_txnid-1) {
-				j = txn->mt_env->me_txns->mti_numreaders;
-				r = txn->mt_env->me_txns->mti_readers + j;
-				for (j = -j; j && (last<r[j].mr_txnid || !r[j].mr_pid); j++) ;
+			{
+				unsigned int i, nr;
+				txnid_t mr;
+				oldest = txn->mt_txnid - 1;
+				nr = txn->mt_env->me_txns->mti_numreaders;
+				r = txn->mt_env->me_txns->mti_readers;
+				for (i=0; i<nr; i++) {
+					if (!r[i].mr_pid) continue;
+					mr = r[i].mr_txnid;
+					if (mr < oldest)
+						oldest = mr;
+				}
 			}
 
-			if (!j) {
+			if (oldest > last) {
 				/* It's usable, grab it.
 				 */
 				MDB_oldpages *mop;
@@ -1331,29 +1337,108 @@ none:
 		if (txn->mt_env->me_pghead) {
 			MDB_oldpages *mop = txn->mt_env->me_pghead;
 			if (num > 1) {
-				/* FIXME: For now, always use fresh pages. We
-				 * really ought to search the free list for a
-				 * contiguous range.
-				 */
-				;
+				MDB_cursor m2;
+				int retry = 2, readit = 0, n2 = num-1;
+				unsigned int i, j, k;
+
+				/* If current list is too short, must fetch more and coalesce */
+				if (mop->mo_pages[0] < (unsigned)num)
+					readit = 1;
+
+				mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
+				do {
+					if (readit) {
+						MDB_val key, data;
+						MDB_oldpages *mop2;
+						pgno_t *idl;
+						int exact;
+
+						last = mop->mo_txnid + 1;
+
+						/* We haven't hit the readers list yet? */
+						if (!oldest) {
+							MDB_reader *r;
+							unsigned int nr;
+							txnid_t mr;
+
+							oldest = txn->mt_txnid - 1;
+							nr = txn->mt_env->me_txns->mti_numreaders;
+							r = txn->mt_env->me_txns->mti_readers;
+							for (i=0; i<nr; i++) {
+								if (!r[i].mr_pid) continue;
+								mr = r[i].mr_txnid;
+								if (mr < oldest)
+									oldest = mr;
+							}
+						}
+
+						/* There's nothing we can use on the freelist */
+						if (oldest - last < 1)
+							break;
+
+						exact = 0;
+						key.mv_data = &last;
+						key.mv_size = sizeof(last);
+						rc = mdb_cursor_set(&m2, &key, &data, MDB_SET, &exact);
+						if (rc)
+							return rc;
+						idl = (MDB_ID *) data.mv_data;
+						mop2 = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - 2*sizeof(pgno_t) + MDB_IDL_SIZEOF(mop->mo_pages));
+						if (!mop2)
+							return ENOMEM;
+						/* merge in sorted order */
+						i = idl[0]; j = mop->mo_pages[0]; mop2->mo_pages[0] = k = i+j;
+						mop->mo_pages[0] = P_INVALID;
+						while (i>0  || j>0) {
+							if (i && idl[i] < mop->mo_pages[j])
+								mop2->mo_pages[k--] = idl[i--];
+							else
+								mop2->mo_pages[k--] = mop->mo_pages[j--];
+						}
+						txn->mt_env->me_pglast = last;
+						mop2->mo_txnid = last;
+						mop2->mo_next = mop->mo_next;
+						txn->mt_env->me_pghead = mop2;
+						free(mop);
+						mop = mop2;
+						/* Keep trying to read until we have enough */
+						if (mop->mo_pages[0] < (unsigned)num) {
+							continue;
+						}
+					}
+
+					/* current list has enough pages, but are they contiguous? */
+					for (i=mop->mo_pages[0]; i>=(unsigned)num; i--) {
+						if (mop->mo_pages[i-n2] == mop->mo_pages[i] + n2) {
+							pgno = mop->mo_pages[i];
+							i -= n2;
+							/* move any stragglers down */
+							for (j=i+num; j<=mop->mo_pages[0]; j++)
+								mop->mo_pages[i++] = mop->mo_pages[j];
+							mop->mo_pages[0] -= num;
+							break;
+						}
+					}
+
+					/* Stop if we succeeded, or no more retries */
+					if (!retry || pgno != P_INVALID)
+						break;
+					readit = 1;
+					retry--;
+
+				} while (1);
 			} else {
 				/* peel pages off tail, so we only have to truncate the list */
 				pgno = MDB_IDL_LAST(mop->mo_pages);
-				if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
-					mop->mo_pages[2]++;
-					if (mop->mo_pages[2] > mop->mo_pages[1])
-						mop->mo_pages[0] = 0;
+				mop->mo_pages[0]--;
+			}
+			if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
+				txn->mt_env->me_pghead = mop->mo_next;
+				if (mc->mc_dbi == FREE_DBI) {
+					mop->mo_next = txn->mt_env->me_pgfree;
+					txn->mt_env->me_pgfree = mop;
 				} else {
-					mop->mo_pages[0]--;
-				}
-				if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
-					txn->mt_env->me_pghead = mop->mo_next;
-					if (mc->mc_dbi == FREE_DBI) {
-						mop->mo_next = txn->mt_env->me_pgfree;
-						txn->mt_env->me_pgfree = mop;
-					} else {
-						free(mop);
-					}
+					free(mop);
 				}
 			}
 		}

--------------040901020606020603090609--