[Date Prev][Date Next]
[Chronological]
[Thread]
[Top]
(ITS#7925) mdb_from_db Berkeley DB importer
Full_Name: Jonathan Graehl
Version: commit 8d346721a60684aeaa7b1e3b2111c972393bfad3
OS: linux
URL:
Submission from: (NULL) (75.85.99.117)
mdb_from_db Berkeley DB->LMDB import utility
See also https://github.com/openldap/openldap/pull/1
>From 32f6c10570bf7ede64cdb734775e97ea2afe1011 Mon Sep 17 00:00:00 2001
From: graehl <graehl@gmail.com>
Date: Sun, 24 Aug 2014 16:02:41 -0700
Subject: [PATCH] mdb_from_db Berkeley DB->LMDB import
---
libraries/liblmdb/Makefile | 3 +-
libraries/liblmdb/mdb_from_db.1 | 104 ++++++++
libraries/liblmdb/mdb_from_db.c | 548
+++++++++++++++++++++++++++++++++++++B%B++
3 files changed, 654 insertions(+), 1 deletion(-)
create mode 100644 libraries/liblmdb/mdb_from_db.1
create mode 100644 libraries/liblmdb/mdb_from_db.c
diff --git a/libraries/liblmdb/Makefile b/libraries/liblmdb/Makefile
index 25c1095..196ed08 100644
--- a/libraries/liblmdb/Makefile
+++ b/libraries/liblmdb/Makefile
@@ -29,7 +29,7 @@ prefix = /usr/local
IHDRS = lmdb.h
ILIBS = liblmdb.a liblmdb.so
-IPROGS = mdb_stat mdb_copy mdb_dump mdb_load
+IPROGS = mdb_stat mdb_copy mdb_dump mdb_load mdb_from_db
IDOCS = mdb_stat.1 mdb_copy.1 mdb_dump.1 mdb_load.1
PROGS = $(IPROGS) mtest mtest2 mtest3 mtest4 mtest5
all: $(ILIBS) $(PROGS)
@@ -58,6 +58,7 @@ mdb_stat: mdb_stat.o liblmdb.a
mdb_copy: mdb_copy.o liblmdb.a
mdb_dump: mdb_dump.o liblmdb.a
mdb_load: mdb_load.o liblmdb.a
+mdb_from_db: mdb_from_db.o liblmdb.a
mtest: mtest.o liblmdb.a
mtest2: mtest2.o liblmdb.a
mtest3: mtest3.o liblmdb.a
diff --git a/libraries/liblmdb/mdb_from_db.1 b/libraries/liblmdb/mdb_from_db.1
new file mode 100644
index 0000000..dbd6797
--- /dev/null
+++ b/libraries/liblmdb/mdb_from_db.1
@@ -0,0 +1,104 @@
+.TH MDB_FROM_DB 1 "2014/06/20" "LMDB 0.9.14"
+.\" Copyright 2014 Howard Chu, Symas Corp. All Rights Reserved.
+.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
+.SH NAME
+mdb_from_db \- LMDB environment translate from Berkeley DB environment tool
+.SH SYNOPSIS
+.B mdb_from_db
+.BR \ berkeley.db
+.BR \ envpath
+[\c
+.BR \-V ]
+[\c
+.BR \-n ]
+[\c
+.BI \-s \ subdb\fR]
+[\c
+.B%5\-b \ bahshsize\fR]
+[\c
+.BI \-h \ berkeley-db-homedir\fR]
+[\c
+.BR \-N ]
+[\c
+.BR \-T ]
+.SH DESCRIPTION
+The
+.B mdb_from_db
+utility reads from a Berkeley DB environment
+.BR berkeley.db
+and from_dbs all its subdatabases, or just the specified
+.BR subdb
+, into the
+LMDB environment
+.BR envpath .
+
+Additionally,
+.B mdb_from_db
+may write in the
+.B -T
+plain text format understood by
+.BR mdb_load (1)
+which can only understand a single subdatabase at a time.
+
B2B.SH OPTION0D0D
+.TP
+.BR \-V
+Write the library version number to the standard output, and exit.
+.TP
+.BR \-n
+From_Db an LMDB database which does not use subdirectories.
+.TP
+.BR \-s \ subdb
+From_Db a specific subdatabase. If no database is specified, data is from_dbed
into the main database.
+.TP
+.BR \-N
+Don't overwrite existing records when from_dbing into an already existing
database; just skip them.
+.TP
+.BR \-b \ sz
+Commit LMDB records
+.B sz
+at a time.
+.TP
+.BR \-h \ db_homedir
+Treat input db path as relative to this homedir (see the Berkeley DB docs).
Default is '.'
+.TP
+.BR \-B
+Perform a nonblocking Berkeley DB open.
+.TP
+.BR \-T
+Write the key/data into a single simple text file (stderr messages
+would allow segmenting the output into separate files for each
+subdatabase). The input will be paired lines of text, where the first
+line of the pair is the key item, and the second line of the pair is
+its corresponding data item. If more than one database is read then
+refer to the counts reported on stderr.
+
+A simple escape mechanism, where newline and backslash (\\) characters
+are special, is applied to the text input. Newline characters are
+interpreted as record separators. Backslash characters in the text
+will be interpreted in one of two ways: If the backslash character
+precedes another backslash character, the pair will be interpreted as
+a literal backslash. If the backslash character precedes any other
+character, the two characters following the backslash will be
+interpreted as a hexadecimal specification of a single character; for
+example, \\0a is a newline character in the ASCII character set.
+
+For this reason, any backslash or newline characters that naturally
+occur in the text input must be escaped to avoid misinterpretation by
+.BR mdb_load.
+
+.SH DIAGNOSTICS
+Exit status is zero if no errors occur.
+Errors result in a non-zero exit status and
+a diagnostic message being written to standard error.
+
+Information about each subdatabase processed, and the total number of
+records is also written to standard error.
+
+.SH "SEE ALSO"
+.BR mdb_load (1)
+.BR mdb_dump (1)
+.BR db_dump (1)
+
+.SH AUTHOR
+Jonathan Graehl <graehl@gmail.com>
diff --git a/libraries/liblmdb/mdb_from_db.c b/libraries/liblmdb/mdb_from_db.c
new file mode 100644
index 0000000..ee81db0
--- /dev/null
+++ b/libraries/liblmdb/mdb_from_db.c
@@ -0,0 +1,548 @@
+/* mdb_from_db.c - translate Berkeley DB to memory-mapped database(s) */
+/*
+ * Copyright 2014 Jonathan Graeh2C2C 2011-2014 Howard Chu, Symas Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted only as authorized by the OpenLDAP
+ * Public License.
+ *
+ * A copy of this license is available in the file LICENSE in the
+ * top-level directory of the distribution or, alternatively, at
+ * <http://www.OpenLDAP.org/license.html>.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <ctype.h>
+#include <unistd.h>
+#include "lmdb.h"
+#include <db.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+
+static int datasize = 64*1024;
+static int batchsize = 100;
+
+static char *subname = NULL;
+
+static char *progB0D
+
+static MDB_val kbuf, dbuf;
+
+#ifdef _WIN32
+#define Z "I"
+#else
+#define Z "z"
+#endif
+
+
+char *usagestr =
+ "path.input.berkeley.db [path.output.mdb|T-txt] [-P dbpasswd] [-V] [-l]
[-n] [-s subdbname] [-N] [-B] [-T] [-v] [-h homedirpath] [-b write-batchsize]
[-f redirect_stdout.txt]\n"
+ " (-T prints to stdout key/val in mdb_load format)\n"
+ " (-l: only list (to stdout) database names, -N: don't overwrite existing
keys; -B nonblocking db open; -n: create single mdb file instead of dir)\n"
+ ;
+
+/**
+ fail() and shutdown(): everything that might need to be cleaned up on
error->exit
+ (conceptually some of these are locals, but having them global lets us call
+ fail() from anywhere)
+*/A%A+static MDB_env *env;
+static MDB_txn *txn;
+static MDB_dbi dbi;
+
+static DB *dbp;
+static DBC *dbcp;
+static DB *parent_dbp;
+static DBC *bdb_subdbcursor;
+static char *subdbname;
+static DB_TXN *dbtxn;
+
+void bdb_close() {
+ if (dbcp)
+ dbcp->cse%2(dbcp);
+ dbcp = 0;
+ if (dbp)
+ dbp->close(dbp, 0);
+ dbp = 0;
+}
+void shutdown() {
+ if (bdb_subdbcursor)
+ bdb_subdbcursor->close(bdb_subdbcursor);
+ bdb_subdbcursor = 0;
+ bdb_close();
+ if (parent_dbp)
+ parent_dbp->close(parent_dbp, 0);
+ parent_dbp = 0;
+ if (txn)
+ mdb_txn_abort(txn);
+ txn = 0;
+ if (dbi)
+ mdb_dbi_close(env, dbi);
+ if (env)
+ mdb_env_close(env);
+ env = 0;
+ if (subdbname)
+ free(subdbname);
+ subdbname =%0;
+}
+
+void fail() {
+ shutdown();
+ exit(EXIT_FAILURE);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "usage: %s %s", prog, usagestr);
+ fail();
+}
+
+/**
+ BDB env
+*/
+static char *dbhome;
+static DB_ENV *dbenv;
+static u_int32_t dbcache = 1024*1024;
+static char *dbpasswd;
+static bool dbnonblocking;
+void strfill(char *str, char fill) {
+ while(*str)
+ *str++ = fill;
+}
+void bdb_err(char *fn, int rc) {
+ fprintf(stderr, "%s: ", prog);
+ if (dbenv)
+ dbenv->err(dbenv, rc, fn);
+ else
+ fprintf(stderr, "%s\n", db_strerror(rc));
+ fail();
+}
+
+DBT dbkey, dbdata;
+void bdb_init_dbenv() {
+ int rc;
+ if ((rc = db_env_create(&dbenv, 0)) != 0)
+ bdb_err("db_env_create", rc);
+ dbenv->set_errfile(dbenv, stderr);
+ dbv-v->set_errpfx(dbenv, prog);
+ if (dbpasswd != NULL) {
+ rc = dbenv->set_encrypt(dbenv, dbpasswd, DB_ENCRYPT_AES);
+ strfill(dbpasswd, '\0');
+ if (rc)
+ bdb_err("dbenv::set_encrypt", rc);
+ }
+ if (dbnonblocking) {
+ if ((rc = dbenv->set_flags(dbenv, DB_NOLOCKING, 1)))
+ bdb_err("DB_NOLOCKING", rc);
+ if ((rc = dbenv->set_flags(dbenv, DB_NOPANIC, 1)))
+ bdb_err("DB_NOPANIC", rc);
+ }
+ if ((rc = dbenv->set_cachesize(dbenv, 0,bdbcache, 1)))
+ bdb_err("dbenv::set_cachesize", rc);
+ if ((rc = dbenv->open(dbenv, dbhome,
+ DB_CREATE | DB_INIT_MPOOL | DB_PRIVATE |
DB_USE_ENVIRON, 0)))
+ bdb_err("dbenv::open", rc);
+ dbdata.flags = DB_DBT_USERMEM;
+ if "1%2(dbdata.data = malloc(dbdata.ulen = dbcache)))
+ fail();
+}
+
+/**
+ BDB db.
+*/
+static char *dbfilename;
+static DBT keyret, dataret;
+static bool bdb_is_recno;
+static db_recno_t bdb_recno;
+static DB_HEAP_RID bdb_heaprid;
+static int bdb_get_flags;B2Bstatic void *pointer_get;
+
+void bdb_open(char *dbname) {
+ int rc;
+ bdb_close();
+ if ((rc = db_create(&dbp, dbenv, 0)))
+ bdb_err("db_create", rc);
+ if ((rc = dbp->open(dbp, dbtxn, dbfilename, dbname,
+ DB_UNKNOWN, (parent_dbp ? 0 : DB_RDWRMASTER)|DB_RDONLY,
0))) {
+ fprintf(stderr, "db open %s : %s\n", dbfilename, dbname);
+ bdb_err(dbfilename, rc);
+ }
+}
+
+void bdb_start_chunks() {
+ int rc;
+ bdb_get_flags = DB_NEXT | DB_MULTIPLE_KEY;
+ if ((bdb_is_recno = (dbp->type == DB_RECNO || dbp->type == DB_QUEUE)))
+ keyret.size = sizeof(*(keyret.data = &bdb_recno));
+ else if (dbp->type == DB_HEAP) {
+ bdb_get_flags = DB_NEXT;
+ dbkey.flags = DB_DBT_USERMEM;
+ dbkey.ze % = dbkey.ulen = sizeof(*(dbkey.data = &bdb_heaprid));
+ }
+ if ((rc = dbp->cursor(dbp, NULL, &dbcp, 0)))
+ bdb_err("cursor", rc);
+}
+
+unsigned align(unsigned req, unsigned granule) {
+ return ((req + granule - 1) / granule) * granule;
+}
+
+/**
+ \return true if there's another chunk of records.
+*/
+bool bdb_read_chunk() {
+ int rc;
+ if ((rc = dbcp->get(dbcp, &dbkey, &dbdata, bdb_get_flags))) {
+ if (rc == DB_NOTFOUND)
+ return false;
+ if (rc == DB_BUFFER_SMALL) {
+ dbdata.ulen = dbdata.size = align(dbdata.size, 4096);
+ if (!(dbdata.data = realloc(dbdata.data, dbdata.size)))
+ fail();
+ rc = dbcp->get(dbcp, &dbkey, &dbdata, bdb_get_flags);
+ }
+ if (rc)
+ bdb_err("get chunk", rc);
+ }
+ DB_MULTIPLE_INIT(pointer_get, &dbdata);
+ return true;
+}
+
+/**
+ \return true if there was another record; sets keyret and dataret.
+*/
+bool bdb_next_record_in_chunk() {
+ if (bdb_is_recno)
+ DB_MULTIPLE_RECNO_NEXT(pointer_get, &dbdata,
+ bdb_recno, dataret.data, dataret.size);
+ else
+ DB_MULTIPLE_KEY_NEXT(pointer_get, &dbdata,
+ keyret.data, keyret.size,
+ dataret.data, dataret.size);
+ return dataret.data;
+}
+
+static char hexc_[] = "01234567890ABCDEF";
+
+char hexc(unsigned char i) {
+ return hexc_[i];
+}
+
+void putchar_T(unsigned char c) {
+ if (c >= 32 && c < 127 && c != '\\') {
+ putchar(c);
+ } else {
+ putchar('\\');
+ putchar(hexc(c >> 4));
+ putchar(hexc(c & 0xf));
+ }
+}
+
+/**
+ TODO: could fwrite chunks of no-escape-needed bytes, or probably faster,
+ encode in memory then write once
+*/
+void print_T(char *data, unsigned len) {
+ unsigned i = 0;
+ for (; i < len; ++i)
+ putchar_T(data[i]);
+}
+
+/**
+ Paired lines of text, where the first line of the pair is the key item, and
the
+ second line of the pair is its corresponding data item.
+
+ A simple escape mechanism, where newline and backslash (\\) characters are
special, is
+ applied to the text input. Newline characters are interpreted as record
separators.
+ Backslash characters in the text will be interpreted in one of two ways: If
the backslash
+ character precedes another backslash character, the pair will be interpreted
as a literal
+ backslash. If the backslash character precedes any other character, the two
characters
+ following the backslash will be interpreted as a hexadecimal specification
of a single
+ character; for example, \\0a is a newline character in the ASCII character
set.
+
+ For this reason, any backslash or newline characters that naturally occur in
the text
+ input must be escaped to avoid misinterpretation by
+*/
+void print_record_T() {
+ print_T(keyret.data, keyret.size);
+ putchar('\n');
+ print_T(dataret.data, dataret.size);
+ putchar('\n');
+}
+
+
+char *bdb_open_subdb(DBT key) {
+ if (!(subdbname = malloc(key.size + 1)))
+ fail();
+ memcpy(subdbname, key.data, key.size);
+ subdbname[key.size] = '\0';
+ bdb_open(subdbname);
+ return subdbname;
+}
+
+bool isdir(char *path) {
+ struct stat s;
+ if (stat(path, &s)) {
+ perror("path");
+ fail();
+ }
+ return S_ISDIR(s.st_mode);
+}
+
+void mkdir_if_needed(char *path) {
+ if (mkdir(path, 0755))
+ if (errno != EEXIST) {
+ perror(path);
+ fail();
+ }
+ if (!isdir(path)) {
+ fprintf(stderr, "%s is not a directory and can't mkdir it. try with -n for
no-subdir (to store as a file)", path);
+ fail();
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ int i, rc;
+ MDB_cursor *mc;
+ int envflags = 0, putflags = 0;
+ int textflag = false;
+ bool havemultiple;
+
+ prog = argv[0];
+
+ if (argc < 2) {
+ usage();
+ }
+
+ /* -n: use NOSUBDIR flag on env_open
+ * -S do not use NOSUBDIR
+ * -s subDB: translate just named subDB (default: all)
+ * -N: use NOOVERWRITE on puts
+ * -V: print version and exit
+ * -T: print -s database in format suitable for mdb_load -T (then output not
required)
+ * -b N: batch size=N (default 100)
+ * -f stdout_file: write stdout here instead
+
+ * db_dump-like options:
+ * '-h dir: ('home' dir for relative db filenames default .)
+ * -B: nonblocking db open
+ */
+ bool subdir = true;
+ bool nodup = true;
+ bool listdbs = false;
+
+ while ((i = getopt(argc, argv, "P:h:s:b:lnvVTNS")) != EOF) {
+ switch(i) {
+ case 'b':
+ i = sscanf(optarg, "%d", &batchsize);
+ if (i != 1) {
+ fprintf(stderr, "ERROR: -b '%s' was not int\n", optarg);
+ usage();
+ }
+ break;
+ case 'f':
+ if (freopen(optarg, "w", stdout) == NULL) {
+ fprintf(stderr, "%s: %s: reopen: %s\n",
+ prog, optarg, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case 'V':
+ printf("%s\n", MDB_VERSION_STRING);
+ printf("%s\n", db_version(NULL, NULL, NULL));
+ exit(0);
+ break;
+ case 'S':
+ subdir = true;
+ break;
+ case 'n':
+ subdir = false;
+ break;
+ case 's':
+ subname = strdup(optarg);
+ break;
+ case 'N':
+ nodup = true;
+ putflags = MDB_NOOVERWRITE|MDB_NODUPDATA;
+ break;
+ case 'B':
+ dbnonblocking = true;
+ break;
+ case 'T':
+ textflag = true;
+ break;
+ case 'h':
+ dbhome = optarg;
+ break;
+ case 'P':
+ /**
+ we XXX password immediately on init, to hide from top etc. but
would
+ be better to get from stdin (XXX earlier would still be insecure)
+ */
+ dbpasswd = optarg;
+ break;
+ case 'l':
+ listdbs = true;
+ break;
+ case '?':
+ default:
+ usage();
+ }
+ }
+
+ if (!subdir)
+ envflags |= MDB_NOSUBDIR;
+ bool haveout = optind == argc - 2;
B2B if (opndnd >= argc)
+ usage();
+ dbfilename = argv[optind++];
+ char *mdboutpath = haveout ? argv[optind++] : NULL;
+ if (mdboutpath) {
+ if (subdir)
+ mkdir_if_needed(mdboutpath);
+ }
+ if (listdbs) {
+ if (textflag)
+ fprintf(stderr, "disabling -T (print key/val lines) because -l (list dbs)
was specified\n");
+ textflag = false;
+ }
+
+ /**
+ args parsed.
+
+ init BDB:
+ */
+ bdb_init_dbenv();
+ bdb_open(subname);
+
+ /**
+ init MDB:D0D
+ */
+#undef MDB_OK
+#define MDB_OK(call) \
+ if (rc) { \
+ fprintf(stderr, #call " failed - error %d %s\n", rc, mdb_strerror(rc)); \
+ goto shutdown; \
+ } else {}
+
+ if (mdboutpath) {
+ rc = mdb_env_create(&env);
+ MDB_OK(mdb_env_create);
+
+ rc = mdb_env_set_maxdbs(env, 2);
+ MDB_OK(mdb_env_set_maxdbs);
+
% rc = mdb_env_open(env, mdboutpath, envflags, 0664);
+ MDB_OK(mdb_env_open);
+
+ kbuf.mv_size = mdb_env_get_maxkeysize(env) * 2 + 2;
+ kbuf.mv_data = malloc(kbuf.mv_size);
+
+ dbuf.mv_size = datasize;
+ dbuf.mv_data = malloc(dbuf.mv_size);
+ }
+
+ havemultiple = !subname && dbp->get_multiple(dbp);
+ if (havemultiple) {
+ parent_dbp = dbp;
+ dbp = 0;
+ if ((rc = parent_dbp->cursor(parent_dbp, NULL, &bdb_subdbcursor, 0)))
+ bdb_err("cursor(sub-dbs)", rc);
+ }
+
+ unsigned long long wnrecords, wnrecordsall = 0;
+ unsigned long long nrecords, nrecordsall = 0;
+ unsigned ndbs = 0;
+ bool const reading = textflag || mdboutpath;
+ for (;;) {
+ MDB_val key, data;
+ int batch = 0;
+ if (havemultiple) {
+ if ((rc = bdb_subdbcursor->get(bdb_subdbcursor, &dbkey, &dbdata, DB_NEXT
| DB_IGNORE_LEASE))) {
+ if (rc != DB_NOTFOUND)
+ bdb_err("get-next-subdb", rc);
+ else
+ rc = 0;
+ break;
+ }
+ subname = bdb_open_subdb(dbkey);
+ }
+
+ ++ndbs;
+ nrecords = 0;
+ wnrecords = 0;
+ if (subname) {
+ if (listdbs)
+ printf("%s\n", subname);
+ if (reading)
+ fprintf(stderr, "reading DB %s ... ", subname);
+ } ee e {
+ listdbs = false;
+ fprintf(stderr, "reading unnamed DB ... ");
+ }
+
+ if (mdboutpath) {
+ rc = mdb_txn_begin(env, NULL, 0, &txn);
+ MDB_OK(mdb_txn_begin);
+ rc = mdb_open(txn, subname, MDB_CREATE, &dbi);
+ MDB_OK(mdb_open);
+ rc = mdb_cursor_open(txn, dbi, &mc);
+ MDB_OK(mdb_cursor_open);
+ }
+
+ if (reading) {
+ bdb_start_chunks();
+ while (bdb_read_chunk()) {
+ while (bdb_next_record_in_chunk()) {
+ ++nrecords;%%0
+ if (textflag)
+ print_record_T();
+ if (mdboutpath) {
+ key.mv_data = keyret.data;
+ key.mv_size = keyret.size;
+ data.mv_data = dataret.data;
+ data.mv_size = dataret.size;
+ rc = mdb_cursor_put(mc, &key, &data, putflags);
+ if (rc == MDB_KEYEXIST && nodup)
+ continue;
+ ++wnrecords;
+ MDB_OK(mdb_cursor_put);
+ if (++batch == batchsize) {
+ batch = 0;
+ rc = mdb_txn_commit(txn);
+ MDB_OK(mdb_txn_commit);
+ rc = mdb_txn_begin(env, NULL, 0, &txn);
+ MDB_OK(mdb_txn_begin);
+ rc = mdb_cursor_open(txn, dbi, &mc);
+ MDB_OK(mdb_cursor_open);
+ }
+ }
+ }
+ }
+ if (mdboutpath) {
+ rc = mdb_txn_commit(txn);
+ txn = 0;
+ MDB_OK(mdb_txn_commit);
+ mdb_dbi_close(env, dbi);
+ dbi = 0;
+ }
+ nrecordsall += nrecords;
+ wnrecordsall += wnrecords;
+ fprintf(stderr, "%llu records (stored %llu).\n", nrecords, wnrecords);
+ }
+ if (!havemultiple)
+ break;
+ }
+ fprintf(stderr, "uound %u Berkeley DB(s) in input file %s - read %llu
records", ndbs, dbfilename, nrecordsall);
+ if (mdboutpath)
+ fprintf(stderr, " (stored %llu to MDB %s).\n", wnrecordsall, mdboutpath);
+ fprintf(stderr, "\n");
+shutdown:
+ shutdown();
+ return rc ? EXIT_FAILURE : EXIT_SUCCESS;
+}
--
1.8.3.GIT