[Date Prev][Date Next] [Chronological] [Thread] [Top]

MDB microbenchmark



Was reading thru Google's leveldb stuff and found their benchmark page

http://leveldb.googlecode.com/svn/trunk/doc/benchmark.html

I adapted their sqlite test driver for MDB, attached.

On my laptop I get:
violino:/home/software/leveldb> ./db_bench_mdb
MDB:    version MDB 0.9.0: ("September 1, 2011")
Date:           Mon Jul  2 07:17:09 2012
CPU:            4 * Intel(R) Core(TM)2 Extreme CPU Q9300  @ 2.53GHz
CPUCache:       6144 KB
Keys:       16 bytes each
Values:     100 bytes each (50 bytes after compression)
Entries:    1000000
RawSize:    110.6 MB (estimated)
FileSize:   62.9 MB (estimated)
------------------------------------------------
fillseq      :       9.740 micros/op;   11.4 MB/s
fillseqsync  :       8.182 micros/op;   13.5 MB/s (10000 ops)
fillseqbatch :       0.502 micros/op;  220.5 MB/s
fillrandom   :      11.558 micros/op;    9.6 MB/s
fillrandint  :       9.593 micros/op;   10.3 MB/s
fillrandibatch :       6.288 micros/op;   15.8 MB/s
fillrandsync :       8.399 micros/op;   13.2 MB/s (10000 ops)
fillrandbatch :       7.206 micros/op;   15.4 MB/s
overwrite    :      14.253 micros/op;    7.8 MB/s
overwritebatch :       9.075 micros/op;   12.2 MB/s
readrandom   :       0.261 micros/op;
readseq      :       0.079 micros/op; 1392.5 MB/s
readreverse  :       0.085 micros/op; 1301.9 MB/s
fillrand100K :     106.695 micros/op;  894.0 MB/s (1000 ops)
fillseq100K  :      93.626 micros/op; 1018.8 MB/s (1000 ops)
readseq100K  :       0.095 micros/op; 1005185.9 MB/s
readrand100K :       0.368 micros/op;

Compared to the leveldb:
violino:/home/software/leveldb> ./db_bench
LevelDB:    version 1.5
Date:       Mon Jul  2 07:18:35 2012
CPU:        4 * Intel(R) Core(TM)2 Extreme CPU Q9300  @ 2.53GHz
CPUCache:   6144 KB
Keys:       16 bytes each
Values:     100 bytes each (50 bytes after compression)
Entries:    1000000
RawSize:    110.6 MB (estimated)
FileSize:   62.9 MB (estimated)
WARNING: Snappy compression is not enabled
------------------------------------------------
fillseq      :       1.752 micros/op;   63.1 MB/s
fillsync     :      13.877 micros/op;    8.0 MB/s (1000 ops)
fillrandom   :       2.836 micros/op;   39.0 MB/s
overwrite    :       3.723 micros/op;   29.7 MB/s
readrandom   :       5.390 micros/op; (1000000 of 1000000 found)
readrandom   :       4.811 micros/op; (1000000 of 1000000 found)
readseq      :       0.228 micros/op;  485.1 MB/s
readreverse  :       0.520 micros/op;  212.9 MB/s
compact      :  439250.000 micros/op;
readrandom   :       3.269 micros/op; (1000000 of 1000000 found)
readseq      :       0.197 micros/op;  560.4 MB/s
readreverse  :       0.438 micros/op;  252.5 MB/s
fill100K     :     504.147 micros/op;  189.2 MB/s (1000 ops)
crc32c       :       4.134 micros/op;  944.9 MB/s (4K per op)
snappycomp   :    6863.000 micros/op; (snappy failure)
snappyuncomp :    8145.000 micros/op; (snappy failure)
acquireload  :       0.439 micros/op; (each op is 1000 loads)

Interestingly enough, MDB wins on one or two write tests. It clearly wins on all of the read tests. MDB databases don't require compaction, so that's another win. MDB doesn't do compression, so those tests are disabled.

I haven't duplicated all of the test scenarios described on the web page yet, you can do that yourself with the attached code. It's pretty clear that nothing else even begins to approach MDB's read speed.

MDB sequential write speed is dominated by the memcpy's required for copy-on-write page updates. There's not much that can be done to eliminate that, besides batching writes. For random writes the memcmp's on the key comparisons become more of an issue. The fillrandi* tests use an integer key instead of a string-based key, to show the difference due to key comparison overhead.

For the synchronous writes, MDB is also faster, because it doesn't need to synchronously write a transaction logfile.

--
  -- Howard Chu
  CTO, Symas Corp.           http://www.symas.com
  Director, Highland Sun     http://highlandsun.com/hyc/
  Chief Architect, OpenLDAP  http://www.openldap.org/project/

// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include <stdio.h>
#include <stdlib.h>
#include <mdb.h>
#include "util/histogram.h"
#include "util/random.h"
#include "util/testutil.h"

// Comma-separated list of operations to run in the specified order
//   Actual benchmarks:
//
//   fillseq       -- write N values in sequential key order in async mode
//   fillrandom    -- write N values in random key order in async mode
//   fillrandint   -- write N values in random binary key order in async mode
//   overwrite     -- overwrite N values in random key order in async mode
//   fillseqsync   -- write N/100 values in sequential key order in sync mode
//   fillseqbatch  -- batch write N values in sequential key order in async mode
//   fillrandsync  -- write N/100 values in random key order in sync mode
//   fillrandbatch  -- batch write N values in random key order in async mode
//   fillrandibatch  -- batch write N values in random binary key order in async mode
//   fillrand100K  -- write N/1000 100K values in random order in async mode
//   fillseq100K   -- write N/1000 100K values in seq order in async mode
//   readseq       -- read N times sequentially
//   readreverse   -- read N times in reverse order
//   readseq100K   -- read N/1000 100K values in sequential order in async mode
//   readrand100K  -- read N/1000 100K values in sequential order in async mode
//   readrandom    -- read N times in random order
static const char* FLAGS_benchmarks =
    "fillseq,"
    "fillseqsync,"
    "fillseqbatch,"
    "fillrandom,"
    "fillrandint,"
    "fillrandibatch,"
    "fillrandsync,"
    "fillrandbatch,"
    "overwrite,"
    "overwritebatch,"
    "readrandom,"
    "readseq,"
    "readreverse,"
    "fillrand100K,"
    "fillseq100K,"
    "readseq100K,"
    "readrand100K,"
    ;

// Number of key/values to place in database
static int FLAGS_num = 1000000;

// Number of read operations to do.  If negative, do FLAGS_num reads.
static int FLAGS_reads = -1;

// Size of each value
static int FLAGS_value_size = 100;

// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.5;

// Print histogram of operation timings
static bool FLAGS_histogram = false;

// Page size. Default 1 KB
static int FLAGS_page_size = 1024;

// If true, do not destroy the existing database.  If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
static bool FLAGS_use_existing_db = false;

// If true, we allow batch writes to occur
static bool FLAGS_transaction = true;

// Use the db with the following name.
static const char* FLAGS_db = NULL;

inline
static void DBSynchronize(MDB_env *env)
{
  int rc;
  // Synchronize will flush writes to disk
  rc = mdb_env_sync(env, 1);
  if (rc) {
    fprintf(stderr, "synchronize error: %s\n", mdb_strerror(rc));
  }
}

namespace leveldb {

// Helper for quickly generating random data.
namespace {
class RandomGenerator {
 private:
  std::string data_;
  int pos_;

 public:
  RandomGenerator() {
    // We use a limited amount of data over and over again and ensure
    // that it is larger than the compression window (32KB), and also
    // large enough to serve all typical value sizes we want to write.
    Random rnd(301);
    std::string piece;
    while (data_.size() < 1048576) {
      // Add a short fragment that is as compressible as specified
      // by FLAGS_compression_ratio.
      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
      data_.append(piece);
    }
    pos_ = 0;
  }

  Slice Generate(int len) {
    if (pos_ + len > data_.size()) {
      pos_ = 0;
      assert(len < data_.size());
    }
    pos_ += len;
    return Slice(data_.data() + pos_ - len, len);
  }
};

static Slice TrimSpace(Slice s) {
  int start = 0;
  while (start < s.size() && isspace(s[start])) {
    start++;
  }
  int limit = s.size();
  while (limit > start && isspace(s[limit-1])) {
    limit--;
  }
  return Slice(s.data() + start, limit - start);
}

}  // namespace

class Benchmark {
 private:
  MDB_env *db_;
  MDB_dbi dbi_;
  int db_num_;
  int num_;
  int reads_;
  double start_;
  double last_op_finish_;
  int64_t bytes_;
  std::string message_;
  Histogram hist_;
  RandomGenerator gen_;
  Random rand_;

  // State kept for progress messages
  int done_;
  int next_report_;     // When to report next

  void PrintHeader() {
    const int kKeySize = 16;
    PrintEnvironment();
    fprintf(stdout, "Keys:       %d bytes each\n", kKeySize);
    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
            FLAGS_value_size,
            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
    fprintf(stdout, "Entries:    %d\n", num_);
    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
            ((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
             / 1048576.0));
    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
            (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
             / 1048576.0));
    PrintWarnings();
    fprintf(stdout, "------------------------------------------------\n");
  }

  void PrintWarnings() {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
    fprintf(stdout,
            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
            );
#endif
#ifndef NDEBUG
    fprintf(stdout,
            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
  }

  void PrintEnvironment() {
    fprintf(stderr, "MDB:    version %s\n", MDB_VERSION_STRING);

#if defined(__linux)
    time_t now = time(NULL);
    fprintf(stderr, "Date:           %s", ctime(&now));  // ctime() adds newline

    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
    if (cpuinfo != NULL) {
      char line[1000];
      int num_cpus = 0;
      std::string cpu_type;
      std::string cache_size;
      while (fgets(line, sizeof(line), cpuinfo) != NULL) {
        const char* sep = strchr(line, ':');
        if (sep == NULL) {
          continue;
        }
        Slice key = TrimSpace(Slice(line, sep - 1 - line));
        Slice val = TrimSpace(Slice(sep + 1));
        if (key == "model name") {
          ++num_cpus;
          cpu_type = val.ToString();
        } else if (key == "cache size") {
          cache_size = val.ToString();
        }
      }
      fclose(cpuinfo);
      fprintf(stderr, "CPU:            %d * %s\n", num_cpus, cpu_type.c_str());
      fprintf(stderr, "CPUCache:       %s\n", cache_size.c_str());
    }
#endif
  }

  void Start() {
    start_ = Env::Default()->NowMicros() * 1e-6;
    bytes_ = 0;
    message_.clear();
    last_op_finish_ = start_;
    hist_.Clear();
    done_ = 0;
    next_report_ = 100;
  }

  void FinishedSingleOp() {
    if (FLAGS_histogram) {
      double now = Env::Default()->NowMicros() * 1e-6;
      double micros = (now - last_op_finish_) * 1e6;
      hist_.Add(micros);
      if (micros > 20000) {
        fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
        fflush(stderr);
      }
      last_op_finish_ = now;
    }

    done_++;
    if (done_ >= next_report_) {
      if      (next_report_ < 1000)   next_report_ += 100;
      else if (next_report_ < 5000)   next_report_ += 500;
      else if (next_report_ < 10000)  next_report_ += 1000;
      else if (next_report_ < 50000)  next_report_ += 5000;
      else if (next_report_ < 100000) next_report_ += 10000;
      else if (next_report_ < 500000) next_report_ += 50000;
      else                            next_report_ += 100000;
      fprintf(stderr, "... finished %d ops%30s\r", done_, "");
      fflush(stderr);
    }
  }

  void Stop(const Slice& name) {
    double finish = Env::Default()->NowMicros() * 1e-6;

    // Pretend at least one op was done in case we are running a benchmark
    // that does not call FinishedSingleOp().
    if (done_ < 1) done_ = 1;

    if (bytes_ > 0) {
      char rate[100];
      snprintf(rate, sizeof(rate), "%6.1f MB/s",
               (bytes_ / 1048576.0) / (finish - start_));
      if (!message_.empty()) {
        message_  = std::string(rate) + " " + message_;
      } else {
        message_ = rate;
      }
    }

    fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
            name.ToString().c_str(),
            (finish - start_) * 1e6 / done_,
            (message_.empty() ? "" : " "),
            message_.c_str());
    if (FLAGS_histogram) {
      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
    }
    fflush(stdout);
  }

 public:
  enum Order {
    SEQUENTIAL,
    RANDOM
  };
  enum DBState {
    FRESH,
    EXISTING
  };
  enum DBFlags {
    NONE = 0,
  	SYNC,
	INT
  };

  Benchmark()
  : db_(NULL),
    num_(FLAGS_num),
    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
    bytes_(0),
    rand_(301) {
    std::vector<std::string> files;
    std::string test_dir;
    Env::Default()->GetTestDirectory(&test_dir);
    Env::Default()->GetChildren(test_dir.c_str(), &files);
    if (!FLAGS_use_existing_db) {
      for (int i = 0; i < files.size(); i++) {
        if (Slice(files[i]).starts_with("dbbench_mdb")) {
          std::string file_name(test_dir);
          file_name += "/";
          file_name += files[i];
          Env::Default()->DeleteFile(file_name.c_str());
        }
      }
    }
  }

  ~Benchmark() {
  	mdb_env_close(db_);
  }

  void Run() {
    PrintHeader();
    Open(NONE);

    const char* benchmarks = FLAGS_benchmarks;
    while (benchmarks != NULL) {
      const char* sep = strchr(benchmarks, ',');
      Slice name;
      if (sep == NULL) {
        name = benchmarks;
        benchmarks = NULL;
      } else {
        name = Slice(benchmarks, sep - benchmarks);
        benchmarks = sep + 1;
      }

      Start();

      bool known = true;
	  DBFlags flags = NONE;
      if (name == Slice("fillseq")) {
        Write(flags, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
        DBSynchronize(db_);
      } else if (name == Slice("fillseqbatch")) {
        Write(flags, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000);
        DBSynchronize(db_);
      } else if (name == Slice("fillrandom")) {
        Write(flags, RANDOM, FRESH, num_, FLAGS_value_size, 1);
        DBSynchronize(db_);
      } else if (name == Slice("fillrandbatch")) {
        Write(flags, RANDOM, FRESH, num_, FLAGS_value_size, 1000);
        DBSynchronize(db_);
      } else if (name == Slice("fillrandint")) {
	  	flags = INT;
        Write(flags, RANDOM, FRESH, num_, FLAGS_value_size, 1);
        DBSynchronize(db_);
      } else if (name == Slice("fillrandibatch")) {
	  	flags = INT;
        Write(flags, RANDOM, FRESH, num_, FLAGS_value_size, 1000);
        DBSynchronize(db_);
      } else if (name == Slice("overwrite")) {
        Write(flags, RANDOM, EXISTING, num_, FLAGS_value_size, 1);
        DBSynchronize(db_);
      } else if (name == Slice("overwritebatch")) {
        Write(flags, RANDOM, EXISTING, num_, FLAGS_value_size, 1000);
        DBSynchronize(db_);
      } else if (name == Slice("fillrandsync")) {
        flags = SYNC;
        Write(flags, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1);
        DBSynchronize(db_);
      } else if (name == Slice("fillseqsync")) {
        flags = SYNC;
        Write(flags, SEQUENTIAL, FRESH, num_ / 100, FLAGS_value_size, 1);
        DBSynchronize(db_);
      } else if (name == Slice("fillrand100K")) {
        Write(flags, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1);
        DBSynchronize(db_);
      } else if (name == Slice("fillseq100K")) {
        Write(flags, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1);
        DBSynchronize(db_);
      } else if (name == Slice("readseq")) {
        ReadSequential();
      } else if (name == Slice("readreverse")) {
        ReadReverse();
      } else if (name == Slice("readrandom")) {
        ReadRandom();
      } else if (name == Slice("readrand100K")) {
        int n = reads_;
        reads_ /= 1000;
        ReadRandom();
        reads_ = n;
      } else if (name == Slice("readseq100K")) {
        int n = reads_;
        reads_ /= 1000;
        ReadSequential();
        reads_ = n;
      } else {
        known = false;
        if (name != Slice()) {  // No error message for empty name
          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
        }
      }
      if (known) {
        Stop(name);
      }
    }
  }

 private:
    void Open(DBFlags flags) {
    assert(db_ == NULL);
	int rc;
	MDB_txn *txn;

    char file_name[100], cmd[200];
    db_num_++;
    std::string test_dir;
    Env::Default()->GetTestDirectory(&test_dir);
    snprintf(file_name, sizeof(file_name),
             "%s/dbbench_mdb-%d.kct",
             test_dir.c_str(),
             db_num_);

	sprintf(cmd, "mkdir -p %s", file_name);
	system(cmd);

	int env_opt = 0;
	if (flags != SYNC)
		env_opt = MDB_NOSYNC;

    // Create tuning options and open the database
	rc = mdb_env_create(&db_);
	rc = mdb_env_set_mapsize(db_, 1048576*256);
	rc = mdb_env_open(db_, file_name, env_opt, 0664);
	if (rc) {
      fprintf(stderr, "open error: %s\n", mdb_strerror(rc));
    }
	rc = mdb_txn_begin(db_, NULL, 0, &txn);
	rc = mdb_open(txn, NULL, flags == INT ? MDB_INTEGERKEY:0, &dbi_);
	rc = mdb_txn_commit(txn);
  }

  void Write(DBFlags flags, Order order, DBState state,
             int num_entries, int value_size, int entries_per_batch) {
    // Create new database if state == FRESH
    if (state == FRESH) {
      if (FLAGS_use_existing_db) {
        message_ = "skipping (--use_existing_db is true)";
        return;
      }
	  if (db_) {
		  const char *path;
		  char cmd[200];
		  mdb_env_get_path(db_, &path);
		  sprintf(cmd, "rm -rf %s", path);
		  mdb_env_close(db_);
		  system(cmd);
		  db_ = NULL;
	  }
      Open(flags);
      Start();  // Do not count time taken to destroy/open
    }

    if (num_entries != num_) {
      char msg[100];
      snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
      message_ = msg;
    }

	MDB_val mkey, mval;
	MDB_txn *txn;
	char key[100];
	int ikey;
	if (flags == INT) {
		mkey.mv_data = &ikey;
		mkey.mv_size = sizeof(ikey);
	} else {
		mkey.mv_data = key;
	}
	mval.mv_size = value_size;
    // Write to database
    for (int i = 0; i < num_entries; i+= entries_per_batch)
    {
	  mdb_txn_begin(db_, NULL, 0, &txn);
	  
	  for (int j=0; j < entries_per_batch; j++) {

      const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % num_entries);
	  int rc, flag = 0;
	  if (flags == INT)
	  	  ikey = k;
	  else
		  mkey.mv_size = snprintf(key, sizeof(key), "%016d", k);
      bytes_ += value_size + mkey.mv_size;
	  if (order == SEQUENTIAL)
	  	flag = MDB_APPEND;
	  mval.mv_data = (void *)gen_.Generate(value_size).data();
	  rc = mdb_put(txn, dbi_, &mkey, &mval, flag);
      if (rc) {
        fprintf(stderr, "set error: %s\n", mdb_strerror(rc));
      }
      FinishedSingleOp();
	  }
	  mdb_txn_commit(txn);
    }
  }

  void ReadReverse() {
    MDB_txn *txn;
	MDB_cursor *cursor;
	MDB_val key, data;

	mdb_txn_begin(db_, NULL, MDB_RDONLY, &txn);
	mdb_cursor_open(txn, dbi_, &cursor);
    while (mdb_cursor_get(cursor, &key, &data, MDB_PREV) == 0) {
      bytes_ += key.mv_size + data.mv_size;
      FinishedSingleOp();
    }
	mdb_cursor_close(cursor);
	mdb_txn_abort(txn);
  }

  void ReadSequential() {
    MDB_txn *txn;
	MDB_cursor *cursor;
	MDB_val key, data;

	mdb_txn_begin(db_, NULL, MDB_RDONLY, &txn);
	mdb_cursor_open(txn, dbi_, &cursor);
    while (mdb_cursor_get(cursor, &key, &data, MDB_NEXT) == 0) {
      bytes_ += key.mv_size + data.mv_size;
      FinishedSingleOp();
    }
	mdb_cursor_close(cursor);
	mdb_txn_abort(txn);
  }

  void ReadRandom() {
    MDB_txn *txn;
	MDB_cursor *cursor;
	MDB_val key, data;
    char ckey[100];
	key.mv_data = ckey;
	mdb_txn_begin(db_, NULL, MDB_RDONLY, &txn);
	mdb_cursor_open(txn, dbi_, &cursor);
    for (int i = 0; i < reads_; i++) {
      const int k = rand_.Next() % reads_;
      key.mv_size = snprintf(ckey, sizeof(ckey), "%016d", k);
	  mdb_cursor_get(cursor, &key, &data, (MDB_cursor_op)0);
      FinishedSingleOp();
    }
	mdb_cursor_close(cursor);
	mdb_txn_abort(txn);
  }
};

}  // namespace leveldb

int main(int argc, char** argv) {
  std::string default_db_path;
  for (int i = 1; i < argc; i++) {
    double d;
    int n;
    char junk;
    if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
      FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
    } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
      FLAGS_compression_ratio = d;
    } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
               (n == 0 || n == 1)) {
      FLAGS_histogram = n;
    } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
      FLAGS_num = n;
    } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
      FLAGS_reads = n;
    } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
      FLAGS_value_size = n;
    } else if (strncmp(argv[i], "--db=", 5) == 0) {
      FLAGS_db = argv[i] + 5;
    } else {
      fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
      exit(1);
    }
  }

  // Choose a location for the test database if none given with --db=<path>
  if (FLAGS_db == NULL) {
      leveldb::Env::Default()->GetTestDirectory(&default_db_path);
      default_db_path += "/dbbench";
      FLAGS_db = default_db_path.c_str();
  }

  leveldb::Benchmark benchmark;
  benchmark.Run();
  return 0;
}