From 2639ae9b1755f49e4eb4b211ce63d8879b0edd4a Mon Sep 17 00:00:00 2001 From: Ben Gras Date: Wed, 14 Jul 2010 17:46:18 +0000 Subject: [PATCH] libc: add db code from netbsd --- lib/libc/Makefile | 1 + lib/libc/db/Makefile.inc | 11 + lib/libc/db/README | 41 ++ lib/libc/db/btree/Makefile.inc | 8 + lib/libc/db/btree/bt_close.c | 184 ++++++ lib/libc/db/btree/bt_conv.c | 216 +++++++ lib/libc/db/btree/bt_debug.c | 326 +++++++++++ lib/libc/db/btree/bt_delete.c | 646 +++++++++++++++++++++ lib/libc/db/btree/bt_get.c | 108 ++++ lib/libc/db/btree/bt_open.c | 484 ++++++++++++++++ lib/libc/db/btree/bt_overflow.c | 239 ++++++++ lib/libc/db/btree/bt_page.c | 103 ++++ lib/libc/db/btree/bt_put.c | 323 +++++++++++ lib/libc/db/btree/bt_search.c | 209 +++++++ lib/libc/db/btree/bt_seq.c | 446 ++++++++++++++ lib/libc/db/btree/bt_split.c | 831 ++++++++++++++++++++++++++ lib/libc/db/btree/bt_utils.c | 258 +++++++++ lib/libc/db/btree/btree.h | 433 ++++++++++++++ lib/libc/db/btree/extern.h | 72 +++ lib/libc/db/changelog | 105 ++++ lib/libc/db/db/Makefile.inc | 6 + lib/libc/db/db/db.c | 114 ++++ lib/libc/db/db2netbsd | 31 + lib/libc/db/hash/Makefile.inc | 7 + lib/libc/db/hash/README | 69 +++ lib/libc/db/hash/extern.h | 63 ++ lib/libc/db/hash/hash.c | 999 ++++++++++++++++++++++++++++++++ lib/libc/db/hash/hash.h | 295 ++++++++++ lib/libc/db/hash/hash_bigkey.c | 683 ++++++++++++++++++++++ lib/libc/db/hash/hash_buf.c | 346 +++++++++++ lib/libc/db/hash/hash_func.c | 210 +++++++ lib/libc/db/hash/hash_log2.c | 64 ++ lib/libc/db/hash/hash_page.c | 989 +++++++++++++++++++++++++++++++ lib/libc/db/hash/ndbm.c | 119 ++++ lib/libc/db/hash/ndbmdatum.c | 162 ++++++ lib/libc/db/hash/page.h | 90 +++ lib/libc/db/man/Makefile.inc | 15 + lib/libc/db/man/btree.3 | 255 ++++++++ lib/libc/db/man/dbm_clearerr.3 | 306 ++++++++++ lib/libc/db/man/dbopen.3 | 534 +++++++++++++++++ lib/libc/db/man/hash.3 | 167 ++++++ lib/libc/db/man/mpool.3 | 226 ++++++++ lib/libc/db/man/recno.3 | 214 +++++++ lib/libc/db/mpool/Makefile.inc | 6 + lib/libc/db/mpool/README | 8 + lib/libc/db/mpool/mpool.c | 466 +++++++++++++++ lib/libc/db/recno/Makefile.inc | 7 + lib/libc/db/recno/extern.h | 52 ++ lib/libc/db/recno/rec_close.c | 190 ++++++ lib/libc/db/recno/rec_delete.c | 201 +++++++ lib/libc/db/recno/rec_get.c | 306 ++++++++++ lib/libc/db/recno/rec_open.c | 250 ++++++++ lib/libc/db/recno/rec_put.c | 282 +++++++++ lib/libc/db/recno/rec_search.c | 130 +++++ lib/libc/db/recno/rec_seq.c | 135 +++++ lib/libc/db/recno/rec_utils.c | 122 ++++ lib/libc/db/recno/recno.h | 37 ++ 57 files changed, 13200 insertions(+) create mode 100644 lib/libc/db/Makefile.inc create mode 100644 lib/libc/db/README create mode 100644 lib/libc/db/btree/Makefile.inc create mode 100644 lib/libc/db/btree/bt_close.c create mode 100644 lib/libc/db/btree/bt_conv.c create mode 100644 lib/libc/db/btree/bt_debug.c create mode 100644 lib/libc/db/btree/bt_delete.c create mode 100644 lib/libc/db/btree/bt_get.c create mode 100644 lib/libc/db/btree/bt_open.c create mode 100644 lib/libc/db/btree/bt_overflow.c create mode 100644 lib/libc/db/btree/bt_page.c create mode 100644 lib/libc/db/btree/bt_put.c create mode 100644 lib/libc/db/btree/bt_search.c create mode 100644 lib/libc/db/btree/bt_seq.c create mode 100644 lib/libc/db/btree/bt_split.c create mode 100644 lib/libc/db/btree/bt_utils.c create mode 100644 lib/libc/db/btree/btree.h create mode 100644 lib/libc/db/btree/extern.h create mode 100644 lib/libc/db/changelog create mode 100644 lib/libc/db/db/Makefile.inc create mode 100644 lib/libc/db/db/db.c create mode 100755 lib/libc/db/db2netbsd create mode 100644 lib/libc/db/hash/Makefile.inc create mode 100644 lib/libc/db/hash/README create mode 100644 lib/libc/db/hash/extern.h create mode 100644 lib/libc/db/hash/hash.c create mode 100644 lib/libc/db/hash/hash.h create mode 100644 lib/libc/db/hash/hash_bigkey.c create mode 100644 lib/libc/db/hash/hash_buf.c create mode 100644 lib/libc/db/hash/hash_func.c create mode 100644 lib/libc/db/hash/hash_log2.c create mode 100644 lib/libc/db/hash/hash_page.c create mode 100644 lib/libc/db/hash/ndbm.c create mode 100644 lib/libc/db/hash/ndbmdatum.c create mode 100644 lib/libc/db/hash/page.h create mode 100644 lib/libc/db/man/Makefile.inc create mode 100644 lib/libc/db/man/btree.3 create mode 100644 lib/libc/db/man/dbm_clearerr.3 create mode 100644 lib/libc/db/man/dbopen.3 create mode 100644 lib/libc/db/man/hash.3 create mode 100644 lib/libc/db/man/mpool.3 create mode 100644 lib/libc/db/man/recno.3 create mode 100644 lib/libc/db/mpool/Makefile.inc create mode 100644 lib/libc/db/mpool/README create mode 100644 lib/libc/db/mpool/mpool.c create mode 100644 lib/libc/db/recno/Makefile.inc create mode 100644 lib/libc/db/recno/extern.h create mode 100644 lib/libc/db/recno/rec_close.c create mode 100644 lib/libc/db/recno/rec_delete.c create mode 100644 lib/libc/db/recno/rec_get.c create mode 100644 lib/libc/db/recno/rec_open.c create mode 100644 lib/libc/db/recno/rec_put.c create mode 100644 lib/libc/db/recno/rec_search.c create mode 100644 lib/libc/db/recno/rec_seq.c create mode 100644 lib/libc/db/recno/rec_utils.c create mode 100644 lib/libc/db/recno/recno.h diff --git a/lib/libc/Makefile b/lib/libc/Makefile index b83662dda..b5b33b807 100644 --- a/lib/libc/Makefile +++ b/lib/libc/Makefile @@ -6,6 +6,7 @@ LIB= c .include "${.CURDIR}/ansi/Makefile.inc" .include "${.CURDIR}/asyn/Makefile.inc" +.include "${.CURDIR}/db/Makefile.inc" .include "${.CURDIR}/ip/Makefile.inc" .include "${.CURDIR}/math/Makefile.inc" .include "${.CURDIR}/other/Makefile.inc" diff --git a/lib/libc/db/Makefile.inc b/lib/libc/db/Makefile.inc new file mode 100644 index 000000000..224719d65 --- /dev/null +++ b/lib/libc/db/Makefile.inc @@ -0,0 +1,11 @@ +# $NetBSD: Makefile.inc,v 1.6 1997/10/22 23:14:11 lukem Exp $ +# @(#)Makefile.inc 8.2 (Berkeley) 2/21/94 +# +CPPFLAGS+=-D__DBINTERFACE_PRIVATE + +.include "${.CURDIR}/db/btree/Makefile.inc" +.include "${.CURDIR}/db/db/Makefile.inc" +.include "${.CURDIR}/db/hash/Makefile.inc" +.include "${.CURDIR}/db/man/Makefile.inc" +.include "${.CURDIR}/db/mpool/Makefile.inc" +.include "${.CURDIR}/db/recno/Makefile.inc" diff --git a/lib/libc/db/README b/lib/libc/db/README new file mode 100644 index 000000000..d06cdf14f --- /dev/null +++ b/lib/libc/db/README @@ -0,0 +1,41 @@ +# $NetBSD: README,v 1.3 1996/05/03 21:17:07 cgd Exp $ +# @(#)README 8.27 (Berkeley) 9/1/94 + +This is version 1.85 of the Berkeley DB code. + +For information on compiling and installing this software, see the file +PORT/README. + +Newer versions of this software will periodically be made available by +anonymous ftp from ftp.cs.berkeley.edu. An archive in compressed format +is in ucb/4bsd/db.tar.Z, or in gzip format in ucb/4bsd/db.tar.gz. If +you'd like to receive announcements of future releases of this software, +send email to the contact address below. + +Email questions may be addressed to Keith Bostic at bostic@cs.berkeley.edu. + +============================================ +Distribution contents: + +Makefile.inc Ignore this, it's the 4.4BSD subsystem Makefile. +PORT The per OS/architecture directories to use to build + libdb.a, if you're not running 4.4BSD. See the file + PORT/README for more information. +README This file. +btree The B+tree routines. +changelog List of changes, per version. +db The dbopen(3) interface routine. +docs Various USENIX papers, and the formatted manual pages. +hash The extended linear hashing routines. +man The unformatted manual pages. +mpool The memory pool routines. +recno The fixed/variable length record routines. +test Test package. + +============================================ +Debugging: + +If you're running a memory checker (e.g. Purify) on DB, make sure that +you recompile it with "-DPURIFY" in the CFLAGS, first. By default, +allocated pages are not initialized by the DB code, and they will show +up as reads of uninitialized memory in the buffer write routines. diff --git a/lib/libc/db/btree/Makefile.inc b/lib/libc/db/btree/Makefile.inc new file mode 100644 index 000000000..46fbb029e --- /dev/null +++ b/lib/libc/db/btree/Makefile.inc @@ -0,0 +1,8 @@ +# $NetBSD: Makefile.inc,v 1.6 1996/05/03 21:50:36 cgd Exp $ +# @(#)Makefile.inc 8.2 (Berkeley) 7/14/94 + +.PATH: ${.CURDIR}/db/btree + +SRCS+= bt_close.c bt_conv.c bt_debug.c bt_delete.c bt_get.c bt_open.c \ + bt_overflow.c bt_page.c bt_put.c bt_search.c bt_seq.c bt_split.c \ + bt_utils.c diff --git a/lib/libc/db/btree/bt_close.c b/lib/libc/db/btree/bt_close.c new file mode 100644 index 000000000..62f3cc758 --- /dev/null +++ b/lib/libc/db/btree/bt_close.c @@ -0,0 +1,184 @@ +/* $NetBSD: bt_close.c,v 1.14 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_close.c,v 1.14 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif + +#include +#include +#include +#include +#include +#include + +#include +#include "btree.h" + +static int bt_meta(BTREE *); + +/* + * BT_CLOSE -- Close a btree. + * + * Parameters: + * dbp: pointer to access method + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__bt_close(DB *dbp) +{ + BTREE *t; + int fd; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Sync the tree. */ + if (__bt_sync(dbp, 0) == RET_ERROR) + return (RET_ERROR); + + /* Close the memory pool. */ + if (mpool_close(t->bt_mp) == RET_ERROR) + return (RET_ERROR); + + /* Free random memory. */ + if (t->bt_cursor.key.data != NULL) { + free(t->bt_cursor.key.data); + t->bt_cursor.key.size = 0; + t->bt_cursor.key.data = NULL; + } + if (t->bt_rkey.data) { + free(t->bt_rkey.data); + t->bt_rkey.size = 0; + t->bt_rkey.data = NULL; + } + if (t->bt_rdata.data) { + free(t->bt_rdata.data); + t->bt_rdata.size = 0; + t->bt_rdata.data = NULL; + } + + fd = t->bt_fd; + free(t); + free(dbp); + return (close(fd) ? RET_ERROR : RET_SUCCESS); +} + +/* + * BT_SYNC -- sync the btree to disk. + * + * Parameters: + * dbp: pointer to access method + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__bt_sync(const DB *dbp, u_int flags) +{ + BTREE *t; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Sync doesn't currently take any flags. */ + if (flags != 0) { + errno = EINVAL; + return (RET_ERROR); + } + + if (F_ISSET(t, B_INMEM | B_RDONLY) || !F_ISSET(t, B_MODIFIED)) + return (RET_SUCCESS); + + if (F_ISSET(t, B_METADIRTY) && bt_meta(t) == RET_ERROR) + return (RET_ERROR); + + if ((status = mpool_sync(t->bt_mp)) == RET_SUCCESS) + F_CLR(t, B_MODIFIED); + + return (status); +} + +/* + * BT_META -- write the tree meta data to disk. + * + * Parameters: + * t: tree + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +bt_meta(BTREE *t) +{ + BTMETA m; + void *p; + + if ((p = mpool_get(t->bt_mp, P_META, 0)) == NULL) + return (RET_ERROR); + + /* Fill in metadata. */ + m.magic = BTREEMAGIC; + m.version = BTREEVERSION; + m.psize = t->bt_psize; + m.free = t->bt_free; + m.nrecs = t->bt_nrecs; + m.flags = F_ISSET(t, SAVEMETA); + + memmove(p, &m, sizeof(BTMETA)); + mpool_put(t->bt_mp, p, MPOOL_DIRTY); + return (RET_SUCCESS); +} diff --git a/lib/libc/db/btree/bt_conv.c b/lib/libc/db/btree/bt_conv.c new file mode 100644 index 000000000..d887be366 --- /dev/null +++ b/lib/libc/db/btree/bt_conv.c @@ -0,0 +1,216 @@ +/* $NetBSD: bt_conv.c,v 1.14 2008/09/10 17:52:35 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_conv.c,v 1.14 2008/09/10 17:52:35 joerg Exp $"); +#endif + +#include +#include + +#include +#include "btree.h" + +static void mswap(PAGE *); + +/* + * __BT_BPGIN, __BT_BPGOUT -- + * Convert host-specific number layout to/from the host-independent + * format stored on disk. + * + * Parameters: + * t: tree + * pg: page number + * h: page to convert + */ +void +__bt_pgin(void *t, pgno_t pg, void *pp) +{ + PAGE *h; + indx_t i, top; + uint8_t flags; + char *p; + + if (!F_ISSET(((BTREE *)t), B_NEEDSWAP)) + return; + if (pg == P_META) { + mswap(pp); + return; + } + + h = pp; + M_32_SWAP(h->pgno); + M_32_SWAP(h->prevpg); + M_32_SWAP(h->nextpg); + M_32_SWAP(h->flags); + M_16_SWAP(h->lower); + M_16_SWAP(h->upper); + + top = NEXTINDEX(h); + if ((h->flags & P_TYPE) == P_BINTERNAL) + for (i = 0; i < top; i++) { + M_16_SWAP(h->linp[i]); + p = (char *)(void *)GETBINTERNAL(h, i); + P_32_SWAP(p); + p += sizeof(uint32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + if (*(uint8_t *)p & P_BIGKEY) { + p += sizeof(uint8_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + } + else if ((h->flags & P_TYPE) == P_BLEAF) + for (i = 0; i < top; i++) { + M_16_SWAP(h->linp[i]); + p = (char *)(void *)GETBLEAF(h, i); + P_32_SWAP(p); + p += sizeof(uint32_t); + P_32_SWAP(p); + p += sizeof(uint32_t); + flags = *(uint8_t *)p; + if (flags & (P_BIGKEY | P_BIGDATA)) { + p += sizeof(uint8_t); + if (flags & P_BIGKEY) { + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + if (flags & P_BIGDATA) { + p += sizeof(uint32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + } + } +} + +void +__bt_pgout(void *t, pgno_t pg, void *pp) +{ + PAGE *h; + indx_t i, top; + uint8_t flags; + char *p; + + if (!F_ISSET(((BTREE *)t), B_NEEDSWAP)) + return; + if (pg == P_META) { + mswap(pp); + return; + } + + h = pp; + top = NEXTINDEX(h); + if ((h->flags & P_TYPE) == P_BINTERNAL) + for (i = 0; i < top; i++) { + p = (char *)(void *)GETBINTERNAL(h, i); + P_32_SWAP(p); + p += sizeof(uint32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + if (*(uint8_t *)p & P_BIGKEY) { + p += sizeof(uint8_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + M_16_SWAP(h->linp[i]); + } + else if ((h->flags & P_TYPE) == P_BLEAF) + for (i = 0; i < top; i++) { + p = (char *)(void *)GETBLEAF(h, i); + P_32_SWAP(p); + p += sizeof(uint32_t); + P_32_SWAP(p); + p += sizeof(uint32_t); + flags = *(uint8_t *)p; + if (flags & (P_BIGKEY | P_BIGDATA)) { + p += sizeof(uint8_t); + if (flags & P_BIGKEY) { + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + if (flags & P_BIGDATA) { + p += sizeof(uint32_t); + P_32_SWAP(p); + p += sizeof(pgno_t); + P_32_SWAP(p); + } + } + M_16_SWAP(h->linp[i]); + } + + M_32_SWAP(h->pgno); + M_32_SWAP(h->prevpg); + M_32_SWAP(h->nextpg); + M_32_SWAP(h->flags); + M_16_SWAP(h->lower); + M_16_SWAP(h->upper); +} + +/* + * MSWAP -- Actually swap the bytes on the meta page. + * + * Parameters: + * p: page to convert + */ +static void +mswap(PAGE *pg) +{ + char *p; + + p = (char *)(void *)pg; + P_32_SWAP(p); /* magic */ + p += sizeof(uint32_t); + P_32_SWAP(p); /* version */ + p += sizeof(uint32_t); + P_32_SWAP(p); /* psize */ + p += sizeof(uint32_t); + P_32_SWAP(p); /* free */ + p += sizeof(uint32_t); + P_32_SWAP(p); /* nrecs */ + p += sizeof(uint32_t); + P_32_SWAP(p); /* flags */ + p += sizeof(uint32_t); +} diff --git a/lib/libc/db/btree/bt_debug.c b/lib/libc/db/btree/bt_debug.c new file mode 100644 index 000000000..2986605d9 --- /dev/null +++ b/lib/libc/db/btree/bt_debug.c @@ -0,0 +1,326 @@ +/* $NetBSD: bt_debug.c,v 1.15 2008/09/10 17:52:35 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_debug.c,v 1.15 2008/09/10 17:52:35 joerg Exp $"); +#endif + +#include +#include +#include +#include + +#include +#include "btree.h" + +#ifdef DEBUG +/* + * BT_DUMP -- Dump the tree + * + * Parameters: + * dbp: pointer to the DB + */ +void +__bt_dump(DB *dbp) +{ + BTREE *t; + PAGE *h; + pgno_t i; + const char *sep; + + t = dbp->internal; + (void)fprintf(stderr, "%s: pgsz %d", + F_ISSET(t, B_INMEM) ? "memory" : "disk", t->bt_psize); + if (F_ISSET(t, R_RECNO)) + (void)fprintf(stderr, " keys %lu", (unsigned long) t->bt_nrecs); +#undef X +#define X(flag, name) \ + if (F_ISSET(t, flag)) { \ + (void)fprintf(stderr, "%s%s", sep, name); \ + sep = ", "; \ + } + if (t->flags != 0) { + sep = " flags ("; + X(R_FIXLEN, "FIXLEN"); + X(B_INMEM, "INMEM"); + X(B_NODUPS, "NODUPS"); + X(B_RDONLY, "RDONLY"); + X(R_RECNO, "RECNO"); + X(B_METADIRTY,"METADIRTY"); + (void)fprintf(stderr, ")\n"); + } +#undef X + + for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) { + __bt_dpage(h); + (void)mpool_put(t->bt_mp, h, 0); + } +} + +/* + * BT_DMPAGE -- Dump the meta page + * + * Parameters: + * h: pointer to the PAGE + */ +void +__bt_dmpage(PAGE *h) +{ + BTMETA *m; + const char *sep; + + m = (BTMETA *)(void *)h; + (void)fprintf(stderr, "magic %lx\n", (unsigned long) m->magic); + (void)fprintf(stderr, "version %lu\n", (unsigned long) m->version); + (void)fprintf(stderr, "psize %lu\n", (unsigned long) m->psize); + (void)fprintf(stderr, "free %lu\n", (unsigned long) m->free); + (void)fprintf(stderr, "nrecs %lu\n", (unsigned long) m->nrecs); + (void)fprintf(stderr, "flags %lu", (unsigned long) m->flags); +#undef X +#define X(flag, name) \ + if (m->flags & flag) { \ + (void)fprintf(stderr, "%s%s", sep, name); \ + sep = ", "; \ + } + if (m->flags) { + sep = " ("; + X(B_NODUPS, "NODUPS"); + X(R_RECNO, "RECNO"); + (void)fprintf(stderr, ")"); + } +} + +/* + * BT_DNPAGE -- Dump the page + * + * Parameters: + * n: page number to dump. + */ +void +__bt_dnpage(DB *dbp, pgno_t pgno) +{ + BTREE *t; + PAGE *h; + + t = dbp->internal; + if ((h = mpool_get(t->bt_mp, pgno, 0)) != NULL) { + __bt_dpage(h); + (void)mpool_put(t->bt_mp, h, 0); + } +} + +/* + * BT_DPAGE -- Dump the page + * + * Parameters: + * h: pointer to the PAGE + */ +void +__bt_dpage(PAGE *h) +{ + BINTERNAL *bi; + BLEAF *bl; + RINTERNAL *ri; + RLEAF *rl; + indx_t cur, top; + const char *sep; + + (void)fprintf(stderr, " page %d: (", h->pgno); +#undef X +#define X(flag, name) \ + if (h->flags & flag) { \ + (void)fprintf(stderr, "%s%s", sep, name); \ + sep = ", "; \ + } + sep = ""; + X(P_BINTERNAL, "BINTERNAL") /* types */ + X(P_BLEAF, "BLEAF") + X(P_RINTERNAL, "RINTERNAL") /* types */ + X(P_RLEAF, "RLEAF") + X(P_OVERFLOW, "OVERFLOW") + X(P_PRESERVE, "PRESERVE"); + (void)fprintf(stderr, ")\n"); +#undef X + + (void)fprintf(stderr, "\tprev %2d next %2d", h->prevpg, h->nextpg); + if (h->flags & P_OVERFLOW) + return; + + top = NEXTINDEX(h); + (void)fprintf(stderr, " lower %3d upper %3d nextind %d\n", + h->lower, h->upper, top); + for (cur = 0; cur < top; cur++) { + (void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]); + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + bi = GETBINTERNAL(h, cur); + (void)fprintf(stderr, + "size %03d pgno %03d", bi->ksize, bi->pgno); + if (bi->flags & P_BIGKEY) + (void)fprintf(stderr, " (indirect)"); + else if (bi->ksize) + (void)fprintf(stderr, + " {%.*s}", (int)bi->ksize, bi->bytes); + break; + case P_RINTERNAL: + ri = GETRINTERNAL(h, cur); + (void)fprintf(stderr, "entries %03d pgno %03d", + ri->nrecs, ri->pgno); + break; + case P_BLEAF: + bl = GETBLEAF(h, cur); + if (bl->flags & P_BIGKEY) + (void)fprintf(stderr, + "big key page %lu size %u/", + (unsigned long) *(pgno_t *)(void *)bl->bytes, + *(uint32_t *)(void *)(bl->bytes + sizeof(pgno_t))); + else if (bl->ksize) + (void)fprintf(stderr, "%s/", bl->bytes); + if (bl->flags & P_BIGDATA) + (void)fprintf(stderr, + "big data page %lu size %u", + (unsigned long) *(pgno_t *)(void *)(bl->bytes + bl->ksize), + *(uint32_t *)(void *)(bl->bytes + bl->ksize + + sizeof(pgno_t))); + else if (bl->dsize) + (void)fprintf(stderr, "%.*s", + (int)bl->dsize, bl->bytes + bl->ksize); + break; + case P_RLEAF: + rl = GETRLEAF(h, cur); + if (rl->flags & P_BIGDATA) + (void)fprintf(stderr, + "big data page %lu size %u", + (unsigned long) *(pgno_t *)(void *)rl->bytes, + *(uint32_t *)(void *)(rl->bytes + sizeof(pgno_t))); + else if (rl->dsize) + (void)fprintf(stderr, + "%.*s", (int)rl->dsize, rl->bytes); + break; + } + (void)fprintf(stderr, "\n"); + } +} +#endif + +#ifdef STATISTICS +/* + * BT_STAT -- Gather/print the tree statistics + * + * Parameters: + * dbp: pointer to the DB + */ +void +__bt_stat(DB *dbp) +{ + extern unsigned long bt_cache_hit, bt_cache_miss, bt_pfxsaved, bt_rootsplit; + extern unsigned long bt_sortsplit, bt_split; + BTREE *t; + PAGE *h; + pgno_t i, pcont, pinternal, pleaf; + unsigned long ifree, lfree, nkeys; + int levels; + + t = dbp->internal; + pcont = pinternal = pleaf = 0; + nkeys = ifree = lfree = 0; + for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) { + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + case P_RINTERNAL: + ++pinternal; + ifree += h->upper - h->lower; + break; + case P_BLEAF: + case P_RLEAF: + ++pleaf; + lfree += h->upper - h->lower; + nkeys += NEXTINDEX(h); + break; + case P_OVERFLOW: + ++pcont; + break; + } + (void)mpool_put(t->bt_mp, h, 0); + } + + /* Count the levels of the tree. */ + for (i = P_ROOT, levels = 0 ;; ++levels) { + h = mpool_get(t->bt_mp, i, 0); + if (h->flags & (P_BLEAF|P_RLEAF)) { + if (levels == 0) + levels = 1; + (void)mpool_put(t->bt_mp, h, 0); + break; + } + i = F_ISSET(t, R_RECNO) ? + GETRINTERNAL(h, 0)->pgno : + GETBINTERNAL(h, 0)->pgno; + (void)mpool_put(t->bt_mp, h, 0); + } + + (void)fprintf(stderr, "%d level%s with %ld keys", + levels, levels == 1 ? "" : "s", nkeys); + if (F_ISSET(t, R_RECNO)) + (void)fprintf(stderr, " (%ld header count)", (long)t->bt_nrecs); + (void)fprintf(stderr, + "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n", + (long)pinternal + pleaf + pcont, (long)pleaf, (long)pinternal, + (long)pcont); + (void)fprintf(stderr, "%ld cache hits, %ld cache misses\n", + bt_cache_hit, bt_cache_miss); + (void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n", + bt_split, bt_rootsplit, bt_sortsplit); + pleaf *= t->bt_psize - BTDATAOFF; + if (pleaf) + (void)fprintf(stderr, + "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n", + ((double)(pleaf - lfree) / pleaf) * 100, + pleaf - lfree, lfree); + pinternal *= t->bt_psize - BTDATAOFF; + if (pinternal) + (void)fprintf(stderr, + "%.0f%% internal fill (%ld bytes used, %ld bytes free\n", + ((double)(pinternal - ifree) / pinternal) * 100, + pinternal - ifree, ifree); + if (bt_pfxsaved) + (void)fprintf(stderr, "prefix checking removed %lu bytes.\n", + bt_pfxsaved); +} +#endif diff --git a/lib/libc/db/btree/bt_delete.c b/lib/libc/db/btree/bt_delete.c new file mode 100644 index 000000000..ca930b3a9 --- /dev/null +++ b/lib/libc/db/btree/bt_delete.c @@ -0,0 +1,646 @@ +/* $NetBSD: bt_delete.c,v 1.17 2009/01/29 02:02:36 lukem Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_delete.c,v 1.17 2009/01/29 02:02:36 lukem Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include + +#include +#include "btree.h" + +static int __bt_bdelete(BTREE *, const DBT *); +static int __bt_curdel(BTREE *, const DBT *, PAGE *, u_int); +static int __bt_pdelete(BTREE *, PAGE *); +static int __bt_relink(BTREE *, PAGE *); +static int __bt_stkacq(BTREE *, PAGE **, CURSOR *); + +/* + * __bt_delete + * Delete the item(s) referenced by a key. + * + * Return RET_SPECIAL if the key is not found. + */ +int +__bt_delete(const DB *dbp, const DBT *key, u_int flags) +{ + BTREE *t; + CURSOR *c; + PAGE *h; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Check for change to a read-only tree. */ + if (F_ISSET(t, B_RDONLY)) { + errno = EPERM; + return (RET_ERROR); + } + + switch (flags) { + case 0: + status = __bt_bdelete(t, key); + break; + case R_CURSOR: + /* + * If flags is R_CURSOR, delete the cursor. Must already + * have started a scan and not have already deleted it. + */ + c = &t->bt_cursor; + if (F_ISSET(c, CURS_INIT)) { + if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) + return (RET_ERROR); + + /* + * If the page is about to be emptied, we'll need to + * delete it, which means we have to acquire a stack. + */ + if (NEXTINDEX(h) == 1) + if (__bt_stkacq(t, &h, &t->bt_cursor)) + return (RET_ERROR); + + status = __bt_dleaf(t, NULL, h, (u_int)c->pg.index); + + if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { + if (__bt_pdelete(t, h)) + return (RET_ERROR); + } else + mpool_put(t->bt_mp, h, + (u_int)(status == RET_SUCCESS ? + MPOOL_DIRTY : 0)); + break; + } + /* FALLTHROUGH */ + default: + errno = EINVAL; + return (RET_ERROR); + } + if (status == RET_SUCCESS) + F_SET(t, B_MODIFIED); + return (status); +} + +/* + * __bt_stkacq -- + * Acquire a stack so we can delete a cursor entry. + * + * Parameters: + * t: tree + * hp: pointer to current, pinned PAGE pointer + * c: pointer to the cursor + * + * Returns: + * 0 on success, 1 on failure + */ +static int +__bt_stkacq(BTREE *t, PAGE **hp, CURSOR *c) +{ + BINTERNAL *bi; + EPG *e; + EPGNO *parent; + PAGE *h; + indx_t idx = 0; /* Pacify gcc */ + pgno_t pgno; + recno_t nextpg, prevpg; + int exact, level; + + /* + * Find the first occurrence of the key in the tree. Toss the + * currently locked page so we don't hit an already-locked page. + */ + h = *hp; + mpool_put(t->bt_mp, h, 0); + if ((e = __bt_search(t, &c->key, &exact)) == NULL) + return (1); + h = e->page; + + /* See if we got it in one shot. */ + if (h->pgno == c->pg.pgno) + goto ret; + + /* + * Move right, looking for the page. At each move we have to move + * up the stack until we don't have to move to the next page. If + * we have to change pages at an internal level, we have to fix the + * stack back up. + */ + while (h->pgno != c->pg.pgno) { + if ((nextpg = h->nextpg) == P_INVALID) + break; + mpool_put(t->bt_mp, h, 0); + + /* Move up the stack. */ + for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { + /* Get the parent page. */ + if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + return (1); + + /* Move to the next index. */ + if (parent->index != NEXTINDEX(h) - 1) { + idx = parent->index + 1; + BT_PUSH(t, h->pgno, idx); + break; + } + mpool_put(t->bt_mp, h, 0); + } + + /* Restore the stack. */ + while (level--) { + /* Push the next level down onto the stack. */ + bi = GETBINTERNAL(h, idx); + pgno = bi->pgno; + BT_PUSH(t, pgno, 0); + + /* Lose the currently pinned page. */ + mpool_put(t->bt_mp, h, 0); + + /* Get the next level down. */ + if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) + return (1); + idx = 0; + } + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) + return (1); + } + + if (h->pgno == c->pg.pgno) + goto ret; + + /* Reacquire the original stack. */ + mpool_put(t->bt_mp, h, 0); + if ((e = __bt_search(t, &c->key, &exact)) == NULL) + return (1); + h = e->page; + + /* + * Move left, looking for the page. At each move we have to move + * up the stack until we don't have to change pages to move to the + * next page. If we have to change pages at an internal level, we + * have to fix the stack back up. + */ + while (h->pgno != c->pg.pgno) { + if ((prevpg = h->prevpg) == P_INVALID) + break; + mpool_put(t->bt_mp, h, 0); + + /* Move up the stack. */ + for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { + /* Get the parent page. */ + if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + return (1); + + /* Move to the next index. */ + if (parent->index != 0) { + idx = parent->index - 1; + BT_PUSH(t, h->pgno, idx); + break; + } + mpool_put(t->bt_mp, h, 0); + } + + /* Restore the stack. */ + while (level--) { + /* Push the next level down onto the stack. */ + bi = GETBINTERNAL(h, idx); + pgno = bi->pgno; + + /* Lose the currently pinned page. */ + mpool_put(t->bt_mp, h, 0); + + /* Get the next level down. */ + if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) + return (1); + + idx = NEXTINDEX(h) - 1; + BT_PUSH(t, pgno, idx); + } + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) + return (1); + } + + +ret: mpool_put(t->bt_mp, h, 0); + return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); +} + +/* + * __bt_bdelete -- + * Delete all key/data pairs matching the specified key. + * + * Parameters: + * t: tree + * key: key to delete + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + */ +static int +__bt_bdelete(BTREE *t, const DBT *key) +{ + EPG *e; + PAGE *h; + int deleted, exact, redo; + + deleted = 0; + + /* Find any matching record; __bt_search pins the page. */ +loop: if ((e = __bt_search(t, key, &exact)) == NULL) + return (deleted ? RET_SUCCESS : RET_ERROR); + if (!exact) { + mpool_put(t->bt_mp, e->page, 0); + return (deleted ? RET_SUCCESS : RET_SPECIAL); + } + + /* + * Delete forward, then delete backward, from the found key. If + * there are duplicates and we reach either side of the page, do + * the key search again, so that we get them all. + */ + redo = 0; + h = e->page; + do { + if (__bt_dleaf(t, key, h, (u_int)e->index)) { + mpool_put(t->bt_mp, h, 0); + return (RET_ERROR); + } + if (F_ISSET(t, B_NODUPS)) { + if (NEXTINDEX(h) == 0) { + if (__bt_pdelete(t, h)) + return (RET_ERROR); + } else + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + return (RET_SUCCESS); + } + deleted = 1; + } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); + + /* Check for right-hand edge of the page. */ + if (e->index == NEXTINDEX(h)) + redo = 1; + + /* Delete from the key to the beginning of the page. */ + while (e->index-- > 0) { + if (__bt_cmp(t, key, e) != 0) + break; + if (__bt_dleaf(t, key, h, (u_int)e->index) == RET_ERROR) { + mpool_put(t->bt_mp, h, 0); + return (RET_ERROR); + } + if (e->index == 0) + redo = 1; + } + + /* Check for an empty page. */ + if (NEXTINDEX(h) == 0) { + if (__bt_pdelete(t, h)) + return (RET_ERROR); + goto loop; + } + + /* Put the page. */ + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + + if (redo) + goto loop; + return (RET_SUCCESS); +} + +/* + * __bt_pdelete -- + * Delete a single page from the tree. + * + * Parameters: + * t: tree + * h: leaf page + * + * Returns: + * RET_SUCCESS, RET_ERROR. + * + * Side-effects: + * mpool_put's the page + */ +static int +__bt_pdelete(BTREE *t, PAGE *h) +{ + BINTERNAL *bi; + PAGE *pg; + EPGNO *parent; + indx_t cnt, idx, *ip, offset; + uint32_t nksize; + char *from; + + /* + * Walk the parent page stack -- a LIFO stack of the pages that were + * traversed when we searched for the page where the delete occurred. + * Each stack entry is a page number and a page index offset. The + * offset is for the page traversed on the search. We've just deleted + * a page, so we have to delete the key from the parent page. + * + * If the delete from the parent page makes it empty, this process may + * continue all the way up the tree. We stop if we reach the root page + * (which is never deleted, it's just not worth the effort) or if the + * delete does not empty the page. + */ + while ((parent = BT_POP(t)) != NULL) { + /* Get the parent page. */ + if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + return (RET_ERROR); + + idx = parent->index; + bi = GETBINTERNAL(pg, idx); + + /* Free any overflow pages. */ + if (bi->flags & P_BIGKEY && + __ovfl_delete(t, bi->bytes) == RET_ERROR) { + mpool_put(t->bt_mp, pg, 0); + return (RET_ERROR); + } + + /* + * Free the parent if it has only the one key and it's not the + * root page. If it's the rootpage, turn it back into an empty + * leaf page. + */ + if (NEXTINDEX(pg) == 1) { + if (pg->pgno == P_ROOT) { + pg->lower = BTDATAOFF; + pg->upper = t->bt_psize; + pg->flags = P_BLEAF; + } else { + if (__bt_relink(t, pg) || __bt_free(t, pg)) + return (RET_ERROR); + continue; + } + } else { + /* Pack remaining key items at the end of the page. */ + nksize = NBINTERNAL(bi->ksize); + from = (char *)(void *)pg + pg->upper; + memmove(from + nksize, from, + (size_t)((char *)(void *)bi - from)); + pg->upper += nksize; + + /* Adjust indices' offsets, shift the indices down. */ + offset = pg->linp[idx]; + for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip) + if (ip[0] < offset) + ip[0] += nksize; + for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip) + ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; + pg->lower -= sizeof(indx_t); + } + + mpool_put(t->bt_mp, pg, MPOOL_DIRTY); + break; + } + + /* Free the leaf page, as long as it wasn't the root. */ + if (h->pgno == P_ROOT) { + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + return (RET_SUCCESS); + } + return (__bt_relink(t, h) || __bt_free(t, h)); +} + +/* + * __bt_dleaf -- + * Delete a single record from a leaf page. + * + * Parameters: + * t: tree + * key: referenced key + * h: page + * idx: index on page to delete + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__bt_dleaf(BTREE *t, const DBT *key, PAGE *h, u_int idx) +{ + BLEAF *bl; + indx_t cnt, *ip, offset; + uint32_t nbytes; + void *to; + char *from; + + /* If this record is referenced by the cursor, delete the cursor. */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && + t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx && + __bt_curdel(t, key, h, idx)) + return (RET_ERROR); + + /* If the entry uses overflow pages, make them available for reuse. */ + to = bl = GETBLEAF(h, idx); + if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) + return (RET_ERROR); + if (bl->flags & P_BIGDATA && + __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) + return (RET_ERROR); + + /* Pack the remaining key/data items at the end of the page. */ + nbytes = NBLEAF(bl); + from = (char *)(void *)h + h->upper; + memmove(from + nbytes, from, (size_t)((char *)(void *)to - from)); + h->upper += nbytes; + + /* Adjust the indices' offsets, shift the indices down. */ + offset = h->linp[idx]; + for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip) + if (ip[0] < offset) + ip[0] += nbytes; + for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip) + ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; + h->lower -= sizeof(indx_t); + + /* If the cursor is on this page, adjust it as necessary. */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && + t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx) + --t->bt_cursor.pg.index; + + return (RET_SUCCESS); +} + +/* + * __bt_curdel -- + * Delete the cursor. + * + * Parameters: + * t: tree + * key: referenced key (or NULL) + * h: page + * idx: index on page to delete + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +static int +__bt_curdel(BTREE *t, const DBT *key, PAGE *h, u_int idx) +{ + CURSOR *c; + EPG e; + PAGE *pg; + int curcopy, status; + + /* + * If there are duplicates, move forward or backward to one. + * Otherwise, copy the key into the cursor area. + */ + c = &t->bt_cursor; + F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); + + curcopy = 0; + if (!F_ISSET(t, B_NODUPS)) { + /* + * We're going to have to do comparisons. If we weren't + * provided a copy of the key, i.e. the user is deleting + * the current cursor position, get one. + */ + if (key == NULL) { + e.page = h; + e.index = idx; + if ((status = __bt_ret(t, &e, + &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) + return (status); + curcopy = 1; + key = &c->key; + } + /* Check previous key, if not at the beginning of the page. */ + if (idx > 0) { + e.page = h; + e.index = idx - 1; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_BEFORE); + goto dup2; + } + } + /* Check next key, if not at the end of the page. */ + if (idx < (unsigned)(NEXTINDEX(h) - 1)) { + e.page = h; + e.index = idx + 1; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_AFTER); + goto dup2; + } + } + /* Check previous key if at the beginning of the page. */ + if (idx == 0 && h->prevpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) + return (RET_ERROR); + e.page = pg; + e.index = NEXTINDEX(pg) - 1; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_BEFORE); + goto dup1; + } + mpool_put(t->bt_mp, pg, 0); + } + /* Check next key if at the end of the page. */ + if (idx == (unsigned)(NEXTINDEX(h) - 1) && h->nextpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) + return (RET_ERROR); + e.page = pg; + e.index = 0; + if (__bt_cmp(t, key, &e) == 0) { + F_SET(c, CURS_AFTER); +dup1: mpool_put(t->bt_mp, pg, 0); +dup2: c->pg.pgno = e.page->pgno; + c->pg.index = e.index; + return (RET_SUCCESS); + } + mpool_put(t->bt_mp, pg, 0); + } + } + e.page = h; + e.index = idx; + if (curcopy || (status = + __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { + F_SET(c, CURS_ACQUIRE); + return (RET_SUCCESS); + } + return (status); +} + +/* + * __bt_relink -- + * Link around a deleted page. + * + * Parameters: + * t: tree + * h: page to be deleted + */ +static int +__bt_relink(BTREE *t, PAGE *h) +{ + PAGE *pg; + + if (h->nextpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) + return (RET_ERROR); + pg->prevpg = h->prevpg; + mpool_put(t->bt_mp, pg, MPOOL_DIRTY); + } + if (h->prevpg != P_INVALID) { + if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) + return (RET_ERROR); + pg->nextpg = h->nextpg; + mpool_put(t->bt_mp, pg, MPOOL_DIRTY); + } + return (0); +} diff --git a/lib/libc/db/btree/bt_get.c b/lib/libc/db/btree/bt_get.c new file mode 100644 index 000000000..a512010b5 --- /dev/null +++ b/lib/libc/db/btree/bt_get.c @@ -0,0 +1,108 @@ +/* $NetBSD: bt_get.c,v 1.13 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_get.c,v 1.13 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include + +#include +#include "btree.h" + +/* + * __BT_GET -- Get a record from the btree. + * + * Parameters: + * dbp: pointer to access method + * key: key to find + * data: data to return + * flag: currently unused + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + */ +int +__bt_get(const DB *dbp, const DBT *key, DBT *data, u_int flags) +{ + BTREE *t; + EPG *e; + int exact, status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Get currently doesn't take any flags. */ + if (flags) { + errno = EINVAL; + return (RET_ERROR); + } + + if ((e = __bt_search(t, key, &exact)) == NULL) + return (RET_ERROR); + if (!exact) { + mpool_put(t->bt_mp, e->page, 0); + return (RET_SPECIAL); + } + + status = __bt_ret(t, e, NULL, NULL, data, &t->bt_rdata, 0); + + /* + * If the user is doing concurrent access, we copied the + * key/data, toss the page. + */ + if (F_ISSET(t, B_DB_LOCK)) + mpool_put(t->bt_mp, e->page, 0); + else + t->bt_pinned = e->page; + return (status); +} diff --git a/lib/libc/db/btree/bt_open.c b/lib/libc/db/btree/bt_open.c new file mode 100644 index 000000000..679668e90 --- /dev/null +++ b/lib/libc/db/btree/bt_open.c @@ -0,0 +1,484 @@ +/* $NetBSD: bt_open.c,v 1.24 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_open.c,v 1.24 2008/09/11 12:58:00 joerg Exp $"); +#endif + +/* + * Implementation of btree access method for 4.4BSD. + * + * The design here was originally based on that of the btree access method + * used in the Postgres database system at UC Berkeley. This implementation + * is wholly independent of the Postgres code. + */ + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef __minix +#include +#else +#define _PATH_TMP "/tmp/" +#endif + +#include +#include "btree.h" + +#ifdef DEBUG +#undef MINPSIZE +#define MINPSIZE 128 +#endif + +#ifndef LITTLE_ENDIAN +# define LITTLE_ENDIAN 1234 +#endif + +#ifndef BIG_ENDIAN +# define BIG_ENDIAN 4321 +#endif + +#ifndef BYTE_ORDER +#define BYTE_ORDER LITTLE_ENDIAN +#endif + +static int byteorder(void); +static int nroot(BTREE *); +static int tmp(void); + +/* + * __BT_OPEN -- Open a btree. + * + * Creates and fills a DB struct, and calls the routine that actually + * opens the btree. + * + * Parameters: + * fname: filename (NULL for in-memory trees) + * flags: open flag bits + * mode: open permission bits + * b: BTREEINFO pointer + * + * Returns: + * NULL on failure, pointer to DB on success. + * + */ +DB * +__bt_open(const char *fname, int flags, mode_t mode, const BTREEINFO *openinfo, + int dflags) +{ + struct stat sb; + BTMETA m; + BTREE *t; + BTREEINFO b; + DB *dbp; + pgno_t ncache; + ssize_t nr; + size_t temp; + int machine_lorder; + + t = NULL; + + /* + * Intention is to make sure all of the user's selections are okay + * here and then use them without checking. Can't be complete, since + * we don't know the right page size, lorder or flags until the backing + * file is opened. Also, the file's page size can cause the cachesize + * to change. + */ + machine_lorder = byteorder(); + if (openinfo) { + b = *openinfo; + + /* Flags: R_DUP. */ + if (b.flags & ~(R_DUP)) + goto einval; + + /* + * Page size must be indx_t aligned and >= MINPSIZE. Default + * page size is set farther on, based on the underlying file + * transfer size. + */ + if (b.psize && + (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || + b.psize & (sizeof(indx_t) - 1))) + goto einval; + + /* Minimum number of keys per page; absolute minimum is 2. */ + if (b.minkeypage) { + if (b.minkeypage < 2) + goto einval; + } else + b.minkeypage = DEFMINKEYPAGE; + + /* If no comparison, use default comparison and prefix. */ + if (b.compare == NULL) { + b.compare = __bt_defcmp; + if (b.prefix == NULL) + b.prefix = __bt_defpfx; + } + + if (b.lorder == 0) + b.lorder = machine_lorder; + } else { + b.compare = __bt_defcmp; + b.cachesize = 0; + b.flags = 0; + b.lorder = machine_lorder; + b.minkeypage = DEFMINKEYPAGE; + b.prefix = __bt_defpfx; + b.psize = 0; + } + + /* Check for the ubiquitous PDP-11. */ + if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) + goto einval; + + /* Allocate and initialize DB and BTREE structures. */ + if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL) + goto err; + memset(t, 0, sizeof(BTREE)); + t->bt_fd = -1; /* Don't close unopened fd on error. */ + t->bt_lorder = b.lorder; + t->bt_order = NOT; + t->bt_cmp = b.compare; + t->bt_pfx = b.prefix; + t->bt_rfd = -1; + + if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL) + goto err; + memset(t->bt_dbp, 0, sizeof(DB)); + if (t->bt_lorder != machine_lorder) + F_SET(t, B_NEEDSWAP); + + dbp->type = DB_BTREE; + dbp->internal = t; + dbp->close = __bt_close; + dbp->del = __bt_delete; + dbp->fd = __bt_fd; + dbp->get = __bt_get; + dbp->put = __bt_put; + dbp->seq = __bt_seq; + dbp->sync = __bt_sync; + + /* + * If no file name was supplied, this is an in-memory btree and we + * open a backing temporary file. Otherwise, it's a disk-based tree. + */ + if (fname) { + switch (flags & O_ACCMODE) { + case O_RDONLY: + F_SET(t, B_RDONLY); + break; + case O_RDWR: + break; + case O_WRONLY: + default: + goto einval; + } + + if ((t->bt_fd = open(fname, flags, mode)) == -1) + goto err; + if (fcntl(t->bt_fd, F_SETFD, FD_CLOEXEC) == -1) + goto err; + } else { + if ((flags & O_ACCMODE) != O_RDWR) + goto einval; + if ((t->bt_fd = tmp()) == -1) + goto err; + F_SET(t, B_INMEM); + } + + if (fcntl(t->bt_fd, F_SETFD, FD_CLOEXEC) == -1) + goto err; + + if (fstat(t->bt_fd, &sb)) + goto err; + if (sb.st_size) { + if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0) + goto err; + if (nr != sizeof(BTMETA)) + goto eftype; + + /* + * Read in the meta-data. This can change the notion of what + * the lorder, page size and flags are, and, when the page size + * changes, the cachesize value can change too. If the user + * specified the wrong byte order for an existing database, we + * don't bother to return an error, we just clear the NEEDSWAP + * bit. + */ + if (m.magic == BTREEMAGIC) + F_CLR(t, B_NEEDSWAP); + else { + F_SET(t, B_NEEDSWAP); + M_32_SWAP(m.magic); + M_32_SWAP(m.version); + M_32_SWAP(m.psize); + M_32_SWAP(m.free); + M_32_SWAP(m.nrecs); + M_32_SWAP(m.flags); + } + if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) + goto eftype; + if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || + m.psize & (sizeof(indx_t) - 1)) + goto eftype; + if (m.flags & ~SAVEMETA) + goto eftype; + b.psize = m.psize; + F_SET(t, m.flags); + t->bt_free = m.free; + t->bt_nrecs = m.nrecs; + } else { + /* + * Set the page size to the best value for I/O to this file. + * Don't overflow the page offset type. + */ + if (b.psize == 0) { +#ifndef __minix + b.psize = sb.st_blksize; +#else + b.psize = 4096; +#endif + if (b.psize < MINPSIZE) + b.psize = MINPSIZE; + if (b.psize > MAX_PAGE_OFFSET + 1) + b.psize = MAX_PAGE_OFFSET + 1; + } + + /* Set flag if duplicates permitted. */ + if (!(b.flags & R_DUP)) + F_SET(t, B_NODUPS); + + t->bt_free = P_INVALID; + t->bt_nrecs = 0; + F_SET(t, B_METADIRTY); + } + + t->bt_psize = b.psize; + + /* Set the cache size; must be a multiple of the page size. */ + if (b.cachesize && b.cachesize & (b.psize - 1)) + b.cachesize += (~b.cachesize & (b.psize - 1)) + 1; + if (b.cachesize < b.psize * MINCACHE) + b.cachesize = b.psize * MINCACHE; + + /* Calculate number of pages to cache. */ + ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; + + /* + * The btree data structure requires that at least two keys can fit on + * a page, but other than that there's no fixed requirement. The user + * specified a minimum number per page, and we translated that into the + * number of bytes a key/data pair can use before being placed on an + * overflow page. This calculation includes the page header, the size + * of the index referencing the leaf item and the size of the leaf item + * structure. Also, don't let the user specify a minkeypage such that + * a key/data pair won't fit even if both key and data are on overflow + * pages. + */ + temp = (t->bt_psize - BTDATAOFF) / b.minkeypage - + (sizeof(indx_t) + NBLEAFDBT(0, 0)); + _DBFIT(temp, indx_t); + t->bt_ovflsize = (indx_t)temp; + if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) + t->bt_ovflsize = + NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); + + /* Initialize the buffer pool. */ + if ((t->bt_mp = + mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) + goto err; + if (!F_ISSET(t, B_INMEM)) + mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); + + /* Create a root page if new tree. */ + if (nroot(t) == RET_ERROR) + goto err; + + /* Global flags. */ + if (dflags & DB_LOCK) + F_SET(t, B_DB_LOCK); + if (dflags & DB_SHMEM) + F_SET(t, B_DB_SHMEM); + if (dflags & DB_TXN) + F_SET(t, B_DB_TXN); + + return (dbp); + +einval: errno = EINVAL; + goto err; + +eftype: errno = EFTYPE; + goto err; + +err: if (t) { + if (t->bt_dbp) + free(t->bt_dbp); + if (t->bt_fd != -1) + (void)close(t->bt_fd); + free(t); + } + return (NULL); +} + +/* + * NROOT -- Create the root of a new tree. + * + * Parameters: + * t: tree + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +nroot(BTREE *t) +{ + PAGE *meta, *root; + pgno_t npg; + + if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) { + mpool_put(t->bt_mp, meta, 0); + return (RET_SUCCESS); + } + if (errno != EINVAL) /* It's OK to not exist. */ + return (RET_ERROR); + errno = 0; + + if ((meta = mpool_new(t->bt_mp, &npg)) == NULL) + return (RET_ERROR); + + if ((root = mpool_new(t->bt_mp, &npg)) == NULL) + return (RET_ERROR); + + if (npg != P_ROOT) + return (RET_ERROR); + root->pgno = npg; + root->prevpg = root->nextpg = P_INVALID; + root->lower = BTDATAOFF; + root->upper = t->bt_psize; + root->flags = P_BLEAF; + memset(meta, 0, t->bt_psize); + mpool_put(t->bt_mp, meta, MPOOL_DIRTY); + mpool_put(t->bt_mp, root, MPOOL_DIRTY); + return (RET_SUCCESS); +} + +static int +tmp(void) +{ + sigset_t set, oset; + size_t len; + int fd; + char *envtmp; + char path[PATH_MAX]; + +#ifndef __minix + if (issetugid()) + envtmp = NULL; + else + envtmp = getenv("TMPDIR"); +#else + envtmp = getenv("TMPDIR"); +#endif + + len = snprintf(path, + sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : _PATH_TMP); + if (len >= sizeof(path)) + return -1; + + (void)sigfillset(&set); + (void)sigprocmask(SIG_BLOCK, &set, &oset); + if ((fd = mkstemp(path)) != -1) { + (void)unlink(path); + (void)fcntl(fd, F_SETFD, FD_CLOEXEC); + } + (void)sigprocmask(SIG_SETMASK, &oset, NULL); + return(fd); +} + +static int +byteorder(void) +{ + uint32_t x; + uint8_t *p; + + x = 0x01020304; + p = (uint8_t *)(void *)&x; + switch (*p) { + case 1: + return (BIG_ENDIAN); + case 4: + return (LITTLE_ENDIAN); + default: + return (0); + } +} + +int +__bt_fd(const DB *dbp) +{ + BTREE *t; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* In-memory database can't have a file descriptor. */ + if (F_ISSET(t, B_INMEM)) { + errno = ENOENT; + return (-1); + } + return (t->bt_fd); +} diff --git a/lib/libc/db/btree/bt_overflow.c b/lib/libc/db/btree/bt_overflow.c new file mode 100644 index 000000000..1d77be9ca --- /dev/null +++ b/lib/libc/db/btree/bt_overflow.c @@ -0,0 +1,239 @@ +/* $NetBSD: bt_overflow.c,v 1.16 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_overflow.c,v 1.16 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include + +#include +#include "btree.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/* + * Big key/data code. + * + * Big key and data entries are stored on linked lists of pages. The initial + * reference is byte string stored with the key or data and is the page number + * and size. The actual record is stored in a chain of pages linked by the + * nextpg field of the PAGE header. + * + * The first page of the chain has a special property. If the record is used + * by an internal page, it cannot be deleted and the P_PRESERVE bit will be set + * in the header. + * + * XXX + * A single DBT is written to each chain, so a lot of space on the last page + * is wasted. This is a fairly major bug for some data sets. + */ + +/* + * __OVFL_GET -- Get an overflow key/data item. + * + * Parameters: + * t: tree + * p: pointer to { pgno_t, uint32_t } + * buf: storage address + * bufsz: storage size + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__ovfl_get(BTREE *t, void *p, size_t *ssz, void **buf, size_t *bufsz) +{ + PAGE *h; + pgno_t pg; + uint32_t sz, nb, plen; + size_t temp; + + memmove(&pg, p, sizeof(pgno_t)); + memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(uint32_t)); + *ssz = sz; + +#ifdef DEBUG + if (pg == P_INVALID || sz == 0) + abort(); +#endif + /* Make the buffer bigger as necessary. */ + if (*bufsz < sz) { + *buf = (char *)(*buf == NULL ? malloc(sz) : realloc(*buf, sz)); + if (*buf == NULL) + return (RET_ERROR); + *bufsz = sz; + } + + /* + * Step through the linked list of pages, copying the data on each one + * into the buffer. Never copy more than the data's length. + */ + temp = t->bt_psize - BTDATAOFF; + _DBFIT(temp, uint32_t); + plen = (uint32_t)temp; + for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + nb = MIN(sz, plen); + memmove(p, (char *)(void *)h + BTDATAOFF, nb); + mpool_put(t->bt_mp, h, 0); + + if ((sz -= nb) == 0) + break; + } + return (RET_SUCCESS); +} + +/* + * __OVFL_PUT -- Store an overflow key/data item. + * + * Parameters: + * t: tree + * data: DBT to store + * pgno: storage page number + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__ovfl_put(BTREE *t, const DBT *dbt, pgno_t *pg) +{ + PAGE *h, *last; + void *p; + pgno_t npg; + uint32_t sz, nb, plen; + size_t temp; + + /* + * Allocate pages and copy the key/data record into them. Store the + * number of the first page in the chain. + */ + temp = t->bt_psize - BTDATAOFF; + _DBFIT(temp, uint32_t); + plen = (uint32_t)temp; + last = NULL; + p = dbt->data; + temp = dbt->size; + _DBFIT(temp, uint32_t); + sz = temp; + for (;; p = (char *)p + plen, last = h) { + if ((h = __bt_new(t, &npg)) == NULL) + return (RET_ERROR); + + h->pgno = npg; + h->nextpg = h->prevpg = P_INVALID; + h->flags = P_OVERFLOW; + h->lower = h->upper = 0; + + nb = MIN(sz, plen); + (void)memmove((char *)(void *)h + BTDATAOFF, p, (size_t)nb); + + if (last) { + last->nextpg = h->pgno; + mpool_put(t->bt_mp, last, MPOOL_DIRTY); + } else + *pg = h->pgno; + + if ((sz -= nb) == 0) { + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + break; + } + } + return (RET_SUCCESS); +} + +/* + * __OVFL_DELETE -- Delete an overflow chain. + * + * Parameters: + * t: tree + * p: pointer to { pgno_t, uint32_t } + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__ovfl_delete(BTREE *t, void *p) +{ + PAGE *h; + pgno_t pg; + uint32_t sz, plen; + size_t temp; + + (void)memmove(&pg, p, sizeof(pgno_t)); + (void)memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(uint32_t)); + +#ifdef DEBUG + if (pg == P_INVALID || sz == 0) + abort(); +#endif + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + /* Don't delete chains used by internal pages. */ + if (h->flags & P_PRESERVE) { + mpool_put(t->bt_mp, h, 0); + return (RET_SUCCESS); + } + + /* Step through the chain, calling the free routine for each page. */ + temp = t->bt_psize - BTDATAOFF; + _DBFIT(temp, uint32_t); + plen = (uint32_t)temp; + for (;; sz -= plen) { + pg = h->nextpg; + __bt_free(t, h); + if (sz <= plen) + break; + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + } + return (RET_SUCCESS); +} diff --git a/lib/libc/db/btree/bt_page.c b/lib/libc/db/btree/bt_page.c new file mode 100644 index 000000000..afa05ab10 --- /dev/null +++ b/lib/libc/db/btree/bt_page.c @@ -0,0 +1,103 @@ +/* $NetBSD: bt_page.c,v 1.13 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_page.c,v 1.13 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include + +#include +#include "btree.h" + +/* + * __bt_free -- + * Put a page on the freelist. + * + * Parameters: + * t: tree + * h: page to free + * + * Returns: + * RET_ERROR, RET_SUCCESS + * + * Side-effect: + * mpool_put's the page. + */ +int +__bt_free(BTREE *t, PAGE *h) +{ + /* Insert the page at the head of the free list. */ + h->prevpg = P_INVALID; + h->nextpg = t->bt_free; + t->bt_free = h->pgno; + F_SET(t, B_METADIRTY); + + /* Make sure the page gets written back. */ + return (mpool_put(t->bt_mp, h, MPOOL_DIRTY)); +} + +/* + * __bt_new -- + * Get a new page, preferably from the freelist. + * + * Parameters: + * t: tree + * npg: storage for page number. + * + * Returns: + * Pointer to a page, NULL on error. + */ +PAGE * +__bt_new(BTREE *t, pgno_t *npg) +{ + PAGE *h; + + if (t->bt_free != P_INVALID && + (h = mpool_get(t->bt_mp, t->bt_free, 0)) != NULL) { + *npg = t->bt_free; + t->bt_free = h->nextpg; + F_SET(t, B_METADIRTY); + return (h); + } + return (mpool_new(t->bt_mp, npg)); +} diff --git a/lib/libc/db/btree/bt_put.c b/lib/libc/db/btree/bt_put.c new file mode 100644 index 000000000..ace935d3b --- /dev/null +++ b/lib/libc/db/btree/bt_put.c @@ -0,0 +1,323 @@ +/* $NetBSD: bt_put.c,v 1.19 2009/02/12 06:40:14 lukem Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_put.c,v 1.19 2009/02/12 06:40:14 lukem Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include +#include + +#include +#include "btree.h" + +static EPG *bt_fast(BTREE *, const DBT *, const DBT *, int *); + +/* + * __BT_PUT -- Add a btree item to the tree. + * + * Parameters: + * dbp: pointer to access method + * key: key + * data: data + * flag: R_NOOVERWRITE + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the + * tree and R_NOOVERWRITE specified. + */ +int +__bt_put(const DB *dbp, DBT *key, const DBT *data, u_int flags) +{ + BTREE *t; + DBT tkey, tdata; + EPG *e = NULL; /* pacify gcc */ + PAGE *h; + indx_t idx, nxtindex; + pgno_t pg; + uint32_t nbytes, temp; + int dflags, exact, status; + char *dest, db[NOVFLSIZE], kb[NOVFLSIZE]; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Check for change to a read-only tree. */ + if (F_ISSET(t, B_RDONLY)) { + errno = EPERM; + return (RET_ERROR); + } + + switch (flags) { + case 0: + case R_NOOVERWRITE: + break; + case R_CURSOR: + /* + * If flags is R_CURSOR, put the cursor. Must already + * have started a scan and not have already deleted it. + */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, + CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) + break; + /* FALLTHROUGH */ + default: + errno = EINVAL; + return (RET_ERROR); + } + + /* + * If the key/data pair won't fit on a page, store it on overflow + * pages. Only put the key on the overflow page if the pair are + * still too big after moving the data to an overflow page. + * + * XXX + * If the insert fails later on, the overflow pages aren't recovered. + */ + dflags = 0; + if (key->size + data->size > t->bt_ovflsize) { + if (key->size > t->bt_ovflsize) { +storekey: if (__ovfl_put(t, key, &pg) == RET_ERROR) + return (RET_ERROR); + tkey.data = kb; + tkey.size = NOVFLSIZE; + memmove(kb, &pg, sizeof(pgno_t)); + memmove(kb + sizeof(pgno_t), + &key->size, sizeof(uint32_t)); + dflags |= P_BIGKEY; + key = &tkey; + } + if (key->size + data->size > t->bt_ovflsize) { + if (__ovfl_put(t, data, &pg) == RET_ERROR) + return (RET_ERROR); + tdata.data = db; + tdata.size = NOVFLSIZE; + memmove(db, &pg, sizeof(pgno_t)); + _DBFIT(data->size, uint32_t); + temp = (uint32_t)data->size; + (void)memmove(db + sizeof(pgno_t), + &temp, sizeof(uint32_t)); + dflags |= P_BIGDATA; + data = &tdata; + } + if (key->size + data->size > t->bt_ovflsize) + goto storekey; + } + + /* Replace the cursor. */ + if (flags == R_CURSOR) { + if ((h = mpool_get(t->bt_mp, t->bt_cursor.pg.pgno, 0)) == NULL) + return (RET_ERROR); + idx = t->bt_cursor.pg.index; + goto delete; + } + + /* + * Find the key to delete, or, the location at which to insert. + * Bt_fast and __bt_search both pin the returned page. + */ + if (t->bt_order == NOT || (e = bt_fast(t, key, data, &exact)) == NULL) + if ((e = __bt_search(t, key, &exact)) == NULL) + return (RET_ERROR); + h = e->page; + idx = e->index; + + /* + * Add the key/data pair to the tree. If an identical key is already + * in the tree, and R_NOOVERWRITE is set, an error is returned. If + * R_NOOVERWRITE is not set, the key is either added (if duplicates are + * permitted) or an error is returned. + */ + switch (flags) { + case R_NOOVERWRITE: + if (!exact) + break; + mpool_put(t->bt_mp, h, 0); + return (RET_SPECIAL); + default: + if (!exact || !F_ISSET(t, B_NODUPS)) + break; + /* + * !!! + * Note, the delete may empty the page, so we need to put a + * new entry into the page immediately. + */ +delete: if (__bt_dleaf(t, key, h, (u_int)idx) == RET_ERROR) { + mpool_put(t->bt_mp, h, 0); + return (RET_ERROR); + } + break; + } + + /* + * If not enough room, or the user has put a ceiling on the number of + * keys permitted in the page, split the page. The split code will + * insert the key and data and unpin the current page. If inserting + * into the offset array, shift the pointers up. + */ + nbytes = NBLEAFDBT(key->size, data->size); + if ((uint32_t)h->upper - (uint32_t)h->lower < nbytes + sizeof(indx_t)) { + if ((status = __bt_split(t, h, key, + data, dflags, nbytes, (u_int)idx)) != RET_SUCCESS) + return (status); + goto success; + } + + if (idx < (nxtindex = NEXTINDEX(h))) + memmove(h->linp + idx + 1, h->linp + idx, + (nxtindex - idx) * sizeof(indx_t)); + h->lower += sizeof(indx_t); + + h->linp[idx] = h->upper -= nbytes; + dest = (char *)(void *)h + h->upper; + WR_BLEAF(dest, key, data, dflags); + + /* If the cursor is on this page, adjust it as necessary. */ + if (F_ISSET(&t->bt_cursor, CURS_INIT) && + !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && + t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index >= idx) + ++t->bt_cursor.pg.index; + + if (t->bt_order == NOT) { + if (h->nextpg == P_INVALID) { + if (idx == NEXTINDEX(h) - 1) { + t->bt_order = FORWARD; + t->bt_last.index = idx; + t->bt_last.pgno = h->pgno; + } + } else if (h->prevpg == P_INVALID) { + if (idx == 0) { + t->bt_order = BACK; + t->bt_last.index = 0; + t->bt_last.pgno = h->pgno; + } + } + } + + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + +success: + if (flags == R_SETCURSOR) + __bt_setcur(t, e->page->pgno, (u_int)e->index); + + F_SET(t, B_MODIFIED); + return (RET_SUCCESS); +} + +#ifdef STATISTICS +unsigned long bt_cache_hit, bt_cache_miss; +#endif + +/* + * BT_FAST -- Do a quick check for sorted data. + * + * Parameters: + * t: tree + * key: key to insert + * + * Returns: + * EPG for new record or NULL if not found. + */ +static EPG * +bt_fast(BTREE *t, const DBT *key, const DBT *data, int *exactp) +{ + PAGE *h; + uint32_t nbytes; + int cmp; + + if ((h = mpool_get(t->bt_mp, t->bt_last.pgno, 0)) == NULL) { + t->bt_order = NOT; + return (NULL); + } + t->bt_cur.page = h; + t->bt_cur.index = t->bt_last.index; + + /* + * If won't fit in this page or have too many keys in this page, + * have to search to get split stack. + */ + nbytes = NBLEAFDBT(key->size, data->size); + if ((uint32_t)h->upper - (uint32_t)h->lower < nbytes + sizeof(indx_t)) + goto miss; + + if (t->bt_order == FORWARD) { + if (t->bt_cur.page->nextpg != P_INVALID) + goto miss; + if (t->bt_cur.index != NEXTINDEX(h) - 1) + goto miss; + if ((cmp = __bt_cmp(t, key, &t->bt_cur)) < 0) + goto miss; + t->bt_last.index = cmp ? ++t->bt_cur.index : t->bt_cur.index; + } else { + if (t->bt_cur.page->prevpg != P_INVALID) + goto miss; + if (t->bt_cur.index != 0) + goto miss; + if ((cmp = __bt_cmp(t, key, &t->bt_cur)) > 0) + goto miss; + t->bt_last.index = 0; + } + *exactp = cmp == 0; +#ifdef STATISTICS + ++bt_cache_hit; +#endif + return (&t->bt_cur); + +miss: +#ifdef STATISTICS + ++bt_cache_miss; +#endif + t->bt_order = NOT; + mpool_put(t->bt_mp, h, 0); + return (NULL); +} diff --git a/lib/libc/db/btree/bt_search.c b/lib/libc/db/btree/bt_search.c new file mode 100644 index 000000000..db17049c5 --- /dev/null +++ b/lib/libc/db/btree/bt_search.c @@ -0,0 +1,209 @@ +/* $NetBSD: bt_search.c,v 1.17 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_search.c,v 1.17 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include + +#include +#include "btree.h" + +static int __bt_snext(BTREE *, PAGE *, const DBT *, int *); +static int __bt_sprev(BTREE *, PAGE *, const DBT *, int *); + +/* + * __bt_search -- + * Search a btree for a key. + * + * Parameters: + * t: tree to search + * key: key to find + * exactp: pointer to exact match flag + * + * Returns: + * The EPG for matching record, if any, or the EPG for the location + * of the key, if it were inserted into the tree, is entered into + * the bt_cur field of the tree. A pointer to the field is returned. + */ +EPG * +__bt_search(BTREE *t, const DBT *key, int *exactp) +{ + PAGE *h; + indx_t base, idx, lim; + pgno_t pg; + int cmp; + + BT_CLR(t); + for (pg = P_ROOT;;) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (NULL); + + /* Do a binary search on the current page. */ + t->bt_cur.page = h; + for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) { + t->bt_cur.index = idx = base + ((uint32_t)lim >> 1); + if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) { + if (h->flags & P_BLEAF) { + *exactp = 1; + return (&t->bt_cur); + } + goto next; + } + if (cmp > 0) { + base = idx + 1; + --lim; + } + } + + /* + * If it's a leaf page, we're almost done. If no duplicates + * are allowed, or we have an exact match, we're done. Else, + * it's possible that there were matching keys on this page, + * which later deleted, and we're on a page with no matches + * while there are matches on other pages. If at the start or + * end of a page, check the adjacent page. + */ + if (h->flags & P_BLEAF) { + if (!F_ISSET(t, B_NODUPS)) { + if (base == 0 && + h->prevpg != P_INVALID && + __bt_sprev(t, h, key, exactp)) + return (&t->bt_cur); + if (base == NEXTINDEX(h) && + h->nextpg != P_INVALID && + __bt_snext(t, h, key, exactp)) + return (&t->bt_cur); + } + *exactp = 0; + t->bt_cur.index = base; + return (&t->bt_cur); + } + + /* + * No match found. Base is the smallest index greater than + * key and may be zero or a last + 1 index. If it's non-zero, + * decrement by one, and record the internal page which should + * be a parent page for the key. If a split later occurs, the + * inserted page will be to the right of the saved page. + */ + idx = base ? base - 1 : base; + +next: BT_PUSH(t, h->pgno, idx); + pg = GETBINTERNAL(h, idx)->pgno; + mpool_put(t->bt_mp, h, 0); + } +} + +/* + * __bt_snext -- + * Check for an exact match after the key. + * + * Parameters: + * t: tree + * h: current page + * key: key + * exactp: pointer to exact match flag + * + * Returns: + * If an exact match found. + */ +static int +__bt_snext(BTREE *t, PAGE *h, const DBT *key, int *exactp) +{ + EPG e; + + /* + * Get the next page. The key is either an exact + * match, or not as good as the one we already have. + */ + if ((e.page = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) + return (0); + e.index = 0; + if (__bt_cmp(t, key, &e) == 0) { + mpool_put(t->bt_mp, h, 0); + t->bt_cur = e; + *exactp = 1; + return (1); + } + mpool_put(t->bt_mp, e.page, 0); + return (0); +} + +/* + * __bt_sprev -- + * Check for an exact match before the key. + * + * Parameters: + * t: tree + * h: current page + * key: key + * exactp: pointer to exact match flag + * + * Returns: + * If an exact match found. + */ +static int +__bt_sprev(BTREE *t, PAGE *h, const DBT *key, int *exactp) +{ + EPG e; + + /* + * Get the previous page. The key is either an exact + * match, or not as good as the one we already have. + */ + if ((e.page = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) + return (0); + e.index = NEXTINDEX(e.page) - 1; + if (__bt_cmp(t, key, &e) == 0) { + mpool_put(t->bt_mp, h, 0); + t->bt_cur = e; + *exactp = 1; + return (1); + } + mpool_put(t->bt_mp, e.page, 0); + return (0); +} diff --git a/lib/libc/db/btree/bt_seq.c b/lib/libc/db/btree/bt_seq.c new file mode 100644 index 000000000..f1cfac3c4 --- /dev/null +++ b/lib/libc/db/btree/bt_seq.c @@ -0,0 +1,446 @@ +/* $NetBSD: bt_seq.c,v 1.17 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_seq.c,v 1.17 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include +#include + +#include +#include "btree.h" + +static int __bt_first(BTREE *, const DBT *, EPG *, int *); +static int __bt_seqadv(BTREE *, EPG *, int); +static int __bt_seqset(BTREE *, EPG *, DBT *, int); + +/* + * Sequential scan support. + * + * The tree can be scanned sequentially, starting from either end of the + * tree or from any specific key. A scan request before any scanning is + * done is initialized as starting from the least node. + */ + +/* + * __bt_seq -- + * Btree sequential scan interface. + * + * Parameters: + * dbp: pointer to access method + * key: key for positioning and return value + * data: data return value + * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. + * + * Returns: + * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. + */ +int +__bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags) +{ + BTREE *t; + EPG e; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* + * If scan unitialized as yet, or starting at a specific record, set + * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin + * the page the cursor references if they're successful. + */ + switch (flags) { + case R_NEXT: + case R_PREV: + if (F_ISSET(&t->bt_cursor, CURS_INIT)) { + status = __bt_seqadv(t, &e, (int)flags); + break; + } + /* FALLTHROUGH */ + case R_FIRST: + case R_LAST: + case R_CURSOR: + status = __bt_seqset(t, &e, key, (int)flags); + break; + default: + errno = EINVAL; + return (RET_ERROR); + } + + if (status == RET_SUCCESS) { + __bt_setcur(t, e.page->pgno, (u_int)e.index); + + status = + __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); + + /* + * If the user is doing concurrent access, we copied the + * key/data, toss the page. + */ + if (F_ISSET(t, B_DB_LOCK)) + mpool_put(t->bt_mp, e.page, 0); + else + t->bt_pinned = e.page; + } + return (status); +} + +/* + * __bt_seqset -- + * Set the sequential scan to a specific key. + * + * Parameters: + * t: tree + * ep: storage for returned key + * key: key for initial scan position + * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV + * + * Side effects: + * Pins the page the cursor references. + * + * Returns: + * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. + */ +static int +__bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags) +{ + PAGE *h; + pgno_t pg; + int exact; + + /* + * Find the first, last or specific key in the tree and point the + * cursor at it. The cursor may not be moved until a new key has + * been found. + */ + switch (flags) { + case R_CURSOR: /* Keyed scan. */ + /* + * Find the first instance of the key or the smallest key + * which is greater than or equal to the specified key. + */ + if (key->data == NULL || key->size == 0) { + errno = EINVAL; + return (RET_ERROR); + } + return (__bt_first(t, key, ep, &exact)); + case R_FIRST: /* First record. */ + case R_NEXT: + /* Walk down the left-hand side of the tree. */ + for (pg = P_ROOT;;) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + /* Check for an empty tree. */ + if (NEXTINDEX(h) == 0) { + mpool_put(t->bt_mp, h, 0); + return (RET_SPECIAL); + } + + if (h->flags & (P_BLEAF | P_RLEAF)) + break; + pg = GETBINTERNAL(h, 0)->pgno; + mpool_put(t->bt_mp, h, 0); + } + ep->page = h; + ep->index = 0; + break; + case R_LAST: /* Last record. */ + case R_PREV: + /* Walk down the right-hand side of the tree. */ + for (pg = P_ROOT;;) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + + /* Check for an empty tree. */ + if (NEXTINDEX(h) == 0) { + mpool_put(t->bt_mp, h, 0); + return (RET_SPECIAL); + } + + if (h->flags & (P_BLEAF | P_RLEAF)) + break; + pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; + mpool_put(t->bt_mp, h, 0); + } + + ep->page = h; + ep->index = NEXTINDEX(h) - 1; + break; + } + return (RET_SUCCESS); +} + +/* + * __bt_seqadvance -- + * Advance the sequential scan. + * + * Parameters: + * t: tree + * flags: R_NEXT, R_PREV + * + * Side effects: + * Pins the page the new key/data record is on. + * + * Returns: + * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. + */ +static int +__bt_seqadv(BTREE *t, EPG *ep, int flags) +{ + CURSOR *c; + PAGE *h; + indx_t idx = 0; /* pacify gcc */ + pgno_t pg; + int exact; + + /* + * There are a couple of states that we can be in. The cursor has + * been initialized by the time we get here, but that's all we know. + */ + c = &t->bt_cursor; + + /* + * The cursor was deleted where there weren't any duplicate records, + * so the key was saved. Find out where that key would go in the + * current tree. It doesn't matter if the returned key is an exact + * match or not -- if it's an exact match, the record was added after + * the delete so we can just return it. If not, as long as there's + * a record there, return it. + */ + if (F_ISSET(c, CURS_ACQUIRE)) + return (__bt_first(t, &c->key, ep, &exact)); + + /* Get the page referenced by the cursor. */ + if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) + return (RET_ERROR); + + /* + * Find the next/previous record in the tree and point the cursor at + * it. The cursor may not be moved until a new key has been found. + */ + switch (flags) { + case R_NEXT: /* Next record. */ + /* + * The cursor was deleted in duplicate records, and moved + * forward to a record that has yet to be returned. Clear + * that flag, and return the record. + */ + if (F_ISSET(c, CURS_AFTER)) + goto usecurrent; + idx = c->pg.index; + if (++idx == NEXTINDEX(h)) { + pg = h->nextpg; + mpool_put(t->bt_mp, h, 0); + if (pg == P_INVALID) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + idx = 0; + } + break; + case R_PREV: /* Previous record. */ + /* + * The cursor was deleted in duplicate records, and moved + * backward to a record that has yet to be returned. Clear + * that flag, and return the record. + */ + if (F_ISSET(c, CURS_BEFORE)) { +usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); + ep->page = h; + ep->index = c->pg.index; + return (RET_SUCCESS); + } + idx = c->pg.index; + if (idx == 0) { + pg = h->prevpg; + mpool_put(t->bt_mp, h, 0); + if (pg == P_INVALID) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + idx = NEXTINDEX(h) - 1; + } else + --idx; + break; + } + + ep->page = h; + ep->index = idx; + return (RET_SUCCESS); +} + +/* + * __bt_first -- + * Find the first entry. + * + * Parameters: + * t: the tree + * key: the key + * erval: return EPG + * exactp: pointer to exact match flag + * + * Returns: + * The first entry in the tree greater than or equal to key, + * or RET_SPECIAL if no such key exists. + */ +static int +__bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp) +{ + PAGE *h; + EPG *ep, save; + pgno_t pg; + + /* + * Find any matching record; __bt_search pins the page. + * + * If it's an exact match and duplicates are possible, walk backwards + * in the tree until we find the first one. Otherwise, make sure it's + * a valid key (__bt_search may return an index just past the end of a + * page) and return it. + */ + if ((ep = __bt_search(t, key, exactp)) == NULL) + return (0); + if (*exactp) { + if (F_ISSET(t, B_NODUPS)) { + *erval = *ep; + return (RET_SUCCESS); + } + + /* + * Walk backwards, as long as the entry matches and there are + * keys left in the tree. Save a copy of each match in case + * we go too far. + */ + save = *ep; + h = ep->page; + do { + if (save.page->pgno != ep->page->pgno) { + mpool_put(t->bt_mp, save.page, 0); + save = *ep; + } else + save.index = ep->index; + + /* + * Don't unpin the page the last (or original) match + * was on, but make sure it's unpinned if an error + * occurs. + */ + if (ep->index == 0) { + if (h->prevpg == P_INVALID) + break; + if (h->pgno != save.page->pgno) + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, + h->prevpg, 0)) == NULL) + return (RET_ERROR); + ep->page = h; + ep->index = NEXTINDEX(h); + } + --ep->index; + } while (__bt_cmp(t, key, ep) == 0); + + /* + * Reach here with the last page that was looked at pinned, + * which may or may not be the same as the last (or original) + * match page. If it's not useful, release it. + */ + if (h->pgno != save.page->pgno) + mpool_put(t->bt_mp, h, 0); + + *erval = save; + return (RET_SUCCESS); + } + + /* If at the end of a page, find the next entry. */ + if (ep->index == NEXTINDEX(ep->page)) { + h = ep->page; + pg = h->nextpg; + mpool_put(t->bt_mp, h, 0); + if (pg == P_INVALID) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + ep->index = 0; + ep->page = h; + } + *erval = *ep; + return (RET_SUCCESS); +} + +/* + * __bt_setcur -- + * Set the cursor to an entry in the tree. + * + * Parameters: + * t: the tree + * pgno: page number + * idx: page index + */ +void +__bt_setcur(BTREE *t, pgno_t pgno, u_int idx) +{ + /* Lose any already deleted key. */ + if (t->bt_cursor.key.data != NULL) { + free(t->bt_cursor.key.data); + t->bt_cursor.key.size = 0; + t->bt_cursor.key.data = NULL; + } + F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); + + /* Update the cursor. */ + t->bt_cursor.pg.pgno = pgno; + t->bt_cursor.pg.index = idx; + F_SET(&t->bt_cursor, CURS_INIT); +} diff --git a/lib/libc/db/btree/bt_split.c b/lib/libc/db/btree/bt_split.c new file mode 100644 index 000000000..07c59b6a3 --- /dev/null +++ b/lib/libc/db/btree/bt_split.c @@ -0,0 +1,831 @@ +/* $NetBSD: bt_split.c,v 1.19 2009/04/22 18:44:06 christos Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_split.c,v 1.19 2009/04/22 18:44:06 christos Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include +#include + +#include +#include "btree.h" + +static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *); +static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); +static int bt_preserve(BTREE *, pgno_t); +static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t); +static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); +static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *); +static recno_t rec_total(PAGE *); + +#ifdef STATISTICS +unsigned long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; +#endif + +/* + * __BT_SPLIT -- Split the tree. + * + * Parameters: + * t: tree + * sp: page to split + * key: key to insert + * data: data to insert + * flags: BIGKEY/BIGDATA flags + * ilen: insert length + * skip: index to leave open + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, + size_t ilen, uint32_t argskip) +{ + BINTERNAL *bi = NULL; /* pacify gcc */ + BLEAF *bl = NULL, *tbl; /* pacify gcc */ + DBT a, b; + EPGNO *parent; + PAGE *h, *l, *r, *lchild, *rchild; + indx_t nxtindex; + uint16_t skip; + uint32_t n, nbytes, nksize = 0; /* pacify gcc */ + int parentsplit; + char *dest; + + /* + * Split the page into two pages, l and r. The split routines return + * a pointer to the page into which the key should be inserted and with + * skip set to the offset which should be used. Additionally, l and r + * are pinned. + */ + skip = argskip; + h = sp->pgno == P_ROOT ? + bt_root(t, sp, &l, &r, &skip, ilen) : + bt_page(t, sp, &l, &r, &skip, ilen); + if (h == NULL) + return (RET_ERROR); + + /* + * Insert the new key/data pair into the leaf page. (Key inserts + * always cause a leaf page to split first.) + */ + _DBFIT(ilen, indx_t); + h->upper -= (indx_t)ilen; + h->linp[skip] = h->upper; + dest = (char *)(void *)h + h->upper; + if (F_ISSET(t, R_RECNO)) + WR_RLEAF(dest, data, flags); + else + WR_BLEAF(dest, key, data, flags); + + /* If the root page was split, make it look right. */ + if (sp->pgno == P_ROOT && + (F_ISSET(t, R_RECNO) ? + bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) + goto err2; + + /* + * Now we walk the parent page stack -- a LIFO stack of the pages that + * were traversed when we searched for the page that split. Each stack + * entry is a page number and a page index offset. The offset is for + * the page traversed on the search. We've just split a page, so we + * have to insert a new key into the parent page. + * + * If the insert into the parent page causes it to split, may have to + * continue splitting all the way up the tree. We stop if the root + * splits or the page inserted into didn't have to split to hold the + * new key. Some algorithms replace the key for the old page as well + * as the new page. We don't, as there's no reason to believe that the + * first key on the old page is any better than the key we have, and, + * in the case of a key being placed at index 0 causing the split, the + * key is unavailable. + * + * There are a maximum of 5 pages pinned at any time. We keep the left + * and right pages pinned while working on the parent. The 5 are the + * two children, left parent and right parent (when the parent splits) + * and the root page or the overflow key page when calling bt_preserve. + * This code must make sure that all pins are released other than the + * root page or overflow page which is unlocked elsewhere. + */ + while ((parent = BT_POP(t)) != NULL) { + lchild = l; + rchild = r; + + /* Get the parent page. */ + if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + goto err2; + + /* + * The new key goes ONE AFTER the index, because the split + * was to the right. + */ + skip = parent->index + 1; + + /* + * Calculate the space needed on the parent page. + * + * Prefix trees: space hack when inserting into BINTERNAL + * pages. Retain only what's needed to distinguish between + * the new entry and the LAST entry on the page to its left. + * If the keys compare equal, retain the entire key. Note, + * we don't touch overflow keys, and the entire key must be + * retained for the next-to-left most key on the leftmost + * page of each level, or the search will fail. Applicable + * ONLY to internal pages that have leaf pages as children. + * Further reduction of the key between pairs of internal + * pages loses too much information. + */ + switch (rchild->flags & P_TYPE) { + case P_BINTERNAL: + bi = GETBINTERNAL(rchild, 0); + nbytes = NBINTERNAL(bi->ksize); + break; + case P_BLEAF: + bl = GETBLEAF(rchild, 0); + nbytes = NBINTERNAL(bl->ksize); + if (t->bt_pfx && !(bl->flags & P_BIGKEY) && + (h->prevpg != P_INVALID || skip > 1)) { + size_t temp; + tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); + a.size = tbl->ksize; + a.data = tbl->bytes; + b.size = bl->ksize; + b.data = bl->bytes; + temp = t->bt_pfx(&a, &b); + _DBFIT(temp, uint32_t); + nksize = (uint32_t)temp; + n = NBINTERNAL(nksize); + if (n < nbytes) { +#ifdef STATISTICS + bt_pfxsaved += nbytes - n; +#endif + nbytes = n; + } else + nksize = 0; + } else + nksize = 0; + break; + case P_RINTERNAL: + case P_RLEAF: + nbytes = NRINTERNAL; + break; + default: + abort(); + } + + /* Split the parent page if necessary or shift the indices. */ + if ((uint32_t)h->upper - (uint32_t)h->lower < nbytes + sizeof(indx_t)) { + sp = h; + h = h->pgno == P_ROOT ? + bt_root(t, h, &l, &r, &skip, nbytes) : + bt_page(t, h, &l, &r, &skip, nbytes); + if (h == NULL) + goto err1; + parentsplit = 1; + } else { + if (skip < (nxtindex = NEXTINDEX(h))) + memmove(h->linp + skip + 1, h->linp + skip, + (nxtindex - skip) * sizeof(indx_t)); + h->lower += sizeof(indx_t); + parentsplit = 0; + } + + /* Insert the key into the parent page. */ + switch (rchild->flags & P_TYPE) { + case P_BINTERNAL: + h->linp[skip] = h->upper -= nbytes; + dest = (char *)(void *)h + h->linp[skip]; + memmove(dest, bi, nbytes); + ((BINTERNAL *)(void *)dest)->pgno = rchild->pgno; + break; + case P_BLEAF: + h->linp[skip] = h->upper -= nbytes; + dest = (char *)(void *)h + h->linp[skip]; + WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, + rchild->pgno, bl->flags & P_BIGKEY); + memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); + if (bl->flags & P_BIGKEY && + bt_preserve(t, *(pgno_t *)(void *)bl->bytes) == + RET_ERROR) + goto err1; + break; + case P_RINTERNAL: + /* + * Update the left page count. If split + * added at index 0, fix the correct page. + */ + if (skip > 0) + dest = (char *)(void *)h + h->linp[skip - 1]; + else + dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1]; + ((RINTERNAL *)(void *)dest)->nrecs = rec_total(lchild); + ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno; + + /* Update the right page count. */ + h->linp[skip] = h->upper -= nbytes; + dest = (char *)(void *)h + h->linp[skip]; + ((RINTERNAL *)(void *)dest)->nrecs = rec_total(rchild); + ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno; + break; + case P_RLEAF: + /* + * Update the left page count. If split + * added at index 0, fix the correct page. + */ + if (skip > 0) + dest = (char *)(void *)h + h->linp[skip - 1]; + else + dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1]; + ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(lchild); + ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno; + + /* Update the right page count. */ + h->linp[skip] = h->upper -= nbytes; + dest = (char *)(void *)h + h->linp[skip]; + ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(rchild); + ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno; + break; + default: + abort(); + } + + /* Unpin the held pages. */ + if (!parentsplit) { + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + break; + } + + /* If the root page was split, make it look right. */ + if (sp->pgno == P_ROOT && + (F_ISSET(t, R_RECNO) ? + bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) + goto err1; + + mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); + mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); + } + + /* Unpin the held pages. */ + mpool_put(t->bt_mp, l, MPOOL_DIRTY); + mpool_put(t->bt_mp, r, MPOOL_DIRTY); + + /* Clear any pages left on the stack. */ + return (RET_SUCCESS); + + /* + * If something fails in the above loop we were already walking back + * up the tree and the tree is now inconsistent. Nothing much we can + * do about it but release any memory we're holding. + */ +err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); + mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); + +err2: mpool_put(t->bt_mp, l, 0); + mpool_put(t->bt_mp, r, 0); + __dbpanic(t->bt_dbp); + return (RET_ERROR); +} + +/* + * BT_PAGE -- Split a non-root page of a btree. + * + * Parameters: + * t: tree + * h: root page + * lp: pointer to left page pointer + * rp: pointer to right page pointer + * skip: pointer to index to leave open + * ilen: insert length + * + * Returns: + * Pointer to page in which to insert or NULL on error. + */ +static PAGE * +bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) +{ + PAGE *l, *r, *tp; + pgno_t npg; + +#ifdef STATISTICS + ++bt_split; +#endif + /* Put the new right page for the split into place. */ + if ((r = __bt_new(t, &npg)) == NULL) + return (NULL); + r->pgno = npg; + r->lower = BTDATAOFF; + r->upper = t->bt_psize; + r->nextpg = h->nextpg; + r->prevpg = h->pgno; + r->flags = h->flags & P_TYPE; + + /* + * If we're splitting the last page on a level because we're appending + * a key to it (skip is NEXTINDEX()), it's likely that the data is + * sorted. Adding an empty page on the side of the level is less work + * and can push the fill factor much higher than normal. If we're + * wrong it's no big deal, we'll just do the split the right way next + * time. It may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, but it's not. + * Don't even try. + */ + if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { +#ifdef STATISTICS + ++bt_sortsplit; +#endif + h->nextpg = r->pgno; + r->lower = BTDATAOFF + sizeof(indx_t); + *skip = 0; + *lp = h; + *rp = r; + return (r); + } + + /* Put the new left page for the split into place. */ + if ((l = calloc(1, t->bt_psize)) == NULL) { + mpool_put(t->bt_mp, r, 0); + return (NULL); + } +#ifdef PURIFY + memset(l, 0xff, t->bt_psize); +#endif + l->pgno = h->pgno; + l->nextpg = r->pgno; + l->prevpg = h->prevpg; + l->lower = BTDATAOFF; + l->upper = t->bt_psize; + l->flags = h->flags & P_TYPE; + + /* Fix up the previous pointer of the page after the split page. */ + if (h->nextpg != P_INVALID) { + if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { + free(l); + /* XXX mpool_free(t->bt_mp, r->pgno); */ + return (NULL); + } + tp->prevpg = r->pgno; + mpool_put(t->bt_mp, tp, MPOOL_DIRTY); + } + + /* + * Split right. The key/data pairs aren't sorted in the btree page so + * it's simpler to copy the data from the split page onto two new pages + * instead of copying half the data to the right page and compacting + * the left page in place. Since the left page can't change, we have + * to swap the original and the allocated left page after the split. + */ + tp = bt_psplit(t, h, l, r, skip, ilen); + + /* Move the new left page onto the old left page. */ + memmove(h, l, t->bt_psize); + if (tp == l) + tp = h; + free(l); + + *lp = h; + *rp = r; + return (tp); +} + +/* + * BT_ROOT -- Split the root page of a btree. + * + * Parameters: + * t: tree + * h: root page + * lp: pointer to left page pointer + * rp: pointer to right page pointer + * skip: pointer to index to leave open + * ilen: insert length + * + * Returns: + * Pointer to page in which to insert or NULL on error. + */ +static PAGE * +bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) +{ + PAGE *l, *r, *tp; + pgno_t lnpg, rnpg; + +#ifdef STATISTICS + ++bt_split; + ++bt_rootsplit; +#endif + /* Put the new left and right pages for the split into place. */ + if ((l = __bt_new(t, &lnpg)) == NULL || + (r = __bt_new(t, &rnpg)) == NULL) + return (NULL); + l->pgno = lnpg; + r->pgno = rnpg; + l->nextpg = r->pgno; + r->prevpg = l->pgno; + l->prevpg = r->nextpg = P_INVALID; + l->lower = r->lower = BTDATAOFF; + l->upper = r->upper = t->bt_psize; + l->flags = r->flags = h->flags & P_TYPE; + + /* Split the root page. */ + tp = bt_psplit(t, h, l, r, skip, ilen); + + *lp = l; + *rp = r; + return (tp); +} + +/* + * BT_RROOT -- Fix up the recno root page after it has been split. + * + * Parameters: + * t: tree + * h: root page + * l: left page + * r: right page + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) +{ + char *dest; + uint32_t sz; + size_t temp; + + temp = t->bt_psize - NRINTERNAL; + _DBFIT(temp, uint32_t); + sz = (uint32_t)temp; + + /* Insert the left and right keys, set the header information. */ + _DBFIT(sz, indx_t); + h->linp[0] = h->upper = (indx_t)sz; + dest = (char *)(void *)h + h->upper; + WR_RINTERNAL(dest, + l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); + + h->linp[1] = h->upper -= NRINTERNAL; + dest = (char *)(void *)h + h->upper; + WR_RINTERNAL(dest, + r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); + + h->lower = BTDATAOFF + 2 * sizeof(indx_t); + + /* Unpin the root page, set to recno internal page. */ + h->flags &= ~P_TYPE; + h->flags |= P_RINTERNAL; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + + return (RET_SUCCESS); +} + +/* + * BT_BROOT -- Fix up the btree root page after it has been split. + * + * Parameters: + * t: tree + * h: root page + * l: left page + * r: right page + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +static int +bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) +{ + BINTERNAL *bi = NULL; /* pacify gcc */ + BLEAF *bl; + uint32_t nbytes; + char *dest; + + /* + * If the root page was a leaf page, change it into an internal page. + * We copy the key we split on (but not the key's data, in the case of + * a leaf page) to the new root page. + * + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + nbytes = NBINTERNAL(0); + h->linp[0] = h->upper = t->bt_psize - nbytes; + dest = (char *)(void *)h + h->upper; + WR_BINTERNAL(dest, 0, l->pgno, 0); + + switch (h->flags & P_TYPE) { + case P_BLEAF: + bl = GETBLEAF(r, 0); + nbytes = NBINTERNAL(bl->ksize); + h->linp[1] = h->upper -= nbytes; + dest = (char *)(void *)h + h->upper; + WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); + memmove(dest, bl->bytes, bl->ksize); + + /* + * If the key is on an overflow page, mark the overflow chain + * so it isn't deleted when the leaf copy of the key is deleted. + */ + if (bl->flags & P_BIGKEY && + bt_preserve(t, *(pgno_t *)(void *)bl->bytes) == RET_ERROR) + return (RET_ERROR); + break; + case P_BINTERNAL: + bi = GETBINTERNAL(r, 0); + nbytes = NBINTERNAL(bi->ksize); + h->linp[1] = h->upper -= nbytes; + dest = (char *)(void *)h + h->upper; + memmove(dest, bi, nbytes); + ((BINTERNAL *)(void *)dest)->pgno = r->pgno; + break; + default: + abort(); + } + + /* There are two keys on the page. */ + h->lower = BTDATAOFF + 2 * sizeof(indx_t); + + /* Unpin the root page, set to btree internal page. */ + h->flags &= ~P_TYPE; + h->flags |= P_BINTERNAL; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + + return (RET_SUCCESS); +} + +/* + * BT_PSPLIT -- Do the real work of splitting the page. + * + * Parameters: + * t: tree + * h: page to be split + * l: page to put lower half of data + * r: page to put upper half of data + * pskip: pointer to index to leave open + * ilen: insert length + * + * Returns: + * Pointer to page in which to insert. + */ +static PAGE * +bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen) +{ + BINTERNAL *bi; + BLEAF *bl; + CURSOR *c; + RLEAF *rl; + PAGE *rval; + void *src = NULL; /* pacify gcc */ + indx_t full, half, nxt, off, skip, top, used; + uint32_t nbytes; + size_t temp; + int bigkeycnt, isbigkey; + + /* + * Split the data to the left and right pages. Leave the skip index + * open. Additionally, make some effort not to split on an overflow + * key. This makes internal page processing faster and can save + * space as overflow keys used by internal pages are never deleted. + */ + bigkeycnt = 0; + skip = *pskip; + temp = t->bt_psize - BTDATAOFF; + _DBFIT(temp, indx_t); + full = (indx_t)temp; + half = full / 2; + used = 0; + for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { + if (skip == off) { + _DBFIT(ilen, uint32_t); + nbytes = (uint32_t)ilen; + isbigkey = 0; /* XXX: not really known. */ + } else + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + src = bi = GETBINTERNAL(h, nxt); + nbytes = NBINTERNAL(bi->ksize); + isbigkey = bi->flags & P_BIGKEY; + break; + case P_BLEAF: + src = bl = GETBLEAF(h, nxt); + nbytes = NBLEAF(bl); + isbigkey = bl->flags & P_BIGKEY; + break; + case P_RINTERNAL: + src = GETRINTERNAL(h, nxt); + nbytes = NRINTERNAL; + isbigkey = 0; + break; + case P_RLEAF: + src = rl = GETRLEAF(h, nxt); + nbytes = NRLEAF(rl); + isbigkey = 0; + break; + default: + abort(); + } + + /* + * If the key/data pairs are substantial fractions of the max + * possible size for the page, it's possible to get situations + * where we decide to try and copy too much onto the left page. + * Make sure that doesn't happen. + */ + if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) || + nxt == top - 1) { + --off; + break; + } + + /* Copy the key/data pair, if not the skipped index. */ + if (skip != off) { + ++nxt; + + l->linp[off] = l->upper -= nbytes; + memmove((char *)(void *)l + l->upper, src, nbytes); + } + + temp = nbytes + sizeof(indx_t); + _DBFIT(temp, indx_t); + used += (indx_t)temp; + if (used >= half) { + if (!isbigkey || bigkeycnt == 3) + break; + else + ++bigkeycnt; + } + } + + /* + * Off is the last offset that's valid for the left page. + * Nxt is the first offset to be placed on the right page. + */ + temp = (off + 1) * sizeof(indx_t); + _DBFIT(temp, indx_t); + l->lower += (indx_t)temp; + + /* + * If splitting the page that the cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. If the + * cursor is at or past the skipped slot, the cursor is incremented by + * one. If the cursor is on the right page, it is decremented by the + * number of records split to the left page. + */ + c = &t->bt_cursor; + if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { + if (c->pg.index >= skip) + ++c->pg.index; + if (c->pg.index < nxt) /* Left page. */ + c->pg.pgno = l->pgno; + else { /* Right page. */ + c->pg.pgno = r->pgno; + c->pg.index -= nxt; + } + } + + /* + * If the skipped index was on the left page, just return that page. + * Otherwise, adjust the skip index to reflect the new position on + * the right page. + */ + if (skip <= off) { + skip = MAX_PAGE_OFFSET; + rval = l; + } else { + rval = r; + *pskip -= nxt; + } + + for (off = 0; nxt < top; ++off) { + if (skip == nxt) { + ++off; + skip = MAX_PAGE_OFFSET; + } + switch (h->flags & P_TYPE) { + case P_BINTERNAL: + src = bi = GETBINTERNAL(h, nxt); + nbytes = NBINTERNAL(bi->ksize); + break; + case P_BLEAF: + src = bl = GETBLEAF(h, nxt); + nbytes = NBLEAF(bl); + break; + case P_RINTERNAL: + src = GETRINTERNAL(h, nxt); + nbytes = NRINTERNAL; + break; + case P_RLEAF: + src = rl = GETRLEAF(h, nxt); + nbytes = NRLEAF(rl); + break; + default: + abort(); + } + ++nxt; + r->linp[off] = r->upper -= nbytes; + memmove((char *)(void *)r + r->upper, src, nbytes); + } + temp = off * sizeof(indx_t); + _DBFIT(temp, indx_t); + r->lower += (indx_t)temp; + + /* If the key is being appended to the page, adjust the index. */ + if (skip == top) + r->lower += sizeof(indx_t); + + return (rval); +} + +/* + * BT_PRESERVE -- Mark a chain of pages as used by an internal node. + * + * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the + * record that references them gets deleted. Chains pointed to by internal + * pages never get deleted. This routine marks a chain as pointed to by an + * internal page. + * + * Parameters: + * t: tree + * pg: page number of first page in the chain. + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +static int +bt_preserve(BTREE *t, pgno_t pg) +{ + PAGE *h; + + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + h->flags |= P_PRESERVE; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + return (RET_SUCCESS); +} + +/* + * REC_TOTAL -- Return the number of recno entries below a page. + * + * Parameters: + * h: page + * + * Returns: + * The number of recno entries below a page. + * + * XXX + * These values could be set by the bt_psplit routine. The problem is that the + * entry has to be popped off of the stack etc. or the values have to be passed + * all the way back to bt_split/bt_rroot and it's not very clean. + */ +static recno_t +rec_total(PAGE *h) +{ + recno_t recs; + indx_t nxt, top; + + for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) + recs += GETRINTERNAL(h, nxt)->nrecs; + return (recs); +} diff --git a/lib/libc/db/btree/bt_utils.c b/lib/libc/db/btree/bt_utils.c new file mode 100644 index 000000000..59503c34e --- /dev/null +++ b/lib/libc/db/btree/bt_utils.c @@ -0,0 +1,258 @@ +/* $NetBSD: bt_utils.c,v 1.13 2008/09/10 17:52:35 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: bt_utils.c,v 1.13 2008/09/10 17:52:35 joerg Exp $"); +#endif + +#include + +#include +#include +#include +#include + +#include +#include "btree.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/* + * __bt_ret -- + * Build return key/data pair. + * + * Parameters: + * t: tree + * e: key/data pair to be returned + * key: user's key structure (NULL if not to be filled in) + * rkey: memory area to hold key + * data: user's data structure (NULL if not to be filled in) + * rdata: memory area to hold data + * copy: always copy the key/data item + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__bt_ret(BTREE *t, EPG *e, DBT *key, DBT *rkey, DBT *data, DBT *rdata, int copy) +{ + BLEAF *bl; + void *p; + + bl = GETBLEAF(e->page, e->index); + + /* + * We must copy big keys/data to make them contigous. Otherwise, + * leave the page pinned and don't copy unless the user specified + * concurrent access. + */ + if (key == NULL) + goto dataonly; + + if (bl->flags & P_BIGKEY) { + if (__ovfl_get(t, bl->bytes, + &key->size, &rkey->data, &rkey->size)) + return (RET_ERROR); + key->data = rkey->data; + } else if (copy || F_ISSET(t, B_DB_LOCK)) { + if (bl->ksize > rkey->size) { + p = (void *)(rkey->data == NULL ? + malloc(bl->ksize) : realloc(rkey->data, bl->ksize)); + if (p == NULL) + return (RET_ERROR); + rkey->data = p; + rkey->size = bl->ksize; + } + memmove(rkey->data, bl->bytes, bl->ksize); + key->size = bl->ksize; + key->data = rkey->data; + } else { + key->size = bl->ksize; + key->data = bl->bytes; + } + +dataonly: + if (data == NULL) + return (RET_SUCCESS); + + if (bl->flags & P_BIGDATA) { + if (__ovfl_get(t, bl->bytes + bl->ksize, + &data->size, &rdata->data, &rdata->size)) + return (RET_ERROR); + data->data = rdata->data; + } else if (copy || F_ISSET(t, B_DB_LOCK)) { + /* Use +1 in case the first record retrieved is 0 length. */ + if (bl->dsize + 1 > rdata->size) { + p = (void *)(rdata->data == NULL ? + malloc(bl->dsize + 1) : + realloc(rdata->data, bl->dsize + 1)); + if (p == NULL) + return (RET_ERROR); + rdata->data = p; + rdata->size = bl->dsize + 1; + } + memmove(rdata->data, bl->bytes + bl->ksize, bl->dsize); + data->size = bl->dsize; + data->data = rdata->data; + } else { + data->size = bl->dsize; + data->data = bl->bytes + bl->ksize; + } + + return (RET_SUCCESS); +} + +/* + * __BT_CMP -- Compare a key to a given record. + * + * Parameters: + * t: tree + * k1: DBT pointer of first arg to comparison + * e: pointer to EPG for comparison + * + * Returns: + * < 0 if k1 is < record + * = 0 if k1 is = record + * > 0 if k1 is > record + */ +int +__bt_cmp(BTREE *t, const DBT *k1, EPG *e) +{ + BINTERNAL *bi; + BLEAF *bl; + DBT k2; + PAGE *h; + void *bigkey; + + /* + * The left-most key on internal pages, at any level of the tree, is + * guaranteed by the following code to be less than any user key. + * This saves us from having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything we've yet seen. + */ + h = e->page; + if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & P_BLEAF)) + return (1); + + bigkey = NULL; + if (h->flags & P_BLEAF) { + bl = GETBLEAF(h, e->index); + if (bl->flags & P_BIGKEY) + bigkey = bl->bytes; + else { + k2.data = bl->bytes; + k2.size = bl->ksize; + } + } else { + bi = GETBINTERNAL(h, e->index); + if (bi->flags & P_BIGKEY) + bigkey = bi->bytes; + else { + k2.data = bi->bytes; + k2.size = bi->ksize; + } + } + + if (bigkey) { + if (__ovfl_get(t, bigkey, + &k2.size, &t->bt_rdata.data, &t->bt_rdata.size)) + return (RET_ERROR); + k2.data = t->bt_rdata.data; + } + return ((*t->bt_cmp)(k1, &k2)); +} + +/* + * __BT_DEFCMP -- Default comparison routine. + * + * Parameters: + * a: DBT #1 + * b: DBT #2 + * + * Returns: + * < 0 if a is < b + * = 0 if a is = b + * > 0 if a is > b + */ +int +__bt_defcmp(const DBT *a, const DBT *b) +{ + size_t len; + uint8_t *p1, *p2; + + /* + * XXX + * If a size_t doesn't fit in an int, this routine can lose. + * What we need is a integral type which is guaranteed to be + * larger than a size_t, and there is no such thing. + */ + len = MIN(a->size, b->size); + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) + if (*p1 != *p2) + return ((int)*p1 - (int)*p2); + return ((int)a->size - (int)b->size); +} + +/* + * __BT_DEFPFX -- Default prefix routine. + * + * Parameters: + * a: DBT #1 + * b: DBT #2 + * + * Returns: + * Number of bytes needed to distinguish b from a. + */ +size_t +__bt_defpfx(const DBT *a, const DBT *b) +{ + uint8_t *p1, *p2; + size_t cnt, len; + + cnt = 1; + len = MIN(a->size, b->size); + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt) + if (*p1 != *p2) + return (cnt); + + /* a->size must be <= b->size, or they wouldn't be in this order. */ + return (a->size < b->size ? a->size + 1 : a->size); +} diff --git a/lib/libc/db/btree/btree.h b/lib/libc/db/btree/btree.h new file mode 100644 index 000000000..b28f082ff --- /dev/null +++ b/lib/libc/db/btree/btree.h @@ -0,0 +1,433 @@ +/* $NetBSD: btree.h,v 1.16 2008/08/26 21:18:38 joerg Exp $ */ + +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)btree.h 8.11 (Berkeley) 8/17/94 + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +/* Macros to set/clear/test flags. */ +#define F_SET(p, f) (p)->flags |= (f) +#define F_CLR(p, f) (p)->flags &= ~(f) +#define F_ISSET(p, f) ((p)->flags & (f)) + +#include + +#define DEFMINKEYPAGE (2) /* Minimum keys per page */ +#define MINCACHE (5) /* Minimum cached pages */ +#define MINPSIZE (512) /* Minimum page size */ + +/* + * Page 0 of a btree file contains a copy of the meta-data. This page is also + * used as an out-of-band page, i.e. page pointers that point to nowhere point + * to page 0. Page 1 is the root of the btree. + */ +#define P_INVALID 0 /* Invalid tree page number. */ +#define P_META 0 /* Tree metadata page number. */ +#define P_ROOT 1 /* Tree root page number. */ + +/* + * There are five page layouts in the btree: btree internal pages (BINTERNAL), + * btree leaf pages (BLEAF), recno internal pages (RINTERNAL), recno leaf pages + * (RLEAF) and overflow pages. All five page types have a page header (PAGE). + * This implementation requires that values within structures NOT be padded. + * (ANSI C permits random padding.) If your compiler pads randomly you'll have + * to do some work to get this package to run. + */ +typedef struct _page { + pgno_t pgno; /* this page's page number */ + pgno_t prevpg; /* left sibling */ + pgno_t nextpg; /* right sibling */ + +#define P_BINTERNAL 0x01 /* btree internal page */ +#define P_BLEAF 0x02 /* leaf page */ +#define P_OVERFLOW 0x04 /* overflow page */ +#define P_RINTERNAL 0x08 /* recno internal page */ +#define P_RLEAF 0x10 /* leaf page */ +#define P_TYPE 0x1f /* type mask */ +#define P_PRESERVE 0x20 /* never delete this chain of pages */ + uint32_t flags; + + indx_t lower; /* lower bound of free space on page */ + indx_t upper; /* upper bound of free space on page */ + indx_t linp[1]; /* indx_t-aligned VAR. LENGTH DATA */ +} PAGE; + +/* First and next index. */ +#define BTDATAOFF \ + (sizeof(pgno_t) + sizeof(pgno_t) + sizeof(pgno_t) + \ + sizeof(uint32_t) + sizeof(indx_t) + sizeof(indx_t)) + +#define _NEXTINDEX(p) (((p)->lower - BTDATAOFF) / sizeof(indx_t)) +#ifdef _DIAGNOSTIC +static __inline indx_t +NEXTINDEX(const PAGE *p) { + size_t x = _NEXTINDEX(p); + _DBFIT(x, indx_t); + return (indx_t)x; +} +#else +#define NEXTINDEX(p) (indx_t)_NEXTINDEX(p) +#endif + +/* + * For pages other than overflow pages, there is an array of offsets into the + * rest of the page immediately following the page header. Each offset is to + * an item which is unique to the type of page. The h_lower offset is just + * past the last filled-in index. The h_upper offset is the first item on the + * page. Offsets are from the beginning of the page. + * + * If an item is too big to store on a single page, a flag is set and the item + * is a { page, size } pair such that the page is the first page of an overflow + * chain with size bytes of item. Overflow pages are simply bytes without any + * external structure. + * + * The page number and size fields in the items are pgno_t-aligned so they can + * be manipulated without copying. (This presumes that 32 bit items can be + * manipulated on this system.) + */ +#define BTLALIGN(n) (((n) + sizeof(pgno_t) - 1) & ~(sizeof(pgno_t) - 1)) +#define NOVFLSIZE (sizeof(pgno_t) + sizeof(uint32_t)) + +/* + * For the btree internal pages, the item is a key. BINTERNALs are {key, pgno} + * pairs, such that the key compares less than or equal to all of the records + * on that page. For a tree without duplicate keys, an internal page with two + * consecutive keys, a and b, will have all records greater than or equal to a + * and less than b stored on the page associated with a. Duplicate keys are + * somewhat special and can cause duplicate internal and leaf page records and + * some minor modifications of the above rule. + */ +typedef struct _binternal { + uint32_t ksize; /* key size */ + pgno_t pgno; /* page number stored on */ +#define P_BIGDATA 0x01 /* overflow data */ +#define P_BIGKEY 0x02 /* overflow key */ + uint8_t flags; + char bytes[1]; /* data */ +} BINTERNAL; + +/* Get the page's BINTERNAL structure at index indx. */ +#define GETBINTERNAL(pg, indx) \ + ((BINTERNAL *)(void *)((char *)(void *)(pg) + (pg)->linp[indx])) + +/* Get the number of bytes in the entry. */ +#define _NBINTERNAL(len) \ + BTLALIGN(sizeof(uint32_t) + sizeof(pgno_t) + sizeof(uint8_t) + (len)) +#ifdef _DIAGNOSTIC +static __inline uint32_t +NBINTERNAL(uint32_t len) { + size_t x = _NBINTERNAL(len); + _DBFIT(x, uint32_t); + return (uint32_t)x; +} +#else +#define NBINTERNAL(len) (uint32_t)_NBINTERNAL(len) +#endif + +/* Copy a BINTERNAL entry to the page. */ +#define WR_BINTERNAL(p, size, pgno, flags) do { \ + _DBFIT(size, uint32_t); \ + *(uint32_t *)(void *)p = (uint32_t)size; \ + p += sizeof(uint32_t); \ + *(pgno_t *)(void *)p = pgno; \ + p += sizeof(pgno_t); \ + *(uint8_t *)(void *)p = flags; \ + p += sizeof(uint8_t); \ +} while (/*CONSTCOND*/0) + +/* + * For the recno internal pages, the item is a page number with the number of + * keys found on that page and below. + */ +typedef struct _rinternal { + recno_t nrecs; /* number of records */ + pgno_t pgno; /* page number stored below */ +} RINTERNAL; + +/* Get the page's RINTERNAL structure at index indx. */ +#define GETRINTERNAL(pg, indx) \ + ((RINTERNAL *)(void *)((char *)(void *)(pg) + (pg)->linp[indx])) + +/* Get the number of bytes in the entry. */ +#define NRINTERNAL \ + BTLALIGN(sizeof(recno_t) + sizeof(pgno_t)) + +/* Copy a RINTERAL entry to the page. */ +#define WR_RINTERNAL(p, nrecs, pgno) do { \ + *(recno_t *)(void *)p = nrecs; \ + p += sizeof(recno_t); \ + *(pgno_t *)(void *)p = pgno; \ +} while (/*CONSTCOND*/0) + +/* For the btree leaf pages, the item is a key and data pair. */ +typedef struct _bleaf { + uint32_t ksize; /* size of key */ + uint32_t dsize; /* size of data */ + uint8_t flags; /* P_BIGDATA, P_BIGKEY */ + char bytes[1]; /* data */ +} BLEAF; + +/* Get the page's BLEAF structure at index indx. */ +#define GETBLEAF(pg, indx) \ + ((BLEAF *)(void *)((char *)(void *)(pg) + (pg)->linp[indx])) + + +/* Get the number of bytes in the user's key/data pair. */ +#define _NBLEAFDBT(ksize, dsize) \ + BTLALIGN(sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint8_t) + \ + (ksize) + (dsize)) +#ifdef _DIAGNOSTIC +static __inline uint32_t +NBLEAFDBT(size_t k, size_t d) { + size_t x = _NBLEAFDBT(k, d); + _DBFIT(x, uint32_t); + return (uint32_t)x; +} +#else +#define NBLEAFDBT(p, q) (uint32_t)_NBLEAFDBT(p, q) +#endif + +/* Get the number of bytes in the entry. */ +#define NBLEAF(p) NBLEAFDBT((p)->ksize, (p)->dsize) + +/* Copy a BLEAF entry to the page. */ +#define WR_BLEAF(p, key, data, flags) do { \ + _DBFIT(key->size, uint32_t); \ + *(uint32_t *)(void *)p = (uint32_t)key->size; \ + p += sizeof(uint32_t); \ + _DBFIT(data->size, uint32_t); \ + *(uint32_t *)(void *)p = (uint32_t)data->size; \ + p += sizeof(uint32_t); \ + *(uint8_t *)(void *)p = flags; \ + p += sizeof(uint8_t); \ + (void)memmove(p, key->data, key->size); \ + p += key->size; \ + (void)memmove(p, data->data, data->size); \ +} while (/*CONSTCOND*/0) + +/* For the recno leaf pages, the item is a data entry. */ +typedef struct _rleaf { + uint32_t dsize; /* size of data */ + uint8_t flags; /* P_BIGDATA */ + char bytes[1]; +} RLEAF; + +/* Get the page's RLEAF structure at index indx. */ +#define GETRLEAF(pg, indx) \ + ((RLEAF *)(void *)((char *)(void *)(pg) + (pg)->linp[indx])) + +#define _NRLEAFDBT(dsize) \ + BTLALIGN(sizeof(uint32_t) + sizeof(uint8_t) + (dsize)) + +#ifdef _DIAGNOSTIC +static __inline uint32_t +NRLEAFDBT(size_t d) { + size_t x = _NRLEAFDBT(d); + _DBFIT(x, uint32_t); + return (uint32_t)x; +} +#else +#define NRLEAFDBT(d) (uint32_t)_NRLEAFDBT(d) +#endif + +/* Get the number of bytes in the entry. */ +#define NRLEAF(p) NRLEAFDBT((p)->dsize) + +/* Get the number of bytes from the user's data. */ + +/* Copy a RLEAF entry to the page. */ +#define WR_RLEAF(p, data, flags) do { \ + _DBFIT(data->size, uint32_t); \ + *(uint32_t *)(void *)p = (uint32_t)data->size; \ + p += sizeof(uint32_t); \ + *(uint8_t *)(void *)p = flags; \ + p += sizeof(uint8_t); \ + memmove(p, data->data, data->size); \ +} while (/*CONSTCOND*/0) + +/* + * A record in the tree is either a pointer to a page and an index in the page + * or a page number and an index. These structures are used as a cursor, stack + * entry and search returns as well as to pass records to other routines. + * + * One comment about searches. Internal page searches must find the largest + * record less than key in the tree so that descents work. Leaf page searches + * must find the smallest record greater than key so that the returned index + * is the record's correct position for insertion. + */ +typedef struct _epgno { + pgno_t pgno; /* the page number */ + indx_t index; /* the index on the page */ +} EPGNO; + +typedef struct _epg { + PAGE *page; /* the (pinned) page */ + indx_t index; /* the index on the page */ +} EPG; + +/* + * About cursors. The cursor (and the page that contained the key/data pair + * that it referenced) can be deleted, which makes things a bit tricky. If + * there are no duplicates of the cursor key in the tree (i.e. B_NODUPS is set + * or there simply aren't any duplicates of the key) we copy the key that it + * referenced when it's deleted, and reacquire a new cursor key if the cursor + * is used again. If there are duplicates keys, we move to the next/previous + * key, and set a flag so that we know what happened. NOTE: if duplicate (to + * the cursor) keys are added to the tree during this process, it is undefined + * if they will be returned or not in a cursor scan. + * + * The flags determine the possible states of the cursor: + * + * CURS_INIT The cursor references *something*. + * CURS_ACQUIRE The cursor was deleted, and a key has been saved so that + * we can reacquire the right position in the tree. + * CURS_AFTER, CURS_BEFORE + * The cursor was deleted, and now references a key/data pair + * that has not yet been returned, either before or after the + * deleted key/data pair. + * XXX + * This structure is broken out so that we can eventually offer multiple + * cursors as part of the DB interface. + */ +typedef struct _cursor { + EPGNO pg; /* B: Saved tree reference. */ + DBT key; /* B: Saved key, or key.data == NULL. */ + recno_t rcursor; /* R: recno cursor (1-based) */ + +#define CURS_ACQUIRE 0x01 /* B: Cursor needs to be reacquired. */ +#define CURS_AFTER 0x02 /* B: Unreturned cursor after key. */ +#define CURS_BEFORE 0x04 /* B: Unreturned cursor before key. */ +#define CURS_INIT 0x08 /* RB: Cursor initialized. */ + uint8_t flags; +} CURSOR; + +/* + * The metadata of the tree. The nrecs field is used only by the RECNO code. + * This is because the btree doesn't really need it and it requires that every + * put or delete call modify the metadata. + */ +typedef struct _btmeta { + uint32_t magic; /* magic number */ + uint32_t version; /* version */ + uint32_t psize; /* page size */ + uint32_t free; /* page number of first free page */ + uint32_t nrecs; /* R: number of records */ + +#define SAVEMETA (B_NODUPS | R_RECNO) + uint32_t flags; /* bt_flags & SAVEMETA */ +} BTMETA; + +/* The in-memory btree/recno data structure. */ +typedef struct _btree { + MPOOL *bt_mp; /* memory pool cookie */ + + DB *bt_dbp; /* pointer to enclosing DB */ + + EPG bt_cur; /* current (pinned) page */ + PAGE *bt_pinned; /* page pinned across calls */ + + CURSOR bt_cursor; /* cursor */ + +#define BT_PUSH(t, p, i) { \ + t->bt_sp->pgno = p; \ + t->bt_sp->index = i; \ + ++t->bt_sp; \ +} +#define BT_POP(t) (t->bt_sp == t->bt_stack ? NULL : --t->bt_sp) +#define BT_CLR(t) (t->bt_sp = t->bt_stack) + EPGNO bt_stack[50]; /* stack of parent pages */ + EPGNO *bt_sp; /* current stack pointer */ + + DBT bt_rkey; /* returned key */ + DBT bt_rdata; /* returned data */ + + int bt_fd; /* tree file descriptor */ + + pgno_t bt_free; /* next free page */ + uint32_t bt_psize; /* page size */ + indx_t bt_ovflsize; /* cut-off for key/data overflow */ + int bt_lorder; /* byte order */ + /* sorted order */ + enum { NOT, BACK, FORWARD } bt_order; + EPGNO bt_last; /* last insert */ + + /* B: key comparison function */ + int (*bt_cmp)(const DBT *, const DBT *); + /* B: prefix comparison function */ + size_t (*bt_pfx)(const DBT *, const DBT *); + /* R: recno input function */ + int (*bt_irec)(struct _btree *, recno_t); + + FILE *bt_rfp; /* R: record FILE pointer */ + int bt_rfd; /* R: record file descriptor */ + + caddr_t bt_cmap; /* R: current point in mapped space */ + caddr_t bt_smap; /* R: start of mapped space */ + caddr_t bt_emap; /* R: end of mapped space */ + size_t bt_msize; /* R: size of mapped region. */ + + recno_t bt_nrecs; /* R: number of records */ + size_t bt_reclen; /* R: fixed record length */ + uint8_t bt_bval; /* R: delimiting byte/pad character */ + +/* + * NB: + * B_NODUPS and R_RECNO are stored on disk, and may not be changed. + */ +#define B_INMEM 0x00001 /* in-memory tree */ +#define B_METADIRTY 0x00002 /* need to write metadata */ +#define B_MODIFIED 0x00004 /* tree modified */ +#define B_NEEDSWAP 0x00008 /* if byte order requires swapping */ +#define B_RDONLY 0x00010 /* read-only tree */ + +#define B_NODUPS 0x00020 /* no duplicate keys permitted */ +#define R_RECNO 0x00080 /* record oriented tree */ + +#define R_CLOSEFP 0x00040 /* opened a file pointer */ +#define R_EOF 0x00100 /* end of input file reached. */ +#define R_FIXLEN 0x00200 /* fixed length records */ +#define R_MEMMAPPED 0x00400 /* memory mapped file. */ +#define R_INMEM 0x00800 /* in-memory file */ +#define R_MODIFIED 0x01000 /* modified file */ +#define R_RDONLY 0x02000 /* read-only file */ + +#define B_DB_LOCK 0x04000 /* DB_LOCK specified. */ +#define B_DB_SHMEM 0x08000 /* DB_SHMEM specified. */ +#define B_DB_TXN 0x10000 /* DB_TXN specified. */ + uint32_t flags; +} BTREE; + +#include "extern.h" diff --git a/lib/libc/db/btree/extern.h b/lib/libc/db/btree/extern.h new file mode 100644 index 000000000..665f852a5 --- /dev/null +++ b/lib/libc/db/btree/extern.h @@ -0,0 +1,72 @@ +/* $NetBSD: extern.h,v 1.12 2008/09/26 11:41:06 tsutsui Exp $ */ + +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)extern.h 8.10 (Berkeley) 7/20/94 + */ + +#ifndef _BTREE_EXTERN_H_ +#define _BTREE_EXTERN_H_ +int __bt_close(DB *); +int __bt_cmp(BTREE *, const DBT *, EPG *); +int __bt_crsrdel(BTREE *, EPGNO *); +int __bt_defcmp(const DBT *, const DBT *); +size_t __bt_defpfx(const DBT *, const DBT *); +int __bt_delete(const DB *, const DBT *, unsigned int); +int __bt_dleaf(BTREE *, const DBT *, PAGE *, unsigned int); +int __bt_fd(const DB *); +int __bt_free(BTREE *, PAGE *); +int __bt_get(const DB *, const DBT *, DBT *, unsigned int); +PAGE *__bt_new(BTREE *, pgno_t *); +void __bt_pgin(void *, pgno_t, void *); +void __bt_pgout(void *, pgno_t, void *); +int __bt_push(BTREE *, pgno_t, int); +int __bt_put(const DB *dbp, DBT *, const DBT *, unsigned int); +int __bt_ret(BTREE *, EPG *, DBT *, DBT *, DBT *, DBT *, int); +EPG *__bt_search(BTREE *, const DBT *, int *); +int __bt_seq(const DB *, DBT *, DBT *, unsigned int); +void __bt_setcur(BTREE *, pgno_t, unsigned int); +int __bt_split(BTREE *, PAGE *, + const DBT *, const DBT *, int, size_t, uint32_t); +int __bt_sync(const DB *, unsigned int); + +int __ovfl_delete(BTREE *, void *); +int __ovfl_get(BTREE *, void *, size_t *, void **, size_t *); +int __ovfl_put(BTREE *, const DBT *, pgno_t *); + +#ifdef DEBUG +void __bt_dmpage(PAGE *); +void __bt_dnpage(DB *, pgno_t); +void __bt_dpage(PAGE *); +void __bt_dump(DB *); +#endif +#ifdef STATISTICS +void __bt_stat(DB *); +#endif +#endif /* _BTREE_EXTERN_H_ */ diff --git a/lib/libc/db/changelog b/lib/libc/db/changelog new file mode 100644 index 000000000..f820e56b5 --- /dev/null +++ b/lib/libc/db/changelog @@ -0,0 +1,105 @@ +# $NetBSD: changelog,v 1.2 1996/05/03 21:20:56 cgd Exp $ + +1.84 -> 1.85 + recno: #ifdef out use of mmap, it's not portable enough. + +1.83 -> 1.84 Thu Aug 18 15:46:07 EDT 1994 + recno: Rework fixed-length records so that closing and reopening + the file now works. Pad short records on input. Never do + signed comparison in recno input reading functions. + +1.82 -> 1.83 Tue Jul 26 15:33:44 EDT 1994 + btree: Rework cursor deletion code yet again; bugs with + deleting empty pages that only contained the cursor + record. + +1.81 -> 1.82 Sat Jul 16 11:01:50 EDT 1994 + btree: Fix bugs introduced by new cursor/deletion code. + Replace return kbuf/dbuf with real DBT's. + +1.80 -> 1.81 + btree: Fix bugs introduced by new cursor/deletion code. + all: Add #defines for Purify. + +1.79 -> 1.80 Wed Jul 13 22:41:54 EDT 1994 + btree Change deletion to coalesce empty pages. This is a major + change, cursors and duplicate pages all had to be reworked. + Return to a fixed stack. + recno: Affected by cursor changes. New cursor structures should + permit multiple cursors in the future. + +1.78 -> 1.79 Mon Jun 20 17:36:47 EDT 1994 + all: Minor cleanups of 1.78 for porting reasons; only + major change was inlining check of NULL pointer + so that __fix_realloc goes away. + +1.77 -> 1.78 Thu Jun 16 19:06:43 EDT 1994 + all: Move "standard" size typedef's into db.h. + +1.76 -> 1.77 Thu Jun 16 16:48:38 EDT 1994 + hash: Delete __init_ routine, has special meaning to OSF 2.0. + +1.74 -> 1.76 + all: Finish up the port to the Alpha. + +1.73 -> 1.74 + recno: Don't put the record if rec_search fails, in rec_rdelete. + Create fixed-length intermediate records past "end" of DB + correctly. + Realloc bug when reading in fixed records. + all: First cut at port to Alpha (64-bit architecture) using + 4.4BSD basic integral types typedef's. + Cast allocation pointers to shut up old compilers. + Rework PORT directory into OS/machine directories. + +1.72 -> 1.73 + btree: If enough duplicate records were inserted and then deleted + that internal pages had references to empty pages of the + duplicate keys, the search function ended up on the wrong + page. + +1.7 -> 1.72 12 Oct 1993 + hash: Support NET/2 hash formats. + +1.7 -> 1.71 16 Sep 1993 + btree/recno: + Fix bug in internal search routines that caused + return of invalid pointers. + +1.6 -> 1.7 07 Sep 1993 + hash: Fixed big key overflow bugs. + test: Portability hacks, rewrite test script, Makefile. + btree/recno: + Stop copying non-overflow key/data pairs. + PORT: Break PORT directory up into per architecture/OS + subdirectories. + +1.5 -> 1.6 06 Jun 1993 + hash: In PAIRFITS, the first comparison should look at (P)[2]. + The hash_realloc function was walking off the end of memory. + The overflow page number was wrong when bumping splitpoint. + +1.4 -> 1.5 23 May 1993 + hash: Set hash default fill factor dynamically. + recno: Fixed bug in sorted page splits. + Add page size parameter support. + Allow recno to specify the name of the underlying btree; + used for vi recovery. + btree/recno: + Support 64K pages. + btree/hash/recno: + Provide access to an underlying file descriptor. + Change sync routines to take a flag argument, recno + uses this to sync out the underlying btree. + +1.3 -> 1.4 10 May 1993 + recno: Delete the R_CURSORLOG flag from the recno interface. + Zero-length record fix for non-mmap reads. + Try and make SIZE_T_MAX test in open portable. + +1.2 -> 1.3 01 May 1993 + btree: Ignore user byte-order setting when reading already + existing database. Fixes to byte-order conversions. + +1.1 -> 1.2 15 Apr 1993 + No bug fixes, only compatibility hacks. diff --git a/lib/libc/db/db/Makefile.inc b/lib/libc/db/db/Makefile.inc new file mode 100644 index 000000000..9755e8f20 --- /dev/null +++ b/lib/libc/db/db/Makefile.inc @@ -0,0 +1,6 @@ +# $NetBSD: Makefile.inc,v 1.4 1995/02/27 13:21:22 cgd Exp $ +# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 + +.PATH: ${.CURDIR}/db/db + +SRCS+= db.c diff --git a/lib/libc/db/db/db.c b/lib/libc/db/db/db.c new file mode 100644 index 000000000..9a30db32a --- /dev/null +++ b/lib/libc/db/db/db.c @@ -0,0 +1,114 @@ +/* $NetBSD: db.c,v 1.16 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: db.c,v 1.16 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include + +#include +static int __dberr(void); + +#ifdef __weak_alias +__weak_alias(dbopen,_dbopen) +#endif + +DB * +dbopen(const char *fname, int flags, mode_t mode, DBTYPE type, + const void *openinfo) +{ + +#ifndef O_EXLOCK +#define O_EXLOCK 0 +#endif + +#ifndef O_SHLOCK +#define O_SHLOCK 0 +#endif + +#define DB_FLAGS (DB_LOCK | DB_SHMEM | DB_TXN) +#define USE_OPEN_FLAGS \ + (O_CREAT | O_EXCL | O_EXLOCK | O_NONBLOCK | O_RDONLY | \ + O_RDWR | O_SHLOCK | O_TRUNC) + + if ((flags & ~(USE_OPEN_FLAGS | DB_FLAGS)) == 0) + switch (type) { + case DB_BTREE: + return (__bt_open(fname, flags & USE_OPEN_FLAGS, + mode, openinfo, (int)(flags & DB_FLAGS))); + case DB_HASH: + return (__hash_open(fname, flags & USE_OPEN_FLAGS, + mode, openinfo, (int)(flags & DB_FLAGS))); + case DB_RECNO: + return (__rec_open(fname, flags & USE_OPEN_FLAGS, + mode, openinfo, (int)(flags & DB_FLAGS))); + } + errno = EINVAL; + return (NULL); +} + +static int +__dberr(void) +{ + return (RET_ERROR); +} + +/* + * __DBPANIC -- Stop. + * + * Parameters: + * dbp: pointer to the DB structure. + */ +void +__dbpanic(DB *dbp) +{ + /* The only thing that can succeed is a close. */ + dbp->del = (int (*)(const struct __db *, const DBT*, u_int))__dberr; + dbp->fd = (int (*)(const struct __db *))__dberr; + dbp->get = (int (*)(const struct __db *, const DBT*, DBT *, u_int))__dberr; + dbp->put = (int (*)(const struct __db *, DBT *, const DBT *, u_int))__dberr; + dbp->seq = (int (*)(const struct __db *, DBT *, DBT *, u_int))__dberr; + dbp->sync = (int (*)(const struct __db *, u_int))__dberr; +} diff --git a/lib/libc/db/db2netbsd b/lib/libc/db/db2netbsd new file mode 100755 index 000000000..d5abedaa9 --- /dev/null +++ b/lib/libc/db/db2netbsd @@ -0,0 +1,31 @@ +#!/bin/sh +# $NetBSD: db2netbsd,v 1.2 1999/02/16 18:01:37 kleink Exp $ + +# This version transforms a Berkeley DB distribution into something +# which can be 'cvs import'ed into the NetBSD source repository. +# It is to be run in the untarred Berkeley DB distribution directory +# (e.g. the "db.1.85" directory created by tar xvf), and sets up +# the destination tree in place. + +version=`basename $PWD | sed -e 's/db\.//'` +releasetag=`basename $PWD | sed -e 's/\./-/g'` + +CLEANFILES="PORT docs hash/search.h test/btree.tests test/hash.tests" + +# clean up pieces that we never import +/bin/rm -rf $CLEANFILES +find . -type l -o -name tags | xargs /bin/rm -f + +# The include files are already in place + +# Put the regression tests in the right place +mkdir -p regress/lib/libc +mv test regress/lib/libc/db + +# Put the libc pieces in the right place. +mkdir -p lib/libc/db +mv Makefile.inc README btree changelog db hash man mpool recno lib/libc/db + +echo "import with:" +echo "cvs import -m \"Import of Berkeley DB version $version\" \ +src CSRG $releasetag" diff --git a/lib/libc/db/hash/Makefile.inc b/lib/libc/db/hash/Makefile.inc new file mode 100644 index 000000000..d65b3106d --- /dev/null +++ b/lib/libc/db/hash/Makefile.inc @@ -0,0 +1,7 @@ +# $NetBSD: Makefile.inc,v 1.9 2005/09/13 01:44:09 christos Exp $ +# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 + +.PATH: ${.CURDIR}/db/hash + +SRCS+= hash.c hash_bigkey.c hash_buf.c hash_func.c hash_log2.c \ + hash_page.c ndbmdatum.c ndbm.c diff --git a/lib/libc/db/hash/README b/lib/libc/db/hash/README new file mode 100644 index 000000000..02e16556b --- /dev/null +++ b/lib/libc/db/hash/README @@ -0,0 +1,69 @@ +# $NetBSD: README,v 1.5 1999/02/16 17:59:18 kleink Exp $ +# @(#)README 8.1 (Berkeley) 6/4/93 + +This package implements a superset of the hsearch and dbm/ndbm libraries. + +Test Programs: + All test programs which need key/data pairs expect them entered + with key and data on separate lines + + tcreat3.c + Takes + bucketsize (bsize), + fill factor (ffactor), and + initial number of elements (nelem). + Creates a hash table named hashtest containing the + keys/data pairs entered from standard in. + thash4.c + Takes + bucketsize (bsize), + fill factor (ffactor), + initial number of elements (nelem) + bytes of cache (ncached), and + file from which to read data (fname) + Creates a table from the key/data pairs on standard in and + then does a read of each key/data in fname + tdel.c + Takes + bucketsize (bsize), and + fill factor (ffactor). + file from which to read data (fname) + Reads each key/data pair from fname and deletes the + key from the hash table hashtest + tseq.c + Reads the key/data pairs in the file hashtest and writes them + to standard out. + tread2.c + Takes + butes of cache (ncached). + Reads key/data pairs from standard in and looks them up + in the file hashtest. + tverify.c + Reads key/data pairs from standard in, looks them up + in the file hashtest, and verifies that the data is + correct. + +NOTES: + +The man page ../man/db.3 explains the interface to the hashing system. +The file hash.ps is a postscript copy of a paper explaining +the history, implementation, and performance of the hash package. + +"bugs" or idiosyncracies + +If you have a lot of overflows, it is possible to run out of overflow +pages. Currently, this will cause a message to be printed on stderr. +Eventually, this will be indicated by a return error code. + +If you are using the ndbm interface and exit without flushing or closing the +file, you may lose updates since the package buffers all writes. Also, +the db interface only creates a single database file. To avoid overwriting +the user's original file, the suffix ".db" is appended to the file name +passed to dbm_open. Additionally, if your code "knows" about the historic +.dir and .pag files, it will break. + +There is a fundamental difference between this package and the old hsearch. +Hsearch requires the user to maintain the keys and data in the application's +allocated memory while hash takes care of all storage management. The down +side is that the byte strings passed in the ENTRY structure must be null +terminated (both the keys and the data). diff --git a/lib/libc/db/hash/extern.h b/lib/libc/db/hash/extern.h new file mode 100644 index 000000000..455bf51d8 --- /dev/null +++ b/lib/libc/db/hash/extern.h @@ -0,0 +1,63 @@ +/* $NetBSD: extern.h,v 1.9 2008/08/26 21:18:38 joerg Exp $ */ + +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)extern.h 8.4 (Berkeley) 6/16/94 + */ + +BUFHEAD *__add_ovflpage(HTAB *, BUFHEAD *); +int __addel(HTAB *, BUFHEAD *, const DBT *, const DBT *); +int __big_delete(HTAB *, BUFHEAD *); +int __big_insert(HTAB *, BUFHEAD *, const DBT *, const DBT *); +int __big_keydata(HTAB *, BUFHEAD *, DBT *, DBT *, int); +int __big_return(HTAB *, BUFHEAD *, int, DBT *, int); +int __big_split(HTAB *, BUFHEAD *, BUFHEAD *, BUFHEAD *, + int, uint32_t, SPLIT_RETURN *); +int __buf_free(HTAB *, int, int); +void __buf_init(HTAB *, u_int); +uint32_t __call_hash(HTAB *, char *, int); +int __delpair(HTAB *, BUFHEAD *, int); +int __expand_table(HTAB *); +int __find_bigpair(HTAB *, BUFHEAD *, int, char *, int); +uint16_t __find_last_page(HTAB *, BUFHEAD **); +void __free_ovflpage(HTAB *, BUFHEAD *); +BUFHEAD *__get_buf(HTAB *, uint32_t, BUFHEAD *, int); +int __get_page(HTAB *, char *, uint32_t, int, int, int); +int __ibitmap(HTAB *, int, int, int); +uint32_t __log2(uint32_t); +int __put_page(HTAB *, char *, uint32_t, int, int); +void __reclaim_buf(HTAB *, BUFHEAD *); +int __split_page(HTAB *, uint32_t, uint32_t); + +/* Default hash routine. */ +extern uint32_t (*__default_hash)(const void *, size_t); + +#ifdef HASH_STATISTICS +extern int hash_accesses, hash_collisions, hash_expansions, hash_overflows; +#endif diff --git a/lib/libc/db/hash/hash.c b/lib/libc/db/hash/hash.c new file mode 100644 index 000000000..3338885ea --- /dev/null +++ b/lib/libc/db/hash/hash.c @@ -0,0 +1,999 @@ +/* $NetBSD: hash.c,v 1.31 2009/02/12 06:35:54 lukem Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: hash.c,v 1.31 2009/02/12 06:35:54 lukem Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "hash.h" +#include "page.h" +#include "extern.h" + +#ifndef LITTLE_ENDIAN +# define LITTLE_ENDIAN 1234 +#endif + +#ifndef BIG_ENDIAN +# define BIG_ENDIAN 4321 +#endif + +#ifndef BYTE_ORDER +#define BYTE_ORDER LITTLE_ENDIAN +#endif + +#ifndef _DIAGASSERT +#define _DIAGASSERT +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +static int alloc_segs(HTAB *, int); +static int flush_meta(HTAB *); +static int hash_access(HTAB *, ACTION, DBT *, DBT *); +static int hash_close(DB *); +static int hash_delete(const DB *, const DBT *, unsigned int); +static int hash_fd(const DB *); +static int hash_get(const DB *, const DBT *, DBT *, unsigned int); +static int hash_put(const DB *, DBT *, const DBT *, unsigned int); +static void *hash_realloc(SEGMENT **, size_t, size_t); +static int hash_seq(const DB *, DBT *, DBT *, unsigned int); +static int hash_sync(const DB *, unsigned int); +static int hdestroy(HTAB *); +static HTAB *init_hash(HTAB *, const char *, const HASHINFO *); +static int init_htab(HTAB *, size_t); +#if BYTE_ORDER == LITTLE_ENDIAN +static void swap_header(HTAB *); +static void swap_header_copy(HASHHDR *, HASHHDR *); +#endif + +/* Fast arithmetic, relying on powers of 2, */ +#define MOD(x, y) ((x) & ((y) - 1)) + +#define RETURN_ERROR(ERR, LOC) { save_errno = ERR; goto LOC; } + +/* Return values */ +#define SUCCESS (0) +#define ERROR (-1) +#define ABNORMAL (1) + +#ifdef HASH_STATISTICS +int hash_accesses, hash_collisions, hash_expansions, hash_overflows; +#endif + +/************************** INTERFACE ROUTINES ***************************/ +/* OPEN/CLOSE */ + +/* ARGSUSED */ +DB * +__hash_open(const char *file, int flags, mode_t mode, const HASHINFO *info, + int dflags) +{ + HTAB *hashp; + struct stat statbuf; + DB *dbp; + int bpages, new_table, nsegs, save_errno; + ssize_t hdrsize; + + if ((flags & O_ACCMODE) == O_WRONLY) { + errno = EINVAL; + return (NULL); + } + + if (!(hashp = calloc(1, sizeof(HTAB)))) + return (NULL); + hashp->fp = -1; + + /* + * Even if user wants write only, we need to be able to read + * the actual file, so we need to open it read/write. But, the + * field in the hashp structure needs to be accurate so that + * we can check accesses. + */ + hashp->flags = flags; + + new_table = 0; + if (!file || (flags & O_TRUNC) || + (stat(file, &statbuf) && (errno == ENOENT))) { + if (errno == ENOENT) + errno = 0; /* Just in case someone looks at errno */ + new_table = 1; + } + if (file) { + if ((hashp->fp = open(file, flags, mode)) == -1) + RETURN_ERROR(errno, error0); + if (fcntl(hashp->fp, F_SETFD, FD_CLOEXEC) == -1) + RETURN_ERROR(errno, error1); + if (fstat(hashp->fp, &statbuf) == -1) + RETURN_ERROR(errno, error1); + new_table |= statbuf.st_size == 0; + } + if (new_table) { + if (!(hashp = init_hash(hashp, file, info))) + RETURN_ERROR(errno, error1); + } else { + /* Table already exists */ + if (info && info->hash) + hashp->hash = info->hash; + else + hashp->hash = __default_hash; + + hdrsize = read(hashp->fp, &hashp->hdr, sizeof(HASHHDR)); +#if BYTE_ORDER == LITTLE_ENDIAN + swap_header(hashp); +#endif + if (hdrsize == -1) + RETURN_ERROR(errno, error1); + if (hdrsize != sizeof(HASHHDR)) + RETURN_ERROR(EFTYPE, error1); + /* Verify file type, versions and hash function */ + if (hashp->MAGIC != HASHMAGIC) + RETURN_ERROR(EFTYPE, error1); +#define OLDHASHVERSION 1 + if (hashp->VERSION != HASHVERSION && + hashp->VERSION != OLDHASHVERSION) + RETURN_ERROR(EFTYPE, error1); + if (hashp->hash(CHARKEY, sizeof(CHARKEY)) != + (uint32_t)hashp->H_CHARKEY) + RETURN_ERROR(EFTYPE, error1); + /* + * Figure out how many segments we need. Max_Bucket is the + * maximum bucket number, so the number of buckets is + * max_bucket + 1. + */ + nsegs = (hashp->MAX_BUCKET + 1 + hashp->SGSIZE - 1) / + hashp->SGSIZE; + hashp->nsegs = 0; + if (alloc_segs(hashp, nsegs)) + /* + * If alloc_segs fails, table will have been destroyed + * and errno will have been set. + */ + return (NULL); + /* Read in bitmaps */ + bpages = (hashp->SPARES[hashp->OVFL_POINT] + + (unsigned int)(hashp->BSIZE << BYTE_SHIFT) - 1) >> + (hashp->BSHIFT + BYTE_SHIFT); + + hashp->nmaps = bpages; + (void)memset(&hashp->mapp[0], 0, bpages * sizeof(uint32_t *)); + } + + /* Initialize Buffer Manager */ + if (info && info->cachesize) + __buf_init(hashp, info->cachesize); + else + __buf_init(hashp, DEF_BUFSIZE); + + hashp->new_file = new_table; + hashp->save_file = file && (hashp->flags & O_RDWR); + hashp->cbucket = -1; + if (!(dbp = malloc(sizeof(DB)))) { + save_errno = errno; + hdestroy(hashp); + errno = save_errno; + return (NULL); + } + dbp->internal = hashp; + dbp->close = hash_close; + dbp->del = hash_delete; + dbp->fd = hash_fd; + dbp->get = hash_get; + dbp->put = hash_put; + dbp->seq = hash_seq; + dbp->sync = hash_sync; + dbp->type = DB_HASH; + +#ifdef DEBUG + (void)fprintf(stderr, +"%s\n%s%p\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%x\n%s%x\n%s%d\n%s%d\n", + "init_htab:", + "TABLE POINTER ", hashp, + "BUCKET SIZE ", hashp->BSIZE, + "BUCKET SHIFT ", hashp->BSHIFT, + "DIRECTORY SIZE ", hashp->DSIZE, + "SEGMENT SIZE ", hashp->SGSIZE, + "SEGMENT SHIFT ", hashp->SSHIFT, + "FILL FACTOR ", hashp->FFACTOR, + "MAX BUCKET ", hashp->MAX_BUCKET, + "OVFL POINT ", hashp->OVFL_POINT, + "LAST FREED ", hashp->LAST_FREED, + "HIGH MASK ", hashp->HIGH_MASK, + "LOW MASK ", hashp->LOW_MASK, + "NSEGS ", hashp->nsegs, + "NKEYS ", hashp->NKEYS); +#endif +#ifdef HASH_STATISTICS + hash_overflows = hash_accesses = hash_collisions = hash_expansions = 0; +#endif + return (dbp); + +error1: + if (hashp != NULL) + (void)close(hashp->fp); + +error0: + free(hashp); + errno = save_errno; + return (NULL); +} + +static int +hash_close(DB *dbp) +{ + HTAB *hashp; + int retval; + + if (!dbp) + return (ERROR); + + hashp = dbp->internal; + retval = hdestroy(hashp); + free(dbp); + return (retval); +} + +static int +hash_fd(const DB *dbp) +{ + HTAB *hashp; + + if (!dbp) + return (ERROR); + + hashp = dbp->internal; + if (hashp->fp == -1) { + errno = ENOENT; + return (-1); + } + return (hashp->fp); +} + +/************************** LOCAL CREATION ROUTINES **********************/ +static HTAB * +init_hash(HTAB *hashp, const char *file, const HASHINFO *info) +{ + struct stat statbuf; + int nelem; + + nelem = 1; + hashp->NKEYS = 0; + hashp->LORDER = BYTE_ORDER; + hashp->BSIZE = DEF_BUCKET_SIZE; + hashp->BSHIFT = DEF_BUCKET_SHIFT; + hashp->SGSIZE = DEF_SEGSIZE; + hashp->SSHIFT = DEF_SEGSIZE_SHIFT; + hashp->DSIZE = DEF_DIRSIZE; + hashp->FFACTOR = DEF_FFACTOR; + hashp->hash = __default_hash; + memset(hashp->SPARES, 0, sizeof(hashp->SPARES)); + memset(hashp->BITMAPS, 0, sizeof (hashp->BITMAPS)); + + /* Fix bucket size to be optimal for file system */ + if (file != NULL) { + if (stat(file, &statbuf)) + return (NULL); +#ifndef __minix + hashp->BSIZE = MIN(statbuf.st_blksize, MAX_BSIZE); +#else + hashp->BSIZE = MIN(4096, MAX_BSIZE); +#endif + hashp->BSHIFT = __log2((uint32_t)hashp->BSIZE); + } + + if (info) { + if (info->bsize) { + /* Round pagesize up to power of 2 */ + hashp->BSHIFT = __log2(info->bsize); + hashp->BSIZE = 1 << hashp->BSHIFT; + if (hashp->BSIZE > MAX_BSIZE) { + errno = EINVAL; + return (NULL); + } + } + if (info->ffactor) + hashp->FFACTOR = info->ffactor; + if (info->hash) + hashp->hash = info->hash; + if (info->nelem) + nelem = info->nelem; + if (info->lorder) { + if (info->lorder != BIG_ENDIAN && + info->lorder != LITTLE_ENDIAN) { + errno = EINVAL; + return (NULL); + } + hashp->LORDER = info->lorder; + } + } + /* init_htab should destroy the table and set errno if it fails */ + if (init_htab(hashp, (size_t)nelem)) + return (NULL); + else + return (hashp); +} +/* + * This calls alloc_segs which may run out of memory. Alloc_segs will destroy + * the table and set errno, so we just pass the error information along. + * + * Returns 0 on No Error + */ +static int +init_htab(HTAB *hashp, size_t nelem) +{ + int nbuckets; + uint32_t nsegs; + int l2; + + /* + * Divide number of elements by the fill factor and determine a + * desired number of buckets. Allocate space for the next greater + * power of two number of buckets. + */ + nelem = (nelem - 1) / hashp->FFACTOR + 1; + + _DBFIT(nelem, uint32_t); + l2 = __log2(MAX((uint32_t)nelem, 2)); + nbuckets = 1 << l2; + + hashp->SPARES[l2] = l2 + 1; + hashp->SPARES[l2 + 1] = l2 + 1; + hashp->OVFL_POINT = l2; + hashp->LAST_FREED = 2; + + /* First bitmap page is at: splitpoint l2 page offset 1 */ + if (__ibitmap(hashp, (int)OADDR_OF(l2, 1), l2 + 1, 0)) + return (-1); + + hashp->MAX_BUCKET = hashp->LOW_MASK = nbuckets - 1; + hashp->HIGH_MASK = (nbuckets << 1) - 1; + /* LINTED constant in conditional context */ + hashp->HDRPAGES = ((MAX(sizeof(HASHHDR), MINHDRSIZE) - 1) >> + hashp->BSHIFT) + 1; + + nsegs = (nbuckets - 1) / hashp->SGSIZE + 1; + nsegs = 1 << __log2(nsegs); + + if (nsegs > (uint32_t)hashp->DSIZE) + hashp->DSIZE = nsegs; + return (alloc_segs(hashp, (int)nsegs)); +} + +/********************** DESTROY/CLOSE ROUTINES ************************/ + +/* + * Flushes any changes to the file if necessary and destroys the hashp + * structure, freeing all allocated space. + */ +static int +hdestroy(HTAB *hashp) +{ + int i, save_errno; + + save_errno = 0; + +#ifdef HASH_STATISTICS + (void)fprintf(stderr, "hdestroy: accesses %d collisions %d\n", + hash_accesses, hash_collisions); + (void)fprintf(stderr, "hdestroy: expansions %d\n", + hash_expansions); + (void)fprintf(stderr, "hdestroy: overflows %d\n", + hash_overflows); + (void)fprintf(stderr, "keys %d maxp %d segmentcount %d\n", + hashp->NKEYS, hashp->MAX_BUCKET, hashp->nsegs); + + for (i = 0; i < NCACHED; i++) + (void)fprintf(stderr, + "spares[%d] = %d\n", i, hashp->SPARES[i]); +#endif + /* + * Call on buffer manager to free buffers, and if required, + * write them to disk. + */ + if (__buf_free(hashp, 1, hashp->save_file)) + save_errno = errno; + if (hashp->dir) { + free(*hashp->dir); /* Free initial segments */ + /* Free extra segments */ + while (hashp->exsegs--) + free(hashp->dir[--hashp->nsegs]); + free(hashp->dir); + } + if (flush_meta(hashp) && !save_errno) + save_errno = errno; + /* Free Bigmaps */ + for (i = 0; i < hashp->nmaps; i++) + if (hashp->mapp[i]) + free(hashp->mapp[i]); + + if (hashp->fp != -1) + (void)close(hashp->fp); + + free(hashp); + + if (save_errno) { + errno = save_errno; + return (ERROR); + } + return (SUCCESS); +} +/* + * Write modified pages to disk + * + * Returns: + * 0 == OK + * -1 ERROR + */ +static int +hash_sync(const DB *dbp, unsigned int flags) +{ + HTAB *hashp; + + if (flags != 0) { + errno = EINVAL; + return (ERROR); + } + + if (!dbp) + return (ERROR); + + hashp = dbp->internal; + if (!hashp->save_file) + return (0); + if (__buf_free(hashp, 0, 1) || flush_meta(hashp)) + return (ERROR); + hashp->new_file = 0; + return (0); +} + +/* + * Returns: + * 0 == OK + * -1 indicates that errno should be set + */ +static int +flush_meta(HTAB *hashp) +{ + HASHHDR *whdrp; +#if BYTE_ORDER == LITTLE_ENDIAN + HASHHDR whdr; +#endif + int fp, i; + ssize_t wsize; + + if (!hashp->save_file) + return (0); + hashp->MAGIC = HASHMAGIC; + hashp->VERSION = HASHVERSION; + hashp->H_CHARKEY = hashp->hash(CHARKEY, sizeof(CHARKEY)); + + fp = hashp->fp; + whdrp = &hashp->hdr; +#if BYTE_ORDER == LITTLE_ENDIAN + whdrp = &whdr; + swap_header_copy(&hashp->hdr, whdrp); +#endif + if ((wsize = pwrite(fp, whdrp, sizeof(HASHHDR), (off_t)0)) == -1) + return (-1); + else + if (wsize != sizeof(HASHHDR)) { + errno = EFTYPE; + hashp->err = errno; + return (-1); + } + for (i = 0; i < NCACHED; i++) + if (hashp->mapp[i]) + if (__put_page(hashp, (char *)(void *)hashp->mapp[i], + (u_int)hashp->BITMAPS[i], 0, 1)) + return (-1); + return (0); +} + +/*******************************SEARCH ROUTINES *****************************/ +/* + * All the access routines return + * + * Returns: + * 0 on SUCCESS + * 1 to indicate an external ERROR (i.e. key not found, etc) + * -1 to indicate an internal ERROR (i.e. out of memory, etc) + */ +static int +hash_get(const DB *dbp, const DBT *key, DBT *data, unsigned int flag) +{ + HTAB *hashp; + + hashp = dbp->internal; + if (flag) { + hashp->err = errno = EINVAL; + return (ERROR); + } + return (hash_access(hashp, HASH_GET, __UNCONST(key), data)); +} + +static int +hash_put(const DB *dbp, DBT *key, const DBT *data, unsigned int flag) +{ + HTAB *hashp; + + hashp = dbp->internal; + if (flag && flag != R_NOOVERWRITE) { + hashp->err = errno = EINVAL; + return (ERROR); + } + if ((hashp->flags & O_ACCMODE) == O_RDONLY) { + hashp->err = errno = EPERM; + return (ERROR); + } + /* LINTED const castaway */ + return (hash_access(hashp, flag == R_NOOVERWRITE ? + HASH_PUTNEW : HASH_PUT, __UNCONST(key), __UNCONST(data))); +} + +static int +hash_delete(const DB *dbp, const DBT *key, unsigned int flag) +{ + HTAB *hashp; + + hashp = dbp->internal; + if (flag && flag != R_CURSOR) { + hashp->err = errno = EINVAL; + return (ERROR); + } + if ((hashp->flags & O_ACCMODE) == O_RDONLY) { + hashp->err = errno = EPERM; + return (ERROR); + } + return hash_access(hashp, HASH_DELETE, __UNCONST(key), NULL); +} + +/* + * Assume that hashp has been set in wrapper routine. + */ +static int +hash_access(HTAB *hashp, ACTION action, DBT *key, DBT *val) +{ + BUFHEAD *rbufp; + BUFHEAD *bufp, *save_bufp; + uint16_t *bp; + int n, ndx, off; + size_t size; + char *kp; + uint16_t pageno; + +#ifdef HASH_STATISTICS + hash_accesses++; +#endif + + off = hashp->BSIZE; + size = key->size; + kp = (char *)key->data; + rbufp = __get_buf(hashp, __call_hash(hashp, kp, (int)size), NULL, 0); + if (!rbufp) + return (ERROR); + save_bufp = rbufp; + + /* Pin the bucket chain */ + rbufp->flags |= BUF_PIN; + for (bp = (uint16_t *)(void *)rbufp->page, n = *bp++, ndx = 1; ndx < n;) + if (bp[1] >= REAL_KEY) { + /* Real key/data pair */ + if (size == (size_t)(off - *bp) && + memcmp(kp, rbufp->page + *bp, size) == 0) + goto found; + off = bp[1]; +#ifdef HASH_STATISTICS + hash_collisions++; +#endif + bp += 2; + ndx += 2; + } else if (bp[1] == OVFLPAGE) { + rbufp = __get_buf(hashp, (uint32_t)*bp, rbufp, 0); + if (!rbufp) { + save_bufp->flags &= ~BUF_PIN; + return (ERROR); + } + /* FOR LOOP INIT */ + bp = (uint16_t *)(void *)rbufp->page; + n = *bp++; + ndx = 1; + off = hashp->BSIZE; + } else if (bp[1] < REAL_KEY) { + if ((ndx = + __find_bigpair(hashp, rbufp, ndx, kp, (int)size)) > 0) + goto found; + if (ndx == -2) { + bufp = rbufp; + if (!(pageno = + __find_last_page(hashp, &bufp))) { + ndx = 0; + rbufp = bufp; + break; /* FOR */ + } + rbufp = __get_buf(hashp, (uint32_t)pageno, + bufp, 0); + if (!rbufp) { + save_bufp->flags &= ~BUF_PIN; + return (ERROR); + } + /* FOR LOOP INIT */ + bp = (uint16_t *)(void *)rbufp->page; + n = *bp++; + ndx = 1; + off = hashp->BSIZE; + } else { + save_bufp->flags &= ~BUF_PIN; + return (ERROR); + } + } + + /* Not found */ + switch (action) { + case HASH_PUT: + case HASH_PUTNEW: + if (__addel(hashp, rbufp, key, val)) { + save_bufp->flags &= ~BUF_PIN; + return (ERROR); + } else { + save_bufp->flags &= ~BUF_PIN; + return (SUCCESS); + } + case HASH_GET: + case HASH_DELETE: + default: + save_bufp->flags &= ~BUF_PIN; + return (ABNORMAL); + } + +found: + switch (action) { + case HASH_PUTNEW: + save_bufp->flags &= ~BUF_PIN; + return (ABNORMAL); + case HASH_GET: + bp = (uint16_t *)(void *)rbufp->page; + if (bp[ndx + 1] < REAL_KEY) { + if (__big_return(hashp, rbufp, ndx, val, 0)) + return (ERROR); + } else { + val->data = (uint8_t *)rbufp->page + (int)bp[ndx + 1]; + val->size = bp[ndx] - bp[ndx + 1]; + } + break; + case HASH_PUT: + if ((__delpair(hashp, rbufp, ndx)) || + (__addel(hashp, rbufp, key, val))) { + save_bufp->flags &= ~BUF_PIN; + return (ERROR); + } + break; + case HASH_DELETE: + if (__delpair(hashp, rbufp, ndx)) + return (ERROR); + break; + default: + abort(); + } + save_bufp->flags &= ~BUF_PIN; + return (SUCCESS); +} + +static int +hash_seq(const DB *dbp, DBT *key, DBT *data, unsigned int flag) +{ + uint32_t bucket; + BUFHEAD *bufp = NULL; /* XXX: gcc */ + HTAB *hashp; + uint16_t *bp, ndx; + + hashp = dbp->internal; + if (flag && flag != R_FIRST && flag != R_NEXT) { + hashp->err = errno = EINVAL; + return (ERROR); + } +#ifdef HASH_STATISTICS + hash_accesses++; +#endif + if ((hashp->cbucket < 0) || (flag == R_FIRST)) { + hashp->cbucket = 0; + hashp->cndx = 1; + hashp->cpage = NULL; + } + + for (bp = NULL; !bp || !bp[0]; ) { + if (!(bufp = hashp->cpage)) { + for (bucket = hashp->cbucket; + bucket <= (uint32_t)hashp->MAX_BUCKET; + bucket++, hashp->cndx = 1) { + bufp = __get_buf(hashp, bucket, NULL, 0); + if (!bufp) + return (ERROR); + hashp->cpage = bufp; + bp = (uint16_t *)(void *)bufp->page; + if (bp[0]) + break; + } + hashp->cbucket = bucket; + if (hashp->cbucket > hashp->MAX_BUCKET) { + hashp->cbucket = -1; + return (ABNORMAL); + } + } else + bp = (uint16_t *)(void *)hashp->cpage->page; + + _DIAGASSERT(bp != NULL); + _DIAGASSERT(bufp != NULL); + while (bp[hashp->cndx + 1] == OVFLPAGE) { + bufp = hashp->cpage = + __get_buf(hashp, (uint32_t)bp[hashp->cndx], bufp, + 0); + if (!bufp) + return (ERROR); + bp = (uint16_t *)(void *)(bufp->page); + hashp->cndx = 1; + } + if (!bp[0]) { + hashp->cpage = NULL; + ++hashp->cbucket; + } + } + ndx = hashp->cndx; + if (bp[ndx + 1] < REAL_KEY) { + if (__big_keydata(hashp, bufp, key, data, 1)) + return (ERROR); + } else { + if (hashp->cpage == NULL) + return (ERROR); + key->data = (uint8_t *)hashp->cpage->page + bp[ndx]; + key->size = (ndx > 1 ? bp[ndx - 1] : hashp->BSIZE) - bp[ndx]; + data->data = (uint8_t *)hashp->cpage->page + bp[ndx + 1]; + data->size = bp[ndx] - bp[ndx + 1]; + ndx += 2; + if (ndx > bp[0]) { + hashp->cpage = NULL; + hashp->cbucket++; + hashp->cndx = 1; + } else + hashp->cndx = ndx; + } + return (SUCCESS); +} + +/********************************* UTILITIES ************************/ + +/* + * Returns: + * 0 ==> OK + * -1 ==> Error + */ +int +__expand_table(HTAB *hashp) +{ + uint32_t old_bucket, new_bucket; + int new_segnum, spare_ndx; + size_t dirsize; + +#ifdef HASH_STATISTICS + hash_expansions++; +#endif + new_bucket = ++hashp->MAX_BUCKET; + old_bucket = (hashp->MAX_BUCKET & hashp->LOW_MASK); + + new_segnum = new_bucket >> hashp->SSHIFT; + + /* Check if we need a new segment */ + if (new_segnum >= hashp->nsegs) { + /* Check if we need to expand directory */ + if (new_segnum >= hashp->DSIZE) { + /* Reallocate directory */ + dirsize = hashp->DSIZE * sizeof(SEGMENT *); + if (!hash_realloc(&hashp->dir, dirsize, dirsize << 1)) + return (-1); + hashp->DSIZE = dirsize << 1; + } + if ((hashp->dir[new_segnum] = + calloc((size_t)hashp->SGSIZE, sizeof(SEGMENT))) == NULL) + return (-1); + hashp->exsegs++; + hashp->nsegs++; + } + /* + * If the split point is increasing (MAX_BUCKET's log base 2 + * * increases), we need to copy the current contents of the spare + * split bucket to the next bucket. + */ + spare_ndx = __log2((uint32_t)(hashp->MAX_BUCKET + 1)); + if (spare_ndx > hashp->OVFL_POINT) { + hashp->SPARES[spare_ndx] = hashp->SPARES[hashp->OVFL_POINT]; + hashp->OVFL_POINT = spare_ndx; + } + + if (new_bucket > (uint32_t)hashp->HIGH_MASK) { + /* Starting a new doubling */ + hashp->LOW_MASK = hashp->HIGH_MASK; + hashp->HIGH_MASK = new_bucket | hashp->LOW_MASK; + } + /* Relocate records to the new bucket */ + return (__split_page(hashp, old_bucket, new_bucket)); +} + +/* + * If realloc guarantees that the pointer is not destroyed if the realloc + * fails, then this routine can go away. + */ +static void * +hash_realloc(SEGMENT **p_ptr, size_t oldsize, size_t newsize) +{ + void *p; + + if ((p = malloc(newsize)) != NULL) { + memmove(p, *p_ptr, oldsize); + memset((char *)p + oldsize, 0, newsize - oldsize); + free(*p_ptr); + *p_ptr = p; + } + return (p); +} + +uint32_t +__call_hash(HTAB *hashp, char *k, int len) +{ + int n, bucket; + + n = hashp->hash(k, (size_t)len); + bucket = n & hashp->HIGH_MASK; + if (bucket > hashp->MAX_BUCKET) + bucket = bucket & hashp->LOW_MASK; + return (bucket); +} + +/* + * Allocate segment table. On error, destroy the table and set errno. + * + * Returns 0 on success + */ +static int +alloc_segs(HTAB *hashp, int nsegs) +{ + int i; + SEGMENT store; + + int save_errno; + + hashp->dir = calloc((size_t)hashp->DSIZE, sizeof(SEGMENT *)); + if (hashp->dir == NULL) { + save_errno = errno; + (void)hdestroy(hashp); + errno = save_errno; + return (-1); + } + hashp->nsegs = nsegs; + if (nsegs == 0) + return 0; + /* Allocate segments */ + store = calloc((size_t)(nsegs << hashp->SSHIFT), sizeof(SEGMENT)); + if (store == NULL) { + save_errno = errno; + (void)hdestroy(hashp); + errno = save_errno; + return (-1); + } + for (i = 0; i < nsegs; i++) + hashp->dir[i] = &store[i << hashp->SSHIFT]; + return (0); +} + +#if BYTE_ORDER == LITTLE_ENDIAN +/* + * Hashp->hdr needs to be byteswapped. + */ +static void +swap_header_copy(HASHHDR *srcp, HASHHDR *destp) +{ + size_t i; + + P_32_COPY(srcp->magic, destp->magic); + P_32_COPY(srcp->version, destp->version); + P_32_COPY(srcp->lorder, destp->lorder); + P_32_COPY(srcp->bsize, destp->bsize); + P_32_COPY(srcp->bshift, destp->bshift); + P_32_COPY(srcp->dsize, destp->dsize); + P_32_COPY(srcp->ssize, destp->ssize); + P_32_COPY(srcp->sshift, destp->sshift); + P_32_COPY(srcp->ovfl_point, destp->ovfl_point); + P_32_COPY(srcp->last_freed, destp->last_freed); + P_32_COPY(srcp->max_bucket, destp->max_bucket); + P_32_COPY(srcp->high_mask, destp->high_mask); + P_32_COPY(srcp->low_mask, destp->low_mask); + P_32_COPY(srcp->ffactor, destp->ffactor); + P_32_COPY(srcp->nkeys, destp->nkeys); + P_32_COPY(srcp->hdrpages, destp->hdrpages); + P_32_COPY(srcp->h_charkey, destp->h_charkey); + for (i = 0; i < NCACHED; i++) { + P_32_COPY(srcp->spares[i], destp->spares[i]); + P_16_COPY(srcp->bitmaps[i], destp->bitmaps[i]); + } +} + +static void +swap_header(HTAB *hashp) +{ + HASHHDR *hdrp; + size_t i; + + hdrp = &hashp->hdr; + + M_32_SWAP(hdrp->magic); + M_32_SWAP(hdrp->version); + M_32_SWAP(hdrp->lorder); + M_32_SWAP(hdrp->bsize); + M_32_SWAP(hdrp->bshift); + M_32_SWAP(hdrp->dsize); + M_32_SWAP(hdrp->ssize); + M_32_SWAP(hdrp->sshift); + M_32_SWAP(hdrp->ovfl_point); + M_32_SWAP(hdrp->last_freed); + M_32_SWAP(hdrp->max_bucket); + M_32_SWAP(hdrp->high_mask); + M_32_SWAP(hdrp->low_mask); + M_32_SWAP(hdrp->ffactor); + M_32_SWAP(hdrp->nkeys); + M_32_SWAP(hdrp->hdrpages); + M_32_SWAP(hdrp->h_charkey); + for (i = 0; i < NCACHED; i++) { + M_32_SWAP(hdrp->spares[i]); + M_16_SWAP(hdrp->bitmaps[i]); + } +} +#endif diff --git a/lib/libc/db/hash/hash.h b/lib/libc/db/hash/hash.h new file mode 100644 index 000000000..f3c87a86e --- /dev/null +++ b/lib/libc/db/hash/hash.h @@ -0,0 +1,295 @@ +/* $NetBSD: hash.h,v 1.15 2008/08/26 21:18:38 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hash.h 8.3 (Berkeley) 5/31/94 + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +/* Operations */ +typedef enum { + HASH_GET, HASH_PUT, HASH_PUTNEW, HASH_DELETE, HASH_FIRST, HASH_NEXT +} ACTION; + +/* Buffer Management structures */ +typedef struct _bufhead BUFHEAD; + +struct _bufhead { + BUFHEAD *prev; /* LRU links */ + BUFHEAD *next; /* LRU links */ + BUFHEAD *ovfl; /* Overflow page buffer header */ + uint32_t addr; /* Address of this page */ + char *page; /* Actual page data */ + char flags; +#define BUF_MOD 0x0001 +#define BUF_DISK 0x0002 +#define BUF_BUCKET 0x0004 +#define BUF_PIN 0x0008 +}; + +#define IS_BUCKET(X) ((X) & BUF_BUCKET) + +typedef BUFHEAD **SEGMENT; + +/* Hash Table Information */ +typedef struct hashhdr { /* Disk resident portion */ + int32_t magic; /* Magic NO for hash tables */ + int32_t version; /* Version ID */ + uint32_t lorder; /* Byte Order */ + int32_t bsize; /* Bucket/Page Size */ + int32_t bshift; /* Bucket shift */ + int32_t dsize; /* Directory Size */ + int32_t ssize; /* Segment Size */ + int32_t sshift; /* Segment shift */ + int32_t ovfl_point; /* Where overflow pages are being + * allocated */ + int32_t last_freed; /* Last overflow page freed */ + int32_t max_bucket; /* ID of Maximum bucket in use */ + int32_t high_mask; /* Mask to modulo into entire table */ + int32_t low_mask; /* Mask to modulo into lower half of + * table */ + int32_t ffactor; /* Fill factor */ + int32_t nkeys; /* Number of keys in hash table */ + int32_t hdrpages; /* Size of table header */ + int32_t h_charkey; /* value of hash(CHARKEY) */ +#define NCACHED 32 /* number of bit maps and spare + * points */ + int32_t spares[NCACHED];/* spare pages for overflow */ + uint16_t bitmaps[NCACHED]; /* address of overflow page + * bitmaps */ +} HASHHDR; + +typedef struct htab { /* Memory resident data structure */ + HASHHDR hdr; /* Header */ + int nsegs; /* Number of allocated segments */ + int exsegs; /* Number of extra allocated + * segments */ + uint32_t (*hash)(const void *, size_t); /* Hash function */ + int flags; /* Flag values */ + int fp; /* File pointer */ + char *tmp_buf; /* Temporary Buffer for BIG data */ + char *tmp_key; /* Temporary Buffer for BIG keys */ + BUFHEAD *cpage; /* Current page */ + int cbucket; /* Current bucket */ + int cndx; /* Index of next item on cpage */ + int err; /* Error Number -- for DBM + * compatibility */ + int new_file; /* Indicates if fd is backing store + * or no */ + int save_file; /* Indicates whether we need to flush + * file at + * exit */ + uint32_t *mapp[NCACHED]; /* Pointers to page maps */ + int nmaps; /* Initial number of bitmaps */ + int nbufs; /* Number of buffers left to + * allocate */ + BUFHEAD bufhead; /* Header of buffer lru list */ + SEGMENT *dir; /* Hash Bucket directory */ +} HTAB; + +/* + * Constants + */ +#define MAX_BSIZE 65536 /* 2^16 */ +#define MIN_BUFFERS 6 +#define MINHDRSIZE 512 +#define DEF_BUFSIZE 65536 /* 64 K */ +#define DEF_BUCKET_SIZE 4096 +#define DEF_BUCKET_SHIFT 12 /* log2(BUCKET) */ +#define DEF_SEGSIZE 256 +#define DEF_SEGSIZE_SHIFT 8 /* log2(SEGSIZE) */ +#define DEF_DIRSIZE 256 +#define DEF_FFACTOR 65536 +#define MIN_FFACTOR 4 +#define SPLTMAX 8 +#define CHARKEY "%$sniglet^&" +#define NUMKEY 1038583 +#define BYTE_SHIFT 3 +#define INT_TO_BYTE 2 +#define INT_BYTE_SHIFT 5 +#define ALL_SET ((uint32_t)0xFFFFFFFF) +#define ALL_CLEAR 0 + +#define PTROF(X) ((BUFHEAD *)(void *)((u_long)(X)&~0x3)) +#define ISMOD(X) ((uint32_t)(u_long)(X)&0x1) +#define DOMOD(X) ((X) = (char *)(void *)((u_long)(X)|0x1)) +#define ISDISK(X) ((uint32_t)(u_long)(X)&0x2) +#define DODISK(X) ((X) = (char *)(void *)((u_long)(X)|0x2)) + +#define BITS_PER_MAP 32 + +/* Given the address of the beginning of a big map, clear/set the nth bit */ +#define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) +#define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) +#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) + +/* Overflow management */ +/* + * Overflow page numbers are allocated per split point. At each doubling of + * the table, we can allocate extra pages. So, an overflow page number has + * the top 5 bits indicate which split point and the lower 11 bits indicate + * which page at that split point is indicated (pages within split points are + * numberered starting with 1). + */ + +#define SPLITSHIFT 11 +#define SPLITMASK 0x7FF +#define SPLITNUM(N) (((uint32_t)(N)) >> SPLITSHIFT) +#define OPAGENUM(N) ((N) & SPLITMASK) +#define OADDR_OF(S,O) ((uint32_t)((uint32_t)(S) << SPLITSHIFT) + (O)) + +#define BUCKET_TO_PAGE(B) \ + (B) + hashp->HDRPAGES + \ + ((B) ? hashp->SPARES[__log2((uint32_t)((B)+1))-1] : 0) +#define OADDR_TO_PAGE(B) \ + BUCKET_TO_PAGE ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B)); + +/* + * page.h contains a detailed description of the page format. + * + * Normally, keys and data are accessed from offset tables in the top of + * each page which point to the beginning of the key and data. There are + * four flag values which may be stored in these offset tables which indicate + * the following: + * + * + * OVFLPAGE Rather than a key data pair, this pair contains + * the address of an overflow page. The format of + * the pair is: + * OVERFLOW_PAGE_NUMBER OVFLPAGE + * + * PARTIAL_KEY This must be the first key/data pair on a page + * and implies that page contains only a partial key. + * That is, the key is too big to fit on a single page + * so it starts on this page and continues on the next. + * The format of the page is: + * KEY_OFF PARTIAL_KEY OVFL_PAGENO OVFLPAGE + * + * KEY_OFF -- offset of the beginning of the key + * PARTIAL_KEY -- 1 + * OVFL_PAGENO - page number of the next overflow page + * OVFLPAGE -- 0 + * + * FULL_KEY This must be the first key/data pair on the page. It + * is used in two cases. + * + * Case 1: + * There is a complete key on the page but no data + * (because it wouldn't fit). The next page contains + * the data. + * + * Page format it: + * KEY_OFF FULL_KEY OVFL_PAGENO OVFL_PAGE + * + * KEY_OFF -- offset of the beginning of the key + * FULL_KEY -- 2 + * OVFL_PAGENO - page number of the next overflow page + * OVFLPAGE -- 0 + * + * Case 2: + * This page contains no key, but part of a large + * data field, which is continued on the next page. + * + * Page format it: + * DATA_OFF FULL_KEY OVFL_PAGENO OVFL_PAGE + * + * KEY_OFF -- offset of the beginning of the data on + * this page + * FULL_KEY -- 2 + * OVFL_PAGENO - page number of the next overflow page + * OVFLPAGE -- 0 + * + * FULL_KEY_DATA + * This must be the first key/data pair on the page. + * There are two cases: + * + * Case 1: + * This page contains a key and the beginning of the + * data field, but the data field is continued on the + * next page. + * + * Page format is: + * KEY_OFF FULL_KEY_DATA OVFL_PAGENO DATA_OFF + * + * KEY_OFF -- offset of the beginning of the key + * FULL_KEY_DATA -- 3 + * OVFL_PAGENO - page number of the next overflow page + * DATA_OFF -- offset of the beginning of the data + * + * Case 2: + * This page contains the last page of a big data pair. + * There is no key, only the tail end of the data + * on this page. + * + * Page format is: + * DATA_OFF FULL_KEY_DATA + * + * DATA_OFF -- offset of the beginning of the data on + * this page + * FULL_KEY_DATA -- 3 + * OVFL_PAGENO - page number of the next overflow page + * OVFLPAGE -- 0 + * + * OVFL_PAGENO and OVFLPAGE are optional (they are + * not present if there is no next page). + */ + +#define OVFLPAGE 0 +#define PARTIAL_KEY 1 +#define FULL_KEY 2 +#define FULL_KEY_DATA 3 +#define REAL_KEY 4 + +/* Short hands for accessing structure */ +#define BSIZE hdr.bsize +#define BSHIFT hdr.bshift +#define DSIZE hdr.dsize +#define SGSIZE hdr.ssize +#define SSHIFT hdr.sshift +#define LORDER hdr.lorder +#define OVFL_POINT hdr.ovfl_point +#define LAST_FREED hdr.last_freed +#define MAX_BUCKET hdr.max_bucket +#define FFACTOR hdr.ffactor +#define HIGH_MASK hdr.high_mask +#define LOW_MASK hdr.low_mask +#define NKEYS hdr.nkeys +#define HDRPAGES hdr.hdrpages +#define SPARES hdr.spares +#define BITMAPS hdr.bitmaps +#define VERSION hdr.version +#define MAGIC hdr.magic +#define NEXT_FREE hdr.next_free +#define H_CHARKEY hdr.h_charkey diff --git a/lib/libc/db/hash/hash_bigkey.c b/lib/libc/db/hash/hash_bigkey.c new file mode 100644 index 000000000..be31e3db2 --- /dev/null +++ b/lib/libc/db/hash/hash_bigkey.c @@ -0,0 +1,683 @@ +/* $NetBSD: hash_bigkey.c,v 1.23 2009/02/12 06:33:13 lukem Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: hash_bigkey.c,v 1.23 2009/02/12 06:33:13 lukem Exp $"); +#endif + +/* + * PACKAGE: hash + * DESCRIPTION: + * Big key/data handling for the hashing package. + * + * ROUTINES: + * External + * __big_keydata + * __big_split + * __big_insert + * __big_return + * __big_delete + * __find_last_page + * Internal + * collect_key + * collect_data + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include "hash.h" +#include "page.h" +#include "extern.h" + +#ifndef _DIAGASSERT +#define _DIAGASSERT +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +static int collect_key(HTAB *, BUFHEAD *, int, DBT *, int); +static int collect_data(HTAB *, BUFHEAD *, int, int); + +/* + * Big_insert + * + * You need to do an insert and the key/data pair is too big + * + * Returns: + * 0 ==> OK + *-1 ==> ERROR + */ +int +__big_insert(HTAB *hashp, BUFHEAD *bufp, const DBT *key, const DBT *val) +{ + uint16_t *p, n; + size_t key_size, val_size; + uint16_t space, move_bytes, off; + char *cp, *key_data, *val_data; + size_t temp; + + cp = bufp->page; /* Character pointer of p. */ + p = (uint16_t *)(void *)cp; + + key_data = (char *)key->data; + _DBFIT(key->size, int); + key_size = key->size; + val_data = (char *)val->data; + _DBFIT(val->size, int); + val_size = val->size; + + /* First move the Key */ + + temp = FREESPACE(p) - BIGOVERHEAD; + _DBFIT(temp, uint16_t); + space = (uint16_t)temp; + while (key_size) { + move_bytes = MIN(space, key_size); + off = OFFSET(p) - move_bytes; + memmove(cp + off, key_data, (size_t)move_bytes); + key_size -= move_bytes; + key_data += move_bytes; + n = p[0]; + p[++n] = off; + p[0] = ++n; + temp = off - PAGE_META(n); + _DBFIT(temp, uint16_t); + FREESPACE(p) = (uint16_t)temp; + OFFSET(p) = off; + p[n] = PARTIAL_KEY; + bufp = __add_ovflpage(hashp, bufp); + if (!bufp) + return (-1); + n = p[0]; + if (!key_size) { + space = FREESPACE(p); + if (space) { + move_bytes = MIN(space, val_size); + /* + * If the data would fit exactly in the + * remaining space, we must overflow it to the + * next page; otherwise the invariant that the + * data must end on a page with FREESPACE + * non-zero would fail. + */ + if (space == val_size && val_size == val->size) + goto toolarge; + off = OFFSET(p) - move_bytes; + memmove(cp + off, val_data, (size_t)move_bytes); + val_data += move_bytes; + val_size -= move_bytes; + p[n] = off; + p[n - 2] = FULL_KEY_DATA; + FREESPACE(p) = FREESPACE(p) - move_bytes; + OFFSET(p) = off; + } else { + toolarge: + p[n - 2] = FULL_KEY; + } + } + p = (uint16_t *)(void *)bufp->page; + cp = bufp->page; + bufp->flags |= BUF_MOD; + temp = FREESPACE(p) - BIGOVERHEAD; + _DBFIT(temp, uint16_t); + space = (uint16_t)temp; + } + + /* Now move the data */ + temp = FREESPACE(p) - BIGOVERHEAD; + _DBFIT(temp, uint16_t); + space = (uint16_t)temp; + while (val_size) { + move_bytes = MIN(space, val_size); + /* + * Here's the hack to make sure that if the data ends on the + * same page as the key ends, FREESPACE is at least one. + */ + if (space == val_size && val_size == val->size) + move_bytes--; + off = OFFSET(p) - move_bytes; + memmove(cp + off, val_data, (size_t)move_bytes); + val_size -= move_bytes; + val_data += move_bytes; + n = p[0]; + p[++n] = off; + p[0] = ++n; + temp = off - PAGE_META(n); + _DBFIT(temp, uint16_t); + FREESPACE(p) = (uint16_t)temp; + OFFSET(p) = off; + if (val_size) { + p[n] = FULL_KEY; + bufp = __add_ovflpage(hashp, bufp); + if (!bufp) + return (-1); + cp = bufp->page; + p = (uint16_t *)(void *)cp; + } else + p[n] = FULL_KEY_DATA; + bufp->flags |= BUF_MOD; + temp = FREESPACE(p) - BIGOVERHEAD; + _DBFIT(temp, uint16_t); + space = (uint16_t)temp; + } + return (0); +} + +/* + * Called when bufp's page contains a partial key (index should be 1) + * + * All pages in the big key/data pair except bufp are freed. We cannot + * free bufp because the page pointing to it is lost and we can't get rid + * of its pointer. + * + * Returns: + * 0 => OK + *-1 => ERROR + */ +int +__big_delete(HTAB *hashp, BUFHEAD *bufp) +{ + BUFHEAD *last_bfp, *rbufp; + uint16_t *bp, pageno; + int key_done, n; + size_t temp; + + rbufp = bufp; + last_bfp = NULL; + bp = (uint16_t *)(void *)bufp->page; + pageno = 0; + key_done = 0; + + while (!key_done || (bp[2] != FULL_KEY_DATA)) { + if (bp[2] == FULL_KEY || bp[2] == FULL_KEY_DATA) + key_done = 1; + + /* + * If there is freespace left on a FULL_KEY_DATA page, then + * the data is short and fits entirely on this page, and this + * is the last page. + */ + if (bp[2] == FULL_KEY_DATA && FREESPACE(bp)) + break; + pageno = bp[bp[0] - 1]; + rbufp->flags |= BUF_MOD; + rbufp = __get_buf(hashp, (uint32_t)pageno, rbufp, 0); + if (last_bfp) + __free_ovflpage(hashp, last_bfp); + last_bfp = rbufp; + if (!rbufp) + return (-1); /* Error. */ + bp = (uint16_t *)(void *)rbufp->page; + } + + /* + * If we get here then rbufp points to the last page of the big + * key/data pair. Bufp points to the first one -- it should now be + * empty pointing to the next page after this pair. Can't free it + * because we don't have the page pointing to it. + */ + + /* This is information from the last page of the pair. */ + n = bp[0]; + pageno = bp[n - 1]; + + /* Now, bp is the first page of the pair. */ + bp = (uint16_t *)(void *)bufp->page; + if (n > 2) { + /* There is an overflow page. */ + bp[1] = pageno; + bp[2] = OVFLPAGE; + bufp->ovfl = rbufp->ovfl; + } else + /* This is the last page. */ + bufp->ovfl = NULL; + n -= 2; + bp[0] = n; + temp = hashp->BSIZE - PAGE_META(n); + _DBFIT(temp, uint16_t); + FREESPACE(bp) = (uint16_t)temp; + OFFSET(bp) = hashp->BSIZE; + + bufp->flags |= BUF_MOD; + if (rbufp) + __free_ovflpage(hashp, rbufp); + if (last_bfp && last_bfp != rbufp) + __free_ovflpage(hashp, last_bfp); + + hashp->NKEYS--; + return (0); +} +/* + * Returns: + * 0 = key not found + * -1 = get next overflow page + * -2 means key not found and this is big key/data + * -3 error + */ +int +__find_bigpair(HTAB *hashp, BUFHEAD *bufp, int ndx, char *key, int size) +{ + uint16_t *bp; + char *p; + int ksize; + uint16_t bytes; + char *kkey; + + bp = (uint16_t *)(void *)bufp->page; + p = bufp->page; + ksize = size; + kkey = key; + + for (bytes = hashp->BSIZE - bp[ndx]; + bytes <= size && bp[ndx + 1] == PARTIAL_KEY; + bytes = hashp->BSIZE - bp[ndx]) { + if (memcmp(p + bp[ndx], kkey, (size_t)bytes)) + return (-2); + kkey += bytes; + ksize -= bytes; + bufp = __get_buf(hashp, (uint32_t)bp[ndx + 2], bufp, 0); + if (!bufp) + return (-3); + p = bufp->page; + bp = (uint16_t *)(void *)p; + ndx = 1; + } + + if (bytes != ksize || memcmp(p + bp[ndx], kkey, (size_t)bytes)) { +#ifdef HASH_STATISTICS + ++hash_collisions; +#endif + return (-2); + } else + return (ndx); +} + +/* + * Given the buffer pointer of the first overflow page of a big pair, + * find the end of the big pair + * + * This will set bpp to the buffer header of the last page of the big pair. + * It will return the pageno of the overflow page following the last page + * of the pair; 0 if there isn't any (i.e. big pair is the last key in the + * bucket) + */ +uint16_t +__find_last_page(HTAB *hashp, BUFHEAD **bpp) +{ + BUFHEAD *bufp; + uint16_t *bp, pageno; + int n; + + bufp = *bpp; + bp = (uint16_t *)(void *)bufp->page; + for (;;) { + n = bp[0]; + + /* + * This is the last page if: the tag is FULL_KEY_DATA and + * either only 2 entries OVFLPAGE marker is explicit there + * is freespace on the page. + */ + if (bp[2] == FULL_KEY_DATA && + ((n == 2) || (bp[n] == OVFLPAGE) || (FREESPACE(bp)))) + break; + + pageno = bp[n - 1]; + bufp = __get_buf(hashp, (uint32_t)pageno, bufp, 0); + if (!bufp) + return (0); /* Need to indicate an error! */ + bp = (uint16_t *)(void *)bufp->page; + } + + *bpp = bufp; + if (bp[0] > 2) + return (bp[3]); + else + return (0); +} + +/* + * Return the data for the key/data pair that begins on this page at this + * index (index should always be 1). + */ +int +__big_return(HTAB *hashp, BUFHEAD *bufp, int ndx, DBT *val, int set_current) +{ + BUFHEAD *save_p; + uint16_t *bp, len, off, save_addr; + char *tp; + + bp = (uint16_t *)(void *)bufp->page; + while (bp[ndx + 1] == PARTIAL_KEY) { + bufp = __get_buf(hashp, (uint32_t)bp[bp[0] - 1], bufp, 0); + if (!bufp) + return (-1); + bp = (uint16_t *)(void *)bufp->page; + ndx = 1; + } + + if (bp[ndx + 1] == FULL_KEY) { + bufp = __get_buf(hashp, (uint32_t)bp[bp[0] - 1], bufp, 0); + if (!bufp) + return (-1); + bp = (uint16_t *)(void *)bufp->page; + save_p = bufp; + save_addr = save_p->addr; + off = bp[1]; + len = 0; + } else + if (!FREESPACE(bp)) { + /* + * This is a hack. We can't distinguish between + * FULL_KEY_DATA that contains complete data or + * incomplete data, so we require that if the data + * is complete, there is at least 1 byte of free + * space left. + */ + off = bp[bp[0]]; + len = bp[1] - off; + save_p = bufp; + save_addr = bufp->addr; + bufp = __get_buf(hashp, (uint32_t)bp[bp[0] - 1], bufp, + 0); + if (!bufp) + return (-1); + bp = (uint16_t *)(void *)bufp->page; + } else { + /* The data is all on one page. */ + tp = (char *)(void *)bp; + off = bp[bp[0]]; + val->data = (uint8_t *)tp + off; + val->size = bp[1] - off; + if (set_current) { + if (bp[0] == 2) { /* No more buckets in + * chain */ + hashp->cpage = NULL; + hashp->cbucket++; + hashp->cndx = 1; + } else { + hashp->cpage = __get_buf(hashp, + (uint32_t)bp[bp[0] - 1], bufp, 0); + if (!hashp->cpage) + return (-1); + hashp->cndx = 1; + if (!((uint16_t *)(void *) + hashp->cpage->page)[0]) { + hashp->cbucket++; + hashp->cpage = NULL; + } + } + } + return (0); + } + + val->size = collect_data(hashp, bufp, (int)len, set_current); + if (val->size == (size_t)-1) + return (-1); + if (save_p->addr != save_addr) { + /* We are pretty short on buffers. */ + errno = EINVAL; /* OUT OF BUFFERS */ + return (-1); + } + memmove(hashp->tmp_buf, (save_p->page) + off, (size_t)len); + val->data = (uint8_t *)hashp->tmp_buf; + return (0); +} +/* + * Count how big the total datasize is by recursing through the pages. Then + * allocate a buffer and copy the data as you recurse up. + */ +static int +collect_data(HTAB *hashp, BUFHEAD *bufp, int len, int set) +{ + uint16_t *bp; + char *p; + BUFHEAD *xbp; + uint16_t save_addr; + int mylen, totlen; + + p = bufp->page; + bp = (uint16_t *)(void *)p; + mylen = hashp->BSIZE - bp[1]; + save_addr = bufp->addr; + + if (bp[2] == FULL_KEY_DATA) { /* End of Data */ + totlen = len + mylen; + if (hashp->tmp_buf) + free(hashp->tmp_buf); + if ((hashp->tmp_buf = calloc(1, (size_t)totlen)) == NULL) + return (-1); + if (set) { + hashp->cndx = 1; + if (bp[0] == 2) { /* No more buckets in chain */ + hashp->cpage = NULL; + hashp->cbucket++; + } else { + hashp->cpage = + __get_buf(hashp, (uint32_t)bp[bp[0] - 1], + bufp, 0); + if (!hashp->cpage) + return (-1); + else if (!((uint16_t *)(void *)hashp->cpage->page)[0]) { + hashp->cbucket++; + hashp->cpage = NULL; + } + } + } + } else { + xbp = __get_buf(hashp, (uint32_t)bp[bp[0] - 1], bufp, 0); + if (!xbp || ((totlen = + collect_data(hashp, xbp, len + mylen, set)) < 1)) + return (-1); + } + if (bufp->addr != save_addr) { + errno = EINVAL; /* Out of buffers. */ + return (-1); + } + memmove(&hashp->tmp_buf[len], (bufp->page) + bp[1], (size_t)mylen); + return (totlen); +} + +/* + * Fill in the key and data for this big pair. + */ +int +__big_keydata(HTAB *hashp, BUFHEAD *bufp, DBT *key, DBT *val, int set) +{ + key->size = collect_key(hashp, bufp, 0, val, set); + if (key->size == (size_t)-1) + return (-1); + key->data = (uint8_t *)hashp->tmp_key; + return (0); +} + +/* + * Count how big the total key size is by recursing through the pages. Then + * collect the data, allocate a buffer and copy the key as you recurse up. + */ +static int +collect_key(HTAB *hashp, BUFHEAD *bufp, int len, DBT *val, int set) +{ + BUFHEAD *xbp; + char *p; + int mylen, totlen; + uint16_t *bp, save_addr; + + p = bufp->page; + bp = (uint16_t *)(void *)p; + mylen = hashp->BSIZE - bp[1]; + + save_addr = bufp->addr; + totlen = len + mylen; + if (bp[2] == FULL_KEY || bp[2] == FULL_KEY_DATA) { /* End of Key. */ + if (hashp->tmp_key != NULL) + free(hashp->tmp_key); + if ((hashp->tmp_key = calloc(1, (size_t)totlen)) == NULL) + return (-1); + if (__big_return(hashp, bufp, 1, val, set)) + return (-1); + } else { + xbp = __get_buf(hashp, (uint32_t)bp[bp[0] - 1], bufp, 0); + if (!xbp || ((totlen = + collect_key(hashp, xbp, totlen, val, set)) < 1)) + return (-1); + } + if (bufp->addr != save_addr) { + errno = EINVAL; /* MIS -- OUT OF BUFFERS */ + return (-1); + } + memmove(&hashp->tmp_key[len], (bufp->page) + bp[1], (size_t)mylen); + return (totlen); +} + +/* + * Returns: + * 0 => OK + * -1 => error + */ +int +__big_split( + HTAB *hashp, + BUFHEAD *op, /* Pointer to where to put keys that go in old bucket */ + BUFHEAD *np, /* Pointer to new bucket page */ + /* Pointer to first page containing the big key/data */ + BUFHEAD *big_keyp, + int addr, /* Address of big_keyp */ + uint32_t obucket,/* Old Bucket */ + SPLIT_RETURN *ret +) +{ + BUFHEAD *tmpp; + uint16_t *tp; + BUFHEAD *bp; + DBT key, val; + uint32_t change; + uint16_t free_space, n, off; + size_t temp; + + bp = big_keyp; + + /* Now figure out where the big key/data goes */ + if (__big_keydata(hashp, big_keyp, &key, &val, 0)) + return (-1); + change = (__call_hash(hashp, key.data, (int)key.size) != obucket); + + if ((ret->next_addr = __find_last_page(hashp, &big_keyp)) != 0) { + if (!(ret->nextp = + __get_buf(hashp, (uint32_t)ret->next_addr, big_keyp, 0))) + return (-1); + } else + ret->nextp = NULL; + + /* Now make one of np/op point to the big key/data pair */ + _DIAGASSERT(np->ovfl == NULL); + if (change) + tmpp = np; + else + tmpp = op; + + tmpp->flags |= BUF_MOD; +#ifdef DEBUG1 + (void)fprintf(stderr, + "BIG_SPLIT: %d->ovfl was %d is now %d\n", tmpp->addr, + (tmpp->ovfl ? tmpp->ovfl->addr : 0), (bp ? bp->addr : 0)); +#endif + tmpp->ovfl = bp; /* one of op/np point to big_keyp */ + tp = (uint16_t *)(void *)tmpp->page; + _DIAGASSERT(FREESPACE(tp) >= OVFLSIZE); + n = tp[0]; + off = OFFSET(tp); + free_space = FREESPACE(tp); + tp[++n] = (uint16_t)addr; + tp[++n] = OVFLPAGE; + tp[0] = n; + OFFSET(tp) = off; + temp = free_space - OVFLSIZE; + _DBFIT(temp, uint16_t); + FREESPACE(tp) = (uint16_t)temp; + + /* + * Finally, set the new and old return values. BIG_KEYP contains a + * pointer to the last page of the big key_data pair. Make sure that + * big_keyp has no following page (2 elements) or create an empty + * following page. + */ + + ret->newp = np; + ret->oldp = op; + + tp = (uint16_t *)(void *)big_keyp->page; + big_keyp->flags |= BUF_MOD; + if (tp[0] > 2) { + /* + * There may be either one or two offsets on this page. If + * there is one, then the overflow page is linked on normally + * and tp[4] is OVFLPAGE. If there are two, tp[4] contains + * the second offset and needs to get stuffed in after the + * next overflow page is added. + */ + n = tp[4]; + free_space = FREESPACE(tp); + off = OFFSET(tp); + tp[0] -= 2; + temp = free_space + OVFLSIZE; + _DBFIT(temp, uint16_t); + FREESPACE(tp) = (uint16_t)temp; + OFFSET(tp) = off; + tmpp = __add_ovflpage(hashp, big_keyp); + if (!tmpp) + return (-1); + tp[4] = n; + } else + tmpp = big_keyp; + + if (change) + ret->newp = tmpp; + else + ret->oldp = tmpp; + return (0); +} diff --git a/lib/libc/db/hash/hash_buf.c b/lib/libc/db/hash/hash_buf.c new file mode 100644 index 000000000..98a868164 --- /dev/null +++ b/lib/libc/db/hash/hash_buf.c @@ -0,0 +1,346 @@ +/* $NetBSD: hash_buf.c,v 1.18 2009/04/23 22:09:23 christos Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: hash_buf.c,v 1.18 2009/04/23 22:09:23 christos Exp $"); +#endif + +/* + * PACKAGE: hash + * + * DESCRIPTION: + * Contains buffer management + * + * ROUTINES: + * External + * __buf_init + * __get_buf + * __buf_free + * __reclaim_buf + * Internal + * newbuf + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include "hash.h" +#include "page.h" +#include "extern.h" + +static BUFHEAD *newbuf(HTAB *, uint32_t, BUFHEAD *); + +/* Unlink B from its place in the lru */ +#define BUF_REMOVE(B) { \ + (B)->prev->next = (B)->next; \ + (B)->next->prev = (B)->prev; \ +} + +/* Insert B after P */ +#define BUF_INSERT(B, P) { \ + (B)->next = (P)->next; \ + (B)->prev = (P); \ + (P)->next = (B); \ + (B)->next->prev = (B); \ +} + +#define MRU hashp->bufhead.next +#define LRU hashp->bufhead.prev + +#define MRU_INSERT(B) BUF_INSERT((B), &hashp->bufhead) +#define LRU_INSERT(B) BUF_INSERT((B), LRU) + +#ifndef _DIAGASSERT +#define _DIAGASSERT assert +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +/* + * We are looking for a buffer with address "addr". If prev_bp is NULL, then + * address is a bucket index. If prev_bp is not NULL, then it points to the + * page previous to an overflow page that we are trying to find. + * + * CAVEAT: The buffer header accessed via prev_bp's ovfl field may no longer + * be valid. Therefore, you must always verify that its address matches the + * address you are seeking. + */ +BUFHEAD * +__get_buf( + HTAB *hashp, + uint32_t addr, + BUFHEAD *prev_bp, + int newpage /* If prev_bp set, indicates a new overflow page. */ +) +{ + BUFHEAD *bp; + uint32_t is_disk_mask; + int is_disk, segment_ndx = 0; /* pacify gcc */ + SEGMENT segp = NULL; /* pacify gcc */ + + is_disk = 0; + is_disk_mask = 0; + if (prev_bp) { + bp = prev_bp->ovfl; + if (!bp || (bp->addr != addr)) + bp = NULL; + if (!newpage) + is_disk = BUF_DISK; + } else { + /* Grab buffer out of directory */ + segment_ndx = addr & (hashp->SGSIZE - 1); + + /* valid segment ensured by __call_hash() */ + segp = hashp->dir[addr >> hashp->SSHIFT]; + _DIAGASSERT(segp != NULL); + bp = PTROF(segp[segment_ndx]); + is_disk_mask = ISDISK(segp[segment_ndx]); + is_disk = is_disk_mask || !hashp->new_file; + } + + if (!bp) { + bp = newbuf(hashp, addr, prev_bp); + if (!bp || + __get_page(hashp, bp->page, addr, !prev_bp, is_disk, 0)) + return (NULL); + if (!prev_bp) + segp[segment_ndx] = + (BUFHEAD *)(void *)((u_long)bp | is_disk_mask); + } else { + BUF_REMOVE(bp); + MRU_INSERT(bp); + } + return (bp); +} + +/* + * We need a buffer for this page. Either allocate one, or evict a resident + * one (if we have as many buffers as we're allowed) and put this one in. + * + * If newbuf finds an error (returning NULL), it also sets errno. + */ +static BUFHEAD * +newbuf(HTAB *hashp, uint32_t addr, BUFHEAD *prev_bp) +{ + BUFHEAD *bp; /* The buffer we're going to use */ + BUFHEAD *xbp; /* Temp pointer */ + BUFHEAD *next_xbp; + SEGMENT segp; + int segment_ndx; + uint16_t oaddr, *shortp; + + oaddr = 0; + bp = LRU; + /* + * If LRU buffer is pinned, the buffer pool is too small. We need to + * allocate more buffers. + */ + if (hashp->nbufs || (bp->flags & BUF_PIN)) { + /* Allocate a new one */ + if ((bp = calloc(1, sizeof(BUFHEAD))) == NULL) + return (NULL); + if ((bp->page = calloc(1, (size_t)hashp->BSIZE)) == NULL) { + free(bp); + return (NULL); + } + if (hashp->nbufs) + hashp->nbufs--; + } else { + /* Kick someone out */ + BUF_REMOVE(bp); + /* + * If this is an overflow page with addr 0, it's already been + * flushed back in an overflow chain and initialized. + */ + if ((bp->addr != 0) || (bp->flags & BUF_BUCKET)) { + /* + * Set oaddr before __put_page so that you get it + * before bytes are swapped. + */ + shortp = (uint16_t *)(void *)bp->page; + if (shortp[0]) + oaddr = shortp[shortp[0] - 1]; + if ((bp->flags & BUF_MOD) && __put_page(hashp, bp->page, + bp->addr, (int)IS_BUCKET(bp->flags), 0)) + return (NULL); + /* + * Update the pointer to this page (i.e. invalidate it). + * + * If this is a new file (i.e. we created it at open + * time), make sure that we mark pages which have been + * written to disk so we retrieve them from disk later, + * rather than allocating new pages. + */ + if (IS_BUCKET(bp->flags)) { + segment_ndx = bp->addr & (hashp->SGSIZE - 1); + segp = hashp->dir[bp->addr >> hashp->SSHIFT]; + _DIAGASSERT(segp != NULL); + + if (hashp->new_file && + ((bp->flags & BUF_MOD) || + ISDISK(segp[segment_ndx]))) + segp[segment_ndx] = (BUFHEAD *)BUF_DISK; + else + segp[segment_ndx] = NULL; + } + /* + * Since overflow pages can only be access by means of + * their bucket, free overflow pages associated with + * this bucket. + */ + for (xbp = bp; xbp->ovfl;) { + next_xbp = xbp->ovfl; + xbp->ovfl = 0; + xbp = next_xbp; + + /* Check that ovfl pointer is up date. */ + if (IS_BUCKET(xbp->flags) || + (oaddr != xbp->addr)) + break; + + shortp = (uint16_t *)(void *)xbp->page; + if (shortp[0]) + /* set before __put_page */ + oaddr = shortp[shortp[0] - 1]; + if ((xbp->flags & BUF_MOD) && __put_page(hashp, + xbp->page, xbp->addr, 0, 0)) + return (NULL); + xbp->addr = 0; + xbp->flags = 0; + BUF_REMOVE(xbp); + LRU_INSERT(xbp); + } + } + } + + /* Now assign this buffer */ + bp->addr = addr; +#ifdef DEBUG1 + (void)fprintf(stderr, "NEWBUF1: %d->ovfl was %d is now %d\n", + bp->addr, (bp->ovfl ? bp->ovfl->addr : 0), 0); +#endif + bp->ovfl = NULL; + if (prev_bp) { + /* + * If prev_bp is set, this is an overflow page, hook it in to + * the buffer overflow links. + */ +#ifdef DEBUG1 + (void)fprintf(stderr, "NEWBUF2: %d->ovfl was %d is now %d\n", + prev_bp->addr, (prev_bp->ovfl ? prev_bp->ovfl->addr : 0), + (bp ? bp->addr : 0)); +#endif + prev_bp->ovfl = bp; + bp->flags = 0; + } else + bp->flags = BUF_BUCKET; + MRU_INSERT(bp); + return (bp); +} + +void +__buf_init(HTAB *hashp, u_int nbytes) +{ + BUFHEAD *bfp; + int npages; + + bfp = &(hashp->bufhead); + npages = (unsigned int)(nbytes + hashp->BSIZE - 1) >> hashp->BSHIFT; + npages = MAX(npages, MIN_BUFFERS); + + hashp->nbufs = npages; + bfp->next = bfp; + bfp->prev = bfp; + /* + * This space is calloc'd so these are already null. + * + * bfp->ovfl = NULL; + * bfp->flags = 0; + * bfp->page = NULL; + * bfp->addr = 0; + */ +} + +int +__buf_free(HTAB *hashp, int do_free, int to_disk) +{ + BUFHEAD *bp; + + /* Need to make sure that buffer manager has been initialized */ + if (!LRU) + return (0); + for (bp = LRU; bp != &hashp->bufhead;) { + /* Check that the buffer is valid */ + if (bp->addr || IS_BUCKET(bp->flags)) { + if (to_disk && (bp->flags & BUF_MOD) && + __put_page(hashp, bp->page, + bp->addr, IS_BUCKET(bp->flags), 0)) + return (-1); + } + /* Check if we are freeing stuff */ + if (do_free) { + if (bp->page) { + (void)memset(bp->page, 0, (size_t)hashp->BSIZE); + free(bp->page); + } + BUF_REMOVE(bp); + free(bp); + bp = LRU; + } else + bp = bp->prev; + } + return (0); +} + +void +__reclaim_buf(HTAB *hashp, BUFHEAD *bp) +{ + bp->ovfl = 0; + bp->addr = 0; + bp->flags = 0; + BUF_REMOVE(bp); + LRU_INSERT(bp); +} diff --git a/lib/libc/db/hash/hash_func.c b/lib/libc/db/hash/hash_func.c new file mode 100644 index 000000000..bad63e87e --- /dev/null +++ b/lib/libc/db/hash/hash_func.c @@ -0,0 +1,210 @@ +/* $NetBSD: hash_func.c,v 1.13 2008/09/10 17:52:35 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: hash_func.c,v 1.13 2008/09/10 17:52:35 joerg Exp $"); +#endif + +#include + +#include +#include "hash.h" +#include "page.h" +#include "extern.h" + +#if 0 +static uint32_t hash1(const void *, size_t) __attribute__((__unused__)); +static uint32_t hash2(const void *, size_t) __attribute__((__unused__)); +static uint32_t hash3(const void *, size_t) __attribute__((__unused__)); +#endif +static uint32_t hash4(const void *, size_t) __attribute__((__unused__)); + +/* Global default hash function */ +uint32_t (*__default_hash)(const void *, size_t) = hash4; +#if 0 +/* + * HASH FUNCTIONS + * + * Assume that we've already split the bucket to which this key hashes, + * calculate that bucket, and check that in fact we did already split it. + * + * This came from ejb's hsearch. + */ + +#define PRIME1 37 +#define PRIME2 1048583 + +static uint32_t +hash1(const void *keyarg, size_t len) +{ + const uint8_t *key; + uint32_t h; + + /* Convert string to integer */ + for (key = keyarg, h = 0; len--;) + h = h * PRIME1 ^ (*key++ - ' '); + h %= PRIME2; + return (h); +} + +/* + * Phong's linear congruential hash + */ +#define dcharhash(h, c) ((h) = 0x63c63cd9*(h) + 0x9c39c33d + (c)) + +static uint32_t +hash2(const void *keyarg, size_t len) +{ + const uint8_t *e, *key; + uint32_t h; + uint8_t c; + + key = keyarg; + e = key + len; + for (h = 0; key != e;) { + c = *key++; + if (!c && key > e) + break; + dcharhash(h, c); + } + return (h); +} + +/* + * This is INCREDIBLY ugly, but fast. We break the string up into 8 byte + * units. On the first time through the loop we get the "leftover bytes" + * (strlen % 8). On every other iteration, we perform 8 HASHC's so we handle + * all 8 bytes. Essentially, this saves us 7 cmp & branch instructions. If + * this routine is heavily used enough, it's worth the ugly coding. + * + * OZ's original sdbm hash + */ +static uint32_t +hash3(const void *keyarg, size_t len) +{ + const uint8_t *key; + size_t loop; + uint32_t h; + +#define HASHC h = *key++ + 65599 * h + + h = 0; + key = keyarg; + if (len > 0) { + loop = (len + 8 - 1) >> 3; + + switch (len & (8 - 1)) { + case 0: + do { + HASHC; + /* FALLTHROUGH */ + case 7: + HASHC; + /* FALLTHROUGH */ + case 6: + HASHC; + /* FALLTHROUGH */ + case 5: + HASHC; + /* FALLTHROUGH */ + case 4: + HASHC; + /* FALLTHROUGH */ + case 3: + HASHC; + /* FALLTHROUGH */ + case 2: + HASHC; + /* FALLTHROUGH */ + case 1: + HASHC; + } while (--loop); + } + } + return (h); +} +#endif + +/* Hash function from Chris Torek. */ +static uint32_t +hash4(const void *keyarg, size_t len) +{ + const uint8_t *key; + size_t loop; + uint32_t h; + +#define HASH4a h = (h << 5) - h + *key++; +#define HASH4b h = (h << 5) + h + *key++; +#define HASH4 HASH4b + + h = 0; + key = keyarg; + if (len > 0) { + loop = (len + 8 - 1) >> 3; + + switch (len & (8 - 1)) { + case 0: + do { + HASH4; + /* FALLTHROUGH */ + case 7: + HASH4; + /* FALLTHROUGH */ + case 6: + HASH4; + /* FALLTHROUGH */ + case 5: + HASH4; + /* FALLTHROUGH */ + case 4: + HASH4; + /* FALLTHROUGH */ + case 3: + HASH4; + /* FALLTHROUGH */ + case 2: + HASH4; + /* FALLTHROUGH */ + case 1: + HASH4; + } while (--loop); + } + } + return (h); +} diff --git a/lib/libc/db/hash/hash_log2.c b/lib/libc/db/hash/hash_log2.c new file mode 100644 index 000000000..688e39b0e --- /dev/null +++ b/lib/libc/db/hash/hash_log2.c @@ -0,0 +1,64 @@ +/* $NetBSD: hash_log2.c,v 1.13 2008/09/11 12:33:55 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: hash_log2.c,v 1.13 2008/09/11 12:33:55 joerg Exp $"); +#endif + +#include + +#include +#include "hash.h" +#include "page.h" +#include "extern.h" + +uint32_t +__log2(uint32_t num) +{ + uint32_t i, limit; + + if (num == 0) + return 0; + --num; + + limit = 0; + for (i = 0; limit < num; limit = limit * 2 + 1, i++) + continue; + return (i); +} diff --git a/lib/libc/db/hash/hash_page.c b/lib/libc/db/hash/hash_page.c new file mode 100644 index 000000000..b4cf1215f --- /dev/null +++ b/lib/libc/db/hash/hash_page.c @@ -0,0 +1,989 @@ +/* $NetBSD: hash_page.c,v 1.23 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: hash_page.c,v 1.23 2008/09/11 12:58:00 joerg Exp $"); +#endif + +/* + * PACKAGE: hashing + * + * DESCRIPTION: + * Page manipulation for hashing package. + * + * ROUTINES: + * + * External + * __get_page + * __add_ovflpage + * Internal + * overflow_page + * open_temp + */ + +#ifndef __minix +#include "namespace.h" +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#ifndef __minix +#include +#else +#define _PATH_TMP "/tmp/" +#endif +#include + +#include +#include "hash.h" +#include "page.h" +#include "extern.h" + +#ifndef LITTLE_ENDIAN +# define LITTLE_ENDIAN 1234 +#endif + +#ifndef BIG_ENDIAN +# define BIG_ENDIAN 4321 +#endif + +#ifndef BYTE_ORDER +#define BYTE_ORDER LITTLE_ENDIAN +#endif + +#ifndef _DIAGASSERT +#define _DIAGASSERT assert +#endif + +static uint32_t *fetch_bitmap(HTAB *, int); +static uint32_t first_free(uint32_t); +static int open_temp(HTAB *); +static uint16_t overflow_page(HTAB *); +static void putpair(char *, const DBT *, const DBT *); +static void squeeze_key(uint16_t *, const DBT *, const DBT *); +static int ugly_split(HTAB *, uint32_t, BUFHEAD *, BUFHEAD *, int, int); + +#define PAGE_INIT(P) { \ + ((uint16_t *)(void *)(P))[0] = 0; \ + temp = 3 * sizeof(uint16_t); \ + _DIAGASSERT(hashp->BSIZE >= temp); \ + ((uint16_t *)(void *)(P))[1] = (uint16_t)(hashp->BSIZE - temp); \ + ((uint16_t *)(void *)(P))[2] = hashp->BSIZE; \ +} + +/* + * This is called AFTER we have verified that there is room on the page for + * the pair (PAIRFITS has returned true) so we go right ahead and start moving + * stuff on. + */ +static void +putpair(char *p, const DBT *key, const DBT *val) +{ + uint16_t *bp, n, off; + size_t temp; + + bp = (uint16_t *)(void *)p; + + /* Enter the key first. */ + n = bp[0]; + + temp = OFFSET(bp); + _DIAGASSERT(temp >= key->size); + off = (uint16_t)(temp - key->size); + memmove(p + off, key->data, key->size); + bp[++n] = off; + + /* Now the data. */ + _DIAGASSERT(off >= val->size); + off -= (uint16_t)val->size; + memmove(p + off, val->data, val->size); + bp[++n] = off; + + /* Adjust page info. */ + bp[0] = n; + temp = (n + 3) * sizeof(uint16_t); + _DIAGASSERT(off >= temp); + bp[n + 1] = (uint16_t)(off - temp); + bp[n + 2] = off; +} + +/* + * Returns: + * 0 OK + * -1 error + */ +int +__delpair(HTAB *hashp, BUFHEAD *bufp, int ndx) +{ + uint16_t *bp, newoff; + int n; + uint16_t pairlen; + size_t temp; + + bp = (uint16_t *)(void *)bufp->page; + n = bp[0]; + + if (bp[ndx + 1] < REAL_KEY) + return (__big_delete(hashp, bufp)); + if (ndx != 1) + newoff = bp[ndx - 1]; + else + newoff = hashp->BSIZE; + pairlen = newoff - bp[ndx + 1]; + + if (ndx != (n - 1)) { + /* Hard Case -- need to shuffle keys */ + int i; + char *src = bufp->page + (int)OFFSET(bp); + char *dst = src + (int)pairlen; + memmove(dst, src, (size_t)(bp[ndx + 1] - OFFSET(bp))); + + /* Now adjust the pointers */ + for (i = ndx + 2; i <= n; i += 2) { + if (bp[i + 1] == OVFLPAGE) { + bp[i - 2] = bp[i]; + bp[i - 1] = bp[i + 1]; + } else { + bp[i - 2] = bp[i] + pairlen; + bp[i - 1] = bp[i + 1] + pairlen; + } + } + } + /* Finally adjust the page data */ + bp[n] = OFFSET(bp) + pairlen; + temp = bp[n + 1] + pairlen + 2 * sizeof(uint16_t); + _DIAGASSERT(temp <= 0xffff); + bp[n - 1] = (uint16_t)temp; + bp[0] = n - 2; + hashp->NKEYS--; + + bufp->flags |= BUF_MOD; + return (0); +} +/* + * Returns: + * 0 ==> OK + * -1 ==> Error + */ +int +__split_page(HTAB *hashp, uint32_t obucket, uint32_t nbucket) +{ + BUFHEAD *new_bufp, *old_bufp; + uint16_t *ino; + char *np; + DBT key, val; + int n, ndx, retval; + uint16_t copyto, diff, off, moved; + char *op; + size_t temp; + + copyto = (uint16_t)hashp->BSIZE; + off = (uint16_t)hashp->BSIZE; + old_bufp = __get_buf(hashp, obucket, NULL, 0); + if (old_bufp == NULL) + return (-1); + new_bufp = __get_buf(hashp, nbucket, NULL, 0); + if (new_bufp == NULL) + return (-1); + + old_bufp->flags |= (BUF_MOD | BUF_PIN); + new_bufp->flags |= (BUF_MOD | BUF_PIN); + + ino = (uint16_t *)(void *)(op = old_bufp->page); + np = new_bufp->page; + + moved = 0; + + for (n = 1, ndx = 1; n < ino[0]; n += 2) { + if (ino[n + 1] < REAL_KEY) { + retval = ugly_split(hashp, obucket, old_bufp, new_bufp, + (int)copyto, (int)moved); + old_bufp->flags &= ~BUF_PIN; + new_bufp->flags &= ~BUF_PIN; + return (retval); + + } + key.data = (uint8_t *)op + ino[n]; + key.size = off - ino[n]; + + if (__call_hash(hashp, key.data, (int)key.size) == obucket) { + /* Don't switch page */ + diff = copyto - off; + if (diff) { + copyto = ino[n + 1] + diff; + memmove(op + copyto, op + ino[n + 1], + (size_t)(off - ino[n + 1])); + ino[ndx] = copyto + ino[n] - ino[n + 1]; + ino[ndx + 1] = copyto; + } else + copyto = ino[n + 1]; + ndx += 2; + } else { + /* Switch page */ + val.data = (uint8_t *)op + ino[n + 1]; + val.size = ino[n] - ino[n + 1]; + putpair(np, &key, &val); + moved += 2; + } + + off = ino[n + 1]; + } + + /* Now clean up the page */ + ino[0] -= moved; + temp = sizeof(uint16_t) * (ino[0] + 3); + _DIAGASSERT(copyto >= temp); + FREESPACE(ino) = (uint16_t)(copyto - temp); + OFFSET(ino) = copyto; + +#ifdef DEBUG3 + (void)fprintf(stderr, "split %d/%d\n", + ((uint16_t *)np)[0] / 2, + ((uint16_t *)op)[0] / 2); +#endif + /* unpin both pages */ + old_bufp->flags &= ~BUF_PIN; + new_bufp->flags &= ~BUF_PIN; + return (0); +} + +/* + * Called when we encounter an overflow or big key/data page during split + * handling. This is special cased since we have to begin checking whether + * the key/data pairs fit on their respective pages and because we may need + * overflow pages for both the old and new pages. + * + * The first page might be a page with regular key/data pairs in which case + * we have a regular overflow condition and just need to go on to the next + * page or it might be a big key/data pair in which case we need to fix the + * big key/data pair. + * + * Returns: + * 0 ==> success + * -1 ==> failure + */ +static int +ugly_split( + HTAB *hashp, + uint32_t obucket, /* Same as __split_page. */ + BUFHEAD *old_bufp, + BUFHEAD *new_bufp, + int copyto, /* First byte on page which contains key/data values. */ + int moved /* Number of pairs moved to new page. */ +) +{ + BUFHEAD *bufp; /* Buffer header for ino */ + uint16_t *ino; /* Page keys come off of */ + uint16_t *np; /* New page */ + uint16_t *op; /* Page keys go on to if they aren't moving */ + size_t temp; + + BUFHEAD *last_bfp; /* Last buf header OVFL needing to be freed */ + DBT key, val; + SPLIT_RETURN ret; + uint16_t n, off, ov_addr, scopyto; + char *cino; /* Character value of ino */ + + bufp = old_bufp; + ino = (uint16_t *)(void *)old_bufp->page; + np = (uint16_t *)(void *)new_bufp->page; + op = (uint16_t *)(void *)old_bufp->page; + last_bfp = NULL; + scopyto = (uint16_t)copyto; /* ANSI */ + + n = ino[0] - 1; + while (n < ino[0]) { + if (ino[2] < REAL_KEY && ino[2] != OVFLPAGE) { + if (__big_split(hashp, old_bufp, + new_bufp, bufp, (int)bufp->addr, obucket, &ret)) + return (-1); + old_bufp = ret.oldp; + if (!old_bufp) + return (-1); + op = (uint16_t *)(void *)old_bufp->page; + new_bufp = ret.newp; + if (!new_bufp) + return (-1); + np = (uint16_t *)(void *)new_bufp->page; + bufp = ret.nextp; + if (!bufp) + return (0); + cino = (char *)bufp->page; + ino = (uint16_t *)(void *)cino; + last_bfp = ret.nextp; + } else if (ino[n + 1] == OVFLPAGE) { + ov_addr = ino[n]; + /* + * Fix up the old page -- the extra 2 are the fields + * which contained the overflow information. + */ + ino[0] -= (moved + 2); + temp = sizeof(uint16_t) * (ino[0] + 3); + _DIAGASSERT(scopyto >= temp); + FREESPACE(ino) = (uint16_t)(scopyto - temp); + OFFSET(ino) = scopyto; + + bufp = __get_buf(hashp, (uint32_t)ov_addr, bufp, 0); + if (!bufp) + return (-1); + + ino = (uint16_t *)(void *)bufp->page; + n = 1; + scopyto = hashp->BSIZE; + moved = 0; + + if (last_bfp) + __free_ovflpage(hashp, last_bfp); + last_bfp = bufp; + } + /* Move regular sized pairs of there are any */ + off = hashp->BSIZE; + for (n = 1; (n < ino[0]) && (ino[n + 1] >= REAL_KEY); n += 2) { + cino = (char *)(void *)ino; + key.data = (uint8_t *)cino + ino[n]; + key.size = off - ino[n]; + val.data = (uint8_t *)cino + ino[n + 1]; + val.size = ino[n] - ino[n + 1]; + off = ino[n + 1]; + + if (__call_hash(hashp, key.data, (int)key.size) == obucket) { + /* Keep on old page */ + if (PAIRFITS(op, (&key), (&val))) + putpair((char *)(void *)op, &key, &val); + else { + old_bufp = + __add_ovflpage(hashp, old_bufp); + if (!old_bufp) + return (-1); + op = (uint16_t *)(void *)old_bufp->page; + putpair((char *)(void *)op, &key, &val); + } + old_bufp->flags |= BUF_MOD; + } else { + /* Move to new page */ + if (PAIRFITS(np, (&key), (&val))) + putpair((char *)(void *)np, &key, &val); + else { + new_bufp = + __add_ovflpage(hashp, new_bufp); + if (!new_bufp) + return (-1); + np = (uint16_t *)(void *)new_bufp->page; + putpair((char *)(void *)np, &key, &val); + } + new_bufp->flags |= BUF_MOD; + } + } + } + if (last_bfp) + __free_ovflpage(hashp, last_bfp); + return (0); +} + +/* + * Add the given pair to the page + * + * Returns: + * 0 ==> OK + * 1 ==> failure + */ +int +__addel(HTAB *hashp, BUFHEAD *bufp, const DBT *key, const DBT *val) +{ + uint16_t *bp, *sop; + int do_expand; + + bp = (uint16_t *)(void *)bufp->page; + do_expand = 0; + while (bp[0] && (bp[2] < REAL_KEY || bp[bp[0]] < REAL_KEY)) + /* Exception case */ + if (bp[2] == FULL_KEY_DATA && bp[0] == 2) + /* This is the last page of a big key/data pair + and we need to add another page */ + break; + else if (bp[2] < REAL_KEY && bp[bp[0]] != OVFLPAGE) { + bufp = __get_buf(hashp, (uint32_t)bp[bp[0] - 1], bufp, + 0); + if (!bufp) + return (-1); + bp = (uint16_t *)(void *)bufp->page; + } else if (bp[bp[0]] != OVFLPAGE) { + /* Short key/data pairs, no more pages */ + break; + } else { + /* Try to squeeze key on this page */ + if (bp[2] >= REAL_KEY && + FREESPACE(bp) >= PAIRSIZE(key, val)) { + squeeze_key(bp, key, val); + goto stats; + } else { + bufp = __get_buf(hashp, + (uint32_t)bp[bp[0] - 1], bufp, 0); + if (!bufp) + return (-1); + bp = (uint16_t *)(void *)bufp->page; + } + } + + if (PAIRFITS(bp, key, val)) + putpair(bufp->page, key, val); + else { + do_expand = 1; + bufp = __add_ovflpage(hashp, bufp); + if (!bufp) + return (-1); + sop = (uint16_t *)(void *)bufp->page; + + if (PAIRFITS(sop, key, val)) + putpair((char *)(void *)sop, key, val); + else + if (__big_insert(hashp, bufp, key, val)) + return (-1); + } +stats: + bufp->flags |= BUF_MOD; + /* + * If the average number of keys per bucket exceeds the fill factor, + * expand the table. + */ + hashp->NKEYS++; + if (do_expand || + (hashp->NKEYS / (hashp->MAX_BUCKET + 1) > hashp->FFACTOR)) + return (__expand_table(hashp)); + return (0); +} + +/* + * + * Returns: + * pointer on success + * NULL on error + */ +BUFHEAD * +__add_ovflpage(HTAB *hashp, BUFHEAD *bufp) +{ + uint16_t *sp; + uint16_t ndx, ovfl_num; + size_t temp; +#ifdef DEBUG1 + int tmp1, tmp2; +#endif + sp = (uint16_t *)(void *)bufp->page; + + /* Check if we are dynamically determining the fill factor */ + if (hashp->FFACTOR == DEF_FFACTOR) { + hashp->FFACTOR = (uint32_t)sp[0] >> 1; + if (hashp->FFACTOR < MIN_FFACTOR) + hashp->FFACTOR = MIN_FFACTOR; + } + bufp->flags |= BUF_MOD; + ovfl_num = overflow_page(hashp); +#ifdef DEBUG1 + tmp1 = bufp->addr; + tmp2 = bufp->ovfl ? bufp->ovfl->addr : 0; +#endif + if (!ovfl_num || !(bufp->ovfl = __get_buf(hashp, (uint32_t)ovfl_num, + bufp, 1))) + return (NULL); + bufp->ovfl->flags |= BUF_MOD; +#ifdef DEBUG1 + (void)fprintf(stderr, "ADDOVFLPAGE: %d->ovfl was %d is now %d\n", + tmp1, tmp2, bufp->ovfl->addr); +#endif + ndx = sp[0]; + /* + * Since a pair is allocated on a page only if there's room to add + * an overflow page, we know that the OVFL information will fit on + * the page. + */ + sp[ndx + 4] = OFFSET(sp); + temp = FREESPACE(sp); + _DIAGASSERT(temp >= OVFLSIZE); + sp[ndx + 3] = (uint16_t)(temp - OVFLSIZE); + sp[ndx + 1] = ovfl_num; + sp[ndx + 2] = OVFLPAGE; + sp[0] = ndx + 2; +#ifdef HASH_STATISTICS + hash_overflows++; +#endif + return (bufp->ovfl); +} + +/* + * Returns: + * 0 indicates SUCCESS + * -1 indicates FAILURE + */ +int +__get_page(HTAB *hashp, char *p, uint32_t bucket, int is_bucket, int is_disk, + int is_bitmap) +{ + int fd, page, size; + ssize_t rsize; + uint16_t *bp; + size_t temp; + + fd = hashp->fp; + size = hashp->BSIZE; + + if ((fd == -1) || !is_disk) { + PAGE_INIT(p); + return (0); + } + if (is_bucket) + page = BUCKET_TO_PAGE(bucket); + else + page = OADDR_TO_PAGE(bucket); + if ((rsize = pread(fd, p, (size_t)size, (off_t)page << hashp->BSHIFT)) == -1) + return (-1); + bp = (uint16_t *)(void *)p; + if (!rsize) + bp[0] = 0; /* We hit the EOF, so initialize a new page */ + else + if (rsize != size) { + errno = EFTYPE; + return (-1); + } + if (!is_bitmap && !bp[0]) { + PAGE_INIT(p); + } else + if (hashp->LORDER != BYTE_ORDER) { + int i, max; + + if (is_bitmap) { + max = (uint32_t)hashp->BSIZE >> 2; /* divide by 4 */ + for (i = 0; i < max; i++) + M_32_SWAP(((int *)(void *)p)[i]); + } else { + M_16_SWAP(bp[0]); + max = bp[0] + 2; + for (i = 1; i <= max; i++) + M_16_SWAP(bp[i]); + } + } + return (0); +} + +/* + * Write page p to disk + * + * Returns: + * 0 ==> OK + * -1 ==>failure + */ +int +__put_page(HTAB *hashp, char *p, uint32_t bucket, int is_bucket, int is_bitmap) +{ + int fd, page, size; + ssize_t wsize; + + size = hashp->BSIZE; + if ((hashp->fp == -1) && open_temp(hashp)) + return (-1); + fd = hashp->fp; + + if (hashp->LORDER != BYTE_ORDER) { + int i; + int max; + + if (is_bitmap) { + max = (uint32_t)hashp->BSIZE >> 2; /* divide by 4 */ + for (i = 0; i < max; i++) + M_32_SWAP(((int *)(void *)p)[i]); + } else { + max = ((uint16_t *)(void *)p)[0] + 2; + for (i = 0; i <= max; i++) + M_16_SWAP(((uint16_t *)(void *)p)[i]); + } + } + if (is_bucket) + page = BUCKET_TO_PAGE(bucket); + else + page = OADDR_TO_PAGE(bucket); + if ((wsize = pwrite(fd, p, (size_t)size, (off_t)page << hashp->BSHIFT)) == -1) + /* Errno is set */ + return (-1); + if (wsize != size) { + errno = EFTYPE; + return (-1); + } + return (0); +} + +#define BYTE_MASK ((1 << INT_BYTE_SHIFT) -1) +/* + * Initialize a new bitmap page. Bitmap pages are left in memory + * once they are read in. + */ +int +__ibitmap(HTAB *hashp, int pnum, int nbits, int ndx) +{ + uint32_t *ip; + int clearbytes, clearints; + + if ((ip = malloc((size_t)hashp->BSIZE)) == NULL) + return (1); + hashp->nmaps++; + clearints = ((uint32_t)(nbits - 1) >> INT_BYTE_SHIFT) + 1; + clearbytes = clearints << INT_TO_BYTE; + (void)memset(ip, 0, (size_t)clearbytes); + (void)memset(((char *)(void *)ip) + clearbytes, 0xFF, + (size_t)(hashp->BSIZE - clearbytes)); + ip[clearints - 1] = ALL_SET << (nbits & BYTE_MASK); + SETBIT(ip, 0); + hashp->BITMAPS[ndx] = (uint16_t)pnum; + hashp->mapp[ndx] = ip; + return (0); +} + +static uint32_t +first_free(uint32_t map) +{ + uint32_t i, mask; + + mask = 0x1; + for (i = 0; i < BITS_PER_MAP; i++) { + if (!(mask & map)) + return (i); + mask = mask << 1; + } + return (i); +} + +static uint16_t +overflow_page(HTAB *hashp) +{ + uint32_t *freep = NULL; + int max_free, offset, splitnum; + uint16_t addr; + int bit, first_page, free_bit, free_page, i, in_use_bits, j; +#ifdef DEBUG2 + int tmp1, tmp2; +#endif + splitnum = hashp->OVFL_POINT; + max_free = hashp->SPARES[splitnum]; + + free_page = (uint32_t)(max_free - 1) >> (hashp->BSHIFT + BYTE_SHIFT); + free_bit = (max_free - 1) & ((hashp->BSIZE << BYTE_SHIFT) - 1); + + /* Look through all the free maps to find the first free block */ + first_page = (uint32_t)hashp->LAST_FREED >>(hashp->BSHIFT + BYTE_SHIFT); + for ( i = first_page; i <= free_page; i++ ) { + if (!(freep = (uint32_t *)hashp->mapp[i]) && + !(freep = fetch_bitmap(hashp, i))) + return (0); + if (i == free_page) + in_use_bits = free_bit; + else + in_use_bits = (hashp->BSIZE << BYTE_SHIFT) - 1; + + if (i == first_page) { + bit = hashp->LAST_FREED & + ((hashp->BSIZE << BYTE_SHIFT) - 1); + j = bit / BITS_PER_MAP; + bit = bit & ~(BITS_PER_MAP - 1); + } else { + bit = 0; + j = 0; + } + for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP) + if (freep[j] != ALL_SET) + goto found; + } + + /* No Free Page Found */ + hashp->LAST_FREED = hashp->SPARES[splitnum]; + hashp->SPARES[splitnum]++; + offset = hashp->SPARES[splitnum] - + (splitnum ? hashp->SPARES[splitnum - 1] : 0); + +#define OVMSG "HASH: Out of overflow pages. Increase page size\n" + if (offset > SPLITMASK) { + if (++splitnum >= NCACHED) { + (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); + errno = EFBIG; + return (0); + } + hashp->OVFL_POINT = splitnum; + hashp->SPARES[splitnum] = hashp->SPARES[splitnum-1]; + hashp->SPARES[splitnum-1]--; + offset = 1; + } + + /* Check if we need to allocate a new bitmap page */ + if (free_bit == (hashp->BSIZE << BYTE_SHIFT) - 1) { + free_page++; + if (free_page >= NCACHED) { + (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); + errno = EFBIG; + return (0); + } + /* + * This is tricky. The 1 indicates that you want the new page + * allocated with 1 clear bit. Actually, you are going to + * allocate 2 pages from this map. The first is going to be + * the map page, the second is the overflow page we were + * looking for. The init_bitmap routine automatically, sets + * the first bit of itself to indicate that the bitmap itself + * is in use. We would explicitly set the second bit, but + * don't have to if we tell init_bitmap not to leave it clear + * in the first place. + */ + if (__ibitmap(hashp, + (int)OADDR_OF(splitnum, offset), 1, free_page)) + return (0); + hashp->SPARES[splitnum]++; +#ifdef DEBUG2 + free_bit = 2; +#endif + offset++; + if (offset > SPLITMASK) { + if (++splitnum >= NCACHED) { + (void)write(STDERR_FILENO, OVMSG, + sizeof(OVMSG) - 1); + errno = EFBIG; + return (0); + } + hashp->OVFL_POINT = splitnum; + hashp->SPARES[splitnum] = hashp->SPARES[splitnum-1]; + hashp->SPARES[splitnum-1]--; + offset = 0; + } + } else { + /* + * Free_bit addresses the last used bit. Bump it to address + * the first available bit. + */ + free_bit++; + SETBIT(freep, free_bit); + } + + /* Calculate address of the new overflow page */ + addr = OADDR_OF(splitnum, offset); +#ifdef DEBUG2 + (void)fprintf(stderr, "OVERFLOW_PAGE: ADDR: %d BIT: %d PAGE %d\n", + addr, free_bit, free_page); +#endif + return (addr); + +found: + bit = bit + first_free(freep[j]); + SETBIT(freep, bit); +#ifdef DEBUG2 + tmp1 = bit; + tmp2 = i; +#endif + /* + * Bits are addressed starting with 0, but overflow pages are addressed + * beginning at 1. Bit is a bit addressnumber, so we need to increment + * it to convert it to a page number. + */ + bit = 1 + bit + (i * (hashp->BSIZE << BYTE_SHIFT)); + if (bit >= hashp->LAST_FREED) + hashp->LAST_FREED = bit - 1; + + /* Calculate the split number for this page */ + for (i = 0; (i < splitnum) && (bit > hashp->SPARES[i]); i++); + offset = (i ? bit - hashp->SPARES[i - 1] : bit); + if (offset >= SPLITMASK) { + (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); + errno = EFBIG; + return (0); /* Out of overflow pages */ + } + addr = OADDR_OF(i, offset); +#ifdef DEBUG2 + (void)fprintf(stderr, "OVERFLOW_PAGE: ADDR: %d BIT: %d PAGE %d\n", + addr, tmp1, tmp2); +#endif + + /* Allocate and return the overflow page */ + return (addr); +} + +/* + * Mark this overflow page as free. + */ +void +__free_ovflpage(HTAB *hashp, BUFHEAD *obufp) +{ + uint16_t addr; + uint32_t *freep; + int bit_address, free_page, free_bit; + uint16_t ndx; + + addr = obufp->addr; +#ifdef DEBUG1 + (void)fprintf(stderr, "Freeing %d\n", addr); +#endif + ndx = (((uint32_t)addr) >> SPLITSHIFT); + bit_address = + (ndx ? hashp->SPARES[ndx - 1] : 0) + (addr & SPLITMASK) - 1; + if (bit_address < hashp->LAST_FREED) + hashp->LAST_FREED = bit_address; + free_page = ((uint32_t)bit_address >> (hashp->BSHIFT + BYTE_SHIFT)); + free_bit = bit_address & ((hashp->BSIZE << BYTE_SHIFT) - 1); + + if (!(freep = hashp->mapp[free_page])) + freep = fetch_bitmap(hashp, free_page); + /* + * This had better never happen. It means we tried to read a bitmap + * that has already had overflow pages allocated off it, and we + * failed to read it from the file. + */ + _DIAGASSERT(freep != NULL); + CLRBIT(freep, free_bit); +#ifdef DEBUG2 + (void)fprintf(stderr, "FREE_OVFLPAGE: ADDR: %d BIT: %d PAGE %d\n", + obufp->addr, free_bit, free_page); +#endif + __reclaim_buf(hashp, obufp); +} + +/* + * Returns: + * 0 success + * -1 failure + */ +static int +open_temp(HTAB *hashp) +{ + sigset_t set, oset; + char *envtmp; + char namestr[PATH_MAX]; + +#ifndef __minix + if (issetugid()) + envtmp = NULL; + else + envtmp = getenv("TMPDIR"); +#else + envtmp = getenv("TMPDIR"); +#endif + if (-1 == snprintf(namestr, sizeof(namestr), "%s/_hashXXXXXX", + envtmp ? envtmp : _PATH_TMP)) + return -1; + + /* Block signals; make sure file goes away at process exit. */ + (void)sigfillset(&set); + (void)sigprocmask(SIG_BLOCK, &set, &oset); + if ((hashp->fp = mkstemp(namestr)) != -1) { + (void)unlink(namestr); + (void)fcntl(hashp->fp, F_SETFD, FD_CLOEXEC); + } + (void)sigprocmask(SIG_SETMASK, &oset, (sigset_t *)NULL); + return (hashp->fp != -1 ? 0 : -1); +} + +/* + * We have to know that the key will fit, but the last entry on the page is + * an overflow pair, so we need to shift things. + */ +static void +squeeze_key(uint16_t *sp, const DBT *key, const DBT *val) +{ + char *p; + uint16_t free_space, n, off, pageno; + size_t temp; + + p = (char *)(void *)sp; + n = sp[0]; + free_space = FREESPACE(sp); + off = OFFSET(sp); + + pageno = sp[n - 1]; + _DIAGASSERT(off >= key->size); + off -= (uint16_t)key->size; + sp[n - 1] = off; + memmove(p + off, key->data, key->size); + _DIAGASSERT(off >= val->size); + off -= (uint16_t)val->size; + sp[n] = off; + memmove(p + off, val->data, val->size); + sp[0] = n + 2; + sp[n + 1] = pageno; + sp[n + 2] = OVFLPAGE; + temp = PAIRSIZE(key, val); + _DIAGASSERT(free_space >= temp); + FREESPACE(sp) = (uint16_t)(free_space - temp); + OFFSET(sp) = off; +} + +static uint32_t * +fetch_bitmap(HTAB *hashp, int ndx) +{ + if (ndx >= hashp->nmaps) + return (NULL); + if ((hashp->mapp[ndx] = malloc((size_t)hashp->BSIZE)) == NULL) + return (NULL); + if (__get_page(hashp, + (char *)(void *)hashp->mapp[ndx], (uint32_t)hashp->BITMAPS[ndx], 0, 1, 1)) { + free(hashp->mapp[ndx]); + return (NULL); + } + return (hashp->mapp[ndx]); +} + +#ifdef DEBUG4 +void print_chain(HTAB *, uint32_t); +void +print_chain(HTAB *hashp, uint32_t addr) +{ + BUFHEAD *bufp; + uint16_t *bp, oaddr; + + (void)fprintf(stderr, "%d ", addr); + bufp = __get_buf(hashp, addr, NULL, 0); + bp = (uint16_t *)bufp->page; + while (bp[0] && ((bp[bp[0]] == OVFLPAGE) || + ((bp[0] > 2) && bp[2] < REAL_KEY))) { + oaddr = bp[bp[0] - 1]; + (void)fprintf(stderr, "%d ", (int)oaddr); + bufp = __get_buf(hashp, (uint32_t)oaddr, bufp, 0); + bp = (uint16_t *)bufp->page; + } + (void)fprintf(stderr, "\n"); +} +#endif diff --git a/lib/libc/db/hash/ndbm.c b/lib/libc/db/hash/ndbm.c new file mode 100644 index 000000000..b18789814 --- /dev/null +++ b/lib/libc/db/hash/ndbm.c @@ -0,0 +1,119 @@ +/* $NetBSD: ndbm.c,v 1.23 2008/09/11 12:58:00 joerg Exp $ */ +/* from: NetBSD: ndbm.c,v 1.18 2004/04/27 20:03:45 kleink Exp */ + +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: ndbm.c,v 1.23 2008/09/11 12:58:00 joerg Exp $"); +#endif + +/* + * This package provides a dbm compatible interface to the new hashing + * package described in db(3). + */ +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include + +#include +#include "hash.h" + +/* + * Returns: + * *DBM on success + * NULL on failure + */ +DBM * +dbm_open(const char *file, int flags, mode_t mode) +{ + HASHINFO info; + char path[MAXPATHLEN]; + + info.bsize = 4096; + info.ffactor = 40; + info.nelem = 1; + info.cachesize = 0; + info.hash = NULL; + info.lorder = 0; + (void)strncpy(path, file, sizeof(path) - 1); + (void)strncat(path, DBM_SUFFIX, sizeof(path) - strlen(path) - 1); + if ((flags & O_ACCMODE) == O_WRONLY) { + flags &= ~O_WRONLY; + flags |= O_RDWR; + } + return ((DBM *)__hash_open(path, flags, mode, &info, 0)); +} + +void +dbm_close(DBM *db) +{ + (void)(db->close)(db); +} + +int +dbm_error(DBM *db) +{ + HTAB *hp; + + hp = db->internal; + return (hp->err); +} + +int +dbm_clearerr(DBM *db) +{ + HTAB *hp; + + hp = db->internal; + hp->err = 0; + return (0); +} + +int +dbm_dirfno(DBM *db) +{ + HTAB *hp; + + hp = db->internal; + return hp->fp; +} diff --git a/lib/libc/db/hash/ndbmdatum.c b/lib/libc/db/hash/ndbmdatum.c new file mode 100644 index 000000000..2d65fafd0 --- /dev/null +++ b/lib/libc/db/hash/ndbmdatum.c @@ -0,0 +1,162 @@ +/* $NetBSD: ndbmdatum.c,v 1.4 2008/09/11 12:58:00 joerg Exp $ */ +/* from: NetBSD: ndbm.c,v 1.18 2004/04/27 20:03:45 kleink Exp */ + +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: ndbmdatum.c,v 1.4 2008/09/11 12:58:00 joerg Exp $"); +#endif + +/* + * This package provides a dbm compatible interface to the new hashing + * package described in db(3). + */ +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include + +#include +#include "hash.h" + +/* + * Returns: + * DATUM on success + * NULL on failure + */ +datum +dbm_fetch(DBM *db, datum key) +{ + datum retdata; + int status; + DBT dbtkey, dbtretdata; + + dbtkey.data = key.dptr; + dbtkey.size = key.dsize; + status = (db->get)(db, &dbtkey, &dbtretdata, 0); + if (status) { + dbtretdata.data = NULL; + dbtretdata.size = 0; + } + retdata.dptr = dbtretdata.data; + retdata.dsize = dbtretdata.size; + return (retdata); +} + +/* + * Returns: + * DATUM on success + * NULL on failure + */ +datum +dbm_firstkey(DBM *db) +{ + int status; + datum retkey; + DBT dbtretkey, dbtretdata; + + status = (db->seq)(db, &dbtretkey, &dbtretdata, R_FIRST); + if (status) + dbtretkey.data = NULL; + retkey.dptr = dbtretkey.data; + retkey.dsize = dbtretkey.size; + return (retkey); +} + +/* + * Returns: + * DATUM on success + * NULL on failure + */ +datum +dbm_nextkey(DBM *db) +{ + int status; + datum retkey; + DBT dbtretkey, dbtretdata; + + status = (db->seq)(db, &dbtretkey, &dbtretdata, R_NEXT); + if (status) + dbtretkey.data = NULL; + retkey.dptr = dbtretkey.data; + retkey.dsize = dbtretkey.size; + return (retkey); +} + +/* + * Returns: + * 0 on success + * <0 failure + */ +int +dbm_delete(DBM *db, datum key) +{ + int status; + DBT dbtkey; + + dbtkey.data = key.dptr; + dbtkey.size = key.dsize; + status = (db->del)(db, &dbtkey, 0); + if (status) + return (-1); + else + return (0); +} + +/* + * Returns: + * 0 on success + * <0 failure + * 1 if DBM_INSERT and entry exists + */ +int +dbm_store(DBM *db, datum key, datum data, int flags) +{ + DBT dbtkey, dbtdata; + + dbtkey.data = key.dptr; + dbtkey.size = key.dsize; + dbtdata.data = data.dptr; + dbtdata.size = data.dsize; + return ((db->put)(db, &dbtkey, &dbtdata, + (u_int)((flags == DBM_INSERT) ? R_NOOVERWRITE : 0))); +} diff --git a/lib/libc/db/hash/page.h b/lib/libc/db/hash/page.h new file mode 100644 index 000000000..40dc77d02 --- /dev/null +++ b/lib/libc/db/hash/page.h @@ -0,0 +1,90 @@ +/* $NetBSD: page.h,v 1.8 2008/08/26 21:18:38 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)page.h 8.2 (Berkeley) 5/31/94 + */ + +/* + * Definitions for hashing page file format. + */ + +/* + * routines dealing with a data page + * + * page format: + * +------------------------------+ + * p | n | keyoff | datoff | keyoff | + * +------------+--------+--------+ + * | datoff | free | ptr | --> | + * +--------+---------------------+ + * | F R E E A R E A | + * +--------------+---------------+ + * | <---- - - - | data | + * +--------+-----+----+----------+ + * | key | data | key | + * +--------+----------+----------+ + * + * Pointer to the free space is always: p[p[0] + 2] + * Amount of free space on the page is: p[p[0] + 1] + */ + +/* + * How many bytes required for this pair? + * 2 shorts in the table at the top of the page + room for the + * key and room for the data + * + * We prohibit entering a pair on a page unless there is also room to append + * an overflow page. The reason for this it that you can get in a situation + * where a single key/data pair fits on a page, but you can't append an + * overflow page and later you'd have to split the key/data and handle like + * a big pair. + * You might as well do this up front. + */ + +#define PAIRSIZE(K,D) (2*sizeof(uint16_t) + (K)->size + (D)->size) +#define BIGOVERHEAD (4*sizeof(uint16_t)) +#define KEYSIZE(K) (4*sizeof(uint16_t) + (K)->size); +#define OVFLSIZE (2*sizeof(uint16_t)) +#define FREESPACE(P) ((P)[(P)[0]+1]) +#define OFFSET(P) ((P)[(P)[0]+2]) +#define PAIRFITS(P,K,D) \ + (((P)[2] >= REAL_KEY) && \ + (PAIRSIZE((K),(D)) + OVFLSIZE) <= FREESPACE((P))) +#define PAGE_META(N) (((N)+3) * sizeof(uint16_t)) + +typedef struct { + BUFHEAD *newp; + BUFHEAD *oldp; + BUFHEAD *nextp; + uint16_t next_addr; +} SPLIT_RETURN; diff --git a/lib/libc/db/man/Makefile.inc b/lib/libc/db/man/Makefile.inc new file mode 100644 index 000000000..f45c41630 --- /dev/null +++ b/lib/libc/db/man/Makefile.inc @@ -0,0 +1,15 @@ +# $NetBSD: Makefile.inc,v 1.10 2004/04/30 21:13:23 kleink Exp $ +# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 + +.PATH: ${.CURDIR}/db/man + +MAN+= btree.3 dbm_clearerr.3 dbopen.3 hash.3 recno.3 mpool.3 +MLINKS+= dbm_clearerr.3 dbm_close.3 dbm_clearerr.3 dbm_delete.3 +MLINKS+= dbm_clearerr.3 dbm_dirfno.3 dbm_clearerr.3 dbm_error.3 +MLINKS+= dbm_clearerr.3 dbm_fetch.3 dbm_clearerr.3 dbm_firstkey.3 +MLINKS+= dbm_clearerr.3 dbm_nextkey.3 dbm_clearerr.3 dbm_open.3 +MLINKS+= dbm_clearerr.3 dbm_store.3 dbm_clearerr.3 ndbm.3 +MLINKS+= dbopen.3 db.3 +MLINKS+= mpool.3 mpool_open.3 mpool.3 mpool_filter.3 mpool.3 mpool_new.3 +MLINKS+= mpool.3 mpool_get.3 mpool.3 mpool_put.3 mpool.3 mpool_sync.3 +MLINKS+= mpool.3 mpool_close.3 diff --git a/lib/libc/db/man/btree.3 b/lib/libc/db/man/btree.3 new file mode 100644 index 000000000..193638696 --- /dev/null +++ b/lib/libc/db/man/btree.3 @@ -0,0 +1,255 @@ +.\" $NetBSD: btree.3,v 1.12 2010/03/22 19:30:53 joerg Exp $ +.\" +.\" Copyright (c) 1990, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)btree.3 8.4 (Berkeley) 8/18/94 +.\" +.Dd April 17, 2003 +.Dt BTREE 3 +.Os +.Sh NAME +.Nm btree +.Nd btree database access method +.Sh SYNOPSIS +.In sys/types.h +.In db.h +.Sh DESCRIPTION +The routine +.Fn dbopen +is the library interface to database files. +One of the supported file formats is btree files. +The general description of the database access methods is in +.Xr dbopen 3 , +this manual page describes only the btree specific information. +.Pp +The btree data structure is a sorted, balanced tree structure storing +associated key/data pairs. +.Pp +The btree access method specific data structure provided to +.Fn dbopen +is defined in the +.In db.h +include file as follows: +.Bd -literal +typedef struct { + u_long flags; + u_int cachesize; + int maxkeypage; + int minkeypage; + u_int psize; + int (*compare)(const DBT *key1, const DBT *key2); + size_t (*prefix)(const DBT *key1, const DBT *key2); + int lorder; +} BTREEINFO; +.Ed +.Pp +The elements of this structure are as follows: +.Bl -tag -width maxkeypagex +.It Fa flags +The flag value is specified by or'ing any of the following values: +.Bl -tag -width R_DUP -offset indent +.It Dv R_DUP +Permit duplicate keys in the tree, i.e. permit insertion if the key to +be inserted already exists in the tree. +The default behavior, as described in +.Xr dbopen 3 , +is to overwrite a matching key when inserting a new key or to fail if +the +.Dv R_NOOVERWRITE +flag is specified. +The +.Dv R_DUP +flag is overridden by the +.Dv R_NOOVERWRITE +flag, and if the +.Dv R_NOOVERWRITE +flag is specified, attempts to insert duplicate keys into the tree +will fail. +.Pp +If the database contains duplicate keys, the order of retrieval of +key/data pairs is undefined if the +.Em get +routine is used, however, +.Em seq +routine calls with the +.Dv R_CURSOR +flag set will always return the logical +.Dq first +of any group of duplicate keys. +.El +.It Fa cachesize +A suggested maximum size (in bytes) of the memory cache. +This value is +.Em only +advisory, and the access method will allocate more memory rather than +fail. +Since every search examines the root page of the tree, caching the +most recently used pages substantially improves access time. +In addition, physical writes are delayed as long as possible, so a +moderate cache can reduce the number of I/O operations significantly. +Obviously, using a cache increases (but only increases) the likelihood +of corruption or lost data if the system crashes while a tree is being +modified. +If +.Fa cachesize +is 0 (no size is specified) a default cache is used. +.It Fa maxkeypage +The maximum number of keys which will be stored on any single page. +Not currently implemented. +.\" The maximum number of keys which will be stored on any single page. +.\" Because of the way the btree data structure works, +.\" .Fa maxkeypage +.\" must always be greater than or equal to 2. +.\" If +.\" .Fa maxkeypage +.\" is 0 (no maximum number of keys is specified) the page fill factor is +.\" made as large as possible (which is almost invariably what is wanted). +.It Fa minkeypage +The minimum number of keys which will be stored on any single page. +This value is used to determine which keys will be stored on overflow +pages, i.e., if a key or data item is longer than the pagesize divided +by the +.Fa minkeypage +value, it will be stored on overflow pages instead of in the page +itself. +If +.Fa minkeypage +is 0 (no minimum number of keys is specified) a value of 2 is used. +.It Fa psize +Page size is the size (in bytes) of the pages used for nodes in the +tree. +The minimum page size is 512 bytes and the maximum page size is 64K. +If +.Fa psize +is 0 (no page size is specified) a page size is chosen based on the +underlying file system I/O block size. +.It Fa compare +Compare is the key comparison function. +It must return an integer less than, equal to, or greater than zero if +the first key argument is considered to be respectively less than, +equal to, or greater than the second key argument. +The same comparison function must be used on a given tree every time +it is opened. +If +.Fa compare +is +.Dv NULL +(no comparison function is specified), the keys are compared +lexically, with shorter keys considered less than longer keys. +.It Fa prefix +Prefix is the prefix comparison function. +If specified, this routine must return the number of bytes of the +second key argument which are necessary to determine that it is +greater than the first key argument. +If the keys are equal, the key length should be returned. +Note, the usefulness of this routine is very data dependent, but, in +some data sets can produce significantly reduced tree sizes and search +times. +If +.Fa prefix +is +.Dv NULL +(no prefix function is specified), +.Em and +no comparison function is specified, a default lexical comparison +routine is used. +If +.Fa prefix +is +.Dv NULL +and a comparison routine is specified, no prefix comparison is done. +.It Fa lorder +The byte order for integers in the stored database metadata. +The number should represent the order as an integer; for example, +big endian order would be the number 4,321. +If +.Fa lorder +is 0 (no order is specified) the current host order is used. +.El +.Pp +If the file already exists (and the +.Dv O_TRUNC +flag is not specified), the values specified for the parameters flags, +lorder and psize are ignored in favor of the values used when the tree +was created. +.Pp +Forward sequential scans of a tree are from the least key to the +greatest. +.Pp +Space freed up by deleting key/data pairs from the tree is never +reclaimed, although it is normally made available for reuse. +This means that the btree storage structure is grow-only. +The only solutions are to avoid excessive deletions, or to create a +fresh tree periodically from a scan of an existing one. +.Pp +Searches, insertions, and deletions in a btree will all complete in +O lg base N where base is the average fill factor. +Often, inserting ordered data into btrees results in a low fill +factor. +This implementation has been modified to make ordered insertion the +best case, resulting in a much better than normal page fill factor. +.Sh ERRORS +The +.Nm +access method routines may fail and set +.Va errno +for any of the errors specified for the library routine +.Xr dbopen 3 . +.Sh SEE ALSO +.Xr dbopen 3 , +.Xr hash 3 , +.Xr mpool 3 , +.Xr recno 3 +.Pp +.Rs +.%T "The Ubiquitous B-tree" +.%A "Douglas Comer" +.%J "ACM Comput. Surv." +.%V 2 +.%N 11 +.%D June 1979 +.%P 121-138 +.Re +.Rs +.%T "Prefix B-trees" +.%A "Bayer" +.%A "Unterauer" +.%J "ACM Transactions on Database Systems" +.%V Vol. 2 +.%N 1 +.%D March 1977 +.%P 11-26 +.Re +.Rs +.%B "The Art of Computer Programming Vol. 3: Sorting and Searching" +.%A "D.E. Knuth" +.%D 1968 +.%P 471-480 +.Re +.Sh BUGS +Only big and little endian byte order is supported. diff --git a/lib/libc/db/man/dbm_clearerr.3 b/lib/libc/db/man/dbm_clearerr.3 new file mode 100644 index 000000000..d9f06721c --- /dev/null +++ b/lib/libc/db/man/dbm_clearerr.3 @@ -0,0 +1,306 @@ +.\" $NetBSD: dbm_clearerr.3,v 1.5 2010/05/05 06:55:57 jruoho Exp $ +.\" +.\" Copyright (c) 2004 The NetBSD Foundation, Inc. +.\" All rights reserved. +.\" +.\" This code is derived from software contributed to The NetBSD Foundation +.\" by Klaus Klein. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd May 5, 2010 +.Dt DBM_CLEARERR 3 +.Os +.Sh NAME +.Nm dbm_clearerr , +.Nm dbm_close , +.Nm dbm_delete , +.Nm dbm_dirfno , +.Nm dbm_error , +.Nm dbm_fetch , +.Nm dbm_firstkey , +.Nm dbm_nextkey , +.Nm dbm_open , +.Nm dbm_store , +.Nm ndbm +.Nd database functions +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In ndbm.h +.Ft int +.Fn dbm_clearerr "DBM *db" +.Ft void +.Fn dbm_close "DBM *db" +.Ft int +.Fn dbm_delete "DBM *db" "datum key" +.Ft int +.Fn dbm_dirfno "DBM *db" +.Ft int +.Fn dbm_error "DBM *db" +.Ft datum +.Fn dbm_fetch "DBM *db" "datum key" +.Ft datum +.Fn dbm_firstkey "DBM *db" +.Ft datum +.Fn dbm_nextkey "DBM *db" +.Ft DBM * +.Fn dbm_open "const char *file" "int open_flags" "mode_t file_mode" +.Ft int +.Fn dbm_store "DBM *db" "datum key" "datum content" "int store_mode" +.Sh DESCRIPTION +The +.Nm ndbm +facility provides access to hash database files. +.Pp +Two data types are fundamental to the +.Nm ndbm +facility. +.Fa DBM +serves as a handle to a database. +It is an opaque type. +.Pp +The other data type is +.Fa datum , +which is a structure type which includes the following members: +.Bd -literal -offset indent +void * dptr +size_t dsize +.Ed +.Pp +A +.Fa datum +is thus given by +.Fa dptr +pointing at an object of +.Fa dsize +bytes in length. +.Pp +The +.Fn dbm_open +function opens a database. +The +.Fa file +argument is the pathname which the actual database file pathname +is based on. +This implementation uses a single file with the suffix +.Pa .db +appended to +.Fa file . +The +.Fa open_flags +argument has the same meaning as the +.Fa flags +argument to +.Xr open 2 +except that when opening a database for write-only access the file +is opened for read/write access, and the +.Dv O_APPEND +flag must not be specified. +The +.Fa file_mode +argument has the same meaning as the +.Fa mode +argument to +.Xr open 2 . +.Pp +For the following functions, the +.Fa db +argument is a handle previously returned by a call to +.Fn dbm_open . +.Pp +The +.Fn dbm_close +function closes a database. +.Pp +The +.Fn dbm_fetch +function retrieves a record from the database. +The +.Fa key +argument is a +.Fa datum +that identifies the record to be fetched. +.Pp +The +.Fn dbm_store +function stores a record into the database. +The +.Fa key +argument is a +.Fa datum +that identifies the record to be stored. +The +.Fa content +argument is a +.Fa datum +that specifies the value of the record to be stored. +The +.Fa store_mode +argument specifies the behavior of +.Fn dbm_store +if a record matching +.Fa key +is already present in the database, +.Fa db . +.Fa store_mode +must be one of the following: +.Bl -tag -width DBM_REPLACEXX -offset indent +.It Dv DBM_INSERT +If a record matching +.Fa key +is already present, it is left unchanged. +.It Dv DBM_REPLACE +If a record matching +.Fa key +is already present, its value is replaced by +.Fa content . +.El +.Pp +If no record matching +.Fa key +is present, a new record is inserted regardless of +.Fa store_mode . +.Pp +The +.Fn dbm_delete +function deletes a record from the database. +The +.Fa key +argument is a +.Fa datum +that identifies the record to be deleted. +.Pp +The +.Fn dbm_firstkey +function returns the first key in the database. +.Pp +The +.Fn dbm_nextkey +function returns the next key in the database. +In order to be meaningful, it must be preceded by a call to +.Fn dbm_firstkey . +.Pp +The +.Fn dbm_error +function returns the error indicator of the database. +.Pp +The +.Fn dbm_clearerr +function clears the error indicator of the database. +.Pp +The +.Fn dbm_dirfno +function returns the file descriptor of the underlying database file. +.Sh IMPLEMENTATION NOTES +The +.Nm ndbm +facility is implemented on top of the +.Xr hash 3 +access method of the +.Xr db 3 +database facility. +.Sh RETURN VALUES +The +.Fn dbm_open +function returns a pointer to a +.Fa DBM +when successful; otherwise a null pointer is returned. +.Pp +The +.Fn dbm_close +function returns no value. +.Pp +The +.Fn dbm_fetch +function returns a content +.Fa datum ; +if no record matching +.Fa key +was found or if an error occured, its +.Fa dptr +member is a null pointer. +.Pp +The +.Fn dbm_store +function returns 0 when then record was successfully inserted; +it returns 1 when called with +.Fa store_mode +being +.Dv DBM_INSERT +and a record matching +.Fa key +is already present; +otherwise a negative value is returned. +.Pp +The +.Fn dbm_delete +function returns 0 when the record was successfully deleted; +otherwise a negative value is returned. +.Pp +The +.Fn dbm_firstkey +and +.Fn dbm_nextkey +functions return a key +.Fa datum . +When the end of the database is reached or if an error occured, its +.Fa dptr +member is a null pointer. +.Pp +The +.Fn dbm_error +function returns 0 if the error indicator is clear; +if the error indicator is set a non-zero value is returned. +.Pp +The +.Fn dbm_clearerr +function always returns 0. +.Pp +The +.Fn dbm_dirfno +function returns the file descriptor of the underlying database file. +.Sh ERRORS +No errors are defined. +.Sh SEE ALSO +.Xr open 2 , +.Xr db 3 , +.Xr hash 3 +.Sh STANDARDS +The +.Fn dbm_clearerr , +.Fn dbm_close , +.Fn dbm_delete , +.Fn dbm_error , +.Fn dbm_fetch , +.Fn dbm_firstkey , +.Fn dbm_nextkey , +.Fn dbm_open , +and +.Fn dbm_store +functions conform to +.St -xpg4.2 +and +.St -susv2 . +The +.Fn dbm_dirfno +function is an extension. diff --git a/lib/libc/db/man/dbopen.3 b/lib/libc/db/man/dbopen.3 new file mode 100644 index 000000000..87b62f221 --- /dev/null +++ b/lib/libc/db/man/dbopen.3 @@ -0,0 +1,534 @@ +.\" $NetBSD: dbopen.3,v 1.18 2010/03/22 19:30:53 joerg Exp $ +.\" +.\" Copyright (c) 1990, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)dbopen.3 8.5 (Berkeley) 1/2/94 +.\" +.Dd April 17, 2003 +.Dt DBOPEN 3 +.Os +.Sh NAME +.Nm dbopen , +.Nm db +.Nd database access methods +.Sh SYNOPSIS +.In sys/types.h +.In limits.h +.In db.h +.In fcntl.h +.Ft DB * +.Fn dbopen "const char *file" "int flags" "mode_t mode" \ +"DBTYPE type" "const void *openinfo" +.Sh DESCRIPTION +.Nm +is the library interface to database files. +The supported file formats are btree, hashed, and UNIX file oriented. +The btree format is a representation of a sorted, balanced tree +structure. +The hashed format is an extensible, dynamic hashing scheme. +The flat-file format is a byte stream file with fixed or variable +length records. +The formats and file format specific information are described in +detail in their respective manual pages +.Xr btree 3 , +.Xr hash 3 , +and +.Xr recno 3 . +.Pp +.Nm +opens +.Fa file +for reading and/or writing. +Files never intended to be preserved on disk may be created by setting +the file parameter to +.Dv NULL . +.Pp +The +.Fa flags +and +.Fa mode +arguments are as specified to the +.Xr open 2 +routine, however, only the +.Dv O_CREAT , +.Dv O_EXCL , +.Dv O_EXLOCK , +.Dv O_NONBLOCK , +.Dv O_RDONLY , +.Dv O_RDWR , +.Dv O_SHLOCK , +and +.Dv O_TRUNC +flags are meaningful. +(Note, opening a database file +.Dv O_WRONLY +is not possible.) +.\"Three additional options may be specified by or'ing +.\"them into the +.\".Fa flags +.\"argument. +.\".Pp +.\".Dv DB_LOCK +.\"Do the necessary locking in the database to support concurrent access. +.\"If concurrent access isn't needed or the database is read-only this +.\"flag should not be set, as it tends to have an associated performance +.\"penalty. +.\".Pp +.\".Dv DB_SHMEM +.\"Place the underlying memory pool used by the database in shared +.\"memory. +.\"Necessary for concurrent access. +.\".Pp +.\".Dv DB_TXN +.\"Support transactions in the database. +.\"The +.\".Dv DB_LOCK +.\"and +.\".Dv DB_SHMEM +.\"flags must be set as well. +.Pp +The +.Fa type +argument is of type +.Vt DBTYPE +(as defined in the +.In db.h +include file) and may be set to +.Dv DB_BTREE , +.Dv DB_HASH , +or +.Dv DB_RECNO . +.Pp +The +.Fa openinfo +argument is a pointer to an access method specific structure described +in the access method's manual page. +If +.Fa openinfo +is +.Dv NULL , +each access method will use defaults appropriate for the system and +the access method. +.Pp +.Nm +returns a pointer to a DB structure on success and +.Dv NULL +on error. +The DB structure is defined in the +.In db.h +include file, and contains at least the following fields: +.Bd -literal +typedef struct { + DBTYPE type; + int (*close)(const DB *db); + int (*del)(const DB *db, const DBT *key, u_int flags); + int (*fd)(const DB *db); + int (*get)(const DB *db, DBT *key, DBT *data, u_int flags); + int (*put)(const DB *db, DBT *key, const DBT *data, + u_int flags); + int (*sync)(const DB *db, u_int flags); + int (*seq)(const DB *db, DBT *key, DBT *data, u_int flags); +} DB; +.Ed +.Pp +These elements describe a database type and a set of functions +performing various actions. +These functions take a pointer to a structure as returned by +.Nm , +and sometimes one or more pointers to key/data structures and a flag +value. +.Bl -tag -width closex +.It Fa type +The type of the underlying access method (and file format). +.It Fa close +A pointer to a routine to flush any cached information to disk, free +any allocated resources, and close the underlying file(s). +Since key/data pairs may be cached in memory, failing to sync the file +with a +.Fa close +or +.Fa sync +function may result in inconsistent or lost information. +.Fa close +routines return \-1 on error (setting +.Va errno ) +and 0 on success. +.It Fa del +A pointer to a routine to remove key/data pairs from the database. +.Pp +The parameter +.Fa flag +may be set to the following value: +.Bl -tag -width R_CURSORX +.It Dv R_CURSOR +Delete the record referenced by the cursor. +The cursor must have previously been initialized. +.El +.Pp +.Fa delete +routines return \-1 on error (setting +.Va errno ) , +0 on success, and 1 if the specified +.Fa key +was not in the file. +.It Fa fd +A pointer to a routine which returns a file descriptor representative +of the underlying database. +A file descriptor referencing the same file will be returned to all +processes which call +.Nm +with the same +.Fa file +name. +This file descriptor may be safely used as an argument to the +.Xr fcntl 2 +and +.Xr flock 2 +locking functions. +The file descriptor is not necessarily associated with any of the +underlying files used by the access method. +No file descriptor is available for in memory databases. +.Fa fd +routines return \-1 on error (setting +.Va errno ) , +and the file descriptor on success. +.It Fa get +A pointer to a routine which is the interface for keyed retrieval from +the database. +The address and length of the data associated with the specified +.Fa key +are returned in the structure referenced by +.Fa data . +.Fa get +routines return \-1 on error (setting +.Va errno ) , +0 on success, and 1 if the +.Fa key +was not in the file. +.It Fa put +A pointer to a routine to store key/data pairs in the database. +.Pp +The parameter +.Fa flag +may be set to one of the following values: +.Bl -tag -width R_NOOVERWRITEX +.It Dv R_CURSOR +Replace the key/data pair referenced by the cursor. +The cursor must have previously been initialized. +.It Dv R_IAFTER +Append the data immediately after the data referenced by +.Fa key , +creating a new key/data pair. +The record number of the appended key/data pair is returned in the +.Fa key +structure. +(Applicable only to the +.Dv DB_RECNO +access method.) +.It Dv R_IBEFORE +Insert the data immediately before the data referenced by +.Fa key , +creating a new key/data pair. +The record number of the inserted key/data pair is returned in the +.Fa key +structure. +(Applicable only to the +.Dv DB_RECNO +access method.) +.It Dv R_NOOVERWRITE +Enter the new key/data pair only if the key does not previously +exist. +.It Dv R_SETCURSOR +Store the key/data pair, setting or initializing the position of the +cursor to reference it. +(Applicable only to the +.Dv DB_BTREE +and +.Dv DB_RECNO +access methods.) +.El +.Pp +.Dv R_SETCURSOR +is available only for the +.Dv DB_BTREE +and +.Dv DB_RECNO +access methods because it implies that the keys have an inherent order +which does not change. +.Pp +.Dv R_IAFTER +and +.Dv R_IBEFORE +are available only for the +.Dv DB_RECNO +access method because they each imply that the access method is able +to create new keys. +This is only true if the keys are ordered and independent, record +numbers for example. +.Pp +The default behavior of the +.Fa put +routines is to enter the new key/data pair, replacing any previously +existing key. +.Pp +.Fa put +routines return \-1 on error (setting +.Va errno ) , +0 on success, and 1 if the +.Dv R_NOOVERWRITE +.Fa flag +was set and the key already exists in the file. +.It Fa seq +A pointer to a routine which is the interface for sequential +retrieval from the database. +The address and length of the key are returned in the structure +referenced by +.Fa key , +and the address and length of the data are returned in the +structure referenced by +.Fa data . +.Pp +Sequential key/data pair retrieval may begin at any time, and the +position of the +.Dq cursor +is not affected by calls to the +.Fa del , +.Fa get , +.Fa put , +or +.Fa sync +routines. +Modifications to the database during a sequential scan will be +reflected in the scan, i.e., records inserted behind the cursor will +not be returned while records inserted in front of the cursor will be +returned. +.Pp +The flag value +.Em must +be set to one of the following values: +.Bl -tag -width R_CURSORX +.It Dv R_CURSOR +The data associated with the specified key is returned. +This differs from the +.Fa get +routines in that it sets or initializes the cursor to the location of +the key as well. +(Note, for the +.Dv DB_BTREE +access method, the returned key is not necessarily an exact match for +the specified key. +The returned key is the smallest key greater than or equal to the +specified key, permitting partial key matches and range searches.) +.It Dv R_FIRST +The first key/data pair of the database is returned, and the cursor +is set or initialized to reference it. +.It Dv R_LAST +The last key/data pair of the database is returned, and the cursor +is set or initialized to reference it. +(Applicable only to the +.Dv DB_BTREE +and +.Dv DB_RECNO +access methods.) +.It Dv R_NEXT +Retrieve the key/data pair immediately after the cursor. +If the cursor is not yet set, this is the same as the +.Dv R_FIRST +flag. +.It Dv R_PREV +Retrieve the key/data pair immediately before the cursor. +If the cursor is not yet set, this is the same as the +.Dv R_LAST +flag. +(Applicable only to the +.Dv DB_BTREE +and +.Dv DB_RECNO +access methods.) +.El +.Pp +.Dv R_LAST +and +.Dv R_PREV +are available only for the +.Dv DB_BTREE +and +.Dv DB_RECNO +access methods because they each imply that the keys have an inherent +order which does not change. +.Pp +.Fa seq +routines return \-1 on error (setting +.Va errno ) , +0 on success and 1 if there are no key/data pairs less than or greater +than the specified or current key. +If the +.Dv DB_RECNO +access method is being used, and if the database file is a character +special file and no complete key/data pairs are currently available, +the +.Fa seq +routines return 2. +.It Fa sync +A pointer to a routine to flush any cached information to disk. +If the database is in memory only, the +.Fa sync +routine has no effect and will always succeed. +.Pp +The flag value may be set to the following value: +.Bl -tag -width ".Dv R_RECNOSYNC" +.It Dv R_RECNOSYNC +If the +.Dv DB_RECNO +access method is being used, this flag causes the sync routine to +apply to the btree file which underlies the recno file, not the recno +file itself. +(See the +.Fa bfname +field of the +.Xr recno 3 +manual page for more information.) +.El +.Pp +.Fa sync +routines return \-1 on error (setting +.Va errno ) +and 0 on success. +.El +.Ss KEY/DATA PAIRS +Access to all file types is based on key/data pairs. +Both keys and data are represented by the following data structure: +.Bd -literal +typedef struct { + void *data; + size_t size; +} DBT; +.Ed +.Pp +The elements of the DBT structure are defined as follows: +.Bl -tag -width datax +.It Fa data +A pointer to a byte string. +.It Fa size +The length of the byte string. +.El +.Pp +Key and data byte strings may reference strings of essentially +unlimited length although any two of them must fit into available +memory at the same time. +It should be noted that the access methods provide no guarantees about +byte string alignment. +.Sh ERRORS +The +.Nm +routine may fail and set +.Va errno +for any of the errors specified for the library routines +.Xr open 2 +and +.Xr malloc 3 +or the following: +.Bl -tag -width Er +.It Er EFTYPE +A file is incorrectly formatted. +.It Er EINVAL +A parameter has been specified (hash function, pad byte, etc.) that is +incompatible with the current file specification or which is not +meaningful for the function (for example, use of the cursor without +prior initialization) or there is a mismatch between the version +number of file and the software. +.It Er EFBIG +The key could not be inserted due to limitations in the DB file format +(e.g., a hash database was out of overflow pages). +.El +.Pp +The +.Fa close +routines may fail and set +.Va errno +for any of the errors specified for the library routines +.Xr close 2 , +.Xr read 2 , +.Xr write 2 , +.Xr free 3 , +or +.Xr fsync 2 . +.Pp +The +.Fa del , +.Fa get , +.Fa put , +and +.Fa seq +routines may fail and set +.Va errno +for any of the errors specified for the library routines +.Xr read 2 , +.Xr write 2 , +.Xr free 3 , +or +.Xr malloc 3 . +.Pp +The +.Fa fd +routines will fail and set +.Va errno +to +.Er ENOENT +for in memory databases. +.Pp +The +.Fa sync +routines may fail and set +.Va errno +for any of the errors specified for the library routine +.Xr fsync 2 . +.Sh SEE ALSO +.Xr btree 3 , +.Xr hash 3 , +.Xr mpool 3 , +.Xr recno 3 +.Pp +.Rs +.%T "LIBTP: Portable, Modular Transactions for UNIX" +.%A Margo Seltzer +.%A Michael Olson +.%J USENIX proceedings +.%D Winter 1992 +.Re +.Sh BUGS +The typedef DBT is a mnemonic for +.Dq data base thang , +and was used because no one could think of a reasonable name that +wasn't already used. +.Pp +The file descriptor interface is a kludge and will be deleted in a +future version of the interface. +.Pp +None of the access methods provide any form of concurrent access, +locking, or transactions. diff --git a/lib/libc/db/man/hash.3 b/lib/libc/db/man/hash.3 new file mode 100644 index 000000000..c735a5124 --- /dev/null +++ b/lib/libc/db/man/hash.3 @@ -0,0 +1,167 @@ +.\" $NetBSD: hash.3,v 1.13 2010/03/22 19:30:53 joerg Exp $ +.\" +.\" Copyright (c) 1990, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)hash.3 8.6 (Berkeley) 8/18/94 +.\" +.Dd September 5, 2008 +.Dt HASH 3 +.Os +.Sh NAME +.Nm hash +.Nd hash database access method +.Sh SYNOPSIS +.In sys/types.h +.In db.h +.Sh DESCRIPTION +The routine +.Fn dbopen +is the library interface to database files. +One of the supported file formats is hash files. +The general description of the database access methods is in +.Xr dbopen 3 , +this manual page describes only the hash specific information. +.Pp +The hash data structure is an extensible, dynamic hashing scheme. +.Pp +The access method specific data structure provided to +.Fn dbopen +is defined in the +.In db.h +include file as follows: +.Bd -literal +typedef struct { + u_int bsize; + u_int ffactor; + u_int nelem; + u_int cachesize; + uint32_t (*hash)(const void *, size_t); + int lorder; +} HASHINFO; +.Ed +.Pp +The elements of this structure are as follows: +.Bl -tag -width cachesizex +.It Fa bsize +.Fa bsize +defines the hash table bucket size, and defaults to 4096 for in-memory tables. +If +.Fa bsize +is 0 (no bucket size is specified) a bucket size is chosen based on the +underlying file system I/O block size. +It may be preferable to increase the page size for disk-resident +tables and tables with large data items. +.It Fa ffactor +.Fa ffactor +indicates a desired density within the hash table. +It is an approximation of the number of keys allowed to accumulate in +any one bucket, determining when the hash table grows or shrinks. +The default value is 8. +.It Fa nelem +.Fa nelem +is an estimate of the final size of the hash table. +If not set or set too low, hash tables will expand gracefully as keys +are entered, although a slight performance degradation may be +noticed. +The default value is 1. +.It Fa cachesize +A suggested maximum size, in bytes, of the memory cache. +This value is +.Em only +advisory, and the access method will allocate more memory rather +than fail. +.It Fa hash +.Fa hash +is a user defined hash function. +Since no hash function performs equally well on all possible data, the +user may find that the built-in hash function does poorly on a +particular data set. +User specified hash functions must take two arguments (a pointer to a +byte string and a length) and return a 32-bit quantity to be used as +the hash value. +.It Fa lorder +The byte order for integers in the stored database metadata. +The number should represent the order as an integer; for example, +big endian order would be the number 4,321. +If +.Fa lorder +is 0 (no order is specified) the current host order is used. +If the file already exists, the specified value is ignored and the +value specified when the tree was created is used. +.El +.Pp +If the file already exists (and the +.Dv O_TRUNC +flag is not specified), the values specified for the parameters +.Fa bsize , +.Fa ffactor , +.Fa lorder , +and +.Fa nelem +are ignored and the values specified when the tree was created are +used. +.Pp +If a hash function is specified, +.Fn hash_open +will attempt to determine if the hash function specified is the same +as the one with which the database was created, and will fail if it is +not. +.\".Pp +.\"Backward compatible interfaces to the routines described in +.\".Xr dbm 3 , +.\"and +.\".Xr ndbm 3 +.\"are provided, however these interfaces are not compatible with +.\"previous file formats. +.Sh ERRORS +The +.Nm +access method routines may fail and set +.Va errno +for any of the errors specified for the library routine +.Xr dbopen 3 . +.Sh SEE ALSO +.Xr btree 3 , +.Xr dbopen 3 , +.Xr mpool 3 , +.Xr recno 3 +.Pp +.Rs +.%T "Dynamic Hash Tables" +.%A Per-Ake Larson +.%J Communications of the ACM +.%D April 1988 +.Re +.Rs +.%T "A New Hash Package for UNIX" +.%A Margo Seltzer +.%J USENIX Proceedings +.%D Winter 1991 +.Re +.Sh BUGS +Only big and little endian byte order is supported. diff --git a/lib/libc/db/man/mpool.3 b/lib/libc/db/man/mpool.3 new file mode 100644 index 000000000..fa76eb547 --- /dev/null +++ b/lib/libc/db/man/mpool.3 @@ -0,0 +1,226 @@ +.\" $NetBSD: mpool.3,v 1.9 2003/08/07 16:42:43 agc Exp $ +.\" +.\" Copyright (c) 1990, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)mpool.3 8.1 (Berkeley) 6/4/93 +.\" +.Dd April 17, 2003 +.Dt MPOOL 3 +.Os +.Sh NAME +.Nm mpool , +.Nm mpool_open , +.Nm mpool_filter , +.Nm mpool_new , +.Nm mpool_get , +.Nm mpool_put , +.Nm mpool_sync , +.Nm mpool_close +.Nd shared memory buffer pool +.Sh SYNOPSIS +.In db.h +.In mpool.h +.Ft MPOOL * +.Fn mpool_open "DBT *key" "int fd" "pgno_t pagesize" "pgno_t maxcache" +.Ft void +.Fn mpool_filter "MPOOL *mp" "void (*pgin)(void *, pgno_t, void *)" \ +"void (*pgout)(void *, pgno_t, void *)" "void *pgcookie" +.Ft void * +.Fn mpool_new "MPOOL *mp" "pgno_t *pgnoaddr" +.Ft void * +.Fn mpool_get "MPOOL *mp" "pgno_t pgno" "u_int flags" +.Ft int +.Fn mpool_put "MPOOL *mp" "void *pgaddr" "u_int flags" +.Ft int +.Fn mpool_sync "MPOOL *mp" +.Ft int +.Fn mpool_close "MPOOL *mp" +.Sh DESCRIPTION +.Nm +is the library interface intended to provide page oriented buffer +management of files. +The buffers may be shared between processes. +.Pp +The function +.Nm mpool_open +initializes a memory pool. +The +.Fa key +argument is the byte string used to negotiate between multiple +processes wishing to share buffers. +If the file buffers are mapped in shared memory, all processes using +the same key will share the buffers. +If +.Fa key +is +.Dv NULL , +the buffers are mapped into private memory. +The +.Fa fd +argument is a file descriptor for the underlying file, which must be +seekable. +If +.Fa key +is +.No non- Ns Dv NULL +and matches a file already being mapped, the +.Fa fd +argument is ignored. +.Pp +The +.Fa pagesize +argument is the size, in bytes, of the pages into which the file is +broken up. +The +.Fa maxcache +argument is the maximum number of pages from the underlying file to +cache at any one time. +This value is not relative to the number of processes which share a +file's buffers, but will be the largest value specified by any of the +processes sharing the file. +.Pp +The +.Nm mpool_filter +function is intended to make transparent input and output processing +of the pages possible. +If the +.Fa pgin +function is specified, it is called each time a buffer is read into +the memory pool from the backing file. +If the +.Fa pgout +function is specified, it is called each time a buffer is written into +the backing file. +Both functions are are called with the +.Fa pgcookie +pointer, the page number and a pointer to the page to being read or +written. +.Pp +The function +.Nm mpool_new +takes an MPOOL pointer and an address as arguments. +If a new page can be allocated, a pointer to the page is returned and +the page number is stored into the +.Fa pgnoaddr +address. +Otherwise, +.Dv NULL +is returned and errno is set. +.Pp +The function +.Nm mpool_get +takes a MPOOL pointer and a page number as arguments. +If the page exists, a pointer to the page is returned. +Otherwise, +.Dv NULL +is returned and errno is set. +The flags parameter is not currently used. +.Pp +The function +.Nm mpool_put +unpins the page referenced by +.Fa pgaddr . +.Fa pgaddr +must be an address previously returned by +.Nm mpool_get +or +.Nm mpool_new . +The flag value is specified by or'ing any of the following values: +.Bl -tag -width MPOOL_DIRTYX -offset indent +.It Dv MPOOL_DIRTY +The page has been modified and needs to be written to the backing +file. +.El +.Pp +.Nm mpool_put +returns 0 on success and \-1 if an error occurs. +.Pp +The function +.Nm mpool_sync +writes all modified pages associated with the MPOOL pointer to the +backing file. +.Nm mpool_sync +returns 0 on success and \-1 if an error occurs. +.Pp +The +.Nm mpool_close +function frees up any allocated memory associated with the memory pool +cookie. +Modified pages are +.Em not +written to the backing file. +.Nm mpool_close +returns 0 on success and \-1 if an error occurs. +.Sh ERRORS +The +.Nm mpool_open +function may fail and set +.Va errno +for any of the errors specified for the library routine +.Xr malloc 3 . +.Pp +The +.Nm mpool_get +function may fail and set +.Va errno +for the following: +.Bl -tag -width Er -offset indent +.It Er EINVAL +The requested record doesn't exist. +.El +.Pp +The +.Nm mpool_new +and +.Nm mpool_get +functions may fail and set +.Va errno +for any of the errors specified for the library routines +.Xr read 2 , +.Xr write 2 , +and +.Xr malloc 3 . +.Pp +The +.Nm mpool_sync +function may fail and set +.Va errno +for any of the errors specified for the library routine +.Xr write 2 . +.Pp +The +.Nm mpool_close +function may fail and set +.Va errno +for any of the errors specified for the library routine +.Xr free 3 . +.Sh SEE ALSO +.Xr btree 3 , +.Xr dbopen 3 , +.Xr hash 3 , +.Xr recno 3 diff --git a/lib/libc/db/man/recno.3 b/lib/libc/db/man/recno.3 new file mode 100644 index 000000000..a38871fe7 --- /dev/null +++ b/lib/libc/db/man/recno.3 @@ -0,0 +1,214 @@ +.\" $NetBSD: recno.3,v 1.11 2010/03/22 19:30:53 joerg Exp $ +.\" +.\" Copyright (c) 1990, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)recno.3 8.5 (Berkeley) 8/18/94 +.\" +.Dd April 17, 2003 +.Dt RECNO 3 +.Os +.Sh NAME +.Nm recno +.Nd record number database access method +.Sh SYNOPSIS +.In sys/types.h +.In db.h +.Sh DESCRIPTION +The routine +.Fn dbopen +is the library interface to database files. +One of the supported file formats is record number files. +The general description of the database access methods is in +.Xr dbopen 3 , +this manual page describes only the recno specific information. +.Pp +The record number data structure is either variable or fixed-length +records stored in a flat-file format, accessed by the logical record +number. +The existence of record number five implies the existence of records +one through four, and the deletion of record number one causes +record number five to be renumbered to record number four, as well +as the cursor, if positioned after record number one, to shift down +one record. +.Pp +The recno access method specific data structure provided to +.Fn dbopen +is defined in the +.In db.h +include file as follows: +.Bd -literal +typedef struct { + u_long flags; + u_int cachesize; + u_int psize; + int lorder; + size_t reclen; + uint8_t bval; + char *bfname; +} RECNOINFO; +.Ed +.Pp +The elements of this structure are defined as follows: +.Bl -tag -width cachesizex +.It Fa flags +The flag value is specified by or'ing any of the following values: +.Bl -tag -width R_FIXEDLENX -offset indent +.It Dv R_FIXEDLEN +The records are fixed-length, not byte delimited. +The structure element +.Fa reclen +specifies the length of the record, and the structure element +.Fa bval +is used as the pad character. +Any records, inserted into the database, that are less than +.Fa reclen +bytes long are automatically padded. +.It Dv R_NOKEY +In the interface specified by +.Fn dbopen , +the sequential record retrieval fills in both the caller's key and +data structures. +If the +.Dv R_NOKEY +flag is specified, the cursor routines are not required to fill in the +key structure. +This permits applications to retrieve records at the end of files +without reading all of the intervening records. +.It Dv R_SNAPSHOT +This flag requires that a snapshot of the file be taken when +.Fn dbopen +is called, instead of permitting any unmodified records to be read +from the original file. +.El +.It Fa cachesize +A suggested maximum size, in bytes, of the memory cache. +This value is +.Em only +advisory, and the access method will allocate more memory rather than +fail. +If +.Fa cachesize +is 0 (no size is specified) a default cache is used. +.It Fa psize +The recno access method stores the in-memory copies of its records +in a btree. +This value is the size (in bytes) of the pages used for nodes in that +tree. +If +.Fa psize +is 0 (no page size is specified) a page size is chosen based on the +underlying file system I/O block size. +See +.Xr btree 3 +for more information. +.It Fa lorder +The byte order for integers in the stored database metadata. +The number should represent the order as an integer; for example, +big endian order would be the number 4,321. +If +.Fa lorder +is 0 (no order is specified) the current host order is used. +.It Fa reclen +The length of a fixed-length record. +.It Fa bval +The delimiting byte to be used to mark the end of a record for +variable-length records, and the pad character for fixed-length +records. +If no value is specified, newlines +.Pq Dq \en +are used to mark the end of variable-length records and fixed-length +records are padded with spaces. +.It Fa bfname +The recno access method stores the in-memory copies of its records +in a btree. +If bfname is +.No non- Ns Dv NULL , +it specifies the name of the btree file, as if specified as the file +name for a +.Fn dbopen +of a btree file. +.El +.Pp +The data part of the key/data pair used by the recno access method +is the same as other access methods. +The key is different. +The +.Fa data +field of the key should be a pointer to a memory location of type +recno_t, as defined in the +.In db.h +include file. +This type is normally the largest unsigned integral type available to +the implementation. +The +.Fa size +field of the key should be the size of that type. +.Pp +Because there can be no meta-data associated with the underlying +recno access method files, any changes made to the default values +(e.g., fixed record length or byte separator value) must be explicitly +specified each time the file is opened. +.Pp +In the interface specified by +.Fn dbopen , +using the +.Fa put +interface to create a new record will cause the creation of multiple, +empty records if the record number is more than one greater than the +largest record currently in the database. +.Sh ERRORS +The +.Nm +access method routines may fail and set +.Va errno +for any of the errors specified for the library routine +.Xr dbopen 3 +or the following: +.Bl -tag -width Er +.It Er EINVAL +An attempt was made to add a record to a fixed-length database that +was too large to fit. +.El +.Sh SEE ALSO +.Xr btree 3 , +.Xr dbopen 3 , +.Xr hash 3 , +.Xr mpool 3 +.Pp +.Rs +.%T "Document Processing in a Relational Database System" +.%A Michael Stonebraker +.%A Heidi Stettner +.%A Joseph Kalash +.%A Antonin Guttman +.%A Nadene Lynn +.%J Memorandum No. UCB/ERL M82/32 +.%D May 1982 +.Re +.Sh BUGS +Only big and little endian byte order is supported. diff --git a/lib/libc/db/mpool/Makefile.inc b/lib/libc/db/mpool/Makefile.inc new file mode 100644 index 000000000..c9320fb0a --- /dev/null +++ b/lib/libc/db/mpool/Makefile.inc @@ -0,0 +1,6 @@ +# $NetBSD: Makefile.inc,v 1.4 1995/02/27 13:23:53 cgd Exp $ +# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 + +.PATH: ${.CURDIR}/db/mpool + +SRCS+= mpool.c diff --git a/lib/libc/db/mpool/README b/lib/libc/db/mpool/README new file mode 100644 index 000000000..13002b30f --- /dev/null +++ b/lib/libc/db/mpool/README @@ -0,0 +1,8 @@ +# $NetBSD: README,v 1.2 1995/02/27 13:24:00 cgd Exp $ +# @(#)README 8.1 (Berkeley) 6/4/93 + +These are the current memory pool routines. +They aren't ready for prime time, yet, and +the interface is expected to change. + +--keith diff --git a/lib/libc/db/mpool/mpool.c b/lib/libc/db/mpool/mpool.c new file mode 100644 index 000000000..c03c7679a --- /dev/null +++ b/lib/libc/db/mpool/mpool.c @@ -0,0 +1,466 @@ +/* $NetBSD: mpool.c,v 1.19 2009/04/22 18:44:06 christos Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: mpool.c,v 1.19 2009/04/22 18:44:06 christos Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include +#include + +#include +#include +#include +#include +#include + +#include + +#define __MPOOLINTERFACE_PRIVATE +#include + +#ifdef __weak_alias +__weak_alias(mpool_close,_mpool_close) +__weak_alias(mpool_filter,_mpool_filter) +__weak_alias(mpool_get,_mpool_get) +__weak_alias(mpool_new,_mpool_new) +__weak_alias(mpool_open,_mpool_open) +__weak_alias(mpool_put,_mpool_put) +__weak_alias(mpool_sync,_mpool_sync) +#endif + +static BKT *mpool_bkt(MPOOL *); +static BKT *mpool_look(MPOOL *, pgno_t); +static int mpool_write(MPOOL *, BKT *); + +/* + * mpool_open -- + * Initialize a memory pool. + */ +/*ARGSUSED*/ +MPOOL * +mpool_open(void *key, int fd, pgno_t pagesize, pgno_t maxcache) +{ + struct stat sb; + MPOOL *mp; + int entry; + + /* + * Get information about the file. + * + * XXX + * We don't currently handle pipes, although we should. + */ + if (fstat(fd, &sb)) + return (NULL); + if (!S_ISREG(sb.st_mode)) { + errno = ESPIPE; + return (NULL); + } + + /* Allocate and initialize the MPOOL cookie. */ + if ((mp = (MPOOL *)calloc(1, sizeof(MPOOL))) == NULL) + return (NULL); + CIRCLEQ_INIT(&mp->lqh); + for (entry = 0; entry < HASHSIZE; ++entry) + CIRCLEQ_INIT(&mp->hqh[entry]); + mp->maxcache = maxcache; + mp->npages = (pgno_t)(sb.st_size / pagesize); + mp->pagesize = pagesize; + mp->fd = fd; + return (mp); +} + +/* + * mpool_filter -- + * Initialize input/output filters. + */ +void +mpool_filter(MPOOL *mp, void (*pgin)(void *, pgno_t, void *), + void (*pgout)(void *, pgno_t, void *), void *pgcookie) +{ + mp->pgin = pgin; + mp->pgout = pgout; + mp->pgcookie = pgcookie; +} + +/* + * mpool_new -- + * Get a new page of memory. + */ +void * +mpool_new( MPOOL *mp, pgno_t *pgnoaddr) +{ + struct _hqh *head; + BKT *bp; + + if (mp->npages == MAX_PAGE_NUMBER) { + (void)fprintf(stderr, "mpool_new: page allocation overflow.\n"); + abort(); + } +#ifdef STATISTICS + ++mp->pagenew; +#endif + /* + * Get a BKT from the cache. Assign a new page number, attach + * it to the head of the hash chain, the tail of the lru chain, + * and return. + */ + if ((bp = mpool_bkt(mp)) == NULL) + return (NULL); + *pgnoaddr = bp->pgno = mp->npages++; + bp->flags = MPOOL_PINNED; + + head = &mp->hqh[HASHKEY(bp->pgno)]; + CIRCLEQ_INSERT_HEAD(head, bp, hq); + CIRCLEQ_INSERT_TAIL(&mp->lqh, bp, q); + return (bp->page); +} + +/* + * mpool_get + * Get a page. + */ +/*ARGSUSED*/ +void * +mpool_get(MPOOL *mp, pgno_t pgno, u_int flags) +{ + struct _hqh *head; + BKT *bp; + off_t off; + ssize_t nr; + + /* Check for attempt to retrieve a non-existent page. */ + if (pgno >= mp->npages) { + errno = EINVAL; + return (NULL); + } + +#ifdef STATISTICS + ++mp->pageget; +#endif + + /* Check for a page that is cached. */ + if ((bp = mpool_look(mp, pgno)) != NULL) { +#ifdef DEBUG + if (bp->flags & MPOOL_PINNED) { + (void)fprintf(stderr, + "mpool_get: page %d already pinned\n", bp->pgno); + abort(); + } +#endif + /* + * Move the page to the head of the hash chain and the tail + * of the lru chain. + */ + head = &mp->hqh[HASHKEY(bp->pgno)]; + CIRCLEQ_REMOVE(head, bp, hq); + CIRCLEQ_INSERT_HEAD(head, bp, hq); + CIRCLEQ_REMOVE(&mp->lqh, bp, q); + CIRCLEQ_INSERT_TAIL(&mp->lqh, bp, q); + + /* Return a pinned page. */ + bp->flags |= MPOOL_PINNED; + return (bp->page); + } + + /* Get a page from the cache. */ + if ((bp = mpool_bkt(mp)) == NULL) + return (NULL); + + /* Read in the contents. */ +#ifdef STATISTICS + ++mp->pageread; +#endif + off = mp->pagesize * pgno; + if ((nr = pread(mp->fd, bp->page, (size_t)mp->pagesize, off)) != (int)mp->pagesize) { + if (nr >= 0) + errno = EFTYPE; + return (NULL); + } + + /* Set the page number, pin the page. */ + bp->pgno = pgno; + bp->flags = MPOOL_PINNED; + + /* + * Add the page to the head of the hash chain and the tail + * of the lru chain. + */ + head = &mp->hqh[HASHKEY(bp->pgno)]; + CIRCLEQ_INSERT_HEAD(head, bp, hq); + CIRCLEQ_INSERT_TAIL(&mp->lqh, bp, q); + + /* Run through the user's filter. */ + if (mp->pgin != NULL) + (mp->pgin)(mp->pgcookie, bp->pgno, bp->page); + + return (bp->page); +} + +/* + * mpool_put + * Return a page. + */ +/*ARGSUSED*/ +int +mpool_put(MPOOL *mp, void *page, u_int flags) +{ + BKT *bp; + +#ifdef STATISTICS + ++mp->pageput; +#endif + bp = (BKT *)(void *)((char *)page - sizeof(BKT)); +#ifdef DEBUG + if (!(bp->flags & MPOOL_PINNED)) { + (void)fprintf(stderr, + "mpool_put: page %d not pinned\n", bp->pgno); + abort(); + } +#endif + bp->flags &= ~MPOOL_PINNED; + bp->flags |= flags & MPOOL_DIRTY; + return (RET_SUCCESS); +} + +/* + * mpool_close + * Close the buffer pool. + */ +int +mpool_close(MPOOL *mp) +{ + BKT *bp; + + /* Free up any space allocated to the lru pages. */ + while ((bp = mp->lqh.cqh_first) != (void *)&mp->lqh) { + CIRCLEQ_REMOVE(&mp->lqh, mp->lqh.cqh_first, q); + free(bp); + } + + /* Free the MPOOL cookie. */ + free(mp); + return (RET_SUCCESS); +} + +/* + * mpool_sync + * Sync the pool to disk. + */ +int +mpool_sync(MPOOL *mp) +{ + BKT *bp; + + /* Walk the lru chain, flushing any dirty pages to disk. */ + for (bp = mp->lqh.cqh_first; + bp != (void *)&mp->lqh; bp = bp->q.cqe_next) + if (bp->flags & MPOOL_DIRTY && + mpool_write(mp, bp) == RET_ERROR) + return (RET_ERROR); + + /* Sync the file descriptor. */ + return (fsync(mp->fd) ? RET_ERROR : RET_SUCCESS); +} + +/* + * mpool_bkt + * Get a page from the cache (or create one). + */ +static BKT * +mpool_bkt(MPOOL *mp) +{ + struct _hqh *head; + BKT *bp; + + /* If under the max cached, always create a new page. */ + if (mp->curcache < mp->maxcache) + goto new; + + /* + * If the cache is max'd out, walk the lru list for a buffer we + * can flush. If we find one, write it (if necessary) and take it + * off any lists. If we don't find anything we grow the cache anyway. + * The cache never shrinks. + */ + for (bp = mp->lqh.cqh_first; + bp != (void *)&mp->lqh; bp = bp->q.cqe_next) + if (!(bp->flags & MPOOL_PINNED)) { + /* Flush if dirty. */ + if (bp->flags & MPOOL_DIRTY && + mpool_write(mp, bp) == RET_ERROR) + return (NULL); +#ifdef STATISTICS + ++mp->pageflush; +#endif + /* Remove from the hash and lru queues. */ + head = &mp->hqh[HASHKEY(bp->pgno)]; + CIRCLEQ_REMOVE(head, bp, hq); + CIRCLEQ_REMOVE(&mp->lqh, bp, q); +#ifdef DEBUG + { + void *spage = bp->page; + (void)memset(bp, 0xff, + (size_t)(sizeof(BKT) + mp->pagesize)); + bp->page = spage; + } +#endif + return (bp); + } + +new: if ((bp = calloc(1, (size_t)(sizeof(BKT) + mp->pagesize))) == NULL) + return (NULL); +#ifdef STATISTICS + ++mp->pagealloc; +#endif +#if defined(DEBUG) || defined(PURIFY) + (void)memset(bp, 0xff, (size_t)(sizeof(BKT) + mp->pagesize)); +#endif + bp->page = (char *)(void *)bp + sizeof(BKT); + ++mp->curcache; + return (bp); +} + +/* + * mpool_write + * Write a page to disk. + */ +static int +mpool_write(MPOOL *mp, BKT *bp) +{ + off_t off; + +#ifdef STATISTICS + ++mp->pagewrite; +#endif + + /* Run through the user's filter. */ + if (mp->pgout) + (mp->pgout)(mp->pgcookie, bp->pgno, bp->page); + + off = mp->pagesize * bp->pgno; + if (pwrite(mp->fd, bp->page, (size_t)mp->pagesize, off) != (int)mp->pagesize) + return (RET_ERROR); + + /* + * Re-run through the input filter since this page may soon be + * accessed via the cache, and whatever the user's output filter + * did may screw things up if we don't let the input filter + * restore the in-core copy. + */ + if (mp->pgin) + (mp->pgin)(mp->pgcookie, bp->pgno, bp->page); + + bp->flags &= ~MPOOL_DIRTY; + return (RET_SUCCESS); +} + +/* + * mpool_look + * Lookup a page in the cache. + */ +static BKT * +mpool_look(MPOOL *mp, pgno_t pgno) +{ + struct _hqh *head; + BKT *bp; + + head = &mp->hqh[HASHKEY(pgno)]; + for (bp = head->cqh_first; bp != (void *)head; bp = bp->hq.cqe_next) + if (bp->pgno == pgno) { +#ifdef STATISTICS + ++mp->cachehit; +#endif + return (bp); + } +#ifdef STATISTICS + ++mp->cachemiss; +#endif + return (NULL); +} + +#ifdef STATISTICS +/* + * mpool_stat + * Print out cache statistics. + */ +void +mpool_stat(mp) + MPOOL *mp; +{ + BKT *bp; + int cnt; + const char *sep; + + (void)fprintf(stderr, "%lu pages in the file\n", (u_long)mp->npages); + (void)fprintf(stderr, + "page size %lu, cacheing %lu pages of %lu page max cache\n", + (u_long)mp->pagesize, (u_long)mp->curcache, (u_long)mp->maxcache); + (void)fprintf(stderr, "%lu page puts, %lu page gets, %lu page new\n", + mp->pageput, mp->pageget, mp->pagenew); + (void)fprintf(stderr, "%lu page allocs, %lu page flushes\n", + mp->pagealloc, mp->pageflush); + if (mp->cachehit + mp->cachemiss) + (void)fprintf(stderr, + "%.0f%% cache hit rate (%lu hits, %lu misses)\n", + ((double)mp->cachehit / (mp->cachehit + mp->cachemiss)) + * 100, mp->cachehit, mp->cachemiss); + (void)fprintf(stderr, "%lu page reads, %lu page writes\n", + mp->pageread, mp->pagewrite); + + sep = ""; + cnt = 0; + for (bp = mp->lqh.cqh_first; + bp != (void *)&mp->lqh; bp = bp->q.cqe_next) { + (void)fprintf(stderr, "%s%d", sep, bp->pgno); + if (bp->flags & MPOOL_DIRTY) + (void)fprintf(stderr, "d"); + if (bp->flags & MPOOL_PINNED) + (void)fprintf(stderr, "P"); + if (++cnt == 10) { + sep = "\n"; + cnt = 0; + } else + sep = ", "; + + } + (void)fprintf(stderr, "\n"); +} +#endif diff --git a/lib/libc/db/recno/Makefile.inc b/lib/libc/db/recno/Makefile.inc new file mode 100644 index 000000000..e7f78f4ce --- /dev/null +++ b/lib/libc/db/recno/Makefile.inc @@ -0,0 +1,7 @@ +# $NetBSD: Makefile.inc,v 1.5 1996/05/03 21:38:43 cgd Exp $ +# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 + +.PATH: ${.CURDIR}/db/recno + +SRCS+= rec_close.c rec_delete.c rec_get.c rec_open.c rec_put.c rec_search.c \ + rec_seq.c rec_utils.c diff --git a/lib/libc/db/recno/extern.h b/lib/libc/db/recno/extern.h new file mode 100644 index 000000000..f73c83de3 --- /dev/null +++ b/lib/libc/db/recno/extern.h @@ -0,0 +1,52 @@ +/* $NetBSD: extern.h,v 1.8 2008/08/26 21:18:38 joerg Exp $ */ + +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)extern.h 8.3 (Berkeley) 6/4/94 + */ + +#include "../btree/extern.h" + +int __rec_close(DB *); +int __rec_delete(const DB *, const DBT *, u_int); +int __rec_dleaf(BTREE *, PAGE *, uint32_t); +int __rec_fd(const DB *); +int __rec_fmap(BTREE *, recno_t); +int __rec_fout(BTREE *); +int __rec_fpipe(BTREE *, recno_t); +int __rec_get(const DB *, const DBT *, DBT *, u_int); +int __rec_iput(BTREE *, recno_t, const DBT *, u_int); +int __rec_put(const DB *dbp, DBT *, const DBT *, u_int); +int __rec_ret(BTREE *, EPG *, recno_t, DBT *, DBT *); +EPG *__rec_search(BTREE *, recno_t, enum SRCHOP); +int __rec_seq(const DB *, DBT *, DBT *, u_int); +int __rec_sync(const DB *, u_int); +int __rec_vmap(BTREE *, recno_t); +int __rec_vout(BTREE *); +int __rec_vpipe(BTREE *, recno_t); diff --git a/lib/libc/db/recno/rec_close.c b/lib/libc/db/recno/rec_close.c new file mode 100644 index 000000000..22ebf3aff --- /dev/null +++ b/lib/libc/db/recno/rec_close.c @@ -0,0 +1,190 @@ +/* $NetBSD: rec_close.c,v 1.15 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_close.c,v 1.15 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "recno.h" + +/* + * __REC_CLOSE -- Close a recno tree. + * + * Parameters: + * dbp: pointer to access method + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__rec_close(DB *dbp) +{ + BTREE *t; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + if (__rec_sync(dbp, 0) == RET_ERROR) + return (RET_ERROR); + + /* Committed to closing. */ + status = RET_SUCCESS; + if (F_ISSET(t, R_MEMMAPPED) && munmap(t->bt_smap, t->bt_msize)) + status = RET_ERROR; + + if (!F_ISSET(t, R_INMEM)) { + if (F_ISSET(t, R_CLOSEFP)) { + if (fclose(t->bt_rfp)) + status = RET_ERROR; + } else { + if (close(t->bt_rfd)) + status = RET_ERROR; + } + } + + if (__bt_close(dbp) == RET_ERROR) + status = RET_ERROR; + + return (status); +} + +/* + * __REC_SYNC -- sync the recno tree to disk. + * + * Parameters: + * dbp: pointer to access method + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__rec_sync(const DB *dbp, u_int flags) +{ + struct iovec iov[2]; + BTREE *t; + DBT data, key; + off_t off; + recno_t scursor, trec; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + if (flags == R_RECNOSYNC) + return (__bt_sync(dbp, 0)); + + if (F_ISSET(t, R_RDONLY | R_INMEM) || !F_ISSET(t, R_MODIFIED)) + return (RET_SUCCESS); + + /* Read any remaining records into the tree. */ + if (!F_ISSET(t, R_EOF) && t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) + return (RET_ERROR); + + /* Rewind the file descriptor. */ + if (lseek(t->bt_rfd, (off_t)0, SEEK_SET) != 0) + return (RET_ERROR); + + /* Save the cursor. */ + scursor = t->bt_cursor.rcursor; + + key.size = sizeof(recno_t); + key.data = &trec; + + if (F_ISSET(t, R_FIXLEN)) { + /* + * We assume that fixed length records are all fixed length. + * Any that aren't are either EINVAL'd or corrected by the + * record put code. + */ + status = (dbp->seq)(dbp, &key, &data, R_FIRST); + while (status == RET_SUCCESS) { + if (write(t->bt_rfd, data.data, data.size) != + (ssize_t) data.size) + return (RET_ERROR); + status = (dbp->seq)(dbp, &key, &data, R_NEXT); + } + } else { + iov[1].iov_base = &t->bt_bval; + iov[1].iov_len = 1; + + status = (dbp->seq)(dbp, &key, &data, R_FIRST); + while (status == RET_SUCCESS) { + iov[0].iov_base = data.data; + iov[0].iov_len = data.size; + if (writev(t->bt_rfd, iov, 2) != + (ssize_t) (data.size + 1)) + return (RET_ERROR); + status = (dbp->seq)(dbp, &key, &data, R_NEXT); + } + } + + /* Restore the cursor. */ + t->bt_cursor.rcursor = scursor; + + if (status == RET_ERROR) + return (RET_ERROR); + if ((off = lseek(t->bt_rfd, (off_t)0, SEEK_CUR)) == -1) + return (RET_ERROR); + if (ftruncate(t->bt_rfd, off)) + return (RET_ERROR); + F_CLR(t, R_MODIFIED); + return (RET_SUCCESS); +} diff --git a/lib/libc/db/recno/rec_delete.c b/lib/libc/db/recno/rec_delete.c new file mode 100644 index 000000000..84ef9c187 --- /dev/null +++ b/lib/libc/db/recno/rec_delete.c @@ -0,0 +1,201 @@ +/* $NetBSD: rec_delete.c,v 1.17 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_delete.c,v 1.17 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include + +#include +#include "recno.h" + +static int rec_rdelete(BTREE *, recno_t); + +/* + * __REC_DELETE -- Delete the item(s) referenced by a key. + * + * Parameters: + * dbp: pointer to access method + * key: key to delete + * flags: R_CURSOR if deleting what the cursor references + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + */ +int +__rec_delete(const DB *dbp, const DBT *key, u_int flags) +{ + BTREE *t; + recno_t nrec; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + switch(flags) { + case 0: + if ((nrec = *(recno_t *)key->data) == 0) + goto einval; + if (nrec > t->bt_nrecs) + return (RET_SPECIAL); + --nrec; + status = rec_rdelete(t, nrec); + break; + case R_CURSOR: + if (!F_ISSET(&t->bt_cursor, CURS_INIT)) + goto einval; + if (t->bt_nrecs == 0) + return (RET_SPECIAL); + status = rec_rdelete(t, t->bt_cursor.rcursor - 1); + if (status == RET_SUCCESS) + --t->bt_cursor.rcursor; + break; + default: +einval: errno = EINVAL; + return (RET_ERROR); + } + + if (status == RET_SUCCESS) + F_SET(t, B_MODIFIED | R_MODIFIED); + return (status); +} + +/* + * REC_RDELETE -- Delete the data matching the specified key. + * + * Parameters: + * tree: tree + * nrec: record to delete + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + */ +static int +rec_rdelete(BTREE *t, recno_t nrec) +{ + EPG *e; + PAGE *h; + int status; + + /* Find the record; __rec_search pins the page. */ + if ((e = __rec_search(t, nrec, SDELETE)) == NULL) + return (RET_ERROR); + + /* Delete the record. */ + h = e->page; + status = __rec_dleaf(t, h, (uint32_t)e->index); + if (status != RET_SUCCESS) { + mpool_put(t->bt_mp, h, 0); + return (status); + } + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + return (RET_SUCCESS); +} + +/* + * __REC_DLEAF -- Delete a single record from a recno leaf page. + * + * Parameters: + * t: tree + * index: index on current page to delete + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__rec_dleaf(BTREE *t, PAGE *h, uint32_t idx) +{ + RLEAF *rl; + indx_t *ip, cnt, offset; + uint32_t nbytes; + char *from; + void *to; + size_t temp; + + /* + * Delete a record from a recno leaf page. Internal records are never + * deleted from internal pages, regardless of the records that caused + * them to be added being deleted. Pages made empty by deletion are + * not reclaimed. They are, however, made available for reuse. + * + * Pack the remaining entries at the end of the page, shift the indices + * down, overwriting the deleted record and its index. If the record + * uses overflow pages, make them available for reuse. + */ + to = rl = GETRLEAF(h, idx); + if (rl->flags & P_BIGDATA && __ovfl_delete(t, rl->bytes) == RET_ERROR) + return (RET_ERROR); + nbytes = NRLEAF(rl); + + /* + * Compress the key/data pairs. Compress and adjust the [BR]LEAF + * offsets. Reset the headers. + */ + from = (char *)(void *)h + h->upper; + memmove(from + nbytes, from, (size_t)((char *)to - from)); + h->upper += nbytes; + + offset = h->linp[idx]; + temp = &h->linp[idx] - (ip = &h->linp[0]); + _DBFIT(temp, uint16_t); + for (cnt = (uint16_t)temp; cnt--; ++ip) + if (ip[0] < offset) + ip[0] += nbytes; + temp = &h->linp[NEXTINDEX(h)] - ip; + _DBFIT(temp, uint16_t); + for (cnt = (uint16_t)temp; --cnt; ++ip) + ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; + h->lower -= sizeof(indx_t); + --t->bt_nrecs; + return (RET_SUCCESS); +} diff --git a/lib/libc/db/recno/rec_get.c b/lib/libc/db/recno/rec_get.c new file mode 100644 index 000000000..eacf14b10 --- /dev/null +++ b/lib/libc/db/recno/rec_get.c @@ -0,0 +1,306 @@ +/* $NetBSD: rec_get.c,v 1.16 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_get.c,v 1.16 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "recno.h" + +/* + * __REC_GET -- Get a record from the btree. + * + * Parameters: + * dbp: pointer to access method + * key: key to find + * data: data to return + * flag: currently unused + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + */ +int +__rec_get(const DB *dbp, const DBT *key, DBT *data, u_int flags) +{ + BTREE *t; + EPG *e; + recno_t nrec; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* Get currently doesn't take any flags, and keys of 0 are illegal. */ + if (flags || (nrec = *(recno_t *)key->data) == 0) { + errno = EINVAL; + return (RET_ERROR); + } + + /* + * If we haven't seen this record yet, try to find it in the + * original file. + */ + if (nrec > t->bt_nrecs) { + if (F_ISSET(t, R_EOF | R_INMEM)) + return (RET_SPECIAL); + if ((status = t->bt_irec(t, nrec)) != RET_SUCCESS) + return (status); + } + + --nrec; + if ((e = __rec_search(t, nrec, SEARCH)) == NULL) + return (RET_ERROR); + + status = __rec_ret(t, e, 0, NULL, data); + if (F_ISSET(t, B_DB_LOCK)) + mpool_put(t->bt_mp, e->page, 0); + else + t->bt_pinned = e->page; + return (status); +} + +/* + * __REC_FPIPE -- Get fixed length records from a pipe. + * + * Parameters: + * t: tree + * cnt: records to read + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__rec_fpipe(BTREE *t, recno_t top) +{ + DBT data; + recno_t nrec; + size_t len; + int ch; + uint8_t *p; + + if (t->bt_rdata.size < t->bt_reclen) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + malloc(t->bt_reclen) : + realloc(t->bt_rdata.data, t->bt_reclen); + if (t->bt_rdata.data == NULL) + return (RET_ERROR); + t->bt_rdata.size = t->bt_reclen; + } + data.data = t->bt_rdata.data; + data.size = t->bt_reclen; + + for (nrec = t->bt_nrecs; nrec < top;) { + len = t->bt_reclen; + for (p = t->bt_rdata.data;; *p++ = ch) + if ((ch = getc(t->bt_rfp)) == EOF || !--len) { + if (ch != EOF) + *p = ch; + if (len != 0) + memset(p, t->bt_bval, len); + if (__rec_iput(t, + nrec, &data, 0) != RET_SUCCESS) + return (RET_ERROR); + ++nrec; + break; + } + if (ch == EOF) + break; + } + if (nrec < top) { + F_SET(t, R_EOF); + return (RET_SPECIAL); + } + return (RET_SUCCESS); +} + +/* + * __REC_VPIPE -- Get variable length records from a pipe. + * + * Parameters: + * t: tree + * cnt: records to read + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__rec_vpipe(BTREE *t, recno_t top) +{ + DBT data; + recno_t nrec; + ptrdiff_t len; + size_t sz; + int bval, ch; + uint8_t *p; + + bval = t->bt_bval; + for (nrec = t->bt_nrecs; nrec < top; ++nrec) { + for (p = t->bt_rdata.data, + sz = t->bt_rdata.size;; *p++ = ch, --sz) { + if ((ch = getc(t->bt_rfp)) == EOF || ch == bval) { + data.data = t->bt_rdata.data; + data.size = p - (uint8_t *)t->bt_rdata.data; + if (ch == EOF && data.size == 0) + break; + if (__rec_iput(t, nrec, &data, 0) + != RET_SUCCESS) + return (RET_ERROR); + break; + } + if (sz == 0) { + len = p - (uint8_t *)t->bt_rdata.data; + t->bt_rdata.size += (sz = 256); + t->bt_rdata.data = t->bt_rdata.data == NULL ? + malloc(t->bt_rdata.size) : + realloc(t->bt_rdata.data, t->bt_rdata.size); + if (t->bt_rdata.data == NULL) + return (RET_ERROR); + p = (uint8_t *)t->bt_rdata.data + len; + } + } + if (ch == EOF) + break; + } + if (nrec < top) { + F_SET(t, R_EOF); + return (RET_SPECIAL); + } + return (RET_SUCCESS); +} + +/* + * __REC_FMAP -- Get fixed length records from a file. + * + * Parameters: + * t: tree + * cnt: records to read + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__rec_fmap(BTREE *t, recno_t top) +{ + DBT data; + recno_t nrec; + uint8_t *sp, *ep, *p; + size_t len; + + if (t->bt_rdata.size < t->bt_reclen) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + malloc(t->bt_reclen) : + realloc(t->bt_rdata.data, t->bt_reclen); + if (t->bt_rdata.data == NULL) + return (RET_ERROR); + t->bt_rdata.size = t->bt_reclen; + } + data.data = t->bt_rdata.data; + data.size = t->bt_reclen; + + sp = (uint8_t *)t->bt_cmap; + ep = (uint8_t *)t->bt_emap; + for (nrec = t->bt_nrecs; nrec < top; ++nrec) { + if (sp >= ep) { + F_SET(t, R_EOF); + return (RET_SPECIAL); + } + len = t->bt_reclen; + for (p = t->bt_rdata.data; + sp < ep && len > 0; *p++ = *sp++, --len); + if (len != 0) + memset(p, t->bt_bval, len); + if (__rec_iput(t, nrec, &data, 0) != RET_SUCCESS) + return (RET_ERROR); + } + t->bt_cmap = (caddr_t)sp; + return (RET_SUCCESS); +} + +/* + * __REC_VMAP -- Get variable length records from a file. + * + * Parameters: + * t: tree + * cnt: records to read + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__rec_vmap(BTREE *t, recno_t top) +{ + DBT data; + uint8_t *sp, *ep; + recno_t nrec; + int bval; + + sp = (uint8_t *)t->bt_cmap; + ep = (uint8_t *)t->bt_emap; + bval = t->bt_bval; + + for (nrec = t->bt_nrecs; nrec < top; ++nrec) { + if (sp >= ep) { + F_SET(t, R_EOF); + return (RET_SPECIAL); + } + for (data.data = sp; sp < ep && *sp != bval; ++sp); + data.size = sp - (uint8_t *)data.data; + if (__rec_iput(t, nrec, &data, 0) != RET_SUCCESS) + return (RET_ERROR); + ++sp; + } + t->bt_cmap = (caddr_t)sp; + return (RET_SUCCESS); +} diff --git a/lib/libc/db/recno/rec_open.c b/lib/libc/db/recno/rec_open.c new file mode 100644 index 000000000..fee429ce3 --- /dev/null +++ b/lib/libc/db/recno/rec_open.c @@ -0,0 +1,250 @@ +/* $NetBSD: rec_open.c,v 1.17 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_open.c,v 1.17 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "recno.h" + +DB * +__rec_open(const char *fname, int flags, mode_t mode, const RECNOINFO *openinfo, + int dflags) +{ + BTREE *t; + BTREEINFO btopeninfo; + DB *dbp; + PAGE *h; + struct stat sb; + int rfd = -1; /* pacify gcc */ + int sverrno; + + dbp = NULL; + /* Open the user's file -- if this fails, we're done. */ + if (fname != NULL) { + if ((rfd = open(fname, flags, mode)) == -1) + return (NULL); + if (fcntl(rfd, F_SETFD, FD_CLOEXEC) == -1) + goto err; + } + + /* Create a btree in memory (backed by disk). */ + if (openinfo) { + if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT)) + goto einval; + btopeninfo.flags = 0; + btopeninfo.cachesize = openinfo->cachesize; + btopeninfo.maxkeypage = 0; + btopeninfo.minkeypage = 0; + btopeninfo.psize = openinfo->psize; + btopeninfo.compare = NULL; + btopeninfo.prefix = NULL; + btopeninfo.lorder = openinfo->lorder; + dbp = __bt_open(openinfo->bfname, + O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags); + } else + dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags); + if (dbp == NULL) + goto err; + + /* + * Some fields in the tree structure are recno specific. Fill them + * in and make the btree structure look like a recno structure. We + * don't change the bt_ovflsize value, it's close enough and slightly + * bigger. + */ + t = dbp->internal; + if (openinfo) { + if (openinfo->flags & R_FIXEDLEN) { + F_SET(t, R_FIXLEN); + t->bt_reclen = openinfo->reclen; + if (t->bt_reclen == 0) + goto einval; + } + t->bt_bval = openinfo->bval; + } else + t->bt_bval = '\n'; + + F_SET(t, R_RECNO); + if (fname == NULL) + F_SET(t, R_EOF | R_INMEM); + else + t->bt_rfd = rfd; + + if (fname != NULL) { + /* + * In 4.4BSD, stat(2) returns true for ISSOCK on pipes. + * Unfortunately, that's not portable, so we use lseek + * and check the errno values. + */ + errno = 0; + if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) { + switch (flags & O_ACCMODE) { + case O_RDONLY: + F_SET(t, R_RDONLY); + break; + default: + goto einval; + } +slow: if ((t->bt_rfp = fdopen(rfd, "r")) == NULL) + goto err; + F_SET(t, R_CLOSEFP); + t->bt_irec = + F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe; + } else { + switch (flags & O_ACCMODE) { + case O_RDONLY: + F_SET(t, R_RDONLY); + break; + case O_RDWR: + break; + default: + goto einval; + } + + if (fstat(rfd, &sb)) + goto err; + /* + * Kluge -- we'd like to test to see if the file is too + * big to mmap. Since, we don't know what size or type + * off_t's or size_t's are, what the largest unsigned + * integral type is, or what random insanity the local + * C compiler will perpetrate, doing the comparison in + * a portable way is flatly impossible. Hope that mmap + * fails if the file is too large. + */ + if (sb.st_size == 0) + F_SET(t, R_EOF); + else { +#ifdef MMAP_NOT_AVAILABLE + /* + * XXX + * Mmap doesn't work correctly on many current + * systems. In particular, it can fail subtly, + * with cache coherency problems. Don't use it + * for now. + */ + t->bt_msize = sb.st_size; + if ((t->bt_smap = mmap(NULL, t->bt_msize, + PROT_READ, MAP_FILE | MAP_PRIVATE, rfd, + (off_t)0)) == (caddr_t)-1) + goto slow; + t->bt_cmap = t->bt_smap; + t->bt_emap = t->bt_smap + sb.st_size; + t->bt_irec = F_ISSET(t, R_FIXLEN) ? + __rec_fmap : __rec_vmap; + F_SET(t, R_MEMMAPPED); +#else + goto slow; +#endif + } + } + } + + /* Use the recno routines. */ + dbp->close = __rec_close; + dbp->del = __rec_delete; + dbp->fd = __rec_fd; + dbp->get = __rec_get; + dbp->put = __rec_put; + dbp->seq = __rec_seq; + dbp->sync = __rec_sync; + + /* If the root page was created, reset the flags. */ + if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL) + goto err; + if ((h->flags & P_TYPE) == P_BLEAF) { + F_CLR(h, P_TYPE); + F_SET(h, P_RLEAF); + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + } else + mpool_put(t->bt_mp, h, 0); + + if (openinfo && openinfo->flags & R_SNAPSHOT && + !F_ISSET(t, R_EOF | R_INMEM) && + t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) + goto err; + return (dbp); + +einval: errno = EINVAL; +err: sverrno = errno; + if (dbp != NULL) + (void)__bt_close(dbp); + if (fname != NULL) + (void)close(rfd); + errno = sverrno; + return (NULL); +} + +int +__rec_fd(const DB *dbp) +{ + BTREE *t; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* In-memory database can't have a file descriptor. */ + if (F_ISSET(t, R_INMEM)) { + errno = ENOENT; + return (-1); + } + return (t->bt_rfd); +} diff --git a/lib/libc/db/recno/rec_put.c b/lib/libc/db/recno/rec_put.c new file mode 100644 index 000000000..f28791ccd --- /dev/null +++ b/lib/libc/db/recno/rec_put.c @@ -0,0 +1,282 @@ +/* $NetBSD: rec_put.c,v 1.17 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_put.c,v 1.17 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include +#include + +#include +#include "recno.h" + +/* + * __REC_PUT -- Add a recno item to the tree. + * + * Parameters: + * dbp: pointer to access method + * key: key + * data: data + * flag: R_CURSOR, R_IAFTER, R_IBEFORE, R_NOOVERWRITE + * + * Returns: + * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is + * already in the tree and R_NOOVERWRITE specified. + */ +int +__rec_put(const DB *dbp, DBT *key, const DBT *data, u_int flags) +{ + BTREE *t; + DBT fdata, tdata; + recno_t nrec; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + /* + * If using fixed-length records, and the record is long, return + * EINVAL. If it's short, pad it out. Use the record data return + * memory, it's only short-term. + */ + if (F_ISSET(t, R_FIXLEN) && data->size != t->bt_reclen) { + if (data->size > t->bt_reclen) + goto einval; + + if (t->bt_rdata.size < t->bt_reclen) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + malloc(t->bt_reclen) : + realloc(t->bt_rdata.data, t->bt_reclen); + if (t->bt_rdata.data == NULL) + return (RET_ERROR); + t->bt_rdata.size = t->bt_reclen; + } + memmove(t->bt_rdata.data, data->data, data->size); + memset((char *)t->bt_rdata.data + data->size, + t->bt_bval, t->bt_reclen - data->size); + fdata.data = t->bt_rdata.data; + fdata.size = t->bt_reclen; + } else { + fdata.data = data->data; + fdata.size = data->size; + } + + switch (flags) { + case R_CURSOR: + if (!F_ISSET(&t->bt_cursor, CURS_INIT)) + goto einval; + nrec = t->bt_cursor.rcursor; + break; + case R_SETCURSOR: + if ((nrec = *(recno_t *)key->data) == 0) + goto einval; + break; + case R_IAFTER: + if ((nrec = *(recno_t *)key->data) == 0) { + nrec = 1; + flags = R_IBEFORE; + } + break; + case 0: + case R_IBEFORE: + if ((nrec = *(recno_t *)key->data) == 0) + goto einval; + break; + case R_NOOVERWRITE: + if ((nrec = *(recno_t *)key->data) == 0) + goto einval; + if (nrec <= t->bt_nrecs) + return (RET_SPECIAL); + break; + default: +einval: errno = EINVAL; + return (RET_ERROR); + } + + /* + * Make sure that records up to and including the put record are + * already in the database. If skipping records, create empty ones. + */ + if (nrec > t->bt_nrecs) { + if (!F_ISSET(t, R_EOF | R_INMEM) && + t->bt_irec(t, nrec) == RET_ERROR) + return (RET_ERROR); + if (nrec > t->bt_nrecs + 1) { + if (F_ISSET(t, R_FIXLEN)) { + if ((tdata.data = + (void *)malloc(t->bt_reclen)) == NULL) + return (RET_ERROR); + tdata.size = t->bt_reclen; + memset(tdata.data, t->bt_bval, tdata.size); + } else { + tdata.data = NULL; + tdata.size = 0; + } + while (nrec > t->bt_nrecs + 1) + if (__rec_iput(t, + t->bt_nrecs, &tdata, 0) != RET_SUCCESS) + return (RET_ERROR); + if (F_ISSET(t, R_FIXLEN)) + free(tdata.data); + } + } + + if ((status = __rec_iput(t, nrec - 1, &fdata, flags)) != RET_SUCCESS) + return (status); + + if (flags == R_SETCURSOR) + t->bt_cursor.rcursor = nrec; + + F_SET(t, R_MODIFIED); + return (__rec_ret(t, NULL, nrec, key, NULL)); +} + +/* + * __REC_IPUT -- Add a recno item to the tree. + * + * Parameters: + * t: tree + * nrec: record number + * data: data + * + * Returns: + * RET_ERROR, RET_SUCCESS + */ +int +__rec_iput(BTREE *t, recno_t nrec, const DBT *data, u_int flags) +{ + DBT tdata; + EPG *e; + PAGE *h; + indx_t idx, nxtindex; + pgno_t pg; + uint32_t nbytes; + int dflags, status; + char *dest, db[NOVFLSIZE]; + + /* + * If the data won't fit on a page, store it on indirect pages. + * + * XXX + * If the insert fails later on, these pages aren't recovered. + */ + if (data->size > t->bt_ovflsize) { + if (__ovfl_put(t, data, &pg) == RET_ERROR) + return (RET_ERROR); + tdata.data = db; + tdata.size = NOVFLSIZE; + *(pgno_t *)(void *)db = pg; + _DBFIT(data->size, uint32_t); + *(uint32_t *)(void *)(db + sizeof(pgno_t)) = + (uint32_t)data->size; + dflags = P_BIGDATA; + data = &tdata; + } else + dflags = 0; + + /* __rec_search pins the returned page. */ + if ((e = __rec_search(t, nrec, + nrec > t->bt_nrecs || flags == R_IAFTER || flags == R_IBEFORE ? + SINSERT : SEARCH)) == NULL) + return (RET_ERROR); + + h = e->page; + idx = e->index; + + /* + * Add the specified key/data pair to the tree. The R_IAFTER and + * R_IBEFORE flags insert the key after/before the specified key. + * + * Pages are split as required. + */ + switch (flags) { + case R_IAFTER: + ++idx; + break; + case R_IBEFORE: + break; + default: + if (nrec < t->bt_nrecs && + __rec_dleaf(t, h, (uint32_t)idx) == RET_ERROR) { + mpool_put(t->bt_mp, h, 0); + return (RET_ERROR); + } + break; + } + + /* + * If not enough room, split the page. The split code will insert + * the key and data and unpin the current page. If inserting into + * the offset array, shift the pointers up. + */ + nbytes = NRLEAFDBT(data->size); + if ((uint32_t) (h->upper - h->lower) < nbytes + sizeof(indx_t)) { + status = __bt_split(t, h, NULL, data, dflags, nbytes, + (uint32_t)idx); + if (status == RET_SUCCESS) + ++t->bt_nrecs; + return (status); + } + + if (idx < (nxtindex = NEXTINDEX(h))) + memmove(h->linp + idx + 1, h->linp + idx, + (nxtindex - idx) * sizeof(indx_t)); + h->lower += sizeof(indx_t); + + h->linp[idx] = h->upper -= nbytes; + dest = (char *)(void *)h + h->upper; + WR_RLEAF(dest, data, dflags); + + ++t->bt_nrecs; + F_SET(t, B_MODIFIED); + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + + return (RET_SUCCESS); +} diff --git a/lib/libc/db/recno/rec_search.c b/lib/libc/db/recno/rec_search.c new file mode 100644 index 000000000..08b580934 --- /dev/null +++ b/lib/libc/db/recno/rec_search.c @@ -0,0 +1,130 @@ +/* $NetBSD: rec_search.c,v 1.14 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_search.c,v 1.14 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include + +#include +#include "recno.h" + +/* + * __REC_SEARCH -- Search a btree for a key. + * + * Parameters: + * t: tree to search + * recno: key to find + * op: search operation + * + * Returns: + * EPG for matching record, if any, or the EPG for the location of the + * key, if it were inserted into the tree. + * + * Returns: + * The EPG for matching record, if any, or the EPG for the location + * of the key, if it were inserted into the tree, is entered into + * the bt_cur field of the tree. A pointer to the field is returned. + */ +EPG * +__rec_search(BTREE *t, recno_t recno, enum SRCHOP op) +{ + indx_t idx; + PAGE *h; + EPGNO *parent; + RINTERNAL *r; + pgno_t pg; + indx_t top; + recno_t total; + int sverrno; + + BT_CLR(t); + for (pg = P_ROOT, total = 0;;) { + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + goto err; + if (h->flags & P_RLEAF) { + t->bt_cur.page = h; + t->bt_cur.index = recno - total; + return (&t->bt_cur); + } + for (idx = 0, top = NEXTINDEX(h);;) { + r = GETRINTERNAL(h, idx); + if (++idx == top || total + r->nrecs > recno) + break; + total += r->nrecs; + } + + BT_PUSH(t, pg, idx - 1); + + pg = r->pgno; + switch (op) { + case SDELETE: + --GETRINTERNAL(h, (idx - 1))->nrecs; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + break; + case SINSERT: + ++GETRINTERNAL(h, (idx - 1))->nrecs; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + break; + case SEARCH: + mpool_put(t->bt_mp, h, 0); + break; + } + + } + /* Try and recover the tree. */ +err: sverrno = errno; + if (op != SEARCH) + while ((parent = BT_POP(t)) != NULL) { + if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) + break; + if (op == SINSERT) + --GETRINTERNAL(h, parent->index)->nrecs; + else + ++GETRINTERNAL(h, parent->index)->nrecs; + mpool_put(t->bt_mp, h, MPOOL_DIRTY); + } + errno = sverrno; + return (NULL); +} diff --git a/lib/libc/db/recno/rec_seq.c b/lib/libc/db/recno/rec_seq.c new file mode 100644 index 000000000..a3bceaf9c --- /dev/null +++ b/lib/libc/db/recno/rec_seq.c @@ -0,0 +1,135 @@ +/* $NetBSD: rec_seq.c,v 1.14 2008/09/11 12:58:00 joerg Exp $ */ + +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_seq.c,v 1.14 2008/09/11 12:58:00 joerg Exp $"); +#endif + +#ifndef __minix +#include "namespace.h" +#endif +#include + +#include +#include +#include +#include +#include + +#include +#include "recno.h" + +/* + * __REC_SEQ -- Recno sequential scan interface. + * + * Parameters: + * dbp: pointer to access method + * key: key for positioning and return value + * data: data return value + * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. + * + * Returns: + * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. + */ +int +__rec_seq(const DB *dbp, DBT *key, DBT *data, u_int flags) +{ + BTREE *t; + EPG *e; + recno_t nrec; + int status; + + t = dbp->internal; + + /* Toss any page pinned across calls. */ + if (t->bt_pinned != NULL) { + mpool_put(t->bt_mp, t->bt_pinned, 0); + t->bt_pinned = NULL; + } + + switch(flags) { + case R_CURSOR: + if ((nrec = *(recno_t *)key->data) == 0) + goto einval; + break; + case R_NEXT: + if (F_ISSET(&t->bt_cursor, CURS_INIT)) { + nrec = t->bt_cursor.rcursor + 1; + break; + } + /* FALLTHROUGH */ + case R_FIRST: + nrec = 1; + break; + case R_PREV: + if (F_ISSET(&t->bt_cursor, CURS_INIT)) { + if ((nrec = t->bt_cursor.rcursor - 1) == 0) + return (RET_SPECIAL); + break; + } + /* FALLTHROUGH */ + case R_LAST: + if (!F_ISSET(t, R_EOF | R_INMEM) && + t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) + return (RET_ERROR); + nrec = t->bt_nrecs; + break; + default: +einval: errno = EINVAL; + return (RET_ERROR); + } + + if (t->bt_nrecs == 0 || nrec > t->bt_nrecs) { + if (!F_ISSET(t, R_EOF | R_INMEM) && + (status = t->bt_irec(t, nrec)) != RET_SUCCESS) + return (status); + if (t->bt_nrecs == 0 || nrec > t->bt_nrecs) + return (RET_SPECIAL); + } + + if ((e = __rec_search(t, nrec - 1, SEARCH)) == NULL) + return (RET_ERROR); + + F_SET(&t->bt_cursor, CURS_INIT); + t->bt_cursor.rcursor = nrec; + + status = __rec_ret(t, e, nrec, key, data); + if (F_ISSET(t, B_DB_LOCK)) + mpool_put(t->bt_mp, e->page, 0); + else + t->bt_pinned = e->page; + return (status); +} diff --git a/lib/libc/db/recno/rec_utils.c b/lib/libc/db/recno/rec_utils.c new file mode 100644 index 000000000..73498738a --- /dev/null +++ b/lib/libc/db/recno/rec_utils.c @@ -0,0 +1,122 @@ +/* $NetBSD: rec_utils.c,v 1.12 2008/09/10 17:52:36 joerg Exp $ */ + +/*- + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +#ifndef __minix +__RCSID("$NetBSD: rec_utils.c,v 1.12 2008/09/10 17:52:36 joerg Exp $"); +#endif + +#include + +#include +#include +#include +#include + +#include +#include "recno.h" + +/* + * __rec_ret -- + * Build return data. + * + * Parameters: + * t: tree + * e: key/data pair to be returned + * nrec: record number + * key: user's key structure + * data: user's data structure + * + * Returns: + * RET_SUCCESS, RET_ERROR. + */ +int +__rec_ret(BTREE *t, EPG *e, recno_t nrec, DBT *key, DBT *data) +{ + RLEAF *rl; + void *p; + + if (key == NULL) + goto dataonly; + + /* We have to copy the key, it's not on the page. */ + if (sizeof(recno_t) > t->bt_rkey.size) { + p = (void *)(t->bt_rkey.data == NULL ? + malloc(sizeof(recno_t)) : + realloc(t->bt_rkey.data, sizeof(recno_t))); + if (p == NULL) + return (RET_ERROR); + t->bt_rkey.data = p; + t->bt_rkey.size = sizeof(recno_t); + } + memmove(t->bt_rkey.data, &nrec, sizeof(recno_t)); + key->size = sizeof(recno_t); + key->data = t->bt_rkey.data; + +dataonly: + if (data == NULL) + return (RET_SUCCESS); + + /* + * We must copy big keys/data to make them contigous. Otherwise, + * leave the page pinned and don't copy unless the user specified + * concurrent access. + */ + rl = GETRLEAF(e->page, e->index); + if (rl->flags & P_BIGDATA) { + if (__ovfl_get(t, rl->bytes, + &data->size, &t->bt_rdata.data, &t->bt_rdata.size)) + return (RET_ERROR); + data->data = t->bt_rdata.data; + } else if (F_ISSET(t, B_DB_LOCK)) { + /* Use +1 in case the first record retrieved is 0 length. */ + if (rl->dsize + 1 > t->bt_rdata.size) { + p = (void *)(t->bt_rdata.data == NULL ? + malloc(rl->dsize + 1) : + realloc(t->bt_rdata.data, rl->dsize + 1)); + if (p == NULL) + return (RET_ERROR); + t->bt_rdata.data = p; + t->bt_rdata.size = rl->dsize + 1; + } + memmove(t->bt_rdata.data, rl->bytes, rl->dsize); + data->size = rl->dsize; + data->data = t->bt_rdata.data; + } else { + data->size = rl->dsize; + data->data = rl->bytes; + } + return (RET_SUCCESS); +} diff --git a/lib/libc/db/recno/recno.h b/lib/libc/db/recno/recno.h new file mode 100644 index 000000000..7ac0d176a --- /dev/null +++ b/lib/libc/db/recno/recno.h @@ -0,0 +1,37 @@ +/* $NetBSD: recno.h,v 1.6 2003/08/07 16:42:44 agc Exp $ */ + +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)recno.h 8.1 (Berkeley) 6/4/93 + */ + +enum SRCHOP { SDELETE, SINSERT, SEARCH}; /* Rec_search operation. */ + +#include "../btree/btree.h" +#include "extern.h" -- 2.44.0