includes:
$(MAKE) -C include includes
$(MAKE) -C lib includes NBSD_LIBC=yes
+ $(MAKE) -C sys includes
MKHEADERSS=/usr/pkg/gcc*/libexec/gcc/*/*/install-tools/mkheaders
gnu-includes: includes
$(MAKE) -C bin all
$(MAKE) -C sbin all
$(MAKE) -C usr.bin all
+ $(MAKE) -C external all
$(MAKE) -C libexec all
$(MAKE) -C usr.sbin all
$(MAKE) -C bin dependall
$(MAKE) -C sbin dependall
$(MAKE) -C usr.bin dependall
+ $(MAKE) -C external dependall
$(MAKE) -C libexec dependall
$(MAKE) -C usr.sbin dependall
$(MAKE) -C kernel dependall
$(MAKE) -C bin all
$(MAKE) -C sbin all
$(MAKE) -C usr.bin all
+ $(MAKE) -C external all
$(MAKE) -C libexec all
$(MAKE) -C usr.sbin all
$(MAKE) -C tools all
$(MAKE) -C bin install
$(MAKE) -C sbin install
$(MAKE) -C usr.bin install
+ $(MAKE) -C external install
$(MAKE) -C usr.sbin install
$(MAKE) -C servers install
$(MAKE) -C share install
$(MAKE) -C bin clean
$(MAKE) -C sbin clean
$(MAKE) -C usr.bin clean
+ $(MAKE) -C external clean
$(MAKE) -C libexec clean
$(MAKE) -C usr.sbin clean
$(MAKE) -C share clean
$(MAKE) -C bin cleandepend
$(MAKE) -C sbin cleandepend
$(MAKE) -C usr.bin cleandepend
+ $(MAKE) -C external cleandepend
$(MAKE) -C libexec cleandepend
$(MAKE) -C usr.sbin cleandepend
$(MAKE) -C tools cleandepend
+++ /dev/null
-/* $NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $ */
-
-/*
- * Copyright (c) 1994 Christopher G. Demetriou
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by Christopher G. Demetriou.
- * 4. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _I386_DISKLABEL_H_
-#define _I386_DISKLABEL_H_
-
-#define LABELUSESMBR 1 /* use MBR partitionning */
-#define LABELSECTOR 1 /* sector containing label */
-#define LABELOFFSET 0 /* offset of label in sector */
-#define MAXPARTITIONS 16 /* number of partitions */
-#define OLDMAXPARTITIONS 8 /* number of partitions before 1.6 */
-#define RAW_PART 3 /* raw partition: XX?d (XXX) */
-
-/*
- * We use the highest bit of the minor number for the partition number.
- * This maintains backward compatibility with device nodes created before
- * MAXPARTITIONS was increased.
- */
-#define __I386_MAXDISKS ((1 << 20) / MAXPARTITIONS)
-#define DISKUNIT(dev) ((minor(dev) / OLDMAXPARTITIONS) % __I386_MAXDISKS)
-#define DISKPART(dev) ((minor(dev) % OLDMAXPARTITIONS) + \
- ((minor(dev) / (__I386_MAXDISKS * OLDMAXPARTITIONS)) * OLDMAXPARTITIONS))
-#define DISKMINOR(unit, part) \
- (((unit) * OLDMAXPARTITIONS) + ((part) % OLDMAXPARTITIONS) + \
- ((part) / OLDMAXPARTITIONS) * (__I386_MAXDISKS * OLDMAXPARTITIONS))
-
-/* Pull in MBR partition definitions. */
-#if HAVE_NBTOOL_CONFIG_H
-#include <nbinclude/sys/bootblock.h>
-#else
-#include <sys/bootblock.h>
-#endif /* HAVE_NBTOOL_CONFIG_H */
-
-#ifndef __ASSEMBLER__
-#if HAVE_NBTOOL_CONFIG_H
-#include <nbinclude/sys/dkbad.h>
-#else
-#include <sys/dkbad.h>
-#endif /* HAVE_NBTOOL_CONFIG_H */
-struct cpu_disklabel {
-#define __HAVE_DISKLABEL_DKBAD
- struct dkbad bad;
-};
-#endif
-
-#endif /* _I386_DISKLABEL_H_ */
--- /dev/null
+SUBDIR=bsd
+
+.include <bsd.subdir.mk>
--- /dev/null
+.include <bsd.own.mk>
+SUBDIR=mdocml
+.include <bsd.subdir.mk>
# $NetBSD: Makefile.inc,v 1.12 2010/07/25 19:16:18 joerg Exp $
.include <bsd.own.mk>
-.include "../Makefile.inc"
VERSION!= cd ${.PARSEDIR}/dist && ${MAKE} -V VERSION
ttyent.h tzfile.h ucontext.h ulimit.h unistd.h util.h utime.h utmp.h \
utmpx.h uuid.h varargs.h vis.h wchar.h wctype.h wordexp.h
-INCS += ufs/chfs/chfs.h ufs/chfs/chfs_args.h ufs/chfs/chfs_inode.h \
- ufs/chfs/chfs_pool.h ufs/chfs/debug.h ufs/chfs/ebh.h \
- ufs/chfs/ebh_media.h ufs/chfs/ebh_misc.h ufs/chfs/media.h \
- ufs/ext2fs/ext2fs.h ufs/ext2fs/ext2fs_dinode.h \
- ufs/ext2fs/ext2fs_dir.h ufs/ext2fs/ext2fs_extern.h \
- ufs/ffs/ffs_extern.h ufs/ffs/fs.h ufs/lfs/lfs.h \
- ufs/lfs/lfs_extern.h ufs/mfs/mfs_extern.h ufs/mfs/mfsnode.h \
- ufs/ufs/dinode.h ufs/ufs/dir.h ufs/ufs/dirhash.h \
- ufs/ufs/extattr.h ufs/ufs/inode.h ufs/ufs/quota.h \
- ufs/ufs/quota1.h ufs/ufs/quota2.h ufs/ufs/ufs_bswap.h \
- ufs/ufs/ufs_extern.h ufs/ufs/ufs_quota.h ufs/ufs/ufs_wapbl.h \
- ufs/ufs/ufsmount.h \
-
.else
INCS= a.out.h aio.h ar.h assert.h atomic.h \
bitstring.h bm.h cdbr.h cdbw.h complex.h cpio.h ctype.h \
int_mwgwtypes.h int_types.h limits.h \
math.h mcontext.h npx.h param.h profile.h \
setjmp.h signal.h stdarg.h types.h \
- vmparam.h wchar_limits.h
+ vmparam.h wchar_limits.h \
+ archtypes.h bios.h cmos.h cpu.h diskparm.h fpu.h int86.h \
+ interrupt.h memory.h multiboot.h partition.h \
+ pci.h pci_amd.h pci_intel.h pci_sis.h pci_via.h \
+ ports.h stackframe.h vm.h elf.h elf_machdep.h mutex.h \
+ disklabel.h
-.include "${MINIXSRCDIR}/common/include/arch/i386/Makefile.inc"
.include <bsd.kinc.mk>
-/* $NetBSD: disklabel.h,v 1.15 2009/11/23 13:40:10 pooka Exp $ */
+/* $NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $ */
/*
* Copyright (c) 1994 Christopher G. Demetriou
#ifndef _I386_DISKLABEL_H_
#define _I386_DISKLABEL_H_
+#define LABELUSESMBR 1 /* use MBR partitionning */
#define LABELSECTOR 1 /* sector containing label */
#define LABELOFFSET 0 /* offset of label in sector */
#define MAXPARTITIONS 16 /* number of partitions */
SUBDIR+= arch/i386/stand/bootxx
SUBDIR+= arch/i386/stand/boot
SUBDIR+= arch/i386/stand/cdboot
+SUBDIR+= ufs
.include <bsd.subdir.mk>
--- /dev/null
+# $NetBSD: Makefile,v 1.2 2002/11/26 23:30:35 lukem Exp $
+
+SUBDIR= ffs lfs mfs ufs ext2fs
+
+INCSDIR= /usr/include/ufs
+
+.include <bsd.kinc.mk>
--- /dev/null
+/* $NetBSD: chfs_build.c,v 1.2 2011/11/24 21:22:39 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+
+void
+chfs_calc_trigger_levels(struct chfs_mount *chmp)
+{
+ uint32_t size;
+
+ chmp->chm_resv_blocks_deletion = 2;
+
+ size = chmp->chm_ebh->flash_size / 50; //2% of flash size
+ size += chmp->chm_ebh->peb_nr * 100;
+ size += chmp->chm_ebh->eb_size - 1;
+
+ chmp->chm_resv_blocks_write =
+ chmp->chm_resv_blocks_deletion + (size / chmp->chm_ebh->eb_size);
+ chmp->chm_resv_blocks_gctrigger = chmp->chm_resv_blocks_write + 1;
+ chmp->chm_resv_blocks_gcmerge = chmp->chm_resv_blocks_deletion + 1;
+ chmp->chm_vdirty_blocks_gctrigger = chmp->chm_resv_blocks_gctrigger * 10;
+
+ chmp->chm_nospc_dirty =
+ chmp->chm_ebh->eb_size + (chmp->chm_ebh->flash_size / 100);
+}
+
+
+/**
+ * chfs_build_set_vnodecache_nlink - set pvno and nlink in vnodecaches
+ * @chmp: CHFS main descriptor structure
+ * @vc: vnode cache
+ * This function travels @vc's directory entries and sets the pvno and nlink
+ * attribute of the vnode where the dirent's vno points.
+ */
+void
+chfs_build_set_vnodecache_nlink(struct chfs_mount *chmp,
+ struct chfs_vnode_cache *vc)
+{
+ struct chfs_dirent *fd;
+ //dbg("set nlink\n");
+
+// for (fd = vc->scan_dirents; fd; fd = fd->next) {
+ TAILQ_FOREACH(fd, &vc->scan_dirents, fds) {
+ struct chfs_vnode_cache *child_vc;
+
+ if (!fd->vno)
+ continue;
+
+ mutex_enter(&chmp->chm_lock_vnocache);
+ child_vc = chfs_vnode_cache_get(chmp, fd->vno);
+ mutex_exit(&chmp->chm_lock_vnocache);
+ if (!child_vc) {
+ chfs_mark_node_obsolete(chmp, fd->nref);
+ continue;
+ }
+ if (fd->type == VDIR) {
+ if (child_vc->nlink < 1)
+ child_vc->nlink = 1;
+
+ if (child_vc->pvno) {
+ chfs_err("found a hard link: child dir: %s"
+ ", (vno: %llu) of dir vno: %llu\n",
+ fd->name, (unsigned long long)fd->vno,
+ (unsigned long long)vc->vno);
+ } else {
+ //dbg("child_vc->pvno =
+ // vc->vno; pvno = %d\n", child_vc->pvno);
+ child_vc->pvno = vc->vno;
+ }
+ }
+ child_vc->nlink++;
+ //dbg("child_vc->nlink++;\n");
+ //child_vc->nlink++;
+ vc->nlink++;
+ }
+}
+
+/**
+ * chfs_build_remove_unlinked vnode
+ */
+/* static */
+void
+chfs_build_remove_unlinked_vnode(struct chfs_mount *chmp,
+ struct chfs_vnode_cache *vc,
+// struct chfs_dirent **unlinked)
+ struct chfs_dirent_list *unlinked)
+{
+ struct chfs_node_ref *nref;
+ struct chfs_dirent *fd, *tmpfd;
+
+ dbg("START\n");
+ dbg("vno: %llu\n", (unsigned long long)vc->vno);
+
+ nref = vc->dnode;
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ // The vnode cache is at the end of the data node's chain
+ while (nref != (struct chfs_node_ref *)vc) {
+ struct chfs_node_ref *next = nref->nref_next;
+ dbg("mark dnode\n");
+ chfs_mark_node_obsolete(chmp, nref);
+ nref = next;
+ }
+ nref = vc->dirents;
+ // The vnode cache is at the end of the dirent node's chain
+ while (nref != (struct chfs_node_ref *)vc) {
+ struct chfs_node_ref *next = nref->nref_next;
+ dbg("mark dirent\n");
+ chfs_mark_node_obsolete(chmp, nref);
+ nref = next;
+ }
+ if (!TAILQ_EMPTY(&vc->scan_dirents)) {
+ TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) {
+// while (vc->scan_dirents) {
+ struct chfs_vnode_cache *child_vc;
+// fd = vc->scan_dirents;
+ dbg("dirent dump:\n");
+ dbg(" ->vno: %llu\n", (unsigned long long)fd->vno);
+ dbg(" ->version: %llu\n", (unsigned long long)fd->version);
+ dbg(" ->nhash: 0x%x\n", fd->nhash);
+ dbg(" ->nsize: %d\n", fd->nsize);
+ dbg(" ->name: %s\n", fd->name);
+ dbg(" ->type: %d\n", fd->type);
+// vc->scan_dirents = fd->next;
+ TAILQ_REMOVE(&vc->scan_dirents, fd, fds);
+
+ if (!fd->vno) {
+ chfs_free_dirent(fd);
+ continue;
+ }
+ mutex_enter(&chmp->chm_lock_vnocache);
+ child_vc = chfs_vnode_cache_get(chmp, fd->vno);
+ mutex_exit(&chmp->chm_lock_vnocache);
+ if (!child_vc) {
+ chfs_free_dirent(fd);
+ continue;
+ }
+ /**
+ * Decrease nlink in child. If it is 0, add to unlinked
+ * dirents or just free it otherwise.
+ */
+ child_vc->nlink--;
+
+ if (!child_vc->nlink) {
+ //dbg("nlink is 0\n");
+// fd->next = *unlinked;
+// *unlinked = fd;
+ // XXX HEAD or TAIL?
+ // original code did HEAD, but we could add
+ // it to the TAIL easily with TAILQ.
+ TAILQ_INSERT_TAIL(unlinked, fd, fds);
+ } else {
+ chfs_free_dirent(fd);
+ }
+ }
+ } else {
+ dbg("there are no scan dirents\n");
+ }
+
+ nref = vc->v;
+ while ((struct chfs_vnode_cache *)nref != vc) {
+ if (!CHFS_REF_OBSOLETE(nref))
+ chfs_mark_node_obsolete(chmp, nref);
+ nref = nref->nref_next;
+ }
+
+ mutex_enter(&chmp->chm_lock_vnocache);
+ if (vc->vno != CHFS_ROOTINO)
+ chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_UNCHECKED);
+ mutex_exit(&chmp->chm_lock_vnocache);
+ dbg("END\n");
+}
+
+/**
+ * chfs_build_filesystem - build in-memory representation of filesystem
+ * @chmp: super block information
+ *
+ * Step 1:
+ * This function scans through the eraseblocks mapped in EBH.
+ * During scan builds up the map of vnodes and directory entries and puts them
+ * into the vnode_cache.
+ * Step 2:
+ * Scans the directory tree and set the nlink in the vnode caches.
+ * Step 3:
+ * Scans vnode caches with nlink = 0
+ */
+int
+chfs_build_filesystem(struct chfs_mount *chmp)
+{
+ int i,err = 0;
+ struct chfs_vnode_cache *vc;
+ struct chfs_dirent *fd, *tmpfd;
+// struct chfs_dirent *unlinked = NULL;
+ struct chfs_node_ref **nref;
+ struct chfs_dirent_list unlinked;
+ struct chfs_vnode_cache *notregvc;
+
+ TAILQ_INIT(&unlinked);
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+
+ /**
+ * Step 1
+ */
+ chmp->chm_flags |= CHFS_MP_FLAG_SCANNING;
+ for (i = 0; i < chmp->chm_ebh->peb_nr; i++) {
+ //dbg("processing block: %d\n", i);
+ chmp->chm_blocks[i].lnr = i;
+ chmp->chm_blocks[i].free_size = chmp->chm_ebh->eb_size;
+ //If the LEB is add to free list skip it.
+ if (chmp->chm_ebh->lmap[i] < 0) {
+ //dbg("block %d is unmapped\n", i);
+ TAILQ_INSERT_TAIL(&chmp->chm_free_queue,
+ &chmp->chm_blocks[i], queue);
+ chmp->chm_nr_free_blocks++;
+ continue;
+ }
+
+ err = chfs_scan_eraseblock(chmp, &chmp->chm_blocks[i]);
+ switch (err) {
+ case CHFS_BLK_STATE_FREE:
+ chmp->chm_nr_free_blocks++;
+ TAILQ_INSERT_TAIL(&chmp->chm_free_queue,
+ &chmp->chm_blocks[i], queue);
+ break;
+ case CHFS_BLK_STATE_CLEAN:
+ TAILQ_INSERT_TAIL(&chmp->chm_clean_queue,
+ &chmp->chm_blocks[i], queue);
+ break;
+ case CHFS_BLK_STATE_PARTDIRTY:
+ //dbg("free size: %d\n", chmp->chm_blocks[i].free_size);
+ if (chmp->chm_blocks[i].free_size > chmp->chm_wbuf_pagesize &&
+ (!chmp->chm_nextblock ||
+ chmp->chm_blocks[i].free_size >
+ chmp->chm_nextblock->free_size)) {
+ /* convert the old nextblock's free size to
+ * dirty and put it on a list */
+ if (chmp->chm_nextblock) {
+ err = chfs_close_eraseblock(chmp,
+ chmp->chm_nextblock);
+ if (err)
+ return err;
+ }
+ chmp->chm_nextblock = &chmp->chm_blocks[i];
+ } else {
+ /* convert the scanned block's free size to
+ * dirty and put it on a list */
+ err = chfs_close_eraseblock(chmp,
+ &chmp->chm_blocks[i]);
+ if (err)
+ return err;
+ }
+ break;
+ case CHFS_BLK_STATE_ALLDIRTY:
+ /*
+ * The block has a valid EBH header, but it doesn't
+ * contain any valid data.
+ */
+ TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+ &chmp->chm_blocks[i], queue);
+ chmp->chm_nr_erasable_blocks++;
+ break;
+ default:
+ /* It was an error, unknown state */
+ break;
+ }
+
+ }
+ chmp->chm_flags &= ~CHFS_MP_FLAG_SCANNING;
+
+
+ //TODO need bad block check (and bad block handling in EBH too!!)
+ /* Now EBH only checks block is bad during its scan operation.
+ * Need check at erase + write + read...
+ */
+
+ /**
+ * Step 2
+ */
+ chmp->chm_flags |= CHFS_MP_FLAG_BUILDING;
+ for (i = 0; i < VNODECACHE_SIZE; i++) {
+ vc = chmp->chm_vnocache_hash[i];
+ while (vc) {
+ dbg("vc->vno: %llu\n", (unsigned long long)vc->vno);
+ if (!TAILQ_EMPTY(&vc->scan_dirents))
+ chfs_build_set_vnodecache_nlink(chmp, vc);
+ vc = vc->next;
+ }
+ }
+
+ /**
+ * Step 3
+ * Scan for vnodes with 0 nlink.
+ */
+ for (i = 0; i < VNODECACHE_SIZE; i++) {
+ vc = chmp->chm_vnocache_hash[i];
+ while (vc) {
+ if (vc->nlink) {
+ vc = vc->next;
+ continue;
+ }
+
+ //dbg("remove unlinked start i: %d\n", i);
+ chfs_build_remove_unlinked_vnode(chmp,
+ vc, &unlinked);
+ //dbg("remove unlinked end\n");
+ vc = vc->next;
+ }
+ }
+ /* Remove the newly unlinked vnodes. They are on the unlinked list */
+ TAILQ_FOREACH_SAFE(fd, &unlinked, fds, tmpfd) {
+// while (unlinked) {
+// fd = unlinked;
+// unlinked = fd->next;
+ TAILQ_REMOVE(&unlinked, fd, fds);
+ mutex_enter(&chmp->chm_lock_vnocache);
+ vc = chfs_vnode_cache_get(chmp, fd->vno);
+ mutex_exit(&chmp->chm_lock_vnocache);
+ if (vc) {
+ chfs_build_remove_unlinked_vnode(chmp,
+ vc, &unlinked);
+ }
+ chfs_free_dirent(fd);
+ }
+
+ chmp->chm_flags &= ~CHFS_MP_FLAG_BUILDING;
+
+ /* Free all dirents */
+ for (i = 0; i < VNODECACHE_SIZE; i++) {
+ vc = chmp->chm_vnocache_hash[i];
+ while (vc) {
+ TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) {
+// while (vc->scan_dirents) {
+// fd = vc->scan_dirents;
+// vc->scan_dirents = fd->next;
+ TAILQ_REMOVE(&vc->scan_dirents, fd, fds);
+ if (fd->vno == 0) {
+ //for (nref = &vc->dirents;
+ // *nref != fd->nref;
+ // nref = &((*nref)->next));
+
+ nref = &fd->nref;
+ *nref = fd->nref->nref_next;
+ //fd->nref->nref_next = NULL;
+ } else if (fd->type == VDIR) {
+ //set state every non-VREG file's vc
+ mutex_enter(&chmp->chm_lock_vnocache);
+ notregvc =
+ chfs_vnode_cache_get(chmp,
+ fd->vno);
+ chfs_vnode_cache_set_state(chmp,
+ notregvc, VNO_STATE_PRESENT);
+ mutex_exit(&chmp->chm_lock_vnocache);
+ }
+ chfs_free_dirent(fd);
+ }
+// vc->scan_dirents = NULL;
+ KASSERT(TAILQ_EMPTY(&vc->scan_dirents));
+ vc = vc->next;
+ }
+ }
+
+ //Set up chmp->chm_wbuf_ofs for the first write
+ if (chmp->chm_nextblock) {
+ dbg("free_size: %d\n", chmp->chm_nextblock->free_size);
+ chmp->chm_wbuf_ofs = chmp->chm_ebh->eb_size -
+ chmp->chm_nextblock->free_size;
+ } else {
+ chmp->chm_wbuf_ofs = 0xffffffff;
+ }
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ return 0;
+}
+
--- /dev/null
+/* $NetBSD: chfs_erase.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (c) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_erase.c
+ *
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>,
+ * ...
+ * University of Szeged, Hungary
+ */
+
+#include "chfs.h"
+
+
+/**
+ * chfs_remap_leb - unmap and then map a leb
+ * @chmp: chfs mount structure
+ *
+ * This function gets an eraseblock from the erasable queue, unmaps it through
+ * EBH and maps another eraseblock to the same LNR.
+ * EBH will find a free eraseblock if any or will erase one if there isn't any
+ * free, just dirty block.
+ *
+ * Returns zero on case of success, errorcode otherwise.
+ *
+ * Needs more brainstorming here.
+ */
+int
+chfs_remap_leb(struct chfs_mount *chmp)
+{
+ int err;
+ struct chfs_eraseblock *cheb;
+ dbg("chfs_remap_leb\n");
+ uint32_t dirty, unchecked, used, free, wasted;
+
+ //dbg("chmp->chm_nr_erasable_blocks: %d\n", chmp->chm_nr_erasable_blocks);
+ //dbg("ltree: %p ecl: %p\n", &chmp->chm_ebh->ltree_lock, &chmp->chm_lock_sizes);
+ KASSERT(!rw_write_held(&chmp->chm_lock_wbuf));
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+
+ if (!chmp->chm_nr_erasable_blocks) {
+ //TODO
+ /* We don't have any erasable blocks, need to check if there are
+ * blocks on erasable_pending_wbuf_queue, flush the data and then
+ * we can remap it.
+ * If there aren't any blocks on that list too, we need to GC?
+ */
+ if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) {
+ cheb = TAILQ_FIRST(&chmp->chm_erasable_pending_wbuf_queue);
+ TAILQ_REMOVE(&chmp->chm_erasable_pending_wbuf_queue, cheb, queue);
+ if (chmp->chm_wbuf_len) {
+ mutex_exit(&chmp->chm_lock_sizes);
+ chfs_flush_pending_wbuf(chmp);
+ mutex_enter(&chmp->chm_lock_sizes);
+ }
+ TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue, cheb, queue);
+ chmp->chm_nr_erasable_blocks++;
+ } else {
+ /* We can't delete any block. */
+ //FIXME should we return ENOSPC?
+ return ENOSPC;
+ }
+ }
+ cheb = TAILQ_FIRST(&chmp->chm_erase_pending_queue);
+ TAILQ_REMOVE(&chmp->chm_erase_pending_queue, cheb, queue);
+ chmp->chm_nr_erasable_blocks--;
+
+ dirty = cheb->dirty_size;
+ unchecked = cheb->unchecked_size;
+ used = cheb->used_size;
+ free = cheb->free_size;
+ wasted = cheb->wasted_size;
+
+ // Free allocated node references for this eraseblock
+ chfs_free_node_refs(cheb);
+
+ err = chfs_unmap_leb(chmp, cheb->lnr);
+ if (err)
+ return err;
+
+ err = chfs_map_leb(chmp, cheb->lnr);
+ if (err)
+ return err;
+ // Reset state to default and change chmp sizes too
+ chfs_change_size_dirty(chmp, cheb, -dirty);
+ chfs_change_size_unchecked(chmp, cheb, -unchecked);
+ chfs_change_size_used(chmp, cheb, -used);
+ chfs_change_size_free(chmp, cheb, chmp->chm_ebh->eb_size - free);
+ chfs_change_size_wasted(chmp, cheb, -wasted);
+
+ KASSERT(cheb->dirty_size == 0);
+ KASSERT(cheb->unchecked_size == 0);
+ KASSERT(cheb->used_size == 0);
+ KASSERT(cheb->free_size == chmp->chm_ebh->eb_size);
+ KASSERT(cheb->wasted_size == 0);
+
+ cheb->first_node = NULL;
+ cheb->last_node = NULL;
+ //put it to free_queue
+ TAILQ_INSERT_TAIL(&chmp->chm_free_queue, cheb, queue);
+ chmp->chm_nr_free_blocks++;
+ dbg("remaped (free: %d, erasable: %d)\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks);
+ KASSERT(!TAILQ_EMPTY(&chmp->chm_free_queue));
+
+ return 0;
+}
--- /dev/null
+/* $NetBSD: chfs_gc.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (c) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (c) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+
+void chfs_gc_release_inode(struct chfs_mount *,
+ struct chfs_inode *);
+struct chfs_inode *chfs_gc_fetch_inode(struct chfs_mount *,
+ ino_t, uint32_t);
+int chfs_check(struct chfs_mount *, struct chfs_vnode_cache *);
+void chfs_clear_inode(struct chfs_mount *, struct chfs_inode *);
+
+
+struct chfs_eraseblock *find_gc_block(struct chfs_mount *);
+int chfs_gcollect_pristine(struct chfs_mount *,
+ struct chfs_eraseblock *,
+ struct chfs_vnode_cache *, struct chfs_node_ref *);
+int chfs_gcollect_live(struct chfs_mount *,
+ struct chfs_eraseblock *, struct chfs_node_ref *,
+ struct chfs_inode *);
+int chfs_gcollect_vnode(struct chfs_mount *, struct chfs_inode *);
+int chfs_gcollect_dirent(struct chfs_mount *,
+ struct chfs_eraseblock *, struct chfs_inode *,
+ struct chfs_dirent *);
+int chfs_gcollect_deletion_dirent(struct chfs_mount *,
+ struct chfs_eraseblock *, struct chfs_inode *,
+ struct chfs_dirent *);
+int chfs_gcollect_dnode(struct chfs_mount *,
+ struct chfs_eraseblock *, struct chfs_inode *,
+ struct chfs_full_dnode *, uint32_t, uint32_t);
+
+/* must be called with chm_lock_mountfields held */
+void
+chfs_gc_trigger(struct chfs_mount *chmp)
+{
+ struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+ //mutex_enter(&chmp->chm_lock_sizes);
+ if (gc->gcth_running &&
+ chfs_gc_thread_should_wake(chmp)) {
+ cv_signal(&gc->gcth_wakeup);
+ }
+ //mutex_exit(&chmp->chm_lock_sizes);
+}
+
+
+void
+chfs_gc_thread(void *data)
+{
+ struct chfs_mount *chmp = data;
+ struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+ dbg_gc("[GC THREAD] thread started\n");
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+ while (gc->gcth_running) {
+ /* we must call chfs_gc_thread_should_wake with chm_lock_mountfields
+ * held, which is a bit awkwardly done here, but we cant relly
+ * do it otherway with the current design...
+ */
+ if (chfs_gc_thread_should_wake(chmp)) {
+// mutex_exit(&chmp->chm_lock_mountfields);
+ if (chfs_gcollect_pass(chmp) == ENOSPC) {
+ dbg_gc("No space for garbage collection\n");
+ panic("No space for garbage collection\n");
+ /* XXX why break here? i have added a panic
+ * here to see if it gets triggered -ahoka
+ */
+ break;
+ }
+ /* XXX gcollect_pass drops the mutex */
+ mutex_enter(&chmp->chm_lock_mountfields);
+ }
+
+ cv_timedwait_sig(&gc->gcth_wakeup,
+ &chmp->chm_lock_mountfields, mstohz(100));
+ }
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ dbg_gc("[GC THREAD] thread stopped\n");
+ kthread_exit(0);
+}
+
+void
+chfs_gc_thread_start(struct chfs_mount *chmp)
+{
+ struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+ cv_init(&gc->gcth_wakeup, "chfsgccv");
+
+ gc->gcth_running = true;
+ kthread_create(PRI_NONE, /*KTHREAD_MPSAFE |*/ KTHREAD_MUSTJOIN,
+ NULL, chfs_gc_thread, chmp, &gc->gcth_thread,
+ "chfsgcth");
+}
+
+void
+chfs_gc_thread_stop(struct chfs_mount *chmp)
+{
+ struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+ /* check if it is actually running. if not, do nothing */
+ if (gc->gcth_running) {
+ gc->gcth_running = false;
+ } else {
+ return;
+ }
+ cv_signal(&gc->gcth_wakeup);
+ dbg_gc("[GC THREAD] stop signal sent\n");
+
+ kthread_join(gc->gcth_thread);
+#ifdef BROKEN_KTH_JOIN
+ kpause("chfsthjoin", false, mstohz(1000), NULL);
+#endif
+
+ cv_destroy(&gc->gcth_wakeup);
+}
+
+/* must be called with chm_lock_mountfields held */
+int
+chfs_gc_thread_should_wake(struct chfs_mount *chmp)
+{
+ int nr_very_dirty = 0;
+ struct chfs_eraseblock *cheb;
+ uint32_t dirty;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+ dbg_gc("erase_pending\n");
+ return 1;
+ }
+
+ if (chmp->chm_unchecked_size) {
+ dbg_gc("unchecked\n");
+ return 1;
+ }
+
+ dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks *
+ chmp->chm_ebh->eb_size;
+
+ if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks <
+ chmp->chm_resv_blocks_gctrigger && (dirty > chmp->chm_nospc_dirty)) {
+ dbg_gc("free: %d + erasable: %d < resv: %d\n",
+ chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks,
+ chmp->chm_resv_blocks_gctrigger);
+ dbg_gc("dirty: %d > nospc_dirty: %d\n",
+ dirty, chmp->chm_nospc_dirty);
+
+ return 1;
+ }
+
+ TAILQ_FOREACH(cheb, &chmp->chm_very_dirty_queue, queue) {
+ nr_very_dirty++;
+ if (nr_very_dirty == chmp->chm_vdirty_blocks_gctrigger) {
+ dbg_gc("nr_very_dirty\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+void
+chfs_gc_release_inode(struct chfs_mount *chmp,
+ struct chfs_inode *ip)
+{
+ dbg_gc("release inode\n");
+ //mutex_exit(&ip->inode_lock);
+ //vput(ITOV(ip));
+}
+
+struct chfs_inode *
+chfs_gc_fetch_inode(struct chfs_mount *chmp, ino_t vno,
+ uint32_t unlinked)
+{
+ struct vnode *vp = NULL;
+ struct chfs_vnode_cache *vc;
+ struct chfs_inode *ip;
+ dbg_gc("fetch inode %llu\n", (unsigned long long)vno);
+
+ if (unlinked) {
+ dbg_gc("unlinked\n");
+ vp = chfs_vnode_lookup(chmp, vno);
+ if (!vp) {
+ mutex_enter(&chmp->chm_lock_vnocache);
+ vc = chfs_vnode_cache_get(chmp, vno);
+ if (!vc) {
+ mutex_exit(&chmp->chm_lock_vnocache);
+ return NULL;
+ }
+ if (vc->state != VNO_STATE_CHECKEDABSENT) {
+ //sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+ mutex_exit(&chmp->chm_lock_vnocache);
+ /* XXX why do we need the delay here?! */
+// kpause("chvncabs", true, mstohz(50), NULL);
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ cv_timedwait_sig(
+ &chmp->chm_gc_thread.gcth_wakeup,
+ &chmp->chm_lock_mountfields, mstohz(50));
+
+// KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+ } else {
+ mutex_exit(&chmp->chm_lock_vnocache);
+ }
+ return NULL;
+ }
+ } else {
+ dbg_gc("vnode lookup\n");
+ vp = chfs_vnode_lookup(chmp, vno);
+ //VFS_VGET(chmp->chm_fsmp, vno, &vp);
+ }
+ dbg_gc("vp to ip\n");
+ ip = VTOI(vp);
+ KASSERT(ip);
+ //mutex_enter(&ip->inode_lock);
+
+ return ip;
+}
+
+extern rb_tree_ops_t frag_rbtree_ops;
+
+int
+chfs_check(struct chfs_mount *chmp, struct chfs_vnode_cache *chvc)
+{
+ struct chfs_inode *ip;
+ struct vnode *vp;
+ int ret;
+
+ ip = pool_get(&chfs_inode_pool, PR_WAITOK);
+ if (!ip) {
+ return ENOMEM;
+ }
+
+ vp = kmem_zalloc(sizeof(struct vnode), KM_SLEEP);
+
+ ip->chvc = chvc;
+ ip->vp = vp;
+
+ vp->v_data = ip;
+
+ rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+ TAILQ_INIT(&ip->dents);
+
+ ret = chfs_read_inode_internal(chmp, ip);
+ if (!ret) {
+ chfs_clear_inode(chmp, ip);
+ }
+
+ pool_put(&chfs_inode_pool, ip);
+
+ return ret;
+}
+
+void
+chfs_clear_inode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+ struct chfs_dirent *fd, *tmpfd;
+ struct chfs_vnode_cache *chvc;
+
+
+ /* XXX not sure if this is the correct locking */
+// mutex_enter(&chmp->chm_lock_vnocache);
+ chvc = ip->chvc;
+ /* shouldnt this be: */
+ //bool deleted = (chvc && !(chvc->pvno || chvc->nlink));
+ int deleted = (chvc && !(chvc->pvno | chvc->nlink));
+
+ if (chvc && chvc->state != VNO_STATE_CHECKING) {
+// chfs_vnode_cache_state_set(chmp, chvc, VNO_STATE_CLEARING);
+ chvc->state = VNO_STATE_CLEARING;
+ }
+
+ if (chvc->v && ((struct chfs_vnode_cache *)chvc->v != chvc)) {
+ if (deleted)
+ chfs_mark_node_obsolete(chmp, chvc->v);
+ //chfs_free_refblock(chvc->v);
+ }
+// mutex_enter(&chmp->chm_lock_vnocache);
+
+ chfs_kill_fragtree(&ip->fragtree);
+/*
+ fd = TAILQ_FIRST(&ip->dents);
+ while (fd) {
+ TAILQ_REMOVE(&ip->dents, fd, fds);
+ chfs_free_dirent(fd);
+ fd = TAILQ_FIRST(&ip->dents);
+ }
+*/
+
+ TAILQ_FOREACH_SAFE(fd, &ip->dents, fds, tmpfd) {
+ chfs_free_dirent(fd);
+ }
+
+ if (chvc && chvc->state == VNO_STATE_CHECKING) {
+ chfs_vnode_cache_set_state(chmp,
+ chvc, VNO_STATE_CHECKEDABSENT);
+ if ((struct chfs_vnode_cache *)chvc->v == chvc &&
+ (struct chfs_vnode_cache *)chvc->dirents == chvc &&
+ (struct chfs_vnode_cache *)chvc->dnode == chvc)
+ chfs_vnode_cache_remove(chmp, chvc);
+ }
+
+}
+
+struct chfs_eraseblock *
+find_gc_block(struct chfs_mount *chmp)
+{
+ struct chfs_eraseblock *ret;
+ struct chfs_eraseblock_queue *nextqueue;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ struct timespec now;
+ vfs_timestamp(&now);
+
+ int n = now.tv_nsec % 128;
+
+ //dbg_gc("n = %d\n", n);
+again:
+/* if (!TAILQ_EMPTY(&chmp->chm_bad_used_queue) && chmp->chm_nr_free_blocks > chmp->chm_nr_resv_blocks_gcbad) {
+ dbg_gc("Picking block from bad_used_queue to GC next\n");
+ nextqueue = &chmp->chm_bad_used_queue;
+ } else */if (n<50 && !TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+ dbg_gc("Picking block from erase_pending_queue to GC next\n");
+ nextqueue = &chmp->chm_erase_pending_queue;
+ } else if (n<110 && !TAILQ_EMPTY(&chmp->chm_very_dirty_queue) ) {
+ dbg_gc("Picking block from very_dirty_queue to GC next\n");
+ nextqueue = &chmp->chm_very_dirty_queue;
+ } else if (n<126 && !TAILQ_EMPTY(&chmp->chm_dirty_queue) ) {
+ dbg_gc("Picking block from dirty_queue to GC next\n");
+ nextqueue = &chmp->chm_dirty_queue;
+ } else if (!TAILQ_EMPTY(&chmp->chm_clean_queue)) {
+ dbg_gc("Picking block from clean_queue to GC next\n");
+ nextqueue = &chmp->chm_clean_queue;
+ } else if (!TAILQ_EMPTY(&chmp->chm_dirty_queue)) {
+ dbg_gc("Picking block from dirty_queue to GC next"
+ " (clean_queue was empty)\n");
+ nextqueue = &chmp->chm_dirty_queue;
+ } else if (!TAILQ_EMPTY(&chmp->chm_very_dirty_queue)) {
+ dbg_gc("Picking block from very_dirty_queue to GC next"
+ " (clean_queue and dirty_queue were empty)\n");
+ nextqueue = &chmp->chm_very_dirty_queue;
+ } else if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+ dbg_gc("Picking block from erase_pending_queue to GC next"
+ " (clean_queue and {very_,}dirty_queue were empty)\n");
+ nextqueue = &chmp->chm_erase_pending_queue;
+ } else if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) {
+ dbg_gc("Synching wbuf in order to reuse "
+ "erasable_pendig_wbuf_queue blocks\n");
+ rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+ chfs_flush_pending_wbuf(chmp);
+ rw_exit(&chmp->chm_lock_wbuf);
+ goto again;
+ } else {
+ dbg_gc("CHFS: no clean, dirty _or_ erasable"
+ " blocks to GC from! Where are they all?\n");
+ return NULL;
+ }
+
+ ret = TAILQ_FIRST(nextqueue);
+ if (chmp->chm_nextblock) {
+ dbg_gc("nextblock num: %u - gcblock num: %u\n",
+ chmp->chm_nextblock->lnr, ret->lnr);
+ if (ret == chmp->chm_nextblock)
+ goto again;
+ //KASSERT(ret != chmp->chm_nextblock);
+ //dbg_gc("first node lnr: %u ofs: %u\n", ret->first_node->lnr, ret->first_node->offset);
+ //dbg_gc("last node lnr: %u ofs: %u\n", ret->last_node->lnr, ret->last_node->offset);
+ }
+ TAILQ_REMOVE(nextqueue, ret, queue);
+ chmp->chm_gcblock = ret;
+ ret->gc_node = ret->first_node;
+
+ if (!ret->gc_node) {
+ dbg_gc("Oops! ret->gc_node at LEB: %u is NULL\n", ret->lnr);
+ panic("CHFS BUG - one LEB's gc_node is NULL\n");
+ }
+
+ /* TODO wasted size? */
+ return ret;
+}
+
+
+int
+chfs_gcollect_pass(struct chfs_mount *chmp)
+{
+ struct chfs_vnode_cache *vc;
+ struct chfs_eraseblock *eb;
+ struct chfs_node_ref *nref;
+ uint32_t gcblock_dirty;
+ struct chfs_inode *ip;
+ ino_t vno, pvno;
+ uint32_t nlink;
+ int ret = 0;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+// mutex_enter(&chmp->chm_lock_mountfields);
+ for (;;) {
+ mutex_enter(&chmp->chm_lock_sizes);
+
+ dbg_gc("unchecked size == %u\n", chmp->chm_unchecked_size);
+ if (!chmp->chm_unchecked_size)
+ break;
+
+ if (chmp->chm_checked_vno > chmp->chm_max_vno) {
+ mutex_exit(&chmp->chm_lock_sizes);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ dbg_gc("checked_vno (#%llu) > max_vno (#%llu)\n",
+ (unsigned long long)chmp->chm_checked_vno,
+ (unsigned long long)chmp->chm_max_vno);
+ return ENOSPC;
+ }
+
+ mutex_exit(&chmp->chm_lock_sizes);
+
+ mutex_enter(&chmp->chm_lock_vnocache);
+ dbg_gc("checking vno #%llu\n",
+ (unsigned long long)chmp->chm_checked_vno);
+ dbg_gc("get vnode cache\n");
+ vc = chfs_vnode_cache_get(chmp, chmp->chm_checked_vno++);
+
+ if (!vc) {
+ dbg_gc("!vc\n");
+ mutex_exit(&chmp->chm_lock_vnocache);
+ continue;
+ }
+
+ if ((vc->pvno | vc->nlink) == 0) {
+ dbg_gc("(pvno | nlink) == 0\n");
+ mutex_exit(&chmp->chm_lock_vnocache);
+ continue;
+ }
+
+ dbg_gc("switch\n");
+ switch (vc->state) {
+ case VNO_STATE_CHECKEDABSENT:
+ case VNO_STATE_PRESENT:
+ mutex_exit(&chmp->chm_lock_vnocache);
+ continue;
+
+ case VNO_STATE_GC:
+ case VNO_STATE_CHECKING:
+ mutex_exit(&chmp->chm_lock_vnocache);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ dbg_gc("VNO_STATE GC or CHECKING\n");
+ panic("CHFS BUG - vc state gc or checking\n");
+
+ case VNO_STATE_READING:
+ chmp->chm_checked_vno--;
+ mutex_exit(&chmp->chm_lock_vnocache);
+ /* XXX why do we need the delay here?! */
+ kpause("chvncrea", true, mstohz(50), NULL);
+
+// sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+// KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+ mutex_exit(&chmp->chm_lock_mountfields);
+ return 0;
+
+ default:
+ mutex_exit(&chmp->chm_lock_vnocache);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ dbg_gc("default\n");
+ panic("CHFS BUG - vc state is other what we"
+ " checked\n");
+
+ case VNO_STATE_UNCHECKED:
+ ;
+ }
+
+ chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_CHECKING);
+
+ /* XXX check if this is too heavy to call under
+ * chm_lock_vnocache
+ */
+ ret = chfs_check(chmp, vc);
+ dbg_gc("set state\n");
+ chfs_vnode_cache_set_state(chmp,
+ vc, VNO_STATE_CHECKEDABSENT);
+
+ mutex_exit(&chmp->chm_lock_vnocache);
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ return ret;
+ }
+
+
+ eb = chmp->chm_gcblock;
+
+ if (!eb) {
+ eb = find_gc_block(chmp);
+ }
+
+ if (!eb) {
+ dbg_gc("!eb\n");
+ if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+ mutex_exit(&chmp->chm_lock_sizes);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ return EAGAIN;
+ }
+ mutex_exit(&chmp->chm_lock_sizes);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ return EIO;
+ }
+
+ if (!eb->used_size) {
+ dbg_gc("!eb->used_size\n");
+ goto eraseit;
+ }
+
+ nref = eb->gc_node;
+ //dbg_gc("gc use: %u\n", chmp->chm_nextblock->lnr);
+ //dbg_gc("nref: %u %u\n", nref->nref_lnr, nref->nref_offset);
+ gcblock_dirty = eb->dirty_size;
+
+ while(CHFS_REF_OBSOLETE(nref)) {
+ //dbg_gc("obsoleted nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+#ifdef DBG_MSG_GC
+ if (nref == chmp->chm_blocks[nref->nref_lnr].last_node) {
+ dbg_gc("THIS NODE IS THE LAST NODE OF ITS EB\n");
+ }
+#endif
+ nref = node_next(nref);
+ if (!nref) {
+ //dbg_gc("!nref\n");
+ eb->gc_node = nref;
+ mutex_exit(&chmp->chm_lock_sizes);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ panic("CHFS BUG - nref is NULL)\n");
+ }
+ }
+ eb->gc_node = nref;
+ //dbg_gc("nref the chosen one lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+ KASSERT(nref->nref_lnr == chmp->chm_gcblock->lnr);
+
+ if (!nref->nref_next) {
+ //dbg_gc("!nref->nref_next\n");
+ mutex_exit(&chmp->chm_lock_sizes);
+ if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+ chfs_gcollect_pristine(chmp, eb, NULL, nref);
+ } else {
+ chfs_mark_node_obsolete(chmp, nref);
+ }
+ goto lock_size;
+ }
+
+ dbg_gc("nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+ vc = chfs_nref_to_vc(nref);
+
+ mutex_exit(&chmp->chm_lock_sizes);
+
+ //dbg_gc("enter vnocache lock on #%llu\n", vc->vno);
+ mutex_enter(&chmp->chm_lock_vnocache);
+
+ dbg_gc("switch\n");
+ switch(vc->state) {
+ case VNO_STATE_CHECKEDABSENT:
+ if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+ chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_GC);
+ }
+ break;
+
+ case VNO_STATE_PRESENT:
+ break;
+
+ case VNO_STATE_UNCHECKED:
+ case VNO_STATE_CHECKING:
+ case VNO_STATE_GC:
+ mutex_exit(&chmp->chm_lock_vnocache);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ panic("CHFS BUG - vc state unchecked,"
+ " checking or gc (vno #%llu, num #%d)\n",
+ (unsigned long long)vc->vno, vc->state);
+
+ case VNO_STATE_READING:
+ mutex_exit(&chmp->chm_lock_vnocache);
+ /* XXX why do we need the delay here?! */
+ kpause("chvncrea", true, mstohz(50), NULL);
+
+// sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+// KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+ mutex_exit(&chmp->chm_lock_mountfields);
+ return 0;
+ }
+
+ if (vc->state == VNO_STATE_GC) {
+ dbg_gc("vc->state == VNO_STATE_GC\n");
+ mutex_exit(&chmp->chm_lock_vnocache);
+ ret = chfs_gcollect_pristine(chmp, eb, NULL, nref);
+
+// chfs_vnode_cache_state_set(chmp,
+// vc, VNO_STATE_CHECKEDABSENT);
+ /* XXX locking? */
+ vc->state = VNO_STATE_CHECKEDABSENT;
+ //TODO wake_up(&chmp->chm_vnocache_wq);
+ if (ret != EBADF)
+ goto test_gcnode;
+ mutex_enter(&chmp->chm_lock_vnocache);
+ }
+
+ vno = vc->vno;
+ pvno = vc->pvno;
+ nlink = vc->nlink;
+ mutex_exit(&chmp->chm_lock_vnocache);
+
+ ip = chfs_gc_fetch_inode(chmp, vno, !(pvno | nlink));
+
+ if (!ip) {
+ dbg_gc("!ip\n");
+ ret = 0;
+ goto lock_size;
+ }
+
+ chfs_gcollect_live(chmp, eb, nref, ip);
+
+ chfs_gc_release_inode(chmp, ip);
+
+test_gcnode:
+ if (eb->dirty_size == gcblock_dirty &&
+ !CHFS_REF_OBSOLETE(eb->gc_node)) {
+ dbg_gc("ERROR collecting node at %u failed.\n",
+ CHFS_GET_OFS(eb->gc_node->nref_offset));
+
+ ret = ENOSPC;
+ }
+
+lock_size:
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ mutex_enter(&chmp->chm_lock_sizes);
+eraseit:
+ dbg_gc("eraseit\n");
+
+ if (chmp->chm_gcblock) {
+ dbg_gc("eb used size = %u\n", chmp->chm_gcblock->used_size);
+ dbg_gc("eb free size = %u\n", chmp->chm_gcblock->free_size);
+ dbg_gc("eb dirty size = %u\n", chmp->chm_gcblock->dirty_size);
+ dbg_gc("eb unchecked size = %u\n",
+ chmp->chm_gcblock->unchecked_size);
+ dbg_gc("eb wasted size = %u\n", chmp->chm_gcblock->wasted_size);
+
+ KASSERT(chmp->chm_gcblock->used_size + chmp->chm_gcblock->free_size +
+ chmp->chm_gcblock->dirty_size +
+ chmp->chm_gcblock->unchecked_size +
+ chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size);
+
+ }
+
+ if (chmp->chm_gcblock && chmp->chm_gcblock->dirty_size +
+ chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size) {
+ dbg_gc("Block at leb #%u completely obsoleted by GC, "
+ "Moving to erase_pending_queue\n", chmp->chm_gcblock->lnr);
+ TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+ chmp->chm_gcblock, queue);
+ chmp->chm_gcblock = NULL;
+ chmp->chm_nr_erasable_blocks++;
+ if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+ ret = chfs_remap_leb(chmp);
+ }
+ }
+
+ mutex_exit(&chmp->chm_lock_sizes);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ dbg_gc("return\n");
+ return ret;
+}
+
+
+int
+chfs_gcollect_pristine(struct chfs_mount *chmp, struct chfs_eraseblock *cheb,
+ struct chfs_vnode_cache *chvc, struct chfs_node_ref *nref)
+{
+ struct chfs_node_ref *newnref;
+ struct chfs_flash_node_hdr *nhdr;
+ struct chfs_flash_vnode *fvnode;
+ struct chfs_flash_dirent_node *fdirent;
+ struct chfs_flash_data_node *fdata;
+ int ret, retries = 0;
+ uint32_t ofs, crc;
+ size_t totlen = chfs_nref_len(chmp, cheb, nref);
+ char *data;
+ struct iovec vec;
+ size_t retlen;
+
+ dbg_gc("gcollect_pristine\n");
+
+ data = kmem_alloc(totlen, KM_SLEEP);
+ if (!data)
+ return ENOMEM;
+
+ ofs = CHFS_GET_OFS(nref->nref_offset);
+
+ ret = chfs_read_leb(chmp, nref->nref_lnr, data, ofs, totlen, &retlen);
+ if (ret) {
+ dbg_gc("reading error\n");
+ return ret;
+ }
+ if (retlen != totlen) {
+ dbg_gc("read size error\n");
+ return EIO;
+ }
+ nhdr = (struct chfs_flash_node_hdr *)data;
+ /* check the header */
+ if (le16toh(nhdr->magic) != CHFS_FS_MAGIC_BITMASK) {
+ dbg_gc("node header magic number error\n");
+ return EBADF;
+ }
+ crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4);
+ if (crc != le32toh(nhdr->hdr_crc)) {
+ dbg_gc("node header crc error\n");
+ return EBADF;
+ }
+
+ switch(le16toh(nhdr->type)) {
+ case CHFS_NODETYPE_VNODE:
+ fvnode = (struct chfs_flash_vnode *)data;
+ crc = crc32(0, (uint8_t *)fvnode, sizeof(struct chfs_flash_vnode) - 4);
+ if (crc != le32toh(fvnode->node_crc)) {
+ dbg_gc("vnode crc error\n");
+ return EBADF;
+ }
+ break;
+ case CHFS_NODETYPE_DIRENT:
+ fdirent = (struct chfs_flash_dirent_node *)data;
+ crc = crc32(0, (uint8_t *)fdirent, sizeof(struct chfs_flash_dirent_node) - 4);
+ if (crc != le32toh(fdirent->node_crc)) {
+ dbg_gc("dirent crc error\n");
+ return EBADF;
+ }
+ crc = crc32(0, fdirent->name, fdirent->nsize);
+ if (crc != le32toh(fdirent->name_crc)) {
+ dbg_gc("dirent name crc error\n");
+ return EBADF;
+ }
+ break;
+ case CHFS_NODETYPE_DATA:
+ fdata = (struct chfs_flash_data_node *)data;
+ crc = crc32(0, (uint8_t *)fdata, sizeof(struct chfs_flash_data_node) - 4);
+ if (crc != le32toh(fdata->node_crc)) {
+ dbg_gc("data node crc error\n");
+ return EBADF;
+ }
+ break;
+ default:
+ if (chvc) {
+ dbg_gc("unknown node have vnode cache\n");
+ return EBADF;
+ }
+ }
+ /* CRC's OK, write node to its new place */
+retry:
+ ret = chfs_reserve_space_gc(chmp, totlen);
+ if (ret)
+ return ret;
+
+ newnref = chfs_alloc_node_ref(chmp->chm_nextblock);
+ if (!newnref)
+ return ENOMEM;
+
+ ofs = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+ newnref->nref_offset = ofs;
+
+ vec.iov_base = (void *)data;
+ vec.iov_len = totlen;
+ mutex_enter(&chmp->chm_lock_sizes);
+ ret = chfs_write_wbuf(chmp, &vec, 1, ofs, &retlen);
+
+ if (ret || retlen != totlen) {
+ chfs_err("error while writing out to the media\n");
+ chfs_err("err: %d | size: %zu | retlen : %zu\n",
+ ret, totlen, retlen);
+
+ chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen);
+ if (retries) {
+ mutex_exit(&chmp->chm_lock_sizes);
+ return EIO;
+ }
+
+ retries++;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto retry;
+ }
+
+ mutex_exit(&chmp->chm_lock_sizes);
+ //TODO should we set free_size?
+ chfs_mark_node_obsolete(chmp, nref);
+ chfs_add_vnode_ref_to_vc(chmp, chvc, newnref);
+ return 0;
+}
+
+
+int
+chfs_gcollect_live(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, struct chfs_node_ref *nref,
+ struct chfs_inode *ip)
+{
+ struct chfs_node_frag *frag;
+ struct chfs_full_dnode *fn = NULL;
+ int start = 0, end = 0, nrfrags = 0;
+ struct chfs_dirent *fd = NULL;
+ int ret = 0;
+ bool is_dirent;
+
+ dbg_gc("gcollect_live\n");
+
+ if (chmp->chm_gcblock != cheb) {
+ dbg_gc("GC block is no longer gcblock. Restart.\n");
+ goto upnout;
+ }
+
+ if (CHFS_REF_OBSOLETE(nref)) {
+ dbg_gc("node to be GC'd was obsoleted in the meantime.\n");
+ goto upnout;
+ }
+
+ /* It's a vnode? */
+ if (ip->chvc->v == nref) {
+ chfs_gcollect_vnode(chmp, ip);
+ goto upnout;
+ }
+
+ /* find fn */
+ dbg_gc("find full dnode\n");
+ for(frag = frag_first(&ip->fragtree);
+ frag; frag = frag_next(&ip->fragtree, frag)) {
+ if (frag->node && frag->node->nref == nref) {
+ fn = frag->node;
+ end = frag->ofs + frag->size;
+ if (!nrfrags++)
+ start = frag->ofs;
+ if (nrfrags == frag->node->frags)
+ break;
+ }
+ }
+
+ /* It's a pristine node, or dnode (or hole? XXX have we hole nodes?) */
+ if (fn) {
+ if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+ ret = chfs_gcollect_pristine(chmp,
+ cheb, ip->chvc, nref);
+ if (!ret) {
+ frag->node->nref = ip->chvc->v;
+ }
+ if (ret != EBADF)
+ goto upnout;
+ }
+ //ret = chfs_gcollect_hole(chmp, cheb, ip, fn, start, end);
+ ret = chfs_gcollect_dnode(chmp, cheb, ip, fn, start, end);
+ goto upnout;
+ }
+
+
+ /* It's a dirent? */
+ dbg_gc("find full dirent\n");
+ is_dirent = false;
+ TAILQ_FOREACH(fd, &ip->dents, fds) {
+ if (fd->nref == nref) {
+ is_dirent = true;
+ break;
+ }
+ }
+
+ if (is_dirent && fd->vno) {
+ ret = chfs_gcollect_dirent(chmp, cheb, ip, fd);
+ } else if (is_dirent) {
+ ret = chfs_gcollect_deletion_dirent(chmp, cheb, ip, fd);
+ } else {
+ dbg_gc("Nref at leb #%u offset 0x%08x wasn't in node list"
+ " for ino #%llu\n",
+ nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset),
+ (unsigned long long)ip->ino);
+ if (CHFS_REF_OBSOLETE(nref)) {
+ dbg_gc("But it's obsolete so we don't mind"
+ " too much.\n");
+ }
+ }
+
+upnout:
+ return ret;
+}
+
+int
+chfs_gcollect_vnode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+ int ret;
+ dbg_gc("gcollect_vnode\n");
+
+ ret = chfs_write_flash_vnode(chmp, ip, ALLOC_GC);
+
+ return ret;
+}
+
+int
+chfs_gcollect_dirent(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, struct chfs_inode *parent,
+ struct chfs_dirent *fd)
+{
+ struct vnode *vnode = NULL;
+ struct chfs_inode *ip;
+ struct chfs_node_ref *prev;
+ dbg_gc("gcollect_dirent\n");
+
+ vnode = chfs_vnode_lookup(chmp, fd->vno);
+
+ /* XXX maybe KASSERT or panic on this? */
+ if (vnode == NULL) {
+ return ENOENT;
+ }
+
+ ip = VTOI(vnode);
+
+ prev = parent->chvc->dirents;
+ if (prev == fd->nref) {
+ parent->chvc->dirents = prev->nref_next;
+ dbg_gc("fd nref removed from dirents list\n");
+ prev = NULL;
+ }
+ while (prev) {
+ if (prev->nref_next == fd->nref) {
+ prev->nref_next = fd->nref->nref_next;
+ dbg_gc("fd nref removed from dirents list\n");
+ break;
+ }
+ prev = prev->nref_next;
+ }
+
+ prev = fd->nref;
+ chfs_mark_node_obsolete(chmp, fd->nref);
+ return chfs_write_flash_dirent(chmp,
+ parent, ip, fd, fd->vno, ALLOC_GC);
+}
+
+/* Check dirents what are marked as deleted. */
+int
+chfs_gcollect_deletion_dirent(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, struct chfs_inode *parent,
+ struct chfs_dirent *fd)
+{
+ struct chfs_flash_dirent_node chfdn;
+ struct chfs_node_ref *nref;
+ size_t retlen, name_len, nref_len;
+ uint32_t name_crc;
+
+ int ret;
+
+ struct vnode *vnode = NULL;
+
+ dbg_gc("gcollect_deletion_dirent\n");
+
+ name_len = strlen(fd->name);
+ name_crc = crc32(0, fd->name, name_len);
+
+ nref_len = chfs_nref_len(chmp, cheb, fd->nref);
+
+ vnode = chfs_vnode_lookup(chmp, fd->vno);
+
+ //dbg_gc("ip from vnode\n");
+ //VFS_VGET(chmp->chm_fsmp, fd->vno, &vnode);
+ //ip = VTOI(vnode);
+ //vput(vnode);
+
+ //dbg_gc("mutex enter erase_completion_lock\n");
+
+// dbg_gc("alloc chfdn\n");
+// chfdn = kmem_alloc(nref_len, KM_SLEEP);
+// if (!chfdn)
+// return ENOMEM;
+
+ for (nref = parent->chvc->dirents;
+ nref != (void*)parent->chvc;
+ nref = nref->nref_next) {
+
+ if (!CHFS_REF_OBSOLETE(nref))
+ continue;
+
+ /* if node refs have different length, skip */
+ if (chfs_nref_len(chmp, NULL, nref) != nref_len)
+ continue;
+
+ if (CHFS_GET_OFS(nref->nref_offset) ==
+ CHFS_GET_OFS(fd->nref->nref_offset)) {
+ continue;
+ }
+
+ ret = chfs_read_leb(chmp,
+ nref->nref_lnr, (void*)&chfdn, CHFS_GET_OFS(nref->nref_offset),
+ nref_len, &retlen);
+
+ if (ret) {
+ dbg_gc("Read error: %d\n", ret);
+ continue;
+ }
+
+ if (retlen != nref_len) {
+ dbg_gc("Error reading node:"
+ " read: %zu insted of: %zu\n", retlen, nref_len);
+ continue;
+ }
+
+ /* if node type doesn't match, skip */
+ if (le16toh(chfdn.type) != CHFS_NODETYPE_DIRENT)
+ continue;
+
+ /* if crc doesn't match, skip */
+ if (le32toh(chfdn.name_crc) != name_crc)
+ continue;
+
+ /* if length of name different, or this is an another deletion
+ * dirent, skip
+ */
+ if (chfdn.nsize != name_len || !le64toh(chfdn.vno))
+ continue;
+
+ /* check actual name */
+ if (memcmp(chfdn.name, fd->name, name_len))
+ continue;
+
+// kmem_free(chfdn, nref_len);
+
+ chfs_mark_node_obsolete(chmp, fd->nref);
+ return chfs_write_flash_dirent(chmp,
+ parent, NULL, fd, fd->vno, ALLOC_GC);
+ }
+
+// kmem_free(chfdn, nref_len);
+
+ TAILQ_REMOVE(&parent->dents, fd, fds);
+ chfs_free_dirent(fd);
+ return 0;
+}
+
+int
+chfs_gcollect_dnode(struct chfs_mount *chmp,
+ struct chfs_eraseblock *orig_cheb, struct chfs_inode *ip,
+ struct chfs_full_dnode *fn, uint32_t orig_start, uint32_t orig_end)
+{
+ struct chfs_node_ref *nref, *prev;
+ struct chfs_full_dnode *newfn;
+ struct chfs_flash_data_node *fdnode;
+ int ret = 0, retries = 0;
+ uint32_t totlen;
+ char *data = NULL;
+ struct iovec vec;
+ size_t retlen;
+ dbg_gc("gcollect_dnode\n");
+
+ //uint32_t used_size;
+
+/* TODO GC merging frags, should we use it?
+
+ uint32_t start, end;
+
+ start = orig_start;
+ end = orig_end;
+
+ if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks > chmp->chm_resv_blocks_gcmerge) {
+ struct chfs_node_frag *frag;
+ uint32_t min, max;
+
+ min = start & (PAGE_CACHE_SIZE-1);
+ max = min + PAGE_CACHE_SIZE;
+
+ frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &start);
+ KASSERT(frag->ofs == start);
+
+ while ((frag = frag_prev(&ip->i_chfs_ext.fragtree, frag)) && frag->ofs >= min) {
+ if (frag->ofs > min) {
+ start = frag->ofs;
+ continue;
+ }
+
+ if (!frag->node || !frag->node->nref) {
+ break;
+ } else {
+ struct chfs_node_ref *nref = frag->node->nref;
+ struct chfs_eraseblock *cheb;
+
+ cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+ if (cheb == chmp->chm_gcblock)
+ start = frag->ofs;
+
+ //TODO is this a clean block?
+
+ start = frag->ofs;
+ break;
+ }
+ }
+
+ end--;
+ frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &(end));
+
+ while ((frag = frag_next(&ip->i_chfs_ext.fragtree, frag)) && (frag->ofs + frag->size <= max)) {
+ if (frag->ofs + frag->size < max) {
+ end = frag->ofs + frag->size;
+ continue;
+ }
+
+ if (!frag->node || !frag->node->nref) {
+ break;
+ } else {
+ struct chfs_node_ref *nref = frag->node->nref;
+ struct chfs_eraseblock *cheb;
+
+ cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+ if (cheb == chmp->chm_gcblock)
+ end = frag->ofs + frag->size;
+
+ //TODO is this a clean block?
+
+ end = frag->ofs + frag->size;
+ break;
+ }
+ }
+
+ KASSERT(end <=
+ frag_last(&ip->i_chfs_ext.fragtree)->ofs +
+ frag_last(&ip->i_chfs_ext.fragtree)->size);
+ KASSERT(end >= orig_end);
+ KASSERT(start <= orig_start);
+ }
+*/
+ KASSERT(orig_cheb->lnr == fn->nref->nref_lnr);
+ totlen = chfs_nref_len(chmp, orig_cheb, fn->nref);
+ data = kmem_alloc(totlen, KM_SLEEP);
+
+ ret = chfs_read_leb(chmp, fn->nref->nref_lnr, data, fn->nref->nref_offset,
+ totlen, &retlen);
+
+ fdnode = (struct chfs_flash_data_node *)data;
+ fdnode->version = htole64(++ip->chvc->highest_version);
+ fdnode->node_crc = htole32(crc32(0, (uint8_t *)fdnode,
+ sizeof(*fdnode) - 4));
+
+ vec.iov_base = (void *)data;
+ vec.iov_len = totlen;
+
+retry:
+ ret = chfs_reserve_space_gc(chmp, totlen);
+ if (ret)
+ goto out;
+
+ nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+ if (!nref) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ mutex_enter(&chmp->chm_lock_sizes);
+
+ nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+ KASSERT(nref->nref_offset % 4 == 0);
+ chfs_change_size_free(chmp, chmp->chm_nextblock, -totlen);
+
+ ret = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen);
+ if (ret || retlen != totlen) {
+ chfs_err("error while writing out to the media\n");
+ chfs_err("err: %d | size: %d | retlen : %zu\n",
+ ret, totlen, retlen);
+ chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen);
+ if (retries) {
+ ret = EIO;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto out;
+ }
+
+ retries++;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto retry;
+ }
+
+ dbg_gc("new nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+
+ chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen);
+ mutex_exit(&chmp->chm_lock_sizes);
+ KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+
+ newfn = chfs_alloc_full_dnode();
+ newfn->nref = nref;
+ newfn->ofs = fn->ofs;
+ newfn->size = fn->size;
+ newfn->frags = fn->frags;
+
+ //TODO should we remove fd from dnode list?
+
+ prev = ip->chvc->dnode;
+ if (prev == fn->nref) {
+ ip->chvc->dnode = prev->nref_next;
+ prev = NULL;
+ }
+ while (prev) {
+ if (prev->nref_next == fn->nref) {
+ prev->nref_next = fn->nref->nref_next;
+ break;
+ }
+ prev = prev->nref_next;
+ }
+
+ chfs_add_full_dnode_to_inode(chmp, ip, newfn);
+ chfs_add_node_to_list(chmp,
+ ip->chvc, newfn->nref, &ip->chvc->dnode);
+
+out:
+ kmem_free(data, totlen);
+ return ret;
+}
--- /dev/null
+/* $NetBSD: chfs_ihash.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+/*
+ * Structures associated with inode cacheing.
+ */
+static LIST_HEAD(ihashhead, chfs_inode) *chfs_ihashtbl;
+static u_long chfs_ihash; /* size of hash table - 1 */
+#define INOHASH(device, inum) (((device) + (inum)) & chfs_ihash)
+
+kmutex_t chfs_ihash_lock;
+kmutex_t chfs_hashlock;
+
+/*
+ * Initialize inode hash table.
+ */
+void
+chfs_ihashinit(void)
+{
+ dbg("initing\n");
+
+ mutex_init(&chfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&chfs_ihash_lock, MUTEX_DEFAULT, IPL_NONE);
+ chfs_ihashtbl = hashinit(desiredvnodes,
+ HASH_LIST, true, &chfs_ihash);
+}
+
+/*
+ * Reinitialize inode hash table.
+ */
+
+void
+chfs_ihashreinit(void)
+{
+ struct chfs_inode *ip;
+ struct ihashhead *oldhash, *hash;
+ u_long oldmask, mask, val;
+ int i;
+
+ dbg("reiniting\n");
+
+ hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+ mutex_enter(&chfs_ihash_lock);
+ oldhash = chfs_ihashtbl;
+ oldmask = chfs_ihash;
+ chfs_ihashtbl = hash;
+ chfs_ihash = mask;
+ for (i = 0; i <= oldmask; i++) {
+ while ((ip = LIST_FIRST(&oldhash[i])) != NULL) {
+ LIST_REMOVE(ip, hash_entry);
+ val = INOHASH(ip->dev, ip->ino);
+ LIST_INSERT_HEAD(&hash[val], ip, hash_entry);
+ }
+ }
+ mutex_exit(&chfs_ihash_lock);
+ hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free inode hash table.
+ */
+void
+chfs_ihashdone(void)
+{
+ dbg("destroying\n");
+
+ hashdone(chfs_ihashtbl, HASH_LIST, chfs_ihash);
+ mutex_destroy(&chfs_hashlock);
+ mutex_destroy(&chfs_ihash_lock);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, return it, even if it is locked.
+ */
+struct vnode *
+chfs_ihashlookup(dev_t dev, ino_t inum)
+{
+ struct chfs_inode *ip;
+ struct ihashhead *ipp;
+
+ dbg("dev: %ju, inum: %ju\n", (uintmax_t )dev, (uintmax_t )inum);
+
+ KASSERT(mutex_owned(&chfs_ihash_lock));
+
+ ipp = &chfs_ihashtbl[INOHASH(dev, inum)];
+ LIST_FOREACH(ip, ipp, hash_entry) {
+ if (inum == ip->ino && dev == ip->dev) {
+ break;
+ }
+ }
+
+ if (ip) {
+ return (ITOV(ip));
+ }
+
+ return (NULLVP);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, but locked, wait for it.
+ */
+struct vnode *
+chfs_ihashget(dev_t dev, ino_t inum, int flags)
+{
+ struct ihashhead *ipp;
+ struct chfs_inode *ip;
+ struct vnode *vp;
+
+ dbg("search for ino\n");
+
+loop:
+ mutex_enter(&chfs_ihash_lock);
+ ipp = &chfs_ihashtbl[INOHASH(dev, inum)];
+ dbg("ipp: %p, chfs_ihashtbl: %p, ihash: %lu\n",
+ ipp, chfs_ihashtbl, chfs_ihash);
+ LIST_FOREACH(ip, ipp, hash_entry) {
+ dbg("ip: %p\n", ip);
+ if (inum == ip->ino && dev == ip->dev) {
+// printf("chfs_ihashget: found inode: %p\n", ip);
+ vp = ITOV(ip);
+ KASSERT(vp != NULL);
+ //dbg("found\n");
+ if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
+ //dbg("wait for #%llu\n", ip->ino);
+ mutex_exit(&chfs_ihash_lock);
+ goto loop;
+ }
+ /*
+ if (VOP_ISLOCKED(vp))
+ dbg("locked\n");
+ else
+ dbg("isn't locked\n");
+ */
+ if (flags == 0) {
+ //dbg("no flags\n");
+ mutex_exit(&chfs_ihash_lock);
+ } else {
+ //dbg("vget\n");
+ mutex_enter(vp->v_interlock);
+ mutex_exit(&chfs_ihash_lock);
+ if (vget(vp, flags)) {
+ goto loop;
+ }
+ //dbg("got it\n");
+ }
+ //dbg("return\n");
+ return (vp);
+ }
+ }
+ //dbg("not found\n");
+ mutex_exit(&chfs_ihash_lock);
+ return (NULL);
+}
+
+/*
+ * Insert the inode into the hash table, and return it locked.
+ */
+void
+chfs_ihashins(struct chfs_inode *ip)
+{
+ struct ihashhead *ipp;
+
+ dbg("ip: %p\n", ip);
+
+ KASSERT(mutex_owned(&chfs_hashlock));
+
+ /* lock the inode, then put it on the appropriate hash list */
+ VOP_LOCK(ITOV(ip), LK_EXCLUSIVE);
+
+ mutex_enter(&chfs_ihash_lock);
+ ipp = &chfs_ihashtbl[INOHASH(ip->dev, ip->ino)];
+ LIST_INSERT_HEAD(ipp, ip, hash_entry);
+ mutex_exit(&chfs_ihash_lock);
+}
+
+/*
+ * Remove the inode from the hash table.
+ */
+void
+chfs_ihashrem(struct chfs_inode *ip)
+{
+ dbg("ip: %p\n", ip);
+
+ mutex_enter(&chfs_ihash_lock);
+ LIST_REMOVE(ip, hash_entry);
+ mutex_exit(&chfs_ihash_lock);
+}
+
--- /dev/null
+/* $NetBSD: chfs_malloc.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include <sys/pool.h>
+
+pool_cache_t chfs_vnode_cache;
+pool_cache_t chfs_nrefs_cache;
+pool_cache_t chfs_flash_vnode_cache;
+pool_cache_t chfs_flash_dirent_cache;
+pool_cache_t chfs_flash_dnode_cache;
+pool_cache_t chfs_node_frag_cache;
+pool_cache_t chfs_tmp_dnode_cache;
+pool_cache_t chfs_tmp_dnode_info_cache;
+
+int
+chfs_alloc_pool_caches()
+{
+ chfs_vnode_cache = pool_cache_init(
+ sizeof(struct chfs_vnode_cache),
+ 0, 0, 0, "chfs_vnode_cache", NULL, IPL_NONE, NULL, NULL,
+ NULL);
+ if (!chfs_vnode_cache)
+ goto err_vnode;
+
+ chfs_nrefs_cache = pool_cache_init(
+ (REFS_BLOCK_LEN + 1) * sizeof(struct chfs_node_ref), 0, 0,
+ 0, "chfs_nrefs_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+ if (!chfs_nrefs_cache)
+ goto err_nrefs;
+
+ chfs_flash_vnode_cache = pool_cache_init(
+ sizeof(struct chfs_flash_vnode), 0, 0, 0,
+ "chfs_flash_vnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+ if (!chfs_flash_vnode_cache)
+ goto err_flash_vnode;
+
+ chfs_flash_dirent_cache = pool_cache_init(
+ sizeof(struct chfs_flash_dirent_node), 0, 0, 0,
+ "chfs_flash_dirent_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+ if (!chfs_flash_dirent_cache)
+ goto err_flash_dirent;
+
+ chfs_flash_dnode_cache = pool_cache_init(
+ sizeof(struct chfs_flash_data_node), 0, 0, 0,
+ "chfs_flash_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+ if (!chfs_flash_dnode_cache)
+ goto err_flash_dnode;
+
+ chfs_node_frag_cache = pool_cache_init(
+ sizeof(struct chfs_node_frag), 0, 0, 0,
+ "chfs_node_frag_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+ if (!chfs_node_frag_cache)
+ goto err_node_frag;
+
+ chfs_tmp_dnode_cache = pool_cache_init(
+ sizeof(struct chfs_tmp_dnode), 0, 0, 0,
+ "chfs_tmp_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+ if (!chfs_tmp_dnode_cache)
+ goto err_tmp_dnode;
+
+ chfs_tmp_dnode_info_cache = pool_cache_init(
+ sizeof(struct chfs_tmp_dnode_info), 0, 0, 0,
+ "chfs_tmp_dnode_info_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+ if (!chfs_tmp_dnode_info_cache)
+ goto err_tmp_dnode_info;
+
+ return 0;
+
+err_tmp_dnode_info:
+ pool_cache_destroy(chfs_tmp_dnode_cache);
+err_tmp_dnode:
+ pool_cache_destroy(chfs_node_frag_cache);
+err_node_frag:
+ pool_cache_destroy(chfs_flash_dnode_cache);
+err_flash_dnode:
+ pool_cache_destroy(chfs_flash_dirent_cache);
+err_flash_dirent:
+ pool_cache_destroy(chfs_flash_vnode_cache);
+err_flash_vnode:
+ pool_cache_destroy(chfs_nrefs_cache);
+err_nrefs:
+ pool_cache_destroy(chfs_vnode_cache);
+err_vnode:
+
+ return ENOMEM;
+}
+
+void
+chfs_destroy_pool_caches()
+{
+ if (chfs_vnode_cache)
+ pool_cache_destroy(chfs_vnode_cache);
+
+ if (chfs_nrefs_cache)
+ pool_cache_destroy(chfs_nrefs_cache);
+
+ if (chfs_flash_vnode_cache)
+ pool_cache_destroy(chfs_flash_vnode_cache);
+
+ if (chfs_flash_dirent_cache)
+ pool_cache_destroy(chfs_flash_dirent_cache);
+
+ if (chfs_flash_dnode_cache)
+ pool_cache_destroy(chfs_flash_dnode_cache);
+
+ if (chfs_node_frag_cache)
+ pool_cache_destroy(chfs_node_frag_cache);
+
+ if (chfs_tmp_dnode_cache)
+ pool_cache_destroy(chfs_tmp_dnode_cache);
+
+ if (chfs_tmp_dnode_info_cache)
+ pool_cache_destroy(chfs_tmp_dnode_info_cache);
+}
+
+struct chfs_vnode_cache *
+chfs_vnode_cache_alloc(ino_t vno)
+{
+ struct chfs_vnode_cache* vc;
+ vc = pool_cache_get(chfs_vnode_cache, PR_WAITOK);
+
+ memset(vc, 0, sizeof(*vc));
+ vc->vno = vno;
+ vc->v = (void *)vc;
+ vc->dirents = (void *)vc;
+ vc->dnode = (void *)vc;
+ TAILQ_INIT(&vc->scan_dirents);
+ vc->highest_version = 0;
+
+ return vc;
+}
+
+void
+chfs_vnode_cache_free(struct chfs_vnode_cache *vc)
+{
+ //kmem_free(vc->vno_version, sizeof(uint64_t));
+ pool_cache_put(chfs_vnode_cache, vc);
+}
+
+/**
+ * chfs_alloc_refblock - allocating a refblock
+ *
+ * Returns a pointer of the first element in the block.
+ *
+ * We are not allocating just one node ref, instead we allocating REFS_BLOCK_LEN
+ * number of node refs, the last element will be a pointer to the next block.
+ * We do this, because we need a chain of nodes which have been ordered by the
+ * physical address of them.
+ *
+ */
+struct chfs_node_ref*
+chfs_alloc_refblock(void)
+{
+ int i;
+ struct chfs_node_ref *nref;
+ nref = pool_cache_get(chfs_nrefs_cache, PR_WAITOK);
+
+ for (i = 0; i < REFS_BLOCK_LEN; i++) {
+ nref[i].nref_lnr = REF_EMPTY_NODE;
+ nref[i].nref_next = NULL;
+ }
+ i = REFS_BLOCK_LEN;
+ nref[i].nref_lnr = REF_LINK_TO_NEXT;
+ nref[i].nref_next = NULL;
+
+ return nref;
+}
+
+/**
+ * chfs_free_refblock - freeing a refblock
+ */
+void
+chfs_free_refblock(struct chfs_node_ref *nref)
+{
+ pool_cache_put(chfs_nrefs_cache, nref);
+}
+
+/**
+ * chfs_alloc_node_ref - allocating a node ref from a refblock
+ * @cheb: eraseblock information structure
+ *
+ * Allocating a node ref from a refblock, it there isn't any free element in the
+ * block, a new block will be allocated and be linked to the current block.
+ */
+struct chfs_node_ref*
+chfs_alloc_node_ref(struct chfs_eraseblock *cheb)
+{
+ struct chfs_node_ref *nref, *new, *old;
+ old = cheb->last_node;
+ nref = cheb->last_node;
+
+ if (!nref) {
+ //There haven't been any nref allocated for this block yet
+ nref = chfs_alloc_refblock();
+
+ cheb->first_node = nref;
+ cheb->last_node = nref;
+ nref->nref_lnr = cheb->lnr;
+ KASSERT(cheb->lnr == nref->nref_lnr);
+
+ return nref;
+ }
+
+ nref++;
+ if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+ new = chfs_alloc_refblock();
+ nref->nref_next = new;
+ nref = new;
+ }
+
+ cheb->last_node = nref;
+ nref->nref_lnr = cheb->lnr;
+
+ KASSERT(old->nref_lnr == nref->nref_lnr &&
+ nref->nref_lnr == cheb->lnr);
+
+ return nref;
+}
+
+/**
+ * chfs_free_node_refs - freeing an eraseblock's node refs
+ * @cheb: eraseblock information structure
+ */
+void
+chfs_free_node_refs(struct chfs_eraseblock *cheb)
+{
+ struct chfs_node_ref *nref, *block;
+
+ block = nref = cheb->first_node;
+
+ while (nref) {
+ if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+ nref = nref->nref_next;
+ chfs_free_refblock(block);
+ block = nref;
+ continue;
+ }
+ nref++;
+ }
+}
+
+struct chfs_dirent*
+chfs_alloc_dirent(int namesize)
+{
+ struct chfs_dirent *ret;
+ size_t size = sizeof(struct chfs_dirent) + namesize;
+
+ ret = kmem_alloc(size, KM_SLEEP);
+ //ret->alloc_size = size;
+
+ return ret;
+}
+
+void
+chfs_free_dirent(struct chfs_dirent *dirent)
+{
+ //size_t size = dirent->alloc_size;
+ size_t size = sizeof(struct chfs_dirent) + dirent->nsize + 1;
+
+ kmem_free(dirent, size);
+}
+
+struct chfs_full_dnode*
+chfs_alloc_full_dnode()
+{
+ struct chfs_full_dnode *ret;
+ ret = kmem_alloc(sizeof(struct chfs_full_dnode), KM_SLEEP);
+ return ret;
+}
+
+void
+chfs_free_full_dnode(struct chfs_full_dnode *fd)
+{
+ kmem_free(fd,(sizeof(struct chfs_full_dnode)));
+}
+
+struct chfs_flash_vnode*
+chfs_alloc_flash_vnode()
+{
+ struct chfs_flash_vnode *ret;
+ ret = pool_cache_get(chfs_flash_vnode_cache, 0);
+ return ret;
+}
+
+void
+chfs_free_flash_vnode(struct chfs_flash_vnode *fvnode)
+{
+ pool_cache_put(chfs_flash_vnode_cache, fvnode);
+}
+
+struct chfs_flash_dirent_node*
+chfs_alloc_flash_dirent()
+{
+ struct chfs_flash_dirent_node *ret;
+ ret = pool_cache_get(chfs_flash_dirent_cache, 0);
+ return ret;
+}
+
+void
+chfs_free_flash_dirent(struct chfs_flash_dirent_node *fdnode)
+{
+ pool_cache_put(chfs_flash_dirent_cache, fdnode);
+}
+
+struct chfs_flash_data_node*
+chfs_alloc_flash_dnode()
+{
+ struct chfs_flash_data_node *ret;
+ ret = pool_cache_get(chfs_flash_dnode_cache, 0);
+ return ret;
+}
+
+void
+chfs_free_flash_dnode(struct chfs_flash_data_node *fdnode)
+{
+ pool_cache_put(chfs_flash_dnode_cache, fdnode);
+}
+
+
+struct chfs_node_frag*
+chfs_alloc_node_frag()
+{
+ struct chfs_node_frag *ret;
+ ret = pool_cache_get(chfs_node_frag_cache, 0);
+ return ret;
+
+}
+
+void
+chfs_free_node_frag(struct chfs_node_frag *frag)
+{
+ pool_cache_put(chfs_node_frag_cache, frag);
+}
+
+struct chfs_tmp_dnode *
+chfs_alloc_tmp_dnode()
+{
+ struct chfs_tmp_dnode *ret;
+ ret = pool_cache_get(chfs_tmp_dnode_cache, 0);
+ ret->next = NULL;
+ return ret;
+}
+
+void
+chfs_free_tmp_dnode(struct chfs_tmp_dnode *td)
+{
+ pool_cache_put(chfs_tmp_dnode_cache, td);
+}
+
+struct chfs_tmp_dnode_info *
+chfs_alloc_tmp_dnode_info()
+{
+ struct chfs_tmp_dnode_info *ret;
+ ret = pool_cache_get(chfs_tmp_dnode_info_cache, 0);
+ ret->tmpnode = NULL;
+ return ret;
+}
+
+void
+chfs_free_tmp_dnode_info(struct chfs_tmp_dnode_info *di)
+{
+ pool_cache_put(chfs_tmp_dnode_info_cache, di);
+}
+
--- /dev/null
+/* $NetBSD: chfs_nodeops.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+
+/**
+ * chfs_update_eb_dirty - updates dirty and free space, first and
+ * last node references
+ * @sbi: CHFS main descriptor structure
+ * @cheb: eraseblock to update
+ * @size: increase dirty space size with this
+ * Returns zero in case of success, %1 in case of fail.
+ */
+int
+chfs_update_eb_dirty(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, uint32_t size)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+ if (!size)
+ return 0;
+
+ if (size > cheb->free_size) {
+ chfs_err("free_size (%d) is less then dirty space (%d) "
+ "on block (%d)\n", cheb->free_size, size, cheb->lnr);
+ return 1;
+ }
+ mutex_enter(&chmp->chm_lock_sizes);
+ //dbg("BEFORE: free_size: %d\n", cheb->free_size);
+ chfs_change_size_free(chmp, cheb, -size);
+ chfs_change_size_dirty(chmp, cheb, size);
+ //dbg(" AFTER: free_size: %d\n", cheb->free_size);
+ mutex_exit(&chmp->chm_lock_sizes);
+ return 0;
+}
+
+/**
+ * chfs_add_node_to_list - adds a data node ref to vnode cache's dnode list
+ * @sbi: super block informations
+ * @new: node ref to insert
+ * @list: head of the list
+ * This function inserts a data node ref to the list of vnode cache.
+ * The list is sorted by data node's lnr and offset.
+ */
+void
+chfs_add_node_to_list(struct chfs_mount *chmp,
+ struct chfs_vnode_cache *vc,
+ struct chfs_node_ref *new, struct chfs_node_ref **list)
+{
+ struct chfs_node_ref *nextref = *list;
+ struct chfs_node_ref *prevref = NULL;
+
+ while (nextref && nextref != (struct chfs_node_ref *)vc &&
+ (nextref->nref_lnr <= new->nref_lnr)) {
+ if (nextref->nref_lnr == new->nref_lnr) {
+ while (nextref && nextref !=
+ (struct chfs_node_ref *)vc &&
+ (CHFS_GET_OFS(nextref->nref_offset) <
+ CHFS_GET_OFS(new->nref_offset))) {
+ prevref = nextref;
+ nextref = nextref->nref_next;
+ }
+ break;
+ }
+ prevref = nextref;
+ nextref = nextref->nref_next;
+ }
+
+ if (nextref && nextref != (struct chfs_node_ref *)vc &&
+ nextref->nref_lnr == new->nref_lnr &&
+ CHFS_GET_OFS(nextref->nref_offset) ==
+ CHFS_GET_OFS(new->nref_offset)) {
+ new->nref_next = nextref->nref_next;
+ } else {
+ new->nref_next = nextref;
+ }
+
+ if (prevref) {
+ prevref->nref_next = new;
+ } else {
+ *list = new;
+ }
+}
+
+void
+chfs_add_fd_to_inode(struct chfs_mount *chmp,
+ struct chfs_inode *parent, struct chfs_dirent *new)
+{
+// struct chfs_dirent **prev = &parent->dents;
+ struct chfs_dirent *fd, *tmpfd;
+
+ if (new->version > parent->chvc->highest_version) {
+ parent->chvc->highest_version = new->version;
+ }
+
+ //mutex_enter(&parent->inode_lock);
+ TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) {
+ if (fd->nhash > new->nhash) {
+ /* insert new before fd */
+ TAILQ_INSERT_BEFORE(fd, new, fds);
+ return;
+ } else if (fd->nhash == new->nhash &&
+ !strcmp(fd->name, new->name)) {
+ if (new->version > fd->version) {
+// new->next = fd->next;
+ /* replace fd with new */
+ TAILQ_INSERT_BEFORE(fd, new, fds);
+ TAILQ_REMOVE(&parent->dents, fd, fds);
+ if (fd->nref) {
+ chfs_mark_node_obsolete(chmp,
+ fd->nref);
+ }
+ chfs_free_dirent(fd);
+// *prev = new;//XXX
+ } else {
+ chfs_mark_node_obsolete(chmp, new->nref);
+ chfs_free_dirent(new);
+ }
+ return;
+ }
+ }
+ /* if we couldnt fit it elsewhere, lets add to the end */
+ /* FIXME insert tail or insert head? */
+ TAILQ_INSERT_HEAD(&parent->dents, new, fds);
+ //mutex_exit(&parent->inode_lock);
+#if 0
+ while ((*prev) && (*prev)->nhash <= new->nhash) {
+ if ((*prev)->nhash == new->nhash &&
+ !strcmp((*prev)->name, new->name)) {
+ if (new->version > (*prev)->version) {
+ new->next = (*prev)->next;
+ if ((*prev)->nref) {
+ chfs_mark_node_obsolete(chmp,
+ (*prev)->nref);
+ }
+ chfs_free_dirent(*prev);
+ *prev = new;
+ } else {
+ chfs_mark_node_obsolete(chmp, new->nref);
+ chfs_free_dirent(new);
+ }
+ return;
+ }
+ prev = &((*prev)->next);
+ }
+
+ new->next = *prev;
+ *prev = new;
+#endif
+}
+
+void
+chfs_add_vnode_ref_to_vc(struct chfs_mount *chmp,
+ struct chfs_vnode_cache *vc, struct chfs_node_ref *new)
+{
+ if ((struct chfs_vnode_cache*)(vc->v) != vc) {
+ chfs_mark_node_obsolete(chmp, vc->v);
+ new->nref_next = vc->v->nref_next;
+ } else {
+ new->nref_next = vc->v;
+ }
+ vc->v = new;
+}
+
+struct chfs_node_ref *
+chfs_nref_next(struct chfs_node_ref *nref)
+{
+// dbg("check nref: %u - %u\n", nref->nref_lnr, nref->nref_offset);
+ nref++;
+// dbg("next nref: %u - %u\n", nref->nref_lnr, nref->nref_offset);
+ if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+ //End of chain
+ if (!nref->nref_next)
+ return NULL;
+
+ nref = nref->nref_next;
+ }
+ //end of chain
+ if (nref->nref_lnr == REF_EMPTY_NODE)
+ return NULL;
+
+ return nref;
+}
+
+int
+chfs_nref_len(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, struct chfs_node_ref *nref)
+{
+ struct chfs_node_ref *next;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ if (!cheb)
+ cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+ next = chfs_nref_next(nref);
+
+ if (!next) {
+ //dbg("next null\n");
+ return chmp->chm_ebh->eb_size - cheb->free_size -
+ CHFS_GET_OFS(nref->nref_offset);
+ }
+ //dbg("size: %d\n", CHFS_GET_OFS(next->nref_offset) - CHFS_GET_OFS(nref->nref_offset));
+ return CHFS_GET_OFS(next->nref_offset) -
+ CHFS_GET_OFS(nref->nref_offset);
+}
+
+/**
+ * chfs_mark_node_obsolete - marks a node obsolete
+ */
+void
+chfs_mark_node_obsolete(struct chfs_mount *chmp,
+ struct chfs_node_ref *nref)
+{
+ int len;
+ struct chfs_eraseblock *cheb;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ KASSERT(!CHFS_REF_OBSOLETE(nref));
+
+ KASSERT(nref->nref_lnr <= chmp->chm_ebh->peb_nr);
+ cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+#ifdef DIAGNOSTIC
+ if (cheb->used_size + cheb->free_size + cheb->dirty_size +
+ cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) {
+ dbg("eraseblock leak detected!\nused: %u\nfree: %u\n"
+ "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n",
+ cheb->used_size, cheb->free_size, cheb->dirty_size,
+ cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size +
+ cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size,
+ chmp->chm_ebh->eb_size);
+ }
+#endif
+
+ len = chfs_nref_len(chmp, cheb, nref);
+ //dbg("len: %u\n", len);
+ //dbg("1. used: %u\n", cheb->used_size);
+
+ mutex_enter(&chmp->chm_lock_sizes);
+
+ if (CHFS_REF_FLAGS(nref) == CHFS_UNCHECKED_NODE_MASK) {
+ //dbg("UNCHECKED mark an unchecked node\n");
+ chfs_change_size_unchecked(chmp, cheb, -len);
+ //dbg("unchecked: %u\n", chmp->chm_unchecked_size);
+ } else {
+ chfs_change_size_used(chmp, cheb, -len);
+
+ //dbg("2. used: %u\n", cheb->used_size);
+ KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+ }
+ chfs_change_size_dirty(chmp, cheb, len);
+
+#ifdef DIAGNOSTIC
+ if (cheb->used_size + cheb->free_size + cheb->dirty_size +
+ cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) {
+ panic("eraseblock leak detected!\nused: %u\nfree: %u\n"
+ "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n",
+ cheb->used_size, cheb->free_size, cheb->dirty_size,
+ cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size +
+ cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size,
+ chmp->chm_ebh->eb_size);
+ }
+#endif
+ nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+ CHFS_OBSOLETE_NODE_MASK;
+
+ if (chmp->chm_flags & CHFS_MP_FLAG_SCANNING) {
+ /*Scan is in progress, do nothing now*/
+ mutex_exit(&chmp->chm_lock_sizes);
+ return;
+ }
+
+ if (cheb == chmp->chm_nextblock) {
+ dbg("Not moving nextblock to dirty/erase_pending list\n");
+ } else if (!cheb->used_size && !cheb->unchecked_size) {
+ if (cheb == chmp->chm_gcblock) {
+ dbg("gcblock is completely dirtied\n");
+ chmp->chm_gcblock = NULL;
+ } else {
+ //remove from a tailq, but we don't know which tailq contains this cheb
+ //so we remove it from the dirty list now
+ //TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+ int removed = 0;
+ struct chfs_eraseblock *eb, *tmpeb;
+ //XXX ugly code
+ TAILQ_FOREACH_SAFE(eb, &chmp->chm_free_queue, queue, tmpeb) {
+ if (eb == cheb) {
+ TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue);
+ removed = 1;
+ break;
+ }
+ }
+ if (removed == 0) {
+ TAILQ_FOREACH_SAFE(eb, &chmp->chm_dirty_queue, queue, tmpeb) {
+ if (eb == cheb) {
+ TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+ removed = 1;
+ break;
+ }
+ }
+ }
+ if (removed == 0) {
+ TAILQ_FOREACH_SAFE(eb, &chmp->chm_very_dirty_queue, queue, tmpeb) {
+ if (eb == cheb) {
+ TAILQ_REMOVE(&chmp->chm_very_dirty_queue, cheb, queue);
+ removed = 1;
+ break;
+ }
+ }
+ }
+ if (removed == 0) {
+ TAILQ_FOREACH_SAFE(eb, &chmp->chm_clean_queue, queue, tmpeb) {
+ if (eb == cheb) {
+ TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue);
+ removed = 1;
+ break;
+ }
+ }
+ }
+ }
+ if (chmp->chm_wbuf_len) {
+ dbg("Adding block to erasable pending wbuf queue\n");
+ TAILQ_INSERT_TAIL(&chmp->chm_erasable_pending_wbuf_queue,
+ cheb, queue);
+ } else {
+ TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+ cheb, queue);
+ chmp->chm_nr_erasable_blocks++;
+ }
+ chfs_remap_leb(chmp);
+ } else if (cheb == chmp->chm_gcblock) {
+ dbg("Not moving gcblock to dirty list\n");
+ } else if (cheb->dirty_size > MAX_DIRTY_TO_CLEAN &&
+ cheb->dirty_size - len <= MAX_DIRTY_TO_CLEAN) {
+ dbg("Freshly dirtied, remove it from clean queue and "
+ "add it to dirty\n");
+ TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue);
+ TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue);
+ } else if (VERY_DIRTY(chmp, cheb->dirty_size) &&
+ !VERY_DIRTY(chmp, cheb->dirty_size - len)) {
+ dbg("Becomes now very dirty, remove it from dirty "
+ "queue and add it to very dirty\n");
+ TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+ TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue);
+ } else {
+ dbg("Leave cheb where it is\n");
+ }
+ mutex_exit(&chmp->chm_lock_sizes);
+ return;
+}
+
+/**
+ * chfs_close_eraseblock - close an eraseblock
+ * @chmp: chfs mount structure
+ * @cheb: eraseblock informations
+ *
+ * This function close the physical chain of the nodes on the eraseblock,
+ * convert its free size to dirty and add it to clean, dirty or very dirty list.
+ */
+int
+chfs_close_eraseblock(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb)
+{
+ uint32_t offset;
+ struct chfs_node_ref *nref;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ offset = chmp->chm_ebh->eb_size - cheb->free_size;
+
+ // Close the chain
+ nref = chfs_alloc_node_ref(cheb);
+ if (!nref)
+ return ENOMEM;
+
+ nref->nref_next = NULL;
+ nref->nref_offset = offset;
+
+ // Mark space as dirty
+ chfs_update_eb_dirty(chmp, cheb, cheb->free_size);
+
+ if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN) {
+ TAILQ_INSERT_TAIL(&chmp->chm_clean_queue, cheb, queue);
+ } else if (VERY_DIRTY(chmp, cheb->dirty_size)) {
+ TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue);
+ } else {
+ TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue);
+ }
+ return 0;
+}
+
+int
+chfs_reserve_space_normal(struct chfs_mount *chmp, uint32_t size, int prio)
+{
+ int ret;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ mutex_enter(&chmp->chm_lock_sizes);
+ while (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks < chmp->chm_resv_blocks_write) {
+ dbg("free: %d, erasable: %d, resv: %d\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks, chmp->chm_resv_blocks_write);
+ uint32_t avail, dirty;
+ if (prio == ALLOC_DELETION && chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks >= chmp->chm_resv_blocks_deletion)
+ break;
+
+ dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks * chmp->chm_ebh->eb_size + chmp->chm_unchecked_size;
+ if (dirty < chmp->chm_nospc_dirty) {
+ dbg("dirty: %u < nospc_dirty: %u\n", dirty, chmp->chm_nospc_dirty);
+ ret = ENOSPC;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto out;
+ }
+
+ avail = chmp->chm_free_size - (chmp->chm_resv_blocks_write * chmp->chm_ebh->eb_size);
+ if (size > avail) {
+ dbg("size: %u > avail: %u\n", size, avail);
+ ret = ENOSPC;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto out;
+ }
+
+ mutex_exit(&chmp->chm_lock_sizes);
+ ret = chfs_gcollect_pass(chmp);
+ /* gcollect_pass exits chm_lock_mountfields */
+ mutex_enter(&chmp->chm_lock_mountfields);
+ mutex_enter(&chmp->chm_lock_sizes);
+
+ if (chmp->chm_nr_erasable_blocks ||
+ !TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue) ||
+ ret == EAGAIN) {
+ ret = chfs_remap_leb(chmp);
+ }
+
+ if (ret) {
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto out;
+ }
+ }
+
+ mutex_exit(&chmp->chm_lock_sizes);
+ ret = chfs_reserve_space(chmp, size);
+out:
+ return ret;
+}
+
+
+int
+chfs_reserve_space_gc(struct chfs_mount *chmp, uint32_t size)
+{
+ int ret;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ mutex_enter(&chmp->chm_lock_sizes);
+ chfs_remap_leb(chmp);
+
+ if (size > chmp->chm_free_size) {
+ dbg("size: %u\n", size);
+ mutex_exit(&chmp->chm_lock_sizes);
+ return ENOSPC;
+ }
+
+ mutex_exit(&chmp->chm_lock_sizes);
+ ret = chfs_reserve_space(chmp, size);
+ return ret;
+}
+
+/**
+ * chfs_reserve_space - finds a block which free size is >= requested size
+ * @chmp: chfs mount point
+ * @size: requested size
+ * @len: reserved spaced will be returned in this variable;
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+chfs_reserve_space(struct chfs_mount *chmp, uint32_t size)
+{
+ //TODO define minimum reserved blocks, which is needed for writing
+ //TODO check we have enough free blocks to write
+ //TODO if no: need erase and GC
+
+ int err;
+ struct chfs_eraseblock *cheb;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+ cheb = chmp->chm_nextblock;
+ //if (cheb)
+ //dbg("cheb->free_size %u\n", cheb->free_size);
+ if (cheb && size > cheb->free_size) {
+ dbg("size: %u > free_size: %u\n", size, cheb->free_size);
+ /*
+ * There isn't enough space on this eraseblock, we mark this as
+ * dirty and close the physical chain of the node refs.
+ */
+ //Write out pending data if any
+ if (chmp->chm_wbuf_len) {
+ chfs_flush_pending_wbuf(chmp);
+ //FIXME need goto restart here?
+ }
+
+ while (chmp->chm_wbuf_ofs < chmp->chm_ebh->eb_size) {
+ dbg("wbuf ofs: %zu - eb_size: %zu\n",
+ chmp->chm_wbuf_ofs, chmp->chm_ebh->eb_size);
+ chfs_flush_pending_wbuf(chmp);
+ }
+
+ if (!(chmp->chm_wbuf_ofs % chmp->chm_ebh->eb_size) && !chmp->chm_wbuf_len)
+ chmp->chm_wbuf_ofs = 0xffffffff;
+
+ err = chfs_close_eraseblock(chmp, cheb);
+ if (err)
+ return err;
+
+ cheb = NULL;
+ }
+ if (!cheb) {
+ //get a block for nextblock
+ if (TAILQ_EMPTY(&chmp->chm_free_queue)) {
+ // If this succeeds there will be a block on free_queue
+ dbg("cheb remap (free: %d)\n", chmp->chm_nr_free_blocks);
+ err = chfs_remap_leb(chmp);
+ if (err)
+ return err;
+ }
+ cheb = TAILQ_FIRST(&chmp->chm_free_queue);
+ TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue);
+ chmp->chm_nextblock = cheb;
+ chmp->chm_nr_free_blocks--;
+ }
+
+ return 0;
+}
+
--- /dev/null
+/* $NetBSD: chfs_pool.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Pool allocator and convenience routines for chfs.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/pool.h>
+#include <sys/atomic.h>
+
+#include <uvm/uvm.h>
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+/* --------------------------------------------------------------------- */
+
+void * chfs_pool_page_alloc(struct pool *, int);
+void chfs_pool_page_free(struct pool *, void *);
+
+extern void* pool_page_alloc_nointr(struct pool *, int);
+extern void pool_page_free_nointr(struct pool *, void *);
+
+/* --------------------------------------------------------------------- */
+
+struct pool_allocator chfs_pool_allocator = {
+ .pa_alloc = chfs_pool_page_alloc,
+ .pa_free = chfs_pool_page_free,
+};
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_init(struct chfs_pool *chpp, size_t size, const char *what,
+ struct chfs_mount *chmp)
+{
+ int cnt;
+
+ cnt = snprintf(chpp->chp_name, sizeof(chpp->chp_name),
+ "%s_chfs_%p", what, chmp);
+ KASSERT(cnt < sizeof(chpp->chp_name));
+
+ pool_init(&chpp->chp_pool, size, 0, 0, 0, chpp->chp_name,
+ &chfs_pool_allocator, IPL_NONE);
+ chpp->chp_mount = chmp;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_destroy(struct chfs_pool *chpp)
+{
+ pool_destroy((struct pool *)chpp);
+}
+
+/* --------------------------------------------------------------------- */
+
+void *
+chfs_pool_page_alloc(struct pool *pp, int flags)
+{
+ struct chfs_pool *chpp;
+ struct chfs_mount *chmp;
+ unsigned int pages;
+ void *page;
+ dbg("CHFS: pool_page_alloc()\n");
+
+ chpp = (struct chfs_pool *)pp;
+ chmp = chpp->chp_mount;
+
+ pages = atomic_inc_uint_nv(&chmp->chm_pages_used);
+ if (pages >= CHFS_PAGES_MAX(chmp)) {
+ atomic_dec_uint(&chmp->chm_pages_used);
+ return NULL;
+ }
+ page = pool_page_alloc_nointr(pp, flags | PR_WAITOK);
+ if (page == NULL) {
+ atomic_dec_uint(&chmp->chm_pages_used);
+ }
+
+ return page;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_page_free(struct pool *pp, void *v)
+{
+ struct chfs_pool *chpp;
+ struct chfs_mount *chmp;
+ dbg("CHFS: pool_page_free()\n");
+
+ chpp = (struct chfs_pool *)pp;
+ chmp = chpp->chp_mount;
+
+ atomic_dec_uint(&chmp->chm_pages_used);
+ pool_page_free_nointr(pp, v);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_init(struct chfs_str_pool *chsp, struct chfs_mount *chmp)
+{
+ dbg("CHFS: str_pool_init()\n");
+
+ chfs_pool_init(&chsp->chsp_pool_16, 16, "str", chmp);
+ chfs_pool_init(&chsp->chsp_pool_32, 32, "str", chmp);
+ chfs_pool_init(&chsp->chsp_pool_64, 64, "str", chmp);
+ chfs_pool_init(&chsp->chsp_pool_128, 128, "str", chmp);
+ chfs_pool_init(&chsp->chsp_pool_256, 256, "str", chmp);
+ chfs_pool_init(&chsp->chsp_pool_512, 512, "str", chmp);
+ chfs_pool_init(&chsp->chsp_pool_1024, 1024, "str", chmp);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_destroy(struct chfs_str_pool *chsp)
+{
+ dbg("CHFS: str_pool_destroy()\n");
+
+ chfs_pool_destroy(&chsp->chsp_pool_16);
+ chfs_pool_destroy(&chsp->chsp_pool_32);
+ chfs_pool_destroy(&chsp->chsp_pool_64);
+ chfs_pool_destroy(&chsp->chsp_pool_128);
+ chfs_pool_destroy(&chsp->chsp_pool_256);
+ chfs_pool_destroy(&chsp->chsp_pool_512);
+ chfs_pool_destroy(&chsp->chsp_pool_1024);
+}
+
+/* --------------------------------------------------------------------- */
+
+char *
+chfs_str_pool_get(struct chfs_str_pool *chsp, size_t len, int flags)
+{
+ struct chfs_pool *p;
+ dbg("CHFS: str_pool_get()\n");
+
+ KASSERT(len <= 1024);
+
+ if (len <= 16) p = &chsp->chsp_pool_16;
+ else if (len <= 32) p = &chsp->chsp_pool_32;
+ else if (len <= 64) p = &chsp->chsp_pool_64;
+ else if (len <= 128) p = &chsp->chsp_pool_128;
+ else if (len <= 256) p = &chsp->chsp_pool_256;
+ else if (len <= 512) p = &chsp->chsp_pool_512;
+ else if (len <= 1024) p = &chsp->chsp_pool_1024;
+ else {
+ KASSERT(0);
+ p = NULL; /* Silence compiler warnings */
+ }
+
+ return (char *)CHFS_POOL_GET(p, flags);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_put(struct chfs_str_pool *chsp, char *str, size_t len)
+{
+ struct chfs_pool *p;
+ dbg("CHFS: str_pool_put()\n");
+
+ KASSERT(len <= 1024);
+
+ if (len <= 16) p = &chsp->chsp_pool_16;
+ else if (len <= 32) p = &chsp->chsp_pool_32;
+ else if (len <= 64) p = &chsp->chsp_pool_64;
+ else if (len <= 128) p = &chsp->chsp_pool_128;
+ else if (len <= 256) p = &chsp->chsp_pool_256;
+ else if (len <= 512) p = &chsp->chsp_pool_512;
+ else if (len <= 1024) p = &chsp->chsp_pool_1024;
+ else {
+ KASSERT(0);
+ p = NULL; /* Silence compiler warnings */
+ }
+
+ CHFS_POOL_PUT(p, str);
+}
--- /dev/null
+/* $NetBSD: chfs_readinode.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_readinode.c
+ *
+ * Created on: 2010.05.31.
+ * Author: dtengeri
+ */
+
+#include <sys/buf.h>
+
+#include "chfs.h"
+
+/* tmp node operations */
+int chfs_check_td_data(struct chfs_mount *,
+ struct chfs_tmp_dnode *);
+int chfs_check_td_node(struct chfs_mount *,
+ struct chfs_tmp_dnode *);
+struct chfs_node_ref *chfs_first_valid_data_ref(struct chfs_node_ref *);
+int chfs_add_tmp_dnode_to_tree(struct chfs_mount *,
+ struct chfs_readinode_info *,
+ struct chfs_tmp_dnode *);
+void chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *,
+ struct chfs_tmp_dnode *);
+void chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *,
+ struct chfs_tmp_dnode *);
+static void chfs_kill_td(struct chfs_mount *,
+ struct chfs_tmp_dnode *);
+static void chfs_kill_tdi(struct chfs_mount *,
+ struct chfs_tmp_dnode_info *);
+/* frag node operations */
+struct chfs_node_frag *new_fragment(struct chfs_full_dnode *,
+ uint32_t,
+ uint32_t);
+int no_overlapping_node(struct rb_tree *, struct chfs_node_frag *,
+ struct chfs_node_frag *, uint32_t);
+int chfs_add_frag_to_fragtree(struct chfs_mount *,
+ struct rb_tree *,
+ struct chfs_node_frag *);
+void chfs_obsolete_node_frag(struct chfs_mount *,
+ struct chfs_node_frag *);
+/* general node operations */
+int chfs_get_data_nodes(struct chfs_mount *,
+ struct chfs_inode *,
+ struct chfs_readinode_info *);
+int chfs_build_fragtree(struct chfs_mount *,
+ struct chfs_inode *,
+ struct chfs_readinode_info *);
+
+
+
+/*
+ * --------------------------
+ * tmp node rbtree operations
+ * --------------------------
+ */
+static signed int
+tmp_node_compare_nodes(void *ctx, const void *n1, const void *n2)
+{
+ const struct chfs_tmp_dnode_info *tdi1 = n1;
+ const struct chfs_tmp_dnode_info *tdi2 = n2;
+
+ return (tdi1->tmpnode->node->ofs - tdi2->tmpnode->node->ofs);
+}
+
+static signed int
+tmp_node_compare_key(void *ctx, const void *n, const void *key)
+{
+ const struct chfs_tmp_dnode_info *tdi = n;
+ uint64_t ofs = *(const uint64_t *)key;
+
+ return (tdi->tmpnode->node->ofs - ofs);
+}
+
+const rb_tree_ops_t tmp_node_rbtree_ops = {
+ .rbto_compare_nodes = tmp_node_compare_nodes,
+ .rbto_compare_key = tmp_node_compare_key,
+ .rbto_node_offset = offsetof(struct chfs_tmp_dnode_info, rb_node),
+ .rbto_context = NULL
+};
+
+
+/*
+ * ---------------------------
+ * frag node rbtree operations
+ * ---------------------------
+ */
+static signed int
+frag_compare_nodes(void *ctx, const void *n1, const void *n2)
+{
+ const struct chfs_node_frag *frag1 = n1;
+ const struct chfs_node_frag *frag2 = n2;
+
+ return (frag1->ofs - frag2->ofs);
+}
+
+static signed int
+frag_compare_key(void *ctx, const void *n, const void *key)
+{
+ const struct chfs_node_frag *frag = n;
+ uint64_t ofs = *(const uint64_t *)key;
+
+ return (frag->ofs - ofs);
+}
+
+const rb_tree_ops_t frag_rbtree_ops = {
+ .rbto_compare_nodes = frag_compare_nodes,
+ .rbto_compare_key = frag_compare_key,
+ .rbto_node_offset = offsetof(struct chfs_node_frag, rb_node),
+ .rbto_context = NULL
+};
+
+
+/*
+ * -------------------
+ * tmp node operations
+ * -------------------
+ */
+/*
+ * Check the data CRC of the node.
+ *
+ * Returns: 0 - if everything OK;
+ * 1 - if CRC is incorrect;
+ * 2 - else;
+ * error code if an error occured.
+ */
+int
+chfs_check_td_data(struct chfs_mount *chmp,
+ struct chfs_tmp_dnode *td)
+{
+ int err;
+ size_t retlen, len, totlen;
+ uint32_t crc;
+ uint64_t ofs;
+ char *buf;
+ struct chfs_node_ref *nref = td->node->nref;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+ ofs = CHFS_GET_OFS(nref->nref_offset) + sizeof(struct chfs_flash_data_node);
+ len = td->node->size;
+ if (!len)
+ return 0;
+
+ buf = kmem_alloc(len, KM_SLEEP);
+ if (!buf) {
+ dbg("allocating error\n");
+ return 2;
+ }
+ err = chfs_read_leb(chmp, nref->nref_lnr, buf, ofs, len, &retlen);
+ if (err) {
+ dbg("error wile reading: %d\n", err);
+ err = 2;
+ goto out;
+ }
+
+ if (len != retlen) {
+ dbg("len:%zu, retlen:%zu\n", len, retlen);
+ err = 2;
+ goto out;
+ }
+ crc = crc32(0, (uint8_t *)buf, len);
+
+ if (crc != td->data_crc) {
+ dbg("crc failed, calculated: 0x%x, orig: 0x%x\n", crc, td->data_crc);
+ kmem_free(buf, len);
+ return 1;
+ }
+
+ nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) | CHFS_NORMAL_NODE_MASK;
+ totlen = CHFS_PAD(sizeof(struct chfs_flash_data_node) + len);
+
+ mutex_enter(&chmp->chm_lock_sizes);
+ chfs_change_size_unchecked(chmp, &chmp->chm_blocks[nref->nref_lnr], -totlen);
+ chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen);
+ mutex_exit(&chmp->chm_lock_sizes);
+ KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+
+ err = 0;
+out:
+ kmem_free(buf, len);
+ return err;
+}
+
+int
+chfs_check_td_node(struct chfs_mount *chmp, struct chfs_tmp_dnode *td)
+{
+ int ret;
+
+ if (CHFS_REF_FLAGS(td->node->nref) != CHFS_UNCHECKED_NODE_MASK)
+ return 0;
+
+ ret = chfs_check_td_data(chmp, td);
+ if (ret == 1) {
+ chfs_mark_node_obsolete(chmp, td->node->nref);
+ }
+ return ret;
+}
+
+
+struct chfs_node_ref *
+chfs_first_valid_data_ref(struct chfs_node_ref *nref)
+{
+ while (nref) {
+ if (!CHFS_REF_OBSOLETE(nref)) {
+#ifdef DGB_MSG_GC
+ if (nref->nref_lnr == REF_EMPTY_NODE) {
+ dbg("FIRST VALID IS EMPTY!\n");
+ }
+#endif
+ return nref;
+ }
+
+ if (nref->nref_next) {
+ nref = nref->nref_next;
+ } else
+ break;
+ }
+ return NULL;
+}
+
+void
+chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *tdi,
+ struct chfs_tmp_dnode *td)
+{
+ if (!tdi->tmpnode) {
+ tdi->tmpnode = td;
+ } else {
+ struct chfs_tmp_dnode *tmp = tdi->tmpnode;
+ while (tmp->next) {
+ tmp = tmp->next;
+ }
+ tmp->next = td;
+ }
+}
+
+void
+chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *tdi,
+ struct chfs_tmp_dnode *td)
+{
+ if (tdi->tmpnode == td) {
+ tdi->tmpnode = tdi->tmpnode->next;
+ } else {
+ struct chfs_tmp_dnode *tmp = tdi->tmpnode->next;
+ while (tmp->next && tmp->next != td) {
+ tmp = tmp->next;
+ }
+ if (tmp->next) {
+ tmp->next = td->next;
+ }
+ }
+}
+
+static void
+chfs_kill_td(struct chfs_mount *chmp,
+ struct chfs_tmp_dnode *td)
+{
+ /* check if we need to mark as obsolete, to avoid double mark */
+ if (!CHFS_REF_OBSOLETE(td->node->nref)) {
+ chfs_mark_node_obsolete(chmp, td->node->nref);
+ }
+
+ chfs_free_tmp_dnode(td);
+}
+
+static void
+chfs_kill_tdi(struct chfs_mount *chmp,
+ struct chfs_tmp_dnode_info *tdi)
+{
+ struct chfs_tmp_dnode *next, *tmp = tdi->tmpnode;
+
+ while (tmp) {
+ next = tmp->next;
+ chfs_kill_td(chmp, tmp);
+ tmp = next;
+ }
+
+ chfs_free_tmp_dnode_info(tdi);
+}
+
+int
+chfs_add_tmp_dnode_to_tree(struct chfs_mount *chmp,
+ struct chfs_readinode_info *rii,
+ struct chfs_tmp_dnode *newtd)
+{
+ uint64_t end_ofs = newtd->node->ofs + newtd->node->size;
+ struct chfs_tmp_dnode_info *this;
+ struct rb_node *node, *prev_node;
+ struct chfs_tmp_dnode_info *newtdi;
+
+ node = rb_tree_find_node(&rii->tdi_root, &newtd->node->ofs);
+ if (node) {
+ this = (struct chfs_tmp_dnode_info *)node;
+ while (this->tmpnode->overlapped) {
+ prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+ if (!prev_node) {
+ this->tmpnode->overlapped = 0;
+ break;
+ }
+ node = prev_node;
+ this = (struct chfs_tmp_dnode_info *)node;
+ }
+ }
+ while (node) {
+ this = (struct chfs_tmp_dnode_info *)node;
+ if (this->tmpnode->node->ofs > end_ofs)
+ break;
+
+ struct chfs_tmp_dnode *tmp_td = this->tmpnode;
+ while (tmp_td) {
+ if (tmp_td->version == newtd->version) {
+ if (!chfs_check_td_node(chmp, tmp_td)) {
+ dbg("calling kill td 0\n");
+ chfs_kill_td(chmp, newtd);
+ return 0;
+ } else {
+ chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+ chfs_kill_td(chmp, tmp_td);
+ chfs_add_tmp_dnode_to_tdi(this, newtd);
+ return 0;
+ }
+ }
+ if (tmp_td->version < newtd->version &&
+ tmp_td->node->ofs >= newtd->node->ofs &&
+ tmp_td->node->ofs + tmp_td->node->size <= end_ofs) {
+ /* New node entirely overlaps 'this' */
+ if (chfs_check_td_node(chmp, newtd)) {
+ dbg("calling kill td 2\n");
+ chfs_kill_td(chmp, newtd);
+ return 0;
+ }
+ /* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */
+ while (tmp_td && tmp_td->node->ofs + tmp_td->node->size <= end_ofs) {
+ struct rb_node *next = rb_tree_iterate(&rii->tdi_root, this, RB_DIR_RIGHT);
+ struct chfs_tmp_dnode_info *next_tdi = (struct chfs_tmp_dnode_info *)next;
+ struct chfs_tmp_dnode *next_td = NULL;
+ if (tmp_td->next) {
+ next_td = tmp_td->next;
+ } else if (next_tdi) {
+ next_td = next_tdi->tmpnode;
+ }
+ if (tmp_td->version < newtd->version) {
+ chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+ chfs_kill_td(chmp, tmp_td);
+ if (!this->tmpnode) {
+ rb_tree_remove_node(&rii->tdi_root, this);
+ chfs_kill_tdi(chmp, this);
+ this = next_tdi;
+ }
+ }
+ tmp_td = next_td;
+ }
+ continue;
+ }
+ if (tmp_td->version > newtd->version &&
+ tmp_td->node->ofs <= newtd->node->ofs &&
+ tmp_td->node->ofs + tmp_td->node->size >= end_ofs) {
+ /* New node entirely overlapped by 'this' */
+ if (!chfs_check_td_node(chmp, tmp_td)) {
+ dbg("this version: %llu\n",
+ (unsigned long long)tmp_td->version);
+ dbg("this ofs: %llu, size: %u\n",
+ (unsigned long long)tmp_td->node->ofs,
+ tmp_td->node->size);
+ dbg("calling kill td 4\n");
+ chfs_kill_td(chmp, newtd);
+ return 0;
+ }
+ /* ... but 'this' was bad. Replace it... */
+ chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+ chfs_kill_td(chmp, tmp_td);
+ if (!this->tmpnode) {
+ rb_tree_remove_node(&rii->tdi_root, this);
+ chfs_kill_tdi(chmp, this);
+ }
+ dbg("calling kill td 5\n");
+ chfs_kill_td(chmp, newtd);
+ break;
+ }
+ tmp_td = tmp_td->next;
+ }
+ node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+ }
+
+ newtdi = chfs_alloc_tmp_dnode_info();
+ chfs_add_tmp_dnode_to_tdi(newtdi, newtd);
+ /* We neither completely obsoleted nor were completely
+ obsoleted by an earlier node. Insert into the tree */
+ struct chfs_tmp_dnode_info *tmp_tdi = rb_tree_insert_node(&rii->tdi_root, newtdi);
+ if (tmp_tdi != newtdi) {
+ chfs_add_tmp_dnode_to_tdi(tmp_tdi, newtd);
+ newtdi->tmpnode = NULL;
+ chfs_kill_tdi(chmp, newtdi);
+ }
+
+ /* If there's anything behind that overlaps us, note it */
+ node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+ if (node) {
+ while (1) {
+ this = (struct chfs_tmp_dnode_info *)node;
+ if (this->tmpnode->node->ofs + this->tmpnode->node->size > newtd->node->ofs) {
+ newtd->overlapped = 1;
+ }
+ if (!this->tmpnode->overlapped)
+ break;
+
+ prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+ if (!prev_node) {
+ this->tmpnode->overlapped = 0;
+ break;
+ }
+ node = prev_node;
+ }
+ }
+
+ /* If the new node overlaps anything ahead, note it */
+ node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+ this = (struct chfs_tmp_dnode_info *)node;
+ while (this && this->tmpnode->node->ofs < end_ofs) {
+ this->tmpnode->overlapped = 1;
+ node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+ this = (struct chfs_tmp_dnode_info *)node;
+ }
+ return 0;
+}
+
+
+/*
+ * --------------------
+ * frag node operations
+ * --------------------
+ */
+struct chfs_node_frag *
+new_fragment(struct chfs_full_dnode *fdn, uint32_t ofs, uint32_t size)
+{
+ struct chfs_node_frag *newfrag;
+ newfrag = chfs_alloc_node_frag();
+ if (newfrag) {
+ newfrag->ofs = ofs;
+ newfrag->size = size;
+ newfrag->node = fdn;
+ } else {
+ chfs_err("cannot allocate a chfs_node_frag object\n");
+ }
+ return newfrag;
+}
+
+int
+no_overlapping_node(struct rb_tree *fragtree,
+ struct chfs_node_frag *newfrag,
+ struct chfs_node_frag *this, uint32_t lastend)
+{
+ if (lastend < newfrag->node->ofs) {
+ struct chfs_node_frag *holefrag;
+
+ holefrag = new_fragment(NULL, lastend, newfrag->node->ofs - lastend);
+ if (!holefrag) {
+ chfs_free_node_frag(newfrag);
+ return ENOMEM;
+ }
+
+ rb_tree_insert_node(fragtree, holefrag);
+ this = holefrag;
+ }
+
+ rb_tree_insert_node(fragtree, newfrag);
+
+ return 0;
+}
+
+int
+chfs_add_frag_to_fragtree(struct chfs_mount *chmp,
+ struct rb_tree *fragtree,
+ struct chfs_node_frag *newfrag)
+{
+ struct chfs_node_frag *this;
+ uint32_t lastend;
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ this = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &newfrag->ofs);
+
+ if (this) {
+ lastend = this->ofs + this->size;
+ } else {
+ lastend = 0;
+ }
+
+ if (lastend <= newfrag->ofs) {
+ //dbg("no overlapping node\n");
+ if (lastend && (lastend - 1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) {
+ if (this->node)
+ CHFS_MARK_REF_NORMAL(this->node->nref);
+ CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+ }
+ return no_overlapping_node(fragtree, newfrag, this, lastend);
+ }
+
+ if (newfrag->ofs > this->ofs) {
+
+ CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+ if (this->node)
+ CHFS_MARK_REF_NORMAL(this->node->nref);
+
+ if (this->ofs + this->size > newfrag->ofs + newfrag->size) {
+ /* newfrag is inside of this */
+ //dbg("newfrag is inside of this\n");
+ struct chfs_node_frag *newfrag2;
+
+ newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size,
+ this->ofs + this->size - newfrag->ofs - newfrag->size);
+ if (!newfrag2)
+ return ENOMEM;
+ if (this->node)
+ this->node->frags++;
+
+ this->size = newfrag->ofs - this->ofs;
+
+ rb_tree_insert_node(fragtree, newfrag);
+ rb_tree_insert_node(fragtree, newfrag2);
+
+ return 0;
+ }
+ /* newfrag is bottom of this */
+ //dbg("newfrag is bottom of this\n");
+ this->size = newfrag->ofs - this->ofs;
+ rb_tree_insert_node(fragtree, newfrag);
+ } else {
+ /* newfrag start at same point */
+ //dbg("newfrag start at same point\n");
+ //TODO replace instead of remove and insert
+ rb_tree_remove_node(fragtree, this);
+ rb_tree_insert_node(fragtree, newfrag);
+
+ if (newfrag->ofs + newfrag->size >= this->ofs+this->size) {
+ chfs_obsolete_node_frag(chmp, this);
+ } else {
+ this->ofs += newfrag->size;
+ this->size -= newfrag->size;
+
+ rb_tree_insert_node(fragtree, this);
+ return 0;
+ }
+ }
+ /* OK, now we have newfrag added in the correct place in the tree, but
+ frag_next(newfrag) may be a fragment which is overlapped by it
+ */
+ while ((this = frag_next(fragtree, newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) {
+ rb_tree_remove_node(fragtree, this);
+ chfs_obsolete_node_frag(chmp, this);
+ }
+
+ if (!this || newfrag->ofs + newfrag->size == this->ofs)
+ return 0;
+
+ this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size);
+ this->ofs = newfrag->ofs + newfrag->size;
+
+ if (this->node)
+ CHFS_MARK_REF_NORMAL(this->node->nref);
+ CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+
+ return 0;
+}
+
+void
+chfs_kill_fragtree(struct rb_tree *fragtree)
+{
+ struct chfs_node_frag *this, *next;
+ //dbg("start\n");
+
+ this = (struct chfs_node_frag *)RB_TREE_MIN(fragtree);
+ while (this) {
+ //for (this = (struct chfs_node_frag *)RB_TREE_MIN(&fragtree); this != NULL; this = (struct chfs_node_frag *)rb_tree_iterate(&fragtree, &this->rb_node, RB_DIR_RIGHT)) {
+ next = frag_next(fragtree, this);
+ rb_tree_remove_node(fragtree, this);
+ chfs_free_node_frag(this);
+ //dbg("one frag killed\n");
+ this = next;
+ }
+ //dbg("end\n");
+}
+
+uint32_t
+chfs_truncate_fragtree(struct chfs_mount *chmp,
+ struct rb_tree *fragtree, uint32_t size)
+{
+ struct chfs_node_frag *frag;
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ dbg("truncate to size: %u\n", size);
+
+ frag = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &size);
+
+ /* Find the last frag before size and set its new size. */
+ if (frag && frag->ofs != size) {
+ if (frag->ofs + frag->size > size) {
+ frag->size = size - frag->ofs;
+ }
+ frag = frag_next(fragtree, frag);
+ }
+
+ /* Delete frags after new size. */
+ while (frag && frag->ofs >= size) {
+ struct chfs_node_frag *next = frag_next(fragtree, frag);
+
+ rb_tree_remove_node(fragtree, frag);
+ chfs_obsolete_node_frag(chmp, frag);
+ frag = next;
+ }
+
+ if (size == 0) {
+ return 0;
+ }
+
+ frag = frag_last(fragtree);
+
+ if (!frag) {
+ return 0;
+ }
+
+ if (frag->ofs + frag->size < size) {
+ return frag->ofs + frag->size;
+ }
+
+ /* FIXME Should we check the postion of the last node? (PAGE_CACHE size, etc.) */
+ if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) {
+ frag->node->nref->nref_offset = CHFS_GET_OFS(frag->node->nref->nref_offset) | CHFS_PRISTINE_NODE_MASK;
+ }
+
+ return size;
+}
+
+void
+chfs_obsolete_node_frag(struct chfs_mount *chmp,
+ struct chfs_node_frag *this)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ if (this->node) {
+ this->node->frags--;
+ if (!this->node->frags) {
+ struct chfs_vnode_cache *vc = chfs_nref_to_vc(this->node->nref);
+ chfs_mark_node_obsolete(chmp, this->node->nref);
+
+ if (vc->dnode == this->node->nref) {
+ vc->dnode = this->node->nref->nref_next;
+ } else {
+ struct chfs_node_ref *tmp = vc->dnode;
+ while (tmp->nref_next != (struct chfs_node_ref*) vc
+ && tmp->nref_next != this->node->nref) {
+ tmp = tmp->nref_next;
+ }
+ if (tmp->nref_next == this->node->nref) {
+ tmp->nref_next = this->node->nref->nref_next;
+ }
+ // FIXME should we free here the this->node->nref?
+ }
+
+ chfs_free_full_dnode(this->node);
+ } else {
+ CHFS_MARK_REF_NORMAL(this->node->nref);
+ }
+ }
+ chfs_free_node_frag(this);
+}
+
+int
+chfs_add_full_dnode_to_inode(struct chfs_mount *chmp,
+ struct chfs_inode *ip,
+ struct chfs_full_dnode *fd)
+{
+ int ret;
+ struct chfs_node_frag *newfrag;
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ if (unlikely(!fd->size))
+ return 0;
+
+ newfrag = new_fragment(fd, fd->ofs, fd->size);
+ if (unlikely(!newfrag))
+ return ENOMEM;
+
+ newfrag->node->frags = 1;
+
+ ret = chfs_add_frag_to_fragtree(chmp, &ip->fragtree, newfrag);
+ if (ret)
+ return ret;
+
+ if (newfrag->ofs & (PAGE_SIZE - 1)) {
+ struct chfs_node_frag *prev = frag_prev(&ip->fragtree, newfrag);
+
+ CHFS_MARK_REF_NORMAL(fd->nref);
+ if (prev->node)
+ CHFS_MARK_REF_NORMAL(prev->node->nref);
+ }
+
+ if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE - 1)) {
+ struct chfs_node_frag *next = frag_next(&ip->fragtree, newfrag);
+
+ if (next) {
+ CHFS_MARK_REF_NORMAL(fd->nref);
+ if (next->node)
+ CHFS_MARK_REF_NORMAL(next->node->nref);
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * -----------------------
+ * general node operations
+ * -----------------------
+ */
+/* get tmp nodes of an inode */
+int
+chfs_get_data_nodes(struct chfs_mount *chmp,
+ struct chfs_inode *ip,
+ struct chfs_readinode_info *rii)
+{
+ uint32_t crc;
+ int err;
+ size_t len, retlen;
+ struct chfs_node_ref *nref;
+ struct chfs_flash_data_node *dnode;
+ struct chfs_tmp_dnode *td;
+ char* buf;
+
+ len = sizeof(struct chfs_flash_data_node);
+ buf = kmem_alloc(len, KM_SLEEP);
+
+ dnode = kmem_alloc(len, KM_SLEEP);
+ if (!dnode)
+ return ENOMEM;
+
+ nref = chfs_first_valid_data_ref(ip->chvc->dnode);
+
+ rii->highest_version = ip->chvc->highest_version;
+
+ while(nref && (struct chfs_vnode_cache *)nref != ip->chvc) {
+ err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), len, &retlen);
+ if (err || len != retlen)
+ goto out;
+ dnode = (struct chfs_flash_data_node*)buf;
+
+ //check header crc
+ crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4);
+ if (crc != le32toh(dnode->hdr_crc)) {
+ chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc));
+ goto cont;
+ }
+ //check header magic bitmask
+ if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) {
+ chfs_err("Wrong magic bitmask.\n");
+ goto cont;
+ }
+ //check node crc
+ crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4);
+ if (crc != le32toh(dnode->node_crc)) {
+ chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc));
+ goto cont;
+ }
+ td = chfs_alloc_tmp_dnode();
+ if (!td) {
+ chfs_err("Can't allocate tmp dnode info.\n");
+ err = ENOMEM;
+ goto out;
+ }
+ /* We don't check data crc here, just add nodes to tmp frag tree, because
+ * we don't want to check nodes which have been overlapped by a new node
+ * with a higher version number.
+ */
+ td->node = chfs_alloc_full_dnode();
+ if (!td->node) {
+ chfs_err("Can't allocate full dnode info.\n");
+ err = ENOMEM;
+ goto out_tmp_dnode;
+ }
+ td->version = le64toh(dnode->version);
+ td->node->ofs = le64toh(dnode->offset);
+ td->data_crc = le32toh(dnode->data_crc);
+ td->node->nref = nref;
+ td->node->size = le32toh(dnode->data_length);
+ td->overlapped = 0;
+
+ if (td->version > rii->highest_version) {
+ rii->highest_version = td->version;
+ }
+
+ err = chfs_add_tmp_dnode_to_tree(chmp, rii, td);
+ if (err)
+ goto out_full_dnode;
+
+cont:
+ nref = chfs_first_valid_data_ref(nref->nref_next);
+ }
+
+ ip->chvc->highest_version = rii->highest_version;
+ return 0;
+
+/* Exit points */
+out_full_dnode:
+ chfs_free_full_dnode(td->node);
+out_tmp_dnode:
+ chfs_free_tmp_dnode(td);
+out:
+ kmem_free(buf, len);
+ kmem_free(dnode, len);
+ return err;
+}
+
+
+/* Build final normal fragtree from tdi tree. */
+int
+chfs_build_fragtree(struct chfs_mount *chmp, struct chfs_inode *ip,
+ struct chfs_readinode_info *rii)
+{
+ struct chfs_tmp_dnode_info *pen, *last, *this;
+ struct rb_tree ver_tree; /* version tree */
+ uint64_t high_ver = 0;
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ rb_tree_init(&ver_tree, &tmp_node_rbtree_ops);
+
+ if (rii->mdata_tn) {
+ high_ver = rii->mdata_tn->tmpnode->version;
+ rii->latest_ref = rii->mdata_tn->tmpnode->node->nref;
+ }
+
+ pen = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&rii->tdi_root);
+
+ while((last = pen)) {
+ pen = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&rii->tdi_root, last, RB_DIR_LEFT);
+
+ rb_tree_remove_node(&rii->tdi_root, last);
+ rb_tree_insert_node(&ver_tree, last);
+
+ if (last->tmpnode->overlapped) {
+ if (pen)
+ continue;
+
+ last->tmpnode->overlapped = 0;
+ }
+
+ this = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&ver_tree);
+
+ while (this) {
+ struct chfs_tmp_dnode_info *vers_next;
+ int ret;
+
+ vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT);
+ rb_tree_remove_node(&ver_tree, this);
+
+ struct chfs_tmp_dnode *tmp_td = this->tmpnode;
+ while (tmp_td) {
+ struct chfs_tmp_dnode *next_td = tmp_td->next;
+
+ if (chfs_check_td_node(chmp, tmp_td)) {
+ if (next_td) {
+ chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+ } else {
+ break;
+ }
+ } else {
+ if (tmp_td->version > high_ver) {
+ high_ver = tmp_td->version;
+ dbg("highver: %llu\n", (unsigned long long)high_ver);
+ rii->latest_ref = tmp_td->node->nref;
+ }
+
+ ret = chfs_add_full_dnode_to_inode(chmp, ip, tmp_td->node);
+ if (ret) {
+ while (1) {
+ vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT);
+ while (tmp_td) {
+ next_td = tmp_td->next;
+ if (chfs_check_td_node(chmp, tmp_td) > 1) {
+ chfs_mark_node_obsolete(chmp,
+ tmp_td->node->nref);
+ }
+ chfs_free_full_dnode(tmp_td->node);
+ chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+ chfs_free_tmp_dnode(tmp_td);
+ tmp_td = next_td;
+ }
+ chfs_free_tmp_dnode_info(this);
+ this = vers_next;
+ if (!this)
+ break;
+ rb_tree_remove_node(&ver_tree, vers_next);
+ }
+ return ret;
+ }
+
+ chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+ chfs_free_tmp_dnode(tmp_td);
+ }
+ tmp_td = next_td;
+ }
+ chfs_kill_tdi(chmp, this);
+ this = vers_next;
+ }
+ }
+
+ return 0;
+}
+
+int chfs_read_inode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+ struct chfs_vnode_cache *vc = ip->chvc;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+retry:
+ /* XXX locking */
+ //mutex_enter(&chmp->chm_lock_vnocache);
+ switch (vc->state) {
+ case VNO_STATE_UNCHECKED:
+ case VNO_STATE_CHECKEDABSENT:
+// chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_READING);
+ vc->state = VNO_STATE_READING;
+ break;
+ case VNO_STATE_CHECKING:
+ case VNO_STATE_GC:
+ //sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+ //KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+ goto retry;
+ break;
+ case VNO_STATE_PRESENT:
+ case VNO_STATE_READING:
+ chfs_err("Reading inode #%llu in state %d!\n",
+ (unsigned long long)vc->vno, vc->state);
+ chfs_err("wants to read a nonexistent ino %llu\n",
+ (unsigned long long)vc->vno);
+ return ENOENT;
+ default:
+ panic("BUG() Bad vno cache state.");
+ }
+ //mutex_exit(&chmp->chm_lock_vnocache);
+
+ return chfs_read_inode_internal(chmp, ip);
+}
+
+/*
+ * Read inode frags.
+ * Firstly get tmp nodes,
+ * secondly build fragtree from those.
+ */
+int
+chfs_read_inode_internal(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+ int err;
+ size_t len, retlen;
+ char* buf;
+ struct chfs_readinode_info rii;
+ struct chfs_flash_vnode *fvnode;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ len = sizeof(*fvnode);
+
+ memset(&rii, 0, sizeof(rii));
+
+ rb_tree_init(&rii.tdi_root, &tmp_node_rbtree_ops);
+
+ /* build up a temp node frag tree */
+ err = chfs_get_data_nodes(chmp, ip, &rii);
+ if (err) {
+ if (ip->chvc->state == VNO_STATE_READING)
+ ip->chvc->state = VNO_STATE_CHECKEDABSENT;
+ /* FIXME Should we kill fragtree or something here? */
+ return err;
+ }
+
+ rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+ /*
+ * build fragtree from temp nodes
+ */
+ err = chfs_build_fragtree(chmp, ip, &rii);
+ if (err) {
+ if (ip->chvc->state == VNO_STATE_READING)
+ ip->chvc->state = VNO_STATE_CHECKEDABSENT;
+ /* FIXME Should we kill fragtree or something here? */
+ return err;
+ }
+
+ if (!rii.latest_ref) {
+ return 0;
+ }
+
+ buf = kmem_alloc(len, KM_SLEEP);
+ if (!buf)
+ return ENOMEM;
+
+ /*
+ * set inode size from chvc->v
+ */
+ err = chfs_read_leb(chmp, ip->chvc->v->nref_lnr, buf, CHFS_GET_OFS(ip->chvc->v->nref_offset), len, &retlen);
+ if (err || retlen != len) {
+ kmem_free(buf, len);
+ return err?err:EIO;
+ }
+
+ fvnode = (struct chfs_flash_vnode*)buf;
+
+ dbg("set size from v: %u\n", fvnode->dn_size);
+ chfs_set_vnode_size(ITOV(ip), fvnode->dn_size);
+ uint32_t retsize = chfs_truncate_fragtree(chmp, &ip->fragtree, fvnode->dn_size);
+ if (retsize != fvnode->dn_size) {
+ dbg("Truncating failed. It is %u instead of %u\n", retsize, fvnode->dn_size);
+ }
+
+ kmem_free(buf, len);
+
+ if (ip->chvc->state == VNO_STATE_READING) {
+ ip->chvc->state = VNO_STATE_PRESENT;
+ }
+
+ return 0;
+}
+
+int
+chfs_read_data(struct chfs_mount* chmp, struct vnode *vp,
+ struct buf *bp)
+{
+ off_t ofs;
+ struct chfs_node_frag *frag;
+ char * buf;
+ int err = 0;
+ size_t size, retlen;
+ uint32_t crc;
+ struct chfs_inode *ip = VTOI(vp);
+ struct chfs_flash_data_node *dnode;
+ struct chfs_node_ref *nref;
+
+ memset(bp->b_data, 0, bp->b_bcount);
+
+ ofs = bp->b_blkno * PAGE_SIZE;
+ frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->fragtree, &ofs);
+
+ if (!frag || frag->ofs > ofs || frag->ofs + frag->size <= ofs) {
+ dbg("not found in frag tree\n");
+ return 0;
+ }
+
+ if (!frag->node) {
+ dbg("no node in frag\n");
+ return 0;
+ }
+
+ nref = frag->node->nref;
+
+ size = sizeof(*dnode) + frag->size;
+
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ dbg("reading from lnr: %u, offset: %u, size: %zu\n", nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset), size);
+ err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), size, &retlen);
+ if (err) {
+ chfs_err("error after reading: %d\n", err);
+ goto out;
+ }
+ if (retlen != size) {
+ chfs_err("retlen: %zu != size: %zu\n", retlen, size);
+ err = EIO;
+ goto out;
+ }
+
+ dnode = (struct chfs_flash_data_node *)buf;
+ crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4);
+ if (crc != le32toh(dnode->hdr_crc)) {
+ chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc));
+ err = EIO;
+ goto out;
+ }
+ //check header magic bitmask
+ if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) {
+ chfs_err("Wrong magic bitmask.\n");
+ err = EIO;
+ goto out;
+ }
+ //check node crc
+ crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4);
+ if (crc != le32toh(dnode->node_crc)) {
+ chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc));
+ err = EIO;
+ goto out;
+ }
+ crc = crc32(0, (uint8_t *)dnode->data, dnode->data_length);
+ if (crc != le32toh(dnode->data_crc)) {
+ chfs_err("Data CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->data_crc));
+ err = EIO;
+ goto out;
+ }
+
+ memcpy(bp->b_data, dnode->data, dnode->data_length);
+ bp->b_resid = 0;
+
+out:
+ kmem_free(buf, size);
+ return err;
+}
--- /dev/null
+/* $NetBSD: chfs_scan.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (c) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_scan.c
+ *
+ * Created on: 2009.11.05.
+ * Author: dtengeri
+ */
+
+#include "chfs.h"
+
+/**
+ * chfs_scan_make_vnode_cache - makes a new vnode cache during scan
+ * @chmp: CHFS main descriptor structure
+ * @vno: vnode identifier
+ * This function returns a vnode cache belonging to @vno.
+ */
+struct chfs_vnode_cache *
+chfs_scan_make_vnode_cache(struct chfs_mount *chmp, ino_t vno)
+{
+ struct chfs_vnode_cache *vc;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+ vc = chfs_vnode_cache_get(chmp, vno);
+ if (vc) {
+ return vc;
+ }
+
+ if (vno > chmp->chm_max_vno) {
+ chmp->chm_max_vno = vno;
+ }
+
+ vc = chfs_vnode_cache_alloc(vno);
+
+ //mutex_enter(&chmp->chm_lock_vnocache);
+
+ chfs_vnode_cache_add(chmp, vc);
+
+ //mutex_exit(&chmp->chm_lock_vnocache);
+
+ if (vno == CHFS_ROOTINO) {
+ vc->nlink = 2;
+ vc->pvno = CHFS_ROOTINO;
+ chfs_vnode_cache_set_state(chmp,
+ vc, VNO_STATE_CHECKEDABSENT);
+ }
+
+ return vc;
+}
+
+/**
+ * chfs_scan_check_node_hdr - checks node magic and crc
+ * @nhdr: node header to check
+ * Returns 0 if everything is OK, error code otherwise.
+ */
+int
+chfs_scan_check_node_hdr(struct chfs_flash_node_hdr *nhdr)
+{
+ uint16_t magic;
+ uint32_t crc, hdr_crc;
+
+ magic = le16toh(nhdr->magic);
+
+ if (magic != CHFS_FS_MAGIC_BITMASK) {
+ dbg("bad magic\n");
+ return CHFS_NODE_BADMAGIC;
+ }
+
+ hdr_crc = le32toh(nhdr->hdr_crc);
+ crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4);
+
+ if (crc != hdr_crc) {
+ dbg("bad crc\n");
+ return CHFS_NODE_BADCRC;
+ }
+
+ return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_check_vnode - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: vnode to check
+ * @ofs: offset in eraseblock where vnode starts
+ */
+int
+chfs_scan_check_vnode(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ struct chfs_vnode_cache *vc;
+ struct chfs_flash_vnode *vnode = buf;
+ struct chfs_node_ref *nref;
+ int err;
+ uint32_t crc;
+ ino_t vno;
+
+ crc = crc32(0, (uint8_t *)vnode,
+ sizeof(struct chfs_flash_vnode) - 4);
+
+ if (crc != le32toh(vnode->node_crc)) {
+ err = chfs_update_eb_dirty(chmp,
+ cheb, le32toh(vnode->length));
+ if (err) {
+ return err;
+ }
+
+ return CHFS_NODE_BADCRC;
+ }
+
+ vno = le64toh(vnode->vno);
+
+ mutex_enter(&chmp->chm_lock_vnocache);
+ vc = chfs_vnode_cache_get(chmp, vno);
+ if (!vc) {
+ vc = chfs_scan_make_vnode_cache(chmp, vno);
+ if (!vc) {
+ mutex_exit(&chmp->chm_lock_vnocache);
+ return ENOMEM;
+ }
+ }
+ mutex_exit(&chmp->chm_lock_vnocache);
+
+ nref = chfs_alloc_node_ref(cheb);
+
+ nref->nref_offset = ofs;
+
+ KASSERT(nref->nref_lnr == cheb->lnr);
+
+ /* Check version of vnode. */
+ if ((struct chfs_vnode_cache *)vc->v != vc) {
+ if (le64toh(vnode->version) > *vc->vno_version) {
+ //err = chfs_update_eb_dirty(chmp, &chmp->chm_blocks[vc->v->lnr],
+ // sizeof(struct chfs_flash_vnode));
+ *vc->vno_version = le64toh(vnode->version);
+ chfs_add_vnode_ref_to_vc(chmp, vc, nref);
+ } else {
+ err = chfs_update_eb_dirty(chmp, cheb,
+ sizeof(struct chfs_flash_vnode));
+ return CHFS_NODE_OK;
+ }
+ } else {
+ vc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+ if (!vc->vno_version)
+ return ENOMEM;
+ *vc->vno_version = le64toh(vnode->version);
+ chfs_add_vnode_ref_to_vc(chmp, vc, nref);
+ }
+
+ mutex_enter(&chmp->chm_lock_sizes);
+ //dbg("B:lnr: %d |free_size: %d node's size: %d\n", cheb->lnr, cheb->free_size, le32toh(vnode->length));
+ chfs_change_size_free(chmp, cheb, -le32toh(vnode->length));
+ chfs_change_size_used(chmp, cheb, le32toh(vnode->length));
+ mutex_exit(&chmp->chm_lock_sizes);
+
+ KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+
+ KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+ //dbg(" A: free_size: %d\n", cheb->free_size);
+
+ /*dbg("vnode dump:\n");
+ dbg(" ->magic: 0x%x\n", le16toh(vnode->magic));
+ dbg(" ->type: %d\n", le16toh(vnode->type));
+ dbg(" ->length: %d\n", le32toh(vnode->length));
+ dbg(" ->hdr_crc: 0x%x\n", le32toh(vnode->hdr_crc));
+ dbg(" ->vno: %d\n", le64toh(vnode->vno));
+ dbg(" ->version: %ld\n", le64toh(vnode->version));
+ dbg(" ->uid: %d\n", le16toh(vnode->uid));
+ dbg(" ->gid: %d\n", le16toh(vnode->gid));
+ dbg(" ->mode: %d\n", le32toh(vnode->mode));
+ dbg(" ->dn_size: %d\n", le32toh(vnode->dn_size));
+ dbg(" ->atime: %d\n", le32toh(vnode->atime));
+ dbg(" ->mtime: %d\n", le32toh(vnode->mtime));
+ dbg(" ->ctime: %d\n", le32toh(vnode->ctime));
+ dbg(" ->dsize: %d\n", le32toh(vnode->dsize));
+ dbg(" ->node_crc: 0x%x\n", le32toh(vnode->node_crc));*/
+
+ return CHFS_NODE_OK;
+}
+
+int
+chfs_scan_mark_dirent_obsolete(struct chfs_mount *chmp,
+ struct chfs_vnode_cache *vc, struct chfs_dirent *fd)
+{
+ //int size;
+ struct chfs_eraseblock *cheb;
+ struct chfs_node_ref *prev, *nref;
+
+ nref = fd->nref;
+ cheb = &chmp->chm_blocks[fd->nref->nref_lnr];
+
+ /* Remove dirent's node ref from vnode cache */
+ prev = vc->dirents;
+ if (prev && prev == nref) {
+ vc->dirents = prev->nref_next;
+ } else if (prev && prev != (void *)vc) {
+ while (prev->nref_next && prev->nref_next !=
+ (void *)vc && prev->nref_next != nref) {
+ prev = prev->nref_next;
+ }
+
+ if (prev->nref_next == nref) {
+ prev->nref_next = nref->nref_next;
+ }
+ }
+ /*dbg("XXX - start\n");
+ //nref = vc->dirents;
+ struct chfs_dirent *tmp;
+ tmp = vc->scan_dirents;
+ while (tmp) {
+ dbg(" ->tmp->name: %s\n", tmp->name);
+ dbg(" ->tmp->version: %ld\n", tmp->version);
+ dbg(" ->tmp->vno: %d\n", tmp->vno);
+ tmp = tmp->next;
+ }
+ dbg("XXX - end\n");*/
+ //size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize);
+
+ KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size +
+ cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+ return 0;
+}
+
+void
+chfs_add_fd_to_list(struct chfs_mount *chmp,
+ struct chfs_dirent *new, struct chfs_vnode_cache *pvc)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ int size;
+ struct chfs_eraseblock *cheb, *oldcheb;
+// struct chfs_dirent **prev;
+ struct chfs_dirent *fd, *tmpfd;
+
+ dbg("adding fd to list: %s\n", new->name);
+
+ if ((new->version > pvc->highest_version))
+ pvc->highest_version = new->version;
+
+ size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) +
+ new->nsize);
+ cheb = &chmp->chm_blocks[new->nref->nref_lnr];
+
+ mutex_enter(&chmp->chm_lock_sizes);
+ TAILQ_FOREACH_SAFE(fd, &pvc->scan_dirents, fds, tmpfd) {
+ if (fd->nhash > new->nhash) {
+ /* insert new before fd */
+ TAILQ_INSERT_BEFORE(fd, new, fds);
+ goto out;
+ } else if (fd->nhash == new->nhash &&
+ !strcmp(fd->name, new->name)) {
+ if (new->version > fd->version) {
+// new->next = fd->next;
+ /* replace fd with new */
+ TAILQ_INSERT_BEFORE(fd, new, fds);
+ chfs_change_size_free(chmp, cheb, -size);
+ chfs_change_size_used(chmp, cheb, size);
+
+ TAILQ_REMOVE(&pvc->scan_dirents, fd, fds);
+ if (fd->nref) {
+ size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize);
+ chfs_scan_mark_dirent_obsolete(chmp, pvc, fd);
+ oldcheb = &chmp->chm_blocks[fd->nref->nref_lnr];
+ chfs_change_size_used(chmp, oldcheb, -size);
+ chfs_change_size_dirty(chmp, oldcheb, size);
+ }
+ chfs_free_dirent(fd);
+// *prev = new;//XXX
+ } else {
+ chfs_scan_mark_dirent_obsolete(chmp, pvc, new);
+ chfs_change_size_free(chmp, cheb, -size);
+ chfs_change_size_dirty(chmp, cheb, size);
+ chfs_free_dirent(new);
+ }
+ /*dbg("START\n");
+ fd = pvc->scan_dirents;
+ while (fd) {
+ dbg("dirent dump:\n");
+ dbg(" ->vno: %d\n", fd->vno);
+ dbg(" ->version: %ld\n", fd->version);
+ dbg(" ->nhash: 0x%x\n", fd->nhash);
+ dbg(" ->nsize: %d\n", fd->nsize);
+ dbg(" ->name: %s\n", fd->name);
+ dbg(" ->type: %d\n", fd->type);
+ fd = fd->next;
+ }
+ dbg("END\n");*/
+ mutex_exit(&chmp->chm_lock_sizes);
+ return;
+ }
+ }
+ /* if we couldnt fit it elsewhere, lets add to the end */
+ TAILQ_INSERT_TAIL(&pvc->scan_dirents, new, fds);
+
+out:
+ //dbg("B:lnr: %d |free_size: %d size: %d\n", cheb->lnr, cheb->free_size, size);
+ chfs_change_size_free(chmp, cheb, -size);
+ chfs_change_size_used(chmp, cheb, size);
+ mutex_exit(&chmp->chm_lock_sizes);
+
+ KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+ //dbg(" A: free_size: %d\n", cheb->free_size);
+
+ KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+
+// fd = pvc->scan_dirents;
+ /*dbg("START\n");
+ while (fd) {
+ dbg("dirent dump:\n");
+ dbg(" ->vno: %d\n", fd->vno);
+ dbg(" ->version: %ld\n", fd->version);
+ dbg(" ->nhash: 0x%x\n", fd->nhash);
+ dbg(" ->nsize: %d\n", fd->nsize);
+ dbg(" ->name: %s\n", fd->name);
+ dbg(" ->type: %d\n", fd->type);
+ fd = fd->next;
+ }
+ dbg("END\n");*/
+}
+/**
+ * chfs_scan_check_dirent_node - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: directory entry to check
+ * @ofs: offset in eraseblock where dirent starts
+ */
+int
+chfs_scan_check_dirent_node(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+ int err, namelen;
+ uint32_t crc;
+ struct chfs_dirent *fd;
+ struct chfs_vnode_cache *vc;
+ struct chfs_flash_dirent_node *dirent = buf;
+
+ //struct chfs_node_ref *tmp;
+
+ crc = crc32(0, (uint8_t *)dirent, sizeof(*dirent) - 4);
+ if (crc != le32toh(dirent->node_crc)) {
+ err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length));
+ if (err)
+ return err;
+ return CHFS_NODE_BADCRC;
+ }
+ namelen = dirent->nsize;
+
+ fd = chfs_alloc_dirent(namelen + 1);
+ if (!fd)
+ return ENOMEM;
+
+ fd->nref = chfs_alloc_node_ref(cheb);
+ if (!fd->nref)
+ return ENOMEM;
+
+ KASSERT(fd->nref->nref_lnr == cheb->lnr);
+
+ memcpy(&fd->name, dirent->name, namelen);
+ fd->nsize = namelen;
+ fd->name[namelen] = 0;
+ crc = crc32(0, fd->name, dirent->nsize);
+ if (crc != le32toh(dirent->name_crc)) {
+ chfs_err("Directory entry's name has bad crc: read: 0x%x, "
+ "calculated: 0x%x\n", le32toh(dirent->name_crc), crc);
+ chfs_free_dirent(fd);
+ err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length));
+ if (err)
+ return err;
+ return CHFS_NODE_BADNAMECRC;
+ }
+
+ /* Check vnode_cache of parent node */
+ mutex_enter(&chmp->chm_lock_vnocache);
+ vc = chfs_scan_make_vnode_cache(chmp, le64toh(dirent->pvno));
+ mutex_exit(&chmp->chm_lock_vnocache);
+ if (!vc) {
+ chfs_free_dirent(fd);
+ return ENOMEM;
+ }
+
+ fd->nref->nref_offset = ofs;
+
+ dbg("add dirent to #%llu\n", (unsigned long long)vc->vno);
+ chfs_add_node_to_list(chmp, vc, fd->nref, &vc->dirents);
+ /*tmp = vc->dirents;
+ dbg("START|vno: %d dirents dump\n", vc->vno);
+ while (tmp) {
+ dbg(" ->nref->nref_lnr: %d\n", tmp->lnr);
+ dbg(" ->nref->nref_offset: %d\n", tmp->offset);
+ tmp = tmp->next;
+ }
+ dbg(" END|vno: %d dirents dump\n", vc->vno);*/
+
+// fd->next = NULL;
+ fd->vno = le64toh(dirent->vno);
+ fd->version = le64toh(dirent->version);
+ fd->nhash = hash32_buf(fd->name, namelen, HASH32_BUF_INIT);
+ fd->type = dirent->dtype;
+
+ /*dbg("dirent dump:\n");
+ dbg(" ->vno: %d\n", fd->vno);
+ dbg(" ->version: %ld\n", fd->version);
+ dbg(" ->nhash: 0x%x\n", fd->nhash);
+ dbg(" ->nsize: %d\n", fd->nsize);
+ dbg(" ->name: %s\n", fd->name);
+ dbg(" ->type: %d\n", fd->type);*/
+
+ chfs_add_fd_to_list(chmp, fd, vc);
+
+ /*struct chfs_node_ref *tmp;
+ tmp = vc->dirents;
+ dbg("START|vno: %d dirents dump\n", vc->vno);
+ while (tmp) {
+ dbg(" ->nref->nref_lnr: %d\n", tmp->lnr);
+ dbg(" ->nref->nref_offset: %d\n", tmp->offset);
+ tmp = tmp->next;
+ }
+ dbg(" END|vno: %d dirents dump\n", vc->vno);*/
+
+ /*dbg("dirent dump:\n");
+ dbg(" ->magic: 0x%x\n", le16toh(dirent->magic));
+ dbg(" ->type: %d\n", le16toh(dirent->type));
+ dbg(" ->length: %d\n", le32toh(dirent->length));
+ dbg(" ->hdr_crc: 0x%x\n", le32toh(dirent->hdr_crc));
+ dbg(" ->vno: %d\n", le64toh(dirent->vno));
+ dbg(" ->pvno: %d\n", le64toh(dirent->pvno));
+ dbg(" ->version: %ld\n", le64toh(dirent->version));
+ dbg(" ->mctime: %d\n", le32toh(dirent->mctime));
+ dbg(" ->nsize: %d\n", dirent->nsize);
+ dbg(" ->dtype: %d\n", dirent->dtype);
+ dbg(" ->name_crc: 0x%x\n", le32toh(dirent->name_crc));
+ dbg(" ->node_crc: 0x%x\n", le32toh(dirent->node_crc));
+ dbg(" ->name: %s\n", dirent->name);*/
+
+ return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_check_data_node - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: data node to check
+ * @ofs: offset in eraseblock where data node starts
+ */
+int
+chfs_scan_check_data_node(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ int err;
+ uint32_t crc, vno;
+ struct chfs_node_ref *nref;
+ struct chfs_vnode_cache *vc;
+ struct chfs_flash_data_node *dnode = buf;
+
+ crc = crc32(0, (uint8_t *)dnode, sizeof(struct chfs_flash_data_node) - 4);
+ if (crc != le32toh(dnode->node_crc)) {
+ err = chfs_update_eb_dirty(chmp, cheb, le32toh(dnode->length));
+ if (err)
+ return err;
+ return CHFS_NODE_BADCRC;
+ }
+ /**
+ * Don't check data nodes crc and version here, it will be done in
+ * the background GC thread.
+ */
+ nref = chfs_alloc_node_ref(cheb);
+ if (!nref)
+ return ENOMEM;
+
+ nref->nref_offset = ofs | CHFS_UNCHECKED_NODE_MASK;
+
+ KASSERT(nref->nref_lnr == cheb->lnr);
+
+ vno = le64toh(dnode->vno);
+ mutex_enter(&chmp->chm_lock_vnocache);
+ vc = chfs_vnode_cache_get(chmp, vno);
+ if (!vc) {
+ vc = chfs_scan_make_vnode_cache(chmp, vno);
+ if (!vc)
+ return ENOMEM;
+ }
+ mutex_exit(&chmp->chm_lock_vnocache);
+ chfs_add_node_to_list(chmp, vc, nref, &vc->dnode);
+
+ dbg("chmpfree: %u, chebfree: %u, dnode: %u\n", chmp->chm_free_size, cheb->free_size, dnode->length);
+
+ mutex_enter(&chmp->chm_lock_sizes);
+ chfs_change_size_free(chmp, cheb, -dnode->length);
+ chfs_change_size_unchecked(chmp, cheb, dnode->length);
+ mutex_exit(&chmp->chm_lock_sizes);
+ return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_classify_cheb - determine eraseblock's state
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock to classify
+ */
+int
+chfs_scan_classify_cheb(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb)
+{
+ if (cheb->free_size == chmp->chm_ebh->eb_size)
+ return CHFS_BLK_STATE_FREE;
+ else if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN)
+ return CHFS_BLK_STATE_CLEAN;
+ else if (cheb->used_size || cheb->unchecked_size)
+ return CHFS_BLK_STATE_PARTDIRTY;
+ else
+ return CHFS_BLK_STATE_ALLDIRTY;
+}
+
+
+/**
+ * chfs_scan_eraseblock - scans an eraseblock and looking for nodes
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock to scan
+ *
+ * This function scans a whole eraseblock, checks the nodes on it and add them
+ * to the vnode cache.
+ * Returns eraseblock state on success, error code if fails.
+ */
+int
+chfs_scan_eraseblock(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb) {
+
+ int err;
+ size_t len, retlen;
+ off_t ofs = 0;
+ int lnr = cheb->lnr;
+ u_char *buf;
+ struct chfs_flash_node_hdr *nhdr;
+ int read_free = 0;
+ struct chfs_node_ref *nref;
+
+
+ dbg("scanning eraseblock content: %d free_size: %d\n", cheb->lnr, cheb->free_size);
+ dbg("scanned physical block: %d\n", chmp->chm_ebh->lmap[lnr]);
+ buf = kmem_alloc(CHFS_MAX_NODE_SIZE, KM_SLEEP);
+
+ while((ofs + CHFS_NODE_HDR_SIZE) < chmp->chm_ebh->eb_size) {
+ memset(buf, 0 , CHFS_MAX_NODE_SIZE);
+ err = chfs_read_leb(chmp,
+ lnr, buf, ofs, CHFS_NODE_HDR_SIZE, &retlen);
+ if (err) {
+ return err;
+ }
+
+ if (retlen != CHFS_NODE_HDR_SIZE) {
+ chfs_err("Error reading node header: "
+ "read: %zu instead of: %zu\n",
+ CHFS_NODE_HDR_SIZE, retlen);
+ return EIO;
+ }
+
+ /* first we check if the buffer we read is full with 0xff, if yes maybe
+ * the blocks remaining area is free. We increase read_free and if it
+ * reaches MAX_READ_FREE we stop reading the block*/
+ if (check_pattern(buf, 0xff, 0, CHFS_NODE_HDR_SIZE)) {
+ read_free += CHFS_NODE_HDR_SIZE;
+ if (read_free >= MAX_READ_FREE(chmp)) {
+ dbg("rest of the block is free. Size: %d\n", cheb->free_size);
+ return chfs_scan_classify_cheb(chmp, cheb);
+ }
+ ofs += CHFS_NODE_HDR_SIZE;
+ continue;
+ } else {
+ chfs_update_eb_dirty(chmp, cheb, read_free);
+ read_free = 0;
+ }
+
+ nhdr = (struct chfs_flash_node_hdr *)buf;
+
+ err = chfs_scan_check_node_hdr(nhdr);
+ if (err) {
+ dbg("node hdr error\n");
+ err = chfs_update_eb_dirty(chmp, cheb, 4);
+ if (err) {
+ return err;
+ }
+
+ ofs += 4;
+ continue;
+ }
+ ofs += CHFS_NODE_HDR_SIZE;
+ if (ofs > chmp->chm_ebh->eb_size) {
+ chfs_err("Second part of node is on the next eraseblock.\n");
+ return EIO;
+ }
+ switch (le16toh(nhdr->type)) {
+ case CHFS_NODETYPE_VNODE:
+ /* Read up the node */
+ //dbg("nodetype vnode\n");
+ len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+ err = chfs_read_leb(chmp,
+ lnr, buf + CHFS_NODE_HDR_SIZE,
+ ofs, len, &retlen);
+ if (err) {
+ return err;
+ }
+
+ if (retlen != len) {
+ chfs_err("Error reading vnode: read: %zu instead of: %zu\n",
+ len, retlen);
+ return EIO;
+ }
+ KASSERT(lnr == cheb->lnr);
+ err = chfs_scan_check_vnode(chmp,
+ cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+ if (err) {
+ return err;
+ }
+
+ //dbg("XXX5end\n");
+ break;
+ case CHFS_NODETYPE_DIRENT:
+ /* Read up the node */
+ //dbg("nodetype dirent\n");
+ len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+
+ err = chfs_read_leb(chmp,
+ lnr, buf + CHFS_NODE_HDR_SIZE,
+ ofs, len, &retlen);
+ if (err) {
+ return err;
+ }
+
+ if (retlen != len) {
+ chfs_err("Error reading dirent node: read: %zu "
+ "instead of: %zu\n", len, retlen);
+ return EIO;
+ }
+
+ KASSERT(lnr == cheb->lnr);
+
+ err = chfs_scan_check_dirent_node(chmp,
+ cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+ if (err) {
+ return err;
+ }
+
+ //dbg("XXX6end\n");
+ break;
+ case CHFS_NODETYPE_DATA:
+ //dbg("nodetype data\n");
+ len = sizeof(struct chfs_flash_data_node) -
+ CHFS_NODE_HDR_SIZE;
+ err = chfs_read_leb(chmp,
+ lnr, buf + CHFS_NODE_HDR_SIZE,
+ ofs, len, &retlen);
+ if (err) {
+ return err;
+ }
+
+ if (retlen != len) {
+ chfs_err("Error reading data node: read: %zu "
+ "instead of: %zu\n", len, retlen);
+ return EIO;
+ }
+ KASSERT(lnr == cheb->lnr);
+ err = chfs_scan_check_data_node(chmp,
+ cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+ if (err)
+ return err;
+
+ //dbg("XXX7end\n");
+ break;
+ case CHFS_NODETYPE_PADDING:
+ //dbg("nodetype padding\n");
+ //dbg("padding len: %d\n", le32toh(nhdr->length));
+ //dbg("BEF: cheb->free_size: %d\n", cheb->free_size);
+ nref = chfs_alloc_node_ref(cheb);
+ nref->nref_offset = ofs - CHFS_NODE_HDR_SIZE;
+ nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+ CHFS_OBSOLETE_NODE_MASK;
+
+ err = chfs_update_eb_dirty(chmp, cheb,
+ le32toh(nhdr->length));
+ //dbg("AFT: cheb->free_size: %d\n", cheb->free_size);
+ if (err)
+ return err;
+
+ //dbg("XXX8end\n");
+ break;
+ default:
+ //dbg("nodetype ? (default)\n");
+ /* Unknown node type, update dirty and skip */
+ err = chfs_update_eb_dirty(chmp, cheb,
+ le32toh(nhdr->length));
+ if (err)
+ return err;
+
+ //dbg("XXX9end\n");
+ break;
+ }
+ ofs += le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+ }
+
+ KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size +
+ cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+ //dbg("XXX10\n");
+ return chfs_scan_classify_cheb(chmp, cheb);
+}
--- /dev/null
+/* $NetBSD: chfs_subr.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Efficient memory file system supporting functions.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/kmem.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/systm.h>
+#include <sys/swap.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/proc.h>
+#include <sys/atomic.h>
+
+#include <uvm/uvm.h>
+
+#include <miscfs/specfs/specdev.h>
+#include "chfs.h"
+//#include <fs/chfs/chfs_vnops.h>
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Returns information about the number of available memory pages,
+ * including physical and virtual ones.
+ *
+ * If 'total' is true, the value returned is the total amount of memory
+ * pages configured for the system (either in use or free).
+ * If it is FALSE, the value returned is the amount of free memory pages.
+ *
+ * Remember to remove DUMMYFS_PAGES_RESERVED from the returned value to avoid
+ * excessive memory usage.
+ *
+ */
+size_t
+chfs_mem_info(bool total)
+{
+ size_t size;
+
+ size = 0;
+ size += uvmexp.swpgavail;
+ if (!total) {
+ size -= uvmexp.swpgonly;
+ }
+ size += uvmexp.free;
+ size += uvmexp.filepages;
+ if (size > uvmexp.wired) {
+ size -= uvmexp.wired;
+ } else {
+ size = 0;
+ }
+
+ return size;
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Looks for a directory entry in the directory represented by node.
+ * 'cnp' describes the name of the entry to look for. Note that the .
+ * and .. components are not allowed as they do not physically exist
+ * within directories.
+ *
+ * Returns a pointer to the entry when found, otherwise NULL.
+ */
+struct chfs_dirent *
+chfs_dir_lookup(struct chfs_inode *ip, struct componentname *cnp)
+{
+ bool found;
+ struct chfs_dirent *fd;
+ dbg("dir_lookup()\n");
+
+ KASSERT(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
+ KASSERT(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
+ cnp->cn_nameptr[1] == '.')));
+ //CHFS_VALIDATE_DIR(node);
+
+ //node->chn_status |= CHFS_NODE_ACCESSED;
+
+ found = false;
+// fd = ip->dents;
+// while(fd) {
+ TAILQ_FOREACH(fd, &ip->dents, fds) {
+ KASSERT(cnp->cn_namelen < 0xffff);
+ if (fd->vno == 0)
+ continue;
+ /*dbg("dirent dump:\n");
+ dbg(" ->vno: %d\n", fd->vno);
+ dbg(" ->version: %ld\n", fd->version);
+ dbg(" ->nhash: 0x%x\n", fd->nhash);
+ dbg(" ->nsize: %d\n", fd->nsize);
+ dbg(" ->name: %s\n", fd->name);
+ dbg(" ->type: %d\n", fd->type);*/
+ if (fd->nsize == (uint16_t)cnp->cn_namelen &&
+ memcmp(fd->name, cnp->cn_nameptr, fd->nsize) == 0) {
+ found = true;
+ break;
+ }
+// fd = fd->next;
+ }
+
+ return found ? fd : NULL;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_filldir(struct uio* uio, ino_t ino, const char *name,
+ int namelen, enum vtype type)
+{
+ struct dirent dent;
+ int error;
+
+ memset(&dent, 0, sizeof(dent));
+
+ dent.d_fileno = ino;
+ switch (type) {
+ case VBLK:
+ dent.d_type = DT_BLK;
+ break;
+
+ case VCHR:
+ dent.d_type = DT_CHR;
+ break;
+
+ case VDIR:
+ dent.d_type = DT_DIR;
+ break;
+
+ case VFIFO:
+ dent.d_type = DT_FIFO;
+ break;
+
+ case VLNK:
+ dent.d_type = DT_LNK;
+ break;
+
+ case VREG:
+ dent.d_type = DT_REG;
+ break;
+
+ case VSOCK:
+ dent.d_type = DT_SOCK;
+ break;
+
+ default:
+ KASSERT(0);
+ }
+ dent.d_namlen = namelen;
+ (void)memcpy(dent.d_name, name, dent.d_namlen);
+ dent.d_reclen = _DIRENT_SIZE(&dent);
+
+ if (dent.d_reclen > uio->uio_resid) {
+ error = -1;
+ } else {
+ error = uiomove(&dent, dent.d_reclen, uio);
+ }
+
+ return error;
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Change size of the given vnode.
+ * Caller should execute chfs_update on vp after a successful execution.
+ * The vnode must be locked on entry and remain locked on exit.
+ */
+int
+chfs_chsize(struct vnode *vp, u_quad_t size, kauth_cred_t cred)
+{
+ struct chfs_mount *chmp;
+ struct chfs_inode *ip;
+ struct buf *bp;
+ int blknum, append;
+ int error = 0;
+ char *buf = NULL;
+ struct chfs_full_dnode *fd;
+
+ ip = VTOI(vp);
+ chmp = ip->chmp;
+
+ dbg("chfs_chsize\n");
+
+ switch (vp->v_type) {
+ case VDIR:
+ return EISDIR;
+ case VLNK:
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return EROFS;
+ break;
+ case VBLK:
+ case VCHR:
+ case VFIFO:
+ return 0;
+ default:
+ return EOPNOTSUPP; /* XXX why not ENODEV? */
+ }
+
+ vflushbuf(vp, 0);
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+ chfs_flush_pending_wbuf(chmp);
+
+ /* handle truncate to zero as a special case */
+ if (size == 0) {
+ dbg("truncate to zero");
+ chfs_truncate_fragtree(ip->chmp,
+ &ip->fragtree, size);
+ chfs_set_vnode_size(vp, size);
+
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ return 0;
+ }
+
+
+ /* allocate zeros for the new data */
+ buf = kmem_zalloc(size, KM_SLEEP);
+ bp = getiobuf(vp, true);
+
+ if (ip->size != 0) {
+ /* read the whole data */
+ bp->b_blkno = 0;
+ bp->b_bufsize = bp->b_resid = bp->b_bcount = ip->size;
+ bp->b_data = kmem_alloc(ip->size, KM_SLEEP);
+
+ error = chfs_read_data(chmp, vp, bp);
+ if (error) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ putiobuf(bp);
+
+ return error;
+ }
+
+ /* create the new data */
+ dbg("create new data vap%llu ip%llu\n",
+ (unsigned long long)size, (unsigned long long)ip->size);
+ append = size - ip->size;
+ if (append > 0) {
+ memcpy(buf, bp->b_data, ip->size);
+ } else {
+ memcpy(buf, bp->b_data, size);
+ chfs_truncate_fragtree(ip->chmp,
+ &ip->fragtree, size);
+ }
+
+ kmem_free(bp->b_data, ip->size);
+
+ struct chfs_node_frag *lastfrag = frag_last(&ip->fragtree);
+ fd = lastfrag->node;
+ chfs_mark_node_obsolete(chmp, fd->nref);
+
+ blknum = lastfrag->ofs / PAGE_SIZE;
+ lastfrag->size = append > PAGE_SIZE ? PAGE_SIZE : size % PAGE_SIZE;
+ } else {
+ fd = chfs_alloc_full_dnode();
+ blknum = 0;
+ }
+
+ chfs_set_vnode_size(vp, size);
+
+ // write the new data
+ for (bp->b_blkno = blknum; bp->b_blkno * PAGE_SIZE < size; bp->b_blkno++) {
+ uint64_t writesize = MIN(size - bp->b_blkno * PAGE_SIZE, PAGE_SIZE);
+
+ bp->b_bufsize = bp->b_resid = bp->b_bcount = writesize;
+ bp->b_data = kmem_alloc(writesize, KM_SLEEP);
+
+ memcpy(bp->b_data, buf + (bp->b_blkno * PAGE_SIZE), writesize);
+
+ if (bp->b_blkno != blknum) {
+ fd = chfs_alloc_full_dnode();
+ }
+
+ error = chfs_write_flash_dnode(chmp, vp, bp, fd);
+ if (error) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ kmem_free(bp->b_data, writesize);
+ putiobuf(bp);
+
+ return error;
+ }
+ if (bp->b_blkno != blknum) {
+ chfs_add_full_dnode_to_inode(chmp, ip, fd);
+ }
+ kmem_free(bp->b_data, writesize);
+ }
+
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ kmem_free(buf, size);
+ putiobuf(bp);
+
+ return 0;
+}
+#if 0
+ int error;
+ struct chfs_node *node;
+
+ KASSERT(VOP_ISLOCKED(vp));
+
+ node = VP_TO_CHFS_NODE(vp);
+
+ // Decide whether this is a valid operation based on the file type.
+ error = 0;
+ switch (vp->v_type) {
+ case VDIR:
+ return EISDIR;
+
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return EROFS;
+ break;
+
+ case VBLK:
+ case VCHR:
+ case VFIFO:
+ // Allow modifications of special files even if in the file
+ // system is mounted read-only (we are not modifying the
+ // files themselves, but the objects they represent).
+ return 0;
+
+ default:
+ return ENODEV;
+ }
+
+ // Immutable or append-only files cannot be modified, either.
+ if (node->chn_flags & (IMMUTABLE | APPEND))
+ return EPERM;
+
+ error = chfs_truncate(vp, size);
+ // chfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents
+ // for us, as will update dn_status; no need to do that here.
+
+ KASSERT(VOP_ISLOCKED(vp));
+
+ return error;
+#endif
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Change flags of the given vnode.
+ * Caller should execute chfs_update on vp after a successful execution.
+ * The vnode must be locked on entry and remain locked on exit.
+ */
+int
+chfs_chflags(struct vnode *vp, int flags, kauth_cred_t cred)
+{
+ struct chfs_mount *chmp;
+ struct chfs_inode *ip;
+ int error = 0;
+
+ ip = VTOI(vp);
+ chmp = ip->chmp;
+
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return EROFS;
+
+ if (kauth_cred_geteuid(cred) != ip->uid &&
+ (error = kauth_authorize_generic(cred,
+ KAUTH_GENERIC_ISSUSER, NULL)))
+ return error;
+
+ if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+ NULL) == 0) {
+ if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) &&
+ kauth_authorize_system(curlwp->l_cred,
+ KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL))
+ return EPERM;
+
+ if ((flags & SF_SNAPSHOT) !=
+ (ip->flags & SF_SNAPSHOT))
+ return EPERM;
+
+ ip->flags = flags;
+ } else {
+ if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) ||
+ (flags & UF_SETTABLE) != flags)
+ return EPERM;
+
+ if ((ip->flags & SF_SETTABLE) !=
+ (flags & SF_SETTABLE))
+ return EPERM;
+
+ ip->flags &= SF_SETTABLE;
+ ip->flags |= (flags & UF_SETTABLE);
+ }
+ ip->iflag |= IN_CHANGE;
+ error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+ if (error)
+ return error;
+
+ if (flags & (IMMUTABLE | APPEND))
+ return 0;
+
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_itimes(struct chfs_inode *ip, const struct timespec *acc,
+ const struct timespec *mod, const struct timespec *cre)
+{
+ //dbg("itimes\n");
+ struct timespec now;
+
+ if (!(ip->iflag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+ return;
+ }
+
+ vfs_timestamp(&now);
+ if (ip->iflag & IN_ACCESS) {
+ if (acc == NULL)
+ acc = &now;
+ ip->atime = acc->tv_sec;
+ }
+ if (ip->iflag & (IN_UPDATE | IN_MODIFY)) {
+ if (mod == NULL)
+ mod = &now;
+ ip->mtime = mod->tv_sec;
+ //ip->i_modrev++;
+ }
+ if (ip->iflag & (IN_CHANGE | IN_MODIFY)) {
+ if (cre == NULL)
+ cre = &now;
+ ip->ctime = cre->tv_sec;
+ }
+ if (ip->iflag & (IN_ACCESS | IN_MODIFY))
+ ip->iflag |= IN_ACCESSED;
+ if (ip->iflag & (IN_UPDATE | IN_CHANGE))
+ ip->iflag |= IN_MODIFIED;
+ ip->iflag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_update(struct vnode *vp, const struct timespec *acc,
+ const struct timespec *mod, int flags)
+{
+
+ struct chfs_inode *ip;
+
+ /* XXX ufs_reclaim calls this function unlocked! */
+// KASSERT(VOP_ISLOCKED(vp));
+
+#if 0
+ if (flags & UPDATE_CLOSE)
+ ; /* XXX Need to do anything special? */
+#endif
+
+ ip = VTOI(vp);
+ chfs_itimes(ip, acc, mod, NULL);
+
+// KASSERT(VOP_ISLOCKED(vp));
+ return (0);
+}
+
+/* --------------------------------------------------------------------- */
+/*
+ int
+ chfs_truncate(struct vnode *vp, off_t length)
+ {
+ bool extended;
+ int error;
+ struct chfs_node *node;
+ printf("CHFS: truncate()\n");
+
+ node = VP_TO_CHFS_NODE(vp);
+ extended = length > node->chn_size;
+
+ if (length < 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (node->chn_size == length) {
+ error = 0;
+ goto out;
+ }
+
+ error = chfs_reg_resize(vp, length);
+ if (error == 0)
+ node->chn_status |= CHFS_NODE_CHANGED | CHFS_NODE_MODIFIED;
+
+ out:
+ chfs_update(vp, NULL, NULL, 0);
+
+ return error;
+ }*/
+
+
--- /dev/null
+/* $NetBSD: chfs_vfsops.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/module.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+//XXX needed just for debugging
+#include <sys/fstrans.h>
+#include <sys/sleepq.h>
+#include <sys/lockdebug.h>
+#include <sys/ktrace.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pager.h>
+#include <ufs/ufs/dir.h>
+//#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+#include <miscfs/specfs/specdev.h>
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+//#include </root/xipffs/netbsd.chfs/chfs_args.h>
+#include "chfs.h"
+#include "chfs_args.h"
+
+MODULE(MODULE_CLASS_VFS, chfs, "flash");
+
+/* --------------------------------------------------------------------- */
+/* functions */
+
+static int chfs_mount(struct mount *, const char *, void *, size_t *);
+static int chfs_unmount(struct mount *, int);
+static int chfs_root(struct mount *, struct vnode **);
+static int chfs_vget(struct mount *, ino_t, struct vnode **);
+static int chfs_fhtovp(struct mount *, struct fid *, struct vnode **);
+static int chfs_vptofh(struct vnode *, struct fid *, size_t *);
+static int chfs_start(struct mount *, int);
+static int chfs_statvfs(struct mount *, struct statvfs *);
+static int chfs_sync(struct mount *, int, kauth_cred_t);
+static void chfs_init(void);
+static void chfs_reinit(void);
+static void chfs_done(void);
+static int chfs_snapshot(struct mount *, struct vnode *,
+ struct timespec *);
+
+/* --------------------------------------------------------------------- */
+/* structures */
+
+int
+chfs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+ kauth_cred_t cred)
+{
+ return (0);
+}
+
+const struct genfs_ops chfs_genfsops = {
+ .gop_size = genfs_size,
+ .gop_alloc = chfs_gop_alloc,
+ .gop_write = genfs_gop_write,
+ .gop_markupdate = ufs_gop_markupdate,
+};
+
+/*
+static const struct ufs_ops chfs_ufsops = {
+ .uo_itimes = chfs_itimes,
+ .uo_update = chfs_update,
+};
+*/
+
+struct pool chfs_inode_pool;
+
+/* for looking up the major for flash */
+extern const struct cdevsw flash_cdevsw;
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_mount(struct mount *mp,
+ const char *path, void *data, size_t *data_len)
+{
+ struct lwp *l = curlwp;
+ struct nameidata nd;
+ struct pathbuf *pb;
+ struct vnode *devvp = NULL;
+ struct ufs_args *args = data;
+ struct ufsmount *ump = NULL;
+ struct chfs_mount *chmp;
+ int err = 0;
+ int xflags;
+
+ dbg("mount()\n");
+
+ if (*data_len < sizeof *args)
+ return EINVAL;
+
+ if (mp->mnt_flag & MNT_GETARGS) {
+ ump = VFSTOUFS(mp);
+ if (ump == NULL)
+ return EIO;
+ memset(args, 0, sizeof *args);
+ args->fspec = NULL;
+ *data_len = sizeof *args;
+ return 0;
+ }
+
+ if (mp->mnt_flag & MNT_UPDATE) {
+ /* XXX: There is no support yet to update file system
+ * settings. Should be added. */
+
+ return ENODEV;
+ }
+
+ if (args->fspec != NULL) {
+ err = pathbuf_copyin(args->fspec, &pb);
+ if (err) {
+ return err;
+ }
+ /*
+ * Look up the name and verify that it's sane.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW, pb);
+ if ((err = namei(&nd)) != 0 )
+ return (err);
+ devvp = nd.ni_vp;
+
+ /*
+ * Be sure this is a valid block device
+ */
+ if (devvp->v_type != VBLK)
+ err = ENOTBLK;
+ else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+ err = ENXIO;
+ }
+
+ if (err) {
+ vrele(devvp);
+ return (err);
+ }
+
+ if (mp->mnt_flag & MNT_RDONLY)
+ xflags = FREAD;
+ else
+ xflags = FREAD|FWRITE;
+
+ err = VOP_OPEN(devvp, xflags, FSCRED);
+ if (err)
+ goto fail;
+
+
+ err = chfs_mountfs(devvp, mp);
+ if (err) {
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(devvp, xflags, NOCRED);
+ VOP_UNLOCK(devvp);
+ goto fail;
+ }
+ ump = VFSTOUFS(mp);
+ chmp = ump->um_chfs;
+
+ vfs_getnewfsid(mp);
+ chmp->chm_fsmp = mp;
+
+ return set_statvfs_info(path,
+ UIO_USERSPACE, args->fspec,
+ UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+
+fail:
+ vrele(devvp);
+ return (err);
+}
+
+
+int
+chfs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+ struct lwp *l = curlwp;
+ struct proc *p;
+ kauth_cred_t cred;
+ devmajor_t flash_major;
+ dev_t dev;
+ struct ufsmount* ump = NULL;
+ struct chfs_mount* chmp;
+ struct vnode *vp;
+ int err = 0;
+
+ dbg("mountfs()\n");
+
+ dev = devvp->v_rdev;
+ p = l ? l->l_proc : NULL;
+ cred = l ? l->l_cred : NOCRED;
+
+ /* Flush out any old buffers remaining from a previous use. */
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ err = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+ VOP_UNLOCK(devvp);
+ if (err)
+ return (err);
+
+ flash_major = cdevsw_lookup_major(&flash_cdevsw);
+
+ if (devvp->v_type != VBLK)
+ err = ENOTBLK;
+ else if (bdevsw_lookup(dev) == NULL)
+ err = ENXIO;
+ else if (major(dev) != flash_major) {
+ dbg("major(dev): %d, flash_major: %d\n",
+ major(dev), flash_major);
+ err = ENODEV;
+ }
+ if (err) {
+ vrele(devvp);
+ return (err);
+ }
+
+ ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK);
+ memset(ump, 0, sizeof(*ump));
+ ump->um_fstype = UFS1;
+ //ump->um_ops = &chfs_ufsops;
+ ump->um_chfs = malloc(sizeof(struct chfs_mount),
+ M_UFSMNT, M_WAITOK);
+ memset(ump->um_chfs, 0, sizeof(struct chfs_mount));
+
+ mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
+
+ /* Get superblock and set flash device number */
+ chmp = ump->um_chfs;
+ if (!chmp)
+ return ENOMEM;
+
+ chmp->chm_ebh = kmem_alloc(sizeof(struct chfs_ebh), KM_SLEEP);
+
+ dbg("[]opening flash: %u\n", (unsigned int)devvp->v_rdev);
+ err = ebh_open(chmp->chm_ebh, devvp->v_rdev);
+ if (err) {
+ dbg("error while opening flash\n");
+ kmem_free(chmp->chm_ebh, sizeof(struct chfs_ebh));
+ free(chmp, M_UFSMNT);
+ return err;
+ }
+
+ //TODO check flash sizes
+
+ chmp->chm_gbl_version = 0;
+ chmp->chm_vnocache_hash = chfs_vnocache_hash_init();
+
+ chmp->chm_blocks = kmem_zalloc(chmp->chm_ebh->peb_nr *
+ sizeof(struct chfs_eraseblock), KM_SLEEP);
+
+ if (!chmp->chm_blocks) {
+ kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr *
+ sizeof(struct chfs_eraseblock));
+ ebh_close(chmp->chm_ebh);
+ free(chmp, M_UFSMNT);
+ return ENOMEM;
+ }
+
+ mutex_init(&chmp->chm_lock_mountfields, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&chmp->chm_lock_sizes, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&chmp->chm_lock_vnocache, MUTEX_DEFAULT, IPL_NONE);
+
+ //XXX
+ chmp->chm_fs_bmask = -4096;
+ chmp->chm_fs_bsize = 4096;
+ chmp->chm_fs_qbmask = 4095;
+ chmp->chm_fs_bshift = 12;
+ chmp->chm_fs_fmask = -2048;
+ chmp->chm_fs_qfmask = 2047;
+
+ chmp->chm_wbuf_pagesize = chmp->chm_ebh->flash_if->page_size;
+ dbg("wbuf size: %zu\n", chmp->chm_wbuf_pagesize);
+ chmp->chm_wbuf = kmem_alloc(chmp->chm_wbuf_pagesize, KM_SLEEP);
+ rw_init(&chmp->chm_lock_wbuf);
+
+ //init queues
+ TAILQ_INIT(&chmp->chm_free_queue);
+ TAILQ_INIT(&chmp->chm_clean_queue);
+ TAILQ_INIT(&chmp->chm_dirty_queue);
+ TAILQ_INIT(&chmp->chm_very_dirty_queue);
+ TAILQ_INIT(&chmp->chm_erasable_pending_wbuf_queue);
+ TAILQ_INIT(&chmp->chm_erase_pending_queue);
+
+ chfs_calc_trigger_levels(chmp);
+
+ chmp->chm_nr_free_blocks = 0;
+ chmp->chm_nr_erasable_blocks = 0;
+ chmp->chm_max_vno = 2;
+ chmp->chm_checked_vno = 2;
+ chmp->chm_unchecked_size = 0;
+ chmp->chm_used_size = 0;
+ chmp->chm_dirty_size = 0;
+ chmp->chm_wasted_size = 0;
+ chmp->chm_free_size = chmp->chm_ebh->eb_size * chmp->chm_ebh->peb_nr;
+ err = chfs_build_filesystem(chmp);
+
+ if (err) {
+ chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash);
+ kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr *
+ sizeof(struct chfs_eraseblock));
+ ebh_close(chmp->chm_ebh);
+ free(chmp, M_UFSMNT);
+ return EIO;
+ }
+
+ mp->mnt_data = ump;
+ mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+ mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CHFS);
+ mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+ mp->mnt_stat.f_namemax = MAXNAMLEN;
+ mp->mnt_flag |= MNT_LOCAL;
+ mp->mnt_fs_bshift = PAGE_SHIFT;
+ mp->mnt_dev_bshift = DEV_BSHIFT;
+ mp->mnt_iflag |= IMNT_MPSAFE;
+ ump->um_flags = 0;
+ ump->um_mountp = mp;
+ ump->um_dev = dev;
+ ump->um_devvp = devvp;
+ ump->um_maxfilesize = 1048512 * 1024;
+ /*TODO fill these fields
+ ump->um_nindir =
+ ump->um_lognindir =
+ ump->um_bptrtodb =
+ ump->um_seqinc =
+ ump->um_maxsymlinklen =
+ ump->um_dirblksiz =
+ ump->um_maxfilesize =
+ */
+
+ /*
+ * Allocate the root vnode.
+ */
+ err = VFS_VGET(mp, CHFS_ROOTINO, &vp);
+ if (err) {
+ dbg("error: %d while allocating root node\n", err);
+ return err;
+ }
+ vput(vp);
+
+ chfs_gc_thread_start(chmp);
+ mutex_enter(&chmp->chm_lock_mountfields);
+ chfs_gc_trigger(chmp);
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ devvp->v_specmountpoint = mp;
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED2 */
+static int
+chfs_unmount(struct mount *mp, int mntflags)
+{
+ int flags = 0, i = 0;
+ struct ufsmount *ump;
+ struct chfs_mount *chmp;
+// struct chfs_vnode_cache *vc, *next;
+
+ if (mntflags & MNT_FORCE)
+ flags |= FORCECLOSE;
+
+ dbg("[START]\n");
+
+ ump = VFSTOUFS(mp);
+ chmp = ump->um_chfs;
+
+ chfs_gc_thread_stop(chmp);
+
+ (void)vflush(mp, NULLVP, flags);
+
+ if (chmp->chm_wbuf_len) {
+ mutex_enter(&chmp->chm_lock_mountfields);
+ chfs_flush_pending_wbuf(chmp);
+ mutex_exit(&chmp->chm_lock_mountfields);
+ }
+
+ for (i = 0; i < chmp->chm_ebh->peb_nr; i++) {
+ chfs_free_node_refs(&chmp->chm_blocks[i]);
+ }
+
+ chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash);
+
+ ebh_close(chmp->chm_ebh);
+
+ rw_destroy(&chmp->chm_lock_wbuf);
+ mutex_destroy(&chmp->chm_lock_vnocache);
+ mutex_destroy(&chmp->chm_lock_sizes);
+ mutex_destroy(&chmp->chm_lock_mountfields);
+
+ if (ump->um_devvp->v_type != VBAD) {
+ ump->um_devvp->v_specmountpoint = NULL;
+ }
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(ump->um_devvp, FREAD|FWRITE, NOCRED);
+ vput(ump->um_devvp);
+
+ mutex_destroy(&ump->um_lock);
+
+ //free(ump->um_chfs, M_UFSMNT);
+ free(ump, M_UFSMNT);
+ mp->mnt_data = NULL;
+ mp->mnt_flag &= ~MNT_LOCAL;
+ dbg("[END]\n");
+ return (0);
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_root(struct mount *mp, struct vnode **vpp)
+{
+ struct vnode *vp;
+ int error;
+
+ if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &vp)) != 0)
+ return error;
+ *vpp = vp;
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+extern rb_tree_ops_t frag_rbtree_ops;
+
+static int
+chfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+ struct chfs_mount *chmp;
+ struct chfs_inode *ip;
+ struct ufsmount *ump;
+ struct vnode *vp;
+ dev_t dev;
+ int error;
+ struct chfs_vnode_cache* chvc = NULL;
+ struct chfs_node_ref* nref = NULL;
+ struct buf *bp;
+
+ dbg("vget() | ino: %llu\n", (unsigned long long)ino);
+
+ ump = VFSTOUFS(mp);
+ dev = ump->um_dev;
+retry:
+ if (!vpp) {
+ vpp = kmem_alloc(sizeof(struct vnode*), KM_SLEEP);
+ }
+
+ if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) {
+ return 0;
+ }
+
+ /* Allocate a new vnode/inode. */
+ if ((error = getnewvnode(VT_CHFS,
+ mp, chfs_vnodeop_p, NULL, &vp)) != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+ ip = pool_get(&chfs_inode_pool, PR_WAITOK);
+
+ mutex_enter(&chfs_hashlock);
+ if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) {
+ mutex_exit(&chfs_hashlock);
+ ungetnewvnode(vp);
+ pool_put(&chfs_inode_pool, ip);
+ goto retry;
+ }
+
+ vp->v_vflag |= VV_LOCKSWORK;
+
+ memset(ip, 0, sizeof(*ip));
+ vp->v_data = ip;
+ ip->vp = vp;
+ ip->ump = ump;
+ ip->chmp = chmp = ump->um_chfs;
+ ip->dev = dev;
+ ip->ino = ino;
+ vp->v_mount = mp;
+ genfs_node_init(vp, &chfs_genfsops);
+
+ rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+ //mutex_init(&ip->inode_lock, MUTEX_DEFAULT, IPL_NONE);
+
+ chfs_ihashins(ip);
+ mutex_exit(&chfs_hashlock);
+
+ // set root inode
+ if (ino == CHFS_ROOTINO) {
+ dbg("SETROOT\n");
+ vp->v_vflag |= VV_ROOT;
+ vp->v_type = VDIR;
+ ip->mode = IFMT | IEXEC | IWRITE | IREAD;
+ ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE);
+ chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+// ip->dents = NULL; XXXTAILQ
+ TAILQ_INIT(&ip->dents);
+ chfs_set_vnode_size(vp, 512);
+ }
+
+ // set vnode cache
+ mutex_enter(&chmp->chm_lock_vnocache);
+ chvc = chfs_vnode_cache_get(chmp, ino);
+ mutex_exit(&chmp->chm_lock_vnocache);
+ if (!chvc) {
+ dbg("!chvc\n");
+ /* XXX, we cant alloc under a lock, refactor this! */
+ chvc = chfs_vnode_cache_alloc(ino);
+ mutex_enter(&chmp->chm_lock_vnocache);
+ if (ino == CHFS_ROOTINO) {
+ chvc->nlink = 2;
+ chvc->pvno = CHFS_ROOTINO;
+ chfs_vnode_cache_set_state(chmp,
+ chvc, VNO_STATE_CHECKEDABSENT);
+ }
+ chfs_vnode_cache_add(chmp, chvc);
+ mutex_exit(&chmp->chm_lock_vnocache);
+
+ ip->chvc = chvc;
+ TAILQ_INIT(&ip->dents);
+ } else {
+ dbg("chvc\n");
+ ip->chvc = chvc;
+ // if we have a vnode cache, the node is already on flash, so read it
+ if (ino == CHFS_ROOTINO) {
+ chvc->pvno = CHFS_ROOTINO;
+ TAILQ_INIT(&chvc->scan_dirents);
+ } else {
+ chfs_readvnode(mp, ino, &vp);
+ }
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+ // init type specific things
+ switch (vp->v_type) {
+ case VDIR:
+ nref = chvc->dirents;
+ while (nref &&
+ (struct chfs_vnode_cache *)nref != chvc) {
+ chfs_readdirent(mp, nref, ip);
+ nref = nref->nref_next;
+ }
+ chfs_set_vnode_size(vp, 512);
+ break;
+ case VREG:
+ case VSOCK:
+ //build the fragtree of the vnode
+ dbg("read_inode_internal | ino: %llu\n",
+ (unsigned long long)ip->ino);
+ error = chfs_read_inode(chmp, ip);
+ if (error) {
+ vput(vp);
+ *vpp = NULL;
+ mutex_exit(&chmp->chm_lock_mountfields);
+ return (error);
+ }
+ break;
+ case VLNK:
+ //build the fragtree of the vnode
+ dbg("read_inode_internal | ino: %llu\n",
+ (unsigned long long)ip->ino);
+ error = chfs_read_inode_internal(chmp, ip);
+ if (error) {
+ vput(vp);
+ *vpp = NULL;
+ mutex_exit(&chmp->chm_lock_mountfields);
+ return (error);
+ }
+
+ dbg("size: %llu\n", (unsigned long long)ip->size);
+ bp = getiobuf(vp, true);
+ bp->b_blkno = 0;
+ bp->b_bufsize = bp->b_resid =
+ bp->b_bcount = ip->size;
+ bp->b_data = kmem_alloc(ip->size, KM_SLEEP);
+ chfs_read_data(chmp, vp, bp);
+ if (!ip->target)
+ ip->target = kmem_alloc(ip->size,
+ KM_SLEEP);
+ memcpy(ip->target, bp->b_data, ip->size);
+ kmem_free(bp->b_data, ip->size);
+ putiobuf(bp);
+
+ break;
+ case VCHR:
+ case VBLK:
+ case VFIFO:
+ //build the fragtree of the vnode
+ dbg("read_inode_internal | ino: %llu\n",
+ (unsigned long long)ip->ino);
+ error = chfs_read_inode_internal(chmp, ip);
+ if (error) {
+ vput(vp);
+ *vpp = NULL;
+ mutex_exit(&chmp->chm_lock_mountfields);
+ return (error);
+ }
+
+ bp = getiobuf(vp, true);
+ bp->b_blkno = 0;
+ bp->b_bufsize = bp->b_resid =
+ bp->b_bcount = sizeof(dev_t);
+ bp->b_data = kmem_alloc(sizeof(dev_t), KM_SLEEP);
+ chfs_read_data(chmp, vp, bp);
+ memcpy(&ip->rdev,
+ bp->b_data, sizeof(dev_t));
+ kmem_free(bp->b_data, sizeof(dev_t));
+ putiobuf(bp);
+ if (vp->v_type == VFIFO)
+ vp->v_op = chfs_fifoop_p;
+ else {
+ vp->v_op = chfs_specop_p;
+ spec_node_init(vp, ip->rdev);
+ }
+
+ break;
+ case VNON:
+ case VBAD:
+ break;
+ }
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ }
+
+ /* finish inode initalization */
+ ip->devvp = ump->um_devvp;
+ vref(ip->devvp);
+
+ uvm_vnp_setsize(vp, ip->size);
+ *vpp = vp;
+
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+ return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+ return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_start(struct mount *mp, int flags)
+{
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED2 */
+static int
+chfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+ struct chfs_mount *chmp;
+ struct ufsmount *ump;
+ dbg("statvfs\n");
+
+ ump = VFSTOUFS(mp);
+ chmp = ump->um_chfs;
+
+ sbp->f_flag = mp->mnt_flag;
+ sbp->f_bsize = chmp->chm_ebh->eb_size;
+ sbp->f_frsize = chmp->chm_ebh->eb_size;
+ sbp->f_iosize = chmp->chm_ebh->eb_size;
+
+ sbp->f_blocks = chmp->chm_ebh->peb_nr;
+ sbp->f_files = 0;
+ sbp->f_bavail = chmp->chm_nr_free_blocks - chmp->chm_resv_blocks_write;
+#if 0
+ printf("chmp->chm_nr_free_blocks: %jd\n",
+ (intmax_t )chmp->chm_nr_free_blocks);
+ printf("chmp->chm_resv_blocks_write: %jd\n",
+ (intmax_t) chmp->chm_resv_blocks_write);
+ printf("chmp->chm_ebh->peb_nr: %jd\n",
+ (intmax_t) chmp->chm_ebh->peb_nr);
+#endif
+
+ sbp->f_bfree = chmp->chm_nr_free_blocks;
+ sbp->f_bresvd = chmp->chm_resv_blocks_write;
+
+ /* FFS specific */
+ sbp->f_ffree = 0;
+ sbp->f_favail = 0;
+ sbp->f_fresvd = 0;
+
+ copy_statvfs_info(sbp, mp);
+
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED0 */
+static int
+chfs_sync(struct mount *mp, int waitfor,
+ kauth_cred_t uc)
+{
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_init(void)
+{
+ chfs_alloc_pool_caches();
+ chfs_ihashinit();
+ pool_init(&chfs_inode_pool, sizeof(struct chfs_inode), 0, 0, 0,
+ "chfsinopl", &pool_allocator_nointr, IPL_NONE);
+ ufs_init();
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_reinit(void)
+{
+ chfs_ihashreinit();
+ ufs_reinit();
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_done(void)
+{
+ ufs_done();
+ chfs_ihashdone();
+ pool_destroy(&chfs_inode_pool);
+ chfs_destroy_pool_caches();
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_snapshot(struct mount *mp, struct vnode *vp,
+ struct timespec *ctime)
+{
+ return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * chfs vfs operations.
+ */
+
+extern const struct vnodeopv_desc chfs_fifoop_opv_desc;
+extern const struct vnodeopv_desc chfs_specop_opv_desc;
+extern const struct vnodeopv_desc chfs_vnodeop_opv_desc;
+
+const struct vnodeopv_desc * const chfs_vnodeopv_descs[] = {
+ &chfs_fifoop_opv_desc,
+ &chfs_specop_opv_desc,
+ &chfs_vnodeop_opv_desc,
+ NULL,
+};
+
+struct vfsops chfs_vfsops = {
+ MOUNT_CHFS, /* vfs_name */
+ sizeof (struct chfs_args),
+ chfs_mount, /* vfs_mount */
+ chfs_start, /* vfs_start */
+ chfs_unmount, /* vfs_unmount */
+ chfs_root, /* vfs_root */
+ ufs_quotactl, /* vfs_quotactl */
+ chfs_statvfs, /* vfs_statvfs */
+ chfs_sync, /* vfs_sync */
+ chfs_vget, /* vfs_vget */
+ chfs_fhtovp, /* vfs_fhtovp */
+ chfs_vptofh, /* vfs_vptofh */
+ chfs_init, /* vfs_init */
+ chfs_reinit, /* vfs_reinit */
+ chfs_done, /* vfs_done */
+ NULL, /* vfs_mountroot */
+ chfs_snapshot, /* vfs_snapshot */
+ vfs_stdextattrctl, /* vfs_extattrctl */
+ (void *)eopnotsupp, /* vfs_suspendctl */
+ genfs_renamelock_enter,
+ genfs_renamelock_exit,
+ (void *)eopnotsupp,
+ chfs_vnodeopv_descs,
+ 0, /* vfs_refcount */
+ { NULL, NULL },
+};
+
+static int
+chfs_modcmd(modcmd_t cmd, void *arg)
+{
+ switch (cmd) {
+ case MODULE_CMD_INIT:
+ return vfs_attach(&chfs_vfsops);
+ case MODULE_CMD_FINI:
+ return vfs_detach(&chfs_vfsops);
+ default:
+ return ENOTTY;
+ }
+}
--- /dev/null
+/* $NetBSD: chfs_vnode.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include "chfs_inode.h"
+#include <sys/malloc.h>
+#include <sys/kauth.h>
+#include <sys/namei.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+
+struct vnode *
+chfs_vnode_lookup(struct chfs_mount *chmp, ino_t vno)
+{
+ struct vnode *vp;
+ struct chfs_inode *ip;
+
+ TAILQ_FOREACH(vp, &chmp->chm_fsmp->mnt_vnodelist, v_mntvnodes) {
+ ip = VTOI(vp);
+ if (ip && ip->ino == vno)
+ return vp;
+ }
+ return NULL;
+}
+
+int
+chfs_readvnode(struct mount* mp, ino_t ino, struct vnode** vpp)
+{
+ struct ufsmount* ump = VFSTOUFS(mp);
+ struct chfs_mount *chmp = ump->um_chfs;
+ struct chfs_vnode_cache *chvc;
+ struct chfs_flash_vnode *chfvn;
+ struct chfs_inode *ip;
+ int err;
+ char* buf;
+ size_t retlen, len;
+ struct vnode* vp = NULL;
+ dbg("readvnode | ino: %llu\n", (unsigned long long)ino);
+
+ len = sizeof(struct chfs_flash_vnode);
+
+ KASSERT(vpp != NULL);
+
+ if (vpp != NULL) {
+ vp = *vpp;
+ }
+
+ ip = VTOI(vp);
+ chvc = ip->chvc;
+
+ if (chvc && ino != CHFS_ROOTINO) {
+ /* debug... */
+ printf("readvnode; offset: %" PRIu32 ", lnr: %d\n",
+ CHFS_GET_OFS(chvc->v->nref_offset), chvc->v->nref_lnr);
+
+ KASSERT((void *)chvc != (void *)chvc->v);
+
+ buf = kmem_alloc(len, KM_SLEEP);
+ err = chfs_read_leb(chmp, chvc->v->nref_lnr, buf,
+ CHFS_GET_OFS(chvc->v->nref_offset), len, &retlen);
+ if (err)
+ return err;
+ if (retlen != len) {
+ chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+ len, retlen);
+ return EIO;
+ }
+ chfvn = (struct chfs_flash_vnode*)buf;
+ chfs_set_vnode_size(vp, chfvn->dn_size);
+ ip->mode = chfvn->mode;
+ vp->v_type = IFTOVT(ip->mode);
+ ip->version = chfvn->version;
+ //ip->chvc->highest_version = ip->version;
+ ip->uid = chfvn->uid;
+ ip->gid = chfvn->gid;
+ ip->atime = chfvn->atime;
+ ip->mtime = chfvn->mtime;
+ ip->ctime = chfvn->ctime;
+ kmem_free(buf, len);
+ }
+
+
+ *vpp = vp;
+ return 0;
+}
+
+int
+chfs_readdirent(struct mount *mp, struct chfs_node_ref *chnr, struct chfs_inode *pdir)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct chfs_mount *chmp = ump->um_chfs;
+ struct chfs_flash_dirent_node chfdn;
+ struct chfs_dirent *fd;//, *pdents;
+ size_t len = sizeof(struct chfs_flash_dirent_node);
+// struct chfs_vnode_cache* parent;
+ size_t retlen;
+ int err = 0;
+
+// parent = chfs_get_vnode_cache(chmp, pdir->ino);
+
+ //read flash_dirent_node
+ err = chfs_read_leb(chmp, chnr->nref_lnr, (char *)&chfdn,
+ CHFS_GET_OFS(chnr->nref_offset), len, &retlen);
+ if (err) {
+ return err;
+ }
+ if (retlen != len) {
+ chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+ retlen, len);
+ return EIO;
+ }
+
+ //set fields of dirent
+ fd = chfs_alloc_dirent(chfdn.nsize + 1);
+ fd->version = chfdn.version;
+ fd->vno = chfdn.vno;
+ fd->type = chfdn.dtype;
+ fd->nsize = chfdn.nsize;
+// fd->next = NULL;
+
+ err = chfs_read_leb(chmp, chnr->nref_lnr, fd->name,
+ CHFS_GET_OFS(chnr->nref_offset) + len, chfdn.nsize, &retlen);
+ if (err) {
+ return err;
+ }
+
+ if (retlen != chfdn.nsize) {
+ chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+ len, retlen);
+ return EIO;
+ }
+
+ fd->name[fd->nsize] = 0;
+ fd->nref = chnr;
+
+ chfs_add_fd_to_inode(chmp, pdir, fd);
+/*
+ pdents = pdir->i_chfs_ext.dents;
+ if (!pdents)
+ pdir->i_chfs_ext.dents = fd;
+ else {
+ while (pdents->next != NULL) {
+ pdents = pdents->next;
+ }
+ pdents->next = fd;
+ }
+*/
+ return 0;
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+chfs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp, int type)
+{
+ struct chfs_inode *ip, *pdir;
+ struct vnode *vp;
+ struct ufsmount* ump = VFSTOUFS(dvp->v_mount);
+ struct chfs_mount* chmp = ump->um_chfs;
+ struct chfs_vnode_cache* chvc;
+ int error, ismember = 0;
+ ino_t vno;
+ struct chfs_dirent *nfd;//, *fd;
+
+ dbg("makeinode\n");
+ pdir = VTOI(dvp);
+
+ *vpp = NULL;
+
+ vno = ++(chmp->chm_max_vno);
+
+ error = VFS_VGET(dvp->v_mount, vno, &vp);
+ if (error)
+ return (error);
+
+ mutex_enter(&chmp->chm_lock_vnocache);
+ chvc = chfs_vnode_cache_get(chmp, vno);
+ mutex_exit(&chmp->chm_lock_vnocache);
+
+ chvc->pvno = pdir->ino;
+ chvc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+ *(chvc->vno_version) = 1;
+ if (type != VDIR)
+ chvc->nlink = 1;
+ else
+ chvc->nlink = 2;
+// chfs_vnode_cache_set_state(chmp, chvc, VNO_STATE_CHECKEDABSENT);
+ chvc->state = VNO_STATE_CHECKEDABSENT;
+
+ ip = VTOI(vp);
+ ip->ino = vno;
+
+ if (type == VDIR)
+ chfs_set_vnode_size(vp, 512);
+ else
+ chfs_set_vnode_size(vp, 0);
+
+ ip->uid = kauth_cred_geteuid(cnp->cn_cred);
+ ip->gid = kauth_cred_getegid(cnp->cn_cred);
+ ip->version = 1;
+ ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE);
+
+ ip->chvc = chvc;
+ //ip->chvc->highest_version = 1;
+ ip->target = NULL;
+
+ ip->mode = mode;
+ vp->v_type = type; /* Rest init'd in getnewvnode(). */
+ if ((ip->mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+ ip->gid, &ismember) != 0 || !ismember) &&
+ kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL))
+ ip->mode &= ~ISGID;
+
+ chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+
+ //write inode to flash
+ error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+ if (error) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ vput(vp);
+ vput(dvp);
+ return error;
+ }
+ //update parent directory and write it to the flash
+ pdir->iflag |= (IN_ACCESS | IN_CHANGE | IN_MODIFY | IN_UPDATE);
+ chfs_update(dvp, NULL, NULL, UPDATE_WAIT);
+
+ error = chfs_write_flash_vnode(chmp, pdir, ALLOC_NORMAL);
+ if (error) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ vput(vp);
+ vput(dvp);
+ return error;
+ }
+ vput(dvp);
+
+ //set up node's full dirent
+ nfd = chfs_alloc_dirent(cnp->cn_namelen + 1);
+ nfd->vno = ip->ino;
+ nfd->version = (++pdir->chvc->highest_version);
+ nfd->type = type;
+// nfd->next = NULL;
+ nfd->nsize = cnp->cn_namelen;
+ memcpy(&(nfd->name), cnp->cn_nameptr, cnp->cn_namelen);
+ nfd->name[nfd->nsize] = 0;
+ nfd->nhash = hash32_buf(nfd->name, cnp->cn_namelen, HASH32_BUF_INIT);
+
+ // write out direntry
+ error = chfs_write_flash_dirent(chmp, pdir, ip, nfd, ip->ino, ALLOC_NORMAL);
+ if (error) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ vput(vp);
+ return error;
+ }
+
+ //TODO set parent's dir times
+
+ chfs_add_fd_to_inode(chmp, pdir, nfd);
+/*
+ fd = pdir->i_chfs_ext.dents;
+ if (!fd)
+ pdir->i_chfs_ext.dents = nfd;
+ else {
+ while (fd->next != NULL) {
+ fd = fd->next;
+ }
+ fd->next = nfd;
+ }
+*/
+ //pdir->i_nlink++;
+ pdir->chvc->nlink++;
+
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ *vpp = vp;
+ return (0);
+}
+
+void
+chfs_set_vnode_size(struct vnode *vp, size_t size)
+{
+ struct chfs_inode *ip;
+
+ KASSERT(vp != NULL);
+
+ ip = VTOI(vp);
+ KASSERT(ip != NULL);
+
+ ip->size = size;
+ vp->v_size = vp->v_writesize = size;
+ return;
+}
+
+void
+chfs_change_size_free(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, int change)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+ KASSERT((int)(chmp->chm_free_size + change) >= 0);
+ KASSERT((int)(cheb->free_size + change) >= 0);
+ KASSERT((int)(cheb->free_size + change) <= chmp->chm_ebh->eb_size);
+ chmp->chm_free_size += change;
+ cheb->free_size += change;
+ return;
+}
+
+void
+chfs_change_size_dirty(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, int change)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+ KASSERT((int)(chmp->chm_dirty_size + change) >= 0);
+ KASSERT((int)(cheb->dirty_size + change) >= 0);
+ KASSERT((int)(cheb->dirty_size + change) <= chmp->chm_ebh->eb_size);
+ chmp->chm_dirty_size += change;
+ cheb->dirty_size += change;
+ return;
+}
+
+void
+chfs_change_size_unchecked(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, int change)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+ KASSERT((int)(chmp->chm_unchecked_size + change) >= 0);
+ KASSERT((int)(cheb->unchecked_size + change) >= 0);
+ KASSERT((int)(cheb->unchecked_size + change) <= chmp->chm_ebh->eb_size);
+ chmp->chm_unchecked_size += change;
+ cheb->unchecked_size += change;
+ return;
+}
+
+void
+chfs_change_size_used(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, int change)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+ KASSERT((int)(chmp->chm_used_size + change) >= 0);
+ KASSERT((int)(cheb->used_size + change) >= 0);
+ KASSERT((int)(cheb->used_size + change) <= chmp->chm_ebh->eb_size);
+ chmp->chm_used_size += change;
+ cheb->used_size += change;
+ return;
+}
+
+void
+chfs_change_size_wasted(struct chfs_mount *chmp,
+ struct chfs_eraseblock *cheb, int change)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+ KASSERT((int)(chmp->chm_wasted_size + change) >= 0);
+ KASSERT((int)(cheb->wasted_size + change) >= 0);
+ KASSERT((int)(cheb->wasted_size + change) <= chmp->chm_ebh->eb_size);
+ chmp->chm_wasted_size += change;
+ cheb->wasted_size += change;
+ return;
+}
+
--- /dev/null
+/* $NetBSD: chfs_vnode_cache.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include <sys/pool.h>
+
+struct chfs_vnode_cache **
+chfs_vnocache_hash_init(void)
+{
+ return kmem_zalloc(VNODECACHE_SIZE *
+ sizeof(struct chfs_vnode_cache *), KM_SLEEP);
+}
+
+/**
+ * chfs_set_vnode_cache_state - set state of a vnode_cache
+ * @chmp: fs super block info
+ * @vc: vnode_cache
+ * @state: new state
+ */
+void
+chfs_vnode_cache_set_state(struct chfs_mount *chmp,
+ struct chfs_vnode_cache* vc, int state)
+{
+ /* XXX do we really need locking here? */
+ KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+ vc->state = state;
+}
+
+/**
+ * chfs_get_vnode_cache - get a vnode_cache from the vnocache_hash
+ * @chmp: fs super block info
+ * @ino: inode for search
+ * Returns the vnode_cache.
+ */
+struct chfs_vnode_cache *
+chfs_vnode_cache_get(struct chfs_mount *chmp, ino_t vno)
+{
+ struct chfs_vnode_cache* ret;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+ ret = chmp->chm_vnocache_hash[vno % VNODECACHE_SIZE];
+
+ if (ret == NULL) {
+ return NULL;
+ }
+
+ while (ret && ret->vno < vno) {
+ ret = ret->next;
+ }
+
+ if (ret && ret->vno != vno) {
+ ret = NULL;
+ }
+
+ return ret;
+}
+
+/**
+ * chfs_add_vnode_cache - add a vnode_cache to the vnocache_hash
+ * @chmp: fs super block info
+ * @new: new vnode_cache
+ */
+void
+chfs_vnode_cache_add(struct chfs_mount *chmp,
+ struct chfs_vnode_cache* new)
+{
+ struct chfs_vnode_cache** prev;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+ if (!new->vno) {
+ new->vno = ++chmp->chm_max_vno;
+ }
+
+ prev = &chmp->chm_vnocache_hash[new->vno % VNODECACHE_SIZE];
+
+ while ((*prev) && (*prev)->vno < new->vno) {
+ prev = &((*prev)->next);
+ }
+ new->next = *prev;
+ *prev = new;
+}
+
+/**
+ * chfs_del_vnode_cache - del a vnode_cache from the vnocache_hash
+ * @chmp: fs super block info
+ * @old: old vnode_cache
+ */
+void
+chfs_vnode_cache_remove(struct chfs_mount *chmp,
+ struct chfs_vnode_cache* old)
+{
+ struct chfs_vnode_cache** prev;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+ prev = &chmp->chm_vnocache_hash[old->vno % VNODECACHE_SIZE];
+ while ((*prev) && (*prev)->vno < old->vno) {
+ prev = &(*prev)->next;
+ }
+
+ if ((*prev) == old) {
+ *prev = old->next;
+ }
+
+ if (old->state != VNO_STATE_READING &&
+ old->state != VNO_STATE_CLEARING) {
+ chfs_vnode_cache_free(old);
+ }
+}
+
+/**
+ * chfs_free_vnode_caches - free the vnocache_hash
+ * @chmp: fs super block info
+ */
+void
+chfs_vnocache_hash_destroy(struct chfs_vnode_cache **hash)
+{
+ struct chfs_vnode_cache *this, *next;
+ int i;
+
+ for (i = 0; i < VNODECACHE_SIZE; i++) {
+ this = hash[i];
+ while (this) {
+ next = this->next;
+ chfs_vnode_cache_free(this);
+ this = next;
+ }
+ hash[i] = NULL;
+ }
+}
+
+
--- /dev/null
+/* $NetBSD: chfs_vnops.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <uvm/uvm.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/buf.h>
+#include <sys/fstrans.h>
+#include <sys/vnode.h>
+
+#include "chfs.h"
+
+#define READ_S "chfs_read"
+
+int
+chfs_lookup(void *v)
+{
+ struct vnode *dvp = ((struct vop_lookup_args *) v)->a_dvp;
+ struct vnode **vpp = ((struct vop_lookup_args *) v)->a_vpp;
+ struct componentname *cnp = ((struct vop_lookup_args *) v)->a_cnp;
+
+ int error;
+ struct chfs_inode* ip;
+ struct ufsmount* ump;
+ struct chfs_mount* chmp;
+ struct chfs_vnode_cache* chvc;
+ struct chfs_dirent* fd;
+
+ dbg("lookup(): %s\n", cnp->cn_nameptr);
+
+ KASSERT(VOP_ISLOCKED(dvp));
+
+ *vpp = NULL;
+
+ // Check accessibility of requested node as a first step.
+ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred);
+ if (error != 0) {
+ goto out;
+ }
+
+ // If requesting the last path component on a read-only file system
+ // with a write operation, deny it.
+ if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY)
+ && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EROFS;
+ goto out;
+ }
+
+ // Avoid doing a linear scan of the directory if the requested
+ // directory/name couple is already in the cache.
+ error = cache_lookup(dvp, vpp, cnp);
+ if (error >= 0) {
+ goto out;
+ }
+
+ ip = VTOI(dvp);
+ ump = VFSTOUFS(dvp->v_mount);
+ chmp = ump->um_chfs;
+ if (ip->ino == 0) {
+ ip->ino = ++chmp->chm_max_vno;
+ }
+ mutex_enter(&chmp->chm_lock_vnocache);
+ chvc = chfs_vnode_cache_get(chmp, ip->ino);
+ mutex_exit(&chmp->chm_lock_vnocache);
+
+ // We cannot be requesting the parent directory of the root node.
+ KASSERT(IMPLIES(dvp->v_type == VDIR && chvc->pvno == chvc->vno,
+ !(cnp->cn_flags & ISDOTDOT)));
+
+ if (cnp->cn_flags & ISDOTDOT) {
+ VOP_UNLOCK(dvp);
+ error = VFS_VGET(dvp->v_mount, ip->chvc->pvno, vpp);
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+ } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
+ vref(dvp);
+ *vpp = dvp;
+ error = 0;
+ } else {
+ fd = chfs_dir_lookup(ip, cnp);
+
+ if (fd == NULL) {
+ dbg("fd null\n");
+ // The entry was not found in the directory.
+ // This is OK if we are creating or renaming an
+ // entry and are working on the last component of
+ // the path name.
+ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE
+ || cnp->cn_nameiop == RENAME)) {
+ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
+ if (error) {
+ dbg("after the entry was not found in dir\n");
+ goto out;
+ }
+
+ dbg("return EJUSTRETURN\n");
+ error = EJUSTRETURN;
+ } else {
+ error = ENOENT;
+ }
+ } else {
+ // If we are not at the last path component and
+ // found a non-directory or non-link entry (which
+ // may itself be pointing to a directory), raise
+ // an error.
+ if ((fd->type != VDIR && fd->type != VLNK) && !(cnp->cn_flags
+ & ISLASTCN)) {
+ error = ENOTDIR;
+ goto out;
+ }
+
+ dbg("vno@allocating new vnode: %llu\n",
+ (unsigned long long)fd->vno);
+ error = VFS_VGET(dvp->v_mount, fd->vno, vpp);
+ }
+ }
+ // Store the result of this lookup in the cache. Avoid this if the
+ // request was for creation, as it does not improve timings on
+ // emprical tests.
+ if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE
+ && (cnp->cn_flags & ISDOTDOT) == 0)
+ cache_enter(dvp, *vpp, cnp);
+
+out:
+ // If there were no errors, *vpp cannot be null and it must be
+ // locked.
+ KASSERT(IFF(error == 0, *vpp != NULL && VOP_ISLOCKED(*vpp)));
+
+ // dvp must always be locked.
+ KASSERT(VOP_ISLOCKED(dvp));
+
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_create(void *v)
+{
+ struct vop_create_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */*ap = v;
+ int error, mode;
+ dbg("create()\n");
+
+ mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+
+ if ((mode & IFMT) == 0) {
+ if (ap->a_vap->va_type == VREG)
+ mode |= IFREG;
+ if (ap->a_vap->va_type == VSOCK)
+ mode |= IFSOCK;
+ }
+
+ error = chfs_makeinode(mode, ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap->va_type);
+
+ if (error) {
+ dbg("error: %d\n", error);
+ return error;
+ }
+
+ VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ return 0;
+}
+/* --------------------------------------------------------------------- */
+
+int
+chfs_mknod(void *v)
+{
+ struct vnode *dvp = ((struct vop_mknod_args *) v)->a_dvp;
+ struct vnode **vpp = ((struct vop_mknod_args *) v)->a_vpp;
+ struct componentname *cnp = ((struct vop_mknod_args *) v)->a_cnp;
+ struct vattr *vap = ((struct vop_mknod_args *) v)->a_vap;
+ int mode, err = 0;
+ struct chfs_inode *ip;
+ struct vnode *vp;
+
+ struct ufsmount *ump;
+ struct chfs_mount *chmp;
+ ino_t ino;
+
+ struct chfs_full_dnode *fd;
+ struct buf *bp;
+ int len;
+ dbg("mknod()\n");
+
+ ump = VFSTOUFS(dvp->v_mount);
+ chmp = ump->um_chfs;
+
+ if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO)
+ return EINVAL;
+
+ vp = *vpp;
+
+ mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+ if ((mode & IFMT) == 0) {
+ switch (vap->va_type) {
+ case VBLK:
+ mode |= IFBLK;
+ break;
+ case VCHR:
+ mode |= IFCHR;
+ break;
+ case VFIFO:
+ mode |= IFIFO;
+ break;
+ default:
+ break;
+ }
+ }
+
+ err = chfs_makeinode(mode, dvp, &vp, cnp, vap->va_type);
+
+ ip = VTOI(vp);
+ ino = ip->ino;
+ if (vap->va_rdev != VNOVAL)
+ ip->rdev = vap->va_rdev;
+
+ if (vap->va_type == VFIFO)
+ vp->v_op = chfs_fifoop_p;
+ else {
+ vp->v_op = chfs_specop_p;
+ spec_node_init(vp, ip->rdev);
+ }
+
+ if (err)
+ return err;
+
+ len = sizeof(dev_t);
+ chfs_set_vnode_size(vp, len);
+ bp = getiobuf(vp, true);
+ bp->b_bufsize = bp->b_resid = len;
+ bp->b_data = kmem_alloc(len, KM_SLEEP);
+ memcpy(bp->b_data, &ip->rdev, len);
+ bp->b_blkno = 0;
+
+ fd = chfs_alloc_full_dnode();
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+
+ err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+ if (err) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ kmem_free(bp->b_data, len);
+ return err;
+ }
+
+ err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+ if (err) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ kmem_free(bp->b_data, len);
+ return err;
+ }
+
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ *vpp = vp;
+ kmem_free(bp->b_data, len);
+ putiobuf(bp);
+
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_open(void *v)
+{
+ struct vnode *vp = ((struct vop_open_args *) v)->a_vp;
+ int mode = ((struct vop_open_args *) v)->a_mode;
+ dbg("open()\n");
+
+ int error;
+ struct chfs_inode *ip;
+
+ KASSERT(VOP_ISLOCKED(vp));
+
+ ip = VTOI(vp);
+
+ KASSERT(vp->v_size == ip->size);
+ if (ip->chvc->nlink < 1) {
+ error = ENOENT;
+ goto out;
+ }
+
+ // If the file is marked append-only, deny write requests.
+ if (ip->flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE)
+ error = EPERM;
+ else
+ error = 0;
+
+out:
+ KASSERT(VOP_ISLOCKED(vp));
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_close(void *v)
+{
+ struct vnode *vp = ((struct vop_close_args *) v)->a_vp;
+ dbg("close()\n");
+
+ struct chfs_inode *ip;
+
+ KASSERT(VOP_ISLOCKED(vp));
+
+ ip = VTOI(vp);
+
+ if (ip->chvc->nlink > 0) {
+ //ip->chvc->nlink = 0;
+ chfs_update(vp, NULL, NULL, UPDATE_CLOSE);
+ }
+
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_access(void *v)
+{
+ struct vnode *vp = ((struct vop_access_args *) v)->a_vp;
+ int mode = ((struct vop_access_args *) v)->a_mode;
+ kauth_cred_t cred = ((struct vop_access_args *) v)->a_cred;
+
+ dbg("access()\n");
+ struct chfs_inode *ip = VTOI(vp);
+
+ if (mode & VWRITE) {
+ switch (vp->v_type) {
+ case VLNK:
+ case VDIR:
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ break;
+ case VBLK:
+ case VCHR:
+ case VSOCK:
+ case VFIFO:
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (mode & VWRITE && ip->flags & IMMUTABLE)
+ return (EPERM);
+
+ return genfs_can_access(vp->v_type, ip->mode & ALLPERMS,
+ ip->uid, ip->gid, mode, cred);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_getattr(void *v)
+{
+ struct vnode *vp = ((struct vop_getattr_args *) v)->a_vp;
+ struct vattr *vap = ((struct vop_getattr_args *) v)->a_vap;
+
+ struct chfs_inode *ip = VTOI(vp);
+ dbg("getattr()\n");
+
+ KASSERT(vp->v_size == ip->size);
+
+ vattr_null(vap);
+ CHFS_ITIMES(ip, NULL, NULL, NULL);
+
+ vap->va_type = vp->v_type;
+ vap->va_mode = ip->mode & ALLPERMS;
+ vap->va_nlink = ip->chvc->nlink;
+ vap->va_uid = ip->uid;
+ vap->va_gid = ip->gid;
+ vap->va_fsid = ip->dev;
+ vap->va_fileid = ip->ino;
+ vap->va_size = ip->size;
+ vap->va_blocksize = PAGE_SIZE;
+ vap->va_atime.tv_sec = ip->atime;
+ vap->va_atime.tv_nsec = 0;
+ vap->va_mtime.tv_sec = ip->mtime;
+ vap->va_mtime.tv_nsec = 0;
+ vap->va_ctime.tv_sec = ip->ctime;
+ vap->va_ctime.tv_nsec = 0;
+ vap->va_gen = ip->version;
+ vap->va_flags = ip->flags;
+ vap->va_rdev = ip->rdev;
+ vap->va_bytes = round_page(ip->size);
+ vap->va_filerev = VNOVAL;
+ vap->va_vaflags = 0;
+ vap->va_spare = VNOVAL;
+
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* Note: modelled after tmpfs's same function */
+
+int
+chfs_setattr(void *v)
+{
+ struct vnode *vp = ((struct vop_setattr_args *) v)->a_vp;
+ struct vattr *vap = ((struct vop_setattr_args *) v)->a_vap;
+ kauth_cred_t cred = ((struct vop_setattr_args *) v)->a_cred;
+
+ struct chfs_inode *ip;
+ struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+ struct chfs_mount *chmp = ump->um_chfs;
+ int error = 0;
+
+ dbg("setattr()\n");
+
+ KASSERT(VOP_ISLOCKED(vp));
+ ip = VTOI(vp);
+
+ /* Abort if any unsettable attribute is given. */
+ if (vap->va_type != VNON || vap->va_nlink != VNOVAL ||
+ vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL ||
+ vap->va_blocksize != VNOVAL /*|| GOODTIME(&vap->va_ctime)*/ ||
+ vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL ||
+ vap->va_bytes != VNOVAL) {
+ return EINVAL;
+ }
+
+ if (error == 0 && (vap->va_flags != VNOVAL))
+ error = chfs_chflags(vp, vap->va_flags, cred);
+
+ if (error == 0 && (vap->va_size != VNOVAL))
+ error = chfs_chsize(vp, vap->va_size, cred);
+
+ if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL))
+ error = chfs_chown(vp, vap->va_uid, vap->va_gid, cred);
+
+ if (error == 0 && (vap->va_mode != VNOVAL))
+ error = chfs_chmod(vp, vap->va_mode, cred);
+
+#if 0
+ /* why do we need that? */
+ if (ip->flags & (IMMUTABLE | APPEND))
+ return EPERM;
+#endif
+
+ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+ error = genfs_can_chtimes(vp, vap->va_vaflags, ip->uid, cred);
+ if (error)
+ return error;
+ if (vap->va_atime.tv_sec != VNOVAL)
+ ip->iflag |= IN_ACCESS;
+ if (vap->va_mtime.tv_sec != VNOVAL)
+ ip->iflag |= IN_CHANGE | IN_UPDATE;
+ error = chfs_update(vp,
+ &vap->va_atime, &vap->va_mtime, UPDATE_WAIT);
+ if (error)
+ return error;
+ }
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+ error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ return error;
+}
+
+int
+chfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred)
+{
+ struct chfs_inode *ip = VTOI(vp);
+ int error;
+ dbg("chmod\n");
+
+ error = genfs_can_chmod(vp, cred, ip->uid, ip->gid, mode);
+ if (error)
+ return error;
+ ip->mode &= ~ALLPERMS;
+ ip->mode |= (mode & ALLPERMS);
+ ip->iflag |= IN_CHANGE;
+
+ error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+int
+chfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred)
+{
+ struct chfs_inode *ip = VTOI(vp);
+ int error;
+ dbg("chown\n");
+
+ if (uid == (uid_t)VNOVAL)
+ uid = ip->uid;
+ if (gid == (gid_t)VNOVAL)
+ gid = ip->gid;
+
+ error = genfs_can_chown(vp, cred, ip->uid, ip->gid, uid, gid);
+ if (error)
+ return error;
+
+ ip->gid = gid;
+ ip->uid = uid;
+ ip->iflag |= IN_CHANGE;
+
+ error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+
+/* --------------------------------------------------------------------- */
+/* calculates ((off_t)blk * chmp->chm_chm_fs_bsize) */
+#define lblktosize(chmp, blk) \
+ (((off_t)(blk)) << (chmp)->chm_fs_bshift)
+
+/* calculates (loc % chmp->chm_chm_fs_bsize) */
+#define blkoff(chmp, loc) \
+ ((loc) & (chmp)->chm_fs_qbmask)
+
+/* calculates (loc / chmp->chm_chm_fs_bsize) */
+#define lblkno(chmp, loc) \
+ ((loc) >> (chmp)->chm_fs_bshift)
+
+/* calculates roundup(size, chmp->chm_chm_fs_fsize) */
+#define fragroundup(chmp, size) \
+ (((size) + (chmp)->chm_fs_qfmask) & (chmp)->chm_fs_fmask)
+
+#define blksize(chmp, ip, lbn) \
+ (((lbn) >= NDADDR || (ip)->size >= lblktosize(chmp, (lbn) + 1)) \
+ ? (chmp)->chm_fs_bsize \
+ : (fragroundup(chmp, blkoff(chmp, (ip)->size))))
+
+/* calculates roundup(size, chmp->chm_chm_fs_bsize) */
+#define blkroundup(chmp, size) \
+ (((size) + (chmp)->chm_fs_qbmask) & (chmp)->chm_fs_bmask)
+
+int
+chfs_read(void *v)
+{
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct chfs_inode *ip;
+ struct uio *uio;
+ struct ufsmount *ump;
+ struct buf *bp;
+ struct chfs_mount *chmp;
+ daddr_t lbn, nextlbn;
+ off_t bytesinfile;
+ long size, xfersize, blkoffset;
+ int error, ioflag;
+ vsize_t bytelen;
+ bool usepc = false;
+
+ dbg("chfs_read\n");
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ ump = ip->ump;
+ uio = ap->a_uio;
+ ioflag = ap->a_ioflag;
+ error = 0;
+
+ dbg("ip->size:%llu\n", (unsigned long long)ip->size);
+
+#ifdef DIAGNOSTIC
+ if (uio->uio_rw != UIO_READ)
+ panic("%s: mode", READ_S);
+
+ if (vp->v_type == VLNK) {
+ if (ip->size < ump->um_maxsymlinklen)
+ panic("%s: short symlink", READ_S);
+ } else if (vp->v_type != VREG && vp->v_type != VDIR)
+ panic("%s: type %d", READ_S, vp->v_type);
+#endif
+ chmp = ip->chmp;
+ if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
+ return (EFBIG);
+ if (uio->uio_resid == 0)
+ return (0);
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+ if (uio->uio_offset >= ip->size)
+ goto out;
+
+ usepc = vp->v_type == VREG;
+ bytelen = 0;
+ if (usepc) {
+ const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+ while (uio->uio_resid > 0) {
+ if (ioflag & IO_DIRECT) {
+ genfs_directio(vp, uio, ioflag);
+ }
+ bytelen = MIN(ip->size - uio->uio_offset,
+ uio->uio_resid);
+ if (bytelen == 0)
+ break;
+ error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+ UBC_READ | UBC_PARTIALOK |
+ (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0));
+ if (error)
+ break;
+
+ }
+ goto out;
+ }
+
+
+ dbg("start reading\n");
+ for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+ bytesinfile = ip->size - uio->uio_offset;
+ if (bytesinfile <= 0)
+ break;
+ lbn = lblkno(chmp, uio->uio_offset);
+ nextlbn = lbn + 1;
+ size = blksize(chmp, ip, lbn);
+ blkoffset = blkoff(chmp, uio->uio_offset);
+ xfersize = MIN(MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid),
+ bytesinfile);
+
+ if (lblktosize(chmp, nextlbn) >= ip->size) {
+ error = bread(vp, lbn, size, NOCRED, 0, &bp);
+ dbg("after bread\n");
+ } else {
+ int nextsize = blksize(chmp, ip, nextlbn);
+ dbg("size: %ld\n", size);
+ error = breadn(vp, lbn,
+ size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+ dbg("after breadN\n");
+ }
+ if (error)
+ break;
+
+ /*
+ * We should only get non-zero b_resid when an I/O error
+ * has occurred, which should cause us to break above.
+ * However, if the short read did not cause an error,
+ * then we want to ensure that we do not uiomove bad
+ * or uninitialized data.
+ */
+ size -= bp->b_resid;
+ if (size < xfersize) {
+ if (size == 0)
+ break;
+ xfersize = size;
+ }
+ dbg("uiomove\n");
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+ if (error)
+ break;
+ brelse(bp, 0);
+ }
+ if (bp != NULL)
+ brelse(bp, 0);
+
+out:
+ if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+ ip->iflag |= IN_ACCESS;
+ if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
+ //error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error) {
+ fstrans_done(vp->v_mount);
+ return error;
+ }
+ error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+ //UFS_WAPBL_END(vp->v_mount);
+ }
+ }
+
+ dbg("[END]\n");
+ fstrans_done(vp->v_mount);
+ return (error);
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*from ffs write*/
+int
+chfs_write(void *v)
+{
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp ;
+ struct uio *uio;
+ struct chfs_inode *ip;
+ struct chfs_mount *chmp;
+ struct lwp *l;
+ kauth_cred_t cred;
+ off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
+ int blkoffset, error, flags, ioflag, resid;
+ int aflag;
+ int extended=0;
+ vsize_t bytelen;
+ bool async;
+ struct ufsmount *ump;
+
+
+ cred = ap->a_cred;
+ ioflag = ap->a_ioflag;
+ uio = ap->a_uio;
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ //dbg("file size (vp): %llu\n", (unsigned long long)vp->v_size);
+ //dbg("file size (ip): %llu\n", (unsigned long long)ip->i_size);
+ ump = ip->ump;
+
+ //dbg("uio->resid: %d\n", uio->uio_resid);
+ dbg("write\n");
+
+ KASSERT(vp->v_size == ip->size);
+
+ switch (vp->v_type) {
+ case VREG:
+ if (ioflag & IO_APPEND)
+ uio->uio_offset = ip->size;
+ if ((ip->flags & APPEND) && uio->uio_offset != ip->size)
+ return (EPERM);
+ /* FALLTHROUGH */
+ case VLNK:
+ break;
+ case VDIR:
+ if ((ioflag & IO_SYNC) == 0)
+ panic("chfs_write: nonsync dir write");
+ break;
+ default:
+ panic("chfs_write: type");
+ }
+
+ chmp = ip->chmp;
+ if (uio->uio_offset < 0 ||
+ (u_int64_t)uio->uio_offset +
+ uio->uio_resid > ump->um_maxfilesize) {
+ dbg("uio->uio_offset = %lld | uio->uio_offset + "
+ "uio->uio_resid (%llu) > ump->um_maxfilesize (%lld)\n",
+ (long long)uio->uio_offset,
+ (uint64_t)uio->uio_offset + uio->uio_resid,
+ (long long)ump->um_maxfilesize);
+ return (EFBIG);
+ }
+ /*
+ * Maybe this should be above the vnode op call, but so long as
+ * file servers have no limits, I don't think it matters.
+ */
+ l = curlwp;
+ if (vp->v_type == VREG && l &&
+ uio->uio_offset + uio->uio_resid >
+ l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
+ mutex_enter(proc_lock);
+ psignal(l->l_proc, SIGXFSZ);
+ mutex_exit(proc_lock);
+ return (EFBIG);
+ }
+ if (uio->uio_resid == 0)
+ return (0);
+
+ //mutex_enter(&ip->inode_lock);
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+ flags = ioflag & IO_SYNC ? B_SYNC : 0;
+ async = vp->v_mount->mnt_flag & MNT_ASYNC;
+ origoff = uio->uio_offset;
+ resid = uio->uio_resid;
+ osize = ip->size;
+ error = 0;
+
+
+ /*if ((ioflag & IO_JOURNALLOCKED) == 0) {
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error) {
+ fstrans_done(vp->v_mount);
+ return error;
+ }
+ }*/
+
+ preallocoff = round_page(blkroundup(chmp,
+ MAX(osize, uio->uio_offset)));
+ aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+ nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
+ endallocoff = nsize - blkoff(chmp, nsize);
+
+ /*
+ * if we're increasing the file size, deal with expanding
+ * the fragment if there is one.
+ */
+
+ if (nsize > osize && lblkno(chmp, osize) < NDADDR &&
+ lblkno(chmp, osize) != lblkno(chmp, nsize) &&
+ blkroundup(chmp, osize) != osize) {
+ off_t eob;
+
+ eob = blkroundup(chmp, osize);
+ uvm_vnp_setwritesize(vp, eob);
+ error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
+ if (error)
+ goto out;
+ if (flags & B_SYNC) {
+ mutex_enter(vp->v_interlock);
+ VOP_PUTPAGES(vp,
+ trunc_page(osize & chmp->chm_fs_bmask),
+ round_page(eob),
+ PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+ }
+ }
+
+ while (uio->uio_resid > 0) {
+ int ubc_flags = UBC_WRITE;
+ bool overwrite; /* if we're overwrite a whole block */
+ off_t newoff;
+
+ if (ioflag & IO_DIRECT) {
+ genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
+ }
+
+ oldoff = uio->uio_offset;
+ blkoffset = blkoff(chmp, uio->uio_offset);
+ bytelen = MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid);
+ if (bytelen == 0) {
+ break;
+ }
+
+ /*
+ * if we're filling in a hole, allocate the blocks now and
+ * initialize the pages first. if we're extending the file,
+ * we can safely allocate blocks without initializing pages
+ * since the new blocks will be inaccessible until the write
+ * is complete.
+ */
+ overwrite = uio->uio_offset >= preallocoff &&
+ uio->uio_offset < endallocoff;
+ if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
+ blkoff(chmp, uio->uio_offset) == 0 &&
+ (uio->uio_offset & PAGE_MASK) == 0) {
+ vsize_t len;
+
+ len = trunc_page(bytelen);
+ len -= blkoff(chmp, len);
+ if (len > 0) {
+ overwrite = true;
+ bytelen = len;
+ }
+ }
+
+ newoff = oldoff + bytelen;
+ if (vp->v_size < newoff) {
+ uvm_vnp_setwritesize(vp, newoff);
+ }
+
+ if (!overwrite) {
+ error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
+ cred, aflag);
+ if (error)
+ break;
+ } else {
+ genfs_node_wrlock(vp);
+ error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
+ aflag, cred);
+ genfs_node_unlock(vp);
+ if (error)
+ break;
+ ubc_flags |= UBC_FAULTBUSY;
+ }
+
+ /*
+ * copy the data.
+ */
+
+ ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
+ error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+ IO_ADV_DECODE(ioflag), ubc_flags);
+
+ /*
+ * update UVM's notion of the size now that we've
+ * copied the data into the vnode's pages.
+ *
+ * we should update the size even when uiomove failed.
+ */
+
+ if (vp->v_size < newoff) {
+ uvm_vnp_setsize(vp, newoff);
+ extended = 1;
+ }
+
+ if (error)
+ break;
+
+ /*
+ * flush what we just wrote if necessary.
+ * XXXUBC simplistic async flushing.
+ */
+
+ if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+ (uio->uio_offset >> 16) << 16,
+ PGO_CLEANIT | PGO_JOURNALLOCKED);
+ if (error)
+ break;
+ }
+ }
+out:
+ if (error == 0 && ioflag & IO_SYNC) {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp,
+ trunc_page(origoff & chmp->chm_fs_bmask),
+ round_page(blkroundup(chmp, uio->uio_offset)),
+ PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+ }
+ ip->iflag |= IN_CHANGE | IN_UPDATE;
+ if (resid > uio->uio_resid && ap->a_cred &&
+ kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+ ip->mode &= ~(ISUID | ISGID);
+ }
+ if (resid > uio->uio_resid)
+ VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+ if (error) {
+ (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+ uio->uio_offset -= resid - uio->uio_resid;
+ uio->uio_resid = resid;
+ } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+ error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+
+ //XXX hack, i write the next line after i know ip->i_size and vp->v_size don't equal
+ chfs_set_vnode_size(vp, vp->v_size);
+
+
+ //dbg("end file size (vp): %llu\n", (unsigned long long)vp->v_size);
+ //dbg("end file size (ip): %llu\n", (unsigned long long)ip->i_size);
+ KASSERT(vp->v_size == ip->size);
+ fstrans_done(vp->v_mount);
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+ error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ //mutex_exit(&ip->inode_lock);
+ //dbg("end\n");
+ return (error);
+}
+
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_fsync(void *v)
+{
+ //dbg("fsync\n");
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ kauth_cred_t a_cred;
+ int a_flags;
+ off_t offlo;
+ off_t offhi;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ int wait;
+
+ if (ap->a_flags & FSYNC_CACHE) {
+ return ENODEV;
+ }
+ wait = (ap->a_flags & FSYNC_WAIT) != 0;
+ vflushbuf(vp, wait);
+ //struct chfs_inode *ip = VTOI(vp);
+ //chfs_set_vnode_size(vp, ip->write_size);
+
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_remove(void *v)
+{
+ struct vnode *dvp = ((struct vop_remove_args *) v)->a_dvp;
+ struct vnode *vp = ((struct vop_remove_args *) v)->a_vp;
+ struct componentname *cnp = (((struct vop_remove_args *) v)->a_cnp);
+ dbg("remove\n");
+
+ KASSERT(VOP_ISLOCKED(dvp));
+ KASSERT(VOP_ISLOCKED(vp));
+
+ struct chfs_inode *ip = VTOI(vp);
+ struct chfs_inode *parent = VTOI(dvp);
+ int error = 0;
+
+ KASSERT(ip->chvc->vno != ip->chvc->pvno);
+
+ error = chfs_do_unlink(ip,
+ parent, cnp->cn_nameptr, cnp->cn_namelen);
+
+ vput(dvp);
+ vput(vp);
+
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_link(void *v)
+{
+ struct vnode *dvp = ((struct vop_link_args *) v)->a_dvp;
+ struct vnode *vp = ((struct vop_link_args *) v)->a_vp;
+ struct componentname *cnp = ((struct vop_link_args *) v)->a_cnp;
+
+ struct chfs_inode *ip, *parent;
+ int error = 0;
+
+ if (vp->v_type == VDIR) {
+ VOP_ABORTOP(dvp, cnp);
+ error = EISDIR;
+ goto out;
+ }
+ if (dvp->v_mount != vp->v_mount) {
+ VOP_ABORTOP(dvp, cnp);
+ error = EXDEV;
+ goto out;
+ }
+ if (dvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE))) {
+ VOP_ABORTOP(dvp, cnp);
+ goto out;
+ }
+
+ parent = VTOI(dvp);
+ ip = VTOI(vp);
+
+ error = chfs_do_link(ip,
+ parent, cnp->cn_nameptr, cnp->cn_namelen, vp->v_type);
+
+ if (dvp != vp)
+ VOP_UNLOCK(vp);
+out:
+ vput(dvp);
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_rename(void *v)
+{
+ struct vnode *fdvp = ((struct vop_rename_args *) v)->a_fdvp;
+ struct vnode *fvp = ((struct vop_rename_args *) v)->a_fvp;
+ struct componentname *fcnp = ((struct vop_rename_args *) v)->a_fcnp;
+ struct vnode *tdvp = ((struct vop_rename_args *) v)->a_tdvp;
+ struct vnode *tvp = ((struct vop_rename_args *) v)->a_tvp;
+ struct componentname *tcnp = ((struct vop_rename_args *) v)->a_tcnp;
+
+ struct chfs_inode *oldparent, *old;
+ struct chfs_inode *newparent;
+ struct chfs_dirent *fd;//, *oldfd;
+ struct chfs_inode *ip;
+ int error = 0;
+ dbg("rename\n");
+
+ KASSERT(VOP_ISLOCKED(tdvp));
+ KASSERT(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
+
+ oldparent = VTOI(fdvp);
+ old = VTOI(fvp);
+ newparent = VTOI(tdvp);
+ if (tvp) {
+ dbg("tvp not null\n");
+ ip = VTOI(tvp);
+ if (tvp->v_type == VDIR) {
+ //TODO: lock
+// fd = ip->dents;
+// while (fd) {
+ TAILQ_FOREACH(fd, &ip->dents, fds) {
+ if (fd->vno) {
+ //TODO: unlock
+ error = ENOTEMPTY;
+ goto out_unlocked;
+ }
+// fd = fd->next;
+ }
+ //TODO: unlock
+ }
+ error = chfs_do_unlink(ip,
+ newparent, tcnp->cn_nameptr, tcnp->cn_namelen);
+ vput(tvp);
+ }
+ VFS_VGET(tdvp->v_mount, old->ino, &tvp);
+ ip = VTOI(tvp);
+
+// for (oldfd = oldparent->dents;
+// oldfd->vno != old->ino;
+// oldfd = oldfd->next);
+
+ error = chfs_do_link(ip,
+ newparent, tcnp->cn_nameptr, tcnp->cn_namelen, tvp->v_type);
+ error = chfs_do_unlink(old,
+ oldparent, fcnp->cn_nameptr, fcnp->cn_namelen);
+
+//out:
+// if (fchnode != tchnode)
+// VOP_UNLOCK(fdvp, 0);
+
+out_unlocked:
+ // Release target nodes.
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp != NULL)
+ vput(tvp);
+
+ // Release source nodes.
+ vrele(fdvp);
+ vrele(fvp);
+
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_mkdir(void *v)
+{
+ struct vnode *dvp = ((struct vop_mkdir_args *) v)->a_dvp;
+ struct vnode **vpp = ((struct vop_mkdir_args *)v)->a_vpp;
+ struct componentname *cnp = ((struct vop_mkdir_args *) v)->a_cnp;
+ struct vattr *vap = ((struct vop_mkdir_args *) v)->a_vap;
+ dbg("mkdir()\n");
+
+ int mode;
+
+ mode = vap->va_mode & ACCESSPERMS;
+ if ((mode & IFMT) == 0) {
+ mode |= IFDIR;
+ }
+
+ KASSERT(vap->va_type == VDIR);
+
+ return chfs_makeinode(mode, dvp, vpp, cnp, VDIR);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_rmdir(void *v)
+{
+ struct vnode *dvp = ((struct vop_rmdir_args *) v)->a_dvp;
+ struct vnode *vp = ((struct vop_rmdir_args *) v)->a_vp;
+ struct componentname *cnp = ((struct vop_rmdir_args *) v)->a_cnp;
+ dbg("rmdir()\n");
+
+ KASSERT(VOP_ISLOCKED(dvp));
+ KASSERT(VOP_ISLOCKED(vp));
+
+ struct chfs_inode *ip = VTOI(vp);
+ struct chfs_inode *parent = VTOI(dvp);
+ struct chfs_dirent *fd;
+ int error = 0;
+
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+
+ KASSERT(ip->chvc->vno != ip->chvc->pvno);
+
+// for (fd = ip->dents; fd; fd = fd->next) {
+ TAILQ_FOREACH(fd, &ip->dents, fds) {
+ if (fd->vno) {
+ error = ENOTEMPTY;
+ goto out;
+ }
+ }
+
+ error = chfs_do_unlink(ip,
+ parent, cnp->cn_nameptr, cnp->cn_namelen);
+
+out:
+ vput(dvp);
+ vput(vp);
+
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_symlink(void *v)
+{
+ struct vnode *dvp = ((struct vop_symlink_args *) v)->a_dvp;
+ struct vnode **vpp = ((struct vop_symlink_args *) v)->a_vpp;
+ struct componentname *cnp = ((struct vop_symlink_args *) v)->a_cnp;
+ struct vattr *vap = ((struct vop_symlink_args *) v)->a_vap;
+ char *target = ((struct vop_symlink_args *) v)->a_target;
+
+ struct ufsmount *ump;
+ struct chfs_mount *chmp;
+ struct vnode *vp;
+ struct chfs_inode *ip;
+ int len, err;
+ struct chfs_full_dnode *fd;
+ struct buf *bp;
+ dbg("symlink()\n");
+
+ ump = VFSTOUFS(dvp->v_mount);
+ chmp = ump->um_chfs;
+
+ err = chfs_makeinode(IFLNK | vap->va_mode, dvp, vpp, cnp, VLNK);
+ if (err)
+ return (err);
+ VN_KNOTE(dvp, NOTE_WRITE);
+ vp = *vpp;
+ len = strlen(target);
+ ip = VTOI(vp);
+ /* TODO max symlink len instead of "100" */
+ if (len < 100) {
+ ip->target = kmem_alloc(len, KM_SLEEP);
+ memcpy(ip->target, target, len);
+ chfs_set_vnode_size(vp, len);
+ ip->iflag |= IN_CHANGE | IN_UPDATE;
+
+ bp = getiobuf(vp, true);
+ bp->b_bufsize = bp->b_resid = len;
+ bp->b_data = kmem_alloc(len, KM_SLEEP);
+ memcpy(bp->b_data, target, len);
+ bp->b_blkno = 0;
+
+ fd = chfs_alloc_full_dnode();
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+
+ err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+ if (err) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ goto out;
+ }
+
+ err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+ if (err) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ goto out;
+ }
+
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ kmem_free(bp->b_data, len);
+ putiobuf(bp);
+
+ uvm_vnp_setsize(vp, len);
+ } else {
+ err = vn_rdwr(UIO_WRITE, vp, target, len, (off_t)0,
+ UIO_SYSSPACE, IO_NODELOCKED, cnp->cn_cred,
+ (size_t *)0, NULL);
+ }
+
+out:
+ if (err)
+ vput(vp);
+
+ return (err);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_readdir(void *v)
+{
+ struct vnode *vp = ((struct vop_readdir_args *) v)->a_vp;
+ struct uio *uio = ((struct vop_readdir_args *) v)->a_uio;
+ int *eofflag = ((struct vop_readdir_args *) v)->a_eofflag;
+
+ int error = 0;
+ off_t skip, offset;
+ struct chfs_inode *ip;
+ struct chfs_dirent *fd;
+
+ struct ufsmount *ump;
+ struct chfs_mount *chmp;
+ struct chfs_vnode_cache *chvc;
+
+ KASSERT(VOP_ISLOCKED(vp));
+
+ /* This operation only makes sense on directory nodes. */
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+
+ ip = VTOI(vp);
+
+ /* uiomove in chfs_filldir automatically increments the
+ * uio_offset by an arbitrary size, so we discard any change
+ * to uio_offset and set it to our own value on return
+ */
+ offset = uio->uio_offset;
+
+ if (offset == CHFS_OFFSET_DOT) {
+ error = chfs_filldir(uio, ip->ino, ".", 1, VDIR);
+ if (error == -1) {
+ error = 0;
+ goto outok;
+ } else if (error != 0)
+ goto outok;
+
+ offset = CHFS_OFFSET_DOTDOT;
+ }
+
+ if (offset == CHFS_OFFSET_DOTDOT) {
+ ump = VFSTOUFS(vp->v_mount);
+ chmp = ump->um_chfs;
+ mutex_enter(&chmp->chm_lock_vnocache);
+ chvc = chfs_vnode_cache_get(chmp, ip->ino);
+ mutex_exit(&chmp->chm_lock_vnocache);
+
+ error = chfs_filldir(uio, chvc->pvno, "..", 2, VDIR);
+ if (error == -1) {
+ error = 0;
+ goto outok;
+ } else if (error != 0) {
+ goto outok;
+ }
+
+ if (TAILQ_EMPTY(&ip->dents)) {
+ offset = CHFS_OFFSET_EOF;
+ } else {
+ offset = CHFS_OFFSET_FIRST;
+ }
+ }
+
+ if (offset != CHFS_OFFSET_EOF) {
+ skip = offset - CHFS_OFFSET_FIRST;
+
+ TAILQ_FOREACH(fd, &ip->dents, fds) {
+ /* seek to offset by skipping items */
+ /* XXX race conditions by changed dirent? */
+ if (skip > 0) {
+ skip--;
+ continue;
+ }
+
+ if (fd->vno != 0) {
+ error = chfs_filldir(uio, fd->vno,
+ fd->name, fd->nsize, fd->type);
+ if (error == -1) {
+ error = 0;
+ goto outok;
+ } else if (error != 0) {
+ dbg("err %d\n", error);
+ goto outok;
+ }
+ }
+ offset++;
+ }
+ }
+ offset = CHFS_OFFSET_EOF;
+
+outok:
+ uio->uio_offset = offset;
+
+ if (eofflag != NULL) {
+ *eofflag = (error == 0 &&
+ uio->uio_offset == CHFS_OFFSET_EOF);
+ }
+
+out:
+ KASSERT(VOP_ISLOCKED(vp));
+
+ return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_readlink(void *v)
+{
+
+ struct vnode *vp = ((struct vop_readlink_args *) v)->a_vp;
+ struct uio *uio = ((struct vop_readlink_args *) v)->a_uio;
+ kauth_cred_t cred = ((struct vop_readlink_args *) v)->a_cred;
+
+ struct chfs_inode *ip = VTOI(vp);
+
+ dbg("readlink()\n");
+
+ /* TODO max symlink len instead of "100" */
+ if (ip->size < 100) {
+ uiomove(ip->target, ip->size, uio);
+ return (0);
+ }
+
+ return (VOP_READ(vp, uio, 0, cred));
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_inactive(void *v)
+{
+ struct vnode *vp = ((struct vop_inactive_args *) v)->a_vp;
+ struct chfs_inode *ip = VTOI(vp);
+ struct chfs_vnode_cache *chvc;
+ dbg("inactive | vno: %llu\n", (unsigned long long)ip->ino);
+
+ KASSERT(VOP_ISLOCKED(vp));
+
+ if (ip->ino) {
+ chvc = ip->chvc;
+ if (chvc->nlink)
+ *((struct vop_inactive_args *) v)->a_recycle = 0;
+ } else {
+ *((struct vop_inactive_args *) v)->a_recycle = 1;
+ }
+
+ VOP_UNLOCK(vp);
+
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_reclaim(void *v)
+{
+ struct vop_reclaim_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct chfs_inode *ip = VTOI(vp);
+ struct chfs_mount *chmp = ip->chmp;
+ struct chfs_dirent *fd;
+
+ //dbg("reclaim() | ino: %llu\n", (unsigned long long)ip->ino);
+ //mutex_enter(&ip->inode_lock);
+
+ mutex_enter(&chmp->chm_lock_vnocache);
+ chfs_vnode_cache_set_state(chmp,
+ ip->chvc, VNO_STATE_CHECKEDABSENT);
+ mutex_exit(&chmp->chm_lock_vnocache);
+
+ chfs_update(vp, NULL, NULL, UPDATE_CLOSE);
+
+ if (vp->v_type == VREG || vp->v_type == VLNK || vp->v_type == VCHR ||
+ vp->v_type == VBLK || vp->v_type == VFIFO || vp->v_type == VSOCK)
+ chfs_kill_fragtree(&ip->fragtree);
+
+ fd = TAILQ_FIRST(&ip->dents);
+ while(fd) {
+ TAILQ_REMOVE(&ip->dents, fd, fds);
+ chfs_free_dirent(fd);
+ fd = TAILQ_FIRST(&ip->dents);
+ }
+ //mutex_exit(&ip->inode_lock);
+ //mutex_destroy(&ip->inode_lock);
+
+ cache_purge(vp);
+ if (ip->devvp) {
+ vrele(ip->devvp);
+ ip->devvp = 0;
+ }
+ chfs_ihashrem(ip);
+
+ genfs_node_destroy(vp);
+ pool_put(&chfs_inode_pool, vp->v_data);
+ vp->v_data = NULL;
+ return (0);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_advlock(void *v)
+{
+ //struct vnode *vp = ((struct vop_advlock_args *) v)->a_vp;
+ dbg("advlock()\n");
+ /*
+ struct chfs_node *node;
+
+ node = VP_TO_CHFS_NODE(vp);
+
+ return lf_advlock(v, &node->chn_lockf, node->chn_size);
+ */
+ return 0;
+}
+
+/* --------------------------------------------------------------------- */
+int
+chfs_strategy(void *v)
+{
+ struct vop_strategy_args /* {
+ const struct vnodeop_desc *a_desc;
+ struct vnode *a_vp;
+ struct buf *a_bp;
+ } */ *ap = v;
+ struct chfs_full_dnode *fd;
+ struct buf *bp = ap->a_bp;
+ struct vnode *vp = ap->a_vp;
+ struct chfs_inode *ip = VTOI(vp);
+ struct chfs_mount *chmp = ip->chmp;
+ int read = (bp->b_flags & B_READ) ? 1 : 0;
+ int err = 0;
+
+/* dbg("bp dump:\n");
+ dbg(" ->b_bcount: %d\n", bp->b_bcount);
+ dbg(" ->b_resid: %d\n", bp->b_resid);
+ dbg(" ->b_blkno: %llu\n", (unsigned long long)bp->b_blkno);
+ dbg(" ->b_error: %d\n", bp->b_error);*/
+ if (read) {
+ err = chfs_read_data(chmp, vp, bp);
+ } else {
+ fd = chfs_alloc_full_dnode();
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+
+ err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+ if (err) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ goto out;
+ }
+
+ err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+ /*if (err) {
+ mutex_exit(&chmp->chm_lock_mountfields);
+ goto out;
+ }*/
+
+ mutex_exit(&chmp->chm_lock_mountfields);
+ }
+out:
+ biodone(bp);
+ //dbg("end\n");
+ return err;
+}
+
+int
+chfs_bmap(void *v)
+{
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct vnode **a_vpp;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+ } */ *ap = v;
+ if (ap->a_vpp != NULL)
+ *ap->a_vpp = ap->a_vp;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn;
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ return (0);
+}
+
+/*
+ * vnode operations vector used for files stored in a chfs file system.
+ */
+int
+(**chfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc chfs_vnodeop_entries[] =
+ {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, chfs_lookup },
+ { &vop_create_desc, chfs_create },
+ { &vop_mknod_desc, chfs_mknod },
+ { &vop_open_desc, chfs_open },
+ { &vop_close_desc, chfs_close },
+ { &vop_access_desc, chfs_access },
+ { &vop_getattr_desc, chfs_getattr },
+ { &vop_setattr_desc, chfs_setattr },
+ { &vop_read_desc, chfs_read },
+ { &vop_write_desc, chfs_write },
+ { &vop_ioctl_desc, genfs_enoioctl },
+ { &vop_fcntl_desc, genfs_fcntl },
+ { &vop_poll_desc, genfs_poll },
+ { &vop_kqfilter_desc, genfs_kqfilter },
+ { &vop_revoke_desc, genfs_revoke },
+ { &vop_mmap_desc, genfs_mmap },
+ { &vop_fsync_desc, chfs_fsync },
+ { &vop_seek_desc, genfs_seek },
+ { &vop_remove_desc, chfs_remove },
+ { &vop_link_desc, chfs_link },
+ { &vop_rename_desc, chfs_rename },
+ { &vop_mkdir_desc, chfs_mkdir },
+ { &vop_rmdir_desc, chfs_rmdir },
+ { &vop_symlink_desc, chfs_symlink },
+ { &vop_readdir_desc, chfs_readdir },
+ { &vop_readlink_desc, chfs_readlink },
+ { &vop_abortop_desc, genfs_abortop },
+ { &vop_inactive_desc, chfs_inactive },
+ { &vop_reclaim_desc, chfs_reclaim },
+ { &vop_lock_desc, genfs_lock },
+ { &vop_unlock_desc, genfs_unlock },
+ { &vop_bmap_desc, chfs_bmap },
+ { &vop_strategy_desc, chfs_strategy },
+ { &vop_print_desc, ufs_print },
+ { &vop_pathconf_desc, ufs_pathconf },
+ { &vop_islocked_desc, genfs_islocked },
+ { &vop_advlock_desc, chfs_advlock },
+ { &vop_bwrite_desc, vn_bwrite },
+ { &vop_getpages_desc, genfs_getpages },
+ { &vop_putpages_desc, genfs_putpages },
+ { NULL, NULL } };
+
+const struct vnodeopv_desc chfs_vnodeop_opv_desc =
+ { &chfs_vnodeop_p, chfs_vnodeop_entries };
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * vnode operations vector used for special devices stored in a chfs
+ * file system.
+ */
+int
+(**chfs_specop_p)(void *);
+const struct vnodeopv_entry_desc chfs_specop_entries[] =
+ {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, spec_lookup },
+ { &vop_create_desc, spec_create },
+ { &vop_mknod_desc, spec_mknod },
+ { &vop_open_desc, spec_open },
+ { &vop_close_desc, ufsspec_close },
+ { &vop_access_desc, chfs_access },
+ { &vop_getattr_desc, chfs_getattr },
+ { &vop_setattr_desc, chfs_setattr },
+ { &vop_read_desc, chfs_read },
+ { &vop_write_desc, chfs_write },
+ { &vop_ioctl_desc, spec_ioctl },
+ { &vop_fcntl_desc, genfs_fcntl },
+ { &vop_poll_desc, spec_poll },
+ { &vop_kqfilter_desc, spec_kqfilter },
+ { &vop_revoke_desc, spec_revoke },
+ { &vop_mmap_desc, spec_mmap },
+ { &vop_fsync_desc, spec_fsync },
+ { &vop_seek_desc, spec_seek },
+ { &vop_remove_desc, spec_remove },
+ { &vop_link_desc, spec_link },
+ { &vop_rename_desc, spec_rename },
+ { &vop_mkdir_desc, spec_mkdir },
+ { &vop_rmdir_desc, spec_rmdir },
+ { &vop_symlink_desc, spec_symlink },
+ { &vop_readdir_desc, spec_readdir },
+ { &vop_readlink_desc, spec_readlink },
+ { &vop_abortop_desc, spec_abortop },
+ { &vop_inactive_desc, chfs_inactive },
+ { &vop_reclaim_desc, chfs_reclaim },
+ { &vop_lock_desc, genfs_lock },
+ { &vop_unlock_desc, genfs_unlock },
+ { &vop_bmap_desc, spec_bmap },
+ { &vop_strategy_desc, spec_strategy },
+ { &vop_print_desc, ufs_print },
+ { &vop_pathconf_desc, spec_pathconf },
+ { &vop_islocked_desc, genfs_islocked },
+ { &vop_advlock_desc, spec_advlock },
+ { &vop_bwrite_desc, vn_bwrite },
+ { &vop_getpages_desc, spec_getpages },
+ { &vop_putpages_desc, spec_putpages },
+ { NULL, NULL } };
+
+const struct vnodeopv_desc chfs_specop_opv_desc =
+ { &chfs_specop_p, chfs_specop_entries };
+
+/* --------------------------------------------------------------------- */
+/*
+ * vnode operations vector used for fifos stored in a chfs file system.
+ */
+int
+(**chfs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc chfs_fifoop_entries[] =
+ {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, vn_fifo_bypass },
+ { &vop_create_desc, vn_fifo_bypass },
+ { &vop_mknod_desc, vn_fifo_bypass },
+ { &vop_open_desc, vn_fifo_bypass },
+ { &vop_close_desc, ufsfifo_close },
+ { &vop_access_desc, chfs_access },
+ { &vop_getattr_desc, chfs_getattr },
+ { &vop_setattr_desc, chfs_setattr },
+ { &vop_read_desc, ufsfifo_read },
+ { &vop_write_desc, ufsfifo_write },
+ { &vop_ioctl_desc, vn_fifo_bypass },
+ { &vop_fcntl_desc, genfs_fcntl },
+ { &vop_poll_desc, vn_fifo_bypass },
+ { &vop_kqfilter_desc, vn_fifo_bypass },
+ { &vop_revoke_desc, vn_fifo_bypass },
+ { &vop_mmap_desc, vn_fifo_bypass },
+ { &vop_fsync_desc, vn_fifo_bypass },
+ { &vop_seek_desc, vn_fifo_bypass },
+ { &vop_remove_desc, vn_fifo_bypass },
+ { &vop_link_desc, vn_fifo_bypass },
+ { &vop_rename_desc, vn_fifo_bypass },
+ { &vop_mkdir_desc, vn_fifo_bypass },
+ { &vop_rmdir_desc, vn_fifo_bypass },
+ { &vop_symlink_desc, vn_fifo_bypass },
+ { &vop_readdir_desc, vn_fifo_bypass },
+ { &vop_readlink_desc, vn_fifo_bypass },
+ { &vop_abortop_desc, vn_fifo_bypass },
+ { &vop_inactive_desc, chfs_inactive },
+ { &vop_reclaim_desc, chfs_reclaim },
+ { &vop_lock_desc, genfs_lock },
+ { &vop_unlock_desc, genfs_unlock },
+ { &vop_bmap_desc, vn_fifo_bypass },
+ { &vop_strategy_desc, vn_fifo_bypass },
+ { &vop_print_desc, ufs_print },
+ { &vop_pathconf_desc, vn_fifo_bypass },
+ { &vop_islocked_desc, genfs_islocked },
+ { &vop_advlock_desc, vn_fifo_bypass },
+ { &vop_bwrite_desc, genfs_nullop },
+ { &vop_getpages_desc, genfs_badop },
+ { &vop_putpages_desc, vn_fifo_bypass },
+ { NULL, NULL } };
+
+const struct vnodeopv_desc chfs_fifoop_opv_desc =
+ { &chfs_fifoop_p, chfs_fifoop_entries };
--- /dev/null
+/* $NetBSD: chfs_wbuf.c,v 1.2 2011/11/24 20:50:33 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <dev/flash/flash.h>
+#include <sys/uio.h>
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+#define DBG_WBUF 1
+
+#define PAD(x) (((x)+3)&~3)
+
+#define EB_ADDRESS(x) ( ((unsigned long)(x) / chmp->chm_ebh->eb_size) * chmp->chm_ebh->eb_size )
+
+#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(chmp->chm_wbuf_pagesize)) * (unsigned long)(chmp->chm_wbuf_pagesize) )
+#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(chmp->chm_wbuf_pagesize) )
+
+/*
+// test functions
+int wbuf_test(void);
+void wbuf_test_erase_flash(struct chfs_mount*);
+void wbuf_test_callback(struct erase_instruction*);
+*/
+
+#define NOPAD 0
+#define SETPAD 1
+
+
+/**
+ * chfs_flush_wbuf - write wbuf to the flash
+ * @chmp: super block info
+ * @pad: padding (NOPAD / SETPAD)
+ * Returns zero in case of success.
+ */
+static int
+chfs_flush_wbuf(struct chfs_mount *chmp, int pad)
+{
+ int ret=0;
+ size_t retlen = 0;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+ KASSERT(rw_write_held(&chmp->chm_lock_wbuf));
+
+ if (pad) {
+ chmp->chm_wbuf_len = PAD(chmp->chm_wbuf_len);
+ memset(chmp->chm_wbuf + chmp->chm_wbuf_len, 0, chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len);
+
+ struct chfs_flash_padding_node* padnode = (void*)(chmp->chm_wbuf + chmp->chm_wbuf_len);
+ padnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+ padnode->type = htole16(CHFS_NODETYPE_PADDING);
+ padnode->length = htole32(chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len);
+ padnode->hdr_crc = htole32(crc32(0, (uint8_t *)padnode, sizeof(*padnode)-4));
+
+ struct chfs_node_ref *nref;
+ nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+ nref->nref_offset = chmp->chm_wbuf_ofs + chmp->chm_wbuf_len;
+ nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+ CHFS_OBSOLETE_NODE_MASK;
+ chmp->chm_wbuf_len = chmp->chm_wbuf_pagesize;
+
+ chfs_change_size_free(chmp, chmp->chm_nextblock, -padnode->length);
+ chfs_change_size_wasted(chmp, chmp->chm_nextblock, padnode->length);
+ }
+
+ ret = chfs_write_leb(chmp, chmp->chm_nextblock->lnr, chmp->chm_wbuf, chmp->chm_wbuf_ofs, chmp->chm_wbuf_len, &retlen);
+ if(ret) {
+ return ret;
+ }
+
+ memset(chmp->chm_wbuf,0xff,chmp->chm_wbuf_pagesize);
+ chmp->chm_wbuf_ofs += chmp->chm_wbuf_pagesize;
+ chmp->chm_wbuf_len = 0;
+ return 0;
+}
+
+
+/**
+ * chfs_fill_wbuf - write to wbuf
+ * @chmp: super block info
+ * @buf: buffer
+ * @len: buffer length
+ * Return the len of the buf what we didn't write to the wbuf.
+ */
+static size_t
+chfs_fill_wbuf(struct chfs_mount *chmp, const u_char *buf, size_t len)
+{
+ if (len && !chmp->chm_wbuf_len && (len >= chmp->chm_wbuf_pagesize)) {
+ return 0;
+ }
+ if (len > (chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len)) {
+ len = chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len;
+ }
+ memcpy(chmp->chm_wbuf + chmp->chm_wbuf_len, buf, len);
+
+ chmp->chm_wbuf_len += (int) len;
+ return len;
+}
+
+/**
+ * chfs_write_wbuf - write to wbuf and then the flash
+ * @chmp: super block info
+ * @invecs: io vectors
+ * @count: num of vectors
+ * @to: offset of target
+ * @retlen: writed bytes
+ * Returns zero in case of success.
+ */
+int
+chfs_write_wbuf(struct chfs_mount* chmp, const struct iovec *invecs, long count,
+ off_t to, size_t *retlen)
+{
+ int invec, ret = 0;
+ size_t wbuf_retlen, donelen = 0;
+ int outvec_to = to;
+
+ int lnr = chmp->chm_nextblock->lnr;
+
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+ KASSERT(!rw_write_held(&chmp->chm_lock_wbuf));
+
+ rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+
+ //dbg("1. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+ if (chmp->chm_wbuf_ofs == 0xffffffff) {
+ chmp->chm_wbuf_ofs = PAGE_DIV(to);
+ chmp->chm_wbuf_len = PAGE_MOD(to);
+ memset(chmp->chm_wbuf, 0xff, chmp->chm_wbuf_pagesize);
+ }
+
+ //dbg("2. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+ if (EB_ADDRESS(to) != EB_ADDRESS(chmp->chm_wbuf_ofs)) {
+ if (chmp->chm_wbuf_len) {
+ ret = chfs_flush_wbuf(chmp, SETPAD);
+ if (ret)
+ goto outerr;
+ }
+ chmp->chm_wbuf_ofs = PAGE_DIV(to);
+ chmp->chm_wbuf_len = PAGE_MOD(to);
+ }
+
+ //dbg("3. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+ if (to != PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len)) {
+ dbg("to: %llu != %zu\n", (unsigned long long)to,
+ PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len));
+ dbg("Non-contiguous write\n");
+ panic("BUG\n");
+ }
+
+ /* adjust alignment offset */
+ if (chmp->chm_wbuf_len != PAGE_MOD(to)) {
+ chmp->chm_wbuf_len = PAGE_MOD(to);
+ /* take care of alignement to next page*/
+ if (!chmp->chm_wbuf_len) {
+ chmp->chm_wbuf_len += chmp->chm_wbuf_pagesize;
+ ret = chfs_flush_wbuf(chmp, NOPAD);
+ if (ret)
+ goto outerr;
+ }
+ }
+
+ for (invec = 0; invec < count; invec++) {
+ int vlen = invecs[invec].iov_len;
+ u_char* v = invecs[invec].iov_base;
+
+ //dbg("invec:%d len:%d\n", invec, vlen);
+
+ wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen);
+ if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) {
+ ret = chfs_flush_wbuf(chmp, NOPAD);
+ if (ret) {
+ goto outerr;
+ }
+ }
+ vlen -= wbuf_retlen;
+ outvec_to += wbuf_retlen;
+ v += wbuf_retlen;
+ donelen += wbuf_retlen;
+ if (vlen >= chmp->chm_wbuf_pagesize) {
+ ret = chfs_write_leb(chmp, lnr, v, outvec_to, PAGE_DIV(vlen), &wbuf_retlen);
+ //dbg("fd->write: %zu\n", wbuf_retlen);
+ vlen -= wbuf_retlen;
+ outvec_to += wbuf_retlen;
+ chmp->chm_wbuf_ofs = outvec_to;
+ v += wbuf_retlen;
+ donelen += wbuf_retlen;
+ }
+ wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen);
+ if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) {
+ ret = chfs_flush_wbuf(chmp, NOPAD);
+ if (ret)
+ goto outerr;
+ }
+
+ // if we write the last vector, we flush with padding
+ /*if (invec == count-1) {
+ ret = chfs_flush_wbuf(chmp, SETPAD);
+ if (ret)
+ goto outerr;
+ }*/
+ outvec_to += wbuf_retlen;
+ donelen += wbuf_retlen;
+ }
+ *retlen = donelen;
+ rw_exit(&chmp->chm_lock_wbuf);
+ return ret;
+
+outerr:
+ *retlen = 0;
+ return ret;
+}
+
+int chfs_flush_pending_wbuf(struct chfs_mount *chmp)
+{
+ //dbg("flush pending wbuf\n");
+ int err;
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+ mutex_enter(&chmp->chm_lock_sizes);
+ rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+ err = chfs_flush_wbuf(chmp, SETPAD);
+ rw_exit(&chmp->chm_lock_wbuf);
+ mutex_exit(&chmp->chm_lock_sizes);
+ return err;
+}
--- /dev/null
+/* $NetBSD: chfs_write.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_write.c
+ *
+ * Created on: 2010.02.17.
+ * Author: dtengeri
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+
+#include "chfs.h"
+
+int
+chfs_write_flash_vnode(struct chfs_mount *chmp,
+ struct chfs_inode *ip, int prio)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ struct chfs_flash_vnode *fvnode;
+ struct chfs_vnode_cache* chvc;
+ struct chfs_node_ref *nref;
+ struct iovec vec;
+ size_t size, retlen;
+ int err = 0, retries = 0;
+
+ if (ip->ino == CHFS_ROOTINO)
+ return 0;
+
+ fvnode = chfs_alloc_flash_vnode();
+ if (!fvnode)
+ return ENOMEM;
+
+ chvc = ip->chvc;
+
+ /* setting up flash_vnode members */
+ size = sizeof(*fvnode);
+ //dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size));
+ fvnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+ fvnode->type = htole16(CHFS_NODETYPE_VNODE);
+ fvnode->length = htole32(CHFS_PAD(size));
+ fvnode->hdr_crc = htole32(crc32(0, (uint8_t *)fvnode,
+ CHFS_NODE_HDR_SIZE - 4));
+ fvnode->vno = htole64(ip->ino);
+ fvnode->version = htole64(++ip->chvc->highest_version);
+ fvnode->mode = htole32(ip->mode);
+ fvnode->dn_size = htole32(ip->size);
+ fvnode->atime = htole32(ip->atime);
+ fvnode->ctime = htole32(ip->ctime);
+ fvnode->mtime = htole32(ip->mtime);
+ fvnode->gid = htole32(ip->gid);
+ fvnode->uid = htole32(ip->uid);
+ fvnode->node_crc = htole32(crc32(0, (uint8_t *)fvnode, size - 4));
+
+ /* write out flash_vnode */
+retry:
+ if (prio == ALLOC_GC) {
+ /* the GC calls this function */
+ err = chfs_reserve_space_gc(chmp, CHFS_PAD(size));
+ if (err)
+ goto out;
+ } else {
+ chfs_gc_trigger(chmp);
+ if (prio == ALLOC_NORMAL)
+ err = chfs_reserve_space_normal(chmp,
+ CHFS_PAD(size), ALLOC_NORMAL);
+ else
+ err = chfs_reserve_space_normal(chmp,
+ CHFS_PAD(size), ALLOC_DELETION);
+ if (err)
+ goto out;
+ }
+
+ nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+ if (!nref) {
+ err = ENOMEM;
+ goto out;
+ }
+
+ mutex_enter(&chmp->chm_lock_sizes);
+
+ nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+ chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size));
+ vec.iov_base = fvnode;
+ vec.iov_len = CHFS_PAD(size);
+ err = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen);
+ if (err || retlen != CHFS_PAD(size)) {
+ chfs_err("error while writing out flash vnode to the media\n");
+ chfs_err("err: %d | size: %zu | retlen : %zu\n",
+ err, CHFS_PAD(size), retlen);
+ chfs_change_size_dirty(chmp,
+ chmp->chm_nextblock, CHFS_PAD(size));
+ if (retries) {
+ err = EIO;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto out;
+ }
+
+ retries++;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto retry;
+ }
+ //Everything went well
+ chfs_change_size_used(chmp,
+ &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+ mutex_exit(&chmp->chm_lock_sizes);
+
+ chfs_add_vnode_ref_to_vc(chmp, chvc, nref);
+ KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+out:
+ chfs_free_flash_vnode(fvnode);
+ return err;
+}
+
+int
+chfs_write_flash_dirent(struct chfs_mount *chmp, struct chfs_inode *pdir,
+ struct chfs_inode *ip, struct chfs_dirent *fd,
+ ino_t ino, int prio)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ struct chfs_flash_dirent_node *fdirent;
+ struct chfs_node_ref *nref;
+ struct iovec vec[2];
+ size_t size, retlen;
+ int err = 0, retries = 0;
+ uint8_t *name;
+ size_t namelen;
+
+ KASSERT(fd->vno != CHFS_ROOTINO);
+
+ fdirent = chfs_alloc_flash_dirent();
+ if (!fdirent)
+ return ENOMEM;
+
+ size = sizeof(*fdirent) + fd->nsize;
+ namelen = CHFS_PAD(size) - sizeof(*fdirent);
+
+ name = kmem_zalloc(namelen, KM_SLEEP);
+ memcpy(name, fd->name, fd->nsize);
+ //dbg("namelen: %zu | nsize: %hhu\n", namelen, fd->nsize);
+
+
+ //dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size));
+ fdirent->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+ fdirent->type = htole16(CHFS_NODETYPE_DIRENT);
+ fdirent->length = htole32(CHFS_PAD(size));
+ fdirent->hdr_crc = htole32(crc32(0, (uint8_t *)fdirent,
+ CHFS_NODE_HDR_SIZE - 4));
+ fdirent->vno = htole64(ino);
+ fdirent->pvno = htole64(pdir->ino);
+ fdirent->version = htole64(++pdir->chvc->highest_version);
+ fdirent->mctime = ip?ip->ctime:0;
+ fdirent->nsize = fd->nsize;
+ fdirent->dtype = fd->type;
+ fdirent->name_crc = crc32(0, (uint8_t *)&(fd->name), fd->nsize);
+ fdirent->node_crc = crc32(0, (uint8_t *)fdirent, sizeof(*fdirent) - 4);
+
+ vec[0].iov_base = fdirent;
+ vec[0].iov_len = sizeof(*fdirent);
+ vec[1].iov_base = name;
+ vec[1].iov_len = namelen;
+
+retry:
+ if (prio == ALLOC_GC) {
+ /* the GC calls this function */
+ err = chfs_reserve_space_gc(chmp, CHFS_PAD(size));
+ if (err)
+ goto out;
+ } else {
+ chfs_gc_trigger(chmp);
+ if (prio == ALLOC_NORMAL)
+ err = chfs_reserve_space_normal(chmp,
+ CHFS_PAD(size), ALLOC_NORMAL);
+ else
+ err = chfs_reserve_space_normal(chmp,
+ CHFS_PAD(size), ALLOC_DELETION);
+ if (err)
+ goto out;
+ }
+
+ nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+ if (!nref) {
+ err = ENOMEM;
+ goto out;
+ }
+
+ mutex_enter(&chmp->chm_lock_sizes);
+
+ nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+ chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size));
+
+ err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen);
+ if (err || retlen != CHFS_PAD(size)) {
+ chfs_err("error while writing out flash dirent node to the media\n");
+ chfs_err("err: %d | size: %zu | retlen : %zu\n",
+ err, CHFS_PAD(size), retlen);
+ chfs_change_size_dirty(chmp,
+ chmp->chm_nextblock, CHFS_PAD(size));
+ if (retries) {
+ err = EIO;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto out;
+ }
+
+ retries++;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto retry;
+ }
+
+
+ // Everything went well
+ chfs_change_size_used(chmp,
+ &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+ mutex_exit(&chmp->chm_lock_sizes);
+ KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+ fd->nref = nref;
+ if (prio != ALLOC_DELETION) {
+ chfs_add_node_to_list(chmp,
+ pdir->chvc, nref, &pdir->chvc->dirents);
+ }
+out:
+ chfs_free_flash_dirent(fdirent);
+ return err;
+}
+
+/**
+ * chfs_write_flash_dnode - write out a data node to flash
+ * @chmp: chfs mount structure
+ * @vp: vnode where the data belongs to
+ * @bp: buffer contains data
+ */
+int
+chfs_write_flash_dnode(struct chfs_mount *chmp, struct vnode *vp,
+ struct buf *bp, struct chfs_full_dnode *fd)
+{
+ KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+ int err = 0, retries = 0;
+ size_t size, retlen;
+ off_t ofs;
+ struct chfs_flash_data_node *dnode;
+ struct chfs_node_ref *nref;
+ struct chfs_inode *ip = VTOI(vp);
+ struct iovec vec[2];
+ uint32_t len;
+ void *tmpbuf = NULL;
+
+ KASSERT(ip->ino != CHFS_ROOTINO);
+
+ dnode = chfs_alloc_flash_dnode();
+ if (!dnode)
+ return ENOMEM;
+
+ /* initialize flash data node */
+ ofs = bp->b_blkno * PAGE_SIZE;
+ //dbg("vp->v_size: %ju, bp->b_blkno: %ju, bp-b_data: %p,"
+ // " bp->b_resid: %ju\n",
+ // (uintmax_t )vp->v_size, (uintmax_t )bp->b_blkno,
+ // bp->b_data, (uintmax_t )bp->b_resid);
+ //dbg("[XXX]vp->v_size - ofs: %llu\n", (vp->v_size - ofs));
+ len = MIN((vp->v_size - ofs), bp->b_resid);
+ size = sizeof(*dnode) + len;
+
+ dnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+ dnode->type = htole16(CHFS_NODETYPE_DATA);
+ dnode->length = htole32(CHFS_PAD(size));
+ dnode->hdr_crc = htole32(crc32(0, (uint8_t *)dnode,
+ CHFS_NODE_HDR_SIZE - 4));
+ dnode->vno = htole64(ip->ino);
+ dnode->version = htole64(++ip->chvc->highest_version);
+ dnode->offset = htole64(ofs);
+ dnode->data_length = htole32(len);
+ dnode->data_crc = htole32(crc32(0, (uint8_t *)bp->b_data, len));
+ dnode->node_crc = htole32(crc32(0, (uint8_t *)dnode,
+ sizeof(*dnode) - 4));
+
+ dbg("dnode @%llu %ub v%llu\n", (unsigned long long)dnode->offset,
+ dnode->data_length, (unsigned long long)dnode->version);
+
+ if (CHFS_PAD(size) - sizeof(*dnode)) {
+ tmpbuf = kmem_zalloc(CHFS_PAD(size)
+ - sizeof(*dnode), KM_SLEEP);
+ memcpy(tmpbuf, bp->b_data, len);
+ }
+
+ /* creating iovecs for wbuf */
+ vec[0].iov_base = dnode;
+ vec[0].iov_len = sizeof(*dnode);
+ vec[1].iov_base = tmpbuf;
+ vec[1].iov_len = CHFS_PAD(size) - sizeof(*dnode);
+
+ fd->frags = 0;
+ fd->ofs = ofs;
+ fd->size = len;
+
+retry:
+
+ /* Reserve space for data node. This will set up the next eraseblock
+ * where to we will write.
+ */
+
+ chfs_gc_trigger(chmp);
+ err = chfs_reserve_space_normal(chmp,
+ CHFS_PAD(size), ALLOC_NORMAL);
+ if (err)
+ goto out;
+
+ nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+ if (!nref) {
+ err = ENOMEM;
+ goto out;
+ }
+
+ nref->nref_offset =
+ chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+
+ KASSERT(nref->nref_offset < chmp->chm_ebh->eb_size);
+
+ mutex_enter(&chmp->chm_lock_sizes);
+
+ chfs_change_size_free(chmp,
+ chmp->chm_nextblock, -CHFS_PAD(size));
+
+ //dbg("vno: %llu nref lnr: %u offset: %u\n",
+ // dnode->vno, nref->nref_lnr, nref->nref_offset);
+
+ err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen);
+ if (err || retlen != CHFS_PAD(size)) {
+ chfs_err("error while writing out flash data node to the media\n");
+ chfs_err("err: %d | size: %zu | retlen : %zu\n",
+ err, size, retlen);
+ chfs_change_size_dirty(chmp,
+ chmp->chm_nextblock, CHFS_PAD(size));
+ if (retries) {
+ err = EIO;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto out;
+ }
+
+ retries++;
+ mutex_exit(&chmp->chm_lock_sizes);
+ goto retry;
+ }
+ /* Everything went well */
+ ip->write_size += fd->size;
+ chfs_change_size_used(chmp,
+ &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+ mutex_exit(&chmp->chm_lock_sizes);
+
+ KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+ fd->nref = nref;
+ chfs_add_node_to_list(chmp, ip->chvc, nref, &ip->chvc->dnode);
+out:
+ chfs_free_flash_dnode(dnode);
+ if (CHFS_PAD(size) - sizeof(*dnode)) {
+ kmem_free(tmpbuf, CHFS_PAD(size) - sizeof(*dnode));
+ }
+
+ return err;
+}
+
+/**
+ * chfs_do_link - makes a copy from a node
+ * @old: old node
+ * @oldfd: dirent of old node
+ * @parent: parent of new node
+ * @name: name of new node
+ * @namelen: length of name
+ * This function writes the dirent of the new node to the media.
+ */
+int
+chfs_do_link(struct chfs_inode *ip, struct chfs_inode *parent, const char *name, int namelen, enum vtype type)
+{
+ int error = 0;
+ struct vnode *vp = ITOV(ip);
+ struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+ struct chfs_mount *chmp = ump->um_chfs;
+ struct chfs_dirent *newfd = NULL;
+// struct chfs_dirent *fd = NULL;
+
+ //dbg("link vno: %llu\n", ip->ino);
+
+ newfd = chfs_alloc_dirent(namelen + 1);
+
+ newfd->vno = ip->ino;
+ newfd->type = type;
+ newfd->nsize = namelen;
+ memcpy(newfd->name, name, namelen);
+ newfd->name[newfd->nsize] = 0;
+// newfd->next = NULL;
+
+ ip->chvc->nlink++;
+ parent->chvc->nlink++;
+ ip->iflag |= IN_CHANGE;
+ chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+
+ error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+ if (error)
+ return error;
+
+ error = chfs_write_flash_dirent(chmp,
+ parent, ip, newfd, ip->ino, ALLOC_NORMAL);
+ /* TODO: what should we do if error isn't zero? */
+
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ /* add fd to the fd list */
+ TAILQ_INSERT_TAIL(&parent->dents, newfd, fds);
+#if 0
+ fd = parent->dents;
+ if (!fd) {
+ parent->dents = newfd;
+ } else {
+ while (fd->next)
+ fd = fd->next;
+ fd->next = newfd;
+ }
+#endif
+
+ return error;
+}
+
+
+/**
+ * chfs_do_unlink - delete a node
+ * @ip: node what we'd like to delete
+ * @parent: parent of the node
+ * @name: name of the node
+ * @namelen: length of name
+ * This function set the nlink and vno of the node zero and write its dirent to the media.
+ */
+int
+chfs_do_unlink(struct chfs_inode *ip,
+ struct chfs_inode *parent, const char *name, int namelen)
+{
+ struct chfs_dirent *fd, *tmpfd;
+ int error = 0;
+ struct vnode *vp = ITOV(ip);
+ struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+ struct chfs_mount *chmp = ump->um_chfs;
+ struct chfs_node_ref *nref;
+
+ //dbg("unlink vno: %llu\n", ip->ino);
+
+ vflushbuf(vp, 0);
+
+ mutex_enter(&chmp->chm_lock_mountfields);
+
+ /* remove the full direntry from the parent dents list */
+ TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) {
+ if (fd->vno == ip->ino &&
+ fd->nsize == namelen &&
+ !memcmp(fd->name, name, fd->nsize)) {
+ if (fd->type == VDIR && ip->chvc->nlink == 2)
+ ip->chvc->nlink = 0;
+ else
+ ip->chvc->nlink--;
+
+ fd->type = VNON;
+
+ TAILQ_REMOVE(&parent->dents, fd, fds);
+
+ /* remove nref from dirents list */
+ nref = parent->chvc->dirents;
+ if (nref == fd->nref) {
+ nref->nref_next = fd->nref->nref_next;
+ } else {
+ while (nref->nref_next && nref->nref_next != fd->nref)
+ nref = nref->nref_next;
+ if (nref->nref_next)
+ nref->nref_next = fd->nref->nref_next;
+ }
+
+ //dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n",
+ // fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset);
+ chfs_mark_node_obsolete(chmp, fd->nref);
+
+ error = chfs_write_flash_dirent(chmp,
+ parent, ip, fd, 0, ALLOC_DELETION);
+
+ //dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n",
+ // fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset);
+ chfs_mark_node_obsolete(chmp, fd->nref);
+
+ nref = ip->chvc->dnode;
+ while (nref != (struct chfs_node_ref *)ip->chvc) {
+ //dbg("DATA NREF\n");
+ chfs_mark_node_obsolete(chmp, nref);
+ nref = nref->nref_next;
+ }
+ ip->chvc->dnode = (struct chfs_node_ref *)ip->chvc;
+
+ nref = ip->chvc->v;
+ while (nref != (struct chfs_node_ref *)ip->chvc) {
+ //dbg("V NREF\n");
+ chfs_mark_node_obsolete(chmp, nref);
+ nref = nref->nref_next;
+ }
+ ip->chvc->v = ip->chvc->v->nref_next;
+
+ parent->chvc->nlink--;
+ //TODO: if error
+ }
+ }
+ mutex_exit(&chmp->chm_lock_mountfields);
+
+ return error;
+}
--- /dev/null
+/* $NetBSD: debug.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * XipFFS -- Xip Flash File System
+ *
+ * Copyright (C) 2009 Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Zoltan Sogor <weth@inf.u-szeged.hu>,
+ * ...
+ * University of Szeged, Hungary
+ *
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
--- /dev/null
+/* $NetBSD: ebh.c,v 1.2 2011/11/25 11:15:24 ahoka Exp $ */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ * University of Szeged, Hungary
+ * Copyright (C) 2009 Ferenc Havasi <havasi@inf.u-szeged.hu>
+ * Copyright (C) 2009 Zoltan Sogor <weth@inf.u-szeged.hu>
+ * Copyright (C) 2009 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2009 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "ebh.h"
+
+/*****************************************************************************/
+/* Flash specific operations */
+/*****************************************************************************/
+int nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr);
+int nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr);
+int nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset);
+int nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset);
+int nor_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr,struct chfs_eb_hdr *ebhdr);
+int nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf);
+int nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf);
+int nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid);
+int nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr);
+int mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec);
+
+int ltree_entry_cmp(struct chfs_ltree_entry *le1, struct chfs_ltree_entry *le2);
+int peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2);
+int peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2);
+int add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec,struct peb_queue *queue);
+struct chfs_peb * find_peb_in_use(struct chfs_ebh *ebh, int pebnr);
+int add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec);
+int add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec);
+void erase_callback(struct flash_erase_instruction *ei);
+int free_peb(struct chfs_ebh *ebh);
+int release_peb(struct chfs_ebh *ebh, int pebnr);
+void erase_thread(void *data);
+static void erase_thread_start(struct chfs_ebh *ebh);
+static void erase_thread_stop(struct chfs_ebh *ebh);
+int scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2);
+int nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status);
+int nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+ int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr);
+int nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+ int pebnr, struct chfs_eb_hdr *ebhdr);
+struct chfs_scan_info *chfs_scan(struct chfs_ebh *ebh);
+void scan_info_destroy(struct chfs_scan_info *si);
+int scan_media(struct chfs_ebh *ebh);
+int get_peb(struct chfs_ebh *ebh);
+/**
+ * nor_create_eb_hdr - creates an eraseblock header for NOR flash
+ * @ebhdr: ebhdr to set
+ * @lnr: LEB number
+ */
+int
+nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr)
+{
+ ebhdr->u.nor_hdr.lid = htole32(lnr);
+ return 0;
+}
+
+/**
+ * nand_create_eb_hdr - creates an eraseblock header for NAND flash
+ * @ebhdr: ebhdr to set
+ * @lnr: LEB number
+ */
+int
+nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr)
+{
+ ebhdr->u.nand_hdr.lid = htole32(lnr);
+ return 0;
+}
+
+/**
+ * nor_calc_data_offs - calculates data offset on NOR flash
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @offset: offset within the eraseblock
+ */
+int
+nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset)
+{
+ return pebnr * ebh->flash_if->erasesize + offset +
+ CHFS_EB_EC_HDR_SIZE + CHFS_EB_HDR_NOR_SIZE;
+}
+
+/**
+ * nand_calc_data_offs - calculates data offset on NAND flash
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @offset: offset within the eraseblock
+ */
+int
+nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset)
+{
+ return pebnr * ebh->flash_if->erasesize + offset +
+ 2 * ebh->flash_if->page_size;
+}
+
+/**
+ * nor_read_eb_hdr - read ereaseblock header from NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ebhdr: whereto store the data
+ *
+ * Reads the eraseblock header from media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_read_eb_hdr(struct chfs_ebh *ebh,
+ int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+ int ret;
+ size_t retlen;
+ off_t ofs = pebnr * ebh->flash_if->erasesize;
+
+ KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+ ret = flash_read(ebh->flash_dev,
+ ofs, CHFS_EB_EC_HDR_SIZE,
+ &retlen, (unsigned char *) &ebhdr->ec_hdr);
+
+ if (ret || retlen != CHFS_EB_EC_HDR_SIZE)
+ return ret;
+
+ ofs += CHFS_EB_EC_HDR_SIZE;
+ ret = flash_read(ebh->flash_dev,
+ ofs, CHFS_EB_HDR_NOR_SIZE,
+ &retlen, (unsigned char *) &ebhdr->u.nor_hdr);
+
+ if (ret || retlen != CHFS_EB_HDR_NOR_SIZE)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * nand_read_eb_hdr - read ereaseblock header from NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ebhdr: whereto store the data
+ *
+ * Reads the eraseblock header from media. It is on the first two page.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr,
+ struct chfs_eb_hdr *ebhdr)
+{
+ int ret;
+ size_t retlen;
+ off_t ofs;
+
+ KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+ /* Read erase counter header from the first page. */
+ ofs = pebnr * ebh->flash_if->erasesize;
+ ret = flash_read(ebh->flash_dev,
+ ofs, CHFS_EB_EC_HDR_SIZE, &retlen,
+ (unsigned char *) &ebhdr->ec_hdr);
+ if (ret || retlen != CHFS_EB_EC_HDR_SIZE)
+ return ret;
+
+ /* Read NAND eraseblock header from the second page */
+ ofs += ebh->flash_if->page_size;
+ ret = flash_read(ebh->flash_dev,
+ ofs, CHFS_EB_HDR_NAND_SIZE, &retlen,
+ (unsigned char *) &ebhdr->u.nand_hdr);
+ if (ret || retlen != CHFS_EB_HDR_NAND_SIZE)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * nor_write_eb_hdr - write ereaseblock header to NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number whereto write
+ * @ebh: ebh to write
+ *
+ * Writes the eraseblock header to media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+ int ret, crc;
+ size_t retlen;
+
+ off_t ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE;
+
+ ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid
+ | htole32(CHFS_LID_NOT_DIRTY_BIT);
+
+ crc = crc32(0, (uint8_t *)&ebhdr->u.nor_hdr + 4,
+ CHFS_EB_HDR_NOR_SIZE - 4);
+ ebhdr->u.nand_hdr.crc = htole32(crc);
+
+ KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+ ret = flash_write(ebh->flash_dev,
+ ofs, CHFS_EB_HDR_NOR_SIZE, &retlen,
+ (unsigned char *) &ebhdr->u.nor_hdr);
+
+ if (ret || retlen != CHFS_EB_HDR_NOR_SIZE)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * nand_write_eb_hdr - write ereaseblock header to NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number whereto write
+ * @ebh: ebh to write
+ *
+ * Writes the eraseblock header to media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr,
+ struct chfs_eb_hdr *ebhdr)
+{
+ int ret, crc;
+ size_t retlen;
+ flash_off_t ofs;
+
+ KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+ ofs = pebnr * ebh->flash_if->erasesize +
+ ebh->flash_if->page_size;
+
+ ebhdr->u.nand_hdr.serial = htole64(++(*ebh->max_serial));
+
+ crc = crc32(0, (uint8_t *)&ebhdr->u.nand_hdr + 4,
+ CHFS_EB_HDR_NAND_SIZE - 4);
+ ebhdr->u.nand_hdr.crc = htole32(crc);
+
+ ret = flash_write(ebh->flash_dev, ofs,
+ CHFS_EB_HDR_NAND_SIZE, &retlen,
+ (unsigned char *) &ebhdr->u.nand_hdr);
+
+ if (ret || retlen != CHFS_EB_HDR_NAND_SIZE)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * nor_check_eb_hdr - check ereaseblock header read from NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @buf: eraseblock header to check
+ *
+ * Returns eraseblock header status.
+ */
+int
+nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf)
+{
+ uint32_t magic, crc, hdr_crc;
+ struct chfs_eb_hdr *ebhdr = buf;
+ le32 lid_save;
+
+ //check is there a header
+ if (check_pattern((void *) &ebhdr->ec_hdr,
+ 0xFF, 0, CHFS_EB_EC_HDR_SIZE)) {
+ dbg_ebh("no header found\n");
+ return EBHDR_LEB_NO_HDR;
+ }
+
+ // check magic
+ magic = le32toh(ebhdr->ec_hdr.magic);
+ if (magic != CHFS_MAGIC_BITMASK) {
+ dbg_ebh("bad magic bitmask(exp: %x found %x)\n",
+ CHFS_MAGIC_BITMASK, magic);
+ return EBHDR_LEB_BADMAGIC;
+ }
+
+ // check CRC_EC
+ hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec);
+ crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+ if (hdr_crc != crc) {
+ dbg_ebh("bad crc_ec found\n");
+ return EBHDR_LEB_BADCRC;
+ }
+
+ /* check if the PEB is free: magic, crc_ec and erase_cnt is good and
+ * everything else is FFF..
+ */
+ if (check_pattern((void *) &ebhdr->u.nor_hdr, 0xFF, 0,
+ CHFS_EB_HDR_NOR_SIZE)) {
+ dbg_ebh("free peb found\n");
+ return EBHDR_LEB_FREE;
+ }
+
+ // check invalidated (CRC == LID == 0)
+ if (ebhdr->u.nor_hdr.crc == 0 && ebhdr->u.nor_hdr.lid == 0) {
+ dbg_ebh("invalidated ebhdr found\n");
+ return EBHDR_LEB_INVALIDATED;
+ }
+
+ // check CRC
+ hdr_crc = le32toh(ebhdr->u.nor_hdr.crc);
+ lid_save = ebhdr->u.nor_hdr.lid;
+
+ // mark lid as not dirty for crc calc
+ ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid | htole32(
+ CHFS_LID_NOT_DIRTY_BIT);
+ crc = crc32(0, (uint8_t *) &ebhdr->u.nor_hdr + 4,
+ CHFS_EB_HDR_NOR_SIZE - 4);
+ // restore the original lid value in ebh
+ ebhdr->u.nor_hdr.lid = lid_save;
+
+ if (crc != hdr_crc) {
+ dbg_ebh("bad crc found\n");
+ return EBHDR_LEB_BADCRC;
+ }
+
+ // check dirty
+ if (!(le32toh(lid_save) & CHFS_LID_NOT_DIRTY_BIT)) {
+ dbg_ebh("dirty ebhdr found\n");
+ return EBHDR_LEB_DIRTY;
+ }
+
+ return EBHDR_LEB_OK;
+}
+
+/**
+ * nand_check_eb_hdr - check ereaseblock header read from NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @buf: eraseblock header to check
+ *
+ * Returns eraseblock header status.
+ */
+int
+nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf)
+{
+ uint32_t magic, crc, hdr_crc;
+ struct chfs_eb_hdr *ebhdr = buf;
+
+ //check is there a header
+ if (check_pattern((void *) &ebhdr->ec_hdr,
+ 0xFF, 0, CHFS_EB_EC_HDR_SIZE)) {
+ dbg_ebh("no header found\n");
+ return EBHDR_LEB_NO_HDR;
+ }
+
+ // check magic
+ magic = le32toh(ebhdr->ec_hdr.magic);
+ if (magic != CHFS_MAGIC_BITMASK) {
+ dbg_ebh("bad magic bitmask(exp: %x found %x)\n",
+ CHFS_MAGIC_BITMASK, magic);
+ return EBHDR_LEB_BADMAGIC;
+ }
+
+ // check CRC_EC
+ hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec);
+ crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+ if (hdr_crc != crc) {
+ dbg_ebh("bad crc_ec found\n");
+ return EBHDR_LEB_BADCRC;
+ }
+
+ /* check if the PEB is free: magic, crc_ec and erase_cnt is good and
+ * everything else is FFF..
+ */
+ if (check_pattern((void *) &ebhdr->u.nand_hdr, 0xFF, 0,
+ CHFS_EB_HDR_NAND_SIZE)) {
+ dbg_ebh("free peb found\n");
+ return EBHDR_LEB_FREE;
+ }
+
+ // check CRC
+ hdr_crc = le32toh(ebhdr->u.nand_hdr.crc);
+
+ crc = crc32(0, (uint8_t *) &ebhdr->u.nand_hdr + 4,
+ CHFS_EB_HDR_NAND_SIZE - 4);
+
+ if (crc != hdr_crc) {
+ dbg_ebh("bad crc found\n");
+ return EBHDR_LEB_BADCRC;
+ }
+
+ return EBHDR_LEB_OK;
+}
+
+/**
+ * nor_mark_eb_hdr_dirty_flash- mark ereaseblock header dirty on NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @lid: leb id (it's bit number 31 will be set to 0)
+ *
+ * It pulls the CHFS_LID_NOT_DIRTY_BIT to zero on flash.
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid)
+{
+ int ret;
+ size_t retlen;
+ off_t ofs;
+
+ /* mark leb id dirty */
+ lid = htole32(lid & CHFS_LID_DIRTY_BIT_MASK);
+
+ /* calculate position */
+ ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE
+ + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr , lid);
+
+ ret = flash_write(ebh->flash_dev, ofs, sizeof(lid), &retlen,
+ (unsigned char *) &lid);
+ if (ret || retlen != sizeof(lid)) {
+ chfs_err("can't mark peb dirty");
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * nor_invalidate_eb_hdr - invalidate ereaseblock header on NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ *
+ * Sets crc and lip field to zero.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr)
+{
+ int ret;
+ size_t retlen;
+ off_t ofs;
+ char zero_buf[CHFS_INVALIDATE_SIZE];
+
+ /* fill with zero */
+ memset(zero_buf, 0x0, CHFS_INVALIDATE_SIZE);
+
+ /* calculate position (!!! lid is directly behind crc !!!) */
+ ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE
+ + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr, crc);
+
+ ret = flash_write(ebh->flash_dev,
+ ofs, CHFS_INVALIDATE_SIZE, &retlen,
+ (unsigned char *) &zero_buf);
+ if (ret || retlen != CHFS_INVALIDATE_SIZE) {
+ chfs_err("can't invalidate peb");
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * mark_eb_hdr_free - free ereaseblock header on NOR or NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ec: erase counter of PEB
+ *
+ * Write out the magic and erase counter to the physical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+ int ret, crc;
+ size_t retlen;
+ off_t ofs;
+ struct chfs_eb_hdr *ebhdr;
+ ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+ ebhdr->ec_hdr.magic = htole32(CHFS_MAGIC_BITMASK);
+ ebhdr->ec_hdr.erase_cnt = htole32(ec);
+ crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+ ebhdr->ec_hdr.crc_ec = htole32(crc);
+
+ ofs = pebnr * ebh->flash_if->erasesize;
+
+ KASSERT(sizeof(ebhdr->ec_hdr) == CHFS_EB_EC_HDR_SIZE);
+
+ ret = flash_write(ebh->flash_dev,
+ ofs, CHFS_EB_EC_HDR_SIZE, &retlen,
+ (unsigned char *) &ebhdr->ec_hdr);
+
+ if (ret || retlen != CHFS_EB_EC_HDR_SIZE) {
+ chfs_err("can't mark peb as free: %d\n", pebnr);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ return ret;
+ }
+
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ return 0;
+}
+
+/*****************************************************************************/
+/* End of Flash specific operations */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Lock Tree */
+/*****************************************************************************/
+
+int
+ltree_entry_cmp(struct chfs_ltree_entry *le1,
+ struct chfs_ltree_entry *le2)
+{
+ return (le1->lnr - le2->lnr);
+}
+
+/* Generate functions for Lock tree's red-black tree */
+RB_PROTOTYPE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp);
+RB_GENERATE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp);
+
+
+/**
+ * ltree_lookup - looks up a logical eraseblock in the lock tree
+ * @ebh: chfs eraseblock handler
+ * @lid: identifier of the logical eraseblock
+ *
+ * This function returns a pointer to the wanted &struct chfs_ltree_entry
+ * if the logical eraseblock is in the lock tree, so it is locked, NULL
+ * otherwise.
+ * @ebh->ltree_lock has to be locked!
+ */
+static struct chfs_ltree_entry *
+ltree_lookup(struct chfs_ebh *ebh, int lnr)
+{
+ struct chfs_ltree_entry le, *result;
+ le.lnr = lnr;
+ result = RB_FIND(ltree_rbtree, &ebh->ltree, &le);
+ return result;
+}
+
+/**
+ * ltree_add_entry - add an entry to the lock tree
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function adds a new logical eraseblock entry identified with @lnr to the
+ * lock tree. If the entry is already in the tree, it increases the user
+ * counter.
+ * Returns NULL if can not allocate memory for lock tree entry, or a pointer
+ * to the inserted entry otherwise.
+ */
+static struct chfs_ltree_entry *
+ltree_add_entry(struct chfs_ebh *ebh, int lnr)
+{
+ struct chfs_ltree_entry *le, *result;
+
+ le = kmem_alloc(sizeof(struct chfs_ltree_entry), KM_SLEEP);
+
+ le->lnr = lnr;
+ le->users = 1;
+ rw_init(&le->mutex);
+
+ //dbg_ebh("enter ltree lock\n");
+ mutex_enter(&ebh->ltree_lock);
+ //dbg_ebh("insert\n");
+ result = RB_INSERT(ltree_rbtree, &ebh->ltree, le);
+ //dbg_ebh("inserted\n");
+ if (result) {
+ //The entry is already in the tree
+ result->users++;
+ kmem_free(le, sizeof(struct chfs_ltree_entry));
+ }
+ else {
+ result = le;
+ }
+ mutex_exit(&ebh->ltree_lock);
+
+ return result;
+}
+
+/**
+ * leb_read_lock - lock a logical eraseblock for read
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+static int
+leb_read_lock(struct chfs_ebh *ebh, int lnr)
+{
+ struct chfs_ltree_entry *le;
+
+ le = ltree_add_entry(ebh, lnr);
+ if (!le)
+ return ENOMEM;
+
+ rw_enter(&le->mutex, RW_READER);
+ return 0;
+}
+
+/**
+ * leb_read_unlock - unlock a logical eraseblock from read
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function unlocks a logical eraseblock from read and delete it from the
+ * lock tree is there are no more users of it.
+ */
+static void
+leb_read_unlock(struct chfs_ebh *ebh, int lnr)
+{
+ struct chfs_ltree_entry *le;
+
+ mutex_enter(&ebh->ltree_lock);
+ //dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_read_unlock()\n");
+ le = ltree_lookup(ebh, lnr);
+ if (!le)
+ goto out;
+
+ le->users -= 1;
+ KASSERT(le->users >= 0);
+ rw_exit(&le->mutex);
+ if (le->users == 0) {
+ le = RB_REMOVE(ltree_rbtree, &ebh->ltree, le);
+ if (le) {
+ KASSERT(!rw_lock_held(&le->mutex));
+ rw_destroy(&le->mutex);
+
+ kmem_free(le, sizeof(struct chfs_ltree_entry));
+ }
+ }
+
+out:
+ mutex_exit(&ebh->ltree_lock);
+ //dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_read_unlock()\n");
+}
+
+/**
+ * leb_write_lock - lock a logical eraseblock for write
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+static int
+leb_write_lock(struct chfs_ebh *ebh, int lnr)
+{
+ struct chfs_ltree_entry *le;
+
+ le = ltree_add_entry(ebh, lnr);
+ if (!le)
+ return ENOMEM;
+
+ rw_enter(&le->mutex, RW_WRITER);
+ return 0;
+}
+
+/**
+ * leb_write_unlock - unlock a logical eraseblock from write
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function unlocks a logical eraseblock from write and delete it from the
+ * lock tree is there are no more users of it.
+ */
+static void
+leb_write_unlock(struct chfs_ebh *ebh, int lnr)
+{
+ struct chfs_ltree_entry *le;
+
+ mutex_enter(&ebh->ltree_lock);
+ //dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_write_unlock()\n");
+ le = ltree_lookup(ebh, lnr);
+ if (!le)
+ goto out;
+
+ le->users -= 1;
+ KASSERT(le->users >= 0);
+ rw_exit(&le->mutex);
+ if (le->users == 0) {
+ RB_REMOVE(ltree_rbtree, &ebh->ltree, le);
+
+ KASSERT(!rw_lock_held(&le->mutex));
+ rw_destroy(&le->mutex);
+
+ kmem_free(le, sizeof(struct chfs_ltree_entry));
+ }
+
+out:
+ mutex_exit(&ebh->ltree_lock);
+ //dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_write_unlock()\n");
+}
+
+/*****************************************************************************/
+/* End of Lock Tree */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Erase related operations */
+/*****************************************************************************/
+
+/**
+ * If the first argument is smaller than the second, the function
+ * returns a value smaller than zero. If they are equal, the function re-
+ * turns zero. Otherwise, it should return a value greater than zero.
+ */
+int
+peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2)
+{
+ return (peb1->pebnr - peb2->pebnr);
+}
+
+int
+peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2)
+{
+ int comp;
+
+ comp = peb1->erase_cnt - peb2->erase_cnt;
+ if (0 == comp)
+ comp = peb1->pebnr - peb2->pebnr;
+
+ return comp;
+}
+
+/* Generate functions for in use PEB's red-black tree */
+RB_PROTOTYPE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp);
+RB_GENERATE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp);
+RB_PROTOTYPE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp);
+RB_GENERATE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp);
+
+/**
+ * add_peb_to_erase_queue: adds a PEB to to_erase/fully_erased queue
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ * @queue: the queue to add to
+ *
+ * This function adds a PEB to the erase queue specified by @queue.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec,
+ struct peb_queue *queue)
+{
+ struct chfs_peb *peb;
+
+ peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+ peb->erase_cnt = ec;
+ peb->pebnr = pebnr;
+
+ TAILQ_INSERT_TAIL(queue, peb, u.queue);
+
+ return 0;
+
+}
+//TODO
+/**
+ * find_peb_in_use - looks up a PEB in the RB-tree of used blocks
+ * @ebh - chfs eraseblock handler
+ *
+ * This function returns a pointer to the PEB found in the tree,
+ * NULL otherwise.
+ * The @ebh->erase_lock must be locked before using this.
+ */
+struct chfs_peb *
+find_peb_in_use(struct chfs_ebh *ebh, int pebnr)
+{
+ struct chfs_peb peb, *result;
+ peb.pebnr = pebnr;
+ result = RB_FIND(peb_in_use_rbtree, &ebh->in_use, &peb);
+ return result;
+}
+
+/**
+ * add_peb_to_free - adds a PEB to the RB-tree of free PEBs
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ *
+ *
+ * This function adds a physical eraseblock to the RB-tree of free PEBs
+ * stored in the @ebh. The key is the erase counter and pebnr.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+ struct chfs_peb *peb, *result;
+
+ peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+ peb->erase_cnt = ec;
+ peb->pebnr = pebnr;
+ result = RB_INSERT(peb_free_rbtree, &ebh->free, peb);
+ if (result)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * add_peb_to_in_use - adds a PEB to the RB-tree of used PEBs
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ *
+ *
+ * This function adds a physical eraseblock to the RB-tree of used PEBs
+ * stored in the @ebh. The key is pebnr.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+ struct chfs_peb *peb, *result;
+
+ peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+ peb->erase_cnt = ec;
+ peb->pebnr = pebnr;
+ result = RB_INSERT(peb_in_use_rbtree, &ebh->in_use, peb);
+ if (result)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * erase_callback - callback function for flash erase
+ * @ei: erase information
+ */
+void
+erase_callback(struct flash_erase_instruction *ei)
+{
+ int err;
+ struct chfs_erase_info_priv *priv = (void *) ei->ei_priv;
+ //dbg_ebh("ERASE_CALLBACK() CALLED\n");
+ struct chfs_ebh *ebh = priv->ebh;
+ struct chfs_peb *peb = priv->peb;
+
+ peb->erase_cnt += 1;
+
+ if (ei->ei_state == FLASH_ERASE_DONE) {
+
+ /* Write out erase counter */
+ err = ebh->ops->mark_eb_hdr_free(ebh,
+ peb->pebnr, peb->erase_cnt);
+ if (err) {
+ /* cannot mark PEB as free,so erase it again */
+ chfs_err(
+ "cannot mark eraseblock as free, PEB: %d\n",
+ peb->pebnr);
+ mutex_enter(&ebh->erase_lock);
+ /*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback() "
+ "after mark ebhdr free\n");*/
+ add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt,
+ &ebh->to_erase);
+ mutex_exit(&ebh->erase_lock);
+ /*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback() "
+ "after mark ebhdr free\n");*/
+ kmem_free(peb, sizeof(struct chfs_peb));
+ return;
+ }
+
+ mutex_enter(&ebh->erase_lock);
+ /*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback()\n");*/
+ err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt);
+ mutex_exit(&ebh->erase_lock);
+ /*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback()\n");*/
+ kmem_free(peb, sizeof(struct chfs_peb));
+ } else {
+ /*
+ * Erase is finished, but there was a problem,
+ * so erase PEB again
+ */
+ chfs_err("erase failed, state is: 0x%x\n", ei->ei_state);
+ add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt, &ebh->to_erase);
+ kmem_free(peb, sizeof(struct chfs_peb));
+ }
+}
+
+/**
+ * free_peb: free a PEB
+ * @ebh: chfs eraseblock handler
+ *
+ * This function erases the first physical eraseblock from one of the erase
+ * lists and adds to the RB-tree of free PEBs.
+ * Returns zero in case of succes, error code in case of fail.
+ */
+int
+free_peb(struct chfs_ebh *ebh)
+{
+ int err, retries = 0;
+ off_t ofs;
+ struct chfs_peb *peb = NULL;
+ struct flash_erase_instruction *ei;
+
+ KASSERT(mutex_owned(&ebh->erase_lock));
+
+ if (!TAILQ_EMPTY(&ebh->fully_erased)) {
+ //dbg_ebh("[FREE PEB] got a fully erased block\n");
+ peb = TAILQ_FIRST(&ebh->fully_erased);
+ TAILQ_REMOVE(&ebh->fully_erased, peb, u.queue);
+ err = ebh->ops->mark_eb_hdr_free(ebh,
+ peb->pebnr, peb->erase_cnt);
+ if (err) {
+ goto out_free;
+ }
+ err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt);
+ goto out_free;
+ }
+ /* Erase PEB */
+ //dbg_ebh("[FREE PEB] eraseing a block\n");
+ peb = TAILQ_FIRST(&ebh->to_erase);
+ TAILQ_REMOVE(&ebh->to_erase, peb, u.queue);
+ mutex_exit(&ebh->erase_lock);
+ //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in free_peb()\n");
+ ofs = peb->pebnr * ebh->flash_if->erasesize;
+
+ /* XXX where do we free this? */
+ ei = kmem_alloc(sizeof(struct flash_erase_instruction)
+ + sizeof(struct chfs_erase_info_priv), KM_SLEEP);
+retry:
+ memset(ei, 0, sizeof(*ei));
+
+// ei->ei_if = ebh->flash_if;
+ ei->ei_addr = ofs;
+ ei->ei_len = ebh->flash_if->erasesize;
+ ei->ei_callback = erase_callback;
+ ei->ei_priv = (unsigned long) (&ei[1]);
+
+ ((struct chfs_erase_info_priv *) ei->ei_priv)->ebh = ebh;
+ ((struct chfs_erase_info_priv *) ei->ei_priv)->peb = peb;
+
+ err = flash_erase(ebh->flash_dev, ei);
+ dbg_ebh("erased peb: %d\n", peb->pebnr);
+
+ /* einval would mean we did something wrong */
+ KASSERT(err != EINVAL);
+
+ if (err) {
+ dbg_ebh("errno: %d, ei->ei_state: %d\n", err, ei->ei_state);
+ if (CHFS_MAX_GET_PEB_RETRIES < ++retries &&
+ ei->ei_state == FLASH_ERASE_FAILED) {
+ /* The block went bad mark it */
+ dbg_ebh("ebh markbad! 0x%jx\n", (uintmax_t )ofs);
+ err = flash_block_markbad(ebh->flash_dev, ofs);
+ if (!err) {
+ ebh->peb_nr--;
+ }
+
+ goto out;
+ }
+ chfs_err("can not erase PEB: %d, try again\n", peb->pebnr);
+ goto retry;
+ }
+
+out:
+ /* lock the erase_lock, because it was locked
+ * when the function was called */
+ mutex_enter(&ebh->erase_lock);
+ return err;
+
+out_free:
+ kmem_free(peb, sizeof(struct chfs_peb));
+ return err;
+}
+
+/**
+ * release_peb - schedule an erase for the PEB
+ * @ebh: chfs eraseblock handler
+ * @pebnr: physical eraseblock number
+ *
+ * This function get the peb identified by @pebnr from the in_use RB-tree of
+ * @ebh, removes it and schedule an erase for it.
+ *
+ * Returns zero on success, error code in case of fail.
+ */
+int
+release_peb(struct chfs_ebh *ebh, int pebnr)
+{
+ int err = 0;
+ struct chfs_peb *peb;
+
+ mutex_enter(&ebh->erase_lock);
+
+ //dbg_ebh("LOCK: ebh->erase_lock spin locked in release_peb()\n");
+ peb = find_peb_in_use(ebh, pebnr);
+ if (!peb) {
+ chfs_err("LEB is mapped, but is not in the 'in_use' "
+ "tree of ebh\n");
+ goto out_unlock;
+ }
+ err = add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt,
+ &ebh->to_erase);
+
+ if (err)
+ goto out_unlock;
+
+ RB_REMOVE(peb_in_use_rbtree, &ebh->in_use, peb);
+out_unlock:
+ mutex_exit(&ebh->erase_lock);
+ //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in release_peb()"
+ // " at out_unlock\n");
+ return err;
+}
+
+/**
+ * erase_thread - background thread for erasing PEBs
+ * @data: pointer to the eraseblock handler
+ */
+/*void
+ erase_thread(void *data)
+ {
+ struct chfs_ebh *ebh = data;
+
+ dbg_ebh("erase thread started\n");
+ while (ebh->bg_erase.eth_running) {
+ int err;
+
+ mutex_enter(&ebh->erase_lock);
+ dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_thread()\n");
+ if (TAILQ_EMPTY(&ebh->to_erase) && TAILQ_EMPTY(&ebh->fully_erased)) {
+ dbg_ebh("thread has nothing to do\n");
+ mutex_exit(&ebh->erase_lock);
+ mutex_enter(&ebh->bg_erase.eth_thread_mtx);
+ cv_timedwait_sig(&ebh->bg_erase.eth_wakeup,
+ &ebh->bg_erase.eth_thread_mtx, mstohz(100));
+ mutex_exit(&ebh->bg_erase.eth_thread_mtx);
+
+ dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n");
+ continue;
+ }
+ mutex_exit(&ebh->erase_lock);
+ dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n");
+
+ err = free_peb(ebh);
+ if (err)
+ chfs_err("freeing PEB failed in the background thread: %d\n", err);
+
+ }
+ dbg_ebh("erase thread stopped\n");
+ kthread_exit(0);
+ }*/
+
+/**
+ * erase_thread - background thread for erasing PEBs
+ * @data: pointer to the eraseblock handler
+ */
+void
+erase_thread(void *data) {
+ dbg_ebh("[EBH THREAD] erase thread started\n");
+
+ struct chfs_ebh *ebh = data;
+ int err;
+
+ mutex_enter(&ebh->erase_lock);
+ while (ebh->bg_erase.eth_running) {
+ if (TAILQ_EMPTY(&ebh->to_erase) &&
+ TAILQ_EMPTY(&ebh->fully_erased)) {
+ cv_timedwait_sig(&ebh->bg_erase.eth_wakeup,
+ &ebh->erase_lock, mstohz(100));
+ } else {
+ /* XXX exiting this mutex is a bit odd here as
+ * free_peb instantly reenters it...
+ */
+ err = free_peb(ebh);
+ mutex_exit(&ebh->erase_lock);
+ if (err) {
+ chfs_err("freeing PEB failed in the"
+ " background thread: %d\n", err);
+ }
+ mutex_enter(&ebh->erase_lock);
+ }
+ }
+ mutex_exit(&ebh->erase_lock);
+
+ dbg_ebh("[EBH THREAD] erase thread stopped\n");
+ kthread_exit(0);
+}
+
+/**
+ * erase_thread_start - init and start erase thread
+ * @ebh: eraseblock handler
+ */
+static void
+erase_thread_start(struct chfs_ebh *ebh)
+{
+ cv_init(&ebh->bg_erase.eth_wakeup, "ebheracv");
+
+ ebh->bg_erase.eth_running = true;
+ kthread_create(PRI_NONE, KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
+ erase_thread, ebh, &ebh->bg_erase.eth_thread, "ebherase");
+}
+
+/**
+ * erase_thread_stop - stop background erase thread
+ * @ebh: eraseblock handler
+ */
+static void
+erase_thread_stop(struct chfs_ebh *ebh)
+{
+ ebh->bg_erase.eth_running = false;
+ cv_signal(&ebh->bg_erase.eth_wakeup);
+ dbg_ebh("[EBH THREAD STOP] signaled\n");
+
+ kthread_join(ebh->bg_erase.eth_thread);
+#ifdef BROKEN_KTH_JOIN
+ kpause("chfsebhjointh", false, mstohz(1000), NULL);
+#endif
+
+ cv_destroy(&ebh->bg_erase.eth_wakeup);
+}
+
+/*****************************************************************************/
+/* End of Erase related operations */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Scan related operations */
+/*****************************************************************************/
+int
+scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2)
+{
+ return (sleb1->lnr - sleb2->lnr);
+}
+
+RB_PROTOTYPE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp);
+RB_GENERATE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp);
+
+/**
+ * scan_add_to_queue - adds a physical eraseblock to one of the
+ * eraseblock queue
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ * @erase_cnt: erase counter of the physical eraseblock
+ * @list: the list to add to
+ *
+ * This function adds a physical eraseblock to one of the lists in the scanning
+ * information.
+ * Returns zero in case of success, negative error code in case of fail.
+ */
+static int
+scan_add_to_queue(struct chfs_scan_info *si, int pebnr, int erase_cnt,
+ struct scan_leb_queue *queue)
+{
+ struct chfs_scan_leb *sleb;
+
+ sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+ sleb->pebnr = pebnr;
+ sleb->erase_cnt = erase_cnt;
+ TAILQ_INSERT_TAIL(queue, sleb, u.queue);
+ return 0;
+}
+
+/*
+ * nor_scan_add_to_used - add a physical eraseblock to the
+ * used tree of scan info
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @ebhdr: eraseblock header
+ * @pebnr: physical eraseblock number
+ * @leb_status: the status of the PEB's eraseblock header
+ *
+ * This function adds a PEB to the used tree of the scanning information.
+ * It handles the situations if there are more physical eraseblock referencing
+ * to the same logical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+ struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status)
+{
+ int err, lnr, ec;
+ struct chfs_scan_leb *sleb, *old;
+
+ lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid);
+ ec = le32toh(ebhdr->ec_hdr.erase_cnt);
+
+ sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+ sleb->erase_cnt = ec;
+ sleb->lnr = lnr;
+ sleb->pebnr = pebnr;
+ sleb->info = leb_status;
+
+ old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb);
+ if (old) {
+ kmem_free(sleb, sizeof(struct chfs_scan_leb));
+ /* There is already an eraseblock in the used tree */
+ /* If the new one is bad */
+ if (EBHDR_LEB_DIRTY == leb_status &&
+ EBHDR_LEB_OK == old->info) {
+ return scan_add_to_queue(si, pebnr, ec, &si->erase);
+ } else {
+ err = scan_add_to_queue(si, old->pebnr,
+ old->erase_cnt, &si->erase);
+ if (err) {
+ return err;
+ }
+
+ old->erase_cnt = ec;
+ old->lnr = lnr;
+ old->pebnr = pebnr;
+ old->info = leb_status;
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/**
+ * nor_process eb -read the headers from NOR flash, check them and add to
+ * the scanning information
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+ int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+ int err, erase_cnt, leb_status;
+
+ err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr);
+ if (err)
+ return err;
+
+ erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt);
+ dbg_ebh("erase_cnt: %d\n", erase_cnt);
+ leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr);
+ if (EBHDR_LEB_BADMAGIC == leb_status ||
+ EBHDR_LEB_BADCRC == leb_status) {
+ err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted);
+ return err;
+ }
+ else if (EBHDR_LEB_FREE == leb_status) {
+ err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free);
+ goto count_mean;
+ }
+ else if (EBHDR_LEB_NO_HDR == leb_status) {
+ err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased);
+ return err;
+ }
+ else if (EBHDR_LEB_INVALIDATED == leb_status) {
+ err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erase);
+ return err;
+ }
+
+ err = nor_scan_add_to_used(ebh, si, ebhdr, pebnr, leb_status);
+ if (err)
+ return err;
+
+
+count_mean:
+ si->sum_of_ec += erase_cnt;
+ si->num_of_eb++;
+
+ return err;
+}
+
+/*
+ * nand_scan_add_to_used - add a physical eraseblock to the
+ * used tree of scan info
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @ebhdr: eraseblock header
+ * @pebnr: physical eraseblock number
+ * @leb_status: the status of the PEB's eraseblock header
+ *
+ * This function adds a PEB to the used tree of the scanning information.
+ * It handles the situations if there are more physical eraseblock referencing
+ * to the same logical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+ struct chfs_eb_hdr *ebhdr, int pebnr)
+{
+ int err, lnr, ec;
+ struct chfs_scan_leb *sleb, *old;
+ uint64_t serial = le64toh(ebhdr->u.nand_hdr.serial);
+
+ lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid);
+ ec = le32toh(ebhdr->ec_hdr.erase_cnt);
+
+ sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+ sleb->erase_cnt = ec;
+ sleb->lnr = lnr;
+ sleb->pebnr = pebnr;
+ sleb->info = serial;
+
+ old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb);
+ if (old) {
+ kmem_free(sleb, sizeof(struct chfs_scan_leb));
+ /* There is already an eraseblock in the used tree */
+ /* If the new one is bad */
+ if (serial < old->info)
+ return scan_add_to_queue(si, pebnr, ec, &si->erase);
+ else {
+ err = scan_add_to_queue(si,
+ old->pebnr, old->erase_cnt, &si->erase);
+ if (err)
+ return err;
+
+ old->erase_cnt = ec;
+ old->lnr = lnr;
+ old->pebnr = pebnr;
+ old->info = serial;
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/**
+ * nand_process eb -read the headers from NAND flash, check them and add to the
+ * scanning information
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+ int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+ int err, erase_cnt, leb_status;
+ uint64_t max_serial;
+ /* isbad() is defined on some ancient platforms, heh */
+ bool is_bad;
+
+ /* Check block is bad */
+ err = flash_block_isbad(ebh->flash_dev,
+ pebnr * ebh->flash_if->erasesize, &is_bad);
+ if (err) {
+ chfs_err("checking block is bad failed\n");
+ return err;
+ }
+ if (is_bad) {
+ si->bad_peb_cnt++;
+ return 0;
+ }
+
+ err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr);
+ if (err)
+ return err;
+
+ erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt);
+ leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr);
+ if (EBHDR_LEB_BADMAGIC == leb_status ||
+ EBHDR_LEB_BADCRC == leb_status) {
+ err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted);
+ return err;
+ }
+ else if (EBHDR_LEB_FREE == leb_status) {
+ err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free);
+ goto count_mean;
+ }
+ else if (EBHDR_LEB_NO_HDR == leb_status) {
+ err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased);
+ return err;
+ }
+
+ err = nand_scan_add_to_used(ebh, si, ebhdr, pebnr);
+ if (err)
+ return err;
+
+ max_serial = le64toh(ebhdr->u.nand_hdr.serial);
+ if (max_serial > *ebh->max_serial) {
+ *ebh->max_serial = max_serial;
+ }
+
+count_mean:
+ si->sum_of_ec += erase_cnt;
+ si->num_of_eb++;
+
+ return err;
+}
+
+/**
+ * chfs_scan - scans the media and returns informations about it
+ * @ebh: chfs eraseblock handler
+ *
+ * This function scans through the media and returns information about it or if
+ * it fails NULL will be returned.
+ */
+struct chfs_scan_info *
+chfs_scan(struct chfs_ebh *ebh)
+{
+ struct chfs_scan_info *si;
+ struct chfs_eb_hdr *ebhdr;
+ int pebnr, err;
+
+ si = kmem_alloc(sizeof(*si), KM_SLEEP);
+
+ TAILQ_INIT(&si->corrupted);
+ TAILQ_INIT(&si->free);
+ TAILQ_INIT(&si->erase);
+ TAILQ_INIT(&si->erased);
+ RB_INIT(&si->used);
+ si->bad_peb_cnt = 0;
+ si->num_of_eb = 0;
+ si->sum_of_ec = 0;
+
+ ebhdr = kmem_alloc(sizeof(*ebhdr), KM_SLEEP);
+
+ for (pebnr = 0; pebnr < ebh->peb_nr; pebnr++) {
+ dbg_ebh("processing PEB %d\n", pebnr);
+ err = ebh->ops->process_eb(ebh, si, pebnr, ebhdr);
+ if (err < 0)
+ goto out_ebhdr;
+ }
+ kmem_free(ebhdr, sizeof(*ebhdr));
+ dbg_ebh("[CHFS_SCAN] scanning information collected\n");
+ return si;
+
+out_ebhdr:
+ kmem_free(ebhdr, sizeof(*ebhdr));
+ kmem_free(si, sizeof(*si));
+ return NULL;
+}
+
+/**
+ * scan_info_destroy - frees all lists and trees in the scanning information
+ * @si: the scanning information
+ */
+void
+scan_info_destroy(struct chfs_scan_info *si)
+{
+ EBH_QUEUE_DESTROY(&si->corrupted,
+ struct chfs_scan_leb, u.queue);
+
+ EBH_QUEUE_DESTROY(&si->erase,
+ struct chfs_scan_leb, u.queue);
+
+ EBH_QUEUE_DESTROY(&si->erased,
+ struct chfs_scan_leb, u.queue);
+
+ EBH_QUEUE_DESTROY(&si->free,
+ struct chfs_scan_leb, u.queue);
+
+ EBH_TREE_DESTROY(scan_leb_used_rbtree,
+ &si->used, struct chfs_scan_leb);
+
+ kmem_free(si, sizeof(*si));
+ dbg_ebh("[SCAN_INFO_DESTROY] scanning information destroyed\n");
+}
+
+/**
+ * scan_media - scan media
+ *
+ * @ebh - chfs eraseblock handler
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+
+int
+scan_media(struct chfs_ebh *ebh)
+{
+ int err, i, avg_ec;
+ struct chfs_scan_info *si;
+ struct chfs_scan_leb *sleb;
+
+ si = chfs_scan(ebh);
+ /*
+ * Process the scan info, manage the eraseblock lists
+ */
+ mutex_init(&ebh->ltree_lock, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&ebh->erase_lock, MUTEX_DEFAULT, IPL_NONE);
+ RB_INIT(&ebh->ltree);
+ RB_INIT(&ebh->free);
+ RB_INIT(&ebh->in_use);
+ TAILQ_INIT(&ebh->to_erase);
+ TAILQ_INIT(&ebh->fully_erased);
+ mutex_init(&ebh->alc_mutex, MUTEX_DEFAULT, IPL_NONE);
+
+ ebh->peb_nr -= si->bad_peb_cnt;
+
+ /*
+ * Create background thread for erasing
+ */
+ erase_thread_start(ebh);
+
+ ebh->lmap = kmem_alloc(ebh->peb_nr * sizeof(int), KM_SLEEP);
+
+ for (i = 0; i < ebh->peb_nr; i++) {
+ ebh->lmap[i] = EBH_LEB_UNMAPPED;
+ }
+
+ if (si->num_of_eb == 0) {
+ /* The flash contains no data. */
+ avg_ec = 0;
+ }
+ else {
+ avg_ec = (int) (si->sum_of_ec / si->num_of_eb);
+ }
+ dbg_ebh("num_of_eb: %d\n", si->num_of_eb);
+
+ mutex_enter(&ebh->erase_lock);
+
+ RB_FOREACH(sleb, scan_leb_used_rbtree, &si->used) {
+ ebh->lmap[sleb->lnr] = sleb->pebnr;
+ err = add_peb_to_in_use(ebh, sleb->pebnr, sleb->erase_cnt);
+ if (err)
+ goto out_free;
+ }
+
+ TAILQ_FOREACH(sleb, &si->erased, u.queue) {
+ err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+ &ebh->fully_erased);
+ if (err)
+ goto out_free;
+ }
+
+ TAILQ_FOREACH(sleb, &si->erase, u.queue) {
+ err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+ &ebh->to_erase);
+ if (err)
+ goto out_free;
+ }
+
+ TAILQ_FOREACH(sleb, &si->free, u.queue) {
+ err = add_peb_to_free(ebh, sleb->pebnr, sleb->erase_cnt);
+ if (err)
+ goto out_free;
+ }
+
+ TAILQ_FOREACH(sleb, &si->corrupted, u.queue) {
+ err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+ &ebh->to_erase);
+ if (err)
+ goto out_free;
+ }
+ mutex_exit(&ebh->erase_lock);
+ scan_info_destroy(si);
+ return 0;
+
+out_free:
+ mutex_exit(&ebh->erase_lock);
+ kmem_free(ebh->lmap, ebh->peb_nr * sizeof(int));
+ scan_info_destroy(si);
+ dbg_ebh("[SCAN_MEDIA] returning with error: %d\n", err);
+ return err;
+}
+
+/*****************************************************************************/
+/* End of Scan related operations */
+/*****************************************************************************/
+
+/**
+ * ebh_open - opens mtd device and init ereaseblock header
+ * @ebh: eraseblock handler
+ * @flash_nr: flash device number to use
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_open(struct chfs_ebh *ebh, dev_t dev)
+{
+ int err;
+
+ ebh->flash_dev = flash_get_device(dev);
+ if (!ebh->flash_dev) {
+ aprint_error("ebh_open: cant get flash device\n");
+ return ENODEV;
+ }
+
+ ebh->flash_if = flash_get_interface(dev);
+ if (!ebh->flash_if) {
+ aprint_error("ebh_open: cant get flash interface\n");
+ return ENODEV;
+ }
+
+ ebh->flash_size = flash_get_size(dev);
+ ebh->peb_nr = ebh->flash_size / ebh->flash_if->erasesize;
+// ebh->peb_nr = ebh->flash_if->size / ebh->flash_if->erasesize;
+ /* Set up flash operations based on flash type */
+ ebh->ops = kmem_alloc(sizeof(struct chfs_ebh_ops), KM_SLEEP);
+
+ switch (ebh->flash_if->type) {
+ case FLASH_TYPE_NOR:
+ ebh->eb_size = ebh->flash_if->erasesize -
+ CHFS_EB_EC_HDR_SIZE - CHFS_EB_HDR_NOR_SIZE;
+
+ ebh->ops->read_eb_hdr = nor_read_eb_hdr;
+ ebh->ops->write_eb_hdr = nor_write_eb_hdr;
+ ebh->ops->check_eb_hdr = nor_check_eb_hdr;
+ ebh->ops->mark_eb_hdr_dirty_flash =
+ nor_mark_eb_hdr_dirty_flash;
+ ebh->ops->invalidate_eb_hdr = nor_invalidate_eb_hdr;
+ ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free;
+
+ ebh->ops->process_eb = nor_process_eb;
+
+ ebh->ops->create_eb_hdr = nor_create_eb_hdr;
+ ebh->ops->calc_data_offs = nor_calc_data_offs;
+
+ ebh->max_serial = NULL;
+ break;
+ case FLASH_TYPE_NAND:
+ ebh->eb_size = ebh->flash_if->erasesize -
+ 2 * ebh->flash_if->page_size;
+
+ ebh->ops->read_eb_hdr = nand_read_eb_hdr;
+ ebh->ops->write_eb_hdr = nand_write_eb_hdr;
+ ebh->ops->check_eb_hdr = nand_check_eb_hdr;
+ ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free;
+ ebh->ops->mark_eb_hdr_dirty_flash = NULL;
+ ebh->ops->invalidate_eb_hdr = NULL;
+
+ ebh->ops->process_eb = nand_process_eb;
+
+ ebh->ops->create_eb_hdr = nand_create_eb_hdr;
+ ebh->ops->calc_data_offs = nand_calc_data_offs;
+
+ ebh->max_serial = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+
+ *ebh->max_serial = 0;
+ break;
+ default:
+ return 1;
+ }
+ printf("opening ebh: eb_size: %zu\n", ebh->eb_size);
+ err = scan_media(ebh);
+ if (err) {
+ dbg_ebh("Scan failed.");
+ kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops));
+ kmem_free(ebh, sizeof(struct chfs_ebh));
+ return err;
+ }
+ return 0;
+}
+
+/**
+ * ebh_close - close ebh
+ * @ebh: eraseblock handler
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_close(struct chfs_ebh *ebh)
+{
+ erase_thread_stop(ebh);
+
+ EBH_TREE_DESTROY(peb_free_rbtree, &ebh->free, struct chfs_peb);
+ EBH_TREE_DESTROY(peb_in_use_rbtree, &ebh->in_use, struct chfs_peb);
+
+ EBH_QUEUE_DESTROY(&ebh->fully_erased, struct chfs_peb, u.queue);
+ EBH_QUEUE_DESTROY(&ebh->to_erase, struct chfs_peb, u.queue);
+
+ /* XXX HACK, see ebh.h */
+ EBH_TREE_DESTROY_MUTEX(ltree_rbtree, &ebh->ltree,
+ struct chfs_ltree_entry);
+
+ KASSERT(!mutex_owned(&ebh->ltree_lock));
+ KASSERT(!mutex_owned(&ebh->alc_mutex));
+ KASSERT(!mutex_owned(&ebh->erase_lock));
+
+ mutex_destroy(&ebh->ltree_lock);
+ mutex_destroy(&ebh->alc_mutex);
+ mutex_destroy(&ebh->erase_lock);
+
+ kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops));
+ kmem_free(ebh, sizeof(struct chfs_ebh));
+
+ return 0;
+}
+
+/**
+ * ebh_read_leb - read data from leb
+ * @ebh: eraseblock handler
+ * @lnr: logical eraseblock number
+ * @buf: buffer to read to
+ * @offset: offset from where to read
+ * @len: bytes number to read
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_read_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset,
+ size_t len, size_t *retlen)
+{
+ int err, pebnr;
+ off_t data_offset;
+
+ KASSERT(offset + len <= ebh->eb_size);
+
+ err = leb_read_lock(ebh, lnr);
+ if (err)
+ return err;
+ pebnr = ebh->lmap[lnr];
+ /* If PEB is not mapped the buffer is filled with 0xFF */
+ if (EBH_LEB_UNMAPPED == pebnr) {
+ leb_read_unlock(ebh, lnr);
+ memset(buf, 0xFF, len);
+ return 0;
+ }
+
+ /* Read data */
+ data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+ err = flash_read(ebh->flash_dev, data_offset, len, retlen,
+ (unsigned char *) buf);
+ if (err)
+ goto out_free;
+
+ KASSERT(len == *retlen);
+
+ leb_read_unlock(ebh, lnr);
+ return err;
+
+out_free:
+ leb_read_unlock(ebh, lnr);
+ return err;
+}
+
+/**
+ * get_peb: get a free physical eraseblock
+ * @ebh - chfs eraseblock handler
+ *
+ * This function gets a free eraseblock from the ebh->free RB-tree.
+ * The fist entry will be returned and deleted from the tree.
+ * The entries sorted by the erase counters, so the PEB with the smallest
+ * erase counter will be added back.
+ * If something goes bad a negative value will be returned.
+ */
+int
+get_peb(struct chfs_ebh *ebh)
+{
+ int err, pebnr;
+ struct chfs_peb *peb;
+
+retry:
+ mutex_enter(&ebh->erase_lock);
+ //dbg_ebh("LOCK: ebh->erase_lock spin locked in get_peb()\n");
+ if (RB_EMPTY(&ebh->free)) {
+ /*There is no more free PEBs in the tree*/
+ if (TAILQ_EMPTY(&ebh->to_erase) &&
+ TAILQ_EMPTY(&ebh->fully_erased)) {
+ mutex_exit(&ebh->erase_lock);
+ //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+ return ENOSPC;
+ }
+ err = free_peb(ebh);
+
+ mutex_exit(&ebh->erase_lock);
+ //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+
+ if (err)
+ return err;
+ goto retry;
+ }
+ peb = RB_MIN(peb_free_rbtree, &ebh->free);
+ pebnr = peb->pebnr;
+ RB_REMOVE(peb_free_rbtree, &ebh->free, peb);
+ err = add_peb_to_in_use(ebh, peb->pebnr, peb->erase_cnt);
+ if (err)
+ pebnr = err;
+
+ kmem_free(peb, sizeof(struct chfs_peb));
+
+ mutex_exit(&ebh->erase_lock);
+ //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+
+ return pebnr;
+}
+
+/**
+ * ebh_write_leb - write data to leb
+ * @ebh: eraseblock handler
+ * @lnr: logical eraseblock number
+ * @buf: data to write
+ * @offset: offset where to write
+ * @len: bytes number to write
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_write_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset,
+ size_t len, size_t *retlen)
+{
+ int err, pebnr, retries = 0;
+ off_t data_offset;
+ struct chfs_eb_hdr *ebhdr;
+
+ dbg("offset: %d | len: %zu | (offset+len): %zu "
+ " | ebsize: %zu\n", offset, len, (offset+len), ebh->eb_size);
+
+ KASSERT(offset + len <= ebh->eb_size);
+
+ err = leb_write_lock(ebh, lnr);
+ if (err)
+ return err;
+
+ pebnr = ebh->lmap[lnr];
+ /* If the LEB is mapped write out data */
+ if (pebnr != EBH_LEB_UNMAPPED) {
+ data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+ err = flash_write(ebh->flash_dev, data_offset, len, retlen,
+ (unsigned char *) buf);
+
+ if (err) {
+ chfs_err("error %d while writing %zu bytes to PEB "
+ "%d:%ju, written %zu bytes\n",
+ err, len, pebnr, (uintmax_t )offset, *retlen);
+ } else {
+ KASSERT(len == *retlen);
+ }
+
+ leb_write_unlock(ebh, lnr);
+ return err;
+ }
+
+ /*
+ * If the LEB is unmapped, get a free PEB and write the
+ * eraseblock header first
+ */
+ ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+ /* Setting up eraseblock header properties */
+ ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+retry:
+ /* Getting a physical eraseblock from the wear leveling system */
+ pebnr = get_peb(ebh);
+ if (pebnr < 0) {
+ leb_write_unlock(ebh, lnr);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ return pebnr;
+ }
+
+ /* Write the eraseblock header to the media */
+ err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+ if (err) {
+ chfs_warn(
+ "error writing eraseblock header: LEB %d , PEB %d\n",
+ lnr, pebnr);
+ goto write_error;
+ }
+
+ /* Write out data */
+ if (len) {
+ data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+ err = flash_write(ebh->flash_dev,
+ data_offset, len, retlen, (unsigned char *) buf);
+ if (err) {
+ chfs_err("error %d while writing %zu bytes to PEB "
+ " %d:%ju, written %zu bytes\n",
+ err, len, pebnr, (uintmax_t )offset, *retlen);
+ goto write_error;
+ }
+ }
+
+ ebh->lmap[lnr] = pebnr;
+ leb_write_unlock(ebh, lnr);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+
+ return 0;
+
+write_error: err = release_peb(ebh, pebnr);
+ // max retries (NOW: 2)
+ if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+ leb_write_unlock(ebh, lnr);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ return err;
+ }
+ goto retry;
+}
+
+/**
+ * ebh_erase_leb - erase a leb
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_erase_leb(struct chfs_ebh *ebh, int lnr)
+{
+ int err, pebnr;
+
+ leb_write_lock(ebh, lnr);
+
+ pebnr = ebh->lmap[lnr];
+ if (pebnr < 0) {
+ leb_write_unlock(ebh, lnr);
+ return EBH_LEB_UNMAPPED;
+ }
+ err = release_peb(ebh, pebnr);
+ if (err)
+ goto out_unlock;
+
+ ebh->lmap[lnr] = EBH_LEB_UNMAPPED;
+ cv_signal(&ebh->bg_erase.eth_wakeup);
+out_unlock:
+ leb_write_unlock(ebh, lnr);
+ return err;
+}
+
+/**
+ * ebh_map_leb - maps a PEB to LEB
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Returns zero on success, error code in case of fail
+ */
+int
+ebh_map_leb(struct chfs_ebh *ebh, int lnr)
+{
+ int err, pebnr, retries = 0;
+ struct chfs_eb_hdr *ebhdr;
+
+ ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+ err = leb_write_lock(ebh, lnr);
+ if (err)
+ return err;
+
+retry:
+ pebnr = get_peb(ebh);
+ if (pebnr < 0) {
+ err = pebnr;
+ goto out_unlock;
+ }
+
+ ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+ err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+ if (err) {
+ chfs_warn(
+ "error writing eraseblock header: LEB %d , PEB %d\n",
+ lnr, pebnr);
+ goto write_error;
+ }
+
+ ebh->lmap[lnr] = pebnr;
+
+out_unlock:
+ leb_write_unlock(ebh, lnr);
+ return err;
+
+write_error:
+ err = release_peb(ebh, pebnr);
+ // max retries (NOW: 2)
+ if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+ leb_write_unlock(ebh, lnr);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ return err;
+ }
+ goto retry;
+}
+
+/**
+ * ebh_unmap_leb -
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Retruns zero on success, error code in case of fail.
+ */
+int
+ebh_unmap_leb(struct chfs_ebh *ebh, int lnr)
+{
+ int err;
+
+ if (ebh_is_mapped(ebh, lnr) < 0)
+ /* If the eraseblock already unmapped */
+ return 0;
+
+ err = ebh_erase_leb(ebh, lnr);
+
+ return err;
+}
+
+/**
+ * ebh_is_mapped - check if a PEB is mapped to @lnr
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Retruns 0 if the logical eraseblock is mapped, negative error code otherwise.
+ */
+int
+ebh_is_mapped(struct chfs_ebh *ebh, int lnr)
+{
+ int err, result;
+ err = leb_read_lock(ebh, lnr);
+ if (err)
+ return err;
+
+ result = ebh->lmap[lnr];
+ leb_read_unlock(ebh, lnr);
+
+ return result;
+}
+
+/**
+ * ebh_change_leb - write the LEB to another PEB
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ * @buf: data to write
+ * @len: length of data
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_change_leb(struct chfs_ebh *ebh, int lnr, char *buf, size_t len,
+ size_t *retlen)
+{
+ int err, pebnr, pebnr_old, retries = 0;
+ off_t data_offset;
+
+ struct chfs_peb *peb = NULL;
+ struct chfs_eb_hdr *ebhdr;
+
+ if (ebh_is_mapped(ebh, lnr) < 0)
+ return EBH_LEB_UNMAPPED;
+
+ if (len == 0) {
+ err = ebh_unmap_leb(ebh, lnr);
+ if (err)
+ return err;
+ return ebh_map_leb(ebh, lnr);
+ }
+
+ ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+ pebnr_old = ebh->lmap[lnr];
+
+ mutex_enter(&ebh->alc_mutex);
+ err = leb_write_lock(ebh, lnr);
+ if (err)
+ goto out_mutex;
+
+ if (ebh->ops->mark_eb_hdr_dirty_flash) {
+ err = ebh->ops->mark_eb_hdr_dirty_flash(ebh, pebnr_old, lnr);
+ if (err)
+ goto out_unlock;
+ }
+
+ /* Setting up eraseblock header properties */
+ ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+retry:
+ /* Getting a physical eraseblock from the wear leveling system */
+ pebnr = get_peb(ebh);
+ if (pebnr < 0) {
+ leb_write_unlock(ebh, lnr);
+ mutex_exit(&ebh->alc_mutex);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ return pebnr;
+ }
+
+ err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+ if (err) {
+ chfs_warn(
+ "error writing eraseblock header: LEB %d , PEB %d",
+ lnr, pebnr);
+ goto write_error;
+ }
+
+ /* Write out data */
+ data_offset = ebh->ops->calc_data_offs(ebh, pebnr, 0);
+ err = flash_write(ebh->flash_dev, data_offset, len, retlen,
+ (unsigned char *) buf);
+ if (err) {
+ chfs_err("error %d while writing %zu bytes to PEB %d:%ju,"
+ " written %zu bytes",
+ err, len, pebnr, (uintmax_t)data_offset, *retlen);
+ goto write_error;
+ }
+
+ ebh->lmap[lnr] = pebnr;
+
+ if (ebh->ops->invalidate_eb_hdr) {
+ err = ebh->ops->invalidate_eb_hdr(ebh, pebnr_old);
+ if (err)
+ goto out_unlock;
+ }
+ peb = find_peb_in_use(ebh, pebnr_old);
+ err = release_peb(ebh, peb->pebnr);
+
+out_unlock:
+ leb_write_unlock(ebh, lnr);
+
+out_mutex:
+ mutex_exit(&ebh->alc_mutex);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ kmem_free(peb, sizeof(struct chfs_peb));
+ return err;
+
+write_error:
+ err = release_peb(ebh, pebnr);
+ //max retries (NOW: 2)
+ if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+ leb_write_unlock(ebh, lnr);
+ mutex_exit(&ebh->alc_mutex);
+ kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+ return err;
+ }
+ goto retry;
+}
+
--- /dev/null
+# $NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $
+
+INCSDIR= /usr/include/ufs/ext2fs
+
+INCS= ext2fs.h ext2fs_dinode.h ext2fs_dir.h ext2fs_extern.h
+
+.include <bsd.kinc.mk>
--- /dev/null
+/* $NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_alloc.c 8.11 (Berkeley) 10/27/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ffs_alloc.c 8.11 (Berkeley) 10/27/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/syslog.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+u_long ext2gennumber;
+
+static daddr_t ext2fs_alloccg(struct inode *, int, daddr_t, int);
+static u_long ext2fs_dirpref(struct m_ext2fs *);
+static void ext2fs_fserr(struct m_ext2fs *, u_int, const char *);
+static u_long ext2fs_hashalloc(struct inode *, int, long, int,
+ daddr_t (*)(struct inode *, int, daddr_t, int));
+static daddr_t ext2fs_nodealloccg(struct inode *, int, daddr_t, int);
+static daddr_t ext2fs_mapsearch(struct m_ext2fs *, char *, daddr_t);
+
+/*
+ * Allocate a block in the file system.
+ *
+ * A preference may be optionally specified. If a preference is given
+ * the following hierarchy is used to allocate a block:
+ * 1) allocate the requested block.
+ * 2) allocate a rotationally optimal block in the same cylinder.
+ * 3) allocate a block in the same cylinder group.
+ * 4) quadradically rehash into other cylinder groups, until an
+ * available block is located.
+ * If no block preference is given the following hierarchy is used
+ * to allocate a block:
+ * 1) allocate a block in the cylinder group that contains the
+ * inode for the file.
+ * 2) quadradically rehash into other cylinder groups, until an
+ * available block is located.
+ */
+int
+ext2fs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref,
+ kauth_cred_t cred, daddr_t *bnp)
+{
+ struct m_ext2fs *fs;
+ daddr_t bno;
+ int cg;
+
+ *bnp = 0;
+ fs = ip->i_e2fs;
+#ifdef DIAGNOSTIC
+ if (cred == NOCRED)
+ panic("ext2fs_alloc: missing credential");
+#endif /* DIAGNOSTIC */
+ if (fs->e2fs.e2fs_fbcount == 0)
+ goto nospace;
+ if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+ NULL, NULL) != 0 &&
+ freespace(fs) <= 0)
+ goto nospace;
+ if (bpref >= fs->e2fs.e2fs_bcount)
+ bpref = 0;
+ if (bpref == 0)
+ cg = ino_to_cg(fs, ip->i_number);
+ else
+ cg = dtog(fs, bpref);
+ bno = (daddr_t)ext2fs_hashalloc(ip, cg, bpref, fs->e2fs_bsize,
+ ext2fs_alloccg);
+ if (bno > 0) {
+ ip->i_e2fs_nblock += btodb(fs->e2fs_bsize);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ *bnp = bno;
+ return (0);
+ }
+nospace:
+ ext2fs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+ uprintf("\n%s: write failed, file system is full\n", fs->e2fs_fsmnt);
+ return (ENOSPC);
+}
+
+/*
+ * Allocate an inode in the file system.
+ *
+ * If allocating a directory, use ext2fs_dirpref to select the inode.
+ * If allocating in a directory, the following hierarchy is followed:
+ * 1) allocate the preferred inode.
+ * 2) allocate an inode in the same cylinder group.
+ * 3) quadradically rehash into other cylinder groups, until an
+ * available inode is located.
+ * If no inode preference is given the following hierarchy is used
+ * to allocate an inode:
+ * 1) allocate an inode in cylinder group 0.
+ * 2) quadradically rehash into other cylinder groups, until an
+ * available inode is located.
+ */
+int
+ext2fs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+ struct vnode **vpp)
+{
+ struct inode *pip;
+ struct m_ext2fs *fs;
+ struct inode *ip;
+ ino_t ino, ipref;
+ int cg, error;
+
+ *vpp = NULL;
+ pip = VTOI(pvp);
+ fs = pip->i_e2fs;
+ if (fs->e2fs.e2fs_ficount == 0)
+ goto noinodes;
+
+ if ((mode & IFMT) == IFDIR)
+ cg = ext2fs_dirpref(fs);
+ else
+ cg = ino_to_cg(fs, pip->i_number);
+ ipref = cg * fs->e2fs.e2fs_ipg + 1;
+ ino = (ino_t)ext2fs_hashalloc(pip, cg, (long)ipref, mode, ext2fs_nodealloccg);
+ if (ino == 0)
+ goto noinodes;
+ error = VFS_VGET(pvp->v_mount, ino, vpp);
+ if (error) {
+ ext2fs_vfree(pvp, ino, mode);
+ return (error);
+ }
+ ip = VTOI(*vpp);
+ if (ip->i_e2fs_mode && ip->i_e2fs_nlink != 0) {
+ printf("mode = 0%o, nlinks %d, inum = %llu, fs = %s\n",
+ ip->i_e2fs_mode, ip->i_e2fs_nlink,
+ (unsigned long long)ip->i_number, fs->e2fs_fsmnt);
+ panic("ext2fs_valloc: dup alloc");
+ }
+
+ memset(ip->i_din.e2fs_din, 0, sizeof(struct ext2fs_dinode));
+
+ /*
+ * Set up a new generation number for this inode.
+ */
+ if (++ext2gennumber < time_second)
+ ext2gennumber = time_second;
+ ip->i_e2fs_gen = ext2gennumber;
+ return (0);
+noinodes:
+ ext2fs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
+ uprintf("\n%s: create/symlink failed, no inodes free\n", fs->e2fs_fsmnt);
+ return (ENOSPC);
+}
+
+/*
+ * Find a cylinder to place a directory.
+ *
+ * The policy implemented by this algorithm is to select from
+ * among those cylinder groups with above the average number of
+ * free inodes, the one with the smallest number of directories.
+ */
+static u_long
+ext2fs_dirpref(struct m_ext2fs *fs)
+{
+ int cg, maxspace, mincg, avgifree;
+
+ avgifree = fs->e2fs.e2fs_ficount / fs->e2fs_ncg;
+ maxspace = 0;
+ mincg = -1;
+ for (cg = 0; cg < fs->e2fs_ncg; cg++)
+ if ( fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree) {
+ if (mincg == -1 || fs->e2fs_gd[cg].ext2bgd_nbfree > maxspace) {
+ mincg = cg;
+ maxspace = fs->e2fs_gd[cg].ext2bgd_nbfree;
+ }
+ }
+ return mincg;
+}
+
+/*
+ * Select the desired position for the next block in a file. The file is
+ * logically divided into sections. The first section is composed of the
+ * direct blocks. Each additional section contains fs_maxbpg blocks.
+ *
+ * If no blocks have been allocated in the first section, the policy is to
+ * request a block in the same cylinder group as the inode that describes
+ * the file. Otherwise, the policy is to try to allocate the blocks
+ * contigously. The two fields of the ext2 inode extension (see
+ * ufs/ufs/inode.h) help this.
+ */
+daddr_t
+ext2fs_blkpref(struct inode *ip, daddr_t lbn, int indx,
+ int32_t *bap /* XXX ondisk32 */)
+{
+ struct m_ext2fs *fs;
+ int cg, i;
+
+ fs = ip->i_e2fs;
+ /*
+ * if we are doing contigous lbn allocation, try to alloc blocks
+ * contigously on disk
+ */
+
+ if ( ip->i_e2fs_last_blk && lbn == ip->i_e2fs_last_lblk + 1) {
+ return ip->i_e2fs_last_blk + 1;
+ }
+
+ /*
+ * bap, if provided, gives us a list of blocks to which we want to
+ * stay close
+ */
+
+ if (bap) {
+ for (i = indx; i >= 0 ; i--) {
+ if (bap[i]) {
+ return fs2h32(bap[i]) + 1;
+ }
+ }
+ }
+
+ /* fall back to the first block of the cylinder containing the inode */
+
+ cg = ino_to_cg(fs, ip->i_number);
+ return fs->e2fs.e2fs_bpg * cg + fs->e2fs.e2fs_first_dblock + 1;
+}
+
+/*
+ * Implement the cylinder overflow algorithm.
+ *
+ * The policy implemented by this algorithm is:
+ * 1) allocate the block in its requested cylinder group.
+ * 2) quadradically rehash on the cylinder group number.
+ * 3) brute force search for a free block.
+ */
+static u_long
+ext2fs_hashalloc(struct inode *ip, int cg, long pref, int size,
+ daddr_t (*allocator)(struct inode *, int, daddr_t, int))
+{
+ struct m_ext2fs *fs;
+ long result;
+ int i, icg = cg;
+
+ fs = ip->i_e2fs;
+ /*
+ * 1: preferred cylinder group
+ */
+ result = (*allocator)(ip, cg, pref, size);
+ if (result)
+ return (result);
+ /*
+ * 2: quadratic rehash
+ */
+ for (i = 1; i < fs->e2fs_ncg; i *= 2) {
+ cg += i;
+ if (cg >= fs->e2fs_ncg)
+ cg -= fs->e2fs_ncg;
+ result = (*allocator)(ip, cg, 0, size);
+ if (result)
+ return (result);
+ }
+ /*
+ * 3: brute force search
+ * Note that we start at i == 2, since 0 was checked initially,
+ * and 1 is always checked in the quadratic rehash.
+ */
+ cg = (icg + 2) % fs->e2fs_ncg;
+ for (i = 2; i < fs->e2fs_ncg; i++) {
+ result = (*allocator)(ip, cg, 0, size);
+ if (result)
+ return (result);
+ cg++;
+ if (cg == fs->e2fs_ncg)
+ cg = 0;
+ }
+ return (0);
+}
+
+/*
+ * Determine whether a block can be allocated.
+ *
+ * Check to see if a block of the appropriate size is available,
+ * and if it is, allocate it.
+ */
+
+static daddr_t
+ext2fs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
+{
+ struct m_ext2fs *fs;
+ char *bbp;
+ struct buf *bp;
+ /* XXX ondisk32 */
+ int error, bno, start, end, loc;
+
+ fs = ip->i_e2fs;
+ if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0)
+ return (0);
+ error = bread(ip->i_devvp, fsbtodb(fs,
+ fs->e2fs_gd[cg].ext2bgd_b_bitmap),
+ (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (0);
+ }
+ bbp = (char *)bp->b_data;
+
+ if (dtog(fs, bpref) != cg)
+ bpref = 0;
+ if (bpref != 0) {
+ bpref = dtogd(fs, bpref);
+ /*
+ * if the requested block is available, use it
+ */
+ if (isclr(bbp, bpref)) {
+ bno = bpref;
+ goto gotit;
+ }
+ }
+ /*
+ * no blocks in the requested cylinder, so take next
+ * available one in this cylinder group.
+ * first try to get 8 contigous blocks, then fall back to a single
+ * block.
+ */
+ if (bpref)
+ start = dtogd(fs, bpref) / NBBY;
+ else
+ start = 0;
+ end = howmany(fs->e2fs.e2fs_fpg, NBBY) - start;
+ for (loc = start; loc < end; loc++) {
+ if (bbp[loc] == 0) {
+ bno = loc * NBBY;
+ goto gotit;
+ }
+ }
+ for (loc = 0; loc < start; loc++) {
+ if (bbp[loc] == 0) {
+ bno = loc * NBBY;
+ goto gotit;
+ }
+ }
+
+ bno = ext2fs_mapsearch(fs, bbp, bpref);
+ if (bno < 0)
+ return (0);
+gotit:
+#ifdef DIAGNOSTIC
+ if (isset(bbp, (daddr_t)bno)) {
+ printf("ext2fs_alloccgblk: cg=%d bno=%d fs=%s\n",
+ cg, bno, fs->e2fs_fsmnt);
+ panic("ext2fs_alloccg: dup alloc");
+ }
+#endif
+ setbit(bbp, (daddr_t)bno);
+ fs->e2fs.e2fs_fbcount--;
+ fs->e2fs_gd[cg].ext2bgd_nbfree--;
+ fs->e2fs_fmod = 1;
+ bdwrite(bp);
+ return (cg * fs->e2fs.e2fs_fpg + fs->e2fs.e2fs_first_dblock + bno);
+}
+
+/*
+ * Determine whether an inode can be allocated.
+ *
+ * Check to see if an inode is available, and if it is,
+ * allocate it using the following policy:
+ * 1) allocate the requested inode.
+ * 2) allocate the next available inode after the requested
+ * inode in the specified cylinder group.
+ */
+static daddr_t
+ext2fs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
+{
+ struct m_ext2fs *fs;
+ char *ibp;
+ struct buf *bp;
+ int error, start, len, loc, map, i;
+
+ ipref--; /* to avoid a lot of (ipref -1) */
+ if (ipref == -1)
+ ipref = 0;
+ fs = ip->i_e2fs;
+ if (fs->e2fs_gd[cg].ext2bgd_nifree == 0)
+ return (0);
+ error = bread(ip->i_devvp, fsbtodb(fs,
+ fs->e2fs_gd[cg].ext2bgd_i_bitmap),
+ (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (0);
+ }
+ ibp = (char *)bp->b_data;
+ if (ipref) {
+ ipref %= fs->e2fs.e2fs_ipg;
+ if (isclr(ibp, ipref))
+ goto gotit;
+ }
+ start = ipref / NBBY;
+ len = howmany(fs->e2fs.e2fs_ipg - ipref, NBBY);
+ loc = skpc(0xff, len, &ibp[start]);
+ if (loc == 0) {
+ len = start + 1;
+ start = 0;
+ loc = skpc(0xff, len, &ibp[0]);
+ if (loc == 0) {
+ printf("cg = %d, ipref = %lld, fs = %s\n",
+ cg, (long long)ipref, fs->e2fs_fsmnt);
+ panic("ext2fs_nodealloccg: map corrupted");
+ /* NOTREACHED */
+ }
+ }
+ i = start + len - loc;
+ map = ibp[i] ^ 0xff;
+ if (map == 0) {
+ printf("fs = %s\n", fs->e2fs_fsmnt);
+ panic("ext2fs_nodealloccg: block not in map");
+ }
+ ipref = i * NBBY + ffs(map) - 1;
+gotit:
+ setbit(ibp, ipref);
+ fs->e2fs.e2fs_ficount--;
+ fs->e2fs_gd[cg].ext2bgd_nifree--;
+ fs->e2fs_fmod = 1;
+ if ((mode & IFMT) == IFDIR) {
+ fs->e2fs_gd[cg].ext2bgd_ndirs++;
+ }
+ bdwrite(bp);
+ return (cg * fs->e2fs.e2fs_ipg + ipref +1);
+}
+
+/*
+ * Free a block.
+ *
+ * The specified block is placed back in the
+ * free map.
+ */
+void
+ext2fs_blkfree(struct inode *ip, daddr_t bno)
+{
+ struct m_ext2fs *fs;
+ char *bbp;
+ struct buf *bp;
+ int error, cg;
+
+ fs = ip->i_e2fs;
+ cg = dtog(fs, bno);
+ if ((u_int)bno >= fs->e2fs.e2fs_bcount) {
+ printf("bad block %lld, ino %llu\n", (long long)bno,
+ (unsigned long long)ip->i_number);
+ ext2fs_fserr(fs, ip->i_uid, "bad block");
+ return;
+ }
+ error = bread(ip->i_devvp,
+ fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap),
+ (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return;
+ }
+ bbp = (char *)bp->b_data;
+ bno = dtogd(fs, bno);
+ if (isclr(bbp, bno)) {
+ printf("dev = 0x%llx, block = %lld, fs = %s\n",
+ (unsigned long long)ip->i_dev, (long long)bno,
+ fs->e2fs_fsmnt);
+ panic("blkfree: freeing free block");
+ }
+ clrbit(bbp, bno);
+ fs->e2fs.e2fs_fbcount++;
+ fs->e2fs_gd[cg].ext2bgd_nbfree++;
+
+ fs->e2fs_fmod = 1;
+ bdwrite(bp);
+}
+
+/*
+ * Free an inode.
+ *
+ * The specified inode is placed back in the free map.
+ */
+int
+ext2fs_vfree(struct vnode *pvp, ino_t ino, int mode)
+{
+ struct m_ext2fs *fs;
+ char *ibp;
+ struct inode *pip;
+ struct buf *bp;
+ int error, cg;
+
+ pip = VTOI(pvp);
+ fs = pip->i_e2fs;
+ if ((u_int)ino > fs->e2fs.e2fs_icount || (u_int)ino < EXT2_FIRSTINO)
+ panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+ (unsigned long long)pip->i_dev, (unsigned long long)ino,
+ fs->e2fs_fsmnt);
+ cg = ino_to_cg(fs, ino);
+ error = bread(pip->i_devvp,
+ fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap),
+ (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (0);
+ }
+ ibp = (char *)bp->b_data;
+ ino = (ino - 1) % fs->e2fs.e2fs_ipg;
+ if (isclr(ibp, ino)) {
+ printf("dev = 0x%llx, ino = %llu, fs = %s\n",
+ (unsigned long long)pip->i_dev,
+ (unsigned long long)ino, fs->e2fs_fsmnt);
+ if (fs->e2fs_ronly == 0)
+ panic("ifree: freeing free inode");
+ }
+ clrbit(ibp, ino);
+ fs->e2fs.e2fs_ficount++;
+ fs->e2fs_gd[cg].ext2bgd_nifree++;
+ if ((mode & IFMT) == IFDIR) {
+ fs->e2fs_gd[cg].ext2bgd_ndirs--;
+ }
+ fs->e2fs_fmod = 1;
+ bdwrite(bp);
+ return (0);
+}
+
+/*
+ * Find a block in the specified cylinder group.
+ *
+ * It is a panic if a request is made to find a block if none are
+ * available.
+ */
+
+static daddr_t
+ext2fs_mapsearch(struct m_ext2fs *fs, char *bbp, daddr_t bpref)
+{
+ int start, len, loc, i, map;
+
+ /*
+ * find the fragment by searching through the free block
+ * map for an appropriate bit pattern
+ */
+ if (bpref)
+ start = dtogd(fs, bpref) / NBBY;
+ else
+ start = 0;
+ len = howmany(fs->e2fs.e2fs_fpg, NBBY) - start;
+ loc = skpc(0xff, len, &bbp[start]);
+ if (loc == 0) {
+ len = start + 1;
+ start = 0;
+ loc = skpc(0xff, len, &bbp[start]);
+ if (loc == 0) {
+ printf("start = %d, len = %d, fs = %s\n",
+ start, len, fs->e2fs_fsmnt);
+ panic("ext2fs_alloccg: map corrupted");
+ /* NOTREACHED */
+ }
+ }
+ i = start + len - loc;
+ map = bbp[i] ^ 0xff;
+ if (map == 0) {
+ printf("fs = %s\n", fs->e2fs_fsmnt);
+ panic("ext2fs_mapsearch: block not in map");
+ }
+ return i * NBBY + ffs(map) - 1;
+}
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ * fs: error message
+ */
+static void
+ext2fs_fserr(struct m_ext2fs *fs, u_int uid, const char *cp)
+{
+
+ log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->e2fs_fsmnt, cp);
+}
--- /dev/null
+/* $NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_uvmhist.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <uvm/uvm.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+/*
+ * Balloc defines the structure of file system storage
+ * by allocating the physical blocks on a device given
+ * the inode and the logical block number in a file.
+ */
+int
+ext2fs_balloc(struct inode *ip, daddr_t bn, int size,
+ kauth_cred_t cred, struct buf **bpp, int flags)
+{
+ struct m_ext2fs *fs;
+ daddr_t nb;
+ struct buf *bp, *nbp;
+ struct vnode *vp = ITOV(ip);
+ struct indir indirs[NIADDR + 2];
+ daddr_t newb, lbn, pref;
+ int32_t *bap; /* XXX ondisk32 */
+ int num, i, error;
+ u_int deallocated;
+ daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
+ int32_t *allocib; /* XXX ondisk32 */
+ int unwindidx = -1;
+ UVMHIST_FUNC("ext2fs_balloc"); UVMHIST_CALLED(ubchist);
+
+ UVMHIST_LOG(ubchist, "bn 0x%x", bn,0,0,0);
+
+ if (bpp != NULL) {
+ *bpp = NULL;
+ }
+ if (bn < 0)
+ return (EFBIG);
+ fs = ip->i_e2fs;
+ lbn = bn;
+
+ /*
+ * The first NDADDR blocks are direct blocks
+ */
+ if (bn < NDADDR) {
+ /* XXX ondisk32 */
+ nb = fs2h32(ip->i_e2fs_blocks[bn]);
+ if (nb != 0) {
+
+ /*
+ * the block is already allocated, just read it.
+ */
+
+ if (bpp != NULL) {
+ error = bread(vp, bn, fs->e2fs_bsize, NOCRED,
+ B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ *bpp = bp;
+ }
+ return (0);
+ }
+
+ /*
+ * allocate a new direct block.
+ */
+
+ error = ext2fs_alloc(ip, bn,
+ ext2fs_blkpref(ip, bn, bn, &ip->i_e2fs_blocks[0]),
+ cred, &newb);
+ if (error)
+ return (error);
+ ip->i_e2fs_last_lblk = lbn;
+ ip->i_e2fs_last_blk = newb;
+ /* XXX ondisk32 */
+ ip->i_e2fs_blocks[bn] = h2fs32((int32_t)newb);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (bpp != NULL) {
+ bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0);
+ bp->b_blkno = fsbtodb(fs, newb);
+ if (flags & B_CLRBUF)
+ clrbuf(bp);
+ *bpp = bp;
+ }
+ return (0);
+ }
+ /*
+ * Determine the number of levels of indirection.
+ */
+ pref = 0;
+ if ((error = ufs_getlbns(vp, bn, indirs, &num)) != 0)
+ return(error);
+#ifdef DIAGNOSTIC
+ if (num < 1)
+ panic ("ext2fs_balloc: ufs_getlbns returned indirect block\n");
+#endif
+ /*
+ * Fetch the first indirect block allocating if necessary.
+ */
+ --num;
+ /* XXX ondisk32 */
+ nb = fs2h32(ip->i_e2fs_blocks[NDADDR + indirs[0].in_off]);
+ allocib = NULL;
+ allocblk = allociblk;
+ if (nb == 0) {
+ pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0);
+ error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+ if (error)
+ return (error);
+ nb = newb;
+ *allocblk++ = nb;
+ ip->i_e2fs_last_blk = newb;
+ bp = getblk(vp, indirs[1].in_lbn, fs->e2fs_bsize, 0, 0);
+ bp->b_blkno = fsbtodb(fs, newb);
+ clrbuf(bp);
+ /*
+ * Write synchronously so that indirect blocks
+ * never point at garbage.
+ */
+ if ((error = bwrite(bp)) != 0)
+ goto fail;
+ unwindidx = 0;
+ allocib = &ip->i_e2fs_blocks[NDADDR + indirs[0].in_off];
+ /* XXX ondisk32 */
+ *allocib = h2fs32((int32_t)newb);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ }
+ /*
+ * Fetch through the indirect blocks, allocating as necessary.
+ */
+ for (i = 1;;) {
+ error = bread(vp,
+ indirs[i].in_lbn, (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+ nb = fs2h32(bap[indirs[i].in_off]);
+ if (i == num)
+ break;
+ i++;
+ if (nb != 0) {
+ brelse(bp, 0);
+ continue;
+ }
+ pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0);
+ error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ nb = newb;
+ *allocblk++ = nb;
+ ip->i_e2fs_last_blk = newb;
+ nbp = getblk(vp, indirs[i].in_lbn, fs->e2fs_bsize, 0, 0);
+ nbp->b_blkno = fsbtodb(fs, nb);
+ clrbuf(nbp);
+ /*
+ * Write synchronously so that indirect blocks
+ * never point at garbage.
+ */
+ if ((error = bwrite(nbp)) != 0) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ if (unwindidx < 0)
+ unwindidx = i - 1;
+ /* XXX ondisk32 */
+ bap[indirs[i - 1].in_off] = h2fs32((int32_t)nb);
+ /*
+ * If required, write synchronously, otherwise use
+ * delayed write.
+ */
+ if (flags & B_SYNC) {
+ bwrite(bp);
+ } else {
+ bdwrite(bp);
+ }
+ }
+ /*
+ * Get the data block, allocating if necessary.
+ */
+ if (nb == 0) {
+ pref = ext2fs_blkpref(ip, lbn, indirs[num].in_off, &bap[0]);
+ error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ nb = newb;
+ *allocblk++ = nb;
+ ip->i_e2fs_last_lblk = lbn;
+ ip->i_e2fs_last_blk = newb;
+ /* XXX ondisk32 */
+ bap[indirs[num].in_off] = h2fs32((int32_t)nb);
+ /*
+ * If required, write synchronously, otherwise use
+ * delayed write.
+ */
+ if (flags & B_SYNC) {
+ bwrite(bp);
+ } else {
+ bdwrite(bp);
+ }
+ if (bpp != NULL) {
+ nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+ nbp->b_blkno = fsbtodb(fs, nb);
+ if (flags & B_CLRBUF)
+ clrbuf(nbp);
+ *bpp = nbp;
+ }
+ return (0);
+ }
+ brelse(bp, 0);
+ if (bpp != NULL) {
+ if (flags & B_CLRBUF) {
+ error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED,
+ B_MODIFY, &nbp);
+ if (error) {
+ brelse(nbp, 0);
+ goto fail;
+ }
+ } else {
+ nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+ nbp->b_blkno = fsbtodb(fs, nb);
+ }
+ *bpp = nbp;
+ }
+ return (0);
+fail:
+ /*
+ * If we have failed part way through block allocation, we
+ * have to deallocate any indirect blocks that we have allocated.
+ */
+ for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+ ext2fs_blkfree(ip, *blkp);
+ deallocated += fs->e2fs_bsize;
+ }
+ if (unwindidx >= 0) {
+ if (unwindidx == 0) {
+ *allocib = 0;
+ } else {
+ int r;
+
+ r = bread(vp, indirs[unwindidx].in_lbn,
+ (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+ if (r) {
+ panic("Could not unwind indirect block, error %d", r);
+ brelse(bp, 0);
+ } else {
+ bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+ bap[indirs[unwindidx].in_off] = 0;
+ if (flags & B_SYNC)
+ bwrite(bp);
+ else
+ bdwrite(bp);
+ }
+ }
+ for (i = unwindidx + 1; i <= num; i++) {
+ bp = getblk(vp, indirs[i].in_lbn, (int)fs->e2fs_bsize,
+ 0, 0);
+ brelse(bp, BC_INVAL);
+ }
+ }
+ if (deallocated) {
+ ip->i_e2fs_nblock -= btodb(deallocated);
+ ip->i_e2fs_flags |= IN_CHANGE | IN_UPDATE;
+ }
+ return error;
+}
+
+int
+ext2fs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+ kauth_cred_t cred)
+{
+ struct inode *ip = VTOI(vp);
+ struct m_ext2fs *fs = ip->i_e2fs;
+ int error, delta, bshift, bsize;
+ UVMHIST_FUNC("ext2fs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+ bshift = fs->e2fs_bshift;
+ bsize = 1 << bshift;
+
+ delta = off & (bsize - 1);
+ off -= delta;
+ len += delta;
+
+ while (len > 0) {
+ bsize = min(bsize, len);
+ UVMHIST_LOG(ubchist, "off 0x%x len 0x%x bsize 0x%x",
+ off, len, bsize, 0);
+
+ error = ext2fs_balloc(ip, lblkno(fs, off), bsize, cred,
+ NULL, flags);
+ if (error) {
+ UVMHIST_LOG(ubchist, "error %d", error, 0,0,0);
+ return error;
+ }
+
+ /*
+ * increase file size now, ext2fs_balloc() requires that
+ * EOF be up-to-date before each call.
+ */
+
+ if (ext2fs_size(ip) < off + bsize) {
+ UVMHIST_LOG(ubchist, "old 0x%lx%8lx new 0x%lx%8lx",
+ /* Note that arguments are always cast to u_long. */
+ ext2fs_size(ip) >> 32,
+ ext2fs_size(ip) & 0xffffffff,
+ (off + bsize) >> 32,
+ (off + bsize) & 0xffffffff);
+ error = ext2fs_setsize(ip, off + bsize);
+ if (error) {
+ UVMHIST_LOG(ubchist, "error %d", error, 0,0,0);
+ return error;
+ }
+ }
+
+ off += bsize;
+ len -= bsize;
+ }
+ return 0;
+}
--- /dev/null
+/* $NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+static int ext2fs_bmaparray(struct vnode *, daddr_t, daddr_t *,
+ struct indir *, int *, int *);
+
+#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc)
+
+/*
+ * Bmap converts a the logical block number of a file to its physical block
+ * number on the disk. The conversion is done by using the logical block
+ * number to index into the array of block pointers described by the dinode.
+ */
+int
+ext2fs_bmap(void *v)
+{
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct vnode **a_vpp;
+ daddr_t *a_bnp;
+ int *a_runp;
+ } */ *ap = v;
+ /*
+ * Check for underlying vnode requests and ensure that logical
+ * to physical mapping is requested.
+ */
+ if (ap->a_vpp != NULL)
+ *ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
+ if (ap->a_bnp == NULL)
+ return (0);
+
+ return (ext2fs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
+ ap->a_runp));
+}
+
+/*
+ * Indirect blocks are now on the vnode for the file. They are given negative
+ * logical block numbers. Indirect blocks are addressed by the negative
+ * address of the first data block to which they point. Double indirect blocks
+ * are addressed by one less than the address of the first indirect block to
+ * which they point. Triple indirect blocks are addressed by one less than
+ * the address of the first double indirect block to which they point.
+ *
+ * ext2fs_bmaparray does the bmap conversion, and if requested returns the
+ * array of logical blocks which must be traversed to get to a block.
+ * Each entry contains the offset into that block that gets you to the
+ * next block and the disk address of the block (if it is assigned).
+ */
+
+int
+ext2fs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
+ int *nump, int *runp)
+{
+ struct inode *ip;
+ struct buf *bp, *cbp;
+ struct ufsmount *ump;
+ struct mount *mp;
+ struct indir a[NIADDR+1], *xap;
+ daddr_t daddr;
+ daddr_t metalbn;
+ int error, maxrun = 0, num;
+
+ ip = VTOI(vp);
+ mp = vp->v_mount;
+ ump = ip->i_ump;
+#ifdef DIAGNOSTIC
+ if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL))
+ panic("ext2fs_bmaparray: invalid arguments");
+#endif
+
+ if (runp) {
+ /*
+ * XXX
+ * If MAXBSIZE is the largest transfer the disks can handle,
+ * we probably want maxrun to be 1 block less so that we
+ * don't create a block larger than the device can handle.
+ */
+ *runp = 0;
+ maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1;
+ }
+
+ if (bn >= 0 && bn < NDADDR) {
+ /* XXX ondisk32 */
+ *bnp = blkptrtodb(ump, fs2h32(ip->i_e2fs_blocks[bn]));
+ if (*bnp == 0)
+ *bnp = -1;
+ else if (runp)
+ /* XXX ondisk32 */
+ for (++bn; bn < NDADDR && *runp < maxrun &&
+ is_sequential(ump, (daddr_t)fs2h32(ip->i_e2fs_blocks[bn - 1]),
+ (daddr_t)fs2h32(ip->i_e2fs_blocks[bn]));
+ ++bn, ++*runp);
+ return (0);
+ }
+
+ xap = ap == NULL ? a : ap;
+ if (!nump)
+ nump = #
+ if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
+ return (error);
+
+ num = *nump;
+
+ /* Get disk address out of indirect block array */
+ /* XXX ondisk32 */
+ daddr = fs2h32(ip->i_e2fs_blocks[NDADDR + xap->in_off]);
+
+#ifdef DIAGNOSTIC
+ if (num > NIADDR + 1 || num < 1) {
+ printf("ext2fs_bmaparray: num=%d\n", num);
+ panic("ext2fs_bmaparray: num");
+ }
+#endif
+ for (bp = NULL, ++xap; --num; ++xap) {
+ /*
+ * Exit the loop if there is no disk address assigned yet and
+ * the indirect block isn't in the cache, or if we were
+ * looking for an indirect block and we've found it.
+ */
+
+ metalbn = xap->in_lbn;
+ if (metalbn == bn)
+ break;
+ if (daddr == 0) {
+ mutex_enter(&bufcache_lock);
+ cbp = incore(vp, metalbn);
+ mutex_exit(&bufcache_lock);
+ if (cbp == NULL)
+ break;
+ }
+ /*
+ * If we get here, we've either got the block in the cache
+ * or we have a disk address for it, go fetch it.
+ */
+ if (bp)
+ brelse(bp, 0);
+
+ xap->in_exists = 1;
+ bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
+ if (bp == NULL) {
+
+ /*
+ * getblk() above returns NULL only iff we are
+ * pagedaemon. See the implementation of getblk
+ * for detail.
+ */
+
+ return (ENOMEM);
+ }
+ if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+ trace(TR_BREADHIT, pack(vp, size), metalbn);
+ }
+#ifdef DIAGNOSTIC
+ else if (!daddr)
+ panic("ext2fs_bmaparry: indirect block not in cache");
+#endif
+ else {
+ trace(TR_BREADMISS, pack(vp, size), metalbn);
+ bp->b_blkno = blkptrtodb(ump, daddr);
+ bp->b_flags |= B_READ;
+ VOP_STRATEGY(vp, bp);
+ curlwp->l_ru.ru_inblock++; /* XXX */
+ if ((error = biowait(bp)) != 0) {
+ brelse(bp, 0);
+ return (error);
+ }
+ }
+
+ /* XXX ondisk32 */
+ daddr = fs2h32(((int32_t *)bp->b_data)[xap->in_off]);
+ if (num == 1 && daddr && runp)
+ /* XXX ondisk32 */
+ for (bn = xap->in_off + 1;
+ bn < MNINDIR(ump) && *runp < maxrun &&
+ is_sequential(ump, ((int32_t *)bp->b_data)[bn - 1],
+ ((int32_t *)bp->b_data)[bn]);
+ ++bn, ++*runp);
+ }
+ if (bp)
+ brelse(bp, 0);
+
+ daddr = blkptrtodb(ump, daddr);
+ *bnp = daddr == 0 ? -1 : daddr;
+ return (0);
+}
--- /dev/null
+/* $NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/types.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_dinode.h>
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+/* These functions are only needed if native byte order is not big endian */
+#if BYTE_ORDER == BIG_ENDIAN
+void
+e2fs_sb_bswap(struct ext2fs *old, struct ext2fs *new)
+{
+
+ /* preserve unused fields */
+ memcpy(new, old, sizeof(struct ext2fs));
+ new->e2fs_icount = bswap32(old->e2fs_icount);
+ new->e2fs_bcount = bswap32(old->e2fs_bcount);
+ new->e2fs_rbcount = bswap32(old->e2fs_rbcount);
+ new->e2fs_fbcount = bswap32(old->e2fs_fbcount);
+ new->e2fs_ficount = bswap32(old->e2fs_ficount);
+ new->e2fs_first_dblock = bswap32(old->e2fs_first_dblock);
+ new->e2fs_log_bsize = bswap32(old->e2fs_log_bsize);
+ new->e2fs_fsize = bswap32(old->e2fs_fsize);
+ new->e2fs_bpg = bswap32(old->e2fs_bpg);
+ new->e2fs_fpg = bswap32(old->e2fs_fpg);
+ new->e2fs_ipg = bswap32(old->e2fs_ipg);
+ new->e2fs_mtime = bswap32(old->e2fs_mtime);
+ new->e2fs_wtime = bswap32(old->e2fs_wtime);
+ new->e2fs_mnt_count = bswap16(old->e2fs_mnt_count);
+ new->e2fs_max_mnt_count = bswap16(old->e2fs_max_mnt_count);
+ new->e2fs_magic = bswap16(old->e2fs_magic);
+ new->e2fs_state = bswap16(old->e2fs_state);
+ new->e2fs_beh = bswap16(old->e2fs_beh);
+ new->e2fs_minrev = bswap16(old->e2fs_minrev);
+ new->e2fs_lastfsck = bswap32(old->e2fs_lastfsck);
+ new->e2fs_fsckintv = bswap32(old->e2fs_fsckintv);
+ new->e2fs_creator = bswap32(old->e2fs_creator);
+ new->e2fs_rev = bswap32(old->e2fs_rev);
+ new->e2fs_ruid = bswap16(old->e2fs_ruid);
+ new->e2fs_rgid = bswap16(old->e2fs_rgid);
+ new->e2fs_first_ino = bswap32(old->e2fs_first_ino);
+ new->e2fs_inode_size = bswap16(old->e2fs_inode_size);
+ new->e2fs_block_group_nr = bswap16(old->e2fs_block_group_nr);
+ new->e2fs_features_compat = bswap32(old->e2fs_features_compat);
+ new->e2fs_features_incompat = bswap32(old->e2fs_features_incompat);
+ new->e2fs_features_rocompat = bswap32(old->e2fs_features_rocompat);
+ new->e2fs_algo = bswap32(old->e2fs_algo);
+ new->e2fs_reserved_ngdb = bswap16(old->e2fs_reserved_ngdb);
+}
+
+void e2fs_cg_bswap(struct ext2_gd *old, struct ext2_gd *new, int size)
+{
+ int i;
+
+ for (i = 0; i < (size / (int)sizeof(struct ext2_gd)); i++) {
+ new[i].ext2bgd_b_bitmap = bswap32(old[i].ext2bgd_b_bitmap);
+ new[i].ext2bgd_i_bitmap = bswap32(old[i].ext2bgd_i_bitmap);
+ new[i].ext2bgd_i_tables = bswap32(old[i].ext2bgd_i_tables);
+ new[i].ext2bgd_nbfree = bswap16(old[i].ext2bgd_nbfree);
+ new[i].ext2bgd_nifree = bswap16(old[i].ext2bgd_nifree);
+ new[i].ext2bgd_ndirs = bswap16(old[i].ext2bgd_ndirs);
+ }
+}
+
+void e2fs_i_bswap(struct ext2fs_dinode *old, struct ext2fs_dinode *new)
+{
+
+ new->e2di_mode = bswap16(old->e2di_mode);
+ new->e2di_uid = bswap16(old->e2di_uid);
+ new->e2di_gid = bswap16(old->e2di_gid);
+ new->e2di_nlink = bswap16(old->e2di_nlink);
+ new->e2di_size = bswap32(old->e2di_size);
+ new->e2di_atime = bswap32(old->e2di_atime);
+ new->e2di_ctime = bswap32(old->e2di_ctime);
+ new->e2di_mtime = bswap32(old->e2di_mtime);
+ new->e2di_dtime = bswap32(old->e2di_dtime);
+ new->e2di_nblock = bswap32(old->e2di_nblock);
+ new->e2di_flags = bswap32(old->e2di_flags);
+ new->e2di_gen = bswap32(old->e2di_gen);
+ new->e2di_facl = bswap32(old->e2di_facl);
+ new->e2di_dacl = bswap32(old->e2di_dacl);
+ new->e2di_faddr = bswap32(old->e2di_faddr);
+ new->e2di_uid_high = bswap16(old->e2di_uid_high);
+ new->e2di_gid_high = bswap16(old->e2di_gid_high);
+ memcpy(&new->e2di_blocks[0], &old->e2di_blocks[0],
+ (NDADDR + NIADDR) * sizeof(uint32_t));
+}
+#endif
--- /dev/null
+/* $NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_inode.c 8.8 (Berkeley) 10/19/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ffs_inode.c 8.8 (Berkeley) 10/19/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/trace.h>
+#include <sys/resourcevar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+extern int prtactive;
+
+static int ext2fs_indirtrunc(struct inode *, daddr_t, daddr_t,
+ daddr_t, int, long *);
+
+/*
+ * Get the size of an inode.
+ */
+uint64_t
+ext2fs_size(struct inode *ip)
+{
+ uint64_t size = ip->i_e2fs_size;
+
+ if ((ip->i_e2fs_mode & IFMT) == IFREG)
+ size |= (uint64_t)ip->i_e2fs_dacl << 32;
+ return size;
+}
+
+int
+ext2fs_setsize(struct inode *ip, uint64_t size)
+{
+ if ((ip->i_e2fs_mode & IFMT) == IFREG ||
+ ip->i_e2fs_mode == 0) {
+ ip->i_e2fs_dacl = size >> 32;
+ if (size >= 0x80000000U) {
+ struct m_ext2fs *fs = ip->i_e2fs;
+
+ if (fs->e2fs.e2fs_rev <= E2FS_REV0) {
+ /* Linux automagically upgrades to REV1 here! */
+ return EFBIG;
+ }
+ if (!(fs->e2fs.e2fs_features_rocompat
+ & EXT2F_ROCOMPAT_LARGEFILE)) {
+ fs->e2fs.e2fs_features_rocompat |=
+ EXT2F_ROCOMPAT_LARGEFILE;
+ fs->e2fs_fmod = 1;
+ }
+ }
+ } else if (size >= 0x80000000U)
+ return EFBIG;
+
+ ip->i_e2fs_size = size;
+
+ return 0;
+}
+
+/*
+ * Last reference to an inode. If necessary, write or delete it.
+ */
+int
+ext2fs_inactive(void *v)
+{
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ bool *a_recycle;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ int error = 0;
+
+ if (prtactive && vp->v_usecount != 0)
+ vprint("ext2fs_inactive: pushing active", vp);
+ /* Get rid of inodes related to stale file handles. */
+ if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0)
+ goto out;
+
+ error = 0;
+ if (ip->i_e2fs_nlink == 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+ /* Defer final inode free and update to reclaim.*/
+ if (ext2fs_size(ip) != 0) {
+ error = ext2fs_truncate(vp, (off_t)0, 0, NOCRED);
+ }
+ ip->i_e2fs_dtime = time_second;
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ ip->i_omode = 1;
+ }
+ if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
+ ext2fs_update(vp, NULL, NULL, 0);
+ }
+out:
+ /*
+ * If we are done with the inode, reclaim it
+ * so that it can be reused immediately.
+ */
+ *ap->a_recycle = (ip->i_e2fs_dtime != 0);
+ VOP_UNLOCK(vp);
+ return (error);
+}
+
+
+/*
+ * Update the access, modified, and inode change times as specified by the
+ * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is
+ * used to specify that the inode needs to be updated but that the times have
+ * already been set. The access and modified times are taken from the second
+ * and third parameters; the inode change time is always taken from the current
+ * time. If UPDATE_WAIT or UPDATE_DIROP is set, then wait for the disk
+ * write of the inode to complete.
+ */
+int
+ext2fs_update(struct vnode *vp, const struct timespec *acc,
+ const struct timespec *mod, int updflags)
+{
+ struct m_ext2fs *fs;
+ struct buf *bp;
+ struct inode *ip;
+ int error;
+ void *cp;
+ int flags;
+
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (0);
+ ip = VTOI(vp);
+ EXT2FS_ITIMES(ip, acc, mod, NULL);
+ if (updflags & UPDATE_CLOSE)
+ flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
+ else
+ flags = ip->i_flag & IN_MODIFIED;
+ if (flags == 0)
+ return (0);
+ fs = ip->i_e2fs;
+
+ error = bread(ip->i_devvp,
+ fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+ (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
+ cp = (char *)bp->b_data +
+ (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs));
+ e2fs_isave(ip->i_din.e2fs_din, (struct ext2fs_dinode *)cp);
+ if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) != 0 &&
+ (flags & IN_MODIFIED) != 0 &&
+ (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
+ return (bwrite(bp));
+ else {
+ bdwrite(bp);
+ return (0);
+ }
+}
+
+#define SINGLE 0 /* index of single indirect block */
+#define DOUBLE 1 /* index of double indirect block */
+#define TRIPLE 2 /* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+int
+ext2fs_truncate(struct vnode *ovp, off_t length, int ioflag,
+ kauth_cred_t cred)
+{
+ daddr_t lastblock;
+ struct inode *oip = VTOI(ovp);
+ daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
+ /* XXX ondisk32 */
+ int32_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
+ struct m_ext2fs *fs;
+ int offset, size, level;
+ long count, blocksreleased = 0;
+ int i, nblocks;
+ int error, allerror = 0;
+ off_t osize;
+ int sync;
+ struct ufsmount *ump = oip->i_ump;
+
+ if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+ ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+ return 0;
+ }
+
+ if (length < 0)
+ return (EINVAL);
+
+ if (ovp->v_type == VLNK &&
+ (ext2fs_size(oip) < ump->um_maxsymlinklen ||
+ (ump->um_maxsymlinklen == 0 && oip->i_e2fs_nblock == 0))) {
+ KDASSERT(length == 0);
+ memset((char *)&oip->i_din.e2fs_din->e2di_shortlink, 0,
+ (u_int)ext2fs_size(oip));
+ (void)ext2fs_setsize(oip, 0);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (ext2fs_update(ovp, NULL, NULL, 0));
+ }
+ if (ext2fs_size(oip) == length) {
+ /* still do a uvm_vnp_setsize() as writesize may be larger */
+ uvm_vnp_setsize(ovp, length);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (ext2fs_update(ovp, NULL, NULL, 0));
+ }
+ fs = oip->i_e2fs;
+ if (length > ump->um_maxfilesize)
+ return (EFBIG);
+
+ osize = ext2fs_size(oip);
+
+ /*
+ * Lengthen the size of the file. We must ensure that the
+ * last byte of the file is allocated. Since the smallest
+ * value of osize is 0, length will be at least 1.
+ */
+ if (osize < length) {
+ uvm_vnp_setwritesize(ovp, length);
+ error = ufs_balloc_range(ovp, length - 1, 1, cred,
+ ioflag & IO_SYNC ? B_SYNC : 0);
+ if (error) {
+ (void) ext2fs_truncate(ovp, osize, ioflag & IO_SYNC,
+ cred);
+ return (error);
+ }
+ uvm_vnp_setsize(ovp, length);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ KASSERT(error || ovp->v_size == ext2fs_size(oip));
+ return (ext2fs_update(ovp, NULL, NULL, 0));
+ }
+ /*
+ * Shorten the size of the file. If the file is not being
+ * truncated to a block boundry, the contents of the
+ * partial block following the end of the file must be
+ * zero'ed in case it ever become accessible again because
+ * of subsequent file growth.
+ */
+ offset = blkoff(fs, length);
+ if (offset != 0) {
+ size = fs->e2fs_bsize;
+
+ /* XXXUBC we should handle more than just VREG */
+ ubc_zerorange(&ovp->v_uobj, length, size - offset,
+ UBC_UNMAP_FLAG(ovp));
+ }
+ (void)ext2fs_setsize(oip, length);
+ uvm_vnp_setsize(ovp, length);
+ /*
+ * Calculate index into inode's block list of
+ * last direct and indirect blocks (if any)
+ * which we want to keep. Lastblock is -1 when
+ * the file is truncated to 0.
+ */
+ lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1;
+ lastiblock[SINGLE] = lastblock - NDADDR;
+ lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+ lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+ nblocks = btodb(fs->e2fs_bsize);
+ /*
+ * Update file and block pointers on disk before we start freeing
+ * blocks. If we crash before free'ing blocks below, the blocks
+ * will be returned to the free list. lastiblock values are also
+ * normalized to -1 for calls to ext2fs_indirtrunc below.
+ */
+ memcpy((void *)oldblks, (void *)&oip->i_e2fs_blocks[0], sizeof oldblks);
+ sync = 0;
+ for (level = TRIPLE; level >= SINGLE; level--) {
+ if (lastiblock[level] < 0 && oldblks[NDADDR + level] != 0) {
+ sync = 1;
+ oip->i_e2fs_blocks[NDADDR + level] = 0;
+ lastiblock[level] = -1;
+ }
+ }
+ for (i = 0; i < NDADDR; i++) {
+ if (i > lastblock && oldblks[i] != 0) {
+ sync = 1;
+ oip->i_e2fs_blocks[i] = 0;
+ }
+ }
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (sync) {
+ error = ext2fs_update(ovp, NULL, NULL, UPDATE_WAIT);
+ if (error && !allerror)
+ allerror = error;
+ }
+
+ /*
+ * Having written the new inode to disk, save its new configuration
+ * and put back the old block pointers long enough to process them.
+ * Note that we save the new block configuration so we can check it
+ * when we are done.
+ */
+ memcpy((void *)newblks, (void *)&oip->i_e2fs_blocks[0], sizeof newblks);
+ memcpy((void *)&oip->i_e2fs_blocks[0], (void *)oldblks, sizeof oldblks);
+
+ (void)ext2fs_setsize(oip, osize);
+ error = vtruncbuf(ovp, lastblock + 1, 0, 0);
+ if (error && !allerror)
+ allerror = error;
+
+ /*
+ * Indirect blocks first.
+ */
+ indir_lbn[SINGLE] = -NDADDR;
+ indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) -1;
+ indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+ for (level = TRIPLE; level >= SINGLE; level--) {
+ /* XXX ondisk32 */
+ bn = fs2h32(oip->i_e2fs_blocks[NDADDR + level]);
+ if (bn != 0) {
+ error = ext2fs_indirtrunc(oip, indir_lbn[level],
+ fsbtodb(fs, bn), lastiblock[level], level, &count);
+ if (error)
+ allerror = error;
+ blocksreleased += count;
+ if (lastiblock[level] < 0) {
+ oip->i_e2fs_blocks[NDADDR + level] = 0;
+ ext2fs_blkfree(oip, bn);
+ blocksreleased += nblocks;
+ }
+ }
+ if (lastiblock[level] >= 0)
+ goto done;
+ }
+
+ /*
+ * All whole direct blocks or frags.
+ */
+ for (i = NDADDR - 1; i > lastblock; i--) {
+ /* XXX ondisk32 */
+ bn = fs2h32(oip->i_e2fs_blocks[i]);
+ if (bn == 0)
+ continue;
+ oip->i_e2fs_blocks[i] = 0;
+ ext2fs_blkfree(oip, bn);
+ blocksreleased += btodb(fs->e2fs_bsize);
+ }
+
+done:
+#ifdef DIAGNOSTIC
+ for (level = SINGLE; level <= TRIPLE; level++)
+ if (newblks[NDADDR + level] !=
+ oip->i_e2fs_blocks[NDADDR + level])
+ panic("ext2fs_truncate1");
+ for (i = 0; i < NDADDR; i++)
+ if (newblks[i] != oip->i_e2fs_blocks[i])
+ panic("ext2fs_truncate2");
+ if (length == 0 &&
+ (!LIST_EMPTY(&ovp->v_cleanblkhd) ||
+ !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+ panic("ext2fs_truncate3");
+#endif /* DIAGNOSTIC */
+ /*
+ * Put back the real size.
+ */
+ (void)ext2fs_setsize(oip, length);
+ oip->i_e2fs_nblock -= blocksreleased;
+ oip->i_flag |= IN_CHANGE;
+ KASSERT(ovp->v_type != VREG || ovp->v_size == ext2fs_size(oip));
+ return (allerror);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn. Blocks are free'd in LIFO order up to (but not including)
+ * lastbn. If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+ext2fs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
+ int level, long *countp)
+{
+ int i;
+ struct buf *bp;
+ struct m_ext2fs *fs = ip->i_e2fs;
+ int32_t *bap; /* XXX ondisk32 */
+ struct vnode *vp;
+ daddr_t nb, nlbn, last;
+ int32_t *copy = NULL; /* XXX ondisk32 */
+ long blkcount, factor;
+ int nblocks, blocksreleased = 0;
+ int error = 0, allerror = 0;
+
+ /*
+ * Calculate index in current block of last
+ * block to be kept. -1 indicates the entire
+ * block so we need not calculate the index.
+ */
+ factor = 1;
+ for (i = SINGLE; i < level; i++)
+ factor *= NINDIR(fs);
+ last = lastbn;
+ if (lastbn > 0)
+ last /= factor;
+ nblocks = btodb(fs->e2fs_bsize);
+ /*
+ * Get buffer of block pointers, zero those entries corresponding
+ * to blocks to be free'd, and update on disk copy first. Since
+ * double(triple) indirect before single(double) indirect, calls
+ * to bmap on these blocks will fail. However, we already have
+ * the on disk address, so we have to set the b_blkno field
+ * explicitly instead of letting bread do everything for us.
+ */
+ vp = ITOV(ip);
+ bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0);
+ if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+ /* Braces must be here in case trace evaluates to nothing. */
+ trace(TR_BREADHIT, pack(vp, fs->e2fs_bsize), lbn);
+ } else {
+ trace(TR_BREADMISS, pack(vp, fs->e2fs_bsize), lbn);
+ curlwp->l_ru.ru_inblock++; /* pay for read */
+ bp->b_flags |= B_READ;
+ if (bp->b_bcount > bp->b_bufsize)
+ panic("ext2fs_indirtrunc: bad buffer size");
+ bp->b_blkno = dbn;
+ VOP_STRATEGY(vp, bp);
+ error = biowait(bp);
+ }
+ if (error) {
+ brelse(bp, 0);
+ *countp = 0;
+ return (error);
+ }
+
+ bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+ if (lastbn >= 0) {
+ /* XXX ondisk32 */
+ copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK);
+ memcpy((void *)copy, (void *)bap, (u_int)fs->e2fs_bsize);
+ memset((void *)&bap[last + 1], 0,
+ (u_int)(NINDIR(fs) - (last + 1)) * sizeof (uint32_t));
+ error = bwrite(bp);
+ if (error)
+ allerror = error;
+ bap = copy;
+ }
+
+ /*
+ * Recursively free totally unused blocks.
+ */
+ for (i = NINDIR(fs) - 1,
+ nlbn = lbn + 1 - i * factor; i > last;
+ i--, nlbn += factor) {
+ /* XXX ondisk32 */
+ nb = fs2h32(bap[i]);
+ if (nb == 0)
+ continue;
+ if (level > SINGLE) {
+ error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+ (daddr_t)-1, level - 1,
+ &blkcount);
+ if (error)
+ allerror = error;
+ blocksreleased += blkcount;
+ }
+ ext2fs_blkfree(ip, nb);
+ blocksreleased += nblocks;
+ }
+
+ /*
+ * Recursively free last partial block.
+ */
+ if (level > SINGLE && lastbn >= 0) {
+ last = lastbn % factor;
+ /* XXX ondisk32 */
+ nb = fs2h32(bap[i]);
+ if (nb != 0) {
+ error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+ last, level - 1, &blkcount);
+ if (error)
+ allerror = error;
+ blocksreleased += blkcount;
+ }
+ }
+
+ if (copy != NULL) {
+ free(copy, M_TEMP);
+ } else {
+ brelse(bp, BC_INVAL);
+ }
+
+ *countp = blocksreleased;
+ return (allerror);
+}
--- /dev/null
+/* $NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $ */
+
+/*
+ * Modified for NetBSD 1.2E
+ * May 1997, Manuel Bouyer
+ * Laboratoire d'informatique de Paris VI
+ */
+/*
+ * modified for Lites 1.1
+ *
+ * Aug 1995, Godmar Back (gback@cs.utah.edu)
+ * University of Utah, Department of Computer Science
+ */
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_lookup.c 8.6 (Berkeley) 4/1/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/kauth.h>
+#include <sys/proc.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ext2fs/ext2fs.h>
+
+extern int dirchk;
+
+static void ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir,
+ struct dirent *ffsdir);
+static int ext2fs_dirbadentry(struct vnode *dp,
+ struct ext2fs_direct *de,
+ int entryoffsetinblock);
+
+/*
+ * the problem that is tackled below is the fact that FFS
+ * includes the terminating zero on disk while EXT2FS doesn't
+ * this implies that we need to introduce some padding.
+ * For instance, a filename "sbin" has normally a reclen 12
+ * in EXT2, but 16 in FFS.
+ * This reminds me of that Pepsi commercial: 'Kid saved a lousy nine cents...'
+ * If it wasn't for that, the complete ufs code for directories would
+ * have worked w/o changes (except for the difference in DIRBLKSIZ)
+ */
+static void
+ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir, struct dirent *ffsdir)
+{
+ memset(ffsdir, 0, sizeof(struct dirent));
+ ffsdir->d_fileno = fs2h32(e2dir->e2d_ino);
+ ffsdir->d_namlen = e2dir->e2d_namlen;
+
+ ffsdir->d_type = DT_UNKNOWN; /* don't know more here */
+#ifdef DIAGNOSTIC
+#if MAXNAMLEN < E2FS_MAXNAMLEN
+ /*
+ * we should handle this more gracefully !
+ */
+ if (e2dir->e2d_namlen > MAXNAMLEN)
+ panic("ext2fs: e2dir->e2d_namlen");
+#endif
+#endif
+ strncpy(ffsdir->d_name, e2dir->e2d_name, ffsdir->d_namlen);
+
+ /* Godmar thinks: since e2dir->e2d_reclen can be big and means
+ nothing anyway, we compute our own reclen according to what
+ we think is right
+ */
+ ffsdir->d_reclen = _DIRENT_SIZE(ffsdir);
+}
+
+/*
+ * Vnode op for reading directories.
+ *
+ * Convert the on-disk entries to <sys/dirent.h> entries.
+ * the problem is that the conversion will blow up some entries by four bytes,
+ * so it can't be done in place. This is too bad. Right now the conversion is
+ * done entry by entry, the converted entry is sent via uiomove.
+ *
+ * XXX allocate a buffer, convert as many entries as possible, then send
+ * the whole buffer to uiomove
+ */
+int
+ext2fs_readdir(void *v)
+{
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ kauth_cred_t a_cred;
+ int **a_eofflag;
+ off_t **a_cookies;
+ int ncookies;
+ } */ *ap = v;
+ struct uio *uio = ap->a_uio;
+ int error;
+ size_t e2fs_count, readcnt;
+ struct vnode *vp = ap->a_vp;
+ struct m_ext2fs *fs = VTOI(vp)->i_e2fs;
+
+ struct ext2fs_direct *dp;
+ struct dirent *dstd;
+ struct uio auio;
+ struct iovec aiov;
+ void *dirbuf;
+ off_t off = uio->uio_offset;
+ off_t *cookies = NULL;
+ int nc = 0, ncookies = 0;
+ int e2d_reclen;
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ e2fs_count = uio->uio_resid;
+ /* Make sure we don't return partial entries. */
+ e2fs_count -= (uio->uio_offset + e2fs_count) & (fs->e2fs_bsize -1);
+ if (e2fs_count <= 0)
+ return (EINVAL);
+
+ auio = *uio;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_len = e2fs_count;
+ auio.uio_resid = e2fs_count;
+ UIO_SETUP_SYSSPACE(&auio);
+ dirbuf = malloc(e2fs_count, M_TEMP, M_WAITOK);
+ dstd = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK | M_ZERO);
+ if (ap->a_ncookies) {
+ nc = e2fs_count / _DIRENT_MINSIZE((struct dirent *)0);
+ ncookies = nc;
+ cookies = malloc(sizeof (off_t) * ncookies, M_TEMP, M_WAITOK);
+ *ap->a_cookies = cookies;
+ }
+ memset(dirbuf, 0, e2fs_count);
+ aiov.iov_base = dirbuf;
+
+ error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
+ if (error == 0) {
+ readcnt = e2fs_count - auio.uio_resid;
+ for (dp = (struct ext2fs_direct *)dirbuf;
+ (char *)dp < (char *)dirbuf + readcnt; ) {
+ e2d_reclen = fs2h16(dp->e2d_reclen);
+ if (e2d_reclen == 0) {
+ error = EIO;
+ break;
+ }
+ ext2fs_dirconv2ffs(dp, dstd);
+ if(dstd->d_reclen > uio->uio_resid) {
+ break;
+ }
+ error = uiomove(dstd, dstd->d_reclen, uio);
+ if (error != 0) {
+ break;
+ }
+ off = off + e2d_reclen;
+ if (cookies != NULL) {
+ *cookies++ = off;
+ if (--ncookies <= 0){
+ break; /* out of cookies */
+ }
+ }
+ /* advance dp */
+ dp = (struct ext2fs_direct *) ((char *)dp + e2d_reclen);
+ }
+ /* we need to correct uio_offset */
+ uio->uio_offset = off;
+ }
+ free(dirbuf, M_TEMP);
+ free(dstd, M_TEMP);
+ *ap->a_eofflag = ext2fs_size(VTOI(ap->a_vp)) <= uio->uio_offset;
+ if (ap->a_ncookies) {
+ if (error) {
+ free(*ap->a_cookies, M_TEMP);
+ *ap->a_ncookies = 0;
+ *ap->a_cookies = NULL;
+ } else
+ *ap->a_ncookies = nc - ncookies;
+ }
+ return (error);
+}
+
+/*
+ * Convert a component of a pathname into a pointer to a locked inode.
+ * This is a very central and rather complicated routine.
+ * If the file system is not maintained in a strict tree hierarchy,
+ * this can result in a deadlock situation (see comments in code below).
+ *
+ * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
+ * on whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it and the target of the pathname
+ * exists, lookup returns both the target and its parent directory locked.
+ * When creating or renaming and LOCKPARENT is specified, the target may
+ * not be ".". When deleting and LOCKPARENT is specified, the target may
+ * be "."., but the caller must check to ensure it does an vrele and vput
+ * instead of two vputs.
+ *
+ * Overall outline of ext2fs_lookup:
+ *
+ * check accessibility of directory
+ * look for name in cache, if found, then if at end of path
+ * and deleting or creating, drop it, else return name
+ * search for name in directory, to found or notfound
+ * notfound:
+ * if creating, return locked directory, leaving info on available slots
+ * else return error
+ * found:
+ * if at end of path and deleting, return information to allow delete
+ * if at end of path and rewriting (RENAME and LOCKPARENT), lock target
+ * inode and return info to allow rewrite
+ * if not at end, add name to cache; if at end and neither creating
+ * nor deleting, add name to cache
+ */
+int
+ext2fs_lookup(void *v)
+{
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *vdp = ap->a_dvp; /* vnode for directory being searched */
+ struct inode *dp = VTOI(vdp); /* inode for directory being searched */
+ struct buf *bp; /* a buffer of directory entries */
+ struct ext2fs_direct *ep; /* the current directory entry */
+ int entryoffsetinblock; /* offset of ep in bp's buffer */
+ enum {NONE, COMPACT, FOUND} slotstatus;
+ doff_t slotoffset; /* offset of area with free space */
+ int slotsize; /* size of area at slotoffset */
+ int slotfreespace; /* amount of space free in slot */
+ int slotneeded; /* size of the entry we're seeking */
+ int numdirpasses; /* strategy for directory search */
+ doff_t endsearch; /* offset to end directory search */
+ doff_t prevoff; /* prev entry dp->i_offset */
+ struct vnode *pdp; /* saved dp during symlink work */
+ struct vnode *tdp; /* returned by VFS_VGET */
+ doff_t enduseful; /* pointer past last used dir slot */
+ u_long bmask; /* block offset mask */
+ int namlen, error;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ kauth_cred_t cred = cnp->cn_cred;
+ int flags;
+ int nameiop = cnp->cn_nameiop;
+ struct ufsmount *ump = dp->i_ump;
+ int dirblksiz = ump->um_dirblksiz;
+ ino_t foundino;
+ struct ufs_lookup_results *results;
+
+ flags = cnp->cn_flags;
+
+ bp = NULL;
+ slotoffset = -1;
+ *vpp = NULL;
+
+ /*
+ * Produce the auxiliary lookup results into i_crap. Increment
+ * its serial number so elsewhere we can tell if we're using
+ * stale results. This should not be done this way. XXX.
+ */
+ results = &dp->i_crap;
+ dp->i_crapcounter++;
+
+ /*
+ * Check accessiblity of directory.
+ */
+ if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
+ return (error);
+
+ if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+ return (EROFS);
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ *
+ * Before tediously performing a linear scan of the directory,
+ * check the name cache to see if the directory/name pair
+ * we are looking for is known already.
+ */
+ if ((error = cache_lookup(vdp, vpp, cnp)) >= 0)
+ return (error);
+
+ /*
+ * Suppress search for slots unless creating
+ * file and at end of pathname, in which case
+ * we watch for a place to put the new file in
+ * case it doesn't already exist.
+ */
+ slotstatus = FOUND;
+ slotfreespace = slotsize = slotneeded = 0;
+ if ((nameiop == CREATE || nameiop == RENAME) &&
+ (flags & ISLASTCN)) {
+ slotstatus = NONE;
+ slotneeded = EXT2FS_DIRSIZ(cnp->cn_namelen);
+ }
+
+ /*
+ * If there is cached information on a previous search of
+ * this directory, pick up where we last left off.
+ * We cache only lookups as these are the most common
+ * and have the greatest payoff. Caching CREATE has little
+ * benefit as it usually must search the entire directory
+ * to determine that the entry does not exist. Caching the
+ * location of the last DELETE or RENAME has not reduced
+ * profiling time and hence has been removed in the interest
+ * of simplicity.
+ */
+ bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
+ if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
+ results->ulr_diroff >= ext2fs_size(dp)) {
+ entryoffsetinblock = 0;
+ results->ulr_offset = 0;
+ numdirpasses = 1;
+ } else {
+ results->ulr_offset = results->ulr_diroff;
+ if ((entryoffsetinblock = results->ulr_offset & bmask) &&
+ (error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, &bp)))
+ return (error);
+ numdirpasses = 2;
+ nchstats.ncs_2passes++;
+ }
+ prevoff = results->ulr_offset;
+ endsearch = roundup(ext2fs_size(dp), dirblksiz);
+ enduseful = 0;
+
+searchloop:
+ while (results->ulr_offset < endsearch) {
+ if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+ preempt();
+ /*
+ * If necessary, get the next directory block.
+ */
+ if ((results->ulr_offset & bmask) == 0) {
+ if (bp != NULL)
+ brelse(bp, 0);
+ error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL,
+ &bp);
+ if (error != 0)
+ return (error);
+ entryoffsetinblock = 0;
+ }
+ /*
+ * If still looking for a slot, and at a dirblksize
+ * boundary, have to start looking for free space again.
+ */
+ if (slotstatus == NONE &&
+ (entryoffsetinblock & (dirblksiz - 1)) == 0) {
+ slotoffset = -1;
+ slotfreespace = 0;
+ }
+ /*
+ * Get pointer to next entry.
+ * Full validation checks are slow, so we only check
+ * enough to insure forward progress through the
+ * directory. Complete checks can be run by patching
+ * "dirchk" to be true.
+ */
+ KASSERT(bp != NULL);
+ ep = (struct ext2fs_direct *)
+ ((char *)bp->b_data + entryoffsetinblock);
+ if (ep->e2d_reclen == 0 ||
+ (dirchk &&
+ ext2fs_dirbadentry(vdp, ep, entryoffsetinblock))) {
+ int i;
+
+ ufs_dirbad(dp, results->ulr_offset, "mangled entry");
+ i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
+ results->ulr_offset += i;
+ entryoffsetinblock += i;
+ continue;
+ }
+
+ /*
+ * If an appropriate sized slot has not yet been found,
+ * check to see if one is available. Also accumulate space
+ * in the current block so that we can determine if
+ * compaction is viable.
+ */
+ if (slotstatus != FOUND) {
+ int size = fs2h16(ep->e2d_reclen);
+
+ if (ep->e2d_ino != 0)
+ size -= EXT2FS_DIRSIZ(ep->e2d_namlen);
+ if (size > 0) {
+ if (size >= slotneeded) {
+ slotstatus = FOUND;
+ slotoffset = results->ulr_offset;
+ slotsize = fs2h16(ep->e2d_reclen);
+ } else if (slotstatus == NONE) {
+ slotfreespace += size;
+ if (slotoffset == -1)
+ slotoffset = results->ulr_offset;
+ if (slotfreespace >= slotneeded) {
+ slotstatus = COMPACT;
+ slotsize = results->ulr_offset +
+ fs2h16(ep->e2d_reclen) -
+ slotoffset;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check for a name match.
+ */
+ if (ep->e2d_ino) {
+ namlen = ep->e2d_namlen;
+ if (namlen == cnp->cn_namelen &&
+ !memcmp(cnp->cn_nameptr, ep->e2d_name,
+ (unsigned)namlen)) {
+ /*
+ * Save directory entry's inode number and
+ * reclen in ndp->ni_ufs area, and release
+ * directory buffer.
+ */
+ foundino = fs2h32(ep->e2d_ino);
+ results->ulr_reclen = fs2h16(ep->e2d_reclen);
+ goto found;
+ }
+ }
+ prevoff = results->ulr_offset;
+ results->ulr_offset += fs2h16(ep->e2d_reclen);
+ entryoffsetinblock += fs2h16(ep->e2d_reclen);
+ if (ep->e2d_ino)
+ enduseful = results->ulr_offset;
+ }
+/* notfound: */
+ /*
+ * If we started in the middle of the directory and failed
+ * to find our target, we must check the beginning as well.
+ */
+ if (numdirpasses == 2) {
+ numdirpasses--;
+ results->ulr_offset = 0;
+ endsearch = results->ulr_diroff;
+ goto searchloop;
+ }
+ if (bp != NULL)
+ brelse(bp, 0);
+ /*
+ * If creating, and at end of pathname and current
+ * directory has not been removed, then can consider
+ * allowing file to be created.
+ */
+ if ((nameiop == CREATE || nameiop == RENAME) &&
+ (flags & ISLASTCN) && dp->i_e2fs_nlink != 0) {
+ /*
+ * Access for write is interpreted as allowing
+ * creation of files in the directory.
+ */
+ error = VOP_ACCESS(vdp, VWRITE, cred);
+ if (error)
+ return (error);
+ /*
+ * Return an indication of where the new directory
+ * entry should be put. If we didn't find a slot,
+ * then set results->ulr_count to 0 indicating
+ * that the new slot belongs at the end of the
+ * directory. If we found a slot, then the new entry
+ * can be put in the range from results->ulr_offset to
+ * results->ulr_offset + results->ulr_count.
+ */
+ if (slotstatus == NONE) {
+ results->ulr_offset = roundup(ext2fs_size(dp), dirblksiz);
+ results->ulr_count = 0;
+ enduseful = results->ulr_offset;
+ } else {
+ results->ulr_offset = slotoffset;
+ results->ulr_count = slotsize;
+ if (enduseful < slotoffset + slotsize)
+ enduseful = slotoffset + slotsize;
+ }
+ results->ulr_endoff = roundup(enduseful, dirblksiz);
+#if 0
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+#endif
+ /*
+ * We return with the directory locked, so that
+ * the parameters we set up above will still be
+ * valid if we actually decide to do a direnter().
+ * We return ni_vp == NULL to indicate that the entry
+ * does not currently exist; we leave a pointer to
+ * the (locked) directory inode in ndp->ni_dvp.
+ *
+ * NB - if the directory is unlocked, then this
+ * information cannot be used.
+ */
+ return (EJUSTRETURN);
+ }
+ /*
+ * Insert name into cache (as non-existent) if appropriate.
+ */
+ if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+ cache_enter(vdp, *vpp, cnp);
+ return (ENOENT);
+
+found:
+ if (numdirpasses == 2)
+ nchstats.ncs_pass2++;
+ /*
+ * Check that directory length properly reflects presence
+ * of this entry.
+ */
+ if (results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen) > ext2fs_size(dp)) {
+ ufs_dirbad(dp, results->ulr_offset, "i_size too small");
+ error = ext2fs_setsize(dp,
+ results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen));
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ uvm_vnp_setsize(vdp, ext2fs_size(dp));
+ }
+ brelse(bp, 0);
+
+ /*
+ * Found component in pathname.
+ * If the final component of path name, save information
+ * in the cache as to where the entry was found.
+ */
+ if ((flags & ISLASTCN) && nameiop == LOOKUP)
+ results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1);
+
+ /*
+ * If deleting, and at end of pathname, return
+ * parameters which can be used to remove file.
+ * Lock the inode, being careful with ".".
+ */
+ if (nameiop == DELETE && (flags & ISLASTCN)) {
+ /*
+ * Write access to directory required to delete files.
+ */
+ if ((error = VOP_ACCESS(vdp, VWRITE, cred)) != 0)
+ return (error);
+ /*
+ * Return pointer to current entry in results->ulr_offset,
+ * and distance past previous entry (if there
+ * is a previous entry in this block) in results->ulr_count.
+ * Save directory inode pointer in ndp->ni_dvp for dirremove().
+ */
+ if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+ results->ulr_count = 0;
+ else
+ results->ulr_count = results->ulr_offset - prevoff;
+ if (dp->i_number == foundino) {
+ vref(vdp);
+ *vpp = vdp;
+ return (0);
+ }
+ if (flags & ISDOTDOT)
+ VOP_UNLOCK(vdp); /* race to get the inode */
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ if (flags & ISDOTDOT)
+ vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ return (error);
+ /*
+ * If directory is "sticky", then user must own
+ * the directory, or the file in it, else she
+ * may not delete it (unless she's root). This
+ * implements append-only directories.
+ */
+ if ((dp->i_e2fs_mode & ISVTX) &&
+ kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) &&
+ kauth_cred_geteuid(cred) != dp->i_uid &&
+ VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) {
+ vput(tdp);
+ return (EPERM);
+ }
+ *vpp = tdp;
+ return (0);
+ }
+
+ /*
+ * If rewriting (RENAME), return the inode and the
+ * information required to rewrite the present directory
+ * Must get inode of directory entry to verify it's a
+ * regular file, or empty directory.
+ */
+ if (nameiop == RENAME && (flags & ISLASTCN)) {
+ error = VOP_ACCESS(vdp, VWRITE, cred);
+ if (error)
+ return (error);
+ /*
+ * Careful about locking second inode.
+ * This can only occur if the target is ".".
+ */
+ if (dp->i_number == foundino)
+ return (EISDIR);
+ if (flags & ISDOTDOT)
+ VOP_UNLOCK(vdp); /* race to get the inode */
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ if (flags & ISDOTDOT)
+ vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ return (error);
+ *vpp = tdp;
+ return (0);
+ }
+
+ /*
+ * Step through the translation in the name. We do not `vput' the
+ * directory because we may need it again if a symbolic link
+ * is relative to the current directory. Instead we save it
+ * unlocked as "pdp". We must get the target inode before unlocking
+ * the directory to insure that the inode will not be removed
+ * before we get it. We prevent deadlock by always fetching
+ * inodes from the root, moving down the directory tree. Thus
+ * when following backward pointers ".." we must unlock the
+ * parent directory before getting the requested directory.
+ * There is a potential race condition here if both the current
+ * and parent directories are removed before the VFS_VGET for the
+ * inode associated with ".." returns. We hope that this occurs
+ * infrequently since we cannot avoid this race condition without
+ * implementing a sophisticated deadlock detection algorithm.
+ * Note also that this simple deadlock detection scheme will not
+ * work if the file system has any hard links other than ".."
+ * that point backwards in the directory structure.
+ */
+ pdp = vdp;
+ if (flags & ISDOTDOT) {
+ VOP_UNLOCK(pdp); /* race to get the inode */
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
+ if (error) {
+ return (error);
+ }
+ *vpp = tdp;
+ } else if (dp->i_number == foundino) {
+ vref(vdp); /* we want ourself, ie "." */
+ *vpp = vdp;
+ } else {
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ if (error)
+ return (error);
+ *vpp = tdp;
+ }
+
+ /*
+ * Insert name into cache if appropriate.
+ */
+ if (cnp->cn_flags & MAKEENTRY)
+ cache_enter(vdp, *vpp, cnp);
+ return (0);
+}
+
+/*
+ * Do consistency checking on a directory entry:
+ * record length must be multiple of 4
+ * entry must fit in rest of its dirblksize block
+ * record must be large enough to contain entry
+ * name is not longer than EXT2FS_MAXNAMLEN
+ * name must be as long as advertised, and null terminated
+ */
+/*
+ * changed so that it confirms to ext2fs_check_dir_entry
+ */
+static int
+ext2fs_dirbadentry(struct vnode *dp, struct ext2fs_direct *de,
+ int entryoffsetinblock)
+{
+ struct ufsmount *ump = VFSTOUFS(dp->v_mount);
+ int dirblksiz = ump->um_dirblksiz;
+
+ const char *error_msg = NULL;
+ int reclen = fs2h16(de->e2d_reclen);
+ int namlen = de->e2d_namlen;
+
+ if (reclen < EXT2FS_DIRSIZ(1)) /* e2d_namlen = 1 */
+ error_msg = "rec_len is smaller than minimal";
+ else if (reclen % 4 != 0)
+ error_msg = "rec_len % 4 != 0";
+ else if (namlen > EXT2FS_MAXNAMLEN)
+ error_msg = "namlen > EXT2FS_MAXNAMLEN";
+ else if (reclen < EXT2FS_DIRSIZ(namlen))
+ error_msg = "reclen is too small for name_len";
+ else if (entryoffsetinblock + reclen > dirblksiz)
+ error_msg = "directory entry across blocks";
+ else if (fs2h32(de->e2d_ino) >
+ VTOI(dp)->i_e2fs->e2fs.e2fs_icount)
+ error_msg = "inode out of bounds";
+
+ if (error_msg != NULL) {
+ printf( "bad directory entry: %s\n"
+ "offset=%d, inode=%lu, rec_len=%d, name_len=%d \n",
+ error_msg, entryoffsetinblock,
+ (unsigned long) fs2h32(de->e2d_ino),
+ reclen, namlen);
+ panic("ext2fs_dirbadentry");
+ }
+ return error_msg == NULL ? 0 : 1;
+}
+
+/*
+ * Write a directory entry after a call to namei, using the parameters
+ * that it left in nameidata. The argument ip is the inode which the new
+ * directory entry will refer to. Dvp is a pointer to the directory to
+ * be written, which was left locked by namei. Remaining parameters
+ * (ulr_offset, ulr_count) indicate how the space for the new
+ * entry is to be obtained.
+ */
+int
+ext2fs_direnter(struct inode *ip, struct vnode *dvp,
+ const struct ufs_lookup_results *ulr,
+ struct componentname *cnp)
+{
+ struct ext2fs_direct *ep, *nep;
+ struct inode *dp;
+ struct buf *bp;
+ struct ext2fs_direct newdir;
+ struct iovec aiov;
+ struct uio auio;
+ u_int dsize;
+ int error, loc, newentrysize, spacefree;
+ char *dirbuf;
+ struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
+ int dirblksiz = ump->um_dirblksiz;
+
+ dp = VTOI(dvp);
+
+ newdir.e2d_ino = h2fs32(ip->i_number);
+ newdir.e2d_namlen = cnp->cn_namelen;
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+ (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+ newdir.e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode));
+ } else {
+ newdir.e2d_type = 0;
+ }
+ memcpy(newdir.e2d_name, cnp->cn_nameptr, (unsigned)cnp->cn_namelen + 1);
+ newentrysize = EXT2FS_DIRSIZ(cnp->cn_namelen);
+ if (ulr->ulr_count == 0) {
+ /*
+ * If ulr_count is 0, then namei could find no
+ * space in the directory. Here, ulr_offset will
+ * be on a directory block boundary and we will write the
+ * new entry into a fresh block.
+ */
+ if (ulr->ulr_offset & (dirblksiz - 1))
+ panic("ext2fs_direnter: newblk");
+ auio.uio_offset = ulr->ulr_offset;
+ newdir.e2d_reclen = h2fs16(dirblksiz);
+ auio.uio_resid = newentrysize;
+ aiov.iov_len = newentrysize;
+ aiov.iov_base = (void *)&newdir;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_WRITE;
+ UIO_SETUP_SYSSPACE(&auio);
+ error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred);
+ if (dirblksiz > dvp->v_mount->mnt_stat.f_bsize)
+ /* XXX should grow with balloc() */
+ panic("ext2fs_direnter: frag size");
+ else if (!error) {
+ error = ext2fs_setsize(dp,
+ roundup(ext2fs_size(dp), dirblksiz));
+ if (error)
+ return (error);
+ dp->i_flag |= IN_CHANGE;
+ uvm_vnp_setsize(dvp, ext2fs_size(dp));
+ }
+ return (error);
+ }
+
+ /*
+ * If ulr_count is non-zero, then namei found space
+ * for the new entry in the range ulr_offset to
+ * ulr_offset + ulr_count in the directory.
+ * To use this space, we may have to compact the entries located
+ * there, by copying them together towards the beginning of the
+ * block, leaving the free space in one usable chunk at the end.
+ */
+
+ /*
+ * Get the block containing the space for the new directory entry.
+ */
+ if ((error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp)) != 0)
+ return (error);
+ /*
+ * Find space for the new entry. In the simple case, the entry at
+ * offset base will have the space. If it does not, then namei
+ * arranged that compacting the region ulr_offset to
+ * ulr_offset + ulr_count would yield the
+ * space.
+ */
+ ep = (struct ext2fs_direct *)dirbuf;
+ dsize = EXT2FS_DIRSIZ(ep->e2d_namlen);
+ spacefree = fs2h16(ep->e2d_reclen) - dsize;
+ for (loc = fs2h16(ep->e2d_reclen); loc < ulr->ulr_count; ) {
+ nep = (struct ext2fs_direct *)(dirbuf + loc);
+ if (ep->e2d_ino) {
+ /* trim the existing slot */
+ ep->e2d_reclen = h2fs16(dsize);
+ ep = (struct ext2fs_direct *)((char *)ep + dsize);
+ } else {
+ /* overwrite; nothing there; header is ours */
+ spacefree += dsize;
+ }
+ dsize = EXT2FS_DIRSIZ(nep->e2d_namlen);
+ spacefree += fs2h16(nep->e2d_reclen) - dsize;
+ loc += fs2h16(nep->e2d_reclen);
+ memcpy((void *)ep, (void *)nep, dsize);
+ }
+ /*
+ * Update the pointer fields in the previous entry (if any),
+ * copy in the new entry, and write out the block.
+ */
+ if (ep->e2d_ino == 0) {
+#ifdef DIAGNOSTIC
+ if (spacefree + dsize < newentrysize)
+ panic("ext2fs_direnter: compact1");
+#endif
+ newdir.e2d_reclen = h2fs16(spacefree + dsize);
+ } else {
+#ifdef DIAGNOSTIC
+ if (spacefree < newentrysize) {
+ printf("ext2fs_direnter: compact2 %u %u",
+ (u_int)spacefree, (u_int)newentrysize);
+ panic("ext2fs_direnter: compact2");
+ }
+#endif
+ newdir.e2d_reclen = h2fs16(spacefree);
+ ep->e2d_reclen = h2fs16(dsize);
+ ep = (struct ext2fs_direct *)((char *)ep + dsize);
+ }
+ memcpy((void *)ep, (void *)&newdir, (u_int)newentrysize);
+ error = VOP_BWRITE(bp->b_vp, bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (!error && ulr->ulr_endoff && ulr->ulr_endoff < ext2fs_size(dp))
+ error = ext2fs_truncate(dvp, (off_t)ulr->ulr_endoff, IO_SYNC,
+ cnp->cn_cred);
+ return (error);
+}
+
+/*
+ * Remove a directory entry after a call to namei, using
+ * the auxiliary results it provided. The entry
+ * ulr_offset contains the offset into the directory of the
+ * entry to be eliminated. The ulr_count field contains the
+ * size of the previous record in the directory. If this
+ * is 0, the first entry is being deleted, so we need only
+ * zero the inode number to mark the entry as free. If the
+ * entry is not the first in the directory, we must reclaim
+ * the space of the now empty record by adding the record size
+ * to the size of the previous entry.
+ */
+int
+ext2fs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+ struct componentname *cnp)
+{
+ struct inode *dp;
+ struct ext2fs_direct *ep;
+ struct buf *bp;
+ int error;
+
+ dp = VTOI(dvp);
+
+ if (ulr->ulr_count == 0) {
+ /*
+ * First entry in block: set d_ino to zero.
+ */
+ error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset,
+ (void *)&ep, &bp);
+ if (error != 0)
+ return (error);
+ ep->e2d_ino = 0;
+ error = VOP_BWRITE(bp->b_vp, bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (error);
+ }
+ /*
+ * Collapse new free space into previous entry.
+ */
+ error = ext2fs_blkatoff(dvp, (off_t)(ulr->ulr_offset - ulr->ulr_count),
+ (void *)&ep, &bp);
+ if (error != 0)
+ return (error);
+ ep->e2d_reclen = h2fs16(fs2h16(ep->e2d_reclen) + ulr->ulr_reclen);
+ error = VOP_BWRITE(bp->b_vp, bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (error);
+}
+
+/*
+ * Rewrite an existing directory entry to point at the inode
+ * supplied. The parameters describing the directory entry are
+ * set up by a call to namei.
+ */
+int
+ext2fs_dirrewrite(struct inode *dp, const struct ufs_lookup_results *ulr,
+ struct inode *ip, struct componentname *cnp)
+{
+ struct buf *bp;
+ struct ext2fs_direct *ep;
+ struct vnode *vdp = ITOV(dp);
+ int error;
+
+ error = ext2fs_blkatoff(vdp, (off_t)ulr->ulr_offset, (void *)&ep, &bp);
+ if (error != 0)
+ return (error);
+ ep->e2d_ino = h2fs32(ip->i_number);
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+ (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+ ep->e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode));
+ } else {
+ ep->e2d_type = 0;
+ }
+ error = VOP_BWRITE(bp->b_vp, bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (error);
+}
+
+/*
+ * Check if a directory is empty or not.
+ * Inode supplied must be locked.
+ *
+ * Using a struct dirtemplate here is not precisely
+ * what we want, but better than using a struct ext2fs_direct.
+ *
+ * NB: does not handle corrupted directories.
+ */
+int
+ext2fs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
+{
+ off_t off;
+ struct ext2fs_dirtemplate dbuf;
+ struct ext2fs_direct *dp = (struct ext2fs_direct *)&dbuf;
+ int error, namlen;
+ size_t count;
+
+#define MINDIRSIZ (sizeof (struct ext2fs_dirtemplate) / 2)
+
+ for (off = 0; off < ext2fs_size(ip); off += fs2h16(dp->e2d_reclen)) {
+ error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off,
+ UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL);
+ /*
+ * Since we read MINDIRSIZ, residual must
+ * be 0 unless we're at end of file.
+ */
+ if (error || count != 0)
+ return (0);
+ /* avoid infinite loops */
+ if (dp->e2d_reclen == 0)
+ return (0);
+ /* skip empty entries */
+ if (dp->e2d_ino == 0)
+ continue;
+ /* accept only "." and ".." */
+ namlen = dp->e2d_namlen;
+ if (namlen > 2)
+ return (0);
+ if (dp->e2d_name[0] != '.')
+ return (0);
+ /*
+ * At this point namlen must be 1 or 2.
+ * 1 implies ".", 2 implies ".." if second
+ * char is also "."
+ */
+ if (namlen == 1)
+ continue;
+ if (dp->e2d_name[1] == '.' && fs2h32(dp->e2d_ino) == parentino)
+ continue;
+ return (0);
+ }
+ return (1);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+int
+ext2fs_checkpath(struct inode *source, struct inode *target,
+ kauth_cred_t cred)
+{
+ struct vnode *vp;
+ int error, rootino, namlen;
+ struct ext2fs_dirtemplate dirbuf;
+ uint32_t ino;
+
+ vp = ITOV(target);
+ if (target->i_number == source->i_number) {
+ error = EEXIST;
+ goto out;
+ }
+ rootino = ROOTINO;
+ error = 0;
+ if (target->i_number == rootino)
+ goto out;
+
+ for (;;) {
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ break;
+ }
+ error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf,
+ sizeof (struct ext2fs_dirtemplate), (off_t)0,
+ UIO_SYSSPACE, IO_NODELOCKED, cred, (size_t *)0,
+ NULL);
+ if (error != 0)
+ break;
+ namlen = dirbuf.dotdot_namlen;
+ if (namlen != 2 ||
+ dirbuf.dotdot_name[0] != '.' ||
+ dirbuf.dotdot_name[1] != '.') {
+ error = ENOTDIR;
+ break;
+ }
+ ino = fs2h32(dirbuf.dotdot_ino);
+ if (ino == source->i_number) {
+ error = EINVAL;
+ break;
+ }
+ if (ino == rootino)
+ break;
+ vput(vp);
+ error = VFS_VGET(vp->v_mount, ino, &vp);
+ if (error != 0) {
+ vp = NULL;
+ break;
+ }
+ }
+
+out:
+ if (error == ENOTDIR) {
+ printf("checkpath: .. not a directory\n");
+ panic("checkpath");
+ }
+ if (vp != NULL)
+ vput(vp);
+ return (error);
+}
--- /dev/null
+/* $NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $ */
+
+/*-
+ * Copyright (c) 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_readwrite.c 8.8 (Berkeley) 8/4/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*-
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ufs_readwrite.c 8.8 (Berkeley) 8/4/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+
+#define doclusterread 0 /* XXX underway */
+#define doclusterwrite 0
+
+/*
+ * Vnode op for reading.
+ */
+/* ARGSUSED */
+int
+ext2fs_read(void *v)
+{
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+ struct uio *uio;
+ struct m_ext2fs *fs;
+ struct buf *bp;
+ struct ufsmount *ump;
+ vsize_t bytelen;
+ daddr_t lbn, nextlbn;
+ off_t bytesinfile;
+ long size, xfersize, blkoffset;
+ int error;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ ump = ip->i_ump;
+ uio = ap->a_uio;
+ error = 0;
+
+#ifdef DIAGNOSTIC
+ if (uio->uio_rw != UIO_READ)
+ panic("%s: mode", "ext2fs_read");
+
+ if (vp->v_type == VLNK) {
+ if (ext2fs_size(ip) < ump->um_maxsymlinklen ||
+ (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0))
+ panic("%s: short symlink", "ext2fs_read");
+ } else if (vp->v_type != VREG && vp->v_type != VDIR)
+ panic("%s: type %d", "ext2fs_read", vp->v_type);
+#endif
+ fs = ip->i_e2fs;
+ if ((uint64_t)uio->uio_offset > ump->um_maxfilesize)
+ return (EFBIG);
+ if (uio->uio_resid == 0)
+ return (0);
+ if (uio->uio_offset >= ext2fs_size(ip))
+ goto out;
+
+ if (vp->v_type == VREG) {
+ const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+ while (uio->uio_resid > 0) {
+ bytelen = MIN(ext2fs_size(ip) - uio->uio_offset,
+ uio->uio_resid);
+ if (bytelen == 0)
+ break;
+
+ error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+ UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
+ if (error)
+ break;
+ }
+ goto out;
+ }
+
+ for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+ bytesinfile = ext2fs_size(ip) - uio->uio_offset;
+ if (bytesinfile <= 0)
+ break;
+ lbn = lblkno(fs, uio->uio_offset);
+ nextlbn = lbn + 1;
+ size = fs->e2fs_bsize;
+ blkoffset = blkoff(fs, uio->uio_offset);
+ xfersize = fs->e2fs_bsize - blkoffset;
+ if (uio->uio_resid < xfersize)
+ xfersize = uio->uio_resid;
+ if (bytesinfile < xfersize)
+ xfersize = bytesinfile;
+
+ if (lblktosize(fs, nextlbn) >= ext2fs_size(ip))
+ error = bread(vp, lbn, size, NOCRED, 0, &bp);
+ else {
+ int nextsize = fs->e2fs_bsize;
+ error = breadn(vp, lbn,
+ size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+ }
+ if (error)
+ break;
+
+ /*
+ * We should only get non-zero b_resid when an I/O error
+ * has occurred, which should cause us to break above.
+ * However, if the short read did not cause an error,
+ * then we want to ensure that we do not uiomove bad
+ * or uninitialized data.
+ */
+ size -= bp->b_resid;
+ if (size < xfersize) {
+ if (size == 0)
+ break;
+ xfersize = size;
+ }
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+ if (error)
+ break;
+ brelse(bp, 0);
+ }
+ if (bp != NULL)
+ brelse(bp, 0);
+
+out:
+ if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+ ip->i_flag |= IN_ACCESS;
+ if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
+ error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+ }
+ return (error);
+}
+
+/*
+ * Vnode op for writing.
+ */
+int
+ext2fs_write(void *v)
+{
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct uio *uio;
+ struct inode *ip;
+ struct m_ext2fs *fs;
+ struct buf *bp;
+ struct ufsmount *ump;
+ daddr_t lbn;
+ off_t osize;
+ int blkoffset, error, flags, ioflag, resid, xfersize;
+ vsize_t bytelen;
+ off_t oldoff = 0; /* XXX */
+ bool async;
+ int extended = 0;
+ int advice;
+
+ ioflag = ap->a_ioflag;
+ advice = IO_ADV_DECODE(ioflag);
+ uio = ap->a_uio;
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ ump = ip->i_ump;
+ error = 0;
+
+#ifdef DIAGNOSTIC
+ if (uio->uio_rw != UIO_WRITE)
+ panic("%s: mode", "ext2fs_write");
+#endif
+
+ switch (vp->v_type) {
+ case VREG:
+ if (ioflag & IO_APPEND)
+ uio->uio_offset = ext2fs_size(ip);
+ if ((ip->i_e2fs_flags & EXT2_APPEND) &&
+ uio->uio_offset != ext2fs_size(ip))
+ return (EPERM);
+ /* FALLTHROUGH */
+ case VLNK:
+ break;
+ case VDIR:
+ if ((ioflag & IO_SYNC) == 0)
+ panic("%s: nonsync dir write", "ext2fs_write");
+ break;
+ default:
+ panic("%s: type", "ext2fs_write");
+ }
+
+ fs = ip->i_e2fs;
+ if (uio->uio_offset < 0 ||
+ (uint64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
+ return (EFBIG);
+ if (uio->uio_resid == 0)
+ return (0);
+
+ async = vp->v_mount->mnt_flag & MNT_ASYNC;
+ resid = uio->uio_resid;
+ osize = ext2fs_size(ip);
+
+ if (vp->v_type == VREG) {
+ while (uio->uio_resid > 0) {
+ oldoff = uio->uio_offset;
+ blkoffset = blkoff(fs, uio->uio_offset);
+ bytelen = MIN(fs->e2fs_bsize - blkoffset,
+ uio->uio_resid);
+
+ if (vp->v_size < oldoff + bytelen) {
+ uvm_vnp_setwritesize(vp, oldoff + bytelen);
+ }
+ error = ufs_balloc_range(vp, uio->uio_offset,
+ bytelen, ap->a_cred, 0);
+ if (error)
+ break;
+ error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+ UBC_WRITE | UBC_UNMAP_FLAG(vp));
+ if (error)
+ break;
+
+ /*
+ * update UVM's notion of the size now that we've
+ * copied the data into the vnode's pages.
+ */
+
+ if (vp->v_size < uio->uio_offset) {
+ uvm_vnp_setsize(vp, uio->uio_offset);
+ extended = 1;
+ }
+
+ /*
+ * flush what we just wrote if necessary.
+ * XXXUBC simplistic async flushing.
+ */
+
+ if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+ (uio->uio_offset >> 16) << 16, PGO_CLEANIT);
+ }
+ }
+ if (error == 0 && ioflag & IO_SYNC) {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, trunc_page(oldoff),
+ round_page(blkroundup(fs, uio->uio_offset)),
+ PGO_CLEANIT | PGO_SYNCIO);
+ }
+
+ goto out;
+ }
+
+ flags = ioflag & IO_SYNC ? B_SYNC : 0;
+ for (error = 0; uio->uio_resid > 0;) {
+ lbn = lblkno(fs, uio->uio_offset);
+ blkoffset = blkoff(fs, uio->uio_offset);
+ xfersize = MIN(fs->e2fs_bsize - blkoffset, uio->uio_resid);
+ if (xfersize < fs->e2fs_bsize)
+ flags |= B_CLRBUF;
+ else
+ flags &= ~B_CLRBUF;
+ error = ext2fs_balloc(ip,
+ lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
+ if (error)
+ break;
+ if (ext2fs_size(ip) < uio->uio_offset + xfersize) {
+ error = ext2fs_setsize(ip, uio->uio_offset + xfersize);
+ if (error)
+ break;
+ }
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+
+ /*
+ * update UVM's notion of the size now that we've
+ * copied the data into the vnode's pages.
+ */
+
+ if (vp->v_size < uio->uio_offset) {
+ uvm_vnp_setsize(vp, uio->uio_offset);
+ extended = 1;
+ }
+
+ if (ioflag & IO_SYNC)
+ (void)bwrite(bp);
+ else if (xfersize + blkoffset == fs->e2fs_bsize)
+ bawrite(bp);
+ else
+ bdwrite(bp);
+ if (error || xfersize == 0)
+ break;
+ }
+
+ /*
+ * If we successfully wrote any data, and we are not the superuser
+ * we clear the setuid and setgid bits as a precaution against
+ * tampering.
+ */
+
+out:
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (vp->v_mount->mnt_flag & MNT_RELATIME)
+ ip->i_flag |= IN_ACCESS;
+ if (resid > uio->uio_resid && ap->a_cred &&
+ kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL))
+ ip->i_e2fs_mode &= ~(ISUID | ISGID);
+ if (resid > uio->uio_resid)
+ VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+ if (error) {
+ (void) ext2fs_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+ uio->uio_offset -= resid - uio->uio_resid;
+ uio->uio_resid = resid;
+ } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+ error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+ KASSERT(vp->v_size == ext2fs_size(ip));
+ return (error);
+}
--- /dev/null
+/* $NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_subr.c 8.2 (Berkeley) 9/21/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ffs_subr.c 8.2 (Berkeley) 9/21/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/inttypes.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+/*
+ * Return buffer with the contents of block "offset" from the beginning of
+ * directory "ip". If "res" is non-zero, fill it in with a pointer to the
+ * remaining space in the directory.
+ */
+int
+ext2fs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp)
+{
+ struct inode *ip;
+ struct m_ext2fs *fs;
+ struct buf *bp;
+ daddr_t lbn;
+ int error;
+
+ ip = VTOI(vp);
+ fs = ip->i_e2fs;
+ lbn = lblkno(fs, offset);
+
+ *bpp = NULL;
+ if ((error = bread(vp, lbn, fs->e2fs_bsize, NOCRED, 0, &bp)) != 0) {
+ brelse(bp, 0);
+ return (error);
+ }
+ if (res)
+ *res = (char *)bp->b_data + blkoff(fs, offset);
+ *bpp = bp;
+ return (0);
+}
+
+void
+ext2fs_itimes(struct inode *ip, const struct timespec *acc,
+ const struct timespec *mod, const struct timespec *cre)
+{
+ struct timespec now;
+
+ if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+ return;
+ }
+
+ vfs_timestamp(&now);
+ if (ip->i_flag & IN_ACCESS) {
+ if (acc == NULL)
+ acc = &now;
+ ip->i_e2fs_atime = acc->tv_sec;
+ }
+ if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+ if (mod == NULL)
+ mod = &now;
+ ip->i_e2fs_mtime = mod->tv_sec;
+ ip->i_modrev++;
+ }
+ if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+ if (cre == NULL)
+ cre = &now;
+ ip->i_e2fs_ctime = cre->tv_sec;
+ }
+ if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
+ ip->i_flag |= IN_ACCESSED;
+ if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
+ ip->i_flag |= IN_MODIFIED;
+ ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
--- /dev/null
+/* $NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $ */
+
+/*
+ * Copyright (c) 1989, 1991, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_vfsops.c 8.14 (Berkeley) 11/28/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ffs_vfsops.c 8.14 (Berkeley) 11/28/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_compat_netbsd.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/lock.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, ext2fs, "ffs");
+
+int ext2fs_sbupdate(struct ufsmount *, int);
+static int ext2fs_checksb(struct ext2fs *, int);
+
+static struct sysctllog *ext2fs_sysctl_log;
+
+extern const struct vnodeopv_desc ext2fs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc ext2fs_specop_opv_desc;
+extern const struct vnodeopv_desc ext2fs_fifoop_opv_desc;
+
+const struct vnodeopv_desc * const ext2fs_vnodeopv_descs[] = {
+ &ext2fs_vnodeop_opv_desc,
+ &ext2fs_specop_opv_desc,
+ &ext2fs_fifoop_opv_desc,
+ NULL,
+};
+
+struct vfsops ext2fs_vfsops = {
+ MOUNT_EXT2FS,
+ sizeof (struct ufs_args),
+ ext2fs_mount,
+ ufs_start,
+ ext2fs_unmount,
+ ufs_root,
+ ufs_quotactl,
+ ext2fs_statvfs,
+ ext2fs_sync,
+ ext2fs_vget,
+ ext2fs_fhtovp,
+ ext2fs_vptofh,
+ ext2fs_init,
+ ext2fs_reinit,
+ ext2fs_done,
+ ext2fs_mountroot,
+ (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+ vfs_stdextattrctl,
+ (void *)eopnotsupp, /* vfs_suspendctl */
+ genfs_renamelock_enter,
+ genfs_renamelock_exit,
+ (void *)eopnotsupp,
+ ext2fs_vnodeopv_descs,
+ 0,
+ { NULL, NULL },
+};
+
+static const struct genfs_ops ext2fs_genfsops = {
+ .gop_size = genfs_size,
+ .gop_alloc = ext2fs_gop_alloc,
+ .gop_write = genfs_gop_write,
+ .gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops ext2fs_ufsops = {
+ .uo_itimes = ext2fs_itimes,
+ .uo_update = ext2fs_update,
+ .uo_vfree = ext2fs_vfree,
+ .uo_unmark_vnode = (void (*)(vnode_t *))nullop,
+};
+
+/* Fill in the inode uid/gid from ext2 halves. */
+void
+ext2fs_set_inode_guid(struct inode *ip)
+{
+
+ ip->i_gid = ip->i_e2fs_gid;
+ ip->i_uid = ip->i_e2fs_uid;
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+ ip->i_gid |= ip->i_e2fs_gid_high << 16;
+ ip->i_uid |= ip->i_e2fs_uid_high << 16;
+ }
+}
+
+static int
+ext2fs_modcmd(modcmd_t cmd, void *arg)
+{
+ int error;
+
+ switch (cmd) {
+ case MODULE_CMD_INIT:
+ error = vfs_attach(&ext2fs_vfsops);
+ if (error != 0)
+ break;
+ sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "vfs", NULL,
+ NULL, 0, NULL, 0,
+ CTL_VFS, CTL_EOL);
+ sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "ext2fs",
+ SYSCTL_DESCR("Linux EXT2FS file system"),
+ NULL, 0, NULL, 0,
+ CTL_VFS, 17, CTL_EOL);
+ /*
+ * XXX the "17" above could be dynamic, thereby eliminating
+ * one more instance of the "number to vfs" mapping problem,
+ * but "17" is the order as taken from sys/mount.h
+ */
+ break;
+ case MODULE_CMD_FINI:
+ error = vfs_detach(&ext2fs_vfsops);
+ if (error != 0)
+ break;
+ sysctl_teardown(&ext2fs_sysctl_log);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ return (error);
+}
+
+/*
+ * XXX Same structure as FFS inodes? Should we share a common pool?
+ */
+struct pool ext2fs_inode_pool;
+struct pool ext2fs_dinode_pool;
+
+extern u_long ext2gennumber;
+
+void
+ext2fs_init(void)
+{
+
+ pool_init(&ext2fs_inode_pool, sizeof(struct inode), 0, 0, 0,
+ "ext2fsinopl", &pool_allocator_nointr, IPL_NONE);
+ pool_init(&ext2fs_dinode_pool, sizeof(struct ext2fs_dinode), 0, 0, 0,
+ "ext2dinopl", &pool_allocator_nointr, IPL_NONE);
+ ufs_init();
+}
+
+void
+ext2fs_reinit(void)
+{
+ ufs_reinit();
+}
+
+void
+ext2fs_done(void)
+{
+
+ ufs_done();
+ pool_destroy(&ext2fs_inode_pool);
+ pool_destroy(&ext2fs_dinode_pool);
+}
+
+/*
+ * Called by main() when ext2fs is going to be mounted as root.
+ *
+ * Name is updated by mount(8) after booting.
+ */
+#define ROOTNAME "root_device"
+
+int
+ext2fs_mountroot(void)
+{
+ extern struct vnode *rootvp;
+ struct m_ext2fs *fs;
+ struct mount *mp;
+ struct ufsmount *ump;
+ int error;
+
+ if (device_class(root_device) != DV_DISK)
+ return (ENODEV);
+
+ if ((error = vfs_rootmountalloc(MOUNT_EXT2FS, "root_device", &mp))) {
+ vrele(rootvp);
+ return (error);
+ }
+
+ if ((error = ext2fs_mountfs(rootvp, mp)) != 0) {
+ vfs_unbusy(mp, false, NULL);
+ vfs_destroy(mp);
+ return (error);
+ }
+ mutex_enter(&mountlist_lock);
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mutex_exit(&mountlist_lock);
+ ump = VFSTOUFS(mp);
+ fs = ump->um_e2fs;
+ memset(fs->e2fs_fsmnt, 0, sizeof(fs->e2fs_fsmnt));
+ (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt,
+ sizeof(fs->e2fs_fsmnt) - 1, 0);
+ if (fs->e2fs.e2fs_rev > E2FS_REV0) {
+ memset(fs->e2fs.e2fs_fsmnt, 0, sizeof(fs->e2fs.e2fs_fsmnt));
+ (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt,
+ sizeof(fs->e2fs.e2fs_fsmnt) - 1, 0);
+ }
+ (void)ext2fs_statvfs(mp, &mp->mnt_stat);
+ vfs_unbusy(mp, false, NULL);
+ setrootfstime((time_t)fs->e2fs.e2fs_wtime);
+ return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+ext2fs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+ struct lwp *l = curlwp;
+ struct vnode *devvp;
+ struct ufs_args *args = data;
+ struct ufsmount *ump = NULL;
+ struct m_ext2fs *fs;
+ size_t size;
+ int error = 0, flags, update;
+ mode_t accessmode;
+
+ if (*data_len < sizeof *args)
+ return EINVAL;
+
+ if (mp->mnt_flag & MNT_GETARGS) {
+ ump = VFSTOUFS(mp);
+ if (ump == NULL)
+ return EIO;
+ memset(args, 0, sizeof *args);
+ args->fspec = NULL;
+ *data_len = sizeof *args;
+ return 0;
+ }
+
+ update = mp->mnt_flag & MNT_UPDATE;
+
+ /* Check arguments */
+ if (args->fspec != NULL) {
+ /*
+ * Look up the name and verify that it's sane.
+ */
+ error = namei_simple_user(args->fspec,
+ NSM_FOLLOW_NOEMULROOT, &devvp);
+ if (error != 0)
+ return (error);
+
+ if (!update) {
+ /*
+ * Be sure this is a valid block device
+ */
+ if (devvp->v_type != VBLK)
+ error = ENOTBLK;
+ else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+ error = ENXIO;
+ } else {
+ /*
+ * Be sure we're still naming the same device
+ * used for our initial mount
+ */
+ ump = VFSTOUFS(mp);
+ if (devvp != ump->um_devvp) {
+ if (devvp->v_rdev != ump->um_devvp->v_rdev)
+ error = EINVAL;
+ else {
+ vrele(devvp);
+ devvp = ump->um_devvp;
+ vref(devvp);
+ }
+ }
+ }
+ } else {
+ if (!update) {
+ /* New mounts must have a filename for the device */
+ return (EINVAL);
+ } else {
+ ump = VFSTOUFS(mp);
+ devvp = ump->um_devvp;
+ vref(devvp);
+ }
+ }
+
+ /*
+ * If mount by non-root, then verify that user has necessary
+ * permissions on the device.
+ *
+ * Permission to update a mount is checked higher, so here we presume
+ * updating the mount is okay (for example, as far as securelevel goes)
+ * which leaves us with the normal check.
+ */
+ if (error == 0) {
+ accessmode = VREAD;
+ if (update ?
+ (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+ (mp->mnt_flag & MNT_RDONLY) == 0)
+ accessmode |= VWRITE;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = genfs_can_mount(devvp, accessmode, l->l_cred);
+ VOP_UNLOCK(devvp);
+ }
+
+ if (error) {
+ vrele(devvp);
+ return (error);
+ }
+
+ if (!update) {
+ int xflags;
+
+ if (mp->mnt_flag & MNT_RDONLY)
+ xflags = FREAD;
+ else
+ xflags = FREAD|FWRITE;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_OPEN(devvp, xflags, FSCRED);
+ VOP_UNLOCK(devvp);
+ if (error)
+ goto fail;
+ error = ext2fs_mountfs(devvp, mp);
+ if (error) {
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(devvp, xflags, NOCRED);
+ VOP_UNLOCK(devvp);
+ goto fail;
+ }
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_e2fs;
+ } else {
+ /*
+ * Update the mount.
+ */
+
+ /*
+ * The initial mount got a reference on this
+ * device, so drop the one obtained via
+ * namei(), above.
+ */
+ vrele(devvp);
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_e2fs;
+ if (fs->e2fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+ /*
+ * Changing from r/w to r/o
+ */
+ flags = WRITECLOSE;
+ if (mp->mnt_flag & MNT_FORCE)
+ flags |= FORCECLOSE;
+ error = ext2fs_flushfiles(mp, flags);
+ if (error == 0 &&
+ ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
+ (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
+ fs->e2fs.e2fs_state = E2FS_ISCLEAN;
+ (void) ext2fs_sbupdate(ump, MNT_WAIT);
+ }
+ if (error)
+ return (error);
+ fs->e2fs_ronly = 1;
+ }
+
+ if (mp->mnt_flag & MNT_RELOAD) {
+ error = ext2fs_reload(mp, l->l_cred, l);
+ if (error)
+ return (error);
+ }
+
+ if (fs->e2fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+ /*
+ * Changing from read-only to read/write
+ */
+ fs->e2fs_ronly = 0;
+ if (fs->e2fs.e2fs_state == E2FS_ISCLEAN)
+ fs->e2fs.e2fs_state = 0;
+ else
+ fs->e2fs.e2fs_state = E2FS_ERRORS;
+ fs->e2fs_fmod = 1;
+ }
+ if (args->fspec == NULL)
+ return 0;
+ }
+
+ error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+ UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+ (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt,
+ sizeof(fs->e2fs_fsmnt) - 1, &size);
+ memset(fs->e2fs_fsmnt + size, 0, sizeof(fs->e2fs_fsmnt) - size);
+ if (fs->e2fs.e2fs_rev > E2FS_REV0) {
+ (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt,
+ sizeof(fs->e2fs.e2fs_fsmnt) - 1, &size);
+ memset(fs->e2fs.e2fs_fsmnt, 0,
+ sizeof(fs->e2fs.e2fs_fsmnt) - size);
+ }
+ if (fs->e2fs_fmod != 0) { /* XXX */
+ fs->e2fs_fmod = 0;
+ if (fs->e2fs.e2fs_state == 0)
+ fs->e2fs.e2fs_wtime = time_second;
+ else
+ printf("%s: file system not clean; please fsck(8)\n",
+ mp->mnt_stat.f_mntfromname);
+ (void) ext2fs_cgupdate(ump, MNT_WAIT);
+ }
+ return (error);
+
+fail:
+ vrele(devvp);
+ return (error);
+}
+
+/*
+ * Reload all incore data for a filesystem (used after running fsck on
+ * the root filesystem and finding things to fix). The filesystem must
+ * be mounted read-only.
+ *
+ * Things to do to update the mount:
+ * 1) invalidate all cached meta-data.
+ * 2) re-read superblock from disk.
+ * 3) re-read summary information from disk.
+ * 4) invalidate all inactive vnodes.
+ * 5) invalidate all cached file data.
+ * 6) re-read inode data for all active vnodes.
+ */
+int
+ext2fs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
+{
+ struct vnode *vp, *mvp, *devvp;
+ struct inode *ip;
+ struct buf *bp;
+ struct m_ext2fs *fs;
+ struct ext2fs *newfs;
+ int i, error;
+ void *cp;
+ struct ufsmount *ump;
+
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ return (EINVAL);
+
+ ump = VFSTOUFS(mp);
+ /*
+ * Step 1: invalidate all cached meta-data.
+ */
+ devvp = ump->um_devvp;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = vinvalbuf(devvp, 0, cred, l, 0, 0);
+ VOP_UNLOCK(devvp);
+ if (error)
+ panic("ext2fs_reload: dirty1");
+ /*
+ * Step 2: re-read superblock from disk.
+ */
+ error = bread(devvp, SBLOCK, SBSIZE, NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ newfs = (struct ext2fs *)bp->b_data;
+ error = ext2fs_checksb(newfs, (mp->mnt_flag & MNT_RDONLY) != 0);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+
+ fs = ump->um_e2fs;
+ /*
+ * copy in new superblock, and compute in-memory values
+ */
+ e2fs_sbload(newfs, &fs->e2fs);
+ fs->e2fs_ncg =
+ howmany(fs->e2fs.e2fs_bcount - fs->e2fs.e2fs_first_dblock,
+ fs->e2fs.e2fs_bpg);
+ fs->e2fs_fsbtodb = fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT;
+ fs->e2fs_bsize = MINBSIZE << fs->e2fs.e2fs_log_bsize;
+ fs->e2fs_bshift = LOG_MINBSIZE + fs->e2fs.e2fs_log_bsize;
+ fs->e2fs_qbmask = fs->e2fs_bsize - 1;
+ fs->e2fs_bmask = ~fs->e2fs_qbmask;
+ fs->e2fs_ngdb =
+ howmany(fs->e2fs_ncg, fs->e2fs_bsize / sizeof(struct ext2_gd));
+ fs->e2fs_ipb = fs->e2fs_bsize / EXT2_DINODE_SIZE(fs);
+ fs->e2fs_itpg = fs->e2fs.e2fs_ipg / fs->e2fs_ipb;
+ brelse(bp, 0);
+
+ /*
+ * Step 3: re-read summary information from disk.
+ */
+
+ for (i = 0; i < fs->e2fs_ngdb; i++) {
+ error = bread(devvp ,
+ fsbtodb(fs, fs->e2fs.e2fs_first_dblock +
+ 1 /* superblock */ + i),
+ fs->e2fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ e2fs_cgload((struct ext2_gd *)bp->b_data,
+ &fs->e2fs_gd[i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
+ fs->e2fs_bsize);
+ brelse(bp, 0);
+ }
+
+ /* Allocate a marker vnode. */
+ mvp = vnalloc(mp);
+ /*
+ * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+ * and vclean() can be called indirectly
+ */
+ mutex_enter(&mntvnode_lock);
+loop:
+ for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+ vmark(mvp, vp);
+ if (vp->v_mount != mp || vismarker(vp))
+ continue;
+ /*
+ * Step 4: invalidate all inactive vnodes.
+ */
+ if (vrecycle(vp, &mntvnode_lock, l)) {
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ goto loop;
+ }
+ /*
+ * Step 5: invalidate all cached file data.
+ */
+ mutex_enter(vp->v_interlock);
+ mutex_exit(&mntvnode_lock);
+ if (vget(vp, LK_EXCLUSIVE)) {
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ goto loop;
+ }
+ if (vinvalbuf(vp, 0, cred, l, 0, 0))
+ panic("ext2fs_reload: dirty2");
+ /*
+ * Step 6: re-read inode data for all active vnodes.
+ */
+ ip = VTOI(vp);
+ error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+ (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ break;
+ }
+ cp = (char *)bp->b_data +
+ (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs));
+ e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din);
+ ext2fs_set_inode_guid(ip);
+ brelse(bp, 0);
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+ vnfree(mvp);
+ return (error);
+}
+
+/*
+ * Common code for mount and mountroot
+ */
+int
+ext2fs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+ struct lwp *l = curlwp;
+ struct ufsmount *ump;
+ struct buf *bp;
+ struct ext2fs *fs;
+ struct m_ext2fs *m_fs;
+ dev_t dev;
+ int error, i, ronly;
+ kauth_cred_t cred;
+ struct proc *p;
+
+ dev = devvp->v_rdev;
+ p = l ? l->l_proc : NULL;
+ cred = l ? l->l_cred : NOCRED;
+
+ /* Flush out any old buffers remaining from a previous use. */
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+ VOP_UNLOCK(devvp);
+ if (error)
+ return (error);
+
+ ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+ bp = NULL;
+ ump = NULL;
+
+#ifdef DEBUG_EXT2
+ printf("ext2 sb size: %zu\n", sizeof(struct ext2fs));
+#endif
+ error = bread(devvp, SBLOCK, SBSIZE, cred, 0, &bp);
+ if (error)
+ goto out;
+ fs = (struct ext2fs *)bp->b_data;
+ error = ext2fs_checksb(fs, ronly);
+ if (error)
+ goto out;
+ ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK);
+ memset(ump, 0, sizeof(*ump));
+ ump->um_fstype = UFS1;
+ ump->um_ops = &ext2fs_ufsops;
+ ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_UFSMNT, M_WAITOK);
+ memset(ump->um_e2fs, 0, sizeof(struct m_ext2fs));
+ e2fs_sbload((struct ext2fs *)bp->b_data, &ump->um_e2fs->e2fs);
+ brelse(bp, 0);
+ bp = NULL;
+ m_fs = ump->um_e2fs;
+ m_fs->e2fs_ronly = ronly;
+
+#ifdef DEBUG_EXT2
+ printf("ext2 ino size %zu\n", EXT2_DINODE_SIZE(m_fs));
+#endif
+ if (ronly == 0) {
+ if (m_fs->e2fs.e2fs_state == E2FS_ISCLEAN)
+ m_fs->e2fs.e2fs_state = 0;
+ else
+ m_fs->e2fs.e2fs_state = E2FS_ERRORS;
+ m_fs->e2fs_fmod = 1;
+ }
+
+ /* compute dynamic sb infos */
+ m_fs->e2fs_ncg =
+ howmany(m_fs->e2fs.e2fs_bcount - m_fs->e2fs.e2fs_first_dblock,
+ m_fs->e2fs.e2fs_bpg);
+ m_fs->e2fs_fsbtodb = m_fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT;
+ m_fs->e2fs_bsize = MINBSIZE << m_fs->e2fs.e2fs_log_bsize;
+ m_fs->e2fs_bshift = LOG_MINBSIZE + m_fs->e2fs.e2fs_log_bsize;
+ m_fs->e2fs_qbmask = m_fs->e2fs_bsize - 1;
+ m_fs->e2fs_bmask = ~m_fs->e2fs_qbmask;
+ m_fs->e2fs_ngdb =
+ howmany(m_fs->e2fs_ncg, m_fs->e2fs_bsize / sizeof(struct ext2_gd));
+ m_fs->e2fs_ipb = m_fs->e2fs_bsize / EXT2_DINODE_SIZE(m_fs);
+ m_fs->e2fs_itpg = m_fs->e2fs.e2fs_ipg / m_fs->e2fs_ipb;
+
+ m_fs->e2fs_gd = malloc(m_fs->e2fs_ngdb * m_fs->e2fs_bsize,
+ M_UFSMNT, M_WAITOK);
+ for (i = 0; i < m_fs->e2fs_ngdb; i++) {
+ error = bread(devvp ,
+ fsbtodb(m_fs, m_fs->e2fs.e2fs_first_dblock +
+ 1 /* superblock */ + i),
+ m_fs->e2fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+ free(m_fs->e2fs_gd, M_UFSMNT);
+ goto out;
+ }
+ e2fs_cgload((struct ext2_gd *)bp->b_data,
+ &m_fs->e2fs_gd[
+ i * m_fs->e2fs_bsize / sizeof(struct ext2_gd)],
+ m_fs->e2fs_bsize);
+ brelse(bp, 0);
+ bp = NULL;
+ }
+
+ mp->mnt_data = ump;
+ mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+ mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_EXT2FS);
+ mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+ mp->mnt_stat.f_namemax = EXT2FS_MAXNAMLEN;
+ mp->mnt_flag |= MNT_LOCAL;
+ mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */
+ mp->mnt_fs_bshift = m_fs->e2fs_bshift;
+ mp->mnt_iflag |= IMNT_DTYPE;
+ ump->um_flags = 0;
+ ump->um_mountp = mp;
+ ump->um_dev = dev;
+ ump->um_devvp = devvp;
+ ump->um_nindir = NINDIR(m_fs);
+ ump->um_lognindir = ffs(NINDIR(m_fs)) - 1;
+ ump->um_bptrtodb = m_fs->e2fs_fsbtodb;
+ ump->um_seqinc = 1; /* no frags */
+ ump->um_maxsymlinklen = EXT2_MAXSYMLINKLEN;
+ ump->um_dirblksiz = m_fs->e2fs_bsize;
+ ump->um_maxfilesize = ((uint64_t)0x80000000 * m_fs->e2fs_bsize - 1);
+ devvp->v_specmountpoint = mp;
+ return (0);
+
+out:
+ KASSERT(bp != NULL);
+ brelse(bp, 0);
+ if (ump) {
+ free(ump->um_e2fs, M_UFSMNT);
+ free(ump, M_UFSMNT);
+ mp->mnt_data = NULL;
+ }
+ return (error);
+}
+
+/*
+ * unmount system call
+ */
+int
+ext2fs_unmount(struct mount *mp, int mntflags)
+{
+ struct ufsmount *ump;
+ struct m_ext2fs *fs;
+ int error, flags;
+
+ flags = 0;
+ if (mntflags & MNT_FORCE)
+ flags |= FORCECLOSE;
+ if ((error = ext2fs_flushfiles(mp, flags)) != 0)
+ return (error);
+ ump = VFSTOUFS(mp);
+ fs = ump->um_e2fs;
+ if (fs->e2fs_ronly == 0 &&
+ ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
+ (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
+ fs->e2fs.e2fs_state = E2FS_ISCLEAN;
+ (void) ext2fs_sbupdate(ump, MNT_WAIT);
+ }
+ if (ump->um_devvp->v_type != VBAD)
+ ump->um_devvp->v_specmountpoint = NULL;
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_CLOSE(ump->um_devvp, fs->e2fs_ronly ? FREAD : FREAD|FWRITE,
+ NOCRED);
+ vput(ump->um_devvp);
+ free(fs->e2fs_gd, M_UFSMNT);
+ free(fs, M_UFSMNT);
+ free(ump, M_UFSMNT);
+ mp->mnt_data = NULL;
+ mp->mnt_flag &= ~MNT_LOCAL;
+ return (error);
+}
+
+/*
+ * Flush out all the files in a filesystem.
+ */
+int
+ext2fs_flushfiles(struct mount *mp, int flags)
+{
+ extern int doforce;
+ int error;
+
+ if (!doforce)
+ flags &= ~FORCECLOSE;
+ error = vflush(mp, NULLVP, flags);
+ return (error);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+ext2fs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+ struct ufsmount *ump;
+ struct m_ext2fs *fs;
+ uint32_t overhead, overhead_per_group, ngdb;
+ int i, ngroups;
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_e2fs;
+ if (fs->e2fs.e2fs_magic != E2FS_MAGIC)
+ panic("ext2fs_statvfs");
+
+ /*
+ * Compute the overhead (FS structures)
+ */
+ overhead_per_group =
+ 1 /* block bitmap */ +
+ 1 /* inode bitmap */ +
+ fs->e2fs_itpg;
+ overhead = fs->e2fs.e2fs_first_dblock +
+ fs->e2fs_ncg * overhead_per_group;
+ if (fs->e2fs.e2fs_rev > E2FS_REV0 &&
+ fs->e2fs.e2fs_features_rocompat & EXT2F_ROCOMPAT_SPARSESUPER) {
+ for (i = 0, ngroups = 0; i < fs->e2fs_ncg; i++) {
+ if (cg_has_sb(i))
+ ngroups++;
+ }
+ } else {
+ ngroups = fs->e2fs_ncg;
+ }
+ ngdb = fs->e2fs_ngdb;
+ if (fs->e2fs.e2fs_rev > E2FS_REV0 &&
+ fs->e2fs.e2fs_features_compat & EXT2F_COMPAT_RESIZE)
+ ngdb += fs->e2fs.e2fs_reserved_ngdb;
+ overhead += ngroups * (1 /* superblock */ + ngdb);
+
+ sbp->f_bsize = fs->e2fs_bsize;
+ sbp->f_frsize = MINBSIZE << fs->e2fs.e2fs_fsize;
+ sbp->f_iosize = fs->e2fs_bsize;
+ sbp->f_blocks = fs->e2fs.e2fs_bcount - overhead;
+ sbp->f_bfree = fs->e2fs.e2fs_fbcount;
+ sbp->f_bresvd = fs->e2fs.e2fs_rbcount;
+ if (sbp->f_bfree > sbp->f_bresvd)
+ sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+ else
+ sbp->f_bavail = 0;
+ sbp->f_files = fs->e2fs.e2fs_icount;
+ sbp->f_ffree = fs->e2fs.e2fs_ficount;
+ sbp->f_favail = fs->e2fs.e2fs_ficount;
+ sbp->f_fresvd = 0;
+ copy_statvfs_info(sbp, mp);
+ return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+ext2fs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+ struct vnode *vp, *mvp;
+ struct inode *ip;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct m_ext2fs *fs;
+ int error, allerror = 0;
+
+ fs = ump->um_e2fs;
+ if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */
+ printf("fs = %s\n", fs->e2fs_fsmnt);
+ panic("update: rofs mod");
+ }
+
+ /* Allocate a marker vnode. */
+ mvp = vnalloc(mp);
+
+ /*
+ * Write back each (modified) inode.
+ */
+ mutex_enter(&mntvnode_lock);
+loop:
+ /*
+ * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+ * and vclean() can be called indirectly
+ */
+ for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+ vmark(mvp, vp);
+ if (vp->v_mount != mp || vismarker(vp))
+ continue;
+ mutex_enter(vp->v_interlock);
+ ip = VTOI(vp);
+ if (ip == NULL || (vp->v_iflag & (VI_XLOCK|VI_CLEAN)) != 0 ||
+ vp->v_type == VNON ||
+ ((ip->i_flag &
+ (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
+ LIST_EMPTY(&vp->v_dirtyblkhd) &&
+ UVM_OBJ_IS_CLEAN(&vp->v_uobj)))
+ {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ mutex_exit(&mntvnode_lock);
+ error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error) {
+ mutex_enter(&mntvnode_lock);
+ if (error == ENOENT) {
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ goto loop;
+ }
+ continue;
+ }
+ if (vp->v_type == VREG && waitfor == MNT_LAZY)
+ error = ext2fs_update(vp, NULL, NULL, 0);
+ else
+ error = VOP_FSYNC(vp, cred,
+ waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0);
+ if (error)
+ allerror = error;
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+ vnfree(mvp);
+ /*
+ * Force stale file system control information to be flushed.
+ */
+ if (waitfor != MNT_LAZY) {
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = VOP_FSYNC(ump->um_devvp, cred,
+ waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0)
+ allerror = error;
+ VOP_UNLOCK(ump->um_devvp);
+ }
+ /*
+ * Write back modified superblock.
+ */
+ if (fs->e2fs_fmod != 0) {
+ fs->e2fs_fmod = 0;
+ fs->e2fs.e2fs_wtime = time_second;
+ if ((error = ext2fs_cgupdate(ump, waitfor)))
+ allerror = error;
+ }
+ return (allerror);
+}
+
+/*
+ * Look up a EXT2FS dinode number to find its incore vnode, otherwise read it
+ * in from disk. If it is in core, wait for the lock bit to clear, then
+ * return the inode locked. Detection and handling of mount points must be
+ * done by the calling routine.
+ */
+int
+ext2fs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+ struct m_ext2fs *fs;
+ struct inode *ip;
+ struct ufsmount *ump;
+ struct buf *bp;
+ struct vnode *vp;
+ dev_t dev;
+ int error;
+ void *cp;
+
+ ump = VFSTOUFS(mp);
+ dev = ump->um_dev;
+retry:
+ if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+ return (0);
+
+ /* Allocate a new vnode/inode. */
+ error = getnewvnode(VT_EXT2FS, mp, ext2fs_vnodeop_p, NULL, &vp);
+ if (error) {
+ *vpp = NULL;
+ return (error);
+ }
+ ip = pool_get(&ext2fs_inode_pool, PR_WAITOK);
+
+ mutex_enter(&ufs_hashlock);
+ if ((*vpp = ufs_ihashget(dev, ino, 0)) != NULL) {
+ mutex_exit(&ufs_hashlock);
+ ungetnewvnode(vp);
+ pool_put(&ext2fs_inode_pool, ip);
+ goto retry;
+ }
+
+ vp->v_vflag |= VV_LOCKSWORK;
+
+ memset(ip, 0, sizeof(struct inode));
+ vp->v_data = ip;
+ ip->i_vnode = vp;
+ ip->i_ump = ump;
+ ip->i_e2fs = fs = ump->um_e2fs;
+ ip->i_dev = dev;
+ ip->i_number = ino;
+ ip->i_e2fs_last_lblk = 0;
+ ip->i_e2fs_last_blk = 0;
+ genfs_node_init(vp, &ext2fs_genfsops);
+
+ /*
+ * Put it onto its hash chain and lock it so that other requests for
+ * this inode will block if they arrive while we are sleeping waiting
+ * for old data structures to be purged or for the contents of the
+ * disk portion of this inode to be read.
+ */
+
+ ufs_ihashins(ip);
+ mutex_exit(&ufs_hashlock);
+
+ /* Read in the disk contents for the inode, copy into the inode. */
+ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+ (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+
+ /*
+ * The inode does not contain anything useful, so it would
+ * be misleading to leave it on its hash chain. With mode
+ * still zero, it will be unlinked and returned to the free
+ * list by vput().
+ */
+
+ vput(vp);
+ brelse(bp, 0);
+ *vpp = NULL;
+ return (error);
+ }
+ cp = (char *)bp->b_data + (ino_to_fsbo(fs, ino) * EXT2_DINODE_SIZE(fs));
+ ip->i_din.e2fs_din = pool_get(&ext2fs_dinode_pool, PR_WAITOK);
+ e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din);
+ ext2fs_set_inode_guid(ip);
+ brelse(bp, 0);
+
+ /* If the inode was deleted, reset all fields */
+ if (ip->i_e2fs_dtime != 0) {
+ ip->i_e2fs_mode = ip->i_e2fs_nblock = 0;
+ (void)ext2fs_setsize(ip, 0);
+ memset(ip->i_e2fs_blocks, 0, sizeof(ip->i_e2fs_blocks));
+ }
+
+ /*
+ * Initialize the vnode from the inode, check for aliases.
+ */
+
+ error = ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp);
+ if (error) {
+ vput(vp);
+ *vpp = NULL;
+ return (error);
+ }
+ /*
+ * Finish inode initialization now that aliasing has been resolved.
+ */
+
+ ip->i_devvp = ump->um_devvp;
+ vref(ip->i_devvp);
+
+ /*
+ * Set up a generation number for this inode if it does not
+ * already have one. This should only happen on old filesystems.
+ */
+
+ if (ip->i_e2fs_gen == 0) {
+ if (++ext2gennumber < (u_long)time_second)
+ ext2gennumber = time_second;
+ ip->i_e2fs_gen = ext2gennumber;
+ if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+ ip->i_flag |= IN_MODIFIED;
+ }
+ uvm_vnp_setsize(vp, ext2fs_size(ip));
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * File handle to vnode
+ *
+ * Have to be really careful about stale file handles:
+ * - check that the inode number is valid
+ * - call ext2fs_vget() to get the locked inode
+ * - check for an unallocated inode (i_mode == 0)
+ */
+int
+ext2fs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+ struct inode *ip;
+ struct vnode *nvp;
+ int error;
+ struct ufid ufh;
+ struct m_ext2fs *fs;
+
+ if (fhp->fid_len != sizeof(struct ufid))
+ return EINVAL;
+
+ memcpy(&ufh, fhp, sizeof(struct ufid));
+ fs = VFSTOUFS(mp)->um_e2fs;
+ if ((ufh.ufid_ino < EXT2_FIRSTINO && ufh.ufid_ino != EXT2_ROOTINO) ||
+ ufh.ufid_ino >= fs->e2fs_ncg * fs->e2fs.e2fs_ipg)
+ return (ESTALE);
+
+ if ((error = VFS_VGET(mp, ufh.ufid_ino, &nvp)) != 0) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ ip = VTOI(nvp);
+ if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0 ||
+ ip->i_e2fs_gen != ufh.ufid_gen) {
+ vput(nvp);
+ *vpp = NULLVP;
+ return (ESTALE);
+ }
+ *vpp = nvp;
+ return (0);
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+ext2fs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+ struct inode *ip;
+ struct ufid ufh;
+
+ if (*fh_size < sizeof(struct ufid)) {
+ *fh_size = sizeof(struct ufid);
+ return E2BIG;
+ }
+ *fh_size = sizeof(struct ufid);
+
+ ip = VTOI(vp);
+ memset(&ufh, 0, sizeof(ufh));
+ ufh.ufid_len = sizeof(struct ufid);
+ ufh.ufid_ino = ip->i_number;
+ ufh.ufid_gen = ip->i_e2fs_gen;
+ memcpy(fhp, &ufh, sizeof(ufh));
+ return (0);
+}
+
+/*
+ * Write a superblock and associated information back to disk.
+ */
+int
+ext2fs_sbupdate(struct ufsmount *mp, int waitfor)
+{
+ struct m_ext2fs *fs = mp->um_e2fs;
+ struct buf *bp;
+ int error = 0;
+
+ bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0);
+ e2fs_sbsave(&fs->e2fs, (struct ext2fs*)bp->b_data);
+ if (waitfor == MNT_WAIT)
+ error = bwrite(bp);
+ else
+ bawrite(bp);
+ return (error);
+}
+
+int
+ext2fs_cgupdate(struct ufsmount *mp, int waitfor)
+{
+ struct m_ext2fs *fs = mp->um_e2fs;
+ struct buf *bp;
+ int i, error = 0, allerror = 0;
+
+ allerror = ext2fs_sbupdate(mp, waitfor);
+ for (i = 0; i < fs->e2fs_ngdb; i++) {
+ bp = getblk(mp->um_devvp, fsbtodb(fs,
+ fs->e2fs.e2fs_first_dblock +
+ 1 /* superblock */ + i), fs->e2fs_bsize, 0, 0);
+ e2fs_cgsave(&fs->e2fs_gd[
+ i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
+ (struct ext2_gd *)bp->b_data, fs->e2fs_bsize);
+ if (waitfor == MNT_WAIT)
+ error = bwrite(bp);
+ else
+ bawrite(bp);
+ }
+
+ if (!allerror && error)
+ allerror = error;
+ return (allerror);
+}
+
+static int
+ext2fs_checksb(struct ext2fs *fs, int ronly)
+{
+
+ if (fs2h16(fs->e2fs_magic) != E2FS_MAGIC) {
+ return (EINVAL); /* XXX needs translation */
+ }
+ if (fs2h32(fs->e2fs_rev) > E2FS_REV1) {
+#ifdef DIAGNOSTIC
+ printf("Ext2 fs: unsupported revision number: %x\n",
+ fs2h32(fs->e2fs_rev));
+#endif
+ return (EINVAL); /* XXX needs translation */
+ }
+ if (fs2h32(fs->e2fs_log_bsize) > 2) { /* block size = 1024|2048|4096 */
+#ifdef DIAGNOSTIC
+ printf("Ext2 fs: bad block size: %d "
+ "(expected <= 2 for ext2 fs)\n",
+ fs2h32(fs->e2fs_log_bsize));
+#endif
+ return (EINVAL); /* XXX needs translation */
+ }
+ if (fs2h32(fs->e2fs_rev) > E2FS_REV0) {
+ if (fs2h32(fs->e2fs_first_ino) != EXT2_FIRSTINO) {
+ printf("Ext2 fs: unsupported first inode position\n");
+ return (EINVAL); /* XXX needs translation */
+ }
+ if (fs2h32(fs->e2fs_features_incompat) &
+ ~EXT2F_INCOMPAT_SUPP) {
+ printf("Ext2 fs: unsupported optional feature\n");
+ return (EINVAL); /* XXX needs translation */
+ }
+ if (!ronly && fs2h32(fs->e2fs_features_rocompat) &
+ ~EXT2F_ROCOMPAT_SUPP) {
+ return (EROFS); /* XXX needs translation */
+ }
+ }
+ return (0);
+}
--- /dev/null
+/* $NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_vnops.c 8.14 (Berkeley) 10/26/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @(#)ufs_vnops.c 8.14 (Berkeley) 10/26/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+
+extern int prtactive;
+
+static int ext2fs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *);
+static int ext2fs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
+ struct lwp *);
+
+union _qcvt {
+ int64_t qcvt;
+ int32_t val[2];
+};
+
+#define SETHIGH(q, h) { \
+ union _qcvt tmp; \
+ tmp.qcvt = (q); \
+ tmp.val[_QUAD_HIGHWORD] = (h); \
+ (q) = tmp.qcvt; \
+}
+#define SETLOW(q, l) { \
+ union _qcvt tmp; \
+ tmp.qcvt = (q); \
+ tmp.val[_QUAD_LOWWORD] = (l); \
+ (q) = tmp.qcvt; \
+}
+
+/*
+ * Create a regular file
+ */
+int
+ext2fs_create(void *v)
+{
+ struct vop_create_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ int error;
+
+ error =
+ ext2fs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+ ap->a_dvp, ap->a_vpp, ap->a_cnp);
+
+ if (error)
+ return (error);
+ VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ return (0);
+}
+
+/*
+ * Mknod vnode call
+ */
+/* ARGSUSED */
+int
+ext2fs_mknod(void *v)
+{
+ struct vop_mknod_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ struct vattr *vap = ap->a_vap;
+ struct vnode **vpp = ap->a_vpp;
+ struct inode *ip;
+ int error;
+ struct mount *mp;
+ ino_t ino;
+
+ if ((error = ext2fs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+ ap->a_dvp, vpp, ap->a_cnp)) != 0)
+ return (error);
+ VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ ip = VTOI(*vpp);
+ mp = (*vpp)->v_mount;
+ ino = ip->i_number;
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ if (vap->va_rdev != VNOVAL) {
+ /*
+ * Want to be able to use this to make badblock
+ * inodes, so don't truncate the dev number.
+ */
+ ip->i_din.e2fs_din->e2di_rdev = h2fs32(vap->va_rdev);
+ }
+ /*
+ * Remove inode so that it will be reloaded by VFS_VGET and
+ * checked to see if it is an alias of an existing entry in
+ * the inode cache.
+ */
+ VOP_UNLOCK(*vpp);
+ (*vpp)->v_type = VNON;
+ vgone(*vpp);
+ error = VFS_VGET(mp, ino, vpp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * Open called.
+ *
+ * Just check the APPEND flag.
+ */
+/* ARGSUSED */
+int
+ext2fs_open(void *v)
+{
+ struct vop_open_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ /*
+ * Files marked append-only must be opened for appending.
+ */
+ if ((VTOI(ap->a_vp)->i_e2fs_flags & EXT2_APPEND) &&
+ (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+ return (EPERM);
+ return (0);
+}
+
+static int
+ext2fs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode)
+{
+
+ /*
+ * Disallow write attempts on read-only file systems;
+ * unless the file is a socket, fifo, or a block or
+ * character device resident on the file system.
+ */
+ if (mode & VWRITE) {
+ switch (vp->v_type) {
+ case VDIR:
+ case VLNK:
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* If immutable bit set, nobody gets to write it. */
+ if ((mode & VWRITE) && (ip->i_e2fs_flags & EXT2_IMMUTABLE))
+ return (EPERM);
+
+ return 0;
+}
+
+static int
+ext2fs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode,
+ kauth_cred_t cred)
+{
+
+ return genfs_can_access(vp->v_type, ip->i_e2fs_mode & ALLPERMS,
+ ip->i_uid, ip->i_gid, mode, cred);
+}
+
+int
+ext2fs_access(void *v)
+{
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ mode_t mode = ap->a_mode;
+ int error;
+
+ error = ext2fs_check_possible(vp, ip, mode);
+ if (error)
+ return error;
+
+ error = ext2fs_check_permitted(vp, ip, mode, ap->a_cred);
+
+ return error;
+}
+
+/* ARGSUSED */
+int
+ext2fs_getattr(void *v)
+{
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct vattr *vap = ap->a_vap;
+
+ EXT2FS_ITIMES(ip, NULL, NULL, NULL);
+ /*
+ * Copy from inode table
+ */
+ vap->va_fsid = ip->i_dev;
+ vap->va_fileid = ip->i_number;
+ vap->va_mode = ip->i_e2fs_mode & ALLPERMS;
+ vap->va_nlink = ip->i_e2fs_nlink;
+ vap->va_uid = ip->i_uid;
+ vap->va_gid = ip->i_gid;
+ vap->va_rdev = (dev_t)fs2h32(ip->i_din.e2fs_din->e2di_rdev);
+ vap->va_size = vp->v_size;
+ vap->va_atime.tv_sec = ip->i_e2fs_atime;
+ vap->va_atime.tv_nsec = 0;
+ vap->va_mtime.tv_sec = ip->i_e2fs_mtime;
+ vap->va_mtime.tv_nsec = 0;
+ vap->va_ctime.tv_sec = ip->i_e2fs_ctime;
+ vap->va_ctime.tv_nsec = 0;
+#ifdef EXT2FS_SYSTEM_FLAGS
+ vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? SF_APPEND : 0;
+ vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? SF_IMMUTABLE : 0;
+#else
+ vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? UF_APPEND : 0;
+ vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? UF_IMMUTABLE : 0;
+#endif
+ vap->va_gen = ip->i_e2fs_gen;
+ /* this doesn't belong here */
+ if (vp->v_type == VBLK)
+ vap->va_blocksize = BLKDEV_IOSIZE;
+ else if (vp->v_type == VCHR)
+ vap->va_blocksize = MAXBSIZE;
+ else
+ vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+ vap->va_bytes = dbtob((u_quad_t)ip->i_e2fs_nblock);
+ vap->va_type = vp->v_type;
+ vap->va_filerev = ip->i_modrev;
+ return (0);
+}
+
+/*
+ * Set attribute vnode op. called from several syscalls
+ */
+int
+ext2fs_setattr(void *v)
+{
+ struct vop_setattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vattr *vap = ap->a_vap;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ kauth_cred_t cred = ap->a_cred;
+ struct lwp *l = curlwp;
+ int error;
+
+ /*
+ * Check for unsettable attributes.
+ */
+ if ((vap->va_type != VNON) || (vap->va_nlink != (nlink_t)VNOVAL) ||
+ (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+ (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+ ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+ return (EINVAL);
+ }
+ if (vap->va_flags != VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ if (kauth_cred_geteuid(cred) != ip->i_uid &&
+ (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+ NULL)))
+ return (error);
+#ifdef EXT2FS_SYSTEM_FLAGS
+ if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+ NULL) == 0) {
+ if ((ip->i_e2fs_flags &
+ (EXT2_APPEND | EXT2_IMMUTABLE)) &&
+ kauth_authorize_system(l->l_cred,
+ KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL))
+ return (EPERM);
+ ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE);
+ ip->i_e2fs_flags |=
+ (vap->va_flags & SF_APPEND) ? EXT2_APPEND : 0 |
+ (vap->va_flags & SF_IMMUTABLE) ? EXT2_IMMUTABLE : 0;
+ } else
+ return (EPERM);
+#else
+ ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE);
+ ip->i_e2fs_flags |=
+ (vap->va_flags & UF_APPEND) ? EXT2_APPEND : 0 |
+ (vap->va_flags & UF_IMMUTABLE) ? EXT2_IMMUTABLE : 0;
+#endif
+ ip->i_flag |= IN_CHANGE;
+ if (vap->va_flags & (IMMUTABLE | APPEND))
+ return (0);
+ }
+ if (ip->i_e2fs_flags & (EXT2_APPEND | EXT2_IMMUTABLE))
+ return (EPERM);
+ /*
+ * Go through the fields and update iff not VNOVAL.
+ */
+ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ error = ext2fs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
+ if (error)
+ return (error);
+ }
+ if (vap->va_size != VNOVAL) {
+ /*
+ * Disallow write attempts on read-only file systems;
+ * unless the file is a socket, fifo, or a block or
+ * character device resident on the file system.
+ */
+ switch (vp->v_type) {
+ case VDIR:
+ return (EISDIR);
+ case VLNK:
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ default:
+ break;
+ }
+ error = ext2fs_truncate(vp, vap->va_size, 0, cred);
+ if (error)
+ return (error);
+ }
+ ip = VTOI(vp);
+ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred);
+ if (error)
+ return (error);
+ if (vap->va_atime.tv_sec != VNOVAL)
+ if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
+ ip->i_flag |= IN_ACCESS;
+ if (vap->va_mtime.tv_sec != VNOVAL) {
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (vp->v_mount->mnt_flag & MNT_RELATIME)
+ ip->i_flag |= IN_ACCESS;
+ }
+ error = ext2fs_update(vp, &vap->va_atime, &vap->va_mtime,
+ UPDATE_WAIT);
+ if (error)
+ return (error);
+ }
+ error = 0;
+ if (vap->va_mode != (mode_t)VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ error = ext2fs_chmod(vp, (int)vap->va_mode, cred, l);
+ }
+ VN_KNOTE(vp, NOTE_ATTRIB);
+ return (error);
+}
+
+/*
+ * Change the mode on a file.
+ * Inode must be locked before calling.
+ */
+static int
+ext2fs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
+{
+ struct inode *ip = VTOI(vp);
+ int error;
+
+ error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode);
+ if (error)
+ return (error);
+
+ ip->i_e2fs_mode &= ~ALLPERMS;
+ ip->i_e2fs_mode |= (mode & ALLPERMS);
+ ip->i_flag |= IN_CHANGE;
+ return (0);
+}
+
+/*
+ * Perform chown operation on inode ip;
+ * inode must be locked prior to call.
+ */
+static int
+ext2fs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
+ struct lwp *l)
+{
+ struct inode *ip = VTOI(vp);
+ uid_t ouid;
+ gid_t ogid;
+ int error;
+
+ if (uid == (uid_t)VNOVAL)
+ uid = ip->i_uid;
+ if (gid == (gid_t)VNOVAL)
+ gid = ip->i_gid;
+
+ error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid);
+ if (error)
+ return (error);
+
+ ogid = ip->i_gid;
+ ouid = ip->i_uid;
+
+ ip->i_e2fs_gid = gid & 0xffff;
+ ip->i_e2fs_uid = uid & 0xffff;
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+ ip->i_e2fs_gid_high = (gid >> 16) & 0xffff;
+ ip->i_e2fs_uid_high = (uid >> 16) & 0xffff;
+ } else {
+ ip->i_e2fs_gid_high = 0;
+ ip->i_e2fs_uid_high = 0;
+ }
+ if (ouid != uid || ogid != gid) {
+ ext2fs_set_inode_guid(ip);
+ ip->i_flag |= IN_CHANGE;
+ }
+ if (ouid != uid && kauth_authorize_generic(cred,
+ KAUTH_GENERIC_ISSUSER, NULL) != 0)
+ ip->i_e2fs_mode &= ~ISUID;
+ if (ogid != gid && kauth_authorize_generic(cred,
+ KAUTH_GENERIC_ISSUSER, NULL) != 0)
+ ip->i_e2fs_mode &= ~ISGID;
+ return (0);
+}
+
+int
+ext2fs_remove(void *v)
+{
+ struct vop_remove_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct inode *ip;
+ struct vnode *vp = ap->a_vp;
+ struct vnode *dvp = ap->a_dvp;
+ struct ufs_lookup_results *ulr;
+ int error;
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+ ip = VTOI(vp);
+ if (vp->v_type == VDIR ||
+ (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+ (VTOI(dvp)->i_e2fs_flags & EXT2_APPEND)) {
+ error = EPERM;
+ } else {
+ error = ext2fs_dirremove(dvp, ulr, ap->a_cnp);
+ if (error == 0) {
+ ip->i_e2fs_nlink--;
+ ip->i_flag |= IN_CHANGE;
+ }
+ }
+
+ VN_KNOTE(vp, NOTE_DELETE);
+ VN_KNOTE(dvp, NOTE_WRITE);
+ if (dvp == vp)
+ vrele(vp);
+ else
+ vput(vp);
+ vput(dvp);
+ return (error);
+}
+
+/*
+ * ext2fs_link: create hard link.
+ */
+int
+ext2fs_link(void *v)
+{
+ struct vop_link_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *dvp = ap->a_dvp;
+ struct vnode *vp = ap->a_vp;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *ip;
+ int error;
+ struct ufs_lookup_results *ulr;
+
+ KASSERT(dvp != vp);
+ KASSERT(vp->v_type != VDIR);
+ KASSERT(dvp->v_mount == vp->v_mount);
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error) {
+ VOP_ABORTOP(dvp, cnp);
+ goto out2;
+ }
+ ip = VTOI(vp);
+ if ((nlink_t)ip->i_e2fs_nlink >= LINK_MAX) {
+ VOP_ABORTOP(dvp, cnp);
+ error = EMLINK;
+ goto out1;
+ }
+ if (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) {
+ VOP_ABORTOP(dvp, cnp);
+ error = EPERM;
+ goto out1;
+ }
+ ip->i_e2fs_nlink++;
+ ip->i_flag |= IN_CHANGE;
+ error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+ if (!error)
+ error = ext2fs_direnter(ip, dvp, ulr, cnp);
+ if (error) {
+ ip->i_e2fs_nlink--;
+ ip->i_flag |= IN_CHANGE;
+ }
+out1:
+ VOP_UNLOCK(vp);
+out2:
+ VN_KNOTE(vp, NOTE_LINK);
+ VN_KNOTE(dvp, NOTE_WRITE);
+ vput(dvp);
+ return (error);
+}
+
+/*
+ * Rename system call.
+ * rename("foo", "bar");
+ * is essentially
+ * unlink("bar");
+ * link("foo", "bar");
+ * unlink("foo");
+ * but ``atomically''. Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time. Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ * target. This also ensure the inode won't be deleted out
+ * from underneath us while we work (it may be truncated by
+ * a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination. If destination already exists,
+ * delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ * directory was moved and the parent of the destination
+ * is different from the source, patch the ".." entry in the
+ * directory.
+ */
+int
+ext2fs_rename(void *v)
+{
+ struct vop_rename_args /* {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+ } */ *ap = v;
+ struct vnode *tvp = ap->a_tvp;
+ struct vnode *tdvp = ap->a_tdvp;
+ struct vnode *fvp = ap->a_fvp;
+ struct vnode *fdvp = ap->a_fdvp;
+ struct componentname *tcnp = ap->a_tcnp;
+ struct componentname *fcnp = ap->a_fcnp;
+ struct inode *ip, *xp, *dp;
+ struct ext2fs_dirtemplate dirbuf;
+ int doingdirectory = 0, oldparent = 0, newparent = 0;
+ int error = 0;
+ u_char namlen;
+
+ /*
+ * Check for cross-device rename.
+ */
+ if ((fvp->v_mount != tdvp->v_mount) ||
+ (tvp && (fvp->v_mount != tvp->v_mount))) {
+ error = EXDEV;
+abortit:
+ VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+ vrele(fdvp);
+ vrele(fvp);
+ return (error);
+ }
+
+ /*
+ * Check if just deleting a link name.
+ */
+ if (tvp && ((VTOI(tvp)->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+ (VTOI(tdvp)->i_e2fs_flags & EXT2_APPEND))) {
+ error = EPERM;
+ goto abortit;
+ }
+ if (fvp == tvp) {
+ if (fvp->v_type == VDIR) {
+ error = EINVAL;
+ goto abortit;
+ }
+
+ /* Release destination completely. */
+ VOP_ABORTOP(tdvp, tcnp);
+ vput(tdvp);
+ vput(tvp);
+
+ /* Delete source. */
+ vrele(fvp);
+ fcnp->cn_flags &= ~(MODMASK);
+ fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+ fcnp->cn_nameiop = DELETE;
+ vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+ vput(fdvp);
+ return (error);
+ }
+ return (VOP_REMOVE(fdvp, fvp, fcnp));
+ }
+ if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
+ goto abortit;
+ dp = VTOI(fdvp);
+ ip = VTOI(fvp);
+ if ((nlink_t) ip->i_e2fs_nlink >= LINK_MAX) {
+ VOP_UNLOCK(fvp);
+ error = EMLINK;
+ goto abortit;
+ }
+ if ((ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+ (dp->i_e2fs_flags & EXT2_APPEND)) {
+ VOP_UNLOCK(fvp);
+ error = EPERM;
+ goto abortit;
+ }
+ if ((ip->i_e2fs_mode & IFMT) == IFDIR) {
+ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+ if (!error && tvp)
+ error = VOP_ACCESS(tvp, VWRITE, tcnp->cn_cred);
+ if (error) {
+ VOP_UNLOCK(fvp);
+ error = EACCES;
+ goto abortit;
+ }
+ /*
+ * Avoid ".", "..", and aliases of "." for obvious reasons.
+ */
+ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+ dp == ip ||
+ (fcnp->cn_flags & ISDOTDOT) ||
+ (tcnp->cn_flags & ISDOTDOT) ||
+ (ip->i_flag & IN_RENAME)) {
+ VOP_UNLOCK(fvp);
+ error = EINVAL;
+ goto abortit;
+ }
+ ip->i_flag |= IN_RENAME;
+ oldparent = dp->i_number;
+ doingdirectory = 1;
+ }
+ VN_KNOTE(fdvp, NOTE_WRITE); /* XXXLUKEM/XXX: right place? */
+
+ /*
+ * When the target exists, both the directory
+ * and target vnodes are returned locked.
+ */
+ dp = VTOI(tdvp);
+ xp = NULL;
+ if (tvp)
+ xp = VTOI(tvp);
+
+ /*
+ * 1) Bump link count while we're moving stuff
+ * around. If we crash somewhere before
+ * completing our work, the link count
+ * may be wrong, but correctable.
+ */
+ ip->i_e2fs_nlink++;
+ ip->i_flag |= IN_CHANGE;
+ if ((error = ext2fs_update(fvp, NULL, NULL, UPDATE_WAIT)) != 0) {
+ VOP_UNLOCK(fvp);
+ goto bad;
+ }
+
+ /*
+ * If ".." must be changed (ie the directory gets a new
+ * parent) then the source directory must not be in the
+ * directory hierarchy above the target, as this would
+ * orphan everything below the source directory. Also
+ * the user must have write permission in the source so
+ * as to be able to change "..". We must repeat the call
+ * to namei, as the parent directory is unlocked by the
+ * call to checkpath().
+ */
+ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+ VOP_UNLOCK(fvp);
+ if (oldparent != dp->i_number)
+ newparent = dp->i_number;
+ if (doingdirectory && newparent) {
+ if (error) /* write access check above */
+ goto bad;
+ if (xp != NULL)
+ vput(tvp);
+ vref(tdvp); /* compensate for the ref checkpath loses */
+ error = ext2fs_checkpath(ip, dp, tcnp->cn_cred);
+ if (error != 0) {
+ vrele(tdvp);
+ goto out;
+ }
+ vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = relookup(tdvp, &tvp, tcnp, 0)) != 0) {
+ vput(tdvp);
+ goto out;
+ }
+ dp = VTOI(tdvp);
+ xp = NULL;
+ if (tvp)
+ xp = VTOI(tvp);
+ }
+ /*
+ * 2) If target doesn't exist, link the target
+ * to the source and unlink the source.
+ * Otherwise, rewrite the target directory
+ * entry to reference the source inode and
+ * expunge the original entry's existence.
+ */
+ if (xp == NULL) {
+ if (dp->i_dev != ip->i_dev)
+ panic("rename: EXDEV");
+ /*
+ * Account for ".." in new directory.
+ * When source and destination have the same
+ * parent we don't fool with the link count.
+ */
+ if (doingdirectory && newparent) {
+ if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) {
+ error = EMLINK;
+ goto bad;
+ }
+ dp->i_e2fs_nlink++;
+ dp->i_flag |= IN_CHANGE;
+ if ((error = ext2fs_update(tdvp, NULL, NULL,
+ UPDATE_WAIT)) != 0)
+ goto bad;
+ }
+ error = ext2fs_direnter(ip, tdvp, &VTOI(tdvp)->i_crap, tcnp);
+ if (error != 0) {
+ if (doingdirectory && newparent) {
+ dp->i_e2fs_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ (void)ext2fs_update(tdvp, NULL, NULL,
+ UPDATE_WAIT);
+ }
+ goto bad;
+ }
+ VN_KNOTE(tdvp, NOTE_WRITE);
+ vput(tdvp);
+ } else {
+ if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
+ panic("rename: EXDEV");
+ /*
+ * Short circuit rename(foo, foo).
+ */
+ if (xp->i_number == ip->i_number)
+ panic("rename: same file");
+ /*
+ * If the parent directory is "sticky", then the user must
+ * own the parent directory, or the destination of the rename,
+ * otherwise the destination may not be changed (except by
+ * root). This implements append-only directories.
+ */
+ if ((dp->i_e2fs_mode & S_ISTXT) &&
+ kauth_authorize_generic(tcnp->cn_cred,
+ KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
+ kauth_cred_geteuid(tcnp->cn_cred) != dp->i_uid &&
+ xp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+ error = EPERM;
+ goto bad;
+ }
+ /*
+ * Target must be empty if a directory and have no links
+ * to it. Also, ensure source and target are compatible
+ * (both directories, or both not directories).
+ */
+ if ((xp->i_e2fs_mode & IFMT) == IFDIR) {
+ if (!ext2fs_dirempty(xp, dp->i_number, tcnp->cn_cred) ||
+ xp->i_e2fs_nlink > 2) {
+ error = ENOTEMPTY;
+ goto bad;
+ }
+ if (!doingdirectory) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ cache_purge(tdvp);
+ } else if (doingdirectory) {
+ error = EISDIR;
+ goto bad;
+ }
+ error = ext2fs_dirrewrite(dp, &dp->i_crap, ip, tcnp);
+ if (error != 0)
+ goto bad;
+ /*
+ * If the target directory is in the same
+ * directory as the source directory,
+ * decrement the link count on the parent
+ * of the target directory.
+ */
+ if (doingdirectory && !newparent) {
+ dp->i_e2fs_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ }
+ /*
+ * Adjust the link count of the target to
+ * reflect the dirrewrite above. If this is
+ * a directory it is empty and there are
+ * no links to it, so we can squash the inode and
+ * any space associated with it. We disallowed
+ * renaming over top of a directory with links to
+ * it above, as the remaining link would point to
+ * a directory without "." or ".." entries.
+ */
+ xp->i_e2fs_nlink--;
+ if (doingdirectory) {
+ if (--xp->i_e2fs_nlink != 0)
+ panic("rename: linked directory");
+ error = ext2fs_truncate(tvp, (off_t)0, IO_SYNC,
+ tcnp->cn_cred);
+ }
+ xp->i_flag |= IN_CHANGE;
+ VN_KNOTE(tdvp, NOTE_WRITE);
+ vput(tdvp);
+ VN_KNOTE(tvp, NOTE_DELETE);
+ vput(tvp);
+ xp = NULL;
+ }
+
+ /*
+ * 3) Unlink the source.
+ */
+ fcnp->cn_flags &= ~(MODMASK);
+ fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+ vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+ vput(fdvp);
+ vrele(ap->a_fvp);
+ return (error);
+ }
+ if (fvp != NULL) {
+ xp = VTOI(fvp);
+ dp = VTOI(fdvp);
+ } else {
+ /*
+ * From name has disappeared.
+ */
+ if (doingdirectory)
+ panic("ext2fs_rename: lost dir entry");
+ vrele(ap->a_fvp);
+ return (0);
+ }
+ /*
+ * Ensure that the directory entry still exists and has not
+ * changed while the new name has been entered. If the source is
+ * a file then the entry may have been unlinked or renamed. In
+ * either case there is no further work to be done. If the source
+ * is a directory then it cannot have been rmdir'ed; its link
+ * count of three would cause a rmdir to fail with ENOTEMPTY.
+ * The IRENAME flag ensures that it cannot be moved by another
+ * rename.
+ */
+ if (xp != ip) {
+ if (doingdirectory)
+ panic("ext2fs_rename: lost dir entry");
+ } else {
+ /*
+ * If the source is a directory with a
+ * new parent, the link count of the old
+ * parent directory must be decremented
+ * and ".." set to point to the new parent.
+ */
+ if (doingdirectory && newparent) {
+ KASSERT(dp != NULL);
+ dp->i_e2fs_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ error = vn_rdwr(UIO_READ, fvp, (void *)&dirbuf,
+ sizeof (struct ext2fs_dirtemplate), (off_t)0,
+ UIO_SYSSPACE, IO_NODELOCKED,
+ tcnp->cn_cred, (size_t *)0, NULL);
+ if (error == 0) {
+ namlen = dirbuf.dotdot_namlen;
+ if (namlen != 2 ||
+ dirbuf.dotdot_name[0] != '.' ||
+ dirbuf.dotdot_name[1] != '.') {
+ ufs_dirbad(xp, (doff_t)12,
+ "ext2fs_rename: mangled dir");
+ } else {
+ dirbuf.dotdot_ino = h2fs32(newparent);
+ (void) vn_rdwr(UIO_WRITE, fvp,
+ (void *)&dirbuf,
+ sizeof (struct dirtemplate),
+ (off_t)0, UIO_SYSSPACE,
+ IO_NODELOCKED|IO_SYNC,
+ tcnp->cn_cred, (size_t *)0,
+ NULL);
+ cache_purge(fdvp);
+ }
+ }
+ }
+ error = ext2fs_dirremove(fdvp, &VTOI(fdvp)->i_crap, fcnp);
+ if (!error) {
+ xp->i_e2fs_nlink--;
+ xp->i_flag |= IN_CHANGE;
+ }
+ xp->i_flag &= ~IN_RENAME;
+ }
+ VN_KNOTE(fvp, NOTE_RENAME);
+ if (dp)
+ vput(fdvp);
+ if (xp)
+ vput(fvp);
+ vrele(ap->a_fvp);
+ return (error);
+
+bad:
+ if (xp)
+ vput(ITOV(xp));
+ vput(ITOV(dp));
+out:
+ if (doingdirectory)
+ ip->i_flag &= ~IN_RENAME;
+ if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
+ ip->i_e2fs_nlink--;
+ ip->i_flag |= IN_CHANGE;
+ vput(fvp);
+ } else
+ vrele(fvp);
+ vrele(fdvp);
+ return (error);
+}
+
+/*
+ * Mkdir system call
+ */
+int
+ext2fs_mkdir(void *v)
+{
+ struct vop_mkdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ struct vnode *dvp = ap->a_dvp;
+ struct vattr *vap = ap->a_vap;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *ip, *dp = VTOI(dvp);
+ struct vnode *tvp;
+ struct ext2fs_dirtemplate dirtemplate;
+ int error, dmode;
+ struct ufs_lookup_results *ulr;
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+ if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) {
+ error = EMLINK;
+ goto out;
+ }
+ dmode = vap->va_mode & ACCESSPERMS;
+ dmode |= IFDIR;
+ /*
+ * Must simulate part of ext2fs_makeinode here to acquire the inode,
+ * but not have it entered in the parent directory. The entry is
+ * made later after writing "." and ".." entries.
+ */
+ if ((error = ext2fs_valloc(dvp, dmode, cnp->cn_cred, &tvp)) != 0)
+ goto out;
+ ip = VTOI(tvp);
+ ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+ ip->i_e2fs_uid = ip->i_uid & 0xffff;
+ ip->i_e2fs_gid = dp->i_e2fs_gid;
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+ ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff;
+ ip->i_e2fs_gid_high = dp->i_e2fs_gid_high;
+ } else {
+ ip->i_e2fs_uid_high = 0;
+ ip->i_e2fs_gid_high = 0;
+ }
+ ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16);
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ ip->i_e2fs_mode = dmode;
+ tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */
+ ip->i_e2fs_nlink = 2;
+
+ /*
+ * Bump link count in parent directory
+ * to reflect work done below. Should
+ * be done before reference is created
+ * so reparation is possible if we crash.
+ */
+ dp->i_e2fs_nlink++;
+ dp->i_flag |= IN_CHANGE;
+ if ((error = ext2fs_update(dvp, NULL, NULL, UPDATE_DIROP)) != 0)
+ goto bad;
+
+ /* Initialize directory with "." and ".." from static template. */
+ memset(&dirtemplate, 0, sizeof(dirtemplate));
+ dirtemplate.dot_ino = h2fs32(ip->i_number);
+ dirtemplate.dot_reclen = h2fs16(12);
+ dirtemplate.dot_namlen = 1;
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+ (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+ dirtemplate.dot_type = EXT2_FT_DIR;
+ }
+ dirtemplate.dot_name[0] = '.';
+ dirtemplate.dotdot_ino = h2fs32(dp->i_number);
+ dirtemplate.dotdot_reclen = h2fs16(VTOI(dvp)->i_e2fs->e2fs_bsize - 12);
+ dirtemplate.dotdot_namlen = 2;
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+ (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+ dirtemplate.dotdot_type = EXT2_FT_DIR;
+ }
+ dirtemplate.dotdot_name[0] = dirtemplate.dotdot_name[1] = '.';
+ error = vn_rdwr(UIO_WRITE, tvp, (void *)&dirtemplate,
+ sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE,
+ IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (size_t *)0, NULL);
+ if (error) {
+ dp->i_e2fs_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ goto bad;
+ }
+ if (VTOI(dvp)->i_e2fs->e2fs_bsize > dvp->v_mount->mnt_stat.f_bsize)
+ panic("ext2fs_mkdir: blksize"); /* XXX should grow with balloc() */
+ else {
+ error = ext2fs_setsize(ip, VTOI(dvp)->i_e2fs->e2fs_bsize);
+ if (error) {
+ dp->i_e2fs_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ goto bad;
+ }
+ ip->i_flag |= IN_CHANGE;
+ uvm_vnp_setsize(tvp, ext2fs_size(ip));
+ }
+
+ /* Directory set up, now install it's entry in the parent directory. */
+ error = ext2fs_direnter(ip, dvp, ulr, cnp);
+ if (error != 0) {
+ dp->i_e2fs_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ }
+bad:
+ /*
+ * No need to do an explicit ext2fs_truncate here, vrele will do this
+ * for us because we set the link count to 0.
+ */
+ if (error) {
+ ip->i_e2fs_nlink = 0;
+ ip->i_flag |= IN_CHANGE;
+ vput(tvp);
+ } else {
+ VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+ *ap->a_vpp = tvp;
+ }
+out:
+ vput(dvp);
+ return (error);
+}
+
+/*
+ * Rmdir system call.
+ */
+int
+ext2fs_rmdir(void *v)
+{
+ struct vop_rmdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct vnode *dvp = ap->a_dvp;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *ip, *dp;
+ int error;
+ struct ufs_lookup_results *ulr;
+
+ ip = VTOI(vp);
+ dp = VTOI(dvp);
+
+ /* XXX should handle this material another way */
+ ulr = &dp->i_crap;
+ UFS_CHECK_CRAPCOUNTER(dp);
+
+ /*
+ * No rmdir "." please.
+ */
+ if (dp == ip) {
+ vrele(dvp);
+ vput(vp);
+ return (EINVAL);
+ }
+ /*
+ * Verify the directory is empty (and valid).
+ * (Rmdir ".." won't be valid since
+ * ".." will contain a reference to
+ * the current directory and thus be
+ * non-empty.)
+ */
+ error = 0;
+ if (ip->i_e2fs_nlink != 2 ||
+ !ext2fs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
+ error = ENOTEMPTY;
+ goto out;
+ }
+ if ((dp->i_e2fs_flags & EXT2_APPEND) ||
+ (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND))) {
+ error = EPERM;
+ goto out;
+ }
+ /*
+ * Delete reference to directory before purging
+ * inode. If we crash in between, the directory
+ * will be reattached to lost+found,
+ */
+ error = ext2fs_dirremove(dvp, ulr, cnp);
+ if (error != 0)
+ goto out;
+ dp->i_e2fs_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+ cache_purge(dvp);
+ vput(dvp);
+ dvp = NULL;
+ /*
+ * Truncate inode. The only stuff left
+ * in the directory is "." and "..". The
+ * "." reference is inconsequential since
+ * we're quashing it. The ".." reference
+ * has already been adjusted above. We've
+ * removed the "." reference and the reference
+ * in the parent directory, but there may be
+ * other hard links so decrement by 2 and
+ * worry about them later.
+ */
+ ip->i_e2fs_nlink -= 2;
+ error = ext2fs_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
+ cache_purge(ITOV(ip));
+out:
+ VN_KNOTE(vp, NOTE_DELETE);
+ if (dvp)
+ vput(dvp);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * symlink -- make a symbolic link
+ */
+int
+ext2fs_symlink(void *v)
+{
+ struct vop_symlink_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+ } */ *ap = v;
+ struct vnode *vp, **vpp;
+ struct inode *ip;
+ int len, error;
+
+ vpp = ap->a_vpp;
+ error = ext2fs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
+ vpp, ap->a_cnp);
+ if (error)
+ return (error);
+ VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ vp = *vpp;
+ len = strlen(ap->a_target);
+ ip = VTOI(vp);
+ if (len < ip->i_ump->um_maxsymlinklen) {
+ memcpy((char *)ip->i_din.e2fs_din->e2di_shortlink, ap->a_target, len);
+ error = ext2fs_setsize(ip, len);
+ if (error)
+ goto bad;
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (vp->v_mount->mnt_flag & MNT_RELATIME)
+ ip->i_flag |= IN_ACCESS;
+ uvm_vnp_setsize(vp, len);
+ } else
+ error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
+ UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred,
+ (size_t *)0, NULL);
+bad:
+ if (error)
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link
+ */
+int
+ext2fs_readlink(void *v)
+{
+ struct vop_readlink_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct ufsmount *ump = ip->i_ump;
+ int isize;
+
+ isize = ext2fs_size(ip);
+ if (isize < ump->um_maxsymlinklen ||
+ (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0)) {
+ uiomove((char *)ip->i_din.e2fs_din->e2di_shortlink, isize, ap->a_uio);
+ return (0);
+ }
+ return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+ext2fs_advlock(void *v)
+{
+ struct vop_advlock_args /* {
+ struct vnode *a_vp;
+ void * a_id;
+ int a_op;
+ struct flock *a_fl;
+ int a_flags;
+ } */ *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+
+ return lf_advlock(ap, &ip->i_lockf, ext2fs_size(ip));
+}
+
+int
+ext2fs_fsync(void *v)
+{
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ kauth_cred_t a_cred;
+ int a_flags;
+ off_t offlo;
+ off_t offhi;
+ struct proc *a_p;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ int wait;
+ int error;
+
+ wait = (ap->a_flags & FSYNC_WAIT) != 0;
+
+ if (vp->v_type == VBLK)
+ error = spec_fsync(v);
+ else
+ error = vflushbuf(vp, wait);
+ if (error == 0 && (ap->a_flags & FSYNC_DATAONLY) == 0)
+ error = ext2fs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+
+ if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+ int l = 0;
+ error = VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+ curlwp->l_cred);
+ }
+
+ return error;
+}
+
+/*
+ * Initialize the vnode associated with a new inode, handle aliased
+ * vnodes.
+ */
+int
+ext2fs_vinit(struct mount *mntp, int (**specops)(void *),
+ int (**fifoops)(void *), struct vnode **vpp)
+{
+ struct timeval tv;
+ struct inode *ip;
+ struct vnode *vp;
+
+ vp = *vpp;
+ ip = VTOI(vp);
+ switch(vp->v_type = IFTOVT(ip->i_e2fs_mode)) {
+ case VCHR:
+ case VBLK:
+ vp->v_op = specops;
+ spec_node_init(vp, fs2h32(ip->i_din.e2fs_din->e2di_rdev));
+ break;
+ case VFIFO:
+ vp->v_op = fifoops;
+ break;
+ case VNON:
+ case VBAD:
+ case VSOCK:
+ case VLNK:
+ case VDIR:
+ case VREG:
+ break;
+ }
+ if (ip->i_number == ROOTINO)
+ vp->v_vflag |= VV_ROOT;
+ /*
+ * Initialize modrev times
+ */
+ getmicrouptime(&tv);
+ SETHIGH(ip->i_modrev, tv.tv_sec);
+ SETLOW(ip->i_modrev, tv.tv_usec * 4294);
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+ext2fs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp)
+{
+ struct inode *ip, *pdir;
+ struct vnode *tvp;
+ int error, ismember = 0;
+ struct ufs_lookup_results *ulr;
+
+ pdir = VTOI(dvp);
+
+ /* XXX should handle this material another way */
+ ulr = &pdir->i_crap;
+ UFS_CHECK_CRAPCOUNTER(pdir);
+
+ *vpp = NULL;
+ if ((mode & IFMT) == 0)
+ mode |= IFREG;
+
+ if ((error = ext2fs_valloc(dvp, mode, cnp->cn_cred, &tvp)) != 0) {
+ vput(dvp);
+ return (error);
+ }
+ ip = VTOI(tvp);
+ ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+ ip->i_e2fs_uid = ip->i_uid & 0xffff;
+ ip->i_e2fs_gid = pdir->i_e2fs_gid;
+ if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+ ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff;
+ ip->i_e2fs_gid_high = pdir->i_e2fs_gid_high;
+ } else {
+ ip->i_e2fs_uid_high = 0;
+ ip->i_e2fs_gid_high = 0;
+ }
+ ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16);
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ ip->i_e2fs_mode = mode;
+ tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */
+ ip->i_e2fs_nlink = 1;
+ if ((ip->i_e2fs_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+ ip->i_gid, &ismember) != 0 || !ismember) &&
+ kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL))
+ ip->i_e2fs_mode &= ~ISGID;
+
+ /*
+ * Make sure inode goes to disk before directory entry.
+ */
+ if ((error = ext2fs_update(tvp, NULL, NULL, UPDATE_WAIT)) != 0)
+ goto bad;
+ error = ext2fs_direnter(ip, dvp, ulr, cnp);
+ if (error != 0)
+ goto bad;
+ vput(dvp);
+ *vpp = tvp;
+ return (0);
+
+bad:
+ /*
+ * Write error occurred trying to update the inode
+ * or the directory so must deallocate the inode.
+ */
+ tvp->v_type = VNON; /* Stop explosion if VBLK */
+ ip->i_e2fs_nlink = 0;
+ ip->i_flag |= IN_CHANGE;
+ vput(tvp);
+ vput(dvp);
+ return (error);
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ext2fs_reclaim(void *v)
+{
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ int error;
+
+ /*
+ * The inode must be freed and updated before being removed
+ * from its hash chain. Other threads trying to gain a hold
+ * on the inode will be stalled because it is locked (VI_XLOCK).
+ */
+ if (ip->i_omode == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+ ext2fs_vfree(vp, ip->i_number, ip->i_e2fs_mode);
+ if ((error = ufs_reclaim(vp)) != 0)
+ return (error);
+ if (ip->i_din.e2fs_din != NULL)
+ pool_put(&ext2fs_dinode_pool, ip->i_din.e2fs_din);
+ genfs_node_destroy(vp);
+ pool_put(&ext2fs_inode_pool, vp->v_data);
+ vp->v_data = NULL;
+ return (0);
+}
+
+/* Global vfs data structures for ext2fs. */
+int (**ext2fs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_vnodeop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, ext2fs_lookup }, /* lookup */
+ { &vop_create_desc, ext2fs_create }, /* create */
+ { &vop_mknod_desc, ext2fs_mknod }, /* mknod */
+ { &vop_open_desc, ext2fs_open }, /* open */
+ { &vop_close_desc, ufs_close }, /* close */
+ { &vop_access_desc, ext2fs_access }, /* access */
+ { &vop_getattr_desc, ext2fs_getattr }, /* getattr */
+ { &vop_setattr_desc, ext2fs_setattr }, /* setattr */
+ { &vop_read_desc, ext2fs_read }, /* read */
+ { &vop_write_desc, ext2fs_write }, /* write */
+ { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, ufs_poll }, /* poll */
+ { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */
+ { &vop_revoke_desc, ufs_revoke }, /* revoke */
+ { &vop_mmap_desc, ufs_mmap }, /* mmap */
+ { &vop_fsync_desc, ext2fs_fsync }, /* fsync */
+ { &vop_seek_desc, ufs_seek }, /* seek */
+ { &vop_remove_desc, ext2fs_remove }, /* remove */
+ { &vop_link_desc, ext2fs_link }, /* link */
+ { &vop_rename_desc, ext2fs_rename }, /* rename */
+ { &vop_mkdir_desc, ext2fs_mkdir }, /* mkdir */
+ { &vop_rmdir_desc, ext2fs_rmdir }, /* rmdir */
+ { &vop_symlink_desc, ext2fs_symlink }, /* symlink */
+ { &vop_readdir_desc, ext2fs_readdir }, /* readdir */
+ { &vop_readlink_desc, ext2fs_readlink }, /* readlink */
+ { &vop_abortop_desc, ufs_abortop }, /* abortop */
+ { &vop_inactive_desc, ext2fs_inactive }, /* inactive */
+ { &vop_reclaim_desc, ext2fs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, ext2fs_bmap }, /* bmap */
+ { &vop_strategy_desc, ufs_strategy }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */
+ { &vop_advlock_desc, ext2fs_advlock }, /* advlock */
+ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
+ { &vop_getpages_desc, genfs_getpages }, /* getpages */
+ { &vop_putpages_desc, genfs_putpages }, /* putpages */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_vnodeop_opv_desc =
+ { &ext2fs_vnodeop_p, ext2fs_vnodeop_entries };
+
+int (**ext2fs_specop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_specop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, spec_lookup }, /* lookup */
+ { &vop_create_desc, spec_create }, /* create */
+ { &vop_mknod_desc, spec_mknod }, /* mknod */
+ { &vop_open_desc, spec_open }, /* open */
+ { &vop_close_desc, ufsspec_close }, /* close */
+ { &vop_access_desc, ext2fs_access }, /* access */
+ { &vop_getattr_desc, ext2fs_getattr }, /* getattr */
+ { &vop_setattr_desc, ext2fs_setattr }, /* setattr */
+ { &vop_read_desc, ufsspec_read }, /* read */
+ { &vop_write_desc, ufsspec_write }, /* write */
+ { &vop_ioctl_desc, spec_ioctl }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, spec_poll }, /* poll */
+ { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */
+ { &vop_revoke_desc, spec_revoke }, /* revoke */
+ { &vop_mmap_desc, spec_mmap }, /* mmap */
+ { &vop_fsync_desc, ext2fs_fsync }, /* fsync */
+ { &vop_seek_desc, spec_seek }, /* seek */
+ { &vop_remove_desc, spec_remove }, /* remove */
+ { &vop_link_desc, spec_link }, /* link */
+ { &vop_rename_desc, spec_rename }, /* rename */
+ { &vop_mkdir_desc, spec_mkdir }, /* mkdir */
+ { &vop_rmdir_desc, spec_rmdir }, /* rmdir */
+ { &vop_symlink_desc, spec_symlink }, /* symlink */
+ { &vop_readdir_desc, spec_readdir }, /* readdir */
+ { &vop_readlink_desc, spec_readlink }, /* readlink */
+ { &vop_abortop_desc, spec_abortop }, /* abortop */
+ { &vop_inactive_desc, ext2fs_inactive }, /* inactive */
+ { &vop_reclaim_desc, ext2fs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, spec_bmap }, /* bmap */
+ { &vop_strategy_desc, spec_strategy }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, spec_pathconf }, /* pathconf */
+ { &vop_advlock_desc, spec_advlock }, /* advlock */
+ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
+ { &vop_getpages_desc, spec_getpages }, /* getpages */
+ { &vop_putpages_desc, spec_putpages }, /* putpages */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_specop_opv_desc =
+ { &ext2fs_specop_p, ext2fs_specop_entries };
+
+int (**ext2fs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_fifoop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, vn_fifo_bypass }, /* lookup */
+ { &vop_create_desc, vn_fifo_bypass }, /* create */
+ { &vop_mknod_desc, vn_fifo_bypass }, /* mknod */
+ { &vop_open_desc, vn_fifo_bypass }, /* open */
+ { &vop_close_desc, ufsfifo_close }, /* close */
+ { &vop_access_desc, ext2fs_access }, /* access */
+ { &vop_getattr_desc, ext2fs_getattr }, /* getattr */
+ { &vop_setattr_desc, ext2fs_setattr }, /* setattr */
+ { &vop_read_desc, ufsfifo_read }, /* read */
+ { &vop_write_desc, ufsfifo_write }, /* write */
+ { &vop_ioctl_desc, vn_fifo_bypass }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, vn_fifo_bypass }, /* poll */
+ { &vop_kqfilter_desc, vn_fifo_bypass }, /* kqfilter */
+ { &vop_revoke_desc, vn_fifo_bypass }, /* revoke */
+ { &vop_mmap_desc, vn_fifo_bypass }, /* mmap */
+ { &vop_fsync_desc, ext2fs_fsync }, /* fsync */
+ { &vop_seek_desc, vn_fifo_bypass }, /* seek */
+ { &vop_remove_desc, vn_fifo_bypass }, /* remove */
+ { &vop_link_desc, vn_fifo_bypass }, /* link */
+ { &vop_rename_desc, vn_fifo_bypass }, /* rename */
+ { &vop_mkdir_desc, vn_fifo_bypass }, /* mkdir */
+ { &vop_rmdir_desc, vn_fifo_bypass }, /* rmdir */
+ { &vop_symlink_desc, vn_fifo_bypass }, /* symlink */
+ { &vop_readdir_desc, vn_fifo_bypass }, /* readdir */
+ { &vop_readlink_desc, vn_fifo_bypass }, /* readlink */
+ { &vop_abortop_desc, vn_fifo_bypass }, /* abortop */
+ { &vop_inactive_desc, ext2fs_inactive }, /* inactive */
+ { &vop_reclaim_desc, ext2fs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, vn_fifo_bypass }, /* bmap */
+ { &vop_strategy_desc, vn_fifo_bypass }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, vn_fifo_bypass }, /* pathconf */
+ { &vop_advlock_desc, vn_fifo_bypass }, /* advlock */
+ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
+ { &vop_putpages_desc, vn_fifo_bypass }, /* putpages */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_fifoop_opv_desc =
+ { &ext2fs_fifoop_p, ext2fs_fifoop_entries };
--- /dev/null
+# $NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $
+
+INCSDIR= /usr/include/ufs/ffs
+
+INCS= ffs_extern.h fs.h
+
+.include <bsd.kinc.mk>
--- /dev/null
+/* $NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $ */
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2002 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and Network Associates Laboratories, the Security
+ * Research Division of Network Associates, Inc. under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
+ * research program
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_uvm_page_trkown.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/cprng.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/syslog.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#ifdef UVM_PAGE_TRKOWN
+#include <uvm/uvm.h>
+#endif
+
+static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int);
+static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
+static ino_t ffs_dirpref(struct inode *);
+static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
+static void ffs_fserr(struct fs *, u_int, const char *);
+static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
+ daddr_t (*)(struct inode *, int, daddr_t, int, int));
+static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
+static int32_t ffs_mapsearch(struct fs *, struct cg *,
+ daddr_t, int);
+static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
+ daddr_t, long, bool);
+static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
+ int, bool);
+
+/* if 1, changes in optimalization strategy are logged */
+int ffs_log_changeopt = 0;
+
+/* in ffs_tables.c */
+extern const int inside[], around[];
+extern const u_char * const fragtbl[];
+
+/* Basic consistency check for block allocations */
+static int
+ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
+ long size, dev_t dev, ino_t inum)
+{
+ if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
+ fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
+ printf("dev = 0x%llx, bno = %" PRId64 " bsize = %d, "
+ "size = %ld, fs = %s\n",
+ (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
+ panic("%s: bad size", func);
+ }
+
+ if (bno >= fs->fs_size) {
+ printf("bad block %" PRId64 ", ino %llu\n", bno,
+ (unsigned long long)inum);
+ ffs_fserr(fs, inum, "bad block");
+ return EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Allocate a block in the file system.
+ *
+ * The size of the requested block is given, which must be some
+ * multiple of fs_fsize and <= fs_bsize.
+ * A preference may be optionally specified. If a preference is given
+ * the following hierarchy is used to allocate a block:
+ * 1) allocate the requested block.
+ * 2) allocate a rotationally optimal block in the same cylinder.
+ * 3) allocate a block in the same cylinder group.
+ * 4) quadradically rehash into other cylinder groups, until an
+ * available block is located.
+ * If no block preference is given the following hierarchy is used
+ * to allocate a block:
+ * 1) allocate a block in the cylinder group that contains the
+ * inode for the file.
+ * 2) quadradically rehash into other cylinder groups, until an
+ * available block is located.
+ *
+ * => called with um_lock held
+ * => releases um_lock before returning
+ */
+int
+ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags,
+ kauth_cred_t cred, daddr_t *bnp)
+{
+ struct ufsmount *ump;
+ struct fs *fs;
+ daddr_t bno;
+ int cg;
+#if defined(QUOTA) || defined(QUOTA2)
+ int error;
+#endif
+
+ fs = ip->i_fs;
+ ump = ip->i_ump;
+
+ KASSERT(mutex_owned(&ump->um_lock));
+
+#ifdef UVM_PAGE_TRKOWN
+
+ /*
+ * Sanity-check that allocations within the file size
+ * do not allow other threads to read the stale contents
+ * of newly allocated blocks.
+ * Usually pages will exist to cover the new allocation.
+ * There is an optimization in ffs_write() where we skip
+ * creating pages if several conditions are met:
+ * - the file must not be mapped (in any user address space).
+ * - the write must cover whole pages and whole blocks.
+ * If those conditions are not met then pages must exist and
+ * be locked by the current thread.
+ */
+
+ if (ITOV(ip)->v_type == VREG &&
+ lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) {
+ struct vm_page *pg;
+ struct vnode *vp = ITOV(ip);
+ struct uvm_object *uobj = &vp->v_uobj;
+ voff_t off = trunc_page(lblktosize(fs, lbn));
+ voff_t endoff = round_page(lblktosize(fs, lbn) + size);
+
+ mutex_enter(uobj->vmobjlock);
+ while (off < endoff) {
+ pg = uvm_pagelookup(uobj, off);
+ KASSERT((pg == NULL && (vp->v_vflag & VV_MAPPED) == 0 &&
+ (size & PAGE_MASK) == 0 &&
+ blkoff(fs, size) == 0) ||
+ (pg != NULL && pg->owner == curproc->p_pid &&
+ pg->lowner == curlwp->l_lid));
+ off += PAGE_SIZE;
+ }
+ mutex_exit(uobj->vmobjlock);
+ }
+#endif
+
+ *bnp = 0;
+#ifdef DIAGNOSTIC
+ if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
+ printf("dev = 0x%llx, bsize = %d, size = %d, fs = %s\n",
+ (unsigned long long)ip->i_dev, fs->fs_bsize, size,
+ fs->fs_fsmnt);
+ panic("ffs_alloc: bad size");
+ }
+ if (cred == NOCRED)
+ panic("ffs_alloc: missing credential");
+#endif /* DIAGNOSTIC */
+ if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
+ goto nospace;
+ if (freespace(fs, fs->fs_minfree) <= 0 &&
+ kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+ NULL, NULL) != 0)
+ goto nospace;
+#if defined(QUOTA) || defined(QUOTA2)
+ mutex_exit(&ump->um_lock);
+ if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
+ return (error);
+ mutex_enter(&ump->um_lock);
+#endif
+
+ if (bpref >= fs->fs_size)
+ bpref = 0;
+ if (bpref == 0)
+ cg = ino_to_cg(fs, ip->i_number);
+ else
+ cg = dtog(fs, bpref);
+ bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg);
+ if (bno > 0) {
+ DIP_ADD(ip, blocks, btodb(size));
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ *bnp = bno;
+ return (0);
+ }
+#if defined(QUOTA) || defined(QUOTA2)
+ /*
+ * Restore user's disk quota because allocation failed.
+ */
+ (void) chkdq(ip, -btodb(size), cred, FORCE);
+#endif
+ if (flags & B_CONTIG) {
+ /*
+ * XXX ump->um_lock handling is "suspect" at best.
+ * For the case where ffs_hashalloc() fails early
+ * in the B_CONTIG case we reach here with um_lock
+ * already unlocked, so we can't release it again
+ * like in the normal error path. See kern/39206.
+ *
+ *
+ * Fail silently - it's up to our caller to report
+ * errors.
+ */
+ return (ENOSPC);
+ }
+nospace:
+ mutex_exit(&ump->um_lock);
+ ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+ uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
+ return (ENOSPC);
+}
+
+/*
+ * Reallocate a fragment to a bigger size
+ *
+ * The number and size of the old block is given, and a preference
+ * and new size is also specified. The allocator attempts to extend
+ * the original block. Failing that, the regular block allocator is
+ * invoked to get an appropriate block.
+ *
+ * => called with um_lock held
+ * => return with um_lock released
+ */
+int
+ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
+ int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop)
+{
+ struct ufsmount *ump;
+ struct fs *fs;
+ struct buf *bp;
+ int cg, request, error;
+ daddr_t bprev, bno;
+
+ fs = ip->i_fs;
+ ump = ip->i_ump;
+
+ KASSERT(mutex_owned(&ump->um_lock));
+
+#ifdef UVM_PAGE_TRKOWN
+
+ /*
+ * Sanity-check that allocations within the file size
+ * do not allow other threads to read the stale contents
+ * of newly allocated blocks.
+ * Unlike in ffs_alloc(), here pages must always exist
+ * for such allocations, because only the last block of a file
+ * can be a fragment and ffs_write() will reallocate the
+ * fragment to the new size using ufs_balloc_range(),
+ * which always creates pages to cover blocks it allocates.
+ */
+
+ if (ITOV(ip)->v_type == VREG) {
+ struct vm_page *pg;
+ struct uvm_object *uobj = &ITOV(ip)->v_uobj;
+ voff_t off = trunc_page(lblktosize(fs, lbprev));
+ voff_t endoff = round_page(lblktosize(fs, lbprev) + osize);
+
+ mutex_enter(uobj->vmobjlock);
+ while (off < endoff) {
+ pg = uvm_pagelookup(uobj, off);
+ KASSERT(pg->owner == curproc->p_pid &&
+ pg->lowner == curlwp->l_lid);
+ off += PAGE_SIZE;
+ }
+ mutex_exit(uobj->vmobjlock);
+ }
+#endif
+
+#ifdef DIAGNOSTIC
+ if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
+ (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
+ printf(
+ "dev = 0x%llx, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
+ (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
+ fs->fs_fsmnt);
+ panic("ffs_realloccg: bad size");
+ }
+ if (cred == NOCRED)
+ panic("ffs_realloccg: missing credential");
+#endif /* DIAGNOSTIC */
+ if (freespace(fs, fs->fs_minfree) <= 0 &&
+ kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+ NULL, NULL) != 0) {
+ mutex_exit(&ump->um_lock);
+ goto nospace;
+ }
+ if (fs->fs_magic == FS_UFS2_MAGIC)
+ bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs));
+ else
+ bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs));
+
+ if (bprev == 0) {
+ printf("dev = 0x%llx, bsize = %d, bprev = %" PRId64 ", fs = %s\n",
+ (unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
+ fs->fs_fsmnt);
+ panic("ffs_realloccg: bad bprev");
+ }
+ mutex_exit(&ump->um_lock);
+
+ /*
+ * Allocate the extra space in the buffer.
+ */
+ if (bpp != NULL &&
+ (error = bread(ITOV(ip), lbprev, osize, NOCRED, 0, &bp)) != 0) {
+ brelse(bp, 0);
+ return (error);
+ }
+#if defined(QUOTA) || defined(QUOTA2)
+ if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
+ if (bpp != NULL) {
+ brelse(bp, 0);
+ }
+ return (error);
+ }
+#endif
+ /*
+ * Check for extension in the existing location.
+ */
+ cg = dtog(fs, bprev);
+ mutex_enter(&ump->um_lock);
+ if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
+ DIP_ADD(ip, blocks, btodb(nsize - osize));
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+ if (bpp != NULL) {
+ if (bp->b_blkno != fsbtodb(fs, bno))
+ panic("bad blockno");
+ allocbuf(bp, nsize, 1);
+ memset((char *)bp->b_data + osize, 0, nsize - osize);
+ mutex_enter(bp->b_objlock);
+ KASSERT(!cv_has_waiters(&bp->b_done));
+ bp->b_oflags |= BO_DONE;
+ mutex_exit(bp->b_objlock);
+ *bpp = bp;
+ }
+ if (blknop != NULL) {
+ *blknop = bno;
+ }
+ return (0);
+ }
+ /*
+ * Allocate a new disk location.
+ */
+ if (bpref >= fs->fs_size)
+ bpref = 0;
+ switch ((int)fs->fs_optim) {
+ case FS_OPTSPACE:
+ /*
+ * Allocate an exact sized fragment. Although this makes
+ * best use of space, we will waste time relocating it if
+ * the file continues to grow. If the fragmentation is
+ * less than half of the minimum free reserve, we choose
+ * to begin optimizing for time.
+ */
+ request = nsize;
+ if (fs->fs_minfree < 5 ||
+ fs->fs_cstotal.cs_nffree >
+ fs->fs_dsize * fs->fs_minfree / (2 * 100))
+ break;
+
+ if (ffs_log_changeopt) {
+ log(LOG_NOTICE,
+ "%s: optimization changed from SPACE to TIME\n",
+ fs->fs_fsmnt);
+ }
+
+ fs->fs_optim = FS_OPTTIME;
+ break;
+ case FS_OPTTIME:
+ /*
+ * At this point we have discovered a file that is trying to
+ * grow a small fragment to a larger fragment. To save time,
+ * we allocate a full sized block, then free the unused portion.
+ * If the file continues to grow, the `ffs_fragextend' call
+ * above will be able to grow it in place without further
+ * copying. If aberrant programs cause disk fragmentation to
+ * grow within 2% of the free reserve, we choose to begin
+ * optimizing for space.
+ */
+ request = fs->fs_bsize;
+ if (fs->fs_cstotal.cs_nffree <
+ fs->fs_dsize * (fs->fs_minfree - 2) / 100)
+ break;
+
+ if (ffs_log_changeopt) {
+ log(LOG_NOTICE,
+ "%s: optimization changed from TIME to SPACE\n",
+ fs->fs_fsmnt);
+ }
+
+ fs->fs_optim = FS_OPTSPACE;
+ break;
+ default:
+ printf("dev = 0x%llx, optim = %d, fs = %s\n",
+ (unsigned long long)ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
+ panic("ffs_realloccg: bad optim");
+ /* NOTREACHED */
+ }
+ bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg);
+ if (bno > 0) {
+ if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+ (ITOV(ip)->v_type != VREG)) {
+ UFS_WAPBL_REGISTER_DEALLOCATION(
+ ip->i_ump->um_mountp, fsbtodb(fs, bprev),
+ osize);
+ } else {
+ ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
+ ip->i_number);
+ }
+ if (nsize < request) {
+ if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+ (ITOV(ip)->v_type != VREG)) {
+ UFS_WAPBL_REGISTER_DEALLOCATION(
+ ip->i_ump->um_mountp,
+ fsbtodb(fs, (bno + numfrags(fs, nsize))),
+ request - nsize);
+ } else
+ ffs_blkfree(fs, ip->i_devvp,
+ bno + numfrags(fs, nsize),
+ (long)(request - nsize), ip->i_number);
+ }
+ DIP_ADD(ip, blocks, btodb(nsize - osize));
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (bpp != NULL) {
+ bp->b_blkno = fsbtodb(fs, bno);
+ allocbuf(bp, nsize, 1);
+ memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
+ mutex_enter(bp->b_objlock);
+ KASSERT(!cv_has_waiters(&bp->b_done));
+ bp->b_oflags |= BO_DONE;
+ mutex_exit(bp->b_objlock);
+ *bpp = bp;
+ }
+ if (blknop != NULL) {
+ *blknop = bno;
+ }
+ return (0);
+ }
+ mutex_exit(&ump->um_lock);
+
+#if defined(QUOTA) || defined(QUOTA2)
+ /*
+ * Restore user's disk quota because allocation failed.
+ */
+ (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
+#endif
+ if (bpp != NULL) {
+ brelse(bp, 0);
+ }
+
+nospace:
+ /*
+ * no space available
+ */
+ ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+ uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
+ return (ENOSPC);
+}
+
+/*
+ * Allocate an inode in the file system.
+ *
+ * If allocating a directory, use ffs_dirpref to select the inode.
+ * If allocating in a directory, the following hierarchy is followed:
+ * 1) allocate the preferred inode.
+ * 2) allocate an inode in the same cylinder group.
+ * 3) quadradically rehash into other cylinder groups, until an
+ * available inode is located.
+ * If no inode preference is given the following hierarchy is used
+ * to allocate an inode:
+ * 1) allocate an inode in cylinder group 0.
+ * 2) quadradically rehash into other cylinder groups, until an
+ * available inode is located.
+ *
+ * => um_lock not held upon entry or return
+ */
+int
+ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+ struct vnode **vpp)
+{
+ struct ufsmount *ump;
+ struct inode *pip;
+ struct fs *fs;
+ struct inode *ip;
+ struct timespec ts;
+ ino_t ino, ipref;
+ int cg, error;
+
+ UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
+
+ *vpp = NULL;
+ pip = VTOI(pvp);
+ fs = pip->i_fs;
+ ump = pip->i_ump;
+
+ error = UFS_WAPBL_BEGIN(pvp->v_mount);
+ if (error) {
+ return error;
+ }
+ mutex_enter(&ump->um_lock);
+ if (fs->fs_cstotal.cs_nifree == 0)
+ goto noinodes;
+
+ if ((mode & IFMT) == IFDIR)
+ ipref = ffs_dirpref(pip);
+ else
+ ipref = pip->i_number;
+ if (ipref >= fs->fs_ncg * fs->fs_ipg)
+ ipref = 0;
+ cg = ino_to_cg(fs, ipref);
+ /*
+ * Track number of dirs created one after another
+ * in a same cg without intervening by files.
+ */
+ if ((mode & IFMT) == IFDIR) {
+ if (fs->fs_contigdirs[cg] < 255)
+ fs->fs_contigdirs[cg]++;
+ } else {
+ if (fs->fs_contigdirs[cg] > 0)
+ fs->fs_contigdirs[cg]--;
+ }
+ ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg);
+ if (ino == 0)
+ goto noinodes;
+ UFS_WAPBL_END(pvp->v_mount);
+ error = VFS_VGET(pvp->v_mount, ino, vpp);
+ if (error) {
+ int err;
+ err = UFS_WAPBL_BEGIN(pvp->v_mount);
+ if (err == 0)
+ ffs_vfree(pvp, ino, mode);
+ if (err == 0)
+ UFS_WAPBL_END(pvp->v_mount);
+ return (error);
+ }
+ KASSERT((*vpp)->v_type == VNON);
+ ip = VTOI(*vpp);
+ if (ip->i_mode) {
+#if 0
+ printf("mode = 0%o, inum = %d, fs = %s\n",
+ ip->i_mode, ip->i_number, fs->fs_fsmnt);
+#else
+ printf("dmode %x mode %x dgen %x gen %x\n",
+ DIP(ip, mode), ip->i_mode,
+ DIP(ip, gen), ip->i_gen);
+ printf("size %llx blocks %llx\n",
+ (long long)DIP(ip, size), (long long)DIP(ip, blocks));
+ printf("ino %llu ipref %llu\n", (unsigned long long)ino,
+ (unsigned long long)ipref);
+#if 0
+ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+ (int)fs->fs_bsize, NOCRED, 0, &bp);
+#endif
+
+#endif
+ panic("ffs_valloc: dup alloc");
+ }
+ if (DIP(ip, blocks)) { /* XXX */
+ printf("free inode %s/%llu had %" PRId64 " blocks\n",
+ fs->fs_fsmnt, (unsigned long long)ino, DIP(ip, blocks));
+ DIP_ASSIGN(ip, blocks, 0);
+ }
+ ip->i_flag &= ~IN_SPACECOUNTED;
+ ip->i_flags = 0;
+ DIP_ASSIGN(ip, flags, 0);
+ /*
+ * Set up a new generation number for this inode.
+ */
+ ip->i_gen++;
+ DIP_ASSIGN(ip, gen, ip->i_gen);
+ if (fs->fs_magic == FS_UFS2_MAGIC) {
+ vfs_timestamp(&ts);
+ ip->i_ffs2_birthtime = ts.tv_sec;
+ ip->i_ffs2_birthnsec = ts.tv_nsec;
+ }
+ return (0);
+noinodes:
+ mutex_exit(&ump->um_lock);
+ UFS_WAPBL_END(pvp->v_mount);
+ ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
+ uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
+ return (ENOSPC);
+}
+
+/*
+ * Find a cylinder group in which to place a directory.
+ *
+ * The policy implemented by this algorithm is to allocate a
+ * directory inode in the same cylinder group as its parent
+ * directory, but also to reserve space for its files inodes
+ * and data. Restrict the number of directories which may be
+ * allocated one after another in the same cylinder group
+ * without intervening allocation of files.
+ *
+ * If we allocate a first level directory then force allocation
+ * in another cylinder group.
+ */
+static ino_t
+ffs_dirpref(struct inode *pip)
+{
+ register struct fs *fs;
+ int cg, prefcg;
+ int64_t dirsize, cgsize, curdsz;
+ int avgifree, avgbfree, avgndir;
+ int minifree, minbfree, maxndir;
+ int mincg, minndir;
+ int maxcontigdirs;
+
+ KASSERT(mutex_owned(&pip->i_ump->um_lock));
+
+ fs = pip->i_fs;
+
+ avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
+ avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+ avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
+
+ /*
+ * Force allocation in another cg if creating a first level dir.
+ */
+ if (ITOV(pip)->v_vflag & VV_ROOT) {
+ prefcg = random() % fs->fs_ncg;
+ mincg = prefcg;
+ minndir = fs->fs_ipg;
+ for (cg = prefcg; cg < fs->fs_ncg; cg++)
+ if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
+ fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
+ fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+ mincg = cg;
+ minndir = fs->fs_cs(fs, cg).cs_ndir;
+ }
+ for (cg = 0; cg < prefcg; cg++)
+ if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
+ fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
+ fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+ mincg = cg;
+ minndir = fs->fs_cs(fs, cg).cs_ndir;
+ }
+ return ((ino_t)(fs->fs_ipg * mincg));
+ }
+
+ /*
+ * Count various limits which used for
+ * optimal allocation of a directory inode.
+ */
+ maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
+ minifree = avgifree - fs->fs_ipg / 4;
+ if (minifree < 0)
+ minifree = 0;
+ minbfree = avgbfree - fragstoblks(fs, fs->fs_fpg) / 4;
+ if (minbfree < 0)
+ minbfree = 0;
+ cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
+ dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
+ if (avgndir != 0) {
+ curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
+ if (dirsize < curdsz)
+ dirsize = curdsz;
+ }
+ if (cgsize < dirsize * 255)
+ maxcontigdirs = cgsize / dirsize;
+ else
+ maxcontigdirs = 255;
+ if (fs->fs_avgfpdir > 0)
+ maxcontigdirs = min(maxcontigdirs,
+ fs->fs_ipg / fs->fs_avgfpdir);
+ if (maxcontigdirs == 0)
+ maxcontigdirs = 1;
+
+ /*
+ * Limit number of dirs in one cg and reserve space for
+ * regular files, but only if we have no deficit in
+ * inodes or space.
+ */
+ prefcg = ino_to_cg(fs, pip->i_number);
+ for (cg = prefcg; cg < fs->fs_ncg; cg++)
+ if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
+ fs->fs_cs(fs, cg).cs_nifree >= minifree &&
+ fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
+ if (fs->fs_contigdirs[cg] < maxcontigdirs)
+ return ((ino_t)(fs->fs_ipg * cg));
+ }
+ for (cg = 0; cg < prefcg; cg++)
+ if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
+ fs->fs_cs(fs, cg).cs_nifree >= minifree &&
+ fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
+ if (fs->fs_contigdirs[cg] < maxcontigdirs)
+ return ((ino_t)(fs->fs_ipg * cg));
+ }
+ /*
+ * This is a backstop when we are deficient in space.
+ */
+ for (cg = prefcg; cg < fs->fs_ncg; cg++)
+ if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
+ return ((ino_t)(fs->fs_ipg * cg));
+ for (cg = 0; cg < prefcg; cg++)
+ if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
+ break;
+ return ((ino_t)(fs->fs_ipg * cg));
+}
+
+/*
+ * Select the desired position for the next block in a file. The file is
+ * logically divided into sections. The first section is composed of the
+ * direct blocks. Each additional section contains fs_maxbpg blocks.
+ *
+ * If no blocks have been allocated in the first section, the policy is to
+ * request a block in the same cylinder group as the inode that describes
+ * the file. If no blocks have been allocated in any other section, the
+ * policy is to place the section in a cylinder group with a greater than
+ * average number of free blocks. An appropriate cylinder group is found
+ * by using a rotor that sweeps the cylinder groups. When a new group of
+ * blocks is needed, the sweep begins in the cylinder group following the
+ * cylinder group from which the previous allocation was made. The sweep
+ * continues until a cylinder group with greater than the average number
+ * of free blocks is found. If the allocation is for the first block in an
+ * indirect block, the information on the previous allocation is unavailable;
+ * here a best guess is made based upon the logical block number being
+ * allocated.
+ *
+ * If a section is already partially allocated, the policy is to
+ * contiguously allocate fs_maxcontig blocks. The end of one of these
+ * contiguous blocks and the beginning of the next is laid out
+ * contigously if possible.
+ *
+ * => um_lock held on entry and exit
+ */
+daddr_t
+ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
+ int32_t *bap /* XXX ondisk32 */)
+{
+ struct fs *fs;
+ int cg;
+ int avgbfree, startcg;
+
+ KASSERT(mutex_owned(&ip->i_ump->um_lock));
+
+ fs = ip->i_fs;
+
+ /*
+ * If allocating a contiguous file with B_CONTIG, use the hints
+ * in the inode extentions to return the desired block.
+ *
+ * For metadata (indirect blocks) return the address of where
+ * the first indirect block resides - we'll scan for the next
+ * available slot if we need to allocate more than one indirect
+ * block. For data, return the address of the actual block
+ * relative to the address of the first data block.
+ */
+ if (flags & B_CONTIG) {
+ KASSERT(ip->i_ffs_first_data_blk != 0);
+ KASSERT(ip->i_ffs_first_indir_blk != 0);
+ if (flags & B_METAONLY)
+ return ip->i_ffs_first_indir_blk;
+ else
+ return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+ }
+
+ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
+ if (lbn < NDADDR + NINDIR(fs)) {
+ cg = ino_to_cg(fs, ip->i_number);
+ return (cgbase(fs, cg) + fs->fs_frag);
+ }
+ /*
+ * Find a cylinder with greater than average number of
+ * unused data blocks.
+ */
+ if (indx == 0 || bap[indx - 1] == 0)
+ startcg =
+ ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+ else
+ startcg = dtog(fs,
+ ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
+ startcg %= fs->fs_ncg;
+ avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+ for (cg = startcg; cg < fs->fs_ncg; cg++)
+ if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+ return (cgbase(fs, cg) + fs->fs_frag);
+ }
+ for (cg = 0; cg < startcg; cg++)
+ if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+ return (cgbase(fs, cg) + fs->fs_frag);
+ }
+ return (0);
+ }
+ /*
+ * We just always try to lay things out contiguously.
+ */
+ return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
+}
+
+daddr_t
+ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
+ int64_t *bap)
+{
+ struct fs *fs;
+ int cg;
+ int avgbfree, startcg;
+
+ KASSERT(mutex_owned(&ip->i_ump->um_lock));
+
+ fs = ip->i_fs;
+
+ /*
+ * If allocating a contiguous file with B_CONTIG, use the hints
+ * in the inode extentions to return the desired block.
+ *
+ * For metadata (indirect blocks) return the address of where
+ * the first indirect block resides - we'll scan for the next
+ * available slot if we need to allocate more than one indirect
+ * block. For data, return the address of the actual block
+ * relative to the address of the first data block.
+ */
+ if (flags & B_CONTIG) {
+ KASSERT(ip->i_ffs_first_data_blk != 0);
+ KASSERT(ip->i_ffs_first_indir_blk != 0);
+ if (flags & B_METAONLY)
+ return ip->i_ffs_first_indir_blk;
+ else
+ return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+ }
+
+ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
+ if (lbn < NDADDR + NINDIR(fs)) {
+ cg = ino_to_cg(fs, ip->i_number);
+ return (cgbase(fs, cg) + fs->fs_frag);
+ }
+ /*
+ * Find a cylinder with greater than average number of
+ * unused data blocks.
+ */
+ if (indx == 0 || bap[indx - 1] == 0)
+ startcg =
+ ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+ else
+ startcg = dtog(fs,
+ ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
+ startcg %= fs->fs_ncg;
+ avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+ for (cg = startcg; cg < fs->fs_ncg; cg++)
+ if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+ return (cgbase(fs, cg) + fs->fs_frag);
+ }
+ for (cg = 0; cg < startcg; cg++)
+ if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+ return (cgbase(fs, cg) + fs->fs_frag);
+ }
+ return (0);
+ }
+ /*
+ * We just always try to lay things out contiguously.
+ */
+ return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
+}
+
+
+/*
+ * Implement the cylinder overflow algorithm.
+ *
+ * The policy implemented by this algorithm is:
+ * 1) allocate the block in its requested cylinder group.
+ * 2) quadradically rehash on the cylinder group number.
+ * 3) brute force search for a free block.
+ *
+ * => called with um_lock held
+ * => returns with um_lock released on success, held on failure
+ * (*allocator releases lock on success, retains lock on failure)
+ */
+/*VARARGS5*/
+static daddr_t
+ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
+ int size /* size for data blocks, mode for inodes */,
+ int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int))
+{
+ struct fs *fs;
+ daddr_t result;
+ int i, icg = cg;
+
+ fs = ip->i_fs;
+ /*
+ * 1: preferred cylinder group
+ */
+ result = (*allocator)(ip, cg, pref, size, flags);
+ if (result)
+ return (result);
+
+ if (flags & B_CONTIG)
+ return (result);
+ /*
+ * 2: quadratic rehash
+ */
+ for (i = 1; i < fs->fs_ncg; i *= 2) {
+ cg += i;
+ if (cg >= fs->fs_ncg)
+ cg -= fs->fs_ncg;
+ result = (*allocator)(ip, cg, 0, size, flags);
+ if (result)
+ return (result);
+ }
+ /*
+ * 3: brute force search
+ * Note that we start at i == 2, since 0 was checked initially,
+ * and 1 is always checked in the quadratic rehash.
+ */
+ cg = (icg + 2) % fs->fs_ncg;
+ for (i = 2; i < fs->fs_ncg; i++) {
+ result = (*allocator)(ip, cg, 0, size, flags);
+ if (result)
+ return (result);
+ cg++;
+ if (cg == fs->fs_ncg)
+ cg = 0;
+ }
+ return (0);
+}
+
+/*
+ * Determine whether a fragment can be extended.
+ *
+ * Check to see if the necessary fragments are available, and
+ * if they are, allocate them.
+ *
+ * => called with um_lock held
+ * => returns with um_lock released on success, held on failure
+ */
+static daddr_t
+ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize)
+{
+ struct ufsmount *ump;
+ struct fs *fs;
+ struct cg *cgp;
+ struct buf *bp;
+ daddr_t bno;
+ int frags, bbase;
+ int i, error;
+ u_int8_t *blksfree;
+
+ fs = ip->i_fs;
+ ump = ip->i_ump;
+
+ KASSERT(mutex_owned(&ump->um_lock));
+
+ if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
+ return (0);
+ frags = numfrags(fs, nsize);
+ bbase = fragnum(fs, bprev);
+ if (bbase > fragnum(fs, (bprev + frags - 1))) {
+ /* cannot extend across a block boundary */
+ return (0);
+ }
+ mutex_exit(&ump->um_lock);
+ error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+ (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+ if (error)
+ goto fail;
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
+ goto fail;
+ cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
+ if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+ (fs->fs_old_flags & FS_FLAGS_UPDATED))
+ cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
+ bno = dtogd(fs, bprev);
+ blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
+ for (i = numfrags(fs, osize); i < frags; i++)
+ if (isclr(blksfree, bno + i))
+ goto fail;
+ /*
+ * the current fragment can be extended
+ * deduct the count on fragment being extended into
+ * increase the count on the remaining fragment (if any)
+ * allocate the extended piece
+ */
+ for (i = frags; i < fs->fs_frag - bbase; i++)
+ if (isclr(blksfree, bno + i))
+ break;
+ ufs_add32(cgp->cg_frsum[i - numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
+ if (i != frags)
+ ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
+ mutex_enter(&ump->um_lock);
+ for (i = numfrags(fs, osize); i < frags; i++) {
+ clrbit(blksfree, bno + i);
+ ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
+ fs->fs_cstotal.cs_nffree--;
+ fs->fs_cs(fs, cg).cs_nffree--;
+ }
+ fs->fs_fmod = 1;
+ ACTIVECG_CLR(fs, cg);
+ mutex_exit(&ump->um_lock);
+ bdwrite(bp);
+ return (bprev);
+
+ fail:
+ brelse(bp, 0);
+ mutex_enter(&ump->um_lock);
+ return (0);
+}
+
+/*
+ * Determine whether a block can be allocated.
+ *
+ * Check to see if a block of the appropriate size is available,
+ * and if it is, allocate it.
+ */
+static daddr_t
+ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags)
+{
+ struct ufsmount *ump;
+ struct fs *fs = ip->i_fs;
+ struct cg *cgp;
+ struct buf *bp;
+ int32_t bno;
+ daddr_t blkno;
+ int error, frags, allocsiz, i;
+ u_int8_t *blksfree;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ ump = ip->i_ump;
+
+ KASSERT(mutex_owned(&ump->um_lock));
+
+ if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
+ return (0);
+ mutex_exit(&ump->um_lock);
+ error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+ (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+ if (error)
+ goto fail;
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, needswap) ||
+ (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
+ goto fail;
+ cgp->cg_old_time = ufs_rw32(time_second, needswap);
+ if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+ (fs->fs_old_flags & FS_FLAGS_UPDATED))
+ cgp->cg_time = ufs_rw64(time_second, needswap);
+ if (size == fs->fs_bsize) {
+ mutex_enter(&ump->um_lock);
+ blkno = ffs_alloccgblk(ip, bp, bpref, flags);
+ ACTIVECG_CLR(fs, cg);
+ mutex_exit(&ump->um_lock);
+ bdwrite(bp);
+ return (blkno);
+ }
+ /*
+ * check to see if any fragments are already available
+ * allocsiz is the size which will be allocated, hacking
+ * it down to a smaller size if necessary
+ */
+ blksfree = cg_blksfree(cgp, needswap);
+ frags = numfrags(fs, size);
+ for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
+ if (cgp->cg_frsum[allocsiz] != 0)
+ break;
+ if (allocsiz == fs->fs_frag) {
+ /*
+ * no fragments were available, so a block will be
+ * allocated, and hacked up
+ */
+ if (cgp->cg_cs.cs_nbfree == 0)
+ goto fail;
+ mutex_enter(&ump->um_lock);
+ blkno = ffs_alloccgblk(ip, bp, bpref, flags);
+ bno = dtogd(fs, blkno);
+ for (i = frags; i < fs->fs_frag; i++)
+ setbit(blksfree, bno + i);
+ i = fs->fs_frag - frags;
+ ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
+ fs->fs_cstotal.cs_nffree += i;
+ fs->fs_cs(fs, cg).cs_nffree += i;
+ fs->fs_fmod = 1;
+ ufs_add32(cgp->cg_frsum[i], 1, needswap);
+ ACTIVECG_CLR(fs, cg);
+ mutex_exit(&ump->um_lock);
+ bdwrite(bp);
+ return (blkno);
+ }
+ bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
+#if 0
+ /*
+ * XXX fvdl mapsearch will panic, and never return -1
+ * also: returning NULL as daddr_t ?
+ */
+ if (bno < 0)
+ goto fail;
+#endif
+ for (i = 0; i < frags; i++)
+ clrbit(blksfree, bno + i);
+ mutex_enter(&ump->um_lock);
+ ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
+ fs->fs_cstotal.cs_nffree -= frags;
+ fs->fs_cs(fs, cg).cs_nffree -= frags;
+ fs->fs_fmod = 1;
+ ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
+ if (frags != allocsiz)
+ ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
+ blkno = cgbase(fs, cg) + bno;
+ ACTIVECG_CLR(fs, cg);
+ mutex_exit(&ump->um_lock);
+ bdwrite(bp);
+ return blkno;
+
+ fail:
+ brelse(bp, 0);
+ mutex_enter(&ump->um_lock);
+ return (0);
+}
+
+/*
+ * Allocate a block in a cylinder group.
+ *
+ * This algorithm implements the following policy:
+ * 1) allocate the requested block.
+ * 2) allocate a rotationally optimal block in the same cylinder.
+ * 3) allocate the next available block on the block rotor for the
+ * specified cylinder group.
+ * Note that this routine only allocates fs_bsize blocks; these
+ * blocks may be fragmented by the routine that allocates them.
+ */
+static daddr_t
+ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags)
+{
+ struct ufsmount *ump;
+ struct fs *fs = ip->i_fs;
+ struct cg *cgp;
+ int cg;
+ daddr_t blkno;
+ int32_t bno;
+ u_int8_t *blksfree;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ ump = ip->i_ump;
+
+ KASSERT(mutex_owned(&ump->um_lock));
+
+ cgp = (struct cg *)bp->b_data;
+ blksfree = cg_blksfree(cgp, needswap);
+ if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
+ bpref = ufs_rw32(cgp->cg_rotor, needswap);
+ } else {
+ bpref = blknum(fs, bpref);
+ bno = dtogd(fs, bpref);
+ /*
+ * if the requested block is available, use it
+ */
+ if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
+ goto gotit;
+ /*
+ * if the requested data block isn't available and we are
+ * trying to allocate a contiguous file, return an error.
+ */
+ if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
+ return (0);
+ }
+
+ /*
+ * Take the next available block in this cylinder group.
+ */
+ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
+ if (bno < 0)
+ return (0);
+ cgp->cg_rotor = ufs_rw32(bno, needswap);
+gotit:
+ blkno = fragstoblks(fs, bno);
+ ffs_clrblock(fs, blksfree, blkno);
+ ffs_clusteracct(fs, cgp, blkno, -1);
+ ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+ fs->fs_cstotal.cs_nbfree--;
+ fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
+ if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+ ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+ int cylno;
+ cylno = old_cbtocylno(fs, bno);
+ KASSERT(cylno >= 0);
+ KASSERT(cylno < fs->fs_old_ncyl);
+ KASSERT(old_cbtorpos(fs, bno) >= 0);
+ KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
+ ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
+ needswap);
+ ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
+ }
+ fs->fs_fmod = 1;
+ cg = ufs_rw32(cgp->cg_cgx, needswap);
+ blkno = cgbase(fs, cg) + bno;
+ return (blkno);
+}
+
+/*
+ * Determine whether an inode can be allocated.
+ *
+ * Check to see if an inode is available, and if it is,
+ * allocate it using the following policy:
+ * 1) allocate the requested inode.
+ * 2) allocate the next available inode after the requested
+ * inode in the specified cylinder group.
+ */
+static daddr_t
+ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags)
+{
+ struct ufsmount *ump = ip->i_ump;
+ struct fs *fs = ip->i_fs;
+ struct cg *cgp;
+ struct buf *bp, *ibp;
+ u_int8_t *inosused;
+ int error, start, len, loc, map, i;
+ int32_t initediblk;
+ daddr_t nalloc;
+ struct ufs2_dinode *dp2;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ KASSERT(mutex_owned(&ump->um_lock));
+ UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
+
+ if (fs->fs_cs(fs, cg).cs_nifree == 0)
+ return (0);
+ mutex_exit(&ump->um_lock);
+ ibp = NULL;
+ initediblk = -1;
+retry:
+ error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+ (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+ if (error)
+ goto fail;
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
+ goto fail;
+
+ if (ibp != NULL &&
+ initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
+ /* Another thread allocated more inodes so we retry the test. */
+ brelse(ibp, 0);
+ ibp = NULL;
+ }
+ /*
+ * Check to see if we need to initialize more inodes.
+ */
+ if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
+ initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
+ nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
+ if (nalloc + INOPB(fs) > initediblk &&
+ initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
+ /*
+ * We have to release the cg buffer here to prevent
+ * a deadlock when reading the inode block will
+ * run a copy-on-write that might use this cg.
+ */
+ brelse(bp, 0);
+ bp = NULL;
+ error = ffs_getblk(ip->i_devvp, fsbtodb(fs,
+ ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
+ FFS_NOBLK, fs->fs_bsize, false, &ibp);
+ if (error)
+ goto fail;
+ goto retry;
+ }
+ }
+
+ cgp->cg_old_time = ufs_rw32(time_second, needswap);
+ if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+ (fs->fs_old_flags & FS_FLAGS_UPDATED))
+ cgp->cg_time = ufs_rw64(time_second, needswap);
+ inosused = cg_inosused(cgp, needswap);
+ if (ipref) {
+ ipref %= fs->fs_ipg;
+ if (isclr(inosused, ipref))
+ goto gotit;
+ }
+ start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
+ len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
+ NBBY);
+ loc = skpc(0xff, len, &inosused[start]);
+ if (loc == 0) {
+ len = start + 1;
+ start = 0;
+ loc = skpc(0xff, len, &inosused[0]);
+ if (loc == 0) {
+ printf("cg = %d, irotor = %d, fs = %s\n",
+ cg, ufs_rw32(cgp->cg_irotor, needswap),
+ fs->fs_fsmnt);
+ panic("ffs_nodealloccg: map corrupted");
+ /* NOTREACHED */
+ }
+ }
+ i = start + len - loc;
+ map = inosused[i] ^ 0xff;
+ if (map == 0) {
+ printf("fs = %s\n", fs->fs_fsmnt);
+ panic("ffs_nodealloccg: block not in map");
+ }
+ ipref = i * NBBY + ffs(map) - 1;
+ cgp->cg_irotor = ufs_rw32(ipref, needswap);
+gotit:
+ UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
+ mode);
+ /*
+ * Check to see if we need to initialize more inodes.
+ */
+ if (ibp != NULL) {
+ KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
+ memset(ibp->b_data, 0, fs->fs_bsize);
+ dp2 = (struct ufs2_dinode *)(ibp->b_data);
+ for (i = 0; i < INOPB(fs); i++) {
+ /*
+ * Don't bother to swap, it's supposed to be
+ * random, after all.
+ */
+ dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
+ dp2++;
+ }
+ initediblk += INOPB(fs);
+ cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
+ }
+
+ mutex_enter(&ump->um_lock);
+ ACTIVECG_CLR(fs, cg);
+ setbit(inosused, ipref);
+ ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
+ fs->fs_cstotal.cs_nifree--;
+ fs->fs_cs(fs, cg).cs_nifree--;
+ fs->fs_fmod = 1;
+ if ((mode & IFMT) == IFDIR) {
+ ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
+ fs->fs_cstotal.cs_ndir++;
+ fs->fs_cs(fs, cg).cs_ndir++;
+ }
+ mutex_exit(&ump->um_lock);
+ if (ibp != NULL) {
+ bwrite(bp);
+ bawrite(ibp);
+ } else
+ bdwrite(bp);
+ return (cg * fs->fs_ipg + ipref);
+ fail:
+ if (bp != NULL)
+ brelse(bp, 0);
+ if (ibp != NULL)
+ brelse(ibp, 0);
+ mutex_enter(&ump->um_lock);
+ return (0);
+}
+
+/*
+ * Allocate a block or fragment.
+ *
+ * The specified block or fragment is removed from the
+ * free map, possibly fragmenting a block in the process.
+ *
+ * This implementation should mirror fs_blkfree
+ *
+ * => um_lock not held on entry or exit
+ */
+int
+ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
+{
+ int error;
+
+ error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
+ ip->i_dev, ip->i_uid);
+ if (error)
+ return error;
+
+ return ffs_blkalloc_ump(ip->i_ump, bno, size);
+}
+
+int
+ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
+{
+ struct fs *fs = ump->um_fs;
+ struct cg *cgp;
+ struct buf *bp;
+ int32_t fragno, cgbno;
+ int i, error, cg, blk, frags, bbase;
+ u_int8_t *blksfree;
+ const int needswap = UFS_FSNEEDSWAP(fs);
+
+ KASSERT((u_int)size <= fs->fs_bsize && fragoff(fs, size) == 0 &&
+ fragnum(fs, bno) + numfrags(fs, size) <= fs->fs_frag);
+ KASSERT(bno < fs->fs_size);
+
+ cg = dtog(fs, bno);
+ error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
+ (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return error;
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, needswap)) {
+ brelse(bp, 0);
+ return EIO;
+ }
+ cgp->cg_old_time = ufs_rw32(time_second, needswap);
+ cgp->cg_time = ufs_rw64(time_second, needswap);
+ cgbno = dtogd(fs, bno);
+ blksfree = cg_blksfree(cgp, needswap);
+
+ mutex_enter(&ump->um_lock);
+ if (size == fs->fs_bsize) {
+ fragno = fragstoblks(fs, cgbno);
+ if (!ffs_isblock(fs, blksfree, fragno)) {
+ mutex_exit(&ump->um_lock);
+ brelse(bp, 0);
+ return EBUSY;
+ }
+ ffs_clrblock(fs, blksfree, fragno);
+ ffs_clusteracct(fs, cgp, fragno, -1);
+ ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+ fs->fs_cstotal.cs_nbfree--;
+ fs->fs_cs(fs, cg).cs_nbfree--;
+ } else {
+ bbase = cgbno - fragnum(fs, cgbno);
+
+ frags = numfrags(fs, size);
+ for (i = 0; i < frags; i++) {
+ if (isclr(blksfree, cgbno + i)) {
+ mutex_exit(&ump->um_lock);
+ brelse(bp, 0);
+ return EBUSY;
+ }
+ }
+ /*
+ * if a complete block is being split, account for it
+ */
+ fragno = fragstoblks(fs, bbase);
+ if (ffs_isblock(fs, blksfree, fragno)) {
+ ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
+ fs->fs_cstotal.cs_nffree += fs->fs_frag;
+ fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
+ ffs_clusteracct(fs, cgp, fragno, -1);
+ ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+ fs->fs_cstotal.cs_nbfree--;
+ fs->fs_cs(fs, cg).cs_nbfree--;
+ }
+ /*
+ * decrement the counts associated with the old frags
+ */
+ blk = blkmap(fs, blksfree, bbase);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
+ /*
+ * allocate the fragment
+ */
+ for (i = 0; i < frags; i++) {
+ clrbit(blksfree, cgbno + i);
+ }
+ ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
+ fs->fs_cstotal.cs_nffree -= i;
+ fs->fs_cs(fs, cg).cs_nffree -= i;
+ /*
+ * add back in counts associated with the new frags
+ */
+ blk = blkmap(fs, blksfree, bbase);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
+ }
+ fs->fs_fmod = 1;
+ ACTIVECG_CLR(fs, cg);
+ mutex_exit(&ump->um_lock);
+ bdwrite(bp);
+ return 0;
+}
+
+/*
+ * Free a block or fragment.
+ *
+ * The specified block or fragment is placed back in the
+ * free map. If a fragment is deallocated, a possible
+ * block reassembly is checked.
+ *
+ * => um_lock not held on entry or exit
+ */
+void
+ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
+ ino_t inum)
+{
+ struct cg *cgp;
+ struct buf *bp;
+ struct ufsmount *ump;
+ daddr_t cgblkno;
+ int error, cg;
+ dev_t dev;
+ const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ KASSERT(!devvp_is_snapshot);
+
+ cg = dtog(fs, bno);
+ dev = devvp->v_rdev;
+ ump = VFSTOUFS(devvp->v_specmountpoint);
+ KASSERT(fs == ump->um_fs);
+ cgblkno = fsbtodb(fs, cgtod(fs, cg));
+ if (ffs_snapblkfree(fs, devvp, bno, size, inum))
+ return;
+
+ error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
+ if (error)
+ return;
+
+ error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
+ NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return;
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, needswap)) {
+ brelse(bp, 0);
+ return;
+ }
+
+ ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
+
+ bdwrite(bp);
+}
+
+/*
+ * Free a block or fragment from a snapshot cg copy.
+ *
+ * The specified block or fragment is placed back in the
+ * free map. If a fragment is deallocated, a possible
+ * block reassembly is checked.
+ *
+ * => um_lock not held on entry or exit
+ */
+void
+ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
+ ino_t inum)
+{
+ struct cg *cgp;
+ struct buf *bp;
+ struct ufsmount *ump;
+ daddr_t cgblkno;
+ int error, cg;
+ dev_t dev;
+ const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ KASSERT(devvp_is_snapshot);
+
+ cg = dtog(fs, bno);
+ dev = VTOI(devvp)->i_devvp->v_rdev;
+ ump = VFSTOUFS(devvp->v_mount);
+ cgblkno = fragstoblks(fs, cgtod(fs, cg));
+
+ error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
+ if (error)
+ return;
+
+ error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
+ NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return;
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, needswap)) {
+ brelse(bp, 0);
+ return;
+ }
+
+ ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
+
+ bdwrite(bp);
+}
+
+static void
+ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
+ struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
+{
+ struct cg *cgp;
+ int32_t fragno, cgbno;
+ int i, cg, blk, frags, bbase;
+ u_int8_t *blksfree;
+ const int needswap = UFS_FSNEEDSWAP(fs);
+
+ cg = dtog(fs, bno);
+ cgp = (struct cg *)bp->b_data;
+ cgp->cg_old_time = ufs_rw32(time_second, needswap);
+ if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+ (fs->fs_old_flags & FS_FLAGS_UPDATED))
+ cgp->cg_time = ufs_rw64(time_second, needswap);
+ cgbno = dtogd(fs, bno);
+ blksfree = cg_blksfree(cgp, needswap);
+ mutex_enter(&ump->um_lock);
+ if (size == fs->fs_bsize) {
+ fragno = fragstoblks(fs, cgbno);
+ if (!ffs_isfreeblock(fs, blksfree, fragno)) {
+ if (devvp_is_snapshot) {
+ mutex_exit(&ump->um_lock);
+ return;
+ }
+ printf("dev = 0x%llx, block = %" PRId64 ", fs = %s\n",
+ (unsigned long long)dev, bno, fs->fs_fsmnt);
+ panic("blkfree: freeing free block");
+ }
+ ffs_setblock(fs, blksfree, fragno);
+ ffs_clusteracct(fs, cgp, fragno, 1);
+ ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
+ fs->fs_cstotal.cs_nbfree++;
+ fs->fs_cs(fs, cg).cs_nbfree++;
+ if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+ ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+ i = old_cbtocylno(fs, cgbno);
+ KASSERT(i >= 0);
+ KASSERT(i < fs->fs_old_ncyl);
+ KASSERT(old_cbtorpos(fs, cgbno) >= 0);
+ KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
+ ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
+ needswap);
+ ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
+ }
+ } else {
+ bbase = cgbno - fragnum(fs, cgbno);
+ /*
+ * decrement the counts associated with the old frags
+ */
+ blk = blkmap(fs, blksfree, bbase);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
+ /*
+ * deallocate the fragment
+ */
+ frags = numfrags(fs, size);
+ for (i = 0; i < frags; i++) {
+ if (isset(blksfree, cgbno + i)) {
+ printf("dev = 0x%llx, block = %" PRId64
+ ", fs = %s\n",
+ (unsigned long long)dev, bno + i,
+ fs->fs_fsmnt);
+ panic("blkfree: freeing free frag");
+ }
+ setbit(blksfree, cgbno + i);
+ }
+ ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
+ fs->fs_cstotal.cs_nffree += i;
+ fs->fs_cs(fs, cg).cs_nffree += i;
+ /*
+ * add back in counts associated with the new frags
+ */
+ blk = blkmap(fs, blksfree, bbase);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
+ /*
+ * if a complete block has been reassembled, account for it
+ */
+ fragno = fragstoblks(fs, bbase);
+ if (ffs_isblock(fs, blksfree, fragno)) {
+ ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
+ fs->fs_cstotal.cs_nffree -= fs->fs_frag;
+ fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
+ ffs_clusteracct(fs, cgp, fragno, 1);
+ ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
+ fs->fs_cstotal.cs_nbfree++;
+ fs->fs_cs(fs, cg).cs_nbfree++;
+ if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+ ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+ i = old_cbtocylno(fs, bbase);
+ KASSERT(i >= 0);
+ KASSERT(i < fs->fs_old_ncyl);
+ KASSERT(old_cbtorpos(fs, bbase) >= 0);
+ KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
+ ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
+ bbase)], 1, needswap);
+ ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
+ }
+ }
+ }
+ fs->fs_fmod = 1;
+ ACTIVECG_CLR(fs, cg);
+ mutex_exit(&ump->um_lock);
+}
+
+/*
+ * Free an inode.
+ */
+int
+ffs_vfree(struct vnode *vp, ino_t ino, int mode)
+{
+
+ return ffs_freefile(vp->v_mount, ino, mode);
+}
+
+/*
+ * Do the actual free operation.
+ * The specified inode is placed back in the free map.
+ *
+ * => um_lock not held on entry or exit
+ */
+int
+ffs_freefile(struct mount *mp, ino_t ino, int mode)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ struct vnode *devvp;
+ struct cg *cgp;
+ struct buf *bp;
+ int error, cg;
+ daddr_t cgbno;
+ dev_t dev;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ cg = ino_to_cg(fs, ino);
+ devvp = ump->um_devvp;
+ dev = devvp->v_rdev;
+ cgbno = fsbtodb(fs, cgtod(fs, cg));
+
+ if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+ panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+ (long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
+ error = bread(devvp, cgbno, (int)fs->fs_cgsize,
+ NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, needswap)) {
+ brelse(bp, 0);
+ return (0);
+ }
+
+ ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
+
+ bdwrite(bp);
+
+ return 0;
+}
+
+int
+ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
+{
+ struct ufsmount *ump;
+ struct cg *cgp;
+ struct buf *bp;
+ int error, cg;
+ daddr_t cgbno;
+ dev_t dev;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ KASSERT(devvp->v_type != VBLK);
+
+ cg = ino_to_cg(fs, ino);
+ dev = VTOI(devvp)->i_devvp->v_rdev;
+ ump = VFSTOUFS(devvp->v_mount);
+ cgbno = fragstoblks(fs, cgtod(fs, cg));
+ if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+ panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+ (unsigned long long)dev, (unsigned long long)ino,
+ fs->fs_fsmnt);
+ error = bread(devvp, cgbno, (int)fs->fs_cgsize,
+ NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, needswap)) {
+ brelse(bp, 0);
+ return (0);
+ }
+ ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
+
+ bdwrite(bp);
+
+ return 0;
+}
+
+static void
+ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
+ struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
+{
+ int cg;
+ struct cg *cgp;
+ u_int8_t *inosused;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ cg = ino_to_cg(fs, ino);
+ cgp = (struct cg *)bp->b_data;
+ cgp->cg_old_time = ufs_rw32(time_second, needswap);
+ if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+ (fs->fs_old_flags & FS_FLAGS_UPDATED))
+ cgp->cg_time = ufs_rw64(time_second, needswap);
+ inosused = cg_inosused(cgp, needswap);
+ ino %= fs->fs_ipg;
+ if (isclr(inosused, ino)) {
+ printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
+ (unsigned long long)dev, (unsigned long long)ino +
+ cg * fs->fs_ipg, fs->fs_fsmnt);
+ if (fs->fs_ronly == 0)
+ panic("ifree: freeing free inode");
+ }
+ clrbit(inosused, ino);
+ if (!devvp_is_snapshot)
+ UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp,
+ ino + cg * fs->fs_ipg, mode);
+ if (ino < ufs_rw32(cgp->cg_irotor, needswap))
+ cgp->cg_irotor = ufs_rw32(ino, needswap);
+ ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
+ mutex_enter(&ump->um_lock);
+ fs->fs_cstotal.cs_nifree++;
+ fs->fs_cs(fs, cg).cs_nifree++;
+ if ((mode & IFMT) == IFDIR) {
+ ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
+ fs->fs_cstotal.cs_ndir--;
+ fs->fs_cs(fs, cg).cs_ndir--;
+ }
+ fs->fs_fmod = 1;
+ ACTIVECG_CLR(fs, cg);
+ mutex_exit(&ump->um_lock);
+}
+
+/*
+ * Check to see if a file is free.
+ */
+int
+ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
+{
+ struct cg *cgp;
+ struct buf *bp;
+ daddr_t cgbno;
+ int ret, cg;
+ u_int8_t *inosused;
+ const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+
+ KASSERT(devvp_is_snapshot);
+
+ cg = ino_to_cg(fs, ino);
+ if (devvp_is_snapshot)
+ cgbno = fragstoblks(fs, cgtod(fs, cg));
+ else
+ cgbno = fsbtodb(fs, cgtod(fs, cg));
+ if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+ return 1;
+ if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, 0, &bp)) {
+ brelse(bp, 0);
+ return 1;
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
+ brelse(bp, 0);
+ return 1;
+ }
+ inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
+ ino %= fs->fs_ipg;
+ ret = isclr(inosused, ino);
+ brelse(bp, 0);
+ return ret;
+}
+
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ *
+ * It is a panic if a request is made to find a block if none are
+ * available.
+ */
+static int32_t
+ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
+{
+ int32_t bno;
+ int start, len, loc, i;
+ int blk, field, subfield, pos;
+ int ostart, olen;
+ u_int8_t *blksfree;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ /* KASSERT(mutex_owned(&ump->um_lock)); */
+
+ /*
+ * find the fragment by searching through the free block
+ * map for an appropriate bit pattern
+ */
+ if (bpref)
+ start = dtogd(fs, bpref) / NBBY;
+ else
+ start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
+ blksfree = cg_blksfree(cgp, needswap);
+ len = howmany(fs->fs_fpg, NBBY) - start;
+ ostart = start;
+ olen = len;
+ loc = scanc((u_int)len,
+ (const u_char *)&blksfree[start],
+ (const u_char *)fragtbl[fs->fs_frag],
+ (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
+ if (loc == 0) {
+ len = start + 1;
+ start = 0;
+ loc = scanc((u_int)len,
+ (const u_char *)&blksfree[0],
+ (const u_char *)fragtbl[fs->fs_frag],
+ (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
+ if (loc == 0) {
+ printf("start = %d, len = %d, fs = %s\n",
+ ostart, olen, fs->fs_fsmnt);
+ printf("offset=%d %ld\n",
+ ufs_rw32(cgp->cg_freeoff, needswap),
+ (long)blksfree - (long)cgp);
+ printf("cg %d\n", cgp->cg_cgx);
+ panic("ffs_alloccg: map corrupted");
+ /* NOTREACHED */
+ }
+ }
+ bno = (start + len - loc) * NBBY;
+ cgp->cg_frotor = ufs_rw32(bno, needswap);
+ /*
+ * found the byte in the map
+ * sift through the bits to find the selected frag
+ */
+ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
+ blk = blkmap(fs, blksfree, bno);
+ blk <<= 1;
+ field = around[allocsiz];
+ subfield = inside[allocsiz];
+ for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
+ if ((blk & field) == subfield)
+ return (bno + pos);
+ field <<= 1;
+ subfield <<= 1;
+ }
+ }
+ printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
+ panic("ffs_alloccg: block not in map");
+ /* return (-1); */
+}
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ * fs: error message
+ */
+static void
+ffs_fserr(struct fs *fs, u_int uid, const char *cp)
+{
+
+ log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
+ uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
+}
--- /dev/null
+/* $NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $ */
+
+/*
+ * Copyright (c) 2002 Darrin B. Jewell
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#if defined(_KERNEL)
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/cprng.h>
+#endif
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#if !defined(_KERNEL) && !defined(STANDALONE)
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#define KASSERT(x) assert(x)
+#endif
+
+/*
+ * this is the same calculation as in_cksum
+ */
+u_int16_t
+ffs_appleufs_cksum(const struct appleufslabel *appleufs)
+{
+ const u_int16_t *p = (const u_int16_t *)appleufs;
+ int len = APPLEUFS_LABEL_SIZE; /* sizeof(struct appleufslabel) */
+ long res = 0;
+ while (len > 1) {
+ res += *p++;
+ len -= 2;
+ }
+#if 0 /* APPLEUFS_LABEL_SIZE is guaranteed to be even */
+ if (len == 1)
+ res += htobe16(*(u_char *)p<<8);
+#endif
+ res = (res >> 16) + (res & 0xffff);
+ res += (res >> 16);
+ return (~res);
+}
+
+/* copies o to n, validating and byteswapping along the way
+ * returns 0 if ok, EINVAL if not valid
+ */
+int
+ffs_appleufs_validate(const char *name, const struct appleufslabel *o,
+ struct appleufslabel *n)
+{
+ struct appleufslabel tmp;
+ if (!n) n = &tmp;
+
+ if (o->ul_magic != be32toh(APPLEUFS_LABEL_MAGIC)) {
+ return EINVAL;
+ }
+ *n = *o;
+ n->ul_checksum = 0;
+ n->ul_checksum = ffs_appleufs_cksum(n);
+ if (n->ul_checksum != o->ul_checksum) {
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+ printf("%s: invalid APPLE UFS checksum. found 0x%x, expecting 0x%x",
+ name,o->ul_checksum,n->ul_checksum);
+#endif
+ return EINVAL;
+ }
+ n->ul_magic = be32toh(o->ul_magic);
+ n->ul_version = be32toh(o->ul_version);
+ n->ul_time = be32toh(o->ul_time);
+ n->ul_namelen = be16toh(o->ul_namelen);
+
+ if (n->ul_namelen > APPLEUFS_MAX_LABEL_NAME) {
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+ printf("%s: APPLE UFS label name too long, truncated.\n",
+ name);
+#endif
+ n->ul_namelen = APPLEUFS_MAX_LABEL_NAME;
+ }
+ /* if len is max, will set ul_unused1 */
+ n->ul_name[n->ul_namelen - 1] = '\0';
+
+#ifdef DEBUG
+ printf("%s: found APPLE UFS label v%d: \"%s\"\n",
+ name,n->ul_version,n->ul_name);
+#endif
+ n->ul_uuid = be64toh(o->ul_uuid);
+
+ return 0;
+}
+
+void
+ffs_appleufs_set(struct appleufslabel *appleufs, const char *name, time_t t,
+ uint64_t uuid)
+{
+ size_t namelen;
+ if (!name) name = "untitled";
+ if (t == ((time_t)-1)) {
+#if defined(_KERNEL)
+ t = time_second;
+#elif defined(STANDALONE)
+ t = 0;
+#else
+ (void)time(&t);
+#endif
+ }
+ if (uuid == 0) {
+#if defined(_KERNEL) && !defined(STANDALONE)
+ uuid = cprng_fast64();
+#endif
+ }
+ namelen = strlen(name);
+ if (namelen > APPLEUFS_MAX_LABEL_NAME)
+ namelen = APPLEUFS_MAX_LABEL_NAME;
+ memset(appleufs, 0, APPLEUFS_LABEL_SIZE);
+ appleufs->ul_magic = htobe32(APPLEUFS_LABEL_MAGIC);
+ appleufs->ul_version = htobe32(APPLEUFS_LABEL_VERSION);
+ appleufs->ul_time = htobe32((u_int32_t)t);
+ appleufs->ul_namelen = htobe16(namelen);
+ strncpy(appleufs->ul_name,name,namelen);
+ appleufs->ul_uuid = htobe64(uuid);
+ appleufs->ul_checksum = ffs_appleufs_cksum(appleufs);
+}
--- /dev/null
+/* $NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $ */
+
+/*
+ * Copyright (c) 2002 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and Network Associates Laboratories, the Security
+ * Research Division of Network Associates, Inc. under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
+ * research program
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/fstrans.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+static int ffs_balloc_ufs1(struct vnode *, off_t, int, kauth_cred_t, int,
+ struct buf **);
+static int ffs_balloc_ufs2(struct vnode *, off_t, int, kauth_cred_t, int,
+ struct buf **);
+
+/*
+ * Balloc defines the structure of file system storage
+ * by allocating the physical blocks on a device given
+ * the inode and the logical block number in a file.
+ */
+
+int
+ffs_balloc(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags,
+ struct buf **bpp)
+{
+ int error;
+
+ if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC)
+ error = ffs_balloc_ufs2(vp, off, size, cred, flags, bpp);
+ else
+ error = ffs_balloc_ufs1(vp, off, size, cred, flags, bpp);
+
+ if (error == 0 && bpp != NULL && (error = fscow_run(*bpp, false)) != 0)
+ brelse(*bpp, 0);
+
+ return error;
+}
+
+static int
+ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
+ int flags, struct buf **bpp)
+{
+ daddr_t lbn, lastlbn;
+ struct buf *bp, *nbp;
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+ struct ufsmount *ump = ip->i_ump;
+ struct indir indirs[NIADDR + 2];
+ daddr_t newb, pref, nb;
+ int32_t *bap; /* XXX ondisk32 */
+ int deallocated, osize, nsize, num, i, error;
+ int32_t *blkp, *allocblk, allociblk[NIADDR + 1];
+ int32_t *allocib;
+ int unwindidx = -1;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+ UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
+
+ lbn = lblkno(fs, off);
+ size = blkoff(fs, off) + size;
+ if (size > fs->fs_bsize)
+ panic("ffs_balloc: blk too big");
+ if (bpp != NULL) {
+ *bpp = NULL;
+ }
+ UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
+
+ if (lbn < 0)
+ return (EFBIG);
+
+ /*
+ * If the next write will extend the file into a new block,
+ * and the file is currently composed of a fragment
+ * this fragment has to be extended to be a full block.
+ */
+
+ lastlbn = lblkno(fs, ip->i_size);
+ if (lastlbn < NDADDR && lastlbn < lbn) {
+ nb = lastlbn;
+ osize = blksize(fs, ip, nb);
+ if (osize < fs->fs_bsize && osize > 0) {
+ mutex_enter(&ump->um_lock);
+ error = ffs_realloccg(ip, nb,
+ ffs_blkpref_ufs1(ip, lastlbn, nb, flags,
+ &ip->i_ffs1_db[0]),
+ osize, (int)fs->fs_bsize, cred, bpp, &newb);
+ if (error)
+ return (error);
+ ip->i_size = lblktosize(fs, nb + 1);
+ ip->i_ffs1_size = ip->i_size;
+ uvm_vnp_setsize(vp, ip->i_ffs1_size);
+ ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (bpp && *bpp) {
+ if (flags & B_SYNC)
+ bwrite(*bpp);
+ else
+ bawrite(*bpp);
+ }
+ }
+ }
+
+ /*
+ * The first NDADDR blocks are direct blocks
+ */
+
+ if (lbn < NDADDR) {
+ nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap);
+ if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
+
+ /*
+ * The block is an already-allocated direct block
+ * and the file already extends past this block,
+ * thus this must be a whole block.
+ * Just read the block (if requested).
+ */
+
+ if (bpp != NULL) {
+ error = bread(vp, lbn, fs->fs_bsize, NOCRED,
+ B_MODIFY, bpp);
+ if (error) {
+ brelse(*bpp, 0);
+ return (error);
+ }
+ }
+ return (0);
+ }
+ if (nb != 0) {
+
+ /*
+ * Consider need to reallocate a fragment.
+ */
+
+ osize = fragroundup(fs, blkoff(fs, ip->i_size));
+ nsize = fragroundup(fs, size);
+ if (nsize <= osize) {
+
+ /*
+ * The existing block is already
+ * at least as big as we want.
+ * Just read the block (if requested).
+ */
+
+ if (bpp != NULL) {
+ error = bread(vp, lbn, osize, NOCRED,
+ B_MODIFY, bpp);
+ if (error) {
+ brelse(*bpp, 0);
+ return (error);
+ }
+ }
+ return 0;
+ } else {
+
+ /*
+ * The existing block is smaller than we want,
+ * grow it.
+ */
+ mutex_enter(&ump->um_lock);
+ error = ffs_realloccg(ip, lbn,
+ ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
+ &ip->i_ffs1_db[0]),
+ osize, nsize, cred, bpp, &newb);
+ if (error)
+ return (error);
+ }
+ } else {
+
+ /*
+ * the block was not previously allocated,
+ * allocate a new block or fragment.
+ */
+
+ if (ip->i_size < lblktosize(fs, lbn + 1))
+ nsize = fragroundup(fs, size);
+ else
+ nsize = fs->fs_bsize;
+ mutex_enter(&ump->um_lock);
+ error = ffs_alloc(ip, lbn,
+ ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
+ &ip->i_ffs1_db[0]),
+ nsize, flags, cred, &newb);
+ if (error)
+ return (error);
+ if (bpp != NULL) {
+ error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
+ nsize, (flags & B_CLRBUF) != 0, bpp);
+ if (error)
+ return error;
+ }
+ }
+ ip->i_ffs1_db[lbn] = ufs_rw32((u_int32_t)newb, needswap);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (0);
+ }
+
+ /*
+ * Determine the number of levels of indirection.
+ */
+
+ pref = 0;
+ if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+ return (error);
+
+ /*
+ * Fetch the first indirect block allocating if necessary.
+ */
+
+ --num;
+ nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap);
+ allocib = NULL;
+ allocblk = allociblk;
+ if (nb == 0) {
+ mutex_enter(&ump->um_lock);
+ pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL);
+ error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+ flags | B_METAONLY, cred, &newb);
+ if (error)
+ goto fail;
+ nb = newb;
+ *allocblk++ = nb;
+ error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, true, &bp);
+ if (error)
+ goto fail;
+ /*
+ * Write synchronously so that indirect blocks
+ * never point at garbage.
+ */
+ if ((error = bwrite(bp)) != 0)
+ goto fail;
+ unwindidx = 0;
+ allocib = &ip->i_ffs1_ib[indirs[0].in_off];
+ *allocib = ufs_rw32(nb, needswap);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ }
+
+ /*
+ * Fetch through the indirect blocks, allocating as necessary.
+ */
+
+ for (i = 1;;) {
+ error = bread(vp,
+ indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+ nb = ufs_rw32(bap[indirs[i].in_off], needswap);
+ if (i == num)
+ break;
+ i++;
+ if (nb != 0) {
+ brelse(bp, 0);
+ continue;
+ }
+ if (fscow_run(bp, true) != 0) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ mutex_enter(&ump->um_lock);
+ /* Try to keep snapshot indirect blocks contiguous. */
+ if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
+ pref = ffs_blkpref_ufs1(ip, lbn, indirs[i-1].in_off,
+ flags | B_METAONLY, &bap[0]);
+ if (pref == 0)
+ pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY,
+ NULL);
+ error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+ flags | B_METAONLY, cred, &newb);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ nb = newb;
+ *allocblk++ = nb;
+ error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, true, &nbp);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ /*
+ * Write synchronously so that indirect blocks
+ * never point at garbage.
+ */
+ if ((error = bwrite(nbp)) != 0) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ if (unwindidx < 0)
+ unwindidx = i - 1;
+ bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap);
+
+ /*
+ * If required, write synchronously, otherwise use
+ * delayed write.
+ */
+
+ if (flags & B_SYNC) {
+ bwrite(bp);
+ } else {
+ bdwrite(bp);
+ }
+ }
+
+ if (flags & B_METAONLY) {
+ KASSERT(bpp != NULL);
+ *bpp = bp;
+ return (0);
+ }
+
+ /*
+ * Get the data block, allocating if necessary.
+ */
+
+ if (nb == 0) {
+ if (fscow_run(bp, true) != 0) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ mutex_enter(&ump->um_lock);
+ pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags,
+ &bap[0]);
+ error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
+ &newb);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ nb = newb;
+ *allocblk++ = nb;
+ if (bpp != NULL) {
+ error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ }
+ bap[indirs[num].in_off] = ufs_rw32(nb, needswap);
+ if (allocib == NULL && unwindidx < 0) {
+ unwindidx = i - 1;
+ }
+
+ /*
+ * If required, write synchronously, otherwise use
+ * delayed write.
+ */
+
+ if (flags & B_SYNC) {
+ bwrite(bp);
+ } else {
+ bdwrite(bp);
+ }
+ return (0);
+ }
+ brelse(bp, 0);
+ if (bpp != NULL) {
+ if (flags & B_CLRBUF) {
+ error = bread(vp, lbn, (int)fs->fs_bsize,
+ NOCRED, B_MODIFY, &nbp);
+ if (error) {
+ brelse(nbp, 0);
+ goto fail;
+ }
+ } else {
+ error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, true, &nbp);
+ if (error)
+ goto fail;
+ }
+ *bpp = nbp;
+ }
+ return (0);
+
+fail:
+ /*
+ * If we have failed part way through block allocation, we
+ * have to deallocate any indirect blocks that we have allocated.
+ */
+
+ if (unwindidx >= 0) {
+
+ /*
+ * First write out any buffers we've created to resolve their
+ * softdeps. This must be done in reverse order of creation
+ * so that we resolve the dependencies in one pass.
+ * Write the cylinder group buffers for these buffers too.
+ */
+
+ for (i = num; i >= unwindidx; i--) {
+ if (i == 0) {
+ break;
+ }
+ if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+ fs->fs_bsize, false, &bp) != 0)
+ continue;
+ if (bp->b_oflags & BO_DELWRI) {
+ nb = fsbtodb(fs, cgtod(fs, dtog(fs,
+ dbtofsb(fs, bp->b_blkno))));
+ bwrite(bp);
+ if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
+ fs->fs_cgsize, false, &bp) != 0)
+ continue;
+ if (bp->b_oflags & BO_DELWRI) {
+ bwrite(bp);
+ } else {
+ brelse(bp, BC_INVAL);
+ }
+ } else {
+ brelse(bp, BC_INVAL);
+ }
+ }
+
+ /*
+ * Undo the partial allocation.
+ */
+ if (unwindidx == 0) {
+ *allocib = 0;
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ } else {
+ int r;
+
+ r = bread(vp, indirs[unwindidx].in_lbn,
+ (int)fs->fs_bsize, NOCRED, 0, &bp);
+ if (r) {
+ panic("Could not unwind indirect block, error %d", r);
+ brelse(bp, 0);
+ } else {
+ bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+ bap[indirs[unwindidx].in_off] = 0;
+ bwrite(bp);
+ }
+ }
+ for (i = unwindidx + 1; i <= num; i++) {
+ if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+ fs->fs_bsize, false, &bp) == 0)
+ brelse(bp, BC_INVAL);
+ }
+ }
+ for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+ ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
+ deallocated += fs->fs_bsize;
+ }
+ if (deallocated) {
+#if defined(QUOTA) || defined(QUOTA2)
+ /*
+ * Restore user's disk quota because allocation failed.
+ */
+ (void)chkdq(ip, -btodb(deallocated), cred, FORCE);
+#endif
+ ip->i_ffs1_blocks -= btodb(deallocated);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ }
+ return (error);
+}
+
+static int
+ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
+ int flags, struct buf **bpp)
+{
+ daddr_t lbn, lastlbn;
+ struct buf *bp, *nbp;
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+ struct ufsmount *ump = ip->i_ump;
+ struct indir indirs[NIADDR + 2];
+ daddr_t newb, pref, nb;
+ int64_t *bap;
+ int deallocated, osize, nsize, num, i, error;
+ daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
+ int64_t *allocib;
+ int unwindidx = -1;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+ UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
+
+ lbn = lblkno(fs, off);
+ size = blkoff(fs, off) + size;
+ if (size > fs->fs_bsize)
+ panic("ffs_balloc: blk too big");
+ if (bpp != NULL) {
+ *bpp = NULL;
+ }
+ UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
+
+ if (lbn < 0)
+ return (EFBIG);
+
+#ifdef notyet
+ /*
+ * Check for allocating external data.
+ */
+ if (flags & IO_EXT) {
+ if (lbn >= NXADDR)
+ return (EFBIG);
+ /*
+ * If the next write will extend the data into a new block,
+ * and the data is currently composed of a fragment
+ * this fragment has to be extended to be a full block.
+ */
+ lastlbn = lblkno(fs, dp->di_extsize);
+ if (lastlbn < lbn) {
+ nb = lastlbn;
+ osize = sblksize(fs, dp->di_extsize, nb);
+ if (osize < fs->fs_bsize && osize > 0) {
+ mutex_enter(&ump->um_lock);
+ error = ffs_realloccg(ip, -1 - nb,
+ dp->di_extb[nb],
+ ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
+ flags, &dp->di_extb[0]),
+ osize,
+ (int)fs->fs_bsize, cred, &bp);
+ if (error)
+ return (error);
+ dp->di_extsize = smalllblktosize(fs, nb + 1);
+ dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
+ bp->b_xflags |= BX_ALTDATA;
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (flags & IO_SYNC)
+ bwrite(bp);
+ else
+ bawrite(bp);
+ }
+ }
+ /*
+ * All blocks are direct blocks
+ */
+ if (flags & BA_METAONLY)
+ panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
+ nb = dp->di_extb[lbn];
+ if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
+ error = bread(vp, -1 - lbn, fs->fs_bsize,
+ NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ mutex_enter(&bp->b_interlock);
+ bp->b_blkno = fsbtodb(fs, nb);
+ bp->b_xflags |= BX_ALTDATA;
+ mutex_exit(&bp->b_interlock);
+ *bpp = bp;
+ return (0);
+ }
+ if (nb != 0) {
+ /*
+ * Consider need to reallocate a fragment.
+ */
+ osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
+ nsize = fragroundup(fs, size);
+ if (nsize <= osize) {
+ error = bread(vp, -1 - lbn, osize,
+ NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ mutex_enter(&bp->b_interlock);
+ bp->b_blkno = fsbtodb(fs, nb);
+ bp->b_xflags |= BX_ALTDATA;
+ mutex_exit(&bp->b_interlock);
+ } else {
+ mutex_enter(&ump->um_lock);
+ error = ffs_realloccg(ip, -1 - lbn,
+ dp->di_extb[lbn],
+ ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+ &dp->di_extb[0]),
+ osize, nsize, cred, &bp);
+ if (error)
+ return (error);
+ bp->b_xflags |= BX_ALTDATA;
+ }
+ } else {
+ if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
+ nsize = fragroundup(fs, size);
+ else
+ nsize = fs->fs_bsize;
+ mutex_enter(&ump->um_lock);
+ error = ffs_alloc(ip, lbn,
+ ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+ &dp->di_extb[0]),
+ nsize, flags, cred, &newb);
+ if (error)
+ return (error);
+ error = ffs_getblk(vp, -1 - lbn, fsbtodb(fs, newb),
+ nsize, (flags & BA_CLRBUF) != 0, &bp);
+ if (error)
+ return error;
+ bp->b_xflags |= BX_ALTDATA;
+ }
+ dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ *bpp = bp;
+ return (0);
+ }
+#endif
+ /*
+ * If the next write will extend the file into a new block,
+ * and the file is currently composed of a fragment
+ * this fragment has to be extended to be a full block.
+ */
+
+ lastlbn = lblkno(fs, ip->i_size);
+ if (lastlbn < NDADDR && lastlbn < lbn) {
+ nb = lastlbn;
+ osize = blksize(fs, ip, nb);
+ if (osize < fs->fs_bsize && osize > 0) {
+ mutex_enter(&ump->um_lock);
+ error = ffs_realloccg(ip, nb,
+ ffs_blkpref_ufs2(ip, lastlbn, nb, flags,
+ &ip->i_ffs2_db[0]),
+ osize, (int)fs->fs_bsize, cred, bpp, &newb);
+ if (error)
+ return (error);
+ ip->i_size = lblktosize(fs, nb + 1);
+ ip->i_ffs2_size = ip->i_size;
+ uvm_vnp_setsize(vp, ip->i_size);
+ ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (bpp) {
+ if (flags & B_SYNC)
+ bwrite(*bpp);
+ else
+ bawrite(*bpp);
+ }
+ }
+ }
+
+ /*
+ * The first NDADDR blocks are direct blocks
+ */
+
+ if (lbn < NDADDR) {
+ nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap);
+ if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
+
+ /*
+ * The block is an already-allocated direct block
+ * and the file already extends past this block,
+ * thus this must be a whole block.
+ * Just read the block (if requested).
+ */
+
+ if (bpp != NULL) {
+ error = bread(vp, lbn, fs->fs_bsize, NOCRED,
+ B_MODIFY, bpp);
+ if (error) {
+ brelse(*bpp, 0);
+ return (error);
+ }
+ }
+ return (0);
+ }
+ if (nb != 0) {
+
+ /*
+ * Consider need to reallocate a fragment.
+ */
+
+ osize = fragroundup(fs, blkoff(fs, ip->i_size));
+ nsize = fragroundup(fs, size);
+ if (nsize <= osize) {
+
+ /*
+ * The existing block is already
+ * at least as big as we want.
+ * Just read the block (if requested).
+ */
+
+ if (bpp != NULL) {
+ error = bread(vp, lbn, osize, NOCRED,
+ B_MODIFY, bpp);
+ if (error) {
+ brelse(*bpp, 0);
+ return (error);
+ }
+ }
+ return 0;
+ } else {
+
+ /*
+ * The existing block is smaller than we want,
+ * grow it.
+ */
+ mutex_enter(&ump->um_lock);
+ error = ffs_realloccg(ip, lbn,
+ ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+ &ip->i_ffs2_db[0]),
+ osize, nsize, cred, bpp, &newb);
+ if (error)
+ return (error);
+ }
+ } else {
+
+ /*
+ * the block was not previously allocated,
+ * allocate a new block or fragment.
+ */
+
+ if (ip->i_size < lblktosize(fs, lbn + 1))
+ nsize = fragroundup(fs, size);
+ else
+ nsize = fs->fs_bsize;
+ mutex_enter(&ump->um_lock);
+ error = ffs_alloc(ip, lbn,
+ ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+ &ip->i_ffs2_db[0]),
+ nsize, flags, cred, &newb);
+ if (error)
+ return (error);
+ if (bpp != NULL) {
+ error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
+ nsize, (flags & B_CLRBUF) != 0, bpp);
+ if (error)
+ return error;
+ }
+ }
+ ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (0);
+ }
+
+ /*
+ * Determine the number of levels of indirection.
+ */
+
+ pref = 0;
+ if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+ return (error);
+
+ /*
+ * Fetch the first indirect block allocating if necessary.
+ */
+
+ --num;
+ nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap);
+ allocib = NULL;
+ allocblk = allociblk;
+ if (nb == 0) {
+ mutex_enter(&ump->um_lock);
+ pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL);
+ error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+ flags | B_METAONLY, cred, &newb);
+ if (error)
+ goto fail;
+ nb = newb;
+ *allocblk++ = nb;
+ error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, true, &bp);
+ if (error)
+ goto fail;
+ /*
+ * Write synchronously so that indirect blocks
+ * never point at garbage.
+ */
+ if ((error = bwrite(bp)) != 0)
+ goto fail;
+ unwindidx = 0;
+ allocib = &ip->i_ffs2_ib[indirs[0].in_off];
+ *allocib = ufs_rw64(nb, needswap);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ }
+
+ /*
+ * Fetch through the indirect blocks, allocating as necessary.
+ */
+
+ for (i = 1;;) {
+ error = bread(vp,
+ indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ bap = (int64_t *)bp->b_data;
+ nb = ufs_rw64(bap[indirs[i].in_off], needswap);
+ if (i == num)
+ break;
+ i++;
+ if (nb != 0) {
+ brelse(bp, 0);
+ continue;
+ }
+ if (fscow_run(bp, true) != 0) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ mutex_enter(&ump->um_lock);
+ /* Try to keep snapshot indirect blocks contiguous. */
+ if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
+ pref = ffs_blkpref_ufs2(ip, lbn, indirs[i-1].in_off,
+ flags | B_METAONLY, &bap[0]);
+ if (pref == 0)
+ pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY,
+ NULL);
+ error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+ flags | B_METAONLY, cred, &newb);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ nb = newb;
+ *allocblk++ = nb;
+ error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, true, &nbp);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ /*
+ * Write synchronously so that indirect blocks
+ * never point at garbage.
+ */
+ if ((error = bwrite(nbp)) != 0) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ if (unwindidx < 0)
+ unwindidx = i - 1;
+ bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap);
+
+ /*
+ * If required, write synchronously, otherwise use
+ * delayed write.
+ */
+
+ if (flags & B_SYNC) {
+ bwrite(bp);
+ } else {
+ bdwrite(bp);
+ }
+ }
+
+ if (flags & B_METAONLY) {
+ KASSERT(bpp != NULL);
+ *bpp = bp;
+ return (0);
+ }
+
+ /*
+ * Get the data block, allocating if necessary.
+ */
+
+ if (nb == 0) {
+ if (fscow_run(bp, true) != 0) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ mutex_enter(&ump->um_lock);
+ pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags,
+ &bap[0]);
+ error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
+ &newb);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ nb = newb;
+ *allocblk++ = nb;
+ if (bpp != NULL) {
+ error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
+ if (error) {
+ brelse(bp, 0);
+ goto fail;
+ }
+ }
+ bap[indirs[num].in_off] = ufs_rw64(nb, needswap);
+ if (allocib == NULL && unwindidx < 0) {
+ unwindidx = i - 1;
+ }
+
+ /*
+ * If required, write synchronously, otherwise use
+ * delayed write.
+ */
+
+ if (flags & B_SYNC) {
+ bwrite(bp);
+ } else {
+ bdwrite(bp);
+ }
+ return (0);
+ }
+ brelse(bp, 0);
+ if (bpp != NULL) {
+ if (flags & B_CLRBUF) {
+ error = bread(vp, lbn, (int)fs->fs_bsize,
+ NOCRED, B_MODIFY, &nbp);
+ if (error) {
+ brelse(nbp, 0);
+ goto fail;
+ }
+ } else {
+ error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+ fs->fs_bsize, true, &nbp);
+ if (error)
+ goto fail;
+ }
+ *bpp = nbp;
+ }
+ return (0);
+
+fail:
+ /*
+ * If we have failed part way through block allocation, we
+ * have to deallocate any indirect blocks that we have allocated.
+ */
+
+ if (unwindidx >= 0) {
+
+ /*
+ * First write out any buffers we've created to resolve their
+ * softdeps. This must be done in reverse order of creation
+ * so that we resolve the dependencies in one pass.
+ * Write the cylinder group buffers for these buffers too.
+ */
+
+ for (i = num; i >= unwindidx; i--) {
+ if (i == 0) {
+ break;
+ }
+ if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+ fs->fs_bsize, false, &bp) != 0)
+ continue;
+ if (bp->b_oflags & BO_DELWRI) {
+ nb = fsbtodb(fs, cgtod(fs, dtog(fs,
+ dbtofsb(fs, bp->b_blkno))));
+ bwrite(bp);
+ if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
+ fs->fs_cgsize, false, &bp) != 0)
+ continue;
+ if (bp->b_oflags & BO_DELWRI) {
+ bwrite(bp);
+ } else {
+ brelse(bp, BC_INVAL);
+ }
+ } else {
+ brelse(bp, BC_INVAL);
+ }
+ }
+
+ /*
+ * Now that any dependencies that we created have been
+ * resolved, we can undo the partial allocation.
+ */
+
+ if (unwindidx == 0) {
+ *allocib = 0;
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ } else {
+ int r;
+
+ r = bread(vp, indirs[unwindidx].in_lbn,
+ (int)fs->fs_bsize, NOCRED, 0, &bp);
+ if (r) {
+ panic("Could not unwind indirect block, error %d", r);
+ brelse(bp, 0);
+ } else {
+ bap = (int64_t *)bp->b_data;
+ bap[indirs[unwindidx].in_off] = 0;
+ bwrite(bp);
+ }
+ }
+ for (i = unwindidx + 1; i <= num; i++) {
+ if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+ fs->fs_bsize, false, &bp) == 0)
+ brelse(bp, BC_INVAL);
+ }
+ }
+ for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+ ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
+ deallocated += fs->fs_bsize;
+ }
+ if (deallocated) {
+#if defined(QUOTA) || defined(QUOTA2)
+ /*
+ * Restore user's disk quota because allocation failed.
+ */
+ (void)chkdq(ip, -btodb(deallocated), cred, FORCE);
+#endif
+ ip->i_ffs2_blocks -= btodb(deallocated);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ }
+
+ return (error);
+}
--- /dev/null
+/* $NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1998 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $");
+
+#include <sys/param.h>
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#endif
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#if !defined(_KERNEL)
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define panic(x) printf("%s\n", (x)), abort()
+#endif
+
+void
+ffs_sb_swap(struct fs *o, struct fs *n)
+{
+ size_t i;
+ u_int32_t *o32, *n32;
+
+ /*
+ * In order to avoid a lot of lines, as the first N fields (52)
+ * of the superblock up to fs_fmod are u_int32_t, we just loop
+ * here to convert them.
+ */
+ o32 = (u_int32_t *)o;
+ n32 = (u_int32_t *)n;
+ for (i = 0; i < offsetof(struct fs, fs_fmod) / sizeof(u_int32_t); i++)
+ n32[i] = bswap32(o32[i]);
+
+ n->fs_swuid = bswap64(o->fs_swuid);
+ n->fs_cgrotor = bswap32(o->fs_cgrotor); /* Unused */
+ n->fs_old_cpc = bswap32(o->fs_old_cpc);
+
+ /* These fields overlap with a possible location for the
+ * historic FS_DYNAMICPOSTBLFMT postbl table, and with the
+ * first half of the historic FS_42POSTBLFMT postbl table.
+ */
+ n->fs_maxbsize = bswap32(o->fs_maxbsize);
+ /* XXX journal */
+ n->fs_quota_magic = bswap32(o->fs_quota_magic);
+ for (i = 0; i < MAXQUOTAS; i++)
+ n->fs_quotafile[i] = bswap64(o->fs_quotafile[i]);
+ n->fs_sblockloc = bswap64(o->fs_sblockloc);
+ ffs_csumtotal_swap(&o->fs_cstotal, &n->fs_cstotal);
+ n->fs_time = bswap64(o->fs_time);
+ n->fs_size = bswap64(o->fs_size);
+ n->fs_dsize = bswap64(o->fs_dsize);
+ n->fs_csaddr = bswap64(o->fs_csaddr);
+ n->fs_pendingblocks = bswap64(o->fs_pendingblocks);
+ n->fs_pendinginodes = bswap32(o->fs_pendinginodes);
+
+ /* These fields overlap with the second half of the
+ * historic FS_42POSTBLFMT postbl table
+ */
+ for (i = 0; i < FSMAXSNAP; i++)
+ n->fs_snapinum[i] = bswap32(o->fs_snapinum[i]);
+ n->fs_avgfilesize = bswap32(o->fs_avgfilesize);
+ n->fs_avgfpdir = bswap32(o->fs_avgfpdir);
+ /* fs_sparecon[28] - ignore for now */
+ n->fs_flags = bswap32(o->fs_flags);
+ n->fs_contigsumsize = bswap32(o->fs_contigsumsize);
+ n->fs_maxsymlinklen = bswap32(o->fs_maxsymlinklen);
+ n->fs_old_inodefmt = bswap32(o->fs_old_inodefmt);
+ n->fs_maxfilesize = bswap64(o->fs_maxfilesize);
+ n->fs_qbmask = bswap64(o->fs_qbmask);
+ n->fs_qfmask = bswap64(o->fs_qfmask);
+ n->fs_state = bswap32(o->fs_state);
+ n->fs_old_postblformat = bswap32(o->fs_old_postblformat);
+ n->fs_old_nrpos = bswap32(o->fs_old_nrpos);
+ n->fs_old_postbloff = bswap32(o->fs_old_postbloff);
+ n->fs_old_rotbloff = bswap32(o->fs_old_rotbloff);
+
+ n->fs_magic = bswap32(o->fs_magic);
+}
+
+void
+ffs_dinode1_swap(struct ufs1_dinode *o, struct ufs1_dinode *n)
+{
+
+ n->di_mode = bswap16(o->di_mode);
+ n->di_nlink = bswap16(o->di_nlink);
+ n->di_u.oldids[0] = bswap16(o->di_u.oldids[0]);
+ n->di_u.oldids[1] = bswap16(o->di_u.oldids[1]);
+ n->di_size = bswap64(o->di_size);
+ n->di_atime = bswap32(o->di_atime);
+ n->di_atimensec = bswap32(o->di_atimensec);
+ n->di_mtime = bswap32(o->di_mtime);
+ n->di_mtimensec = bswap32(o->di_mtimensec);
+ n->di_ctime = bswap32(o->di_ctime);
+ n->di_ctimensec = bswap32(o->di_ctimensec);
+ memcpy(n->di_db, o->di_db, (NDADDR + NIADDR) * sizeof(u_int32_t));
+ n->di_flags = bswap32(o->di_flags);
+ n->di_blocks = bswap32(o->di_blocks);
+ n->di_gen = bswap32(o->di_gen);
+ n->di_uid = bswap32(o->di_uid);
+ n->di_gid = bswap32(o->di_gid);
+}
+
+void
+ffs_dinode2_swap(struct ufs2_dinode *o, struct ufs2_dinode *n)
+{
+ n->di_mode = bswap16(o->di_mode);
+ n->di_nlink = bswap16(o->di_nlink);
+ n->di_uid = bswap32(o->di_uid);
+ n->di_gid = bswap32(o->di_gid);
+ n->di_blksize = bswap32(o->di_blksize);
+ n->di_size = bswap64(o->di_size);
+ n->di_blocks = bswap64(o->di_blocks);
+ n->di_atime = bswap64(o->di_atime);
+ n->di_atimensec = bswap32(o->di_atimensec);
+ n->di_mtime = bswap64(o->di_mtime);
+ n->di_mtimensec = bswap32(o->di_mtimensec);
+ n->di_ctime = bswap64(o->di_ctime);
+ n->di_ctimensec = bswap32(o->di_ctimensec);
+ n->di_birthtime = bswap64(o->di_birthtime);
+ n->di_birthnsec = bswap32(o->di_birthnsec);
+ n->di_gen = bswap32(o->di_gen);
+ n->di_kernflags = bswap32(o->di_kernflags);
+ n->di_flags = bswap32(o->di_flags);
+ n->di_extsize = bswap32(o->di_extsize);
+ memcpy(n->di_extb, o->di_extb, (NXADDR + NDADDR + NIADDR) * 8);
+}
+
+void
+ffs_csum_swap(struct csum *o, struct csum *n, int size)
+{
+ size_t i;
+ u_int32_t *oint, *nint;
+
+ oint = (u_int32_t*)o;
+ nint = (u_int32_t*)n;
+
+ for (i = 0; i < size / sizeof(u_int32_t); i++)
+ nint[i] = bswap32(oint[i]);
+}
+
+void
+ffs_csumtotal_swap(struct csum_total *o, struct csum_total *n)
+{
+ n->cs_ndir = bswap64(o->cs_ndir);
+ n->cs_nbfree = bswap64(o->cs_nbfree);
+ n->cs_nifree = bswap64(o->cs_nifree);
+ n->cs_nffree = bswap64(o->cs_nffree);
+}
+
+/*
+ * Note that ffs_cg_swap may be called with o == n.
+ */
+void
+ffs_cg_swap(struct cg *o, struct cg *n, struct fs *fs)
+{
+ int i;
+ u_int32_t *n32, *o32;
+ u_int16_t *n16, *o16;
+ int32_t btotoff, boff, clustersumoff;
+
+ n->cg_firstfield = bswap32(o->cg_firstfield);
+ n->cg_magic = bswap32(o->cg_magic);
+ n->cg_old_time = bswap32(o->cg_old_time);
+ n->cg_cgx = bswap32(o->cg_cgx);
+ n->cg_old_ncyl = bswap16(o->cg_old_ncyl);
+ n->cg_old_niblk = bswap16(o->cg_old_niblk);
+ n->cg_ndblk = bswap32(o->cg_ndblk);
+ n->cg_cs.cs_ndir = bswap32(o->cg_cs.cs_ndir);
+ n->cg_cs.cs_nbfree = bswap32(o->cg_cs.cs_nbfree);
+ n->cg_cs.cs_nifree = bswap32(o->cg_cs.cs_nifree);
+ n->cg_cs.cs_nffree = bswap32(o->cg_cs.cs_nffree);
+ n->cg_rotor = bswap32(o->cg_rotor);
+ n->cg_frotor = bswap32(o->cg_frotor);
+ n->cg_irotor = bswap32(o->cg_irotor);
+ for (i = 0; i < MAXFRAG; i++)
+ n->cg_frsum[i] = bswap32(o->cg_frsum[i]);
+
+ if ((fs->fs_magic != FS_UFS2_MAGIC) &&
+ (fs->fs_old_postblformat == FS_42POSTBLFMT)) { /* old format */
+ struct ocg *on, *oo;
+ int j;
+ on = (struct ocg *)n;
+ oo = (struct ocg *)o;
+
+ for (i = 0; i < 32; i++) {
+ on->cg_btot[i] = bswap32(oo->cg_btot[i]);
+ for (j = 0; j < 8; j++)
+ on->cg_b[i][j] = bswap16(oo->cg_b[i][j]);
+ }
+ memmove(on->cg_iused, oo->cg_iused, 256);
+ on->cg_magic = bswap32(oo->cg_magic);
+ } else { /* new format */
+
+ n->cg_old_btotoff = bswap32(o->cg_old_btotoff);
+ n->cg_old_boff = bswap32(o->cg_old_boff);
+ n->cg_iusedoff = bswap32(o->cg_iusedoff);
+ n->cg_freeoff = bswap32(o->cg_freeoff);
+ n->cg_nextfreeoff = bswap32(o->cg_nextfreeoff);
+ n->cg_clustersumoff = bswap32(o->cg_clustersumoff);
+ n->cg_clusteroff = bswap32(o->cg_clusteroff);
+ n->cg_nclusterblks = bswap32(o->cg_nclusterblks);
+ n->cg_niblk = bswap32(o->cg_niblk);
+ n->cg_initediblk = bswap32(o->cg_initediblk);
+ n->cg_time = bswap64(o->cg_time);
+
+ if (n->cg_magic == CG_MAGIC) {
+ btotoff = n->cg_old_btotoff;
+ boff = n->cg_old_boff;
+ clustersumoff = n->cg_clustersumoff;
+ } else {
+ btotoff = bswap32(n->cg_old_btotoff);
+ boff = bswap32(n->cg_old_boff);
+ clustersumoff = bswap32(n->cg_clustersumoff);
+ }
+
+ n32 = (u_int32_t *)((u_int8_t *)n + clustersumoff);
+ o32 = (u_int32_t *)((u_int8_t *)o + clustersumoff);
+ for (i = 1; i < fs->fs_contigsumsize + 1; i++)
+ n32[i] = bswap32(o32[i]);
+
+ if (fs->fs_magic == FS_UFS2_MAGIC)
+ return;
+
+ n32 = (u_int32_t *)((u_int8_t *)n + btotoff);
+ o32 = (u_int32_t *)((u_int8_t *)o + btotoff);
+ n16 = (u_int16_t *)((u_int8_t *)n + boff);
+ o16 = (u_int16_t *)((u_int8_t *)o + boff);
+
+ for (i = 0; i < fs->fs_old_cpg; i++)
+ n32[i] = bswap32(o32[i]);
+
+ for (i = 0; i < fs->fs_old_cpg * fs->fs_old_nrpos; i++)
+ n16[i] = bswap16(o16[i]);
+ }
+}
--- /dev/null
+/* $NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $ */
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int,
+ int64_t *);
+
+/*
+ * Update the access, modified, and inode change times as specified
+ * by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.
+ * The IN_MODIFIED flag is used to specify that the inode needs to be
+ * updated but that the times have already been set. The access
+ * and modified times are taken from the second and third parameters;
+ * the inode change time is always taken from the current time. If
+ * UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the
+ * disk write of the inode to complete.
+ */
+
+int
+ffs_update(struct vnode *vp, const struct timespec *acc,
+ const struct timespec *mod, int updflags)
+{
+ struct fs *fs;
+ struct buf *bp;
+ struct inode *ip;
+ int error;
+ void *cp;
+ int waitfor, flags;
+
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (0);
+ ip = VTOI(vp);
+ FFS_ITIMES(ip, acc, mod, NULL);
+ if (updflags & UPDATE_CLOSE)
+ flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
+ else
+ flags = ip->i_flag & IN_MODIFIED;
+ if (flags == 0)
+ return (0);
+ fs = ip->i_fs;
+
+ if ((flags & IN_MODIFIED) != 0 &&
+ (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
+ waitfor = updflags & UPDATE_WAIT;
+ if ((updflags & UPDATE_DIROP) != 0)
+ waitfor |= UPDATE_WAIT;
+ } else
+ waitfor = 0;
+
+ /*
+ * Ensure that uid and gid are correct. This is a temporary
+ * fix until fsck has been changed to do the update.
+ */
+ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */
+ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
+ ip->i_ffs1_ouid = ip->i_uid; /* XXX */
+ ip->i_ffs1_ogid = ip->i_gid; /* XXX */
+ } /* XXX */
+ error = bread(ip->i_devvp,
+ fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+ (int)fs->fs_bsize, NOCRED, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
+ /* Keep unlinked inode list up to date */
+ KDASSERT(DIP(ip, nlink) == ip->i_nlink);
+ if (ip->i_mode) {
+ if (ip->i_nlink > 0) {
+ UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp,
+ ip->i_number, ip->i_mode);
+ } else {
+ UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp,
+ ip->i_number, ip->i_mode);
+ }
+ }
+ if (fs->fs_magic == FS_UFS1_MAGIC) {
+ cp = (char *)bp->b_data +
+ (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE);
+#ifdef FFS_EI
+ if (UFS_FSNEEDSWAP(fs))
+ ffs_dinode1_swap(ip->i_din.ffs1_din,
+ (struct ufs1_dinode *)cp);
+ else
+#endif
+ memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE);
+ } else {
+ cp = (char *)bp->b_data +
+ (ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE);
+#ifdef FFS_EI
+ if (UFS_FSNEEDSWAP(fs))
+ ffs_dinode2_swap(ip->i_din.ffs2_din,
+ (struct ufs2_dinode *)cp);
+ else
+#endif
+ memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE);
+ }
+ if (waitfor) {
+ return (bwrite(bp));
+ } else {
+ bdwrite(bp);
+ return (0);
+ }
+}
+
+#define SINGLE 0 /* index of single indirect block */
+#define DOUBLE 1 /* index of double indirect block */
+#define TRIPLE 2 /* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+int
+ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
+{
+ daddr_t lastblock;
+ struct inode *oip = VTOI(ovp);
+ daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
+ daddr_t blks[NDADDR + NIADDR];
+ struct fs *fs;
+ int offset, pgoffset, level;
+ int64_t count, blocksreleased = 0;
+ int i, aflag, nblocks;
+ int error, allerror = 0;
+ off_t osize;
+ int sync;
+ struct ufsmount *ump = oip->i_ump;
+
+ if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+ ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+ KASSERT(oip->i_size == 0);
+ return 0;
+ }
+
+ if (length < 0)
+ return (EINVAL);
+
+ if (ovp->v_type == VLNK &&
+ (oip->i_size < ump->um_maxsymlinklen ||
+ (ump->um_maxsymlinklen == 0 && DIP(oip, blocks) == 0))) {
+ KDASSERT(length == 0);
+ memset(SHORTLINK(oip), 0, (size_t)oip->i_size);
+ oip->i_size = 0;
+ DIP_ASSIGN(oip, size, 0);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (ffs_update(ovp, NULL, NULL, 0));
+ }
+ if (oip->i_size == length) {
+ /* still do a uvm_vnp_setsize() as writesize may be larger */
+ uvm_vnp_setsize(ovp, length);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (ffs_update(ovp, NULL, NULL, 0));
+ }
+ fs = oip->i_fs;
+ if (length > ump->um_maxfilesize)
+ return (EFBIG);
+
+ if ((oip->i_flags & SF_SNAPSHOT) != 0)
+ ffs_snapremove(ovp);
+
+ osize = oip->i_size;
+ aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+
+ /*
+ * Lengthen the size of the file. We must ensure that the
+ * last byte of the file is allocated. Since the smallest
+ * value of osize is 0, length will be at least 1.
+ */
+
+ if (osize < length) {
+ if (lblkno(fs, osize) < NDADDR &&
+ lblkno(fs, osize) != lblkno(fs, length) &&
+ blkroundup(fs, osize) != osize) {
+ off_t eob;
+
+ eob = blkroundup(fs, osize);
+ uvm_vnp_setwritesize(ovp, eob);
+ error = ufs_balloc_range(ovp, osize, eob - osize,
+ cred, aflag);
+ if (error) {
+ (void) ffs_truncate(ovp, osize,
+ ioflag & IO_SYNC, cred);
+ return error;
+ }
+ if (ioflag & IO_SYNC) {
+ mutex_enter(ovp->v_interlock);
+ VOP_PUTPAGES(ovp,
+ trunc_page(osize & fs->fs_bmask),
+ round_page(eob), PGO_CLEANIT | PGO_SYNCIO |
+ PGO_JOURNALLOCKED);
+ }
+ }
+ uvm_vnp_setwritesize(ovp, length);
+ error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag);
+ if (error) {
+ (void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred);
+ return (error);
+ }
+ uvm_vnp_setsize(ovp, length);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ KASSERT(ovp->v_size == oip->i_size);
+ return (ffs_update(ovp, NULL, NULL, 0));
+ }
+
+ /*
+ * When truncating a regular file down to a non-block-aligned size,
+ * we must zero the part of last block which is past the new EOF.
+ * We must synchronously flush the zeroed pages to disk
+ * since the new pages will be invalidated as soon as we
+ * inform the VM system of the new, smaller size.
+ * We must do this before acquiring the GLOCK, since fetching
+ * the pages will acquire the GLOCK internally.
+ * So there is a window where another thread could see a whole
+ * zeroed page past EOF, but that's life.
+ */
+
+ offset = blkoff(fs, length);
+ pgoffset = length & PAGE_MASK;
+ if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) &&
+ osize > length) {
+ daddr_t lbn;
+ voff_t eoz;
+ int size;
+
+ if (offset != 0) {
+ error = ufs_balloc_range(ovp, length - 1, 1, cred,
+ aflag);
+ if (error)
+ return error;
+ }
+ lbn = lblkno(fs, length);
+ size = blksize(fs, oip, lbn);
+ eoz = MIN(MAX(lblktosize(fs, lbn) + size, round_page(pgoffset)),
+ osize);
+ ubc_zerorange(&ovp->v_uobj, length, eoz - length,
+ UBC_UNMAP_FLAG(ovp));
+ if (round_page(eoz) > round_page(length)) {
+ mutex_enter(ovp->v_interlock);
+ error = VOP_PUTPAGES(ovp, round_page(length),
+ round_page(eoz),
+ PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED |
+ ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
+ if (error)
+ return error;
+ }
+ }
+
+ genfs_node_wrlock(ovp);
+ oip->i_size = length;
+ DIP_ASSIGN(oip, size, length);
+ uvm_vnp_setsize(ovp, length);
+ /*
+ * Calculate index into inode's block list of
+ * last direct and indirect blocks (if any)
+ * which we want to keep. Lastblock is -1 when
+ * the file is truncated to 0.
+ */
+ lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
+ lastiblock[SINGLE] = lastblock - NDADDR;
+ lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+ lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+ nblocks = btodb(fs->fs_bsize);
+ /*
+ * Update file and block pointers on disk before we start freeing
+ * blocks. If we crash before free'ing blocks below, the blocks
+ * will be returned to the free list. lastiblock values are also
+ * normalized to -1 for calls to ffs_indirtrunc below.
+ */
+ sync = 0;
+ for (level = TRIPLE; level >= SINGLE; level--) {
+ blks[NDADDR + level] = DIP(oip, ib[level]);
+ if (lastiblock[level] < 0 && blks[NDADDR + level] != 0) {
+ sync = 1;
+ DIP_ASSIGN(oip, ib[level], 0);
+ lastiblock[level] = -1;
+ }
+ }
+ for (i = 0; i < NDADDR; i++) {
+ blks[i] = DIP(oip, db[i]);
+ if (i > lastblock && blks[i] != 0) {
+ sync = 1;
+ DIP_ASSIGN(oip, db[i], 0);
+ }
+ }
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (sync) {
+ error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT);
+ if (error && !allerror)
+ allerror = error;
+ }
+
+ /*
+ * Having written the new inode to disk, save its new configuration
+ * and put back the old block pointers long enough to process them.
+ * Note that we save the new block configuration so we can check it
+ * when we are done.
+ */
+ for (i = 0; i < NDADDR; i++) {
+ bn = DIP(oip, db[i]);
+ DIP_ASSIGN(oip, db[i], blks[i]);
+ blks[i] = bn;
+ }
+ for (i = 0; i < NIADDR; i++) {
+ bn = DIP(oip, ib[i]);
+ DIP_ASSIGN(oip, ib[i], blks[NDADDR + i]);
+ blks[NDADDR + i] = bn;
+ }
+
+ oip->i_size = osize;
+ DIP_ASSIGN(oip, size, osize);
+ error = vtruncbuf(ovp, lastblock + 1, 0, 0);
+ if (error && !allerror)
+ allerror = error;
+
+ /*
+ * Indirect blocks first.
+ */
+ indir_lbn[SINGLE] = -NDADDR;
+ indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
+ indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+ for (level = TRIPLE; level >= SINGLE; level--) {
+ if (oip->i_ump->um_fstype == UFS1)
+ bn = ufs_rw32(oip->i_ffs1_ib[level],UFS_FSNEEDSWAP(fs));
+ else
+ bn = ufs_rw64(oip->i_ffs2_ib[level],UFS_FSNEEDSWAP(fs));
+ if (bn != 0) {
+ error = ffs_indirtrunc(oip, indir_lbn[level],
+ fsbtodb(fs, bn), lastiblock[level], level, &count);
+ if (error)
+ allerror = error;
+ blocksreleased += count;
+ if (lastiblock[level] < 0) {
+ DIP_ASSIGN(oip, ib[level], 0);
+ if (oip->i_ump->um_mountp->mnt_wapbl) {
+ UFS_WAPBL_REGISTER_DEALLOCATION(
+ oip->i_ump->um_mountp,
+ fsbtodb(fs, bn), fs->fs_bsize);
+ } else
+ ffs_blkfree(fs, oip->i_devvp, bn,
+ fs->fs_bsize, oip->i_number);
+ blocksreleased += nblocks;
+ }
+ }
+ if (lastiblock[level] >= 0)
+ goto done;
+ }
+
+ /*
+ * All whole direct blocks or frags.
+ */
+ for (i = NDADDR - 1; i > lastblock; i--) {
+ long bsize;
+
+ if (oip->i_ump->um_fstype == UFS1)
+ bn = ufs_rw32(oip->i_ffs1_db[i], UFS_FSNEEDSWAP(fs));
+ else
+ bn = ufs_rw64(oip->i_ffs2_db[i], UFS_FSNEEDSWAP(fs));
+ if (bn == 0)
+ continue;
+ DIP_ASSIGN(oip, db[i], 0);
+ bsize = blksize(fs, oip, i);
+ if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+ (ovp->v_type != VREG)) {
+ UFS_WAPBL_REGISTER_DEALLOCATION(oip->i_ump->um_mountp,
+ fsbtodb(fs, bn), bsize);
+ } else
+ ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
+ blocksreleased += btodb(bsize);
+ }
+ if (lastblock < 0)
+ goto done;
+
+ /*
+ * Finally, look for a change in size of the
+ * last direct block; release any frags.
+ */
+ if (oip->i_ump->um_fstype == UFS1)
+ bn = ufs_rw32(oip->i_ffs1_db[lastblock], UFS_FSNEEDSWAP(fs));
+ else
+ bn = ufs_rw64(oip->i_ffs2_db[lastblock], UFS_FSNEEDSWAP(fs));
+ if (bn != 0) {
+ long oldspace, newspace;
+
+ /*
+ * Calculate amount of space we're giving
+ * back as old block size minus new block size.
+ */
+ oldspace = blksize(fs, oip, lastblock);
+ oip->i_size = length;
+ DIP_ASSIGN(oip, size, length);
+ newspace = blksize(fs, oip, lastblock);
+ if (newspace == 0)
+ panic("itrunc: newspace");
+ if (oldspace - newspace > 0) {
+ /*
+ * Block number of space to be free'd is
+ * the old block # plus the number of frags
+ * required for the storage we're keeping.
+ */
+ bn += numfrags(fs, newspace);
+ if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+ (ovp->v_type != VREG)) {
+ UFS_WAPBL_REGISTER_DEALLOCATION(
+ oip->i_ump->um_mountp, fsbtodb(fs, bn),
+ oldspace - newspace);
+ } else
+ ffs_blkfree(fs, oip->i_devvp, bn,
+ oldspace - newspace, oip->i_number);
+ blocksreleased += btodb(oldspace - newspace);
+ }
+ }
+
+done:
+#ifdef DIAGNOSTIC
+ for (level = SINGLE; level <= TRIPLE; level++)
+ if (blks[NDADDR + level] != DIP(oip, ib[level]))
+ panic("itrunc1");
+ for (i = 0; i < NDADDR; i++)
+ if (blks[i] != DIP(oip, db[i]))
+ panic("itrunc2");
+ if (length == 0 &&
+ (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+ panic("itrunc3");
+#endif /* DIAGNOSTIC */
+ /*
+ * Put back the real size.
+ */
+ oip->i_size = length;
+ DIP_ASSIGN(oip, size, length);
+ DIP_ADD(oip, blocks, -blocksreleased);
+ genfs_node_unlock(ovp);
+ oip->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0);
+#if defined(QUOTA) || defined(QUOTA2)
+ (void) chkdq(oip, -blocksreleased, NOCRED, 0);
+#endif
+ KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size);
+ return (allerror);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn. Blocks are free'd in LIFO order up to (but not including)
+ * lastbn. If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
+ int level, int64_t *countp)
+{
+ int i;
+ struct buf *bp;
+ struct fs *fs = ip->i_fs;
+ int32_t *bap1 = NULL;
+ int64_t *bap2 = NULL;
+ struct vnode *vp;
+ daddr_t nb, nlbn, last;
+ char *copy = NULL;
+ int64_t blkcount, factor, blocksreleased = 0;
+ int nblocks;
+ int error = 0, allerror = 0;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+#define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \
+ ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap))
+#define BAP_ASSIGN(ip, i, value) \
+ do { \
+ if ((ip)->i_ump->um_fstype == UFS1) \
+ bap1[i] = (value); \
+ else \
+ bap2[i] = (value); \
+ } while(0)
+
+ /*
+ * Calculate index in current block of last
+ * block to be kept. -1 indicates the entire
+ * block so we need not calculate the index.
+ */
+ factor = 1;
+ for (i = SINGLE; i < level; i++)
+ factor *= NINDIR(fs);
+ last = lastbn;
+ if (lastbn > 0)
+ last /= factor;
+ nblocks = btodb(fs->fs_bsize);
+ /*
+ * Get buffer of block pointers, zero those entries corresponding
+ * to blocks to be free'd, and update on disk copy first. Since
+ * double(triple) indirect before single(double) indirect, calls
+ * to bmap on these blocks will fail. However, we already have
+ * the on disk address, so we have to set the b_blkno field
+ * explicitly instead of letting bread do everything for us.
+ */
+ vp = ITOV(ip);
+ error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp);
+ if (error) {
+ *countp = 0;
+ return error;
+ }
+ if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+ /* Braces must be here in case trace evaluates to nothing. */
+ trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn);
+ } else {
+ trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn);
+ curlwp->l_ru.ru_inblock++; /* pay for read */
+ bp->b_flags |= B_READ;
+ bp->b_flags &= ~B_COWDONE; /* we change blkno below */
+ if (bp->b_bcount > bp->b_bufsize)
+ panic("ffs_indirtrunc: bad buffer size");
+ bp->b_blkno = dbn;
+ BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+ VOP_STRATEGY(vp, bp);
+ error = biowait(bp);
+ if (error == 0)
+ error = fscow_run(bp, true);
+ }
+ if (error) {
+ brelse(bp, 0);
+ *countp = 0;
+ return (error);
+ }
+
+ if (ip->i_ump->um_fstype == UFS1)
+ bap1 = (int32_t *)bp->b_data;
+ else
+ bap2 = (int64_t *)bp->b_data;
+ if (lastbn >= 0) {
+ copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK);
+ memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize);
+ for (i = last + 1; i < NINDIR(fs); i++)
+ BAP_ASSIGN(ip, i, 0);
+ error = bwrite(bp);
+ if (error)
+ allerror = error;
+ if (ip->i_ump->um_fstype == UFS1)
+ bap1 = (int32_t *)copy;
+ else
+ bap2 = (int64_t *)copy;
+ }
+
+ /*
+ * Recursively free totally unused blocks.
+ */
+ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
+ i--, nlbn += factor) {
+ nb = RBAP(ip, i);
+ if (nb == 0)
+ continue;
+ if (level > SINGLE) {
+ error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+ (daddr_t)-1, level - 1,
+ &blkcount);
+ if (error)
+ allerror = error;
+ blocksreleased += blkcount;
+ }
+ if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+ ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) {
+ UFS_WAPBL_REGISTER_DEALLOCATION(ip->i_ump->um_mountp,
+ fsbtodb(fs, nb), fs->fs_bsize);
+ } else
+ ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize,
+ ip->i_number);
+ blocksreleased += nblocks;
+ }
+
+ /*
+ * Recursively free last partial block.
+ */
+ if (level > SINGLE && lastbn >= 0) {
+ last = lastbn % factor;
+ nb = RBAP(ip, i);
+ if (nb != 0) {
+ error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+ last, level - 1, &blkcount);
+ if (error)
+ allerror = error;
+ blocksreleased += blkcount;
+ }
+ }
+
+ if (copy != NULL) {
+ free(copy, M_TEMP);
+ } else {
+ brelse(bp, BC_INVAL);
+ }
+
+ *countp = blocksreleased;
+ return (allerror);
+}
+
+void
+ffs_itimes(struct inode *ip, const struct timespec *acc,
+ const struct timespec *mod, const struct timespec *cre)
+{
+ struct timespec now;
+
+ if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+ return;
+ }
+
+ vfs_timestamp(&now);
+ if (ip->i_flag & IN_ACCESS) {
+ if (acc == NULL)
+ acc = &now;
+ DIP_ASSIGN(ip, atime, acc->tv_sec);
+ DIP_ASSIGN(ip, atimensec, acc->tv_nsec);
+ }
+ if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+ if ((ip->i_flags & SF_SNAPSHOT) == 0) {
+ if (mod == NULL)
+ mod = &now;
+ DIP_ASSIGN(ip, mtime, mod->tv_sec);
+ DIP_ASSIGN(ip, mtimensec, mod->tv_nsec);
+ }
+ ip->i_modrev++;
+ }
+ if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+ if (cre == NULL)
+ cre = &now;
+ DIP_ASSIGN(ip, ctime, cre->tv_sec);
+ DIP_ASSIGN(ip, ctimensec, cre->tv_nsec);
+ }
+ if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
+ ip->i_flag |= IN_ACCESSED;
+ if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
+ ip->i_flag |= IN_MODIFIED;
+ ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
--- /dev/null
+/* $NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $ */
+/*-
+ * Copyright (c) 2010 Manuel Bouyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota2.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ffs/fs.h>
+
+
+int
+ffs_quota2_mount(struct mount *mp)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ int error = 0;
+ struct vnode *vp;
+ struct lwp *l = curlwp;
+
+ if ((fs->fs_flags & FS_DOQUOTA2) == 0)
+ return 0;
+
+ ump->um_flags |= UFS_QUOTA2;
+ ump->umq2_bsize = fs->fs_bsize;
+ ump->umq2_bmask = fs->fs_qbmask;
+ if (fs->fs_quota_magic != Q2_HEAD_MAGIC) {
+ printf("%s: Invalid quota magic number\n",
+ mp->mnt_stat.f_mntonname);
+ return EINVAL;
+ }
+ if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA)) &&
+ fs->fs_quotafile[USRQUOTA] == 0) {
+ printf("%s: no user quota inode\n",
+ mp->mnt_stat.f_mntonname);
+ error = EINVAL;
+ }
+ if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA)) &&
+ fs->fs_quotafile[GRPQUOTA] == 0) {
+ printf("%s: no group quota inode\n",
+ mp->mnt_stat.f_mntonname);
+ error = EINVAL;
+ }
+ if (error)
+ return error;
+
+ if (fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA) &&
+ ump->um_quotas[USRQUOTA] == NULLVP) {
+ error = VFS_VGET(mp, fs->fs_quotafile[USRQUOTA], &vp);
+ if (error) {
+ printf("%s: can't vget() user quota inode: %d\n",
+ mp->mnt_stat.f_mntonname, error);
+ return error;
+ }
+ ump->um_quotas[USRQUOTA] = vp;
+ ump->um_cred[USRQUOTA] = l->l_cred;
+ mutex_enter(vp->v_interlock);
+ vp->v_writecount++;
+ mutex_exit(vp->v_interlock);
+ VOP_UNLOCK(vp);
+ }
+ if (fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA) &&
+ ump->um_quotas[GRPQUOTA] == NULLVP) {
+ error = VFS_VGET(mp, fs->fs_quotafile[GRPQUOTA], &vp);
+ if (error) {
+ vn_close(ump->um_quotas[USRQUOTA],
+ FREAD|FWRITE, l->l_cred);
+ printf("%s: can't vget() group quota inode: %d\n",
+ mp->mnt_stat.f_mntonname, error);
+ return error;
+ }
+ ump->um_quotas[GRPQUOTA] = vp;
+ ump->um_cred[GRPQUOTA] = l->l_cred;
+ mutex_enter(vp->v_interlock);
+ vp->v_vflag |= VV_SYSTEM;
+ vp->v_writecount++;
+ mutex_exit(vp->v_interlock);
+ VOP_UNLOCK(vp);
+ }
+ mp->mnt_flag |= MNT_QUOTA;
+ return 0;
+}
--- /dev/null
+/* $NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $ */
+
+/*
+ * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
+ *
+ * Further information about snapshots can be obtained from:
+ *
+ * Marshall Kirk McKusick http://www.mckusick.com/softdep/
+ * 1614 Oxford Street mckusick@mckusick.com
+ * Berkeley, CA 94709-1608 +1-510-843-9542
+ * USA
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
+ *
+ * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/sched.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/fstrans.h>
+#include <sys/wapbl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+struct snap_info {
+ kmutex_t si_lock; /* Lock this snapinfo */
+ kmutex_t si_snaplock; /* Snapshot vnode common lock */
+ lwp_t *si_owner; /* Sanplock owner */
+ TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
+ daddr_t *si_snapblklist; /* Snapshot block hints list */
+ uint32_t si_gen; /* Incremented on change */
+};
+
+#if !defined(FFS_NO_SNAPSHOT)
+typedef int (*acctfunc_t)
+ (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
+
+static int snapshot_setup(struct mount *, struct vnode *);
+static int snapshot_copyfs(struct mount *, struct vnode *, void **);
+static int snapshot_expunge(struct mount *, struct vnode *,
+ struct fs *, daddr_t *, daddr_t **);
+static int snapshot_expunge_snap(struct mount *, struct vnode *,
+ struct fs *, daddr_t);
+static int snapshot_writefs(struct mount *, struct vnode *, void *);
+static int cgaccount(struct vnode *, int, int *);
+static int cgaccount1(int, struct vnode *, void *, int);
+static int expunge(struct vnode *, struct inode *, struct fs *,
+ acctfunc_t, int);
+static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
+ daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
+static int fullacct(struct vnode *, void *, int, int, struct fs *,
+ daddr_t, int);
+static int snapacct(struct vnode *, void *, int, int, struct fs *,
+ daddr_t, int);
+static int mapacct(struct vnode *, void *, int, int, struct fs *,
+ daddr_t, int);
+#endif /* !defined(FFS_NO_SNAPSHOT) */
+
+static int ffs_copyonwrite(void *, struct buf *, bool);
+static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
+static int rwfsblk(struct vnode *, int, void *, daddr_t);
+static int syncsnap(struct vnode *);
+static int wrsnapblk(struct vnode *, void *, daddr_t);
+#if !defined(FFS_NO_SNAPSHOT)
+static int blocks_in_journal(struct fs *);
+#endif
+
+static inline bool is_active_snapshot(struct snap_info *, struct inode *);
+static inline daddr_t db_get(struct inode *, int);
+static inline void db_assign(struct inode *, int, daddr_t);
+static inline daddr_t ib_get(struct inode *, int);
+static inline void ib_assign(struct inode *, int, daddr_t);
+static inline daddr_t idb_get(struct inode *, void *, int);
+static inline void idb_assign(struct inode *, void *, int, daddr_t);
+
+#ifdef DEBUG
+static int snapdebug = 0;
+#endif
+
+int
+ffs_snapshot_init(struct ufsmount *ump)
+{
+ struct snap_info *si;
+
+ si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
+ if (si == NULL)
+ return ENOMEM;
+
+ TAILQ_INIT(&si->si_snapshots);
+ mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
+ si->si_owner = NULL;
+ si->si_gen = 0;
+ si->si_snapblklist = NULL;
+
+ return 0;
+}
+
+void
+ffs_snapshot_fini(struct ufsmount *ump)
+{
+ struct snap_info *si;
+
+ si = ump->um_snapinfo;
+ ump->um_snapinfo = NULL;
+
+ KASSERT(TAILQ_EMPTY(&si->si_snapshots));
+ mutex_destroy(&si->si_lock);
+ mutex_destroy(&si->si_snaplock);
+ KASSERT(si->si_snapblklist == NULL);
+ kmem_free(si, sizeof(*si));
+}
+
+/*
+ * Create a snapshot file and initialize it for the filesystem.
+ * Vnode is locked on entry and return.
+ */
+int
+ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
+{
+#if defined(FFS_NO_SNAPSHOT)
+ return EOPNOTSUPP;
+}
+#else /* defined(FFS_NO_SNAPSHOT) */
+ bool suspended = false;
+ int error, redo = 0, snaploc;
+ void *sbbuf = NULL;
+ daddr_t *snaplist = NULL, snaplistsize = 0;
+ struct buf *bp, *nbp;
+ struct fs *copy_fs = NULL;
+ struct fs *fs = VFSTOUFS(mp)->um_fs;
+ struct inode *ip = VTOI(vp);
+ struct lwp *l = curlwp;
+ struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
+ struct timespec ts;
+ struct timeval starttime;
+#ifdef DEBUG
+ struct timeval endtime;
+#endif
+ struct vnode *devvp = ip->i_devvp;
+
+ /*
+ * If the vnode already is a snapshot, return.
+ */
+ if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) {
+ if ((VTOI(vp)->i_flags & SF_SNAPINVAL))
+ return EINVAL;
+ if (ctime) {
+ ctime->tv_sec = DIP(VTOI(vp), mtime);
+ ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
+ }
+ return 0;
+ }
+ /*
+ * Check for free snapshot slot in the superblock.
+ */
+ for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+ if (fs->fs_snapinum[snaploc] == 0)
+ break;
+ if (snaploc == FSMAXSNAP)
+ return (ENOSPC);
+ /*
+ * Prepare the vnode to become a snapshot.
+ */
+ error = snapshot_setup(mp, vp);
+ if (error)
+ goto out;
+
+ /*
+ * Copy all the cylinder group maps. Although the
+ * filesystem is still active, we hope that only a few
+ * cylinder groups will change between now and when we
+ * suspend operations. Thus, we will be able to quickly
+ * touch up the few cylinder groups that changed during
+ * the suspension period.
+ */
+ error = cgaccount(vp, 1, NULL);
+ if (error)
+ goto out;
+
+ /*
+ * snapshot is now valid
+ */
+ ip->i_flags &= ~SF_SNAPINVAL;
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+ /*
+ * Ensure that the snapshot is completely on disk.
+ * Since we have marked it as a snapshot it is safe to
+ * unlock it as no process will be allowed to write to it.
+ */
+ error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
+ if (error)
+ goto out;
+ VOP_UNLOCK(vp);
+ /*
+ * All allocations are done, so we can now suspend the filesystem.
+ */
+ error = vfs_suspend(vp->v_mount, 0);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ goto out;
+ suspended = true;
+ getmicrotime(&starttime);
+ /*
+ * First, copy all the cylinder group maps that have changed.
+ */
+ error = cgaccount(vp, 2, &redo);
+ if (error)
+ goto out;
+ /*
+ * Create a copy of the superblock and its summary information.
+ */
+ error = snapshot_copyfs(mp, vp, &sbbuf);
+ copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
+ if (error)
+ goto out;
+ /*
+ * Expunge unlinked files from our view.
+ */
+ error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
+ if (error)
+ goto out;
+ /*
+ * Record snapshot inode. Since this is the newest snapshot,
+ * it must be placed at the end of the list.
+ */
+ if (ip->i_nlink > 0)
+ fs->fs_snapinum[snaploc] = ip->i_number;
+
+ mutex_enter(&si->si_lock);
+ if (is_active_snapshot(si, ip))
+ panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
+ TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
+ if (TAILQ_FIRST(&si->si_snapshots) == ip) {
+ /*
+ * If this is the first snapshot on this filesystem, put the
+ * preliminary list in place and establish the cow handler.
+ */
+ si->si_snapblklist = snaplist;
+ fscow_establish(mp, ffs_copyonwrite, devvp);
+ }
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+
+ vp->v_vflag |= VV_SYSTEM;
+ /*
+ * Set the mtime to the time the snapshot has been taken.
+ */
+ TIMEVAL_TO_TIMESPEC(&starttime, &ts);
+ if (ctime)
+ *ctime = ts;
+ DIP_ASSIGN(ip, mtime, ts.tv_sec);
+ DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ /*
+ * Copy allocation information from all snapshots and then
+ * expunge them from our view.
+ */
+ error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
+ if (error)
+ goto out;
+ /*
+ * Write the superblock and its summary information to the snapshot.
+ */
+ error = snapshot_writefs(mp, vp, sbbuf);
+ if (error)
+ goto out;
+ /*
+ * We're nearly done, ensure that the snapshot is completely on disk.
+ */
+ error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
+ if (error)
+ goto out;
+ /*
+ * Invalidate and free all pages on the snapshot vnode.
+ * We will read and write through the buffercache.
+ */
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, 0, 0,
+ PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
+ if (error)
+ goto out;
+ /*
+ * Invalidate short ( < fs_bsize ) buffers. We will always read
+ * full size buffers later.
+ */
+ mutex_enter(&bufcache_lock);
+ KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+ for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+ KASSERT((bp->b_cflags & BC_BUSY) == 0);
+ if (bp->b_bcount < fs->fs_bsize) {
+ bp->b_cflags |= BC_BUSY;
+ brelsel(bp, BC_INVAL | BC_VFLUSH);
+ }
+ }
+ mutex_exit(&bufcache_lock);
+
+out:
+ if (sbbuf != NULL) {
+ free(copy_fs->fs_csp, M_UFSMNT);
+ free(sbbuf, M_UFSMNT);
+ }
+ if (fs->fs_active != NULL) {
+ free(fs->fs_active, M_DEVBUF);
+ fs->fs_active = NULL;
+ }
+
+ mutex_enter(&si->si_lock);
+ if (snaplist != NULL) {
+ if (si->si_snapblklist == snaplist)
+ si->si_snapblklist = NULL;
+ free(snaplist, M_UFSMNT);
+ }
+ if (error) {
+ fs->fs_snapinum[snaploc] = 0;
+ } else {
+ /*
+ * As this is the newest list, it is the most inclusive, so
+ * should replace the previous list.
+ */
+ si->si_snapblklist = ip->i_snapblklist;
+ }
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+
+ if (suspended) {
+ VOP_UNLOCK(vp);
+ vfs_resume(vp->v_mount);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef DEBUG
+ getmicrotime(&endtime);
+ timersub(&endtime, &starttime, &endtime);
+ printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
+ mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
+ endtime.tv_usec / 1000, redo, fs->fs_ncg);
+#endif
+ }
+ if (error) {
+ if (!UFS_WAPBL_BEGIN(mp)) {
+ (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
+ UFS_WAPBL_END(mp);
+ }
+ } else if (ip->i_nlink > 0)
+ vref(vp);
+ return (error);
+}
+
+/*
+ * Prepare vnode to become a snapshot.
+ */
+static int
+snapshot_setup(struct mount *mp, struct vnode *vp)
+{
+ int error, n, len, loc, cg;
+ daddr_t blkno, numblks;
+ struct buf *ibp, *nbp;
+ struct fs *fs = VFSTOUFS(mp)->um_fs;
+ struct lwp *l = curlwp;
+ const int wbreak = blocks_in_journal(fs)/8;
+ struct inode *ip = VTOI(vp);
+
+ /*
+ * Check mount, exclusive reference and owner.
+ */
+ if (vp->v_mount != mp)
+ return EXDEV;
+ if (vp->v_usecount != 1 || vp->v_writecount != 0)
+ return EBUSY;
+ if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL) != 0 &&
+ VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
+ return EACCES;
+
+ if (vp->v_size != 0) {
+ error = ffs_truncate(vp, 0, 0, NOCRED);
+ if (error)
+ return error;
+ }
+
+ /* Change inode to snapshot type file. */
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+#if defined(QUOTA) || defined(QUOTA2)
+ /* shapshot inodes are not accounted in quotas */
+ chkiq(ip, -1, l->l_cred, 0);
+#endif
+ ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ ffs_update(vp, NULL, NULL, UPDATE_WAIT);
+ UFS_WAPBL_END(mp);
+
+ KASSERT(ip->i_flags & SF_SNAPSHOT);
+ /*
+ * Write an empty list of preallocated blocks to the end of
+ * the snapshot to set size to at least that of the filesystem.
+ */
+ numblks = howmany(fs->fs_size, fs->fs_frag);
+ blkno = 1;
+ blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
+ error = vn_rdwr(UIO_WRITE, vp,
+ (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
+ UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
+ if (error)
+ return error;
+ /*
+ * Preallocate critical data structures so that we can copy
+ * them in without further allocation after we suspend all
+ * operations on the filesystem. We would like to just release
+ * the allocated buffers without writing them since they will
+ * be filled in below once we are ready to go, but this upsets
+ * the soft update code, so we go ahead and write the new buffers.
+ *
+ * Allocate all indirect blocks and mark all of them as not
+ * needing to be copied.
+ */
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) {
+ error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
+ fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+ if (error)
+ goto out;
+ brelse(ibp, 0);
+ if (wbreak > 0 && (++n % wbreak) == 0) {
+ UFS_WAPBL_END(mp);
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ }
+ }
+ /*
+ * Allocate copies for the superblock and its summary information.
+ */
+ error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
+ 0, &nbp);
+ if (error)
+ goto out;
+ bawrite(nbp);
+ blkno = fragstoblks(fs, fs->fs_csaddr);
+ len = howmany(fs->fs_cssize, fs->fs_bsize);
+ for (loc = 0; loc < len; loc++) {
+ error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
+ fs->fs_bsize, l->l_cred, 0, &nbp);
+ if (error)
+ goto out;
+ bawrite(nbp);
+ if (wbreak > 0 && (++n % wbreak) == 0) {
+ UFS_WAPBL_END(mp);
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ }
+ }
+ /*
+ * Allocate all cylinder group blocks.
+ */
+ for (cg = 0; cg < fs->fs_ncg; cg++) {
+ error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
+ fs->fs_bsize, l->l_cred, 0, &nbp);
+ if (error)
+ goto out;
+ bawrite(nbp);
+ if (wbreak > 0 && (++n % wbreak) == 0) {
+ UFS_WAPBL_END(mp);
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ }
+ }
+
+out:
+ UFS_WAPBL_END(mp);
+ return error;
+}
+
+/*
+ * Create a copy of the superblock and its summary information.
+ * It is up to the caller to free copyfs and copy_fs->fs_csp.
+ */
+static int
+snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
+{
+ int error, i, len, loc, size;
+ void *space;
+ int32_t *lp;
+ struct buf *bp;
+ struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
+ struct lwp *l = curlwp;
+ struct vnode *devvp = VTOI(vp)->i_devvp;
+
+ /*
+ * Grab a copy of the superblock and its summary information.
+ * We delay writing it until the suspension is released below.
+ */
+ *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+ loc = blkoff(fs, fs->fs_sblockloc);
+ if (loc > 0)
+ memset(*sbbuf, 0, loc);
+ copyfs = (struct fs *)((char *)(*sbbuf) + loc);
+ memcpy(copyfs, fs, fs->fs_sbsize);
+ size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
+ if (fs->fs_sbsize < size)
+ memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
+ size - fs->fs_sbsize);
+ size = blkroundup(fs, fs->fs_cssize);
+ if (fs->fs_contigsumsize > 0)
+ size += fs->fs_ncg * sizeof(int32_t);
+ space = malloc(size, M_UFSMNT, M_WAITOK);
+ copyfs->fs_csp = space;
+ memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
+ space = (char *)space + fs->fs_cssize;
+ loc = howmany(fs->fs_cssize, fs->fs_fsize);
+ i = fs->fs_frag - loc % fs->fs_frag;
+ len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
+ if (len > 0) {
+ if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
+ len, l->l_cred, 0, &bp)) != 0) {
+ brelse(bp, 0);
+ free(copyfs->fs_csp, M_UFSMNT);
+ free(*sbbuf, M_UFSMNT);
+ *sbbuf = NULL;
+ return error;
+ }
+ memcpy(space, bp->b_data, (u_int)len);
+ space = (char *)space + len;
+ brelse(bp, BC_INVAL | BC_NOCACHE);
+ }
+ if (fs->fs_contigsumsize > 0) {
+ copyfs->fs_maxcluster = lp = space;
+ for (i = 0; i < fs->fs_ncg; i++)
+ *lp++ = fs->fs_contigsumsize;
+ }
+ if (mp->mnt_wapbl)
+ copyfs->fs_flags &= ~FS_DOWAPBL;
+ return 0;
+}
+
+/*
+ * We must check for active files that have been unlinked (e.g., with a zero
+ * link count). We have to expunge all trace of these files from the snapshot
+ * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
+ * Note that we skip unlinked snapshot files as they will be handled separately.
+ * Calculate the snapshot list size and create a preliminary list.
+ */
+static int
+snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
+ daddr_t *snaplistsize, daddr_t **snaplist)
+{
+ int cg, error = 0, len, loc;
+ daddr_t blkno, *blkp;
+ struct fs *fs = VFSTOUFS(mp)->um_fs;
+ struct inode *xp;
+ struct lwp *l = curlwp;
+ struct vattr vat;
+ struct vnode *logvp = NULL, *mvp = NULL, *xvp;
+
+ *snaplist = NULL;
+ /*
+ * Get the log inode if any.
+ */
+ if ((fs->fs_flags & FS_DOWAPBL) &&
+ fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
+ error = VFS_VGET(mp,
+ fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
+ if (error)
+ goto out;
+ }
+ /*
+ * Allocate a marker vnode.
+ */
+ mvp = vnalloc(mp);
+ /*
+ * We also calculate the needed size for the snapshot list.
+ */
+ *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
+ FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
+ mutex_enter(&mntvnode_lock);
+ /*
+ * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+ * and vclean() can be called indirectly
+ */
+ for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
+ vmark(mvp, xvp);
+ /*
+ * Make sure this vnode wasn't reclaimed in getnewvnode().
+ * Start over if it has (it won't be on the list anymore).
+ */
+ if (xvp->v_mount != mp || vismarker(xvp))
+ continue;
+ mutex_enter(xvp->v_interlock);
+ if ((xvp->v_iflag & VI_XLOCK) ||
+ xvp->v_usecount == 0 || xvp->v_type == VNON ||
+ VTOI(xvp) == NULL ||
+ (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
+ mutex_exit(xvp->v_interlock);
+ continue;
+ }
+ mutex_exit(&mntvnode_lock);
+ /*
+ * XXXAD should increase vnode ref count to prevent it
+ * disappearing or being recycled.
+ */
+ mutex_exit(xvp->v_interlock);
+#ifdef DEBUG
+ if (snapdebug)
+ vprint("ffs_snapshot: busy vnode", xvp);
+#endif
+ xp = VTOI(xvp);
+ if (xvp != logvp) {
+ if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
+ vat.va_nlink > 0) {
+ mutex_enter(&mntvnode_lock);
+ continue;
+ }
+ if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
+ mutex_enter(&mntvnode_lock);
+ continue;
+ }
+ }
+ /*
+ * If there is a fragment, clear it here.
+ */
+ blkno = 0;
+ loc = howmany(xp->i_size, fs->fs_bsize) - 1;
+ if (loc < NDADDR) {
+ len = fragroundup(fs, blkoff(fs, xp->i_size));
+ if (len > 0 && len < fs->fs_bsize) {
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error) {
+ (void)vunmark(mvp);
+ goto out;
+ }
+ ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
+ len, xp->i_number);
+ blkno = db_get(xp, loc);
+ db_assign(xp, loc, 0);
+ UFS_WAPBL_END(mp);
+ }
+ }
+ *snaplistsize += 1;
+ error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
+ if (blkno)
+ db_assign(xp, loc, blkno);
+ if (!error) {
+ error = UFS_WAPBL_BEGIN(mp);
+ if (!error) {
+ error = ffs_freefile_snap(copy_fs, vp,
+ xp->i_number, xp->i_mode);
+ UFS_WAPBL_END(mp);
+ }
+ }
+ if (error) {
+ (void)vunmark(mvp);
+ goto out;
+ }
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+ /*
+ * Create a preliminary list of preallocated snapshot blocks.
+ */
+ *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+ blkp = &(*snaplist)[1];
+ *blkp++ = lblkno(fs, fs->fs_sblockloc);
+ blkno = fragstoblks(fs, fs->fs_csaddr);
+ for (cg = 0; cg < fs->fs_ncg; cg++) {
+ if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
+ break;
+ *blkp++ = fragstoblks(fs, cgtod(fs, cg));
+ }
+ len = howmany(fs->fs_cssize, fs->fs_bsize);
+ for (loc = 0; loc < len; loc++)
+ *blkp++ = blkno + loc;
+ for (; cg < fs->fs_ncg; cg++)
+ *blkp++ = fragstoblks(fs, cgtod(fs, cg));
+ (*snaplist)[0] = blkp - &(*snaplist)[0];
+
+out:
+ if (mvp != NULL)
+ vnfree(mvp);
+ if (logvp != NULL)
+ vput(logvp);
+ if (error && *snaplist != NULL) {
+ free(*snaplist, M_UFSMNT);
+ *snaplist = NULL;
+ }
+
+ return error;
+}
+
+/*
+ * Copy allocation information from all the snapshots in this snapshot and
+ * then expunge them from its view. Also, collect the list of allocated
+ * blocks in i_snapblklist.
+ */
+static int
+snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
+ struct fs *copy_fs, daddr_t snaplistsize)
+{
+ int error = 0, i;
+ daddr_t numblks, *snaplist = NULL;
+ struct fs *fs = VFSTOUFS(mp)->um_fs;
+ struct inode *ip = VTOI(vp), *xp;
+ struct lwp *l = curlwp;
+ struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
+
+ TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
+ if (xp != ip) {
+ error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
+ if (error)
+ break;
+ }
+ if (xp->i_nlink != 0)
+ continue;
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ break;
+ error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
+ UFS_WAPBL_END(mp);
+ if (error)
+ break;
+ }
+ if (error)
+ goto out;
+ /*
+ * Allocate space for the full list of preallocated snapshot blocks.
+ */
+ snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+ ip->i_snapblklist = &snaplist[1];
+ /*
+ * Expunge the blocks used by the snapshots from the set of
+ * blocks marked as used in the snapshot bitmaps. Also, collect
+ * the list of allocated blocks in i_snapblklist.
+ */
+ error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
+ if (error)
+ goto out;
+ if (snaplistsize < ip->i_snapblklist - snaplist)
+ panic("ffs_snapshot: list too small");
+ snaplistsize = ip->i_snapblklist - snaplist;
+ snaplist[0] = snaplistsize;
+ ip->i_snapblklist = &snaplist[0];
+ /*
+ * Write out the list of allocated blocks to the end of the snapshot.
+ */
+ numblks = howmany(fs->fs_size, fs->fs_frag);
+ for (i = 0; i < snaplistsize; i++)
+ snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
+ error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
+ snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks),
+ UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
+ for (i = 0; i < snaplistsize; i++)
+ snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
+out:
+ if (error && snaplist != NULL) {
+ free(snaplist, M_UFSMNT);
+ ip->i_snapblklist = NULL;
+ }
+ return error;
+}
+
+/*
+ * Write the superblock and its summary information to the snapshot.
+ * Make sure, the first NDADDR blocks get copied to the snapshot.
+ */
+static int
+snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
+{
+ int error, len, loc;
+ void *space;
+ daddr_t blkno;
+ struct buf *bp;
+ struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
+ struct inode *ip = VTOI(vp);
+ struct lwp *l = curlwp;
+
+ copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
+
+ /*
+ * Write the superblock and its summary information
+ * to the snapshot.
+ */
+ blkno = fragstoblks(fs, fs->fs_csaddr);
+ len = howmany(fs->fs_cssize, fs->fs_bsize);
+ space = copyfs->fs_csp;
+#ifdef FFS_EI
+ if (UFS_FSNEEDSWAP(fs)) {
+ ffs_sb_swap(copyfs, copyfs);
+ ffs_csum_swap(space, space, fs->fs_cssize);
+ }
+#endif
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ for (loc = 0; loc < len; loc++) {
+ error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
+ B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ break;
+ }
+ memcpy(bp->b_data, space, fs->fs_bsize);
+ space = (char *)space + fs->fs_bsize;
+ bawrite(bp);
+ }
+ if (error)
+ goto out;
+ error = bread(vp, lblkno(fs, fs->fs_sblockloc),
+ fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
+ if (error) {
+ brelse(bp, 0);
+ goto out;
+ } else {
+ memcpy(bp->b_data, sbbuf, fs->fs_bsize);
+ bawrite(bp);
+ }
+ /*
+ * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
+ * and ffs_snapblkfree() will always work on indirect blocks.
+ */
+ for (loc = 0; loc < NDADDR; loc++) {
+ if (db_get(ip, loc) != 0)
+ continue;
+ error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
+ fs->fs_bsize, l->l_cred, 0, &bp);
+ if (error)
+ break;
+ error = rwfsblk(vp, B_READ, bp->b_data, loc);
+ if (error) {
+ brelse(bp, 0);
+ break;
+ }
+ bawrite(bp);
+ }
+
+out:
+ UFS_WAPBL_END(mp);
+ return error;
+}
+
+/*
+ * Copy all cylinder group maps.
+ */
+static int
+cgaccount(struct vnode *vp, int passno, int *redo)
+{
+ int cg, error = 0;
+ struct buf *nbp;
+ struct fs *fs = VTOI(vp)->i_fs;
+
+ if (redo != NULL)
+ *redo = 0;
+ if (passno == 1)
+ fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ for (cg = 0; cg < fs->fs_ncg; cg++) {
+ if (passno == 2 && ACTIVECG_ISSET(fs, cg))
+ continue;
+
+ if (redo != NULL)
+ *redo += 1;
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ return error;
+ error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
+ fs->fs_bsize, curlwp->l_cred, 0, &nbp);
+ if (error) {
+ UFS_WAPBL_END(vp->v_mount);
+ break;
+ }
+ error = cgaccount1(cg, vp, nbp->b_data, passno);
+ bawrite(nbp);
+ UFS_WAPBL_END(vp->v_mount);
+ if (error)
+ break;
+ }
+ return error;
+}
+
+/*
+ * Copy a cylinder group map. All the unallocated blocks are marked
+ * BLK_NOCOPY so that the snapshot knows that it need not copy them
+ * if they are later written. If passno is one, then this is a first
+ * pass, so only setting needs to be done. If passno is 2, then this
+ * is a revision to a previous pass which must be undone as the
+ * replacement pass is done.
+ */
+static int
+cgaccount1(int cg, struct vnode *vp, void *data, int passno)
+{
+ struct buf *bp, *ibp;
+ struct inode *ip;
+ struct cg *cgp;
+ struct fs *fs;
+ struct lwp *l = curlwp;
+ daddr_t base, numblks;
+ int error, len, loc, ns, indiroff;
+
+ ip = VTOI(vp);
+ fs = ip->i_fs;
+ ns = UFS_FSNEEDSWAP(fs);
+ error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+ (int)fs->fs_cgsize, l->l_cred, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp, ns)) {
+ brelse(bp, 0);
+ return (EIO);
+ }
+ ACTIVECG_SET(fs, cg);
+
+ memcpy(data, bp->b_data, fs->fs_cgsize);
+ brelse(bp, 0);
+ if (fs->fs_cgsize < fs->fs_bsize)
+ memset((char *)data + fs->fs_cgsize, 0,
+ fs->fs_bsize - fs->fs_cgsize);
+ numblks = howmany(fs->fs_size, fs->fs_frag);
+ len = howmany(fs->fs_fpg, fs->fs_frag);
+ base = cg * fs->fs_fpg / fs->fs_frag;
+ if (base + len >= numblks)
+ len = numblks - base - 1;
+ loc = 0;
+ if (base < NDADDR) {
+ for ( ; loc < NDADDR; loc++) {
+ if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
+ db_assign(ip, loc, BLK_NOCOPY);
+ else if (db_get(ip, loc) == BLK_NOCOPY) {
+ if (passno == 2)
+ db_assign(ip, loc, 0);
+ else if (passno == 1)
+ panic("ffs_snapshot: lost direct block");
+ }
+ }
+ }
+ if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
+ fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
+ return (error);
+ indiroff = (base + loc - NDADDR) % NINDIR(fs);
+ for ( ; loc < len; loc++, indiroff++) {
+ if (indiroff >= NINDIR(fs)) {
+ bawrite(ibp);
+ if ((error = ffs_balloc(vp,
+ lblktosize(fs, (off_t)(base + loc)),
+ fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
+ return (error);
+ indiroff = 0;
+ }
+ if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
+ idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
+ else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
+ if (passno == 2)
+ idb_assign(ip, ibp->b_data, indiroff, 0);
+ else if (passno == 1)
+ panic("ffs_snapshot: lost indirect block");
+ }
+ }
+ bdwrite(ibp);
+ return (0);
+}
+
+/*
+ * Before expunging a snapshot inode, note all the
+ * blocks that it claims with BLK_SNAP so that fsck will
+ * be able to account for those blocks properly and so
+ * that this snapshot knows that it need not copy them
+ * if the other snapshot holding them is freed.
+ */
+static int
+expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
+ acctfunc_t acctfunc, int expungetype)
+{
+ int i, error, ns;
+ daddr_t lbn, rlbn;
+ daddr_t len, blkno, numblks, blksperindir;
+ struct ufs1_dinode *dip1;
+ struct ufs2_dinode *dip2;
+ struct lwp *l = curlwp;
+ void *bap;
+ struct buf *bp;
+ struct mount *mp;
+
+ ns = UFS_FSNEEDSWAP(fs);
+ mp = snapvp->v_mount;
+
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ /*
+ * Prepare to expunge the inode. If its inode block has not
+ * yet been copied, then allocate and fill the copy.
+ */
+ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
+ error = snapblkaddr(snapvp, lbn, &blkno);
+ if (error)
+ return error;
+ if (blkno != 0) {
+ error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
+ B_MODIFY, &bp);
+ } else {
+ error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
+ fs->fs_bsize, l->l_cred, 0, &bp);
+ if (! error)
+ error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
+ }
+ if (error) {
+ UFS_WAPBL_END(mp);
+ return error;
+ }
+ /*
+ * Set a snapshot inode to be a zero length file, regular files
+ * or unlinked snapshots to be completely unallocated.
+ */
+ if (fs->fs_magic == FS_UFS1_MAGIC) {
+ dip1 = (struct ufs1_dinode *)bp->b_data +
+ ino_to_fsbo(fs, cancelip->i_number);
+ if (cancelip->i_flags & SF_SNAPSHOT) {
+ dip1->di_flags =
+ ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
+ SF_SNAPINVAL, ns);
+ }
+ if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
+ dip1->di_mode = 0;
+ dip1->di_size = 0;
+ dip1->di_blocks = 0;
+ memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t));
+ } else {
+ dip2 = (struct ufs2_dinode *)bp->b_data +
+ ino_to_fsbo(fs, cancelip->i_number);
+ if (cancelip->i_flags & SF_SNAPSHOT) {
+ dip2->di_flags =
+ ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
+ SF_SNAPINVAL, ns);
+ }
+ if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
+ dip2->di_mode = 0;
+ dip2->di_size = 0;
+ dip2->di_blocks = 0;
+ memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t));
+ }
+ bdwrite(bp);
+ UFS_WAPBL_END(mp);
+ /*
+ * Now go through and expunge all the blocks in the file
+ * using the function requested.
+ */
+ numblks = howmany(cancelip->i_size, fs->fs_bsize);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ bap = &cancelip->i_ffs1_db[0];
+ else
+ bap = &cancelip->i_ffs2_db[0];
+ error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype);
+ if (error)
+ return (error);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ bap = &cancelip->i_ffs1_ib[0];
+ else
+ bap = &cancelip->i_ffs2_ib[0];
+ error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype);
+ if (error)
+ return (error);
+ blksperindir = 1;
+ lbn = -NDADDR;
+ len = numblks - NDADDR;
+ rlbn = NDADDR;
+ for (i = 0; len > 0 && i < NIADDR; i++) {
+ error = indiracct(snapvp, ITOV(cancelip), i,
+ ib_get(cancelip, i), lbn, rlbn, len,
+ blksperindir, fs, acctfunc, expungetype);
+ if (error)
+ return (error);
+ blksperindir *= NINDIR(fs);
+ lbn -= blksperindir + 1;
+ len -= blksperindir;
+ rlbn += blksperindir;
+ }
+ return (0);
+}
+
+/*
+ * Descend an indirect block chain for vnode cancelvp accounting for all
+ * its indirect blocks in snapvp.
+ */
+static int
+indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
+ daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
+ daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
+{
+ int error, num, i;
+ daddr_t subblksperindir;
+ struct indir indirs[NIADDR + 2];
+ daddr_t last;
+ void *bap;
+ struct buf *bp;
+
+ if (blkno == 0) {
+ if (expungetype == BLK_NOCOPY)
+ return (0);
+ panic("indiracct: missing indir");
+ }
+ if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
+ return (error);
+ if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
+ panic("indiracct: botched params");
+ /*
+ * We have to expand bread here since it will deadlock looking
+ * up the block number for any blocks that are not in the cache.
+ */
+ error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
+ false, &bp);
+ if (error)
+ return error;
+ if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
+ rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
+ brelse(bp, 0);
+ return (error);
+ }
+ /*
+ * Account for the block pointers in this indirect block.
+ */
+ last = howmany(remblks, blksperindir);
+ if (last > NINDIR(fs))
+ last = NINDIR(fs);
+ bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
+ memcpy((void *)bap, bp->b_data, fs->fs_bsize);
+ brelse(bp, 0);
+ error = (*acctfunc)(snapvp, bap, 0, last,
+ fs, level == 0 ? rlbn : -1, expungetype);
+ if (error || level == 0)
+ goto out;
+ /*
+ * Account for the block pointers in each of the indirect blocks
+ * in the levels below us.
+ */
+ subblksperindir = blksperindir / NINDIR(fs);
+ for (lbn++, level--, i = 0; i < last; i++) {
+ error = indiracct(snapvp, cancelvp, level,
+ idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
+ subblksperindir, fs, acctfunc, expungetype);
+ if (error)
+ goto out;
+ rlbn += blksperindir;
+ lbn -= blksperindir;
+ remblks -= blksperindir;
+ }
+out:
+ free(bap, M_DEVBUF);
+ return (error);
+}
+
+/*
+ * Do both snap accounting and map accounting.
+ */
+static int
+fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+ struct fs *fs, daddr_t lblkno,
+ int exptype /* BLK_SNAP or BLK_NOCOPY */)
+{
+ int error;
+
+ if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
+ return (error);
+ return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
+}
+
+/*
+ * Identify a set of blocks allocated in a snapshot inode.
+ */
+static int
+snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+ struct fs *fs, daddr_t lblkno,
+ int expungetype /* BLK_SNAP or BLK_NOCOPY */)
+{
+ struct inode *ip = VTOI(vp);
+ struct lwp *l = curlwp;
+ struct mount *mp = vp->v_mount;
+ daddr_t blkno;
+ daddr_t lbn;
+ struct buf *ibp;
+ int error, n;
+ const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
+
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ for ( n = 0; oldblkp < lastblkp; oldblkp++) {
+ blkno = idb_get(ip, bap, oldblkp);
+ if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
+ continue;
+ lbn = fragstoblks(fs, blkno);
+ if (lbn < NDADDR) {
+ blkno = db_get(ip, lbn);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ } else {
+ error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
+ fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+ if (error)
+ break;
+ blkno = idb_get(ip, ibp->b_data,
+ (lbn - NDADDR) % NINDIR(fs));
+ }
+ /*
+ * If we are expunging a snapshot vnode and we
+ * find a block marked BLK_NOCOPY, then it is
+ * one that has been allocated to this snapshot after
+ * we took our current snapshot and can be ignored.
+ */
+ if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
+ if (lbn >= NDADDR)
+ brelse(ibp, 0);
+ } else {
+ if (blkno != 0)
+ panic("snapacct: bad block");
+ if (lbn < NDADDR)
+ db_assign(ip, lbn, expungetype);
+ else {
+ idb_assign(ip, ibp->b_data,
+ (lbn - NDADDR) % NINDIR(fs), expungetype);
+ bdwrite(ibp);
+ }
+ }
+ if (wbreak > 0 && (++n % wbreak) == 0) {
+ UFS_WAPBL_END(mp);
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ }
+ }
+ UFS_WAPBL_END(mp);
+ return error;
+}
+
+/*
+ * Account for a set of blocks allocated in a snapshot inode.
+ */
+static int
+mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+ struct fs *fs, daddr_t lblkno, int expungetype)
+{
+ daddr_t blkno;
+ struct inode *ip;
+ struct mount *mp = vp->v_mount;
+ ino_t inum;
+ int acctit, error, n;
+ const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
+
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ ip = VTOI(vp);
+ inum = ip->i_number;
+ if (lblkno == -1)
+ acctit = 0;
+ else
+ acctit = 1;
+ for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
+ blkno = idb_get(ip, bap, oldblkp);
+ if (blkno == 0 || blkno == BLK_NOCOPY)
+ continue;
+ if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
+ *ip->i_snapblklist++ = lblkno;
+ if (blkno == BLK_SNAP)
+ blkno = blkstofrags(fs, lblkno);
+ ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
+ if (wbreak > 0 && (++n % wbreak) == 0) {
+ UFS_WAPBL_END(mp);
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ }
+ }
+ UFS_WAPBL_END(mp);
+ return (0);
+}
+
+/*
+ * Number of blocks that fit into the journal or zero if not logging.
+ */
+static int
+blocks_in_journal(struct fs *fs)
+{
+ off_t bpj;
+
+ if ((fs->fs_flags & FS_DOWAPBL) == 0)
+ return 0;
+ bpj = 1;
+ if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
+ switch (fs->fs_journal_location) {
+ case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+ bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
+ fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+ break;
+ case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+ bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
+ fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+ break;
+ }
+ }
+ bpj /= fs->fs_bsize;
+ return (bpj > 0 ? bpj : 1);
+}
+#endif /* defined(FFS_NO_SNAPSHOT) */
+
+/*
+ * Decrement extra reference on snapshot when last name is removed.
+ * It will not be freed until the last open reference goes away.
+ */
+void
+ffs_snapgone(struct inode *ip)
+{
+ struct mount *mp = ip->i_devvp->v_specmountpoint;
+ struct inode *xp;
+ struct fs *fs;
+ struct snap_info *si;
+ int snaploc;
+
+ si = VFSTOUFS(mp)->um_snapinfo;
+
+ /*
+ * Find snapshot in incore list.
+ */
+ mutex_enter(&si->si_lock);
+ TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
+ if (xp == ip)
+ break;
+ mutex_exit(&si->si_lock);
+ if (xp != NULL)
+ vrele(ITOV(ip));
+#ifdef DEBUG
+ else if (snapdebug)
+ printf("ffs_snapgone: lost snapshot vnode %llu\n",
+ (unsigned long long)ip->i_number);
+#endif
+ /*
+ * Delete snapshot inode from superblock. Keep list dense.
+ */
+ mutex_enter(&si->si_lock);
+ fs = ip->i_fs;
+ for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+ if (fs->fs_snapinum[snaploc] == ip->i_number)
+ break;
+ if (snaploc < FSMAXSNAP) {
+ for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
+ if (fs->fs_snapinum[snaploc] == 0)
+ break;
+ fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
+ }
+ fs->fs_snapinum[snaploc - 1] = 0;
+ }
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+}
+
+/*
+ * Prepare a snapshot file for being removed.
+ */
+void
+ffs_snapremove(struct vnode *vp)
+{
+ struct inode *ip = VTOI(vp), *xp;
+ struct vnode *devvp = ip->i_devvp;
+ struct fs *fs = ip->i_fs;
+ struct mount *mp = devvp->v_specmountpoint;
+ struct buf *ibp;
+ struct snap_info *si;
+ struct lwp *l = curlwp;
+ daddr_t numblks, blkno, dblk;
+ int error, loc, last;
+
+ si = VFSTOUFS(mp)->um_snapinfo;
+ /*
+ * If active, delete from incore list (this snapshot may
+ * already have been in the process of being deleted, so
+ * would not have been active).
+ *
+ * Clear copy-on-write flag if last snapshot.
+ */
+ mutex_enter(&si->si_snaplock);
+ mutex_enter(&si->si_lock);
+ if (is_active_snapshot(si, ip)) {
+ TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
+ if (TAILQ_FIRST(&si->si_snapshots) != 0) {
+ /* Roll back the list of preallocated blocks. */
+ xp = TAILQ_LAST(&si->si_snapshots, inodelst);
+ si->si_snapblklist = xp->i_snapblklist;
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+ mutex_exit(&si->si_snaplock);
+ } else {
+ si->si_snapblklist = 0;
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+ mutex_exit(&si->si_snaplock);
+ fscow_disestablish(mp, ffs_copyonwrite, devvp);
+ }
+ if (ip->i_snapblklist != NULL) {
+ free(ip->i_snapblklist, M_UFSMNT);
+ ip->i_snapblklist = NULL;
+ }
+ } else {
+ mutex_exit(&si->si_lock);
+ mutex_exit(&si->si_snaplock);
+ }
+ /*
+ * Clear all BLK_NOCOPY fields. Pass any block claims to other
+ * snapshots that want them (see ffs_snapblkfree below).
+ */
+ for (blkno = 1; blkno < NDADDR; blkno++) {
+ dblk = db_get(ip, blkno);
+ if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
+ db_assign(ip, blkno, 0);
+ else if ((dblk == blkstofrags(fs, blkno) &&
+ ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
+ ip->i_number))) {
+ DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
+ db_assign(ip, blkno, 0);
+ }
+ }
+ numblks = howmany(ip->i_size, fs->fs_bsize);
+ for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
+ error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
+ fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+ if (error)
+ continue;
+ if (fs->fs_size - blkno > NINDIR(fs))
+ last = NINDIR(fs);
+ else
+ last = fs->fs_size - blkno;
+ for (loc = 0; loc < last; loc++) {
+ dblk = idb_get(ip, ibp->b_data, loc);
+ if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
+ idb_assign(ip, ibp->b_data, loc, 0);
+ else if (dblk == blkstofrags(fs, blkno) &&
+ ffs_snapblkfree(fs, ip->i_devvp, dblk,
+ fs->fs_bsize, ip->i_number)) {
+ DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
+ idb_assign(ip, ibp->b_data, loc, 0);
+ }
+ }
+ bawrite(ibp);
+ UFS_WAPBL_END(mp);
+ error = UFS_WAPBL_BEGIN(mp);
+ KASSERT(error == 0);
+ }
+ /*
+ * Clear snapshot flag and drop reference.
+ */
+ ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+#if defined(QUOTA) || defined(QUOTA2)
+ chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
+ chkiq(ip, 1, l->l_cred, FORCE);
+#endif
+}
+
+/*
+ * Notification that a block is being freed. Return zero if the free
+ * should be allowed to proceed. Return non-zero if the snapshot file
+ * wants to claim the block. The block will be claimed if it is an
+ * uncopied part of one of the snapshots. It will be freed if it is
+ * either a BLK_NOCOPY or has already been copied in all of the snapshots.
+ * If a fragment is being freed, then all snapshots that care about
+ * it must make a copy since a snapshot file can only claim full sized
+ * blocks. Note that if more than one snapshot file maps the block,
+ * we can pick one at random to claim it. Since none of the snapshots
+ * can change, we are assurred that they will all see the same unmodified
+ * image. When deleting a snapshot file (see ffs_snapremove above), we
+ * must push any of these claimed blocks to one of the other snapshots
+ * that maps it. These claimed blocks are easily identified as they will
+ * have a block number equal to their logical block number within the
+ * snapshot. A copied block can never have this property because they
+ * must always have been allocated from a BLK_NOCOPY location.
+ */
+int
+ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
+ long size, ino_t inum)
+{
+ struct mount *mp = devvp->v_specmountpoint;
+ struct buf *ibp;
+ struct inode *ip;
+ struct vnode *vp = NULL;
+ struct snap_info *si;
+ void *saved_data = NULL;
+ daddr_t lbn;
+ daddr_t blkno;
+ uint32_t gen;
+ int indiroff = 0, error = 0, claimedblk = 0;
+
+ si = VFSTOUFS(mp)->um_snapinfo;
+ lbn = fragstoblks(fs, bno);
+ mutex_enter(&si->si_snaplock);
+ mutex_enter(&si->si_lock);
+ si->si_owner = curlwp;
+
+retry:
+ gen = si->si_gen;
+ TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
+ vp = ITOV(ip);
+ /*
+ * Lookup block being written.
+ */
+ if (lbn < NDADDR) {
+ blkno = db_get(ip, lbn);
+ } else {
+ mutex_exit(&si->si_lock);
+ error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
+ fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
+ if (error) {
+ mutex_enter(&si->si_lock);
+ break;
+ }
+ indiroff = (lbn - NDADDR) % NINDIR(fs);
+ blkno = idb_get(ip, ibp->b_data, indiroff);
+ mutex_enter(&si->si_lock);
+ if (gen != si->si_gen) {
+ brelse(ibp, 0);
+ goto retry;
+ }
+ }
+ /*
+ * Check to see if block needs to be copied.
+ */
+ if (blkno == 0) {
+ /*
+ * A block that we map is being freed. If it has not
+ * been claimed yet, we will claim or copy it (below).
+ */
+ claimedblk = 1;
+ } else if (blkno == BLK_SNAP) {
+ /*
+ * No previous snapshot claimed the block,
+ * so it will be freed and become a BLK_NOCOPY
+ * (don't care) for us.
+ */
+ if (claimedblk)
+ panic("snapblkfree: inconsistent block type");
+ if (lbn < NDADDR) {
+ db_assign(ip, lbn, BLK_NOCOPY);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ } else {
+ idb_assign(ip, ibp->b_data, indiroff,
+ BLK_NOCOPY);
+ mutex_exit(&si->si_lock);
+ if (ip->i_nlink > 0)
+ bwrite(ibp);
+ else
+ bdwrite(ibp);
+ mutex_enter(&si->si_lock);
+ if (gen != si->si_gen)
+ goto retry;
+ }
+ continue;
+ } else /* BLK_NOCOPY or default */ {
+ /*
+ * If the snapshot has already copied the block
+ * (default), or does not care about the block,
+ * it is not needed.
+ */
+ if (lbn >= NDADDR)
+ brelse(ibp, 0);
+ continue;
+ }
+ /*
+ * If this is a full size block, we will just grab it
+ * and assign it to the snapshot inode. Otherwise we
+ * will proceed to copy it. See explanation for this
+ * routine as to why only a single snapshot needs to
+ * claim this block.
+ */
+ if (size == fs->fs_bsize) {
+#ifdef DEBUG
+ if (snapdebug)
+ printf("%s %llu lbn %" PRId64
+ "from inum %llu\n",
+ "Grabonremove: snapino",
+ (unsigned long long)ip->i_number,
+ lbn, (unsigned long long)inum);
+#endif
+ mutex_exit(&si->si_lock);
+ if (lbn < NDADDR) {
+ db_assign(ip, lbn, bno);
+ } else {
+ idb_assign(ip, ibp->b_data, indiroff, bno);
+ if (ip->i_nlink > 0)
+ bwrite(ibp);
+ else
+ bdwrite(ibp);
+ }
+ DIP_ADD(ip, blocks, btodb(size));
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (ip->i_nlink > 0 && mp->mnt_wapbl)
+ error = syncsnap(vp);
+ else
+ error = 0;
+ mutex_enter(&si->si_lock);
+ si->si_owner = NULL;
+ mutex_exit(&si->si_lock);
+ mutex_exit(&si->si_snaplock);
+ return (error == 0);
+ }
+ if (lbn >= NDADDR)
+ brelse(ibp, 0);
+#ifdef DEBUG
+ if (snapdebug)
+ printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
+ "Copyonremove: snapino ",
+ (unsigned long long)ip->i_number,
+ lbn, "for inum", (unsigned long long)inum, size);
+#endif
+ /*
+ * If we have already read the old block contents, then
+ * simply copy them to the new block. Note that we need
+ * to synchronously write snapshots that have not been
+ * unlinked, and hence will be visible after a crash,
+ * to ensure their integrity.
+ */
+ mutex_exit(&si->si_lock);
+ if (saved_data == NULL) {
+ saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+ error = rwfsblk(vp, B_READ, saved_data, lbn);
+ if (error) {
+ free(saved_data, M_UFSMNT);
+ saved_data = NULL;
+ mutex_enter(&si->si_lock);
+ break;
+ }
+ }
+ error = wrsnapblk(vp, saved_data, lbn);
+ if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
+ error = syncsnap(vp);
+ mutex_enter(&si->si_lock);
+ if (error)
+ break;
+ if (gen != si->si_gen)
+ goto retry;
+ }
+ si->si_owner = NULL;
+ mutex_exit(&si->si_lock);
+ mutex_exit(&si->si_snaplock);
+ if (saved_data)
+ free(saved_data, M_UFSMNT);
+ /*
+ * If we have been unable to allocate a block in which to do
+ * the copy, then return non-zero so that the fragment will
+ * not be freed. Although space will be lost, the snapshot
+ * will stay consistent.
+ */
+ return (error);
+}
+
+/*
+ * Associate snapshot files when mounting.
+ */
+void
+ffs_snapshot_mount(struct mount *mp)
+{
+ struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
+ struct fs *fs = VFSTOUFS(mp)->um_fs;
+ struct lwp *l = curlwp;
+ struct vnode *vp;
+ struct inode *ip, *xp;
+ struct snap_info *si;
+ daddr_t snaplistsize, *snapblklist;
+ int i, error, ns, snaploc, loc;
+
+ /*
+ * No persistent snapshots on apple ufs file systems.
+ */
+ if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
+ return;
+
+ si = VFSTOUFS(mp)->um_snapinfo;
+ ns = UFS_FSNEEDSWAP(fs);
+ /*
+ * XXX The following needs to be set before ffs_truncate or
+ * VOP_READ can be called.
+ */
+ mp->mnt_stat.f_iosize = fs->fs_bsize;
+ /*
+ * Process each snapshot listed in the superblock.
+ */
+ vp = NULL;
+ mutex_enter(&si->si_lock);
+ for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
+ if (fs->fs_snapinum[snaploc] == 0)
+ break;
+ if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
+ &vp)) != 0) {
+ printf("ffs_snapshot_mount: vget failed %d\n", error);
+ continue;
+ }
+ ip = VTOI(vp);
+ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
+ SF_SNAPSHOT) {
+ printf("ffs_snapshot_mount: non-snapshot inode %d\n",
+ fs->fs_snapinum[snaploc]);
+ vput(vp);
+ vp = NULL;
+ for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
+ if (fs->fs_snapinum[loc] == 0)
+ break;
+ fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
+ }
+ fs->fs_snapinum[loc - 1] = 0;
+ snaploc--;
+ continue;
+ }
+
+ /*
+ * Read the block hints list. Use an empty list on
+ * read errors.
+ */
+ error = vn_rdwr(UIO_READ, vp,
+ (void *)&snaplistsize, sizeof(snaplistsize),
+ lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
+ UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
+ l->l_cred, NULL, NULL);
+ if (error) {
+ printf("ffs_snapshot_mount: read_1 failed %d\n", error);
+ snaplistsize = 1;
+ } else
+ snaplistsize = ufs_rw64(snaplistsize, ns);
+ snapblklist = malloc(
+ snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+ if (error)
+ snapblklist[0] = 1;
+ else {
+ error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
+ snaplistsize * sizeof(daddr_t),
+ lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
+ UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
+ l->l_cred, NULL, NULL);
+ for (i = 0; i < snaplistsize; i++)
+ snapblklist[i] = ufs_rw64(snapblklist[i], ns);
+ if (error) {
+ printf("ffs_snapshot_mount: read_2 failed %d\n",
+ error);
+ snapblklist[0] = 1;
+ }
+ }
+ ip->i_snapblklist = &snapblklist[0];
+
+ /*
+ * Link it onto the active snapshot list.
+ */
+ if (is_active_snapshot(si, ip))
+ panic("ffs_snapshot_mount: %"PRIu64" already on list",
+ ip->i_number);
+ else
+ TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
+ vp->v_vflag |= VV_SYSTEM;
+ VOP_UNLOCK(vp);
+ }
+ /*
+ * No usable snapshots found.
+ */
+ if (vp == NULL) {
+ mutex_exit(&si->si_lock);
+ return;
+ }
+ /*
+ * Attach the block hints list. We always want to
+ * use the list from the newest snapshot.
+ */
+ xp = TAILQ_LAST(&si->si_snapshots, inodelst);
+ si->si_snapblklist = xp->i_snapblklist;
+ fscow_establish(mp, ffs_copyonwrite, devvp);
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+}
+
+/*
+ * Disassociate snapshot files when unmounting.
+ */
+void
+ffs_snapshot_unmount(struct mount *mp)
+{
+ struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
+ struct inode *xp;
+ struct vnode *vp = NULL;
+ struct snap_info *si;
+
+ si = VFSTOUFS(mp)->um_snapinfo;
+ mutex_enter(&si->si_lock);
+ while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
+ vp = ITOV(xp);
+ TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
+ if (xp->i_snapblklist == si->si_snapblklist)
+ si->si_snapblklist = NULL;
+ free(xp->i_snapblklist, M_UFSMNT);
+ if (xp->i_nlink > 0) {
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+ vrele(vp);
+ mutex_enter(&si->si_lock);
+ }
+ }
+ si->si_gen++;
+ mutex_exit(&si->si_lock);
+ if (vp)
+ fscow_disestablish(mp, ffs_copyonwrite, devvp);
+}
+
+/*
+ * Check for need to copy block that is about to be written,
+ * copying the block if necessary.
+ */
+static int
+ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
+{
+ struct fs *fs;
+ struct inode *ip;
+ struct vnode *devvp = v, *vp = NULL;
+ struct mount *mp = devvp->v_specmountpoint;
+ struct snap_info *si;
+ void *saved_data = NULL;
+ daddr_t lbn, blkno, *snapblklist;
+ uint32_t gen;
+ int lower, upper, mid, snapshot_locked = 0, error = 0;
+
+ /*
+ * Check for valid snapshots.
+ */
+ si = VFSTOUFS(mp)->um_snapinfo;
+ mutex_enter(&si->si_lock);
+ ip = TAILQ_FIRST(&si->si_snapshots);
+ if (ip == NULL) {
+ mutex_exit(&si->si_lock);
+ return 0;
+ }
+ /*
+ * First check to see if it is after the file system,
+ * in the journal or in the preallocated list.
+ * By doing these checks we avoid several potential deadlocks.
+ */
+ fs = ip->i_fs;
+ lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
+ if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
+ mutex_exit(&si->si_lock);
+ return 0;
+ }
+ if ((fs->fs_flags & FS_DOWAPBL) &&
+ fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
+ off_t blk_off, log_start, log_end;
+
+ log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
+ fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+ log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
+ fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+ blk_off = dbtob(bp->b_blkno);
+ if (blk_off >= log_start && blk_off < log_end) {
+ mutex_exit(&si->si_lock);
+ return 0;
+ }
+ }
+ snapblklist = si->si_snapblklist;
+ upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
+ lower = 1;
+ while (lower <= upper) {
+ mid = (lower + upper) / 2;
+ if (snapblklist[mid] == lbn)
+ break;
+ if (snapblklist[mid] < lbn)
+ lower = mid + 1;
+ else
+ upper = mid - 1;
+ }
+ if (lower <= upper) {
+ mutex_exit(&si->si_lock);
+ return 0;
+ }
+ /*
+ * Not in the precomputed list, so check the snapshots.
+ */
+ if (si->si_owner != curlwp) {
+ if (!mutex_tryenter(&si->si_snaplock)) {
+ mutex_exit(&si->si_lock);
+ mutex_enter(&si->si_snaplock);
+ mutex_enter(&si->si_lock);
+ }
+ si->si_owner = curlwp;
+ snapshot_locked = 1;
+ }
+ if (data_valid && bp->b_bcount == fs->fs_bsize)
+ saved_data = bp->b_data;
+retry:
+ gen = si->si_gen;
+ TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
+ vp = ITOV(ip);
+ /*
+ * We ensure that everything of our own that needs to be
+ * copied will be done at the time that ffs_snapshot is
+ * called. Thus we can skip the check here which can
+ * deadlock in doing the lookup in ffs_balloc.
+ */
+ if (bp->b_vp == vp)
+ continue;
+ /*
+ * Check to see if block needs to be copied.
+ */
+ if (lbn < NDADDR) {
+ blkno = db_get(ip, lbn);
+ } else {
+ mutex_exit(&si->si_lock);
+ blkno = 0; /* XXX: GCC */
+ if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
+ mutex_enter(&si->si_lock);
+ break;
+ }
+ mutex_enter(&si->si_lock);
+ if (gen != si->si_gen)
+ goto retry;
+ }
+#ifdef DIAGNOSTIC
+ if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
+ panic("ffs_copyonwrite: bad copy block");
+#endif
+ if (blkno != 0)
+ continue;
+
+ if (curlwp == uvm.pagedaemon_lwp) {
+ error = ENOMEM;
+ break;
+ }
+ /* Only one level of recursion allowed. */
+ KASSERT(snapshot_locked);
+ /*
+ * Allocate the block into which to do the copy. Since
+ * multiple processes may all try to copy the same block,
+ * we have to recheck our need to do a copy if we sleep
+ * waiting for the lock.
+ *
+ * Because all snapshots on a filesystem share a single
+ * lock, we ensure that we will never be in competition
+ * with another process to allocate a block.
+ */
+#ifdef DEBUG
+ if (snapdebug) {
+ printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
+ (unsigned long long)ip->i_number, lbn);
+ if (bp->b_vp == devvp)
+ printf("fs metadata");
+ else
+ printf("inum %llu", (unsigned long long)
+ VTOI(bp->b_vp)->i_number);
+ printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
+ }
+#endif
+ /*
+ * If we have already read the old block contents, then
+ * simply copy them to the new block. Note that we need
+ * to synchronously write snapshots that have not been
+ * unlinked, and hence will be visible after a crash,
+ * to ensure their integrity.
+ */
+ mutex_exit(&si->si_lock);
+ if (saved_data == NULL) {
+ saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+ error = rwfsblk(vp, B_READ, saved_data, lbn);
+ if (error) {
+ free(saved_data, M_UFSMNT);
+ saved_data = NULL;
+ mutex_enter(&si->si_lock);
+ break;
+ }
+ }
+ error = wrsnapblk(vp, saved_data, lbn);
+ if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
+ error = syncsnap(vp);
+ mutex_enter(&si->si_lock);
+ if (error)
+ break;
+ if (gen != si->si_gen)
+ goto retry;
+ }
+ /*
+ * Note that we need to synchronously write snapshots that
+ * have not been unlinked, and hence will be visible after
+ * a crash, to ensure their integrity.
+ */
+ if (snapshot_locked) {
+ si->si_owner = NULL;
+ mutex_exit(&si->si_lock);
+ mutex_exit(&si->si_snaplock);
+ } else
+ mutex_exit(&si->si_lock);
+ if (saved_data && saved_data != bp->b_data)
+ free(saved_data, M_UFSMNT);
+ return error;
+}
+
+/*
+ * Read from a snapshot.
+ */
+int
+ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
+{
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+ struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
+ struct buf *bp;
+ daddr_t lbn, nextlbn;
+ off_t fsbytes, bytesinfile;
+ long size, xfersize, blkoffset;
+ int error;
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ mutex_enter(&si->si_snaplock);
+
+ if (ioflag & IO_ALTSEMANTICS)
+ fsbytes = ip->i_size;
+ else
+ fsbytes = lfragtosize(fs, fs->fs_size);
+ for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+ bytesinfile = fsbytes - uio->uio_offset;
+ if (bytesinfile <= 0)
+ break;
+ lbn = lblkno(fs, uio->uio_offset);
+ nextlbn = lbn + 1;
+ size = fs->fs_bsize;
+ blkoffset = blkoff(fs, uio->uio_offset);
+ xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
+ bytesinfile);
+
+ if (lblktosize(fs, nextlbn + 1) >= fsbytes) {
+ if (lblktosize(fs, lbn) + size > fsbytes)
+ size = fragroundup(fs,
+ fsbytes - lblktosize(fs, lbn));
+ error = bread(vp, lbn, size, NOCRED, 0, &bp);
+ } else {
+ int nextsize = fs->fs_bsize;
+ error = breadn(vp, lbn,
+ size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+ }
+ if (error)
+ break;
+
+ /*
+ * We should only get non-zero b_resid when an I/O error
+ * has occurred, which should cause us to break above.
+ * However, if the short read did not cause an error,
+ * then we want to ensure that we do not uiomove bad
+ * or uninitialized data.
+ */
+ size -= bp->b_resid;
+ if (size < blkoffset + xfersize) {
+ xfersize = size - blkoffset;
+ if (xfersize <= 0)
+ break;
+ }
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+ if (error)
+ break;
+ brelse(bp, BC_AGE);
+ }
+ if (bp != NULL)
+ brelse(bp, BC_AGE);
+
+ mutex_exit(&si->si_snaplock);
+ fstrans_done(vp->v_mount);
+ return error;
+}
+
+/*
+ * Lookup a snapshots data block address.
+ * Simpler than UFS_BALLOC() as we know all metadata is already allocated
+ * and safe even for the pagedaemon where we cannot bread().
+ */
+static int
+snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
+{
+ struct indir indirs[NIADDR + 2];
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+ struct buf *bp;
+ int error, num;
+
+ KASSERT(lbn >= 0);
+
+ if (lbn < NDADDR) {
+ *res = db_get(ip, lbn);
+ return 0;
+ }
+ if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+ return error;
+ if (curlwp == uvm.pagedaemon_lwp) {
+ mutex_enter(&bufcache_lock);
+ bp = incore(vp, indirs[num-1].in_lbn);
+ if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
+ *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
+ error = 0;
+ } else
+ error = ENOMEM;
+ mutex_exit(&bufcache_lock);
+ return error;
+ }
+ error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
+ if (error == 0)
+ *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
+ brelse(bp, 0);
+
+ return error;
+}
+
+/*
+ * Read or write the specified block of the filesystem vp resides on
+ * from or to the disk bypassing the buffer cache.
+ */
+static int
+rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
+{
+ int error;
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+ struct buf *nbp;
+
+ nbp = getiobuf(NULL, true);
+ nbp->b_flags = flags;
+ nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
+ nbp->b_error = 0;
+ nbp->b_data = data;
+ nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
+ nbp->b_proc = NULL;
+ nbp->b_dev = ip->i_devvp->v_rdev;
+ SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
+
+ bdev_strategy(nbp);
+
+ error = biowait(nbp);
+
+ putiobuf(nbp);
+
+ return error;
+}
+
+/*
+ * Write all dirty buffers to disk and invalidate them.
+ */
+static int
+syncsnap(struct vnode *vp)
+{
+ int error;
+ buf_t *bp;
+ struct fs *fs = VTOI(vp)->i_fs;
+
+ mutex_enter(&bufcache_lock);
+ while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
+ error = bbusy(bp, false, 0, NULL);
+ if (error == EPASSTHROUGH)
+ continue;
+ else if (error != 0) {
+ mutex_exit(&bufcache_lock);
+ return error;
+ }
+ KASSERT(bp->b_bcount == fs->fs_bsize);
+ mutex_exit(&bufcache_lock);
+ error = rwfsblk(vp, B_WRITE, bp->b_data,
+ fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
+ brelse(bp, BC_INVAL | BC_VFLUSH);
+ if (error)
+ return error;
+ mutex_enter(&bufcache_lock);
+ }
+ mutex_exit(&bufcache_lock);
+
+ return 0;
+}
+
+/*
+ * Write the specified block to a snapshot.
+ */
+static int
+wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
+{
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+ struct buf *bp;
+ int error;
+
+ error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
+ FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
+ if (error)
+ return error;
+ memcpy(bp->b_data, data, fs->fs_bsize);
+ if (ip->i_nlink > 0)
+ error = bwrite(bp);
+ else
+ bawrite(bp);
+
+ return error;
+}
+
+/*
+ * Check if this inode is present on the active snapshot list.
+ * Must be called with snapinfo locked.
+ */
+static inline bool
+is_active_snapshot(struct snap_info *si, struct inode *ip)
+{
+ struct inode *xp;
+
+ KASSERT(mutex_owned(&si->si_lock));
+
+ TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
+ if (xp == ip)
+ return true;
+ return false;
+}
+
+/*
+ * Get/Put direct block from inode or buffer containing disk addresses. Take
+ * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
+ * into a global include.
+ */
+static inline daddr_t
+db_get(struct inode *ip, int loc)
+{
+ if (ip->i_ump->um_fstype == UFS1)
+ return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
+ else
+ return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+db_assign(struct inode *ip, int loc, daddr_t val)
+{
+ if (ip->i_ump->um_fstype == UFS1)
+ ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+ else
+ ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
+
+static inline daddr_t
+ib_get(struct inode *ip, int loc)
+{
+ if (ip->i_ump->um_fstype == UFS1)
+ return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
+ else
+ return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+ib_assign(struct inode *ip, int loc, daddr_t val)
+{
+ if (ip->i_ump->um_fstype == UFS1)
+ ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+ else
+ ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
+
+static inline daddr_t
+idb_get(struct inode *ip, void *bf, int loc)
+{
+ if (ip->i_ump->um_fstype == UFS1)
+ return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
+ else
+ return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
+{
+ if (ip->i_ump->um_fstype == UFS1)
+ ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+ else
+ ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
--- /dev/null
+/* $NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $");
+
+#include <sys/param.h>
+
+/* in ffs_tables.c */
+extern const int inside[], around[];
+extern const u_char * const fragtbl[];
+
+#ifndef _KERNEL
+#define FFS_EI /* always include byteswapped filesystems support */
+#endif
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+#ifndef _KERNEL
+#include <ufs/ufs/dinode.h>
+void panic(const char *, ...)
+ __attribute__((__noreturn__,__format__(__printf__,1,2)));
+
+#else /* _KERNEL */
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/inttypes.h>
+#include <sys/pool.h>
+#include <sys/fstrans.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+/*
+ * Load up the contents of an inode and copy the appropriate pieces
+ * to the incore copy.
+ */
+void
+ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino)
+{
+ struct ufs1_dinode *dp1;
+ struct ufs2_dinode *dp2;
+
+ if (ip->i_ump->um_fstype == UFS1) {
+ dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
+#ifdef FFS_EI
+ if (UFS_FSNEEDSWAP(fs))
+ ffs_dinode1_swap(dp1, ip->i_din.ffs1_din);
+ else
+#endif
+ *ip->i_din.ffs1_din = *dp1;
+
+ ip->i_mode = ip->i_ffs1_mode;
+ ip->i_nlink = ip->i_ffs1_nlink;
+ ip->i_size = ip->i_ffs1_size;
+ ip->i_flags = ip->i_ffs1_flags;
+ ip->i_gen = ip->i_ffs1_gen;
+ ip->i_uid = ip->i_ffs1_uid;
+ ip->i_gid = ip->i_ffs1_gid;
+ } else {
+ dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
+#ifdef FFS_EI
+ if (UFS_FSNEEDSWAP(fs))
+ ffs_dinode2_swap(dp2, ip->i_din.ffs2_din);
+ else
+#endif
+ *ip->i_din.ffs2_din = *dp2;
+
+ ip->i_mode = ip->i_ffs2_mode;
+ ip->i_nlink = ip->i_ffs2_nlink;
+ ip->i_size = ip->i_ffs2_size;
+ ip->i_flags = ip->i_ffs2_flags;
+ ip->i_gen = ip->i_ffs2_gen;
+ ip->i_uid = ip->i_ffs2_uid;
+ ip->i_gid = ip->i_ffs2_gid;
+ }
+}
+
+int
+ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size,
+ bool clearbuf, buf_t **bpp)
+{
+ int error = 0;
+
+ KASSERT(blkno >= 0 || blkno == FFS_NOBLK);
+
+ if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL)
+ return ENOMEM;
+ if (blkno != FFS_NOBLK)
+ (*bpp)->b_blkno = blkno;
+ if (clearbuf)
+ clrbuf(*bpp);
+ if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0)
+ brelse(*bpp, BC_INVAL);
+ return error;
+}
+
+#endif /* _KERNEL */
+
+/*
+ * Update the frsum fields to reflect addition or deletion
+ * of some frags.
+ */
+void
+ffs_fragacct(struct fs *fs, int fragmap, int32_t fraglist[], int cnt,
+ int needswap)
+{
+ int inblk;
+ int field, subfield;
+ int siz, pos;
+
+ inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
+ fragmap <<= 1;
+ for (siz = 1; siz < fs->fs_frag; siz++) {
+ if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0)
+ continue;
+ field = around[siz];
+ subfield = inside[siz];
+ for (pos = siz; pos <= fs->fs_frag; pos++) {
+ if ((fragmap & field) == subfield) {
+ fraglist[siz] = ufs_rw32(
+ ufs_rw32(fraglist[siz], needswap) + cnt,
+ needswap);
+ pos += siz;
+ field <<= siz;
+ subfield <<= siz;
+ }
+ field <<= 1;
+ subfield <<= 1;
+ }
+ }
+}
+
+/*
+ * block operations
+ *
+ * check if a block is available
+ * returns true if all the correponding bits in the free map are 1
+ * returns false if any corresponding bit in the free map is 0
+ */
+int
+ffs_isblock(struct fs *fs, u_char *cp, int32_t h)
+{
+ u_char mask;
+
+ switch ((int)fs->fs_fragshift) {
+ case 3:
+ return (cp[h] == 0xff);
+ case 2:
+ mask = 0x0f << ((h & 0x1) << 2);
+ return ((cp[h >> 1] & mask) == mask);
+ case 1:
+ mask = 0x03 << ((h & 0x3) << 1);
+ return ((cp[h >> 2] & mask) == mask);
+ case 0:
+ mask = 0x01 << (h & 0x7);
+ return ((cp[h >> 3] & mask) == mask);
+ default:
+ panic("ffs_isblock: unknown fs_fragshift %d",
+ (int)fs->fs_fragshift);
+ }
+}
+
+/*
+ * check if a block is completely allocated
+ * returns true if all the corresponding bits in the free map are 0
+ * returns false if any corresponding bit in the free map is 1
+ */
+int
+ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+ switch ((int)fs->fs_fragshift) {
+ case 3:
+ return (cp[h] == 0);
+ case 2:
+ return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+ case 1:
+ return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+ case 0:
+ return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+ default:
+ panic("ffs_isfreeblock: unknown fs_fragshift %d",
+ (int)fs->fs_fragshift);
+ }
+}
+
+/*
+ * take a block out of the map
+ */
+void
+ffs_clrblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+ switch ((int)fs->fs_fragshift) {
+ case 3:
+ cp[h] = 0;
+ return;
+ case 2:
+ cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
+ return;
+ case 1:
+ cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
+ return;
+ case 0:
+ cp[h >> 3] &= ~(0x01 << (h & 0x7));
+ return;
+ default:
+ panic("ffs_clrblock: unknown fs_fragshift %d",
+ (int)fs->fs_fragshift);
+ }
+}
+
+/*
+ * put a block into the map
+ */
+void
+ffs_setblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+ switch ((int)fs->fs_fragshift) {
+ case 3:
+ cp[h] = 0xff;
+ return;
+ case 2:
+ cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
+ return;
+ case 1:
+ cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
+ return;
+ case 0:
+ cp[h >> 3] |= (0x01 << (h & 0x7));
+ return;
+ default:
+ panic("ffs_setblock: unknown fs_fragshift %d",
+ (int)fs->fs_fragshift);
+ }
+}
+
+/*
+ * Update the cluster map because of an allocation or free.
+ *
+ * Cnt == 1 means free; cnt == -1 means allocating.
+ */
+void
+ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt)
+{
+ int32_t *sump;
+ int32_t *lp;
+ u_char *freemapp, *mapp;
+ int i, start, end, forw, back, map, bit;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ /* KASSERT(mutex_owned(&ump->um_lock)); */
+
+ if (fs->fs_contigsumsize <= 0)
+ return;
+ freemapp = cg_clustersfree(cgp, needswap);
+ sump = cg_clustersum(cgp, needswap);
+ /*
+ * Allocate or clear the actual block.
+ */
+ if (cnt > 0)
+ setbit(freemapp, blkno);
+ else
+ clrbit(freemapp, blkno);
+ /*
+ * Find the size of the cluster going forward.
+ */
+ start = blkno + 1;
+ end = start + fs->fs_contigsumsize;
+ if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap))
+ end = ufs_rw32(cgp->cg_nclusterblks, needswap);
+ mapp = &freemapp[start / NBBY];
+ map = *mapp++;
+ bit = 1 << (start % NBBY);
+ for (i = start; i < end; i++) {
+ if ((map & bit) == 0)
+ break;
+ if ((i & (NBBY - 1)) != (NBBY - 1)) {
+ bit <<= 1;
+ } else {
+ map = *mapp++;
+ bit = 1;
+ }
+ }
+ forw = i - start;
+ /*
+ * Find the size of the cluster going backward.
+ */
+ start = blkno - 1;
+ end = start - fs->fs_contigsumsize;
+ if (end < 0)
+ end = -1;
+ mapp = &freemapp[start / NBBY];
+ map = *mapp--;
+ bit = 1 << (start % NBBY);
+ for (i = start; i > end; i--) {
+ if ((map & bit) == 0)
+ break;
+ if ((i & (NBBY - 1)) != 0) {
+ bit >>= 1;
+ } else {
+ map = *mapp--;
+ bit = 1 << (NBBY - 1);
+ }
+ }
+ back = start - i;
+ /*
+ * Account for old cluster and the possibly new forward and
+ * back clusters.
+ */
+ i = back + forw + 1;
+ if (i > fs->fs_contigsumsize)
+ i = fs->fs_contigsumsize;
+ ufs_add32(sump[i], cnt, needswap);
+ if (back > 0)
+ ufs_add32(sump[back], -cnt, needswap);
+ if (forw > 0)
+ ufs_add32(sump[forw], -cnt, needswap);
+
+ /*
+ * Update cluster summary information.
+ */
+ lp = &sump[fs->fs_contigsumsize];
+ for (i = fs->fs_contigsumsize; i > 0; i--)
+ if (ufs_rw32(*lp--, needswap) > 0)
+ break;
+#if defined(_KERNEL)
+ fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
+#endif
+}
--- /dev/null
+/* $NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_tables.c 8.1 (Berkeley) 6/11/93
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $");
+
+#include <sys/param.h>
+
+/*
+ * Bit patterns for identifying fragments in the block map
+ * used as ((map & around) == inside)
+ */
+const int around[9] = {
+ 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+};
+const int inside[9] = {
+ 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+};
+
+/*
+ * Given a block map bit pattern, the frag tables tell whether a
+ * particular size fragment is available.
+ *
+ * used as:
+ * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] {
+ * at least one fragment of the indicated size is available
+ * }
+ *
+ * These tables are used by the scanc instruction on the VAX to
+ * quickly find an appropriate fragment.
+ */
+const u_char fragtbl124[256] = {
+ 0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e,
+ 0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a,
+ 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+ 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+ 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+ 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+ 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
+ 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+ 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+ 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+ 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+ 0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e,
+ 0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae,
+ 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
+ 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
+ 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+ 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+ 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+ 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+ 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+ 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
+ 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
+ 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
+ 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
+ 0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce,
+ 0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a,
+};
+
+const u_char fragtbl8[256] = {
+ 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04,
+ 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08,
+ 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+ 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10,
+ 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+ 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+ 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+ 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20,
+ 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+ 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+ 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+ 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+ 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+ 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
+ 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
+ 0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40,
+ 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+ 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+ 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+ 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+ 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+ 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
+ 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21,
+ 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+ 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
+ 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12,
+ 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
+ 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c,
+ 0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c,
+ 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80,
+};
+
+/*
+ * The actual fragtbl array.
+ */
+const u_char * const fragtbl[MAXFRAG + 1] = {
+ 0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8,
+};
--- /dev/null
+/* $NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $ */
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc, and by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1989, 1991, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/disk.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, ffs, NULL);
+
+static int ffs_vfs_fsync(vnode_t *, int);
+
+static struct sysctllog *ffs_sysctl_log;
+
+/* how many times ffs_init() was called */
+int ffs_initcount = 0;
+
+extern const struct vnodeopv_desc ffs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc ffs_specop_opv_desc;
+extern const struct vnodeopv_desc ffs_fifoop_opv_desc;
+
+const struct vnodeopv_desc * const ffs_vnodeopv_descs[] = {
+ &ffs_vnodeop_opv_desc,
+ &ffs_specop_opv_desc,
+ &ffs_fifoop_opv_desc,
+ NULL,
+};
+
+struct vfsops ffs_vfsops = {
+ MOUNT_FFS,
+ sizeof (struct ufs_args),
+ ffs_mount,
+ ufs_start,
+ ffs_unmount,
+ ufs_root,
+ ufs_quotactl,
+ ffs_statvfs,
+ ffs_sync,
+ ffs_vget,
+ ffs_fhtovp,
+ ffs_vptofh,
+ ffs_init,
+ ffs_reinit,
+ ffs_done,
+ ffs_mountroot,
+ ffs_snapshot,
+ ffs_extattrctl,
+ ffs_suspendctl,
+ genfs_renamelock_enter,
+ genfs_renamelock_exit,
+ ffs_vfs_fsync,
+ ffs_vnodeopv_descs,
+ 0,
+ { NULL, NULL },
+};
+
+static const struct genfs_ops ffs_genfsops = {
+ .gop_size = ffs_gop_size,
+ .gop_alloc = ufs_gop_alloc,
+ .gop_write = genfs_gop_write,
+ .gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops ffs_ufsops = {
+ .uo_itimes = ffs_itimes,
+ .uo_update = ffs_update,
+ .uo_truncate = ffs_truncate,
+ .uo_valloc = ffs_valloc,
+ .uo_vfree = ffs_vfree,
+ .uo_balloc = ffs_balloc,
+ .uo_unmark_vnode = (void (*)(vnode_t *))nullop,
+};
+
+static int
+ffs_modcmd(modcmd_t cmd, void *arg)
+{
+ int error;
+
+#if 0
+ extern int doasyncfree;
+#endif
+#ifdef UFS_EXTATTR
+ extern int ufs_extattr_autocreate;
+#endif
+ extern int ffs_log_changeopt;
+
+ switch (cmd) {
+ case MODULE_CMD_INIT:
+ error = vfs_attach(&ffs_vfsops);
+ if (error != 0)
+ break;
+
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "vfs", NULL,
+ NULL, 0, NULL, 0,
+ CTL_VFS, CTL_EOL);
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "ffs",
+ SYSCTL_DESCR("Berkeley Fast File System"),
+ NULL, 0, NULL, 0,
+ CTL_VFS, 1, CTL_EOL);
+ /*
+ * @@@ should we even bother with these first three?
+ */
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "doclusterread", NULL,
+ sysctl_notavail, 0, NULL, 0,
+ CTL_VFS, 1, FFS_CLUSTERREAD, CTL_EOL);
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "doclusterwrite", NULL,
+ sysctl_notavail, 0, NULL, 0,
+ CTL_VFS, 1, FFS_CLUSTERWRITE, CTL_EOL);
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "doreallocblks", NULL,
+ sysctl_notavail, 0, NULL, 0,
+ CTL_VFS, 1, FFS_REALLOCBLKS, CTL_EOL);
+#if 0
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "doasyncfree",
+ SYSCTL_DESCR("Release dirty blocks asynchronously"),
+ NULL, 0, &doasyncfree, 0,
+ CTL_VFS, 1, FFS_ASYNCFREE, CTL_EOL);
+#endif
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "log_changeopt",
+ SYSCTL_DESCR("Log changes in optimization strategy"),
+ NULL, 0, &ffs_log_changeopt, 0,
+ CTL_VFS, 1, FFS_LOG_CHANGEOPT, CTL_EOL);
+#ifdef UFS_EXTATTR
+ sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "extattr_autocreate",
+ SYSCTL_DESCR("Size of attribute for "
+ "backing file autocreation"),
+ NULL, 0, &ufs_extattr_autocreate, 0,
+ CTL_VFS, 1, FFS_EXTATTR_AUTOCREATE, CTL_EOL);
+
+#endif /* UFS_EXTATTR */
+
+ break;
+ case MODULE_CMD_FINI:
+ error = vfs_detach(&ffs_vfsops);
+ if (error != 0)
+ break;
+ sysctl_teardown(&ffs_sysctl_log);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ return (error);
+}
+
+pool_cache_t ffs_inode_cache;
+pool_cache_t ffs_dinode1_cache;
+pool_cache_t ffs_dinode2_cache;
+
+static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, daddr_t);
+static void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
+
+/*
+ * Called by main() when ffs is going to be mounted as root.
+ */
+
+int
+ffs_mountroot(void)
+{
+ struct fs *fs;
+ struct mount *mp;
+ struct lwp *l = curlwp; /* XXX */
+ struct ufsmount *ump;
+ int error;
+
+ if (device_class(root_device) != DV_DISK)
+ return (ENODEV);
+
+ if ((error = vfs_rootmountalloc(MOUNT_FFS, "root_device", &mp))) {
+ vrele(rootvp);
+ return (error);
+ }
+
+ /*
+ * We always need to be able to mount the root file system.
+ */
+ mp->mnt_flag |= MNT_FORCE;
+ if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
+ vfs_unbusy(mp, false, NULL);
+ vfs_destroy(mp);
+ return (error);
+ }
+ mp->mnt_flag &= ~MNT_FORCE;
+ mutex_enter(&mountlist_lock);
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mutex_exit(&mountlist_lock);
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ memset(fs->fs_fsmnt, 0, sizeof(fs->fs_fsmnt));
+ (void)copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
+ (void)ffs_statvfs(mp, &mp->mnt_stat);
+ vfs_unbusy(mp, false, NULL);
+ setrootfstime((time_t)fs->fs_time);
+ return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+ struct lwp *l = curlwp;
+ struct vnode *devvp = NULL;
+ struct ufs_args *args = data;
+ struct ufsmount *ump = NULL;
+ struct fs *fs;
+ int error = 0, flags, update;
+ mode_t accessmode;
+
+ if (*data_len < sizeof *args)
+ return EINVAL;
+
+ if (mp->mnt_flag & MNT_GETARGS) {
+ ump = VFSTOUFS(mp);
+ if (ump == NULL)
+ return EIO;
+ args->fspec = NULL;
+ *data_len = sizeof *args;
+ return 0;
+ }
+
+ update = mp->mnt_flag & MNT_UPDATE;
+
+ /* Check arguments */
+ if (args->fspec != NULL) {
+ /*
+ * Look up the name and verify that it's sane.
+ */
+ error = namei_simple_user(args->fspec,
+ NSM_FOLLOW_NOEMULROOT, &devvp);
+ if (error != 0)
+ return (error);
+
+ if (!update) {
+ /*
+ * Be sure this is a valid block device
+ */
+ if (devvp->v_type != VBLK)
+ error = ENOTBLK;
+ else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+ error = ENXIO;
+ } else {
+ /*
+ * Be sure we're still naming the same device
+ * used for our initial mount
+ */
+ ump = VFSTOUFS(mp);
+ if (devvp != ump->um_devvp) {
+ if (devvp->v_rdev != ump->um_devvp->v_rdev)
+ error = EINVAL;
+ else {
+ vrele(devvp);
+ devvp = ump->um_devvp;
+ vref(devvp);
+ }
+ }
+ }
+ } else {
+ if (!update) {
+ /* New mounts must have a filename for the device */
+ return (EINVAL);
+ } else {
+ /* Use the extant mount */
+ ump = VFSTOUFS(mp);
+ devvp = ump->um_devvp;
+ vref(devvp);
+ }
+ }
+
+ /*
+ * If mount by non-root, then verify that user has necessary
+ * permissions on the device.
+ *
+ * Permission to update a mount is checked higher, so here we presume
+ * updating the mount is okay (for example, as far as securelevel goes)
+ * which leaves us with the normal check.
+ */
+ if (error == 0) {
+ accessmode = VREAD;
+ if (update ?
+ (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+ (mp->mnt_flag & MNT_RDONLY) == 0)
+ accessmode |= VWRITE;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = genfs_can_mount(devvp, accessmode, l->l_cred);
+ VOP_UNLOCK(devvp);
+ }
+
+ if (error) {
+ vrele(devvp);
+ return (error);
+ }
+
+#ifdef WAPBL
+ /* WAPBL can only be enabled on a r/w mount. */
+ if ((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) {
+ mp->mnt_flag &= ~MNT_LOG;
+ }
+#else /* !WAPBL */
+ mp->mnt_flag &= ~MNT_LOG;
+#endif /* !WAPBL */
+
+ if (!update) {
+ int xflags;
+
+ if (mp->mnt_flag & MNT_RDONLY)
+ xflags = FREAD;
+ else
+ xflags = FREAD | FWRITE;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_OPEN(devvp, xflags, FSCRED);
+ VOP_UNLOCK(devvp);
+ if (error)
+ goto fail;
+ error = ffs_mountfs(devvp, mp, l);
+ if (error) {
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(devvp, xflags, NOCRED);
+ VOP_UNLOCK(devvp);
+ goto fail;
+ }
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ } else {
+ /*
+ * Update the mount.
+ */
+
+ /*
+ * The initial mount got a reference on this
+ * device, so drop the one obtained via
+ * namei(), above.
+ */
+ vrele(devvp);
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+ /*
+ * Changing from r/w to r/o
+ */
+ flags = WRITECLOSE;
+ if (mp->mnt_flag & MNT_FORCE)
+ flags |= FORCECLOSE;
+ error = ffs_flushfiles(mp, flags, l);
+ if (error == 0)
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error == 0 &&
+ ffs_cgupdate(ump, MNT_WAIT) == 0 &&
+ fs->fs_clean & FS_WASCLEAN) {
+ if (mp->mnt_flag & MNT_SOFTDEP)
+ fs->fs_flags &= ~FS_DOSOFTDEP;
+ fs->fs_clean = FS_ISCLEAN;
+ (void) ffs_sbupdate(ump, MNT_WAIT);
+ }
+ if (error == 0)
+ UFS_WAPBL_END(mp);
+ if (error)
+ return (error);
+ }
+
+#ifdef WAPBL
+ if ((mp->mnt_flag & MNT_LOG) == 0) {
+ error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE);
+ if (error)
+ return error;
+ }
+#endif /* WAPBL */
+
+ if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+ /*
+ * Finish change from r/w to r/o
+ */
+ fs->fs_ronly = 1;
+ fs->fs_fmod = 0;
+ }
+
+ if (mp->mnt_flag & MNT_RELOAD) {
+ error = ffs_reload(mp, l->l_cred, l);
+ if (error)
+ return (error);
+ }
+
+ if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+ /*
+ * Changing from read-only to read/write
+ */
+#ifndef QUOTA2
+ if (fs->fs_flags & FS_DOQUOTA2) {
+ ump->um_flags |= UFS_QUOTA2;
+ uprintf("%s: options QUOTA2 not enabled%s\n",
+ mp->mnt_stat.f_mntonname,
+ (mp->mnt_flag & MNT_FORCE) ? "" :
+ ", not mounting");
+ return EINVAL;
+ }
+#endif
+ fs->fs_ronly = 0;
+ fs->fs_clean <<= 1;
+ fs->fs_fmod = 1;
+#ifdef WAPBL
+ if (fs->fs_flags & FS_DOWAPBL) {
+ printf("%s: replaying log to disk\n",
+ fs->fs_fsmnt);
+ KDASSERT(mp->mnt_wapbl_replay);
+ error = wapbl_replay_write(mp->mnt_wapbl_replay,
+ devvp);
+ if (error) {
+ return error;
+ }
+ wapbl_replay_stop(mp->mnt_wapbl_replay);
+ fs->fs_clean = FS_WASCLEAN;
+ }
+#endif /* WAPBL */
+ if (fs->fs_snapinum[0] != 0)
+ ffs_snapshot_mount(mp);
+ }
+
+#ifdef WAPBL
+ error = ffs_wapbl_start(mp);
+ if (error)
+ return error;
+#endif /* WAPBL */
+
+#ifdef QUOTA2
+ if (!fs->fs_ronly) {
+ error = ffs_quota2_mount(mp);
+ if (error) {
+ return error;
+ }
+ }
+#endif
+ if (args->fspec == NULL)
+ return 0;
+ }
+
+ error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+ UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+ if (error == 0)
+ (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
+ sizeof(fs->fs_fsmnt));
+ fs->fs_flags &= ~FS_DOSOFTDEP;
+ if (fs->fs_fmod != 0) { /* XXX */
+ int err;
+
+ fs->fs_fmod = 0;
+ if (fs->fs_clean & FS_WASCLEAN)
+ fs->fs_time = time_second;
+ else {
+ printf("%s: file system not clean (fs_clean=%#x); "
+ "please fsck(8)\n", mp->mnt_stat.f_mntfromname,
+ fs->fs_clean);
+ printf("%s: lost blocks %" PRId64 " files %d\n",
+ mp->mnt_stat.f_mntfromname, fs->fs_pendingblocks,
+ fs->fs_pendinginodes);
+ }
+ err = UFS_WAPBL_BEGIN(mp);
+ if (err == 0) {
+ (void) ffs_cgupdate(ump, MNT_WAIT);
+ UFS_WAPBL_END(mp);
+ }
+ }
+ if ((mp->mnt_flag & MNT_SOFTDEP) != 0) {
+ printf("%s: `-o softdep' is no longer supported, "
+ "consider `-o log'\n", mp->mnt_stat.f_mntfromname);
+ mp->mnt_flag &= ~MNT_SOFTDEP;
+ }
+
+ return (error);
+
+fail:
+ vrele(devvp);
+ return (error);
+}
+
+/*
+ * Reload all incore data for a filesystem (used after running fsck on
+ * the root filesystem and finding things to fix). The filesystem must
+ * be mounted read-only.
+ *
+ * Things to do to update the mount:
+ * 1) invalidate all cached meta-data.
+ * 2) re-read superblock from disk.
+ * 3) re-read summary information from disk.
+ * 4) invalidate all inactive vnodes.
+ * 5) invalidate all cached file data.
+ * 6) re-read inode data for all active vnodes.
+ */
+int
+ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
+{
+ struct vnode *vp, *mvp, *devvp;
+ struct inode *ip;
+ void *space;
+ struct buf *bp;
+ struct fs *fs, *newfs;
+ struct dkwedge_info dkw;
+ int i, bsize, blks, error;
+ int32_t *lp;
+ struct ufsmount *ump;
+ daddr_t sblockloc;
+
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ return (EINVAL);
+
+ ump = VFSTOUFS(mp);
+ /*
+ * Step 1: invalidate all cached meta-data.
+ */
+ devvp = ump->um_devvp;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = vinvalbuf(devvp, 0, cred, l, 0, 0);
+ VOP_UNLOCK(devvp);
+ if (error)
+ panic("ffs_reload: dirty1");
+ /*
+ * Step 2: re-read superblock from disk.
+ */
+ fs = ump->um_fs;
+
+ /* XXX we don't handle possibility that superblock moved. */
+ error = bread(devvp, fs->fs_sblockloc / DEV_BSIZE, fs->fs_sbsize,
+ NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+ newfs = malloc(fs->fs_sbsize, M_UFSMNT, M_WAITOK);
+ memcpy(newfs, bp->b_data, fs->fs_sbsize);
+#ifdef FFS_EI
+ if (ump->um_flags & UFS_NEEDSWAP) {
+ ffs_sb_swap((struct fs*)bp->b_data, newfs);
+ fs->fs_flags |= FS_SWAPPED;
+ } else
+#endif
+ fs->fs_flags &= ~FS_SWAPPED;
+ if ((newfs->fs_magic != FS_UFS1_MAGIC &&
+ newfs->fs_magic != FS_UFS2_MAGIC)||
+ newfs->fs_bsize > MAXBSIZE ||
+ newfs->fs_bsize < sizeof(struct fs)) {
+ brelse(bp, 0);
+ free(newfs, M_UFSMNT);
+ return (EIO); /* XXX needs translation */
+ }
+ /* Store off old fs_sblockloc for fs_oldfscompat_read. */
+ sblockloc = fs->fs_sblockloc;
+ /*
+ * Copy pointer fields back into superblock before copying in XXX
+ * new superblock. These should really be in the ufsmount. XXX
+ * Note that important parameters (eg fs_ncg) are unchanged.
+ */
+ newfs->fs_csp = fs->fs_csp;
+ newfs->fs_maxcluster = fs->fs_maxcluster;
+ newfs->fs_contigdirs = fs->fs_contigdirs;
+ newfs->fs_ronly = fs->fs_ronly;
+ newfs->fs_active = fs->fs_active;
+ memcpy(fs, newfs, (u_int)fs->fs_sbsize);
+ brelse(bp, 0);
+ free(newfs, M_UFSMNT);
+
+ /* Recheck for apple UFS filesystem */
+ ump->um_flags &= ~UFS_ISAPPLEUFS;
+ /* First check to see if this is tagged as an Apple UFS filesystem
+ * in the disklabel
+ */
+ if (getdiskinfo(devvp, &dkw) == 0 &&
+ strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0)
+ ump->um_flags |= UFS_ISAPPLEUFS;
+#ifdef APPLE_UFS
+ else {
+ /* Manually look for an apple ufs label, and if a valid one
+ * is found, then treat it like an Apple UFS filesystem anyway
+ *
+ * EINVAL is most probably a blocksize or alignment problem,
+ * it is unlikely that this is an Apple UFS filesystem then.
+ */
+ error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE),
+ APPLEUFS_LABEL_SIZE, cred, 0, &bp);
+ if (error && error != EINVAL) {
+ brelse(bp, 0);
+ return (error);
+ }
+ if (error == 0) {
+ error = ffs_appleufs_validate(fs->fs_fsmnt,
+ (struct appleufslabel *)bp->b_data, NULL);
+ if (error == 0)
+ ump->um_flags |= UFS_ISAPPLEUFS;
+ }
+ brelse(bp, 0);
+ bp = NULL;
+ }
+#else
+ if (ump->um_flags & UFS_ISAPPLEUFS)
+ return (EIO);
+#endif
+
+ if (UFS_MPISAPPLEUFS(ump)) {
+ /* see comment about NeXT below */
+ ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
+ ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
+ mp->mnt_iflag |= IMNT_DTYPE;
+ } else {
+ ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
+ ump->um_dirblksiz = DIRBLKSIZ;
+ if (ump->um_maxsymlinklen > 0)
+ mp->mnt_iflag |= IMNT_DTYPE;
+ else
+ mp->mnt_iflag &= ~IMNT_DTYPE;
+ }
+ ffs_oldfscompat_read(fs, ump, sblockloc);
+
+ mutex_enter(&ump->um_lock);
+ ump->um_maxfilesize = fs->fs_maxfilesize;
+ if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+ uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+ mp->mnt_stat.f_mntonname, fs->fs_flags,
+ (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+ if ((mp->mnt_flag & MNT_FORCE) == 0) {
+ mutex_exit(&ump->um_lock);
+ return (EINVAL);
+ }
+ }
+ if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
+ fs->fs_pendingblocks = 0;
+ fs->fs_pendinginodes = 0;
+ }
+ mutex_exit(&ump->um_lock);
+
+ ffs_statvfs(mp, &mp->mnt_stat);
+ /*
+ * Step 3: re-read summary information from disk.
+ */
+ blks = howmany(fs->fs_cssize, fs->fs_fsize);
+ space = fs->fs_csp;
+ for (i = 0; i < blks; i += fs->fs_frag) {
+ bsize = fs->fs_bsize;
+ if (i + fs->fs_frag > blks)
+ bsize = (blks - i) * fs->fs_fsize;
+ error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize,
+ NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ return (error);
+ }
+#ifdef FFS_EI
+ if (UFS_FSNEEDSWAP(fs))
+ ffs_csum_swap((struct csum *)bp->b_data,
+ (struct csum *)space, bsize);
+ else
+#endif
+ memcpy(space, bp->b_data, (size_t)bsize);
+ space = (char *)space + bsize;
+ brelse(bp, 0);
+ }
+ if (fs->fs_snapinum[0] != 0)
+ ffs_snapshot_mount(mp);
+ /*
+ * We no longer know anything about clusters per cylinder group.
+ */
+ if (fs->fs_contigsumsize > 0) {
+ lp = fs->fs_maxcluster;
+ for (i = 0; i < fs->fs_ncg; i++)
+ *lp++ = fs->fs_contigsumsize;
+ }
+
+ /* Allocate a marker vnode. */
+ mvp = vnalloc(mp);
+ /*
+ * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+ * and vclean() can be called indirectly
+ */
+ mutex_enter(&mntvnode_lock);
+ loop:
+ for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+ vmark(mvp, vp);
+ if (vp->v_mount != mp || vismarker(vp))
+ continue;
+ /*
+ * Step 4: invalidate all inactive vnodes.
+ */
+ if (vrecycle(vp, &mntvnode_lock, l)) {
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ goto loop;
+ }
+ /*
+ * Step 5: invalidate all cached file data.
+ */
+ mutex_enter(vp->v_interlock);
+ mutex_exit(&mntvnode_lock);
+ if (vget(vp, LK_EXCLUSIVE)) {
+ (void)vunmark(mvp);
+ goto loop;
+ }
+ if (vinvalbuf(vp, 0, cred, l, 0, 0))
+ panic("ffs_reload: dirty2");
+ /*
+ * Step 6: re-read inode data for all active vnodes.
+ */
+ ip = VTOI(vp);
+ error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+ (int)fs->fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+ brelse(bp, 0);
+ vput(vp);
+ (void)vunmark(mvp);
+ break;
+ }
+ ffs_load_inode(bp, ip, fs, ip->i_number);
+ brelse(bp, 0);
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+ vnfree(mvp);
+ return (error);
+}
+
+/*
+ * Possible superblock locations ordered from most to least likely.
+ */
+static const int sblock_try[] = SBLOCKSEARCH;
+
+/*
+ * Common code for mount and mountroot
+ */
+int
+ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
+{
+ struct ufsmount *ump;
+ struct buf *bp;
+ struct fs *fs;
+ dev_t dev;
+ struct dkwedge_info dkw;
+ void *space;
+ daddr_t sblockloc, fsblockloc;
+ int blks, fstype;
+ int error, i, bsize, ronly, bset = 0;
+#ifdef FFS_EI
+ int needswap = 0; /* keep gcc happy */
+#endif
+ int32_t *lp;
+ kauth_cred_t cred;
+ u_int32_t sbsize = 8192; /* keep gcc happy*/
+ int32_t fsbsize;
+
+ dev = devvp->v_rdev;
+ cred = l ? l->l_cred : NOCRED;
+
+ /* Flush out any old buffers remaining from a previous use. */
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+ VOP_UNLOCK(devvp);
+ if (error)
+ return (error);
+
+ ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+ bp = NULL;
+ ump = NULL;
+ fs = NULL;
+ sblockloc = 0;
+ fstype = 0;
+
+ error = fstrans_mount(mp);
+ if (error)
+ return error;
+
+ ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK);
+ memset(ump, 0, sizeof *ump);
+ mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
+ error = ffs_snapshot_init(ump);
+ if (error)
+ goto out;
+ ump->um_ops = &ffs_ufsops;
+
+#ifdef WAPBL
+ sbagain:
+#endif
+ /*
+ * Try reading the superblock in each of its possible locations.
+ */
+ for (i = 0; ; i++) {
+ if (bp != NULL) {
+ brelse(bp, BC_NOCACHE);
+ bp = NULL;
+ }
+ if (sblock_try[i] == -1) {
+ error = EINVAL;
+ fs = NULL;
+ goto out;
+ }
+ error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, cred,
+ 0, &bp);
+ if (error) {
+ fs = NULL;
+ goto out;
+ }
+ fs = (struct fs*)bp->b_data;
+ fsblockloc = sblockloc = sblock_try[i];
+ if (fs->fs_magic == FS_UFS1_MAGIC) {
+ sbsize = fs->fs_sbsize;
+ fstype = UFS1;
+ fsbsize = fs->fs_bsize;
+#ifdef FFS_EI
+ needswap = 0;
+ } else if (fs->fs_magic == bswap32(FS_UFS1_MAGIC)) {
+ sbsize = bswap32(fs->fs_sbsize);
+ fstype = UFS1;
+ fsbsize = bswap32(fs->fs_bsize);
+ needswap = 1;
+#endif
+ } else if (fs->fs_magic == FS_UFS2_MAGIC) {
+ sbsize = fs->fs_sbsize;
+ fstype = UFS2;
+ fsbsize = fs->fs_bsize;
+#ifdef FFS_EI
+ needswap = 0;
+ } else if (fs->fs_magic == bswap32(FS_UFS2_MAGIC)) {
+ sbsize = bswap32(fs->fs_sbsize);
+ fstype = UFS2;
+ fsbsize = bswap32(fs->fs_bsize);
+ needswap = 1;
+#endif
+ } else
+ continue;
+
+
+ /* fs->fs_sblockloc isn't defined for old filesystems */
+ if (fstype == UFS1 && !(fs->fs_old_flags & FS_FLAGS_UPDATED)) {
+ if (sblockloc == SBLOCK_UFS2)
+ /*
+ * This is likely to be the first alternate
+ * in a filesystem with 64k blocks.
+ * Don't use it.
+ */
+ continue;
+ fsblockloc = sblockloc;
+ } else {
+ fsblockloc = fs->fs_sblockloc;
+#ifdef FFS_EI
+ if (needswap)
+ fsblockloc = bswap64(fsblockloc);
+#endif
+ }
+
+ /* Check we haven't found an alternate superblock */
+ if (fsblockloc != sblockloc)
+ continue;
+
+ /* Validate size of superblock */
+ if (sbsize > MAXBSIZE || sbsize < sizeof(struct fs))
+ continue;
+
+ /* Check that we can handle the file system blocksize */
+ if (fsbsize > MAXBSIZE) {
+ printf("ffs_mountfs: block size (%d) > MAXBSIZE (%d)\n",
+ fsbsize, MAXBSIZE);
+ continue;
+ }
+
+ /* Ok seems to be a good superblock */
+ break;
+ }
+
+ fs = malloc((u_long)sbsize, M_UFSMNT, M_WAITOK);
+ memcpy(fs, bp->b_data, sbsize);
+ ump->um_fs = fs;
+
+#ifdef FFS_EI
+ if (needswap) {
+ ffs_sb_swap((struct fs*)bp->b_data, fs);
+ fs->fs_flags |= FS_SWAPPED;
+ } else
+#endif
+ fs->fs_flags &= ~FS_SWAPPED;
+
+#ifdef WAPBL
+ if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) {
+ error = ffs_wapbl_replay_start(mp, fs, devvp);
+ if (error && (mp->mnt_flag & MNT_FORCE) == 0)
+ goto out;
+ if (!error) {
+ if (!ronly) {
+ /* XXX fsmnt may be stale. */
+ printf("%s: replaying log to disk\n",
+ fs->fs_fsmnt);
+ error = wapbl_replay_write(mp->mnt_wapbl_replay,
+ devvp);
+ if (error)
+ goto out;
+ wapbl_replay_stop(mp->mnt_wapbl_replay);
+ fs->fs_clean = FS_WASCLEAN;
+ } else {
+ /* XXX fsmnt may be stale */
+ printf("%s: replaying log to memory\n",
+ fs->fs_fsmnt);
+ }
+
+ /* Force a re-read of the superblock */
+ brelse(bp, BC_INVAL);
+ bp = NULL;
+ free(fs, M_UFSMNT);
+ fs = NULL;
+ goto sbagain;
+ }
+ }
+#else /* !WAPBL */
+ if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) {
+ error = EPERM;
+ goto out;
+ }
+#endif /* !WAPBL */
+
+ ffs_oldfscompat_read(fs, ump, sblockloc);
+ ump->um_maxfilesize = fs->fs_maxfilesize;
+
+ if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+ uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+ mp->mnt_stat.f_mntonname, fs->fs_flags,
+ (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+ if ((mp->mnt_flag & MNT_FORCE) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
+ fs->fs_pendingblocks = 0;
+ fs->fs_pendinginodes = 0;
+ }
+
+ ump->um_fstype = fstype;
+ if (fs->fs_sbsize < SBLOCKSIZE)
+ brelse(bp, BC_INVAL);
+ else
+ brelse(bp, 0);
+ bp = NULL;
+
+ /* First check to see if this is tagged as an Apple UFS filesystem
+ * in the disklabel
+ */
+ if (getdiskinfo(devvp, &dkw) == 0 &&
+ strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0)
+ ump->um_flags |= UFS_ISAPPLEUFS;
+#ifdef APPLE_UFS
+ else {
+ /* Manually look for an apple ufs label, and if a valid one
+ * is found, then treat it like an Apple UFS filesystem anyway
+ */
+ error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE),
+ APPLEUFS_LABEL_SIZE, cred, 0, &bp);
+ if (error)
+ goto out;
+ error = ffs_appleufs_validate(fs->fs_fsmnt,
+ (struct appleufslabel *)bp->b_data, NULL);
+ if (error == 0) {
+ ump->um_flags |= UFS_ISAPPLEUFS;
+ }
+ brelse(bp, 0);
+ bp = NULL;
+ }
+#else
+ if (ump->um_flags & UFS_ISAPPLEUFS) {
+ error = EINVAL;
+ goto out;
+ }
+#endif
+
+#if 0
+/*
+ * XXX This code changes the behaviour of mounting dirty filesystems, to
+ * XXX require "mount -f ..." to mount them. This doesn't match what
+ * XXX mount(8) describes and is disabled for now.
+ */
+ /*
+ * If the file system is not clean, don't allow it to be mounted
+ * unless MNT_FORCE is specified. (Note: MNT_FORCE is always set
+ * for the root file system.)
+ */
+ if (fs->fs_flags & FS_DOWAPBL) {
+ /*
+ * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL
+ * bit is set, although there's a window in unmount where it
+ * could be FS_ISCLEAN
+ */
+ if ((mp->mnt_flag & MNT_FORCE) == 0 &&
+ (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) {
+ error = EPERM;
+ goto out;
+ }
+ } else
+ if ((fs->fs_clean & FS_ISCLEAN) == 0 &&
+ (mp->mnt_flag & MNT_FORCE) == 0) {
+ error = EPERM;
+ goto out;
+ }
+#endif
+
+ /*
+ * verify that we can access the last block in the fs
+ * if we're mounting read/write.
+ */
+
+ if (!ronly) {
+ error = bread(devvp, fsbtodb(fs, fs->fs_size - 1), fs->fs_fsize,
+ cred, 0, &bp);
+ if (bp->b_bcount != fs->fs_fsize)
+ error = EINVAL;
+ if (error) {
+ bset = BC_INVAL;
+ goto out;
+ }
+ brelse(bp, BC_INVAL);
+ bp = NULL;
+ }
+
+ fs->fs_ronly = ronly;
+ /* Don't bump fs_clean if we're replaying journal */
+ if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN)))
+ if (ronly == 0) {
+ fs->fs_clean <<= 1;
+ fs->fs_fmod = 1;
+ }
+ bsize = fs->fs_cssize;
+ blks = howmany(bsize, fs->fs_fsize);
+ if (fs->fs_contigsumsize > 0)
+ bsize += fs->fs_ncg * sizeof(int32_t);
+ bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs);
+ space = malloc((u_long)bsize, M_UFSMNT, M_WAITOK);
+ fs->fs_csp = space;
+ for (i = 0; i < blks; i += fs->fs_frag) {
+ bsize = fs->fs_bsize;
+ if (i + fs->fs_frag > blks)
+ bsize = (blks - i) * fs->fs_fsize;
+ error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize,
+ cred, 0, &bp);
+ if (error) {
+ free(fs->fs_csp, M_UFSMNT);
+ goto out;
+ }
+#ifdef FFS_EI
+ if (needswap)
+ ffs_csum_swap((struct csum *)bp->b_data,
+ (struct csum *)space, bsize);
+ else
+#endif
+ memcpy(space, bp->b_data, (u_int)bsize);
+
+ space = (char *)space + bsize;
+ brelse(bp, 0);
+ bp = NULL;
+ }
+ if (fs->fs_contigsumsize > 0) {
+ fs->fs_maxcluster = lp = space;
+ for (i = 0; i < fs->fs_ncg; i++)
+ *lp++ = fs->fs_contigsumsize;
+ space = lp;
+ }
+ bsize = fs->fs_ncg * sizeof(*fs->fs_contigdirs);
+ fs->fs_contigdirs = space;
+ space = (char *)space + bsize;
+ memset(fs->fs_contigdirs, 0, bsize);
+ /* Compatibility for old filesystems - XXX */
+ if (fs->fs_avgfilesize <= 0)
+ fs->fs_avgfilesize = AVFILESIZ;
+ if (fs->fs_avgfpdir <= 0)
+ fs->fs_avgfpdir = AFPDIR;
+ fs->fs_active = NULL;
+ mp->mnt_data = ump;
+ mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+ mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_FFS);
+ mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+ mp->mnt_stat.f_namemax = FFS_MAXNAMLEN;
+ if (UFS_MPISAPPLEUFS(ump)) {
+ /* NeXT used to keep short symlinks in the inode even
+ * when using FS_42INODEFMT. In that case fs->fs_maxsymlinklen
+ * is probably -1, but we still need to be able to identify
+ * short symlinks.
+ */
+ ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
+ ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
+ mp->mnt_iflag |= IMNT_DTYPE;
+ } else {
+ ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
+ ump->um_dirblksiz = DIRBLKSIZ;
+ if (ump->um_maxsymlinklen > 0)
+ mp->mnt_iflag |= IMNT_DTYPE;
+ else
+ mp->mnt_iflag &= ~IMNT_DTYPE;
+ }
+ mp->mnt_fs_bshift = fs->fs_bshift;
+ mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */
+ mp->mnt_flag |= MNT_LOCAL;
+ mp->mnt_iflag |= IMNT_MPSAFE;
+#ifdef FFS_EI
+ if (needswap)
+ ump->um_flags |= UFS_NEEDSWAP;
+#endif
+ ump->um_mountp = mp;
+ ump->um_dev = dev;
+ ump->um_devvp = devvp;
+ ump->um_nindir = fs->fs_nindir;
+ ump->um_lognindir = ffs(fs->fs_nindir) - 1;
+ ump->um_bptrtodb = fs->fs_fshift - DEV_BSHIFT;
+ ump->um_seqinc = fs->fs_frag;
+ for (i = 0; i < MAXQUOTAS; i++)
+ ump->um_quotas[i] = NULLVP;
+ devvp->v_specmountpoint = mp;
+ if (ronly == 0 && fs->fs_snapinum[0] != 0)
+ ffs_snapshot_mount(mp);
+#ifdef WAPBL
+ if (!ronly) {
+ KDASSERT(fs->fs_ronly == 0);
+ /*
+ * ffs_wapbl_start() needs mp->mnt_stat initialised if it
+ * needs to create a new log file in-filesystem.
+ */
+ ffs_statvfs(mp, &mp->mnt_stat);
+
+ error = ffs_wapbl_start(mp);
+ if (error) {
+ free(fs->fs_csp, M_UFSMNT);
+ goto out;
+ }
+ }
+#endif /* WAPBL */
+ if (ronly == 0) {
+#ifdef QUOTA2
+ error = ffs_quota2_mount(mp);
+ if (error) {
+ free(fs->fs_csp, M_UFSMNT);
+ goto out;
+ }
+#else
+ if (fs->fs_flags & FS_DOQUOTA2) {
+ ump->um_flags |= UFS_QUOTA2;
+ uprintf("%s: options QUOTA2 not enabled%s\n",
+ mp->mnt_stat.f_mntonname,
+ (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+ if ((mp->mnt_flag & MNT_FORCE) == 0) {
+ error = EINVAL;
+ free(fs->fs_csp, M_UFSMNT);
+ goto out;
+ }
+ }
+#endif
+ }
+#ifdef UFS_EXTATTR
+ /*
+ * Initialize file-backed extended attributes on UFS1 file
+ * systems.
+ */
+ if (ump->um_fstype == UFS1)
+ ufs_extattr_uepm_init(&ump->um_extattr);
+#endif /* UFS_EXTATTR */
+
+ return (0);
+out:
+#ifdef WAPBL
+ if (mp->mnt_wapbl_replay) {
+ wapbl_replay_stop(mp->mnt_wapbl_replay);
+ wapbl_replay_free(mp->mnt_wapbl_replay);
+ mp->mnt_wapbl_replay = 0;
+ }
+#endif
+
+ fstrans_unmount(mp);
+ if (fs)
+ free(fs, M_UFSMNT);
+ devvp->v_specmountpoint = NULL;
+ if (bp)
+ brelse(bp, bset);
+ if (ump) {
+ if (ump->um_oldfscompat)
+ free(ump->um_oldfscompat, M_UFSMNT);
+ mutex_destroy(&ump->um_lock);
+ free(ump, M_UFSMNT);
+ mp->mnt_data = NULL;
+ }
+ return (error);
+}
+
+/*
+ * Sanity checks for loading old filesystem superblocks.
+ * See ffs_oldfscompat_write below for unwound actions.
+ *
+ * XXX - Parts get retired eventually.
+ * Unfortunately new bits get added.
+ */
+static void
+ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc)
+{
+ off_t maxfilesize;
+ int32_t *extrasave;
+
+ if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+ (fs->fs_old_flags & FS_FLAGS_UPDATED))
+ return;
+
+ if (!ump->um_oldfscompat)
+ ump->um_oldfscompat = malloc(512 + 3*sizeof(int32_t),
+ M_UFSMNT, M_WAITOK);
+
+ memcpy(ump->um_oldfscompat, &fs->fs_old_postbl_start, 512);
+ extrasave = ump->um_oldfscompat;
+ extrasave += 512/sizeof(int32_t);
+ extrasave[0] = fs->fs_old_npsect;
+ extrasave[1] = fs->fs_old_interleave;
+ extrasave[2] = fs->fs_old_trackskew;
+
+ /* These fields will be overwritten by their
+ * original values in fs_oldfscompat_write, so it is harmless
+ * to modify them here.
+ */
+ fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
+ fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
+ fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
+ fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
+
+ fs->fs_maxbsize = fs->fs_bsize;
+ fs->fs_time = fs->fs_old_time;
+ fs->fs_size = fs->fs_old_size;
+ fs->fs_dsize = fs->fs_old_dsize;
+ fs->fs_csaddr = fs->fs_old_csaddr;
+ fs->fs_sblockloc = sblockloc;
+
+ fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL);
+
+ if (fs->fs_old_postblformat == FS_42POSTBLFMT) {
+ fs->fs_old_nrpos = 8;
+ fs->fs_old_npsect = fs->fs_old_nsect;
+ fs->fs_old_interleave = 1;
+ fs->fs_old_trackskew = 0;
+ }
+
+ if (fs->fs_old_inodefmt < FS_44INODEFMT) {
+ fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
+ fs->fs_qbmask = ~fs->fs_bmask;
+ fs->fs_qfmask = ~fs->fs_fmask;
+ }
+
+ maxfilesize = (u_int64_t)0x80000000 * fs->fs_bsize - 1;
+ if (fs->fs_maxfilesize > maxfilesize)
+ fs->fs_maxfilesize = maxfilesize;
+
+ /* Compatibility for old filesystems */
+ if (fs->fs_avgfilesize <= 0)
+ fs->fs_avgfilesize = AVFILESIZ;
+ if (fs->fs_avgfpdir <= 0)
+ fs->fs_avgfpdir = AFPDIR;
+
+#if 0
+ if (bigcgs) {
+ fs->fs_save_cgsize = fs->fs_cgsize;
+ fs->fs_cgsize = fs->fs_bsize;
+ }
+#endif
+}
+
+/*
+ * Unwinding superblock updates for old filesystems.
+ * See ffs_oldfscompat_read above for details.
+ *
+ * XXX - Parts get retired eventually.
+ * Unfortunately new bits get added.
+ */
+static void
+ffs_oldfscompat_write(struct fs *fs, struct ufsmount *ump)
+{
+ int32_t *extrasave;
+
+ if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+ (fs->fs_old_flags & FS_FLAGS_UPDATED))
+ return;
+
+ fs->fs_old_time = fs->fs_time;
+ fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
+ fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
+ fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
+ fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
+ fs->fs_old_flags = fs->fs_flags;
+
+#if 0
+ if (bigcgs) {
+ fs->fs_cgsize = fs->fs_save_cgsize;
+ }
+#endif
+
+ memcpy(&fs->fs_old_postbl_start, ump->um_oldfscompat, 512);
+ extrasave = ump->um_oldfscompat;
+ extrasave += 512/sizeof(int32_t);
+ fs->fs_old_npsect = extrasave[0];
+ fs->fs_old_interleave = extrasave[1];
+ fs->fs_old_trackskew = extrasave[2];
+
+}
+
+/*
+ * unmount vfs operation
+ */
+int
+ffs_unmount(struct mount *mp, int mntflags)
+{
+ struct lwp *l = curlwp;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ int error, flags;
+#ifdef WAPBL
+ extern int doforce;
+#endif
+
+ flags = 0;
+ if (mntflags & MNT_FORCE)
+ flags |= FORCECLOSE;
+ if ((error = ffs_flushfiles(mp, flags, l)) != 0)
+ return (error);
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error == 0)
+ if (fs->fs_ronly == 0 &&
+ ffs_cgupdate(ump, MNT_WAIT) == 0 &&
+ fs->fs_clean & FS_WASCLEAN) {
+ fs->fs_clean = FS_ISCLEAN;
+ fs->fs_fmod = 0;
+ (void) ffs_sbupdate(ump, MNT_WAIT);
+ }
+ if (error == 0)
+ UFS_WAPBL_END(mp);
+#ifdef WAPBL
+ KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl));
+ if (mp->mnt_wapbl_replay) {
+ KDASSERT(fs->fs_ronly);
+ wapbl_replay_stop(mp->mnt_wapbl_replay);
+ wapbl_replay_free(mp->mnt_wapbl_replay);
+ mp->mnt_wapbl_replay = 0;
+ }
+ error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE));
+ if (error) {
+ return error;
+ }
+#endif /* WAPBL */
+#ifdef UFS_EXTATTR
+ if (ump->um_fstype == UFS1) {
+ ufs_extattr_stop(mp, l);
+ ufs_extattr_uepm_destroy(&ump->um_extattr);
+ }
+#endif /* UFS_EXTATTR */
+
+ if (ump->um_devvp->v_type != VBAD)
+ ump->um_devvp->v_specmountpoint = NULL;
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE,
+ NOCRED);
+ vput(ump->um_devvp);
+ free(fs->fs_csp, M_UFSMNT);
+ free(fs, M_UFSMNT);
+ if (ump->um_oldfscompat != NULL)
+ free(ump->um_oldfscompat, M_UFSMNT);
+ mutex_destroy(&ump->um_lock);
+ ffs_snapshot_fini(ump);
+ free(ump, M_UFSMNT);
+ mp->mnt_data = NULL;
+ mp->mnt_flag &= ~MNT_LOCAL;
+ fstrans_unmount(mp);
+ return (0);
+}
+
+/*
+ * Flush out all the files in a filesystem.
+ */
+int
+ffs_flushfiles(struct mount *mp, int flags, struct lwp *l)
+{
+ extern int doforce;
+ struct ufsmount *ump;
+ int error;
+
+ if (!doforce)
+ flags &= ~FORCECLOSE;
+ ump = VFSTOUFS(mp);
+#ifdef QUOTA
+ if ((error = quota1_umount(mp, flags)) != 0)
+ return (error);
+#endif
+#ifdef QUOTA2
+ if ((error = quota2_umount(mp, flags)) != 0)
+ return (error);
+#endif
+ if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0)
+ return (error);
+ ffs_snapshot_unmount(mp);
+ /*
+ * Flush all the files.
+ */
+ error = vflush(mp, NULLVP, flags);
+ if (error)
+ return (error);
+ /*
+ * Flush filesystem metadata.
+ */
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0);
+ VOP_UNLOCK(ump->um_devvp);
+ if (flags & FORCECLOSE) /* XXXDBJ */
+ error = 0;
+
+#ifdef WAPBL
+ if (error)
+ return error;
+ if (mp->mnt_wapbl) {
+ error = wapbl_flush(mp->mnt_wapbl, 1);
+ if (flags & FORCECLOSE)
+ error = 0;
+ }
+#endif
+
+ return (error);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+ffs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+ struct ufsmount *ump;
+ struct fs *fs;
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ mutex_enter(&ump->um_lock);
+ sbp->f_bsize = fs->fs_bsize;
+ sbp->f_frsize = fs->fs_fsize;
+ sbp->f_iosize = fs->fs_bsize;
+ sbp->f_blocks = fs->fs_dsize;
+ sbp->f_bfree = blkstofrags(fs, fs->fs_cstotal.cs_nbfree) +
+ fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
+ sbp->f_bresvd = ((u_int64_t) fs->fs_dsize * (u_int64_t)
+ fs->fs_minfree) / (u_int64_t) 100;
+ if (sbp->f_bfree > sbp->f_bresvd)
+ sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+ else
+ sbp->f_bavail = 0;
+ sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO;
+ sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
+ sbp->f_favail = sbp->f_ffree;
+ sbp->f_fresvd = 0;
+ mutex_exit(&ump->um_lock);
+ copy_statvfs_info(sbp, mp);
+
+ return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+ffs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+ struct vnode *vp, *mvp, *nvp;
+ struct inode *ip;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs;
+ int error, allerror = 0;
+ bool is_suspending;
+
+ fs = ump->um_fs;
+ if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */
+ printf("fs = %s\n", fs->fs_fsmnt);
+ panic("update: rofs mod");
+ }
+
+ /* Allocate a marker vnode. */
+ mvp = vnalloc(mp);
+
+ fstrans_start(mp, FSTRANS_SHARED);
+ is_suspending = (fstrans_getstate(mp) == FSTRANS_SUSPENDING);
+ /*
+ * Write back each (modified) inode.
+ */
+ mutex_enter(&mntvnode_lock);
+loop:
+ /*
+ * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+ * and vclean() can be called indirectly
+ */
+ for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
+ nvp = TAILQ_NEXT(vp, v_mntvnodes);
+ /*
+ * If the vnode that we are about to sync is no longer
+ * associated with this mount point, start over.
+ */
+ if (vp->v_mount != mp)
+ goto loop;
+ /*
+ * Don't interfere with concurrent scans of this FS.
+ */
+ if (vismarker(vp))
+ continue;
+ mutex_enter(vp->v_interlock);
+ ip = VTOI(vp);
+
+ /*
+ * Skip the vnode/inode if inaccessible.
+ */
+ if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 ||
+ vp->v_type == VNON) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+
+ /*
+ * We deliberately update inode times here. This will
+ * prevent a massive queue of updates accumulating, only
+ * to be handled by a call to unmount.
+ *
+ * XXX It would be better to have the syncer trickle these
+ * out. Adjustment needed to allow registering vnodes for
+ * sync when the vnode is clean, but the inode dirty. Or
+ * have ufs itself trickle out inode updates.
+ *
+ * If doing a lazy sync, we don't care about metadata or
+ * data updates, because they are handled by each vnode's
+ * synclist entry. In this case we are only interested in
+ * writing back modified inodes.
+ */
+ if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE |
+ IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) == 0 &&
+ (waitfor == MNT_LAZY || (LIST_EMPTY(&vp->v_dirtyblkhd) &&
+ UVM_OBJ_IS_CLEAN(&vp->v_uobj)))) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ if (vp->v_type == VBLK && is_suspending) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ vmark(mvp, vp);
+ mutex_exit(&mntvnode_lock);
+ error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error) {
+ mutex_enter(&mntvnode_lock);
+ nvp = vunmark(mvp);
+ if (error == ENOENT) {
+ goto loop;
+ }
+ continue;
+ }
+ if (waitfor == MNT_LAZY) {
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (!error) {
+ error = ffs_update(vp, NULL, NULL,
+ UPDATE_CLOSE);
+ UFS_WAPBL_END(vp->v_mount);
+ }
+ } else {
+ error = VOP_FSYNC(vp, cred, FSYNC_NOLOG |
+ (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0);
+ }
+ if (error)
+ allerror = error;
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ nvp = vunmark(mvp);
+ }
+ mutex_exit(&mntvnode_lock);
+ /*
+ * Force stale file system control information to be flushed.
+ */
+ if (waitfor != MNT_LAZY && (ump->um_devvp->v_numoutput > 0 ||
+ !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd))) {
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = VOP_FSYNC(ump->um_devvp, cred,
+ (waitfor == MNT_WAIT ? FSYNC_WAIT : 0) | FSYNC_NOLOG,
+ 0, 0)) != 0)
+ allerror = error;
+ VOP_UNLOCK(ump->um_devvp);
+ if (allerror == 0 && waitfor == MNT_WAIT && !mp->mnt_wapbl) {
+ mutex_enter(&mntvnode_lock);
+ goto loop;
+ }
+ }
+#if defined(QUOTA) || defined(QUOTA2)
+ qsync(mp);
+#endif
+ /*
+ * Write back modified superblock.
+ */
+ if (fs->fs_fmod != 0) {
+ fs->fs_fmod = 0;
+ fs->fs_time = time_second;
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ allerror = error;
+ else {
+ if ((error = ffs_cgupdate(ump, waitfor)))
+ allerror = error;
+ UFS_WAPBL_END(mp);
+ }
+ }
+
+#ifdef WAPBL
+ if (mp->mnt_wapbl) {
+ error = wapbl_flush(mp->mnt_wapbl, 0);
+ if (error)
+ allerror = error;
+ }
+#endif
+
+ fstrans_done(mp);
+ vnfree(mvp);
+ return (allerror);
+}
+
+/*
+ * Look up a FFS dinode number to find its incore vnode, otherwise read it
+ * in from disk. If it is in core, wait for the lock bit to clear, then
+ * return the inode locked. Detection and handling of mount points must be
+ * done by the calling routine.
+ */
+int
+ffs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+ struct fs *fs;
+ struct inode *ip;
+ struct ufsmount *ump;
+ struct buf *bp;
+ struct vnode *vp;
+ dev_t dev;
+ int error;
+
+ ump = VFSTOUFS(mp);
+ dev = ump->um_dev;
+
+ retry:
+ if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+ return (0);
+
+ /* Allocate a new vnode/inode. */
+ error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, NULL, &vp);
+ if (error) {
+ *vpp = NULL;
+ return (error);
+ }
+ ip = pool_cache_get(ffs_inode_cache, PR_WAITOK);
+
+ /*
+ * If someone beat us to it, put back the freshly allocated
+ * vnode/inode pair and retry.
+ */
+ mutex_enter(&ufs_hashlock);
+ if (ufs_ihashget(dev, ino, 0) != NULL) {
+ mutex_exit(&ufs_hashlock);
+ ungetnewvnode(vp);
+ pool_cache_put(ffs_inode_cache, ip);
+ goto retry;
+ }
+
+ vp->v_vflag |= VV_LOCKSWORK;
+
+ /*
+ * XXX MFS ends up here, too, to allocate an inode. Should we
+ * XXX create another pool for MFS inodes?
+ */
+
+ memset(ip, 0, sizeof(struct inode));
+ vp->v_data = ip;
+ ip->i_vnode = vp;
+ ip->i_ump = ump;
+ ip->i_fs = fs = ump->um_fs;
+ ip->i_dev = dev;
+ ip->i_number = ino;
+#if defined(QUOTA) || defined(QUOTA2)
+ ufsquota_init(ip);
+#endif
+
+ /*
+ * Initialize genfs node, we might proceed to destroy it in
+ * error branches.
+ */
+ genfs_node_init(vp, &ffs_genfsops);
+
+ /*
+ * Put it onto its hash chain and lock it so that other requests for
+ * this inode will block if they arrive while we are sleeping waiting
+ * for old data structures to be purged or for the contents of the
+ * disk portion of this inode to be read.
+ */
+
+ ufs_ihashins(ip);
+ mutex_exit(&ufs_hashlock);
+
+ /* Read in the disk contents for the inode, copy into the inode. */
+ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+ (int)fs->fs_bsize, NOCRED, 0, &bp);
+ if (error) {
+
+ /*
+ * The inode does not contain anything useful, so it would
+ * be misleading to leave it on its hash chain. With mode
+ * still zero, it will be unlinked and returned to the free
+ * list by vput().
+ */
+
+ vput(vp);
+ brelse(bp, 0);
+ *vpp = NULL;
+ return (error);
+ }
+ if (ip->i_ump->um_fstype == UFS1)
+ ip->i_din.ffs1_din = pool_cache_get(ffs_dinode1_cache,
+ PR_WAITOK);
+ else
+ ip->i_din.ffs2_din = pool_cache_get(ffs_dinode2_cache,
+ PR_WAITOK);
+ ffs_load_inode(bp, ip, fs, ino);
+ brelse(bp, 0);
+
+ /*
+ * Initialize the vnode from the inode, check for aliases.
+ * Note that the underlying vnode may have changed.
+ */
+
+ ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp);
+
+ /*
+ * Finish inode initialization now that aliasing has been resolved.
+ */
+
+ ip->i_devvp = ump->um_devvp;
+ vref(ip->i_devvp);
+
+ /*
+ * Ensure that uid and gid are correct. This is a temporary
+ * fix until fsck has been changed to do the update.
+ */
+
+ if (fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
+ ip->i_uid = ip->i_ffs1_ouid; /* XXX */
+ ip->i_gid = ip->i_ffs1_ogid; /* XXX */
+ } /* XXX */
+ uvm_vnp_setsize(vp, ip->i_size);
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * File handle to vnode
+ *
+ * Have to be really careful about stale file handles:
+ * - check that the inode number is valid
+ * - call ffs_vget() to get the locked inode
+ * - check for an unallocated inode (i_mode == 0)
+ * - check that the given client host has export rights and return
+ * those rights via. exflagsp and credanonp
+ */
+int
+ffs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+ struct ufid ufh;
+ struct fs *fs;
+
+ if (fhp->fid_len != sizeof(struct ufid))
+ return EINVAL;
+
+ memcpy(&ufh, fhp, sizeof(ufh));
+ fs = VFSTOUFS(mp)->um_fs;
+ if (ufh.ufid_ino < ROOTINO ||
+ ufh.ufid_ino >= fs->fs_ncg * fs->fs_ipg)
+ return (ESTALE);
+ return (ufs_fhtovp(mp, &ufh, vpp));
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+ffs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+ struct inode *ip;
+ struct ufid ufh;
+
+ if (*fh_size < sizeof(struct ufid)) {
+ *fh_size = sizeof(struct ufid);
+ return E2BIG;
+ }
+ ip = VTOI(vp);
+ *fh_size = sizeof(struct ufid);
+ memset(&ufh, 0, sizeof(ufh));
+ ufh.ufid_len = sizeof(struct ufid);
+ ufh.ufid_ino = ip->i_number;
+ ufh.ufid_gen = ip->i_gen;
+ memcpy(fhp, &ufh, sizeof(ufh));
+ return (0);
+}
+
+void
+ffs_init(void)
+{
+ if (ffs_initcount++ > 0)
+ return;
+
+ ffs_inode_cache = pool_cache_init(sizeof(struct inode), 0, 0, 0,
+ "ffsino", NULL, IPL_NONE, NULL, NULL, NULL);
+ ffs_dinode1_cache = pool_cache_init(sizeof(struct ufs1_dinode), 0, 0, 0,
+ "ffsdino1", NULL, IPL_NONE, NULL, NULL, NULL);
+ ffs_dinode2_cache = pool_cache_init(sizeof(struct ufs2_dinode), 0, 0, 0,
+ "ffsdino2", NULL, IPL_NONE, NULL, NULL, NULL);
+ ufs_init();
+}
+
+void
+ffs_reinit(void)
+{
+
+ ufs_reinit();
+}
+
+void
+ffs_done(void)
+{
+ if (--ffs_initcount > 0)
+ return;
+
+ ufs_done();
+ pool_cache_destroy(ffs_dinode2_cache);
+ pool_cache_destroy(ffs_dinode1_cache);
+ pool_cache_destroy(ffs_inode_cache);
+}
+
+/*
+ * Write a superblock and associated information back to disk.
+ */
+int
+ffs_sbupdate(struct ufsmount *mp, int waitfor)
+{
+ struct fs *fs = mp->um_fs;
+ struct buf *bp;
+ int error = 0;
+ u_int32_t saveflag;
+
+ error = ffs_getblk(mp->um_devvp,
+ fs->fs_sblockloc / DEV_BSIZE, FFS_NOBLK,
+ fs->fs_sbsize, false, &bp);
+ if (error)
+ return error;
+ saveflag = fs->fs_flags & FS_INTERNAL;
+ fs->fs_flags &= ~FS_INTERNAL;
+
+ memcpy(bp->b_data, fs, fs->fs_sbsize);
+
+ ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
+#ifdef FFS_EI
+ if (mp->um_flags & UFS_NEEDSWAP)
+ ffs_sb_swap((struct fs *)bp->b_data, (struct fs *)bp->b_data);
+#endif
+ fs->fs_flags |= saveflag;
+
+ if (waitfor == MNT_WAIT)
+ error = bwrite(bp);
+ else
+ bawrite(bp);
+ return (error);
+}
+
+int
+ffs_cgupdate(struct ufsmount *mp, int waitfor)
+{
+ struct fs *fs = mp->um_fs;
+ struct buf *bp;
+ int blks;
+ void *space;
+ int i, size, error = 0, allerror = 0;
+
+ allerror = ffs_sbupdate(mp, waitfor);
+ blks = howmany(fs->fs_cssize, fs->fs_fsize);
+ space = fs->fs_csp;
+ for (i = 0; i < blks; i += fs->fs_frag) {
+ size = fs->fs_bsize;
+ if (i + fs->fs_frag > blks)
+ size = (blks - i) * fs->fs_fsize;
+ error = ffs_getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
+ FFS_NOBLK, size, false, &bp);
+ if (error)
+ break;
+#ifdef FFS_EI
+ if (mp->um_flags & UFS_NEEDSWAP)
+ ffs_csum_swap((struct csum*)space,
+ (struct csum*)bp->b_data, size);
+ else
+#endif
+ memcpy(bp->b_data, space, (u_int)size);
+ space = (char *)space + size;
+ if (waitfor == MNT_WAIT)
+ error = bwrite(bp);
+ else
+ bawrite(bp);
+ }
+ if (!allerror && error)
+ allerror = error;
+ return (allerror);
+}
+
+int
+ffs_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
+ int attrnamespace, const char *attrname)
+{
+#ifdef UFS_EXTATTR
+ /*
+ * File-backed extended attributes are only supported on UFS1.
+ * UFS2 has native extended attributes.
+ */
+ if (VFSTOUFS(mp)->um_fstype == UFS1)
+ return (ufs_extattrctl(mp, cmd, vp, attrnamespace, attrname));
+#endif
+ return (vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname));
+}
+
+int
+ffs_suspendctl(struct mount *mp, int cmd)
+{
+ int error;
+ struct lwp *l = curlwp;
+
+ switch (cmd) {
+ case SUSPEND_SUSPEND:
+ if ((error = fstrans_setstate(mp, FSTRANS_SUSPENDING)) != 0)
+ return error;
+ error = ffs_sync(mp, MNT_WAIT, l->l_proc->p_cred);
+ if (error == 0)
+ error = fstrans_setstate(mp, FSTRANS_SUSPENDED);
+#ifdef WAPBL
+ if (error == 0 && mp->mnt_wapbl)
+ error = wapbl_flush(mp->mnt_wapbl, 1);
+#endif
+ if (error != 0) {
+ (void) fstrans_setstate(mp, FSTRANS_NORMAL);
+ return error;
+ }
+ return 0;
+
+ case SUSPEND_RESUME:
+ return fstrans_setstate(mp, FSTRANS_NORMAL);
+
+ default:
+ return EINVAL;
+ }
+}
+
+/*
+ * Synch vnode for a mounted file system.
+ */
+static int
+ffs_vfs_fsync(vnode_t *vp, int flags)
+{
+ int error, i, pflags;
+#ifdef WAPBL
+ struct mount *mp;
+#endif
+
+ KASSERT(vp->v_type == VBLK);
+ KASSERT(vp->v_specmountpoint != NULL);
+
+ /*
+ * Flush all dirty data associated with the vnode.
+ */
+ pflags = PGO_ALLPAGES | PGO_CLEANIT;
+ if ((flags & FSYNC_WAIT) != 0)
+ pflags |= PGO_SYNCIO;
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, 0, 0, pflags);
+ if (error)
+ return error;
+
+#ifdef WAPBL
+ mp = vp->v_specmountpoint;
+ if (mp && mp->mnt_wapbl) {
+ /*
+ * Don't bother writing out metadata if the syncer is
+ * making the request. We will let the sync vnode
+ * write it out in a single burst through a call to
+ * VFS_SYNC().
+ */
+ if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0)
+ return 0;
+
+ /*
+ * Don't flush the log if the vnode being flushed
+ * contains no dirty buffers that could be in the log.
+ */
+ if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+ error = wapbl_flush(mp->mnt_wapbl, 0);
+ if (error)
+ return error;
+ }
+
+ if ((flags & FSYNC_WAIT) != 0) {
+ mutex_enter(vp->v_interlock);
+ while (vp->v_numoutput)
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ mutex_exit(vp->v_interlock);
+ }
+
+ return 0;
+ }
+#endif /* WAPBL */
+
+ error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0);
+ if (error == 0 && (flags & FSYNC_CACHE) != 0) {
+ i = 1;
+ (void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
+ kauth_cred_get());
+ }
+
+ return error;
+}
--- /dev/null
+/* $NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $ */
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc, and by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+/* Global vfs data structures for ufs. */
+int (**ffs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, ufs_lookup }, /* lookup */
+ { &vop_create_desc, ufs_create }, /* create */
+ { &vop_whiteout_desc, ufs_whiteout }, /* whiteout */
+ { &vop_mknod_desc, ufs_mknod }, /* mknod */
+ { &vop_open_desc, ufs_open }, /* open */
+ { &vop_close_desc, ufs_close }, /* close */
+ { &vop_access_desc, ufs_access }, /* access */
+ { &vop_getattr_desc, ufs_getattr }, /* getattr */
+ { &vop_setattr_desc, ufs_setattr }, /* setattr */
+ { &vop_read_desc, ffs_read }, /* read */
+ { &vop_write_desc, ffs_write }, /* write */
+ { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, ufs_poll }, /* poll */
+ { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */
+ { &vop_revoke_desc, ufs_revoke }, /* revoke */
+ { &vop_mmap_desc, ufs_mmap }, /* mmap */
+ { &vop_fsync_desc, ffs_fsync }, /* fsync */
+ { &vop_seek_desc, ufs_seek }, /* seek */
+ { &vop_remove_desc, ufs_remove }, /* remove */
+ { &vop_link_desc, ufs_link }, /* link */
+ { &vop_rename_desc, ufs_rename }, /* rename */
+ { &vop_mkdir_desc, ufs_mkdir }, /* mkdir */
+ { &vop_rmdir_desc, ufs_rmdir }, /* rmdir */
+ { &vop_symlink_desc, ufs_symlink }, /* symlink */
+ { &vop_readdir_desc, ufs_readdir }, /* readdir */
+ { &vop_readlink_desc, ufs_readlink }, /* readlink */
+ { &vop_abortop_desc, ufs_abortop }, /* abortop */
+ { &vop_inactive_desc, ufs_inactive }, /* inactive */
+ { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, ufs_bmap }, /* bmap */
+ { &vop_strategy_desc, ufs_strategy }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */
+ { &vop_advlock_desc, ufs_advlock }, /* advlock */
+ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
+ { &vop_getpages_desc, genfs_getpages }, /* getpages */
+ { &vop_putpages_desc, genfs_putpages }, /* putpages */
+ { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */
+ { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */
+ { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */
+ { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */
+ { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */
+ { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc ffs_vnodeop_opv_desc =
+ { &ffs_vnodeop_p, ffs_vnodeop_entries };
+
+int (**ffs_specop_p)(void *);
+const struct vnodeopv_entry_desc ffs_specop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, spec_lookup }, /* lookup */
+ { &vop_create_desc, spec_create }, /* create */
+ { &vop_mknod_desc, spec_mknod }, /* mknod */
+ { &vop_open_desc, spec_open }, /* open */
+ { &vop_close_desc, ufsspec_close }, /* close */
+ { &vop_access_desc, ufs_access }, /* access */
+ { &vop_getattr_desc, ufs_getattr }, /* getattr */
+ { &vop_setattr_desc, ufs_setattr }, /* setattr */
+ { &vop_read_desc, ufsspec_read }, /* read */
+ { &vop_write_desc, ufsspec_write }, /* write */
+ { &vop_ioctl_desc, spec_ioctl }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, spec_poll }, /* poll */
+ { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */
+ { &vop_revoke_desc, spec_revoke }, /* revoke */
+ { &vop_mmap_desc, spec_mmap }, /* mmap */
+ { &vop_fsync_desc, ffs_spec_fsync }, /* fsync */
+ { &vop_seek_desc, spec_seek }, /* seek */
+ { &vop_remove_desc, spec_remove }, /* remove */
+ { &vop_link_desc, spec_link }, /* link */
+ { &vop_rename_desc, spec_rename }, /* rename */
+ { &vop_mkdir_desc, spec_mkdir }, /* mkdir */
+ { &vop_rmdir_desc, spec_rmdir }, /* rmdir */
+ { &vop_symlink_desc, spec_symlink }, /* symlink */
+ { &vop_readdir_desc, spec_readdir }, /* readdir */
+ { &vop_readlink_desc, spec_readlink }, /* readlink */
+ { &vop_abortop_desc, spec_abortop }, /* abortop */
+ { &vop_inactive_desc, ufs_inactive }, /* inactive */
+ { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, spec_bmap }, /* bmap */
+ { &vop_strategy_desc, spec_strategy }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, spec_pathconf }, /* pathconf */
+ { &vop_advlock_desc, spec_advlock }, /* advlock */
+ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
+ { &vop_getpages_desc, spec_getpages }, /* getpages */
+ { &vop_putpages_desc, spec_putpages }, /* putpages */
+ { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */
+ { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */
+ { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */
+ { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */
+ { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */
+ { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc ffs_specop_opv_desc =
+ { &ffs_specop_p, ffs_specop_entries };
+
+int (**ffs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, vn_fifo_bypass }, /* lookup */
+ { &vop_create_desc, vn_fifo_bypass }, /* create */
+ { &vop_mknod_desc, vn_fifo_bypass }, /* mknod */
+ { &vop_open_desc, vn_fifo_bypass }, /* open */
+ { &vop_close_desc, ufsfifo_close }, /* close */
+ { &vop_access_desc, ufs_access }, /* access */
+ { &vop_getattr_desc, ufs_getattr }, /* getattr */
+ { &vop_setattr_desc, ufs_setattr }, /* setattr */
+ { &vop_read_desc, ufsfifo_read }, /* read */
+ { &vop_write_desc, ufsfifo_write }, /* write */
+ { &vop_ioctl_desc, vn_fifo_bypass }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, vn_fifo_bypass }, /* poll */
+ { &vop_kqfilter_desc, vn_fifo_bypass }, /* kqfilter */
+ { &vop_revoke_desc, vn_fifo_bypass }, /* revoke */
+ { &vop_mmap_desc, vn_fifo_bypass }, /* mmap */
+ { &vop_fsync_desc, ffs_fsync }, /* fsync */
+ { &vop_seek_desc, vn_fifo_bypass }, /* seek */
+ { &vop_remove_desc, vn_fifo_bypass }, /* remove */
+ { &vop_link_desc, vn_fifo_bypass }, /* link */
+ { &vop_rename_desc, vn_fifo_bypass }, /* rename */
+ { &vop_mkdir_desc, vn_fifo_bypass }, /* mkdir */
+ { &vop_rmdir_desc, vn_fifo_bypass }, /* rmdir */
+ { &vop_symlink_desc, vn_fifo_bypass }, /* symlink */
+ { &vop_readdir_desc, vn_fifo_bypass }, /* readdir */
+ { &vop_readlink_desc, vn_fifo_bypass }, /* readlink */
+ { &vop_abortop_desc, vn_fifo_bypass }, /* abortop */
+ { &vop_inactive_desc, ufs_inactive }, /* inactive */
+ { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, vn_fifo_bypass }, /* bmap */
+ { &vop_strategy_desc, vn_fifo_bypass }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, vn_fifo_bypass }, /* pathconf */
+ { &vop_advlock_desc, vn_fifo_bypass }, /* advlock */
+ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
+ { &vop_putpages_desc, vn_fifo_bypass }, /* putpages */
+ { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */
+ { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */
+ { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */
+ { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */
+ { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */
+ { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc ffs_fifoop_opv_desc =
+ { &ffs_fifoop_p, ffs_fifoop_entries };
+
+#include <ufs/ufs/ufs_readwrite.c>
+
+int
+ffs_spec_fsync(void *v)
+{
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ kauth_cred_t a_cred;
+ int a_flags;
+ off_t a_offlo;
+ off_t a_offhi;
+ struct lwp *a_l;
+ } */ *ap = v;
+ int error, flags, uflags;
+ struct vnode *vp;
+ struct mount *mp;
+
+ flags = ap->a_flags;
+ uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+ vp = ap->a_vp;
+ mp = vp->v_mount;
+
+ fstrans_start(mp, FSTRANS_LAZY);
+
+ error = spec_fsync(v);
+ if (error)
+ goto out;
+
+#ifdef WAPBL
+ if (mp && mp->mnt_wapbl) {
+ /*
+ * Don't bother writing out metadata if the syncer is
+ * making the request. We will let the sync vnode
+ * write it out in a single burst through a call to
+ * VFS_SYNC().
+ */
+ if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
+ goto out;
+ if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
+ | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error != 0)
+ goto out;
+ error = ffs_update(vp, NULL, NULL, uflags);
+ UFS_WAPBL_END(mp);
+ }
+ goto out;
+ }
+#endif /* WAPBL */
+
+ error = ffs_update(vp, NULL, NULL, uflags);
+
+out:
+ fstrans_done(mp);
+ return error;
+}
+
+int
+ffs_fsync(void *v)
+{
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ kauth_cred_t a_cred;
+ int a_flags;
+ off_t a_offlo;
+ off_t a_offhi;
+ struct lwp *a_l;
+ } */ *ap = v;
+ struct buf *bp;
+ int num, error, i;
+ struct indir ia[NIADDR + 1];
+ int bsize;
+ daddr_t blk_high;
+ struct vnode *vp;
+ struct mount *mp;
+
+ vp = ap->a_vp;
+ mp = vp->v_mount;
+
+ fstrans_start(mp, FSTRANS_LAZY);
+ if ((ap->a_offlo == 0 && ap->a_offhi == 0) || (vp->v_type != VREG)) {
+ error = ffs_full_fsync(vp, ap->a_flags);
+ goto out;
+ }
+
+ bsize = mp->mnt_stat.f_iosize;
+ blk_high = ap->a_offhi / bsize;
+ if (ap->a_offhi % bsize != 0)
+ blk_high++;
+
+ /*
+ * First, flush all pages in range.
+ */
+
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+ round_page(ap->a_offhi), PGO_CLEANIT |
+ ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0));
+ if (error) {
+ goto out;
+ }
+
+#ifdef WAPBL
+ KASSERT(vp->v_type == VREG);
+ if (mp->mnt_wapbl) {
+ /*
+ * Don't bother writing out metadata if the syncer is
+ * making the request. We will let the sync vnode
+ * write it out in a single burst through a call to
+ * VFS_SYNC().
+ */
+ if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) {
+ fstrans_done(mp);
+ return 0;
+ }
+ error = 0;
+ if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag &
+ (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
+ IN_MODIFIED | IN_ACCESSED)) {
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error) {
+ fstrans_done(mp);
+ return error;
+ }
+ error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
+ ((ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0));
+ UFS_WAPBL_END(mp);
+ }
+ if (error || (ap->a_flags & FSYNC_NOLOG) != 0) {
+ fstrans_done(mp);
+ return error;
+ }
+ error = wapbl_flush(mp->mnt_wapbl, 0);
+ fstrans_done(mp);
+ return error;
+ }
+#endif /* WAPBL */
+
+ /*
+ * Then, flush indirect blocks.
+ */
+
+ if (blk_high >= NDADDR) {
+ error = ufs_getlbns(vp, blk_high, ia, &num);
+ if (error)
+ goto out;
+
+ mutex_enter(&bufcache_lock);
+ for (i = 0; i < num; i++) {
+ if ((bp = incore(vp, ia[i].in_lbn)) == NULL)
+ continue;
+ if ((bp->b_cflags & BC_BUSY) != 0 ||
+ (bp->b_oflags & BO_DELWRI) == 0)
+ continue;
+ bp->b_cflags |= BC_BUSY | BC_VFLUSH;
+ mutex_exit(&bufcache_lock);
+ bawrite(bp);
+ mutex_enter(&bufcache_lock);
+ }
+ mutex_exit(&bufcache_lock);
+ }
+
+ if (ap->a_flags & FSYNC_WAIT) {
+ mutex_enter(vp->v_interlock);
+ while (vp->v_numoutput > 0)
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ mutex_exit(vp->v_interlock);
+ }
+
+ error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
+ (((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) == FSYNC_WAIT)
+ ? UPDATE_WAIT : 0));
+
+ if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+ int l = 0;
+ VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+ curlwp->l_cred);
+ }
+
+out:
+ fstrans_done(mp);
+ return error;
+}
+
+/*
+ * Synch an open file. Called for VOP_FSYNC().
+ */
+/* ARGSUSED */
+int
+ffs_full_fsync(struct vnode *vp, int flags)
+{
+ int error, i, uflags;
+ struct mount *mp;
+
+ KASSERT(vp->v_tag == VT_UFS);
+ KASSERT(VTOI(vp) != NULL);
+ KASSERT(vp->v_type != VCHR && vp->v_type != VBLK);
+
+ error = 0;
+ uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+
+ mp = vp->v_mount;
+
+ /*
+ * Flush all dirty data associated with the vnode.
+ */
+ if (vp->v_type == VREG) {
+ int pflags = PGO_ALLPAGES | PGO_CLEANIT;
+
+ if ((flags & FSYNC_WAIT))
+ pflags |= PGO_SYNCIO;
+ if (fstrans_getstate(mp) == FSTRANS_SUSPENDING)
+ pflags |= PGO_FREE;
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, 0, 0, pflags);
+ if (error)
+ return error;
+ }
+
+#ifdef WAPBL
+ if (mp && mp->mnt_wapbl) {
+ /*
+ * Don't bother writing out metadata if the syncer is
+ * making the request. We will let the sync vnode
+ * write it out in a single burst through a call to
+ * VFS_SYNC().
+ */
+ if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
+ return 0;
+
+ if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
+ | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error)
+ return error;
+ error = ffs_update(vp, NULL, NULL, uflags);
+ UFS_WAPBL_END(mp);
+ }
+ if (error || (flags & FSYNC_NOLOG) != 0)
+ return error;
+
+ /*
+ * Don't flush the log if the vnode being flushed
+ * contains no dirty buffers that could be in the log.
+ */
+ if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+ error = wapbl_flush(mp->mnt_wapbl, 0);
+ if (error)
+ return error;
+ }
+
+ if ((flags & FSYNC_WAIT) != 0) {
+ mutex_enter(vp->v_interlock);
+ while (vp->v_numoutput != 0)
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ mutex_exit(vp->v_interlock);
+ }
+
+ return error;
+ }
+#endif /* WAPBL */
+
+ error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0);
+ if (error == 0)
+ error = ffs_update(vp, NULL, NULL, uflags);
+ if (error == 0 && (flags & FSYNC_CACHE) != 0) {
+ i = 1;
+ (void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
+ kauth_cred_get());
+ }
+
+ return error;
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ffs_reclaim(void *v)
+{
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct lwp *a_l;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = ip->i_ump;
+ void *data;
+ int error;
+
+ fstrans_start(mp, FSTRANS_LAZY);
+ /*
+ * The inode must be freed and updated before being removed
+ * from its hash chain. Other threads trying to gain a hold
+ * on the inode will be stalled because it is locked (VI_XLOCK).
+ */
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error) {
+ fstrans_done(mp);
+ return error;
+ }
+ if (ip->i_nlink <= 0 && ip->i_omode != 0 &&
+ (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+ ffs_vfree(vp, ip->i_number, ip->i_omode);
+ UFS_WAPBL_END(mp);
+ if ((error = ufs_reclaim(vp)) != 0) {
+ fstrans_done(mp);
+ return (error);
+ }
+ if (ip->i_din.ffs1_din != NULL) {
+ if (ump->um_fstype == UFS1)
+ pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din);
+ else
+ pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din);
+ }
+ /*
+ * To interlock with ffs_sync().
+ */
+ genfs_node_destroy(vp);
+ mutex_enter(vp->v_interlock);
+ data = vp->v_data;
+ vp->v_data = NULL;
+ mutex_exit(vp->v_interlock);
+
+ /*
+ * XXX MFS ends up here, too, to free an inode. Should we create
+ * XXX a separate pool for MFS inodes?
+ */
+ pool_cache_put(ffs_inode_cache, data);
+ fstrans_done(mp);
+ return (0);
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size".
+ */
+
+void
+ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+ daddr_t olbn, nlbn;
+
+ olbn = lblkno(fs, ip->i_size);
+ nlbn = lblkno(fs, size);
+ if (nlbn < NDADDR && olbn <= nlbn) {
+ *eobp = fragroundup(fs, size);
+ } else {
+ *eobp = blkroundup(fs, size);
+ }
+}
+
+int
+ffs_openextattr(void *v)
+{
+ struct vop_openextattr_args /* {
+ struct vnode *a_vp;
+ kauth_cred_t a_cred;
+ struct proc *a_p;
+ } */ *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+ struct fs *fs = ip->i_fs;
+
+ /* Not supported for UFS1 file systems. */
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ return (EOPNOTSUPP);
+
+ /* XXX Not implemented for UFS2 file systems. */
+ return (EOPNOTSUPP);
+}
+
+int
+ffs_closeextattr(void *v)
+{
+ struct vop_closeextattr_args /* {
+ struct vnode *a_vp;
+ int a_commit;
+ kauth_cred_t a_cred;
+ struct proc *a_p;
+ } */ *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+ struct fs *fs = ip->i_fs;
+
+ /* Not supported for UFS1 file systems. */
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ return (EOPNOTSUPP);
+
+ /* XXX Not implemented for UFS2 file systems. */
+ return (EOPNOTSUPP);
+}
+
+int
+ffs_getextattr(void *v)
+{
+ struct vop_getextattr_args /* {
+ struct vnode *a_vp;
+ int a_attrnamespace;
+ const char *a_name;
+ struct uio *a_uio;
+ size_t *a_size;
+ kauth_cred_t a_cred;
+ struct proc *a_p;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+
+ if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+ int error;
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ error = ufs_getextattr(ap);
+ fstrans_done(vp->v_mount);
+ return error;
+#else
+ return (EOPNOTSUPP);
+#endif
+ }
+
+ /* XXX Not implemented for UFS2 file systems. */
+ return (EOPNOTSUPP);
+}
+
+int
+ffs_setextattr(void *v)
+{
+ struct vop_setextattr_args /* {
+ struct vnode *a_vp;
+ int a_attrnamespace;
+ const char *a_name;
+ struct uio *a_uio;
+ kauth_cred_t a_cred;
+ struct proc *a_p;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+
+ if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+ int error;
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ error = ufs_setextattr(ap);
+ fstrans_done(vp->v_mount);
+ return error;
+#else
+ return (EOPNOTSUPP);
+#endif
+ }
+
+ /* XXX Not implemented for UFS2 file systems. */
+ return (EOPNOTSUPP);
+}
+
+int
+ffs_listextattr(void *v)
+{
+ struct vop_listextattr_args /* {
+ struct vnode *a_vp;
+ int a_attrnamespace;
+ struct uio *a_uio;
+ size_t *a_size;
+ kauth_cred_t a_cred;
+ struct proc *a_p;
+ } */ *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+ struct fs *fs = ip->i_fs;
+
+ if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+ struct vnode *vp = ap->a_vp;
+ int error;
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ error = ufs_listextattr(ap);
+ fstrans_done(vp->v_mount);
+ return error;
+#else
+ return (EOPNOTSUPP);
+#endif
+ }
+
+ /* XXX Not implemented for UFS2 file systems. */
+ return (EOPNOTSUPP);
+}
+
+int
+ffs_deleteextattr(void *v)
+{
+ struct vop_deleteextattr_args /* {
+ struct vnode *a_vp;
+ int a_attrnamespace;
+ kauth_cred_t a_cred;
+ struct proc *a_p;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct fs *fs = ip->i_fs;
+
+ if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+ int error;
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ error = ufs_deleteextattr(ap);
+ fstrans_done(vp->v_mount);
+ return error;
+#else
+ return (EOPNOTSUPP);
+#endif
+ }
+
+ /* XXX Not implemented for UFS2 file systems. */
+ return (EOPNOTSUPP);
+}
--- /dev/null
+/* $NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $ */
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $");
+
+#define WAPBL_INTERNAL
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/file.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#undef WAPBL_DEBUG
+#ifdef WAPBL_DEBUG
+int ffs_wapbl_debug = 1;
+#define DPRINTF(fmt, args...) \
+do { \
+ if (ffs_wapbl_debug) \
+ printf("%s:%d "fmt, __func__ , __LINE__, ##args); \
+} while (/* CONSTCOND */0)
+#else
+#define DPRINTF(fmt, args...) \
+do { \
+ /* nothing */ \
+} while (/* CONSTCOND */0)
+#endif
+
+static int ffs_superblock_layout(struct fs *);
+static int wapbl_log_position(struct mount *, struct fs *, struct vnode *,
+ daddr_t *, size_t *, size_t *, uint64_t *);
+static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *,
+ daddr_t *, size_t *, uint64_t *);
+static void wapbl_find_log_start(struct mount *, struct vnode *, off_t,
+ daddr_t *, daddr_t *, size_t *);
+static int wapbl_remove_log(struct mount *);
+static int wapbl_allocate_log_file(struct mount *, struct vnode *,
+ daddr_t *, size_t *, uint64_t *);
+
+/*
+ * Return the super block layout format - UFS1 or UFS2.
+ * WAPBL only works with UFS2 layout (which is still available
+ * with FFSv1).
+ *
+ * XXX Should this be in ufs/ffs/fs.h? Same style of check is
+ * also used in ffs_alloc.c in a few places.
+ */
+static int
+ffs_superblock_layout(struct fs *fs)
+{
+ if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+ ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))
+ return 1;
+ else
+ return 2;
+}
+
+/*
+ * This function is invoked after a log is replayed to
+ * disk to perform logical cleanup actions as described by
+ * the log
+ */
+void
+ffs_wapbl_replay_finish(struct mount *mp)
+{
+ struct wapbl_replay *wr = mp->mnt_wapbl_replay;
+ int i;
+ int error;
+
+ if (!wr)
+ return;
+
+ KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0);
+
+ for (i = 0; i < wr->wr_inodescnt; i++) {
+ struct vnode *vp;
+ struct inode *ip;
+ error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp);
+ if (error) {
+ printf("ffs_wapbl_replay_finish: "
+ "unable to cleanup inode %" PRIu32 "\n",
+ wr->wr_inodes[i].wr_inumber);
+ continue;
+ }
+ ip = VTOI(vp);
+ KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number);
+#ifdef WAPBL_DEBUG
+ printf("ffs_wapbl_replay_finish: "
+ "cleaning inode %" PRIu64 " size=%" PRIu64 " mode=%o nlink=%d\n",
+ ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink);
+#endif
+ KASSERT(ip->i_nlink == 0);
+
+ /*
+ * The journal may have left partially allocated inodes in mode
+ * zero. This may occur if a crash occurs betweeen the node
+ * allocation in ffs_nodeallocg and when the node is properly
+ * initialized in ufs_makeinode. If so, just dallocate them.
+ */
+ if (ip->i_mode == 0) {
+ UFS_WAPBL_BEGIN(mp);
+ ffs_vfree(vp, ip->i_number, wr->wr_inodes[i].wr_imode);
+ UFS_WAPBL_END(mp);
+ }
+ vput(vp);
+ }
+ wapbl_replay_stop(wr);
+ wapbl_replay_free(wr);
+ mp->mnt_wapbl_replay = NULL;
+}
+
+/* Callback for wapbl */
+void
+ffs_wapbl_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+ int *dealloclens, int dealloccnt)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ int i, error;
+
+#ifdef WAPBL_DEBUG_INODES
+ ufs_wapbl_verify_inodes(mp, "ffs_wapbl_sync_metadata");
+#endif
+
+ for (i = 0; i< dealloccnt; i++) {
+ /*
+ * blkfree errors are unreported, might silently fail
+ * if it cannot read the cylinder group block
+ */
+ ffs_blkfree(fs, ump->um_devvp,
+ dbtofsb(fs, deallocblks[i]), dealloclens[i], -1);
+ }
+
+ fs->fs_fmod = 0;
+ fs->fs_time = time_second;
+ error = ffs_cgupdate(ump, 0);
+ KASSERT(error == 0);
+}
+
+void
+ffs_wapbl_abort_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+ int *dealloclens, int dealloccnt)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ int i;
+
+ for (i = 0; i < dealloccnt; i++) {
+ /*
+ * Since the above blkfree may have failed, this blkalloc might
+ * fail as well, so don't check its error. Note that if the
+ * blkfree succeeded above, then this shouldn't fail because
+ * the buffer will be locked in the current transaction.
+ */
+ ffs_blkalloc_ump(ump, dbtofsb(fs, deallocblks[i]),
+ dealloclens[i]);
+ }
+}
+
+static int
+wapbl_remove_log(struct mount *mp)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ struct vnode *vp;
+ struct inode *ip;
+ ino_t log_ino;
+ int error;
+
+ /* If super block layout is too old to support WAPBL, return */
+ if (ffs_superblock_layout(fs) < 2)
+ return 0;
+
+ /* If all the log locators are 0, just clean up */
+ if (fs->fs_journallocs[0] == 0 &&
+ fs->fs_journallocs[1] == 0 &&
+ fs->fs_journallocs[2] == 0 &&
+ fs->fs_journallocs[3] == 0) {
+ DPRINTF("empty locators, just clear\n");
+ goto done;
+ }
+
+ switch (fs->fs_journal_location) {
+ case UFS_WAPBL_JOURNALLOC_NONE:
+ /* nothing! */
+ DPRINTF("no log\n");
+ break;
+
+ case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+ log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO];
+ DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino);
+
+ /* if no existing log inode, just clear all fields and bail */
+ if (log_ino == 0)
+ goto done;
+ error = VFS_VGET(mp, log_ino, &vp);
+ if (error != 0) {
+ printf("ffs_wapbl: vget failed %d\n",
+ error);
+ /* clear out log info on error */
+ goto done;
+ }
+ ip = VTOI(vp);
+ KASSERT(log_ino == ip->i_number);
+ if ((ip->i_flags & SF_LOG) == 0) {
+ printf("ffs_wapbl: try to clear non-log inode "
+ "%" PRId64 "\n", log_ino);
+ vput(vp);
+ /* clear out log info on error */
+ goto done;
+ }
+
+ /*
+ * remove the log inode by setting its link count back
+ * to zero and bail.
+ */
+ ip->i_nlink = 0;
+ DIP_ASSIGN(ip, nlink, 0);
+ vput(vp);
+
+ case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+ DPRINTF("end-of-partition log\n");
+ /* no extra work required */
+ break;
+
+ default:
+ printf("ffs_wapbl: unknown journal type %d\n",
+ fs->fs_journal_location);
+ break;
+ }
+
+
+done:
+ /* Clear out all previous knowledge of journal */
+ fs->fs_journal_version = 0;
+ fs->fs_journal_location = 0;
+ fs->fs_journal_flags = 0;
+ fs->fs_journallocs[0] = 0;
+ fs->fs_journallocs[1] = 0;
+ fs->fs_journallocs[2] = 0;
+ fs->fs_journallocs[3] = 0;
+ (void) ffs_sbupdate(ump, MNT_WAIT);
+
+ return 0;
+}
+
+int
+ffs_wapbl_start(struct mount *mp)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ struct vnode *devvp = ump->um_devvp;
+ daddr_t off;
+ size_t count;
+ size_t blksize;
+ uint64_t extradata;
+ int error;
+
+ if (mp->mnt_wapbl == NULL) {
+ if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) {
+ /* Clear out any existing journal file */
+ error = wapbl_remove_log(mp);
+ if (error != 0)
+ return error;
+ }
+
+ if (mp->mnt_flag & MNT_LOG) {
+ KDASSERT(fs->fs_ronly == 0);
+
+ /* WAPBL needs UFS2 format super block */
+ if (ffs_superblock_layout(fs) < 2) {
+ printf("%s fs superblock in old format, "
+ "not journaling\n",
+ VFSTOUFS(mp)->um_fs->fs_fsmnt);
+ mp->mnt_flag &= ~MNT_LOG;
+ return EINVAL;
+ }
+
+ error = wapbl_log_position(mp, fs, devvp, &off,
+ &count, &blksize, &extradata);
+ if (error)
+ return error;
+
+ error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off,
+ count, blksize, mp->mnt_wapbl_replay,
+ ffs_wapbl_sync_metadata,
+ ffs_wapbl_abort_sync_metadata);
+ if (error)
+ return error;
+
+ mp->mnt_wapbl_op = &wapbl_ops;
+
+#ifdef WAPBL_DEBUG
+ printf("%s: enabling logging\n", fs->fs_fsmnt);
+#endif
+
+ if ((fs->fs_flags & FS_DOWAPBL) == 0) {
+ UFS_WAPBL_BEGIN(mp);
+ fs->fs_flags |= FS_DOWAPBL;
+ error = ffs_sbupdate(ump, MNT_WAIT);
+ if (error) {
+ UFS_WAPBL_END(mp);
+ ffs_wapbl_stop(mp, MNT_FORCE);
+ return error;
+ }
+ UFS_WAPBL_END(mp);
+ error = wapbl_flush(mp->mnt_wapbl, 1);
+ if (error) {
+ ffs_wapbl_stop(mp, MNT_FORCE);
+ return error;
+ }
+ }
+ } else if (fs->fs_flags & FS_DOWAPBL) {
+ fs->fs_fmod = 1;
+ fs->fs_flags &= ~FS_DOWAPBL;
+ }
+ }
+
+ /*
+ * It is recommended that you finish replay with logging enabled.
+ * However, even if logging is not enabled, the remaining log
+ * replay should be safely recoverable with an fsck, so perform
+ * it anyway.
+ */
+ if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) {
+ int saveflag = mp->mnt_flag & MNT_RDONLY;
+ /*
+ * Make sure MNT_RDONLY is not set so that the inode
+ * cleanup in ufs_inactive will actually do its work.
+ */
+ mp->mnt_flag &= ~MNT_RDONLY;
+ ffs_wapbl_replay_finish(mp);
+ mp->mnt_flag |= saveflag;
+ KASSERT(fs->fs_ronly == 0);
+ }
+
+ return 0;
+}
+
+int
+ffs_wapbl_stop(struct mount *mp, int force)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ int error;
+
+ if (mp->mnt_wapbl) {
+ KDASSERT(fs->fs_ronly == 0);
+
+ /*
+ * Make sure turning off FS_DOWAPBL is only removed
+ * as the only change in the final flush since otherwise
+ * a transaction may reorder writes.
+ */
+ error = wapbl_flush(mp->mnt_wapbl, 1);
+ if (error && !force)
+ return error;
+ if (error && force)
+ goto forceout;
+ error = UFS_WAPBL_BEGIN(mp);
+ if (error && !force)
+ return error;
+ if (error && force)
+ goto forceout;
+ KASSERT(fs->fs_flags & FS_DOWAPBL);
+
+ fs->fs_flags &= ~FS_DOWAPBL;
+ error = ffs_sbupdate(ump, MNT_WAIT);
+ KASSERT(error == 0); /* XXX a bit drastic! */
+ UFS_WAPBL_END(mp);
+ forceout:
+ error = wapbl_stop(mp->mnt_wapbl, force);
+ if (error) {
+ KASSERT(!force);
+ fs->fs_flags |= FS_DOWAPBL;
+ return error;
+ }
+ fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */
+ mp->mnt_wapbl = NULL;
+
+#ifdef WAPBL_DEBUG
+ printf("%s: disabled logging\n", fs->fs_fsmnt);
+#endif
+ }
+
+ return 0;
+}
+
+int
+ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp)
+{
+ int error;
+ daddr_t off;
+ size_t count;
+ size_t blksize;
+ uint64_t extradata;
+
+ /*
+ * WAPBL needs UFS2 format super block, if we got here with a
+ * UFS1 format super block something is amiss...
+ */
+ if (ffs_superblock_layout(fs) < 2)
+ return EINVAL;
+
+ error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize,
+ &extradata);
+
+ if (error)
+ return error;
+
+ error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off,
+ count, blksize);
+ if (error)
+ return error;
+
+ mp->mnt_wapbl_op = &wapbl_ops;
+
+ return 0;
+}
+
+/*
+ * If the superblock doesn't already have a recorded journal location
+ * then we allocate the journal in one of two positions:
+ *
+ * - At the end of the partition after the filesystem if there's
+ * enough space. "Enough space" is defined as >= 1MB of journal
+ * per 1GB of filesystem or 64MB, whichever is smaller.
+ *
+ * - Inside the filesystem. We try to allocate a contiguous journal
+ * based on the total filesystem size - the target is 1MB of journal
+ * per 1GB of filesystem, up to a maximum journal size of 64MB. As
+ * a worst case allowing for fragmentation, we'll allocate a journal
+ * 1/4 of the desired size but never smaller than 1MB.
+ *
+ * XXX In the future if we allow for non-contiguous journal files we
+ * can tighten the above restrictions.
+ *
+ * XXX
+ * These seems like a lot of duplication both here and in some of
+ * the userland tools (fsck_ffs, dumpfs, tunefs) with similar
+ * "switch (fs_journal_location)" constructs. Can we centralise
+ * this sort of code somehow/somewhere?
+ */
+static int
+wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp,
+ daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ daddr_t logstart, logend, desired_logsize;
+ uint64_t numsecs;
+ unsigned secsize;
+ int error, location;
+
+ if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
+ switch (fs->fs_journal_location) {
+ case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+ DPRINTF("found existing end-of-partition log\n");
+ *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR];
+ *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+ *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
+ DPRINTF(" start = %" PRId64 ", size = %zu, "
+ "blksize = %zu\n", *startp, *countp, *blksizep);
+ return 0;
+
+ case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+ DPRINTF("found existing in-filesystem log\n");
+ *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR];
+ *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+ *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+ DPRINTF(" start = %" PRId64 ", size = %zu, "
+ "blksize = %zu\n", *startp, *countp, *blksizep);
+ return 0;
+
+ default:
+ printf("ffs_wapbl: unknown journal type %d\n",
+ fs->fs_journal_location);
+ return EINVAL;
+ }
+ }
+
+ desired_logsize =
+ lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE;
+ DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024);
+ desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+ desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+ DPRINTF("adjusted desired log size = %" PRId64 " kB\n",
+ desired_logsize / 1024);
+
+ /* Is there space after after filesystem on partition for log? */
+ logstart = fsbtodb(fs, fs->fs_size);
+ error = getdisksize(devvp, &numsecs, &secsize);
+ if (error)
+ return error;
+ KDASSERT(secsize != 0);
+ logend = btodb(numsecs * secsize);
+
+ if (dbtob(logend - logstart) >= desired_logsize) {
+ DPRINTF("enough space, use end-of-partition log\n");
+
+ location = UFS_WAPBL_JOURNALLOC_END_PARTITION;
+ *blksizep = secsize;
+
+ *startp = logstart;
+ *countp = (logend - logstart);
+ *extradatap = 0;
+
+ /* convert to physical block numbers */
+ *startp = dbtob(*startp) / secsize;
+ *countp = dbtob(*countp) / secsize;
+
+ fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp;
+ fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp;
+ fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep;
+ fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap;
+ } else {
+ DPRINTF("end-of-partition has only %" PRId64 " free\n",
+ logend - logstart);
+
+ location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM;
+ *blksizep = secsize;
+
+ error = wapbl_create_infs_log(mp, fs, devvp,
+ startp, countp, extradatap);
+ ffs_sync(mp, MNT_WAIT, FSCRED);
+
+ /* convert to physical block numbers */
+ *startp = dbtob(*startp) / secsize;
+ *countp = dbtob(*countp) / secsize;
+
+ fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = *startp;
+ fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = *countp;
+ fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = *blksizep;
+ fs->fs_journallocs[UFS_WAPBL_INFS_INO] = *extradatap;
+ }
+
+ if (error == 0) {
+ /* update superblock with log location */
+ fs->fs_journal_version = UFS_WAPBL_VERSION;
+ fs->fs_journal_location = location;
+ fs->fs_journal_flags = 0;
+
+ error = ffs_sbupdate(ump, MNT_WAIT);
+ }
+
+ return error;
+}
+
+/*
+ * Try to create a journal log inside the filesystem.
+ */
+static int
+wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp,
+ daddr_t *startp, size_t *countp, uint64_t *extradatap)
+{
+ struct vnode *vp, *rvp;
+ struct inode *ip;
+ int error;
+
+ if ((error = VFS_ROOT(mp, &rvp)) != 0)
+ return error;
+
+ error = UFS_VALLOC(rvp, 0 | S_IFREG, NOCRED, &vp);
+ if (mp->mnt_flag & MNT_UPDATE) {
+ vput(rvp);
+ } else {
+ VOP_UNLOCK(rvp);
+ vgone(rvp);
+ }
+ if (error != 0)
+ return error;
+
+ vp->v_type = VREG;
+ ip = VTOI(vp);
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ ip->i_mode = 0 | IFREG;
+ DIP_ASSIGN(ip, mode, ip->i_mode);
+ ip->i_flags = SF_LOG;
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ ip->i_nlink = 1;
+ DIP_ASSIGN(ip, nlink, 1);
+ ffs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+ if ((error = wapbl_allocate_log_file(mp, vp,
+ startp, countp, extradatap)) != 0) {
+ /*
+ * If we couldn't allocate the space for the log file,
+ * remove the inode by setting its link count back to
+ * zero and bail.
+ */
+ ip->i_nlink = 0;
+ DIP_ASSIGN(ip, nlink, 0);
+ VOP_UNLOCK(vp);
+ vgone(vp);
+
+ return error;
+ }
+
+ /*
+ * Now that we have the place-holder inode for the journal,
+ * we don't need the vnode ever again.
+ */
+ VOP_UNLOCK(vp);
+ vgone(vp);
+
+ return 0;
+}
+
+int
+wapbl_allocate_log_file(struct mount *mp, struct vnode *vp,
+ daddr_t *startp, size_t *countp, uint64_t *extradatap)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ daddr_t addr, indir_addr;
+ off_t logsize;
+ size_t size;
+ int error;
+
+ logsize = 0;
+ /* check if there's a suggested log size */
+ if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG &&
+ fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM)
+ logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+
+ if (vp->v_size > 0) {
+ printf("%s: file size (%" PRId64 ") non zero\n", __func__,
+ vp->v_size);
+ return EEXIST;
+ }
+ wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size);
+ if (addr == 0) {
+ printf("%s: log not allocated, largest extent is "
+ "%" PRId64 "MB\n", __func__,
+ lblktosize(fs, size) / (1024 * 1024));
+ return ENOSPC;
+ }
+
+ logsize = lblktosize(fs, size); /* final log size */
+
+ VTOI(vp)->i_ffs_first_data_blk = addr;
+ VTOI(vp)->i_ffs_first_indir_blk = indir_addr;
+
+ error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED);
+ if (error) {
+ printf("%s: GOP_ALLOC error %d\n", __func__, error);
+ return error;
+ }
+
+ *startp = fsbtodb(fs, addr);
+ *countp = btodb(logsize);
+ *extradatap = VTOI(vp)->i_number;
+
+ return 0;
+}
+
+/*
+ * Find a suitable location for the journal in the filesystem.
+ *
+ * Our strategy here is to look for a contiguous block of free space
+ * at least "logfile" MB in size (plus room for any indirect blocks).
+ * We start at the middle of the filesystem and check each cylinder
+ * group working outwards. If "logfile" MB is not available as a
+ * single contigous chunk, then return the address and size of the
+ * largest chunk found.
+ *
+ * XXX
+ * At what stage does the search fail? Is if the largest space we could
+ * find is less than a quarter the requested space reasonable? If the
+ * search fails entirely, return a block address if "0" it indicate this.
+ */
+static void
+wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize,
+ daddr_t *addr, daddr_t *indir_addr, size_t *size)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct fs *fs = ump->um_fs;
+ struct vnode *devvp = ump->um_devvp;
+ struct cg *cgp;
+ struct buf *bp;
+ uint8_t *blksfree;
+ daddr_t blkno, best_addr, start_addr;
+ daddr_t desired_blks, min_desired_blks;
+ daddr_t freeblks, best_blks;
+ int bpcg, cg, error, fixedsize, indir_blks, n, s;
+#ifdef FFS_EI
+ const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+ if (logsize == 0) {
+ fixedsize = 0; /* We can adjust the size if tight */
+ logsize = lfragtosize(fs, fs->fs_dsize) /
+ UFS_WAPBL_JOURNAL_SCALE;
+ DPRINTF("suggested log size = %" PRId64 "\n", logsize);
+ logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+ logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+ DPRINTF("adjusted log size = %" PRId64 "\n", logsize);
+ } else {
+ fixedsize = 1;
+ DPRINTF("fixed log size = %" PRId64 "\n", logsize);
+ }
+
+ desired_blks = logsize / fs->fs_bsize;
+ DPRINTF("desired blocks = %" PRId64 "\n", desired_blks);
+
+ /* add in number of indirect blocks needed */
+ indir_blks = 0;
+ if (desired_blks >= NDADDR) {
+ struct indir indirs[NIADDR + 2];
+ int num;
+
+ error = ufs_getlbns(vp, desired_blks, indirs, &num);
+ if (error) {
+ printf("%s: ufs_getlbns failed, error %d!\n",
+ __func__, error);
+ goto bad;
+ }
+
+ switch (num) {
+ case 2:
+ indir_blks = 1; /* 1st level indirect */
+ break;
+ case 3:
+ indir_blks = 1 + /* 1st level indirect */
+ 1 + /* 2nd level indirect */
+ indirs[1].in_off + 1; /* extra 1st level indirect */
+ break;
+ default:
+ printf("%s: unexpected numlevels %d from ufs_getlbns\n",
+ __func__, num);
+ *size = 0;
+ goto bad;
+ }
+ desired_blks += indir_blks;
+ }
+ DPRINTF("desired blocks = %" PRId64 " (including indirect)\n",
+ desired_blks);
+
+ /*
+ * If a specific size wasn't requested, allow for a smaller log
+ * if we're really tight for space...
+ */
+ min_desired_blks = desired_blks;
+ if (!fixedsize)
+ min_desired_blks = desired_blks / 4;
+
+ /* Look at number of blocks per CG. If it's too small, bail early. */
+ bpcg = fragstoblks(fs, fs->fs_fpg);
+ if (min_desired_blks > bpcg) {
+ printf("ffs_wapbl: cylinder group size of %" PRId64 " MB "
+ " is not big enough for journal\n",
+ lblktosize(fs, bpcg) / (1024 * 1024));
+ goto bad;
+ }
+
+ /*
+ * Start with the middle cylinder group, and search outwards in
+ * both directions until we either find the requested log size
+ * or reach the start/end of the file system. If we reach the
+ * start/end without finding enough space for the full requested
+ * log size, use the largest extent found if it is large enough
+ * to satisfy the our minimum size.
+ *
+ * XXX
+ * Can we just use the cluster contigsum stuff (esp on UFS2)
+ * here to simplify this search code?
+ */
+ best_addr = 0;
+ best_blks = 0;
+ for (cg = fs->fs_ncg / 2, s = 0, n = 1;
+ best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg;
+ s++, n = -n, cg += n * s) {
+ DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg);
+ error = bread(devvp, fsbtodb(fs, cgtod(fs, cg)),
+ fs->fs_cgsize, FSCRED, 0, &bp);
+ cgp = (struct cg *)bp->b_data;
+ if (error || !cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
+ brelse(bp, 0);
+ continue;
+ }
+
+ blksfree = cg_blksfree(cgp, needswap);
+
+ for (blkno = 0; blkno < bpcg;) {
+ /* look for next free block */
+ /* XXX use scanc() and fragtbl[] here? */
+ for (; blkno < bpcg - min_desired_blks; blkno++)
+ if (ffs_isblock(fs, blksfree, blkno))
+ break;
+
+ /* past end of search space in this CG? */
+ if (blkno >= bpcg - min_desired_blks)
+ break;
+
+ /* count how many free blocks in this extent */
+ start_addr = blkno;
+ for (freeblks = 0; blkno < bpcg; blkno++, freeblks++)
+ if (!ffs_isblock(fs, blksfree, blkno))
+ break;
+
+ if (freeblks > best_blks) {
+ best_blks = freeblks;
+ best_addr = blkstofrags(fs, start_addr) +
+ cgbase(fs, cg);
+
+ if (freeblks >= desired_blks) {
+ DPRINTF("found len %" PRId64
+ " at offset %" PRId64 " in gc\n",
+ freeblks, start_addr);
+ break;
+ }
+ }
+ }
+ brelse(bp, 0);
+ }
+ DPRINTF("best found len = %" PRId64 ", wanted %" PRId64
+ " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr);
+
+ if (best_blks < min_desired_blks) {
+ *addr = 0;
+ *indir_addr = 0;
+ } else {
+ /* put indirect blocks at start, and data blocks after */
+ *addr = best_addr + blkstofrags(fs, indir_blks);
+ *indir_addr = best_addr;
+ }
+ *size = min(desired_blks, best_blks) - indir_blks;
+ return;
+
+bad:
+ *addr = 0;
+ *indir_addr = 0;
+ *size = 0;
+ return;
+}
--- /dev/null
+# $NetBSD: files.ufs,v 1.27 2011/11/24 15:51:31 ahoka Exp $
+
+deffs FFS
+deffs EXT2FS
+deffs MFS
+deffs LFS
+deffs CHFS
+
+defflag opt_ffs.h FFS_EI FFS_NO_SNAPSHOT APPLE_UFS
+ UFS_DIRHASH
+ UFS_EXTATTR UFS_EXTATTR_AUTOSTART
+
+defflag opt_lfs.h LFS_KERNEL_RFW
+
+file ufs/ext2fs/ext2fs_alloc.c ext2fs
+file ufs/ext2fs/ext2fs_balloc.c ext2fs
+file ufs/ext2fs/ext2fs_bmap.c ext2fs
+file ufs/ext2fs/ext2fs_bswap.c ext2fs
+file ufs/ext2fs/ext2fs_inode.c ext2fs
+file ufs/ext2fs/ext2fs_lookup.c ext2fs
+file ufs/ext2fs/ext2fs_readwrite.c ext2fs
+file ufs/ext2fs/ext2fs_subr.c ext2fs
+file ufs/ext2fs/ext2fs_vfsops.c ext2fs
+file ufs/ext2fs/ext2fs_vnops.c ext2fs
+
+file ufs/chfs/ebh.c chfs
+file ufs/chfs/chfs_ihash.c chfs
+file ufs/chfs/chfs_scan.c chfs
+file ufs/chfs/chfs_write.c chfs
+file ufs/chfs/chfs_vnode_cache.c chfs
+file ufs/chfs/chfs_erase.c chfs
+file ufs/chfs/chfs_build.c chfs
+file ufs/chfs/chfs_wbuf.c chfs
+file ufs/chfs/chfs_vnops.c chfs
+file ufs/chfs/chfs_gc.c chfs
+file ufs/chfs/chfs_nodeops.c chfs
+file ufs/chfs/chfs_malloc.c chfs
+file ufs/chfs/chfs_pool.c chfs
+file ufs/chfs/debug.c chfs
+file ufs/chfs/chfs_vnode.c chfs
+file ufs/chfs/chfs_subr.c chfs
+file ufs/chfs/chfs_vfsops.c chfs
+file ufs/chfs/chfs_readinode.c chfs
+
+file ufs/ffs/ffs_alloc.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_balloc.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_bswap.c (ffs | mfs) & ffs_ei
+file ufs/ffs/ffs_inode.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_snapshot.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_subr.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_tables.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_vfsops.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_vnops.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ffs/ffs_wapbl.c ffs & wapbl
+file ufs/ffs/ffs_appleufs.c ffs & apple_ufs
+file ufs/ffs/ffs_quota2.c quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+
+file ufs/lfs/lfs_alloc.c lfs
+file ufs/lfs/lfs_balloc.c lfs
+file ufs/lfs/lfs_bio.c lfs
+file ufs/lfs/lfs_cksum.c lfs
+file ufs/lfs/lfs_debug.c lfs
+file ufs/lfs/lfs_inode.c lfs
+file ufs/lfs/lfs_itimes.c lfs
+file ufs/lfs/lfs_rfw.c lfs & lfs_kernel_rfw
+file ufs/lfs/lfs_segment.c lfs
+file ufs/lfs/lfs_subr.c lfs
+file ufs/lfs/lfs_syscalls.c lfs
+file ufs/lfs/lfs_vfsops.c lfs
+file ufs/lfs/lfs_vnops.c lfs
+
+file ufs/mfs/mfs_vfsops.c mfs
+file ufs/mfs/mfs_vnops.c mfs
+file ufs/mfs/mfs_miniroot.c
+
+file ufs/ufs/ufs_bmap.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ufs/ufs_dirhash.c (ffs | lfs | mfs | ext2fs | chfs) & ufs_dirhash
+file ufs/ufs/ufs_extattr.c (ffs | mfs) & ufs_extattr
+file ufs/ufs/ufs_ihash.c ffs | lfs | mfs | ext2fs
+file ufs/ufs/ufs_inode.c ffs | lfs | mfs | ext2fs
+file ufs/ufs/ufs_lookup.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ufs/ufs_quota.c (quota | quota2) & (ffs | lfs | mfs | ext2fs | chfs)
+file ufs/ufs/ufs_quota1.c quota & (ffs | lfs | mfs | ext2fs | chfs)
+file ufs/ufs/ufs_quota2.c quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+file ufs/ufs/quota1_subr.c
+file ufs/ufs/quota2_subr.c quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+file ufs/ufs/ufs_vfsops.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ufs/ufs_vnops.c ffs | lfs | mfs | ext2fs | chfs
+file ufs/ufs/ufs_wapbl.c ffs & wapbl
--- /dev/null
+# $NetBSD: CHANGES,v 1.5 2005/12/11 12:25:26 christos Exp $
+
+kernel:
+
+- Instead of blindly continuing when it encounters an Inode that is
+ locked by another process, lfs_markv will process the rest of the
+ inodes passed to it and then return EAGAIN. The cleaner will
+ recognize this and not mark the segment clean. When the cleaner runs
+ again, the segment containg the (formerly) locked inode will sort high
+ for cleaning, since it is now almost entirely empty.
+
+- A beginning has been made to test keeping atime information in the
+ Ifile, instead of on the inodes. This should make read-mostly
+ filesystems significantly faster, since the inodes will then remain
+ close to the data blocks on disk; but of course the ifile will be
+ somewhat larger. This code is not enabled, as it makes the format of
+ IFILEs change.
+
+- The superblock has been broken into two components: an on-disk
+ superblock using fixed-size types, exactly 512 bytes regardless of
+ architecture (or could be enlarged in multiples of the media block
+ size up to LFS_SBPAD); and an in-memory superblock containing the
+ information only useful to a running LFS, including segment pointers,
+ etc. The superblock checksumming code has been modified to make
+ future changes to the superblock format easier.
+
+- Because of the way that lfs_writeseg works, buffers are freed before
+ they are really written to disk: their contents are copied into large
+ buffers which are written async. Because the buffer cache does not
+ serve to throttle these writes, and malloced memory is used to hold them,
+ there is a danger of running out of kmem_map. To avoid this, a new
+ compile-time parameter, LFS_THROTTLE, is used as an upper bound for the
+ number of partial-segments allowed to be in progress writing at any
+ given time.
+
+- If the system crashes between the point that a checkpoint is scheduled
+ for writing and the time that the write completes, the filesystem
+ could be left in an inconsistent state (no valid checkpoints on
+ disk). To avoid this, we toggle between the first two superblocks
+ when checkpointing, and (if it is indicated that no roll-forward agent
+ exists) do not allow one checkpoint to occur before the last one has
+ completed. When the filesystem is mounted, it uses the *older* of the
+ first two superblocks.
+
+- DIROPs:
+
+ The design of the LFS includes segregating vnodes used in directory
+ operations, so that they can be written at the same time during a
+ checkpoint, avoiding filesystem inconsistency after a crash. Code for
+ this was partially written for BSD4.4, but was not complete or enabled.
+
+ In particular, vnodes marked VDIROP could be flushed by getnewvnode at
+ any time, negating the usefulness of marking a vnode VDIROP, since if
+ the filesystem then crashed it would be inconsistent. Now, when a
+ vnode is first marked VDIROP it is also referenced. To avoid running
+ out of vnodes, an attempt to mark more than LFS_MAXDIROP vnodes wth
+ VDIROP will sleep, and trigger a partial-segment write when no dirops
+ are active.
+
+- LFS maintains a linked list of free inode numbers in the Ifile;
+ accesses to this list are now protected by a simple lock.
+
+- lfs_vfree is not allowed to run while an inode has blocks scheduled
+ for writing, since that could trigger a miscounting in lfs_truncate.
+
+- lfs_balloc now correctly extends fragments, if a block is written
+ beyond the current end-of-file.
+
+- Blocks which have already been gathered into a partial-segment are not
+ allowed to be extended, since if they were, any blocks following them
+ would either be written in the wrong place, or overwrite other blocks.
+
+- The LFS buffer-header accounting, which triggers a partial-segment
+ write if too many buffer-headers are in use by the LFS subystem, has
+ been expanded to include *bytes* used in LFS buffers as well.
+
+- Reads of the Ifile, which almost always come from the cleaner, can no
+ longer trigger a partial-segment write, since this could cause a
+ deadlock.
+
+- Support has been added (but not tested, and currently disabled by
+ default) for true read-only filesystems. Currently, if a filesystem
+ is mounted read-only the cleaner can still operate on it, but this
+ obviously would not be true for read-only media. (I think the
+ original plan was for the roll-forward agent to operate using this
+ "feature"?)
+
+- If a fake buffer is created by lfs_markv and another process draws the
+ same block in and changes it, the fake buffer is now discarded and
+ replaced by the "real" buffer containing the new data.
+
+- An inode which has blocks gathered no longer has IN_MODIFIED set, but
+ still does in fact have dirty blocks attached. lfs_update will now
+ wait for such an inode's writes to complete before it runs,
+ suppressing a panic in vinvalbuf.
+
+- Many filesystem operations now update the Ifile's mtime, allowing the
+ cleaner to detect when the filesystem is idle, and clean more
+ vigorously during such times (cf. Blackwell et al., 1995).
+
+- When writing a partial-segment, make sure that the current segment is
+ still marked ACTIVE afterward (otherwise the cleaner might try to
+ clean it, since it might well be mostly empty).
+
+- Don't trust the cleaner so much. Sort the blocks during gathering,
+ even if they came from the cleaner; verify the location of on-disk
+ inodes, even if the cleaner says it knows where they came from.
+
+- The cleaning code (lfs_markv in particular) has been entirely
+ rewritten, and the partial-segment writing code changed to match.
+ Lfs_markv no longer uses its own implementation of lfs_segwrite, but
+ marks inodes with IN_CLEANING to differentiate them from the
+ non-cleaning inodes. This change fixes numerous problems with the old
+ cleaner, including a buffer overrun, and lost extensions in active
+ fragments. lfs_bmapv looks up and returns the addresses of inode
+ blocks, so the cleaner can do something intelligent with them.
+
+ If IN_CLEANING is set on an inode during partial-segment write, only fake
+ buffers will be written, and IN_MODIFIED will not be cleared, saving
+ us from a panic in vinvalbuf. The addition of IN_CLEANING also allows
+ dirops to be active while cleaning is in progress; since otherwise
+ buffers engaged in active dirops might be written ahead of schedule,
+ and cause an inconsistent checkpoint to be written to disk.
+
+ (XXX - even now, DIROP blocks can sometimes be written to disk, if we
+ are cleaning the same blocks as are active? Grr, I don't see a good
+ solution for this!)
+
+- Added sysctl entries for LFS. In particular, `writeindir' controls
+ whether indirect blocks are written during non-checkpoint writes.
+ (Since there is no roll-forward agent as yet, there is no penalty in
+ not writing indirect blocks.)
+
+- Wake up the cleaner at fs-unmount time, so it can die (if we unmount
+ and then remount, we could conceivably get more than one cleaner
+ operating at once).
+
+newfs_lfs:
+
+- The ifile inode is now created with the schg flag set, since nothing
+ ever modifies it. This could be a pain for the roll-forward agent,
+ but since that should really run *before* the filesystem is mounted,
+ I don't care.
+
+- For large disks, it may be necessary to write one or more indirect
+ blocks when the ifile inode is created. Newlfs has been changed to
+ write the first indirect block, if necessary. It should instead just
+ build a set of inodes and blocks, and then use the partial-segment
+ writing routine mentioned above to write an ifile of whatever size is
+ desired.
+
+lfs_cleanerd:
+
+- Now writes information to the syslog.
+
+- Can now deal properly with fragments.
+
+- Sometimes, the cleaner can die. (Why?) If this happens and we don't
+ notice, we're screwed, since the fs will overfill. So, the invoked
+ cleaner now spawns itself repeatedly, a la init(8), to ensure that a
+ cleaner is always present to clean the fs.
+
+- Added a flag to clean more actively, not on low load average but
+ filesystem inactivity; a la Blackwell et al., 1995.
+
+fsck_lfs:
+
+- Exists, although it currently cannot actually fix anything (it is a
+ diagnostic tool only at this point).
--- /dev/null
+# $NetBSD: Makefile,v 1.1 1998/06/12 23:23:12 cgd Exp $
+
+INCSDIR= /usr/include/ufs/lfs
+
+INCS= lfs.h lfs_extern.h
+
+.include <bsd.kinc.mk>
--- /dev/null
+# $NetBSD: README,v 1.3 1999/03/15 00:46:47 perseant Exp $
+
+# @(#)README 8.1 (Berkeley) 6/11/93
+
+The file system is reasonably stable...I think.
+
+For details on the implementation, performance and why garbage
+collection always wins, see Dr. Margo Seltzer's thesis available for
+anonymous ftp from toe.cs.berkeley.edu, in the directory
+pub/personal/margo/thesis.ps.Z, or the January 1993 USENIX paper.
+
+----------
+The disk is laid out in segments. The first segment starts 8K into the
+disk (the first 8K is used for boot information). Each segment is composed
+of the following:
+
+ An optional super block
+ One or more groups of:
+ segment summary
+ 0 or more data blocks
+ 0 or more inode blocks
+
+The segment summary and inode/data blocks start after the super block (if
+present), and grow toward the end of the segment.
+
+ _______________________________________________
+ | | | | |
+ | summary | data/inode | summary | data/inode |
+ | block | blocks | block | blocks | ...
+ |_________|____________|_________|____________|
+
+The data/inode blocks following a summary block are described by the
+summary block. In order to permit the segment to be written in any order
+and in a forward direction only, a checksum is calculated across the
+blocks described by the summary. Additionally, the summary is checksummed
+and timestamped. Both of these are intended for recovery; the former is
+to make it easy to determine that it *is* a summary block and the latter
+is to make it easy to determine when recovery is finished for partially
+written segments. These checksums are also used by the cleaner.
+
+ Summary block (detail)
+ ________________
+ | sum cksum |
+ | data cksum |
+ | next segment |
+ | timestamp |
+ | FINFO count |
+ | inode count |
+ | flags |
+ |______________|
+ | FINFO-1 | 0 or more file info structures, identifying the
+ | . | blocks in the segment.
+ | . |
+ | . |
+ | FINFO-N |
+ | inode-N |
+ | . |
+ | . |
+ | . | 0 or more inode daddr_t's, identifying the inode
+ | inode-1 | blocks in the segment.
+ |______________|
+
+Inode blocks are blocks of on-disk inodes in the same format as those in
+the FFS. However, spare[0] contains the inode number of the inode so we
+can find a particular inode on a page. They are packed page_size /
+sizeof(inode) to a block. Data blocks are exactly as in the FFS. Both
+inodes and data blocks move around the file system at will.
+
+The file system is described by a super-block which is replicated and
+occurs as the first block of the first and other segments. (The maximum
+number of super-blocks is MAXNUMSB). Each super-block maintains a list
+of the disk addresses of all the super-blocks. The super-block maintains
+a small amount of checkpoint information, essentially just enough to find
+the inode for the IFILE (fs->lfs_idaddr).
+
+The IFILE is visible in the file system, as inode number IFILE_INUM. It
+contains information shared between the kernel and various user processes.
+
+ Ifile (detail)
+ ________________
+ | cleaner info | Cleaner information per file system. (Page
+ | | granularity.)
+ |______________|
+ | segment | Space available and last modified times per
+ | usage table | segment. (Page granularity.)
+ |______________|
+ | IFILE-1 | Per inode status information: current version #,
+ | . | if currently allocated, last access time and
+ | . | current disk address of containing inode block.
+ | . | If current disk address is LFS_UNUSED_DADDR, the
+ | IFILE-N | inode is not in use, and it's on the free list.
+ |______________|
+
+
+First Segment at Creation Time:
+_____________________________________________________________
+| | | | | | | |
+| 8K pad | Super | summary | inode | ifile | root | l + f |
+| | block | | block | | dir | dir |
+|________|_______|_________|_______|_______|_______|_______|
+ ^
+ Segment starts here.
+
+Some differences from the Sprite LFS implementation.
+
+1. The LFS implementation placed the ifile metadata and the super block
+ at fixed locations. This implementation replicates the super block
+ and puts each at a fixed location. The checkpoint data is divided into
+ two parts -- just enough information to find the IFILE is stored in
+ two of the super blocks, although it is not toggled between them as in
+ the Sprite implementation. (This was deliberate, to avoid a single
+ point of failure.) The remaining checkpoint information is treated as
+ a regular file, which means that the cleaner info, the segment usage
+ table and the ifile meta-data are stored in normal log segments.
+ (Tastes great, less filling...)
+
+2. The segment layout is radically different in Sprite; this implementation
+ uses something a lot like network framing, where data/inode blocks are
+ written asynchronously, and a checksum is used to validate any set of
+ summary and data/inode blocks. Sprite writes summary blocks synchronously
+ after the data/inode blocks have been written and the existence of the
+ summary block validates the data/inode blocks. This permits us to write
+ everything contiguously, even partial segments and their summaries, whereas
+ Sprite is forced to seek (from the end of the data inode to the summary
+ which lives at the end of the segment). Additionally, writing the summary
+ synchronously should cost about 1/2 a rotation per summary.
+
+3. Sprite LFS distinguishes between different types of blocks in the segment.
+ Other than inode blocks and data blocks, we don't.
+
+4. Sprite LFS traverses the IFILE looking for free blocks. We maintain a
+ free list threaded through the IFILE entries.
+
+5. The cleaner runs in user space, as opposed to kernel space. It shares
+ information with the kernel by reading/writing the IFILE and through
+ cleaner specific system calls.
+
--- /dev/null
+# $NetBSD: TODO,v 1.10 2005/12/11 12:25:26 christos Exp $
+
+- Lock audit. Need to check locking for multiprocessor case in particular.
+
+- Get rid of lfs_segclean(); the kernel should clean a dirty segment IFF it
+ has passed two checkpoints containing zero live bytes.
+
+- Now that our cache is basically all of physical memory, we need to make
+ sure that segwrite is not starving other important things. Need a way
+ to prioritize which blocks are most important to write, and write only
+ those, saving the rest for later. Does this change our notion of what
+ a checkpoint is?
+
+- Investigate alternate inode locking strategy: Inode locks are useful
+ for locking against simultaneous changes to inode size (balloc,
+ truncate, write) but because the assignment of disk blocks is also
+ covered by the segment lock, we don't really need to pay attention to
+ the inode lock when writing a segment, right? If this is true, the
+ locking problem in lfs_{bmapv,markv} goes away and lfs_reserve can go,
+ too.
+
+- Get rid of DEV_BSIZE, pay attention to the media block size at mount time.
+
+- More fs ops need to call lfs_imtime. Which ones? (Blackwell et al., 1995)
+
+- lfs_vunref_head exists so that vnodes loaded solely for cleaning can
+ be put back on the *head* of the vnode free list. Make sure we
+ actually do this, since we now take IN_CLEANING off during segment write.
+
+- The cleaner could be enhanced to be controlled from other processes,
+ and possibly perform additional tasks:
+
+ - Backups. At a minimum, turn the cleaner off and on to allow
+ effective live backups. More aggressively, the cleaner itself could
+ be the backup agent, and dump_lfs would merely be a controller.
+
+ - Cleaning time policies. Be able to tweak the cleaner's thresholds
+ to allow more thorough cleaning during policy-determined idle
+ periods (regardless of actual idleness) or put off until later
+ during short, intensive write periods.
+
+ - File coalescing and placement. During periods we expect to be idle,
+ coalesce fragmented files into one place on disk for better read
+ performance. Ideally, move files that have not been accessed in a
+ while to the extremes of the disk, thereby shortening seek times for
+ files that are accessed more frequently (though how the cleaner
+ should communicate "please put this near the beginning or end of the
+ disk" to the kernel is a very good question; flags to lfs_markv?).
+
+ - Versioning. When it cleans a segment it could write data for files
+ that were less than n versions old to tape or elsewhere. Perhaps it
+ could even write them back onto the disk, although that requires
+ more thought (and kernel mods).
+
+- Move lfs_countlocked() into vfs_bio.c, to replace count_locked_queue;
+ perhaps keep the name, replace the function. Could it count referenced
+ vnodes as well, if it was in vfs_subr.c instead?
+
+- Why not delete the lfs_bmapv call, just mark everything dirty that
+ isn't deleted/truncated? Get some numbers about what percentage of
+ the stuff that the cleaner thinks might be live is live. If it's
+ high, get rid of lfs_bmapv.
+
+- There is a nasty problem in that it may take *more* room to write the
+ data to clean a segment than is returned by the new segment because of
+ indirect blocks in segment 2 being dirtied by the data being copied
+ into the log from segment 1. The suggested solution at this point is
+ to detect it when we have no space left on the filesystem, write the
+ extra data into the last segment (leaving no clean ones), make it a
+ checkpoint and shut down the file system for fixing by a utility
+ reading the raw partition. Argument is that this should never happen
+ and is practically impossible to fix since the cleaner would have to
+ theoretically build a model of the entire filesystem in memory to
+ detect the condition occurring. A file coalescing cleaner will help
+ avoid the problem, and one that reads/writes from the raw disk could
+ fix it.
+
+- Need to keep vnode v_numoutput up to date for pending writes?
+
+- If delete a file that's being executed, the version number isn't
+ updated, and fsck_lfs has to figure this out; case is the same as if
+ have an inode that no directory references, so the file should be
+ reattached into lost+found.
+
+- Currently there's no notion of write error checking.
+ + Failed data/inode writes should be rescheduled (kernel level bad blocking).
+ + Failed superblock writes should cause selection of new superblock
+ for checkpointing.
+
+- Future fantasies:
+ - unrm, versioning
+ - transactions
+ - extended cleaner policies (hot/cold data, data placement)
+
+- Problem with the concept of multiple buffer headers referencing the segment:
+ Positives:
+ Don't lock down 1 segment per file system of physical memory.
+ Don't copy from buffers to segment memory.
+ Don't tie down the bus to transfer 1M.
+ Works on controllers supporting less than large transfers.
+ Disk can start writing immediately instead of waiting 1/2 rotation
+ and the full transfer.
+ Negatives:
+ Have to do segment write then segment summary write, since the latter
+ is what verifies that the segment is okay. (Is there another way
+ to do this?)
+
+- The algorithm for selecting the disk addresses of the super-blocks
+ has to be available to the user program which checks the file system.
--- /dev/null
+/* $NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_alloc.c 8.4 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#include <sys/lock.h>
+#include <sys/vnode.h>
+#include <sys/syslog.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/proc.h>
+#include <sys/tree.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+/* Constants for inode free bitmap */
+#define BMSHIFT 5 /* 2 ** 5 = 32 */
+#define BMMASK ((1 << BMSHIFT) - 1)
+#define SET_BITMAP_FREE(F, I) do { \
+ DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d set\n", (int)(I), \
+ (int)((I) >> BMSHIFT), (int)((I) & BMMASK))); \
+ (F)->lfs_ino_bitmap[(I) >> BMSHIFT] |= (1 << ((I) & BMMASK)); \
+} while (0)
+#define CLR_BITMAP_FREE(F, I) do { \
+ DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d clr\n", (int)(I), \
+ (int)((I) >> BMSHIFT), (int)((I) & BMMASK))); \
+ (F)->lfs_ino_bitmap[(I) >> BMSHIFT] &= ~(1 << ((I) & BMMASK)); \
+} while(0)
+
+#define ISSET_BITMAP_FREE(F, I) \
+ ((F)->lfs_ino_bitmap[(I) >> BMSHIFT] & (1 << ((I) & BMMASK)))
+
+/*
+ * Add a new block to the Ifile, to accommodate future file creations.
+ * Called with the segment lock held.
+ */
+int
+lfs_extend_ifile(struct lfs *fs, kauth_cred_t cred)
+{
+ struct vnode *vp;
+ struct inode *ip;
+ IFILE *ifp;
+ IFILE_V1 *ifp_v1;
+ struct buf *bp, *cbp;
+ int error;
+ daddr_t i, blkno, xmax;
+ ino_t oldlast, maxino;
+ CLEANERINFO *cip;
+
+ ASSERT_SEGLOCK(fs);
+
+ vp = fs->lfs_ivnode;
+ ip = VTOI(vp);
+ blkno = lblkno(fs, ip->i_size);
+ if ((error = lfs_balloc(vp, ip->i_size, fs->lfs_bsize, cred, 0,
+ &bp)) != 0) {
+ return (error);
+ }
+ ip->i_size += fs->lfs_bsize;
+ ip->i_ffs1_size = ip->i_size;
+ uvm_vnp_setsize(vp, ip->i_size);
+
+ maxino = ((ip->i_size >> fs->lfs_bshift) - fs->lfs_cleansz -
+ fs->lfs_segtabsz) * fs->lfs_ifpb;
+ fs->lfs_ino_bitmap = (lfs_bm_t *)
+ realloc(fs->lfs_ino_bitmap, ((maxino + BMMASK) >> BMSHIFT) *
+ sizeof(lfs_bm_t), M_SEGMENT, M_WAITOK);
+ KASSERT(fs->lfs_ino_bitmap != NULL);
+
+ i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
+ fs->lfs_ifpb;
+
+ /*
+ * We insert the new inodes at the head of the free list.
+ * Under normal circumstances, the free list is empty here,
+ * so we are also incidentally placing them at the end (which
+ * we must do if we are to keep them in order).
+ */
+ LFS_GET_HEADFREE(fs, cip, cbp, &oldlast);
+ LFS_PUT_HEADFREE(fs, cip, cbp, i);
+#ifdef DIAGNOSTIC
+ if (fs->lfs_freehd == LFS_UNUSED_INUM)
+ panic("inode 0 allocated [2]");
+#endif /* DIAGNOSTIC */
+ xmax = i + fs->lfs_ifpb;
+
+ if (fs->lfs_version == 1) {
+ for (ifp_v1 = (IFILE_V1 *)bp->b_data; i < xmax; ++ifp_v1) {
+ SET_BITMAP_FREE(fs, i);
+ ifp_v1->if_version = 1;
+ ifp_v1->if_daddr = LFS_UNUSED_DADDR;
+ ifp_v1->if_nextfree = ++i;
+ }
+ ifp_v1--;
+ ifp_v1->if_nextfree = oldlast;
+ } else {
+ for (ifp = (IFILE *)bp->b_data; i < xmax; ++ifp) {
+ SET_BITMAP_FREE(fs, i);
+ ifp->if_version = 1;
+ ifp->if_daddr = LFS_UNUSED_DADDR;
+ ifp->if_nextfree = ++i;
+ }
+ ifp--;
+ ifp->if_nextfree = oldlast;
+ }
+ LFS_PUT_TAILFREE(fs, cip, cbp, xmax - 1);
+
+ (void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+ return 0;
+}
+
+/* Allocate a new inode. */
+/* ARGSUSED */
+/* VOP_BWRITE 2i times */
+int
+lfs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+ struct vnode **vpp)
+{
+ struct lfs *fs;
+ struct buf *bp, *cbp;
+ struct ifile *ifp;
+ ino_t new_ino;
+ int error;
+ int new_gen;
+ CLEANERINFO *cip;
+
+ fs = VTOI(pvp)->i_lfs;
+ if (fs->lfs_ronly)
+ return EROFS;
+
+ ASSERT_NO_SEGLOCK(fs);
+
+ lfs_seglock(fs, SEGM_PROT);
+ vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE);
+
+ /* Get the head of the freelist. */
+ LFS_GET_HEADFREE(fs, cip, cbp, &new_ino);
+ KASSERT(new_ino != LFS_UNUSED_INUM && new_ino != LFS_IFILE_INUM);
+
+ DLOG((DLOG_ALLOC, "lfs_valloc: allocate inode %lld\n",
+ (long long)new_ino));
+
+ /*
+ * Remove the inode from the free list and write the new start
+ * of the free list into the superblock.
+ */
+ CLR_BITMAP_FREE(fs, new_ino);
+ LFS_IENTRY(ifp, fs, new_ino, bp);
+ if (ifp->if_daddr != LFS_UNUSED_DADDR)
+ panic("lfs_valloc: inuse inode %llu on the free list",
+ (unsigned long long)new_ino);
+ LFS_PUT_HEADFREE(fs, cip, cbp, ifp->if_nextfree);
+ DLOG((DLOG_ALLOC, "lfs_valloc: headfree %lld -> %lld\n",
+ (long long)new_ino, (long long)ifp->if_nextfree));
+
+ new_gen = ifp->if_version; /* version was updated by vfree */
+ brelse(bp, 0);
+
+ /* Extend IFILE so that the next lfs_valloc will succeed. */
+ if (fs->lfs_freehd == LFS_UNUSED_INUM) {
+ if ((error = lfs_extend_ifile(fs, cred)) != 0) {
+ LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
+ VOP_UNLOCK(fs->lfs_ivnode);
+ lfs_segunlock(fs);
+ return error;
+ }
+ }
+#ifdef DIAGNOSTIC
+ if (fs->lfs_freehd == LFS_UNUSED_INUM)
+ panic("inode 0 allocated [3]");
+#endif /* DIAGNOSTIC */
+
+ /* Set superblock modified bit and increment file count. */
+ mutex_enter(&lfs_lock);
+ fs->lfs_fmod = 1;
+ mutex_exit(&lfs_lock);
+ ++fs->lfs_nfiles;
+
+ VOP_UNLOCK(fs->lfs_ivnode);
+ lfs_segunlock(fs);
+
+ return lfs_ialloc(fs, pvp, new_ino, new_gen, vpp);
+}
+
+/*
+ * Finish allocating a new inode, given an inode and generation number.
+ */
+int
+lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
+ struct vnode **vpp)
+{
+ struct inode *ip;
+ struct vnode *vp;
+
+ ASSERT_NO_SEGLOCK(fs);
+
+ vp = *vpp;
+ mutex_enter(&ufs_hashlock);
+ /* Create an inode to associate with the vnode. */
+ lfs_vcreate(pvp->v_mount, new_ino, vp);
+
+ ip = VTOI(vp);
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_CHANGE);
+ mutex_exit(&lfs_lock);
+ /* on-disk structure has been zeroed out by lfs_vcreate */
+ ip->i_din.ffs1_din->di_inumber = new_ino;
+
+ /* Note no blocks yet */
+ ip->i_lfs_hiblk = -1;
+
+ /* Set a new generation number for this inode. */
+ if (new_gen) {
+ ip->i_gen = new_gen;
+ ip->i_ffs1_gen = new_gen;
+ }
+
+ /* Insert into the inode hash table. */
+ ufs_ihashins(ip);
+ mutex_exit(&ufs_hashlock);
+
+ ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, vpp);
+ vp = *vpp;
+ ip = VTOI(vp);
+
+ memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize));
+
+ uvm_vnp_setsize(vp, 0);
+ lfs_mark_vnode(vp);
+ genfs_node_init(vp, &lfs_genfsops);
+ vref(ip->i_devvp);
+ return (0);
+}
+
+/* Create a new vnode/inode pair and initialize what fields we can. */
+void
+lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
+{
+ struct inode *ip;
+ struct ufs1_dinode *dp;
+ struct ufsmount *ump;
+
+ /* Get a pointer to the private mount structure. */
+ ump = VFSTOUFS(mp);
+
+ ASSERT_NO_SEGLOCK(ump->um_lfs);
+
+ /* Initialize the inode. */
+ ip = pool_get(&lfs_inode_pool, PR_WAITOK);
+ memset(ip, 0, sizeof(*ip));
+ dp = pool_get(&lfs_dinode_pool, PR_WAITOK);
+ memset(dp, 0, sizeof(*dp));
+ ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
+ memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs));
+ vp->v_data = ip;
+ ip->i_din.ffs1_din = dp;
+ ip->i_ump = ump;
+ ip->i_vnode = vp;
+ ip->i_devvp = ump->um_devvp;
+ ip->i_dev = ump->um_dev;
+ ip->i_number = dp->di_inumber = ino;
+ ip->i_lfs = ump->um_lfs;
+ ip->i_lfs_effnblks = 0;
+ SPLAY_INIT(&ip->i_lfs_lbtree);
+ ip->i_lfs_nbtree = 0;
+ LIST_INIT(&ip->i_lfs_segdhd);
+#ifdef QUOTA
+ ufsquota_init(ip);
+#endif
+}
+
+#if 0
+/*
+ * Find the highest-numbered allocated inode.
+ * This will be used to shrink the Ifile.
+ */
+static inline ino_t
+lfs_last_alloc_ino(struct lfs *fs)
+{
+ ino_t ino, maxino;
+
+ maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) -
+ fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+ for (ino = maxino - 1; ino > LFS_UNUSED_INUM; --ino) {
+ if (ISSET_BITMAP_FREE(fs, ino) == 0)
+ break;
+ }
+ return ino;
+}
+#endif
+
+/*
+ * Find the previous (next lowest numbered) free inode, if any.
+ * If there is none, return LFS_UNUSED_INUM.
+ */
+static inline ino_t
+lfs_freelist_prev(struct lfs *fs, ino_t ino)
+{
+ ino_t tino, bound, bb, freehdbb;
+
+ if (fs->lfs_freehd == LFS_UNUSED_INUM) /* No free inodes at all */
+ return LFS_UNUSED_INUM;
+
+ /* Search our own word first */
+ bound = ino & ~BMMASK;
+ for (tino = ino - 1; tino >= bound && tino > LFS_UNUSED_INUM; tino--)
+ if (ISSET_BITMAP_FREE(fs, tino))
+ return tino;
+ /* If there are no lower words to search, just return */
+ if (ino >> BMSHIFT == 0)
+ return LFS_UNUSED_INUM;
+
+ /*
+ * Find a word with a free inode in it. We have to be a bit
+ * careful here since ino_t is unsigned.
+ */
+ freehdbb = (fs->lfs_freehd >> BMSHIFT);
+ for (bb = (ino >> BMSHIFT) - 1; bb >= freehdbb && bb > 0; --bb)
+ if (fs->lfs_ino_bitmap[bb])
+ break;
+ if (fs->lfs_ino_bitmap[bb] == 0)
+ return LFS_UNUSED_INUM;
+
+ /* Search the word we found */
+ for (tino = (bb << BMSHIFT) | BMMASK; tino >= (bb << BMSHIFT) &&
+ tino > LFS_UNUSED_INUM; tino--)
+ if (ISSET_BITMAP_FREE(fs, tino))
+ break;
+
+ if (tino <= LFS_IFILE_INUM)
+ tino = LFS_UNUSED_INUM;
+
+ return tino;
+}
+
+/* Free an inode. */
+/* ARGUSED */
+/* VOP_BWRITE 2i times */
+int
+lfs_vfree(struct vnode *vp, ino_t ino, int mode)
+{
+ SEGUSE *sup;
+ CLEANERINFO *cip;
+ struct buf *cbp, *bp;
+ struct ifile *ifp;
+ struct inode *ip;
+ struct lfs *fs;
+ daddr_t old_iaddr;
+ ino_t otail;
+
+ /* Get the inode number and file system. */
+ ip = VTOI(vp);
+ fs = ip->i_lfs;
+ ino = ip->i_number;
+
+ ASSERT_NO_SEGLOCK(fs);
+ DLOG((DLOG_ALLOC, "lfs_vfree: free ino %lld\n", (long long)ino));
+
+ /* Drain of pending writes */
+ mutex_enter(vp->v_interlock);
+ while (fs->lfs_version > 1 && WRITEINPROG(vp)) {
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ }
+ mutex_exit(vp->v_interlock);
+
+ lfs_seglock(fs, SEGM_PROT);
+ vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE);
+
+ lfs_unmark_vnode(vp);
+ mutex_enter(&lfs_lock);
+ if (vp->v_uflag & VU_DIROP) {
+ vp->v_uflag &= ~VU_DIROP;
+ --lfs_dirvcount;
+ --fs->lfs_dirvcount;
+ TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+ wakeup(&fs->lfs_dirvcount);
+ wakeup(&lfs_dirvcount);
+ mutex_exit(&lfs_lock);
+ lfs_vunref(vp);
+
+ /*
+ * If this inode is not going to be written any more, any
+ * segment accounting left over from its truncation needs
+ * to occur at the end of the next dirops flush. Attach
+ * them to the fs-wide list for that purpose.
+ */
+ if (LIST_FIRST(&ip->i_lfs_segdhd) != NULL) {
+ struct segdelta *sd;
+
+ while((sd = LIST_FIRST(&ip->i_lfs_segdhd)) != NULL) {
+ LIST_REMOVE(sd, list);
+ LIST_INSERT_HEAD(&fs->lfs_segdhd, sd, list);
+ }
+ }
+ } else {
+ /*
+ * If it's not a dirop, we can finalize right away.
+ */
+ mutex_exit(&lfs_lock);
+ lfs_finalize_ino_seguse(fs, ip);
+ }
+
+ mutex_enter(&lfs_lock);
+ LFS_CLR_UINO(ip, IN_ACCESSED|IN_CLEANING|IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+ ip->i_flag &= ~IN_ALLMOD;
+ ip->i_lfs_iflags |= LFSI_DELETED;
+
+ /*
+ * Set the ifile's inode entry to unused, increment its version number
+ * and link it onto the free chain.
+ */
+ SET_BITMAP_FREE(fs, ino);
+ LFS_IENTRY(ifp, fs, ino, bp);
+ old_iaddr = ifp->if_daddr;
+ ifp->if_daddr = LFS_UNUSED_DADDR;
+ ++ifp->if_version;
+ if (fs->lfs_version == 1) {
+ LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
+ LFS_PUT_HEADFREE(fs, cip, cbp, ino);
+ (void) LFS_BWRITE_LOG(bp); /* Ifile */
+ } else {
+ ino_t tino, onf;
+
+ ifp->if_nextfree = LFS_UNUSED_INUM;
+ (void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+ tino = lfs_freelist_prev(fs, ino);
+ if (tino == LFS_UNUSED_INUM) {
+ /* Nothing free below us, put us on the head */
+ LFS_IENTRY(ifp, fs, ino, bp);
+ LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
+ LFS_PUT_HEADFREE(fs, cip, cbp, ino);
+ DLOG((DLOG_ALLOC, "lfs_vfree: headfree %lld -> %lld\n",
+ (long long)ifp->if_nextfree, (long long)ino));
+ LFS_BWRITE_LOG(bp); /* Ifile */
+
+ /* If the list was empty, set tail too */
+ LFS_GET_TAILFREE(fs, cip, cbp, &otail);
+ if (otail == LFS_UNUSED_INUM) {
+ LFS_PUT_TAILFREE(fs, cip, cbp, ino);
+ DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
+ "-> %lld\n", (long long)otail,
+ (long long)ino));
+ }
+ } else {
+ /*
+ * Insert this inode into the list after tino.
+ * We hold the segment lock so we don't have to
+ * worry about blocks being written out of order.
+ */
+ DLOG((DLOG_ALLOC, "lfs_vfree: insert ino %lld "
+ " after %lld\n", ino, tino));
+
+ LFS_IENTRY(ifp, fs, tino, bp);
+ onf = ifp->if_nextfree;
+ ifp->if_nextfree = ino;
+ LFS_BWRITE_LOG(bp); /* Ifile */
+
+ LFS_IENTRY(ifp, fs, ino, bp);
+ ifp->if_nextfree = onf;
+ LFS_BWRITE_LOG(bp); /* Ifile */
+
+ /* If we're last, put us on the tail */
+ if (onf == LFS_UNUSED_INUM) {
+ LFS_GET_TAILFREE(fs, cip, cbp, &otail);
+ LFS_PUT_TAILFREE(fs, cip, cbp, ino);
+ DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
+ "-> %lld\n", (long long)otail,
+ (long long)ino));
+ }
+ }
+ }
+#ifdef DIAGNOSTIC
+ if (ino == LFS_UNUSED_INUM) {
+ panic("inode 0 freed");
+ }
+#endif /* DIAGNOSTIC */
+ if (old_iaddr != LFS_UNUSED_DADDR) {
+ LFS_SEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp);
+#ifdef DIAGNOSTIC
+ if (sup->su_nbytes < sizeof (struct ufs1_dinode)) {
+ printf("lfs_vfree: negative byte count"
+ " (segment %" PRIu32 " short by %d)\n",
+ dtosn(fs, old_iaddr),
+ (int)sizeof (struct ufs1_dinode) -
+ sup->su_nbytes);
+ panic("lfs_vfree: negative byte count");
+ sup->su_nbytes = sizeof (struct ufs1_dinode);
+ }
+#endif
+ sup->su_nbytes -= sizeof (struct ufs1_dinode);
+ LFS_WRITESEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp); /* Ifile */
+ }
+
+ /* Set superblock modified bit and decrement file count. */
+ mutex_enter(&lfs_lock);
+ fs->lfs_fmod = 1;
+ mutex_exit(&lfs_lock);
+ --fs->lfs_nfiles;
+
+ VOP_UNLOCK(fs->lfs_ivnode);
+ lfs_segunlock(fs);
+
+ return (0);
+}
+
+/*
+ * Sort the freelist and set up the free-inode bitmap.
+ * To be called by lfs_mountfs().
+ */
+void
+lfs_order_freelist(struct lfs *fs)
+{
+ CLEANERINFO *cip;
+ IFILE *ifp = NULL;
+ struct buf *bp;
+ ino_t ino, firstino, lastino, maxino;
+#ifdef notyet
+ struct vnode *vp;
+#endif
+
+ ASSERT_NO_SEGLOCK(fs);
+ lfs_seglock(fs, SEGM_PROT);
+
+ maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) -
+ fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+ fs->lfs_ino_bitmap = (lfs_bm_t *)
+ malloc(((maxino + BMMASK) >> BMSHIFT) * sizeof(lfs_bm_t),
+ M_SEGMENT, M_WAITOK | M_ZERO);
+ KASSERT(fs->lfs_ino_bitmap != NULL);
+
+ firstino = lastino = LFS_UNUSED_INUM;
+ for (ino = 0; ino < maxino; ino++) {
+ if (ino % fs->lfs_ifpb == 0)
+ LFS_IENTRY(ifp, fs, ino, bp);
+ else
+ ++ifp;
+
+ /* Don't put zero or ifile on the free list */
+ if (ino == LFS_UNUSED_INUM || ino == LFS_IFILE_INUM)
+ continue;
+
+#ifdef notyet
+ /* Address orphaned files */
+ if (ifp->if_nextfree == LFS_ORPHAN_NEXTFREE &&
+ VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp) == 0) {
+ lfs_truncate(vp, 0, 0, NOCRED);
+ vput(vp);
+ LFS_SEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp);
+ KASSERT(sup->su_nbytes >= DINODE1_SIZE);
+ sup->su_nbytes -= DINODE1_SIZE;
+ LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp);
+
+ /* Set up to fall through to next section */
+ ifp->if_daddr = LFS_UNUSED_DADDR;
+ LFS_BWRITE_LOG(bp);
+ LFS_IENTRY(ifp, fs, ino, bp);
+ }
+#endif
+
+ if (ifp->if_daddr == LFS_UNUSED_DADDR) {
+ if (firstino == LFS_UNUSED_INUM)
+ firstino = ino;
+ else {
+ brelse(bp, 0);
+
+ LFS_IENTRY(ifp, fs, lastino, bp);
+ ifp->if_nextfree = ino;
+ LFS_BWRITE_LOG(bp);
+
+ LFS_IENTRY(ifp, fs, ino, bp);
+ }
+ lastino = ino;
+
+ SET_BITMAP_FREE(fs, ino);
+ }
+
+ if ((ino + 1) % fs->lfs_ifpb == 0)
+ brelse(bp, 0);
+ }
+
+ LFS_PUT_HEADFREE(fs, cip, bp, firstino);
+ LFS_PUT_TAILFREE(fs, cip, bp, lastino);
+
+ lfs_segunlock(fs);
+}
+
+void
+lfs_orphan(struct lfs *fs, ino_t ino)
+{
+ IFILE *ifp;
+ struct buf *bp;
+
+ LFS_IENTRY(ifp, fs, ino, bp);
+ ifp->if_nextfree = LFS_ORPHAN_NEXTFREE;
+ LFS_BWRITE_LOG(bp);
+}
--- /dev/null
+/* $NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_balloc.c 8.4 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/tree.h>
+#include <sys/trace.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, kauth_cred_t);
+
+u_int64_t locked_fakequeue_count;
+
+/*
+ * Allocate a block, and to inode and filesystem block accounting for it
+ * and for any indirect blocks the may need to be created in order for
+ * this block to be created.
+ *
+ * Blocks which have never been accounted for (i.e., which "do not exist")
+ * have disk address 0, which is translated by ufs_bmap to the special value
+ * UNASSIGNED == -1, as in the historical UFS.
+ *
+ * Blocks which have been accounted for but which have not yet been written
+ * to disk are given the new special disk address UNWRITTEN == -2, so that
+ * they can be differentiated from completely new blocks.
+ */
+/* VOP_BWRITE NIADDR+2 times */
+int
+lfs_balloc(struct vnode *vp, off_t startoffset, int iosize, kauth_cred_t cred,
+ int flags, struct buf **bpp)
+{
+ int offset;
+ daddr_t daddr, idaddr;
+ struct buf *ibp, *bp;
+ struct inode *ip;
+ struct lfs *fs;
+ struct indir indirs[NIADDR+2], *idp;
+ daddr_t lbn, lastblock;
+ int bcount;
+ int error, frags, i, nsize, osize, num;
+
+ ip = VTOI(vp);
+ fs = ip->i_lfs;
+ offset = blkoff(fs, startoffset);
+ KASSERT(iosize <= fs->lfs_bsize);
+ lbn = lblkno(fs, startoffset);
+ /* (void)lfs_check(vp, lbn, 0); */
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+
+ /*
+ * Three cases: it's a block beyond the end of file, it's a block in
+ * the file that may or may not have been assigned a disk address or
+ * we're writing an entire block.
+ *
+ * Note, if the daddr is UNWRITTEN, the block already exists in
+ * the cache (it was read or written earlier). If so, make sure
+ * we don't count it as a new block or zero out its contents. If
+ * it did not, make sure we allocate any necessary indirect
+ * blocks.
+ *
+ * If we are writing a block beyond the end of the file, we need to
+ * check if the old last block was a fragment. If it was, we need
+ * to rewrite it.
+ */
+
+ if (bpp)
+ *bpp = NULL;
+
+ /* Check for block beyond end of file and fragment extension needed. */
+ lastblock = lblkno(fs, ip->i_size);
+ if (lastblock < NDADDR && lastblock < lbn) {
+ osize = blksize(fs, ip, lastblock);
+ if (osize < fs->lfs_bsize && osize > 0) {
+ if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
+ lastblock,
+ (bpp ? &bp : NULL), cred)))
+ return (error);
+ ip->i_ffs1_size = ip->i_size =
+ (lastblock + 1) * fs->lfs_bsize;
+ uvm_vnp_setsize(vp, ip->i_size);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (bpp)
+ (void) VOP_BWRITE(bp->b_vp, bp);
+ }
+ }
+
+ /*
+ * If the block we are writing is a direct block, it's the last
+ * block in the file, and offset + iosize is less than a full
+ * block, we can write one or more fragments. There are two cases:
+ * the block is brand new and we should allocate it the correct
+ * size or it already exists and contains some fragments and
+ * may need to extend it.
+ */
+ if (lbn < NDADDR && lblkno(fs, ip->i_size) <= lbn) {
+ osize = blksize(fs, ip, lbn);
+ nsize = fragroundup(fs, offset + iosize);
+ if (lblktosize(fs, lbn) >= ip->i_size) {
+ /* Brand new block or fragment */
+ frags = numfrags(fs, nsize);
+ if (!ISSPACE(fs, frags, cred))
+ return ENOSPC;
+ if (bpp) {
+ *bpp = bp = getblk(vp, lbn, nsize, 0, 0);
+ bp->b_blkno = UNWRITTEN;
+ if (flags & B_CLRBUF)
+ clrbuf(bp);
+ }
+ ip->i_lfs_effnblks += frags;
+ mutex_enter(&lfs_lock);
+ fs->lfs_bfree -= frags;
+ mutex_exit(&lfs_lock);
+ ip->i_ffs1_db[lbn] = UNWRITTEN;
+ } else {
+ if (nsize <= osize) {
+ /* No need to extend */
+ if (bpp && (error = bread(vp, lbn, osize,
+ NOCRED, 0, &bp)))
+ return error;
+ } else {
+ /* Extend existing block */
+ if ((error =
+ lfs_fragextend(vp, osize, nsize, lbn,
+ (bpp ? &bp : NULL), cred)))
+ return error;
+ }
+ if (bpp)
+ *bpp = bp;
+ }
+ return 0;
+ }
+
+ error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL, NULL);
+ if (error)
+ return (error);
+
+ daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
+ KASSERT(daddr <= LFS_MAX_DADDR);
+
+ /*
+ * Do byte accounting all at once, so we can gracefully fail *before*
+ * we start assigning blocks.
+ */
+ frags = VFSTOUFS(vp->v_mount)->um_seqinc;
+ bcount = 0;
+ if (daddr == UNASSIGNED) {
+ bcount = frags;
+ }
+ for (i = 1; i < num; ++i) {
+ if (!indirs[i].in_exists) {
+ bcount += frags;
+ }
+ }
+ if (ISSPACE(fs, bcount, cred)) {
+ mutex_enter(&lfs_lock);
+ fs->lfs_bfree -= bcount;
+ mutex_exit(&lfs_lock);
+ ip->i_lfs_effnblks += bcount;
+ } else {
+ return ENOSPC;
+ }
+
+ if (daddr == UNASSIGNED) {
+ if (num > 0 && ip->i_ffs1_ib[indirs[0].in_off] == 0) {
+ ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
+ }
+
+ /*
+ * Create new indirect blocks if necessary
+ */
+ if (num > 1) {
+ idaddr = ip->i_ffs1_ib[indirs[0].in_off];
+ for (i = 1; i < num; ++i) {
+ ibp = getblk(vp, indirs[i].in_lbn,
+ fs->lfs_bsize, 0,0);
+ if (!indirs[i].in_exists) {
+ clrbuf(ibp);
+ ibp->b_blkno = UNWRITTEN;
+ } else if (!(ibp->b_oflags & (BO_DELWRI | BO_DONE))) {
+ ibp->b_blkno = fsbtodb(fs, idaddr);
+ ibp->b_flags |= B_READ;
+ VOP_STRATEGY(vp, ibp);
+ biowait(ibp);
+ }
+ /*
+ * This block exists, but the next one may not.
+ * If that is the case mark it UNWRITTEN to keep
+ * the accounting straight.
+ */
+ /* XXX ondisk32 */
+ if (((int32_t *)ibp->b_data)[indirs[i].in_off] == 0)
+ ((int32_t *)ibp->b_data)[indirs[i].in_off] =
+ UNWRITTEN;
+ /* XXX ondisk32 */
+ idaddr = ((int32_t *)ibp->b_data)[indirs[i].in_off];
+#ifdef DEBUG
+ if (vp == fs->lfs_ivnode) {
+ LFS_ENTER_LOG("balloc", __FILE__,
+ __LINE__, indirs[i].in_lbn,
+ ibp->b_flags, curproc->p_pid);
+ }
+#endif
+ if ((error = VOP_BWRITE(ibp->b_vp, ibp)))
+ return error;
+ }
+ }
+ }
+
+
+ /*
+ * Get the existing block from the cache, if requested.
+ */
+ if (bpp)
+ *bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
+
+ /*
+ * Do accounting on blocks that represent pages.
+ */
+ if (!bpp)
+ lfs_register_block(vp, lbn);
+
+ /*
+ * The block we are writing may be a brand new block
+ * in which case we need to do accounting.
+ *
+ * We can tell a truly new block because ufs_bmaparray will say
+ * it is UNASSIGNED. Once we allocate it we will assign it the
+ * disk address UNWRITTEN.
+ */
+ if (daddr == UNASSIGNED) {
+ if (bpp) {
+ if (flags & B_CLRBUF)
+ clrbuf(bp);
+
+ /* Note the new address */
+ bp->b_blkno = UNWRITTEN;
+ }
+
+ switch (num) {
+ case 0:
+ ip->i_ffs1_db[lbn] = UNWRITTEN;
+ break;
+ case 1:
+ ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
+ break;
+ default:
+ idp = &indirs[num - 1];
+ if (bread(vp, idp->in_lbn, fs->lfs_bsize, NOCRED,
+ B_MODIFY, &ibp))
+ panic("lfs_balloc: bread bno %lld",
+ (long long)idp->in_lbn);
+ /* XXX ondisk32 */
+ ((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
+#ifdef DEBUG
+ if (vp == fs->lfs_ivnode) {
+ LFS_ENTER_LOG("balloc", __FILE__,
+ __LINE__, idp->in_lbn,
+ ibp->b_flags, curproc->p_pid);
+ }
+#endif
+ VOP_BWRITE(ibp->b_vp, ibp);
+ }
+ } else if (bpp && !(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
+ /*
+ * Not a brand new block, also not in the cache;
+ * read it in from disk.
+ */
+ if (iosize == fs->lfs_bsize)
+ /* Optimization: I/O is unnecessary. */
+ bp->b_blkno = daddr;
+ else {
+ /*
+ * We need to read the block to preserve the
+ * existing bytes.
+ */
+ bp->b_blkno = daddr;
+ bp->b_flags |= B_READ;
+ VOP_STRATEGY(vp, bp);
+ return (biowait(bp));
+ }
+ }
+
+ return (0);
+}
+
+/* VOP_BWRITE 1 time */
+int
+lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf **bpp,
+ kauth_cred_t cred)
+{
+ struct inode *ip;
+ struct lfs *fs;
+ long frags;
+ int error;
+ extern long locked_queue_bytes;
+ size_t obufsize;
+
+ ip = VTOI(vp);
+ fs = ip->i_lfs;
+ frags = (long)numfrags(fs, nsize - osize);
+ error = 0;
+
+ ASSERT_NO_SEGLOCK(fs);
+
+ /*
+ * Get the seglock so we don't enlarge blocks while a segment
+ * is being written. If we're called with bpp==NULL, though,
+ * we are only pretending to change a buffer, so we don't have to
+ * lock.
+ */
+ top:
+ if (bpp) {
+ rw_enter(&fs->lfs_fraglock, RW_READER);
+ LFS_DEBUG_COUNTLOCKED("frag");
+ }
+
+ if (!ISSPACE(fs, frags, cred)) {
+ error = ENOSPC;
+ goto out;
+ }
+
+ /*
+ * If we are not asked to actually return the block, all we need
+ * to do is allocate space for it. UBC will handle dirtying the
+ * appropriate things and making sure it all goes to disk.
+ * Don't bother to read in that case.
+ */
+ if (bpp && (error = bread(vp, lbn, osize, NOCRED, 0, bpp))) {
+ brelse(*bpp, 0);
+ goto out;
+ }
+#ifdef QUOTA
+ if ((error = chkdq(ip, frags, cred, 0))) {
+ if (bpp)
+ brelse(*bpp, 0);
+ goto out;
+ }
+#endif
+ /*
+ * Adjust accounting for lfs_avail. If there's not enough room,
+ * we will have to wait for the cleaner, which we can't do while
+ * holding a block busy or while holding the seglock. In that case,
+ * release both and start over after waiting.
+ */
+
+ if (bpp && ((*bpp)->b_oflags & BO_DELWRI)) {
+ if (!lfs_fits(fs, frags)) {
+ if (bpp)
+ brelse(*bpp, 0);
+#ifdef QUOTA
+ chkdq(ip, -frags, cred, 0);
+#endif
+ rw_exit(&fs->lfs_fraglock);
+ lfs_availwait(fs, frags);
+ goto top;
+ }
+ fs->lfs_avail -= frags;
+ }
+
+ mutex_enter(&lfs_lock);
+ fs->lfs_bfree -= frags;
+ mutex_exit(&lfs_lock);
+ ip->i_lfs_effnblks += frags;
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+ if (bpp) {
+ obufsize = (*bpp)->b_bufsize;
+ allocbuf(*bpp, nsize, 1);
+
+ /* Adjust locked-list accounting */
+ if (((*bpp)->b_flags & B_LOCKED) != 0 &&
+ (*bpp)->b_iodone == NULL) {
+ mutex_enter(&lfs_lock);
+ locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
+ mutex_exit(&lfs_lock);
+ }
+
+ memset((char *)((*bpp)->b_data) + osize, 0, (u_int)(nsize - osize));
+ }
+
+ out:
+ if (bpp) {
+ rw_exit(&fs->lfs_fraglock);
+ }
+ return (error);
+}
+
+static inline int
+lge(struct lbnentry *a, struct lbnentry *b)
+{
+ return a->lbn - b->lbn;
+}
+
+SPLAY_PROTOTYPE(lfs_splay, lbnentry, entry, lge);
+
+SPLAY_GENERATE(lfs_splay, lbnentry, entry, lge);
+
+/*
+ * Record this lbn as being "write pending". We used to have this information
+ * on the buffer headers, but since pages don't have buffer headers we
+ * record it here instead.
+ */
+void
+lfs_register_block(struct vnode *vp, daddr_t lbn)
+{
+ struct lfs *fs;
+ struct inode *ip;
+ struct lbnentry *lbp;
+
+ ip = VTOI(vp);
+
+ /* Don't count metadata */
+ if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
+ return;
+
+ fs = ip->i_lfs;
+
+ ASSERT_NO_SEGLOCK(fs);
+
+ /* If no space, wait for the cleaner */
+ lfs_availwait(fs, btofsb(fs, 1 << fs->lfs_bshift));
+
+ lbp = (struct lbnentry *)pool_get(&lfs_lbnentry_pool, PR_WAITOK);
+ lbp->lbn = lbn;
+ mutex_enter(&lfs_lock);
+ if (SPLAY_INSERT(lfs_splay, &ip->i_lfs_lbtree, lbp) != NULL) {
+ mutex_exit(&lfs_lock);
+ /* Already there */
+ pool_put(&lfs_lbnentry_pool, lbp);
+ return;
+ }
+
+ ++ip->i_lfs_nbtree;
+ fs->lfs_favail += btofsb(fs, (1 << fs->lfs_bshift));
+ fs->lfs_pages += fs->lfs_bsize >> PAGE_SHIFT;
+ ++locked_fakequeue_count;
+ lfs_subsys_pages += fs->lfs_bsize >> PAGE_SHIFT;
+ mutex_exit(&lfs_lock);
+}
+
+static void
+lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp)
+{
+ ASSERT_MAYBE_SEGLOCK(fs);
+
+ mutex_enter(&lfs_lock);
+ --ip->i_lfs_nbtree;
+ SPLAY_REMOVE(lfs_splay, &ip->i_lfs_lbtree, lbp);
+ if (fs->lfs_favail > btofsb(fs, (1 << fs->lfs_bshift)))
+ fs->lfs_favail -= btofsb(fs, (1 << fs->lfs_bshift));
+ fs->lfs_pages -= fs->lfs_bsize >> PAGE_SHIFT;
+ if (locked_fakequeue_count > 0)
+ --locked_fakequeue_count;
+ lfs_subsys_pages -= fs->lfs_bsize >> PAGE_SHIFT;
+ mutex_exit(&lfs_lock);
+
+ pool_put(&lfs_lbnentry_pool, lbp);
+}
+
+void
+lfs_deregister_block(struct vnode *vp, daddr_t lbn)
+{
+ struct lfs *fs;
+ struct inode *ip;
+ struct lbnentry *lbp;
+ struct lbnentry tmp;
+
+ ip = VTOI(vp);
+
+ /* Don't count metadata */
+ if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
+ return;
+
+ fs = ip->i_lfs;
+ tmp.lbn = lbn;
+ lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp);
+ if (lbp == NULL)
+ return;
+
+ lfs_do_deregister(fs, ip, lbp);
+}
+
+void
+lfs_deregister_all(struct vnode *vp)
+{
+ struct lbnentry *lbp, *nlbp;
+ struct lfs_splay *hd;
+ struct lfs *fs;
+ struct inode *ip;
+
+ ip = VTOI(vp);
+ fs = ip->i_lfs;
+ hd = &ip->i_lfs_lbtree;
+
+ for (lbp = SPLAY_MIN(lfs_splay, hd); lbp != NULL; lbp = nlbp) {
+ nlbp = SPLAY_NEXT(lfs_splay, hd, lbp);
+ lfs_do_deregister(fs, ip, lbp);
+ }
+}
--- /dev/null
+/* $NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_bio.c 8.10 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/resourcevar.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+/*
+ * LFS block write function.
+ *
+ * XXX
+ * No write cost accounting is done.
+ * This is almost certainly wrong for synchronous operations and NFS.
+ *
+ * protected by lfs_lock.
+ */
+int locked_queue_count = 0; /* Count of locked-down buffers. */
+long locked_queue_bytes = 0L; /* Total size of locked buffers. */
+int lfs_subsys_pages = 0L; /* Total number LFS-written pages */
+int lfs_fs_pagetrip = 0; /* # of pages to trip per-fs write */
+int lfs_writing = 0; /* Set if already kicked off a writer
+ because of buffer space */
+
+/* Lock and condition variables for above. */
+kcondvar_t locked_queue_cv;
+kcondvar_t lfs_writing_cv;
+kmutex_t lfs_lock;
+
+extern int lfs_dostats;
+
+/*
+ * reserved number/bytes of locked buffers
+ */
+int locked_queue_rcount = 0;
+long locked_queue_rbytes = 0L;
+
+static int lfs_fits_buf(struct lfs *, int, int);
+static int lfs_reservebuf(struct lfs *, struct vnode *vp, struct vnode *vp2,
+ int, int);
+static int lfs_reserveavail(struct lfs *, struct vnode *vp, struct vnode *vp2,
+ int);
+
+static int
+lfs_fits_buf(struct lfs *fs, int n, int bytes)
+{
+ int count_fit, bytes_fit;
+
+ ASSERT_NO_SEGLOCK(fs);
+ KASSERT(mutex_owned(&lfs_lock));
+
+ count_fit =
+ (locked_queue_count + locked_queue_rcount + n <= LFS_WAIT_BUFS);
+ bytes_fit =
+ (locked_queue_bytes + locked_queue_rbytes + bytes <= LFS_WAIT_BYTES);
+
+#ifdef DEBUG
+ if (!count_fit) {
+ DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit count: %d + %d + %d >= %d\n",
+ locked_queue_count, locked_queue_rcount,
+ n, LFS_WAIT_BUFS));
+ }
+ if (!bytes_fit) {
+ DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit bytes: %ld + %ld + %d >= %ld\n",
+ locked_queue_bytes, locked_queue_rbytes,
+ bytes, LFS_WAIT_BYTES));
+ }
+#endif /* DEBUG */
+
+ return (count_fit && bytes_fit);
+}
+
+/* ARGSUSED */
+static int
+lfs_reservebuf(struct lfs *fs, struct vnode *vp,
+ struct vnode *vp2, int n, int bytes)
+{
+ ASSERT_MAYBE_SEGLOCK(fs);
+ KASSERT(locked_queue_rcount >= 0);
+ KASSERT(locked_queue_rbytes >= 0);
+
+ mutex_enter(&lfs_lock);
+ while (n > 0 && !lfs_fits_buf(fs, n, bytes)) {
+ int error;
+
+ lfs_flush(fs, 0, 0);
+
+ error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
+ hz * LFS_BUFWAIT);
+ if (error && error != EWOULDBLOCK) {
+ mutex_exit(&lfs_lock);
+ return error;
+ }
+ }
+
+ locked_queue_rcount += n;
+ locked_queue_rbytes += bytes;
+
+ if (n < 0)
+ cv_broadcast(&locked_queue_cv);
+
+ mutex_exit(&lfs_lock);
+
+ KASSERT(locked_queue_rcount >= 0);
+ KASSERT(locked_queue_rbytes >= 0);
+
+ return 0;
+}
+
+/*
+ * Try to reserve some blocks, prior to performing a sensitive operation that
+ * requires the vnode lock to be honored. If there is not enough space, give
+ * up the vnode lock temporarily and wait for the space to become available.
+ *
+ * Called with vp locked. (Note nowever that if fsb < 0, vp is ignored.)
+ *
+ * XXX YAMT - it isn't safe to unlock vp here
+ * because the node might be modified while we sleep.
+ * (eg. cached states like i_offset might be stale,
+ * the vnode might be truncated, etc..)
+ * maybe we should have a way to restart the vnodeop (EVOPRESTART?)
+ * or rearrange vnodeop interface to leave vnode locking to file system
+ * specific code so that each file systems can have their own vnode locking and
+ * vnode re-using strategies.
+ */
+static int
+lfs_reserveavail(struct lfs *fs, struct vnode *vp,
+ struct vnode *vp2, int fsb)
+{
+ CLEANERINFO *cip;
+ struct buf *bp;
+ int error, slept;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ slept = 0;
+ mutex_enter(&lfs_lock);
+ while (fsb > 0 && !lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) {
+ mutex_exit(&lfs_lock);
+#if 0
+ /*
+ * XXX ideally, we should unlock vnodes here
+ * because we might sleep very long time.
+ */
+ VOP_UNLOCK(vp);
+ if (vp2 != NULL) {
+ VOP_UNLOCK(vp2);
+ }
+#else
+ /*
+ * XXX since we'll sleep for cleaner with vnode lock holding,
+ * deadlock will occur if cleaner tries to lock the vnode.
+ * (eg. lfs_markv -> lfs_fastvget -> getnewvnode -> vclean)
+ */
+#endif
+
+ if (!slept) {
+ DLOG((DLOG_AVAIL, "lfs_reserve: waiting for %ld (bfree = %d,"
+ " est_bfree = %d)\n",
+ fsb + fs->lfs_ravail + fs->lfs_favail,
+ fs->lfs_bfree, LFS_EST_BFREE(fs)));
+ }
+ ++slept;
+
+ /* Wake up the cleaner */
+ LFS_CLEANERINFO(cip, fs, bp);
+ LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
+ lfs_wakeup_cleaner(fs);
+
+ mutex_enter(&lfs_lock);
+ /* Cleaner might have run while we were reading, check again */
+ if (lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail))
+ break;
+
+ error = mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_reserve",
+ 0, &lfs_lock);
+#if 0
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
+ vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
+#endif
+ if (error) {
+ mutex_exit(&lfs_lock);
+ return error;
+ }
+ }
+#ifdef DEBUG
+ if (slept) {
+ DLOG((DLOG_AVAIL, "lfs_reserve: woke up\n"));
+ }
+#endif
+ fs->lfs_ravail += fsb;
+ mutex_exit(&lfs_lock);
+
+ return 0;
+}
+
+#ifdef DIAGNOSTIC
+int lfs_rescount;
+int lfs_rescountdirop;
+#endif
+
+int
+lfs_reserve(struct lfs *fs, struct vnode *vp, struct vnode *vp2, int fsb)
+{
+ int error;
+ int cantwait;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ if (vp2) {
+ /* Make sure we're not in the process of reclaiming vp2 */
+ mutex_enter(&lfs_lock);
+ while(fs->lfs_flags & LFS_UNDIROP) {
+ mtsleep(&fs->lfs_flags, PRIBIO + 1, "lfsrundirop", 0,
+ &lfs_lock);
+ }
+ mutex_exit(&lfs_lock);
+ }
+
+ KASSERT(fsb < 0 || VOP_ISLOCKED(vp));
+ KASSERT(vp2 == NULL || fsb < 0 || VOP_ISLOCKED(vp2));
+ KASSERT(vp2 == NULL || !(VTOI(vp2)->i_flag & IN_ADIROP));
+ KASSERT(vp2 == NULL || vp2 != fs->lfs_unlockvp);
+
+ cantwait = (VTOI(vp)->i_flag & IN_ADIROP) || fs->lfs_unlockvp == vp;
+#ifdef DIAGNOSTIC
+ if (cantwait) {
+ if (fsb > 0)
+ lfs_rescountdirop++;
+ else if (fsb < 0)
+ lfs_rescountdirop--;
+ if (lfs_rescountdirop < 0)
+ panic("lfs_rescountdirop");
+ }
+ else {
+ if (fsb > 0)
+ lfs_rescount++;
+ else if (fsb < 0)
+ lfs_rescount--;
+ if (lfs_rescount < 0)
+ panic("lfs_rescount");
+ }
+#endif
+ if (cantwait)
+ return 0;
+
+ /*
+ * XXX
+ * vref vnodes here so that cleaner doesn't try to reuse them.
+ * (see XXX comment in lfs_reserveavail)
+ */
+ vhold(vp);
+ if (vp2 != NULL) {
+ vhold(vp2);
+ }
+
+ error = lfs_reserveavail(fs, vp, vp2, fsb);
+ if (error)
+ goto done;
+
+ /*
+ * XXX just a guess. should be more precise.
+ */
+ error = lfs_reservebuf(fs, vp, vp2, fsb, fsbtob(fs, fsb));
+ if (error)
+ lfs_reserveavail(fs, vp, vp2, -fsb);
+
+done:
+ holdrele(vp);
+ if (vp2 != NULL) {
+ holdrele(vp2);
+ }
+
+ return error;
+}
+
+int
+lfs_bwrite(void *v)
+{
+ struct vop_bwrite_args /* {
+ struct vnode *a_vp;
+ struct buf *a_bp;
+ } */ *ap = v;
+ struct buf *bp = ap->a_bp;
+
+#ifdef DIAGNOSTIC
+ if (VTOI(bp->b_vp)->i_lfs->lfs_ronly == 0 && (bp->b_flags & B_ASYNC)) {
+ panic("bawrite LFS buffer");
+ }
+#endif /* DIAGNOSTIC */
+ return lfs_bwrite_ext(bp, 0);
+}
+
+/*
+ * Determine if there is enough room currently available to write fsb
+ * blocks. We need enough blocks for the new blocks, the current
+ * inode blocks (including potentially the ifile inode), a summary block,
+ * and the segment usage table, plus an ifile block.
+ */
+int
+lfs_fits(struct lfs *fs, int fsb)
+{
+ int needed;
+
+ ASSERT_NO_SEGLOCK(fs);
+ needed = fsb + btofsb(fs, fs->lfs_sumsize) +
+ ((howmany(fs->lfs_uinodes + 1, INOPB(fs)) + fs->lfs_segtabsz +
+ 1) << (fs->lfs_bshift - fs->lfs_ffshift));
+
+ if (needed >= fs->lfs_avail) {
+#ifdef DEBUG
+ DLOG((DLOG_AVAIL, "lfs_fits: no fit: fsb = %ld, uinodes = %ld, "
+ "needed = %ld, avail = %ld\n",
+ (long)fsb, (long)fs->lfs_uinodes, (long)needed,
+ (long)fs->lfs_avail));
+#endif
+ return 0;
+ }
+ return 1;
+}
+
+int
+lfs_availwait(struct lfs *fs, int fsb)
+{
+ int error;
+ CLEANERINFO *cip;
+ struct buf *cbp;
+
+ ASSERT_NO_SEGLOCK(fs);
+ /* Push cleaner blocks through regardless */
+ mutex_enter(&lfs_lock);
+ if (LFS_SEGLOCK_HELD(fs) &&
+ fs->lfs_sp->seg_flags & (SEGM_CLEAN | SEGM_FORCE_CKP)) {
+ mutex_exit(&lfs_lock);
+ return 0;
+ }
+ mutex_exit(&lfs_lock);
+
+ while (!lfs_fits(fs, fsb)) {
+ /*
+ * Out of space, need cleaner to run.
+ * Update the cleaner info, then wake it up.
+ * Note the cleanerinfo block is on the ifile
+ * so it CANT_WAIT.
+ */
+ LFS_CLEANERINFO(cip, fs, cbp);
+ LFS_SYNC_CLEANERINFO(cip, fs, cbp, 0);
+
+#ifdef DEBUG
+ DLOG((DLOG_AVAIL, "lfs_availwait: out of available space, "
+ "waiting on cleaner\n"));
+#endif
+
+ lfs_wakeup_cleaner(fs);
+#ifdef DIAGNOSTIC
+ if (LFS_SEGLOCK_HELD(fs))
+ panic("lfs_availwait: deadlock");
+#endif
+ error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "cleaner", 0);
+ if (error)
+ return (error);
+ }
+ return 0;
+}
+
+int
+lfs_bwrite_ext(struct buf *bp, int flags)
+{
+ struct lfs *fs;
+ struct inode *ip;
+ struct vnode *vp;
+ int fsb;
+
+ vp = bp->b_vp;
+ fs = VFSTOUFS(vp->v_mount)->um_lfs;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ KASSERT(bp->b_cflags & BC_BUSY);
+ KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp));
+ KASSERT(((bp->b_oflags | bp->b_flags) & (BO_DELWRI|B_LOCKED))
+ != BO_DELWRI);
+
+ /*
+ * Don't write *any* blocks if we're mounted read-only, or
+ * if we are "already unmounted".
+ *
+ * In particular the cleaner can't write blocks either.
+ */
+ if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) {
+ bp->b_oflags &= ~BO_DELWRI;
+ bp->b_flags |= B_READ;
+ bp->b_error = 0;
+ mutex_enter(&bufcache_lock);
+ LFS_UNLOCK_BUF(bp);
+ if (LFS_IS_MALLOC_BUF(bp))
+ bp->b_cflags &= ~BC_BUSY;
+ else
+ brelsel(bp, 0);
+ mutex_exit(&bufcache_lock);
+ return (fs->lfs_ronly ? EROFS : 0);
+ }
+
+ /*
+ * Set the delayed write flag and use reassignbuf to move the buffer
+ * from the clean list to the dirty one.
+ *
+ * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
+ * the buffer onto the LOCKED free list. This is necessary, otherwise
+ * getnewbuf() would try to reclaim the buffers using bawrite, which
+ * isn't going to work.
+ *
+ * XXX we don't let meta-data writes run out of space because they can
+ * come from the segment writer. We need to make sure that there is
+ * enough space reserved so that there's room to write meta-data
+ * blocks.
+ */
+ if ((bp->b_flags & B_LOCKED) == 0) {
+ fsb = numfrags(fs, bp->b_bcount);
+
+ ip = VTOI(vp);
+ mutex_enter(&lfs_lock);
+ if (flags & BW_CLEAN) {
+ LFS_SET_UINO(ip, IN_CLEANING);
+ } else {
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ }
+ mutex_exit(&lfs_lock);
+ fs->lfs_avail -= fsb;
+
+ mutex_enter(&bufcache_lock);
+ mutex_enter(vp->v_interlock);
+ bp->b_oflags = (bp->b_oflags | BO_DELWRI) & ~BO_DONE;
+ LFS_LOCK_BUF(bp);
+ bp->b_flags &= ~B_READ;
+ bp->b_error = 0;
+ reassignbuf(bp, bp->b_vp);
+ mutex_exit(vp->v_interlock);
+ } else {
+ mutex_enter(&bufcache_lock);
+ }
+
+ if (bp->b_iodone != NULL)
+ bp->b_cflags &= ~BC_BUSY;
+ else
+ brelsel(bp, 0);
+ mutex_exit(&bufcache_lock);
+
+ return (0);
+}
+
+/*
+ * Called and return with the lfs_lock held.
+ */
+void
+lfs_flush_fs(struct lfs *fs, int flags)
+{
+ ASSERT_NO_SEGLOCK(fs);
+ KASSERT(mutex_owned(&lfs_lock));
+ if (fs->lfs_ronly)
+ return;
+
+ if (lfs_dostats)
+ ++lfs_stats.flush_invoked;
+
+ mutex_exit(&lfs_lock);
+ lfs_writer_enter(fs, "fldirop");
+ lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
+ lfs_writer_leave(fs);
+ mutex_enter(&lfs_lock);
+ fs->lfs_favail = 0; /* XXX */
+}
+
+/*
+ * This routine initiates segment writes when LFS is consuming too many
+ * resources. Ideally the pageout daemon would be able to direct LFS
+ * more subtly.
+ * XXX We have one static count of locked buffers;
+ * XXX need to think more about the multiple filesystem case.
+ *
+ * Called and return with lfs_lock held.
+ * If fs != NULL, we hold the segment lock for fs.
+ */
+void
+lfs_flush(struct lfs *fs, int flags, int only_onefs)
+{
+ extern u_int64_t locked_fakequeue_count;
+ struct mount *mp, *nmp;
+ struct lfs *tfs;
+
+ KASSERT(mutex_owned(&lfs_lock));
+ KDASSERT(fs == NULL || !LFS_SEGLOCK_HELD(fs));
+
+ if (lfs_dostats)
+ ++lfs_stats.write_exceeded;
+ /* XXX should we include SEGM_CKP here? */
+ if (lfs_writing && !(flags & SEGM_SYNC)) {
+ DLOG((DLOG_FLUSH, "lfs_flush: not flushing because another flush is active\n"));
+ return;
+ }
+ while (lfs_writing)
+ cv_wait(&lfs_writing_cv, &lfs_lock);
+ lfs_writing = 1;
+
+ mutex_exit(&lfs_lock);
+
+ if (only_onefs) {
+ KASSERT(fs != NULL);
+ if (vfs_busy(fs->lfs_ivnode->v_mount, NULL))
+ goto errout;
+ mutex_enter(&lfs_lock);
+ lfs_flush_fs(fs, flags);
+ mutex_exit(&lfs_lock);
+ vfs_unbusy(fs->lfs_ivnode->v_mount, false, NULL);
+ } else {
+ locked_fakequeue_count = 0;
+ mutex_enter(&mountlist_lock);
+ for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+ mp = nmp) {
+ if (vfs_busy(mp, &nmp)) {
+ DLOG((DLOG_FLUSH, "lfs_flush: fs vfs_busy\n"));
+ continue;
+ }
+ if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS,
+ sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+ tfs = VFSTOUFS(mp)->um_lfs;
+ mutex_enter(&lfs_lock);
+ lfs_flush_fs(tfs, flags);
+ mutex_exit(&lfs_lock);
+ }
+ vfs_unbusy(mp, false, &nmp);
+ }
+ mutex_exit(&mountlist_lock);
+ }
+ LFS_DEBUG_COUNTLOCKED("flush");
+ wakeup(&lfs_subsys_pages);
+
+ errout:
+ mutex_enter(&lfs_lock);
+ KASSERT(lfs_writing);
+ lfs_writing = 0;
+ wakeup(&lfs_writing);
+}
+
+#define INOCOUNT(fs) howmany((fs)->lfs_uinodes, INOPB(fs))
+#define INOBYTES(fs) ((fs)->lfs_uinodes * sizeof (struct ufs1_dinode))
+
+/*
+ * make sure that we don't have too many locked buffers.
+ * flush buffers if needed.
+ */
+int
+lfs_check(struct vnode *vp, daddr_t blkno, int flags)
+{
+ int error;
+ struct lfs *fs;
+ struct inode *ip;
+ extern pid_t lfs_writer_daemon;
+
+ error = 0;
+ ip = VTOI(vp);
+
+ /* If out of buffers, wait on writer */
+ /* XXX KS - if it's the Ifile, we're probably the cleaner! */
+ if (ip->i_number == LFS_IFILE_INUM)
+ return 0;
+ /* If we're being called from inside a dirop, don't sleep */
+ if (ip->i_flag & IN_ADIROP)
+ return 0;
+
+ fs = ip->i_lfs;
+
+ ASSERT_NO_SEGLOCK(fs);
+
+ /*
+ * If we would flush below, but dirops are active, sleep.
+ * Note that a dirop cannot ever reach this code!
+ */
+ mutex_enter(&lfs_lock);
+ while (fs->lfs_dirops > 0 &&
+ (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+ locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
+ lfs_subsys_pages > LFS_MAX_PAGES ||
+ fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+ lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0))
+ {
+ ++fs->lfs_diropwait;
+ mtsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0,
+ &lfs_lock);
+ --fs->lfs_diropwait;
+ }
+
+#ifdef DEBUG
+ if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS)
+ DLOG((DLOG_FLUSH, "lfs_check: lqc = %d, max %d\n",
+ locked_queue_count + INOCOUNT(fs), LFS_MAX_BUFS));
+ if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
+ DLOG((DLOG_FLUSH, "lfs_check: lqb = %ld, max %ld\n",
+ locked_queue_bytes + INOBYTES(fs), LFS_MAX_BYTES));
+ if (lfs_subsys_pages > LFS_MAX_PAGES)
+ DLOG((DLOG_FLUSH, "lfs_check: lssp = %d, max %d\n",
+ lfs_subsys_pages, LFS_MAX_PAGES));
+ if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip)
+ DLOG((DLOG_FLUSH, "lfs_check: fssp = %d, trip at %d\n",
+ fs->lfs_pages, lfs_fs_pagetrip));
+ if (lfs_dirvcount > LFS_MAX_DIROP)
+ DLOG((DLOG_FLUSH, "lfs_check: ldvc = %d, max %d\n",
+ lfs_dirvcount, LFS_MAX_DIROP));
+ if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs))
+ DLOG((DLOG_FLUSH, "lfs_check: lfdvc = %d, max %d\n",
+ fs->lfs_dirvcount, LFS_MAX_FSDIROP(fs)));
+ if (fs->lfs_diropwait > 0)
+ DLOG((DLOG_FLUSH, "lfs_check: ldvw = %d\n",
+ fs->lfs_diropwait));
+#endif
+
+ /* If there are too many pending dirops, we have to flush them. */
+ if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+ lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
+ flags |= SEGM_CKP;
+ }
+
+ if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+ locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
+ lfs_subsys_pages > LFS_MAX_PAGES ||
+ fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+ lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
+ lfs_flush(fs, flags, 0);
+ } else if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) {
+ /*
+ * If we didn't flush the whole thing, some filesystems
+ * still might want to be flushed.
+ */
+ ++fs->lfs_pdflush;
+ wakeup(&lfs_writer_daemon);
+ }
+
+ while (locked_queue_count + INOCOUNT(fs) >= LFS_WAIT_BUFS ||
+ locked_queue_bytes + INOBYTES(fs) >= LFS_WAIT_BYTES ||
+ lfs_subsys_pages > LFS_WAIT_PAGES ||
+ fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+ lfs_dirvcount > LFS_MAX_DIROP) {
+
+ if (lfs_dostats)
+ ++lfs_stats.wait_exceeded;
+ DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n",
+ locked_queue_count, locked_queue_bytes));
+ error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
+ hz * LFS_BUFWAIT);
+ if (error != EWOULDBLOCK)
+ break;
+
+ /*
+ * lfs_flush might not flush all the buffers, if some of the
+ * inodes were locked or if most of them were Ifile blocks
+ * and we weren't asked to checkpoint. Try flushing again
+ * to keep us from blocking indefinitely.
+ */
+ if (locked_queue_count + INOCOUNT(fs) >= LFS_MAX_BUFS ||
+ locked_queue_bytes + INOBYTES(fs) >= LFS_MAX_BYTES) {
+ lfs_flush(fs, flags | SEGM_CKP, 0);
+ }
+ }
+ mutex_exit(&lfs_lock);
+ return (error);
+}
+
+/*
+ * Allocate a new buffer header.
+ */
+struct buf *
+lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type)
+{
+ struct buf *bp;
+ size_t nbytes;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ nbytes = roundup(size, fsbtob(fs, 1));
+
+ bp = getiobuf(NULL, true);
+ if (nbytes) {
+ bp->b_data = lfs_malloc(fs, nbytes, type);
+ /* memset(bp->b_data, 0, nbytes); */
+ }
+#ifdef DIAGNOSTIC
+ if (vp == NULL)
+ panic("vp is NULL in lfs_newbuf");
+ if (bp == NULL)
+ panic("bp is NULL after malloc in lfs_newbuf");
+#endif
+
+ bp->b_bufsize = size;
+ bp->b_bcount = size;
+ bp->b_lblkno = daddr;
+ bp->b_blkno = daddr;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_iodone = lfs_callback;
+ bp->b_cflags = BC_BUSY | BC_NOCACHE;
+ bp->b_private = fs;
+
+ mutex_enter(&bufcache_lock);
+ mutex_enter(vp->v_interlock);
+ bgetvp(vp, bp);
+ mutex_exit(vp->v_interlock);
+ mutex_exit(&bufcache_lock);
+
+ return (bp);
+}
+
+void
+lfs_freebuf(struct lfs *fs, struct buf *bp)
+{
+ struct vnode *vp;
+
+ if ((vp = bp->b_vp) != NULL) {
+ mutex_enter(&bufcache_lock);
+ mutex_enter(vp->v_interlock);
+ brelvp(bp);
+ mutex_exit(vp->v_interlock);
+ mutex_exit(&bufcache_lock);
+ }
+ if (!(bp->b_cflags & BC_INVAL)) { /* BC_INVAL indicates a "fake" buffer */
+ lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN);
+ bp->b_data = NULL;
+ }
+ putiobuf(bp);
+}
+
+/*
+ * Count buffers on the "locked" queue, and compare it to a pro-forma count.
+ * Don't count malloced buffers, since they don't detract from the total.
+ */
+void
+lfs_countlocked(int *count, long *bytes, const char *msg)
+{
+ struct buf *bp;
+ int n = 0;
+ long int size = 0L;
+
+ mutex_enter(&bufcache_lock);
+ TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist) {
+ KASSERT(bp->b_iodone == NULL);
+ n++;
+ size += bp->b_bufsize;
+#ifdef DIAGNOSTIC
+ if (n > nbuf)
+ panic("lfs_countlocked: this can't happen: more"
+ " buffers locked than exist");
+#endif
+ }
+ /*
+ * Theoretically this function never really does anything.
+ * Give a warning if we have to fix the accounting.
+ */
+ if (n != *count) {
+ DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted buf count"
+ " from %d to %d\n", msg, *count, n));
+ }
+ if (size != *bytes) {
+ DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted byte count"
+ " from %ld to %ld\n", msg, *bytes, size));
+ }
+ *count = n;
+ *bytes = size;
+ mutex_exit(&bufcache_lock);
+ return;
+}
+
+int
+lfs_wait_pages(void)
+{
+ int active, inactive;
+
+ uvm_estimatepageable(&active, &inactive);
+ return LFS_WAIT_RESOURCE(active + inactive + uvmexp.free, 1);
+}
+
+int
+lfs_max_pages(void)
+{
+ int active, inactive;
+
+ uvm_estimatepageable(&active, &inactive);
+ return LFS_MAX_RESOURCE(active + inactive + uvmexp.free, 1);
+}
--- /dev/null
+/* $NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_cksum.c 8.2 (Berkeley) 10/9/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $");
+
+#include <sys/param.h>
+#ifdef _KERNEL
+# include <sys/systm.h>
+# include <sys/lock.h>
+#else
+# include <stddef.h>
+#endif
+#include <sys/mount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+/*
+ * Simple, general purpose, fast checksum. Data must be short-aligned.
+ * Returns a u_long in case we ever want to do something more rigorous.
+ *
+ * XXX
+ * Use the TCP/IP checksum instead.
+ */
+u_int32_t
+lfs_cksum_part(void *str, size_t len, u_int32_t sum)
+{
+
+ len &= ~(sizeof(u_int16_t) - 1);
+ for (; len; len -= sizeof(u_int16_t)) {
+ sum ^= *(u_int16_t *)str;
+ str = (void *)((u_int16_t *)str + 1);
+ }
+ return (sum);
+}
+
+u_int32_t
+cksum(void *str, size_t len)
+{
+
+ return lfs_cksum_fold(lfs_cksum_part(str, len, 0));
+}
+
+u_int32_t
+lfs_sb_cksum(struct dlfs *fs)
+{
+ size_t size;
+
+ size = (size_t)offsetof(struct dlfs, dlfs_cksum);
+ return cksum(fs, size);
+}
--- /dev/null
+/* $NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_debug.c 8.1 (Berkeley) 6/11/93
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $");
+
+#ifdef DEBUG
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/syslog.h>
+#include <sys/proc.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+int lfs_lognum;
+struct lfs_log_entry lfs_log[LFS_LOGLENGTH];
+
+int
+lfs_bwrite_log(struct buf *bp, const char *file, int line)
+{
+ struct vop_bwrite_args a;
+
+ a.a_desc = VDESC(vop_bwrite);
+ a.a_bp = bp;
+
+ if (!(bp->b_flags & B_GATHERED) && !(bp->b_oflags & BO_DELWRI)) {
+ LFS_ENTER_LOG("write", file, line, bp->b_lblkno, bp->b_flags,
+ curproc->p_pid);
+ }
+ return (VCALL(bp->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+void
+lfs_dumplog(void)
+{
+ int i;
+ const char *cp;
+
+ for (i = lfs_lognum; i != (lfs_lognum - 1) % LFS_LOGLENGTH;
+ i = (i + 1) % LFS_LOGLENGTH)
+ if (lfs_log[i].file) {
+ /* Only print out basename, for readability */
+ cp = lfs_log[i].file;
+ while(*cp)
+ ++cp;
+ while(*cp != '/' && cp > lfs_log[i].file)
+ --cp;
+
+ printf("lbn %" PRId64 " %s %lx %d, %d %s\n",
+ lfs_log[i].block,
+ lfs_log[i].op,
+ lfs_log[i].flags,
+ lfs_log[i].pid,
+ lfs_log[i].line,
+ cp);
+ }
+}
+
+void
+lfs_dump_super(struct lfs *lfsp)
+{
+ int i;
+
+ printf("%s%x\t%s%x\t%s%d\t%s%d\n",
+ "magic ", lfsp->lfs_magic,
+ "version ", lfsp->lfs_version,
+ "size ", lfsp->lfs_size,
+ "ssize ", lfsp->lfs_ssize);
+ printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+ "dsize ", lfsp->lfs_dsize,
+ "bsize ", lfsp->lfs_bsize,
+ "fsize ", lfsp->lfs_fsize,
+ "frag ", lfsp->lfs_frag);
+
+ printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+ "minfree ", lfsp->lfs_minfree,
+ "inopb ", lfsp->lfs_inopb,
+ "ifpb ", lfsp->lfs_ifpb,
+ "nindir ", lfsp->lfs_nindir);
+
+ printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+ "nseg ", lfsp->lfs_nseg,
+ "nspf ", lfsp->lfs_nspf,
+ "cleansz ", lfsp->lfs_cleansz,
+ "segtabsz ", lfsp->lfs_segtabsz);
+
+ printf("%s%x\t%s%d\t%s%lx\t%s%d\n",
+ "segmask ", lfsp->lfs_segmask,
+ "segshift ", lfsp->lfs_segshift,
+ "bmask ", (unsigned long)lfsp->lfs_bmask,
+ "bshift ", lfsp->lfs_bshift);
+
+ printf("%s%lu\t%s%d\t%s%lx\t%s%u\n",
+ "ffmask ", (unsigned long)lfsp->lfs_ffmask,
+ "ffshift ", lfsp->lfs_ffshift,
+ "fbmask ", (unsigned long)lfsp->lfs_fbmask,
+ "fbshift ", lfsp->lfs_fbshift);
+
+ printf("%s%d\t%s%d\t%s%x\t%s%qx\n",
+ "sushift ", lfsp->lfs_sushift,
+ "fsbtodb ", lfsp->lfs_fsbtodb,
+ "cksum ", lfsp->lfs_cksum,
+ "maxfilesize ", (long long)lfsp->lfs_maxfilesize);
+
+ printf("Superblock disk addresses:");
+ for (i = 0; i < LFS_MAXNUMSB; i++)
+ printf(" %x", lfsp->lfs_sboffs[i]);
+ printf("\n");
+
+ printf("Checkpoint Info\n");
+ printf("%s%d\t%s%x\t%s%d\n",
+ "freehd ", lfsp->lfs_freehd,
+ "idaddr ", lfsp->lfs_idaddr,
+ "ifile ", lfsp->lfs_ifile);
+ printf("%s%x\t%s%d\t%s%x\t%s%x\t%s%x\t%s%x\n",
+ "bfree ", lfsp->lfs_bfree,
+ "nfiles ", lfsp->lfs_nfiles,
+ "lastseg ", lfsp->lfs_lastseg,
+ "nextseg ", lfsp->lfs_nextseg,
+ "curseg ", lfsp->lfs_curseg,
+ "offset ", lfsp->lfs_offset);
+ printf("tstamp %llx\n", (long long)lfsp->lfs_tstamp);
+}
+
+void
+lfs_dump_dinode(struct ufs1_dinode *dip)
+{
+ int i;
+
+ printf("%s%u\t%s%d\t%s%u\t%s%u\t%s%qu\t%s%d\n",
+ "mode ", dip->di_mode,
+ "nlink ", dip->di_nlink,
+ "uid ", dip->di_uid,
+ "gid ", dip->di_gid,
+ "size ", (long long)dip->di_size,
+ "blocks ", dip->di_blocks);
+ printf("inum %d\n", dip->di_inumber);
+ printf("Direct Addresses\n");
+ for (i = 0; i < NDADDR; i++) {
+ printf("\t%x", dip->di_db[i]);
+ if ((i % 6) == 5)
+ printf("\n");
+ }
+ for (i = 0; i < NIADDR; i++)
+ printf("\t%x", dip->di_ib[i]);
+ printf("\n");
+}
+
+void
+lfs_check_segsum(struct lfs *fs, struct segment *sp, char *file, int line)
+{
+ int actual;
+#if 0
+ static int offset;
+#endif
+
+ if ((actual = 1) == 1)
+ return; /* XXXX not checking this anymore, really */
+
+ if (sp->sum_bytes_left >= FINFOSIZE
+ && sp->fip->fi_nblocks > 512) {
+ printf("%s:%d: fi_nblocks = %d\n",file,line,sp->fip->fi_nblocks);
+#ifdef DDB
+ Debugger();
+#endif
+ }
+
+ if (sp->sum_bytes_left > 484) {
+ printf("%s:%d: bad value (%d = -%d) for sum_bytes_left\n",
+ file, line, sp->sum_bytes_left, fs->lfs_sumsize-sp->sum_bytes_left);
+ panic("too many bytes");
+ }
+
+ actual = fs->lfs_sumsize
+ /* amount taken up by FINFOs */
+ - ((char *)&(sp->fip->fi_blocks[sp->fip->fi_nblocks]) - (char *)(sp->segsum))
+ /* amount taken up by inode blocks */
+ - sizeof(int32_t)*((sp->ninodes+INOPB(fs)-1) / INOPB(fs));
+#if 0
+ if (actual - sp->sum_bytes_left < offset)
+ {
+ printf("%s:%d: offset changed %d -> %d\n", file, line,
+ offset, actual-sp->sum_bytes_left);
+ offset = actual - sp->sum_bytes_left;
+ /* panic("byte mismatch"); */
+ }
+#endif
+#if 0
+ if (actual != sp->sum_bytes_left)
+ printf("%s:%d: warning: segsum miscalc at %d (-%d => %d)\n",
+ file, line, sp->sum_bytes_left,
+ fs->lfs_sumsize-sp->sum_bytes_left,
+ actual);
+#endif
+ if (sp->sum_bytes_left > 0
+ && ((char *)(sp->segsum))[fs->lfs_sumsize
+ - sizeof(int32_t) * ((sp->ninodes+INOPB(fs)-1) / INOPB(fs))
+ - sp->sum_bytes_left] != '\0') {
+ printf("%s:%d: warning: segsum overwrite at %d (-%d => %d)\n",
+ file, line, sp->sum_bytes_left,
+ fs->lfs_sumsize-sp->sum_bytes_left,
+ actual);
+#ifdef DDB
+ Debugger();
+#endif
+ }
+}
+
+void
+lfs_check_bpp(struct lfs *fs, struct segment *sp, char *file, int line)
+{
+ daddr_t blkno;
+ struct buf **bpp;
+ struct vnode *devvp;
+
+ devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+ blkno = (*(sp->bpp))->b_blkno;
+ for (bpp = sp->bpp; bpp < sp->cbpp; bpp++) {
+ if ((*bpp)->b_blkno != blkno) {
+ if ((*bpp)->b_vp == devvp) {
+ printf("Oops, would misplace raw block "
+ "0x%" PRIx64 " at 0x%" PRIx64 "\n",
+ (*bpp)->b_blkno,
+ blkno);
+ } else {
+ printf("%s:%d: misplace ino %llu lbn %" PRId64
+ " at 0x%" PRIx64 " instead of "
+ "0x%" PRIx64 "\n",
+ file, line,
+ (unsigned long long)
+ VTOI((*bpp)->b_vp)->i_number,
+ (*bpp)->b_lblkno,
+ blkno,
+ (*bpp)->b_blkno);
+ }
+ }
+ blkno += fsbtodb(fs, btofsb(fs, (*bpp)->b_bcount));
+ }
+}
+
+int lfs_debug_log_subsys[DLOG_MAX];
+
+/*
+ * Log events from various debugging areas of LFS, depending on what
+ * the user has enabled.
+ */
+void
+lfs_debug_log(int subsys, const char *fmt, ...)
+{
+ va_list ap;
+
+ /* If not debugging this subsys, exit */
+ if (lfs_debug_log_subsys[subsys] == 0)
+ return;
+
+ va_start(ap, fmt);
+ vlog(LOG_DEBUG, fmt, ap);
+ va_end(ap);
+}
+#endif /* DEBUG */
--- /dev/null
+/* $NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_inode.c 8.9 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/trace.h>
+#include <sys/resourcevar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+static int lfs_update_seguse(struct lfs *, struct inode *ip, long, size_t);
+static int lfs_indirtrunc (struct inode *, daddr_t, daddr_t,
+ daddr_t, int, long *, long *, long *, size_t *);
+static int lfs_blkfree (struct lfs *, struct inode *, daddr_t, size_t, long *, size_t *);
+static int lfs_vtruncbuf(struct vnode *, daddr_t, bool, int);
+
+/* Search a block for a specific dinode. */
+struct ufs1_dinode *
+lfs_ifind(struct lfs *fs, ino_t ino, struct buf *bp)
+{
+ struct ufs1_dinode *dip = (struct ufs1_dinode *)bp->b_data;
+ struct ufs1_dinode *ldip, *fin;
+
+ ASSERT_NO_SEGLOCK(fs);
+ /*
+ * Read the inode block backwards, since later versions of the
+ * inode will supercede earlier ones. Though it is unlikely, it is
+ * possible that the same inode will appear in the same inode block.
+ */
+ fin = dip + INOPB(fs);
+ for (ldip = fin - 1; ldip >= dip; --ldip)
+ if (ldip->di_inumber == ino)
+ return (ldip);
+
+ printf("searched %d entries\n", (int)(fin - dip));
+ printf("offset is 0x%x (seg %d)\n", fs->lfs_offset,
+ dtosn(fs, fs->lfs_offset));
+ printf("block is 0x%llx (seg %lld)\n",
+ (unsigned long long)dbtofsb(fs, bp->b_blkno),
+ (long long)dtosn(fs, dbtofsb(fs, bp->b_blkno)));
+
+ return NULL;
+}
+
+int
+lfs_update(struct vnode *vp, const struct timespec *acc,
+ const struct timespec *mod, int updflags)
+{
+ struct inode *ip;
+ struct lfs *fs = VFSTOUFS(vp->v_mount)->um_lfs;
+ int flags;
+
+ ASSERT_NO_SEGLOCK(fs);
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (0);
+ ip = VTOI(vp);
+
+ /*
+ * If we are called from vinvalbuf, and the file's blocks have
+ * already been scheduled for writing, but the writes have not
+ * yet completed, lfs_vflush will not be called, and vinvalbuf
+ * will cause a panic. So, we must wait until any pending write
+ * for our inode completes, if we are called with UPDATE_WAIT set.
+ */
+ mutex_enter(vp->v_interlock);
+ while ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT &&
+ WRITEINPROG(vp)) {
+ DLOG((DLOG_SEG, "lfs_update: sleeping on ino %d"
+ " (in progress)\n", ip->i_number));
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ }
+ mutex_exit(vp->v_interlock);
+ LFS_ITIMES(ip, acc, mod, NULL);
+ if (updflags & UPDATE_CLOSE)
+ flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED | IN_CLEANING);
+ else
+ flags = ip->i_flag & (IN_MODIFIED | IN_CLEANING);
+ if (flags == 0)
+ return (0);
+
+ /* If sync, push back the vnode and any dirty blocks it may have. */
+ if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT) {
+ /* Avoid flushing VU_DIROP. */
+ mutex_enter(&lfs_lock);
+ ++fs->lfs_diropwait;
+ while (vp->v_uflag & VU_DIROP) {
+ DLOG((DLOG_DIROP, "lfs_update: sleeping on inode %d"
+ " (dirops)\n", ip->i_number));
+ DLOG((DLOG_DIROP, "lfs_update: vflags 0x%x, iflags"
+ " 0x%x\n",
+ vp->v_iflag | vp->v_vflag | vp->v_uflag,
+ ip->i_flag));
+ if (fs->lfs_dirops == 0)
+ lfs_flush_fs(fs, SEGM_SYNC);
+ else
+ mtsleep(&fs->lfs_writer, PRIBIO+1, "lfs_fsync",
+ 0, &lfs_lock);
+ /* XXX KS - by falling out here, are we writing the vn
+ twice? */
+ }
+ --fs->lfs_diropwait;
+ mutex_exit(&lfs_lock);
+ return lfs_vflush(vp);
+ }
+ return 0;
+}
+
+#define SINGLE 0 /* index of single indirect block */
+#define DOUBLE 1 /* index of double indirect block */
+#define TRIPLE 2 /* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+/* VOP_BWRITE 1 + NIADDR + lfs_balloc == 2 + 2*NIADDR times */
+
+int
+lfs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
+{
+ daddr_t lastblock;
+ struct inode *oip = VTOI(ovp);
+ daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
+ /* XXX ondisk32 */
+ int32_t newblks[NDADDR + NIADDR];
+ struct lfs *fs;
+ struct buf *bp;
+ int offset, size, level;
+ long count, rcount, blocksreleased = 0, real_released = 0;
+ int i, nblocks;
+ int aflags, error, allerror = 0;
+ off_t osize;
+ long lastseg;
+ size_t bc;
+ int obufsize, odb;
+ int usepc;
+ struct ufsmount *ump = oip->i_ump;
+
+ if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+ ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+ KASSERT(oip->i_size == 0);
+ return 0;
+ }
+
+ if (length < 0)
+ return (EINVAL);
+
+ /*
+ * Just return and not update modification times.
+ */
+ if (oip->i_size == length) {
+ /* still do a uvm_vnp_setsize() as writesize may be larger */
+ uvm_vnp_setsize(ovp, length);
+ return (0);
+ }
+
+ if (ovp->v_type == VLNK &&
+ (oip->i_size < ump->um_maxsymlinklen ||
+ (ump->um_maxsymlinklen == 0 &&
+ oip->i_ffs1_blocks == 0))) {
+#ifdef DIAGNOSTIC
+ if (length != 0)
+ panic("lfs_truncate: partial truncate of symlink");
+#endif
+ memset((char *)SHORTLINK(oip), 0, (u_int)oip->i_size);
+ oip->i_size = oip->i_ffs1_size = 0;
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (lfs_update(ovp, NULL, NULL, 0));
+ }
+ if (oip->i_size == length) {
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (lfs_update(ovp, NULL, NULL, 0));
+ }
+ fs = oip->i_lfs;
+ lfs_imtime(fs);
+ osize = oip->i_size;
+ usepc = (ovp->v_type == VREG && ovp != fs->lfs_ivnode);
+
+ ASSERT_NO_SEGLOCK(fs);
+ /*
+ * Lengthen the size of the file. We must ensure that the
+ * last byte of the file is allocated. Since the smallest
+ * value of osize is 0, length will be at least 1.
+ */
+ if (osize < length) {
+ if (length > ump->um_maxfilesize)
+ return (EFBIG);
+ aflags = B_CLRBUF;
+ if (ioflag & IO_SYNC)
+ aflags |= B_SYNC;
+ if (usepc) {
+ if (lblkno(fs, osize) < NDADDR &&
+ lblkno(fs, osize) != lblkno(fs, length) &&
+ blkroundup(fs, osize) != osize) {
+ off_t eob;
+
+ eob = blkroundup(fs, osize);
+ uvm_vnp_setwritesize(ovp, eob);
+ error = ufs_balloc_range(ovp, osize,
+ eob - osize, cred, aflags);
+ if (error) {
+ (void) lfs_truncate(ovp, osize,
+ ioflag & IO_SYNC, cred);
+ return error;
+ }
+ if (ioflag & IO_SYNC) {
+ mutex_enter(ovp->v_interlock);
+ VOP_PUTPAGES(ovp,
+ trunc_page(osize & fs->lfs_bmask),
+ round_page(eob),
+ PGO_CLEANIT | PGO_SYNCIO);
+ }
+ }
+ uvm_vnp_setwritesize(ovp, length);
+ error = ufs_balloc_range(ovp, length - 1, 1, cred,
+ aflags);
+ if (error) {
+ (void) lfs_truncate(ovp, osize,
+ ioflag & IO_SYNC, cred);
+ return error;
+ }
+ uvm_vnp_setsize(ovp, length);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ KASSERT(ovp->v_size == oip->i_size);
+ oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+ return (lfs_update(ovp, NULL, NULL, 0));
+ } else {
+ error = lfs_reserve(fs, ovp, NULL,
+ btofsb(fs, (NIADDR + 2) << fs->lfs_bshift));
+ if (error)
+ return (error);
+ error = lfs_balloc(ovp, length - 1, 1, cred,
+ aflags, &bp);
+ lfs_reserve(fs, ovp, NULL,
+ -btofsb(fs, (NIADDR + 2) << fs->lfs_bshift));
+ if (error)
+ return (error);
+ oip->i_ffs1_size = oip->i_size = length;
+ uvm_vnp_setsize(ovp, length);
+ (void) VOP_BWRITE(bp->b_vp, bp);
+ oip->i_flag |= IN_CHANGE | IN_UPDATE;
+ oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+ return (lfs_update(ovp, NULL, NULL, 0));
+ }
+ }
+
+ if ((error = lfs_reserve(fs, ovp, NULL,
+ btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift))) != 0)
+ return (error);
+
+ /*
+ * Shorten the size of the file. If the file is not being
+ * truncated to a block boundary, the contents of the
+ * partial block following the end of the file must be
+ * zero'ed in case it ever becomes accessible again because
+ * of subsequent file growth. Directories however are not
+ * zero'ed as they should grow back initialized to empty.
+ */
+ offset = blkoff(fs, length);
+ lastseg = -1;
+ bc = 0;
+
+ if (ovp != fs->lfs_ivnode)
+ lfs_seglock(fs, SEGM_PROT);
+ if (offset == 0) {
+ oip->i_size = oip->i_ffs1_size = length;
+ } else if (!usepc) {
+ lbn = lblkno(fs, length);
+ aflags = B_CLRBUF;
+ if (ioflag & IO_SYNC)
+ aflags |= B_SYNC;
+ error = lfs_balloc(ovp, length - 1, 1, cred, aflags, &bp);
+ if (error) {
+ lfs_reserve(fs, ovp, NULL,
+ -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+ goto errout;
+ }
+ obufsize = bp->b_bufsize;
+ odb = btofsb(fs, bp->b_bcount);
+ oip->i_size = oip->i_ffs1_size = length;
+ size = blksize(fs, oip, lbn);
+ if (ovp->v_type != VDIR)
+ memset((char *)bp->b_data + offset, 0,
+ (u_int)(size - offset));
+ allocbuf(bp, size, 1);
+ if ((bp->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) {
+ mutex_enter(&lfs_lock);
+ locked_queue_bytes -= obufsize - bp->b_bufsize;
+ mutex_exit(&lfs_lock);
+ }
+ if (bp->b_oflags & BO_DELWRI)
+ fs->lfs_avail += odb - btofsb(fs, size);
+ (void) VOP_BWRITE(bp->b_vp, bp);
+ } else { /* vp->v_type == VREG && length < osize && offset != 0 */
+ /*
+ * When truncating a regular file down to a non-block-aligned
+ * size, we must zero the part of last block which is past
+ * the new EOF. We must synchronously flush the zeroed pages
+ * to disk since the new pages will be invalidated as soon
+ * as we inform the VM system of the new, smaller size.
+ * We must do this before acquiring the GLOCK, since fetching
+ * the pages will acquire the GLOCK internally.
+ * So there is a window where another thread could see a whole
+ * zeroed page past EOF, but that's life.
+ */
+ daddr_t xlbn;
+ voff_t eoz;
+
+ aflags = ioflag & IO_SYNC ? B_SYNC : 0;
+ error = ufs_balloc_range(ovp, length - 1, 1, cred, aflags);
+ if (error) {
+ lfs_reserve(fs, ovp, NULL,
+ -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+ goto errout;
+ }
+ xlbn = lblkno(fs, length);
+ size = blksize(fs, oip, xlbn);
+ eoz = MIN(lblktosize(fs, xlbn) + size, osize);
+ ubc_zerorange(&ovp->v_uobj, length, eoz - length,
+ UBC_UNMAP_FLAG(ovp));
+ if (round_page(eoz) > round_page(length)) {
+ mutex_enter(ovp->v_interlock);
+ error = VOP_PUTPAGES(ovp, round_page(length),
+ round_page(eoz),
+ PGO_CLEANIT | PGO_DEACTIVATE |
+ ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
+ if (error) {
+ lfs_reserve(fs, ovp, NULL,
+ -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+ goto errout;
+ }
+ }
+ }
+
+ genfs_node_wrlock(ovp);
+
+ oip->i_size = oip->i_ffs1_size = length;
+ uvm_vnp_setsize(ovp, length);
+
+ /*
+ * Calculate index into inode's block list of
+ * last direct and indirect blocks (if any)
+ * which we want to keep. Lastblock is -1 when
+ * the file is truncated to 0.
+ */
+ /* Avoid sign overflow - XXX assumes that off_t is a quad_t. */
+ if (length > QUAD_MAX - fs->lfs_bsize)
+ lastblock = lblkno(fs, QUAD_MAX - fs->lfs_bsize);
+ else
+ lastblock = lblkno(fs, length + fs->lfs_bsize - 1) - 1;
+ lastiblock[SINGLE] = lastblock - NDADDR;
+ lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+ lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+ nblocks = btofsb(fs, fs->lfs_bsize);
+ /*
+ * Record changed file and block pointers before we start
+ * freeing blocks. lastiblock values are also normalized to -1
+ * for calls to lfs_indirtrunc below.
+ */
+ memcpy((void *)newblks, (void *)&oip->i_ffs1_db[0], sizeof newblks);
+ for (level = TRIPLE; level >= SINGLE; level--)
+ if (lastiblock[level] < 0) {
+ newblks[NDADDR+level] = 0;
+ lastiblock[level] = -1;
+ }
+ for (i = NDADDR - 1; i > lastblock; i--)
+ newblks[i] = 0;
+
+ oip->i_size = oip->i_ffs1_size = osize;
+ error = lfs_vtruncbuf(ovp, lastblock + 1, false, 0);
+ if (error && !allerror)
+ allerror = error;
+
+ /*
+ * Indirect blocks first.
+ */
+ indir_lbn[SINGLE] = -NDADDR;
+ indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
+ indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+ for (level = TRIPLE; level >= SINGLE; level--) {
+ bn = oip->i_ffs1_ib[level];
+ if (bn != 0) {
+ error = lfs_indirtrunc(oip, indir_lbn[level],
+ bn, lastiblock[level],
+ level, &count, &rcount,
+ &lastseg, &bc);
+ if (error)
+ allerror = error;
+ real_released += rcount;
+ blocksreleased += count;
+ if (lastiblock[level] < 0) {
+ if (oip->i_ffs1_ib[level] > 0)
+ real_released += nblocks;
+ blocksreleased += nblocks;
+ oip->i_ffs1_ib[level] = 0;
+ lfs_blkfree(fs, oip, bn, fs->lfs_bsize,
+ &lastseg, &bc);
+ lfs_deregister_block(ovp, bn);
+ }
+ }
+ if (lastiblock[level] >= 0)
+ goto done;
+ }
+
+ /*
+ * All whole direct blocks or frags.
+ */
+ for (i = NDADDR - 1; i > lastblock; i--) {
+ long bsize, obsize;
+
+ bn = oip->i_ffs1_db[i];
+ if (bn == 0)
+ continue;
+ bsize = blksize(fs, oip, i);
+ if (oip->i_ffs1_db[i] > 0) {
+ /* Check for fragment size changes */
+ obsize = oip->i_lfs_fragsize[i];
+ real_released += btofsb(fs, obsize);
+ oip->i_lfs_fragsize[i] = 0;
+ } else
+ obsize = 0;
+ blocksreleased += btofsb(fs, bsize);
+ oip->i_ffs1_db[i] = 0;
+ lfs_blkfree(fs, oip, bn, obsize, &lastseg, &bc);
+ lfs_deregister_block(ovp, bn);
+ }
+ if (lastblock < 0)
+ goto done;
+
+ /*
+ * Finally, look for a change in size of the
+ * last direct block; release any frags.
+ */
+ bn = oip->i_ffs1_db[lastblock];
+ if (bn != 0) {
+ long oldspace, newspace;
+#if 0
+ long olddspace;
+#endif
+
+ /*
+ * Calculate amount of space we're giving
+ * back as old block size minus new block size.
+ */
+ oldspace = blksize(fs, oip, lastblock);
+#if 0
+ olddspace = oip->i_lfs_fragsize[lastblock];
+#endif
+
+ oip->i_size = oip->i_ffs1_size = length;
+ newspace = blksize(fs, oip, lastblock);
+ if (newspace == 0)
+ panic("itrunc: newspace");
+ if (oldspace - newspace > 0) {
+ blocksreleased += btofsb(fs, oldspace - newspace);
+ }
+#if 0
+ if (bn > 0 && olddspace - newspace > 0) {
+ /* No segment accounting here, just vnode */
+ real_released += btofsb(fs, olddspace - newspace);
+ }
+#endif
+ }
+
+done:
+ /* Finish segment accounting corrections */
+ lfs_update_seguse(fs, oip, lastseg, bc);
+#ifdef DIAGNOSTIC
+ for (level = SINGLE; level <= TRIPLE; level++)
+ if ((newblks[NDADDR + level] == 0) !=
+ ((oip->i_ffs1_ib[level]) == 0)) {
+ panic("lfs itrunc1");
+ }
+ for (i = 0; i < NDADDR; i++)
+ if ((newblks[i] == 0) != (oip->i_ffs1_db[i] == 0)) {
+ panic("lfs itrunc2");
+ }
+ if (length == 0 &&
+ (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+ panic("lfs itrunc3");
+#endif /* DIAGNOSTIC */
+ /*
+ * Put back the real size.
+ */
+ oip->i_size = oip->i_ffs1_size = length;
+ oip->i_lfs_effnblks -= blocksreleased;
+ oip->i_ffs1_blocks -= real_released;
+ mutex_enter(&lfs_lock);
+ fs->lfs_bfree += blocksreleased;
+ mutex_exit(&lfs_lock);
+#ifdef DIAGNOSTIC
+ if (oip->i_size == 0 &&
+ (oip->i_ffs1_blocks != 0 || oip->i_lfs_effnblks != 0)) {
+ printf("lfs_truncate: truncate to 0 but %d blks/%d effblks\n",
+ oip->i_ffs1_blocks, oip->i_lfs_effnblks);
+ panic("lfs_truncate: persistent blocks");
+ }
+#endif
+
+ /*
+ * If we truncated to zero, take us off the paging queue.
+ */
+ mutex_enter(&lfs_lock);
+ if (oip->i_size == 0 && oip->i_flags & IN_PAGING) {
+ oip->i_flags &= ~IN_PAGING;
+ TAILQ_REMOVE(&fs->lfs_pchainhd, oip, i_lfs_pchain);
+ }
+ mutex_exit(&lfs_lock);
+
+ oip->i_flag |= IN_CHANGE;
+#ifdef QUOTA
+ (void) chkdq(oip, -blocksreleased, NOCRED, 0);
+#endif
+ lfs_reserve(fs, ovp, NULL,
+ -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+ genfs_node_unlock(ovp);
+ errout:
+ oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+ if (ovp != fs->lfs_ivnode)
+ lfs_segunlock(fs);
+ return (allerror ? allerror : error);
+}
+
+/* Update segment and avail usage information when removing a block. */
+static int
+lfs_blkfree(struct lfs *fs, struct inode *ip, daddr_t daddr,
+ size_t bsize, long *lastseg, size_t *num)
+{
+ long seg;
+ int error = 0;
+
+ ASSERT_SEGLOCK(fs);
+ bsize = fragroundup(fs, bsize);
+ if (daddr > 0) {
+ if (*lastseg != (seg = dtosn(fs, daddr))) {
+ error = lfs_update_seguse(fs, ip, *lastseg, *num);
+ *num = bsize;
+ *lastseg = seg;
+ } else
+ *num += bsize;
+ }
+
+ return error;
+}
+
+/* Finish the accounting updates for a segment. */
+static int
+lfs_update_seguse(struct lfs *fs, struct inode *ip, long lastseg, size_t num)
+{
+ struct segdelta *sd;
+ struct vnode *vp;
+
+ ASSERT_SEGLOCK(fs);
+ if (lastseg < 0 || num == 0)
+ return 0;
+
+ vp = ITOV(ip);
+ LIST_FOREACH(sd, &ip->i_lfs_segdhd, list)
+ if (sd->segnum == lastseg)
+ break;
+ if (sd == NULL) {
+ sd = malloc(sizeof(*sd), M_SEGMENT, M_WAITOK);
+ sd->segnum = lastseg;
+ sd->num = 0;
+ LIST_INSERT_HEAD(&ip->i_lfs_segdhd, sd, list);
+ }
+ sd->num += num;
+
+ return 0;
+}
+
+static void
+lfs_finalize_seguse(struct lfs *fs, void *v)
+{
+ SEGUSE *sup;
+ struct buf *bp;
+ struct segdelta *sd;
+ LIST_HEAD(, segdelta) *hd = v;
+
+ ASSERT_SEGLOCK(fs);
+ while((sd = LIST_FIRST(hd)) != NULL) {
+ LIST_REMOVE(sd, list);
+ LFS_SEGENTRY(sup, fs, sd->segnum, bp);
+ if (sd->num > sup->su_nbytes) {
+ printf("lfs_finalize_seguse: segment %ld short by %ld\n",
+ sd->segnum, (long)(sd->num - sup->su_nbytes));
+ panic("lfs_finalize_seguse: negative bytes");
+ sup->su_nbytes = sd->num;
+ }
+ sup->su_nbytes -= sd->num;
+ LFS_WRITESEGENTRY(sup, fs, sd->segnum, bp);
+ free(sd, M_SEGMENT);
+ }
+}
+
+/* Finish the accounting updates for a segment. */
+void
+lfs_finalize_ino_seguse(struct lfs *fs, struct inode *ip)
+{
+ ASSERT_SEGLOCK(fs);
+ lfs_finalize_seguse(fs, &ip->i_lfs_segdhd);
+}
+
+/* Finish the accounting updates for a segment. */
+void
+lfs_finalize_fs_seguse(struct lfs *fs)
+{
+ ASSERT_SEGLOCK(fs);
+ lfs_finalize_seguse(fs, &fs->lfs_segdhd);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn. Blocks are free'd in LIFO order up to (but not including)
+ * lastbn. If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+lfs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
+ daddr_t lastbn, int level, long *countp,
+ long *rcountp, long *lastsegp, size_t *bcp)
+{
+ int i;
+ struct buf *bp;
+ struct lfs *fs = ip->i_lfs;
+ int32_t *bap; /* XXX ondisk32 */
+ struct vnode *vp;
+ daddr_t nb, nlbn, last;
+ int32_t *copy = NULL; /* XXX ondisk32 */
+ long blkcount, rblkcount, factor;
+ int nblocks, blocksreleased = 0, real_released = 0;
+ int error = 0, allerror = 0;
+
+ ASSERT_SEGLOCK(fs);
+ /*
+ * Calculate index in current block of last
+ * block to be kept. -1 indicates the entire
+ * block so we need not calculate the index.
+ */
+ factor = 1;
+ for (i = SINGLE; i < level; i++)
+ factor *= NINDIR(fs);
+ last = lastbn;
+ if (lastbn > 0)
+ last /= factor;
+ nblocks = btofsb(fs, fs->lfs_bsize);
+ /*
+ * Get buffer of block pointers, zero those entries corresponding
+ * to blocks to be free'd, and update on disk copy first. Since
+ * double(triple) indirect before single(double) indirect, calls
+ * to bmap on these blocks will fail. However, we already have
+ * the on disk address, so we have to set the b_blkno field
+ * explicitly instead of letting bread do everything for us.
+ */
+ vp = ITOV(ip);
+ bp = getblk(vp, lbn, (int)fs->lfs_bsize, 0, 0);
+ if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+ /* Braces must be here in case trace evaluates to nothing. */
+ trace(TR_BREADHIT, pack(vp, fs->lfs_bsize), lbn);
+ } else {
+ trace(TR_BREADMISS, pack(vp, fs->lfs_bsize), lbn);
+ curlwp->l_ru.ru_inblock++; /* pay for read */
+ bp->b_flags |= B_READ;
+ if (bp->b_bcount > bp->b_bufsize)
+ panic("lfs_indirtrunc: bad buffer size");
+ bp->b_blkno = fsbtodb(fs, dbn);
+ VOP_STRATEGY(vp, bp);
+ error = biowait(bp);
+ }
+ if (error) {
+ brelse(bp, 0);
+ *countp = *rcountp = 0;
+ return (error);
+ }
+
+ bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+ if (lastbn >= 0) {
+ copy = (int32_t *)lfs_malloc(fs, fs->lfs_bsize, LFS_NB_IBLOCK);
+ memcpy((void *)copy, (void *)bap, (u_int)fs->lfs_bsize);
+ memset((void *)&bap[last + 1], 0,
+ /* XXX ondisk32 */
+ (u_int)(NINDIR(fs) - (last + 1)) * sizeof (int32_t));
+ error = VOP_BWRITE(bp->b_vp, bp);
+ if (error)
+ allerror = error;
+ bap = copy;
+ }
+
+ /*
+ * Recursively free totally unused blocks.
+ */
+ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
+ i--, nlbn += factor) {
+ nb = bap[i];
+ if (nb == 0)
+ continue;
+ if (level > SINGLE) {
+ error = lfs_indirtrunc(ip, nlbn, nb,
+ (daddr_t)-1, level - 1,
+ &blkcount, &rblkcount,
+ lastsegp, bcp);
+ if (error)
+ allerror = error;
+ blocksreleased += blkcount;
+ real_released += rblkcount;
+ }
+ lfs_blkfree(fs, ip, nb, fs->lfs_bsize, lastsegp, bcp);
+ if (bap[i] > 0)
+ real_released += nblocks;
+ blocksreleased += nblocks;
+ }
+
+ /*
+ * Recursively free last partial block.
+ */
+ if (level > SINGLE && lastbn >= 0) {
+ last = lastbn % factor;
+ nb = bap[i];
+ if (nb != 0) {
+ error = lfs_indirtrunc(ip, nlbn, nb,
+ last, level - 1, &blkcount,
+ &rblkcount, lastsegp, bcp);
+ if (error)
+ allerror = error;
+ real_released += rblkcount;
+ blocksreleased += blkcount;
+ }
+ }
+
+ if (copy != NULL) {
+ lfs_free(fs, copy, LFS_NB_IBLOCK);
+ } else {
+ mutex_enter(&bufcache_lock);
+ if (bp->b_oflags & BO_DELWRI) {
+ LFS_UNLOCK_BUF(bp);
+ fs->lfs_avail += btofsb(fs, bp->b_bcount);
+ wakeup(&fs->lfs_avail);
+ }
+ brelsel(bp, BC_INVAL);
+ mutex_exit(&bufcache_lock);
+ }
+
+ *countp = blocksreleased;
+ *rcountp = real_released;
+ return (allerror);
+}
+
+/*
+ * Destroy any in core blocks past the truncation length.
+ * Inlined from vtruncbuf, so that lfs_avail could be updated.
+ * We take the seglock to prevent cleaning from occurring while we are
+ * invalidating blocks.
+ */
+static int
+lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
+{
+ struct buf *bp, *nbp;
+ int error;
+ struct lfs *fs;
+ voff_t off;
+
+ off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
+ if (error)
+ return error;
+
+ fs = VTOI(vp)->i_lfs;
+
+ ASSERT_SEGLOCK(fs);
+
+ mutex_enter(&bufcache_lock);
+restart:
+ for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno < lbn)
+ continue;
+ error = bbusy(bp, catch, slptimeo, NULL);
+ if (error == EPASSTHROUGH)
+ goto restart;
+ if (error != 0) {
+ mutex_exit(&bufcache_lock);
+ return (error);
+ }
+ mutex_enter(bp->b_objlock);
+ if (bp->b_oflags & BO_DELWRI) {
+ bp->b_oflags &= ~BO_DELWRI;
+ fs->lfs_avail += btofsb(fs, bp->b_bcount);
+ wakeup(&fs->lfs_avail);
+ }
+ mutex_exit(bp->b_objlock);
+ LFS_UNLOCK_BUF(bp);
+ brelsel(bp, BC_INVAL | BC_VFLUSH);
+ }
+
+ for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+ if (bp->b_lblkno < lbn)
+ continue;
+ error = bbusy(bp, catch, slptimeo, NULL);
+ if (error == EPASSTHROUGH)
+ goto restart;
+ if (error != 0) {
+ mutex_exit(&bufcache_lock);
+ return (error);
+ }
+ mutex_enter(bp->b_objlock);
+ if (bp->b_oflags & BO_DELWRI) {
+ bp->b_oflags &= ~BO_DELWRI;
+ fs->lfs_avail += btofsb(fs, bp->b_bcount);
+ wakeup(&fs->lfs_avail);
+ }
+ mutex_exit(bp->b_objlock);
+ LFS_UNLOCK_BUF(bp);
+ brelsel(bp, BC_INVAL | BC_VFLUSH);
+ }
+ mutex_exit(&bufcache_lock);
+
+ return (0);
+}
+
--- /dev/null
+/* $NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+
+#include <ufs/ufs/inode.h>
+
+#ifndef _KERNEL
+#include "bufcache.h"
+#include "vnode.h"
+#include "lfs_user.h"
+#define vnode uvnode
+#define buf ubuf
+#define panic call_panic
+#else
+#include <ufs/lfs/lfs_extern.h>
+#include <sys/kauth.h>
+#endif
+
+#include <ufs/lfs/lfs.h>
+
+void
+lfs_itimes(struct inode *ip, const struct timespec *acc,
+ const struct timespec *mod, const struct timespec *cre)
+{
+#ifdef _KERNEL
+ struct timespec now;
+
+ KASSERT(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY));
+
+ vfs_timestamp(&now);
+#endif
+
+ if (ip->i_flag & IN_ACCESS) {
+#ifdef _KERNEL
+ if (acc == NULL)
+ acc = &now;
+#endif
+ ip->i_ffs1_atime = acc->tv_sec;
+ ip->i_ffs1_atimensec = acc->tv_nsec;
+ if (ip->i_lfs->lfs_version > 1) {
+ struct lfs *fs = ip->i_lfs;
+ struct buf *ibp;
+ IFILE *ifp;
+
+ LFS_IENTRY(ifp, ip->i_lfs, ip->i_number, ibp);
+ ifp->if_atime_sec = acc->tv_sec;
+ ifp->if_atime_nsec = acc->tv_nsec;
+ LFS_BWRITE_LOG(ibp);
+ mutex_enter(&lfs_lock);
+ fs->lfs_flags |= LFS_IFDIRTY;
+ mutex_exit(&lfs_lock);
+ } else {
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_ACCESSED);
+ mutex_exit(&lfs_lock);
+ }
+ }
+ if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFY)) {
+ if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+#ifdef _KERNEL
+ if (mod == NULL)
+ mod = &now;
+#endif
+ ip->i_ffs1_mtime = mod->tv_sec;
+ ip->i_ffs1_mtimensec = mod->tv_nsec;
+ ip->i_modrev++;
+ }
+ if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+#ifdef _KERNEL
+ if (cre == NULL)
+ cre = &now;
+#endif
+ ip->i_ffs1_ctime = cre->tv_sec;
+ ip->i_ffs1_ctimensec = cre->tv_nsec;
+ }
+ mutex_enter(&lfs_lock);
+ if (ip->i_flag & (IN_CHANGE | IN_UPDATE))
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ if (ip->i_flag & IN_MODIFY)
+ LFS_SET_UINO(ip, IN_ACCESSED);
+ mutex_exit(&lfs_lock);
+ }
+ ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
--- /dev/null
+/* $NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kthread.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <uvm/uvm_extern.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+
+/*
+ * Roll-forward code.
+ */
+static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
+ kauth_cred_t, int, int *, struct lwp *);
+
+extern int lfs_do_rfw;
+
+/*
+ * Allocate a particular inode with a particular version number, freeing
+ * any previous versions of this inode that may have gone before.
+ * Used by the roll-forward code.
+ *
+ * XXX this function does not have appropriate locking to be used on a live fs;
+ * XXX but something similar could probably be used for an "undelete" call.
+ *
+ * Called with the Ifile inode locked.
+ */
+int
+lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
+ struct vnode **vpp)
+{
+ IFILE *ifp;
+ struct buf *bp, *cbp;
+ struct vnode *vp;
+ struct inode *ip;
+ ino_t tino, oldnext;
+ int error;
+ CLEANERINFO *cip;
+
+ ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
+
+ /*
+ * First, just try a vget. If the version number is the one we want,
+ * we don't have to do anything else. If the version number is wrong,
+ * take appropriate action.
+ */
+ error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp);
+ if (error == 0) {
+ DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp));
+
+ *vpp = vp;
+ ip = VTOI(vp);
+ if (ip->i_gen == vers)
+ return 0;
+ else if (ip->i_gen < vers) {
+ lfs_truncate(vp, (off_t)0, 0, NOCRED);
+ ip->i_gen = ip->i_ffs1_gen = vers;
+ LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
+ return 0;
+ } else {
+ DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
+ ino, vers, ip->i_ffs1_gen));
+ vput(vp);
+ *vpp = NULLVP;
+ return EEXIST;
+ }
+ }
+
+ /*
+ * The inode is not in use. Find it on the free list.
+ */
+ /* If the Ifile is too short to contain this inum, extend it */
+ while (VTOI(fs->lfs_ivnode)->i_size <= (ino /
+ fs->lfs_ifpb + fs->lfs_cleansz + fs->lfs_segtabsz)
+ << fs->lfs_bshift) {
+ lfs_extend_ifile(fs, NOCRED);
+ }
+
+ LFS_IENTRY(ifp, fs, ino, bp);
+ oldnext = ifp->if_nextfree;
+ ifp->if_version = vers;
+ brelse(bp, 0);
+
+ LFS_GET_HEADFREE(fs, cip, cbp, &ino);
+ if (ino) {
+ LFS_PUT_HEADFREE(fs, cip, cbp, oldnext);
+ } else {
+ tino = ino;
+ while (1) {
+ LFS_IENTRY(ifp, fs, tino, bp);
+ if (ifp->if_nextfree == ino ||
+ ifp->if_nextfree == LFS_UNUSED_INUM)
+ break;
+ tino = ifp->if_nextfree;
+ brelse(bp, 0);
+ }
+ if (ifp->if_nextfree == LFS_UNUSED_INUM) {
+ brelse(bp, 0);
+ return ENOENT;
+ }
+ ifp->if_nextfree = oldnext;
+ LFS_BWRITE_LOG(bp);
+ }
+
+ error = lfs_ialloc(fs, fs->lfs_ivnode, ino, vers, &vp);
+ if (error == 0) {
+ /*
+ * Make it VREG so we can put blocks on it. We will change
+ * this later if it turns out to be some other kind of file.
+ */
+ ip = VTOI(vp);
+ ip->i_mode = ip->i_ffs1_mode = IFREG;
+ ip->i_nlink = ip->i_ffs1_nlink = 1;
+ ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, &vp);
+ ip = VTOI(vp);
+
+ DLOG((DLOG_RF, "lfs_rf_valloc: ino %d vp %p\n", ino, vp));
+
+ /* The dirop-nature of this vnode is past */
+ lfs_unmark_vnode(vp);
+ (void)lfs_vunref(vp);
+ vp->v_uflag &= ~VU_DIROP;
+ mutex_enter(&lfs_lock);
+ --lfs_dirvcount;
+ --fs->lfs_dirvcount;
+ TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+ wakeup(&lfs_dirvcount);
+ wakeup(&fs->lfs_dirvcount);
+ mutex_exit(&lfs_lock);
+ }
+ *vpp = vp;
+ return error;
+}
+
+/*
+ * Load the appropriate indirect block, and change the appropriate pointer.
+ * Mark the block dirty. Do segment and avail accounting.
+ */
+static int
+update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
+ daddr_t ndaddr, size_t size, struct lwp *l)
+{
+ int error;
+ struct vnode *vp;
+ struct inode *ip;
+#ifdef DEBUG
+ daddr_t odaddr;
+ struct indir a[NIADDR];
+ int num;
+ int i;
+#endif /* DEBUG */
+ struct buf *bp;
+ SEGUSE *sup;
+
+ KASSERT(lbn >= 0); /* no indirect blocks */
+
+ if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) {
+ DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc"
+ " returned %d\n", ino, error));
+ return error;
+ }
+
+ if ((error = lfs_balloc(vp, (lbn << fs->lfs_bshift), size,
+ NOCRED, 0, &bp)) != 0) {
+ vput(vp);
+ return (error);
+ }
+ /* No need to write, the block is already on disk */
+ if (bp->b_oflags & BO_DELWRI) {
+ LFS_UNLOCK_BUF(bp);
+ fs->lfs_avail += btofsb(fs, bp->b_bcount);
+ }
+ brelse(bp, BC_INVAL);
+
+ /*
+ * Extend the file, if it is not large enough already.
+ * XXX this is not exactly right, we don't know how much of the
+ * XXX last block is actually used. We hope that an inode will
+ * XXX appear later to give the correct size.
+ */
+ ip = VTOI(vp);
+ if (ip->i_size <= (lbn << fs->lfs_bshift)) {
+ u_int64_t newsize;
+
+ if (lbn < NDADDR)
+ newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) +
+ (size - fs->lfs_fsize) + 1;
+ else
+ newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) + 1;
+
+ if (ip->i_size < newsize) {
+ ip->i_size = newsize;
+ /*
+ * tell vm our new size for the case the inode won't
+ * appear later.
+ */
+ uvm_vnp_setsize(vp, newsize);
+ }
+ }
+
+ lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
+
+ LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
+ sup->su_nbytes += size;
+ LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
+
+ /* differences here should be due to UNWRITTEN indirect blocks. */
+ KASSERT((lblkno(fs, ip->i_size) > NDADDR &&
+ ip->i_lfs_effnblks == ip->i_ffs1_blocks) ||
+ ip->i_lfs_effnblks >= ip->i_ffs1_blocks);
+
+#ifdef DEBUG
+ /* Now look again to make sure it worked */
+ ufs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
+ for (i = num; i > 0; i--) {
+ if (!a[i].in_exists)
+ panic("update_meta: absent %d lv indirect block", i);
+ }
+ if (dbtofsb(fs, odaddr) != ndaddr)
+ DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %"
+ PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr));
+#endif /* DEBUG */
+ vput(vp);
+ return 0;
+}
+
+static int
+update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
+ struct lwp *l)
+{
+ struct vnode *devvp, *vp;
+ struct inode *ip;
+ struct ufs1_dinode *dip;
+ struct buf *dbp, *ibp;
+ int error;
+ daddr_t daddr;
+ IFILE *ifp;
+ SEGUSE *sup;
+
+ devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+
+ /*
+ * Get the inode, update times and perms.
+ * DO NOT update disk blocks, we do that separately.
+ */
+ error = bread(devvp, fsbtodb(fs, offset), fs->lfs_ibsize,
+ cred, 0, &dbp);
+ if (error) {
+ DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
+ return error;
+ }
+ dip = ((struct ufs1_dinode *)(dbp->b_data)) + INOPB(fs);
+ while (--dip >= (struct ufs1_dinode *)dbp->b_data) {
+ if (dip->di_inumber > LFS_IFILE_INUM) {
+ error = lfs_rf_valloc(fs, dip->di_inumber, dip->di_gen,
+ l, &vp);
+ if (error) {
+ DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
+ " returned %d\n", error));
+ continue;
+ }
+ ip = VTOI(vp);
+ if (dip->di_size != ip->i_size)
+ lfs_truncate(vp, dip->di_size, 0, NOCRED);
+ /* Get mode, link count, size, and times */
+ memcpy(ip->i_din.ffs1_din, dip,
+ offsetof(struct ufs1_dinode, di_db[0]));
+
+ /* Then the rest, except di_blocks */
+ ip->i_flags = ip->i_ffs1_flags = dip->di_flags;
+ ip->i_gen = ip->i_ffs1_gen = dip->di_gen;
+ ip->i_uid = ip->i_ffs1_uid = dip->di_uid;
+ ip->i_gid = ip->i_ffs1_gid = dip->di_gid;
+
+ ip->i_mode = ip->i_ffs1_mode;
+ ip->i_nlink = ip->i_ffs1_nlink;
+ ip->i_size = ip->i_ffs1_size;
+
+ LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
+
+ /* Re-initialize to get type right */
+ ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
+ &vp);
+ vput(vp);
+
+ /* Record change in location */
+ LFS_IENTRY(ifp, fs, dip->di_inumber, ibp);
+ daddr = ifp->if_daddr;
+ ifp->if_daddr = dbtofsb(fs, dbp->b_blkno);
+ error = LFS_BWRITE_LOG(ibp); /* Ifile */
+ /* And do segment accounting */
+ if (dtosn(fs, daddr) != dtosn(fs, dbtofsb(fs, dbp->b_blkno))) {
+ if (daddr > 0) {
+ LFS_SEGENTRY(sup, fs, dtosn(fs, daddr),
+ ibp);
+ sup->su_nbytes -= sizeof (struct ufs1_dinode);
+ LFS_WRITESEGENTRY(sup, fs,
+ dtosn(fs, daddr),
+ ibp);
+ }
+ LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+ ibp);
+ sup->su_nbytes += sizeof (struct ufs1_dinode);
+ LFS_WRITESEGENTRY(sup, fs,
+ dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+ ibp);
+ }
+ }
+ }
+ brelse(dbp, BC_AGE);
+
+ return 0;
+}
+
+#define CHECK_CKSUM 0x0001 /* Check the checksum to make sure it's valid */
+#define CHECK_UPDATE 0x0002 /* Update Ifile for new data blocks / inodes */
+
+static daddr_t
+check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
+ kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l)
+{
+ struct vnode *devvp;
+ struct buf *bp, *dbp;
+ int error, nblocks = 0, ninos, i, j; /* XXX: gcc */
+ SEGSUM *ssp;
+ u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */
+ daddr_t oldoffset;
+ int32_t *iaddr; /* XXX ondisk32 */
+ FINFO *fip;
+ SEGUSE *sup;
+ size_t size;
+
+ devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+ /*
+ * If the segment has a superblock and we're at the top
+ * of the segment, skip the superblock.
+ */
+ if (sntod(fs, dtosn(fs, offset)) == offset) {
+ LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp);
+ if (sup->su_flags & SEGUSE_SUPERBLOCK)
+ offset += btofsb(fs, LFS_SBPAD);
+ brelse(bp, 0);
+ }
+
+ /* Read in the segment summary */
+ error = bread(devvp, fsbtodb(fs, offset), fs->lfs_sumsize,
+ cred, 0, &bp);
+ if (error)
+ return -1;
+
+ /* Check summary checksum */
+ ssp = (SEGSUM *)bp->b_data;
+ if (flags & CHECK_CKSUM) {
+ if (ssp->ss_sumsum != cksum(&ssp->ss_datasum,
+ fs->lfs_sumsize -
+ sizeof(ssp->ss_sumsum))) {
+ DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset));
+ offset = -1;
+ goto err1;
+ }
+ if (ssp->ss_nfinfo == 0 && ssp->ss_ninos == 0) {
+ DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset));
+ offset = -1;
+ goto err1;
+ }
+ if (ssp->ss_create < fs->lfs_tstamp) {
+ DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
+ offset = -1;
+ goto err1;
+ }
+ }
+ if (fs->lfs_version > 1) {
+ if (ssp->ss_serial != nextserial) {
+ DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64
+ "\n", offset));
+ offset = -1;
+ goto err1;
+ }
+ if (ssp->ss_ident != fs->lfs_ident) {
+ DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
+ PRIx64 "\n", ssp->ss_ident, fs->lfs_ident, offset));
+ offset = -1;
+ goto err1;
+ }
+ }
+ if (pseg_flags)
+ *pseg_flags = ssp->ss_flags;
+ oldoffset = offset;
+ offset += btofsb(fs, fs->lfs_sumsize);
+
+ ninos = howmany(ssp->ss_ninos, INOPB(fs));
+ /* XXX ondisk32 */
+ iaddr = (int32_t *)((char*)bp->b_data + fs->lfs_sumsize - sizeof(int32_t));
+ if (flags & CHECK_CKSUM) {
+ /* Count blocks */
+ nblocks = 0;
+ fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs));
+ for (i = 0; i < ssp->ss_nfinfo; ++i) {
+ nblocks += fip->fi_nblocks;
+ if (fip->fi_nblocks <= 0)
+ break;
+ /* XXX ondisk32 */
+ fip = (FINFO *)(((char *)fip) + FINFOSIZE +
+ (fip->fi_nblocks * sizeof(int32_t)));
+ }
+ nblocks += ninos;
+ /* Create the sum array */
+ datap = dp = (u_long *)malloc(nblocks * sizeof(u_long),
+ M_SEGMENT, M_WAITOK);
+ }
+
+ /* Handle individual blocks */
+ fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs));
+ for (i = 0; i < ssp->ss_nfinfo || ninos; ++i) {
+ /* Inode block? */
+ if (ninos && *iaddr == offset) {
+ if (flags & CHECK_CKSUM) {
+ /* Read in the head and add to the buffer */
+ error = bread(devvp, fsbtodb(fs, offset), fs->lfs_bsize,
+ cred, 0, &dbp);
+ if (error) {
+ offset = -1;
+ goto err2;
+ }
+ (*dp++) = ((u_long *)(dbp->b_data))[0];
+ brelse(dbp, BC_AGE);
+ }
+ if (flags & CHECK_UPDATE) {
+ if ((error = update_inoblk(fs, offset, cred, l))
+ != 0) {
+ offset = -1;
+ goto err2;
+ }
+ }
+ offset += btofsb(fs, fs->lfs_ibsize);
+ --iaddr;
+ --ninos;
+ --i; /* compensate */
+ continue;
+ }
+ size = fs->lfs_bsize;
+ for (j = 0; j < fip->fi_nblocks; ++j) {
+ if (j == fip->fi_nblocks - 1)
+ size = fip->fi_lastlength;
+ if (flags & CHECK_CKSUM) {
+ error = bread(devvp, fsbtodb(fs, offset), size,
+ cred, 0, &dbp);
+ if (error) {
+ offset = -1;
+ goto err2;
+ }
+ (*dp++) = ((u_long *)(dbp->b_data))[0];
+ brelse(dbp, BC_AGE);
+ }
+ /* Account for and update any direct blocks */
+ if ((flags & CHECK_UPDATE) &&
+ fip->fi_ino > LFS_IFILE_INUM &&
+ fip->fi_blocks[j] >= 0) {
+ update_meta(fs, fip->fi_ino, fip->fi_version,
+ fip->fi_blocks[j], offset, size, l);
+ }
+ offset += btofsb(fs, size);
+ }
+ /* XXX ondisk32 */
+ fip = (FINFO *)(((char *)fip) + FINFOSIZE
+ + fip->fi_nblocks * sizeof(int32_t));
+ }
+ /* Checksum the array, compare */
+ if ((flags & CHECK_CKSUM) &&
+ ssp->ss_datasum != cksum(datap, nblocks * sizeof(u_long)))
+ {
+ DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
+ " (wanted %x got %x)\n",
+ offset, ssp->ss_datasum, cksum(datap, nblocks *
+ sizeof(u_long))));
+ offset = -1;
+ goto err2;
+ }
+
+ /* If we're at the end of the segment, move to the next */
+ if (dtosn(fs, offset + btofsb(fs, fs->lfs_sumsize + fs->lfs_bsize)) !=
+ dtosn(fs, offset)) {
+ if (dtosn(fs, offset) == dtosn(fs, ssp->ss_next)) {
+ offset = -1;
+ goto err2;
+ }
+ offset = ssp->ss_next;
+ DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
+ " -> segment %d\n", offset, dtosn(fs,offset)));
+ }
+
+ if (flags & CHECK_UPDATE) {
+ fs->lfs_avail -= (offset - oldoffset);
+ /* Don't clog the buffer queue */
+ mutex_enter(&lfs_lock);
+ if (locked_queue_count > LFS_MAX_BUFS ||
+ locked_queue_bytes > LFS_MAX_BYTES) {
+ lfs_flush(fs, SEGM_CKP, 0);
+ }
+ mutex_exit(&lfs_lock);
+ }
+
+ err2:
+ if (flags & CHECK_CKSUM)
+ free(datap, M_SEGMENT);
+ err1:
+ brelse(bp, BC_AGE);
+
+ /* XXX should we update the serial number even for bad psegs? */
+ if ((flags & CHECK_UPDATE) && offset > 0 && fs->lfs_version > 1)
+ fs->lfs_serial = nextserial;
+ return offset;
+}
+
+void
+lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
+{
+ int flags, dirty;
+ daddr_t offset, oldoffset, lastgoodpseg;
+ int sn, curseg, do_rollforward;
+ struct proc *p;
+ kauth_cred_t cred;
+ SEGUSE *sup;
+ struct buf *bp;
+
+ p = l ? l->l_proc : NULL;
+ cred = p ? p->p_cred : NOCRED;
+
+ /*
+ * Roll forward.
+ *
+ * We don't roll forward for v1 filesystems, because
+ * of the danger that the clock was turned back between the last
+ * checkpoint and crash. This would roll forward garbage.
+ *
+ * v2 filesystems don't have this problem because they use a
+ * monotonically increasing serial number instead of a timestamp.
+ */
+ do_rollforward = (!(fs->lfs_pflags & LFS_PF_CLEAN) &&
+ lfs_do_rfw && fs->lfs_version > 1 && p != NULL);
+ if (do_rollforward) {
+ u_int64_t nextserial;
+ /*
+ * Phase I: Find the address of the last good partial
+ * segment that was written after the checkpoint. Mark
+ * the segments in question dirty, so they won't be
+ * reallocated.
+ */
+ lastgoodpseg = oldoffset = offset = fs->lfs_offset;
+ flags = 0x0;
+ DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
+ PRIx64 "\n", offset));
+ LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp);
+ if (!(sup->su_flags & SEGUSE_DIRTY))
+ --fs->lfs_nclean;
+ sup->su_flags |= SEGUSE_DIRTY;
+ LFS_WRITESEGENTRY(sup, fs, dtosn(fs, offset), bp);
+ nextserial = fs->lfs_serial + 1;
+ while ((offset = check_segsum(fs, offset, nextserial,
+ cred, CHECK_CKSUM, &flags, l)) > 0) {
+ nextserial++;
+ if (sntod(fs, oldoffset) != sntod(fs, offset)) {
+ LFS_SEGENTRY(sup, fs, dtosn(fs, oldoffset),
+ bp);
+ if (!(sup->su_flags & SEGUSE_DIRTY))
+ --fs->lfs_nclean;
+ sup->su_flags |= SEGUSE_DIRTY;
+ LFS_WRITESEGENTRY(sup, fs, dtosn(fs, oldoffset),
+ bp);
+ }
+
+ DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%"
+ PRIx64 "\n", offset));
+ if (flags & SS_DIROP) {
+ DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
+ PRIx64 "\n", oldoffset));
+ if (!(flags & SS_CONT)) {
+ DLOG((DLOG_RF, "lfs_mountfs: dirops end "
+ "at 0x%" PRIx64 "\n", oldoffset));
+ }
+ }
+ if (!(flags & SS_CONT))
+ lastgoodpseg = offset;
+ oldoffset = offset;
+ }
+ if (flags & SS_CONT) {
+ DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
+ "dirops discarded\n"));
+ }
+ DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
+ "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg));
+ oldoffset = fs->lfs_offset;
+ if (fs->lfs_offset != lastgoodpseg) {
+ /* Don't overwrite what we're trying to preserve */
+ offset = fs->lfs_offset;
+ fs->lfs_offset = lastgoodpseg;
+ fs->lfs_curseg = sntod(fs, dtosn(fs, fs->lfs_offset));
+ for (sn = curseg = dtosn(fs, fs->lfs_curseg);;) {
+ sn = (sn + 1) % fs->lfs_nseg;
+ if (sn == curseg)
+ panic("lfs_mountfs: no clean segments");
+ LFS_SEGENTRY(sup, fs, sn, bp);
+ dirty = (sup->su_flags & SEGUSE_DIRTY);
+ brelse(bp, 0);
+ if (!dirty)
+ break;
+ }
+ fs->lfs_nextseg = sntod(fs, sn);
+
+ /*
+ * Phase II: Roll forward from the first superblock.
+ */
+ while (offset != lastgoodpseg) {
+ DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%"
+ PRIx64 "\n", offset));
+ offset = check_segsum(fs, offset,
+ fs->lfs_serial + 1, cred, CHECK_UPDATE,
+ NULL, l);
+ }
+
+ /*
+ * Finish: flush our changes to disk.
+ */
+ lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+ DLOG((DLOG_RF, "lfs_mountfs: roll forward ",
+ "recovered %lld blocks\n",
+ (long long)(lastgoodpseg - oldoffset)));
+ }
+ DLOG((DLOG_RF, "LFS roll forward complete\n"));
+ }
+}
--- /dev/null
+/* $NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $");
+
+#ifdef DEBUG
+# define vndebug(vp, str) do { \
+ if (VTOI(vp)->i_flag & IN_CLEANING) \
+ DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \
+ VTOI(vp)->i_number, (str), op)); \
+} while(0)
+#else
+# define vndebug(vp, str)
+#endif
+#define ivndebug(vp, str) \
+ DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str)))
+
+#if defined(_KERNEL_OPT)
+#include "opt_ddb.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/resourcevar.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+#include <sys/syslog.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_extern.h>
+
+MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
+
+static void lfs_generic_callback(struct buf *, void (*)(struct buf *));
+static void lfs_free_aiodone(struct buf *);
+static void lfs_super_aiodone(struct buf *);
+static void lfs_cluster_aiodone(struct buf *);
+static void lfs_cluster_callback(struct buf *);
+
+/*
+ * Determine if it's OK to start a partial in this segment, or if we need
+ * to go on to a new segment.
+ */
+#define LFS_PARTIAL_FITS(fs) \
+ ((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
+ (fs)->lfs_frag)
+
+/*
+ * Figure out whether we should do a checkpoint write or go ahead with
+ * an ordinary write.
+ */
+#define LFS_SHOULD_CHECKPOINT(fs, flags) \
+ ((flags & SEGM_CLEAN) == 0 && \
+ ((fs->lfs_nactive > LFS_MAX_ACTIVE || \
+ (flags & SEGM_CKP) || \
+ fs->lfs_nclean < LFS_MAX_ACTIVE)))
+
+int lfs_match_fake(struct lfs *, struct buf *);
+void lfs_newseg(struct lfs *);
+/* XXX ondisk32 */
+void lfs_shellsort(struct buf **, int32_t *, int, int);
+void lfs_supercallback(struct buf *);
+void lfs_updatemeta(struct segment *);
+void lfs_writesuper(struct lfs *, daddr_t);
+int lfs_writevnodes(struct lfs *fs, struct mount *mp,
+ struct segment *sp, int dirops);
+
+int lfs_allclean_wakeup; /* Cleaner wakeup address. */
+int lfs_writeindir = 1; /* whether to flush indir on non-ckp */
+int lfs_clean_vnhead = 0; /* Allow freeing to head of vn list */
+int lfs_dirvcount = 0; /* # active dirops */
+
+/* Statistics Counters */
+int lfs_dostats = 1;
+struct lfs_stats lfs_stats;
+
+/* op values to lfs_writevnodes */
+#define VN_REG 0
+#define VN_DIROP 1
+#define VN_EMPTY 2
+#define VN_CLEAN 3
+
+/*
+ * XXX KS - Set modification time on the Ifile, so the cleaner can
+ * read the fs mod time off of it. We don't set IN_UPDATE here,
+ * since we don't really need this to be flushed to disk (and in any
+ * case that wouldn't happen to the Ifile until we checkpoint).
+ */
+void
+lfs_imtime(struct lfs *fs)
+{
+ struct timespec ts;
+ struct inode *ip;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ vfs_timestamp(&ts);
+ ip = VTOI(fs->lfs_ivnode);
+ ip->i_ffs1_mtime = ts.tv_sec;
+ ip->i_ffs1_mtimensec = ts.tv_nsec;
+}
+
+/*
+ * Ifile and meta data blocks are not marked busy, so segment writes MUST be
+ * single threaded. Currently, there are two paths into lfs_segwrite, sync()
+ * and getnewbuf(). They both mark the file system busy. Lfs_vflush()
+ * explicitly marks the file system busy. So lfs_segwrite is safe. I think.
+ */
+
+#define IS_FLUSHING(fs,vp) ((fs)->lfs_flushvp == (vp))
+
+int
+lfs_vflush(struct vnode *vp)
+{
+ struct inode *ip;
+ struct lfs *fs;
+ struct segment *sp;
+ struct buf *bp, *nbp, *tbp, *tnbp;
+ int error;
+ int flushed;
+ int relock;
+ int loopcount;
+
+ ip = VTOI(vp);
+ fs = VFSTOUFS(vp->v_mount)->um_lfs;
+ relock = 0;
+
+ top:
+ ASSERT_NO_SEGLOCK(fs);
+ if (ip->i_flag & IN_CLEANING) {
+ ivndebug(vp,"vflush/in_cleaning");
+ mutex_enter(&lfs_lock);
+ LFS_CLR_UINO(ip, IN_CLEANING);
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+
+ /*
+ * Toss any cleaning buffers that have real counterparts
+ * to avoid losing new data.
+ */
+ mutex_enter(vp->v_interlock);
+ for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+ if (!LFS_IS_MALLOC_BUF(bp))
+ continue;
+ /*
+ * Look for pages matching the range covered
+ * by cleaning blocks. It's okay if more dirty
+ * pages appear, so long as none disappear out
+ * from under us.
+ */
+ if (bp->b_lblkno > 0 && vp->v_type == VREG &&
+ vp != fs->lfs_ivnode) {
+ struct vm_page *pg;
+ voff_t off;
+
+ for (off = lblktosize(fs, bp->b_lblkno);
+ off < lblktosize(fs, bp->b_lblkno + 1);
+ off += PAGE_SIZE) {
+ pg = uvm_pagelookup(&vp->v_uobj, off);
+ if (pg == NULL)
+ continue;
+ if ((pg->flags & PG_CLEAN) == 0 ||
+ pmap_is_modified(pg)) {
+ fs->lfs_avail += btofsb(fs,
+ bp->b_bcount);
+ wakeup(&fs->lfs_avail);
+ mutex_exit(vp->v_interlock);
+ lfs_freebuf(fs, bp);
+ mutex_enter(vp->v_interlock);
+ bp = NULL;
+ break;
+ }
+ }
+ }
+ for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp;
+ tbp = tnbp)
+ {
+ tnbp = LIST_NEXT(tbp, b_vnbufs);
+ if (tbp->b_vp == bp->b_vp
+ && tbp->b_lblkno == bp->b_lblkno
+ && tbp != bp)
+ {
+ fs->lfs_avail += btofsb(fs,
+ bp->b_bcount);
+ wakeup(&fs->lfs_avail);
+ mutex_exit(vp->v_interlock);
+ lfs_freebuf(fs, bp);
+ mutex_enter(vp->v_interlock);
+ bp = NULL;
+ break;
+ }
+ }
+ }
+ } else {
+ mutex_enter(vp->v_interlock);
+ }
+
+ /* If the node is being written, wait until that is done */
+ while (WRITEINPROG(vp)) {
+ ivndebug(vp,"vflush/writeinprog");
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ }
+ mutex_exit(vp->v_interlock);
+
+ /* Protect against VI_XLOCK deadlock in vinvalbuf() */
+ lfs_seglock(fs, SEGM_SYNC);
+
+ /* If we're supposed to flush a freed inode, just toss it */
+ if (ip->i_lfs_iflags & LFSI_DELETED) {
+ DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n",
+ ip->i_number));
+ /* Drain v_numoutput */
+ mutex_enter(vp->v_interlock);
+ while (vp->v_numoutput > 0) {
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ }
+ KASSERT(vp->v_numoutput == 0);
+ mutex_exit(vp->v_interlock);
+
+ mutex_enter(&bufcache_lock);
+ for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+
+ KASSERT((bp->b_flags & B_GATHERED) == 0);
+ if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */
+ fs->lfs_avail += btofsb(fs, bp->b_bcount);
+ wakeup(&fs->lfs_avail);
+ }
+ /* Copied from lfs_writeseg */
+ if (bp->b_iodone != NULL) {
+ mutex_exit(&bufcache_lock);
+ biodone(bp);
+ mutex_enter(&bufcache_lock);
+ } else {
+ bremfree(bp);
+ LFS_UNLOCK_BUF(bp);
+ mutex_enter(vp->v_interlock);
+ bp->b_flags &= ~(B_READ | B_GATHERED);
+ bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE;
+ bp->b_error = 0;
+ reassignbuf(bp, vp);
+ mutex_exit(vp->v_interlock);
+ brelse(bp, 0);
+ }
+ }
+ mutex_exit(&bufcache_lock);
+ LFS_CLR_UINO(ip, IN_CLEANING);
+ LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED);
+ ip->i_flag &= ~IN_ALLMOD;
+ DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n",
+ ip->i_number));
+ lfs_segunlock(fs);
+
+ KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+
+ return 0;
+ }
+
+ fs->lfs_flushvp = vp;
+ if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) {
+ error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC);
+ fs->lfs_flushvp = NULL;
+ KASSERT(fs->lfs_flushvp_fakevref == 0);
+ lfs_segunlock(fs);
+
+ /* Make sure that any pending buffers get written */
+ mutex_enter(vp->v_interlock);
+ while (vp->v_numoutput > 0) {
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ }
+ KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+ KASSERT(vp->v_numoutput == 0);
+ mutex_exit(vp->v_interlock);
+
+ return error;
+ }
+ sp = fs->lfs_sp;
+
+ flushed = 0;
+ if (VPISEMPTY(vp)) {
+ lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
+ ++flushed;
+ } else if ((ip->i_flag & IN_CLEANING) &&
+ (fs->lfs_sp->seg_flags & SEGM_CLEAN)) {
+ ivndebug(vp,"vflush/clean");
+ lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
+ ++flushed;
+ } else if (lfs_dostats) {
+ if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD))
+ ++lfs_stats.vflush_invoked;
+ ivndebug(vp,"vflush");
+ }
+
+#ifdef DIAGNOSTIC
+ if (vp->v_uflag & VU_DIROP) {
+ DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n"));
+ /* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */
+ }
+#endif
+
+ do {
+ loopcount = 0;
+ do {
+ if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
+ relock = lfs_writefile(fs, sp, vp);
+ if (relock) {
+ /*
+ * Might have to wait for the
+ * cleaner to run; but we're
+ * still not done with this vnode.
+ */
+ KDASSERT(ip->i_number != LFS_IFILE_INUM);
+ lfs_writeinode(fs, sp, ip);
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+ lfs_writeseg(fs, sp);
+ lfs_segunlock(fs);
+ lfs_segunlock_relock(fs);
+ goto top;
+ }
+ }
+ /*
+ * If we begin a new segment in the middle of writing
+ * the Ifile, it creates an inconsistent checkpoint,
+ * since the Ifile information for the new segment
+ * is not up-to-date. Take care of this here by
+ * sending the Ifile through again in case there
+ * are newly dirtied blocks. But wait, there's more!
+ * This second Ifile write could *also* cross a segment
+ * boundary, if the first one was large. The second
+ * one is guaranteed to be no more than 8 blocks,
+ * though (two segment blocks and supporting indirects)
+ * so the third write *will not* cross the boundary.
+ */
+ if (vp == fs->lfs_ivnode) {
+ lfs_writefile(fs, sp, vp);
+ lfs_writefile(fs, sp, vp);
+ }
+#ifdef DEBUG
+ if (++loopcount > 2)
+ log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount);
+#endif
+ } while (lfs_writeinode(fs, sp, ip));
+ } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
+
+ if (lfs_dostats) {
+ ++lfs_stats.nwrites;
+ if (sp->seg_flags & SEGM_SYNC)
+ ++lfs_stats.nsync_writes;
+ if (sp->seg_flags & SEGM_CKP)
+ ++lfs_stats.ncheckpoints;
+ }
+ /*
+ * If we were called from somewhere that has already held the seglock
+ * (e.g., lfs_markv()), the lfs_segunlock will not wait for
+ * the write to complete because we are still locked.
+ * Since lfs_vflush() must return the vnode with no dirty buffers,
+ * we must explicitly wait, if that is the case.
+ *
+ * We compare the iocount against 1, not 0, because it is
+ * artificially incremented by lfs_seglock().
+ */
+ mutex_enter(&lfs_lock);
+ if (fs->lfs_seglock > 1) {
+ while (fs->lfs_iocount > 1)
+ (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+ "lfs_vflush", 0, &lfs_lock);
+ }
+ mutex_exit(&lfs_lock);
+
+ lfs_segunlock(fs);
+
+ /* Wait for these buffers to be recovered by aiodoned */
+ mutex_enter(vp->v_interlock);
+ while (vp->v_numoutput > 0) {
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ }
+ KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+ KASSERT(vp->v_numoutput == 0);
+ mutex_exit(vp->v_interlock);
+
+ fs->lfs_flushvp = NULL;
+ KASSERT(fs->lfs_flushvp_fakevref == 0);
+
+ return (0);
+}
+
+int
+lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
+{
+ struct inode *ip;
+ struct vnode *vp;
+ int inodes_written = 0, only_cleaning;
+ int error = 0;
+
+ ASSERT_SEGLOCK(fs);
+ loop:
+ /* start at last (newest) vnode. */
+ mutex_enter(&mntvnode_lock);
+ TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
+ /*
+ * If the vnode that we are about to sync is no longer
+ * associated with this mount point, start over.
+ */
+ if (vp->v_mount != mp) {
+ DLOG((DLOG_VNODE, "lfs_writevnodes: starting over\n"));
+ /*
+ * After this, pages might be busy
+ * due to our own previous putpages.
+ * Start actual segment write here to avoid deadlock.
+ */
+ mutex_exit(&mntvnode_lock);
+ (void)lfs_writeseg(fs, sp);
+ goto loop;
+ }
+
+ mutex_enter(vp->v_interlock);
+ if (vp->v_type == VNON || vismarker(vp) ||
+ (vp->v_iflag & VI_CLEAN) != 0) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+
+ ip = VTOI(vp);
+ if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) ||
+ (op != VN_DIROP && op != VN_CLEAN &&
+ (vp->v_uflag & VU_DIROP))) {
+ mutex_exit(vp->v_interlock);
+ vndebug(vp,"dirop");
+ continue;
+ }
+
+ if (op == VN_EMPTY && !VPISEMPTY(vp)) {
+ mutex_exit(vp->v_interlock);
+ vndebug(vp,"empty");
+ continue;
+ }
+
+ if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM
+ && vp != fs->lfs_flushvp
+ && !(ip->i_flag & IN_CLEANING)) {
+ mutex_exit(vp->v_interlock);
+ vndebug(vp,"cleaning");
+ continue;
+ }
+
+ mutex_exit(&mntvnode_lock);
+ if (lfs_vref(vp)) {
+ vndebug(vp,"vref");
+ mutex_enter(&mntvnode_lock);
+ continue;
+ }
+
+ only_cleaning = 0;
+ /*
+ * Write the inode/file if dirty and it's not the IFILE.
+ */
+ if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) {
+ only_cleaning =
+ ((ip->i_flag & IN_ALLMOD) == IN_CLEANING);
+
+ if (ip->i_number != LFS_IFILE_INUM) {
+ error = lfs_writefile(fs, sp, vp);
+ if (error) {
+ lfs_vunref(vp);
+ if (error == EAGAIN) {
+ /*
+ * This error from lfs_putpages
+ * indicates we need to drop
+ * the segment lock and start
+ * over after the cleaner has
+ * had a chance to run.
+ */
+ lfs_writeinode(fs, sp, ip);
+ lfs_writeseg(fs, sp);
+ if (!VPISEMPTY(vp) &&
+ !WRITEINPROG(vp) &&
+ !(ip->i_flag & IN_ALLMOD)) {
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+ }
+ mutex_enter(&mntvnode_lock);
+ break;
+ }
+ error = 0; /* XXX not quite right */
+ mutex_enter(&mntvnode_lock);
+ continue;
+ }
+
+ if (!VPISEMPTY(vp)) {
+ if (WRITEINPROG(vp)) {
+ ivndebug(vp,"writevnodes/write2");
+ } else if (!(ip->i_flag & IN_ALLMOD)) {
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+ }
+ }
+ (void) lfs_writeinode(fs, sp, ip);
+ inodes_written++;
+ }
+ }
+
+ if (lfs_clean_vnhead && only_cleaning)
+ lfs_vunref_head(vp);
+ else
+ lfs_vunref(vp);
+
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+ return error;
+}
+
+/*
+ * Do a checkpoint.
+ */
+int
+lfs_segwrite(struct mount *mp, int flags)
+{
+ struct buf *bp;
+ struct inode *ip;
+ struct lfs *fs;
+ struct segment *sp;
+ struct vnode *vp;
+ SEGUSE *segusep;
+ int do_ckp, did_ckp, error;
+ unsigned n, segleft, maxseg, sn, i, curseg;
+ int writer_set = 0;
+ int dirty;
+ int redo;
+ int um_error;
+ int loopcount;
+
+ fs = VFSTOUFS(mp)->um_lfs;
+ ASSERT_MAYBE_SEGLOCK(fs);
+
+ if (fs->lfs_ronly)
+ return EROFS;
+
+ lfs_imtime(fs);
+
+ /*
+ * Allocate a segment structure and enough space to hold pointers to
+ * the maximum possible number of buffers which can be described in a
+ * single summary block.
+ */
+ do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
+
+ lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
+ sp = fs->lfs_sp;
+ if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP))
+ do_ckp = 1;
+
+ /*
+ * If lfs_flushvp is non-NULL, we are called from lfs_vflush,
+ * in which case we have to flush *all* buffers off of this vnode.
+ * We don't care about other nodes, but write any non-dirop nodes
+ * anyway in anticipation of another getnewvnode().
+ *
+ * If we're cleaning we only write cleaning and ifile blocks, and
+ * no dirops, since otherwise we'd risk corruption in a crash.
+ */
+ if (sp->seg_flags & SEGM_CLEAN)
+ lfs_writevnodes(fs, mp, sp, VN_CLEAN);
+ else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
+ do {
+ um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
+
+ if (do_ckp || fs->lfs_dirops == 0) {
+ if (!writer_set) {
+ lfs_writer_enter(fs, "lfs writer");
+ writer_set = 1;
+ }
+ error = lfs_writevnodes(fs, mp, sp, VN_DIROP);
+ if (um_error == 0)
+ um_error = error;
+ /* In case writevnodes errored out */
+ lfs_flush_dirops(fs);
+ ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
+ lfs_finalize_fs_seguse(fs);
+ }
+ if (do_ckp && um_error) {
+ lfs_segunlock_relock(fs);
+ sp = fs->lfs_sp;
+ }
+ } while (do_ckp && um_error != 0);
+ }
+
+ /*
+ * If we are doing a checkpoint, mark everything since the
+ * last checkpoint as no longer ACTIVE.
+ */
+ if (do_ckp || fs->lfs_doifile) {
+ segleft = fs->lfs_nseg;
+ curseg = 0;
+ for (n = 0; n < fs->lfs_segtabsz; n++) {
+ dirty = 0;
+ if (bread(fs->lfs_ivnode, fs->lfs_cleansz + n,
+ fs->lfs_bsize, NOCRED, B_MODIFY, &bp))
+ panic("lfs_segwrite: ifile read");
+ segusep = (SEGUSE *)bp->b_data;
+ maxseg = min(segleft, fs->lfs_sepb);
+ for (i = 0; i < maxseg; i++) {
+ sn = curseg + i;
+ if (sn != dtosn(fs, fs->lfs_curseg) &&
+ segusep->su_flags & SEGUSE_ACTIVE) {
+ segusep->su_flags &= ~SEGUSE_ACTIVE;
+ --fs->lfs_nactive;
+ ++dirty;
+ }
+ fs->lfs_suflags[fs->lfs_activesb][sn] =
+ segusep->su_flags;
+ if (fs->lfs_version > 1)
+ ++segusep;
+ else
+ segusep = (SEGUSE *)
+ ((SEGUSE_V1 *)segusep + 1);
+ }
+
+ if (dirty)
+ error = LFS_BWRITE_LOG(bp); /* Ifile */
+ else
+ brelse(bp, 0);
+ segleft -= fs->lfs_sepb;
+ curseg += fs->lfs_sepb;
+ }
+ }
+
+ KASSERT(LFS_SEGLOCK_HELD(fs));
+
+ did_ckp = 0;
+ if (do_ckp || fs->lfs_doifile) {
+ vp = fs->lfs_ivnode;
+ vn_lock(vp, LK_EXCLUSIVE);
+ loopcount = 0;
+ do {
+#ifdef DEBUG
+ LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+ mutex_enter(&lfs_lock);
+ fs->lfs_flags &= ~LFS_IFDIRTY;
+ mutex_exit(&lfs_lock);
+
+ ip = VTOI(vp);
+
+ if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
+ /*
+ * Ifile has no pages, so we don't need
+ * to check error return here.
+ */
+ lfs_writefile(fs, sp, vp);
+ /*
+ * Ensure the Ifile takes the current segment
+ * into account. See comment in lfs_vflush.
+ */
+ lfs_writefile(fs, sp, vp);
+ lfs_writefile(fs, sp, vp);
+ }
+
+ if (ip->i_flag & IN_ALLMOD)
+ ++did_ckp;
+#if 0
+ redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : 0);
+#else
+ redo = lfs_writeinode(fs, sp, ip);
+#endif
+ redo += lfs_writeseg(fs, sp);
+ mutex_enter(&lfs_lock);
+ redo += (fs->lfs_flags & LFS_IFDIRTY);
+ mutex_exit(&lfs_lock);
+#ifdef DEBUG
+ if (++loopcount > 2)
+ log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n",
+ loopcount);
+#endif
+ } while (redo && do_ckp);
+
+ /*
+ * Unless we are unmounting, the Ifile may continue to have
+ * dirty blocks even after a checkpoint, due to changes to
+ * inodes' atime. If we're checkpointing, it's "impossible"
+ * for other parts of the Ifile to be dirty after the loop
+ * above, since we hold the segment lock.
+ */
+ mutex_enter(vp->v_interlock);
+ if (LIST_EMPTY(&vp->v_dirtyblkhd)) {
+ LFS_CLR_UINO(ip, IN_ALLMOD);
+ }
+#ifdef DIAGNOSTIC
+ else if (do_ckp) {
+ int do_panic = 0;
+ LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
+ if (bp->b_lblkno < fs->lfs_cleansz +
+ fs->lfs_segtabsz &&
+ !(bp->b_flags & B_GATHERED)) {
+ printf("ifile lbn %ld still dirty (flags %lx)\n",
+ (long)bp->b_lblkno,
+ (long)bp->b_flags);
+ ++do_panic;
+ }
+ }
+ if (do_panic)
+ panic("dirty blocks");
+ }
+#endif
+ mutex_exit(vp->v_interlock);
+ VOP_UNLOCK(vp);
+ } else {
+ (void) lfs_writeseg(fs, sp);
+ }
+
+ /* Note Ifile no longer needs to be written */
+ fs->lfs_doifile = 0;
+ if (writer_set)
+ lfs_writer_leave(fs);
+
+ /*
+ * If we didn't write the Ifile, we didn't really do anything.
+ * That means that (1) there is a checkpoint on disk and (2)
+ * nothing has changed since it was written.
+ *
+ * Take the flags off of the segment so that lfs_segunlock
+ * doesn't have to write the superblock either.
+ */
+ if (do_ckp && !did_ckp) {
+ sp->seg_flags &= ~SEGM_CKP;
+ }
+
+ if (lfs_dostats) {
+ ++lfs_stats.nwrites;
+ if (sp->seg_flags & SEGM_SYNC)
+ ++lfs_stats.nsync_writes;
+ if (sp->seg_flags & SEGM_CKP)
+ ++lfs_stats.ncheckpoints;
+ }
+ lfs_segunlock(fs);
+ return (0);
+}
+
+/*
+ * Write the dirty blocks associated with a vnode.
+ */
+int
+lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
+{
+ struct finfo *fip;
+ struct inode *ip;
+ int i, frag;
+ int error;
+
+ ASSERT_SEGLOCK(fs);
+ error = 0;
+ ip = VTOI(vp);
+
+ fip = sp->fip;
+ lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+
+ if (vp->v_uflag & VU_DIROP)
+ ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+ if (sp->seg_flags & SEGM_CLEAN) {
+ lfs_gather(fs, sp, vp, lfs_match_fake);
+ /*
+ * For a file being flushed, we need to write *all* blocks.
+ * This means writing the cleaning blocks first, and then
+ * immediately following with any non-cleaning blocks.
+ * The same is true of the Ifile since checkpoints assume
+ * that all valid Ifile blocks are written.
+ */
+ if (IS_FLUSHING(fs, vp) || vp == fs->lfs_ivnode) {
+ lfs_gather(fs, sp, vp, lfs_match_data);
+ /*
+ * Don't call VOP_PUTPAGES: if we're flushing,
+ * we've already done it, and the Ifile doesn't
+ * use the page cache.
+ */
+ }
+ } else {
+ lfs_gather(fs, sp, vp, lfs_match_data);
+ /*
+ * If we're flushing, we've already called VOP_PUTPAGES
+ * so don't do it again. Otherwise, we want to write
+ * everything we've got.
+ */
+ if (!IS_FLUSHING(fs, vp)) {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, 0, 0,
+ PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED);
+ }
+ }
+
+ /*
+ * It may not be necessary to write the meta-data blocks at this point,
+ * as the roll-forward recovery code should be able to reconstruct the
+ * list.
+ *
+ * We have to write them anyway, though, under two conditions: (1) the
+ * vnode is being flushed (for reuse by vinvalbuf); or (2) we are
+ * checkpointing.
+ *
+ * BUT if we are cleaning, we might have indirect blocks that refer to
+ * new blocks not being written yet, in addition to fragments being
+ * moved out of a cleaned segment. If that is the case, don't
+ * write the indirect blocks, or the finfo will have a small block
+ * in the middle of it!
+ * XXX in this case isn't the inode size wrong too?
+ */
+ frag = 0;
+ if (sp->seg_flags & SEGM_CLEAN) {
+ for (i = 0; i < NDADDR; i++)
+ if (ip->i_lfs_fragsize[i] > 0 &&
+ ip->i_lfs_fragsize[i] < fs->lfs_bsize)
+ ++frag;
+ }
+#ifdef DIAGNOSTIC
+ if (frag > 1)
+ panic("lfs_writefile: more than one fragment!");
+#endif
+ if (IS_FLUSHING(fs, vp) ||
+ (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) {
+ lfs_gather(fs, sp, vp, lfs_match_indir);
+ lfs_gather(fs, sp, vp, lfs_match_dindir);
+ lfs_gather(fs, sp, vp, lfs_match_tindir);
+ }
+ fip = sp->fip;
+ lfs_release_finfo(fs);
+
+ return error;
+}
+
+/*
+ * Update segment accounting to reflect this inode's change of address.
+ */
+static int
+lfs_update_iaddr(struct lfs *fs, struct segment *sp, struct inode *ip, daddr_t ndaddr)
+{
+ struct buf *bp;
+ daddr_t daddr;
+ IFILE *ifp;
+ SEGUSE *sup;
+ ino_t ino;
+ int redo_ifile, error;
+ u_int32_t sn;
+
+ redo_ifile = 0;
+
+ /*
+ * If updating the ifile, update the super-block. Update the disk
+ * address and access times for this inode in the ifile.
+ */
+ ino = ip->i_number;
+ if (ino == LFS_IFILE_INUM) {
+ daddr = fs->lfs_idaddr;
+ fs->lfs_idaddr = dbtofsb(fs, ndaddr);
+ } else {
+ LFS_IENTRY(ifp, fs, ino, bp);
+ daddr = ifp->if_daddr;
+ ifp->if_daddr = dbtofsb(fs, ndaddr);
+ error = LFS_BWRITE_LOG(bp); /* Ifile */
+ }
+
+ /*
+ * If this is the Ifile and lfs_offset is set to the first block
+ * in the segment, dirty the new segment's accounting block
+ * (XXX should already be dirty?) and tell the caller to do it again.
+ */
+ if (ip->i_number == LFS_IFILE_INUM) {
+ sn = dtosn(fs, fs->lfs_offset);
+ if (sntod(fs, sn) + btofsb(fs, fs->lfs_sumsize) ==
+ fs->lfs_offset) {
+ LFS_SEGENTRY(sup, fs, sn, bp);
+ KASSERT(bp->b_oflags & BO_DELWRI);
+ LFS_WRITESEGENTRY(sup, fs, sn, bp);
+ /* fs->lfs_flags |= LFS_IFDIRTY; */
+ redo_ifile |= 1;
+ }
+ }
+
+ /*
+ * The inode's last address should not be in the current partial
+ * segment, except under exceptional circumstances (lfs_writevnodes
+ * had to start over, and in the meantime more blocks were written
+ * to a vnode). Both inodes will be accounted to this segment
+ * in lfs_writeseg so we need to subtract the earlier version
+ * here anyway. The segment count can temporarily dip below
+ * zero here; keep track of how many duplicates we have in
+ * "dupino" so we don't panic below.
+ */
+ if (daddr >= fs->lfs_lastpseg && daddr <= fs->lfs_offset) {
+ ++sp->ndupino;
+ DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg "
+ "(ino %d daddr 0x%llx) ndupino=%d\n", ino,
+ (long long)daddr, sp->ndupino));
+ }
+ /*
+ * Account the inode: it no longer belongs to its former segment,
+ * though it will not belong to the new segment until that segment
+ * is actually written.
+ */
+ if (daddr != LFS_UNUSED_DADDR) {
+ u_int32_t oldsn = dtosn(fs, daddr);
+#ifdef DIAGNOSTIC
+ int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : 0;
+#endif
+ LFS_SEGENTRY(sup, fs, oldsn, bp);
+#ifdef DIAGNOSTIC
+ if (sup->su_nbytes +
+ sizeof (struct ufs1_dinode) * ndupino
+ < sizeof (struct ufs1_dinode)) {
+ printf("lfs_writeinode: negative bytes "
+ "(segment %" PRIu32 " short by %d, "
+ "oldsn=%" PRIu32 ", cursn=%" PRIu32
+ ", daddr=%" PRId64 ", su_nbytes=%u, "
+ "ndupino=%d)\n",
+ dtosn(fs, daddr),
+ (int)sizeof (struct ufs1_dinode) *
+ (1 - sp->ndupino) - sup->su_nbytes,
+ oldsn, sp->seg_number, daddr,
+ (unsigned int)sup->su_nbytes,
+ sp->ndupino);
+ panic("lfs_writeinode: negative bytes");
+ sup->su_nbytes = sizeof (struct ufs1_dinode);
+ }
+#endif
+ DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n",
+ dtosn(fs, daddr), sizeof (struct ufs1_dinode), ino));
+ sup->su_nbytes -= sizeof (struct ufs1_dinode);
+ redo_ifile |=
+ (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
+ if (redo_ifile) {
+ mutex_enter(&lfs_lock);
+ fs->lfs_flags |= LFS_IFDIRTY;
+ mutex_exit(&lfs_lock);
+ /* Don't double-account */
+ fs->lfs_idaddr = 0x0;
+ }
+ LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */
+ }
+
+ return redo_ifile;
+}
+
+int
+lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
+{
+ struct buf *bp;
+ struct ufs1_dinode *cdp;
+ daddr_t daddr;
+ int32_t *daddrp; /* XXX ondisk32 */
+ int i, ndx;
+ int redo_ifile = 0;
+ int gotblk = 0;
+ int count;
+
+ ASSERT_SEGLOCK(fs);
+ if (!(ip->i_flag & IN_ALLMOD))
+ return (0);
+
+ /* Can't write ifile when writer is not set */
+ KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 ||
+ (sp->seg_flags & SEGM_CLEAN));
+
+ /*
+ * If this is the Ifile, see if writing it here will generate a
+ * temporary misaccounting. If it will, do the accounting and write
+ * the blocks, postponing the inode write until the accounting is
+ * solid.
+ */
+ count = 0;
+ while (ip->i_number == LFS_IFILE_INUM) {
+ int redo = 0;
+
+ if (sp->idp == NULL && sp->ibp == NULL &&
+ (sp->seg_bytes_left < fs->lfs_ibsize ||
+ sp->sum_bytes_left < sizeof(int32_t))) {
+ (void) lfs_writeseg(fs, sp);
+ continue;
+ }
+
+ /* Look for dirty Ifile blocks */
+ LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) {
+ if (!(bp->b_flags & B_GATHERED)) {
+ redo = 1;
+ break;
+ }
+ }
+
+ if (redo == 0)
+ redo = lfs_update_iaddr(fs, sp, ip, 0x0);
+ if (redo == 0)
+ break;
+
+ if (sp->idp) {
+ sp->idp->di_inumber = 0;
+ sp->idp = NULL;
+ }
+ ++count;
+ if (count > 2)
+ log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count);
+ lfs_writefile(fs, sp, fs->lfs_ivnode);
+ }
+
+ /* Allocate a new inode block if necessary. */
+ if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) &&
+ sp->ibp == NULL) {
+ /* Allocate a new segment if necessary. */
+ if (sp->seg_bytes_left < fs->lfs_ibsize ||
+ sp->sum_bytes_left < sizeof(int32_t))
+ (void) lfs_writeseg(fs, sp);
+
+ /* Get next inode block. */
+ daddr = fs->lfs_offset;
+ fs->lfs_offset += btofsb(fs, fs->lfs_ibsize);
+ sp->ibp = *sp->cbpp++ =
+ getblk(VTOI(fs->lfs_ivnode)->i_devvp,
+ fsbtodb(fs, daddr), fs->lfs_ibsize, 0, 0);
+ gotblk++;
+
+ /* Zero out inode numbers */
+ for (i = 0; i < INOPB(fs); ++i)
+ ((struct ufs1_dinode *)sp->ibp->b_data)[i].di_inumber =
+ 0;
+
+ ++sp->start_bpp;
+ fs->lfs_avail -= btofsb(fs, fs->lfs_ibsize);
+ /* Set remaining space counters. */
+ sp->seg_bytes_left -= fs->lfs_ibsize;
+ sp->sum_bytes_left -= sizeof(int32_t);
+ ndx = fs->lfs_sumsize / sizeof(int32_t) -
+ sp->ninodes / INOPB(fs) - 1;
+ ((int32_t *)(sp->segsum))[ndx] = daddr;
+ }
+
+ /* Check VU_DIROP in case there is a new file with no data blocks */
+ if (ITOV(ip)->v_uflag & VU_DIROP)
+ ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+ /* Update the inode times and copy the inode onto the inode page. */
+ /* XXX kludge --- don't redirty the ifile just to put times on it */
+ if (ip->i_number != LFS_IFILE_INUM)
+ LFS_ITIMES(ip, NULL, NULL, NULL);
+
+ /*
+ * If this is the Ifile, and we've already written the Ifile in this
+ * partial segment, just overwrite it (it's not on disk yet) and
+ * continue.
+ *
+ * XXX we know that the bp that we get the second time around has
+ * already been gathered.
+ */
+ if (ip->i_number == LFS_IFILE_INUM && sp->idp) {
+ *(sp->idp) = *ip->i_din.ffs1_din;
+ ip->i_lfs_osize = ip->i_size;
+ return 0;
+ }
+
+ bp = sp->ibp;
+ cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs));
+ *cdp = *ip->i_din.ffs1_din;
+
+ /*
+ * If cleaning, link counts and directory file sizes cannot change,
+ * since those would be directory operations---even if the file
+ * we are writing is marked VU_DIROP we should write the old values.
+ * If we're not cleaning, of course, update the values so we get
+ * current values the next time we clean.
+ */
+ if (sp->seg_flags & SEGM_CLEAN) {
+ if (ITOV(ip)->v_uflag & VU_DIROP) {
+ cdp->di_nlink = ip->i_lfs_odnlink;
+ /* if (ITOV(ip)->v_type == VDIR) */
+ cdp->di_size = ip->i_lfs_osize;
+ }
+ } else {
+ ip->i_lfs_odnlink = cdp->di_nlink;
+ ip->i_lfs_osize = ip->i_size;
+ }
+
+
+ /* We can finish the segment accounting for truncations now */
+ lfs_finalize_ino_seguse(fs, ip);
+
+ /*
+ * If we are cleaning, ensure that we don't write UNWRITTEN disk
+ * addresses to disk; possibly change the on-disk record of
+ * the inode size, either by reverting to the previous size
+ * (in the case of cleaning) or by verifying the inode's block
+ * holdings (in the case of files being allocated as they are being
+ * written).
+ * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail
+ * XXX count on disk wrong by the same amount. We should be
+ * XXX able to "borrow" from lfs_avail and return it after the
+ * XXX Ifile is written. See also in lfs_writeseg.
+ */
+
+ /* Check file size based on highest allocated block */
+ if (((ip->i_ffs1_mode & IFMT) == IFREG ||
+ (ip->i_ffs1_mode & IFMT) == IFDIR) &&
+ ip->i_size > ((ip->i_lfs_hiblk + 1) << fs->lfs_bshift)) {
+ cdp->di_size = (ip->i_lfs_hiblk + 1) << fs->lfs_bshift;
+ DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %"
+ PRId64 "\n", (int)ip->i_number, ip->i_size, cdp->di_size));
+ }
+ if (ip->i_lfs_effnblks != ip->i_ffs1_blocks) {
+ DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %d != nblk %d)"
+ " at %x\n", ip->i_number, ip->i_lfs_effnblks,
+ ip->i_ffs1_blocks, fs->lfs_offset));
+ for (daddrp = cdp->di_db; daddrp < cdp->di_ib + NIADDR;
+ daddrp++) {
+ if (*daddrp == UNWRITTEN) {
+ DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n"));
+ *daddrp = 0;
+ }
+ }
+ }
+
+#ifdef DIAGNOSTIC
+ /*
+ * Check dinode held blocks against dinode size.
+ * This should be identical to the check in lfs_vget().
+ */
+ for (i = (cdp->di_size + fs->lfs_bsize - 1) >> fs->lfs_bshift;
+ i < NDADDR; i++) {
+ KASSERT(i >= 0);
+ if ((cdp->di_mode & IFMT) == IFLNK)
+ continue;
+ if (((cdp->di_mode & IFMT) == IFBLK ||
+ (cdp->di_mode & IFMT) == IFCHR) && i == 0)
+ continue;
+ if (cdp->di_db[i] != 0) {
+# ifdef DEBUG
+ lfs_dump_dinode(cdp);
+# endif
+ panic("writing inconsistent inode");
+ }
+ }
+#endif /* DIAGNOSTIC */
+
+ if (ip->i_flag & IN_CLEANING)
+ LFS_CLR_UINO(ip, IN_CLEANING);
+ else {
+ /* XXX IN_ALLMOD */
+ LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE |
+ IN_UPDATE | IN_MODIFY);
+ if (ip->i_lfs_effnblks == ip->i_ffs1_blocks)
+ LFS_CLR_UINO(ip, IN_MODIFIED);
+ else {
+ DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real "
+ "blks=%d, eff=%d\n", ip->i_number,
+ ip->i_ffs1_blocks, ip->i_lfs_effnblks));
+ }
+ }
+
+ if (ip->i_number == LFS_IFILE_INUM) {
+ /* We know sp->idp == NULL */
+ sp->idp = ((struct ufs1_dinode *)bp->b_data) +
+ (sp->ninodes % INOPB(fs));
+
+ /* Not dirty any more */
+ mutex_enter(&lfs_lock);
+ fs->lfs_flags &= ~LFS_IFDIRTY;
+ mutex_exit(&lfs_lock);
+ }
+
+ if (gotblk) {
+ mutex_enter(&bufcache_lock);
+ LFS_LOCK_BUF(bp);
+ brelsel(bp, 0);
+ mutex_exit(&bufcache_lock);
+ }
+
+ /* Increment inode count in segment summary block. */
+ ++((SEGSUM *)(sp->segsum))->ss_ninos;
+
+ /* If this page is full, set flag to allocate a new page. */
+ if (++sp->ninodes % INOPB(fs) == 0)
+ sp->ibp = NULL;
+
+ redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno);
+
+ KASSERT(redo_ifile == 0);
+ return (redo_ifile);
+}
+
+int
+lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr)
+{
+ struct lfs *fs;
+ int vers;
+ int j, blksinblk;
+
+ ASSERT_SEGLOCK(sp->fs);
+ /*
+ * If full, finish this segment. We may be doing I/O, so
+ * release and reacquire the splbio().
+ */
+#ifdef DIAGNOSTIC
+ if (sp->vp == NULL)
+ panic ("lfs_gatherblock: Null vp in segment");
+#endif
+ fs = sp->fs;
+ blksinblk = howmany(bp->b_bcount, fs->lfs_bsize);
+ if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk ||
+ sp->seg_bytes_left < bp->b_bcount) {
+ if (mptr)
+ mutex_exit(mptr);
+ lfs_updatemeta(sp);
+
+ vers = sp->fip->fi_version;
+ (void) lfs_writeseg(fs, sp);
+
+ /* Add the current file to the segment summary. */
+ lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers);
+
+ if (mptr)
+ mutex_enter(mptr);
+ return (1);
+ }
+
+ if (bp->b_flags & B_GATHERED) {
+ DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %d,"
+ " lbn %" PRId64 "\n",
+ sp->fip->fi_ino, bp->b_lblkno));
+ return (0);
+ }
+
+ /* Insert into the buffer list, update the FINFO block. */
+ bp->b_flags |= B_GATHERED;
+
+ *sp->cbpp++ = bp;
+ for (j = 0; j < blksinblk; j++) {
+ sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno + j;
+ /* This block's accounting moves from lfs_favail to lfs_avail */
+ lfs_deregister_block(sp->vp, bp->b_lblkno + j);
+ }
+
+ sp->sum_bytes_left -= sizeof(int32_t) * blksinblk;
+ sp->seg_bytes_left -= bp->b_bcount;
+ return (0);
+}
+
+int
+lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp,
+ int (*match)(struct lfs *, struct buf *))
+{
+ struct buf *bp, *nbp;
+ int count = 0;
+
+ ASSERT_SEGLOCK(fs);
+ if (vp->v_type == VBLK)
+ return 0;
+ KASSERT(sp->vp == NULL);
+ sp->vp = vp;
+ mutex_enter(&bufcache_lock);
+
+#ifndef LFS_NO_BACKBUF_HACK
+/* This is a hack to see if ordering the blocks in LFS makes a difference. */
+# define BUF_OFFSET \
+ (((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp)
+# define BACK_BUF(BP) \
+ ((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET))
+# define BEG_OF_LIST \
+ ((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET))
+
+loop:
+ /* Find last buffer. */
+ for (bp = LIST_FIRST(&vp->v_dirtyblkhd);
+ bp && LIST_NEXT(bp, b_vnbufs) != NULL;
+ bp = LIST_NEXT(bp, b_vnbufs))
+ /* nothing */;
+ for (; bp && bp != BEG_OF_LIST; bp = nbp) {
+ nbp = BACK_BUF(bp);
+#else /* LFS_NO_BACKBUF_HACK */
+loop:
+ for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+#endif /* LFS_NO_BACKBUF_HACK */
+ if ((bp->b_cflags & BC_BUSY) != 0 ||
+ (bp->b_flags & B_GATHERED) != 0 || !match(fs, bp)) {
+#ifdef DEBUG
+ if (vp == fs->lfs_ivnode &&
+ (bp->b_cflags & BC_BUSY) != 0 &&
+ (bp->b_flags & B_GATHERED) == 0)
+ log(LOG_NOTICE, "lfs_gather: ifile lbn %"
+ PRId64 " busy (%x) at 0x%x",
+ bp->b_lblkno, bp->b_flags,
+ (unsigned)fs->lfs_offset);
+#endif
+ continue;
+ }
+#ifdef DIAGNOSTIC
+# ifdef LFS_USE_B_INVAL
+ if ((bp->b_flags & BC_INVAL) != 0 && bp->b_iodone == NULL) {
+ DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
+ " is BC_INVAL\n", bp->b_lblkno));
+ VOP_PRINT(bp->b_vp);
+ }
+# endif /* LFS_USE_B_INVAL */
+ if (!(bp->b_oflags & BO_DELWRI))
+ panic("lfs_gather: bp not BO_DELWRI");
+ if (!(bp->b_flags & B_LOCKED)) {
+ DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
+ " blk %" PRId64 " not B_LOCKED\n",
+ bp->b_lblkno,
+ dbtofsb(fs, bp->b_blkno)));
+ VOP_PRINT(bp->b_vp);
+ panic("lfs_gather: bp not B_LOCKED");
+ }
+#endif
+ if (lfs_gatherblock(sp, bp, &bufcache_lock)) {
+ goto loop;
+ }
+ count++;
+ }
+ mutex_exit(&bufcache_lock);
+ lfs_updatemeta(sp);
+ KASSERT(sp->vp == vp);
+ sp->vp = NULL;
+ return count;
+}
+
+#if DEBUG
+# define DEBUG_OOFF(n) do { \
+ if (ooff == 0) { \
+ DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \
+ "ino %d lbn %" PRId64 " at 0x%" PRIx32 \
+ ", was 0x0 (or %" PRId64 ")\n", \
+ (n), ip->i_number, lbn, ndaddr, daddr)); \
+ } \
+} while (0)
+#else
+# define DEBUG_OOFF(n)
+#endif
+
+/*
+ * Change the given block's address to ndaddr, finding its previous
+ * location using ufs_bmaparray().
+ *
+ * Account for this change in the segment table.
+ *
+ * called with sp == NULL by roll-forwarding code.
+ */
+void
+lfs_update_single(struct lfs *fs, struct segment *sp,
+ struct vnode *vp, daddr_t lbn, int32_t ndaddr, int size)
+{
+ SEGUSE *sup;
+ struct buf *bp;
+ struct indir a[NIADDR + 2], *ap;
+ struct inode *ip;
+ daddr_t daddr, ooff;
+ int num, error;
+ int bb, osize, obb;
+
+ ASSERT_SEGLOCK(fs);
+ KASSERT(sp == NULL || sp->vp == vp);
+ ip = VTOI(vp);
+
+ error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL);
+ if (error)
+ panic("lfs_updatemeta: ufs_bmaparray returned %d", error);
+
+ daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
+ KASSERT(daddr <= LFS_MAX_DADDR);
+ if (daddr > 0)
+ daddr = dbtofsb(fs, daddr);
+
+ bb = numfrags(fs, size);
+ switch (num) {
+ case 0:
+ ooff = ip->i_ffs1_db[lbn];
+ DEBUG_OOFF(0);
+ if (ooff == UNWRITTEN)
+ ip->i_ffs1_blocks += bb;
+ else {
+ /* possible fragment truncation or extension */
+ obb = btofsb(fs, ip->i_lfs_fragsize[lbn]);
+ ip->i_ffs1_blocks += (bb - obb);
+ }
+ ip->i_ffs1_db[lbn] = ndaddr;
+ break;
+ case 1:
+ ooff = ip->i_ffs1_ib[a[0].in_off];
+ DEBUG_OOFF(1);
+ if (ooff == UNWRITTEN)
+ ip->i_ffs1_blocks += bb;
+ ip->i_ffs1_ib[a[0].in_off] = ndaddr;
+ break;
+ default:
+ ap = &a[num - 1];
+ if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED,
+ B_MODIFY, &bp))
+ panic("lfs_updatemeta: bread bno %" PRId64,
+ ap->in_lbn);
+
+ /* XXX ondisk32 */
+ ooff = ((int32_t *)bp->b_data)[ap->in_off];
+ DEBUG_OOFF(num);
+ if (ooff == UNWRITTEN)
+ ip->i_ffs1_blocks += bb;
+ /* XXX ondisk32 */
+ ((int32_t *)bp->b_data)[ap->in_off] = ndaddr;
+ (void) VOP_BWRITE(bp->b_vp, bp);
+ }
+
+ KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr);
+
+ /* Update hiblk when extending the file */
+ if (lbn > ip->i_lfs_hiblk)
+ ip->i_lfs_hiblk = lbn;
+
+ /*
+ * Though we'd rather it couldn't, this *can* happen right now
+ * if cleaning blocks and regular blocks coexist.
+ */
+ /* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */
+
+ /*
+ * Update segment usage information, based on old size
+ * and location.
+ */
+ if (daddr > 0) {
+ u_int32_t oldsn = dtosn(fs, daddr);
+#ifdef DIAGNOSTIC
+ int ndupino;
+
+ if (sp && sp->seg_number == oldsn) {
+ ndupino = sp->ndupino;
+ } else {
+ ndupino = 0;
+ }
+#endif
+ KASSERT(oldsn < fs->lfs_nseg);
+ if (lbn >= 0 && lbn < NDADDR)
+ osize = ip->i_lfs_fragsize[lbn];
+ else
+ osize = fs->lfs_bsize;
+ LFS_SEGENTRY(sup, fs, oldsn, bp);
+#ifdef DIAGNOSTIC
+ if (sup->su_nbytes + sizeof (struct ufs1_dinode) * ndupino
+ < osize) {
+ printf("lfs_updatemeta: negative bytes "
+ "(segment %" PRIu32 " short by %" PRId64
+ ")\n", dtosn(fs, daddr),
+ (int64_t)osize -
+ (sizeof (struct ufs1_dinode) * ndupino +
+ sup->su_nbytes));
+ printf("lfs_updatemeta: ino %llu, lbn %" PRId64
+ ", addr = 0x%" PRIx64 "\n",
+ (unsigned long long)ip->i_number, lbn, daddr);
+ printf("lfs_updatemeta: ndupino=%d\n", ndupino);
+ panic("lfs_updatemeta: negative bytes");
+ sup->su_nbytes = osize -
+ sizeof (struct ufs1_dinode) * ndupino;
+ }
+#endif
+ DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64
+ " db 0x%" PRIx64 "\n",
+ dtosn(fs, daddr), osize,
+ ip->i_number, lbn, daddr));
+ sup->su_nbytes -= osize;
+ if (!(bp->b_flags & B_GATHERED)) {
+ mutex_enter(&lfs_lock);
+ fs->lfs_flags |= LFS_IFDIRTY;
+ mutex_exit(&lfs_lock);
+ }
+ LFS_WRITESEGENTRY(sup, fs, oldsn, bp);
+ }
+ /*
+ * Now that this block has a new address, and its old
+ * segment no longer owns it, we can forget about its
+ * old size.
+ */
+ if (lbn >= 0 && lbn < NDADDR)
+ ip->i_lfs_fragsize[lbn] = size;
+}
+
+/*
+ * Update the metadata that points to the blocks listed in the FINFO
+ * array.
+ */
+void
+lfs_updatemeta(struct segment *sp)
+{
+ struct buf *sbp;
+ struct lfs *fs;
+ struct vnode *vp;
+ daddr_t lbn;
+ int i, nblocks, num;
+ int bb;
+ int bytesleft, size;
+
+ ASSERT_SEGLOCK(sp->fs);
+ vp = sp->vp;
+ nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
+ KASSERT(nblocks >= 0);
+ KASSERT(vp != NULL);
+ if (nblocks == 0)
+ return;
+
+ /*
+ * This count may be high due to oversize blocks from lfs_gop_write.
+ * Correct for this. (XXX we should be able to keep track of these.)
+ */
+ fs = sp->fs;
+ for (i = 0; i < nblocks; i++) {
+ if (sp->start_bpp[i] == NULL) {
+ DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks));
+ nblocks = i;
+ break;
+ }
+ num = howmany(sp->start_bpp[i]->b_bcount, fs->lfs_bsize);
+ KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1);
+ nblocks -= num - 1;
+ }
+
+ KASSERT(vp->v_type == VREG ||
+ nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp);
+ KASSERT(nblocks == sp->cbpp - sp->start_bpp);
+
+ /*
+ * Sort the blocks.
+ *
+ * We have to sort even if the blocks come from the
+ * cleaner, because there might be other pending blocks on the
+ * same inode...and if we don't sort, and there are fragments
+ * present, blocks may be written in the wrong place.
+ */
+ lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks, fs->lfs_bsize);
+
+ /*
+ * Record the length of the last block in case it's a fragment.
+ * If there are indirect blocks present, they sort last. An
+ * indirect block will be lfs_bsize and its presence indicates
+ * that you cannot have fragments.
+ *
+ * XXX This last is a lie. A cleaned fragment can coexist with
+ * XXX a later indirect block. This will continue to be
+ * XXX true until lfs_markv is fixed to do everything with
+ * XXX fake blocks (including fake inodes and fake indirect blocks).
+ */
+ sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) &
+ fs->lfs_bmask) + 1;
+
+ /*
+ * Assign disk addresses, and update references to the logical
+ * block and the segment usage information.
+ */
+ for (i = nblocks; i--; ++sp->start_bpp) {
+ sbp = *sp->start_bpp;
+ lbn = *sp->start_lbp;
+ KASSERT(sbp->b_lblkno == lbn);
+
+ sbp->b_blkno = fsbtodb(fs, fs->lfs_offset);
+
+ /*
+ * If we write a frag in the wrong place, the cleaner won't
+ * be able to correctly identify its size later, and the
+ * segment will be uncleanable. (Even worse, it will assume
+ * that the indirect block that actually ends the list
+ * is of a smaller size!)
+ */
+ if ((sbp->b_bcount & fs->lfs_bmask) && i != 0)
+ panic("lfs_updatemeta: fragment is not last block");
+
+ /*
+ * For each subblock in this possibly oversized block,
+ * update its address on disk.
+ */
+ KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize);
+ KASSERT(vp == sbp->b_vp);
+ for (bytesleft = sbp->b_bcount; bytesleft > 0;
+ bytesleft -= fs->lfs_bsize) {
+ size = MIN(bytesleft, fs->lfs_bsize);
+ bb = numfrags(fs, size);
+ lbn = *sp->start_lbp++;
+ lfs_update_single(fs, sp, sp->vp, lbn, fs->lfs_offset,
+ size);
+ fs->lfs_offset += bb;
+ }
+
+ }
+
+ /* This inode has been modified */
+ LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
+}
+
+/*
+ * Move lfs_offset to a segment earlier than sn.
+ */
+int
+lfs_rewind(struct lfs *fs, int newsn)
+{
+ int sn, osn, isdirty;
+ struct buf *bp;
+ SEGUSE *sup;
+
+ ASSERT_SEGLOCK(fs);
+
+ osn = dtosn(fs, fs->lfs_offset);
+ if (osn < newsn)
+ return 0;
+
+ /* lfs_avail eats the remaining space in this segment */
+ fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - fs->lfs_curseg);
+
+ /* Find a low-numbered segment */
+ for (sn = 0; sn < fs->lfs_nseg; ++sn) {
+ LFS_SEGENTRY(sup, fs, sn, bp);
+ isdirty = sup->su_flags & SEGUSE_DIRTY;
+ brelse(bp, 0);
+
+ if (!isdirty)
+ break;
+ }
+ if (sn == fs->lfs_nseg)
+ panic("lfs_rewind: no clean segments");
+ if (newsn >= 0 && sn >= newsn)
+ return ENOENT;
+ fs->lfs_nextseg = sn;
+ lfs_newseg(fs);
+ fs->lfs_offset = fs->lfs_curseg;
+
+ return 0;
+}
+
+/*
+ * Start a new partial segment.
+ *
+ * Return 1 when we entered to a new segment.
+ * Otherwise, return 0.
+ */
+int
+lfs_initseg(struct lfs *fs)
+{
+ struct segment *sp = fs->lfs_sp;
+ SEGSUM *ssp;
+ struct buf *sbp; /* buffer for SEGSUM */
+ int repeat = 0; /* return value */
+
+ ASSERT_SEGLOCK(fs);
+ /* Advance to the next segment. */
+ if (!LFS_PARTIAL_FITS(fs)) {
+ SEGUSE *sup;
+ struct buf *bp;
+
+ /* lfs_avail eats the remaining space */
+ fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset -
+ fs->lfs_curseg);
+ /* Wake up any cleaning procs waiting on this file system. */
+ lfs_wakeup_cleaner(fs);
+ lfs_newseg(fs);
+ repeat = 1;
+ fs->lfs_offset = fs->lfs_curseg;
+
+ sp->seg_number = dtosn(fs, fs->lfs_curseg);
+ sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg);
+
+ /*
+ * If the segment contains a superblock, update the offset
+ * and summary address to skip over it.
+ */
+ LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
+ if (sup->su_flags & SEGUSE_SUPERBLOCK) {
+ fs->lfs_offset += btofsb(fs, LFS_SBPAD);
+ sp->seg_bytes_left -= LFS_SBPAD;
+ }
+ brelse(bp, 0);
+ /* Segment zero could also contain the labelpad */
+ if (fs->lfs_version > 1 && sp->seg_number == 0 &&
+ fs->lfs_start < btofsb(fs, LFS_LABELPAD)) {
+ fs->lfs_offset +=
+ btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
+ sp->seg_bytes_left -=
+ LFS_LABELPAD - fsbtob(fs, fs->lfs_start);
+ }
+ } else {
+ sp->seg_number = dtosn(fs, fs->lfs_curseg);
+ sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg -
+ (fs->lfs_offset - fs->lfs_curseg));
+ }
+ fs->lfs_lastpseg = fs->lfs_offset;
+
+ /* Record first address of this partial segment */
+ if (sp->seg_flags & SEGM_CLEAN) {
+ fs->lfs_cleanint[fs->lfs_cleanind] = fs->lfs_offset;
+ if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) {
+ /* "1" is the artificial inc in lfs_seglock */
+ mutex_enter(&lfs_lock);
+ while (fs->lfs_iocount > 1) {
+ mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+ "lfs_initseg", 0, &lfs_lock);
+ }
+ mutex_exit(&lfs_lock);
+ fs->lfs_cleanind = 0;
+ }
+ }
+
+ sp->fs = fs;
+ sp->ibp = NULL;
+ sp->idp = NULL;
+ sp->ninodes = 0;
+ sp->ndupino = 0;
+
+ sp->cbpp = sp->bpp;
+
+ /* Get a new buffer for SEGSUM */
+ sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
+ fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY);
+
+ /* ... and enter it into the buffer list. */
+ *sp->cbpp = sbp;
+ sp->cbpp++;
+ fs->lfs_offset += btofsb(fs, fs->lfs_sumsize);
+
+ sp->start_bpp = sp->cbpp;
+
+ /* Set point to SEGSUM, initialize it. */
+ ssp = sp->segsum = sbp->b_data;
+ memset(ssp, 0, fs->lfs_sumsize);
+ ssp->ss_next = fs->lfs_nextseg;
+ ssp->ss_nfinfo = ssp->ss_ninos = 0;
+ ssp->ss_magic = SS_MAGIC;
+
+ /* Set pointer to first FINFO, initialize it. */
+ sp->fip = (struct finfo *)((char *)sp->segsum + SEGSUM_SIZE(fs));
+ sp->fip->fi_nblocks = 0;
+ sp->start_lbp = &sp->fip->fi_blocks[0];
+ sp->fip->fi_lastlength = 0;
+
+ sp->seg_bytes_left -= fs->lfs_sumsize;
+ sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs);
+
+ return (repeat);
+}
+
+/*
+ * Remove SEGUSE_INVAL from all segments.
+ */
+void
+lfs_unset_inval_all(struct lfs *fs)
+{
+ SEGUSE *sup;
+ struct buf *bp;
+ int i;
+
+ for (i = 0; i < fs->lfs_nseg; i++) {
+ LFS_SEGENTRY(sup, fs, i, bp);
+ if (sup->su_flags & SEGUSE_INVAL) {
+ sup->su_flags &= ~SEGUSE_INVAL;
+ LFS_WRITESEGENTRY(sup, fs, i, bp);
+ } else
+ brelse(bp, 0);
+ }
+}
+
+/*
+ * Return the next segment to write.
+ */
+void
+lfs_newseg(struct lfs *fs)
+{
+ CLEANERINFO *cip;
+ SEGUSE *sup;
+ struct buf *bp;
+ int curseg, isdirty, sn, skip_inval;
+
+ ASSERT_SEGLOCK(fs);
+
+ /* Honor LFCNWRAPSTOP */
+ mutex_enter(&lfs_lock);
+ while (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) {
+ if (fs->lfs_wrappass) {
+ log(LOG_NOTICE, "%s: wrappass=%d\n",
+ fs->lfs_fsmnt, fs->lfs_wrappass);
+ fs->lfs_wrappass = 0;
+ break;
+ }
+ fs->lfs_wrapstatus = LFS_WRAP_WAITING;
+ wakeup(&fs->lfs_nowrap);
+ log(LOG_NOTICE, "%s: waiting at log wrap\n", fs->lfs_fsmnt);
+ mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz,
+ &lfs_lock);
+ }
+ fs->lfs_wrapstatus = LFS_WRAP_GOING;
+ mutex_exit(&lfs_lock);
+
+ LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
+ DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n",
+ dtosn(fs, fs->lfs_nextseg)));
+ sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
+ sup->su_nbytes = 0;
+ sup->su_nsums = 0;
+ sup->su_ninos = 0;
+ LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
+
+ LFS_CLEANERINFO(cip, fs, bp);
+ --cip->clean;
+ ++cip->dirty;
+ fs->lfs_nclean = cip->clean;
+ LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+ fs->lfs_lastseg = fs->lfs_curseg;
+ fs->lfs_curseg = fs->lfs_nextseg;
+ skip_inval = 1;
+ for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) {
+ sn = (sn + 1) % fs->lfs_nseg;
+
+ if (sn == curseg) {
+ if (skip_inval)
+ skip_inval = 0;
+ else
+ panic("lfs_nextseg: no clean segments");
+ }
+ LFS_SEGENTRY(sup, fs, sn, bp);
+ isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0));
+ /* Check SEGUSE_EMPTY as we go along */
+ if (isdirty && sup->su_nbytes == 0 &&
+ !(sup->su_flags & SEGUSE_EMPTY))
+ LFS_WRITESEGENTRY(sup, fs, sn, bp);
+ else
+ brelse(bp, 0);
+
+ if (!isdirty)
+ break;
+ }
+ if (skip_inval == 0)
+ lfs_unset_inval_all(fs);
+
+ ++fs->lfs_nactive;
+ fs->lfs_nextseg = sntod(fs, sn);
+ if (lfs_dostats) {
+ ++lfs_stats.segsused;
+ }
+}
+
+static struct buf *
+lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr,
+ int n)
+{
+ struct lfs_cluster *cl;
+ struct buf **bpp, *bp;
+
+ ASSERT_SEGLOCK(fs);
+ cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK);
+ bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK);
+ memset(cl, 0, sizeof(*cl));
+ cl->fs = fs;
+ cl->bpp = bpp;
+ cl->bufcount = 0;
+ cl->bufsize = 0;
+
+ /* If this segment is being written synchronously, note that */
+ if (fs->lfs_sp->seg_flags & SEGM_SYNC) {
+ cl->flags |= LFS_CL_SYNC;
+ cl->seg = fs->lfs_sp;
+ ++cl->seg->seg_iocount;
+ }
+
+ /* Get an empty buffer header, or maybe one with something on it */
+ bp = getiobuf(vp, true);
+ bp->b_dev = NODEV;
+ bp->b_blkno = bp->b_lblkno = addr;
+ bp->b_iodone = lfs_cluster_callback;
+ bp->b_private = cl;
+
+ return bp;
+}
+
+int
+lfs_writeseg(struct lfs *fs, struct segment *sp)
+{
+ struct buf **bpp, *bp, *cbp, *newbp, *unbusybp;
+ SEGUSE *sup;
+ SEGSUM *ssp;
+ int i;
+ int do_again, nblocks, byteoffset;
+ size_t el_size;
+ struct lfs_cluster *cl;
+ u_short ninos;
+ struct vnode *devvp;
+ char *p = NULL;
+ struct vnode *vp;
+ int32_t *daddrp; /* XXX ondisk32 */
+ int changed;
+ u_int32_t sum;
+#ifdef DEBUG
+ FINFO *fip;
+ int findex;
+#endif
+
+ ASSERT_SEGLOCK(fs);
+
+ ssp = (SEGSUM *)sp->segsum;
+
+ /*
+ * If there are no buffers other than the segment summary to write,
+ * don't do anything. If we are the end of a dirop sequence, however,
+ * write the empty segment summary anyway, to help out the
+ * roll-forward agent.
+ */
+ if ((nblocks = sp->cbpp - sp->bpp) == 1) {
+ if ((ssp->ss_flags & (SS_DIROP | SS_CONT)) != SS_DIROP)
+ return 0;
+ }
+
+ /* Note if partial segment is being written by the cleaner */
+ if (sp->seg_flags & SEGM_CLEAN)
+ ssp->ss_flags |= SS_CLEAN;
+
+ devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+
+ /* Update the segment usage information. */
+ LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
+
+ /* Loop through all blocks, except the segment summary. */
+ for (bpp = sp->bpp; ++bpp < sp->cbpp; ) {
+ if ((*bpp)->b_vp != devvp) {
+ sup->su_nbytes += (*bpp)->b_bcount;
+ DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d"
+ " lbn %" PRId64 " db 0x%" PRIx64 "\n",
+ sp->seg_number, (*bpp)->b_bcount,
+ VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno,
+ (*bpp)->b_blkno));
+ }
+ }
+
+#ifdef DEBUG
+ /* Check for zero-length and zero-version FINFO entries. */
+ fip = (struct finfo *)((char *)ssp + SEGSUM_SIZE(fs));
+ for (findex = 0; findex < ssp->ss_nfinfo; findex++) {
+ KDASSERT(fip->fi_nblocks > 0);
+ KDASSERT(fip->fi_version > 0);
+ fip = (FINFO *)((char *)fip + FINFOSIZE +
+ sizeof(int32_t) * fip->fi_nblocks);
+ }
+#endif /* DEBUG */
+
+ ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
+ DLOG((DLOG_SU, "seg %d += %d for %d inodes\n",
+ sp->seg_number, ssp->ss_ninos * sizeof (struct ufs1_dinode),
+ ssp->ss_ninos));
+ sup->su_nbytes += ssp->ss_ninos * sizeof (struct ufs1_dinode);
+ /* sup->su_nbytes += fs->lfs_sumsize; */
+ if (fs->lfs_version == 1)
+ sup->su_olastmod = time_second;
+ else
+ sup->su_lastmod = time_second;
+ sup->su_ninos += ninos;
+ ++sup->su_nsums;
+ fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize);
+
+ do_again = !(bp->b_flags & B_GATHERED);
+ LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */
+
+ /*
+ * Mark blocks B_BUSY, to prevent then from being changed between
+ * the checksum computation and the actual write.
+ *
+ * If we are cleaning, check indirect blocks for UNWRITTEN, and if
+ * there are any, replace them with copies that have UNASSIGNED
+ * instead.
+ */
+ mutex_enter(&bufcache_lock);
+ for (bpp = sp->bpp, i = nblocks - 1; i--;) {
+ ++bpp;
+ bp = *bpp;
+ if (bp->b_iodone != NULL) { /* UBC or malloced buffer */
+ bp->b_cflags |= BC_BUSY;
+ continue;
+ }
+
+ while (bp->b_cflags & BC_BUSY) {
+ DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential"
+ " data summary corruption for ino %d, lbn %"
+ PRId64 "\n",
+ VTOI(bp->b_vp)->i_number, bp->b_lblkno));
+ bp->b_cflags |= BC_WANTED;
+ cv_wait(&bp->b_busy, &bufcache_lock);
+ }
+ bp->b_cflags |= BC_BUSY;
+ mutex_exit(&bufcache_lock);
+ unbusybp = NULL;
+
+ /*
+ * Check and replace indirect block UNWRITTEN bogosity.
+ * XXX See comment in lfs_writefile.
+ */
+ if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp &&
+ VTOI(bp->b_vp)->i_ffs1_blocks !=
+ VTOI(bp->b_vp)->i_lfs_effnblks) {
+ DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%d != %d)\n",
+ VTOI(bp->b_vp)->i_number,
+ VTOI(bp->b_vp)->i_lfs_effnblks,
+ VTOI(bp->b_vp)->i_ffs1_blocks));
+ /* Make a copy we'll make changes to */
+ newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
+ bp->b_bcount, LFS_NB_IBLOCK);
+ newbp->b_blkno = bp->b_blkno;
+ memcpy(newbp->b_data, bp->b_data,
+ newbp->b_bcount);
+
+ changed = 0;
+ /* XXX ondisk32 */
+ for (daddrp = (int32_t *)(newbp->b_data);
+ daddrp < (int32_t *)((char *)newbp->b_data +
+ newbp->b_bcount); daddrp++) {
+ if (*daddrp == UNWRITTEN) {
+ ++changed;
+ *daddrp = 0;
+ }
+ }
+ /*
+ * Get rid of the old buffer. Don't mark it clean,
+ * though, if it still has dirty data on it.
+ */
+ if (changed) {
+ DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):"
+ " bp = %p newbp = %p\n", changed, bp,
+ newbp));
+ *bpp = newbp;
+ bp->b_flags &= ~B_GATHERED;
+ bp->b_error = 0;
+ if (bp->b_iodone != NULL) {
+ DLOG((DLOG_SEG, "lfs_writeseg: "
+ "indir bp should not be B_CALL\n"));
+ biodone(bp);
+ bp = NULL;
+ } else {
+ /* Still on free list, leave it there */
+ unbusybp = bp;
+ /*
+ * We have to re-decrement lfs_avail
+ * since this block is going to come
+ * back around to us in the next
+ * segment.
+ */
+ fs->lfs_avail -=
+ btofsb(fs, bp->b_bcount);
+ }
+ } else {
+ lfs_freebuf(fs, newbp);
+ }
+ }
+ mutex_enter(&bufcache_lock);
+ if (unbusybp != NULL) {
+ unbusybp->b_cflags &= ~BC_BUSY;
+ if (unbusybp->b_cflags & BC_WANTED)
+ cv_broadcast(&bp->b_busy);
+ }
+ }
+ mutex_exit(&bufcache_lock);
+
+ /*
+ * Compute checksum across data and then across summary; the first
+ * block (the summary block) is skipped. Set the create time here
+ * so that it's guaranteed to be later than the inode mod times.
+ */
+ sum = 0;
+ if (fs->lfs_version == 1)
+ el_size = sizeof(u_long);
+ else
+ el_size = sizeof(u_int32_t);
+ for (bpp = sp->bpp, i = nblocks - 1; i--; ) {
+ ++bpp;
+ /* Loop through gop_write cluster blocks */
+ for (byteoffset = 0; byteoffset < (*bpp)->b_bcount;
+ byteoffset += fs->lfs_bsize) {
+#ifdef LFS_USE_B_INVAL
+ if (((*bpp)->b_cflags & BC_INVAL) != 0 &&
+ (*bpp)->b_iodone != NULL) {
+ if (copyin((void *)(*bpp)->b_saveaddr +
+ byteoffset, dp, el_size)) {
+ panic("lfs_writeseg: copyin failed [1]:"
+ " ino %d blk %" PRId64,
+ VTOI((*bpp)->b_vp)->i_number,
+ (*bpp)->b_lblkno);
+ }
+ } else
+#endif /* LFS_USE_B_INVAL */
+ {
+ sum = lfs_cksum_part((char *)
+ (*bpp)->b_data + byteoffset, el_size, sum);
+ }
+ }
+ }
+ if (fs->lfs_version == 1)
+ ssp->ss_ocreate = time_second;
+ else {
+ ssp->ss_create = time_second;
+ ssp->ss_serial = ++fs->lfs_serial;
+ ssp->ss_ident = fs->lfs_ident;
+ }
+ ssp->ss_datasum = lfs_cksum_fold(sum);
+ ssp->ss_sumsum = cksum(&ssp->ss_datasum,
+ fs->lfs_sumsize - sizeof(ssp->ss_sumsum));
+
+ mutex_enter(&lfs_lock);
+ fs->lfs_bfree -= (btofsb(fs, ninos * fs->lfs_ibsize) +
+ btofsb(fs, fs->lfs_sumsize));
+ fs->lfs_dmeta += (btofsb(fs, ninos * fs->lfs_ibsize) +
+ btofsb(fs, fs->lfs_sumsize));
+ mutex_exit(&lfs_lock);
+
+ /*
+ * When we simply write the blocks we lose a rotation for every block
+ * written. To avoid this problem, we cluster the buffers into a
+ * chunk and write the chunk. MAXPHYS is the largest size I/O
+ * devices can handle, use that for the size of the chunks.
+ *
+ * Blocks that are already clusters (from GOP_WRITE), however, we
+ * don't bother to copy into other clusters.
+ */
+
+#define CHUNKSIZE MAXPHYS
+
+ if (devvp == NULL)
+ panic("devvp is NULL");
+ for (bpp = sp->bpp, i = nblocks; i;) {
+ cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i);
+ cl = cbp->b_private;
+
+ cbp->b_flags |= B_ASYNC;
+ cbp->b_cflags |= BC_BUSY;
+ cbp->b_bcount = 0;
+
+#if defined(DEBUG) && defined(DIAGNOSTIC)
+ if (bpp - sp->bpp > (fs->lfs_sumsize - SEGSUM_SIZE(fs))
+ / sizeof(int32_t)) {
+ panic("lfs_writeseg: real bpp overwrite");
+ }
+ if (bpp - sp->bpp > segsize(fs) / fs->lfs_fsize) {
+ panic("lfs_writeseg: theoretical bpp overwrite");
+ }
+#endif
+
+ /*
+ * Construct the cluster.
+ */
+ mutex_enter(&lfs_lock);
+ ++fs->lfs_iocount;
+ mutex_exit(&lfs_lock);
+ while (i && cbp->b_bcount < CHUNKSIZE) {
+ bp = *bpp;
+
+ if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount))
+ break;
+ if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC))
+ break;
+
+ /* Clusters from GOP_WRITE are expedited */
+ if (bp->b_bcount > fs->lfs_bsize) {
+ if (cbp->b_bcount > 0)
+ /* Put in its own buffer */
+ break;
+ else {
+ cbp->b_data = bp->b_data;
+ }
+ } else if (cbp->b_bcount == 0) {
+ p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE,
+ LFS_NB_CLUSTER);
+ cl->flags |= LFS_CL_MALLOC;
+ }
+#ifdef DIAGNOSTIC
+ if (dtosn(fs, dbtofsb(fs, bp->b_blkno +
+ btodb(bp->b_bcount - 1))) !=
+ sp->seg_number) {
+ printf("blk size %d daddr %" PRIx64
+ " not in seg %d\n",
+ bp->b_bcount, bp->b_blkno,
+ sp->seg_number);
+ panic("segment overwrite");
+ }
+#endif
+
+#ifdef LFS_USE_B_INVAL
+ /*
+ * Fake buffers from the cleaner are marked as B_INVAL.
+ * We need to copy the data from user space rather than
+ * from the buffer indicated.
+ * XXX == what do I do on an error?
+ */
+ if ((bp->b_cflags & BC_INVAL) != 0 &&
+ bp->b_iodone != NULL) {
+ if (copyin(bp->b_saveaddr, p, bp->b_bcount))
+ panic("lfs_writeseg: "
+ "copyin failed [2]");
+ } else
+#endif /* LFS_USE_B_INVAL */
+ if (cl->flags & LFS_CL_MALLOC) {
+ /* copy data into our cluster. */
+ memcpy(p, bp->b_data, bp->b_bcount);
+ p += bp->b_bcount;
+ }
+
+ cbp->b_bcount += bp->b_bcount;
+ cl->bufsize += bp->b_bcount;
+
+ bp->b_flags &= ~B_READ;
+ bp->b_error = 0;
+ cl->bpp[cl->bufcount++] = bp;
+
+ vp = bp->b_vp;
+ mutex_enter(&bufcache_lock);
+ mutex_enter(vp->v_interlock);
+ bp->b_oflags &= ~(BO_DELWRI | BO_DONE);
+ reassignbuf(bp, vp);
+ vp->v_numoutput++;
+ mutex_exit(vp->v_interlock);
+ mutex_exit(&bufcache_lock);
+
+ bpp++;
+ i--;
+ }
+ if (fs->lfs_sp->seg_flags & SEGM_SYNC)
+ BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL);
+ else
+ BIO_SETPRIO(cbp, BPRIO_TIMELIMITED);
+ mutex_enter(devvp->v_interlock);
+ devvp->v_numoutput++;
+ mutex_exit(devvp->v_interlock);
+ VOP_STRATEGY(devvp, cbp);
+ curlwp->l_ru.ru_oublock++;
+ }
+
+ if (lfs_dostats) {
+ ++lfs_stats.psegwrites;
+ lfs_stats.blocktot += nblocks - 1;
+ if (fs->lfs_sp->seg_flags & SEGM_SYNC)
+ ++lfs_stats.psyncwrites;
+ if (fs->lfs_sp->seg_flags & SEGM_CLEAN) {
+ ++lfs_stats.pcleanwrites;
+ lfs_stats.cleanblocks += nblocks - 1;
+ }
+ }
+
+ return (lfs_initseg(fs) || do_again);
+}
+
+void
+lfs_writesuper(struct lfs *fs, daddr_t daddr)
+{
+ struct buf *bp;
+ struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+ int s;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+#ifdef DIAGNOSTIC
+ KASSERT(fs->lfs_magic == LFS_MAGIC);
+#endif
+ /*
+ * If we can write one superblock while another is in
+ * progress, we risk not having a complete checkpoint if we crash.
+ * So, block here if a superblock write is in progress.
+ */
+ mutex_enter(&lfs_lock);
+ s = splbio();
+ while (fs->lfs_sbactive) {
+ mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0,
+ &lfs_lock);
+ }
+ fs->lfs_sbactive = daddr;
+ splx(s);
+ mutex_exit(&lfs_lock);
+
+ /* Set timestamp of this version of the superblock */
+ if (fs->lfs_version == 1)
+ fs->lfs_otstamp = time_second;
+ fs->lfs_tstamp = time_second;
+
+ /* Checksum the superblock and copy it into a buffer. */
+ fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs));
+ bp = lfs_newbuf(fs, devvp,
+ fsbtodb(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK);
+ memset((char *)bp->b_data + sizeof(struct dlfs), 0,
+ LFS_SBPAD - sizeof(struct dlfs));
+ *(struct dlfs *)bp->b_data = fs->lfs_dlfs;
+
+ bp->b_cflags |= BC_BUSY;
+ bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC;
+ bp->b_oflags &= ~(BO_DONE | BO_DELWRI);
+ bp->b_error = 0;
+ bp->b_iodone = lfs_supercallback;
+
+ if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC)
+ BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+ else
+ BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
+ curlwp->l_ru.ru_oublock++;
+
+ mutex_enter(devvp->v_interlock);
+ devvp->v_numoutput++;
+ mutex_exit(devvp->v_interlock);
+
+ mutex_enter(&lfs_lock);
+ ++fs->lfs_iocount;
+ mutex_exit(&lfs_lock);
+ VOP_STRATEGY(devvp, bp);
+}
+
+/*
+ * Logical block number match routines used when traversing the dirty block
+ * chain.
+ */
+int
+lfs_match_fake(struct lfs *fs, struct buf *bp)
+{
+
+ ASSERT_SEGLOCK(fs);
+ return LFS_IS_MALLOC_BUF(bp);
+}
+
+#if 0
+int
+lfs_match_real(struct lfs *fs, struct buf *bp)
+{
+
+ ASSERT_SEGLOCK(fs);
+ return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp));
+}
+#endif
+
+int
+lfs_match_data(struct lfs *fs, struct buf *bp)
+{
+
+ ASSERT_SEGLOCK(fs);
+ return (bp->b_lblkno >= 0);
+}
+
+int
+lfs_match_indir(struct lfs *fs, struct buf *bp)
+{
+ daddr_t lbn;
+
+ ASSERT_SEGLOCK(fs);
+ lbn = bp->b_lblkno;
+ return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
+}
+
+int
+lfs_match_dindir(struct lfs *fs, struct buf *bp)
+{
+ daddr_t lbn;
+
+ ASSERT_SEGLOCK(fs);
+ lbn = bp->b_lblkno;
+ return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
+}
+
+int
+lfs_match_tindir(struct lfs *fs, struct buf *bp)
+{
+ daddr_t lbn;
+
+ ASSERT_SEGLOCK(fs);
+ lbn = bp->b_lblkno;
+ return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
+}
+
+static void
+lfs_free_aiodone(struct buf *bp)
+{
+ struct lfs *fs;
+
+ KERNEL_LOCK(1, curlwp);
+ fs = bp->b_private;
+ ASSERT_NO_SEGLOCK(fs);
+ lfs_freebuf(fs, bp);
+ KERNEL_UNLOCK_LAST(curlwp);
+}
+
+static void
+lfs_super_aiodone(struct buf *bp)
+{
+ struct lfs *fs;
+
+ KERNEL_LOCK(1, curlwp);
+ fs = bp->b_private;
+ ASSERT_NO_SEGLOCK(fs);
+ mutex_enter(&lfs_lock);
+ fs->lfs_sbactive = 0;
+ if (--fs->lfs_iocount <= 1)
+ wakeup(&fs->lfs_iocount);
+ wakeup(&fs->lfs_sbactive);
+ mutex_exit(&lfs_lock);
+ lfs_freebuf(fs, bp);
+ KERNEL_UNLOCK_LAST(curlwp);
+}
+
+static void
+lfs_cluster_aiodone(struct buf *bp)
+{
+ struct lfs_cluster *cl;
+ struct lfs *fs;
+ struct buf *tbp, *fbp;
+ struct vnode *vp, *devvp, *ovp;
+ struct inode *ip;
+ int error;
+
+ KERNEL_LOCK(1, curlwp);
+
+ error = bp->b_error;
+ cl = bp->b_private;
+ fs = cl->fs;
+ devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+ ASSERT_NO_SEGLOCK(fs);
+
+ /* Put the pages back, and release the buffer */
+ while (cl->bufcount--) {
+ tbp = cl->bpp[cl->bufcount];
+ KASSERT(tbp->b_cflags & BC_BUSY);
+ if (error) {
+ tbp->b_error = error;
+ }
+
+ /*
+ * We're done with tbp. If it has not been re-dirtied since
+ * the cluster was written, free it. Otherwise, keep it on
+ * the locked list to be written again.
+ */
+ vp = tbp->b_vp;
+
+ tbp->b_flags &= ~B_GATHERED;
+
+ LFS_BCLEAN_LOG(fs, tbp);
+
+ mutex_enter(&bufcache_lock);
+ if (tbp->b_iodone == NULL) {
+ KASSERT(tbp->b_flags & B_LOCKED);
+ bremfree(tbp);
+ if (vp) {
+ mutex_enter(vp->v_interlock);
+ reassignbuf(tbp, vp);
+ mutex_exit(vp->v_interlock);
+ }
+ tbp->b_flags |= B_ASYNC; /* for biodone */
+ }
+
+ if (((tbp->b_flags | tbp->b_oflags) &
+ (B_LOCKED | BO_DELWRI)) == B_LOCKED)
+ LFS_UNLOCK_BUF(tbp);
+
+ if (tbp->b_oflags & BO_DONE) {
+ DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n",
+ cl->bufcount, (long)tbp->b_flags));
+ }
+
+ if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) {
+ /*
+ * A buffer from the page daemon.
+ * We use the same iodone as it does,
+ * so we must manually disassociate its
+ * buffers from the vp.
+ */
+ if ((ovp = tbp->b_vp) != NULL) {
+ /* This is just silly */
+ mutex_enter(ovp->v_interlock);
+ brelvp(tbp);
+ mutex_exit(ovp->v_interlock);
+ tbp->b_vp = vp;
+ tbp->b_objlock = vp->v_interlock;
+ }
+ /* Put it back the way it was */
+ tbp->b_flags |= B_ASYNC;
+ /* Master buffers have BC_AGE */
+ if (tbp->b_private == tbp)
+ tbp->b_cflags |= BC_AGE;
+ }
+ mutex_exit(&bufcache_lock);
+
+ biodone(tbp);
+
+ /*
+ * If this is the last block for this vnode, but
+ * there are other blocks on its dirty list,
+ * set IN_MODIFIED/IN_CLEANING depending on what
+ * sort of block. Only do this for our mount point,
+ * not for, e.g., inode blocks that are attached to
+ * the devvp.
+ * XXX KS - Shouldn't we set *both* if both types
+ * of blocks are present (traverse the dirty list?)
+ */
+ mutex_enter(&lfs_lock);
+ mutex_enter(vp->v_interlock);
+ if (vp != devvp && vp->v_numoutput == 0 &&
+ (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) {
+ ip = VTOI(vp);
+ DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n",
+ ip->i_number));
+ if (LFS_IS_MALLOC_BUF(fbp))
+ LFS_SET_UINO(ip, IN_CLEANING);
+ else
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ }
+ cv_broadcast(&vp->v_cv);
+ mutex_exit(vp->v_interlock);
+ mutex_exit(&lfs_lock);
+ }
+
+ /* Fix up the cluster buffer, and release it */
+ if (cl->flags & LFS_CL_MALLOC)
+ lfs_free(fs, bp->b_data, LFS_NB_CLUSTER);
+ putiobuf(bp);
+
+ /* Note i/o done */
+ if (cl->flags & LFS_CL_SYNC) {
+ if (--cl->seg->seg_iocount == 0)
+ wakeup(&cl->seg->seg_iocount);
+ }
+ mutex_enter(&lfs_lock);
+#ifdef DIAGNOSTIC
+ if (fs->lfs_iocount == 0)
+ panic("lfs_cluster_aiodone: zero iocount");
+#endif
+ if (--fs->lfs_iocount <= 1)
+ wakeup(&fs->lfs_iocount);
+ mutex_exit(&lfs_lock);
+
+ KERNEL_UNLOCK_LAST(curlwp);
+
+ pool_put(&fs->lfs_bpppool, cl->bpp);
+ cl->bpp = NULL;
+ pool_put(&fs->lfs_clpool, cl);
+}
+
+static void
+lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *))
+{
+ /* reset b_iodone for when this is a single-buf i/o. */
+ bp->b_iodone = aiodone;
+
+ workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL);
+}
+
+static void
+lfs_cluster_callback(struct buf *bp)
+{
+
+ lfs_generic_callback(bp, lfs_cluster_aiodone);
+}
+
+void
+lfs_supercallback(struct buf *bp)
+{
+
+ lfs_generic_callback(bp, lfs_super_aiodone);
+}
+
+/*
+ * The only buffers that are going to hit these functions are the
+ * segment write blocks, or the segment summaries, or the superblocks.
+ *
+ * All of the above are created by lfs_newbuf, and so do not need to be
+ * released via brelse.
+ */
+void
+lfs_callback(struct buf *bp)
+{
+
+ lfs_generic_callback(bp, lfs_free_aiodone);
+}
+
+/*
+ * Shellsort (diminishing increment sort) from Data Structures and
+ * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
+ * see also Knuth Vol. 3, page 84. The increments are selected from
+ * formula (8), page 95. Roughly O(N^3/2).
+ */
+/*
+ * This is our own private copy of shellsort because we want to sort
+ * two parallel arrays (the array of buffer pointers and the array of
+ * logical block numbers) simultaneously. Note that we cast the array
+ * of logical block numbers to a unsigned in this routine so that the
+ * negative block numbers (meta data blocks) sort AFTER the data blocks.
+ */
+
+void
+lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size)
+{
+ static int __rsshell_increments[] = { 4, 1, 0 };
+ int incr, *incrp, t1, t2;
+ struct buf *bp_temp;
+
+#ifdef DEBUG
+ incr = 0;
+ for (t1 = 0; t1 < nmemb; t1++) {
+ for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
+ if (lb_array[incr++] != bp_array[t1]->b_lblkno + t2) {
+ /* dump before panic */
+ printf("lfs_shellsort: nmemb=%d, size=%d\n",
+ nmemb, size);
+ incr = 0;
+ for (t1 = 0; t1 < nmemb; t1++) {
+ const struct buf *bp = bp_array[t1];
+
+ printf("bp[%d]: lbn=%" PRIu64 ", size=%"
+ PRIu64 "\n", t1,
+ (uint64_t)bp->b_bcount,
+ (uint64_t)bp->b_lblkno);
+ printf("lbns:");
+ for (t2 = 0; t2 * size < bp->b_bcount;
+ t2++) {
+ printf(" %" PRId32,
+ lb_array[incr++]);
+ }
+ printf("\n");
+ }
+ panic("lfs_shellsort: inconsistent input");
+ }
+ }
+ }
+#endif
+
+ for (incrp = __rsshell_increments; (incr = *incrp++) != 0;)
+ for (t1 = incr; t1 < nmemb; ++t1)
+ for (t2 = t1 - incr; t2 >= 0;)
+ if ((u_int32_t)bp_array[t2]->b_lblkno >
+ (u_int32_t)bp_array[t2 + incr]->b_lblkno) {
+ bp_temp = bp_array[t2];
+ bp_array[t2] = bp_array[t2 + incr];
+ bp_array[t2 + incr] = bp_temp;
+ t2 -= incr;
+ } else
+ break;
+
+ /* Reform the list of logical blocks */
+ incr = 0;
+ for (t1 = 0; t1 < nmemb; t1++) {
+ for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
+ lb_array[incr++] = bp_array[t1]->b_lblkno + t2;
+ }
+ }
+}
+
+/*
+ * Call vget with LK_NOWAIT. If we are the one who holds VI_XLOCK,
+ * however, we must press on. Just fake success in that case.
+ */
+int
+lfs_vref(struct vnode *vp)
+{
+ int error;
+ struct lfs *fs;
+
+ KASSERT(mutex_owned(vp->v_interlock));
+
+ fs = VTOI(vp)->i_lfs;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+
+ /*
+ * If we return 1 here during a flush, we risk vinvalbuf() not
+ * being able to flush all of the pages from this vnode, which
+ * will cause it to panic. So, return 0 if a flush is in progress.
+ */
+ error = vget(vp, LK_NOWAIT);
+ if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
+ ++fs->lfs_flushvp_fakevref;
+ return 0;
+ }
+ return error;
+}
+
+/*
+ * This is vrele except that we do not want to VOP_INACTIVE this vnode. We
+ * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end.
+ */
+void
+lfs_vunref(struct vnode *vp)
+{
+ struct lfs *fs;
+
+ fs = VTOI(vp)->i_lfs;
+ ASSERT_MAYBE_SEGLOCK(fs);
+
+ /*
+ * Analogous to lfs_vref, if the node is flushing, fake it.
+ */
+ if (IS_FLUSHING(fs, vp) && fs->lfs_flushvp_fakevref) {
+ --fs->lfs_flushvp_fakevref;
+ return;
+ }
+
+ /* does not call inactive */
+ mutex_enter(vp->v_interlock);
+ vrelel(vp, 0);
+}
+
+/*
+ * We use this when we have vnodes that were loaded in solely for cleaning.
+ * There is no reason to believe that these vnodes will be referenced again
+ * soon, since the cleaning process is unrelated to normal filesystem
+ * activity. Putting cleaned vnodes at the tail of the list has the effect
+ * of flushing the vnode LRU. So, put vnodes that were loaded only for
+ * cleaning at the head of the list, instead.
+ */
+void
+lfs_vunref_head(struct vnode *vp)
+{
+
+ ASSERT_SEGLOCK(VTOI(vp)->i_lfs);
+
+ /* does not call inactive, inserts non-held vnode at head of freelist */
+ mutex_enter(vp->v_interlock);
+ vrelel(vp, 0);
+}
+
+
+/*
+ * Set up an FINFO entry for a new file. The fip pointer is assumed to
+ * point at uninitialized space.
+ */
+void
+lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers)
+{
+ struct segment *sp = fs->lfs_sp;
+
+ KASSERT(vers > 0);
+
+ if (sp->seg_bytes_left < fs->lfs_bsize ||
+ sp->sum_bytes_left < sizeof(struct finfo))
+ (void) lfs_writeseg(fs, fs->lfs_sp);
+
+ sp->sum_bytes_left -= FINFOSIZE;
+ ++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+ sp->fip->fi_nblocks = 0;
+ sp->fip->fi_ino = ino;
+ sp->fip->fi_version = vers;
+}
+
+/*
+ * Release the FINFO entry, either clearing out an unused entry or
+ * advancing us to the next available entry.
+ */
+void
+lfs_release_finfo(struct lfs *fs)
+{
+ struct segment *sp = fs->lfs_sp;
+
+ if (sp->fip->fi_nblocks != 0) {
+ sp->fip = (FINFO*)((char *)sp->fip + FINFOSIZE +
+ sizeof(int32_t) * sp->fip->fi_nblocks);
+ sp->start_lbp = &sp->fip->fi_blocks[0];
+ } else {
+ sp->sum_bytes_left += FINFOSIZE;
+ --((SEGSUM *)(sp->segsum))->ss_nfinfo;
+ }
+}
--- /dev/null
+/* $NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_subr.c 8.4 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+#ifdef DEBUG
+const char *lfs_res_names[LFS_NB_COUNT] = {
+ "summary",
+ "superblock",
+ "file block",
+ "cluster",
+ "clean",
+ "blkiov",
+};
+#endif
+
+int lfs_res_qty[LFS_NB_COUNT] = {
+ LFS_N_SUMMARIES,
+ LFS_N_SBLOCKS,
+ LFS_N_IBLOCKS,
+ LFS_N_CLUSTERS,
+ LFS_N_CLEAN,
+ LFS_N_BLKIOV,
+};
+
+void
+lfs_setup_resblks(struct lfs *fs)
+{
+ int i, j;
+ int maxbpp;
+
+ ASSERT_NO_SEGLOCK(fs);
+ fs->lfs_resblk = (res_t *)malloc(LFS_N_TOTAL * sizeof(res_t), M_SEGMENT,
+ M_WAITOK);
+ for (i = 0; i < LFS_N_TOTAL; i++) {
+ fs->lfs_resblk[i].inuse = 0;
+ fs->lfs_resblk[i].p = NULL;
+ }
+ for (i = 0; i < LFS_RESHASH_WIDTH; i++)
+ LIST_INIT(fs->lfs_reshash + i);
+
+ /*
+ * These types of allocations can be larger than a page,
+ * so we can't use the pool subsystem for them.
+ */
+ for (i = 0, j = 0; j < LFS_N_SUMMARIES; j++, i++)
+ fs->lfs_resblk[i].size = fs->lfs_sumsize;
+ for (j = 0; j < LFS_N_SBLOCKS; j++, i++)
+ fs->lfs_resblk[i].size = LFS_SBPAD;
+ for (j = 0; j < LFS_N_IBLOCKS; j++, i++)
+ fs->lfs_resblk[i].size = fs->lfs_bsize;
+ for (j = 0; j < LFS_N_CLUSTERS; j++, i++)
+ fs->lfs_resblk[i].size = MAXPHYS;
+ for (j = 0; j < LFS_N_CLEAN; j++, i++)
+ fs->lfs_resblk[i].size = MAXPHYS;
+ for (j = 0; j < LFS_N_BLKIOV; j++, i++)
+ fs->lfs_resblk[i].size = LFS_MARKV_MAXBLKCNT * sizeof(BLOCK_INFO);
+
+ for (i = 0; i < LFS_N_TOTAL; i++) {
+ fs->lfs_resblk[i].p = malloc(fs->lfs_resblk[i].size,
+ M_SEGMENT, M_WAITOK);
+ }
+
+ /*
+ * Initialize pools for small types (XXX is BPP small?)
+ */
+ pool_init(&fs->lfs_clpool, sizeof(struct lfs_cluster), 0, 0, 0,
+ "lfsclpl", &pool_allocator_nointr, IPL_NONE);
+ pool_init(&fs->lfs_segpool, sizeof(struct segment), 0, 0, 0,
+ "lfssegpool", &pool_allocator_nointr, IPL_NONE);
+ maxbpp = ((fs->lfs_sumsize - SEGSUM_SIZE(fs)) / sizeof(int32_t) + 2);
+ maxbpp = MIN(maxbpp, segsize(fs) / fs->lfs_fsize + 2);
+ pool_init(&fs->lfs_bpppool, maxbpp * sizeof(struct buf *), 0, 0, 0,
+ "lfsbpppl", &pool_allocator_nointr, IPL_NONE);
+}
+
+void
+lfs_free_resblks(struct lfs *fs)
+{
+ int i;
+
+ pool_destroy(&fs->lfs_bpppool);
+ pool_destroy(&fs->lfs_segpool);
+ pool_destroy(&fs->lfs_clpool);
+
+ mutex_enter(&lfs_lock);
+ for (i = 0; i < LFS_N_TOTAL; i++) {
+ while (fs->lfs_resblk[i].inuse)
+ mtsleep(&fs->lfs_resblk, PRIBIO + 1, "lfs_free", 0,
+ &lfs_lock);
+ if (fs->lfs_resblk[i].p != NULL)
+ free(fs->lfs_resblk[i].p, M_SEGMENT);
+ }
+ free(fs->lfs_resblk, M_SEGMENT);
+ mutex_exit(&lfs_lock);
+}
+
+static unsigned int
+lfs_mhash(void *vp)
+{
+ return (unsigned int)(((unsigned long)vp) >> 2) % LFS_RESHASH_WIDTH;
+}
+
+/*
+ * Return memory of the given size for the given purpose, or use one of a
+ * number of spare last-resort buffers, if malloc returns NULL.
+ */
+void *
+lfs_malloc(struct lfs *fs, size_t size, int type)
+{
+ struct lfs_res_blk *re;
+ void *r;
+ int i, s, start;
+ unsigned int h;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ r = NULL;
+
+ /* If no mem allocated for this type, it just waits */
+ if (lfs_res_qty[type] == 0) {
+ r = malloc(size, M_SEGMENT, M_WAITOK);
+ return r;
+ }
+
+ /* Otherwise try a quick malloc, and if it works, great */
+ if ((r = malloc(size, M_SEGMENT, M_NOWAIT)) != NULL) {
+ return r;
+ }
+
+ /*
+ * If malloc returned NULL, we are forced to use one of our
+ * reserve blocks. We have on hand at least one summary block,
+ * at least one cluster block, at least one superblock,
+ * and several indirect blocks.
+ */
+
+ mutex_enter(&lfs_lock);
+ /* skip over blocks of other types */
+ for (i = 0, start = 0; i < type; i++)
+ start += lfs_res_qty[i];
+ while (r == NULL) {
+ for (i = 0; i < lfs_res_qty[type]; i++) {
+ if (fs->lfs_resblk[start + i].inuse == 0) {
+ re = fs->lfs_resblk + start + i;
+ re->inuse = 1;
+ r = re->p;
+ KASSERT(re->size >= size);
+ h = lfs_mhash(r);
+ s = splbio();
+ LIST_INSERT_HEAD(&fs->lfs_reshash[h], re, res);
+ splx(s);
+ mutex_exit(&lfs_lock);
+ return r;
+ }
+ }
+ DLOG((DLOG_MALLOC, "sleeping on %s (%d)\n",
+ lfs_res_names[type], lfs_res_qty[type]));
+ mtsleep(&fs->lfs_resblk, PVM, "lfs_malloc", 0,
+ &lfs_lock);
+ DLOG((DLOG_MALLOC, "done sleeping on %s\n",
+ lfs_res_names[type]));
+ }
+ /* NOTREACHED */
+ mutex_exit(&lfs_lock);
+ return r;
+}
+
+void
+lfs_free(struct lfs *fs, void *p, int type)
+{
+ int s;
+ unsigned int h;
+ res_t *re;
+#ifdef DEBUG
+ int i;
+#endif
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ h = lfs_mhash(p);
+ mutex_enter(&lfs_lock);
+ s = splbio();
+ LIST_FOREACH(re, &fs->lfs_reshash[h], res) {
+ if (re->p == p) {
+ KASSERT(re->inuse == 1);
+ LIST_REMOVE(re, res);
+ re->inuse = 0;
+ wakeup(&fs->lfs_resblk);
+ splx(s);
+ mutex_exit(&lfs_lock);
+ return;
+ }
+ }
+#ifdef DEBUG
+ for (i = 0; i < LFS_N_TOTAL; i++) {
+ if (fs->lfs_resblk[i].p == p)
+ panic("lfs_free: inconsistent reserved block");
+ }
+#endif
+ splx(s);
+ mutex_exit(&lfs_lock);
+
+ /*
+ * If we didn't find it, free it.
+ */
+ free(p, M_SEGMENT);
+}
+
+/*
+ * lfs_seglock --
+ * Single thread the segment writer.
+ */
+int
+lfs_seglock(struct lfs *fs, unsigned long flags)
+{
+ struct segment *sp;
+
+ mutex_enter(&lfs_lock);
+ if (fs->lfs_seglock) {
+ if (fs->lfs_lockpid == curproc->p_pid &&
+ fs->lfs_locklwp == curlwp->l_lid) {
+ ++fs->lfs_seglock;
+ fs->lfs_sp->seg_flags |= flags;
+ mutex_exit(&lfs_lock);
+ return 0;
+ } else if (flags & SEGM_PAGEDAEMON) {
+ mutex_exit(&lfs_lock);
+ return EWOULDBLOCK;
+ } else {
+ while (fs->lfs_seglock) {
+ (void)mtsleep(&fs->lfs_seglock, PRIBIO + 1,
+ "lfs_seglock", 0, &lfs_lock);
+ }
+ }
+ }
+
+ fs->lfs_seglock = 1;
+ fs->lfs_lockpid = curproc->p_pid;
+ fs->lfs_locklwp = curlwp->l_lid;
+ mutex_exit(&lfs_lock);
+ fs->lfs_cleanind = 0;
+
+#ifdef DEBUG
+ LFS_ENTER_LOG("seglock", __FILE__, __LINE__, 0, flags, curproc->p_pid);
+#endif
+ /* Drain fragment size changes out */
+ rw_enter(&fs->lfs_fraglock, RW_WRITER);
+
+ sp = fs->lfs_sp = pool_get(&fs->lfs_segpool, PR_WAITOK);
+ sp->bpp = pool_get(&fs->lfs_bpppool, PR_WAITOK);
+ sp->seg_flags = flags;
+ sp->vp = NULL;
+ sp->seg_iocount = 0;
+ (void) lfs_initseg(fs);
+
+ /*
+ * Keep a cumulative count of the outstanding I/O operations. If the
+ * disk drive catches up with us it could go to zero before we finish,
+ * so we artificially increment it by one until we've scheduled all of
+ * the writes we intend to do.
+ */
+ mutex_enter(&lfs_lock);
+ ++fs->lfs_iocount;
+ mutex_exit(&lfs_lock);
+ return 0;
+}
+
+static void lfs_unmark_dirop(struct lfs *);
+
+static void
+lfs_unmark_dirop(struct lfs *fs)
+{
+ struct inode *ip, *nip;
+ struct vnode *vp;
+ int doit;
+
+ ASSERT_NO_SEGLOCK(fs);
+ mutex_enter(&lfs_lock);
+ doit = !(fs->lfs_flags & LFS_UNDIROP);
+ if (doit)
+ fs->lfs_flags |= LFS_UNDIROP;
+ if (!doit) {
+ mutex_exit(&lfs_lock);
+ return;
+ }
+
+ for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+ nip = TAILQ_NEXT(ip, i_lfs_dchain);
+ vp = ITOV(ip);
+ if ((VTOI(vp)->i_flag & (IN_ADIROP | IN_ALLMOD)) == 0) {
+ --lfs_dirvcount;
+ --fs->lfs_dirvcount;
+ vp->v_uflag &= ~VU_DIROP;
+ TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+ wakeup(&lfs_dirvcount);
+ fs->lfs_unlockvp = vp;
+ mutex_exit(&lfs_lock);
+ vrele(vp);
+ mutex_enter(&lfs_lock);
+ fs->lfs_unlockvp = NULL;
+ }
+ }
+
+ fs->lfs_flags &= ~LFS_UNDIROP;
+ wakeup(&fs->lfs_flags);
+ mutex_exit(&lfs_lock);
+}
+
+static void
+lfs_auto_segclean(struct lfs *fs)
+{
+ int i, error, s, waited;
+
+ ASSERT_SEGLOCK(fs);
+ /*
+ * Now that we've swapped lfs_activesb, but while we still
+ * hold the segment lock, run through the segment list marking
+ * the empty ones clean.
+ * XXX - do we really need to do them all at once?
+ */
+ waited = 0;
+ for (i = 0; i < fs->lfs_nseg; i++) {
+ if ((fs->lfs_suflags[0][i] &
+ (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+ (SEGUSE_DIRTY | SEGUSE_EMPTY) &&
+ (fs->lfs_suflags[1][i] &
+ (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+ (SEGUSE_DIRTY | SEGUSE_EMPTY)) {
+
+ /* Make sure the sb is written before we clean */
+ mutex_enter(&lfs_lock);
+ s = splbio();
+ while (waited == 0 && fs->lfs_sbactive)
+ mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs asb",
+ 0, &lfs_lock);
+ splx(s);
+ mutex_exit(&lfs_lock);
+ waited = 1;
+
+ if ((error = lfs_do_segclean(fs, i)) != 0) {
+ DLOG((DLOG_CLEAN, "lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error, i));
+ }
+ }
+ fs->lfs_suflags[1 - fs->lfs_activesb][i] =
+ fs->lfs_suflags[fs->lfs_activesb][i];
+ }
+}
+
+/*
+ * lfs_segunlock --
+ * Single thread the segment writer.
+ */
+void
+lfs_segunlock(struct lfs *fs)
+{
+ struct segment *sp;
+ unsigned long sync, ckp;
+ struct buf *bp;
+ int do_unmark_dirop = 0;
+
+ sp = fs->lfs_sp;
+
+ mutex_enter(&lfs_lock);
+ KASSERT(LFS_SEGLOCK_HELD(fs));
+ if (fs->lfs_seglock == 1) {
+ if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 &&
+ LFS_STARVED_FOR_SEGS(fs) == 0)
+ do_unmark_dirop = 1;
+ mutex_exit(&lfs_lock);
+ sync = sp->seg_flags & SEGM_SYNC;
+ ckp = sp->seg_flags & SEGM_CKP;
+
+ /* We should have a segment summary, and nothing else */
+ KASSERT(sp->cbpp == sp->bpp + 1);
+
+ /* Free allocated segment summary */
+ fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
+ bp = *sp->bpp;
+ lfs_freebuf(fs, bp);
+
+ pool_put(&fs->lfs_bpppool, sp->bpp);
+ sp->bpp = NULL;
+
+ /*
+ * If we're not sync, we're done with sp, get rid of it.
+ * Otherwise, we keep a local copy around but free
+ * fs->lfs_sp so another process can use it (we have to
+ * wait but they don't have to wait for us).
+ */
+ if (!sync)
+ pool_put(&fs->lfs_segpool, sp);
+ fs->lfs_sp = NULL;
+
+ /*
+ * If the I/O count is non-zero, sleep until it reaches zero.
+ * At the moment, the user's process hangs around so we can
+ * sleep.
+ */
+ mutex_enter(&lfs_lock);
+ if (--fs->lfs_iocount == 0) {
+ LFS_DEBUG_COUNTLOCKED("lfs_segunlock");
+ }
+ if (fs->lfs_iocount <= 1)
+ wakeup(&fs->lfs_iocount);
+ mutex_exit(&lfs_lock);
+ /*
+ * If we're not checkpointing, we don't have to block
+ * other processes to wait for a synchronous write
+ * to complete.
+ */
+ if (!ckp) {
+#ifdef DEBUG
+ LFS_ENTER_LOG("segunlock_std", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+ mutex_enter(&lfs_lock);
+ --fs->lfs_seglock;
+ fs->lfs_lockpid = 0;
+ fs->lfs_locklwp = 0;
+ mutex_exit(&lfs_lock);
+ wakeup(&fs->lfs_seglock);
+ }
+ /*
+ * We let checkpoints happen asynchronously. That means
+ * that during recovery, we have to roll forward between
+ * the two segments described by the first and second
+ * superblocks to make sure that the checkpoint described
+ * by a superblock completed.
+ */
+ mutex_enter(&lfs_lock);
+ while (ckp && sync && fs->lfs_iocount) {
+ (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+ "lfs_iocount", 0, &lfs_lock);
+ DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", fs, fs->lfs_iocount));
+ }
+ while (sync && sp->seg_iocount) {
+ (void)mtsleep(&sp->seg_iocount, PRIBIO + 1,
+ "seg_iocount", 0, &lfs_lock);
+ DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", sp, sp->seg_iocount));
+ }
+ mutex_exit(&lfs_lock);
+ if (sync)
+ pool_put(&fs->lfs_segpool, sp);
+
+ if (ckp) {
+ fs->lfs_nactive = 0;
+ /* If we *know* everything's on disk, write both sbs */
+ /* XXX should wait for this one */
+ if (sync)
+ lfs_writesuper(fs, fs->lfs_sboffs[fs->lfs_activesb]);
+ lfs_writesuper(fs, fs->lfs_sboffs[1 - fs->lfs_activesb]);
+ if (!(fs->lfs_ivnode->v_mount->mnt_iflag & IMNT_UNMOUNT)) {
+ lfs_auto_segclean(fs);
+ /* If sync, we can clean the remainder too */
+ if (sync)
+ lfs_auto_segclean(fs);
+ }
+ fs->lfs_activesb = 1 - fs->lfs_activesb;
+#ifdef DEBUG
+ LFS_ENTER_LOG("segunlock_ckp", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+ mutex_enter(&lfs_lock);
+ --fs->lfs_seglock;
+ fs->lfs_lockpid = 0;
+ fs->lfs_locklwp = 0;
+ mutex_exit(&lfs_lock);
+ wakeup(&fs->lfs_seglock);
+ }
+ /* Reenable fragment size changes */
+ rw_exit(&fs->lfs_fraglock);
+ if (do_unmark_dirop)
+ lfs_unmark_dirop(fs);
+ } else if (fs->lfs_seglock == 0) {
+ mutex_exit(&lfs_lock);
+ panic ("Seglock not held");
+ } else {
+ --fs->lfs_seglock;
+ mutex_exit(&lfs_lock);
+ }
+}
+
+/*
+ * Drain dirops and start writer.
+ *
+ * No simple_locks are held when we enter and none are held when we return.
+ */
+int
+lfs_writer_enter(struct lfs *fs, const char *wmesg)
+{
+ int error = 0;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ mutex_enter(&lfs_lock);
+
+ /* disallow dirops during flush */
+ fs->lfs_writer++;
+
+ while (fs->lfs_dirops > 0) {
+ ++fs->lfs_diropwait;
+ error = mtsleep(&fs->lfs_writer, PRIBIO+1, wmesg, 0,
+ &lfs_lock);
+ --fs->lfs_diropwait;
+ }
+
+ if (error)
+ fs->lfs_writer--;
+
+ mutex_exit(&lfs_lock);
+
+ return error;
+}
+
+void
+lfs_writer_leave(struct lfs *fs)
+{
+ bool dowakeup;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ mutex_enter(&lfs_lock);
+ dowakeup = !(--fs->lfs_writer);
+ mutex_exit(&lfs_lock);
+ if (dowakeup)
+ wakeup(&fs->lfs_dirops);
+}
+
+/*
+ * Unlock, wait for the cleaner, then relock to where we were before.
+ * To be used only at a fairly high level, to address a paucity of free
+ * segments propagated back from lfs_gop_write().
+ */
+void
+lfs_segunlock_relock(struct lfs *fs)
+{
+ int n = fs->lfs_seglock;
+ u_int16_t seg_flags;
+ CLEANERINFO *cip;
+ struct buf *bp;
+
+ if (n == 0)
+ return;
+
+ /* Write anything we've already gathered to disk */
+ lfs_writeseg(fs, fs->lfs_sp);
+
+ /* Tell cleaner */
+ LFS_CLEANERINFO(cip, fs, bp);
+ cip->flags |= LFS_CLEANER_MUST_CLEAN;
+ LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+ /* Save segment flags for later */
+ seg_flags = fs->lfs_sp->seg_flags;
+
+ fs->lfs_sp->seg_flags |= SEGM_PROT; /* Don't unmark dirop nodes */
+ while(fs->lfs_seglock)
+ lfs_segunlock(fs);
+
+ /* Wait for the cleaner */
+ lfs_wakeup_cleaner(fs);
+ mutex_enter(&lfs_lock);
+ while (LFS_STARVED_FOR_SEGS(fs))
+ mtsleep(&fs->lfs_avail, PRIBIO, "relock", 0,
+ &lfs_lock);
+ mutex_exit(&lfs_lock);
+
+ /* Put the segment lock back the way it was. */
+ while(n--)
+ lfs_seglock(fs, seg_flags);
+
+ /* Cleaner can relax now */
+ LFS_CLEANERINFO(cip, fs, bp);
+ cip->flags &= ~LFS_CLEANER_MUST_CLEAN;
+ LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+ return;
+}
+
+/*
+ * Wake up the cleaner, provided that nowrap is not set.
+ */
+void
+lfs_wakeup_cleaner(struct lfs *fs)
+{
+ if (fs->lfs_nowrap > 0)
+ return;
+
+ wakeup(&fs->lfs_nextseg);
+ wakeup(&lfs_allclean_wakeup);
+}
--- /dev/null
+/* $NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
+ * The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $");
+
+#ifndef LFS
+# define LFS /* for prototypes in syscallargs.h */
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+#include <sys/syscallargs.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *);
+int lfs_fasthashget(dev_t, ino_t, struct vnode **);
+
+pid_t lfs_cleaner_pid = 0;
+
+/*
+ * sys_lfs_markv:
+ *
+ * This will mark inodes and blocks dirty, so they are written into the log.
+ * It will block until all the blocks have been written. The segment create
+ * time passed in the block_info and inode_info structures is used to decide
+ * if the data is valid for each block (in case some process dirtied a block
+ * or inode that is being cleaned between the determination that a block is
+ * live and the lfs_markv call).
+ *
+ * 0 on success
+ * -1/errno is return on error.
+ */
+#ifdef USE_64BIT_SYSCALLS
+int
+sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(fsid_t *) fsidp;
+ syscallarg(struct block_info *) blkiov;
+ syscallarg(int) blkcnt;
+ } */
+ BLOCK_INFO *blkiov;
+ int blkcnt, error;
+ fsid_t fsid;
+ struct lfs *fs;
+ struct mount *mntp;
+
+ if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0)
+ return (error);
+
+ if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+ return (error);
+
+ if ((mntp = vfs_getvfs(fsidp)) == NULL)
+ return (ENOENT);
+ fs = VFSTOUFS(mntp)->um_lfs;
+
+ blkcnt = SCARG(uap, blkcnt);
+ if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+ return (EINVAL);
+
+ KERNEL_LOCK(1, NULL);
+ blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+ if ((error = copyin(SCARG(uap, blkiov), blkiov,
+ blkcnt * sizeof(BLOCK_INFO))) != 0)
+ goto out;
+
+ if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
+ copyout(blkiov, SCARG(uap, blkiov),
+ blkcnt * sizeof(BLOCK_INFO));
+ out:
+ lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+ KERNEL_UNLOCK_ONE(NULL);
+ return error;
+}
+#else
+int
+sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(fsid_t *) fsidp;
+ syscallarg(struct block_info *) blkiov;
+ syscallarg(int) blkcnt;
+ } */
+ BLOCK_INFO *blkiov;
+ BLOCK_INFO_15 *blkiov15;
+ int i, blkcnt, error;
+ fsid_t fsid;
+ struct lfs *fs;
+ struct mount *mntp;
+
+ if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0)
+ return (error);
+
+ if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+ return (error);
+
+ if ((mntp = vfs_getvfs(&fsid)) == NULL)
+ return (ENOENT);
+ fs = VFSTOUFS(mntp)->um_lfs;
+
+ blkcnt = SCARG(uap, blkcnt);
+ if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+ return (EINVAL);
+
+ KERNEL_LOCK(1, NULL);
+ blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+ blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
+ if ((error = copyin(SCARG(uap, blkiov), blkiov15,
+ blkcnt * sizeof(BLOCK_INFO_15))) != 0)
+ goto out;
+
+ for (i = 0; i < blkcnt; i++) {
+ blkiov[i].bi_inode = blkiov15[i].bi_inode;
+ blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
+ blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
+ blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
+ blkiov[i].bi_version = blkiov15[i].bi_version;
+ blkiov[i].bi_bp = blkiov15[i].bi_bp;
+ blkiov[i].bi_size = blkiov15[i].bi_size;
+ }
+
+ if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
+ for (i = 0; i < blkcnt; i++) {
+ blkiov15[i].bi_inode = blkiov[i].bi_inode;
+ blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
+ blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
+ blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
+ blkiov15[i].bi_version = blkiov[i].bi_version;
+ blkiov15[i].bi_bp = blkiov[i].bi_bp;
+ blkiov15[i].bi_size = blkiov[i].bi_size;
+ }
+ copyout(blkiov15, SCARG(uap, blkiov),
+ blkcnt * sizeof(BLOCK_INFO_15));
+ }
+ out:
+ lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+ lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
+ KERNEL_UNLOCK_ONE(NULL);
+ return error;
+}
+#endif
+
+#define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
+
+int
+lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov,
+ int blkcnt)
+{
+ BLOCK_INFO *blkp;
+ IFILE *ifp;
+ struct buf *bp;
+ struct inode *ip = NULL;
+ struct lfs *fs;
+ struct mount *mntp;
+ struct vnode *vp = NULL;
+ ino_t lastino;
+ daddr_t b_daddr, v_daddr;
+ int cnt, error;
+ int do_again = 0;
+ int numrefed = 0;
+ ino_t maxino;
+ size_t obsize;
+
+ /* number of blocks/inodes that we have already bwrite'ed */
+ int nblkwritten, ninowritten;
+
+ if ((mntp = vfs_getvfs(fsidp)) == NULL)
+ return (ENOENT);
+
+ fs = VFSTOUFS(mntp)->um_lfs;
+
+ if (fs->lfs_ronly)
+ return EROFS;
+
+ maxino = (fragstoblks(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks) -
+ fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+
+ cnt = blkcnt;
+
+ if ((error = vfs_busy(mntp, NULL)) != 0)
+ return (error);
+
+ /*
+ * This seglock is just to prevent the fact that we might have to sleep
+ * from allowing the possibility that our blocks might become
+ * invalid.
+ *
+ * It is also important to note here that unless we specify SEGM_CKP,
+ * any Ifile blocks that we might be asked to clean will never get
+ * to the disk.
+ */
+ lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
+
+ /* Mark blocks/inodes dirty. */
+ error = 0;
+
+ /* these were inside the initialization for the for loop */
+ v_daddr = LFS_UNUSED_DADDR;
+ lastino = LFS_UNUSED_INUM;
+ nblkwritten = ninowritten = 0;
+ for (blkp = blkiov; cnt--; ++blkp)
+ {
+ /* Bounds-check incoming data, avoid panic for failed VGET */
+ if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
+ error = EINVAL;
+ goto err3;
+ }
+ /*
+ * Get the IFILE entry (only once) and see if the file still
+ * exists.
+ */
+ if (lastino != blkp->bi_inode) {
+ /*
+ * Finish the old file, if there was one. The presence
+ * of a usable vnode in vp is signaled by a valid v_daddr.
+ */
+ if (v_daddr != LFS_UNUSED_DADDR) {
+ lfs_vunref(vp);
+ numrefed--;
+ }
+
+ /*
+ * Start a new file
+ */
+ lastino = blkp->bi_inode;
+ if (blkp->bi_inode == LFS_IFILE_INUM)
+ v_daddr = fs->lfs_idaddr;
+ else {
+ LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+ /* XXX fix for force write */
+ v_daddr = ifp->if_daddr;
+ brelse(bp, 0);
+ }
+ if (v_daddr == LFS_UNUSED_DADDR)
+ continue;
+
+ /* Get the vnode/inode. */
+ error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
+ &vp,
+ (blkp->bi_lbn == LFS_UNUSED_LBN
+ ? blkp->bi_bp
+ : NULL));
+
+ if (!error) {
+ numrefed++;
+ }
+ if (error) {
+ DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
+ " failed with %d (ino %d, segment %d)\n",
+ error, blkp->bi_inode,
+ dtosn(fs, blkp->bi_daddr)));
+ /*
+ * If we got EAGAIN, that means that the
+ * Inode was locked. This is
+ * recoverable: just clean the rest of
+ * this segment, and let the cleaner try
+ * again with another. (When the
+ * cleaner runs again, this segment will
+ * sort high on the list, since it is
+ * now almost entirely empty.) But, we
+ * still set v_daddr = LFS_UNUSED_ADDR
+ * so as not to test this over and over
+ * again.
+ */
+ if (error == EAGAIN) {
+ error = 0;
+ do_again++;
+ }
+#ifdef DIAGNOSTIC
+ else if (error != ENOENT)
+ panic("lfs_markv VFS_VGET FAILED");
+#endif
+ /* lastino = LFS_UNUSED_INUM; */
+ v_daddr = LFS_UNUSED_DADDR;
+ vp = NULL;
+ ip = NULL;
+ continue;
+ }
+ ip = VTOI(vp);
+ ninowritten++;
+ } else if (v_daddr == LFS_UNUSED_DADDR) {
+ /*
+ * This can only happen if the vnode is dead (or
+ * in any case we can't get it...e.g., it is
+ * inlocked). Keep going.
+ */
+ continue;
+ }
+
+ /* Past this point we are guaranteed that vp, ip are valid. */
+
+ /* Can't clean VU_DIROP directories in case of truncation */
+ /* XXX - maybe we should mark removed dirs specially? */
+ if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) {
+ do_again++;
+ continue;
+ }
+
+ /* If this BLOCK_INFO didn't contain a block, keep going. */
+ if (blkp->bi_lbn == LFS_UNUSED_LBN) {
+ /* XXX need to make sure that the inode gets written in this case */
+ /* XXX but only write the inode if it's the right one */
+ if (blkp->bi_inode != LFS_IFILE_INUM) {
+ LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+ if (ifp->if_daddr == blkp->bi_daddr) {
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_CLEANING);
+ mutex_exit(&lfs_lock);
+ }
+ brelse(bp, 0);
+ }
+ continue;
+ }
+
+ b_daddr = 0;
+ if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
+ dbtofsb(fs, b_daddr) != blkp->bi_daddr)
+ {
+ if (dtosn(fs, dbtofsb(fs, b_daddr)) ==
+ dtosn(fs, blkp->bi_daddr))
+ {
+ DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
+ (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)));
+ }
+ do_again++;
+ continue;
+ }
+
+ /*
+ * Check block sizes. The blocks being cleaned come from
+ * disk, so they should have the same size as their on-disk
+ * counterparts.
+ */
+ if (blkp->bi_lbn >= 0)
+ obsize = blksize(fs, ip, blkp->bi_lbn);
+ else
+ obsize = fs->lfs_bsize;
+ /* Check for fragment size change */
+ if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
+ obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
+ }
+ if (obsize != blkp->bi_size) {
+ DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
+ " size (%ld != %d), try again\n",
+ blkp->bi_inode, (long long)blkp->bi_lbn,
+ (long) obsize, blkp->bi_size));
+ do_again++;
+ continue;
+ }
+
+ /*
+ * If we get to here, then we are keeping the block. If
+ * it is an indirect block, we want to actually put it
+ * in the buffer cache so that it can be updated in the
+ * finish_meta section. If it's not, we need to
+ * allocate a fake buffer so that writeseg can perform
+ * the copyin and write the buffer.
+ */
+ if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
+ /* Data Block */
+ bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
+ blkp->bi_size, blkp->bi_bp);
+ /* Pretend we used bread() to get it */
+ bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
+ } else {
+ /* Indirect block or ifile */
+ if (blkp->bi_size != fs->lfs_bsize &&
+ ip->i_number != LFS_IFILE_INUM)
+ panic("lfs_markv: partial indirect block?"
+ " size=%d\n", blkp->bi_size);
+ bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
+ if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
+ /*
+ * The block in question was not found
+ * in the cache; i.e., the block that
+ * getblk() returned is empty. So, we
+ * can (and should) copy in the
+ * contents, because we've already
+ * determined that this was the right
+ * version of this block on disk.
+ *
+ * And, it can't have changed underneath
+ * us, because we have the segment lock.
+ */
+ error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
+ if (error)
+ goto err2;
+ }
+ }
+ if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
+ goto err2;
+
+ nblkwritten++;
+ /*
+ * XXX should account indirect blocks and ifile pages as well
+ */
+ if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
+ > LFS_MARKV_MAX_BLOCKS) {
+ DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
+ nblkwritten, ninowritten));
+ lfs_segwrite(mntp, SEGM_CLEAN);
+ nblkwritten = ninowritten = 0;
+ }
+ }
+
+ /*
+ * Finish the old file, if there was one
+ */
+ if (v_daddr != LFS_UNUSED_DADDR) {
+ lfs_vunref(vp);
+ numrefed--;
+ }
+
+#ifdef DIAGNOSTIC
+ if (numrefed != 0)
+ panic("lfs_markv: numrefed=%d", numrefed);
+#endif
+ DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
+ nblkwritten, ninowritten));
+
+ /*
+ * The last write has to be SEGM_SYNC, because of calling semantics.
+ * It also has to be SEGM_CKP, because otherwise we could write
+ * over the newly cleaned data contained in a checkpoint, and then
+ * we'd be unhappy at recovery time.
+ */
+ lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
+
+ lfs_segunlock(fs);
+
+ vfs_unbusy(mntp, false, NULL);
+ if (error)
+ return (error);
+ else if (do_again)
+ return EAGAIN;
+
+ return 0;
+
+err2:
+ DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
+
+ /*
+ * XXX we're here because copyin() failed.
+ * XXX it means that we can't trust the cleanerd. too bad.
+ * XXX how can we recover from this?
+ */
+
+err3:
+ KERNEL_UNLOCK_ONE(NULL);
+ /*
+ * XXX should do segwrite here anyway?
+ */
+
+ if (v_daddr != LFS_UNUSED_DADDR) {
+ lfs_vunref(vp);
+ --numrefed;
+ }
+
+ lfs_segunlock(fs);
+ vfs_unbusy(mntp, false, NULL);
+#ifdef DIAGNOSTIC
+ if (numrefed != 0)
+ panic("lfs_markv: numrefed=%d", numrefed);
+#endif
+
+ return (error);
+}
+
+/*
+ * sys_lfs_bmapv:
+ *
+ * This will fill in the current disk address for arrays of blocks.
+ *
+ * 0 on success
+ * -1/errno is return on error.
+ */
+#ifdef USE_64BIT_SYSCALLS
+int
+sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(fsid_t *) fsidp;
+ syscallarg(struct block_info *) blkiov;
+ syscallarg(int) blkcnt;
+ } */
+ BLOCK_INFO *blkiov;
+ int blkcnt, error;
+ fsid_t fsid;
+ struct lfs *fs;
+ struct mount *mntp;
+
+ if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0)
+ return (error);
+
+ if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+ return (error);
+
+ if ((mntp = vfs_getvfs(&fsid)) == NULL)
+ return (ENOENT);
+ fs = VFSTOUFS(mntp)->um_lfs;
+
+ blkcnt = SCARG(uap, blkcnt);
+ if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
+ return (EINVAL);
+ KERNEL_LOCK(1, NULL);
+ blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+ if ((error = copyin(SCARG(uap, blkiov), blkiov,
+ blkcnt * sizeof(BLOCK_INFO))) != 0)
+ goto out;
+
+ if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
+ copyout(blkiov, SCARG(uap, blkiov),
+ blkcnt * sizeof(BLOCK_INFO));
+ out:
+ lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+ KERNEL_UNLOCK_ONE(NULL);
+ return error;
+}
+#else
+int
+sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(fsid_t *) fsidp;
+ syscallarg(struct block_info *) blkiov;
+ syscallarg(int) blkcnt;
+ } */
+ BLOCK_INFO *blkiov;
+ BLOCK_INFO_15 *blkiov15;
+ int i, blkcnt, error;
+ fsid_t fsid;
+ struct lfs *fs;
+ struct mount *mntp;
+
+ if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0)
+ return (error);
+
+ if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+ return (error);
+
+ if ((mntp = vfs_getvfs(&fsid)) == NULL)
+ return (ENOENT);
+ fs = VFSTOUFS(mntp)->um_lfs;
+
+ blkcnt = SCARG(uap, blkcnt);
+ if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
+ return (EINVAL);
+ KERNEL_LOCK(1, NULL);
+ blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+ blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
+ if ((error = copyin(SCARG(uap, blkiov), blkiov15,
+ blkcnt * sizeof(BLOCK_INFO_15))) != 0)
+ goto out;
+
+ for (i = 0; i < blkcnt; i++) {
+ blkiov[i].bi_inode = blkiov15[i].bi_inode;
+ blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
+ blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
+ blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
+ blkiov[i].bi_version = blkiov15[i].bi_version;
+ blkiov[i].bi_bp = blkiov15[i].bi_bp;
+ blkiov[i].bi_size = blkiov15[i].bi_size;
+ }
+
+ if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
+ for (i = 0; i < blkcnt; i++) {
+ blkiov15[i].bi_inode = blkiov[i].bi_inode;
+ blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
+ blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
+ blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
+ blkiov15[i].bi_version = blkiov[i].bi_version;
+ blkiov15[i].bi_bp = blkiov[i].bi_bp;
+ blkiov15[i].bi_size = blkiov[i].bi_size;
+ }
+ copyout(blkiov15, SCARG(uap, blkiov),
+ blkcnt * sizeof(BLOCK_INFO_15));
+ }
+ out:
+ lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+ lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
+ KERNEL_UNLOCK_ONE(NULL);
+ return error;
+}
+#endif
+
+int
+lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
+{
+ BLOCK_INFO *blkp;
+ IFILE *ifp;
+ struct buf *bp;
+ struct inode *ip = NULL;
+ struct lfs *fs;
+ struct mount *mntp;
+ struct ufsmount *ump;
+ struct vnode *vp;
+ ino_t lastino;
+ daddr_t v_daddr;
+ int cnt, error;
+ int numrefed = 0;
+
+ lfs_cleaner_pid = p->p_pid;
+
+ if ((mntp = vfs_getvfs(fsidp)) == NULL)
+ return (ENOENT);
+
+ ump = VFSTOUFS(mntp);
+ if ((error = vfs_busy(mntp, NULL)) != 0)
+ return (error);
+
+ cnt = blkcnt;
+
+ fs = VFSTOUFS(mntp)->um_lfs;
+
+ error = 0;
+
+ /* these were inside the initialization for the for loop */
+ v_daddr = LFS_UNUSED_DADDR;
+ lastino = LFS_UNUSED_INUM;
+ for (blkp = blkiov; cnt--; ++blkp)
+ {
+ /*
+ * Get the IFILE entry (only once) and see if the file still
+ * exists.
+ */
+ if (lastino != blkp->bi_inode) {
+ /*
+ * Finish the old file, if there was one. The presence
+ * of a usable vnode in vp is signaled by a valid
+ * v_daddr.
+ */
+ if (v_daddr != LFS_UNUSED_DADDR) {
+ lfs_vunref(vp);
+ numrefed--;
+ }
+
+ /*
+ * Start a new file
+ */
+ lastino = blkp->bi_inode;
+ if (blkp->bi_inode == LFS_IFILE_INUM)
+ v_daddr = fs->lfs_idaddr;
+ else {
+ LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+ v_daddr = ifp->if_daddr;
+ brelse(bp, 0);
+ }
+ if (v_daddr == LFS_UNUSED_DADDR) {
+ blkp->bi_daddr = LFS_UNUSED_DADDR;
+ continue;
+ }
+ /*
+ * A regular call to VFS_VGET could deadlock
+ * here. Instead, we try an unlocked access.
+ */
+ mutex_enter(&ufs_ihash_lock);
+ vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
+ if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) {
+ ip = VTOI(vp);
+ mutex_enter(vp->v_interlock);
+ mutex_exit(&ufs_ihash_lock);
+ if (lfs_vref(vp)) {
+ v_daddr = LFS_UNUSED_DADDR;
+ continue;
+ }
+ numrefed++;
+ } else {
+ mutex_exit(&ufs_ihash_lock);
+ /*
+ * Don't VFS_VGET if we're being unmounted,
+ * since we hold vfs_busy().
+ */
+ if (mntp->mnt_iflag & IMNT_UNMOUNT) {
+ v_daddr = LFS_UNUSED_DADDR;
+ continue;
+ }
+ error = VFS_VGET(mntp, blkp->bi_inode, &vp);
+ if (error) {
+ DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino"
+ "%d failed with %d",
+ blkp->bi_inode,error));
+ v_daddr = LFS_UNUSED_DADDR;
+ continue;
+ } else {
+ KASSERT(VOP_ISLOCKED(vp));
+ VOP_UNLOCK(vp);
+ numrefed++;
+ }
+ }
+ ip = VTOI(vp);
+ } else if (v_daddr == LFS_UNUSED_DADDR) {
+ /*
+ * This can only happen if the vnode is dead.
+ * Keep going. Note that we DO NOT set the
+ * bi_addr to anything -- if we failed to get
+ * the vnode, for example, we want to assume
+ * conservatively that all of its blocks *are*
+ * located in the segment in question.
+ * lfs_markv will throw them out if we are
+ * wrong.
+ */
+ /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
+ continue;
+ }
+
+ /* Past this point we are guaranteed that vp, ip are valid. */
+
+ if (blkp->bi_lbn == LFS_UNUSED_LBN) {
+ /*
+ * We just want the inode address, which is
+ * conveniently in v_daddr.
+ */
+ blkp->bi_daddr = v_daddr;
+ } else {
+ daddr_t bi_daddr;
+
+ /* XXX ondisk32 */
+ error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
+ &bi_daddr, NULL);
+ if (error)
+ {
+ blkp->bi_daddr = LFS_UNUSED_DADDR;
+ continue;
+ }
+ blkp->bi_daddr = dbtofsb(fs, bi_daddr);
+ /* Fill in the block size, too */
+ if (blkp->bi_lbn >= 0)
+ blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
+ else
+ blkp->bi_size = fs->lfs_bsize;
+ }
+ }
+
+ /*
+ * Finish the old file, if there was one. The presence
+ * of a usable vnode in vp is signaled by a valid v_daddr.
+ */
+ if (v_daddr != LFS_UNUSED_DADDR) {
+ lfs_vunref(vp);
+ numrefed--;
+ }
+
+#ifdef DIAGNOSTIC
+ if (numrefed != 0)
+ panic("lfs_bmapv: numrefed=%d", numrefed);
+#endif
+
+ vfs_unbusy(mntp, false, NULL);
+
+ return 0;
+}
+
+/*
+ * sys_lfs_segclean:
+ *
+ * Mark the segment clean.
+ *
+ * 0 on success
+ * -1/errno is return on error.
+ */
+int
+sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(fsid_t *) fsidp;
+ syscallarg(u_long) segment;
+ } */
+ struct lfs *fs;
+ struct mount *mntp;
+ fsid_t fsid;
+ int error;
+ unsigned long segnum;
+
+ if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0)
+ return (error);
+
+ if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+ return (error);
+ if ((mntp = vfs_getvfs(&fsid)) == NULL)
+ return (ENOENT);
+
+ fs = VFSTOUFS(mntp)->um_lfs;
+ segnum = SCARG(uap, segment);
+
+ if ((error = vfs_busy(mntp, NULL)) != 0)
+ return (error);
+
+ KERNEL_LOCK(1, NULL);
+ lfs_seglock(fs, SEGM_PROT);
+ error = lfs_do_segclean(fs, segnum);
+ lfs_segunlock(fs);
+ KERNEL_UNLOCK_ONE(NULL);
+ vfs_unbusy(mntp, false, NULL);
+ return error;
+}
+
+/*
+ * Actually mark the segment clean.
+ * Must be called with the segment lock held.
+ */
+int
+lfs_do_segclean(struct lfs *fs, unsigned long segnum)
+{
+ extern int lfs_dostats;
+ struct buf *bp;
+ CLEANERINFO *cip;
+ SEGUSE *sup;
+
+ if (dtosn(fs, fs->lfs_curseg) == segnum) {
+ return (EBUSY);
+ }
+
+ LFS_SEGENTRY(sup, fs, segnum, bp);
+ if (sup->su_nbytes) {
+ DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+ " %d live bytes\n", segnum, sup->su_nbytes));
+ brelse(bp, 0);
+ return (EBUSY);
+ }
+ if (sup->su_flags & SEGUSE_ACTIVE) {
+ DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+ " segment is active\n", segnum));
+ brelse(bp, 0);
+ return (EBUSY);
+ }
+ if (!(sup->su_flags & SEGUSE_DIRTY)) {
+ DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+ " segment is already clean\n", segnum));
+ brelse(bp, 0);
+ return (EALREADY);
+ }
+
+ fs->lfs_avail += segtod(fs, 1);
+ if (sup->su_flags & SEGUSE_SUPERBLOCK)
+ fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
+ if (fs->lfs_version > 1 && segnum == 0 &&
+ fs->lfs_start < btofsb(fs, LFS_LABELPAD))
+ fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
+ mutex_enter(&lfs_lock);
+ fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
+ btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
+ fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
+ btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
+ if (fs->lfs_dmeta < 0)
+ fs->lfs_dmeta = 0;
+ mutex_exit(&lfs_lock);
+ sup->su_flags &= ~SEGUSE_DIRTY;
+ LFS_WRITESEGENTRY(sup, fs, segnum, bp);
+
+ LFS_CLEANERINFO(cip, fs, bp);
+ ++cip->clean;
+ --cip->dirty;
+ fs->lfs_nclean = cip->clean;
+ cip->bfree = fs->lfs_bfree;
+ mutex_enter(&lfs_lock);
+ cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
+ wakeup(&fs->lfs_avail);
+ mutex_exit(&lfs_lock);
+ (void) LFS_BWRITE_LOG(bp);
+
+ if (lfs_dostats)
+ ++lfs_stats.segs_reclaimed;
+
+ return (0);
+}
+
+/*
+ * This will block until a segment in file system fsid is written. A timeout
+ * in milliseconds may be specified which will awake the cleaner automatically.
+ * An fsid of -1 means any file system, and a timeout of 0 means forever.
+ */
+int
+lfs_segwait(fsid_t *fsidp, struct timeval *tv)
+{
+ struct mount *mntp;
+ void *addr;
+ u_long timeout;
+ int error;
+
+ KERNEL_LOCK(1, NULL);
+ if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
+ addr = &lfs_allclean_wakeup;
+ else
+ addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
+ /*
+ * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
+ * XXX IS THAT WHAT IS INTENDED?
+ */
+ timeout = tvtohz(tv);
+ error = tsleep(addr, PCATCH | PVFS, "segment", timeout);
+ KERNEL_UNLOCK_ONE(NULL);
+ return (error == ERESTART ? EINTR : 0);
+}
+
+/*
+ * sys_lfs_segwait:
+ *
+ * System call wrapper around lfs_segwait().
+ *
+ * 0 on success
+ * 1 on timeout
+ * -1/errno is return on error.
+ */
+int
+sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap,
+ register_t *retval)
+{
+ /* {
+ syscallarg(fsid_t *) fsidp;
+ syscallarg(struct timeval *) tv;
+ } */
+ struct timeval atv;
+ fsid_t fsid;
+ int error;
+
+ /* XXX need we be su to segwait? */
+ if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0)
+ return (error);
+ if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+ return (error);
+
+ if (SCARG(uap, tv)) {
+ error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
+ if (error)
+ return (error);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ } else /* NULL or invalid */
+ atv.tv_sec = atv.tv_usec = 0;
+ return lfs_segwait(&fsid, &atv);
+}
+
+/*
+ * VFS_VGET call specialized for the cleaner. The cleaner already knows the
+ * daddr from the ifile, so don't look it up again. If the cleaner is
+ * processing IINFO structures, it may have the ondisk inode already, so
+ * don't go retrieving it again.
+ *
+ * we lfs_vref, and it is the caller's responsibility to lfs_vunref
+ * when finished.
+ */
+
+int
+lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
+{
+ struct vnode *vp;
+
+ mutex_enter(&ufs_ihash_lock);
+ if ((vp = ufs_ihashlookup(dev, ino)) != NULL) {
+ mutex_enter(vp->v_interlock);
+ mutex_exit(&ufs_ihash_lock);
+ if (vp->v_iflag & VI_XLOCK) {
+ DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n",
+ ino));
+ lfs_stats.clean_vnlocked++;
+ mutex_exit(vp->v_interlock);
+ return EAGAIN;
+ }
+ if (lfs_vref(vp)) {
+ DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed"
+ " for ino %d\n", ino));
+ lfs_stats.clean_inlocked++;
+ return EAGAIN;
+ }
+ } else {
+ mutex_exit(&ufs_ihash_lock);
+ }
+ *vpp = vp;
+
+ return (0);
+}
+
+int
+lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp,
+ struct ufs1_dinode *dinp)
+{
+ struct inode *ip;
+ struct ufs1_dinode *dip;
+ struct vnode *vp;
+ struct ufsmount *ump;
+ dev_t dev;
+ int error, retries;
+ struct buf *bp;
+ struct lfs *fs;
+
+ ump = VFSTOUFS(mp);
+ dev = ump->um_dev;
+ fs = ump->um_lfs;
+
+ /*
+ * Wait until the filesystem is fully mounted before allowing vget
+ * to complete. This prevents possible problems with roll-forward.
+ */
+ mutex_enter(&lfs_lock);
+ while (fs->lfs_flags & LFS_NOTYET) {
+ mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0,
+ &lfs_lock);
+ }
+ mutex_exit(&lfs_lock);
+
+ /*
+ * This is playing fast and loose. Someone may have the inode
+ * locked, in which case they are going to be distinctly unhappy
+ * if we trash something.
+ */
+
+ error = lfs_fasthashget(dev, ino, vpp);
+ if (error != 0 || *vpp != NULL)
+ return (error);
+
+ /*
+ * getnewvnode(9) will call vfs_busy, which will block if the
+ * filesystem is being unmounted; but umount(9) is waiting for
+ * us because we're already holding the fs busy.
+ * XXXMP
+ */
+ if (mp->mnt_iflag & IMNT_UNMOUNT) {
+ *vpp = NULL;
+ return EDEADLK;
+ }
+ error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp);
+ if (error) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ mutex_enter(&ufs_hashlock);
+ error = lfs_fasthashget(dev, ino, vpp);
+ if (error != 0 || *vpp != NULL) {
+ mutex_exit(&ufs_hashlock);
+ ungetnewvnode(vp);
+ return (error);
+ }
+
+ /* Allocate new vnode/inode. */
+ lfs_vcreate(mp, ino, vp);
+
+ /*
+ * Put it onto its hash chain and lock it so that other requests for
+ * this inode will block if they arrive while we are sleeping waiting
+ * for old data structures to be purged or for the contents of the
+ * disk portion of this inode to be read.
+ */
+ ip = VTOI(vp);
+ ufs_ihashins(ip);
+ mutex_exit(&ufs_hashlock);
+
+ /*
+ * XXX
+ * This may not need to be here, logically it should go down with
+ * the i_devvp initialization.
+ * Ask Kirk.
+ */
+ ip->i_lfs = fs;
+
+ /* Read in the disk contents for the inode, copy into the inode. */
+ if (dinp) {
+ error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
+ if (error) {
+ DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed"
+ " for ino %d\n", ino));
+ ufs_ihashrem(ip);
+
+ /* Unlock and discard unneeded inode. */
+ VOP_UNLOCK(vp);
+ lfs_vunref(vp);
+ *vpp = NULL;
+ return (error);
+ }
+ if (ip->i_number != ino)
+ panic("lfs_fastvget: I was fed the wrong inode!");
+ } else {
+ retries = 0;
+ again:
+ error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
+ NOCRED, 0, &bp);
+ if (error) {
+ DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n",
+ error));
+ /*
+ * The inode does not contain anything useful, so it
+ * would be misleading to leave it on its hash chain.
+ * Iput() will return it to the free list.
+ */
+ ufs_ihashrem(ip);
+
+ /* Unlock and discard unneeded inode. */
+ VOP_UNLOCK(vp);
+ lfs_vunref(vp);
+ brelse(bp, 0);
+ *vpp = NULL;
+ return (error);
+ }
+ dip = lfs_ifind(ump->um_lfs, ino, bp);
+ if (dip == NULL) {
+ /* Assume write has not completed yet; try again */
+ brelse(bp, BC_INVAL);
+ ++retries;
+ if (retries > LFS_IFIND_RETRIES)
+ panic("lfs_fastvget: dinode not found");
+ DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found,"
+ " retrying...\n"));
+ goto again;
+ }
+ *ip->i_din.ffs1_din = *dip;
+ brelse(bp, 0);
+ }
+ lfs_vinit(mp, &vp);
+
+ *vpp = vp;
+
+ KASSERT(VOP_ISLOCKED(vp));
+ VOP_UNLOCK(vp);
+
+ return (0);
+}
+
+/*
+ * Make up a "fake" cleaner buffer, copy the data from userland into it.
+ */
+struct buf *
+lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr)
+{
+ struct buf *bp;
+ int error;
+
+ KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
+
+ bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
+ error = copyin(uaddr, bp->b_data, size);
+ if (error) {
+ lfs_freebuf(fs, bp);
+ return NULL;
+ }
+ KDASSERT(bp->b_iodone == lfs_callback);
+
+#if 0
+ mutex_enter(&lfs_lock);
+ ++fs->lfs_iocount;
+ mutex_exit(&lfs_lock);
+#endif
+ bp->b_bufsize = size;
+ bp->b_bcount = size;
+ return (bp);
+}
--- /dev/null
+/* $NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
+ * The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1989, 1991, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_lfs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kthread.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <uvm/uvm_extern.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/module.h>
+#include <sys/syscallvar.h>
+#include <sys/syscall.h>
+#include <sys/syscallargs.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+
+MODULE(MODULE_CLASS_VFS, lfs, "ffs");
+
+static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
+static bool lfs_issequential_hole(const struct ufsmount *,
+ daddr_t, daddr_t);
+
+static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *);
+
+static struct sysctllog *lfs_sysctl_log;
+
+extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc lfs_specop_opv_desc;
+extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
+
+pid_t lfs_writer_daemon = 0;
+int lfs_do_flush = 0;
+#ifdef LFS_KERNEL_RFW
+int lfs_do_rfw = 0;
+#endif
+
+const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
+ &lfs_vnodeop_opv_desc,
+ &lfs_specop_opv_desc,
+ &lfs_fifoop_opv_desc,
+ NULL,
+};
+
+struct vfsops lfs_vfsops = {
+ MOUNT_LFS,
+ sizeof (struct ufs_args),
+ lfs_mount,
+ ufs_start,
+ lfs_unmount,
+ ufs_root,
+ ufs_quotactl,
+ lfs_statvfs,
+ lfs_sync,
+ lfs_vget,
+ lfs_fhtovp,
+ lfs_vptofh,
+ lfs_init,
+ lfs_reinit,
+ lfs_done,
+ lfs_mountroot,
+ (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+ vfs_stdextattrctl,
+ (void *)eopnotsupp, /* vfs_suspendctl */
+ genfs_renamelock_enter,
+ genfs_renamelock_exit,
+ (void *)eopnotsupp,
+ lfs_vnodeopv_descs,
+ 0,
+ { NULL, NULL },
+};
+
+const struct genfs_ops lfs_genfsops = {
+ .gop_size = lfs_gop_size,
+ .gop_alloc = ufs_gop_alloc,
+ .gop_write = lfs_gop_write,
+ .gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops lfs_ufsops = {
+ .uo_itimes = NULL,
+ .uo_update = lfs_update,
+ .uo_truncate = lfs_truncate,
+ .uo_valloc = lfs_valloc,
+ .uo_vfree = lfs_vfree,
+ .uo_balloc = lfs_balloc,
+ .uo_unmark_vnode = lfs_unmark_vnode,
+};
+
+struct shortlong {
+ const char *sname;
+ const char *lname;
+};
+
+static int
+sysctl_lfs_dostats(SYSCTLFN_ARGS)
+{
+ extern struct lfs_stats lfs_stats;
+ extern int lfs_dostats;
+ int error;
+
+ error = sysctl_lookup(SYSCTLFN_CALL(rnode));
+ if (error || newp == NULL)
+ return (error);
+
+ if (lfs_dostats == 0)
+ memset(&lfs_stats, 0, sizeof(lfs_stats));
+
+ return (0);
+}
+
+static void
+lfs_sysctl_setup(struct sysctllog **clog)
+{
+ int i;
+ extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead,
+ lfs_fs_pagetrip, lfs_ignore_lazy_sync;
+#ifdef DEBUG
+ extern int lfs_debug_log_subsys[DLOG_MAX];
+ struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */
+ { "rollforward", "Debug roll-forward code" },
+ { "alloc", "Debug inode allocation and free list" },
+ { "avail", "Debug space-available-now accounting" },
+ { "flush", "Debug flush triggers" },
+ { "lockedlist", "Debug locked list accounting" },
+ { "vnode_verbose", "Verbose per-vnode-written debugging" },
+ { "vnode", "Debug vnode use during segment write" },
+ { "segment", "Debug segment writing" },
+ { "seguse", "Debug segment used-bytes accounting" },
+ { "cleaner", "Debug cleaning routines" },
+ { "mount", "Debug mount/unmount routines" },
+ { "pagecache", "Debug UBC interactions" },
+ { "dirop", "Debug directory-operation accounting" },
+ { "malloc", "Debug private malloc accounting" },
+ };
+#endif /* DEBUG */
+ struct shortlong stat_names[] = { /* Must match lfs.h! */
+ { "segsused", "Number of new segments allocated" },
+ { "psegwrites", "Number of partial-segment writes" },
+ { "psyncwrites", "Number of synchronous partial-segment"
+ " writes" },
+ { "pcleanwrites", "Number of partial-segment writes by the"
+ " cleaner" },
+ { "blocktot", "Number of blocks written" },
+ { "cleanblocks", "Number of blocks written by the cleaner" },
+ { "ncheckpoints", "Number of checkpoints made" },
+ { "nwrites", "Number of whole writes" },
+ { "nsync_writes", "Number of synchronous writes" },
+ { "wait_exceeded", "Number of times writer waited for"
+ " cleaner" },
+ { "write_exceeded", "Number of times writer invoked flush" },
+ { "flush_invoked", "Number of times flush was invoked" },
+ { "vflush_invoked", "Number of time vflush was called" },
+ { "clean_inlocked", "Number of vnodes skipped for VI_XLOCK" },
+ { "clean_vnlocked", "Number of vnodes skipped for vget failure" },
+ { "segs_reclaimed", "Number of segments reclaimed" },
+ };
+
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "vfs", NULL,
+ NULL, 0, NULL, 0,
+ CTL_VFS, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "lfs",
+ SYSCTL_DESCR("Log-structured file system"),
+ NULL, 0, NULL, 0,
+ CTL_VFS, 5, CTL_EOL);
+ /*
+ * XXX the "5" above could be dynamic, thereby eliminating one
+ * more instance of the "number to vfs" mapping problem, but
+ * "5" is the order as taken from sys/mount.h
+ */
+
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "flushindir", NULL,
+ NULL, 0, &lfs_writeindir, 0,
+ CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "clean_vnhead", NULL,
+ NULL, 0, &lfs_clean_vnhead, 0,
+ CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "dostats",
+ SYSCTL_DESCR("Maintain statistics on LFS operations"),
+ sysctl_lfs_dostats, 0, &lfs_dostats, 0,
+ CTL_VFS, 5, LFS_DOSTATS, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "pagetrip",
+ SYSCTL_DESCR("How many dirty pages in fs triggers"
+ " a flush"),
+ NULL, 0, &lfs_fs_pagetrip, 0,
+ CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "ignore_lazy_sync",
+ SYSCTL_DESCR("Lazy Sync is ignored entirely"),
+ NULL, 0, &lfs_ignore_lazy_sync, 0,
+ CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL);
+#ifdef LFS_KERNEL_RFW
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "rfw",
+ SYSCTL_DESCR("Use in-kernel roll-forward on mount"),
+ NULL, 0, &lfs_do_rfw, 0,
+ CTL_VFS, 5, LFS_DO_RFW, CTL_EOL);
+#endif
+
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "stats",
+ SYSCTL_DESCR("Debugging options"),
+ NULL, 0, NULL, 0,
+ CTL_VFS, 5, LFS_STATS, CTL_EOL);
+ for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) {
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+ CTLTYPE_INT, stat_names[i].sname,
+ SYSCTL_DESCR(stat_names[i].lname),
+ NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]),
+ 0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL);
+ }
+
+#ifdef DEBUG
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "debug",
+ SYSCTL_DESCR("Debugging options"),
+ NULL, 0, NULL, 0,
+ CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL);
+ for (i = 0; i < DLOG_MAX; i++) {
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, dlog_names[i].sname,
+ SYSCTL_DESCR(dlog_names[i].lname),
+ NULL, 0, &(lfs_debug_log_subsys[i]), 0,
+ CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL);
+ }
+#endif
+}
+
+/* old cleaner syscall interface. see VOP_FCNTL() */
+static const struct syscall_package lfs_syscalls[] = {
+ { SYS_lfs_bmapv, 0, (sy_call_t *)sys_lfs_bmapv },
+ { SYS_lfs_markv, 0, (sy_call_t *)sys_lfs_markv },
+ { SYS_lfs_segclean, 0, (sy_call_t *)sys___lfs_segwait50 },
+ { 0, 0, NULL },
+};
+
+static int
+lfs_modcmd(modcmd_t cmd, void *arg)
+{
+ int error;
+
+ switch (cmd) {
+ case MODULE_CMD_INIT:
+ error = syscall_establish(NULL, lfs_syscalls);
+ if (error)
+ return error;
+ error = vfs_attach(&lfs_vfsops);
+ if (error != 0) {
+ syscall_disestablish(NULL, lfs_syscalls);
+ break;
+ }
+ lfs_sysctl_setup(&lfs_sysctl_log);
+ break;
+ case MODULE_CMD_FINI:
+ error = vfs_detach(&lfs_vfsops);
+ if (error != 0)
+ break;
+ syscall_disestablish(NULL, lfs_syscalls);
+ sysctl_teardown(&lfs_sysctl_log);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ return (error);
+}
+
+/*
+ * XXX Same structure as FFS inodes? Should we share a common pool?
+ */
+struct pool lfs_inode_pool;
+struct pool lfs_dinode_pool;
+struct pool lfs_inoext_pool;
+struct pool lfs_lbnentry_pool;
+
+/*
+ * The writer daemon. UVM keeps track of how many dirty pages we are holding
+ * in lfs_subsys_pages; the daemon flushes the filesystem when this value
+ * crosses the (user-defined) threshhold LFS_MAX_PAGES.
+ */
+static void
+lfs_writerd(void *arg)
+{
+ struct mount *mp, *nmp;
+ struct lfs *fs;
+ int fsflags;
+ int loopcount;
+
+ lfs_writer_daemon = curproc->p_pid;
+
+ mutex_enter(&lfs_lock);
+ for (;;) {
+ mtsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", hz/10,
+ &lfs_lock);
+
+ /*
+ * Look through the list of LFSs to see if any of them
+ * have requested pageouts.
+ */
+ mutex_enter(&mountlist_lock);
+ for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+ mp = nmp) {
+ if (vfs_busy(mp, &nmp)) {
+ continue;
+ }
+ if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
+ sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+ fs = VFSTOUFS(mp)->um_lfs;
+ mutex_enter(&lfs_lock);
+ fsflags = 0;
+ if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+ lfs_dirvcount > LFS_MAX_DIROP) &&
+ fs->lfs_dirops == 0)
+ fsflags |= SEGM_CKP;
+ if (fs->lfs_pdflush) {
+ DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
+ fs->lfs_pdflush = 0;
+ lfs_flush_fs(fs, fsflags);
+ mutex_exit(&lfs_lock);
+ } else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
+ DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
+ mutex_exit(&lfs_lock);
+ lfs_writer_enter(fs, "wrdirop");
+ lfs_flush_pchain(fs);
+ lfs_writer_leave(fs);
+ } else
+ mutex_exit(&lfs_lock);
+ }
+ vfs_unbusy(mp, false, &nmp);
+ }
+ mutex_exit(&mountlist_lock);
+
+ /*
+ * If global state wants a flush, flush everything.
+ */
+ mutex_enter(&lfs_lock);
+ loopcount = 0;
+ if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
+ locked_queue_bytes > LFS_MAX_BYTES ||
+ lfs_subsys_pages > LFS_MAX_PAGES) {
+
+ if (lfs_do_flush) {
+ DLOG((DLOG_FLUSH, "daemon: lfs_do_flush\n"));
+ }
+ if (locked_queue_count > LFS_MAX_BUFS) {
+ DLOG((DLOG_FLUSH, "daemon: lqc = %d, max %d\n",
+ locked_queue_count, LFS_MAX_BUFS));
+ }
+ if (locked_queue_bytes > LFS_MAX_BYTES) {
+ DLOG((DLOG_FLUSH, "daemon: lqb = %ld, max %ld\n",
+ locked_queue_bytes, LFS_MAX_BYTES));
+ }
+ if (lfs_subsys_pages > LFS_MAX_PAGES) {
+ DLOG((DLOG_FLUSH, "daemon: lssp = %d, max %d\n",
+ lfs_subsys_pages, LFS_MAX_PAGES));
+ }
+
+ lfs_flush(NULL, SEGM_WRITERD, 0);
+ lfs_do_flush = 0;
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Initialize the filesystem, most work done by ufs_init.
+ */
+void
+lfs_init(void)
+{
+
+ malloc_type_attach(M_SEGMENT);
+ pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
+ "lfsinopl", &pool_allocator_nointr, IPL_NONE);
+ pool_init(&lfs_dinode_pool, sizeof(struct ufs1_dinode), 0, 0, 0,
+ "lfsdinopl", &pool_allocator_nointr, IPL_NONE);
+ pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
+ "lfsinoextpl", &pool_allocator_nointr, IPL_NONE);
+ pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0,
+ "lfslbnpool", &pool_allocator_nointr, IPL_NONE);
+ ufs_init();
+
+#ifdef DEBUG
+ memset(lfs_log, 0, sizeof(lfs_log));
+#endif
+ mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE);
+ cv_init(&locked_queue_cv, "lfsbuf");
+ cv_init(&lfs_writing_cv, "lfsflush");
+}
+
+void
+lfs_reinit(void)
+{
+ ufs_reinit();
+}
+
+void
+lfs_done(void)
+{
+ ufs_done();
+ mutex_destroy(&lfs_lock);
+ cv_destroy(&locked_queue_cv);
+ cv_destroy(&lfs_writing_cv);
+ pool_destroy(&lfs_inode_pool);
+ pool_destroy(&lfs_dinode_pool);
+ pool_destroy(&lfs_inoext_pool);
+ pool_destroy(&lfs_lbnentry_pool);
+ malloc_type_detach(M_SEGMENT);
+}
+
+/*
+ * Called by main() when ufs is going to be mounted as root.
+ */
+int
+lfs_mountroot(void)
+{
+ extern struct vnode *rootvp;
+ struct lfs *fs = NULL; /* LFS */
+ struct mount *mp;
+ struct lwp *l = curlwp;
+ struct ufsmount *ump;
+ int error;
+
+ if (device_class(root_device) != DV_DISK)
+ return (ENODEV);
+
+ if (rootdev == NODEV)
+ return (ENODEV);
+ if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) {
+ vrele(rootvp);
+ return (error);
+ }
+ if ((error = lfs_mountfs(rootvp, mp, l))) {
+ vfs_unbusy(mp, false, NULL);
+ vfs_destroy(mp);
+ return (error);
+ }
+ mutex_enter(&mountlist_lock);
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mutex_exit(&mountlist_lock);
+ ump = VFSTOUFS(mp);
+ fs = ump->um_lfs;
+ memset(fs->lfs_fsmnt, 0, sizeof(fs->lfs_fsmnt));
+ (void)copystr(mp->mnt_stat.f_mntonname, fs->lfs_fsmnt, MNAMELEN - 1, 0);
+ (void)lfs_statvfs(mp, &mp->mnt_stat);
+ vfs_unbusy(mp, false, NULL);
+ setrootfstime((time_t)(VFSTOUFS(mp)->um_lfs->lfs_tstamp));
+ return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+ struct lwp *l = curlwp;
+ struct vnode *devvp;
+ struct ufs_args *args = data;
+ struct ufsmount *ump = NULL;
+ struct lfs *fs = NULL; /* LFS */
+ int error = 0, update;
+ mode_t accessmode;
+
+ if (*data_len < sizeof *args)
+ return EINVAL;
+
+ if (mp->mnt_flag & MNT_GETARGS) {
+ ump = VFSTOUFS(mp);
+ if (ump == NULL)
+ return EIO;
+ args->fspec = NULL;
+ *data_len = sizeof *args;
+ return 0;
+ }
+
+ update = mp->mnt_flag & MNT_UPDATE;
+
+ /* Check arguments */
+ if (args->fspec != NULL) {
+ /*
+ * Look up the name and verify that it's sane.
+ */
+ error = namei_simple_user(args->fspec,
+ NSM_FOLLOW_NOEMULROOT, &devvp);
+ if (error != 0)
+ return (error);
+
+ if (!update) {
+ /*
+ * Be sure this is a valid block device
+ */
+ if (devvp->v_type != VBLK)
+ error = ENOTBLK;
+ else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+ error = ENXIO;
+ } else {
+ /*
+ * Be sure we're still naming the same device
+ * used for our initial mount
+ */
+ ump = VFSTOUFS(mp);
+ if (devvp != ump->um_devvp) {
+ if (devvp->v_rdev != ump->um_devvp->v_rdev)
+ error = EINVAL;
+ else {
+ vrele(devvp);
+ devvp = ump->um_devvp;
+ vref(devvp);
+ }
+ }
+ }
+ } else {
+ if (!update) {
+ /* New mounts must have a filename for the device */
+ return (EINVAL);
+ } else {
+ /* Use the extant mount */
+ ump = VFSTOUFS(mp);
+ devvp = ump->um_devvp;
+ vref(devvp);
+ }
+ }
+
+
+ /*
+ * If mount by non-root, then verify that user has necessary
+ * permissions on the device.
+ */
+ if (error == 0) {
+ accessmode = VREAD;
+ if (update ?
+ (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+ (mp->mnt_flag & MNT_RDONLY) == 0)
+ accessmode |= VWRITE;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = genfs_can_mount(devvp, accessmode, l->l_cred);
+ VOP_UNLOCK(devvp);
+ }
+
+ if (error) {
+ vrele(devvp);
+ return (error);
+ }
+
+ if (!update) {
+ int flags;
+
+ if (mp->mnt_flag & MNT_RDONLY)
+ flags = FREAD;
+ else
+ flags = FREAD|FWRITE;
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_OPEN(devvp, flags, FSCRED);
+ VOP_UNLOCK(devvp);
+ if (error)
+ goto fail;
+ error = lfs_mountfs(devvp, mp, l); /* LFS */
+ if (error) {
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(devvp, flags, NOCRED);
+ VOP_UNLOCK(devvp);
+ goto fail;
+ }
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_lfs;
+ } else {
+ /*
+ * Update the mount.
+ */
+
+ /*
+ * The initial mount got a reference on this
+ * device, so drop the one obtained via
+ * namei(), above.
+ */
+ vrele(devvp);
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_lfs;
+ if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+ /*
+ * Changing from read-only to read/write.
+ * Note in the superblocks that we're writing.
+ */
+ fs->lfs_ronly = 0;
+ if (fs->lfs_pflags & LFS_PF_CLEAN) {
+ fs->lfs_pflags &= ~LFS_PF_CLEAN;
+ lfs_writesuper(fs, fs->lfs_sboffs[0]);
+ lfs_writesuper(fs, fs->lfs_sboffs[1]);
+ }
+ }
+ if (args->fspec == NULL)
+ return EINVAL;
+ }
+
+ error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+ UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+ if (error == 0)
+ (void)strncpy(fs->lfs_fsmnt, mp->mnt_stat.f_mntonname,
+ sizeof(fs->lfs_fsmnt));
+ return error;
+
+fail:
+ vrele(devvp);
+ return (error);
+}
+
+
+/*
+ * Common code for mount and mountroot
+ * LFS specific
+ */
+int
+lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
+{
+ struct dlfs *tdfs, *dfs, *adfs;
+ struct lfs *fs;
+ struct ufsmount *ump;
+ struct vnode *vp;
+ struct buf *bp, *abp;
+ dev_t dev;
+ int error, i, ronly, fsbsize;
+ kauth_cred_t cred;
+ CLEANERINFO *cip;
+ SEGUSE *sup;
+ daddr_t sb_addr;
+
+ cred = l ? l->l_cred : NOCRED;
+
+ /*
+ * Flush out any old buffers remaining from a previous use.
+ */
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+ VOP_UNLOCK(devvp);
+ if (error)
+ return (error);
+
+ ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+ /* Don't free random space on error. */
+ bp = NULL;
+ abp = NULL;
+ ump = NULL;
+
+ sb_addr = LFS_LABELPAD / DEV_BSIZE;
+ while (1) {
+ /* Read in the superblock. */
+ error = bread(devvp, sb_addr, LFS_SBPAD, cred, 0, &bp);
+ if (error)
+ goto out;
+ dfs = (struct dlfs *)bp->b_data;
+
+ /* Check the basics. */
+ if (dfs->dlfs_magic != LFS_MAGIC || dfs->dlfs_bsize > MAXBSIZE ||
+ dfs->dlfs_version > LFS_VERSION ||
+ dfs->dlfs_bsize < sizeof(struct dlfs)) {
+ DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n"));
+ error = EINVAL; /* XXX needs translation */
+ goto out;
+ }
+ if (dfs->dlfs_inodefmt > LFS_MAXINODEFMT) {
+ DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n",
+ dfs->dlfs_inodefmt));
+ error = EINVAL;
+ goto out;
+ }
+
+ if (dfs->dlfs_version == 1)
+ fsbsize = DEV_BSIZE;
+ else {
+ fsbsize = 1 << dfs->dlfs_ffshift;
+ /*
+ * Could be, if the frag size is large enough, that we
+ * don't have the "real" primary superblock. If that's
+ * the case, get the real one, and try again.
+ */
+ if (sb_addr != (dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT))) {
+ DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr"
+ " 0x%llx is not right, trying 0x%llx\n",
+ (long long)sb_addr,
+ (long long)(dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT))));
+ sb_addr = dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT);
+ brelse(bp, 0);
+ continue;
+ }
+ }
+ break;
+ }
+
+ /*
+ * Check the second superblock to see which is newer; then mount
+ * using the older of the two. This is necessary to ensure that
+ * the filesystem is valid if it was not unmounted cleanly.
+ */
+
+ if (dfs->dlfs_sboffs[1] &&
+ dfs->dlfs_sboffs[1] - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize)
+ {
+ error = bread(devvp, dfs->dlfs_sboffs[1] * (fsbsize / DEV_BSIZE),
+ LFS_SBPAD, cred, 0, &abp);
+ if (error)
+ goto out;
+ adfs = (struct dlfs *)abp->b_data;
+
+ if (dfs->dlfs_version == 1) {
+ /* 1s resolution comparison */
+ if (adfs->dlfs_tstamp < dfs->dlfs_tstamp)
+ tdfs = adfs;
+ else
+ tdfs = dfs;
+ } else {
+ /* monotonic infinite-resolution comparison */
+ if (adfs->dlfs_serial < dfs->dlfs_serial)
+ tdfs = adfs;
+ else
+ tdfs = dfs;
+ }
+
+ /* Check the basics. */
+ if (tdfs->dlfs_magic != LFS_MAGIC ||
+ tdfs->dlfs_bsize > MAXBSIZE ||
+ tdfs->dlfs_version > LFS_VERSION ||
+ tdfs->dlfs_bsize < sizeof(struct dlfs)) {
+ DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock"
+ " sanity failed\n"));
+ error = EINVAL; /* XXX needs translation */
+ goto out;
+ }
+ } else {
+ DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock"
+ " daddr=0x%x\n", dfs->dlfs_sboffs[1]));
+ error = EINVAL;
+ goto out;
+ }
+
+ /* Allocate the mount structure, copy the superblock into it. */
+ fs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK | M_ZERO);
+ memcpy(&fs->lfs_dlfs, tdfs, sizeof(struct dlfs));
+
+ /* Compatibility */
+ if (fs->lfs_version < 2) {
+ fs->lfs_sumsize = LFS_V1_SUMMARY_SIZE;
+ fs->lfs_ibsize = fs->lfs_bsize;
+ fs->lfs_start = fs->lfs_sboffs[0];
+ fs->lfs_tstamp = fs->lfs_otstamp;
+ fs->lfs_fsbtodb = 0;
+ }
+ if (fs->lfs_resvseg == 0)
+ fs->lfs_resvseg = MIN(fs->lfs_minfreeseg - 1, \
+ MAX(MIN_RESV_SEGS, fs->lfs_minfreeseg / 2 + 1));
+
+ /*
+ * If we aren't going to be able to write meaningfully to this
+ * filesystem, and were not mounted readonly, bomb out now.
+ */
+ if (fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) {
+ DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write,"
+ " we need BUFPAGES >= %lld\n",
+ (long long)((bufmem_hiwater / bufmem_lowater) *
+ LFS_INVERSE_MAX_BYTES(
+ fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT)));
+ free(fs, M_UFSMNT);
+ error = EFBIG; /* XXX needs translation */
+ goto out;
+ }
+
+ /* Before rolling forward, lock so vget will sleep for other procs */
+ if (l != NULL) {
+ fs->lfs_flags = LFS_NOTYET;
+ fs->lfs_rfpid = l->l_proc->p_pid;
+ }
+
+ ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
+ ump->um_lfs = fs;
+ ump->um_ops = &lfs_ufsops;
+ ump->um_fstype = UFS1;
+ if (sizeof(struct lfs) < LFS_SBPAD) { /* XXX why? */
+ brelse(bp, BC_INVAL);
+ brelse(abp, BC_INVAL);
+ } else {
+ brelse(bp, 0);
+ brelse(abp, 0);
+ }
+ bp = NULL;
+ abp = NULL;
+
+
+ /* Set up the I/O information */
+ fs->lfs_devbsize = DEV_BSIZE;
+ fs->lfs_iocount = 0;
+ fs->lfs_diropwait = 0;
+ fs->lfs_activesb = 0;
+ fs->lfs_uinodes = 0;
+ fs->lfs_ravail = 0;
+ fs->lfs_favail = 0;
+ fs->lfs_sbactive = 0;
+
+ /* Set up the ifile and lock aflags */
+ fs->lfs_doifile = 0;
+ fs->lfs_writer = 0;
+ fs->lfs_dirops = 0;
+ fs->lfs_nadirop = 0;
+ fs->lfs_seglock = 0;
+ fs->lfs_pdflush = 0;
+ fs->lfs_sleepers = 0;
+ fs->lfs_pages = 0;
+ rw_init(&fs->lfs_fraglock);
+ rw_init(&fs->lfs_iflock);
+ cv_init(&fs->lfs_stopcv, "lfsstop");
+
+ /* Set the file system readonly/modify bits. */
+ fs->lfs_ronly = ronly;
+ if (ronly == 0)
+ fs->lfs_fmod = 1;
+
+ /* Initialize the mount structure. */
+ dev = devvp->v_rdev;
+ mp->mnt_data = ump;
+ mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+ mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS);
+ mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+ mp->mnt_stat.f_namemax = LFS_MAXNAMLEN;
+ mp->mnt_stat.f_iosize = fs->lfs_bsize;
+ mp->mnt_flag |= MNT_LOCAL;
+ mp->mnt_fs_bshift = fs->lfs_bshift;
+ ump->um_flags = 0;
+ ump->um_mountp = mp;
+ ump->um_dev = dev;
+ ump->um_devvp = devvp;
+ ump->um_bptrtodb = fs->lfs_ffshift - DEV_BSHIFT;
+ ump->um_seqinc = fs->lfs_frag;
+ ump->um_nindir = fs->lfs_nindir;
+ ump->um_lognindir = ffs(fs->lfs_nindir) - 1;
+ for (i = 0; i < MAXQUOTAS; i++)
+ ump->um_quotas[i] = NULLVP;
+ ump->um_maxsymlinklen = fs->lfs_maxsymlinklen;
+ ump->um_dirblksiz = DIRBLKSIZ;
+ ump->um_maxfilesize = fs->lfs_maxfilesize;
+ if (ump->um_maxsymlinklen > 0)
+ mp->mnt_iflag |= IMNT_DTYPE;
+ devvp->v_specmountpoint = mp;
+
+ /* Set up reserved memory for pageout */
+ lfs_setup_resblks(fs);
+ /* Set up vdirop tailq */
+ TAILQ_INIT(&fs->lfs_dchainhd);
+ /* and paging tailq */
+ TAILQ_INIT(&fs->lfs_pchainhd);
+ /* and delayed segment accounting for truncation list */
+ LIST_INIT(&fs->lfs_segdhd);
+
+ /*
+ * We use the ifile vnode for almost every operation. Instead of
+ * retrieving it from the hash table each time we retrieve it here,
+ * artificially increment the reference count and keep a pointer
+ * to it in the incore copy of the superblock.
+ */
+ if ((error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) != 0) {
+ DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error));
+ goto out;
+ }
+ fs->lfs_ivnode = vp;
+ vref(vp);
+
+ /* Set up inode bitmap and order free list */
+ lfs_order_freelist(fs);
+
+ /* Set up segment usage flags for the autocleaner. */
+ fs->lfs_nactive = 0;
+ fs->lfs_suflags = (u_int32_t **)malloc(2 * sizeof(u_int32_t *),
+ M_SEGMENT, M_WAITOK);
+ fs->lfs_suflags[0] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+ M_SEGMENT, M_WAITOK);
+ fs->lfs_suflags[1] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+ M_SEGMENT, M_WAITOK);
+ memset(fs->lfs_suflags[1], 0, fs->lfs_nseg * sizeof(u_int32_t));
+ for (i = 0; i < fs->lfs_nseg; i++) {
+ int changed;
+
+ LFS_SEGENTRY(sup, fs, i, bp);
+ changed = 0;
+ if (!ronly) {
+ if (sup->su_nbytes == 0 &&
+ !(sup->su_flags & SEGUSE_EMPTY)) {
+ sup->su_flags |= SEGUSE_EMPTY;
+ ++changed;
+ } else if (!(sup->su_nbytes == 0) &&
+ (sup->su_flags & SEGUSE_EMPTY)) {
+ sup->su_flags &= ~SEGUSE_EMPTY;
+ ++changed;
+ }
+ if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) {
+ sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL);
+ ++changed;
+ }
+ }
+ fs->lfs_suflags[0][i] = sup->su_flags;
+ if (changed)
+ LFS_WRITESEGENTRY(sup, fs, i, bp);
+ else
+ brelse(bp, 0);
+ }
+
+#ifdef LFS_KERNEL_RFW
+ lfs_roll_forward(fs, mp, l);
+#endif
+
+ /* If writing, sb is not clean; record in case of immediate crash */
+ if (!fs->lfs_ronly) {
+ fs->lfs_pflags &= ~LFS_PF_CLEAN;
+ lfs_writesuper(fs, fs->lfs_sboffs[0]);
+ lfs_writesuper(fs, fs->lfs_sboffs[1]);
+ }
+
+ /* Allow vget now that roll-forward is complete */
+ fs->lfs_flags &= ~(LFS_NOTYET);
+ wakeup(&fs->lfs_flags);
+
+ /*
+ * Initialize the ifile cleaner info with information from
+ * the superblock.
+ */
+ LFS_CLEANERINFO(cip, fs, bp);
+ cip->clean = fs->lfs_nclean;
+ cip->dirty = fs->lfs_nseg - fs->lfs_nclean;
+ cip->avail = fs->lfs_avail;
+ cip->bfree = fs->lfs_bfree;
+ (void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+ /*
+ * Mark the current segment as ACTIVE, since we're going to
+ * be writing to it.
+ */
+ LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp);
+ sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
+ fs->lfs_nactive++;
+ LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); /* Ifile */
+
+ /* Now that roll-forward is done, unlock the Ifile */
+ vput(vp);
+
+ /* Start the pagedaemon-anticipating daemon */
+ if (lfs_writer_daemon == 0 && kthread_create(PRI_BIO, 0, NULL,
+ lfs_writerd, NULL, NULL, "lfs_writer") != 0)
+ panic("fork lfs_writer");
+ /*
+ * XXX: Get extra reference to LFS vfsops. This prevents unload,
+ * but also prevents kernel panic due to text being unloaded
+ * from below lfs_writerd. When lfs_writerd can exit, remove
+ * this!!!
+ */
+ vfs_getopsbyname(MOUNT_LFS);
+
+ printf("WARNING: the log-structured file system is experimental\n"
+ "WARNING: it may cause system crashes and/or corrupt data\n");
+
+ return (0);
+
+out:
+ if (bp)
+ brelse(bp, 0);
+ if (abp)
+ brelse(abp, 0);
+ if (ump) {
+ free(ump->um_lfs, M_UFSMNT);
+ free(ump, M_UFSMNT);
+ mp->mnt_data = NULL;
+ }
+
+ return (error);
+}
+
+/*
+ * unmount system call
+ */
+int
+lfs_unmount(struct mount *mp, int mntflags)
+{
+ struct lwp *l = curlwp;
+ struct ufsmount *ump;
+ struct lfs *fs;
+ int error, flags, ronly;
+ vnode_t *vp;
+
+ flags = 0;
+ if (mntflags & MNT_FORCE)
+ flags |= FORCECLOSE;
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_lfs;
+
+ /* Two checkpoints */
+ lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+ lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+
+ /* wake up the cleaner so it can die */
+ lfs_wakeup_cleaner(fs);
+ mutex_enter(&lfs_lock);
+ while (fs->lfs_sleepers)
+ mtsleep(&fs->lfs_sleepers, PRIBIO + 1, "lfs_sleepers", 0,
+ &lfs_lock);
+ mutex_exit(&lfs_lock);
+
+#ifdef QUOTA
+ if ((error = quota1_umount(mp, flags)) != 0)
+ return (error);
+#endif
+ if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0)
+ return (error);
+ if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0)
+ return (error);
+ vp = fs->lfs_ivnode;
+ mutex_enter(vp->v_interlock);
+ if (LIST_FIRST(&vp->v_dirtyblkhd))
+ panic("lfs_unmount: still dirty blocks on ifile vnode");
+ mutex_exit(vp->v_interlock);
+
+ /* Explicitly write the superblock, to update serial and pflags */
+ fs->lfs_pflags |= LFS_PF_CLEAN;
+ lfs_writesuper(fs, fs->lfs_sboffs[0]);
+ lfs_writesuper(fs, fs->lfs_sboffs[1]);
+ mutex_enter(&lfs_lock);
+ while (fs->lfs_iocount)
+ mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0,
+ &lfs_lock);
+ mutex_exit(&lfs_lock);
+
+ /* Finish with the Ifile, now that we're done with it */
+ vgone(fs->lfs_ivnode);
+
+ ronly = !fs->lfs_ronly;
+ if (ump->um_devvp->v_type != VBAD)
+ ump->um_devvp->v_specmountpoint = NULL;
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_CLOSE(ump->um_devvp,
+ ronly ? FREAD : FREAD|FWRITE, NOCRED);
+ vput(ump->um_devvp);
+
+ /* Complain about page leakage */
+ if (fs->lfs_pages > 0)
+ printf("lfs_unmount: still claim %d pages (%d in subsystem)\n",
+ fs->lfs_pages, lfs_subsys_pages);
+
+ /* Free per-mount data structures */
+ free(fs->lfs_ino_bitmap, M_SEGMENT);
+ free(fs->lfs_suflags[0], M_SEGMENT);
+ free(fs->lfs_suflags[1], M_SEGMENT);
+ free(fs->lfs_suflags, M_SEGMENT);
+ lfs_free_resblks(fs);
+ cv_destroy(&fs->lfs_stopcv);
+ rw_destroy(&fs->lfs_fraglock);
+ rw_destroy(&fs->lfs_iflock);
+ free(fs, M_UFSMNT);
+ free(ump, M_UFSMNT);
+
+ mp->mnt_data = NULL;
+ mp->mnt_flag &= ~MNT_LOCAL;
+ return (error);
+}
+
+/*
+ * Get file system statistics.
+ *
+ * NB: We don't lock to access the superblock here, because it's not
+ * really that important if we get it wrong.
+ */
+int
+lfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+ struct lfs *fs;
+ struct ufsmount *ump;
+
+ ump = VFSTOUFS(mp);
+ fs = ump->um_lfs;
+ if (fs->lfs_magic != LFS_MAGIC)
+ panic("lfs_statvfs: magic");
+
+ sbp->f_bsize = fs->lfs_bsize;
+ sbp->f_frsize = fs->lfs_fsize;
+ sbp->f_iosize = fs->lfs_bsize;
+ sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks;
+
+ sbp->f_bfree = LFS_EST_BFREE(fs);
+ KASSERT(sbp->f_bfree <= fs->lfs_dsize);
+#if 0
+ if (sbp->f_bfree < 0)
+ sbp->f_bfree = 0;
+#endif
+
+ sbp->f_bresvd = LFS_EST_RSVD(fs);
+ if (sbp->f_bfree > sbp->f_bresvd)
+ sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+ else
+ sbp->f_bavail = 0;
+
+ sbp->f_files = fs->lfs_bfree / btofsb(fs, fs->lfs_ibsize) * INOPB(fs);
+ sbp->f_ffree = sbp->f_files - fs->lfs_nfiles;
+ sbp->f_favail = sbp->f_ffree;
+ sbp->f_fresvd = 0;
+ copy_statvfs_info(sbp, mp);
+ return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+ int error;
+ struct lfs *fs;
+
+ fs = VFSTOUFS(mp)->um_lfs;
+ if (fs->lfs_ronly)
+ return 0;
+
+ /* Snapshots should not hose the syncer */
+ /*
+ * XXX Sync can block here anyway, since we don't have a very
+ * XXX good idea of how much data is pending. If it's more
+ * XXX than a segment and lfs_nextseg is close to the end of
+ * XXX the log, we'll likely block.
+ */
+ mutex_enter(&lfs_lock);
+ if (fs->lfs_nowrap && fs->lfs_nextseg < fs->lfs_curseg) {
+ mutex_exit(&lfs_lock);
+ return 0;
+ }
+ mutex_exit(&lfs_lock);
+
+ lfs_writer_enter(fs, "lfs_dirops");
+
+ /* All syncs must be checkpoints until roll-forward is implemented. */
+ DLOG((DLOG_FLUSH, "lfs_sync at 0x%x\n", fs->lfs_offset));
+ error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0));
+ lfs_writer_leave(fs);
+#ifdef QUOTA
+ qsync(mp);
+#endif
+ return (error);
+}
+
+/*
+ * Look up an LFS dinode number to find its incore vnode. If not already
+ * in core, read it in from the specified device. Return the inode locked.
+ * Detection and handling of mount points must be done by the calling routine.
+ */
+int
+lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+ struct lfs *fs;
+ struct ufs1_dinode *dip;
+ struct inode *ip;
+ struct buf *bp;
+ struct ifile *ifp;
+ struct vnode *vp;
+ struct ufsmount *ump;
+ daddr_t daddr;
+ dev_t dev;
+ int error, retries;
+ struct timespec ts;
+
+ memset(&ts, 0, sizeof ts); /* XXX gcc */
+
+ ump = VFSTOUFS(mp);
+ dev = ump->um_dev;
+ fs = ump->um_lfs;
+
+ /*
+ * If the filesystem is not completely mounted yet, suspend
+ * any access requests (wait for roll-forward to complete).
+ */
+ mutex_enter(&lfs_lock);
+ while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid)
+ mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0,
+ &lfs_lock);
+ mutex_exit(&lfs_lock);
+
+retry:
+ if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+ return (0);
+
+ error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp);
+ if (error) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ mutex_enter(&ufs_hashlock);
+ if (ufs_ihashget(dev, ino, 0) != NULL) {
+ mutex_exit(&ufs_hashlock);
+ ungetnewvnode(vp);
+ goto retry;
+ }
+
+ /* Translate the inode number to a disk address. */
+ if (ino == LFS_IFILE_INUM)
+ daddr = fs->lfs_idaddr;
+ else {
+ /* XXX bounds-check this too */
+ LFS_IENTRY(ifp, fs, ino, bp);
+ daddr = ifp->if_daddr;
+ if (fs->lfs_version > 1) {
+ ts.tv_sec = ifp->if_atime_sec;
+ ts.tv_nsec = ifp->if_atime_nsec;
+ }
+
+ brelse(bp, 0);
+ if (daddr == LFS_UNUSED_DADDR) {
+ *vpp = NULLVP;
+ mutex_exit(&ufs_hashlock);
+ ungetnewvnode(vp);
+ return (ENOENT);
+ }
+ }
+
+ /* Allocate/init new vnode/inode. */
+ lfs_vcreate(mp, ino, vp);
+
+ /*
+ * Put it onto its hash chain and lock it so that other requests for
+ * this inode will block if they arrive while we are sleeping waiting
+ * for old data structures to be purged or for the contents of the
+ * disk portion of this inode to be read.
+ */
+ ip = VTOI(vp);
+ ufs_ihashins(ip);
+ mutex_exit(&ufs_hashlock);
+
+ /*
+ * XXX
+ * This may not need to be here, logically it should go down with
+ * the i_devvp initialization.
+ * Ask Kirk.
+ */
+ ip->i_lfs = ump->um_lfs;
+
+ /* Read in the disk contents for the inode, copy into the inode. */
+ retries = 0;
+ again:
+ error = bread(ump->um_devvp, fsbtodb(fs, daddr),
+ (fs->lfs_version == 1 ? fs->lfs_bsize : fs->lfs_ibsize),
+ NOCRED, 0, &bp);
+ if (error) {
+ /*
+ * The inode does not contain anything useful, so it would
+ * be misleading to leave it on its hash chain. With mode
+ * still zero, it will be unlinked and returned to the free
+ * list by vput().
+ */
+ vput(vp);
+ brelse(bp, 0);
+ *vpp = NULL;
+ return (error);
+ }
+
+ dip = lfs_ifind(fs, ino, bp);
+ if (dip == NULL) {
+ /* Assume write has not completed yet; try again */
+ brelse(bp, BC_INVAL);
+ ++retries;
+ if (retries > LFS_IFIND_RETRIES) {
+#ifdef DEBUG
+ /* If the seglock is held look at the bpp to see
+ what is there anyway */
+ mutex_enter(&lfs_lock);
+ if (fs->lfs_seglock > 0) {
+ struct buf **bpp;
+ struct ufs1_dinode *dp;
+ int i;
+
+ for (bpp = fs->lfs_sp->bpp;
+ bpp != fs->lfs_sp->cbpp; ++bpp) {
+ if ((*bpp)->b_vp == fs->lfs_ivnode &&
+ bpp != fs->lfs_sp->bpp) {
+ /* Inode block */
+ printf("lfs_vget: block 0x%" PRIx64 ": ",
+ (*bpp)->b_blkno);
+ dp = (struct ufs1_dinode *)(*bpp)->b_data;
+ for (i = 0; i < INOPB(fs); i++)
+ if (dp[i].di_u.inumber)
+ printf("%d ", dp[i].di_u.inumber);
+ printf("\n");
+ }
+ }
+ }
+ mutex_exit(&lfs_lock);
+#endif /* DEBUG */
+ panic("lfs_vget: dinode not found");
+ }
+ mutex_enter(&lfs_lock);
+ if (fs->lfs_iocount) {
+ DLOG((DLOG_VNODE, "lfs_vget: dinode %d not found, retrying...\n", ino));
+ (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+ "lfs ifind", 1, &lfs_lock);
+ } else
+ retries = LFS_IFIND_RETRIES;
+ mutex_exit(&lfs_lock);
+ goto again;
+ }
+ *ip->i_din.ffs1_din = *dip;
+ brelse(bp, 0);
+
+ if (fs->lfs_version > 1) {
+ ip->i_ffs1_atime = ts.tv_sec;
+ ip->i_ffs1_atimensec = ts.tv_nsec;
+ }
+
+ lfs_vinit(mp, &vp);
+
+ *vpp = vp;
+
+ KASSERT(VOP_ISLOCKED(vp));
+
+ return (0);
+}
+
+/*
+ * File handle to vnode
+ */
+int
+lfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+ struct lfid lfh;
+ struct buf *bp;
+ IFILE *ifp;
+ int32_t daddr;
+ struct lfs *fs;
+ vnode_t *vp;
+
+ if (fhp->fid_len != sizeof(struct lfid))
+ return EINVAL;
+
+ memcpy(&lfh, fhp, sizeof(lfh));
+ if (lfh.lfid_ino < LFS_IFILE_INUM)
+ return ESTALE;
+
+ fs = VFSTOUFS(mp)->um_lfs;
+ if (lfh.lfid_ident != fs->lfs_ident)
+ return ESTALE;
+
+ if (lfh.lfid_ino >
+ ((VTOI(fs->lfs_ivnode)->i_ffs1_size >> fs->lfs_bshift) -
+ fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb)
+ return ESTALE;
+
+ mutex_enter(&ufs_ihash_lock);
+ vp = ufs_ihashlookup(VFSTOUFS(mp)->um_dev, lfh.lfid_ino);
+ mutex_exit(&ufs_ihash_lock);
+ if (vp == NULL) {
+ LFS_IENTRY(ifp, fs, lfh.lfid_ino, bp);
+ daddr = ifp->if_daddr;
+ brelse(bp, 0);
+ if (daddr == LFS_UNUSED_DADDR)
+ return ESTALE;
+ }
+
+ return (ufs_fhtovp(mp, &lfh.lfid_ufid, vpp));
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+ struct inode *ip;
+ struct lfid lfh;
+
+ if (*fh_size < sizeof(struct lfid)) {
+ *fh_size = sizeof(struct lfid);
+ return E2BIG;
+ }
+ *fh_size = sizeof(struct lfid);
+ ip = VTOI(vp);
+ memset(&lfh, 0, sizeof(lfh));
+ lfh.lfid_len = sizeof(struct lfid);
+ lfh.lfid_ino = ip->i_number;
+ lfh.lfid_gen = ip->i_gen;
+ lfh.lfid_ident = ip->i_lfs->lfs_ident;
+ memcpy(fhp, &lfh, sizeof(lfh));
+ return (0);
+}
+
+/*
+ * ufs_bmaparray callback function for writing.
+ *
+ * Since blocks will be written to the new segment anyway,
+ * we don't care about current daddr of them.
+ */
+static bool
+lfs_issequential_hole(const struct ufsmount *ump,
+ daddr_t daddr0, daddr_t daddr1)
+{
+ daddr0 = (daddr_t)((int32_t)daddr0); /* XXX ondisk32 */
+ daddr1 = (daddr_t)((int32_t)daddr1); /* XXX ondisk32 */
+
+ KASSERT(daddr0 == UNWRITTEN ||
+ (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR));
+ KASSERT(daddr1 == UNWRITTEN ||
+ (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR));
+
+ /* NOTE: all we want to know here is 'hole or not'. */
+ /* NOTE: UNASSIGNED is converted to 0 by ufs_bmaparray. */
+
+ /*
+ * treat UNWRITTENs and all resident blocks as 'contiguous'
+ */
+ if (daddr0 != 0 && daddr1 != 0)
+ return true;
+
+ /*
+ * both are in hole?
+ */
+ if (daddr0 == 0 && daddr1 == 0)
+ return true; /* all holes are 'contiguous' for us. */
+
+ return false;
+}
+
+/*
+ * lfs_gop_write functions exactly like genfs_gop_write, except that
+ * (1) it requires the seglock to be held by its caller, and sp->fip
+ * to be properly initialized (it will return without re-initializing
+ * sp->fip, and without calling lfs_writeseg).
+ * (2) it uses the remaining space in the segment, rather than VOP_BMAP,
+ * to determine how large a block it can write at once (though it does
+ * still use VOP_BMAP to find holes in the file);
+ * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
+ * (leaving lfs_writeseg to deal with the cluster blocks, so we might
+ * now have clusters of clusters, ick.)
+ */
+static int
+lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
+ int flags)
+{
+ int i, error, run, haveeof = 0;
+ int fs_bshift;
+ vaddr_t kva;
+ off_t eof, offset, startoffset = 0;
+ size_t bytes, iobytes, skipbytes;
+ bool async = (flags & PGO_SYNCIO) == 0;
+ daddr_t lbn, blkno;
+ struct vm_page *pg;
+ struct buf *mbp, *bp;
+ struct vnode *devvp = VTOI(vp)->i_devvp;
+ struct inode *ip = VTOI(vp);
+ struct lfs *fs = ip->i_lfs;
+ struct segment *sp = fs->lfs_sp;
+ UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
+
+ ASSERT_SEGLOCK(fs);
+
+ /* The Ifile lives in the buffer cache */
+ KASSERT(vp != fs->lfs_ivnode);
+
+ /*
+ * We don't want to fill the disk before the cleaner has a chance
+ * to make room for us. If we're in danger of doing that, fail
+ * with EAGAIN. The caller will have to notice this, unlock
+ * so the cleaner can run, relock and try again.
+ *
+ * We must write everything, however, if our vnode is being
+ * reclaimed.
+ */
+ if (LFS_STARVED_FOR_SEGS(fs) && vp != fs->lfs_flushvp)
+ goto tryagain;
+
+ /*
+ * Sometimes things slip past the filters in lfs_putpages,
+ * and the pagedaemon tries to write pages---problem is
+ * that the pagedaemon never acquires the segment lock.
+ *
+ * Alternatively, pages that were clean when we called
+ * genfs_putpages may have become dirty in the meantime. In this
+ * case the segment header is not properly set up for blocks
+ * to be added to it.
+ *
+ * Unbusy and unclean the pages, and put them on the ACTIVE
+ * queue under the hypothesis that they couldn't have got here
+ * unless they were modified *quite* recently.
+ *
+ * XXXUBC that last statement is an oversimplification of course.
+ */
+ if (!LFS_SEGLOCK_HELD(fs) ||
+ (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) ||
+ (pgs[0]->offset & fs->lfs_bmask) != 0) {
+ goto tryagain;
+ }
+
+ UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
+ vp, pgs, npages, flags);
+
+ GOP_SIZE(vp, vp->v_size, &eof, 0);
+ haveeof = 1;
+
+ if (vp->v_type == VREG)
+ fs_bshift = vp->v_mount->mnt_fs_bshift;
+ else
+ fs_bshift = DEV_BSHIFT;
+ error = 0;
+ pg = pgs[0];
+ startoffset = pg->offset;
+ KASSERT(eof >= 0);
+
+ if (startoffset >= eof) {
+ goto tryagain;
+ } else
+ bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
+ skipbytes = 0;
+
+ KASSERT(bytes != 0);
+
+ /* Swap PG_DELWRI for PG_PAGEOUT */
+ for (i = 0; i < npages; i++) {
+ if (pgs[i]->flags & PG_DELWRI) {
+ KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
+ pgs[i]->flags &= ~PG_DELWRI;
+ pgs[i]->flags |= PG_PAGEOUT;
+ uvm_pageout_start(1);
+ mutex_enter(&uvm_pageqlock);
+ uvm_pageunwire(pgs[i]);
+ mutex_exit(&uvm_pageqlock);
+ }
+ }
+
+ /*
+ * Check to make sure we're starting on a block boundary.
+ * We'll check later to make sure we always write entire
+ * blocks (or fragments).
+ */
+ if (startoffset & fs->lfs_bmask)
+ printf("%" PRId64 " & %" PRId64 " = %" PRId64 "\n",
+ startoffset, fs->lfs_bmask,
+ startoffset & fs->lfs_bmask);
+ KASSERT((startoffset & fs->lfs_bmask) == 0);
+ if (bytes & fs->lfs_ffmask) {
+ printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
+ panic("lfs_gop_write: non-integer blocks");
+ }
+
+ /*
+ * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK.
+ * If we would, write what we have and try again. If we don't
+ * have anything to write, we'll have to sleep.
+ */
+ if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
+ (((SEGSUM *)(sp->segsum))->ss_nfinfo < 1 ?
+ UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) {
+ DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n"));
+#if 0
+ " with nfinfo=%d at offset 0x%x\n",
+ (int)((SEGSUM *)(sp->segsum))->ss_nfinfo,
+ (unsigned)fs->lfs_offset));
+#endif
+ lfs_updatemeta(sp);
+ lfs_release_finfo(fs);
+ (void) lfs_writeseg(fs, sp);
+
+ lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+
+ /*
+ * Having given up all of the pager_map we were holding,
+ * we can now wait for aiodoned to reclaim it for us
+ * without fear of deadlock.
+ */
+ kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
+ UVMPAGER_MAPIN_WAITOK);
+ }
+
+ mbp = getiobuf(NULL, true);
+ UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
+ vp, mbp, vp->v_numoutput, bytes);
+ mbp->b_bufsize = npages << PAGE_SHIFT;
+ mbp->b_data = (void *)kva;
+ mbp->b_resid = mbp->b_bcount = bytes;
+ mbp->b_cflags = BC_BUSY|BC_AGE;
+ mbp->b_iodone = uvm_aio_biodone;
+
+ bp = NULL;
+ for (offset = startoffset;
+ bytes > 0;
+ offset += iobytes, bytes -= iobytes) {
+ lbn = offset >> fs_bshift;
+ error = ufs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run,
+ lfs_issequential_hole);
+ if (error) {
+ UVMHIST_LOG(ubchist, "ufs_bmaparray() -> %d",
+ error,0,0,0);
+ skipbytes += bytes;
+ bytes = 0;
+ break;
+ }
+
+ iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
+ bytes);
+ if (blkno == (daddr_t)-1) {
+ skipbytes += iobytes;
+ continue;
+ }
+
+ /*
+ * Discover how much we can really pack into this buffer.
+ */
+ /* If no room in the current segment, finish it up */
+ if (sp->sum_bytes_left < sizeof(int32_t) ||
+ sp->seg_bytes_left < (1 << fs->lfs_bshift)) {
+ int vers;
+
+ lfs_updatemeta(sp);
+ vers = sp->fip->fi_version;
+ lfs_release_finfo(fs);
+ (void) lfs_writeseg(fs, sp);
+
+ lfs_acquire_finfo(fs, ip->i_number, vers);
+ }
+ /* Check both for space in segment and space in segsum */
+ iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift)
+ << fs_bshift);
+ iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t))
+ << fs_bshift);
+ KASSERT(iobytes > 0);
+
+ /* if it's really one i/o, don't make a second buf */
+ if (offset == startoffset && iobytes == bytes) {
+ bp = mbp;
+ /*
+ * All the LFS output is done by the segwriter. It
+ * will increment numoutput by one for all the bufs it
+ * recieves. However this buffer needs one extra to
+ * account for aiodone.
+ */
+ mutex_enter(vp->v_interlock);
+ vp->v_numoutput++;
+ mutex_exit(vp->v_interlock);
+ } else {
+ bp = getiobuf(NULL, true);
+ UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
+ vp, bp, vp->v_numoutput, 0);
+ nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
+ /*
+ * LFS doesn't like async I/O here, dies with
+ * and assert in lfs_bwrite(). Is that assert
+ * valid? I retained non-async behaviour when
+ * converted this to use nestiobuf --pooka
+ */
+ bp->b_flags &= ~B_ASYNC;
+ }
+
+ /* XXX This is silly ... is this necessary? */
+ mutex_enter(&bufcache_lock);
+ mutex_enter(vp->v_interlock);
+ bgetvp(vp, bp);
+ mutex_exit(vp->v_interlock);
+ mutex_exit(&bufcache_lock);
+
+ bp->b_lblkno = lblkno(fs, offset);
+ bp->b_private = mbp;
+ if (devvp->v_type == VBLK) {
+ bp->b_dev = devvp->v_rdev;
+ }
+ VOP_BWRITE(bp->b_vp, bp);
+ while (lfs_gatherblock(sp, bp, NULL))
+ continue;
+ }
+
+ nestiobuf_done(mbp, skipbytes, error);
+ if (skipbytes) {
+ UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
+ }
+ UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);
+
+ if (!async) {
+ /* Start a segment write. */
+ UVMHIST_LOG(ubchist, "flushing", 0,0,0,0);
+ mutex_enter(&lfs_lock);
+ lfs_flush(fs, 0, 1);
+ mutex_exit(&lfs_lock);
+ }
+ return (0);
+
+ tryagain:
+ /*
+ * We can't write the pages, for whatever reason.
+ * Clean up after ourselves, and make the caller try again.
+ */
+ mutex_enter(vp->v_interlock);
+
+ /* Tell why we're here, if we know */
+ if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
+ DLOG((DLOG_PAGE, "lfs_gop_write: clean pages dirtied\n"));
+ } else if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
+ DLOG((DLOG_PAGE, "lfs_gop_write: not on block boundary\n"));
+ } else if (haveeof && startoffset >= eof) {
+ DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
+ " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
+ pgs[0]->offset, eof, npages));
+ } else if (LFS_STARVED_FOR_SEGS(fs)) {
+ DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n"));
+ } else {
+ DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n"));
+ }
+
+ mutex_enter(&uvm_pageqlock);
+ for (i = 0; i < npages; i++) {
+ pg = pgs[i];
+
+ if (pg->flags & PG_PAGEOUT)
+ uvm_pageout_done(1);
+ if (pg->flags & PG_DELWRI) {
+ uvm_pageunwire(pg);
+ }
+ uvm_pageactivate(pg);
+ pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
+ DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
+ vp, pg->offset));
+ DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags));
+ DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags));
+ DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon));
+ DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject));
+ DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i,
+ pg->wire_count));
+ DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i,
+ pg->loan_count));
+ }
+ /* uvm_pageunbusy takes care of PG_BUSY, PG_WANTED */
+ uvm_page_unbusy(pgs, npages);
+ mutex_exit(&uvm_pageqlock);
+ mutex_exit(vp->v_interlock);
+ return EAGAIN;
+}
+
+/*
+ * finish vnode/inode initialization.
+ * used by lfs_vget and lfs_fastvget.
+ */
+void
+lfs_vinit(struct mount *mp, struct vnode **vpp)
+{
+ struct vnode *vp = *vpp;
+ struct inode *ip = VTOI(vp);
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct lfs *fs = ump->um_lfs;
+ int i;
+
+ ip->i_mode = ip->i_ffs1_mode;
+ ip->i_nlink = ip->i_ffs1_nlink;
+ ip->i_lfs_osize = ip->i_size = ip->i_ffs1_size;
+ ip->i_flags = ip->i_ffs1_flags;
+ ip->i_gen = ip->i_ffs1_gen;
+ ip->i_uid = ip->i_ffs1_uid;
+ ip->i_gid = ip->i_ffs1_gid;
+
+ ip->i_lfs_effnblks = ip->i_ffs1_blocks;
+ ip->i_lfs_odnlink = ip->i_ffs1_nlink;
+
+ /*
+ * Initialize the vnode from the inode, check for aliases. In all
+ * cases re-init ip, the underlying vnode/inode may have changed.
+ */
+ ufs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
+ ip = VTOI(vp);
+
+ memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize));
+ if (vp->v_type != VLNK || ip->i_size >= ip->i_ump->um_maxsymlinklen) {
+#ifdef DEBUG
+ for (i = (ip->i_size + fs->lfs_bsize - 1) >> fs->lfs_bshift;
+ i < NDADDR; i++) {
+ if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
+ i == 0)
+ continue;
+ if (ip->i_ffs1_db[i] != 0) {
+inconsistent:
+ lfs_dump_dinode(ip->i_din.ffs1_din);
+ panic("inconsistent inode");
+ }
+ }
+ for ( ; i < NDADDR + NIADDR; i++) {
+ if (ip->i_ffs1_ib[i - NDADDR] != 0) {
+ goto inconsistent;
+ }
+ }
+#endif /* DEBUG */
+ for (i = 0; i < NDADDR; i++)
+ if (ip->i_ffs1_db[i] != 0)
+ ip->i_lfs_fragsize[i] = blksize(fs, ip, i);
+ }
+
+#ifdef DIAGNOSTIC
+ if (vp->v_type == VNON) {
+# ifdef DEBUG
+ lfs_dump_dinode(ip->i_din.ffs1_din);
+# endif
+ panic("lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n",
+ (unsigned long long)ip->i_number,
+ (ip->i_mode & IFMT) >> 12);
+ }
+#endif /* DIAGNOSTIC */
+
+ /*
+ * Finish inode initialization now that aliasing has been resolved.
+ */
+
+ ip->i_devvp = ump->um_devvp;
+ vref(ip->i_devvp);
+ genfs_node_init(vp, &lfs_genfsops);
+ uvm_vnp_setsize(vp, ip->i_size);
+
+ /* Initialize hiblk from file size */
+ ip->i_lfs_hiblk = lblkno(ip->i_lfs, ip->i_size + ip->i_lfs->lfs_bsize - 1) - 1;
+
+ *vpp = vp;
+}
+
+/*
+ * Resize the filesystem to contain the specified number of segments.
+ */
+int
+lfs_resize_fs(struct lfs *fs, int newnsegs)
+{
+ SEGUSE *sup;
+ struct buf *bp, *obp;
+ daddr_t olast, nlast, ilast, noff, start, end;
+ struct vnode *ivp;
+ struct inode *ip;
+ int error, badnews, inc, oldnsegs;
+ int sbbytes, csbbytes, gain, cgain;
+ int i;
+
+ /* Only support v2 and up */
+ if (fs->lfs_version < 2)
+ return EOPNOTSUPP;
+
+ /* If we're doing nothing, do it fast */
+ oldnsegs = fs->lfs_nseg;
+ if (newnsegs == oldnsegs)
+ return 0;
+
+ /* We always have to have two superblocks */
+ if (newnsegs <= dtosn(fs, fs->lfs_sboffs[1]))
+ return EFBIG;
+
+ ivp = fs->lfs_ivnode;
+ ip = VTOI(ivp);
+ error = 0;
+
+ /* Take the segment lock so no one else calls lfs_newseg() */
+ lfs_seglock(fs, SEGM_PROT);
+
+ /*
+ * Make sure the segments we're going to be losing, if any,
+ * are in fact empty. We hold the seglock, so their status
+ * cannot change underneath us. Count the superblocks we lose,
+ * while we're at it.
+ */
+ sbbytes = csbbytes = 0;
+ cgain = 0;
+ for (i = newnsegs; i < oldnsegs; i++) {
+ LFS_SEGENTRY(sup, fs, i, bp);
+ badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL);
+ if (sup->su_flags & SEGUSE_SUPERBLOCK)
+ sbbytes += LFS_SBPAD;
+ if (!(sup->su_flags & SEGUSE_DIRTY)) {
+ ++cgain;
+ if (sup->su_flags & SEGUSE_SUPERBLOCK)
+ csbbytes += LFS_SBPAD;
+ }
+ brelse(bp, 0);
+ if (badnews) {
+ error = EBUSY;
+ goto out;
+ }
+ }
+
+ /* Note old and new segment table endpoints, and old ifile size */
+ olast = fs->lfs_cleansz + fs->lfs_segtabsz;
+ nlast = howmany(newnsegs, fs->lfs_sepb) + fs->lfs_cleansz;
+ ilast = ivp->v_size >> fs->lfs_bshift;
+ noff = nlast - olast;
+
+ /*
+ * Make sure no one can use the Ifile while we change it around.
+ * Even after taking the iflock we need to make sure no one still
+ * is holding Ifile buffers, so we get each one, to drain them.
+ * (XXX this could be done better.)
+ */
+ rw_enter(&fs->lfs_iflock, RW_WRITER);
+ vn_lock(ivp, LK_EXCLUSIVE | LK_RETRY);
+ for (i = 0; i < ilast; i++) {
+ bread(ivp, i, fs->lfs_bsize, NOCRED, 0, &bp);
+ brelse(bp, 0);
+ }
+
+ /* Allocate new Ifile blocks */
+ for (i = ilast; i < ilast + noff; i++) {
+ if (lfs_balloc(ivp, i * fs->lfs_bsize, fs->lfs_bsize, NOCRED, 0,
+ &bp) != 0)
+ panic("balloc extending ifile");
+ memset(bp->b_data, 0, fs->lfs_bsize);
+ VOP_BWRITE(bp->b_vp, bp);
+ }
+
+ /* Register new ifile size */
+ ip->i_size += noff * fs->lfs_bsize;
+ ip->i_ffs1_size = ip->i_size;
+ uvm_vnp_setsize(ivp, ip->i_size);
+
+ /* Copy the inode table to its new position */
+ if (noff != 0) {
+ if (noff < 0) {
+ start = nlast;
+ end = ilast + noff;
+ inc = 1;
+ } else {
+ start = ilast + noff - 1;
+ end = nlast - 1;
+ inc = -1;
+ }
+ for (i = start; i != end; i += inc) {
+ if (bread(ivp, i, fs->lfs_bsize, NOCRED,
+ B_MODIFY, &bp) != 0)
+ panic("resize: bread dst blk failed");
+ if (bread(ivp, i - noff, fs->lfs_bsize,
+ NOCRED, 0, &obp))
+ panic("resize: bread src blk failed");
+ memcpy(bp->b_data, obp->b_data, fs->lfs_bsize);
+ VOP_BWRITE(bp->b_vp, bp);
+ brelse(obp, 0);
+ }
+ }
+
+ /* If we are expanding, write the new empty SEGUSE entries */
+ if (newnsegs > oldnsegs) {
+ for (i = oldnsegs; i < newnsegs; i++) {
+ if ((error = bread(ivp, i / fs->lfs_sepb +
+ fs->lfs_cleansz, fs->lfs_bsize,
+ NOCRED, B_MODIFY, &bp)) != 0)
+ panic("lfs: ifile read: %d", error);
+ while ((i + 1) % fs->lfs_sepb && i < newnsegs) {
+ sup = &((SEGUSE *)bp->b_data)[i % fs->lfs_sepb];
+ memset(sup, 0, sizeof(*sup));
+ i++;
+ }
+ VOP_BWRITE(bp->b_vp, bp);
+ }
+ }
+
+ /* Zero out unused superblock offsets */
+ for (i = 2; i < LFS_MAXNUMSB; i++)
+ if (dtosn(fs, fs->lfs_sboffs[i]) >= newnsegs)
+ fs->lfs_sboffs[i] = 0x0;
+
+ /*
+ * Correct superblock entries that depend on fs size.
+ * The computations of these are as follows:
+ *
+ * size = segtod(fs, nseg)
+ * dsize = segtod(fs, nseg - minfreeseg) - btofsb(#super * LFS_SBPAD)
+ * bfree = dsize - btofsb(fs, bsize * nseg / 2) - blocks_actually_used
+ * avail = segtod(fs, nclean) - btofsb(#clean_super * LFS_SBPAD)
+ * + (segtod(fs, 1) - (offset - curseg))
+ * - segtod(fs, minfreeseg - (minfreeseg / 2))
+ *
+ * XXX - we should probably adjust minfreeseg as well.
+ */
+ gain = (newnsegs - oldnsegs);
+ fs->lfs_nseg = newnsegs;
+ fs->lfs_segtabsz = nlast - fs->lfs_cleansz;
+ fs->lfs_size += gain * btofsb(fs, fs->lfs_ssize);
+ fs->lfs_dsize += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes);
+ fs->lfs_bfree += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes)
+ - gain * btofsb(fs, fs->lfs_bsize / 2);
+ if (gain > 0) {
+ fs->lfs_nclean += gain;
+ fs->lfs_avail += gain * btofsb(fs, fs->lfs_ssize);
+ } else {
+ fs->lfs_nclean -= cgain;
+ fs->lfs_avail -= cgain * btofsb(fs, fs->lfs_ssize) -
+ btofsb(fs, csbbytes);
+ }
+
+ /* Resize segment flag cache */
+ fs->lfs_suflags[0] = (u_int32_t *)realloc(fs->lfs_suflags[0],
+ fs->lfs_nseg * sizeof(u_int32_t),
+ M_SEGMENT, M_WAITOK);
+ fs->lfs_suflags[1] = (u_int32_t *)realloc(fs->lfs_suflags[1],
+ fs->lfs_nseg * sizeof(u_int32_t),
+ M_SEGMENT, M_WAITOK);
+ for (i = oldnsegs; i < newnsegs; i++)
+ fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0;
+
+ /* Truncate Ifile if necessary */
+ if (noff < 0)
+ lfs_truncate(ivp, ivp->v_size + (noff << fs->lfs_bshift), 0,
+ NOCRED);
+
+ /* Update cleaner info so the cleaner can die */
+ bread(ivp, 0, fs->lfs_bsize, NOCRED, B_MODIFY, &bp);
+ ((CLEANERINFO *)bp->b_data)->clean = fs->lfs_nclean;
+ ((CLEANERINFO *)bp->b_data)->dirty = fs->lfs_nseg - fs->lfs_nclean;
+ VOP_BWRITE(bp->b_vp, bp);
+
+ /* Let Ifile accesses proceed */
+ VOP_UNLOCK(ivp);
+ rw_exit(&fs->lfs_iflock);
+
+ out:
+ lfs_segunlock(fs);
+ return error;
+}
--- /dev/null
+/* $NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1986, 1989, 1991, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_compat_netbsd.h"
+#include "opt_uvm_page_trkown.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+#include <sys/syslog.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pmap.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+extern pid_t lfs_writer_daemon;
+int lfs_ignore_lazy_sync = 1;
+
+/* Global vfs data structures for lfs. */
+int (**lfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, ufs_lookup }, /* lookup */
+ { &vop_create_desc, lfs_create }, /* create */
+ { &vop_whiteout_desc, ufs_whiteout }, /* whiteout */
+ { &vop_mknod_desc, lfs_mknod }, /* mknod */
+ { &vop_open_desc, ufs_open }, /* open */
+ { &vop_close_desc, lfs_close }, /* close */
+ { &vop_access_desc, ufs_access }, /* access */
+ { &vop_getattr_desc, lfs_getattr }, /* getattr */
+ { &vop_setattr_desc, lfs_setattr }, /* setattr */
+ { &vop_read_desc, lfs_read }, /* read */
+ { &vop_write_desc, lfs_write }, /* write */
+ { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */
+ { &vop_fcntl_desc, lfs_fcntl }, /* fcntl */
+ { &vop_poll_desc, ufs_poll }, /* poll */
+ { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */
+ { &vop_revoke_desc, ufs_revoke }, /* revoke */
+ { &vop_mmap_desc, lfs_mmap }, /* mmap */
+ { &vop_fsync_desc, lfs_fsync }, /* fsync */
+ { &vop_seek_desc, ufs_seek }, /* seek */
+ { &vop_remove_desc, lfs_remove }, /* remove */
+ { &vop_link_desc, lfs_link }, /* link */
+ { &vop_rename_desc, lfs_rename }, /* rename */
+ { &vop_mkdir_desc, lfs_mkdir }, /* mkdir */
+ { &vop_rmdir_desc, lfs_rmdir }, /* rmdir */
+ { &vop_symlink_desc, lfs_symlink }, /* symlink */
+ { &vop_readdir_desc, ufs_readdir }, /* readdir */
+ { &vop_readlink_desc, ufs_readlink }, /* readlink */
+ { &vop_abortop_desc, ufs_abortop }, /* abortop */
+ { &vop_inactive_desc, lfs_inactive }, /* inactive */
+ { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, ufs_bmap }, /* bmap */
+ { &vop_strategy_desc, lfs_strategy }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */
+ { &vop_advlock_desc, ufs_advlock }, /* advlock */
+ { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */
+ { &vop_getpages_desc, lfs_getpages }, /* getpages */
+ { &vop_putpages_desc, lfs_putpages }, /* putpages */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc lfs_vnodeop_opv_desc =
+ { &lfs_vnodeop_p, lfs_vnodeop_entries };
+
+int (**lfs_specop_p)(void *);
+const struct vnodeopv_entry_desc lfs_specop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, spec_lookup }, /* lookup */
+ { &vop_create_desc, spec_create }, /* create */
+ { &vop_mknod_desc, spec_mknod }, /* mknod */
+ { &vop_open_desc, spec_open }, /* open */
+ { &vop_close_desc, lfsspec_close }, /* close */
+ { &vop_access_desc, ufs_access }, /* access */
+ { &vop_getattr_desc, lfs_getattr }, /* getattr */
+ { &vop_setattr_desc, lfs_setattr }, /* setattr */
+ { &vop_read_desc, ufsspec_read }, /* read */
+ { &vop_write_desc, ufsspec_write }, /* write */
+ { &vop_ioctl_desc, spec_ioctl }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, spec_poll }, /* poll */
+ { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */
+ { &vop_revoke_desc, spec_revoke }, /* revoke */
+ { &vop_mmap_desc, spec_mmap }, /* mmap */
+ { &vop_fsync_desc, spec_fsync }, /* fsync */
+ { &vop_seek_desc, spec_seek }, /* seek */
+ { &vop_remove_desc, spec_remove }, /* remove */
+ { &vop_link_desc, spec_link }, /* link */
+ { &vop_rename_desc, spec_rename }, /* rename */
+ { &vop_mkdir_desc, spec_mkdir }, /* mkdir */
+ { &vop_rmdir_desc, spec_rmdir }, /* rmdir */
+ { &vop_symlink_desc, spec_symlink }, /* symlink */
+ { &vop_readdir_desc, spec_readdir }, /* readdir */
+ { &vop_readlink_desc, spec_readlink }, /* readlink */
+ { &vop_abortop_desc, spec_abortop }, /* abortop */
+ { &vop_inactive_desc, lfs_inactive }, /* inactive */
+ { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, spec_bmap }, /* bmap */
+ { &vop_strategy_desc, spec_strategy }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, spec_pathconf }, /* pathconf */
+ { &vop_advlock_desc, spec_advlock }, /* advlock */
+ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
+ { &vop_getpages_desc, spec_getpages }, /* getpages */
+ { &vop_putpages_desc, spec_putpages }, /* putpages */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc lfs_specop_opv_desc =
+ { &lfs_specop_p, lfs_specop_entries };
+
+int (**lfs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc lfs_fifoop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, vn_fifo_bypass }, /* lookup */
+ { &vop_create_desc, vn_fifo_bypass }, /* create */
+ { &vop_mknod_desc, vn_fifo_bypass }, /* mknod */
+ { &vop_open_desc, vn_fifo_bypass }, /* open */
+ { &vop_close_desc, lfsfifo_close }, /* close */
+ { &vop_access_desc, ufs_access }, /* access */
+ { &vop_getattr_desc, lfs_getattr }, /* getattr */
+ { &vop_setattr_desc, lfs_setattr }, /* setattr */
+ { &vop_read_desc, ufsfifo_read }, /* read */
+ { &vop_write_desc, ufsfifo_write }, /* write */
+ { &vop_ioctl_desc, vn_fifo_bypass }, /* ioctl */
+ { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
+ { &vop_poll_desc, vn_fifo_bypass }, /* poll */
+ { &vop_kqfilter_desc, vn_fifo_bypass }, /* kqfilter */
+ { &vop_revoke_desc, vn_fifo_bypass }, /* revoke */
+ { &vop_mmap_desc, vn_fifo_bypass }, /* mmap */
+ { &vop_fsync_desc, vn_fifo_bypass }, /* fsync */
+ { &vop_seek_desc, vn_fifo_bypass }, /* seek */
+ { &vop_remove_desc, vn_fifo_bypass }, /* remove */
+ { &vop_link_desc, vn_fifo_bypass }, /* link */
+ { &vop_rename_desc, vn_fifo_bypass }, /* rename */
+ { &vop_mkdir_desc, vn_fifo_bypass }, /* mkdir */
+ { &vop_rmdir_desc, vn_fifo_bypass }, /* rmdir */
+ { &vop_symlink_desc, vn_fifo_bypass }, /* symlink */
+ { &vop_readdir_desc, vn_fifo_bypass }, /* readdir */
+ { &vop_readlink_desc, vn_fifo_bypass }, /* readlink */
+ { &vop_abortop_desc, vn_fifo_bypass }, /* abortop */
+ { &vop_inactive_desc, lfs_inactive }, /* inactive */
+ { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */
+ { &vop_lock_desc, ufs_lock }, /* lock */
+ { &vop_unlock_desc, ufs_unlock }, /* unlock */
+ { &vop_bmap_desc, vn_fifo_bypass }, /* bmap */
+ { &vop_strategy_desc, vn_fifo_bypass }, /* strategy */
+ { &vop_print_desc, ufs_print }, /* print */
+ { &vop_islocked_desc, ufs_islocked }, /* islocked */
+ { &vop_pathconf_desc, vn_fifo_bypass }, /* pathconf */
+ { &vop_advlock_desc, vn_fifo_bypass }, /* advlock */
+ { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */
+ { &vop_putpages_desc, vn_fifo_bypass }, /* putpages */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc lfs_fifoop_opv_desc =
+ { &lfs_fifoop_p, lfs_fifoop_entries };
+
+static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **);
+
+#define LFS_READWRITE
+#include <ufs/ufs/ufs_readwrite.c>
+#undef LFS_READWRITE
+
+/*
+ * Synch an open file.
+ */
+/* ARGSUSED */
+int
+lfs_fsync(void *v)
+{
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ kauth_cred_t a_cred;
+ int a_flags;
+ off_t offlo;
+ off_t offhi;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ int error, wait;
+ struct inode *ip = VTOI(vp);
+ struct lfs *fs = ip->i_lfs;
+
+ /* If we're mounted read-only, don't try to sync. */
+ if (fs->lfs_ronly)
+ return 0;
+
+ /* If a removed vnode is being cleaned, no need to sync here. */
+ if ((ap->a_flags & FSYNC_RECLAIM) != 0 && ip->i_mode == 0)
+ return 0;
+
+ /*
+ * Trickle sync simply adds this vnode to the pager list, as if
+ * the pagedaemon had requested a pageout.
+ */
+ if (ap->a_flags & FSYNC_LAZY) {
+ if (lfs_ignore_lazy_sync == 0) {
+ mutex_enter(&lfs_lock);
+ if (!(ip->i_flags & IN_PAGING)) {
+ ip->i_flags |= IN_PAGING;
+ TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip,
+ i_lfs_pchain);
+ }
+ wakeup(&lfs_writer_daemon);
+ mutex_exit(&lfs_lock);
+ }
+ return 0;
+ }
+
+ /*
+ * If a vnode is bring cleaned, flush it out before we try to
+ * reuse it. This prevents the cleaner from writing files twice
+ * in the same partial segment, causing an accounting underflow.
+ */
+ if (ap->a_flags & FSYNC_RECLAIM && ip->i_flags & IN_CLEANING) {
+ lfs_vflush(vp);
+ }
+
+ wait = (ap->a_flags & FSYNC_WAIT);
+ do {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+ round_page(ap->a_offhi),
+ PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
+ if (error == EAGAIN) {
+ mutex_enter(&lfs_lock);
+ mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_fsync",
+ hz / 100 + 1, &lfs_lock);
+ mutex_exit(&lfs_lock);
+ }
+ } while (error == EAGAIN);
+ if (error)
+ return error;
+
+ if ((ap->a_flags & FSYNC_DATAONLY) == 0)
+ error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+
+ if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+ int l = 0;
+ error = VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+ curlwp->l_cred);
+ }
+ if (wait && !VPISEMPTY(vp))
+ LFS_SET_UINO(ip, IN_MODIFIED);
+
+ return error;
+}
+
+/*
+ * Take IN_ADIROP off, then call ufs_inactive.
+ */
+int
+lfs_inactive(void *v)
+{
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ } */ *ap = v;
+
+ lfs_unmark_vnode(ap->a_vp);
+
+ /*
+ * The Ifile is only ever inactivated on unmount.
+ * Streamline this process by not giving it more dirty blocks.
+ */
+ if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) {
+ mutex_enter(&lfs_lock);
+ LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD);
+ mutex_exit(&lfs_lock);
+ VOP_UNLOCK(ap->a_vp);
+ return 0;
+ }
+
+ return ufs_inactive(v);
+}
+
+/*
+ * These macros are used to bracket UFS directory ops, so that we can
+ * identify all the pages touched during directory ops which need to
+ * be ordered and flushed atomically, so that they may be recovered.
+ *
+ * Because we have to mark nodes VU_DIROP in order to prevent
+ * the cache from reclaiming them while a dirop is in progress, we must
+ * also manage the number of nodes so marked (otherwise we can run out).
+ * We do this by setting lfs_dirvcount to the number of marked vnodes; it
+ * is decremented during segment write, when VU_DIROP is taken off.
+ */
+#define MARK_VNODE(vp) lfs_mark_vnode(vp)
+#define UNMARK_VNODE(vp) lfs_unmark_vnode(vp)
+#define SET_DIROP_CREATE(dvp, vpp) lfs_set_dirop_create((dvp), (vpp))
+#define SET_DIROP_REMOVE(dvp, vp) lfs_set_dirop((dvp), (vp))
+static int lfs_set_dirop_create(struct vnode *, struct vnode **);
+static int lfs_set_dirop(struct vnode *, struct vnode *);
+
+static int
+lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
+{
+ struct lfs *fs;
+ int error;
+
+ KASSERT(VOP_ISLOCKED(dvp));
+ KASSERT(vp == NULL || VOP_ISLOCKED(vp));
+
+ fs = VTOI(dvp)->i_lfs;
+
+ ASSERT_NO_SEGLOCK(fs);
+ /*
+ * LFS_NRESERVE calculates direct and indirect blocks as well
+ * as an inode block; an overestimate in most cases.
+ */
+ if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0)
+ return (error);
+
+ restart:
+ mutex_enter(&lfs_lock);
+ if (fs->lfs_dirops == 0) {
+ mutex_exit(&lfs_lock);
+ lfs_check(dvp, LFS_UNUSED_LBN, 0);
+ mutex_enter(&lfs_lock);
+ }
+ while (fs->lfs_writer) {
+ error = mtsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH,
+ "lfs_sdirop", 0, &lfs_lock);
+ if (error == EINTR) {
+ mutex_exit(&lfs_lock);
+ goto unreserve;
+ }
+ }
+ if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
+ wakeup(&lfs_writer_daemon);
+ mutex_exit(&lfs_lock);
+ preempt();
+ goto restart;
+ }
+
+ if (lfs_dirvcount > LFS_MAX_DIROP) {
+ mutex_exit(&lfs_lock);
+ DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, "
+ "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount));
+ if ((error = mtsleep(&lfs_dirvcount,
+ PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0,
+ &lfs_lock)) != 0) {
+ goto unreserve;
+ }
+ goto restart;
+ }
+
+ ++fs->lfs_dirops;
+ fs->lfs_doifile = 1;
+ mutex_exit(&lfs_lock);
+
+ /* Hold a reference so SET_ENDOP will be happy */
+ vref(dvp);
+ if (vp) {
+ vref(vp);
+ MARK_VNODE(vp);
+ }
+
+ MARK_VNODE(dvp);
+ return 0;
+
+ unreserve:
+ lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs));
+ return error;
+}
+
+/*
+ * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
+ * in getnewvnode(), if we have a stacked filesystem mounted on top
+ * of us.
+ *
+ * NB: this means we have to clear the new vnodes on error. Fortunately
+ * SET_ENDOP is there to do that for us.
+ */
+static int
+lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp)
+{
+ int error;
+ struct lfs *fs;
+
+ fs = VFSTOUFS(dvp->v_mount)->um_lfs;
+ ASSERT_NO_SEGLOCK(fs);
+ if (fs->lfs_ronly)
+ return EROFS;
+ if (vpp == NULL) {
+ return lfs_set_dirop(dvp, NULL);
+ }
+ error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, NULL, vpp);
+ if (error) {
+ DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n",
+ dvp, error));
+ return error;
+ }
+ if ((error = lfs_set_dirop(dvp, NULL)) != 0) {
+ ungetnewvnode(*vpp);
+ *vpp = NULL;
+ return error;
+ }
+ return 0;
+}
+
+#define SET_ENDOP_BASE(fs, dvp, str) \
+ do { \
+ mutex_enter(&lfs_lock); \
+ --(fs)->lfs_dirops; \
+ if (!(fs)->lfs_dirops) { \
+ if ((fs)->lfs_nadirop) { \
+ panic("SET_ENDOP: %s: no dirops but " \
+ " nadirop=%d", (str), \
+ (fs)->lfs_nadirop); \
+ } \
+ wakeup(&(fs)->lfs_writer); \
+ mutex_exit(&lfs_lock); \
+ lfs_check((dvp), LFS_UNUSED_LBN, 0); \
+ } else \
+ mutex_exit(&lfs_lock); \
+ } while(0)
+#define SET_ENDOP_CREATE(fs, dvp, nvpp, str) \
+ do { \
+ UNMARK_VNODE(dvp); \
+ if (nvpp && *nvpp) \
+ UNMARK_VNODE(*nvpp); \
+ /* Check for error return to stem vnode leakage */ \
+ if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP)) \
+ ungetnewvnode(*(nvpp)); \
+ SET_ENDOP_BASE((fs), (dvp), (str)); \
+ lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs)); \
+ vrele(dvp); \
+ } while(0)
+#define SET_ENDOP_CREATE_AP(ap, str) \
+ SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp, \
+ (ap)->a_vpp, (str))
+#define SET_ENDOP_REMOVE(fs, dvp, ovp, str) \
+ do { \
+ UNMARK_VNODE(dvp); \
+ if (ovp) \
+ UNMARK_VNODE(ovp); \
+ SET_ENDOP_BASE((fs), (dvp), (str)); \
+ lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs)); \
+ vrele(dvp); \
+ if (ovp) \
+ vrele(ovp); \
+ } while(0)
+
+void
+lfs_mark_vnode(struct vnode *vp)
+{
+ struct inode *ip = VTOI(vp);
+ struct lfs *fs = ip->i_lfs;
+
+ mutex_enter(&lfs_lock);
+ if (!(ip->i_flag & IN_ADIROP)) {
+ if (!(vp->v_uflag & VU_DIROP)) {
+ mutex_enter(vp->v_interlock);
+ (void)lfs_vref(vp);
+ ++lfs_dirvcount;
+ ++fs->lfs_dirvcount;
+ TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+ vp->v_uflag |= VU_DIROP;
+ }
+ ++fs->lfs_nadirop;
+ ip->i_flag |= IN_ADIROP;
+ } else
+ KASSERT(vp->v_uflag & VU_DIROP);
+ mutex_exit(&lfs_lock);
+}
+
+void
+lfs_unmark_vnode(struct vnode *vp)
+{
+ struct inode *ip = VTOI(vp);
+
+ if (ip && (ip->i_flag & IN_ADIROP)) {
+ KASSERT(vp->v_uflag & VU_DIROP);
+ mutex_enter(&lfs_lock);
+ --ip->i_lfs->lfs_nadirop;
+ mutex_exit(&lfs_lock);
+ ip->i_flag &= ~IN_ADIROP;
+ }
+}
+
+int
+lfs_symlink(void *v)
+{
+ struct vop_symlink_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+ } */ *ap = v;
+ int error;
+
+ if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+ vput(ap->a_dvp);
+ return error;
+ }
+ error = ufs_symlink(ap);
+ SET_ENDOP_CREATE_AP(ap, "symlink");
+ return (error);
+}
+
+int
+lfs_mknod(void *v)
+{
+ struct vop_mknod_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ struct vattr *vap = ap->a_vap;
+ struct vnode **vpp = ap->a_vpp;
+ struct inode *ip;
+ int error;
+ struct mount *mp;
+ ino_t ino;
+ struct ufs_lookup_results *ulr;
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(ap->a_dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+ if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+ vput(ap->a_dvp);
+ return error;
+ }
+ error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+ ap->a_dvp, ulr, vpp, ap->a_cnp);
+
+ /* Either way we're done with the dirop at this point */
+ SET_ENDOP_CREATE_AP(ap, "mknod");
+
+ if (error)
+ return (error);
+
+ ip = VTOI(*vpp);
+ mp = (*vpp)->v_mount;
+ ino = ip->i_number;
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ if (vap->va_rdev != VNOVAL) {
+ /*
+ * Want to be able to use this to make badblock
+ * inodes, so don't truncate the dev number.
+ */
+#if 0
+ ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
+ UFS_MPNEEDSWAP((*vpp)->v_mount));
+#else
+ ip->i_ffs1_rdev = vap->va_rdev;
+#endif
+ }
+
+ /*
+ * Call fsync to write the vnode so that we don't have to deal with
+ * flushing it when it's marked VU_DIROP|VI_XLOCK.
+ *
+ * XXX KS - If we can't flush we also can't call vgone(), so must
+ * return. But, that leaves this vnode in limbo, also not good.
+ * Can this ever happen (barring hardware failure)?
+ */
+ if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0)) != 0) {
+ panic("lfs_mknod: couldn't fsync (ino %llu)",
+ (unsigned long long)ino);
+ /* return (error); */
+ }
+ /*
+ * Remove vnode so that it will be reloaded by VFS_VGET and
+ * checked to see if it is an alias of an existing entry in
+ * the inode cache.
+ */
+ /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
+
+ VOP_UNLOCK(*vpp);
+ (*vpp)->v_type = VNON;
+ vgone(*vpp);
+ error = VFS_VGET(mp, ino, vpp);
+
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+ return (0);
+}
+
+int
+lfs_create(void *v)
+{
+ struct vop_create_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ int error;
+
+ if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+ vput(ap->a_dvp);
+ return error;
+ }
+ error = ufs_create(ap);
+ SET_ENDOP_CREATE_AP(ap, "create");
+ return (error);
+}
+
+int
+lfs_mkdir(void *v)
+{
+ struct vop_mkdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ int error;
+
+ if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+ vput(ap->a_dvp);
+ return error;
+ }
+ error = ufs_mkdir(ap);
+ SET_ENDOP_CREATE_AP(ap, "mkdir");
+ return (error);
+}
+
+int
+lfs_remove(void *v)
+{
+ struct vop_remove_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *dvp, *vp;
+ struct inode *ip;
+ int error;
+
+ dvp = ap->a_dvp;
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) {
+ if (dvp == vp)
+ vrele(vp);
+ else
+ vput(vp);
+ vput(dvp);
+ return error;
+ }
+ error = ufs_remove(ap);
+ if (ip->i_nlink == 0)
+ lfs_orphan(ip->i_lfs, ip->i_number);
+ SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove");
+ return (error);
+}
+
+int
+lfs_rmdir(void *v)
+{
+ struct vop_rmdir_args /* {
+ struct vnodeop_desc *a_desc;
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+ int error;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) {
+ if (ap->a_dvp == vp)
+ vrele(ap->a_dvp);
+ else
+ vput(ap->a_dvp);
+ vput(vp);
+ return error;
+ }
+ error = ufs_rmdir(ap);
+ if (ip->i_nlink == 0)
+ lfs_orphan(ip->i_lfs, ip->i_number);
+ SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
+ return (error);
+}
+
+int
+lfs_link(void *v)
+{
+ struct vop_link_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ int error;
+ struct vnode **vpp = NULL;
+
+ if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) {
+ vput(ap->a_dvp);
+ return error;
+ }
+ error = ufs_link(ap);
+ SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link");
+ return (error);
+}
+
+int
+lfs_rename(void *v)
+{
+ struct vop_rename_args /* {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+ } */ *ap = v;
+ struct vnode *tvp, *fvp, *tdvp, *fdvp;
+ struct componentname *tcnp, *fcnp;
+ int error;
+ struct lfs *fs;
+
+ fs = VTOI(ap->a_fdvp)->i_lfs;
+ tvp = ap->a_tvp;
+ tdvp = ap->a_tdvp;
+ tcnp = ap->a_tcnp;
+ fvp = ap->a_fvp;
+ fdvp = ap->a_fdvp;
+ fcnp = ap->a_fcnp;
+
+ /*
+ * Check for cross-device rename.
+ * If it is, we don't want to set dirops, just error out.
+ * (In particular note that MARK_VNODE(tdvp) will DTWT on
+ * a cross-device rename.)
+ *
+ * Copied from ufs_rename.
+ */
+ if ((fvp->v_mount != tdvp->v_mount) ||
+ (tvp && (fvp->v_mount != tvp->v_mount))) {
+ error = EXDEV;
+ goto errout;
+ }
+
+ /*
+ * Check to make sure we're not renaming a vnode onto itself
+ * (deleting a hard link by renaming one name onto another);
+ * if we are we can't recursively call VOP_REMOVE since that
+ * would leave us with an unaccounted-for number of live dirops.
+ *
+ * Inline the relevant section of ufs_rename here, *before*
+ * calling SET_DIROP_REMOVE.
+ */
+ if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
+ (VTOI(tdvp)->i_flags & APPEND))) {
+ error = EPERM;
+ goto errout;
+ }
+ if (fvp == tvp) {
+ if (fvp->v_type == VDIR) {
+ error = EINVAL;
+ goto errout;
+ }
+
+ /* Release destination completely. */
+ VOP_ABORTOP(tdvp, tcnp);
+ vput(tdvp);
+ vput(tvp);
+
+ /* Delete source. */
+ vrele(fvp);
+ fcnp->cn_flags &= ~(MODMASK);
+ fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+ fcnp->cn_nameiop = DELETE;
+ vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+ vput(fdvp);
+ return (error);
+ }
+ return (VOP_REMOVE(fdvp, fvp, fcnp));
+ }
+
+ if ((error = SET_DIROP_REMOVE(tdvp, tvp)) != 0)
+ goto errout;
+ MARK_VNODE(fdvp);
+ MARK_VNODE(fvp);
+
+ error = ufs_rename(ap);
+ UNMARK_VNODE(fdvp);
+ UNMARK_VNODE(fvp);
+ SET_ENDOP_REMOVE(fs, tdvp, tvp, "rename");
+ return (error);
+
+ errout:
+ VOP_ABORTOP(tdvp, ap->a_tcnp); /* XXX, why not in NFS? */
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ VOP_ABORTOP(fdvp, ap->a_fcnp); /* XXX, why not in NFS? */
+ vrele(fdvp);
+ vrele(fvp);
+ return (error);
+}
+
+/* XXX hack to avoid calling ITIMES in getattr */
+int
+lfs_getattr(void *v)
+{
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct vattr *vap = ap->a_vap;
+ struct lfs *fs = ip->i_lfs;
+ /*
+ * Copy from inode table
+ */
+ vap->va_fsid = ip->i_dev;
+ vap->va_fileid = ip->i_number;
+ vap->va_mode = ip->i_mode & ~IFMT;
+ vap->va_nlink = ip->i_nlink;
+ vap->va_uid = ip->i_uid;
+ vap->va_gid = ip->i_gid;
+ vap->va_rdev = (dev_t)ip->i_ffs1_rdev;
+ vap->va_size = vp->v_size;
+ vap->va_atime.tv_sec = ip->i_ffs1_atime;
+ vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
+ vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
+ vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
+ vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
+ vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
+ vap->va_flags = ip->i_flags;
+ vap->va_gen = ip->i_gen;
+ /* this doesn't belong here */
+ if (vp->v_type == VBLK)
+ vap->va_blocksize = BLKDEV_IOSIZE;
+ else if (vp->v_type == VCHR)
+ vap->va_blocksize = MAXBSIZE;
+ else
+ vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+ vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
+ vap->va_type = vp->v_type;
+ vap->va_filerev = ip->i_modrev;
+ return (0);
+}
+
+/*
+ * Check to make sure the inode blocks won't choke the buffer
+ * cache, then call ufs_setattr as usual.
+ */
+int
+lfs_setattr(void *v)
+{
+ struct vop_setattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+
+ lfs_check(vp, LFS_UNUSED_LBN, 0);
+ return ufs_setattr(v);
+}
+
+/*
+ * Release the block we hold on lfs_newseg wrapping. Called on file close,
+ * or explicitly from LFCNWRAPGO. Called with the interlock held.
+ */
+static int
+lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor)
+{
+ if (fs->lfs_stoplwp != curlwp)
+ return EBUSY;
+
+ fs->lfs_stoplwp = NULL;
+ cv_signal(&fs->lfs_stopcv);
+
+ KASSERT(fs->lfs_nowrap > 0);
+ if (fs->lfs_nowrap <= 0) {
+ return 0;
+ }
+
+ if (--fs->lfs_nowrap == 0) {
+ log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt);
+ wakeup(&fs->lfs_wrappass);
+ lfs_wakeup_cleaner(fs);
+ }
+ if (waitfor) {
+ mtsleep(&fs->lfs_nextseg, PCATCH | PUSER, "segment",
+ 0, &lfs_lock);
+ }
+
+ return 0;
+}
+
+/*
+ * Close called
+ */
+/* ARGSUSED */
+int
+lfs_close(void *v)
+{
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct lfs *fs = ip->i_lfs;
+
+ if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) &&
+ fs->lfs_stoplwp == curlwp) {
+ mutex_enter(&lfs_lock);
+ log(LOG_NOTICE, "lfs_close: releasing log wrap control\n");
+ lfs_wrapgo(fs, ip, 0);
+ mutex_exit(&lfs_lock);
+ }
+
+ if (vp == ip->i_lfs->lfs_ivnode &&
+ vp->v_mount->mnt_iflag & IMNT_UNMOUNT)
+ return 0;
+
+ if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) {
+ LFS_ITIMES(ip, NULL, NULL, NULL);
+ }
+ return (0);
+}
+
+/*
+ * Close wrapper for special devices.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+lfsspec_close(void *v)
+{
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ if (vp->v_usecount > 1) {
+ LFS_ITIMES(ip, NULL, NULL, NULL);
+ }
+ return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Close wrapper for fifo's.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+lfsfifo_close(void *v)
+{
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ kauth_cred_ a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ if (ap->a_vp->v_usecount > 1) {
+ LFS_ITIMES(ip, NULL, NULL, NULL);
+ }
+ return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+
+int
+lfs_reclaim(void *v)
+{
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct lfs *fs = ip->i_lfs;
+ int error;
+
+ /*
+ * The inode must be freed and updated before being removed
+ * from its hash chain. Other threads trying to gain a hold
+ * on the inode will be stalled because it is locked (VI_XLOCK).
+ */
+ if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+ lfs_vfree(vp, ip->i_number, ip->i_omode);
+
+ mutex_enter(&lfs_lock);
+ LFS_CLR_UINO(ip, IN_ALLMOD);
+ mutex_exit(&lfs_lock);
+ if ((error = ufs_reclaim(vp)))
+ return (error);
+
+ /*
+ * Take us off the paging and/or dirop queues if we were on them.
+ * We shouldn't be on them.
+ */
+ mutex_enter(&lfs_lock);
+ if (ip->i_flags & IN_PAGING) {
+ log(LOG_WARNING, "%s: reclaimed vnode is IN_PAGING\n",
+ fs->lfs_fsmnt);
+ ip->i_flags &= ~IN_PAGING;
+ TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+ }
+ if (vp->v_uflag & VU_DIROP) {
+ panic("reclaimed vnode is VU_DIROP");
+ vp->v_uflag &= ~VU_DIROP;
+ TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+ }
+ mutex_exit(&lfs_lock);
+
+ pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din);
+ lfs_deregister_all(vp);
+ pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
+ ip->inode_ext.lfs = NULL;
+ genfs_node_destroy(vp);
+ pool_put(&lfs_inode_pool, vp->v_data);
+ vp->v_data = NULL;
+ return (0);
+}
+
+/*
+ * Read a block from a storage device.
+ * In order to avoid reading blocks that are in the process of being
+ * written by the cleaner---and hence are not mutexed by the normal
+ * buffer cache / page cache mechanisms---check for collisions before
+ * reading.
+ *
+ * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
+ * the active cleaner test.
+ *
+ * XXX This code assumes that lfs_markv makes synchronous checkpoints.
+ */
+int
+lfs_strategy(void *v)
+{
+ struct vop_strategy_args /* {
+ struct vnode *a_vp;
+ struct buf *a_bp;
+ } */ *ap = v;
+ struct buf *bp;
+ struct lfs *fs;
+ struct vnode *vp;
+ struct inode *ip;
+ daddr_t tbn;
+ int i, sn, error, slept;
+
+ bp = ap->a_bp;
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ fs = ip->i_lfs;
+
+ /* lfs uses its strategy routine only for read */
+ KASSERT(bp->b_flags & B_READ);
+
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ panic("lfs_strategy: spec");
+ KASSERT(bp->b_bcount != 0);
+ if (bp->b_blkno == bp->b_lblkno) {
+ error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL);
+ if (error) {
+ bp->b_error = error;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return (error);
+ }
+ if ((long)bp->b_blkno == -1) /* no valid data */
+ clrbuf(bp);
+ }
+ if ((long)bp->b_blkno < 0) { /* block is not on disk */
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return (0);
+ }
+
+ slept = 1;
+ mutex_enter(&lfs_lock);
+ while (slept && fs->lfs_seglock) {
+ mutex_exit(&lfs_lock);
+ /*
+ * Look through list of intervals.
+ * There will only be intervals to look through
+ * if the cleaner holds the seglock.
+ * Since the cleaner is synchronous, we can trust
+ * the list of intervals to be current.
+ */
+ tbn = dbtofsb(fs, bp->b_blkno);
+ sn = dtosn(fs, tbn);
+ slept = 0;
+ for (i = 0; i < fs->lfs_cleanind; i++) {
+ if (sn == dtosn(fs, fs->lfs_cleanint[i]) &&
+ tbn >= fs->lfs_cleanint[i]) {
+ DLOG((DLOG_CLEAN,
+ "lfs_strategy: ino %d lbn %" PRId64
+ " ind %d sn %d fsb %" PRIx32
+ " given sn %d fsb %" PRIx64 "\n",
+ ip->i_number, bp->b_lblkno, i,
+ dtosn(fs, fs->lfs_cleanint[i]),
+ fs->lfs_cleanint[i], sn, tbn));
+ DLOG((DLOG_CLEAN,
+ "lfs_strategy: sleeping on ino %d lbn %"
+ PRId64 "\n", ip->i_number, bp->b_lblkno));
+ mutex_enter(&lfs_lock);
+ if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
+ /* Cleaner can't wait for itself */
+ mtsleep(&fs->lfs_iocount,
+ (PRIBIO + 1) | PNORELOCK,
+ "clean2", 0,
+ &lfs_lock);
+ slept = 1;
+ break;
+ } else if (fs->lfs_seglock) {
+ mtsleep(&fs->lfs_seglock,
+ (PRIBIO + 1) | PNORELOCK,
+ "clean1", 0,
+ &lfs_lock);
+ slept = 1;
+ break;
+ }
+ mutex_exit(&lfs_lock);
+ }
+ }
+ mutex_enter(&lfs_lock);
+ }
+ mutex_exit(&lfs_lock);
+
+ vp = ip->i_devvp;
+ VOP_STRATEGY(vp, bp);
+ return (0);
+}
+
+void
+lfs_flush_dirops(struct lfs *fs)
+{
+ struct inode *ip, *nip;
+ struct vnode *vp;
+ extern int lfs_dostats;
+ struct segment *sp;
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ KASSERT(fs->lfs_nadirop == 0);
+
+ if (fs->lfs_ronly)
+ return;
+
+ mutex_enter(&lfs_lock);
+ if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
+ mutex_exit(&lfs_lock);
+ return;
+ } else
+ mutex_exit(&lfs_lock);
+
+ if (lfs_dostats)
+ ++lfs_stats.flush_invoked;
+
+ /*
+ * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
+ * Technically this is a checkpoint (the on-disk state is valid)
+ * even though we are leaving out all the file data.
+ */
+ lfs_imtime(fs);
+ lfs_seglock(fs, SEGM_CKP);
+ sp = fs->lfs_sp;
+
+ /*
+ * lfs_writevnodes, optimized to get dirops out of the way.
+ * Only write dirops, and don't flush files' pages, only
+ * blocks from the directories.
+ *
+ * We don't need to vref these files because they are
+ * dirops and so hold an extra reference until the
+ * segunlock clears them of that status.
+ *
+ * We don't need to check for IN_ADIROP because we know that
+ * no dirops are active.
+ *
+ */
+ mutex_enter(&lfs_lock);
+ for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+ nip = TAILQ_NEXT(ip, i_lfs_dchain);
+ mutex_exit(&lfs_lock);
+ vp = ITOV(ip);
+
+ KASSERT((ip->i_flag & IN_ADIROP) == 0);
+
+ /*
+ * All writes to directories come from dirops; all
+ * writes to files' direct blocks go through the page
+ * cache, which we're not touching. Reads to files
+ * and/or directories will not be affected by writing
+ * directory blocks inodes and file inodes. So we don't
+ * really need to lock. If we don't lock, though,
+ * make sure that we don't clear IN_MODIFIED
+ * unnecessarily.
+ */
+ if (vp->v_iflag & VI_XLOCK) {
+ mutex_enter(&lfs_lock);
+ continue;
+ }
+ /* XXX see below
+ * waslocked = VOP_ISLOCKED(vp);
+ */
+ if (vp->v_type != VREG &&
+ ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
+ lfs_writefile(fs, sp, vp);
+ if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
+ !(ip->i_flag & IN_ALLMOD)) {
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+ }
+ }
+ KDASSERT(ip->i_number != LFS_IFILE_INUM);
+ (void) lfs_writeinode(fs, sp, ip);
+ mutex_enter(&lfs_lock);
+ /*
+ * XXX
+ * LK_EXCLOTHER is dead -- what is intended here?
+ * if (waslocked == LK_EXCLOTHER)
+ * LFS_SET_UINO(ip, IN_MODIFIED);
+ */
+ }
+ mutex_exit(&lfs_lock);
+ /* We've written all the dirops there are */
+ ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
+ lfs_finalize_fs_seguse(fs);
+ (void) lfs_writeseg(fs, sp);
+ lfs_segunlock(fs);
+}
+
+/*
+ * Flush all vnodes for which the pagedaemon has requested pageouts.
+ * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop()
+ * has just run, this would be an error). If we have to skip a vnode
+ * for any reason, just skip it; if we have to wait for the cleaner,
+ * abort. The writer daemon will call us again later.
+ */
+void
+lfs_flush_pchain(struct lfs *fs)
+{
+ struct inode *ip, *nip;
+ struct vnode *vp;
+ extern int lfs_dostats;
+ struct segment *sp;
+ int error;
+
+ ASSERT_NO_SEGLOCK(fs);
+
+ if (fs->lfs_ronly)
+ return;
+
+ mutex_enter(&lfs_lock);
+ if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
+ mutex_exit(&lfs_lock);
+ return;
+ } else
+ mutex_exit(&lfs_lock);
+
+ /* Get dirops out of the way */
+ lfs_flush_dirops(fs);
+
+ if (lfs_dostats)
+ ++lfs_stats.flush_invoked;
+
+ /*
+ * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
+ */
+ lfs_imtime(fs);
+ lfs_seglock(fs, 0);
+ sp = fs->lfs_sp;
+
+ /*
+ * lfs_writevnodes, optimized to clear pageout requests.
+ * Only write non-dirop files that are in the pageout queue.
+ * We're very conservative about what we write; we want to be
+ * fast and async.
+ */
+ mutex_enter(&lfs_lock);
+ top:
+ for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) {
+ nip = TAILQ_NEXT(ip, i_lfs_pchain);
+ vp = ITOV(ip);
+
+ if (!(ip->i_flags & IN_PAGING))
+ goto top;
+
+ mutex_enter(vp->v_interlock);
+ if ((vp->v_iflag & VI_XLOCK) || (vp->v_uflag & VU_DIROP) != 0) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ if (vp->v_type != VREG) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ if (lfs_vref(vp))
+ continue;
+ mutex_exit(&lfs_lock);
+
+ if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_RETRY) != 0) {
+ lfs_vunref(vp);
+ mutex_enter(&lfs_lock);
+ continue;
+ }
+
+ error = lfs_writefile(fs, sp, vp);
+ if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
+ !(ip->i_flag & IN_ALLMOD)) {
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(ip, IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+ }
+ KDASSERT(ip->i_number != LFS_IFILE_INUM);
+ (void) lfs_writeinode(fs, sp, ip);
+
+ VOP_UNLOCK(vp);
+ lfs_vunref(vp);
+
+ if (error == EAGAIN) {
+ lfs_writeseg(fs, sp);
+ mutex_enter(&lfs_lock);
+ break;
+ }
+ mutex_enter(&lfs_lock);
+ }
+ mutex_exit(&lfs_lock);
+ (void) lfs_writeseg(fs, sp);
+ lfs_segunlock(fs);
+}
+
+/*
+ * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
+ */
+int
+lfs_fcntl(void *v)
+{
+ struct vop_fcntl_args /* {
+ struct vnode *a_vp;
+ u_int a_command;
+ void * a_data;
+ int a_fflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct timeval tv;
+ struct timeval *tvp;
+ BLOCK_INFO *blkiov;
+ CLEANERINFO *cip;
+ SEGUSE *sup;
+ int blkcnt, error, oclean;
+ size_t fh_size;
+ struct lfs_fcntl_markv blkvp;
+ struct lwp *l;
+ fsid_t *fsidp;
+ struct lfs *fs;
+ struct buf *bp;
+ fhandle_t *fhp;
+ daddr_t off;
+
+ /* Only respect LFS fcntls on fs root or Ifile */
+ if (VTOI(ap->a_vp)->i_number != ROOTINO &&
+ VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) {
+ return ufs_fcntl(v);
+ }
+
+ /* Avoid locking a draining lock */
+ if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) {
+ return ESHUTDOWN;
+ }
+
+ /* LFS control and monitoring fcntls are available only to root */
+ l = curlwp;
+ if (((ap->a_command & 0xff00) >> 8) == 'L' &&
+ (error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0)
+ return (error);
+
+ fs = VTOI(ap->a_vp)->i_lfs;
+ fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx;
+
+ error = 0;
+ switch ((int)ap->a_command) {
+ case LFCNSEGWAITALL_COMPAT_50:
+ case LFCNSEGWAITALL_COMPAT:
+ fsidp = NULL;
+ /* FALLSTHROUGH */
+ case LFCNSEGWAIT_COMPAT_50:
+ case LFCNSEGWAIT_COMPAT:
+ {
+ struct timeval50 *tvp50
+ = (struct timeval50 *)ap->a_data;
+ timeval50_to_timeval(tvp50, &tv);
+ tvp = &tv;
+ }
+ goto segwait_common;
+ case LFCNSEGWAITALL:
+ fsidp = NULL;
+ /* FALLSTHROUGH */
+ case LFCNSEGWAIT:
+ tvp = (struct timeval *)ap->a_data;
+segwait_common:
+ mutex_enter(&lfs_lock);
+ ++fs->lfs_sleepers;
+ mutex_exit(&lfs_lock);
+
+ error = lfs_segwait(fsidp, tvp);
+
+ mutex_enter(&lfs_lock);
+ if (--fs->lfs_sleepers == 0)
+ wakeup(&fs->lfs_sleepers);
+ mutex_exit(&lfs_lock);
+ return error;
+
+ case LFCNBMAPV:
+ case LFCNMARKV:
+ blkvp = *(struct lfs_fcntl_markv *)ap->a_data;
+
+ blkcnt = blkvp.blkcnt;
+ if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+ return (EINVAL);
+ blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+ if ((error = copyin(blkvp.blkiov, blkiov,
+ blkcnt * sizeof(BLOCK_INFO))) != 0) {
+ lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+ return error;
+ }
+
+ mutex_enter(&lfs_lock);
+ ++fs->lfs_sleepers;
+ mutex_exit(&lfs_lock);
+ if (ap->a_command == LFCNBMAPV)
+ error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt);
+ else /* LFCNMARKV */
+ error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt);
+ if (error == 0)
+ error = copyout(blkiov, blkvp.blkiov,
+ blkcnt * sizeof(BLOCK_INFO));
+ mutex_enter(&lfs_lock);
+ if (--fs->lfs_sleepers == 0)
+ wakeup(&fs->lfs_sleepers);
+ mutex_exit(&lfs_lock);
+ lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+ return error;
+
+ case LFCNRECLAIM:
+ /*
+ * Flush dirops and write Ifile, allowing empty segments
+ * to be immediately reclaimed.
+ */
+ lfs_writer_enter(fs, "pndirop");
+ off = fs->lfs_offset;
+ lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP);
+ lfs_flush_dirops(fs);
+ LFS_CLEANERINFO(cip, fs, bp);
+ oclean = cip->clean;
+ LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+ lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP);
+ fs->lfs_sp->seg_flags |= SEGM_PROT;
+ lfs_segunlock(fs);
+ lfs_writer_leave(fs);
+
+#ifdef DEBUG
+ LFS_CLEANERINFO(cip, fs, bp);
+ DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64
+ " blocks, cleaned %" PRId32 " segments (activesb %d)\n",
+ fs->lfs_offset - off, cip->clean - oclean,
+ fs->lfs_activesb));
+ LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
+#endif
+
+ return 0;
+
+ case LFCNIFILEFH_COMPAT:
+ /* Return the filehandle of the Ifile */
+ if ((error = kauth_authorize_system(l->l_cred,
+ KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)) != 0)
+ return (error);
+ fhp = (struct fhandle *)ap->a_data;
+ fhp->fh_fsid = *fsidp;
+ fh_size = 16; /* former VFS_MAXFIDSIZ */
+ return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
+
+ case LFCNIFILEFH_COMPAT2:
+ case LFCNIFILEFH:
+ /* Return the filehandle of the Ifile */
+ fhp = (struct fhandle *)ap->a_data;
+ fhp->fh_fsid = *fsidp;
+ fh_size = sizeof(struct lfs_fhandle) -
+ offsetof(fhandle_t, fh_fid);
+ return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
+
+ case LFCNREWIND:
+ /* Move lfs_offset to the lowest-numbered segment */
+ return lfs_rewind(fs, *(int *)ap->a_data);
+
+ case LFCNINVAL:
+ /* Mark a segment SEGUSE_INVAL */
+ LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp);
+ if (sup->su_nbytes > 0) {
+ brelse(bp, 0);
+ lfs_unset_inval_all(fs);
+ return EBUSY;
+ }
+ sup->su_flags |= SEGUSE_INVAL;
+ VOP_BWRITE(bp->b_vp, bp);
+ return 0;
+
+ case LFCNRESIZE:
+ /* Resize the filesystem */
+ return lfs_resize_fs(fs, *(int *)ap->a_data);
+
+ case LFCNWRAPSTOP:
+ case LFCNWRAPSTOP_COMPAT:
+ /*
+ * Hold lfs_newseg at segment 0; if requested, sleep until
+ * the filesystem wraps around. To support external agents
+ * (dump, fsck-based regression test) that need to look at
+ * a snapshot of the filesystem, without necessarily
+ * requiring that all fs activity stops.
+ */
+ if (fs->lfs_stoplwp == curlwp)
+ return EALREADY;
+
+ mutex_enter(&lfs_lock);
+ while (fs->lfs_stoplwp != NULL)
+ cv_wait(&fs->lfs_stopcv, &lfs_lock);
+ fs->lfs_stoplwp = curlwp;
+ if (fs->lfs_nowrap == 0)
+ log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt);
+ ++fs->lfs_nowrap;
+ if (*(int *)ap->a_data == 1
+ || ap->a_command == LFCNWRAPSTOP_COMPAT) {
+ log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n");
+ error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
+ "segwrap", 0, &lfs_lock);
+ log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n");
+ if (error) {
+ lfs_wrapgo(fs, VTOI(ap->a_vp), 0);
+ }
+ }
+ mutex_exit(&lfs_lock);
+ return 0;
+
+ case LFCNWRAPGO:
+ case LFCNWRAPGO_COMPAT:
+ /*
+ * Having done its work, the agent wakes up the writer.
+ * If the argument is 1, it sleeps until a new segment
+ * is selected.
+ */
+ mutex_enter(&lfs_lock);
+ error = lfs_wrapgo(fs, VTOI(ap->a_vp),
+ ap->a_command == LFCNWRAPGO_COMPAT ? 1 :
+ *((int *)ap->a_data));
+ mutex_exit(&lfs_lock);
+ return error;
+
+ case LFCNWRAPPASS:
+ if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT))
+ return EALREADY;
+ mutex_enter(&lfs_lock);
+ if (fs->lfs_stoplwp != curlwp) {
+ mutex_exit(&lfs_lock);
+ return EALREADY;
+ }
+ if (fs->lfs_nowrap == 0) {
+ mutex_exit(&lfs_lock);
+ return EBUSY;
+ }
+ fs->lfs_wrappass = 1;
+ wakeup(&fs->lfs_wrappass);
+ /* Wait for the log to wrap, if asked */
+ if (*(int *)ap->a_data) {
+ mutex_enter(ap->a_vp->v_interlock);
+ lfs_vref(ap->a_vp);
+ VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
+ log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
+ error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
+ "segwrap", 0, &lfs_lock);
+ log(LOG_NOTICE, "LFCNPASS done waiting\n");
+ VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT;
+ lfs_vunref(ap->a_vp);
+ }
+ mutex_exit(&lfs_lock);
+ return error;
+
+ case LFCNWRAPSTATUS:
+ mutex_enter(&lfs_lock);
+ *(int *)ap->a_data = fs->lfs_wrapstatus;
+ mutex_exit(&lfs_lock);
+ return 0;
+
+ default:
+ return ufs_fcntl(v);
+ }
+ return 0;
+}
+
+int
+lfs_getpages(void *v)
+{
+ struct vop_getpages_args /* {
+ struct vnode *a_vp;
+ voff_t a_offset;
+ struct vm_page **a_m;
+ int *a_count;
+ int a_centeridx;
+ vm_prot_t a_access_type;
+ int a_advice;
+ int a_flags;
+ } */ *ap = v;
+
+ if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM &&
+ (ap->a_access_type & VM_PROT_WRITE) != 0) {
+ return EPERM;
+ }
+ if ((ap->a_access_type & VM_PROT_WRITE) != 0) {
+ mutex_enter(&lfs_lock);
+ LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED);
+ mutex_exit(&lfs_lock);
+ }
+
+ /*
+ * we're relying on the fact that genfs_getpages() always read in
+ * entire filesystem blocks.
+ */
+ return genfs_getpages(v);
+}
+
+/*
+ * Wait for a page to become unbusy, possibly printing diagnostic messages
+ * as well.
+ *
+ * Called with vp->v_interlock held; return with it held.
+ */
+static void
+wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
+{
+ if ((pg->flags & PG_BUSY) == 0)
+ return; /* Nothing to wait for! */
+
+#if defined(DEBUG) && defined(UVM_PAGE_TRKOWN)
+ static struct vm_page *lastpg;
+
+ if (label != NULL && pg != lastpg) {
+ if (pg->owner_tag) {
+ printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n",
+ curproc->p_pid, curlwp->l_lid, label,
+ pg, pg->owner, pg->lowner, pg->owner_tag);
+ } else {
+ printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n",
+ curproc->p_pid, curlwp->l_lid, label, pg);
+ }
+ }
+ lastpg = pg;
+#endif
+
+ pg->flags |= PG_WANTED;
+ UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, "lfsput", 0);
+ mutex_enter(vp->v_interlock);
+}
+
+/*
+ * This routine is called by lfs_putpages() when it can't complete the
+ * write because a page is busy. This means that either (1) someone,
+ * possibly the pagedaemon, is looking at this page, and will give it up
+ * presently; or (2) we ourselves are holding the page busy in the
+ * process of being written (either gathered or actually on its way to
+ * disk). We don't need to give up the segment lock, but we might need
+ * to call lfs_writeseg() to expedite the page's journey to disk.
+ *
+ * Called with vp->v_interlock held; return with it held.
+ */
+/* #define BUSYWAIT */
+static void
+write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
+ int seglocked, const char *label)
+{
+#ifndef BUSYWAIT
+ struct inode *ip = VTOI(vp);
+ struct segment *sp = fs->lfs_sp;
+ int count = 0;
+
+ if (pg == NULL)
+ return;
+
+ while (pg->flags & PG_BUSY &&
+ pg->uobject == &vp->v_uobj) {
+ mutex_exit(vp->v_interlock);
+ if (sp->cbpp - sp->bpp > 1) {
+ /* Write gathered pages */
+ lfs_updatemeta(sp);
+ lfs_release_finfo(fs);
+ (void) lfs_writeseg(fs, sp);
+
+ /*
+ * Reinitialize FIP
+ */
+ KASSERT(sp->vp == vp);
+ lfs_acquire_finfo(fs, ip->i_number,
+ ip->i_gen);
+ }
+ ++count;
+ mutex_enter(vp->v_interlock);
+ wait_for_page(vp, pg, label);
+ }
+ if (label != NULL && count > 1)
+ printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
+ label, (count > 0 ? "looping, " : ""), count);
+#else
+ preempt(1);
+#endif
+}
+
+/*
+ * Make sure that for all pages in every block in the given range,
+ * either all are dirty or all are clean. If any of the pages
+ * we've seen so far are dirty, put the vnode on the paging chain,
+ * and mark it IN_PAGING.
+ *
+ * If checkfirst != 0, don't check all the pages but return at the
+ * first dirty page.
+ */
+static int
+check_dirty(struct lfs *fs, struct vnode *vp,
+ off_t startoffset, off_t endoffset, off_t blkeof,
+ int flags, int checkfirst, struct vm_page **pgp)
+{
+ int by_list;
+ struct vm_page *curpg = NULL; /* XXX: gcc */
+ struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg;
+ off_t soff = 0; /* XXX: gcc */
+ voff_t off;
+ int i;
+ int nonexistent;
+ int any_dirty; /* number of dirty pages */
+ int dirty; /* number of dirty pages in a block */
+ int tdirty;
+ int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
+ int pagedaemon = (curlwp == uvm.pagedaemon_lwp);
+
+ ASSERT_MAYBE_SEGLOCK(fs);
+ top:
+ by_list = (vp->v_uobj.uo_npages <=
+ ((endoffset - startoffset) >> PAGE_SHIFT) *
+ UVM_PAGE_TREE_PENALTY);
+ any_dirty = 0;
+
+ if (by_list) {
+ curpg = TAILQ_FIRST(&vp->v_uobj.memq);
+ } else {
+ soff = startoffset;
+ }
+ while (by_list || soff < MIN(blkeof, endoffset)) {
+ if (by_list) {
+ /*
+ * Find the first page in a block. Skip
+ * blocks outside our area of interest or beyond
+ * the end of file.
+ */
+ KASSERT(curpg == NULL
+ || (curpg->flags & PG_MARKER) == 0);
+ if (pages_per_block > 1) {
+ while (curpg &&
+ ((curpg->offset & fs->lfs_bmask) ||
+ curpg->offset >= vp->v_size ||
+ curpg->offset >= endoffset)) {
+ curpg = TAILQ_NEXT(curpg, listq.queue);
+ KASSERT(curpg == NULL ||
+ (curpg->flags & PG_MARKER) == 0);
+ }
+ }
+ if (curpg == NULL)
+ break;
+ soff = curpg->offset;
+ }
+
+ /*
+ * Mark all pages in extended range busy; find out if any
+ * of them are dirty.
+ */
+ nonexistent = dirty = 0;
+ for (i = 0; i == 0 || i < pages_per_block; i++) {
+ if (by_list && pages_per_block <= 1) {
+ pgs[i] = pg = curpg;
+ } else {
+ off = soff + (i << PAGE_SHIFT);
+ pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
+ if (pg == NULL) {
+ ++nonexistent;
+ continue;
+ }
+ }
+ KASSERT(pg != NULL);
+
+ /*
+ * If we're holding the segment lock, we can deadlock
+ * against a process that has our page and is waiting
+ * for the cleaner, while the cleaner waits for the
+ * segment lock. Just bail in that case.
+ */
+ if ((pg->flags & PG_BUSY) &&
+ (pagedaemon || LFS_SEGLOCK_HELD(fs))) {
+ if (i > 0)
+ uvm_page_unbusy(pgs, i);
+ DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
+ if (pgp)
+ *pgp = pg;
+ return -1;
+ }
+
+ while (pg->flags & PG_BUSY) {
+ wait_for_page(vp, pg, NULL);
+ if (i > 0)
+ uvm_page_unbusy(pgs, i);
+ goto top;
+ }
+ pg->flags |= PG_BUSY;
+ UVM_PAGE_OWN(pg, "lfs_putpages");
+
+ pmap_page_protect(pg, VM_PROT_NONE);
+ tdirty = (pmap_clear_modify(pg) ||
+ (pg->flags & PG_CLEAN) == 0);
+ dirty += tdirty;
+ }
+ if (pages_per_block > 0 && nonexistent >= pages_per_block) {
+ if (by_list) {
+ curpg = TAILQ_NEXT(curpg, listq.queue);
+ } else {
+ soff += fs->lfs_bsize;
+ }
+ continue;
+ }
+
+ any_dirty += dirty;
+ KASSERT(nonexistent == 0);
+
+ /*
+ * If any are dirty make all dirty; unbusy them,
+ * but if we were asked to clean, wire them so that
+ * the pagedaemon doesn't bother us about them while
+ * they're on their way to disk.
+ */
+ for (i = 0; i == 0 || i < pages_per_block; i++) {
+ pg = pgs[i];
+ KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
+ if (dirty) {
+ pg->flags &= ~PG_CLEAN;
+ if (flags & PGO_FREE) {
+ /*
+ * Wire the page so that
+ * pdaemon doesn't see it again.
+ */
+ mutex_enter(&uvm_pageqlock);
+ uvm_pagewire(pg);
+ mutex_exit(&uvm_pageqlock);
+
+ /* Suspended write flag */
+ pg->flags |= PG_DELWRI;
+ }
+ }
+ if (pg->flags & PG_WANTED)
+ wakeup(pg);
+ pg->flags &= ~(PG_WANTED|PG_BUSY);
+ UVM_PAGE_OWN(pg, NULL);
+ }
+
+ if (checkfirst && any_dirty)
+ break;
+
+ if (by_list) {
+ curpg = TAILQ_NEXT(curpg, listq.queue);
+ } else {
+ soff += MAX(PAGE_SIZE, fs->lfs_bsize);
+ }
+ }
+
+ return any_dirty;
+}
+
+/*
+ * lfs_putpages functions like genfs_putpages except that
+ *
+ * (1) It needs to bounds-check the incoming requests to ensure that
+ * they are block-aligned; if they are not, expand the range and
+ * do the right thing in case, e.g., the requested range is clean
+ * but the expanded range is dirty.
+ *
+ * (2) It needs to explicitly send blocks to be written when it is done.
+ * If VOP_PUTPAGES is called without the seglock held, we simply take
+ * the seglock and let lfs_segunlock wait for us.
+ * XXX There might be a bad situation if we have to flush a vnode while
+ * XXX lfs_markv is in operation. As of this writing we panic in this
+ * XXX case.
+ *
+ * Assumptions:
+ *
+ * (1) The caller does not hold any pages in this vnode busy. If it does,
+ * there is a danger that when we expand the page range and busy the
+ * pages we will deadlock.
+ *
+ * (2) We are called with vp->v_interlock held; we must return with it
+ * released.
+ *
+ * (3) We don't absolutely have to free pages right away, provided that
+ * the request does not have PGO_SYNCIO. When the pagedaemon gives
+ * us a request with PGO_FREE, we take the pages out of the paging
+ * queue and wake up the writer, which will handle freeing them for us.
+ *
+ * We ensure that for any filesystem block, all pages for that
+ * block are either resident or not, even if those pages are higher
+ * than EOF; that means that we will be getting requests to free
+ * "unused" pages above EOF all the time, and should ignore them.
+ *
+ * (4) If we are called with PGO_LOCKED, the finfo array we are to write
+ * into has been set up for us by lfs_writefile. If not, we will
+ * have to handle allocating and/or freeing an finfo entry.
+ *
+ * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
+ */
+
+/* How many times to loop before we should start to worry */
+#define TOOMANY 4
+
+int
+lfs_putpages(void *v)
+{
+ int error;
+ struct vop_putpages_args /* {
+ struct vnode *a_vp;
+ voff_t a_offlo;
+ voff_t a_offhi;
+ int a_flags;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+ struct lfs *fs;
+ struct segment *sp;
+ off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
+ off_t off, max_endoffset;
+ bool seglocked, sync, pagedaemon;
+ struct vm_page *pg, *busypg;
+ UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
+#ifdef DEBUG
+ int debug_n_again, debug_n_dirtyclean;
+#endif
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ fs = ip->i_lfs;
+ sync = (ap->a_flags & PGO_SYNCIO) != 0;
+ pagedaemon = (curlwp == uvm.pagedaemon_lwp);
+
+ /* Putpages does nothing for metadata. */
+ if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
+ mutex_exit(vp->v_interlock);
+ return 0;
+ }
+
+ /*
+ * If there are no pages, don't do anything.
+ */
+ if (vp->v_uobj.uo_npages == 0) {
+ if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
+ (vp->v_iflag & VI_ONWORKLST) &&
+ LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
+ vp->v_iflag &= ~VI_WRMAPDIRTY;
+ vn_syncer_remove_from_worklist(vp);
+ }
+ mutex_exit(vp->v_interlock);
+
+ /* Remove us from paging queue, if we were on it */
+ mutex_enter(&lfs_lock);
+ if (ip->i_flags & IN_PAGING) {
+ ip->i_flags &= ~IN_PAGING;
+ TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+ }
+ mutex_exit(&lfs_lock);
+ return 0;
+ }
+
+ blkeof = blkroundup(fs, ip->i_size);
+
+ /*
+ * Ignore requests to free pages past EOF but in the same block
+ * as EOF, unless the request is synchronous. (If the request is
+ * sync, it comes from lfs_truncate.)
+ * XXXUBC Make these pages look "active" so the pagedaemon won't
+ * XXXUBC bother us with them again.
+ */
+ if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
+ origoffset = ap->a_offlo;
+ for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
+ pg = uvm_pagelookup(&vp->v_uobj, off);
+ KASSERT(pg != NULL);
+ while (pg->flags & PG_BUSY) {
+ pg->flags |= PG_WANTED;
+ UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0,
+ "lfsput2", 0);
+ mutex_enter(vp->v_interlock);
+ }
+ mutex_enter(&uvm_pageqlock);
+ uvm_pageactivate(pg);
+ mutex_exit(&uvm_pageqlock);
+ }
+ ap->a_offlo = blkeof;
+ if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
+ mutex_exit(vp->v_interlock);
+ return 0;
+ }
+ }
+
+ /*
+ * Extend page range to start and end at block boundaries.
+ * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
+ */
+ origoffset = ap->a_offlo;
+ origendoffset = ap->a_offhi;
+ startoffset = origoffset & ~(fs->lfs_bmask);
+ max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
+ << fs->lfs_bshift;
+
+ if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+ endoffset = max_endoffset;
+ origendoffset = endoffset;
+ } else {
+ origendoffset = round_page(ap->a_offhi);
+ endoffset = round_page(blkroundup(fs, origendoffset));
+ }
+
+ KASSERT(startoffset > 0 || endoffset >= startoffset);
+ if (startoffset == endoffset) {
+ /* Nothing to do, why were we called? */
+ mutex_exit(vp->v_interlock);
+ DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %"
+ PRId64 "\n", startoffset));
+ return 0;
+ }
+
+ ap->a_offlo = startoffset;
+ ap->a_offhi = endoffset;
+
+ /*
+ * If not cleaning, just send the pages through genfs_putpages
+ * to be returned to the pool.
+ */
+ if (!(ap->a_flags & PGO_CLEANIT))
+ return genfs_putpages(v);
+
+ /* Set PGO_BUSYFAIL to avoid deadlocks */
+ ap->a_flags |= PGO_BUSYFAIL;
+
+ /*
+ * Likewise, if we are asked to clean but the pages are not
+ * dirty, we can just free them using genfs_putpages.
+ */
+#ifdef DEBUG
+ debug_n_dirtyclean = 0;
+#endif
+ do {
+ int r;
+
+ /* Count the number of dirty pages */
+ r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
+ ap->a_flags, 1, NULL);
+ if (r < 0) {
+ /* Pages are busy with another process */
+ mutex_exit(vp->v_interlock);
+ return EDEADLK;
+ }
+ if (r > 0) /* Some pages are dirty */
+ break;
+
+ /*
+ * Sometimes pages are dirtied between the time that
+ * we check and the time we try to clean them.
+ * Instruct lfs_gop_write to return EDEADLK in this case
+ * so we can write them properly.
+ */
+ ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE;
+ r = genfs_do_putpages(vp, startoffset, endoffset,
+ ap->a_flags & ~PGO_SYNCIO, &busypg);
+ ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
+ if (r != EDEADLK)
+ return r;
+
+ /* One of the pages was busy. Start over. */
+ mutex_enter(vp->v_interlock);
+ wait_for_page(vp, busypg, "dirtyclean");
+#ifdef DEBUG
+ ++debug_n_dirtyclean;
+#endif
+ } while(1);
+
+#ifdef DEBUG
+ if (debug_n_dirtyclean > TOOMANY)
+ printf("lfs_putpages: dirtyclean: looping, n = %d\n",
+ debug_n_dirtyclean);
+#endif
+
+ /*
+ * Dirty and asked to clean.
+ *
+ * Pagedaemon can't actually write LFS pages; wake up
+ * the writer to take care of that. The writer will
+ * notice the pager inode queue and act on that.
+ *
+ * XXX We must drop the vp->interlock before taking the lfs_lock or we
+ * get a nasty deadlock with lfs_flush_pchain().
+ */
+ if (pagedaemon) {
+ mutex_exit(vp->v_interlock);
+ mutex_enter(&lfs_lock);
+ if (!(ip->i_flags & IN_PAGING)) {
+ ip->i_flags |= IN_PAGING;
+ TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+ }
+ wakeup(&lfs_writer_daemon);
+ mutex_exit(&lfs_lock);
+ preempt();
+ return EWOULDBLOCK;
+ }
+
+ /*
+ * If this is a file created in a recent dirop, we can't flush its
+ * inode until the dirop is complete. Drain dirops, then flush the
+ * filesystem (taking care of any other pending dirops while we're
+ * at it).
+ */
+ if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
+ (vp->v_uflag & VU_DIROP)) {
+ int locked;
+
+ DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
+ /* XXX VOP_ISLOCKED() may not be used for lock decisions. */
+ locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+ mutex_exit(vp->v_interlock);
+ lfs_writer_enter(fs, "ppdirop");
+ if (locked)
+ VOP_UNLOCK(vp); /* XXX why? */
+
+ mutex_enter(&lfs_lock);
+ lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
+ mutex_exit(&lfs_lock);
+
+ if (locked)
+ VOP_LOCK(vp, LK_EXCLUSIVE);
+ mutex_enter(vp->v_interlock);
+ lfs_writer_leave(fs);
+
+ /* XXX the flush should have taken care of this one too! */
+ }
+
+ /*
+ * This is it. We are going to write some pages. From here on
+ * down it's all just mechanics.
+ *
+ * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
+ */
+ ap->a_flags &= ~PGO_SYNCIO;
+
+ /*
+ * If we've already got the seglock, flush the node and return.
+ * The FIP has already been set up for us by lfs_writefile,
+ * and FIP cleanup and lfs_updatemeta will also be done there,
+ * unless genfs_putpages returns EDEADLK; then we must flush
+ * what we have, and correct FIP and segment header accounting.
+ */
+ get_seglock:
+ /*
+ * If we are not called with the segment locked, lock it.
+ * Account for a new FIP in the segment header, and set sp->vp.
+ * (This should duplicate the setup at the top of lfs_writefile().)
+ */
+ seglocked = (ap->a_flags & PGO_LOCKED) != 0;
+ if (!seglocked) {
+ mutex_exit(vp->v_interlock);
+ error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
+ if (error != 0)
+ return error;
+ mutex_enter(vp->v_interlock);
+ lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+ }
+ sp = fs->lfs_sp;
+ KASSERT(sp->vp == NULL);
+ sp->vp = vp;
+
+ /*
+ * Ensure that the partial segment is marked SS_DIROP if this
+ * vnode is a DIROP.
+ */
+ if (!seglocked && vp->v_uflag & VU_DIROP)
+ ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+ /*
+ * Loop over genfs_putpages until all pages are gathered.
+ * genfs_putpages() drops the interlock, so reacquire it if necessary.
+ * Whenever we lose the interlock we have to rerun check_dirty, as
+ * well, since more pages might have been dirtied in our absence.
+ */
+#ifdef DEBUG
+ debug_n_again = 0;
+#endif
+ do {
+ busypg = NULL;
+ if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
+ ap->a_flags, 0, &busypg) < 0) {
+ mutex_exit(vp->v_interlock);
+
+ mutex_enter(vp->v_interlock);
+ write_and_wait(fs, vp, busypg, seglocked, NULL);
+ if (!seglocked) {
+ mutex_exit(vp->v_interlock);
+ lfs_release_finfo(fs);
+ lfs_segunlock(fs);
+ mutex_enter(vp->v_interlock);
+ }
+ sp->vp = NULL;
+ goto get_seglock;
+ }
+
+ busypg = NULL;
+ error = genfs_do_putpages(vp, startoffset, endoffset,
+ ap->a_flags, &busypg);
+
+ if (error == EDEADLK || error == EAGAIN) {
+ DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
+ " %d ino %d off %x (seg %d)\n", error,
+ ip->i_number, fs->lfs_offset,
+ dtosn(fs, fs->lfs_offset)));
+
+ mutex_enter(vp->v_interlock);
+ write_and_wait(fs, vp, busypg, seglocked, "again");
+ }
+#ifdef DEBUG
+ ++debug_n_again;
+#endif
+ } while (error == EDEADLK);
+#ifdef DEBUG
+ if (debug_n_again > TOOMANY)
+ printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
+#endif
+
+ KASSERT(sp != NULL && sp->vp == vp);
+ if (!seglocked) {
+ sp->vp = NULL;
+
+ /* Write indirect blocks as well */
+ lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir);
+ lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir);
+ lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir);
+
+ KASSERT(sp->vp == NULL);
+ sp->vp = vp;
+ }
+
+ /*
+ * Blocks are now gathered into a segment waiting to be written.
+ * All that's left to do is update metadata, and write them.
+ */
+ lfs_updatemeta(sp);
+ KASSERT(sp->vp == vp);
+ sp->vp = NULL;
+
+ /*
+ * If we were called from lfs_writefile, we don't need to clean up
+ * the FIP or unlock the segment lock. We're done.
+ */
+ if (seglocked)
+ return error;
+
+ /* Clean up FIP and send it to disk. */
+ lfs_release_finfo(fs);
+ lfs_writeseg(fs, fs->lfs_sp);
+
+ /*
+ * Remove us from paging queue if we wrote all our pages.
+ */
+ if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+ mutex_enter(&lfs_lock);
+ if (ip->i_flags & IN_PAGING) {
+ ip->i_flags &= ~IN_PAGING;
+ TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+ }
+ mutex_exit(&lfs_lock);
+ }
+
+ /*
+ * XXX - with the malloc/copy writeseg, the pages are freed by now
+ * even if we don't wait (e.g. if we hold a nested lock). This
+ * will not be true if we stop using malloc/copy.
+ */
+ KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
+ lfs_segunlock(fs);
+
+ /*
+ * Wait for v_numoutput to drop to zero. The seglock should
+ * take care of this, but there is a slight possibility that
+ * aiodoned might not have got around to our buffers yet.
+ */
+ if (sync) {
+ mutex_enter(vp->v_interlock);
+ while (vp->v_numoutput > 0) {
+ DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on"
+ " num %d\n", ip->i_number, vp->v_numoutput));
+ cv_wait(&vp->v_cv, vp->v_interlock);
+ }
+ mutex_exit(vp->v_interlock);
+ }
+ return error;
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size". If writing, we need to know
+ * about sizes on disk, i.e. fragments if there are any; if reading, we need
+ * to know about entire blocks.
+ */
+void
+lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+ struct inode *ip = VTOI(vp);
+ struct lfs *fs = ip->i_lfs;
+ daddr_t olbn, nlbn;
+
+ olbn = lblkno(fs, ip->i_size);
+ nlbn = lblkno(fs, size);
+ if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) {
+ *eobp = fragroundup(fs, size);
+ } else {
+ *eobp = blkroundup(fs, size);
+ }
+}
+
+#ifdef DEBUG
+void lfs_dump_vop(void *);
+
+void
+lfs_dump_vop(void *v)
+{
+ struct vop_putpages_args /* {
+ struct vnode *a_vp;
+ voff_t a_offlo;
+ voff_t a_offhi;
+ int a_flags;
+ } */ *ap = v;
+
+#ifdef DDB
+ vfs_vnode_print(ap->a_vp, 0, printf);
+#endif
+ lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din);
+}
+#endif
+
+int
+lfs_mmap(void *v)
+{
+ struct vop_mmap_args /* {
+ const struct vnodeop_desc *a_desc;
+ struct vnode *a_vp;
+ vm_prot_t a_prot;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
+ return EOPNOTSUPP;
+ return ufs_mmap(v);
+}
--- /dev/null
+# $NetBSD: Makefile,v 1.2 1999/07/03 18:40:32 thorpej Exp $
+
+INCSDIR= /usr/include/ufs/mfs
+
+INCS= mfs_extern.h mfsnode.h
+
+.include <bsd.kinc.mk>
--- /dev/null
+/* $NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $ */
+
+/*
+ * Copyright (c) 1989, 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)mfs_vfsops.c 8.11 (Berkeley) 6/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $");
+
+#include <sys/param.h>
+
+#include <ufs/mfs/mfs_extern.h>
+#include <ufs/ffs/fs.h>
+
+void * mfs_rootbase; /* address of mini-root in kernel virtual memory */
+u_long mfs_rootsize; /* size of mini-root in bytes */
+
+/*
+ * This is called early in boot to set the base address and size
+ * of the mini-root.
+ */
+int
+mfs_initminiroot(void *base)
+{
+ struct fs *fs = (struct fs *)((char *)base + SBLOCK_UFS1);
+ static bool inited = false;
+
+ if (inited)
+ panic("mfs_initminiroot() called more than once");
+ inited = true;
+
+ /* check for valid super block */
+ if (fs->fs_magic != FS_UFS1_MAGIC || fs->fs_bsize > MAXBSIZE ||
+ fs->fs_bsize < sizeof(struct fs))
+ return (0);
+ rootfstype = MOUNT_MFS;
+ mfs_rootbase = base;
+ mfs_rootsize = fs->fs_fsize * fs->fs_size;
+ rootdev = makedev(255, 0);
+ return (mfs_rootsize);
+}
--- /dev/null
+/* $NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $ */
+
+/*
+ * Copyright (c) 1989, 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)mfs_vfsops.c 8.11 (Berkeley) 6/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_compat_netbsd.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/bufq.h>
+#include <sys/mount.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/kmem.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <ufs/mfs/mfsnode.h>
+#include <ufs/mfs/mfs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, mfs, "ffs");
+
+kmutex_t mfs_lock; /* global lock */
+
+/* used for building internal dev_t, minor == 0 reserved for miniroot */
+static int mfs_minor = 1;
+static int mfs_initcnt;
+
+extern int (**mfs_vnodeop_p)(void *);
+
+static struct sysctllog *mfs_sysctl_log;
+
+/*
+ * mfs vfs operations.
+ */
+
+extern const struct vnodeopv_desc mfs_vnodeop_opv_desc;
+
+const struct vnodeopv_desc * const mfs_vnodeopv_descs[] = {
+ &mfs_vnodeop_opv_desc,
+ NULL,
+};
+
+struct vfsops mfs_vfsops = {
+ MOUNT_MFS,
+ sizeof (struct mfs_args),
+ mfs_mount,
+ mfs_start,
+ ffs_unmount,
+ ufs_root,
+ ufs_quotactl,
+ mfs_statvfs,
+ ffs_sync,
+ ffs_vget,
+ ffs_fhtovp,
+ ffs_vptofh,
+ mfs_init,
+ mfs_reinit,
+ mfs_done,
+ NULL,
+ (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+ vfs_stdextattrctl,
+ (void *)eopnotsupp, /* vfs_suspendctl */
+ genfs_renamelock_enter,
+ genfs_renamelock_exit,
+ (void *)eopnotsupp,
+ mfs_vnodeopv_descs,
+ 0,
+ { NULL, NULL },
+};
+
+static int
+mfs_modcmd(modcmd_t cmd, void *arg)
+{
+ int error;
+
+ switch (cmd) {
+ case MODULE_CMD_INIT:
+ error = vfs_attach(&mfs_vfsops);
+ if (error != 0)
+ break;
+ sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "vfs", NULL,
+ NULL, 0, NULL, 0,
+ CTL_VFS, CTL_EOL);
+ sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_ALIAS,
+ CTLTYPE_NODE, "mfs",
+ SYSCTL_DESCR("Memory based file system"),
+ NULL, 1, NULL, 0,
+ CTL_VFS, 3, CTL_EOL);
+ /*
+ * XXX the "1" and the "3" above could be dynamic, thereby
+ * eliminating one more instance of the "number to vfs"
+ * mapping problem, but they are in order as taken from
+ * sys/mount.h
+ */
+ break;
+ case MODULE_CMD_FINI:
+ error = vfs_detach(&mfs_vfsops);
+ if (error != 0)
+ break;
+ sysctl_teardown(&mfs_sysctl_log);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ return (error);
+}
+
+/*
+ * Memory based filesystem initialization.
+ */
+void
+mfs_init(void)
+{
+
+ if (mfs_initcnt++ == 0) {
+ mutex_init(&mfs_lock, MUTEX_DEFAULT, IPL_NONE);
+ ffs_init();
+ }
+}
+
+void
+mfs_reinit(void)
+{
+
+ ffs_reinit();
+}
+
+void
+mfs_done(void)
+{
+
+ if (--mfs_initcnt == 0) {
+ ffs_done();
+ mutex_destroy(&mfs_lock);
+ }
+}
+
+/*
+ * Called by main() when mfs is going to be mounted as root.
+ */
+
+int
+mfs_mountroot(void)
+{
+ struct fs *fs;
+ struct mount *mp;
+ struct lwp *l = curlwp; /* XXX */
+ struct ufsmount *ump;
+ struct mfsnode *mfsp;
+ int error = 0;
+
+ if ((error = vfs_rootmountalloc(MOUNT_MFS, "mfs_root", &mp))) {
+ vrele(rootvp);
+ return (error);
+ }
+
+ mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
+ rootvp->v_data = mfsp;
+ rootvp->v_op = mfs_vnodeop_p;
+ rootvp->v_tag = VT_MFS;
+ mfsp->mfs_baseoff = mfs_rootbase;
+ mfsp->mfs_size = mfs_rootsize;
+ mfsp->mfs_vnode = rootvp;
+ mfsp->mfs_proc = NULL; /* indicate kernel space */
+ mfsp->mfs_shutdown = 0;
+ cv_init(&mfsp->mfs_cv, "mfs");
+ mfsp->mfs_refcnt = 1;
+ bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
+ if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
+ vfs_unbusy(mp, false, NULL);
+ bufq_free(mfsp->mfs_buflist);
+ vfs_destroy(mp);
+ kmem_free(mfsp, sizeof(*mfsp));
+ return (error);
+ }
+ mutex_enter(&mountlist_lock);
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mutex_exit(&mountlist_lock);
+ mp->mnt_vnodecovered = NULLVP;
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ (void) copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
+ (void)ffs_statvfs(mp, &mp->mnt_stat);
+ vfs_unbusy(mp, false, NULL);
+ return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+/* ARGSUSED */
+int
+mfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+ struct lwp *l = curlwp;
+ struct vnode *devvp;
+ struct mfs_args *args = data;
+ struct ufsmount *ump;
+ struct fs *fs;
+ struct mfsnode *mfsp;
+ struct proc *p;
+ int flags, error = 0;
+
+ if (*data_len < sizeof *args)
+ return EINVAL;
+
+ p = l->l_proc;
+ if (mp->mnt_flag & MNT_GETARGS) {
+ struct vnode *vp;
+
+ ump = VFSTOUFS(mp);
+ if (ump == NULL)
+ return EIO;
+
+ vp = ump->um_devvp;
+ if (vp == NULL)
+ return EIO;
+
+ mfsp = VTOMFS(vp);
+ if (mfsp == NULL)
+ return EIO;
+
+ args->fspec = NULL;
+ args->base = mfsp->mfs_baseoff;
+ args->size = mfsp->mfs_size;
+ *data_len = sizeof *args;
+ return 0;
+ }
+ /*
+ * XXX turn off async to avoid hangs when writing lots of data.
+ * the problem is that MFS needs to allocate pages to clean pages,
+ * so if we wait until the last minute to clean pages then there
+ * may not be any pages available to do the cleaning.
+ * ... and since the default partially-synchronous mode turns out
+ * to not be sufficient under heavy load, make it full synchronous.
+ */
+ mp->mnt_flag &= ~MNT_ASYNC;
+ mp->mnt_flag |= MNT_SYNCHRONOUS;
+
+ /*
+ * If updating, check whether changing from read-only to
+ * read/write; if there is no device name, that's all we do.
+ */
+ if (mp->mnt_flag & MNT_UPDATE) {
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+ flags = WRITECLOSE;
+ if (mp->mnt_flag & MNT_FORCE)
+ flags |= FORCECLOSE;
+ error = ffs_flushfiles(mp, flags, l);
+ if (error)
+ return (error);
+ }
+ if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR))
+ fs->fs_ronly = 0;
+ if (args->fspec == NULL)
+ return EINVAL;
+ return (0);
+ }
+ error = getnewvnode(VT_MFS, NULL, mfs_vnodeop_p, NULL, &devvp);
+ if (error)
+ return (error);
+ devvp->v_vflag |= VV_MPSAFE;
+ devvp->v_type = VBLK;
+ spec_node_init(devvp, makedev(255, mfs_minor));
+ mfs_minor++;
+ mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
+ devvp->v_data = mfsp;
+ mfsp->mfs_baseoff = args->base;
+ mfsp->mfs_size = args->size;
+ mfsp->mfs_vnode = devvp;
+ mfsp->mfs_proc = p;
+ mfsp->mfs_shutdown = 0;
+ cv_init(&mfsp->mfs_cv, "mfsidl");
+ mfsp->mfs_refcnt = 1;
+ bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
+ if ((error = ffs_mountfs(devvp, mp, l)) != 0) {
+ mfsp->mfs_shutdown = 1;
+ vrele(devvp);
+ return (error);
+ }
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+ UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+ if (error)
+ return error;
+ (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
+ sizeof(fs->fs_fsmnt));
+ fs->fs_fsmnt[sizeof(fs->fs_fsmnt) - 1] = '\0';
+ /* XXX: cleanup on error */
+ return 0;
+}
+
+/*
+ * Used to grab the process and keep it in the kernel to service
+ * memory filesystem I/O requests.
+ *
+ * Loop servicing I/O requests.
+ * Copy the requested data into or out of the memory filesystem
+ * address space.
+ */
+/* ARGSUSED */
+int
+mfs_start(struct mount *mp, int flags)
+{
+ struct vnode *vp;
+ struct mfsnode *mfsp;
+ struct proc *p;
+ struct buf *bp;
+ void *base;
+ int sleepreturn = 0, refcnt, error;
+ ksiginfoq_t kq;
+
+ /*
+ * Ensure that file system is still mounted when getting mfsnode.
+ * Add a reference to the mfsnode to prevent it disappearing in
+ * this routine.
+ */
+ if ((error = vfs_busy(mp, NULL)) != 0)
+ return error;
+ vp = VFSTOUFS(mp)->um_devvp;
+ mfsp = VTOMFS(vp);
+ mutex_enter(&mfs_lock);
+ mfsp->mfs_refcnt++;
+ mutex_exit(&mfs_lock);
+ vfs_unbusy(mp, false, NULL);
+
+ base = mfsp->mfs_baseoff;
+ mutex_enter(&mfs_lock);
+ while (mfsp->mfs_shutdown != 1) {
+ while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) {
+ mutex_exit(&mfs_lock);
+ mfs_doio(bp, base);
+ mutex_enter(&mfs_lock);
+ }
+ /*
+ * If a non-ignored signal is received, try to unmount.
+ * If that fails, or the filesystem is already in the
+ * process of being unmounted, clear the signal (it has been
+ * "processed"), otherwise we will loop here, as tsleep
+ * will always return EINTR/ERESTART.
+ */
+ if (sleepreturn != 0) {
+ mutex_exit(&mfs_lock);
+ if (dounmount(mp, 0, curlwp) != 0) {
+ p = curproc;
+ ksiginfo_queue_init(&kq);
+ mutex_enter(p->p_lock);
+ sigclearall(p, NULL, &kq);
+ mutex_exit(p->p_lock);
+ ksiginfo_queue_drain(&kq);
+ }
+ sleepreturn = 0;
+ mutex_enter(&mfs_lock);
+ continue;
+ }
+
+ sleepreturn = cv_wait_sig(&mfsp->mfs_cv, &mfs_lock);
+ }
+ KASSERT(bufq_peek(mfsp->mfs_buflist) == NULL);
+ refcnt = --mfsp->mfs_refcnt;
+ mutex_exit(&mfs_lock);
+ if (refcnt == 0) {
+ bufq_free(mfsp->mfs_buflist);
+ cv_destroy(&mfsp->mfs_cv);
+ kmem_free(mfsp, sizeof(*mfsp));
+ }
+ return (sleepreturn);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+mfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+ int error;
+
+ error = ffs_statvfs(mp, sbp);
+ if (error)
+ return error;
+ (void)strncpy(sbp->f_fstypename, mp->mnt_op->vfs_name,
+ sizeof(sbp->f_fstypename));
+ sbp->f_fstypename[sizeof(sbp->f_fstypename) - 1] = '\0';
+ return 0;
+}
--- /dev/null
+/* $NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $ */
+
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/bufq.h>
+#include <sys/vnode.h>
+#include <sys/kmem.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <machine/vmparam.h>
+
+#include <ufs/mfs/mfsnode.h>
+#include <ufs/mfs/mfs_extern.h>
+
+/*
+ * mfs vnode operations.
+ */
+int (**mfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc mfs_vnodeop_entries[] = {
+ { &vop_default_desc, vn_default_error },
+ { &vop_lookup_desc, mfs_lookup }, /* lookup */
+ { &vop_create_desc, mfs_create }, /* create */
+ { &vop_mknod_desc, mfs_mknod }, /* mknod */
+ { &vop_open_desc, mfs_open }, /* open */
+ { &vop_close_desc, mfs_close }, /* close */
+ { &vop_access_desc, mfs_access }, /* access */
+ { &vop_getattr_desc, mfs_getattr }, /* getattr */
+ { &vop_setattr_desc, mfs_setattr }, /* setattr */
+ { &vop_read_desc, mfs_read }, /* read */
+ { &vop_write_desc, mfs_write }, /* write */
+ { &vop_ioctl_desc, mfs_ioctl }, /* ioctl */
+ { &vop_poll_desc, mfs_poll }, /* poll */
+ { &vop_revoke_desc, mfs_revoke }, /* revoke */
+ { &vop_mmap_desc, mfs_mmap }, /* mmap */
+ { &vop_fsync_desc, spec_fsync }, /* fsync */
+ { &vop_seek_desc, mfs_seek }, /* seek */
+ { &vop_remove_desc, mfs_remove }, /* remove */
+ { &vop_link_desc, mfs_link }, /* link */
+ { &vop_rename_desc, mfs_rename }, /* rename */
+ { &vop_mkdir_desc, mfs_mkdir }, /* mkdir */
+ { &vop_rmdir_desc, mfs_rmdir }, /* rmdir */
+ { &vop_symlink_desc, mfs_symlink }, /* symlink */
+ { &vop_readdir_desc, mfs_readdir }, /* readdir */
+ { &vop_readlink_desc, mfs_readlink }, /* readlink */
+ { &vop_abortop_desc, mfs_abortop }, /* abortop */
+ { &vop_inactive_desc, mfs_inactive }, /* inactive */
+ { &vop_reclaim_desc, mfs_reclaim }, /* reclaim */
+ { &vop_lock_desc, genfs_nolock }, /* lock */
+ { &vop_unlock_desc, genfs_nounlock }, /* unlock */
+ { &vop_bmap_desc, mfs_bmap }, /* bmap */
+ { &vop_strategy_desc, mfs_strategy }, /* strategy */
+ { &vop_print_desc, mfs_print }, /* print */
+ { &vop_islocked_desc, mfs_islocked }, /* islocked */
+ { &vop_pathconf_desc, mfs_pathconf }, /* pathconf */
+ { &vop_advlock_desc, mfs_advlock }, /* advlock */
+ { &vop_bwrite_desc, mfs_bwrite }, /* bwrite */
+ { &vop_putpages_desc, mfs_putpages }, /* putpages */
+ { NULL, NULL }
+};
+const struct vnodeopv_desc mfs_vnodeop_opv_desc =
+ { &mfs_vnodeop_p, mfs_vnodeop_entries };
+
+/*
+ * Vnode Operations.
+ *
+ * Open called to allow memory filesystem to initialize and
+ * validate before actual IO. Record our process identifier
+ * so we can tell when we are doing I/O to ourself.
+ */
+/* ARGSUSED */
+int
+mfs_open(void *v)
+{
+ struct vop_open_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ if (ap->a_vp->v_type != VBLK) {
+ panic("mfs_ioctl not VBLK");
+ /* NOTREACHED */
+ }
+ return (0);
+}
+
+/*
+ * Pass I/O requests to the memory filesystem process.
+ */
+int
+mfs_strategy(void *v)
+{
+ struct vop_strategy_args /* {
+ struct vnode *a_vp;
+ struct buf *a_bp;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct buf *bp = ap->a_bp;
+ struct mfsnode *mfsp;
+
+ if (vp->v_type != VBLK || vp->v_usecount == 0)
+ panic("mfs_strategy: bad dev");
+ mfsp = VTOMFS(vp);
+ /* check for mini-root access */
+ if (mfsp->mfs_proc == NULL) {
+ void *base;
+
+ base = (char *)mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT);
+ if (bp->b_flags & B_READ)
+ memcpy(bp->b_data, base, bp->b_bcount);
+ else
+ memcpy(base, bp->b_data, bp->b_bcount);
+ bp->b_resid = 0;
+ biodone(bp);
+ } else if (mfsp->mfs_proc == curproc) {
+ mfs_doio(bp, mfsp->mfs_baseoff);
+ } else if (doing_shutdown) {
+ /*
+ * bitbucket I/O during shutdown.
+ * Note that reads should *not* happen here, but..
+ */
+ if (bp->b_flags & B_READ)
+ printf("warning: mfs read during shutdown\n");
+ bp->b_resid = 0;
+ biodone(bp);
+ } else {
+ mutex_enter(&mfs_lock);
+ bufq_put(mfsp->mfs_buflist, bp);
+ cv_broadcast(&mfsp->mfs_cv);
+ mutex_exit(&mfs_lock);
+ }
+ return (0);
+}
+
+/*
+ * Memory file system I/O.
+ */
+void
+mfs_doio(struct buf *bp, void *base)
+{
+
+ base = (char *)base + (bp->b_blkno << DEV_BSHIFT);
+ if (bp->b_flags & B_READ)
+ bp->b_error = copyin(base, bp->b_data, bp->b_bcount);
+ else
+ bp->b_error = copyout(bp->b_data, base, bp->b_bcount);
+ if (bp->b_error == 0)
+ bp->b_resid = 0;
+ biodone(bp);
+}
+
+/*
+ * This is a noop, simply returning what one has been given.
+ */
+int
+mfs_bmap(void *v)
+{
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct vnode **a_vpp;
+ daddr_t *a_bnp;
+ int *a_runp;
+ } */ *ap = v;
+
+ if (ap->a_vpp != NULL)
+ *ap->a_vpp = ap->a_vp;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn;
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ return (0);
+}
+
+/*
+ * Memory filesystem close routine
+ */
+/* ARGSUSED */
+int
+mfs_close(void *v)
+{
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct mfsnode *mfsp = VTOMFS(vp);
+ struct buf *bp;
+ int error;
+
+ /*
+ * Finish any pending I/O requests.
+ */
+ mutex_enter(&mfs_lock);
+ while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) {
+ mutex_exit(&mfs_lock);
+ mfs_doio(bp, mfsp->mfs_baseoff);
+ mutex_enter(&mfs_lock);
+ }
+ mutex_exit(&mfs_lock);
+ /*
+ * On last close of a memory filesystem
+ * we must invalidate any in core blocks, so that
+ * we can, free up its vnode.
+ */
+ if ((error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0)) != 0)
+ return (error);
+ /*
+ * There should be no way to have any more uses of this
+ * vnode, so if we find any other uses, it is a panic.
+ */
+ if (bufq_peek(mfsp->mfs_buflist) != NULL)
+ panic("mfs_close");
+ /*
+ * Send a request to the filesystem server to exit.
+ */
+ mutex_enter(&mfs_lock);
+ mfsp->mfs_shutdown = 1;
+ cv_broadcast(&mfsp->mfs_cv);
+ mutex_exit(&mfs_lock);
+ return (0);
+}
+
+/*
+ * Memory filesystem inactive routine
+ */
+/* ARGSUSED */
+int
+mfs_inactive(void *v)
+{
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct mfsnode *mfsp = VTOMFS(vp);
+
+ if (bufq_peek(mfsp->mfs_buflist) != NULL)
+ panic("mfs_inactive: not inactive (mfs_buflist %p)",
+ bufq_peek(mfsp->mfs_buflist));
+ VOP_UNLOCK(vp);
+ return (0);
+}
+
+/*
+ * Reclaim a memory filesystem devvp so that it can be reused.
+ */
+int
+mfs_reclaim(void *v)
+{
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct mfsnode *mfsp = VTOMFS(vp);
+ int refcnt;
+
+ mutex_enter(&mfs_lock);
+ vp->v_data = NULL;
+ refcnt = --mfsp->mfs_refcnt;
+ mutex_exit(&mfs_lock);
+
+ if (refcnt == 0) {
+ bufq_free(mfsp->mfs_buflist);
+ cv_destroy(&mfsp->mfs_cv);
+ kmem_free(mfsp, sizeof(*mfsp));
+ }
+
+ return (0);
+}
+
+/*
+ * Print out the contents of an mfsnode.
+ */
+int
+mfs_print(void *v)
+{
+ struct vop_print_args /* {
+ struct vnode *a_vp;
+ } */ *ap = v;
+ struct mfsnode *mfsp = VTOMFS(ap->a_vp);
+
+ printf("tag VT_MFS, pid %d, base %p, size %ld\n",
+ (mfsp->mfs_proc != NULL) ? mfsp->mfs_proc->p_pid : 0,
+ mfsp->mfs_baseoff, mfsp->mfs_size);
+ return (0);
+}
--- /dev/null
+# $NetBSD: Makefile,v 1.7 2011/03/06 17:08:39 bouyer Exp $
+
+INCSDIR= /usr/include/ufs/ufs
+
+INCS= dinode.h dir.h extattr.h inode.h quota.h quota1.h quota2.h \
+ ufs_bswap.h ufs_extern.h ufs_wapbl.h ufsmount.h
+
+.include <bsd.kinc.mk>
--- /dev/null
+/* $NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $ */
+/*-
+ * Copyright (c) 2010 Manuel Bouyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $");
+
+#include <sys/types.h>
+#include <machine/limits.h>
+
+#include <sys/quota.h>
+#include <quota/quotaprop.h>
+#include <ufs/ufs/quota1.h>
+
+static uint64_t
+dqblk2q2e_limit(uint32_t lim)
+{
+ if (lim == 0)
+ return UQUAD_MAX;
+ else
+ return (lim - 1);
+}
+
+static uint32_t
+q2e2dqblk_limit(uint64_t lim)
+{
+ if (lim == UQUAD_MAX)
+ return 0;
+ else
+ return (lim + 1);
+}
+
+void
+dqblk_to_quotaval(const struct dqblk *dqblk, struct quotaval *qv)
+{
+ /* XXX is qv_grace getting handled correctly? */
+
+ qv[QUOTA_LIMIT_BLOCK].qv_hardlimit =
+ dqblk2q2e_limit(dqblk->dqb_bhardlimit);
+ qv[QUOTA_LIMIT_BLOCK].qv_softlimit =
+ dqblk2q2e_limit(dqblk->dqb_bsoftlimit);
+ qv[QUOTA_LIMIT_BLOCK].qv_usage = dqblk->dqb_curblocks;
+ qv[QUOTA_LIMIT_BLOCK].qv_expiretime = dqblk->dqb_btime;
+
+ qv[QUOTA_LIMIT_FILE].qv_hardlimit =
+ dqblk2q2e_limit(dqblk->dqb_ihardlimit);
+ qv[QUOTA_LIMIT_FILE].qv_softlimit =
+ dqblk2q2e_limit(dqblk->dqb_isoftlimit);
+ qv[QUOTA_LIMIT_FILE].qv_usage = dqblk->dqb_curinodes;
+ qv[QUOTA_LIMIT_FILE].qv_expiretime = dqblk->dqb_itime;
+}
+
+void
+quotaval_to_dqblk(const struct quotaval *qv, struct dqblk *dqblk)
+{
+ /* XXX is qv_grace getting handled correctly? */
+
+ dqblk->dqb_bhardlimit =
+ q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_hardlimit);
+ dqblk->dqb_bsoftlimit =
+ q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_softlimit);
+ dqblk->dqb_curblocks = qv[QUOTA_LIMIT_BLOCK].qv_usage;
+ dqblk->dqb_btime = qv[QUOTA_LIMIT_BLOCK].qv_expiretime;
+
+ dqblk->dqb_ihardlimit =
+ q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_hardlimit);
+ dqblk->dqb_isoftlimit =
+ q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_softlimit);
+ dqblk->dqb_curinodes = qv[QUOTA_LIMIT_FILE].qv_usage;
+ dqblk->dqb_itime = qv[QUOTA_LIMIT_FILE].qv_expiretime;
+}
+
--- /dev/null
+/* $NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */
+/*-
+ * Copyright (c) 2010 Manuel Bouyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/quota2.h>
+
+#ifndef _KERNEL
+#include <string.h>
+#endif
+
+void
+quota2_addfreeq2e(struct quota2_header *q2h, void *bp, uint64_t baseoff,
+ uint64_t bsize, int ns)
+{
+ uint64_t blkoff = baseoff % bsize;
+ int i, nq2e;
+ struct quota2_entry *q2e;
+
+ q2e = (void *)((char *)bp + blkoff);
+ nq2e = (bsize - blkoff) / sizeof(*q2e);
+ for (i = 0; i < nq2e; i++) {
+ q2e[i].q2e_next = q2h->q2h_free;
+ q2h->q2h_free = ufs_rw64(i * sizeof(*q2e) + baseoff, ns);
+ }
+}
+
+void
+quota2_create_blk0(uint64_t bsize, void *bp, int q2h_hash_shift, int type,
+ int ns)
+{
+ struct quota2_header *q2h;
+ const int quota2_hash_size = 1 << q2h_hash_shift;
+ const int quota2_full_header_size = sizeof(struct quota2_header) +
+ sizeof(q2h->q2h_entries[0]) * quota2_hash_size;
+ int i;
+
+ memset(bp, 0, bsize);
+ q2h = bp;
+ q2h->q2h_magic_number = ufs_rw32(Q2_HEAD_MAGIC, ns);
+ q2h->q2h_type = type;
+ q2h->q2h_hash_shift = q2h_hash_shift;
+ q2h->q2h_hash_size = ufs_rw16(quota2_hash_size, ns);
+ /* setup defaut entry: unlimited, 7 days grace */
+ for (i = 0; i < N_QL; i++) {
+ q2h->q2h_defentry.q2e_val[i].q2v_hardlimit =
+ q2h->q2h_defentry.q2e_val[i].q2v_softlimit =
+ ufs_rw64(UQUAD_MAX, ns);
+ q2h->q2h_defentry.q2e_val[i].q2v_grace =
+ ufs_rw64(7ULL * 24ULL * 3600ULL, ns);
+ }
+
+ /* first quota entry, after the hash table */
+ quota2_addfreeq2e(q2h, bp, quota2_full_header_size, bsize, ns);
+}
+
+void
+quota2_ufs_rwq2v(const struct quota2_val *s, struct quota2_val *d, int needswap)
+{
+ d->q2v_hardlimit = ufs_rw64(s->q2v_hardlimit, needswap);
+ d->q2v_softlimit = ufs_rw64(s->q2v_softlimit, needswap);
+ d->q2v_cur = ufs_rw64(s->q2v_cur, needswap);
+ d->q2v_time = ufs_rw64(s->q2v_time, needswap);
+ d->q2v_grace = ufs_rw64(s->q2v_grace, needswap);
+}
+
+void
+quota2_ufs_rwq2e(const struct quota2_entry *s, struct quota2_entry *d,
+int needswap)
+{
+ quota2_ufs_rwq2v(&s->q2e_val[QL_BLOCK], &d->q2e_val[QL_BLOCK],
+ needswap);
+ quota2_ufs_rwq2v(&s->q2e_val[QL_FILE], &d->q2e_val[QL_FILE],
+ needswap);
+ d->q2e_uid = ufs_rw32(s->q2e_uid, needswap);
+}
--- /dev/null
+/* $NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_bmap.c 8.8 (Berkeley) 8/11/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+static bool
+ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1)
+{
+
+ /* for ufs, blocks in a hole is not 'contiguous'. */
+ if (daddr0 == 0)
+ return false;
+
+ return (daddr0 + ump->um_seqinc == daddr1);
+}
+
+/*
+ * Bmap converts the logical block number of a file to its physical block
+ * number on the disk. The conversion is done by using the logical block
+ * number to index into the array of block pointers described by the dinode.
+ */
+int
+ufs_bmap(void *v)
+{
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct vnode **a_vpp;
+ daddr_t *a_bnp;
+ int *a_runp;
+ } */ *ap = v;
+ int error;
+
+ /*
+ * Check for underlying vnode requests and ensure that logical
+ * to physical mapping is requested.
+ */
+ if (ap->a_vpp != NULL)
+ *ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
+ if (ap->a_bnp == NULL)
+ return (0);
+
+ fstrans_start(ap->a_vp->v_mount, FSTRANS_SHARED);
+ error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
+ ap->a_runp, ufs_issequential);
+ fstrans_done(ap->a_vp->v_mount);
+ return error;
+}
+
+/*
+ * Indirect blocks are now on the vnode for the file. They are given negative
+ * logical block numbers. Indirect blocks are addressed by the negative
+ * address of the first data block to which they point. Double indirect blocks
+ * are addressed by one less than the address of the first indirect block to
+ * which they point. Triple indirect blocks are addressed by one less than
+ * the address of the first double indirect block to which they point.
+ *
+ * ufs_bmaparray does the bmap conversion, and if requested returns the
+ * array of logical blocks which must be traversed to get to a block.
+ * Each entry contains the offset into that block that gets you to the
+ * next block and the disk address of the block (if it is assigned).
+ */
+
+int
+ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
+ int *nump, int *runp, ufs_issequential_callback_t is_sequential)
+{
+ struct inode *ip;
+ struct buf *bp, *cbp;
+ struct ufsmount *ump;
+ struct mount *mp;
+ struct indir a[NIADDR + 1], *xap;
+ daddr_t daddr;
+ daddr_t metalbn;
+ int error, maxrun = 0, num;
+
+ ip = VTOI(vp);
+ mp = vp->v_mount;
+ ump = ip->i_ump;
+#ifdef DIAGNOSTIC
+ if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL))
+ panic("ufs_bmaparray: invalid arguments");
+#endif
+
+ if (runp) {
+ /*
+ * XXX
+ * If MAXBSIZE is the largest transfer the disks can handle,
+ * we probably want maxrun to be 1 block less so that we
+ * don't create a block larger than the device can handle.
+ */
+ *runp = 0;
+ maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
+ }
+
+ if (bn >= 0 && bn < NDADDR) {
+ if (nump != NULL)
+ *nump = 0;
+ if (ump->um_fstype == UFS1)
+ daddr = ufs_rw32(ip->i_ffs1_db[bn],
+ UFS_MPNEEDSWAP(ump));
+ else
+ daddr = ufs_rw64(ip->i_ffs2_db[bn],
+ UFS_MPNEEDSWAP(ump));
+ *bnp = blkptrtodb(ump, daddr);
+ /*
+ * Since this is FFS independent code, we are out of
+ * scope for the definitions of BLK_NOCOPY and
+ * BLK_SNAP, but we do know that they will fall in
+ * the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts
+ * are made to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
+ && daddr > 0 &&
+ daddr < ump->um_seqinc) {
+ *bnp = -1;
+ } else if (*bnp == 0) {
+ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
+ == SF_SNAPSHOT) {
+ *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
+ } else {
+ *bnp = -1;
+ }
+ } else if (runp) {
+ if (ump->um_fstype == UFS1) {
+ for (++bn; bn < NDADDR && *runp < maxrun &&
+ is_sequential(ump,
+ ufs_rw32(ip->i_ffs1_db[bn - 1],
+ UFS_MPNEEDSWAP(ump)),
+ ufs_rw32(ip->i_ffs1_db[bn],
+ UFS_MPNEEDSWAP(ump)));
+ ++bn, ++*runp);
+ } else {
+ for (++bn; bn < NDADDR && *runp < maxrun &&
+ is_sequential(ump,
+ ufs_rw64(ip->i_ffs2_db[bn - 1],
+ UFS_MPNEEDSWAP(ump)),
+ ufs_rw64(ip->i_ffs2_db[bn],
+ UFS_MPNEEDSWAP(ump)));
+ ++bn, ++*runp);
+ }
+ }
+ return (0);
+ }
+
+ xap = ap == NULL ? a : ap;
+ if (!nump)
+ nump = #
+ if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
+ return (error);
+
+ num = *nump;
+
+ /* Get disk address out of indirect block array */
+ if (ump->um_fstype == UFS1)
+ daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
+ UFS_MPNEEDSWAP(ump));
+ else
+ daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
+ UFS_MPNEEDSWAP(ump));
+
+ for (bp = NULL, ++xap; --num; ++xap) {
+ /*
+ * Exit the loop if there is no disk address assigned yet and
+ * the indirect block isn't in the cache, or if we were
+ * looking for an indirect block and we've found it.
+ */
+
+ metalbn = xap->in_lbn;
+ if (metalbn == bn)
+ break;
+ if (daddr == 0) {
+ mutex_enter(&bufcache_lock);
+ cbp = incore(vp, metalbn);
+ mutex_exit(&bufcache_lock);
+ if (cbp == NULL)
+ break;
+ }
+
+ /*
+ * If we get here, we've either got the block in the cache
+ * or we have a disk address for it, go fetch it.
+ */
+ if (bp)
+ brelse(bp, 0);
+
+ xap->in_exists = 1;
+ bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
+ if (bp == NULL) {
+
+ /*
+ * getblk() above returns NULL only iff we are
+ * pagedaemon. See the implementation of getblk
+ * for detail.
+ */
+
+ return (ENOMEM);
+ }
+ if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+ trace(TR_BREADHIT, pack(vp, size), metalbn);
+ }
+#ifdef DIAGNOSTIC
+ else if (!daddr)
+ panic("ufs_bmaparray: indirect block not in cache");
+#endif
+ else {
+ trace(TR_BREADMISS, pack(vp, size), metalbn);
+ bp->b_blkno = blkptrtodb(ump, daddr);
+ bp->b_flags |= B_READ;
+ BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+ VOP_STRATEGY(vp, bp);
+ curlwp->l_ru.ru_inblock++; /* XXX */
+ if ((error = biowait(bp)) != 0) {
+ brelse(bp, 0);
+ return (error);
+ }
+ }
+ if (ump->um_fstype == UFS1) {
+ daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
+ UFS_MPNEEDSWAP(ump));
+ if (num == 1 && daddr && runp) {
+ for (bn = xap->in_off + 1;
+ bn < MNINDIR(ump) && *runp < maxrun &&
+ is_sequential(ump,
+ ufs_rw32(((int32_t *)bp->b_data)[bn-1],
+ UFS_MPNEEDSWAP(ump)),
+ ufs_rw32(((int32_t *)bp->b_data)[bn],
+ UFS_MPNEEDSWAP(ump)));
+ ++bn, ++*runp);
+ }
+ } else {
+ daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
+ UFS_MPNEEDSWAP(ump));
+ if (num == 1 && daddr && runp) {
+ for (bn = xap->in_off + 1;
+ bn < MNINDIR(ump) && *runp < maxrun &&
+ is_sequential(ump,
+ ufs_rw64(((int64_t *)bp->b_data)[bn-1],
+ UFS_MPNEEDSWAP(ump)),
+ ufs_rw64(((int64_t *)bp->b_data)[bn],
+ UFS_MPNEEDSWAP(ump)));
+ ++bn, ++*runp);
+ }
+ }
+ }
+ if (bp)
+ brelse(bp, 0);
+
+ /*
+ * Since this is FFS independent code, we are out of scope for the
+ * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+ * will fall in the range 1..um_seqinc, so we use that test and
+ * return a request for a zeroed out buffer if attempts are made
+ * to read a BLK_NOCOPY or BLK_SNAP block.
+ */
+ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
+ && daddr > 0 && daddr < ump->um_seqinc) {
+ *bnp = -1;
+ return (0);
+ }
+ *bnp = blkptrtodb(ump, daddr);
+ if (*bnp == 0) {
+ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
+ == SF_SNAPSHOT) {
+ *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
+ } else {
+ *bnp = -1;
+ }
+ }
+ return (0);
+}
+
+/*
+ * Create an array of logical block number/offset pairs which represent the
+ * path of indirect blocks required to access a data block. The first "pair"
+ * contains the logical block number of the appropriate single, double or
+ * triple indirect block and the offset into the inode indirect block array.
+ * Note, the logical block number of the inode single/double/triple indirect
+ * block appears twice in the array, once with the offset into the i_ffs1_ib and
+ * once with the offset into the page itself.
+ */
+int
+ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
+{
+ daddr_t metalbn, realbn;
+ struct ufsmount *ump;
+ int64_t blockcnt;
+ int lbc;
+ int i, numlevels, off;
+
+ ump = VFSTOUFS(vp->v_mount);
+ if (nump)
+ *nump = 0;
+ numlevels = 0;
+ realbn = bn;
+ if (bn < 0)
+ bn = -bn;
+ KASSERT(bn >= NDADDR);
+
+ /*
+ * Determine the number of levels of indirection. After this loop
+ * is done, blockcnt indicates the number of data blocks possible
+ * at the given level of indirection, and NIADDR - i is the number
+ * of levels of indirection needed to locate the requested block.
+ */
+
+ bn -= NDADDR;
+ for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) {
+ if (i == 0)
+ return (EFBIG);
+
+ lbc += ump->um_lognindir;
+ blockcnt = (int64_t)1 << lbc;
+
+ if (bn < blockcnt)
+ break;
+ }
+
+ /* Calculate the address of the first meta-block. */
+ metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + NIADDR - i);
+
+ /*
+ * At each iteration, off is the offset into the bap array which is
+ * an array of disk addresses at the current level of indirection.
+ * The logical block number and the offset in that block are stored
+ * into the argument array.
+ */
+ ap->in_lbn = metalbn;
+ ap->in_off = off = NIADDR - i;
+ ap->in_exists = 0;
+ ap++;
+ for (++numlevels; i <= NIADDR; i++) {
+ /* If searching for a meta-data block, quit when found. */
+ if (metalbn == realbn)
+ break;
+
+ lbc -= ump->um_lognindir;
+ off = (bn >> lbc) & (MNINDIR(ump) - 1);
+
+ ++numlevels;
+ ap->in_lbn = metalbn;
+ ap->in_off = off;
+ ap->in_exists = 0;
+ ++ap;
+
+ metalbn -= -1 + ((int64_t)off << lbc);
+ }
+ if (nump)
+ *nump = numlevels;
+ return (0);
+}
--- /dev/null
+/* $NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $ */
+
+/*
+ * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $");
+
+/*
+ * This implements a hash-based lookup scheme for UFS directories.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/types.h>
+#include <sys/hash.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/pool.h>
+#include <sys/sysctl.h>
+#include <sys/atomic.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/dirhash.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1))
+#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1))
+#define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen <= 0)
+#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n))
+
+static u_int ufs_dirhashminblks = 5;
+static u_int ufs_dirhashmaxmem = 2 * 1024 * 1024;
+static u_int ufs_dirhashmem;
+static u_int ufs_dirhashcheck = 0;
+
+static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen);
+static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff,
+ int dirblksiz);
+static void ufsdirhash_delslot(struct dirhash *dh, int slot);
+static int ufsdirhash_findslot(struct dirhash *dh, const char *name,
+ int namelen, doff_t offset);
+static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset,
+ int dirblksiz);
+static int ufsdirhash_recycle(int wanted);
+
+static pool_cache_t ufsdirhashblk_cache;
+static pool_cache_t ufsdirhash_cache;
+
+#define DIRHASHLIST_LOCK() mutex_enter(&ufsdirhash_lock)
+#define DIRHASHLIST_UNLOCK() mutex_exit(&ufsdirhash_lock)
+#define DIRHASH_LOCK(dh) mutex_enter(&(dh)->dh_lock)
+#define DIRHASH_UNLOCK(dh) mutex_exit(&(dh)->dh_lock)
+#define DIRHASH_BLKALLOC() \
+ pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT)
+#define DIRHASH_BLKFREE(ptr) \
+ pool_cache_put(ufsdirhashblk_cache, ptr)
+
+/* Dirhash list; recently-used entries are near the tail. */
+static TAILQ_HEAD(, dirhash) ufsdirhash_list;
+
+/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
+static kmutex_t ufsdirhash_lock;
+
+static struct sysctllog *ufsdirhash_sysctl_log;
+
+/*
+ * Locking order:
+ * ufsdirhash_lock
+ * dh_lock
+ *
+ * The dh_lock mutex should be acquired either via the inode lock, or via
+ * ufsdirhash_lock. Only the owner of the inode may free the associated
+ * dirhash, but anything can steal its memory and set dh_hash to NULL.
+ */
+
+/*
+ * Attempt to build up a hash table for the directory contents in
+ * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
+ */
+int
+ufsdirhash_build(struct inode *ip)
+{
+ struct dirhash *dh;
+ struct buf *bp = NULL;
+ struct direct *ep;
+ struct vnode *vp;
+ doff_t bmask, pos;
+ int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
+ const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ /* Check if we can/should use dirhash. */
+ if (ip->i_dirhash == NULL) {
+ if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || OFSFMT(ip))
+ return (-1);
+ } else {
+ /* Hash exists, but sysctls could have changed. */
+ if (ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
+ ufs_dirhashmem > ufs_dirhashmaxmem) {
+ ufsdirhash_free(ip);
+ return (-1);
+ }
+ /* Check if hash exists and is intact (note: unlocked read). */
+ if (ip->i_dirhash->dh_hash != NULL)
+ return (0);
+ /* Free the old, recycled hash and build a new one. */
+ ufsdirhash_free(ip);
+ }
+
+ /* Don't hash removed directories. */
+ if (ip->i_nlink == 0)
+ return (-1);
+
+ vp = ip->i_vnode;
+ /* Allocate 50% more entries than this dir size could ever need. */
+ KASSERT(ip->i_size >= dirblksiz);
+ nslots = ip->i_size / DIRECTSIZ(1);
+ nslots = (nslots * 3 + 1) / 2;
+ narrays = howmany(nslots, DH_NBLKOFF);
+ nslots = narrays * DH_NBLKOFF;
+ dirblocks = howmany(ip->i_size, dirblksiz);
+ nblocks = (dirblocks * 3 + 1) / 2;
+
+ memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
+ narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
+ nblocks * sizeof(*dh->dh_blkfree);
+
+ while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) >
+ ufs_dirhashmaxmem) {
+ atomic_add_int(&ufs_dirhashmem, -memreqd);
+ if (memreqd > ufs_dirhashmaxmem / 2)
+ return (-1);
+ /* Try to free some space. */
+ if (ufsdirhash_recycle(memreqd) != 0)
+ return (-1);
+ else
+ DIRHASHLIST_UNLOCK();
+ }
+
+ /*
+ * Use non-blocking mallocs so that we will revert to a linear
+ * lookup on failure rather than potentially blocking forever.
+ */
+ dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT);
+ if (dh == NULL) {
+ atomic_add_int(&ufs_dirhashmem, -memreqd);
+ return (-1);
+ }
+ memset(dh, 0, sizeof(*dh));
+ mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE);
+ DIRHASH_LOCK(dh);
+ dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]);
+ dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP);
+ dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]);
+ dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP);
+ if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
+ goto fail;
+ for (i = 0; i < narrays; i++) {
+ if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL)
+ goto fail;
+ for (j = 0; j < DH_NBLKOFF; j++)
+ dh->dh_hash[i][j] = DIRHASH_EMPTY;
+ }
+
+ /* Initialise the hash table and block statistics. */
+ dh->dh_narrays = narrays;
+ dh->dh_hlen = nslots;
+ dh->dh_nblk = nblocks;
+ dh->dh_dirblks = dirblocks;
+ for (i = 0; i < dirblocks; i++)
+ dh->dh_blkfree[i] = dirblksiz / DIRALIGN;
+ for (i = 0; i < DH_NFSTATS; i++)
+ dh->dh_firstfree[i] = -1;
+ dh->dh_firstfree[DH_NFSTATS] = 0;
+ dh->dh_seqopt = 0;
+ dh->dh_seqoff = 0;
+ dh->dh_score = DH_SCOREINIT;
+ ip->i_dirhash = dh;
+
+ bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+ pos = 0;
+ while (pos < ip->i_size) {
+ if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+ != 0) {
+ preempt();
+ }
+ /* If necessary, get the next directory block. */
+ if ((pos & bmask) == 0) {
+ if (bp != NULL)
+ brelse(bp, 0);
+ if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0)
+ goto fail;
+ }
+
+ /* Add this entry to the hash. */
+ ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
+ if (ep->d_reclen == 0 || ep->d_reclen >
+ dirblksiz - (pos & (dirblksiz - 1))) {
+ /* Corrupted directory. */
+ brelse(bp, 0);
+ goto fail;
+ }
+ if (ep->d_ino != 0) {
+ /* Add the entry (simplified ufsdirhash_add). */
+ slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
+ while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
+ slot = WRAPINCR(slot, dh->dh_hlen);
+ dh->dh_hused++;
+ DH_ENTRY(dh, slot) = pos;
+ ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep, needswap),
+ dirblksiz);
+ }
+ pos += ep->d_reclen;
+ }
+
+ if (bp != NULL)
+ brelse(bp, 0);
+ DIRHASHLIST_LOCK();
+ TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
+ dh->dh_onlist = 1;
+ DIRHASH_UNLOCK(dh);
+ DIRHASHLIST_UNLOCK();
+ return (0);
+
+fail:
+ DIRHASH_UNLOCK(dh);
+ if (dh->dh_hash != NULL) {
+ for (i = 0; i < narrays; i++)
+ if (dh->dh_hash[i] != NULL)
+ DIRHASH_BLKFREE(dh->dh_hash[i]);
+ kmem_free(dh->dh_hash, dh->dh_hashsz);
+ }
+ if (dh->dh_blkfree != NULL)
+ kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
+ mutex_destroy(&dh->dh_lock);
+ pool_cache_put(ufsdirhash_cache, dh);
+ ip->i_dirhash = NULL;
+ atomic_add_int(&ufs_dirhashmem, -memreqd);
+ return (-1);
+}
+
+/*
+ * Free any hash table associated with inode 'ip'.
+ */
+void
+ufsdirhash_free(struct inode *ip)
+{
+ struct dirhash *dh;
+ int i, mem;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return;
+
+ if (dh->dh_onlist) {
+ DIRHASHLIST_LOCK();
+ if (dh->dh_onlist)
+ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+ DIRHASHLIST_UNLOCK();
+ }
+
+ /* The dirhash pointed to by 'dh' is exclusively ours now. */
+ mem = sizeof(*dh);
+ if (dh->dh_hash != NULL) {
+ for (i = 0; i < dh->dh_narrays; i++)
+ DIRHASH_BLKFREE(dh->dh_hash[i]);
+ kmem_free(dh->dh_hash, dh->dh_hashsz);
+ kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
+ mem += dh->dh_hashsz;
+ mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash);
+ mem += dh->dh_nblk * sizeof(*dh->dh_blkfree);
+ }
+ mutex_destroy(&dh->dh_lock);
+ pool_cache_put(ufsdirhash_cache, dh);
+ ip->i_dirhash = NULL;
+
+ atomic_add_int(&ufs_dirhashmem, -mem);
+}
+
+/*
+ * Find the offset of the specified name within the given inode.
+ * Returns 0 on success, ENOENT if the entry does not exist, or
+ * EJUSTRETURN if the caller should revert to a linear search.
+ *
+ * If successful, the directory offset is stored in *offp, and a
+ * pointer to a struct buf containing the entry is stored in *bpp. If
+ * prevoffp is non-NULL, the offset of the previous entry within
+ * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
+ * is the first in a block, the start of the block is used).
+ */
+int
+ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp,
+ struct buf **bpp, doff_t *prevoffp)
+{
+ struct dirhash *dh, *dh_next;
+ struct direct *dp;
+ struct vnode *vp;
+ struct buf *bp;
+ doff_t blkoff, bmask, offset, prevoff;
+ int i, slot;
+ const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return (EJUSTRETURN);
+
+ /*
+ * Move this dirhash towards the end of the list if it has a
+ * score higher than the next entry, and acquire the dh_lock.
+ * Optimise the case where it's already the last by performing
+ * an unlocked read of the TAILQ_NEXT pointer.
+ *
+ * In both cases, end up holding just dh_lock.
+ */
+ if (TAILQ_NEXT(dh, dh_list) != NULL) {
+ DIRHASHLIST_LOCK();
+ DIRHASH_LOCK(dh);
+ /*
+ * If the new score will be greater than that of the next
+ * entry, then move this entry past it. With both mutexes
+ * held, dh_next won't go away, but its dh_score could
+ * change; that's not important since it is just a hint.
+ */
+ if (dh->dh_hash != NULL &&
+ (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
+ dh->dh_score >= dh_next->dh_score) {
+ KASSERT(dh->dh_onlist);
+ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+ TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
+ dh_list);
+ }
+ DIRHASHLIST_UNLOCK();
+ } else {
+ /* Already the last, though that could change as we wait. */
+ DIRHASH_LOCK(dh);
+ }
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return (EJUSTRETURN);
+ }
+
+ /* Update the score. */
+ if (dh->dh_score < DH_SCOREMAX)
+ dh->dh_score++;
+
+ vp = ip->i_vnode;
+ bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+ blkoff = -1;
+ bp = NULL;
+restart:
+ slot = ufsdirhash_hash(dh, name, namelen);
+
+ if (dh->dh_seqopt) {
+ /*
+ * Sequential access optimisation. dh_seqoff contains the
+ * offset of the directory entry immediately following
+ * the last entry that was looked up. Check if this offset
+ * appears in the hash chain for the name we are looking for.
+ */
+ for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
+ i = WRAPINCR(i, dh->dh_hlen))
+ if (offset == dh->dh_seqoff)
+ break;
+ if (offset == dh->dh_seqoff) {
+ /*
+ * We found an entry with the expected offset. This
+ * is probably the entry we want, but if not, the
+ * code below will turn off seqoff and retry.
+ */
+ slot = i;
+ } else
+ dh->dh_seqopt = 0;
+ }
+
+ for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
+ slot = WRAPINCR(slot, dh->dh_hlen)) {
+ if (offset == DIRHASH_DEL)
+ continue;
+
+ if (offset < 0 || offset >= ip->i_size)
+ panic("ufsdirhash_lookup: bad offset in hash array");
+ if ((offset & ~bmask) != blkoff) {
+ if (bp != NULL)
+ brelse(bp, 0);
+ blkoff = offset & ~bmask;
+ if (ufs_blkatoff(vp, (off_t)blkoff,
+ NULL, &bp, false) != 0) {
+ DIRHASH_UNLOCK(dh);
+ return (EJUSTRETURN);
+ }
+ }
+ dp = (struct direct *)((char *)bp->b_data + (offset & bmask));
+ if (dp->d_reclen == 0 || dp->d_reclen >
+ dirblksiz - (offset & (dirblksiz - 1))) {
+ /* Corrupted directory. */
+ DIRHASH_UNLOCK(dh);
+ brelse(bp, 0);
+ return (EJUSTRETURN);
+ }
+ if (dp->d_namlen == namelen &&
+ memcmp(dp->d_name, name, namelen) == 0) {
+ /* Found. Get the prev offset if needed. */
+ if (prevoffp != NULL) {
+ if (offset & (dirblksiz - 1)) {
+ prevoff = ufsdirhash_getprev(dp,
+ offset, dirblksiz);
+ if (prevoff == -1) {
+ brelse(bp, 0);
+ return (EJUSTRETURN);
+ }
+ } else
+ prevoff = offset;
+ *prevoffp = prevoff;
+ }
+
+ /* Check for sequential access, and update offset. */
+ if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
+ dh->dh_seqopt = 1;
+ dh->dh_seqoff = offset + DIRSIZ(0, dp, needswap);
+ DIRHASH_UNLOCK(dh);
+
+ *bpp = bp;
+ *offp = offset;
+ return (0);
+ }
+
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ if (bp != NULL)
+ brelse(bp, 0);
+ ufsdirhash_free(ip);
+ return (EJUSTRETURN);
+ }
+ /*
+ * When the name doesn't match in the seqopt case, go back
+ * and search normally.
+ */
+ if (dh->dh_seqopt) {
+ dh->dh_seqopt = 0;
+ goto restart;
+ }
+ }
+ DIRHASH_UNLOCK(dh);
+ if (bp != NULL)
+ brelse(bp, 0);
+ return (ENOENT);
+}
+
+/*
+ * Find a directory block with room for 'slotneeded' bytes. Returns
+ * the offset of the directory entry that begins the free space.
+ * This will either be the offset of an existing entry that has free
+ * space at the end, or the offset of an entry with d_ino == 0 at
+ * the start of a DIRBLKSIZ block.
+ *
+ * To use the space, the caller may need to compact existing entries in
+ * the directory. The total number of bytes in all of the entries involved
+ * in the compaction is stored in *slotsize. In other words, all of
+ * the entries that must be compacted are exactly contained in the
+ * region beginning at the returned offset and spanning *slotsize bytes.
+ *
+ * Returns -1 if no space was found, indicating that the directory
+ * must be extended.
+ */
+doff_t
+ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
+{
+ struct direct *dp;
+ struct dirhash *dh;
+ struct buf *bp;
+ doff_t pos, slotstart;
+ int dirblock, error, freebytes, i;
+ const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return (-1);
+
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return (-1);
+ }
+
+ /* Find a directory block with the desired free space. */
+ dirblock = -1;
+ for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
+ if ((dirblock = dh->dh_firstfree[i]) != -1)
+ break;
+ if (dirblock == -1) {
+ DIRHASH_UNLOCK(dh);
+ return (-1);
+ }
+
+ KASSERT(dirblock < dh->dh_nblk &&
+ dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN));
+ pos = dirblock * dirblksiz;
+ error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false);
+ if (error) {
+ DIRHASH_UNLOCK(dh);
+ return (-1);
+ }
+ /* Find the first entry with free space. */
+ for (i = 0; i < dirblksiz; ) {
+ if (dp->d_reclen == 0) {
+ DIRHASH_UNLOCK(dh);
+ brelse(bp, 0);
+ return (-1);
+ }
+ if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp, needswap))
+ break;
+ i += dp->d_reclen;
+ dp = (struct direct *)((char *)dp + dp->d_reclen);
+ }
+ if (i > dirblksiz) {
+ DIRHASH_UNLOCK(dh);
+ brelse(bp, 0);
+ return (-1);
+ }
+ slotstart = pos + i;
+
+ /* Find the range of entries needed to get enough space */
+ freebytes = 0;
+ while (i < dirblksiz && freebytes < slotneeded) {
+ freebytes += dp->d_reclen;
+ if (dp->d_ino != 0)
+ freebytes -= DIRSIZ(0, dp, needswap);
+ if (dp->d_reclen == 0) {
+ DIRHASH_UNLOCK(dh);
+ brelse(bp, 0);
+ return (-1);
+ }
+ i += dp->d_reclen;
+ dp = (struct direct *)((char *)dp + dp->d_reclen);
+ }
+ if (i > dirblksiz) {
+ DIRHASH_UNLOCK(dh);
+ brelse(bp, 0);
+ return (-1);
+ }
+ if (freebytes < slotneeded)
+ panic("ufsdirhash_findfree: free mismatch");
+ DIRHASH_UNLOCK(dh);
+ brelse(bp, 0);
+ *slotsize = pos + i - slotstart;
+ return (slotstart);
+}
+
+/*
+ * Return the start of the unused space at the end of a directory, or
+ * -1 if there are no trailing unused blocks.
+ */
+doff_t
+ufsdirhash_enduseful(struct inode *ip)
+{
+ struct dirhash *dh;
+ int i;
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return (-1);
+
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return (-1);
+ }
+
+ if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) {
+ DIRHASH_UNLOCK(dh);
+ return (-1);
+ }
+
+ for (i = dh->dh_dirblks - 1; i >= 0; i--)
+ if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
+ break;
+ DIRHASH_UNLOCK(dh);
+ return ((doff_t)(i + 1) * dirblksiz);
+}
+
+/*
+ * Insert information into the hash about a new directory entry. dirp
+ * points to a struct direct containing the entry, and offset specifies
+ * the offset of this entry.
+ */
+void
+ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
+{
+ struct dirhash *dh;
+ int slot;
+ const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return;
+
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ KASSERT(offset < dh->dh_dirblks * dirblksiz);
+ /*
+ * Normal hash usage is < 66%. If the usage gets too high then
+ * remove the hash entirely and let it be rebuilt later.
+ */
+ if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ /* Find a free hash slot (empty or deleted), and add the entry. */
+ slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
+ while (DH_ENTRY(dh, slot) >= 0)
+ slot = WRAPINCR(slot, dh->dh_hlen);
+ if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
+ dh->dh_hused++;
+ DH_ENTRY(dh, slot) = offset;
+
+ /* Update the per-block summary info. */
+ ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp, needswap), dirblksiz);
+ DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Remove the specified directory entry from the hash. The entry to remove
+ * is defined by the name in `dirp', which must exist at the specified
+ * `offset' within the directory.
+ */
+void
+ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
+{
+ struct dirhash *dh;
+ int slot;
+ const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return;
+
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ KASSERT(offset < dh->dh_dirblks * dirblksiz);
+ /* Find the entry */
+ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
+
+ /* Remove the hash entry. */
+ ufsdirhash_delslot(dh, slot);
+
+ /* Update the per-block summary info. */
+ ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp, needswap), dirblksiz);
+ DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Change the offset associated with a directory entry in the hash. Used
+ * when compacting directory blocks.
+ */
+void
+ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
+ doff_t newoff)
+{
+ struct dirhash *dh;
+ int slot;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return;
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz &&
+ newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz);
+ /* Find the entry, and update the offset. */
+ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
+ DH_ENTRY(dh, slot) = newoff;
+ DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Inform dirhash that the directory has grown by one block that
+ * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
+ */
+void
+ufsdirhash_newblk(struct inode *ip, doff_t offset)
+{
+ struct dirhash *dh;
+ int block;
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return;
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ KASSERT(offset == dh->dh_dirblks * dirblksiz);
+ block = offset / dirblksiz;
+ if (block >= dh->dh_nblk) {
+ /* Out of space; must rebuild. */
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+ dh->dh_dirblks = block + 1;
+
+ /* Account for the new free block. */
+ dh->dh_blkfree[block] = dirblksiz / DIRALIGN;
+ if (dh->dh_firstfree[DH_NFSTATS] == -1)
+ dh->dh_firstfree[DH_NFSTATS] = block;
+ DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Inform dirhash that the directory is being truncated.
+ */
+void
+ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
+{
+ struct dirhash *dh;
+ int block, i;
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if ((dh = ip->i_dirhash) == NULL)
+ return;
+
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ KASSERT(offset <= dh->dh_dirblks * dirblksiz);
+ block = howmany(offset, dirblksiz);
+ /*
+ * If the directory shrinks to less than 1/8 of dh_nblk blocks
+ * (about 20% of its original size due to the 50% extra added in
+ * ufsdirhash_build) then free it, and let the caller rebuild
+ * if necessary.
+ */
+ if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ /*
+ * Remove any `first free' information pertaining to the
+ * truncated blocks. All blocks we're removing should be
+ * completely unused.
+ */
+ if (dh->dh_firstfree[DH_NFSTATS] >= block)
+ dh->dh_firstfree[DH_NFSTATS] = -1;
+ for (i = block; i < dh->dh_dirblks; i++)
+ if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
+ panic("ufsdirhash_dirtrunc: blocks in use");
+ for (i = 0; i < DH_NFSTATS; i++)
+ if (dh->dh_firstfree[i] >= block)
+ panic("ufsdirhash_dirtrunc: first free corrupt");
+ dh->dh_dirblks = block;
+ DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Debugging function to check that the dirhash information about
+ * a directory block matches its actual contents. Panics if a mismatch
+ * is detected.
+ *
+ * On entry, `sbuf' should point to the start of an in-core
+ * DIRBLKSIZ-sized directory block, and `offset' should contain the
+ * offset from the start of the directory of that block.
+ */
+void
+ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset)
+{
+ struct dirhash *dh;
+ struct direct *dp;
+ int block, ffslot, i, nfree;
+ const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+
+ if (!ufs_dirhashcheck)
+ return;
+ if ((dh = ip->i_dirhash) == NULL)
+ return;
+
+ DIRHASH_LOCK(dh);
+ if (dh->dh_hash == NULL) {
+ DIRHASH_UNLOCK(dh);
+ ufsdirhash_free(ip);
+ return;
+ }
+
+ block = offset / dirblksiz;
+ if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks)
+ panic("ufsdirhash_checkblock: bad offset");
+
+ nfree = 0;
+ for (i = 0; i < dirblksiz; i += dp->d_reclen) {
+ dp = (struct direct *)(sbuf + i);
+ if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz)
+ panic("ufsdirhash_checkblock: bad dir");
+
+ if (dp->d_ino == 0) {
+#if 0
+ /*
+ * XXX entries with d_ino == 0 should only occur
+ * at the start of a DIRBLKSIZ block. However the
+ * ufs code is tolerant of such entries at other
+ * offsets, and fsck does not fix them.
+ */
+ if (i != 0)
+ panic("ufsdirhash_checkblock: bad dir inode");
+#endif
+ nfree += dp->d_reclen;
+ continue;
+ }
+
+ /* Check that the entry exists (will panic if it doesn't). */
+ ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
+
+ nfree += dp->d_reclen - DIRSIZ(0, dp, needswap);
+ }
+ if (i != dirblksiz)
+ panic("ufsdirhash_checkblock: bad dir end");
+
+ if (dh->dh_blkfree[block] * DIRALIGN != nfree)
+ panic("ufsdirhash_checkblock: bad free count");
+
+ ffslot = BLKFREE2IDX(nfree / DIRALIGN);
+ for (i = 0; i <= DH_NFSTATS; i++)
+ if (dh->dh_firstfree[i] == block && i != ffslot)
+ panic("ufsdirhash_checkblock: bad first-free");
+ if (dh->dh_firstfree[ffslot] == -1)
+ panic("ufsdirhash_checkblock: missing first-free entry");
+ DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Hash the specified filename into a dirhash slot.
+ */
+static int
+ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen)
+{
+ u_int32_t hash;
+
+ /*
+ * We hash the name and then some other bit of data that is
+ * invariant over the dirhash's lifetime. Otherwise names
+ * differing only in the last byte are placed close to one
+ * another in the table, which is bad for linear probing.
+ */
+ hash = hash32_buf(name, namelen, HASH32_BUF_INIT);
+ hash = hash32_buf(&dh, sizeof(dh), hash);
+ return (hash % dh->dh_hlen);
+}
+
+/*
+ * Adjust the number of free bytes in the block containing `offset'
+ * by the value specified by `diff'.
+ *
+ * The caller must ensure we have exclusive access to `dh'; normally
+ * that means that dh_lock should be held, but this is also called
+ * from ufsdirhash_build() where exclusive access can be assumed.
+ */
+static void
+ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz)
+{
+ int block, i, nfidx, ofidx;
+
+ KASSERT(mutex_owned(&dh->dh_lock));
+
+ /* Update the per-block summary info. */
+ block = offset / dirblksiz;
+ KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks);
+ ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
+ dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
+ nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
+
+ /* Update the `first free' list if necessary. */
+ if (ofidx != nfidx) {
+ /* If removing, scan forward for the next block. */
+ if (dh->dh_firstfree[ofidx] == block) {
+ for (i = block + 1; i < dh->dh_dirblks; i++)
+ if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
+ break;
+ dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
+ }
+
+ /* Make this the new `first free' if necessary */
+ if (dh->dh_firstfree[nfidx] > block ||
+ dh->dh_firstfree[nfidx] == -1)
+ dh->dh_firstfree[nfidx] = block;
+ }
+}
+
+/*
+ * Find the specified name which should have the specified offset.
+ * Returns a slot number, and panics on failure.
+ *
+ * `dh' must be locked on entry and remains so on return.
+ */
+static int
+ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen,
+ doff_t offset)
+{
+ int slot;
+
+ KASSERT(mutex_owned(&dh->dh_lock));
+
+ /* Find the entry. */
+ KASSERT(dh->dh_hused < dh->dh_hlen);
+ slot = ufsdirhash_hash(dh, name, namelen);
+ while (DH_ENTRY(dh, slot) != offset &&
+ DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
+ slot = WRAPINCR(slot, dh->dh_hlen);
+ if (DH_ENTRY(dh, slot) != offset)
+ panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
+
+ return (slot);
+}
+
+/*
+ * Remove the entry corresponding to the specified slot from the hash array.
+ *
+ * `dh' must be locked on entry and remains so on return.
+ */
+static void
+ufsdirhash_delslot(struct dirhash *dh, int slot)
+{
+ int i;
+
+ KASSERT(mutex_owned(&dh->dh_lock));
+
+ /* Mark the entry as deleted. */
+ DH_ENTRY(dh, slot) = DIRHASH_DEL;
+
+ /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
+ for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
+ i = WRAPINCR(i, dh->dh_hlen);
+ if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
+ i = WRAPDECR(i, dh->dh_hlen);
+ while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
+ DH_ENTRY(dh, i) = DIRHASH_EMPTY;
+ dh->dh_hused--;
+ i = WRAPDECR(i, dh->dh_hlen);
+ }
+ KASSERT(dh->dh_hused >= 0);
+ }
+}
+
+/*
+ * Given a directory entry and its offset, find the offset of the
+ * previous entry in the same DIRBLKSIZ-sized block. Returns an
+ * offset, or -1 if there is no previous entry in the block or some
+ * other problem occurred.
+ */
+static doff_t
+ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz)
+{
+ struct direct *dp;
+ char *blkbuf;
+ doff_t blkoff, prevoff;
+ int entrypos, i;
+
+ blkoff = offset & ~(dirblksiz - 1); /* offset of start of block */
+ entrypos = offset & (dirblksiz - 1); /* entry relative to block */
+ blkbuf = (char *)dirp - entrypos;
+ prevoff = blkoff;
+
+ /* If `offset' is the start of a block, there is no previous entry. */
+ if (entrypos == 0)
+ return (-1);
+
+ /* Scan from the start of the block until we get to the entry. */
+ for (i = 0; i < entrypos; i += dp->d_reclen) {
+ dp = (struct direct *)(blkbuf + i);
+ if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
+ return (-1); /* Corrupted directory. */
+ prevoff = blkoff + i;
+ }
+ return (prevoff);
+}
+
+/*
+ * Try to free up `wanted' bytes by stealing memory from existing
+ * dirhashes. Returns zero with list locked if successful.
+ */
+static int
+ufsdirhash_recycle(int wanted)
+{
+ struct dirhash *dh;
+ doff_t **hash;
+ u_int8_t *blkfree;
+ int i, mem, narrays;
+ size_t hashsz, blkfreesz;
+
+ DIRHASHLIST_LOCK();
+ while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
+ /* Find a dirhash, and lock it. */
+ if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
+ DIRHASHLIST_UNLOCK();
+ return (-1);
+ }
+ DIRHASH_LOCK(dh);
+ KASSERT(dh->dh_hash != NULL);
+
+ /* Decrement the score; only recycle if it becomes zero. */
+ if (--dh->dh_score > 0) {
+ DIRHASH_UNLOCK(dh);
+ DIRHASHLIST_UNLOCK();
+ return (-1);
+ }
+
+ /* Remove it from the list and detach its memory. */
+ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+ dh->dh_onlist = 0;
+ hash = dh->dh_hash;
+ hashsz = dh->dh_hashsz;
+ dh->dh_hash = NULL;
+ blkfree = dh->dh_blkfree;
+ blkfreesz = dh->dh_blkfreesz;
+ dh->dh_blkfree = NULL;
+ narrays = dh->dh_narrays;
+ mem = narrays * sizeof(*dh->dh_hash) +
+ narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
+ dh->dh_nblk * sizeof(*dh->dh_blkfree);
+
+ /* Unlock everything, free the detached memory. */
+ DIRHASH_UNLOCK(dh);
+ DIRHASHLIST_UNLOCK();
+
+ for (i = 0; i < narrays; i++)
+ DIRHASH_BLKFREE(hash[i]);
+ kmem_free(hash, hashsz);
+ kmem_free(blkfree, blkfreesz);
+
+ /* Account for the returned memory, and repeat if necessary. */
+ DIRHASHLIST_LOCK();
+ atomic_add_int(&ufs_dirhashmem, -mem);
+ }
+ /* Success. */
+ return (0);
+}
+
+static void
+ufsdirhash_sysctl_init(void)
+{
+ const struct sysctlnode *rnode, *cnode;
+
+ sysctl_createv(&ufsdirhash_sysctl_log, 0, NULL, &rnode,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "vfs", NULL,
+ NULL, 0, NULL, 0,
+ CTL_VFS, CTL_EOL);
+
+ sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "ufs",
+ SYSCTL_DESCR("ufs"),
+ NULL, 0, NULL, 0,
+ CTL_CREATE, CTL_EOL);
+
+ sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "dirhash",
+ SYSCTL_DESCR("dirhash"),
+ NULL, 0, NULL, 0,
+ CTL_CREATE, CTL_EOL);
+
+ sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "minblocks",
+ SYSCTL_DESCR("minimum hashed directory size in blocks"),
+ NULL, 0, &ufs_dirhashminblks, 0,
+ CTL_CREATE, CTL_EOL);
+
+ sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "maxmem",
+ SYSCTL_DESCR("maximum dirhash memory usage"),
+ NULL, 0, &ufs_dirhashmaxmem, 0,
+ CTL_CREATE, CTL_EOL);
+
+ sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+ CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+ CTLTYPE_INT, "memused",
+ SYSCTL_DESCR("current dirhash memory usage"),
+ NULL, 0, &ufs_dirhashmem, 0,
+ CTL_CREATE, CTL_EOL);
+
+ sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "docheck",
+ SYSCTL_DESCR("enable extra sanity checks"),
+ NULL, 0, &ufs_dirhashcheck, 0,
+ CTL_CREATE, CTL_EOL);
+}
+
+void
+ufsdirhash_init(void)
+{
+
+ mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE);
+ ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0,
+ 0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL);
+ ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0,
+ 0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL);
+ TAILQ_INIT(&ufsdirhash_list);
+ ufsdirhash_sysctl_init();
+}
+
+void
+ufsdirhash_done(void)
+{
+
+ KASSERT(TAILQ_EMPTY(&ufsdirhash_list));
+ pool_cache_destroy(ufsdirhashblk_cache);
+ pool_cache_destroy(ufsdirhash_cache);
+ mutex_destroy(&ufsdirhash_lock);
+ sysctl_teardown(&ufsdirhash_sysctl_log);
+}
--- /dev/null
+/* $NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $ */
+
+/*-
+ * Copyright (c) 1999-2002 Robert N. M. Watson
+ * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * This software was developed for the FreeBSD Project in part by Network
+ * Associates Laboratories, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
+ * as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Support for file system extended attributes on the UFS1 file system.
+ *
+ * Extended attributes are defined in the form name=value, where name is
+ * a nul-terminated string in the style of a file name, and value is a
+ * binary blob of zero or more bytes. The UFS1 extended attribute service
+ * layers support for extended attributes onto a backing file, in the style
+ * of the quota implementation, meaning that it requires no underlying format
+ * changes to the file system. This design choice exchanges simplicity,
+ * usability, and easy deployment for performance.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/reboot.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/lwp.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/lock.h>
+#include <sys/dirent.h>
+#include <sys/extattr.h>
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/extattr.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+
+static MALLOC_JUSTDEFINE(M_UFS_EXTATTR, "ufs_extattr","ufs extended attribute");
+
+int ufs_extattr_sync = 1;
+int ufs_extattr_autocreate = 1024;
+
+static int ufs_extattr_valid_attrname(int attrnamespace,
+ const char *attrname);
+static int ufs_extattr_enable_with_open(struct ufsmount *ump,
+ struct vnode *vp, int attrnamespace, const char *attrname,
+ struct lwp *l);
+static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
+ const char *attrname, struct vnode *backing_vnode,
+ struct lwp *l);
+static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
+ const char *attrname, struct lwp *l);
+static int ufs_extattr_get(struct vnode *vp, int attrnamespace,
+ const char *name, struct uio *uio, size_t *size,
+ kauth_cred_t cred, struct lwp *l);
+static int ufs_extattr_list(struct vnode *vp, int attrnamespace,
+ struct uio *uio, size_t *size, int flag,
+ kauth_cred_t cred, struct lwp *l);
+static int ufs_extattr_set(struct vnode *vp, int attrnamespace,
+ const char *name, struct uio *uio, kauth_cred_t cred,
+ struct lwp *l);
+static int ufs_extattr_rm(struct vnode *vp, int attrnamespace,
+ const char *name, kauth_cred_t cred, struct lwp *l);
+static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *,
+ int, const char *);
+static int ufs_extattr_get_header(struct vnode *,
+ struct ufs_extattr_list_entry *,
+ struct ufs_extattr_header *, off_t *);
+
+/*
+ * Per-FS attribute lock protecting attribute operations.
+ * XXX Right now there is a lot of lock contention due to having a single
+ * lock per-FS; really, this should be far more fine-grained.
+ */
+static void
+ufs_extattr_uepm_lock(struct ufsmount *ump)
+{
+
+ /* XXX Why does this need to be recursive? */
+ if (mutex_owned(&ump->um_extattr.uepm_lock)) {
+ ump->um_extattr.uepm_lockcnt++;
+ return;
+ }
+ mutex_enter(&ump->um_extattr.uepm_lock);
+}
+
+static void
+ufs_extattr_uepm_unlock(struct ufsmount *ump)
+{
+
+ if (ump->um_extattr.uepm_lockcnt != 0) {
+ KASSERT(mutex_owned(&ump->um_extattr.uepm_lock));
+ ump->um_extattr.uepm_lockcnt--;
+ return;
+ }
+ mutex_exit(&ump->um_extattr.uepm_lock);
+}
+
+/*-
+ * Determine whether the name passed is a valid name for an actual
+ * attribute.
+ *
+ * Invalid currently consists of:
+ * NULL pointer for attrname
+ * zero-length attrname (used to retrieve application attribute list)
+ */
+static int
+ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
+{
+
+ if (attrname == NULL)
+ return (0);
+ if (strlen(attrname) == 0)
+ return (0);
+ return (1);
+}
+
+/*
+ * Autocreate an attribute storage
+ */
+static struct ufs_extattr_list_entry *
+ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace,
+ const char *attrname, struct lwp *l)
+{
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct vnode *backing_vp;
+ struct nameidata nd;
+ struct pathbuf *pb;
+ char *path;
+ struct ufs_extattr_fileheader uef;
+ struct ufs_extattr_list_entry *uele;
+ int error;
+
+ path = PNBUF_GET();
+
+ /*
+ * We only support system and user namespace autocreation
+ */
+ switch (attrnamespace) {
+ case EXTATTR_NAMESPACE_SYSTEM:
+ (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
+ mp->mnt_stat.f_mntonname,
+ UFS_EXTATTR_FSROOTSUBDIR,
+ UFS_EXTATTR_SUBDIR_SYSTEM,
+ attrname);
+ break;
+ case EXTATTR_NAMESPACE_USER:
+ (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
+ mp->mnt_stat.f_mntonname,
+ UFS_EXTATTR_FSROOTSUBDIR,
+ UFS_EXTATTR_SUBDIR_USER,
+ attrname);
+ break;
+ default:
+ PNBUF_PUT(path);
+ return NULL;
+ break;
+ }
+
+ /*
+ * When setting attribute on the root vnode, we get it
+ * already locked, and vn_open/namei/VFS_ROOT will try to
+ * look it, causing a panic. Unlock it first.
+ */
+ if (vp->v_vflag && VV_ROOT) {
+ KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+ VOP_UNLOCK(vp);
+ }
+ KASSERT(VOP_ISLOCKED(vp) == 0);
+
+ pb = pathbuf_create(path);
+ NDINIT(&nd, CREATE, LOCKPARENT, pb);
+
+ error = vn_open(&nd, O_CREAT|O_RDWR, 0600);
+
+ /*
+ * Reacquire the lock on the vnode if it was root.
+ */
+ KASSERT(VOP_ISLOCKED(vp) == 0);
+ if (vp->v_vflag && VV_ROOT)
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+ if (error != 0) {
+ pathbuf_destroy(pb);
+ PNBUF_PUT(path);
+ return NULL;
+ }
+
+ KASSERT(nd.ni_vp != NULL);
+ KASSERT(VOP_ISLOCKED(nd.ni_vp) == LK_EXCLUSIVE);
+ KASSERT(VOP_ISLOCKED(nd.ni_dvp) == 0);
+
+ /*
+ * backing_vp is the backing store.
+ */
+ backing_vp = nd.ni_vp;
+ pathbuf_destroy(pb);
+ PNBUF_PUT(path);
+
+ uef.uef_magic = UFS_EXTATTR_MAGIC;
+ uef.uef_version = UFS_EXTATTR_VERSION;
+ uef.uef_size = ufs_extattr_autocreate;
+
+ error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0,
+ UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND,
+ l->l_cred, NULL, l);
+
+ VOP_UNLOCK(backing_vp);
+
+ if (error != 0) {
+ printf("%s: write uef header failed for %s, error = %d\n",
+ __func__, attrname, error);
+ vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+ return NULL;
+ }
+
+ /*
+ * ufs_extattr_enable_with_open increases the vnode reference
+ * count. Not sure why, but do the same here.
+ */
+ vref(vp);
+
+ /*
+ * Now enable attribute.
+ */
+ error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l);
+ KASSERT(VOP_ISLOCKED(backing_vp) == 0);
+
+ if (error != 0) {
+ printf("%s: enable %s failed, error %d\n",
+ __func__, attrname, error);
+ vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+ return NULL;
+ }
+
+ uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
+ if (uele == NULL) {
+ printf("%s: atttribute %s created but not found!\n",
+ __func__, attrname);
+ vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+ return NULL;
+ }
+
+ printf("%s: EA backing store autocreated for %s\n",
+ mp->mnt_stat.f_mntonname, attrname);
+
+ return uele;
+}
+
+/*
+ * Locate an attribute given a name and mountpoint.
+ * Must be holding uepm lock for the mount point.
+ */
+static struct ufs_extattr_list_entry *
+ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
+ const char *attrname)
+{
+ struct ufs_extattr_list_entry *search_attribute;
+
+ for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
+ search_attribute != NULL;
+ search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
+ if (!(strncmp(attrname, search_attribute->uele_attrname,
+ UFS_EXTATTR_MAXEXTATTRNAME)) &&
+ (attrnamespace == search_attribute->uele_attrnamespace)) {
+ return (search_attribute);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Initialize per-FS structures supporting extended attributes. Do not
+ * start extended attributes yet.
+ */
+void
+ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
+{
+
+ uepm->uepm_flags = 0;
+ uepm->uepm_lockcnt = 0;
+
+ LIST_INIT(&uepm->uepm_list);
+ mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE);
+ uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
+}
+
+/*
+ * Destroy per-FS structures supporting extended attributes. Assumes
+ * that EAs have already been stopped, and will panic if not.
+ */
+void
+ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
+{
+
+ if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
+ panic("ufs_extattr_uepm_destroy: not initialized");
+
+ if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+ panic("ufs_extattr_uepm_destroy: called while still started");
+
+ /*
+ * It's not clear that either order for the next two lines is
+ * ideal, and it should never be a problem if this is only called
+ * during unmount, and with vfs_busy().
+ */
+ uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
+ mutex_destroy(&uepm->uepm_lock);
+}
+
+/*
+ * Start extended attribute support on an FS.
+ */
+int
+ufs_extattr_start(struct mount *mp, struct lwp *l)
+{
+ struct ufsmount *ump;
+ int error = 0;
+
+ ump = VFSTOUFS(mp);
+
+ ufs_extattr_uepm_lock(ump);
+
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) {
+ error = EOPNOTSUPP;
+ goto unlock;
+ }
+ if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) {
+ error = EBUSY;
+ goto unlock;
+ }
+
+ ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;
+
+ ump->um_extattr.uepm_ucred = l->l_cred;
+ kauth_cred_hold(ump->um_extattr.uepm_ucred);
+
+ unlock:
+ ufs_extattr_uepm_unlock(ump);
+
+ return (error);
+}
+
+/*
+ * Helper routine: given a locked parent directory and filename, return
+ * the locked vnode of the inode associated with the name. Will not
+ * follow symlinks, may return any type of vnode. Lock on parent will
+ * be released even in the event of a failure. In the event that the
+ * target is the parent (i.e., "."), there will be two references and
+ * one lock, requiring the caller to possibly special-case.
+ */
+static int
+ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, const char *dirname,
+ struct vnode **vp, struct lwp *l)
+{
+ struct vop_lookup_args vargs;
+ struct componentname cnp;
+ struct vnode *target_vp;
+ char *pnbuf;
+ int error;
+
+ KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE);
+
+ pnbuf = PNBUF_GET();
+
+ memset(&cnp, 0, sizeof(cnp));
+ cnp.cn_nameiop = LOOKUP;
+ cnp.cn_flags = ISLASTCN | lockparent;
+ cnp.cn_cred = l->l_cred;
+ cnp.cn_nameptr = pnbuf;
+ error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen);
+ if (error) {
+ if (lockparent == 0) {
+ VOP_UNLOCK(start_dvp);
+ }
+ PNBUF_PUT(pnbuf);
+ printf("ufs_extattr_lookup: copystr failed\n");
+ return (error);
+ }
+ cnp.cn_namelen--; /* trim nul termination */
+ vargs.a_desc = NULL;
+ vargs.a_dvp = start_dvp;
+ vargs.a_vpp = &target_vp;
+ vargs.a_cnp = &cnp;
+ error = ufs_lookup(&vargs);
+ PNBUF_PUT(pnbuf);
+ if (error) {
+ if (lockparent == 0) {
+ VOP_UNLOCK(start_dvp);
+ }
+ return (error);
+ }
+#if 0
+ if (target_vp == start_dvp)
+ panic("ufs_extattr_lookup: target_vp == start_dvp");
+#endif
+
+ if ((target_vp != start_dvp) && (lockparent == 0))
+ VOP_UNLOCK(start_dvp);
+
+ KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE);
+ *vp = target_vp;
+ return (0);
+}
+
+/*
+ * Enable an EA using the passed filesystem, backing vnode, attribute name,
+ * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp
+ * to be locked when passed in. The vnode will be returned unlocked,
+ * regardless of success/failure of the function. As a result, the caller
+ * will always need to vrele(), but not vput().
+ */
+static int
+ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
+ int attrnamespace, const char *attrname, struct lwp *l)
+{
+ int error;
+
+ error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred);
+ if (error) {
+ printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed "
+ "with %d\n", error);
+ VOP_UNLOCK(vp);
+ return (error);
+ }
+
+ mutex_enter(vp->v_interlock);
+ vp->v_writecount++;
+ mutex_exit(vp->v_interlock);
+
+ vref(vp);
+
+ VOP_UNLOCK(vp);
+
+ error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l);
+ if (error != 0)
+ vn_close(vp, FREAD|FWRITE, l->l_cred);
+ return (error);
+}
+
+/*
+ * Given a locked directory vnode, iterate over the names in the directory
+ * and use ufs_extattr_lookup() to retrieve locked vnodes of potential
+ * attribute files. Then invoke ufs_extattr_enable_with_open() on each
+ * to attempt to start the attribute. Leaves the directory locked on
+ * exit.
+ */
+static int
+ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
+ int attrnamespace, struct lwp *l)
+{
+ struct vop_readdir_args vargs;
+ struct statvfs *sbp = &ump->um_mountp->mnt_stat;
+ struct dirent *dp, *edp;
+ struct vnode *attr_vp;
+ struct uio auio;
+ struct iovec aiov;
+ char *dirbuf;
+ int error, eofflag = 0;
+
+ if (dvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_offset = 0;
+ UIO_SETUP_SYSSPACE(&auio);
+
+ vargs.a_desc = NULL;
+ vargs.a_vp = dvp;
+ vargs.a_uio = &auio;
+ vargs.a_cred = l->l_cred;
+ vargs.a_eofflag = &eofflag;
+ vargs.a_ncookies = NULL;
+ vargs.a_cookies = NULL;
+
+ while (!eofflag) {
+ auio.uio_resid = DIRBLKSIZ;
+ aiov.iov_base = dirbuf;
+ aiov.iov_len = DIRBLKSIZ;
+ error = ufs_readdir(&vargs);
+ if (error) {
+ printf("ufs_extattr_iterate_directory: ufs_readdir "
+ "%d\n", error);
+ return (error);
+ }
+
+ /*
+ * XXXRW: While in UFS, we always get DIRBLKSIZ returns from
+ * the directory code on success, on other file systems this
+ * may not be the case. For portability, we should check the
+ * read length on return from ufs_readdir().
+ */
+ edp = (struct dirent *)&dirbuf[DIRBLKSIZ];
+ for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+ if (dp->d_reclen == 0)
+ break;
+ /* Skip "." and ".." */
+ if (dp->d_name[0] == '.' &&
+ (dp->d_name[1] == '\0' ||
+ (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
+ goto next;
+ error = ufs_extattr_lookup(dvp, LOCKPARENT,
+ dp->d_name, &attr_vp, l);
+ if (error == ENOENT) {
+ goto next; /* keep silent */
+ } else if (error) {
+ printf("ufs_extattr_iterate_directory: lookup "
+ "%s %d\n", dp->d_name, error);
+ } else if (attr_vp == dvp) {
+ vrele(attr_vp);
+ } else if (attr_vp->v_type != VREG) {
+ vput(attr_vp);
+ } else {
+ error = ufs_extattr_enable_with_open(ump,
+ attr_vp, attrnamespace, dp->d_name, l);
+ vrele(attr_vp);
+ if (error) {
+ printf("ufs_extattr_iterate_directory: "
+ "enable %s %d\n", dp->d_name,
+ error);
+ } else if (bootverbose) {
+ printf("%s: EA %s loaded\n",
+ sbp->f_mntonname, dp->d_name);
+ }
+ }
+ next:
+ dp = (struct dirent *) ((char *)dp + dp->d_reclen);
+ if (dp >= edp)
+ break;
+ }
+ }
+ free(dirbuf, M_TEMP);
+
+ return (0);
+}
+
+/*
+ * Auto-start of extended attributes, to be executed (optionally) at
+ * mount-time.
+ */
+int
+ufs_extattr_autostart(struct mount *mp, struct lwp *l)
+{
+ struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp;
+ int error;
+
+ /*
+ * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
+ * If so, automatically start EA's.
+ */
+ error = VFS_ROOT(mp, &rvp);
+ if (error) {
+ printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n",
+ error);
+ return (error);
+ }
+
+ KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
+
+ error = ufs_extattr_lookup(rvp, 0,
+ UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l);
+ if (error) {
+ /* rvp ref'd but now unlocked */
+ KASSERT(VOP_ISLOCKED(rvp) == 0);
+ vrele(rvp);
+ return (error);
+ }
+ if (rvp == attr_dvp) {
+ /* Should never happen. */
+ KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
+ vrele(attr_dvp);
+ vput(rvp);
+ return (EINVAL);
+ }
+ KASSERT(VOP_ISLOCKED(rvp) == 0);
+ vrele(rvp);
+
+ KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+
+ if (attr_dvp->v_type != VDIR) {
+ printf("ufs_extattr_autostart: %s != VDIR\n",
+ UFS_EXTATTR_FSROOTSUBDIR);
+ goto return_vput_attr_dvp;
+ }
+
+ error = ufs_extattr_start(mp, l);
+ if (error) {
+ printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n",
+ error);
+ goto return_vput_attr_dvp;
+ }
+
+ /*
+ * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
+ * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory,
+ * and start with appropriate type. Failures in either don't
+ * result in an over-all failure. attr_dvp is left locked to
+ * be cleaned up on exit.
+ */
+ error = ufs_extattr_lookup(attr_dvp, LOCKPARENT,
+ UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, l);
+ KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+ if (error == 0) {
+ KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE);
+ error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
+ attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, l);
+ if (error)
+ printf("ufs_extattr_iterate_directory returned %d\n",
+ error);
+ KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE);
+ vput(attr_system_dvp);
+ }
+
+ error = ufs_extattr_lookup(attr_dvp, LOCKPARENT,
+ UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, l);
+ KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+ if (error == 0) {
+ KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE);
+ error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
+ attr_user_dvp, EXTATTR_NAMESPACE_USER, l);
+ if (error)
+ printf("ufs_extattr_iterate_directory returned %d\n",
+ error);
+ KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE);
+ vput(attr_user_dvp);
+ }
+
+ /* Mask startup failures in sub-directories. */
+ error = 0;
+
+ return_vput_attr_dvp:
+ KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+ vput(attr_dvp);
+
+ return (error);
+}
+
+/*
+ * Stop extended attribute support on an FS.
+ */
+void
+ufs_extattr_stop(struct mount *mp, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *uele;
+ struct ufsmount *ump = VFSTOUFS(mp);
+
+ ufs_extattr_uepm_lock(ump);
+
+ /*
+ * If we haven't been started, no big deal. Just short-circuit
+ * the processing work.
+ */
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+ goto unlock;
+ }
+
+ while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) {
+ uele = LIST_FIRST(&ump->um_extattr.uepm_list);
+ ufs_extattr_disable(ump, uele->uele_attrnamespace,
+ uele->uele_attrname, l);
+ }
+
+ ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
+
+ kauth_cred_free(ump->um_extattr.uepm_ucred);
+ ump->um_extattr.uepm_ucred = NULL;
+
+ unlock:
+ ufs_extattr_uepm_unlock(ump);
+}
+
+/*
+ * Enable a named attribute on the specified filesystem; provide an
+ * unlocked backing vnode to hold the attribute data.
+ */
+static int
+ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
+ const char *attrname, struct vnode *backing_vnode, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *attribute;
+ struct iovec aiov;
+ struct uio auio;
+ int error = 0;
+
+ if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
+ return (EINVAL);
+ if (backing_vnode->v_type != VREG)
+ return (EINVAL);
+
+ attribute = malloc(sizeof(*attribute), M_UFS_EXTATTR,
+ M_WAITOK | M_ZERO);
+
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+ error = EOPNOTSUPP;
+ goto free_exit;
+ }
+
+ if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
+ error = EEXIST;
+ goto free_exit;
+ }
+
+ strncpy(attribute->uele_attrname, attrname,
+ UFS_EXTATTR_MAXEXTATTRNAME);
+ attribute->uele_attrnamespace = attrnamespace;
+ memset(&attribute->uele_fileheader, 0,
+ sizeof(struct ufs_extattr_fileheader));
+
+ attribute->uele_backing_vnode = backing_vnode;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = (void *) &attribute->uele_fileheader;
+ aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
+ auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
+ auio.uio_offset = (off_t) 0;
+ auio.uio_rw = UIO_READ;
+ UIO_SETUP_SYSSPACE(&auio);
+
+ vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
+ error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
+ ump->um_extattr.uepm_ucred);
+
+ if (error)
+ goto unlock_free_exit;
+
+ if (auio.uio_resid != 0) {
+ printf("ufs_extattr_enable: malformed attribute header\n");
+ error = EINVAL;
+ goto unlock_free_exit;
+ }
+
+ /*
+ * Try to determine the byte order of the attribute file.
+ */
+ if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
+ attribute->uele_flags |= UELE_F_NEEDSWAP;
+ attribute->uele_fileheader.uef_magic =
+ ufs_rw32(attribute->uele_fileheader.uef_magic,
+ UELE_NEEDSWAP(attribute));
+ if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
+ printf("ufs_extattr_enable: invalid attribute header "
+ "magic\n");
+ error = EINVAL;
+ goto unlock_free_exit;
+ }
+ }
+ attribute->uele_fileheader.uef_version =
+ ufs_rw32(attribute->uele_fileheader.uef_version,
+ UELE_NEEDSWAP(attribute));
+ attribute->uele_fileheader.uef_size =
+ ufs_rw32(attribute->uele_fileheader.uef_size,
+ UELE_NEEDSWAP(attribute));
+
+ if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
+ printf("ufs_extattr_enable: incorrect attribute header "
+ "version\n");
+ error = EINVAL;
+ goto unlock_free_exit;
+ }
+
+ LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute,
+ uele_entries);
+
+ VOP_UNLOCK(backing_vnode);
+ return (0);
+
+ unlock_free_exit:
+ VOP_UNLOCK(backing_vnode);
+
+ free_exit:
+ free(attribute, M_UFS_EXTATTR);
+ return (error);
+}
+
+/*
+ * Disable extended attribute support on an FS.
+ */
+static int
+ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
+ const char *attrname, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *uele;
+ int error = 0;
+
+ if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
+ return (EINVAL);
+
+ uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
+ if (!uele)
+ return (ENOATTR);
+
+ LIST_REMOVE(uele, uele_entries);
+
+ error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE,
+ l->l_cred);
+
+ free(uele, M_UFS_EXTATTR);
+
+ return (error);
+}
+
+/*
+ * VFS call to manage extended attributes in UFS. If filename_vp is
+ * non-NULL, it must be passed in locked, and regardless of errors in
+ * processing, will be unlocked.
+ */
+int
+ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
+ int attrnamespace, const char *attrname)
+{
+ struct lwp *l = curlwp;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+
+ /*
+ * Only privileged processes can configure extended attributes.
+ */
+ if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+ NULL)) != 0) {
+ if (filename_vp != NULL)
+ VOP_UNLOCK(filename_vp);
+ return (error);
+ }
+
+ switch(cmd) {
+ case UFS_EXTATTR_CMD_START:
+ if (filename_vp != NULL) {
+ VOP_UNLOCK(filename_vp);
+ return (EINVAL);
+ }
+ if (attrname != NULL)
+ return (EINVAL);
+
+ error = ufs_extattr_autostart(mp, l);
+ return (error);
+
+ case UFS_EXTATTR_CMD_STOP:
+ if (filename_vp != NULL) {
+ VOP_UNLOCK(filename_vp);
+ return (EINVAL);
+ }
+ if (attrname != NULL)
+ return (EINVAL);
+
+ ufs_extattr_stop(mp, l);
+ return (0);
+
+ case UFS_EXTATTR_CMD_ENABLE:
+ if (filename_vp == NULL)
+ return (EINVAL);
+ if (attrname == NULL) {
+ VOP_UNLOCK(filename_vp);
+ return (EINVAL);
+ }
+
+ /*
+ * ufs_extattr_enable_with_open() will always unlock the
+ * vnode, regardless of failure.
+ */
+ ufs_extattr_uepm_lock(ump);
+ error = ufs_extattr_enable_with_open(ump, filename_vp,
+ attrnamespace, attrname, l);
+ ufs_extattr_uepm_unlock(ump);
+ return (error);
+
+ case UFS_EXTATTR_CMD_DISABLE:
+ if (filename_vp != NULL) {
+ VOP_UNLOCK(filename_vp);
+ return (EINVAL);
+ }
+ if (attrname == NULL)
+ return (EINVAL);
+
+ ufs_extattr_uepm_lock(ump);
+ error = ufs_extattr_disable(ump, attrnamespace, attrname, l);
+ ufs_extattr_uepm_unlock(ump);
+ return (error);
+
+ default:
+ return (EINVAL);
+ }
+}
+
+/*
+ * Read extended attribute header for a given vnode and attribute.
+ * Backing vnode should be locked and unlocked by caller.
+ */
+static int
+ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele,
+ struct ufs_extattr_header *ueh, off_t *bap)
+{
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct inode *ip = VTOI(vp);
+ off_t base_offset;
+ struct iovec aiov;
+ struct uio aio;
+ int error;
+
+ /*
+ * Find base offset of header in file based on file header size, and
+ * data header size + maximum data size, indexed by inode number.
+ */
+ base_offset = sizeof(struct ufs_extattr_fileheader) +
+ ip->i_number * (sizeof(struct ufs_extattr_header) +
+ uele->uele_fileheader.uef_size);
+
+ /*
+ * Read in the data header to see if the data is defined, and if so
+ * how much.
+ */
+ memset(ueh, 0, sizeof(struct ufs_extattr_header));
+ aiov.iov_base = ueh;
+ aiov.iov_len = sizeof(struct ufs_extattr_header);
+ aio.uio_iov = &aiov;
+ aio.uio_iovcnt = 1;
+ aio.uio_rw = UIO_READ;
+ aio.uio_offset = base_offset;
+ aio.uio_resid = sizeof(struct ufs_extattr_header);
+ UIO_SETUP_SYSSPACE(&aio);
+
+ error = VOP_READ(uele->uele_backing_vnode, &aio,
+ IO_NODELOCKED, ump->um_extattr.uepm_ucred);
+ if (error)
+ return error;
+
+ /*
+ * Attribute headers are kept in file system byte order.
+ * XXX What about the blob of data?
+ */
+ ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele));
+ ueh->ueh_len = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele));
+ ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele));
+
+ /* Defined? */
+ if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0)
+ return ENOATTR;
+
+ /* Valid for the current inode generation? */
+ if (ueh->ueh_i_gen != ip->i_gen) {
+ /*
+ * The inode itself has a different generation number
+ * than the uele data. For now, the best solution
+ * is to coerce this to undefined, and let it get cleaned
+ * up by the next write or extattrctl clean.
+ */
+ printf("%s (%s): inode gen inconsistency (%u, %jd)\n",
+ __func__, mp->mnt_stat.f_mntonname, ueh->ueh_i_gen,
+ (intmax_t)ip->i_gen);
+ return ENOATTR;
+ }
+
+ /* Local size consistency check. */
+ if (ueh->ueh_len > uele->uele_fileheader.uef_size)
+ return ENXIO;
+
+ /* Return base offset */
+ if (bap != NULL)
+ *bap = base_offset;
+
+ return 0;
+}
+
+/*
+ * Vnode operation to retrieve a named extended attribute.
+ */
+int
+ufs_getextattr(struct vop_getextattr_args *ap)
+/*
+vop_getextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN kauth_cred_t a_cred;
+};
+*/
+{
+ struct mount *mp = ap->a_vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+
+ ufs_extattr_uepm_lock(ump);
+
+ error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+ ap->a_uio, ap->a_size, ap->a_cred, curlwp);
+
+ ufs_extattr_uepm_unlock(ump);
+
+ return (error);
+}
+
+/*
+ * Real work associated with retrieving a named attribute--assumes that
+ * the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
+ struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *attribute;
+ struct ufs_extattr_header ueh;
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ off_t base_offset;
+ size_t len, old_len;
+ int error = 0;
+
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+ return (EOPNOTSUPP);
+
+ if (strlen(name) == 0)
+ return (EINVAL);
+
+ error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD);
+ if (error)
+ return (error);
+
+ attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+ if (!attribute)
+ return (ENOATTR);
+
+ /*
+ * Allow only offsets of zero to encourage the read/replace
+ * extended attribute semantic. Otherwise we can't guarantee
+ * atomicity, as we don't provide locks for extended attributes.
+ */
+ if (uio != NULL && uio->uio_offset != 0)
+ return (ENXIO);
+
+ /*
+ * Don't need to get a lock on the backing file if the getattr is
+ * being applied to the backing file, as the lock is already held.
+ */
+ if (attribute->uele_backing_vnode != vp)
+ vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);
+
+ error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
+ if (error)
+ goto vopunlock_exit;
+
+ /* Return full data size if caller requested it. */
+ if (size != NULL)
+ *size = ueh.ueh_len;
+
+ /* Return data if the caller requested it. */
+ if (uio != NULL) {
+ /* Allow for offset into the attribute data. */
+ uio->uio_offset = base_offset + sizeof(struct
+ ufs_extattr_header);
+
+ /*
+ * Figure out maximum to transfer -- use buffer size and
+ * local data limit.
+ */
+ len = MIN(uio->uio_resid, ueh.ueh_len);
+ old_len = uio->uio_resid;
+ uio->uio_resid = len;
+
+ error = VOP_READ(attribute->uele_backing_vnode, uio,
+ IO_NODELOCKED, ump->um_extattr.uepm_ucred);
+ if (error)
+ goto vopunlock_exit;
+
+ uio->uio_resid = old_len - (len - uio->uio_resid);
+ }
+
+ vopunlock_exit:
+
+ if (uio != NULL)
+ uio->uio_offset = 0;
+
+ if (attribute->uele_backing_vnode != vp)
+ VOP_UNLOCK(attribute->uele_backing_vnode);
+
+ return (error);
+}
+
+/*
+ * Vnode operation to list extended attribute for a vnode
+ */
+int
+ufs_listextattr(struct vop_listextattr_args *ap)
+/*
+vop_listextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN int flag;
+ IN kauth_cred_t a_cred;
+ struct proc *a_p;
+};
+*/
+{
+ struct mount *mp = ap->a_vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+
+ ufs_extattr_uepm_lock(ump);
+
+ error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace,
+ ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp);
+
+ ufs_extattr_uepm_unlock(ump);
+
+ return (error);
+}
+
+/*
+ * Real work associated with retrieving list of attributes--assumes that
+ * the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_list(struct vnode *vp, int attrnamespace,
+ struct uio *uio, size_t *size, int flag,
+ kauth_cred_t cred, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *uele;
+ struct ufs_extattr_header ueh;
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ size_t listsize = 0;
+ int error = 0;
+
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+ return (EOPNOTSUPP);
+
+ error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD);
+ if (error)
+ return (error);
+
+ LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) {
+ unsigned char attrnamelen;
+
+ if (uele->uele_attrnamespace != attrnamespace)
+ continue;
+
+ error = ufs_extattr_get_header(vp, uele, &ueh, NULL);
+ if (error == ENOATTR)
+ continue;
+ if (error != 0)
+ return error;
+
+ /*
+ * Don't need to get a lock on the backing file if
+ * the listattr is being applied to the backing file,
+ * as the lock is already held.
+ */
+ if (uele->uele_backing_vnode != vp)
+ vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);
+
+ /*
+ * +1 for trailing NUL (listxattr flavor)
+ * or leading name length (extattr_list_file flavor)
+ */
+ attrnamelen = strlen(uele->uele_attrname);
+ listsize += attrnamelen + 1;
+
+ /* Return data if the caller requested it. */
+ if (uio != NULL) {
+ /*
+ * We support two flavors. Either NUL-terminated
+ * strings (a la listxattr), or non NUL-terminated,
+ * one byte length prefixed strings (for
+ * extattr_list_file). EXTATTR_LIST_LENPREFIX switches
+ * that second behavior.
+ */
+ if (flag & EXTATTR_LIST_LENPREFIX) {
+ uint8_t len = (uint8_t)attrnamelen;
+
+ /* Copy leading name length */
+ error = uiomove(&len, sizeof(len), uio);
+ if (error != 0)
+ break;
+ } else {
+ /* Include trailing NULL */
+ attrnamelen++;
+ }
+
+ error = uiomove(uele->uele_attrname,
+ (size_t)attrnamelen, uio);
+ if (error != 0)
+ break;
+ }
+
+ if (uele->uele_backing_vnode != vp)
+ VOP_UNLOCK(uele->uele_backing_vnode);
+
+ if (error != 0)
+ return error;
+ }
+
+ if (uio != NULL)
+ uio->uio_offset = 0;
+
+ /* Return full data size if caller requested it. */
+ if (size != NULL)
+ *size = listsize;
+
+ return 0;
+}
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+int
+ufs_deleteextattr(struct vop_deleteextattr_args *ap)
+/*
+vop_deleteextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ IN kauth_cred_t a_cred;
+};
+*/
+{
+ struct mount *mp = ap->a_vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+
+ ufs_extattr_uepm_lock(ump);
+
+ error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+ ap->a_cred, curlwp);
+
+ ufs_extattr_uepm_unlock(ump);
+
+ return (error);
+}
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+int
+ufs_setextattr(struct vop_setextattr_args *ap)
+/*
+vop_setextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ IN kauth_cred_t a_cred;
+};
+*/
+{
+ struct mount *mp = ap->a_vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+
+ ufs_extattr_uepm_lock(ump);
+
+ /*
+ * XXX: No longer a supported way to delete extended attributes.
+ */
+ if (ap->a_uio == NULL) {
+ ufs_extattr_uepm_unlock(ump);
+ return (EINVAL);
+ }
+
+ error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+ ap->a_uio, ap->a_cred, curlwp);
+
+ ufs_extattr_uepm_unlock(ump);
+
+ return (error);
+}
+
+/*
+ * Real work associated with setting a vnode's extended attributes;
+ * assumes that the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
+ struct uio *uio, kauth_cred_t cred, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *attribute;
+ struct ufs_extattr_header ueh;
+ struct iovec local_aiov;
+ struct uio local_aio;
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct inode *ip = VTOI(vp);
+ off_t base_offset;
+ int error = 0, ioflag;
+
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+ return (EOPNOTSUPP);
+ if (!ufs_extattr_valid_attrname(attrnamespace, name))
+ return (EINVAL);
+
+ error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE);
+ if (error)
+ return (error);
+
+ attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+ if (!attribute) {
+ attribute = ufs_extattr_autocreate_attr(vp, attrnamespace,
+ name, l);
+ if (!attribute)
+ return (ENOATTR);
+ }
+
+ /*
+ * Early rejection of invalid offsets/length.
+ * Reject: any offset but 0 (replace)
+ * Any size greater than attribute size limit
+ */
+ if (uio->uio_offset != 0 ||
+ uio->uio_resid > attribute->uele_fileheader.uef_size)
+ return (ENXIO);
+
+ /*
+ * Find base offset of header in file based on file header size, and
+ * data header size + maximum data size, indexed by inode number.
+ */
+ base_offset = sizeof(struct ufs_extattr_fileheader) +
+ ip->i_number * (sizeof(struct ufs_extattr_header) +
+ attribute->uele_fileheader.uef_size);
+
+ /*
+ * Write out a data header for the data.
+ */
+ ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid,
+ UELE_NEEDSWAP(attribute));
+ ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE,
+ UELE_NEEDSWAP(attribute));
+ ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute));
+ local_aiov.iov_base = &ueh;
+ local_aiov.iov_len = sizeof(struct ufs_extattr_header);
+ local_aio.uio_iov = &local_aiov;
+ local_aio.uio_iovcnt = 1;
+ local_aio.uio_rw = UIO_WRITE;
+ local_aio.uio_offset = base_offset;
+ local_aio.uio_resid = sizeof(struct ufs_extattr_header);
+ UIO_SETUP_SYSSPACE(&local_aio);
+
+ /*
+ * Don't need to get a lock on the backing file if the setattr is
+ * being applied to the backing file, as the lock is already held.
+ */
+ if (attribute->uele_backing_vnode != vp)
+ vn_lock(attribute->uele_backing_vnode,
+ LK_EXCLUSIVE | LK_RETRY);
+
+ ioflag = IO_NODELOCKED;
+ if (ufs_extattr_sync)
+ ioflag |= IO_SYNC;
+ error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
+ ump->um_extattr.uepm_ucred);
+ if (error)
+ goto vopunlock_exit;
+
+ if (local_aio.uio_resid != 0) {
+ error = ENXIO;
+ goto vopunlock_exit;
+ }
+
+ /*
+ * Write out user data.
+ * XXX NOT ATOMIC WITH RESPECT TO THE HEADER.
+ */
+ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);
+
+ ioflag = IO_NODELOCKED;
+ if (ufs_extattr_sync)
+ ioflag |= IO_SYNC;
+ error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
+ ump->um_extattr.uepm_ucred);
+
+ vopunlock_exit:
+ uio->uio_offset = 0;
+
+ if (attribute->uele_backing_vnode != vp)
+ VOP_UNLOCK(attribute->uele_backing_vnode);
+
+ return (error);
+}
+
+/*
+ * Real work associated with removing an extended attribute from a vnode.
+ * Assumes the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
+ kauth_cred_t cred, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *attribute;
+ struct ufs_extattr_header ueh;
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct iovec local_aiov;
+ struct uio local_aio;
+ off_t base_offset;
+ int error = 0, ioflag;
+
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+ return (EOPNOTSUPP);
+ if (!ufs_extattr_valid_attrname(attrnamespace, name))
+ return (EINVAL);
+
+ error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE);
+ if (error)
+ return (error);
+
+ attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+ if (!attribute)
+ return (ENOATTR);
+
+ /*
+ * Don't need to get a lock on the backing file if the getattr is
+ * being applied to the backing file, as the lock is already held.
+ */
+ if (attribute->uele_backing_vnode != vp)
+ vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
+
+ error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
+ if (error)
+ goto vopunlock_exit;
+
+ /* Flag it as not in use. */
+ ueh.ueh_flags = 0; /* No need to byte swap 0 */
+ ueh.ueh_len = 0; /* ...ditto... */
+
+ local_aiov.iov_base = &ueh;
+ local_aiov.iov_len = sizeof(struct ufs_extattr_header);
+ local_aio.uio_iov = &local_aiov;
+ local_aio.uio_iovcnt = 1;
+ local_aio.uio_rw = UIO_WRITE;
+ local_aio.uio_offset = base_offset;
+ local_aio.uio_resid = sizeof(struct ufs_extattr_header);
+ UIO_SETUP_SYSSPACE(&local_aio);
+
+ ioflag = IO_NODELOCKED;
+ if (ufs_extattr_sync)
+ ioflag |= IO_SYNC;
+ error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
+ ump->um_extattr.uepm_ucred);
+ if (error)
+ goto vopunlock_exit;
+
+ if (local_aio.uio_resid != 0)
+ error = ENXIO;
+
+ vopunlock_exit:
+ VOP_UNLOCK(attribute->uele_backing_vnode);
+
+ return (error);
+}
+
+/*
+ * Called by UFS when an inode is no longer active and should have its
+ * attributes stripped.
+ */
+void
+ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l)
+{
+ struct ufs_extattr_list_entry *uele;
+ struct mount *mp = vp->v_mount;
+ struct ufsmount *ump = VFSTOUFS(mp);
+
+ /*
+ * In that case, we cannot lock. We should not have any active vnodes
+ * on the fs if this is not yet initialized but is going to be, so
+ * this can go unlocked.
+ */
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
+ return;
+
+ ufs_extattr_uepm_lock(ump);
+
+ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+ ufs_extattr_uepm_unlock(ump);
+ return;
+ }
+
+ LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
+ ufs_extattr_rm(vp, uele->uele_attrnamespace,
+ uele->uele_attrname, lwp0.l_cred, l);
+
+ ufs_extattr_uepm_unlock(ump);
+}
+
+void
+ufs_extattr_init(void)
+{
+
+ malloc_type_attach(M_UFS_EXTATTR);
+}
+
+void
+ufs_extattr_done(void)
+{
+
+ malloc_type_detach(M_UFS_EXTATTR);
+}
--- /dev/null
+/* $NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_ihash.c 8.7 (Berkeley) 5/17/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+/*
+ * Structures associated with inode cacheing.
+ */
+static LIST_HEAD(ihashhead, inode) *ihashtbl;
+static u_long ihash; /* size of hash table - 1 */
+#define INOHASH(device, inum) (((device) + (inum)) & ihash)
+
+kmutex_t ufs_ihash_lock;
+kmutex_t ufs_hashlock;
+
+/*
+ * Initialize inode hash table.
+ */
+void
+ufs_ihashinit(void)
+{
+
+ mutex_init(&ufs_hashlock, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&ufs_ihash_lock, MUTEX_DEFAULT, IPL_NONE);
+ ihashtbl = hashinit(desiredvnodes, HASH_LIST, true, &ihash);
+}
+
+/*
+ * Reinitialize inode hash table.
+ */
+
+void
+ufs_ihashreinit(void)
+{
+ struct inode *ip;
+ struct ihashhead *oldhash, *hash;
+ u_long oldmask, mask, val;
+ int i;
+
+ hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+ mutex_enter(&ufs_ihash_lock);
+ oldhash = ihashtbl;
+ oldmask = ihash;
+ ihashtbl = hash;
+ ihash = mask;
+ for (i = 0; i <= oldmask; i++) {
+ while ((ip = LIST_FIRST(&oldhash[i])) != NULL) {
+ LIST_REMOVE(ip, i_hash);
+ val = INOHASH(ip->i_dev, ip->i_number);
+ LIST_INSERT_HEAD(&hash[val], ip, i_hash);
+ }
+ }
+ mutex_exit(&ufs_ihash_lock);
+ hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free inode hash table.
+ */
+void
+ufs_ihashdone(void)
+{
+
+ hashdone(ihashtbl, HASH_LIST, ihash);
+ mutex_destroy(&ufs_hashlock);
+ mutex_destroy(&ufs_ihash_lock);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, return it, even if it is locked.
+ */
+struct vnode *
+ufs_ihashlookup(dev_t dev, ino_t inum)
+{
+ struct inode *ip;
+ struct ihashhead *ipp;
+
+ KASSERT(mutex_owned(&ufs_ihash_lock));
+
+ ipp = &ihashtbl[INOHASH(dev, inum)];
+ LIST_FOREACH(ip, ipp, i_hash) {
+ if (inum == ip->i_number && dev == ip->i_dev)
+ break;
+ }
+ if (ip)
+ return (ITOV(ip));
+ return (NULLVP);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, but locked, wait for it.
+ */
+struct vnode *
+ufs_ihashget(dev_t dev, ino_t inum, int flags)
+{
+ struct ihashhead *ipp;
+ struct inode *ip;
+ struct vnode *vp;
+
+ loop:
+ mutex_enter(&ufs_ihash_lock);
+ ipp = &ihashtbl[INOHASH(dev, inum)];
+ LIST_FOREACH(ip, ipp, i_hash) {
+ if (inum == ip->i_number && dev == ip->i_dev) {
+ vp = ITOV(ip);
+ if (flags == 0) {
+ mutex_exit(&ufs_ihash_lock);
+ } else {
+ mutex_enter(vp->v_interlock);
+ mutex_exit(&ufs_ihash_lock);
+ if (vget(vp, flags))
+ goto loop;
+ }
+ return (vp);
+ }
+ }
+ mutex_exit(&ufs_ihash_lock);
+ return (NULL);
+}
+
+/*
+ * Insert the inode into the hash table, and return it locked.
+ */
+void
+ufs_ihashins(struct inode *ip)
+{
+ struct ihashhead *ipp;
+
+ KASSERT(mutex_owned(&ufs_hashlock));
+
+ /* lock the inode, then put it on the appropriate hash list */
+ VOP_LOCK(ITOV(ip), LK_EXCLUSIVE);
+
+ mutex_enter(&ufs_ihash_lock);
+ ipp = &ihashtbl[INOHASH(ip->i_dev, ip->i_number)];
+ LIST_INSERT_HEAD(ipp, ip, i_hash);
+ mutex_exit(&ufs_ihash_lock);
+}
+
+/*
+ * Remove the inode from the hash table.
+ */
+void
+ufs_ihashrem(struct inode *ip)
+{
+ mutex_enter(&ufs_ihash_lock);
+ LIST_REMOVE(ip, i_hash);
+ mutex_exit(&ufs_ihash_lock);
+}
--- /dev/null
+/* $NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $ */
+
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/namei.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/kmem.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#ifdef UFS_EXTATTR
+#include <ufs/ufs/extattr.h>
+#endif
+
+#include <uvm/uvm.h>
+
+extern int prtactive;
+
+/*
+ * Last reference to an inode. If necessary, write or delete it.
+ */
+int
+ufs_inactive(void *v)
+{
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct bool *a_recycle;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct mount *transmp;
+ mode_t mode;
+ int error = 0;
+ int logged = 0;
+
+ UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount);
+
+ transmp = vp->v_mount;
+ fstrans_start(transmp, FSTRANS_LAZY);
+ /*
+ * Ignore inodes related to stale file handles.
+ */
+ if (ip->i_mode == 0)
+ goto out;
+ if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+#ifdef UFS_EXTATTR
+ ufs_extattr_vnode_inactive(vp, curlwp);
+#endif
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ logged = 1;
+ if (ip->i_size != 0) {
+ /*
+ * When journaling, only truncate one indirect block
+ * at a time
+ */
+ if (vp->v_mount->mnt_wapbl) {
+ uint64_t incr = MNINDIR(ip->i_ump) <<
+ vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+ uint64_t base = NDADDR <<
+ vp->v_mount->mnt_fs_bshift;
+ while (!error && ip->i_size > base + incr) {
+ /*
+ * round down to next full indirect
+ * block boundary.
+ */
+ uint64_t nsize = base +
+ ((ip->i_size - base - 1) &
+ ~(incr - 1));
+ error = UFS_TRUNCATE(vp, nsize, 0,
+ NOCRED);
+ if (error)
+ break;
+ UFS_WAPBL_END(vp->v_mount);
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ }
+ }
+ if (!error)
+ error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED);
+ }
+#if defined(QUOTA) || defined(QUOTA2)
+ (void)chkiq(ip, -1, NOCRED, 0);
+#endif
+ DIP_ASSIGN(ip, rdev, 0);
+ mode = ip->i_mode;
+ ip->i_mode = 0;
+ ip->i_omode = mode;
+ DIP_ASSIGN(ip, mode, 0);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ /*
+ * Defer final inode free and update to ufs_reclaim().
+ */
+ }
+
+ if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
+ if (!logged++) {
+ int err;
+ err = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (err)
+ goto out;
+ }
+ UFS_UPDATE(vp, NULL, NULL, 0);
+ }
+ if (logged)
+ UFS_WAPBL_END(vp->v_mount);
+out:
+ /*
+ * If we are done with the inode, reclaim it
+ * so that it can be reused immediately.
+ */
+ *ap->a_recycle = (ip->i_mode == 0);
+ VOP_UNLOCK(vp);
+ fstrans_done(transmp);
+ return (error);
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ufs_reclaim(struct vnode *vp)
+{
+ struct inode *ip = VTOI(vp);
+
+ if (prtactive && vp->v_usecount > 1)
+ vprint("ufs_reclaim: pushing active", vp);
+
+ if (!UFS_WAPBL_BEGIN(vp->v_mount)) {
+ UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
+ UFS_WAPBL_END(vp->v_mount);
+ }
+ UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
+
+ /*
+ * Remove the inode from its hash chain.
+ */
+ ufs_ihashrem(ip);
+
+ if (ip->i_devvp) {
+ vrele(ip->i_devvp);
+ ip->i_devvp = 0;
+ }
+#if defined(QUOTA) || defined(QUOTA2)
+ ufsquota_free(ip);
+#endif
+#ifdef UFS_DIRHASH
+ if (ip->i_dirhash != NULL)
+ ufsdirhash_free(ip);
+#endif
+ return (0);
+}
+
+/*
+ * allocate a range of blocks in a file.
+ * after this function returns, any page entirely contained within the range
+ * will map to invalid data and thus must be overwritten before it is made
+ * accessible to others.
+ */
+
+int
+ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred,
+ int flags)
+{
+ off_t neweof; /* file size after the operation */
+ off_t neweob; /* offset next to the last block after the operation */
+ off_t pagestart; /* starting offset of range covered by pgs */
+ off_t eob; /* offset next to allocated blocks */
+ struct uvm_object *uobj;
+ int i, delta, error, npages;
+ int bshift = vp->v_mount->mnt_fs_bshift;
+ int bsize = 1 << bshift;
+ int ppb = MAX(bsize >> PAGE_SHIFT, 1);
+ struct vm_page **pgs;
+ size_t pgssize;
+ UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
+ UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x",
+ vp, off, len, vp->v_size);
+
+ neweof = MAX(vp->v_size, off + len);
+ GOP_SIZE(vp, neweof, &neweob, 0);
+
+ error = 0;
+ uobj = &vp->v_uobj;
+
+ /*
+ * read or create pages covering the range of the allocation and
+ * keep them locked until the new block is allocated, so there
+ * will be no window where the old contents of the new block are
+ * visible to racing threads.
+ */
+
+ pagestart = trunc_page(off) & ~(bsize - 1);
+ npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
+ pgssize = npages * sizeof(struct vm_page *);
+ pgs = kmem_zalloc(pgssize, KM_SLEEP);
+
+ /*
+ * adjust off to be block-aligned.
+ */
+
+ delta = off & (bsize - 1);
+ off -= delta;
+ len += delta;
+
+ genfs_node_wrlock(vp);
+ mutex_enter(uobj->vmobjlock);
+ error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0,
+ VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC |
+ PGO_NOTIMESTAMP | PGO_GLOCKHELD);
+ if (error) {
+ goto out;
+ }
+
+ /*
+ * now allocate the range.
+ */
+
+ error = GOP_ALLOC(vp, off, len, flags, cred);
+ genfs_node_unlock(vp);
+
+ /*
+ * if the allocation succeeded, clear PG_CLEAN on all the pages
+ * and clear PG_RDONLY on any pages that are now fully backed
+ * by disk blocks. if the allocation failed, we do not invalidate
+ * the pages since they might have already existed and been dirty,
+ * in which case we need to keep them around. if we created the pages,
+ * they will be clean and read-only, and leaving such pages
+ * in the cache won't cause any problems.
+ */
+
+ GOP_SIZE(vp, off + len, &eob, 0);
+ mutex_enter(uobj->vmobjlock);
+ mutex_enter(&uvm_pageqlock);
+ for (i = 0; i < npages; i++) {
+ KASSERT((pgs[i]->flags & PG_RELEASED) == 0);
+ if (!error) {
+ if (off <= pagestart + (i << PAGE_SHIFT) &&
+ pagestart + ((i + 1) << PAGE_SHIFT) <= eob) {
+ pgs[i]->flags &= ~PG_RDONLY;
+ }
+ pgs[i]->flags &= ~PG_CLEAN;
+ }
+ uvm_pageactivate(pgs[i]);
+ }
+ mutex_exit(&uvm_pageqlock);
+ uvm_page_unbusy(pgs, npages);
+ mutex_exit(uobj->vmobjlock);
+
+ out:
+ kmem_free(pgs, pgssize);
+ return error;
+}
--- /dev/null
+/* $NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $ */
+
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_lookup.c 8.9 (Berkeley) 8/11/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/proc.h>
+#include <sys/kmem.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#ifdef DIAGNOSTIC
+int dirchk = 1;
+#else
+int dirchk = 0;
+#endif
+
+#define FSFMT(vp) (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * Convert a component of a pathname into a pointer to a locked inode.
+ * This is a very central and rather complicated routine.
+ * If the file system is not maintained in a strict tree hierarchy,
+ * this can result in a deadlock situation (see comments in code below).
+ *
+ * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
+ * on whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it and the target of the pathname
+ * exists, lookup returns both the target and its parent directory locked.
+ * When creating or renaming and LOCKPARENT is specified, the target may
+ * not be ".". When deleting and LOCKPARENT is specified, the target may
+ * be "."., but the caller must check to ensure it does an vrele and vput
+ * instead of two vputs.
+ *
+ * Overall outline of ufs_lookup:
+ *
+ * check accessibility of directory
+ * look for name in cache, if found, then if at end of path
+ * and deleting or creating, drop it, else return name
+ * search for name in directory, to found or notfound
+ * notfound:
+ * if creating, return locked directory, leaving info on available slots
+ * else return error
+ * found:
+ * if at end of path and deleting, return information to allow delete
+ * if at end of path and rewriting (RENAME and LOCKPARENT), lock target
+ * inode and return info to allow rewrite
+ * if not at end, add name to cache; if at end and neither creating
+ * nor deleting, add name to cache
+ */
+int
+ufs_lookup(void *v)
+{
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *vdp = ap->a_dvp; /* vnode for directory being searched */
+ struct inode *dp = VTOI(vdp); /* inode for directory being searched */
+ struct buf *bp; /* a buffer of directory entries */
+ struct direct *ep; /* the current directory entry */
+ int entryoffsetinblock; /* offset of ep in bp's buffer */
+ enum {NONE, COMPACT, FOUND} slotstatus;
+ doff_t slotoffset; /* offset of area with free space */
+ int slotsize; /* size of area at slotoffset */
+ int slotfreespace; /* amount of space free in slot */
+ int slotneeded; /* size of the entry we're seeking */
+ int numdirpasses; /* strategy for directory search */
+ doff_t endsearch; /* offset to end directory search */
+ doff_t prevoff; /* prev entry dp->i_offset */
+ struct vnode *pdp; /* saved dp during symlink work */
+ struct vnode *tdp; /* returned by VFS_VGET */
+ doff_t enduseful; /* pointer past last used dir slot */
+ u_long bmask; /* block offset mask */
+ int namlen, error;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ kauth_cred_t cred = cnp->cn_cred;
+ int flags;
+ int nameiop = cnp->cn_nameiop;
+ struct ufsmount *ump = dp->i_ump;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+ int dirblksiz = ump->um_dirblksiz;
+ ino_t foundino;
+ struct ufs_lookup_results *results;
+
+ flags = cnp->cn_flags;
+
+ bp = NULL;
+ slotoffset = -1;
+ *vpp = NULL;
+ endsearch = 0; /* silence compiler warning */
+
+ /*
+ * Produce the auxiliary lookup results into i_crap. Increment
+ * its serial number so elsewhere we can tell if we're using
+ * stale results. This should not be done this way. XXX.
+ */
+ results = &dp->i_crap;
+ dp->i_crapcounter++;
+
+ /*
+ * Check accessiblity of directory.
+ */
+ if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
+ return (error);
+
+ if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+ (nameiop == DELETE || nameiop == RENAME))
+ return (EROFS);
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ *
+ * Before tediously performing a linear scan of the directory,
+ * check the name cache to see if the directory/name pair
+ * we are looking for is known already.
+ */
+ if ((error = cache_lookup(vdp, vpp, cnp)) >= 0) {
+ return (error);
+ }
+
+ fstrans_start(vdp->v_mount, FSTRANS_SHARED);
+
+ /*
+ * Suppress search for slots unless creating
+ * file and at end of pathname, in which case
+ * we watch for a place to put the new file in
+ * case it doesn't already exist.
+ */
+ slotstatus = FOUND;
+ slotfreespace = slotsize = slotneeded = 0;
+ if ((nameiop == CREATE || nameiop == RENAME) &&
+ (flags & ISLASTCN)) {
+ slotstatus = NONE;
+ slotneeded = DIRECTSIZ(cnp->cn_namelen);
+ }
+
+ /*
+ * If there is cached information on a previous search of
+ * this directory, pick up where we last left off.
+ * We cache only lookups as these are the most common
+ * and have the greatest payoff. Caching CREATE has little
+ * benefit as it usually must search the entire directory
+ * to determine that the entry does not exist. Caching the
+ * location of the last DELETE or RENAME has not reduced
+ * profiling time and hence has been removed in the interest
+ * of simplicity.
+ */
+ bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
+
+#ifdef UFS_DIRHASH
+ /*
+ * Use dirhash for fast operations on large directories. The logic
+ * to determine whether to hash the directory is contained within
+ * ufsdirhash_build(); a zero return means that it decided to hash
+ * this directory and it successfully built up the hash table.
+ */
+ if (ufsdirhash_build(dp) == 0) {
+ /* Look for a free slot if needed. */
+ enduseful = dp->i_size;
+ if (slotstatus != FOUND) {
+ slotoffset = ufsdirhash_findfree(dp, slotneeded,
+ &slotsize);
+ if (slotoffset >= 0) {
+ slotstatus = COMPACT;
+ enduseful = ufsdirhash_enduseful(dp);
+ if (enduseful < 0)
+ enduseful = dp->i_size;
+ }
+ }
+ /* Look up the component. */
+ numdirpasses = 1;
+ entryoffsetinblock = 0; /* silence compiler warning */
+ switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
+ &results->ulr_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
+ case 0:
+ ep = (struct direct *)((char *)bp->b_data +
+ (results->ulr_offset & bmask));
+ goto foundentry;
+ case ENOENT:
+ results->ulr_offset = roundup(dp->i_size, dirblksiz);
+ goto notfound;
+ default:
+ /* Something failed; just do a linear search. */
+ break;
+ }
+ }
+#endif /* UFS_DIRHASH */
+
+ if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
+ results->ulr_diroff >= dp->i_size) {
+ entryoffsetinblock = 0;
+ results->ulr_offset = 0;
+ numdirpasses = 1;
+ } else {
+ results->ulr_offset = results->ulr_diroff;
+ if ((entryoffsetinblock = results->ulr_offset & bmask) &&
+ (error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
+ NULL, &bp, false)))
+ goto out;
+ numdirpasses = 2;
+ nchstats.ncs_2passes++;
+ }
+ prevoff = results->ulr_offset;
+ endsearch = roundup(dp->i_size, dirblksiz);
+ enduseful = 0;
+
+searchloop:
+ while (results->ulr_offset < endsearch) {
+ if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+ preempt();
+ /*
+ * If necessary, get the next directory block.
+ */
+ if ((results->ulr_offset & bmask) == 0) {
+ if (bp != NULL)
+ brelse(bp, 0);
+ error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, NULL,
+ &bp, false);
+ if (error)
+ goto out;
+ entryoffsetinblock = 0;
+ }
+ /*
+ * If still looking for a slot, and at a DIRBLKSIZ
+ * boundary, have to start looking for free space again.
+ */
+ if (slotstatus == NONE &&
+ (entryoffsetinblock & (dirblksiz - 1)) == 0) {
+ slotoffset = -1;
+ slotfreespace = 0;
+ }
+ /*
+ * Get pointer to next entry.
+ * Full validation checks are slow, so we only check
+ * enough to insure forward progress through the
+ * directory. Complete checks can be run by patching
+ * "dirchk" to be true.
+ */
+ KASSERT(bp != NULL);
+ ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock);
+ if (ep->d_reclen == 0 ||
+ (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) {
+ int i;
+
+ ufs_dirbad(dp, results->ulr_offset, "mangled entry");
+ i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
+ results->ulr_offset += i;
+ entryoffsetinblock += i;
+ continue;
+ }
+
+ /*
+ * If an appropriate sized slot has not yet been found,
+ * check to see if one is available. Also accumulate space
+ * in the current block so that we can determine if
+ * compaction is viable.
+ */
+ if (slotstatus != FOUND) {
+ int size = ufs_rw16(ep->d_reclen, needswap);
+
+ if (ep->d_ino != 0)
+ size -= DIRSIZ(FSFMT(vdp), ep, needswap);
+ if (size > 0) {
+ if (size >= slotneeded) {
+ slotstatus = FOUND;
+ slotoffset = results->ulr_offset;
+ slotsize = ufs_rw16(ep->d_reclen,
+ needswap);
+ } else if (slotstatus == NONE) {
+ slotfreespace += size;
+ if (slotoffset == -1)
+ slotoffset = results->ulr_offset;
+ if (slotfreespace >= slotneeded) {
+ slotstatus = COMPACT;
+ slotsize = results->ulr_offset +
+ ufs_rw16(ep->d_reclen,
+ needswap) -
+ slotoffset;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check for a name match.
+ */
+ if (ep->d_ino) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (FSFMT(vdp) && needswap == 0)
+ namlen = ep->d_type;
+ else
+ namlen = ep->d_namlen;
+#else
+ if (FSFMT(vdp) && needswap != 0)
+ namlen = ep->d_type;
+ else
+ namlen = ep->d_namlen;
+#endif
+ if (namlen == cnp->cn_namelen &&
+ !memcmp(cnp->cn_nameptr, ep->d_name,
+ (unsigned)namlen)) {
+#ifdef UFS_DIRHASH
+foundentry:
+#endif
+ /*
+ * Save directory entry's inode number and
+ * reclen in ndp->ni_ufs area, and release
+ * directory buffer.
+ */
+ if (!FSFMT(vdp) && ep->d_type == DT_WHT) {
+ slotstatus = FOUND;
+ slotoffset = results->ulr_offset;
+ slotsize = ufs_rw16(ep->d_reclen,
+ needswap);
+ results->ulr_reclen = slotsize;
+ /*
+ * This is used to set results->ulr_endoff,
+ * which may be used by ufs_direnter2()
+ * as a length to truncate the
+ * directory to. Therefore, it must
+ * point past the end of the last
+ * non-empty directory entry. We don't
+ * know where that is in this case, so
+ * we effectively disable shrinking by
+ * using the existing size of the
+ * directory.
+ *
+ * Note that we wouldn't expect to
+ * shrink the directory while rewriting
+ * an existing entry anyway.
+ */
+ enduseful = endsearch;
+ ap->a_cnp->cn_flags |= ISWHITEOUT;
+ numdirpasses--;
+ goto notfound;
+ }
+ foundino = ufs_rw32(ep->d_ino, needswap);
+ results->ulr_reclen = ufs_rw16(ep->d_reclen, needswap);
+ goto found;
+ }
+ }
+ prevoff = results->ulr_offset;
+ results->ulr_offset += ufs_rw16(ep->d_reclen, needswap);
+ entryoffsetinblock += ufs_rw16(ep->d_reclen, needswap);
+ if (ep->d_ino)
+ enduseful = results->ulr_offset;
+ }
+notfound:
+ /*
+ * If we started in the middle of the directory and failed
+ * to find our target, we must check the beginning as well.
+ */
+ if (numdirpasses == 2) {
+ numdirpasses--;
+ results->ulr_offset = 0;
+ endsearch = results->ulr_diroff;
+ goto searchloop;
+ }
+ if (bp != NULL)
+ brelse(bp, 0);
+ /*
+ * If creating, and at end of pathname and current
+ * directory has not been removed, then can consider
+ * allowing file to be created.
+ */
+ if ((nameiop == CREATE || nameiop == RENAME ||
+ (nameiop == DELETE &&
+ (ap->a_cnp->cn_flags & DOWHITEOUT) &&
+ (ap->a_cnp->cn_flags & ISWHITEOUT))) &&
+ (flags & ISLASTCN) && dp->i_nlink != 0) {
+ /*
+ * Access for write is interpreted as allowing
+ * creation of files in the directory.
+ */
+ error = VOP_ACCESS(vdp, VWRITE, cred);
+ if (error)
+ goto out;
+ /*
+ * Return an indication of where the new directory
+ * entry should be put. If we didn't find a slot,
+ * then set results->ulr_count to 0 indicating
+ * that the new slot belongs at the end of the
+ * directory. If we found a slot, then the new entry
+ * can be put in the range from results->ulr_offset to
+ * results->ulr_offset + results->ulr_count.
+ */
+ if (slotstatus == NONE) {
+ results->ulr_offset = roundup(dp->i_size, dirblksiz);
+ results->ulr_count = 0;
+ enduseful = results->ulr_offset;
+ } else if (nameiop == DELETE) {
+ results->ulr_offset = slotoffset;
+ if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+ results->ulr_count = 0;
+ else
+ results->ulr_count = results->ulr_offset - prevoff;
+ } else {
+ results->ulr_offset = slotoffset;
+ results->ulr_count = slotsize;
+ if (enduseful < slotoffset + slotsize)
+ enduseful = slotoffset + slotsize;
+ }
+ results->ulr_endoff = roundup(enduseful, dirblksiz);
+#if 0 /* commented out by dbj. none of the on disk fields changed */
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+#endif
+ /*
+ * We return with the directory locked, so that
+ * the parameters we set up above will still be
+ * valid if we actually decide to do a direnter().
+ * We return ni_vp == NULL to indicate that the entry
+ * does not currently exist; we leave a pointer to
+ * the (locked) directory inode in ndp->ni_dvp.
+ *
+ * NB - if the directory is unlocked, then this
+ * information cannot be used.
+ */
+ error = EJUSTRETURN;
+ goto out;
+ }
+ /*
+ * Insert name into cache (as non-existent) if appropriate.
+ */
+ if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+ cache_enter(vdp, *vpp, cnp);
+ error = ENOENT;
+ goto out;
+
+found:
+ if (numdirpasses == 2)
+ nchstats.ncs_pass2++;
+ /*
+ * Check that directory length properly reflects presence
+ * of this entry.
+ */
+ if (results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap) > dp->i_size) {
+ ufs_dirbad(dp, results->ulr_offset, "i_size too small");
+ dp->i_size = results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap);
+ DIP_ASSIGN(dp, size, dp->i_size);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
+ }
+ brelse(bp, 0);
+
+ /*
+ * Found component in pathname.
+ * If the final component of path name, save information
+ * in the cache as to where the entry was found.
+ */
+ if ((flags & ISLASTCN) && nameiop == LOOKUP)
+ results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1);
+
+ /*
+ * If deleting, and at end of pathname, return
+ * parameters which can be used to remove file.
+ * Lock the inode, being careful with ".".
+ */
+ if (nameiop == DELETE && (flags & ISLASTCN)) {
+ /*
+ * Write access to directory required to delete files.
+ */
+ error = VOP_ACCESS(vdp, VWRITE, cred);
+ if (error)
+ goto out;
+ /*
+ * Return pointer to current entry in results->ulr_offset,
+ * and distance past previous entry (if there
+ * is a previous entry in this block) in results->ulr_count.
+ * Save directory inode pointer in ndp->ni_dvp for dirremove().
+ */
+ if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+ results->ulr_count = 0;
+ else
+ results->ulr_count = results->ulr_offset - prevoff;
+ if (dp->i_number == foundino) {
+ vref(vdp);
+ *vpp = vdp;
+ error = 0;
+ goto out;
+ }
+ if (flags & ISDOTDOT)
+ VOP_UNLOCK(vdp); /* race to get the inode */
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ if (flags & ISDOTDOT)
+ vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ goto out;
+ /*
+ * If directory is "sticky", then user must own
+ * the directory, or the file in it, else she
+ * may not delete it (unless she's root). This
+ * implements append-only directories.
+ */
+ if ((dp->i_mode & ISVTX) &&
+ kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+ NULL) != 0 &&
+ kauth_cred_geteuid(cred) != dp->i_uid &&
+ VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) {
+ vput(tdp);
+ error = EPERM;
+ goto out;
+ }
+ *vpp = tdp;
+ error = 0;
+ goto out;
+ }
+
+ /*
+ * If rewriting (RENAME), return the inode and the
+ * information required to rewrite the present directory
+ * Must get inode of directory entry to verify it's a
+ * regular file, or empty directory.
+ */
+ if (nameiop == RENAME && (flags & ISLASTCN)) {
+ error = VOP_ACCESS(vdp, VWRITE, cred);
+ if (error)
+ goto out;
+ /*
+ * Careful about locking second inode.
+ * This can only occur if the target is ".".
+ */
+ if (dp->i_number == foundino) {
+ error = EISDIR;
+ goto out;
+ }
+ if (flags & ISDOTDOT)
+ VOP_UNLOCK(vdp); /* race to get the inode */
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ if (flags & ISDOTDOT)
+ vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ goto out;
+ *vpp = tdp;
+ error = 0;
+ goto out;
+ }
+
+ /*
+ * Step through the translation in the name. We do not `vput' the
+ * directory because we may need it again if a symbolic link
+ * is relative to the current directory. Instead we save it
+ * unlocked as "pdp". We must get the target inode before unlocking
+ * the directory to insure that the inode will not be removed
+ * before we get it. We prevent deadlock by always fetching
+ * inodes from the root, moving down the directory tree. Thus
+ * when following backward pointers ".." we must unlock the
+ * parent directory before getting the requested directory.
+ * There is a potential race condition here if both the current
+ * and parent directories are removed before the VFS_VGET for the
+ * inode associated with ".." returns. We hope that this occurs
+ * infrequently since we cannot avoid this race condition without
+ * implementing a sophisticated deadlock detection algorithm.
+ * Note also that this simple deadlock detection scheme will not
+ * work if the file system has any hard links other than ".."
+ * that point backwards in the directory structure.
+ */
+ pdp = vdp;
+ if (flags & ISDOTDOT) {
+ VOP_UNLOCK(pdp); /* race to get the inode */
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
+ if (error) {
+ goto out;
+ }
+ *vpp = tdp;
+ } else if (dp->i_number == foundino) {
+ vref(vdp); /* we want ourself, ie "." */
+ *vpp = vdp;
+ } else {
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ if (error)
+ goto out;
+ *vpp = tdp;
+ }
+
+ /*
+ * Insert name into cache if appropriate.
+ */
+ if (cnp->cn_flags & MAKEENTRY)
+ cache_enter(vdp, *vpp, cnp);
+ error = 0;
+
+out:
+ fstrans_done(vdp->v_mount);
+ return error;
+}
+
+void
+ufs_dirbad(struct inode *ip, doff_t offset, const char *how)
+{
+ struct mount *mp;
+
+ mp = ITOV(ip)->v_mount;
+ printf("%s: bad dir ino %llu at offset %d: %s\n",
+ mp->mnt_stat.f_mntonname, (unsigned long long)ip->i_number,
+ offset, how);
+ if ((mp->mnt_stat.f_flag & MNT_RDONLY) == 0)
+ panic("bad dir");
+}
+
+/*
+ * Do consistency checking on a directory entry:
+ * record length must be multiple of 4
+ * entry must fit in rest of its DIRBLKSIZ block
+ * record must be large enough to contain entry
+ * name is not longer than FFS_MAXNAMLEN
+ * name must be as long as advertised, and null terminated
+ */
+int
+ufs_dirbadentry(struct vnode *dp, struct direct *ep, int entryoffsetinblock)
+{
+ int i;
+ int namlen;
+ struct ufsmount *ump = VFSTOUFS(dp->v_mount);
+ const int needswap = UFS_MPNEEDSWAP(ump);
+ int dirblksiz = ump->um_dirblksiz;
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (FSFMT(dp) && needswap == 0)
+ namlen = ep->d_type;
+ else
+ namlen = ep->d_namlen;
+#else
+ if (FSFMT(dp) && needswap != 0)
+ namlen = ep->d_type;
+ else
+ namlen = ep->d_namlen;
+#endif
+ if ((ufs_rw16(ep->d_reclen, needswap) & 0x3) != 0 ||
+ ufs_rw16(ep->d_reclen, needswap) >
+ dirblksiz - (entryoffsetinblock & (dirblksiz - 1)) ||
+ ufs_rw16(ep->d_reclen, needswap) <
+ DIRSIZ(FSFMT(dp), ep, needswap) ||
+ namlen > FFS_MAXNAMLEN) {
+ /*return (1); */
+ printf("First bad, reclen=%#x, DIRSIZ=%lu, namlen=%d, "
+ "flags=%#x, entryoffsetinblock=%d, dirblksiz = %d\n",
+ ufs_rw16(ep->d_reclen, needswap),
+ (u_long)DIRSIZ(FSFMT(dp), ep, needswap),
+ namlen, dp->v_mount->mnt_flag, entryoffsetinblock,
+ dirblksiz);
+ goto bad;
+ }
+ if (ep->d_ino == 0)
+ return (0);
+ for (i = 0; i < namlen; i++)
+ if (ep->d_name[i] == '\0') {
+ /*return (1); */
+ printf("Second bad\n");
+ goto bad;
+ }
+ if (ep->d_name[i])
+ goto bad;
+ return (0);
+bad:
+ return (1);
+}
+
+/*
+ * Construct a new directory entry after a call to namei, using the
+ * name in the componentname argument cnp. The argument ip is the
+ * inode to which the new directory entry will refer.
+ */
+void
+ufs_makedirentry(struct inode *ip, struct componentname *cnp,
+ struct direct *newdirp)
+{
+ newdirp->d_ino = ip->i_number;
+ newdirp->d_namlen = cnp->cn_namelen;
+ memcpy(newdirp->d_name, cnp->cn_nameptr, (size_t)cnp->cn_namelen);
+ newdirp->d_name[cnp->cn_namelen] = '\0';
+ if (FSFMT(ITOV(ip)))
+ newdirp->d_type = 0;
+ else
+ newdirp->d_type = IFTODT(ip->i_mode);
+}
+
+/*
+ * Write a directory entry after a call to namei, using the parameters
+ * that ufs_lookup left in nameidata and in the ufs_lookup_results.
+ *
+ * DVP is the directory to be updated. It must be locked.
+ * ULR is the ufs_lookup_results structure from the final lookup step.
+ * TVP is not used. (XXX: why is it here? remove it)
+ * DIRP is the new directory entry contents.
+ * CNP is the componentname from the final lookup step.
+ * NEWDIRBP is not used and (XXX) should be removed. The previous
+ * comment here said it was used by the now-removed softupdates code.
+ *
+ * The link count of the target inode is *not* incremented; the
+ * caller does that.
+ *
+ * If ulr->ulr_count is 0, ufs_lookup did not find space to insert the
+ * directory entry. ulr_offset, which is the place to put the entry,
+ * should be on a block boundary (and should be at the end of the
+ * directory AFAIK) and a fresh block is allocated to put the new
+ * directory entry in.
+ *
+ * If ulr->ulr_count is not zero, ufs_lookup found a slot to insert
+ * the entry into. This slot ranges from ulr_offset to ulr_offset +
+ * ulr_count. However, this slot may already be partially populated
+ * requiring compaction. See notes below.
+ *
+ * Furthermore, if ulr_count is not zero and ulr_endoff is not the
+ * same as i_size, the directory is truncated to size ulr_endoff.
+ */
+int
+ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+ struct vnode *tvp, struct direct *dirp,
+ struct componentname *cnp, struct buf *newdirbp)
+{
+ kauth_cred_t cr;
+ struct lwp *l;
+ int newentrysize;
+ struct inode *dp;
+ struct buf *bp;
+ u_int dsize;
+ struct direct *ep, *nep;
+ int error, ret, blkoff, loc, spacefree;
+ char *dirbuf;
+ struct timespec ts;
+ struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
+ const int needswap = UFS_MPNEEDSWAP(ump);
+ int dirblksiz = ump->um_dirblksiz;
+
+ UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
+ error = 0;
+ cr = cnp->cn_cred;
+ l = curlwp;
+
+ dp = VTOI(dvp);
+ newentrysize = DIRSIZ(0, dirp, 0);
+
+#if 0
+ struct ufs_lookup_results *ulr;
+ /* XXX should handle this material another way */
+ ulr = &dp->i_crap;
+ UFS_CHECK_CRAPCOUNTER(dp);
+#endif
+
+ if (ulr->ulr_count == 0) {
+ /*
+ * If ulr_count is 0, then namei could find no
+ * space in the directory. Here, ulr_offset will
+ * be on a directory block boundary and we will write the
+ * new entry into a fresh block.
+ */
+ if (ulr->ulr_offset & (dirblksiz - 1))
+ panic("ufs_direnter: newblk");
+ if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz,
+ cr, B_CLRBUF | B_SYNC, &bp)) != 0) {
+ return (error);
+ }
+ dp->i_size = ulr->ulr_offset + dirblksiz;
+ DIP_ASSIGN(dp, size, dp->i_size);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ uvm_vnp_setsize(dvp, dp->i_size);
+ dirp->d_reclen = ufs_rw16(dirblksiz, needswap);
+ dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
+ if (FSFMT(dvp)) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (needswap == 0) {
+#else
+ if (needswap != 0) {
+#endif
+ u_char tmp = dirp->d_namlen;
+ dirp->d_namlen = dirp->d_type;
+ dirp->d_type = tmp;
+ }
+ }
+ blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1);
+ memcpy((char *)bp->b_data + blkoff, dirp, newentrysize);
+#ifdef UFS_DIRHASH
+ if (dp->i_dirhash != NULL) {
+ ufsdirhash_newblk(dp, ulr->ulr_offset);
+ ufsdirhash_add(dp, dirp, ulr->ulr_offset);
+ ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
+ ulr->ulr_offset);
+ }
+#endif
+ error = VOP_BWRITE(bp->b_vp, bp);
+ vfs_timestamp(&ts);
+ ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP);
+ if (error == 0)
+ return (ret);
+ return (error);
+ }
+
+ /*
+ * If ulr_count is non-zero, then namei found space for the new
+ * entry in the range ulr_offset to url_offset + url_count
+ * in the directory. To use this space, we may have to compact
+ * the entries located there, by copying them together towards the
+ * beginning of the block, leaving the free space in one usable
+ * chunk at the end.
+ */
+
+ /*
+ * Increase size of directory if entry eats into new space.
+ * This should never push the size past a new multiple of
+ * DIRBLKSIZ.
+ *
+ * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
+ */
+ if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) {
+#ifdef DIAGNOSTIC
+ printf("ufs_direnter: reached 4.2-only block, "
+ "not supposed to happen\n");
+#endif
+ dp->i_size = ulr->ulr_offset + ulr->ulr_count;
+ DIP_ASSIGN(dp, size, dp->i_size);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+ }
+ /*
+ * Get the block containing the space for the new directory entry.
+ */
+ error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true);
+ if (error) {
+ return (error);
+ }
+ /*
+ * Find space for the new entry. In the simple case, the entry at
+ * offset base will have the space. If it does not, then namei
+ * arranged that compacting the region dp->i_offset to
+ * dp->i_offset + dp->i_count would yield the space.
+ */
+ ep = (struct direct *)dirbuf;
+ dsize = (ep->d_ino != 0) ? DIRSIZ(FSFMT(dvp), ep, needswap) : 0;
+ spacefree = ufs_rw16(ep->d_reclen, needswap) - dsize;
+ for (loc = ufs_rw16(ep->d_reclen, needswap); loc < ulr->ulr_count; ) {
+ uint16_t reclen;
+
+ nep = (struct direct *)(dirbuf + loc);
+
+ /* Trim the existing slot (NB: dsize may be zero). */
+ ep->d_reclen = ufs_rw16(dsize, needswap);
+ ep = (struct direct *)((char *)ep + dsize);
+
+ reclen = ufs_rw16(nep->d_reclen, needswap);
+ loc += reclen;
+ if (nep->d_ino == 0) {
+ /*
+ * A mid-block unused entry. Such entries are
+ * never created by the kernel, but fsck_ffs
+ * can create them (and it doesn't fix them).
+ *
+ * Add up the free space, and initialise the
+ * relocated entry since we don't memcpy it.
+ */
+ spacefree += reclen;
+ ep->d_ino = 0;
+ dsize = 0;
+ continue;
+ }
+ dsize = DIRSIZ(FSFMT(dvp), nep, needswap);
+ spacefree += reclen - dsize;
+#ifdef UFS_DIRHASH
+ if (dp->i_dirhash != NULL)
+ ufsdirhash_move(dp, nep,
+ ulr->ulr_offset + ((char *)nep - dirbuf),
+ ulr->ulr_offset + ((char *)ep - dirbuf));
+#endif
+ memcpy((void *)ep, (void *)nep, dsize);
+ }
+ /*
+ * Here, `ep' points to a directory entry containing `dsize' in-use
+ * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
+ * then the entry is completely unused (dsize == 0). The value
+ * of ep->d_reclen is always indeterminate.
+ *
+ * Update the pointer fields in the previous entry (if any),
+ * copy in the new entry, and write out the block.
+ */
+ if (ep->d_ino == 0 ||
+ (ufs_rw32(ep->d_ino, needswap) == WINO &&
+ memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
+ if (spacefree + dsize < newentrysize)
+ panic("ufs_direnter: compact1");
+ dirp->d_reclen = spacefree + dsize;
+ } else {
+ if (spacefree < newentrysize)
+ panic("ufs_direnter: compact2");
+ dirp->d_reclen = spacefree;
+ ep->d_reclen = ufs_rw16(dsize, needswap);
+ ep = (struct direct *)((char *)ep + dsize);
+ }
+ dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap);
+ dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
+ if (FSFMT(dvp)) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (needswap == 0) {
+#else
+ if (needswap != 0) {
+#endif
+ u_char tmp = dirp->d_namlen;
+ dirp->d_namlen = dirp->d_type;
+ dirp->d_type = tmp;
+ }
+ }
+#ifdef UFS_DIRHASH
+ if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
+ dirp->d_reclen == spacefree))
+ ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf));
+#endif
+ memcpy((void *)ep, (void *)dirp, (u_int)newentrysize);
+#ifdef UFS_DIRHASH
+ if (dp->i_dirhash != NULL)
+ ufsdirhash_checkblock(dp, dirbuf -
+ (ulr->ulr_offset & (dirblksiz - 1)),
+ ulr->ulr_offset & ~(dirblksiz - 1));
+#endif
+ error = VOP_BWRITE(bp->b_vp, bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ /*
+ * If all went well, and the directory can be shortened, proceed
+ * with the truncation. Note that we have to unlock the inode for
+ * the entry that we just entered, as the truncation may need to
+ * lock other inodes which can lead to deadlock if we also hold a
+ * lock on the newly entered node.
+ */
+ if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) {
+#ifdef UFS_DIRHASH
+ if (dp->i_dirhash != NULL)
+ ufsdirhash_dirtrunc(dp, ulr->ulr_endoff);
+#endif
+ (void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr);
+ }
+ UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+ return (error);
+}
+
+/*
+ * Remove a directory entry after a call to namei, using the
+ * parameters that ufs_lookup left in nameidata and in the
+ * ufs_lookup_results.
+ *
+ * DVP is the directory to be updated. It must be locked.
+ * ULR is the ufs_lookup_results structure from the final lookup step.
+ * IP, if not null, is the inode being unlinked.
+ * FLAGS may contain DOWHITEOUT.
+ * ISRMDIR is not used and (XXX) should be removed.
+ *
+ * If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout
+ * instead of being cleared.
+ *
+ * ulr->ulr_offset contains the position of the directory entry
+ * to be removed.
+ *
+ * ulr->ulr_reclen contains the size of the directory entry to be
+ * removed.
+ *
+ * ulr->ulr_count contains the size of the *previous* directory
+ * entry. This allows finding it, for free space management. If
+ * ulr_count is 0, the target entry is at the beginning of the
+ * directory. (Does this ever happen? The first entry should be ".",
+ * which should only be removed at rmdir time. Does rmdir come here
+ * to clear out the "." and ".." entries? Perhaps, but I doubt it.)
+ *
+ * The space is marked free by adding it to the record length (not
+ * name length) of the preceding entry. If the first entry becomes
+ * free, it is marked free by setting the inode number to 0.
+ *
+ * The link count of IP is decremented. Note that this is not the
+ * inverse behavior of ufs_direnter, which does not adjust link
+ * counts. Sigh.
+ */
+int
+ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+ struct inode *ip, int flags, int isrmdir)
+{
+ struct inode *dp = VTOI(dvp);
+ struct direct *ep;
+ struct buf *bp;
+ int error;
+#ifdef FFS_EI
+ const int needswap = UFS_MPNEEDSWAP(dp->i_ump);
+#endif
+
+ UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
+ if (flags & DOWHITEOUT) {
+ /*
+ * Whiteout entry: set d_ino to WINO.
+ */
+ error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, (void *)&ep,
+ &bp, true);
+ if (error)
+ return (error);
+ ep->d_ino = ufs_rw32(WINO, needswap);
+ ep->d_type = DT_WHT;
+ goto out;
+ }
+
+ if ((error = ufs_blkatoff(dvp,
+ (off_t)(ulr->ulr_offset - ulr->ulr_count), (void *)&ep, &bp, true)) != 0)
+ return (error);
+
+#ifdef UFS_DIRHASH
+ /*
+ * Remove the dirhash entry. This is complicated by the fact
+ * that `ep' is the previous entry when dp->i_count != 0.
+ */
+ if (dp->i_dirhash != NULL)
+ ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep :
+ (struct direct *)((char *)ep +
+ ufs_rw16(ep->d_reclen, needswap)), ulr->ulr_offset);
+#endif
+
+ if (ulr->ulr_count == 0) {
+ /*
+ * First entry in block: set d_ino to zero.
+ */
+ ep->d_ino = 0;
+ } else {
+ /*
+ * Collapse new free space into previous entry.
+ */
+ ep->d_reclen =
+ ufs_rw16(ufs_rw16(ep->d_reclen, needswap) + ulr->ulr_reclen,
+ needswap);
+ }
+
+#ifdef UFS_DIRHASH
+ if (dp->i_dirhash != NULL) {
+ int dirblksiz = ip->i_ump->um_dirblksiz;
+ ufsdirhash_checkblock(dp, (char *)ep -
+ ((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)),
+ ulr->ulr_offset & ~(dirblksiz - 1));
+ }
+#endif
+
+out:
+ if (ip) {
+ ip->i_nlink--;
+ DIP_ASSIGN(ip, nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0);
+ }
+ error = VOP_BWRITE(bp->b_vp, bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ /*
+ * If the last named reference to a snapshot goes away,
+ * drop its snapshot reference so that it will be reclaimed
+ * when last open reference goes away.
+ */
+ if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 &&
+ ip->i_nlink == 0)
+ ffs_snapgone(ip);
+ UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0);
+ return (error);
+}
+
+/*
+ * Rewrite an existing directory entry to point at the inode supplied.
+ *
+ * DP is the directory to update.
+ * OFFSET is the position of the entry in question. It may come
+ * from ulr_offset of a ufs_lookup_results.
+ * OIP is the old inode the directory previously pointed to.
+ * NEWINUM is the number of the new inode.
+ * NEWTYPE is the new value for the type field of the directory entry.
+ * (This is ignored if the fs doesn't support that.)
+ * ISRMDIR is not used and (XXX) should be removed.
+ * IFLAGS are added to DP's inode flags.
+ *
+ * The link count of OIP is decremented. Note that the link count of
+ * the new inode is *not* incremented. Yay for symmetry.
+ */
+int
+ufs_dirrewrite(struct inode *dp, off_t offset,
+ struct inode *oip, ino_t newinum, int newtype,
+ int isrmdir, int iflags)
+{
+ struct buf *bp;
+ struct direct *ep;
+ struct vnode *vdp = ITOV(dp);
+ int error;
+
+ error = ufs_blkatoff(vdp, offset, (void *)&ep, &bp, true);
+ if (error)
+ return (error);
+ ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump));
+ if (!FSFMT(vdp))
+ ep->d_type = newtype;
+ oip->i_nlink--;
+ DIP_ASSIGN(oip, nlink, oip->i_nlink);
+ oip->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP);
+ error = VOP_BWRITE(bp->b_vp, bp);
+ dp->i_flag |= iflags;
+ /*
+ * If the last named reference to a snapshot goes away,
+ * drop its snapshot reference so that it will be reclaimed
+ * when last open reference goes away.
+ */
+ if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0)
+ ffs_snapgone(oip);
+ UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
+ return (error);
+}
+
+/*
+ * Check if a directory is empty or not.
+ * Inode supplied must be locked.
+ *
+ * Using a struct dirtemplate here is not precisely
+ * what we want, but better than using a struct direct.
+ *
+ * NB: does not handle corrupted directories.
+ */
+int
+ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
+{
+ doff_t off;
+ struct dirtemplate dbuf;
+ struct direct *dp = (struct direct *)&dbuf;
+ int error, namlen;
+ size_t count;
+ const int needswap = UFS_IPNEEDSWAP(ip);
+#define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
+
+ for (off = 0; off < ip->i_size;
+ off += ufs_rw16(dp->d_reclen, needswap)) {
+ error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off,
+ UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL);
+ /*
+ * Since we read MINDIRSIZ, residual must
+ * be 0 unless we're at end of file.
+ */
+ if (error || count != 0)
+ return (0);
+ /* avoid infinite loops */
+ if (dp->d_reclen == 0)
+ return (0);
+ /* skip empty entries */
+ if (dp->d_ino == 0 || ufs_rw32(dp->d_ino, needswap) == WINO)
+ continue;
+ /* accept only "." and ".." */
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (FSFMT(ITOV(ip)) && needswap == 0)
+ namlen = dp->d_type;
+ else
+ namlen = dp->d_namlen;
+#else
+ if (FSFMT(ITOV(ip)) && needswap != 0)
+ namlen = dp->d_type;
+ else
+ namlen = dp->d_namlen;
+#endif
+ if (namlen > 2)
+ return (0);
+ if (dp->d_name[0] != '.')
+ return (0);
+ /*
+ * At this point namlen must be 1 or 2.
+ * 1 implies ".", 2 implies ".." if second
+ * char is also "."
+ */
+ if (namlen == 1 &&
+ ufs_rw32(dp->d_ino, needswap) == ip->i_number)
+ continue;
+ if (dp->d_name[1] == '.' &&
+ ufs_rw32(dp->d_ino, needswap) == parentino)
+ continue;
+ return (0);
+ }
+ return (1);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+int
+ufs_checkpath(struct inode *source, struct inode *target, kauth_cred_t cred)
+{
+ struct vnode *nextvp, *vp;
+ int error, rootino, namlen;
+ struct dirtemplate dirbuf;
+ const int needswap = UFS_MPNEEDSWAP(target->i_ump);
+
+ vp = ITOV(target);
+ if (target->i_number == source->i_number) {
+ error = EEXIST;
+ goto out;
+ }
+ rootino = ROOTINO;
+ error = 0;
+ if (target->i_number == rootino)
+ goto out;
+
+ for (;;) {
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ break;
+ }
+ error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf,
+ sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
+ IO_NODELOCKED, cred, NULL, NULL);
+ if (error != 0)
+ break;
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (FSFMT(vp) && needswap == 0)
+ namlen = dirbuf.dotdot_type;
+ else
+ namlen = dirbuf.dotdot_namlen;
+#else
+ if (FSFMT(vp) && needswap != 0)
+ namlen = dirbuf.dotdot_type;
+ else
+ namlen = dirbuf.dotdot_namlen;
+#endif
+ if (namlen != 2 ||
+ dirbuf.dotdot_name[0] != '.' ||
+ dirbuf.dotdot_name[1] != '.') {
+ error = ENOTDIR;
+ break;
+ }
+ if (ufs_rw32(dirbuf.dotdot_ino, needswap) == source->i_number) {
+ error = EINVAL;
+ break;
+ }
+ if (ufs_rw32(dirbuf.dotdot_ino, needswap) == rootino)
+ break;
+ VOP_UNLOCK(vp);
+ error = VFS_VGET(vp->v_mount,
+ ufs_rw32(dirbuf.dotdot_ino, needswap), &nextvp);
+ vrele(vp);
+ if (error) {
+ vp = NULL;
+ break;
+ }
+ vp = nextvp;
+ }
+
+out:
+ if (error == ENOTDIR)
+ printf("checkpath: .. not a directory\n");
+ if (vp != NULL)
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Extract the inode number of ".." from a directory.
+ * Helper for ufs_parentcheck.
+ */
+static int
+ufs_readdotdot(struct vnode *vp, int needswap, kauth_cred_t cred, ino_t *result)
+{
+ struct dirtemplate dirbuf;
+ int namlen, error;
+
+ error = vn_rdwr(UIO_READ, vp, &dirbuf,
+ sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
+ IO_NODELOCKED, cred, NULL, NULL);
+ if (error) {
+ return error;
+ }
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (FSFMT(vp) && needswap == 0)
+ namlen = dirbuf.dotdot_type;
+ else
+ namlen = dirbuf.dotdot_namlen;
+#else
+ if (FSFMT(vp) && needswap != 0)
+ namlen = dirbuf.dotdot_type;
+ else
+ namlen = dirbuf.dotdot_namlen;
+#endif
+ if (namlen != 2 ||
+ dirbuf.dotdot_name[0] != '.' ||
+ dirbuf.dotdot_name[1] != '.') {
+ printf("ufs_readdotdot: directory %llu contains "
+ "garbage instead of ..\n",
+ (unsigned long long) VTOI(vp)->i_number);
+ return ENOTDIR;
+ }
+ *result = ufs_rw32(dirbuf.dotdot_ino, needswap);
+ return 0;
+}
+
+/*
+ * Check if LOWER is a descendent of UPPER. If we find UPPER, return
+ * nonzero in FOUND and return a reference to the immediate descendent
+ * of UPPER in UPPERCHILD. If we don't find UPPER (that is, if we
+ * reach the volume root and that isn't UPPER), return zero in FOUND
+ * and null in UPPERCHILD.
+ *
+ * Neither UPPER nor LOWER should be locked.
+ *
+ * On error (such as a permissions error checking up the directory
+ * tree) fail entirely.
+ *
+ * Note that UPPER and LOWER must be on the same volume, and because
+ * we inspect only that volume NEEDSWAP can be constant.
+ */
+int
+ufs_parentcheck(struct vnode *upper, struct vnode *lower, kauth_cred_t cred,
+ int *found_ret, struct vnode **upperchild_ret)
+{
+ const int needswap = UFS_MPNEEDSWAP(VTOI(lower)->i_ump);
+ ino_t upper_ino, found_ino;
+ struct vnode *current, *next;
+ int error;
+
+ if (upper == lower) {
+ vref(upper);
+ *found_ret = 1;
+ *upperchild_ret = upper;
+ return 0;
+ }
+ if (VTOI(lower)->i_number == ROOTINO) {
+ *found_ret = 0;
+ *upperchild_ret = NULL;
+ return 0;
+ }
+
+ upper_ino = VTOI(upper)->i_number;
+
+ current = lower;
+ vref(current);
+ vn_lock(current, LK_EXCLUSIVE | LK_RETRY);
+
+ for (;;) {
+ error = ufs_readdotdot(current, needswap, cred, &found_ino);
+ if (error) {
+ vput(current);
+ return error;
+ }
+ if (found_ino == upper_ino) {
+ VOP_UNLOCK(current);
+ *found_ret = 1;
+ *upperchild_ret = current;
+ return 0;
+ }
+ if (found_ino == ROOTINO) {
+ vput(current);
+ *found_ret = 0;
+ *upperchild_ret = NULL;
+ return 0;
+ }
+ VOP_UNLOCK(current);
+ error = VFS_VGET(current->v_mount, found_ino, &next);
+ if (error) {
+ vrele(current);
+ return error;
+ }
+ KASSERT(VOP_ISLOCKED(next));
+ if (next->v_type != VDIR) {
+ printf("ufs_parentcheck: inode %llu reached via .. of "
+ "inode %llu is not a directory\n",
+ (unsigned long long)VTOI(next)->i_number,
+ (unsigned long long)VTOI(current)->i_number);
+ vput(next);
+ vrele(current);
+ return ENOTDIR;
+ }
+ vrele(current);
+ current = next;
+ }
+
+ return 0;
+}
+
+#define UFS_DIRRABLKS 0
+int ufs_dirrablks = UFS_DIRRABLKS;
+
+/*
+ * ufs_blkatoff: Return buffer with the contents of block "offset" from
+ * the beginning of directory "vp". If "res" is non-zero, fill it in with
+ * a pointer to the remaining space in the directory. If the caller intends
+ * to modify the buffer returned, "modify" must be true.
+ */
+
+int
+ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp,
+ bool modify)
+{
+ struct inode *ip;
+ struct buf *bp;
+ daddr_t lbn;
+ const int dirrablks = ufs_dirrablks;
+ daddr_t *blks;
+ int *blksizes;
+ int run, error;
+ struct mount *mp = vp->v_mount;
+ const int bshift = mp->mnt_fs_bshift;
+ const int bsize = 1 << bshift;
+ off_t eof;
+
+ blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP);
+ blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP);
+ ip = VTOI(vp);
+ KASSERT(vp->v_size == ip->i_size);
+ GOP_SIZE(vp, vp->v_size, &eof, 0);
+ lbn = offset >> bshift;
+
+ for (run = 0; run <= dirrablks;) {
+ const off_t curoff = lbn << bshift;
+ const int size = MIN(eof - curoff, bsize);
+
+ if (size == 0) {
+ break;
+ }
+ KASSERT(curoff < eof);
+ blks[run] = lbn;
+ blksizes[run] = size;
+ lbn++;
+ run++;
+ if (size != bsize) {
+ break;
+ }
+ }
+ KASSERT(run >= 1);
+ error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1],
+ run - 1, NOCRED, (modify ? B_MODIFY : 0), &bp);
+ if (error != 0) {
+ brelse(bp, 0);
+ *bpp = NULL;
+ goto out;
+ }
+ if (res) {
+ *res = (char *)bp->b_data + (offset & (bsize - 1));
+ }
+ *bpp = bp;
+
+ out:
+ kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t));
+ kmem_free(blksizes, (1 + dirrablks) * sizeof(int));
+ return error;
+}
--- /dev/null
+/* $NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Robert Elz at The University of Melbourne.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+#include <quota/quotaprop.h>
+
+kmutex_t dqlock;
+kcondvar_t dqcv;
+
+/*
+ * Code pertaining to management of the in-core dquot data structures.
+ */
+#define DQHASH(dqvp, id) \
+ (((((long)(dqvp)) >> 8) + id) & dqhash)
+static LIST_HEAD(dqhashhead, dquot) *dqhashtbl;
+static u_long dqhash;
+static pool_cache_t dquot_cache;
+
+
+static int quota_handle_cmd_get_version(struct mount *, struct lwp *,
+ prop_dictionary_t, prop_array_t);
+static int quota_handle_cmd_get(struct mount *, struct lwp *,
+ prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_set(struct mount *, struct lwp *,
+ prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_getall(struct mount *, struct lwp *,
+ prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_clear(struct mount *, struct lwp *,
+ prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_quotaon(struct mount *, struct lwp *,
+ prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_quotaoff(struct mount *, struct lwp *,
+ prop_dictionary_t, int, prop_array_t);
+/*
+ * Initialize the quota fields of an inode.
+ */
+void
+ufsquota_init(struct inode *ip)
+{
+ int i;
+
+ for (i = 0; i < MAXQUOTAS; i++)
+ ip->i_dquot[i] = NODQUOT;
+}
+
+/*
+ * Release the quota fields from an inode.
+ */
+void
+ufsquota_free(struct inode *ip)
+{
+ int i;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dqrele(ITOV(ip), ip->i_dquot[i]);
+ ip->i_dquot[i] = NODQUOT;
+ }
+}
+
+/*
+ * Update disk usage, and take corrective action.
+ */
+int
+chkdq(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+ /* do not track snapshot usage, or we will deadlock */
+ if ((ip->i_flags & SF_SNAPSHOT) != 0)
+ return 0;
+
+#ifdef QUOTA
+ if (ip->i_ump->um_flags & UFS_QUOTA)
+ return chkdq1(ip, change, cred, flags);
+#endif
+#ifdef QUOTA2
+ if (ip->i_ump->um_flags & UFS_QUOTA2)
+ return chkdq2(ip, change, cred, flags);
+#endif
+ return 0;
+}
+
+/*
+ * Check the inode limit, applying corrective action.
+ */
+int
+chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+ /* do not track snapshot usage, or we will deadlock */
+ if ((ip->i_flags & SF_SNAPSHOT) != 0)
+ return 0;
+#ifdef QUOTA
+ if (ip->i_ump->um_flags & UFS_QUOTA)
+ return chkiq1(ip, change, cred, flags);
+#endif
+#ifdef QUOTA2
+ if (ip->i_ump->um_flags & UFS_QUOTA2)
+ return chkiq2(ip, change, cred, flags);
+#endif
+ return 0;
+}
+
+int
+quota_handle_cmd(struct mount *mp, struct lwp *l, prop_dictionary_t cmddict)
+{
+ int error = 0;
+ const char *cmd, *type;
+ prop_array_t datas;
+ int q2type;
+
+ if (!prop_dictionary_get_cstring_nocopy(cmddict, "command", &cmd))
+ return EINVAL;
+ if (!prop_dictionary_get_cstring_nocopy(cmddict, "type", &type))
+ return EINVAL;
+ if (!strcmp(type, QUOTADICT_CLASS_USER)) {
+ q2type = USRQUOTA;
+ } else if (!strcmp(type, QUOTADICT_CLASS_GROUP)) {
+ q2type = GRPQUOTA;
+ } else
+ return EOPNOTSUPP;
+ datas = prop_dictionary_get(cmddict, "data");
+ if (datas == NULL || prop_object_type(datas) != PROP_TYPE_ARRAY)
+ return EINVAL;
+
+ prop_object_retain(datas);
+ prop_dictionary_remove(cmddict, "data"); /* prepare for return */
+
+ if (strcmp(cmd, "get version") == 0) {
+ error = quota_handle_cmd_get_version(mp, l, cmddict, datas);
+ goto end;
+ }
+ if (strcmp(cmd, "quotaon") == 0) {
+ error = quota_handle_cmd_quotaon(mp, l, cmddict,
+ q2type, datas);
+ goto end;
+ }
+ if (strcmp(cmd, "quotaoff") == 0) {
+ error = quota_handle_cmd_quotaoff(mp, l, cmddict,
+ q2type, datas);
+ goto end;
+ }
+ if (strcmp(cmd, "get") == 0) {
+ error = quota_handle_cmd_get(mp, l, cmddict, q2type, datas);
+ goto end;
+ }
+ if (strcmp(cmd, "set") == 0) {
+ error = quota_handle_cmd_set(mp, l, cmddict, q2type, datas);
+ goto end;
+ }
+ if (strcmp(cmd, "getall") == 0) {
+ error = quota_handle_cmd_getall(mp, l, cmddict, q2type, datas);
+ goto end;
+ }
+ if (strcmp(cmd, "clear") == 0) {
+ error = quota_handle_cmd_clear(mp, l, cmddict, q2type, datas);
+ goto end;
+ }
+ error = EOPNOTSUPP;
+end:
+ error = (prop_dictionary_set_int8(cmddict, "return",
+ error) ? 0 : ENOMEM);
+ prop_object_release(datas);
+ return error;
+}
+
+static int
+quota_handle_cmd_get_version(struct mount *mp, struct lwp *l,
+ prop_dictionary_t cmddict, prop_array_t datas)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ prop_array_t replies;
+ prop_dictionary_t data;
+ int error = 0;
+
+ if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+ return EOPNOTSUPP;
+
+ replies = prop_array_create();
+ if (replies == NULL)
+ return ENOMEM;
+
+ data = prop_dictionary_create();
+ if (data == NULL) {
+ prop_object_release(replies);
+ return ENOMEM;
+ }
+
+#ifdef QUOTA
+ if (ump->um_flags & UFS_QUOTA) {
+ if (!prop_dictionary_set_int8(data, "version", 1))
+ error = ENOMEM;
+ } else
+#endif
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2) {
+ if (!prop_dictionary_set_int8(data, "version", 2))
+ error = ENOMEM;
+ } else
+#endif
+ error = 0;
+ if (error)
+ prop_object_release(data);
+ else if (!prop_array_add_and_rel(replies, data))
+ error = ENOMEM;
+ if (error)
+ prop_object_release(replies);
+ else if (!prop_dictionary_set_and_rel(cmddict, "data", replies))
+ error = ENOMEM;
+ return error;
+}
+
+/* XXX shouldn't all this be in kauth ? */
+static int
+quota_get_auth(struct mount *mp, struct lwp *l, uid_t id) {
+ /* The user can always query about his own quota. */
+ if (id == kauth_cred_getuid(l->l_cred))
+ return 0;
+ return kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(id), NULL);
+}
+
+static int
+quota_handle_cmd_get(struct mount *mp, struct lwp *l,
+ prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+ prop_array_t replies;
+ prop_object_iterator_t iter;
+ prop_dictionary_t data;
+ uint32_t id;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error, defaultq = 0;
+ const char *idstr;
+
+ if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+ return EOPNOTSUPP;
+
+ replies = prop_array_create();
+ if (replies == NULL)
+ return ENOMEM;
+
+ iter = prop_array_iterator(datas);
+ if (iter == NULL) {
+ prop_object_release(replies);
+ return ENOMEM;
+ }
+ while ((data = prop_object_iterator_next(iter)) != NULL) {
+ if (!prop_dictionary_get_uint32(data, "id", &id)) {
+ if (!prop_dictionary_get_cstring_nocopy(data, "id",
+ &idstr))
+ continue;
+ if (strcmp(idstr, "default")) {
+ error = EINVAL;
+ goto err;
+ }
+ id = 0;
+ defaultq = 1;
+ } else {
+ defaultq = 0;
+ }
+ error = quota_get_auth(mp, l, id);
+ if (error == EPERM)
+ continue;
+ if (error != 0)
+ goto err;
+#ifdef QUOTA
+ if (ump->um_flags & UFS_QUOTA)
+ error = quota1_handle_cmd_get(ump, type, id, defaultq,
+ replies);
+ else
+#endif
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2) {
+ error = quota2_handle_cmd_get(ump, type, id, defaultq,
+ replies);
+ } else
+#endif
+ panic("quota_handle_cmd_get: no support ?");
+
+ if (error == ENOENT)
+ continue;
+ if (error != 0)
+ goto err;
+ }
+ prop_object_iterator_release(iter);
+ if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+ error = ENOMEM;
+ } else {
+ error = 0;
+ }
+ return error;
+err:
+ prop_object_iterator_release(iter);
+ prop_object_release(replies);
+ return error;
+}
+
+static int
+quota_handle_cmd_set(struct mount *mp, struct lwp *l,
+ prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+ prop_array_t replies;
+ prop_object_iterator_t iter;
+ prop_dictionary_t data;
+ uint32_t id;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error, defaultq = 0;
+ const char *idstr;
+
+ if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+ return EOPNOTSUPP;
+
+ replies = prop_array_create();
+ if (replies == NULL)
+ return ENOMEM;
+
+ iter = prop_array_iterator(datas);
+ if (iter == NULL) {
+ prop_object_release(replies);
+ return ENOMEM;
+ }
+ while ((data = prop_object_iterator_next(iter)) != NULL) {
+ if (!prop_dictionary_get_uint32(data, "id", &id)) {
+ if (!prop_dictionary_get_cstring_nocopy(data, "id",
+ &idstr))
+ continue;
+ if (strcmp(idstr, "default"))
+ continue;
+ id = 0;
+ defaultq = 1;
+ } else {
+ defaultq = 0;
+ }
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL);
+ if (error != 0)
+ goto err;
+#ifdef QUOTA
+ if (ump->um_flags & UFS_QUOTA)
+ error = quota1_handle_cmd_set(ump, type, id, defaultq,
+ data);
+ else
+#endif
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2) {
+ error = quota2_handle_cmd_set(ump, type, id, defaultq,
+ data);
+ } else
+#endif
+ panic("quota_handle_cmd_get: no support ?");
+
+ if (error && error != ENOENT)
+ goto err;
+ }
+ prop_object_iterator_release(iter);
+ if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+ error = ENOMEM;
+ } else {
+ error = 0;
+ }
+ return error;
+err:
+ prop_object_iterator_release(iter);
+ prop_object_release(replies);
+ return error;
+}
+
+static int
+quota_handle_cmd_clear(struct mount *mp, struct lwp *l,
+ prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+ prop_array_t replies;
+ prop_object_iterator_t iter;
+ prop_dictionary_t data;
+ uint32_t id;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error, defaultq = 0;
+ const char *idstr;
+
+ if ((ump->um_flags & UFS_QUOTA2) == 0)
+ return EOPNOTSUPP;
+
+ replies = prop_array_create();
+ if (replies == NULL)
+ return ENOMEM;
+
+ iter = prop_array_iterator(datas);
+ if (iter == NULL) {
+ prop_object_release(replies);
+ return ENOMEM;
+ }
+ while ((data = prop_object_iterator_next(iter)) != NULL) {
+ if (!prop_dictionary_get_uint32(data, "id", &id)) {
+ if (!prop_dictionary_get_cstring_nocopy(data, "id",
+ &idstr))
+ continue;
+ if (strcmp(idstr, "default"))
+ continue;
+ id = 0;
+ defaultq = 1;
+ } else {
+ defaultq = 0;
+ }
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL);
+ if (error != 0)
+ goto err;
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2) {
+ error = quota2_handle_cmd_clear(ump, type, id, defaultq,
+ data);
+ } else
+#endif
+ panic("quota_handle_cmd_get: no support ?");
+
+ if (error && error != ENOENT)
+ goto err;
+ }
+ prop_object_iterator_release(iter);
+ if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+ error = ENOMEM;
+ } else {
+ error = 0;
+ }
+ return error;
+err:
+ prop_object_iterator_release(iter);
+ prop_object_release(replies);
+ return error;
+}
+
+static int
+quota_handle_cmd_getall(struct mount *mp, struct lwp *l,
+ prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+ prop_array_t replies;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+
+ if ((ump->um_flags & UFS_QUOTA2) == 0)
+ return EOPNOTSUPP;
+
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL);
+ if (error)
+ return error;
+
+ replies = prop_array_create();
+ if (replies == NULL)
+ return ENOMEM;
+
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2) {
+ error = quota2_handle_cmd_getall(ump, type, replies);
+ } else
+#endif
+ panic("quota_handle_cmd_getall: no support ?");
+ if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+ error = ENOMEM;
+ } else {
+ error = 0;
+ }
+ return error;
+}
+
+static int
+quota_handle_cmd_quotaon(struct mount *mp, struct lwp *l,
+ prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+ prop_dictionary_t data;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+ const char *qfile;
+
+ if ((ump->um_flags & UFS_QUOTA2) != 0)
+ return EBUSY;
+
+ if (prop_array_count(datas) != 1)
+ return EINVAL;
+
+ data = prop_array_get(datas, 0);
+ if (data == NULL)
+ return ENOMEM;
+ if (!prop_dictionary_get_cstring_nocopy(data, "quotafile",
+ &qfile))
+ return EINVAL;
+
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+ if (error != 0) {
+ return error;
+ }
+#ifdef QUOTA
+ error = quota1_handle_cmd_quotaon(l, ump, type, qfile);
+#else
+ error = EOPNOTSUPP;
+#endif
+
+ return error;
+}
+
+static int
+quota_handle_cmd_quotaoff(struct mount *mp, struct lwp *l,
+ prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ int error;
+
+ if ((ump->um_flags & UFS_QUOTA2) != 0)
+ return EOPNOTSUPP;
+
+ if (prop_array_count(datas) != 0)
+ return EINVAL;
+
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+ if (error != 0) {
+ return error;
+ }
+#ifdef QUOTA
+ error = quota1_handle_cmd_quotaoff(l, ump, type);
+#else
+ error = EOPNOTSUPP;
+#endif
+
+ return error;
+}
+
+/*
+ * Initialize the quota system.
+ */
+void
+dqinit(void)
+{
+
+ mutex_init(&dqlock, MUTEX_DEFAULT, IPL_NONE);
+ cv_init(&dqcv, "quota");
+ dqhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &dqhash);
+ dquot_cache = pool_cache_init(sizeof(struct dquot), 0, 0, 0, "ufsdq",
+ NULL, IPL_NONE, NULL, NULL, NULL);
+}
+
+void
+dqreinit(void)
+{
+ struct dquot *dq;
+ struct dqhashhead *oldhash, *hash;
+ struct vnode *dqvp;
+ u_long oldmask, mask, hashval;
+ int i;
+
+ hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+ mutex_enter(&dqlock);
+ oldhash = dqhashtbl;
+ oldmask = dqhash;
+ dqhashtbl = hash;
+ dqhash = mask;
+ for (i = 0; i <= oldmask; i++) {
+ while ((dq = LIST_FIRST(&oldhash[i])) != NULL) {
+ dqvp = dq->dq_ump->um_quotas[dq->dq_type];
+ LIST_REMOVE(dq, dq_hash);
+ hashval = DQHASH(dqvp, dq->dq_id);
+ LIST_INSERT_HEAD(&dqhashtbl[hashval], dq, dq_hash);
+ }
+ }
+ mutex_exit(&dqlock);
+ hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free resources held by quota system.
+ */
+void
+dqdone(void)
+{
+
+ pool_cache_destroy(dquot_cache);
+ hashdone(dqhashtbl, HASH_LIST, dqhash);
+ cv_destroy(&dqcv);
+ mutex_destroy(&dqlock);
+}
+
+/*
+ * Set up the quotas for an inode.
+ *
+ * This routine completely defines the semantics of quotas.
+ * If other criterion want to be used to establish quotas, the
+ * MAXQUOTAS value in quotas.h should be increased, and the
+ * additional dquots set up here.
+ */
+int
+getinoquota(struct inode *ip)
+{
+ struct ufsmount *ump = ip->i_ump;
+ struct vnode *vp = ITOV(ip);
+ int i, error;
+ u_int32_t ino_ids[MAXQUOTAS];
+
+ /*
+ * To avoid deadlocks never update quotas for quota files
+ * on the same file system
+ */
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (vp == ump->um_quotas[i])
+ return 0;
+
+ ino_ids[USRQUOTA] = ip->i_uid;
+ ino_ids[GRPQUOTA] = ip->i_gid;
+ for (i = 0; i < MAXQUOTAS; i++) {
+ /*
+ * If the file id changed the quota needs update.
+ */
+ if (ip->i_dquot[i] != NODQUOT &&
+ ip->i_dquot[i]->dq_id != ino_ids[i]) {
+ dqrele(ITOV(ip), ip->i_dquot[i]);
+ ip->i_dquot[i] = NODQUOT;
+ }
+ /*
+ * Set up the quota based on file id.
+ * ENODEV means that quotas are not enabled.
+ */
+ if (ip->i_dquot[i] == NODQUOT &&
+ (error = dqget(vp, ino_ids[i], ump, i, &ip->i_dquot[i])) &&
+ error != ENODEV)
+ return (error);
+ }
+ return 0;
+}
+
+/*
+ * Obtain a dquot structure for the specified identifier and quota file
+ * reading the information from the file if necessary.
+ */
+int
+dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type,
+ struct dquot **dqp)
+{
+ struct dquot *dq, *ndq;
+ struct dqhashhead *dqh;
+ struct vnode *dqvp;
+ int error = 0; /* XXX gcc */
+
+ /* Lock to see an up to date value for QTF_CLOSING. */
+ mutex_enter(&dqlock);
+ if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) {
+ mutex_exit(&dqlock);
+ *dqp = NODQUOT;
+ return (ENODEV);
+ }
+ dqvp = ump->um_quotas[type];
+#ifdef QUOTA
+ if (ump->um_flags & UFS_QUOTA) {
+ if (dqvp == NULLVP || (ump->umq1_qflags[type] & QTF_CLOSING)) {
+ mutex_exit(&dqlock);
+ *dqp = NODQUOT;
+ return (ENODEV);
+ }
+ }
+#endif
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2) {
+ if (dqvp == NULLVP) {
+ mutex_exit(&dqlock);
+ *dqp = NODQUOT;
+ return (ENODEV);
+ }
+ }
+#endif
+ KASSERT(dqvp != vp);
+ /*
+ * Check the cache first.
+ */
+ dqh = &dqhashtbl[DQHASH(dqvp, id)];
+ LIST_FOREACH(dq, dqh, dq_hash) {
+ if (dq->dq_id != id ||
+ dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+ continue;
+ KASSERT(dq->dq_cnt > 0);
+ dqref(dq);
+ mutex_exit(&dqlock);
+ *dqp = dq;
+ return (0);
+ }
+ /*
+ * Not in cache, allocate a new one.
+ */
+ mutex_exit(&dqlock);
+ ndq = pool_cache_get(dquot_cache, PR_WAITOK);
+ /*
+ * Initialize the contents of the dquot structure.
+ */
+ memset((char *)ndq, 0, sizeof *ndq);
+ ndq->dq_flags = 0;
+ ndq->dq_id = id;
+ ndq->dq_ump = ump;
+ ndq->dq_type = type;
+ mutex_init(&ndq->dq_interlock, MUTEX_DEFAULT, IPL_NONE);
+ mutex_enter(&dqlock);
+ dqh = &dqhashtbl[DQHASH(dqvp, id)];
+ LIST_FOREACH(dq, dqh, dq_hash) {
+ if (dq->dq_id != id ||
+ dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+ continue;
+ /*
+ * Another thread beat us allocating this dquot.
+ */
+ KASSERT(dq->dq_cnt > 0);
+ dqref(dq);
+ mutex_exit(&dqlock);
+ mutex_destroy(&ndq->dq_interlock);
+ pool_cache_put(dquot_cache, ndq);
+ *dqp = dq;
+ return 0;
+ }
+ dq = ndq;
+ LIST_INSERT_HEAD(dqh, dq, dq_hash);
+ dqref(dq);
+ mutex_enter(&dq->dq_interlock);
+ mutex_exit(&dqlock);
+#ifdef QUOTA
+ if (ump->um_flags & UFS_QUOTA)
+ error = dq1get(dqvp, id, ump, type, dq);
+#endif
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2)
+ error = dq2get(dqvp, id, ump, type, dq);
+#endif
+ /*
+ * I/O error in reading quota file, release
+ * quota structure and reflect problem to caller.
+ */
+ if (error) {
+ mutex_enter(&dqlock);
+ LIST_REMOVE(dq, dq_hash);
+ mutex_exit(&dqlock);
+ mutex_exit(&dq->dq_interlock);
+ dqrele(vp, dq);
+ *dqp = NODQUOT;
+ return (error);
+ }
+ mutex_exit(&dq->dq_interlock);
+ *dqp = dq;
+ return (0);
+}
+
+/*
+ * Obtain a reference to a dquot.
+ */
+void
+dqref(struct dquot *dq)
+{
+
+ KASSERT(mutex_owned(&dqlock));
+ dq->dq_cnt++;
+ KASSERT(dq->dq_cnt > 0);
+}
+
+/*
+ * Release a reference to a dquot.
+ */
+void
+dqrele(struct vnode *vp, struct dquot *dq)
+{
+
+ if (dq == NODQUOT)
+ return;
+ mutex_enter(&dq->dq_interlock);
+ for (;;) {
+ mutex_enter(&dqlock);
+ if (dq->dq_cnt > 1) {
+ dq->dq_cnt--;
+ mutex_exit(&dqlock);
+ mutex_exit(&dq->dq_interlock);
+ return;
+ }
+ if ((dq->dq_flags & DQ_MOD) == 0)
+ break;
+ mutex_exit(&dqlock);
+#ifdef QUOTA
+ if (dq->dq_ump->um_flags & UFS_QUOTA)
+ (void) dq1sync(vp, dq);
+#endif
+#ifdef QUOTA2
+ if (dq->dq_ump->um_flags & UFS_QUOTA2)
+ (void) dq2sync(vp, dq);
+#endif
+ }
+ KASSERT(dq->dq_cnt == 1 && (dq->dq_flags & DQ_MOD) == 0);
+ LIST_REMOVE(dq, dq_hash);
+ mutex_exit(&dqlock);
+ mutex_exit(&dq->dq_interlock);
+ mutex_destroy(&dq->dq_interlock);
+ pool_cache_put(dquot_cache, dq);
+}
+
+int
+qsync(struct mount *mp)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+#ifdef QUOTA
+ if (ump->um_flags & UFS_QUOTA)
+ return q1sync(mp);
+#endif
+#ifdef QUOTA2
+ if (ump->um_flags & UFS_QUOTA2)
+ return q2sync(mp);
+#endif
+ return 0;
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * Check the hash chains for stray dquot's.
+ */
+void
+dqflush(struct vnode *vp)
+{
+ struct dquot *dq;
+ int i;
+
+ mutex_enter(&dqlock);
+ for (i = 0; i <= dqhash; i++)
+ LIST_FOREACH(dq, &dqhashtbl[i], dq_hash)
+ KASSERT(dq->dq_ump->um_quotas[dq->dq_type] != vp);
+ mutex_exit(&dqlock);
+}
+#endif
--- /dev/null
+/* $NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Robert Elz at The University of Melbourne.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <quota/quotaprop.h>
+#include <ufs/ufs/quota1.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+
+static int chkdqchg(struct inode *, int64_t, kauth_cred_t, int);
+static int chkiqchg(struct inode *, int32_t, kauth_cred_t, int);
+
+/*
+ * Update disk usage, and take corrective action.
+ */
+int
+chkdq1(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+ struct dquot *dq;
+ int i;
+ int ncurblocks, error;
+
+ if ((error = getinoquota(ip)) != 0)
+ return error;
+ if (change == 0)
+ return (0);
+ if (change < 0) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if ((dq = ip->i_dquot[i]) == NODQUOT)
+ continue;
+ mutex_enter(&dq->dq_interlock);
+ ncurblocks = dq->dq_curblocks + change;
+ if (ncurblocks >= 0)
+ dq->dq_curblocks = ncurblocks;
+ else
+ dq->dq_curblocks = 0;
+ dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+ dq->dq_flags |= DQ_MOD;
+ mutex_exit(&dq->dq_interlock);
+ }
+ return (0);
+ }
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if ((dq = ip->i_dquot[i]) == NODQUOT)
+ continue;
+ if ((flags & FORCE) == 0 &&
+ kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, KAUTH_ARG(i),
+ KAUTH_ARG(QL_BLOCK), NULL) != 0) {
+ mutex_enter(&dq->dq_interlock);
+ error = chkdqchg(ip, change, cred, i);
+ mutex_exit(&dq->dq_interlock);
+ if (error != 0)
+ return (error);
+ }
+ }
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if ((dq = ip->i_dquot[i]) == NODQUOT)
+ continue;
+ mutex_enter(&dq->dq_interlock);
+ dq->dq_curblocks += change;
+ dq->dq_flags |= DQ_MOD;
+ mutex_exit(&dq->dq_interlock);
+ }
+ return (0);
+}
+
+/*
+ * Check for a valid change to a users allocation.
+ * Issue an error message if appropriate.
+ */
+static int
+chkdqchg(struct inode *ip, int64_t change, kauth_cred_t cred, int type)
+{
+ struct dquot *dq = ip->i_dquot[type];
+ long ncurblocks = dq->dq_curblocks + change;
+
+ KASSERT(mutex_owned(&dq->dq_interlock));
+ /*
+ * If user would exceed their hard limit, disallow space allocation.
+ */
+ if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
+ if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 &&
+ ip->i_uid == kauth_cred_geteuid(cred)) {
+ uprintf("\n%s: write failed, %s disk limit reached\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[type]);
+ dq->dq_flags |= DQ_WARN(QL_BLOCK);
+ }
+ return (EDQUOT);
+ }
+ /*
+ * If user is over their soft limit for too long, disallow space
+ * allocation. Reset time limit as they cross their soft limit.
+ */
+ if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) {
+ if (dq->dq_curblocks < dq->dq_bsoftlimit) {
+ dq->dq_btime =
+ time_second + ip->i_ump->umq1_btime[type];
+ if (ip->i_uid == kauth_cred_geteuid(cred))
+ uprintf("\n%s: warning, %s %s\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[type], "disk quota exceeded");
+ return (0);
+ }
+ if (time_second > dq->dq_btime) {
+ if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 &&
+ ip->i_uid == kauth_cred_geteuid(cred)) {
+ uprintf("\n%s: write failed, %s %s\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[type],
+ "disk quota exceeded for too long");
+ dq->dq_flags |= DQ_WARN(QL_BLOCK);
+ }
+ return (EDQUOT);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Check the inode limit, applying corrective action.
+ */
+int
+chkiq1(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+ struct dquot *dq;
+ int i;
+ int ncurinodes, error;
+
+ if ((error = getinoquota(ip)) != 0)
+ return error;
+ if (change == 0)
+ return (0);
+ if (change < 0) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if ((dq = ip->i_dquot[i]) == NODQUOT)
+ continue;
+ mutex_enter(&dq->dq_interlock);
+ ncurinodes = dq->dq_curinodes + change;
+ if (ncurinodes >= 0)
+ dq->dq_curinodes = ncurinodes;
+ else
+ dq->dq_curinodes = 0;
+ dq->dq_flags &= ~DQ_WARN(QL_FILE);
+ dq->dq_flags |= DQ_MOD;
+ mutex_exit(&dq->dq_interlock);
+ }
+ return (0);
+ }
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if ((dq = ip->i_dquot[i]) == NODQUOT)
+ continue;
+ if ((flags & FORCE) == 0 && kauth_authorize_system(cred,
+ KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT,
+ KAUTH_ARG(i), KAUTH_ARG(QL_FILE), NULL) != 0) {
+ mutex_enter(&dq->dq_interlock);
+ error = chkiqchg(ip, change, cred, i);
+ mutex_exit(&dq->dq_interlock);
+ if (error != 0)
+ return (error);
+ }
+ }
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if ((dq = ip->i_dquot[i]) == NODQUOT)
+ continue;
+ mutex_enter(&dq->dq_interlock);
+ dq->dq_curinodes += change;
+ dq->dq_flags |= DQ_MOD;
+ mutex_exit(&dq->dq_interlock);
+ }
+ return (0);
+}
+
+/*
+ * Check for a valid change to a users allocation.
+ * Issue an error message if appropriate.
+ */
+static int
+chkiqchg(struct inode *ip, int32_t change, kauth_cred_t cred, int type)
+{
+ struct dquot *dq = ip->i_dquot[type];
+ long ncurinodes = dq->dq_curinodes + change;
+
+ KASSERT(mutex_owned(&dq->dq_interlock));
+ /*
+ * If user would exceed their hard limit, disallow inode allocation.
+ */
+ if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
+ if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 &&
+ ip->i_uid == kauth_cred_geteuid(cred)) {
+ uprintf("\n%s: write failed, %s inode limit reached\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[type]);
+ dq->dq_flags |= DQ_WARN(QL_FILE);
+ }
+ return (EDQUOT);
+ }
+ /*
+ * If user is over their soft limit for too long, disallow inode
+ * allocation. Reset time limit as they cross their soft limit.
+ */
+ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) {
+ if (dq->dq_curinodes < dq->dq_isoftlimit) {
+ dq->dq_itime =
+ time_second + ip->i_ump->umq1_itime[type];
+ if (ip->i_uid == kauth_cred_geteuid(cred))
+ uprintf("\n%s: warning, %s %s\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[type], "inode quota exceeded");
+ return (0);
+ }
+ if (time_second > dq->dq_itime) {
+ if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 &&
+ ip->i_uid == kauth_cred_geteuid(cred)) {
+ uprintf("\n%s: write failed, %s %s\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[type],
+ "inode quota exceeded for too long");
+ dq->dq_flags |= DQ_WARN(QL_FILE);
+ }
+ return (EDQUOT);
+ }
+ }
+ return (0);
+}
+
+int
+quota1_umount(struct mount *mp, int flags)
+{
+ int i, error;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct lwp *l = curlwp;
+
+ if ((ump->um_flags & UFS_QUOTA) == 0)
+ return 0;
+
+ if ((error = vflush(mp, NULLVP, SKIPSYSTEM | flags)) != 0)
+ return (error);
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (ump->um_quotas[i] != NULLVP) {
+ quota1_handle_cmd_quotaoff(l, ump, i);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Code to process quotactl commands.
+ */
+
+/*
+ * set up a quota file for a particular file system.
+ */
+int
+quota1_handle_cmd_quotaon(struct lwp *l, struct ufsmount *ump, int type,
+ const char *fname)
+{
+ struct mount *mp = ump->um_mountp;
+ struct vnode *vp, **vpp, *mvp;
+ struct dquot *dq;
+ int error;
+ struct pathbuf *pb;
+ struct nameidata nd;
+
+ if (ump->um_flags & UFS_QUOTA2) {
+ uprintf("%s: quotas v2 already enabled\n",
+ mp->mnt_stat.f_mntonname);
+ return (EBUSY);
+ }
+
+ if (mp->mnt_wapbl != NULL) {
+ printf("%s: quota v1 cannot be used with -o log\n",
+ mp->mnt_stat.f_mntonname);
+ return (EOPNOTSUPP);
+ }
+
+ vpp = &ump->um_quotas[type];
+
+ pb = pathbuf_create(fname);
+ if (pb == NULL) {
+ return ENOMEM;
+ }
+ NDINIT(&nd, LOOKUP, FOLLOW, pb);
+ if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
+ pathbuf_destroy(pb);
+ return error;
+ }
+ vp = nd.ni_vp;
+ pathbuf_destroy(pb);
+
+ VOP_UNLOCK(vp);
+ if (vp->v_type != VREG) {
+ (void) vn_close(vp, FREAD|FWRITE, l->l_cred);
+ return (EACCES);
+ }
+ if (*vpp != vp)
+ quota1_handle_cmd_quotaoff(l, ump, type);
+ mutex_enter(&dqlock);
+ while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0)
+ cv_wait(&dqcv, &dqlock);
+ ump->umq1_qflags[type] |= QTF_OPENING;
+ mutex_exit(&dqlock);
+ mp->mnt_flag |= MNT_QUOTA;
+ vp->v_vflag |= VV_SYSTEM; /* XXXSMP */
+ *vpp = vp;
+ /*
+ * Save the credential of the process that turned on quotas.
+ * Set up the time limits for this quota.
+ */
+ kauth_cred_hold(l->l_cred);
+ ump->um_cred[type] = l->l_cred;
+ ump->umq1_btime[type] = MAX_DQ_TIME;
+ ump->umq1_itime[type] = MAX_IQ_TIME;
+ if (dqget(NULLVP, 0, ump, type, &dq) == 0) {
+ if (dq->dq_btime > 0)
+ ump->umq1_btime[type] = dq->dq_btime;
+ if (dq->dq_itime > 0)
+ ump->umq1_itime[type] = dq->dq_itime;
+ dqrele(NULLVP, dq);
+ }
+ /* Allocate a marker vnode. */
+ mvp = vnalloc(mp);
+ /*
+ * Search vnodes associated with this mount point,
+ * adding references to quota file being opened.
+ * NB: only need to add dquot's for inodes being modified.
+ */
+ mutex_enter(&mntvnode_lock);
+again:
+ for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+ vmark(mvp, vp);
+ mutex_enter(vp->v_interlock);
+ if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+ vp->v_type == VNON || vp->v_writecount == 0 ||
+ (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ mutex_exit(&mntvnode_lock);
+ if (vget(vp, LK_EXCLUSIVE)) {
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ goto again;
+ }
+ if ((error = getinoquota(VTOI(vp))) != 0) {
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ break;
+ }
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+ vnfree(mvp);
+
+ mutex_enter(&dqlock);
+ ump->umq1_qflags[type] &= ~QTF_OPENING;
+ cv_broadcast(&dqcv);
+ if (error == 0)
+ ump->um_flags |= UFS_QUOTA;
+ mutex_exit(&dqlock);
+ if (error)
+ quota1_handle_cmd_quotaoff(l, ump, type);
+ return (error);
+}
+
+/*
+ * turn off disk quotas for a filesystem.
+ */
+int
+quota1_handle_cmd_quotaoff(struct lwp *l, struct ufsmount *ump, int type)
+{
+ struct mount *mp = ump->um_mountp;
+ struct vnode *vp;
+ struct vnode *qvp, *mvp;
+ struct dquot *dq;
+ struct inode *ip;
+ kauth_cred_t cred;
+ int i, error;
+
+ /* Allocate a marker vnode. */
+ mvp = vnalloc(mp);
+
+ mutex_enter(&dqlock);
+ while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0)
+ cv_wait(&dqcv, &dqlock);
+ if ((qvp = ump->um_quotas[type]) == NULLVP) {
+ mutex_exit(&dqlock);
+ vnfree(mvp);
+ return (0);
+ }
+ ump->umq1_qflags[type] |= QTF_CLOSING;
+ ump->um_flags &= ~UFS_QUOTA;
+ mutex_exit(&dqlock);
+ /*
+ * Search vnodes associated with this mount point,
+ * deleting any references to quota file being closed.
+ */
+ mutex_enter(&mntvnode_lock);
+again:
+ for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+ vmark(mvp, vp);
+ mutex_enter(vp->v_interlock);
+ if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+ vp->v_type == VNON ||
+ (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ mutex_exit(&mntvnode_lock);
+ if (vget(vp, LK_EXCLUSIVE)) {
+ mutex_enter(&mntvnode_lock);
+ (void)vunmark(mvp);
+ goto again;
+ }
+ ip = VTOI(vp);
+ dq = ip->i_dquot[type];
+ ip->i_dquot[type] = NODQUOT;
+ dqrele(vp, dq);
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+#ifdef DIAGNOSTIC
+ dqflush(qvp);
+#endif
+ qvp->v_vflag &= ~VV_SYSTEM;
+ error = vn_close(qvp, FREAD|FWRITE, l->l_cred);
+ mutex_enter(&dqlock);
+ ump->um_quotas[type] = NULLVP;
+ cred = ump->um_cred[type];
+ ump->um_cred[type] = NOCRED;
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (ump->um_quotas[i] != NULLVP)
+ break;
+ ump->umq1_qflags[type] &= ~QTF_CLOSING;
+ cv_broadcast(&dqcv);
+ mutex_exit(&dqlock);
+ kauth_cred_free(cred);
+ if (i == MAXQUOTAS)
+ mp->mnt_flag &= ~MNT_QUOTA;
+ return (error);
+}
+
+int
+quota1_handle_cmd_get(struct ufsmount *ump, int type, int id,
+ int defaultq, prop_array_t replies)
+{
+ struct dquot *dq;
+ struct quotaval qv[QUOTA_NLIMITS];
+ prop_dictionary_t dict;
+ int error;
+ uint64_t *valuesp[QUOTA_NLIMITS];
+ valuesp[QUOTA_LIMIT_BLOCK] = &qv[QUOTA_LIMIT_BLOCK].qv_hardlimit;
+ valuesp[QUOTA_LIMIT_FILE] = &qv[QUOTA_LIMIT_FILE].qv_hardlimit;
+
+
+ if (ump->um_quotas[type] == NULLVP)
+ return ENODEV;
+
+ if (defaultq) { /* we want the grace period of id 0 */
+ if ((error = dqget(NULLVP, 0, ump, type, &dq)) != 0)
+ return error;
+
+ } else {
+ if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+ return error;
+ }
+ dqblk_to_quotaval(&dq->dq_un.dq1_dqb, qv);
+ dqrele(NULLVP, dq);
+ if (defaultq) {
+ if (qv[QUOTA_LIMIT_BLOCK].qv_expiretime > 0)
+ qv[QUOTA_LIMIT_BLOCK].qv_grace =
+ qv[QUOTA_LIMIT_BLOCK].qv_expiretime;
+ else
+ qv[QUOTA_LIMIT_BLOCK].qv_grace = MAX_DQ_TIME;
+ if (qv[QUOTA_LIMIT_FILE].qv_expiretime > 0)
+ qv[QUOTA_LIMIT_FILE].qv_grace =
+ qv[QUOTA_LIMIT_FILE].qv_expiretime;
+ else
+ qv[QUOTA_LIMIT_FILE].qv_grace = MAX_DQ_TIME;
+ }
+ dict = quota64toprop(id, defaultq, valuesp,
+ ufs_quota_entry_names, UFS_QUOTA_NENTRIES,
+ ufs_quota_limit_names, QUOTA_NLIMITS);
+ if (dict == NULL)
+ return ENOMEM;
+ if (!prop_array_add_and_rel(replies, dict))
+ return ENOMEM;
+ return 0;
+}
+
+int
+quota1_handle_cmd_set(struct ufsmount *ump, int type, int id,
+ int defaultq, prop_dictionary_t data)
+{
+ struct dquot *dq;
+ struct dqblk dqb;
+ int error;
+ uint64_t bval[2];
+ uint64_t ival[2];
+ const char *val_limitsonly_grace[] = {QUOTADICT_LIMIT_GTIME};
+#define Q1_GTIME 0
+ const char *val_limitsonly_softhard[] =
+ {QUOTADICT_LIMIT_SOFT, QUOTADICT_LIMIT_HARD};
+#define Q1_SOFT 0
+#define Q1_HARD 1
+
+ uint64_t *valuesp[QUOTA_NLIMITS];
+ valuesp[QUOTA_LIMIT_BLOCK] = bval;
+ valuesp[QUOTA_LIMIT_FILE] = ival;
+
+ if (ump->um_quotas[type] == NULLVP)
+ return ENODEV;
+
+ if (defaultq) {
+ /* just update grace times */
+ error = proptoquota64(data, valuesp, val_limitsonly_grace, 1,
+ ufs_quota_limit_names, QUOTA_NLIMITS);
+ if (error)
+ return error;
+ if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+ return error;
+ mutex_enter(&dq->dq_interlock);
+ if (bval[Q1_GTIME] > 0)
+ ump->umq1_btime[type] = dq->dq_btime =
+ bval[Q1_GTIME];
+ if (ival[Q1_GTIME] > 0)
+ ump->umq1_itime[type] = dq->dq_itime =
+ ival[Q1_GTIME];
+ mutex_exit(&dq->dq_interlock);
+ dq->dq_flags |= DQ_MOD;
+ dqrele(NULLVP, dq);
+ return 0;
+ }
+ error = proptoquota64(data, valuesp, val_limitsonly_softhard, 2,
+ ufs_quota_limit_names, QUOTA_NLIMITS);
+ if (error)
+ return error;
+
+ if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+ return (error);
+ mutex_enter(&dq->dq_interlock);
+ /*
+ * Copy all but the current values.
+ * Reset time limit if previously had no soft limit or were
+ * under it, but now have a soft limit and are over it.
+ */
+ dqb.dqb_curblocks = dq->dq_curblocks;
+ dqb.dqb_curinodes = dq->dq_curinodes;
+ dqb.dqb_btime = dq->dq_btime;
+ dqb.dqb_itime = dq->dq_itime;
+ dqb.dqb_bsoftlimit = (bval[Q1_SOFT] == UQUAD_MAX) ? 0 : bval[Q1_SOFT];
+ dqb.dqb_bhardlimit = (bval[Q1_HARD] == UQUAD_MAX) ? 0 : bval[Q1_HARD];
+ dqb.dqb_isoftlimit = (ival[Q1_SOFT] == UQUAD_MAX) ? 0 : ival[Q1_SOFT];
+ dqb.dqb_ihardlimit = (ival[Q1_HARD] == UQUAD_MAX) ? 0 : ival[Q1_HARD];
+ if (dq->dq_id == 0) {
+ /* also update grace time if available */
+ if (proptoquota64(data, valuesp, val_limitsonly_grace, 1,
+ ufs_quota_limit_names, QUOTA_NLIMITS) == 0) {
+ if (bval[Q1_GTIME] > 0)
+ ump->umq1_btime[type] = dqb.dqb_btime =
+ bval[Q1_GTIME];
+ if (ival[Q1_GTIME] > 0)
+ ump->umq1_itime[type] = dqb.dqb_itime =
+ ival[Q1_GTIME];
+ }
+ }
+ if (dqb.dqb_bsoftlimit &&
+ dq->dq_curblocks >= dqb.dqb_bsoftlimit &&
+ (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
+ dqb.dqb_btime = time_second + ump->umq1_btime[type];
+ if (dqb.dqb_isoftlimit &&
+ dq->dq_curinodes >= dqb.dqb_isoftlimit &&
+ (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
+ dqb.dqb_itime = time_second + ump->umq1_itime[type];
+ dq->dq_un.dq1_dqb = dqb;
+ if (dq->dq_curblocks < dq->dq_bsoftlimit)
+ dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+ if (dq->dq_curinodes < dq->dq_isoftlimit)
+ dq->dq_flags &= ~DQ_WARN(QL_FILE);
+ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+ dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+ dq->dq_flags |= DQ_FAKE;
+ else
+ dq->dq_flags &= ~DQ_FAKE;
+ dq->dq_flags |= DQ_MOD;
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+ return (0);
+}
+
+
+#if 0
+/*
+ * Q_SETQUOTA - assign an entire dqblk structure.
+ */
+int
+setquota1(struct mount *mp, u_long id, int type, struct dqblk *dqb)
+{
+ struct dquot *dq;
+ struct dquot *ndq;
+ struct ufsmount *ump = VFSTOUFS(mp);
+
+
+ if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0)
+ return (error);
+ dq = ndq;
+ mutex_enter(&dq->dq_interlock);
+ /*
+ * Copy all but the current values.
+ * Reset time limit if previously had no soft limit or were
+ * under it, but now have a soft limit and are over it.
+ */
+ dqb->dqb_curblocks = dq->dq_curblocks;
+ dqb->dqb_curinodes = dq->dq_curinodes;
+ if (dq->dq_id != 0) {
+ dqb->dqb_btime = dq->dq_btime;
+ dqb->dqb_itime = dq->dq_itime;
+ }
+ if (dqb->dqb_bsoftlimit &&
+ dq->dq_curblocks >= dqb->dqb_bsoftlimit &&
+ (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
+ dqb->dqb_btime = time_second + ump->umq1_btime[type];
+ if (dqb->dqb_isoftlimit &&
+ dq->dq_curinodes >= dqb->dqb_isoftlimit &&
+ (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
+ dqb->dqb_itime = time_second + ump->umq1_itime[type];
+ dq->dq_un.dq1_dqb = *dqb;
+ if (dq->dq_curblocks < dq->dq_bsoftlimit)
+ dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+ if (dq->dq_curinodes < dq->dq_isoftlimit)
+ dq->dq_flags &= ~DQ_WARN(QL_FILE);
+ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+ dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+ dq->dq_flags |= DQ_FAKE;
+ else
+ dq->dq_flags &= ~DQ_FAKE;
+ dq->dq_flags |= DQ_MOD;
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+ return (0);
+}
+
+/*
+ * Q_SETUSE - set current inode and block usage.
+ */
+int
+setuse(struct mount *mp, u_long id, int type, void *addr)
+{
+ struct dquot *dq;
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct dquot *ndq;
+ struct dqblk usage;
+ int error;
+
+ error = copyin(addr, (void *)&usage, sizeof (struct dqblk));
+ if (error)
+ return (error);
+ if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0)
+ return (error);
+ dq = ndq;
+ mutex_enter(&dq->dq_interlock);
+ /*
+ * Reset time limit if have a soft limit and were
+ * previously under it, but are now over it.
+ */
+ if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit &&
+ usage.dqb_curblocks >= dq->dq_bsoftlimit)
+ dq->dq_btime = time_second + ump->umq1_btime[type];
+ if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit &&
+ usage.dqb_curinodes >= dq->dq_isoftlimit)
+ dq->dq_itime = time_second + ump->umq1_itime[type];
+ dq->dq_curblocks = usage.dqb_curblocks;
+ dq->dq_curinodes = usage.dqb_curinodes;
+ if (dq->dq_curblocks < dq->dq_bsoftlimit)
+ dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+ if (dq->dq_curinodes < dq->dq_isoftlimit)
+ dq->dq_flags &= ~DQ_WARN(QL_FILE);
+ dq->dq_flags |= DQ_MOD;
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+ return (0);
+}
+#endif
+
+/*
+ * Q_SYNC - sync quota files to disk.
+ */
+int
+q1sync(struct mount *mp)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct vnode *vp, *mvp;
+ struct dquot *dq;
+ int i, error;
+
+ /*
+ * Check if the mount point has any quotas.
+ * If not, simply return.
+ */
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (ump->um_quotas[i] != NULLVP)
+ break;
+ if (i == MAXQUOTAS)
+ return (0);
+
+ /* Allocate a marker vnode. */
+ mvp = vnalloc(mp);
+
+ /*
+ * Search vnodes associated with this mount point,
+ * synchronizing any modified dquot structures.
+ */
+ mutex_enter(&mntvnode_lock);
+ again:
+ for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+ vmark(mvp, vp);
+ mutex_enter(vp->v_interlock);
+ if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+ vp->v_type == VNON ||
+ (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+ mutex_exit(vp->v_interlock);
+ continue;
+ }
+ mutex_exit(&mntvnode_lock);
+ error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error) {
+ mutex_enter(&mntvnode_lock);
+ if (error == ENOENT) {
+ (void)vunmark(mvp);
+ goto again;
+ }
+ continue;
+ }
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dq = VTOI(vp)->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ mutex_enter(&dq->dq_interlock);
+ if (dq->dq_flags & DQ_MOD)
+ dq1sync(vp, dq);
+ mutex_exit(&dq->dq_interlock);
+ }
+ vput(vp);
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+ vnfree(mvp);
+ return (0);
+}
+
+/*
+ * Obtain a dquot structure for the specified identifier and quota file
+ * reading the information from the file if necessary.
+ */
+int
+dq1get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type,
+ struct dquot *dq)
+{
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+
+ KASSERT(mutex_owned(&dq->dq_interlock));
+ vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = (void *)&dq->dq_un.dq1_dqb;
+ aiov.iov_len = sizeof (struct dqblk);
+ auio.uio_resid = sizeof (struct dqblk);
+ auio.uio_offset = (off_t)(id * sizeof (struct dqblk));
+ auio.uio_rw = UIO_READ;
+ UIO_SETUP_SYSSPACE(&auio);
+ error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
+ if (auio.uio_resid == sizeof(struct dqblk) && error == 0)
+ memset((void *)&dq->dq_un.dq1_dqb, 0, sizeof(struct dqblk));
+ VOP_UNLOCK(dqvp);
+ /*
+ * I/O error in reading quota file, release
+ * quota structure and reflect problem to caller.
+ */
+ if (error)
+ return (error);
+ /*
+ * Check for no limit to enforce.
+ * Initialize time values if necessary.
+ */
+ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+ dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+ dq->dq_flags |= DQ_FAKE;
+ if (dq->dq_id != 0) {
+ if (dq->dq_btime == 0)
+ dq->dq_btime = time_second + ump->umq1_btime[type];
+ if (dq->dq_itime == 0)
+ dq->dq_itime = time_second + ump->umq1_itime[type];
+ }
+ return (0);
+}
+
+/*
+ * Update the disk quota in the quota file.
+ */
+int
+dq1sync(struct vnode *vp, struct dquot *dq)
+{
+ struct vnode *dqvp;
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+
+ if (dq == NODQUOT)
+ panic("dq1sync: dquot");
+ KASSERT(mutex_owned(&dq->dq_interlock));
+ if ((dq->dq_flags & DQ_MOD) == 0)
+ return (0);
+ if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
+ panic("dq1sync: file");
+ KASSERT(dqvp != vp);
+ vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = (void *)&dq->dq_un.dq1_dqb;
+ aiov.iov_len = sizeof (struct dqblk);
+ auio.uio_resid = sizeof (struct dqblk);
+ auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk));
+ auio.uio_rw = UIO_WRITE;
+ UIO_SETUP_SYSSPACE(&auio);
+ error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
+ if (auio.uio_resid && error == 0)
+ error = EIO;
+ dq->dq_flags &= ~DQ_MOD;
+ VOP_UNLOCK(dqvp);
+ return (error);
+}
--- /dev/null
+/* $NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */
+/*-
+ * Copyright (c) 2010 Manuel Bouyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $");
+
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/quota2.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <quota/quotaprop.h>
+
+/*
+ * LOCKING:
+ * Data in the entries are protected by the associated struct dquot's
+ * dq_interlock (this means we can't read or change a quota entry without
+ * grabing a dquot for it).
+ * The header and lists (including pointers in the data entries, and q2e_uid)
+ * are protected by the global dqlock.
+ * the locking order is dq_interlock -> dqlock
+ */
+
+static int quota2_bwrite(struct mount *, struct buf *);
+static int getinoquota2(struct inode *, bool, bool, struct buf **,
+ struct quota2_entry **);
+static int getq2h(struct ufsmount *, int, struct buf **,
+ struct quota2_header **, int);
+static int getq2e(struct ufsmount *, int, daddr_t, int, struct buf **,
+ struct quota2_entry **, int);
+static int quota2_walk_list(struct ufsmount *, struct buf *, int,
+ uint64_t *, int, void *,
+ int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *,
+ uint64_t, void *));
+
+static int quota2_dict_update_q2e_limits(prop_dictionary_t,
+ struct quota2_entry *);
+static prop_dictionary_t q2etoprop(struct quota2_entry *, int);
+
+static const char *limnames[] = INITQLNAMES;
+
+static int
+quota2_dict_update_q2e_limits(prop_dictionary_t data,
+ struct quota2_entry *q2e)
+{
+ const char *val_limitsonly_names[] = INITQVNAMES_LIMITSONLY;
+
+ int i, error;
+ prop_dictionary_t val;
+
+ for (i = 0; i < N_QL; i++) {
+ if (!prop_dictionary_get_dict(data, limnames[i], &val))
+ return EINVAL;
+ error = quotaprop_dict_get_uint64(val,
+ &q2e->q2e_val[i].q2v_hardlimit,
+ val_limitsonly_names, N_QV, true);
+ if (error)
+ return error;
+ }
+ return 0;
+}
+static prop_dictionary_t
+q2etoprop(struct quota2_entry *q2e, int def)
+{
+ const char *val_names[] = INITQVNAMES_ALL;
+ prop_dictionary_t dict1 = prop_dictionary_create();
+ prop_dictionary_t dict2;
+ int i;
+
+ if (dict1 == NULL)
+ return NULL;
+
+ if (def) {
+ if (!prop_dictionary_set_cstring_nocopy(dict1, "id",
+ "default")) {
+ goto err;
+ }
+ } else {
+ if (!prop_dictionary_set_uint32(dict1, "id", q2e->q2e_uid)) {
+ goto err;
+ }
+ }
+ for (i = 0; i < N_QL; i++) {
+ dict2 = limits64toprop(&q2e->q2e_val[i].q2v_hardlimit,
+ val_names, N_QV);
+ if (dict2 == NULL)
+ goto err;
+ if (!prop_dictionary_set_and_rel(dict1, limnames[i], dict2))
+ goto err;
+ }
+ return dict1;
+
+err:
+ prop_object_release(dict1);
+ return NULL;
+}
+
+
+static int
+quota2_bwrite(struct mount *mp, struct buf *bp)
+{
+ if (mp->mnt_flag & MNT_SYNCHRONOUS)
+ return bwrite(bp);
+ else {
+ bdwrite(bp);
+ return 0;
+ }
+}
+
+static int
+getq2h(struct ufsmount *ump, int type,
+ struct buf **bpp, struct quota2_header **q2hp, int flags)
+{
+#ifdef FFS_EI
+ const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+ int error;
+ struct buf *bp;
+ struct quota2_header *q2h;
+
+ KASSERT(mutex_owned(&dqlock));
+ error = bread(ump->um_quotas[type], 0, ump->umq2_bsize,
+ ump->um_cred[type], flags, &bp);
+ if (error)
+ return error;
+ if (bp->b_resid != 0)
+ panic("dq2get: %s quota file truncated", quotatypes[type]);
+
+ q2h = (void *)bp->b_data;
+ if (ufs_rw32(q2h->q2h_magic_number, needswap) != Q2_HEAD_MAGIC ||
+ q2h->q2h_type != type)
+ panic("dq2get: corrupted %s quota header", quotatypes[type]);
+ *bpp = bp;
+ *q2hp = q2h;
+ return 0;
+}
+
+static int
+getq2e(struct ufsmount *ump, int type, daddr_t lblkno, int blkoffset,
+ struct buf **bpp, struct quota2_entry **q2ep, int flags)
+{
+ int error;
+ struct buf *bp;
+
+ if (blkoffset & (sizeof(uint64_t) - 1)) {
+ panic("dq2get: %s quota file corrupted",
+ quotatypes[type]);
+ }
+ error = bread(ump->um_quotas[type], lblkno, ump->umq2_bsize,
+ ump->um_cred[type], flags, &bp);
+ if (error)
+ return error;
+ if (bp->b_resid != 0) {
+ panic("dq2get: %s quota file corrupted",
+ quotatypes[type]);
+ }
+ *q2ep = (void *)((char *)bp->b_data + blkoffset);
+ *bpp = bp;
+ return 0;
+}
+
+/* walk a quota entry list, calling the callback for each entry */
+#define Q2WL_ABORT 0x10000000
+
+static int
+quota2_walk_list(struct ufsmount *ump, struct buf *hbp, int type,
+ uint64_t *offp, int flags, void *a,
+ int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *, uint64_t, void *))
+{
+#ifdef FFS_EI
+ const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+ daddr_t off = ufs_rw64(*offp, needswap);
+ struct buf *bp, *obp = hbp;
+ int ret = 0, ret2 = 0;
+ struct quota2_entry *q2e;
+ daddr_t lblkno, blkoff, olblkno = 0;
+
+ KASSERT(mutex_owner(&dqlock));
+
+ while (off != 0) {
+ lblkno = (off >> ump->um_mountp->mnt_fs_bshift);
+ blkoff = (off & ump->umq2_bmask);
+ if (lblkno == 0) {
+ /* in the header block */
+ bp = hbp;
+ } else if (lblkno == olblkno) {
+ /* still in the same buf */
+ bp = obp;
+ } else {
+ ret = bread(ump->um_quotas[type], lblkno,
+ ump->umq2_bsize,
+ ump->um_cred[type], flags, &bp);
+ if (ret)
+ return ret;
+ if (bp->b_resid != 0) {
+ panic("quota2_walk_list: %s quota file corrupted",
+ quotatypes[type]);
+ }
+ }
+ q2e = (void *)((char *)(bp->b_data) + blkoff);
+ ret = (*func)(ump, offp, q2e, off, a);
+ if (off != ufs_rw64(*offp, needswap)) {
+ /* callback changed parent's pointer, redo */
+ off = ufs_rw64(*offp, needswap);
+ if (bp != hbp && bp != obp)
+ ret2 = bwrite(bp);
+ } else {
+ /* parent if now current */
+ if (obp != bp && obp != hbp) {
+ if (flags & B_MODIFY)
+ ret2 = bwrite(obp);
+ else
+ brelse(obp, 0);
+ }
+ obp = bp;
+ olblkno = lblkno;
+ offp = &(q2e->q2e_next);
+ off = ufs_rw64(*offp, needswap);
+ }
+ if (ret)
+ break;
+ if (ret2) {
+ ret = ret2;
+ break;
+ }
+ }
+ if (obp != hbp) {
+ if (flags & B_MODIFY)
+ ret2 = bwrite(obp);
+ else
+ brelse(obp, 0);
+ }
+ if (ret & Q2WL_ABORT)
+ return 0;
+ if (ret == 0)
+ return ret2;
+ return ret;
+}
+
+int
+quota2_umount(struct mount *mp, int flags)
+{
+ int i, error;
+ struct ufsmount *ump = VFSTOUFS(mp);
+
+ if ((ump->um_flags & UFS_QUOTA2) == 0)
+ return 0;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (ump->um_quotas[i] != NULLVP) {
+ error = vn_close(ump->um_quotas[i], FREAD|FWRITE,
+ ump->um_cred[i]);
+ if (error) {
+ printf("quota2_umount failed: close(%p) %d\n",
+ ump->um_quotas[i], error);
+ return error;
+ }
+ }
+ ump->um_quotas[i] = NULLVP;
+ }
+ return 0;
+}
+
+static int
+quota2_q2ealloc(struct ufsmount *ump, int type, uid_t uid, struct dquot *dq,
+ struct buf **bpp, struct quota2_entry **q2ep)
+{
+ int error, error2;
+ struct buf *hbp, *bp;
+ struct quota2_header *q2h;
+ struct quota2_entry *q2e;
+ daddr_t offset;
+ u_long hash_mask;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+
+ KASSERT(mutex_owned(&dq->dq_interlock));
+ KASSERT(mutex_owned(&dqlock));
+ error = getq2h(ump, type, &hbp, &q2h, B_MODIFY);
+ if (error)
+ return error;
+ offset = ufs_rw64(q2h->q2h_free, needswap);
+ if (offset == 0) {
+ struct vnode *vp = ump->um_quotas[type];
+ struct inode *ip = VTOI(vp);
+ uint64_t size = ip->i_size;
+ /* need to alocate a new disk block */
+ error = UFS_BALLOC(vp, size, ump->umq2_bsize,
+ ump->um_cred[type], B_CLRBUF | B_SYNC, &bp);
+ if (error) {
+ brelse(hbp, 0);
+ return error;
+ }
+ KASSERT((ip->i_size % ump->umq2_bsize) == 0);
+ ip->i_size += ump->umq2_bsize;
+ DIP_ASSIGN(ip, size, ip->i_size);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ uvm_vnp_setsize(vp, ip->i_size);
+ quota2_addfreeq2e(q2h, bp->b_data, size, ump->umq2_bsize,
+ needswap);
+ error = bwrite(bp);
+ error2 = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+ if (error || error2) {
+ brelse(hbp, 0);
+ if (error)
+ return error;
+ return error2;
+ }
+ offset = ufs_rw64(q2h->q2h_free, needswap);
+ KASSERT(offset != 0);
+ }
+ dq->dq2_lblkno = (offset >> ump->um_mountp->mnt_fs_bshift);
+ dq->dq2_blkoff = (offset & ump->umq2_bmask);
+ if (dq->dq2_lblkno == 0) {
+ bp = hbp;
+ q2e = (void *)((char *)bp->b_data + dq->dq2_blkoff);
+ } else {
+ error = getq2e(ump, type, dq->dq2_lblkno,
+ dq->dq2_blkoff, &bp, &q2e, B_MODIFY);
+ if (error) {
+ brelse(hbp, 0);
+ return error;
+ }
+ }
+ hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+ /* remove from free list */
+ q2h->q2h_free = q2e->q2e_next;
+
+ memcpy(q2e, &q2h->q2h_defentry, sizeof(*q2e));
+ q2e->q2e_uid = ufs_rw32(uid, needswap);
+ /* insert in hash list */
+ q2e->q2e_next = q2h->q2h_entries[uid & hash_mask];
+ q2h->q2h_entries[uid & hash_mask] = ufs_rw64(offset, needswap);
+ if (hbp != bp) {
+ bwrite(hbp);
+ }
+ *q2ep = q2e;
+ *bpp = bp;
+ return 0;
+}
+
+static int
+getinoquota2(struct inode *ip, bool alloc, bool modify, struct buf **bpp,
+ struct quota2_entry **q2ep)
+{
+ int error;
+ int i;
+ struct dquot *dq;
+ struct ufsmount *ump = ip->i_ump;
+ u_int32_t ino_ids[MAXQUOTAS];
+
+ error = getinoquota(ip);
+ if (error)
+ return error;
+
+ if (alloc) {
+ UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp);
+ }
+ ino_ids[USRQUOTA] = ip->i_uid;
+ ino_ids[GRPQUOTA] = ip->i_gid;
+ /* first get the interlock for all dquot */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ mutex_enter(&dq->dq_interlock);
+ }
+ /* now get the corresponding quota entry */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ bpp[i] = NULL;
+ q2ep[i] = NULL;
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ if (__predict_false(ump->um_quotas[i] == NULL)) {
+ /*
+ * quotas have been turned off. This can happen
+ * at umount time.
+ */
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+ ip->i_dquot[i] = NULL;
+ continue;
+ }
+
+ if ((dq->dq2_lblkno | dq->dq2_blkoff) == 0) {
+ if (!alloc) {
+ continue;
+ }
+ /* need to alloc a new on-disk quot */
+ mutex_enter(&dqlock);
+ error = quota2_q2ealloc(ump, i, ino_ids[i], dq,
+ &bpp[i], &q2ep[i]);
+ mutex_exit(&dqlock);
+ if (error)
+ return error;
+ } else {
+ error = getq2e(ump, i, dq->dq2_lblkno,
+ dq->dq2_blkoff, &bpp[i], &q2ep[i],
+ modify ? B_MODIFY : 0);
+ if (error)
+ return error;
+ }
+ }
+ return 0;
+}
+
+static int
+quota2_check(struct inode *ip, int vtype, int64_t change, kauth_cred_t cred,
+ int flags)
+{
+ int error;
+ struct buf *bp[MAXQUOTAS];
+ struct quota2_entry *q2e[MAXQUOTAS];
+ struct quota2_val *q2vp;
+ struct dquot *dq;
+ uint64_t ncurblks;
+ struct ufsmount *ump = ip->i_ump;
+ struct mount *mp = ump->um_mountp;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+ int i;
+
+ if ((error = getinoquota2(ip, change > 0, change != 0, bp, q2e)) != 0)
+ return error;
+ if (change == 0) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ if (bp[i])
+ brelse(bp[i], 0);
+ mutex_exit(&dq->dq_interlock);
+ }
+ return 0;
+ }
+ if (change < 0) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ if (q2e[i] == NULL) {
+ mutex_exit(&dq->dq_interlock);
+ continue;
+ }
+ q2vp = &q2e[i]->q2e_val[vtype];
+ ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
+ if (ncurblks < -change)
+ ncurblks = 0;
+ else
+ ncurblks += change;
+ q2vp->q2v_cur = ufs_rw64(ncurblks, needswap);
+ quota2_bwrite(mp, bp[i]);
+ mutex_exit(&dq->dq_interlock);
+ }
+ return 0;
+ }
+ /* see if the allocation is allowed */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ struct quota2_val q2v;
+ int ql_stat;
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ KASSERT(q2e[i] != NULL);
+ quota2_ufs_rwq2v(&q2e[i]->q2e_val[vtype], &q2v, needswap);
+ ql_stat = quota2_check_limit(&q2v, change, time_second);
+
+ if ((flags & FORCE) == 0 &&
+ kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT,
+ KAUTH_ARG(i), KAUTH_ARG(vtype), NULL) != 0) {
+ /* enforce this limit */
+ switch(QL_STATUS(ql_stat)) {
+ case QL_S_DENY_HARD:
+ if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+ uprintf("\n%s: write failed, %s %s "
+ "limit reached\n",
+ mp->mnt_stat.f_mntonname,
+ quotatypes[i], limnames[vtype]);
+ dq->dq_flags |= DQ_WARN(vtype);
+ }
+ error = EDQUOT;
+ break;
+ case QL_S_DENY_GRACE:
+ if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+ uprintf("\n%s: write failed, %s %s "
+ "limit reached\n",
+ mp->mnt_stat.f_mntonname,
+ quotatypes[i], limnames[vtype]);
+ dq->dq_flags |= DQ_WARN(vtype);
+ }
+ error = EDQUOT;
+ break;
+ case QL_S_ALLOW_SOFT:
+ if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+ uprintf("\n%s: warning, %s %s "
+ "quota exceeded\n",
+ mp->mnt_stat.f_mntonname,
+ quotatypes[i], limnames[vtype]);
+ dq->dq_flags |= DQ_WARN(vtype);
+ }
+ break;
+ }
+ }
+ /*
+ * always do this; we don't know if the allocation will
+ * succed or not in the end. if we don't do the allocation
+ * q2v_time will be ignored anyway
+ */
+ if (ql_stat & QL_F_CROSS) {
+ q2v.q2v_time = time_second + q2v.q2v_grace;
+ quota2_ufs_rwq2v(&q2v, &q2e[i]->q2e_val[vtype],
+ needswap);
+ }
+ }
+
+ /* now do the allocation if allowed */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ KASSERT(q2e[i] != NULL);
+ if (error == 0) {
+ q2vp = &q2e[i]->q2e_val[vtype];
+ ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
+ q2vp->q2v_cur = ufs_rw64(ncurblks + change, needswap);
+ quota2_bwrite(mp, bp[i]);
+ } else
+ brelse(bp[i], 0);
+ mutex_exit(&dq->dq_interlock);
+ }
+ return error;
+}
+
+int
+chkdq2(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+ return quota2_check(ip, QL_BLOCK, change, cred, flags);
+}
+
+int
+chkiq2(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+ return quota2_check(ip, QL_FILE, change, cred, flags);
+}
+
+int
+quota2_handle_cmd_set(struct ufsmount *ump, int type, int id,
+ int defaultq, prop_dictionary_t data)
+{
+ int error;
+ struct dquot *dq;
+ struct quota2_header *q2h;
+ struct quota2_entry q2e, *q2ep;
+ struct buf *bp;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+
+ if (ump->um_quotas[type] == NULLVP)
+ return ENODEV;
+ error = UFS_WAPBL_BEGIN(ump->um_mountp);
+ if (error)
+ return error;
+
+ if (defaultq) {
+ mutex_enter(&dqlock);
+ error = getq2h(ump, type, &bp, &q2h, B_MODIFY);
+ if (error) {
+ mutex_exit(&dqlock);
+ goto out_wapbl;
+ }
+ quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+ error = quota2_dict_update_q2e_limits(data, &q2e);
+ if (error) {
+ mutex_exit(&dqlock);
+ brelse(bp, 0);
+ goto out_wapbl;
+ }
+ quota2_ufs_rwq2e(&q2e, &q2h->q2h_defentry, needswap);
+ mutex_exit(&dqlock);
+ quota2_bwrite(ump->um_mountp, bp);
+ goto out_wapbl;
+ }
+
+ error = dqget(NULLVP, id, ump, type, &dq);
+ if (error)
+ goto out_wapbl;
+
+ mutex_enter(&dq->dq_interlock);
+ if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+ /* need to alloc a new on-disk quot */
+ mutex_enter(&dqlock);
+ error = quota2_q2ealloc(ump, type, id, dq, &bp, &q2ep);
+ mutex_exit(&dqlock);
+ } else {
+ error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+ &bp, &q2ep, B_MODIFY);
+ }
+ if (error)
+ goto out_il;
+
+ quota2_ufs_rwq2e(q2ep, &q2e, needswap);
+ error = quota2_dict_update_q2e_limits(data, &q2e);
+ if (error) {
+ brelse(bp, 0);
+ goto out_il;
+ }
+ quota2_ufs_rwq2e(&q2e, q2ep, needswap);
+ quota2_bwrite(ump->um_mountp, bp);
+
+out_il:
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+out_wapbl:
+ UFS_WAPBL_END(ump->um_mountp);
+ return error;
+}
+
+struct dq2clear_callback {
+ uid_t id;
+ struct dquot *dq;
+ struct quota2_header *q2h;
+};
+
+static int
+dq2clear_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e,
+ uint64_t off, void *v)
+{
+ struct dq2clear_callback *c = v;
+#ifdef FFS_EI
+ const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+ uint64_t myoff;
+
+ if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) {
+ KASSERT(mutex_owned(&c->dq->dq_interlock));
+ c->dq->dq2_lblkno = 0;
+ c->dq->dq2_blkoff = 0;
+ myoff = *offp;
+ /* remove from hash list */
+ *offp = q2e->q2e_next;
+ /* add to free list */
+ q2e->q2e_next = c->q2h->q2h_free;
+ c->q2h->q2h_free = myoff;
+ return Q2WL_ABORT;
+ }
+ return 0;
+}
+int
+quota2_handle_cmd_clear(struct ufsmount *ump, int type, int id,
+ int defaultq, prop_dictionary_t data)
+{
+ int error, i;
+ struct dquot *dq;
+ struct quota2_header *q2h;
+ struct quota2_entry q2e, *q2ep;
+ struct buf *hbp, *bp;
+ u_long hash_mask;
+ struct dq2clear_callback c;
+
+ if (ump->um_quotas[type] == NULLVP)
+ return ENODEV;
+ if (defaultq)
+ return EOPNOTSUPP;
+
+ /* get the default entry before locking the entry's buffer */
+ mutex_enter(&dqlock);
+ error = getq2h(ump, type, &hbp, &q2h, 0);
+ if (error) {
+ mutex_exit(&dqlock);
+ return error;
+ }
+ /* we'll copy to another disk entry, so no need to swap */
+ memcpy(&q2e, &q2h->q2h_defentry, sizeof(q2e));
+ mutex_exit(&dqlock);
+ brelse(hbp, 0);
+
+ error = dqget(NULLVP, id, ump, type, &dq);
+ if (error)
+ return error;
+
+ mutex_enter(&dq->dq_interlock);
+ if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+ /* already clear, nothing to do */
+ error = ENOENT;
+ goto out_il;
+ }
+ error = UFS_WAPBL_BEGIN(ump->um_mountp);
+ if (error)
+ goto out_dq;
+
+ error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+ &bp, &q2ep, B_MODIFY);
+ if (error)
+ goto out_wapbl;
+
+ if (q2ep->q2e_val[QL_BLOCK].q2v_cur != 0 ||
+ q2ep->q2e_val[QL_FILE].q2v_cur != 0) {
+ /* can't free this entry; revert to default */
+ for (i = 0; i < N_QL; i++) {
+ q2ep->q2e_val[i].q2v_softlimit =
+ q2e.q2e_val[i].q2v_softlimit;
+ q2ep->q2e_val[i].q2v_hardlimit =
+ q2e.q2e_val[i].q2v_hardlimit;
+ q2ep->q2e_val[i].q2v_grace =
+ q2e.q2e_val[i].q2v_grace;
+ q2ep->q2e_val[i].q2v_time = 0;
+ }
+ quota2_bwrite(ump->um_mountp, bp);
+ goto out_wapbl;
+ }
+ /* we can free it. release bp so we can walk the list */
+ brelse(bp, 0);
+ mutex_enter(&dqlock);
+ error = getq2h(ump, type, &hbp, &q2h, 0);
+ if (error)
+ goto out_dqlock;
+
+ hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+ c.dq = dq;
+ c.id = id;
+ c.q2h = q2h;
+ error = quota2_walk_list(ump, hbp, type,
+ &q2h->q2h_entries[id & hash_mask], B_MODIFY, &c,
+ dq2clear_callback);
+
+ bwrite(hbp);
+
+out_dqlock:
+ mutex_exit(&dqlock);
+out_wapbl:
+ UFS_WAPBL_END(ump->um_mountp);
+out_il:
+ mutex_exit(&dq->dq_interlock);
+out_dq:
+ dqrele(NULLVP, dq);
+ return error;
+}
+
+static int
+quota2_array_add_q2e(struct ufsmount *ump, int type,
+ int id, prop_array_t replies)
+{
+ struct dquot *dq;
+ int error;
+ struct quota2_entry *q2ep, q2e;
+ struct buf *bp;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+ prop_dictionary_t dict;
+
+ error = dqget(NULLVP, id, ump, type, &dq);
+ if (error)
+ return error;
+
+ mutex_enter(&dq->dq_interlock);
+ if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+ return ENOENT;
+ }
+ error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+ &bp, &q2ep, 0);
+ if (error) {
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+ return error;
+ }
+ quota2_ufs_rwq2e(q2ep, &q2e, needswap);
+ brelse(bp, 0);
+ mutex_exit(&dq->dq_interlock);
+ dqrele(NULLVP, dq);
+ dict = q2etoprop(&q2e, 0);
+ if (dict == NULL)
+ return ENOMEM;
+ if (!prop_array_add_and_rel(replies, dict))
+ return ENOMEM;
+ return 0;
+}
+
+int
+quota2_handle_cmd_get(struct ufsmount *ump, int type, int id,
+ int defaultq, prop_array_t replies)
+{
+ int error;
+ struct quota2_header *q2h;
+ struct quota2_entry q2e;
+ struct buf *bp;
+ prop_dictionary_t dict;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+
+ if (ump->um_quotas[type] == NULLVP)
+ return ENODEV;
+ if (defaultq) {
+ mutex_enter(&dqlock);
+ error = getq2h(ump, type, &bp, &q2h, 0);
+ if (error) {
+ mutex_exit(&dqlock);
+ return error;
+ }
+ quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+ mutex_exit(&dqlock);
+ brelse(bp, 0);
+ dict = q2etoprop(&q2e, defaultq);
+ if (dict == NULL)
+ return ENOMEM;
+ if (!prop_array_add_and_rel(replies, dict))
+ return ENOMEM;
+ } else
+ error = quota2_array_add_q2e(ump, type, id, replies);
+
+ return error;
+}
+
+struct getuids {
+ long nuids; /* number of uids in array */
+ long size; /* size of array */
+ uid_t *uids; /* array of uids, dynamically allocated */
+};
+
+static int
+quota2_getuids_callback(struct ufsmount *ump, uint64_t *offp,
+ struct quota2_entry *q2ep, uint64_t off, void *v)
+{
+ struct getuids *gu = v;
+ uid_t *newuids;
+#ifdef FFS_EI
+ const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+
+ if (gu->nuids == gu->size) {
+ newuids = realloc(gu->uids, gu->size + PAGE_SIZE, M_TEMP,
+ M_WAITOK);
+ if (newuids == NULL) {
+ free(gu->uids, M_TEMP);
+ return ENOMEM;
+ }
+ gu->uids = newuids;
+ gu->size += (PAGE_SIZE / sizeof(uid_t));
+ }
+ gu->uids[gu->nuids] = ufs_rw32(q2ep->q2e_uid, needswap);
+ gu->nuids++;
+ return 0;
+}
+
+int
+quota2_handle_cmd_getall(struct ufsmount *ump, int type, prop_array_t replies)
+{
+ int error;
+ struct quota2_header *q2h;
+ struct quota2_entry q2e;
+ struct buf *hbp;
+ prop_dictionary_t dict;
+ uint64_t offset;
+ int i, j;
+ int quota2_hash_size;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+ struct getuids gu;
+
+ if (ump->um_quotas[type] == NULLVP)
+ return ENODEV;
+ mutex_enter(&dqlock);
+ error = getq2h(ump, type, &hbp, &q2h, 0);
+ if (error) {
+ mutex_exit(&dqlock);
+ return error;
+ }
+ quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+ dict = q2etoprop(&q2e, 1);
+ if (!prop_array_add_and_rel(replies, dict)) {
+ error = ENOMEM;
+ goto error_bp;
+ }
+ /*
+ * we can't directly get entries as we can't walk the list
+ * with qdlock and grab dq_interlock to read the entries
+ * at the same time. So just walk the lists to build a list of uid,
+ * and then read entries for these uids
+ */
+ memset(&gu, 0, sizeof(gu));
+ quota2_hash_size = ufs_rw16(q2h->q2h_hash_size, needswap);
+ for (i = 0; i < quota2_hash_size ; i++) {
+ offset = q2h->q2h_entries[i];
+ error = quota2_walk_list(ump, hbp, type, &offset, 0, &gu,
+ quota2_getuids_callback);
+ if (error) {
+ if (gu.uids != NULL)
+ free(gu.uids, M_TEMP);
+ break;
+ }
+ }
+error_bp:
+ mutex_exit(&dqlock);
+ brelse(hbp, 0);
+ if (error)
+ return error;
+ for (j = 0; j < gu.nuids; j++) {
+ error = quota2_array_add_q2e(ump, type,
+ gu.uids[j], replies);
+ if (error && error != ENOENT)
+ break;
+ }
+ free(gu.uids, M_TEMP);
+ return error;
+}
+
+int
+q2sync(struct mount *mp)
+{
+ return 0;
+}
+
+struct dq2get_callback {
+ uid_t id;
+ struct dquot *dq;
+};
+
+static int
+dq2get_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e,
+ uint64_t off, void *v)
+{
+ struct dq2get_callback *c = v;
+ daddr_t lblkno;
+ int blkoff;
+#ifdef FFS_EI
+ const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+
+ if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) {
+ KASSERT(mutex_owned(&c->dq->dq_interlock));
+ lblkno = (off >> ump->um_mountp->mnt_fs_bshift);
+ blkoff = (off & ump->umq2_bmask);
+ c->dq->dq2_lblkno = lblkno;
+ c->dq->dq2_blkoff = blkoff;
+ return Q2WL_ABORT;
+ }
+ return 0;
+}
+
+int
+dq2get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type,
+ struct dquot *dq)
+{
+ struct buf *bp;
+ struct quota2_header *q2h;
+ int error;
+ daddr_t offset;
+ u_long hash_mask;
+ struct dq2get_callback c = {
+ .id = id,
+ .dq = dq
+ };
+
+ KASSERT(mutex_owned(&dq->dq_interlock));
+ mutex_enter(&dqlock);
+ error = getq2h(ump, type, &bp, &q2h, 0);
+ if (error)
+ goto out_mutex;
+ /* look for our entry */
+ hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+ offset = q2h->q2h_entries[id & hash_mask];
+ error = quota2_walk_list(ump, bp, type, &offset, 0, (void *)&c,
+ dq2get_callback);
+ brelse(bp, 0);
+out_mutex:
+ mutex_exit(&dqlock);
+ return error;
+}
+
+int
+dq2sync(struct vnode *vp, struct dquot *dq)
+{
+ return 0;
+}
--- /dev/null
+/* $NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $ */
+
+/*-
+ * Copyright (c) 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $");
+
+#ifdef LFS_READWRITE
+#define FS struct lfs
+#define I_FS i_lfs
+#define READ lfs_read
+#define READ_S "lfs_read"
+#define WRITE lfs_write
+#define WRITE_S "lfs_write"
+#define fs_bsize lfs_bsize
+#define fs_bmask lfs_bmask
+#define UFS_WAPBL_BEGIN(mp) 0
+#define UFS_WAPBL_END(mp) do { } while (0)
+#define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0)
+#else
+#define FS struct fs
+#define I_FS i_fs
+#define READ ffs_read
+#define READ_S "ffs_read"
+#define WRITE ffs_write
+#define WRITE_S "ffs_write"
+#endif
+
+/*
+ * Vnode op for reading.
+ */
+/* ARGSUSED */
+int
+READ(void *v)
+{
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+ struct uio *uio;
+ struct ufsmount *ump;
+ struct buf *bp;
+ FS *fs;
+ vsize_t bytelen;
+ daddr_t lbn, nextlbn;
+ off_t bytesinfile;
+ long size, xfersize, blkoffset;
+ int error, ioflag;
+ bool usepc = false;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ ump = ip->i_ump;
+ uio = ap->a_uio;
+ ioflag = ap->a_ioflag;
+ error = 0;
+
+#ifdef DIAGNOSTIC
+ if (uio->uio_rw != UIO_READ)
+ panic("%s: mode", READ_S);
+
+ if (vp->v_type == VLNK) {
+ if (ip->i_size < ump->um_maxsymlinklen ||
+ (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0))
+ panic("%s: short symlink", READ_S);
+ } else if (vp->v_type != VREG && vp->v_type != VDIR)
+ panic("%s: type %d", READ_S, vp->v_type);
+#endif
+ fs = ip->I_FS;
+ if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
+ return (EFBIG);
+ if (uio->uio_resid == 0)
+ return (0);
+
+#ifndef LFS_READWRITE
+ if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
+ return ffs_snapshot_read(vp, uio, ioflag);
+#endif /* !LFS_READWRITE */
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+ if (uio->uio_offset >= ip->i_size)
+ goto out;
+
+#ifdef LFS_READWRITE
+ usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM);
+#else /* !LFS_READWRITE */
+ usepc = vp->v_type == VREG;
+#endif /* !LFS_READWRITE */
+ if (usepc) {
+ const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+ while (uio->uio_resid > 0) {
+ if (ioflag & IO_DIRECT) {
+ genfs_directio(vp, uio, ioflag);
+ }
+ bytelen = MIN(ip->i_size - uio->uio_offset,
+ uio->uio_resid);
+ if (bytelen == 0)
+ break;
+ error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+ UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
+ if (error)
+ break;
+ }
+ goto out;
+ }
+
+ for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+ bytesinfile = ip->i_size - uio->uio_offset;
+ if (bytesinfile <= 0)
+ break;
+ lbn = lblkno(fs, uio->uio_offset);
+ nextlbn = lbn + 1;
+ size = blksize(fs, ip, lbn);
+ blkoffset = blkoff(fs, uio->uio_offset);
+ xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
+ bytesinfile);
+
+ if (lblktosize(fs, nextlbn) >= ip->i_size)
+ error = bread(vp, lbn, size, NOCRED, 0, &bp);
+ else {
+ int nextsize = blksize(fs, ip, nextlbn);
+ error = breadn(vp, lbn,
+ size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+ }
+ if (error)
+ break;
+
+ /*
+ * We should only get non-zero b_resid when an I/O error
+ * has occurred, which should cause us to break above.
+ * However, if the short read did not cause an error,
+ * then we want to ensure that we do not uiomove bad
+ * or uninitialized data.
+ */
+ size -= bp->b_resid;
+ if (size < xfersize) {
+ if (size == 0)
+ break;
+ xfersize = size;
+ }
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+ if (error)
+ break;
+ brelse(bp, 0);
+ }
+ if (bp != NULL)
+ brelse(bp, 0);
+
+ out:
+ if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+ ip->i_flag |= IN_ACCESS;
+ if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error) {
+ fstrans_done(vp->v_mount);
+ return error;
+ }
+ error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+ UFS_WAPBL_END(vp->v_mount);
+ }
+ }
+
+ fstrans_done(vp->v_mount);
+ return (error);
+}
+
+/*
+ * Vnode op for writing.
+ */
+int
+WRITE(void *v)
+{
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct uio *uio;
+ struct inode *ip;
+ FS *fs;
+ struct buf *bp;
+ kauth_cred_t cred;
+ daddr_t lbn;
+ off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
+ int blkoffset, error, flags, ioflag, resid, size, xfersize;
+ int aflag;
+ int extended=0;
+ vsize_t bytelen;
+ bool async;
+ bool usepc = false;
+#ifdef LFS_READWRITE
+ bool need_unreserve = false;
+#endif
+ struct ufsmount *ump;
+
+ cred = ap->a_cred;
+ ioflag = ap->a_ioflag;
+ uio = ap->a_uio;
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ ump = ip->i_ump;
+
+ KASSERT(vp->v_size == ip->i_size);
+#ifdef DIAGNOSTIC
+ if (uio->uio_rw != UIO_WRITE)
+ panic("%s: mode", WRITE_S);
+#endif
+
+ switch (vp->v_type) {
+ case VREG:
+ if (ioflag & IO_APPEND)
+ uio->uio_offset = ip->i_size;
+ if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
+ return (EPERM);
+ /* FALLTHROUGH */
+ case VLNK:
+ break;
+ case VDIR:
+ if ((ioflag & IO_SYNC) == 0)
+ panic("%s: nonsync dir write", WRITE_S);
+ break;
+ default:
+ panic("%s: type", WRITE_S);
+ }
+
+ fs = ip->I_FS;
+ if (uio->uio_offset < 0 ||
+ (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
+ return (EFBIG);
+#ifdef LFS_READWRITE
+ /* Disallow writes to the Ifile, even if noschg flag is removed */
+ /* XXX can this go away when the Ifile is no longer in the namespace? */
+ if (vp == fs->lfs_ivnode)
+ return (EPERM);
+#endif
+ if (uio->uio_resid == 0)
+ return (0);
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+ flags = ioflag & IO_SYNC ? B_SYNC : 0;
+ async = vp->v_mount->mnt_flag & MNT_ASYNC;
+ origoff = uio->uio_offset;
+ resid = uio->uio_resid;
+ osize = ip->i_size;
+ error = 0;
+
+ usepc = vp->v_type == VREG;
+
+ if ((ioflag & IO_JOURNALLOCKED) == 0) {
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error) {
+ fstrans_done(vp->v_mount);
+ return error;
+ }
+ }
+
+#ifdef LFS_READWRITE
+ async = true;
+ lfs_check(vp, LFS_UNUSED_LBN, 0);
+#endif /* !LFS_READWRITE */
+ if (!usepc)
+ goto bcache;
+
+ preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset)));
+ aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+ nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
+ endallocoff = nsize - blkoff(fs, nsize);
+
+ /*
+ * if we're increasing the file size, deal with expanding
+ * the fragment if there is one.
+ */
+
+ if (nsize > osize && lblkno(fs, osize) < NDADDR &&
+ lblkno(fs, osize) != lblkno(fs, nsize) &&
+ blkroundup(fs, osize) != osize) {
+ off_t eob;
+
+ eob = blkroundup(fs, osize);
+ uvm_vnp_setwritesize(vp, eob);
+ error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
+ if (error)
+ goto out;
+ if (flags & B_SYNC) {
+ mutex_enter(vp->v_interlock);
+ VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
+ round_page(eob),
+ PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+ }
+ }
+
+ while (uio->uio_resid > 0) {
+ int ubc_flags = UBC_WRITE;
+ bool overwrite; /* if we're overwrite a whole block */
+ off_t newoff;
+
+ if (ioflag & IO_DIRECT) {
+ genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
+ }
+
+ oldoff = uio->uio_offset;
+ blkoffset = blkoff(fs, uio->uio_offset);
+ bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
+ if (bytelen == 0) {
+ break;
+ }
+
+ /*
+ * if we're filling in a hole, allocate the blocks now and
+ * initialize the pages first. if we're extending the file,
+ * we can safely allocate blocks without initializing pages
+ * since the new blocks will be inaccessible until the write
+ * is complete.
+ */
+ overwrite = uio->uio_offset >= preallocoff &&
+ uio->uio_offset < endallocoff;
+ if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
+ blkoff(fs, uio->uio_offset) == 0 &&
+ (uio->uio_offset & PAGE_MASK) == 0) {
+ vsize_t len;
+
+ len = trunc_page(bytelen);
+ len -= blkoff(fs, len);
+ if (len > 0) {
+ overwrite = true;
+ bytelen = len;
+ }
+ }
+
+ newoff = oldoff + bytelen;
+ if (vp->v_size < newoff) {
+ uvm_vnp_setwritesize(vp, newoff);
+ }
+
+ if (!overwrite) {
+ error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
+ cred, aflag);
+ if (error)
+ break;
+ } else {
+ genfs_node_wrlock(vp);
+ error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
+ aflag, cred);
+ genfs_node_unlock(vp);
+ if (error)
+ break;
+ ubc_flags |= UBC_FAULTBUSY;
+ }
+
+ /*
+ * copy the data.
+ */
+
+ error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+ IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp));
+
+ /*
+ * update UVM's notion of the size now that we've
+ * copied the data into the vnode's pages.
+ *
+ * we should update the size even when uiomove failed.
+ */
+
+ if (vp->v_size < newoff) {
+ uvm_vnp_setsize(vp, newoff);
+ extended = 1;
+ }
+
+ if (error)
+ break;
+
+ /*
+ * flush what we just wrote if necessary.
+ * XXXUBC simplistic async flushing.
+ */
+
+#ifndef LFS_READWRITE
+ if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+ (uio->uio_offset >> 16) << 16,
+ PGO_CLEANIT | PGO_JOURNALLOCKED);
+ if (error)
+ break;
+ }
+#endif
+ }
+ if (error == 0 && ioflag & IO_SYNC) {
+ mutex_enter(vp->v_interlock);
+ error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
+ round_page(blkroundup(fs, uio->uio_offset)),
+ PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+ }
+ goto out;
+
+ bcache:
+ mutex_enter(vp->v_interlock);
+ VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid),
+ PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED);
+ while (uio->uio_resid > 0) {
+ lbn = lblkno(fs, uio->uio_offset);
+ blkoffset = blkoff(fs, uio->uio_offset);
+ xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
+ if (fs->fs_bsize > xfersize)
+ flags |= B_CLRBUF;
+ else
+ flags &= ~B_CLRBUF;
+
+#ifdef LFS_READWRITE
+ error = lfs_reserve(fs, vp, NULL,
+ btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+ if (error)
+ break;
+ need_unreserve = true;
+#endif
+ error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
+ ap->a_cred, flags, &bp);
+
+ if (error)
+ break;
+ if (uio->uio_offset + xfersize > ip->i_size) {
+ ip->i_size = uio->uio_offset + xfersize;
+ DIP_ASSIGN(ip, size, ip->i_size);
+ uvm_vnp_setsize(vp, ip->i_size);
+ extended = 1;
+ }
+ size = blksize(fs, ip, lbn) - bp->b_resid;
+ if (xfersize > size)
+ xfersize = size;
+
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+
+ /*
+ * if we didn't clear the block and the uiomove failed,
+ * the buf will now contain part of some other file,
+ * so we need to invalidate it.
+ */
+ if (error && (flags & B_CLRBUF) == 0) {
+ brelse(bp, BC_INVAL);
+ break;
+ }
+#ifdef LFS_READWRITE
+ (void)VOP_BWRITE(bp->b_vp, bp);
+ lfs_reserve(fs, vp, NULL,
+ -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+ need_unreserve = false;
+#else
+ if (ioflag & IO_SYNC)
+ (void)bwrite(bp);
+ else if (xfersize + blkoffset == fs->fs_bsize)
+ bawrite(bp);
+ else
+ bdwrite(bp);
+#endif
+ if (error || xfersize == 0)
+ break;
+ }
+#ifdef LFS_READWRITE
+ if (need_unreserve) {
+ lfs_reserve(fs, vp, NULL,
+ -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+ }
+#endif
+
+ /*
+ * If we successfully wrote any data, and we are not the superuser
+ * we clear the setuid and setgid bits as a precaution against
+ * tampering.
+ */
+out:
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (vp->v_mount->mnt_flag & MNT_RELATIME)
+ ip->i_flag |= IN_ACCESS;
+ if (resid > uio->uio_resid && ap->a_cred &&
+ kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+ ip->i_mode &= ~(ISUID | ISGID);
+ DIP_ASSIGN(ip, mode, ip->i_mode);
+ }
+ if (resid > uio->uio_resid)
+ VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+ if (error) {
+ (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+ uio->uio_offset -= resid - uio->uio_resid;
+ uio->uio_resid = resid;
+ } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+ error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+ else
+ UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+ KASSERT(vp->v_size == ip->i_size);
+ if ((ioflag & IO_JOURNALLOCKED) == 0)
+ UFS_WAPBL_END(vp->v_mount);
+ fstrans_done(vp->v_mount);
+
+ return (error);
+}
--- /dev/null
+/* $NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1991, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <quota/quotaprop.h>
+
+/* how many times ufs_init() was called */
+static int ufs_initcount = 0;
+
+pool_cache_t ufs_direct_cache;
+
+/*
+ * Make a filesystem operational.
+ * Nothing to do at the moment.
+ */
+/* ARGSUSED */
+int
+ufs_start(struct mount *mp, int flags)
+{
+
+ return (0);
+}
+
+/*
+ * Return the root of a filesystem.
+ */
+int
+ufs_root(struct mount *mp, struct vnode **vpp)
+{
+ struct vnode *nvp;
+ int error;
+
+ if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &nvp)) != 0)
+ return (error);
+ *vpp = nvp;
+ return (0);
+}
+
+/*
+ * Do operations associated with quotas
+ */
+int
+ufs_quotactl(struct mount *mp, prop_dictionary_t dict)
+{
+ struct lwp *l = curlwp;
+
+#if !defined(QUOTA) && !defined(QUOTA2)
+ (void) mp;
+ (void) dict;
+ (void) l;
+ return (EOPNOTSUPP);
+#else
+ int error;
+ prop_dictionary_t cmddict;
+ prop_array_t commands;
+ prop_object_iterator_t iter;
+
+ /* Mark the mount busy, as we're passing it to kauth(9). */
+ error = vfs_busy(mp, NULL);
+ if (error)
+ return (error);
+
+ error = quota_get_cmds(dict, &commands);
+ if (error)
+ goto out_vfs;
+ iter = prop_array_iterator(commands);
+ if (iter == NULL) {
+ error = ENOMEM;
+ goto out_vfs;
+ }
+
+
+ mutex_enter(&mp->mnt_updating);
+ while ((cmddict = prop_object_iterator_next(iter)) != NULL) {
+ if (prop_object_type(cmddict) != PROP_TYPE_DICTIONARY)
+ continue;
+ error = quota_handle_cmd(mp, l, cmddict);
+ if (error)
+ break;
+ }
+ prop_object_iterator_release(iter);
+ mutex_exit(&mp->mnt_updating);
+out_vfs:
+ vfs_unbusy(mp, false, NULL);
+ return (error);
+#endif
+}
+
+#if 0
+ switch (cmd) {
+ case Q_SYNC:
+ break;
+
+ case Q_GETQUOTA:
+ /* The user can always query about his own quota. */
+ if (uid == kauth_cred_getuid(l->l_cred))
+ break;
+
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(uid), NULL);
+
+ break;
+
+ case Q_QUOTAON:
+ case Q_QUOTAOFF:
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+
+ break;
+
+ case Q_SETQUOTA:
+ case Q_SETUSE:
+ error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+ KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(uid), NULL);
+
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ type = cmds & SUBCMDMASK;
+ if (!error) {
+ /* Only check if there was no error above. */
+ if ((u_int)type >= MAXQUOTAS)
+ error = EINVAL;
+ }
+
+ if (error) {
+ vfs_unbusy(mp, false, NULL);
+ return (error);
+ }
+
+ mutex_enter(&mp->mnt_updating);
+ switch (cmd) {
+
+ case Q_QUOTAON:
+ error = quotaon(l, mp, type, arg);
+ break;
+
+ case Q_QUOTAOFF:
+ error = quotaoff(l, mp, type);
+ break;
+
+ case Q_SETQUOTA:
+ error = setquota(mp, uid, type, arg);
+ break;
+
+ case Q_SETUSE:
+ error = setuse(mp, uid, type, arg);
+ break;
+
+ case Q_GETQUOTA:
+ error = getquota(mp, uid, type, arg);
+ break;
+
+ case Q_SYNC:
+ error = qsync(mp);
+ break;
+
+ default:
+ error = EINVAL;
+ }
+ mutex_exit(&mp->mnt_updating);
+ vfs_unbusy(mp, false, NULL);
+ return (error);
+#endif
+
+/*
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ */
+int
+ufs_fhtovp(struct mount *mp, struct ufid *ufhp, struct vnode **vpp)
+{
+ struct vnode *nvp;
+ struct inode *ip;
+ int error;
+
+ if ((error = VFS_VGET(mp, ufhp->ufid_ino, &nvp)) != 0) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ ip = VTOI(nvp);
+ if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) {
+ vput(nvp);
+ *vpp = NULLVP;
+ return (ESTALE);
+ }
+ *vpp = nvp;
+ return (0);
+}
+
+/*
+ * Initialize UFS filesystems, done only once.
+ */
+void
+ufs_init(void)
+{
+ if (ufs_initcount++ > 0)
+ return;
+
+ ufs_direct_cache = pool_cache_init(sizeof(struct direct), 0, 0, 0,
+ "ufsdir", NULL, IPL_NONE, NULL, NULL, NULL);
+
+ ufs_ihashinit();
+#if defined(QUOTA) || defined(QUOTA2)
+ dqinit();
+#endif
+#ifdef UFS_DIRHASH
+ ufsdirhash_init();
+#endif
+#ifdef UFS_EXTATTR
+ ufs_extattr_init();
+#endif
+}
+
+void
+ufs_reinit(void)
+{
+ ufs_ihashreinit();
+#if defined(QUOTA) || defined(QUOTA2)
+ dqreinit();
+#endif
+}
+
+/*
+ * Free UFS filesystem resources, done only once.
+ */
+void
+ufs_done(void)
+{
+ if (--ufs_initcount > 0)
+ return;
+
+ ufs_ihashdone();
+#if defined(QUOTA) || defined(QUOTA2)
+ dqdone();
+#endif
+ pool_cache_destroy(ufs_direct_cache);
+#ifdef UFS_DIRHASH
+ ufsdirhash_done();
+#endif
+#ifdef UFS_EXTATTR
+ ufs_extattr_done();
+#endif
+}
--- /dev/null
+/* $NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $ */
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/lfs/lfs_extern.h>
+#include <ufs/lfs/lfs.h>
+
+#include <uvm/uvm.h>
+
+__CTASSERT(EXT2FS_MAXNAMLEN == FFS_MAXNAMLEN);
+__CTASSERT(LFS_MAXNAMLEN == FFS_MAXNAMLEN);
+
+static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *);
+static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
+ struct lwp *);
+
+/*
+ * A virgin directory (no blushing please).
+ */
+static const struct dirtemplate mastertemplate = {
+ 0, 12, DT_DIR, 1, ".",
+ 0, DIRBLKSIZ - 12, DT_DIR, 2, ".."
+};
+
+/*
+ * Create a regular file
+ */
+int
+ufs_create(void *v)
+{
+ struct vop_create_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ int error;
+ struct vnode *dvp = ap->a_dvp;
+ struct ufs_lookup_results *ulr;
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+ /*
+ * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+ * ufs_makeinode
+ */
+ fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+ error =
+ ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+ dvp, ulr, ap->a_vpp, ap->a_cnp);
+ if (error) {
+ fstrans_done(dvp->v_mount);
+ return (error);
+ }
+ UFS_WAPBL_END1(dvp->v_mount, dvp);
+ fstrans_done(dvp->v_mount);
+ VN_KNOTE(dvp, NOTE_WRITE);
+ return (0);
+}
+
+/*
+ * Mknod vnode call
+ */
+/* ARGSUSED */
+int
+ufs_mknod(void *v)
+{
+ struct vop_mknod_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ struct vattr *vap;
+ struct vnode **vpp;
+ struct inode *ip;
+ int error;
+ struct mount *mp;
+ ino_t ino;
+ struct ufs_lookup_results *ulr;
+
+ vap = ap->a_vap;
+ vpp = ap->a_vpp;
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(ap->a_dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+ /*
+ * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+ * ufs_makeinode
+ */
+ fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
+ if ((error =
+ ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+ ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0)
+ goto out;
+ VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ ip = VTOI(*vpp);
+ mp = (*vpp)->v_mount;
+ ino = ip->i_number;
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ if (vap->va_rdev != VNOVAL) {
+ struct ufsmount *ump = ip->i_ump;
+ /*
+ * Want to be able to use this to make badblock
+ * inodes, so don't truncate the dev number.
+ */
+ if (ump->um_fstype == UFS1)
+ ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
+ UFS_MPNEEDSWAP(ump));
+ else
+ ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev,
+ UFS_MPNEEDSWAP(ump));
+ }
+ UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0);
+ UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
+ /*
+ * Remove inode so that it will be reloaded by VFS_VGET and
+ * checked to see if it is an alias of an existing entry in
+ * the inode cache.
+ */
+ (*vpp)->v_type = VNON;
+ VOP_UNLOCK(*vpp);
+ vgone(*vpp);
+ error = VFS_VGET(mp, ino, vpp);
+out:
+ fstrans_done(ap->a_dvp->v_mount);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * Open called.
+ *
+ * Nothing to do.
+ */
+/* ARGSUSED */
+int
+ufs_open(void *v)
+{
+ struct vop_open_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ /*
+ * Files marked append-only must be opened for appending.
+ */
+ if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
+ (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+ return (EPERM);
+ return (0);
+}
+
+/*
+ * Close called.
+ *
+ * Update the times on the inode.
+ */
+/* ARGSUSED */
+int
+ufs_close(void *v)
+{
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ if (vp->v_usecount > 1)
+ UFS_ITIMES(vp, NULL, NULL, NULL);
+ fstrans_done(vp->v_mount);
+ return (0);
+}
+
+static int
+ufs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode,
+ kauth_cred_t cred)
+{
+#if defined(QUOTA) || defined(QUOTA2)
+ int error;
+#endif
+
+ /*
+ * Disallow write attempts on read-only file systems;
+ * unless the file is a socket, fifo, or a block or
+ * character device resident on the file system.
+ */
+ if (mode & VWRITE) {
+ switch (vp->v_type) {
+ case VDIR:
+ case VLNK:
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+#if defined(QUOTA) || defined(QUOTA2)
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ error = chkdq(ip, 0, cred, 0);
+ fstrans_done(vp->v_mount);
+ if (error != 0)
+ return error;
+#endif
+ break;
+ case VBAD:
+ case VBLK:
+ case VCHR:
+ case VSOCK:
+ case VFIFO:
+ case VNON:
+ default:
+ break;
+ }
+ }
+
+ /* If it is a snapshot, nobody gets access to it. */
+ if ((ip->i_flags & SF_SNAPSHOT))
+ return (EPERM);
+ /* If immutable bit set, nobody gets to write it. */
+ if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE))
+ return (EPERM);
+
+ return 0;
+}
+
+static int
+ufs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode,
+ kauth_cred_t cred)
+{
+
+ return genfs_can_access(vp->v_type, ip->i_mode & ALLPERMS, ip->i_uid,
+ ip->i_gid, mode, cred);
+}
+
+int
+ufs_access(void *v)
+{
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+ mode_t mode;
+ int error;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ mode = ap->a_mode;
+
+ error = ufs_check_possible(vp, ip, mode, ap->a_cred);
+ if (error)
+ return error;
+
+ error = ufs_check_permitted(vp, ip, mode, ap->a_cred);
+
+ return error;
+}
+
+/* ARGSUSED */
+int
+ufs_getattr(void *v)
+{
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+ struct vattr *vap;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ vap = ap->a_vap;
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ UFS_ITIMES(vp, NULL, NULL, NULL);
+
+ /*
+ * Copy from inode table
+ */
+ vap->va_fsid = ip->i_dev;
+ vap->va_fileid = ip->i_number;
+ vap->va_mode = ip->i_mode & ALLPERMS;
+ vap->va_nlink = ip->i_nlink;
+ vap->va_uid = ip->i_uid;
+ vap->va_gid = ip->i_gid;
+ vap->va_size = vp->v_size;
+ if (ip->i_ump->um_fstype == UFS1) {
+ vap->va_rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
+ UFS_MPNEEDSWAP(ip->i_ump));
+ vap->va_atime.tv_sec = ip->i_ffs1_atime;
+ vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
+ vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
+ vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
+ vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
+ vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
+ vap->va_birthtime.tv_sec = 0;
+ vap->va_birthtime.tv_nsec = 0;
+ vap->va_bytes = dbtob((u_quad_t)ip->i_ffs1_blocks);
+ } else {
+ vap->va_rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
+ UFS_MPNEEDSWAP(ip->i_ump));
+ vap->va_atime.tv_sec = ip->i_ffs2_atime;
+ vap->va_atime.tv_nsec = ip->i_ffs2_atimensec;
+ vap->va_mtime.tv_sec = ip->i_ffs2_mtime;
+ vap->va_mtime.tv_nsec = ip->i_ffs2_mtimensec;
+ vap->va_ctime.tv_sec = ip->i_ffs2_ctime;
+ vap->va_ctime.tv_nsec = ip->i_ffs2_ctimensec;
+ vap->va_birthtime.tv_sec = ip->i_ffs2_birthtime;
+ vap->va_birthtime.tv_nsec = ip->i_ffs2_birthnsec;
+ vap->va_bytes = dbtob(ip->i_ffs2_blocks);
+ }
+ vap->va_gen = ip->i_gen;
+ vap->va_flags = ip->i_flags;
+
+ /* this doesn't belong here */
+ if (vp->v_type == VBLK)
+ vap->va_blocksize = BLKDEV_IOSIZE;
+ else if (vp->v_type == VCHR)
+ vap->va_blocksize = MAXBSIZE;
+ else
+ vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+ vap->va_type = vp->v_type;
+ vap->va_filerev = ip->i_modrev;
+ fstrans_done(vp->v_mount);
+ return (0);
+}
+
+/*
+ * Set attribute vnode op. called from several syscalls
+ */
+int
+ufs_setattr(void *v)
+{
+ struct vop_setattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vattr *vap;
+ struct vnode *vp;
+ struct inode *ip;
+ kauth_cred_t cred;
+ struct lwp *l;
+ int error;
+
+ vap = ap->a_vap;
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ cred = ap->a_cred;
+ l = curlwp;
+
+ /*
+ * Check for unsettable attributes.
+ */
+ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
+ (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+ (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+ ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+ return (EINVAL);
+ }
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+ if (vap->va_flags != VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+ if (kauth_cred_geteuid(cred) != ip->i_uid &&
+ (error = kauth_authorize_generic(cred,
+ KAUTH_GENERIC_ISSUSER, NULL)))
+ goto out;
+ if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+ NULL) == 0) {
+ if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) &&
+ kauth_authorize_system(l->l_cred,
+ KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL)) {
+ error = EPERM;
+ goto out;
+ }
+ /* Snapshot flag cannot be set or cleared */
+ if ((vap->va_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
+ (ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))) {
+ error = EPERM;
+ goto out;
+ }
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ ip->i_flags = vap->va_flags;
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ } else {
+ if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) ||
+ (vap->va_flags & UF_SETTABLE) != vap->va_flags) {
+ error = EPERM;
+ goto out;
+ }
+ if ((ip->i_flags & SF_SETTABLE) !=
+ (vap->va_flags & SF_SETTABLE)) {
+ error = EPERM;
+ goto out;
+ }
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ ip->i_flags &= SF_SETTABLE;
+ ip->i_flags |= (vap->va_flags & UF_SETTABLE);
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ }
+ ip->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+ UFS_WAPBL_END(vp->v_mount);
+ if (vap->va_flags & (IMMUTABLE | APPEND)) {
+ error = 0;
+ goto out;
+ }
+ }
+ if (ip->i_flags & (IMMUTABLE | APPEND)) {
+ error = EPERM;
+ goto out;
+ }
+ /*
+ * Go through the fields and update iff not VNOVAL.
+ */
+ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
+ UFS_WAPBL_END(vp->v_mount);
+ if (error)
+ goto out;
+ }
+ if (vap->va_size != VNOVAL) {
+ /*
+ * Disallow write attempts on read-only file systems;
+ * unless the file is a socket, fifo, or a block or
+ * character device resident on the file system.
+ */
+ switch (vp->v_type) {
+ case VDIR:
+ error = EISDIR;
+ goto out;
+ case VCHR:
+ case VBLK:
+ case VFIFO:
+ break;
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+ if ((ip->i_flags & SF_SNAPSHOT) != 0) {
+ error = EPERM;
+ goto out;
+ }
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ /*
+ * When journaling, only truncate one indirect block
+ * at a time.
+ */
+ if (vp->v_mount->mnt_wapbl) {
+ uint64_t incr = MNINDIR(ip->i_ump) <<
+ vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+ uint64_t base = NDADDR <<
+ vp->v_mount->mnt_fs_bshift;
+ while (!error && ip->i_size > base + incr &&
+ ip->i_size > vap->va_size + incr) {
+ /*
+ * round down to next full indirect
+ * block boundary.
+ */
+ uint64_t nsize = base +
+ ((ip->i_size - base - 1) &
+ ~(incr - 1));
+ error = UFS_TRUNCATE(vp, nsize, 0,
+ cred);
+ if (error == 0) {
+ UFS_WAPBL_END(vp->v_mount);
+ error =
+ UFS_WAPBL_BEGIN(vp->v_mount);
+ }
+ }
+ }
+ if (!error)
+ error = UFS_TRUNCATE(vp, vap->va_size, 0, cred);
+ UFS_WAPBL_END(vp->v_mount);
+ if (error)
+ goto out;
+ break;
+ default:
+ error = EOPNOTSUPP;
+ goto out;
+ }
+ }
+ ip = VTOI(vp);
+ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
+ vap->va_birthtime.tv_sec != VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+ if ((ip->i_flags & SF_SNAPSHOT) != 0) {
+ error = EPERM;
+ goto out;
+ }
+ error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred);
+ if (error)
+ goto out;
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ if (vap->va_atime.tv_sec != VNOVAL)
+ if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
+ ip->i_flag |= IN_ACCESS;
+ if (vap->va_mtime.tv_sec != VNOVAL) {
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (vp->v_mount->mnt_flag & MNT_RELATIME)
+ ip->i_flag |= IN_ACCESS;
+ }
+ if (vap->va_birthtime.tv_sec != VNOVAL &&
+ ip->i_ump->um_fstype == UFS2) {
+ ip->i_ffs2_birthtime = vap->va_birthtime.tv_sec;
+ ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec;
+ }
+ error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0);
+ UFS_WAPBL_END(vp->v_mount);
+ if (error)
+ goto out;
+ }
+ error = 0;
+ if (vap->va_mode != (mode_t)VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+ if ((ip->i_flags & SF_SNAPSHOT) != 0 &&
+ (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP |
+ S_IXOTH | S_IWOTH))) {
+ error = EPERM;
+ goto out;
+ }
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error)
+ goto out;
+ error = ufs_chmod(vp, (int)vap->va_mode, cred, l);
+ UFS_WAPBL_END(vp->v_mount);
+ }
+ VN_KNOTE(vp, NOTE_ATTRIB);
+out:
+ fstrans_done(vp->v_mount);
+ return (error);
+}
+
+/*
+ * Change the mode on a file.
+ * Inode must be locked before calling.
+ */
+static int
+ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
+{
+ struct inode *ip;
+ int error;
+
+ UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);
+
+ ip = VTOI(vp);
+
+ error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode);
+ if (error)
+ return (error);
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+ ip->i_mode &= ~ALLPERMS;
+ ip->i_mode |= (mode & ALLPERMS);
+ ip->i_flag |= IN_CHANGE;
+ DIP_ASSIGN(ip, mode, ip->i_mode);
+ UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+ fstrans_done(vp->v_mount);
+ return (0);
+}
+
+/*
+ * Perform chown operation on inode ip;
+ * inode must be locked prior to call.
+ */
+static int
+ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
+ struct lwp *l)
+{
+ struct inode *ip;
+ int error = 0;
+#if defined(QUOTA) || defined(QUOTA2)
+ uid_t ouid;
+ gid_t ogid;
+ int64_t change;
+#endif
+ ip = VTOI(vp);
+ error = 0;
+
+ if (uid == (uid_t)VNOVAL)
+ uid = ip->i_uid;
+ if (gid == (gid_t)VNOVAL)
+ gid = ip->i_gid;
+
+ error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid);
+ if (error)
+ return (error);
+
+ fstrans_start(vp->v_mount, FSTRANS_SHARED);
+#if defined(QUOTA) || defined(QUOTA2)
+ ogid = ip->i_gid;
+ ouid = ip->i_uid;
+ change = DIP(ip, blocks);
+ (void) chkdq(ip, -change, cred, 0);
+ (void) chkiq(ip, -1, cred, 0);
+#endif
+ ip->i_gid = gid;
+ DIP_ASSIGN(ip, gid, gid);
+ ip->i_uid = uid;
+ DIP_ASSIGN(ip, uid, uid);
+#if defined(QUOTA) || defined(QUOTA2)
+ if ((error = chkdq(ip, change, cred, 0)) == 0) {
+ if ((error = chkiq(ip, 1, cred, 0)) == 0)
+ goto good;
+ else
+ (void) chkdq(ip, -change, cred, FORCE);
+ }
+ ip->i_gid = ogid;
+ DIP_ASSIGN(ip, gid, ogid);
+ ip->i_uid = ouid;
+ DIP_ASSIGN(ip, uid, ouid);
+ (void) chkdq(ip, change, cred, FORCE);
+ (void) chkiq(ip, 1, cred, FORCE);
+ fstrans_done(vp->v_mount);
+ return (error);
+ good:
+#endif /* QUOTA || QUOTA2 */
+ ip->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+ fstrans_done(vp->v_mount);
+ return (0);
+}
+
+int
+ufs_remove(void *v)
+{
+ struct vop_remove_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *vp, *dvp;
+ struct inode *ip;
+ int error;
+ struct ufs_lookup_results *ulr;
+
+ vp = ap->a_vp;
+ dvp = ap->a_dvp;
+ ip = VTOI(vp);
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+ fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+ if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) ||
+ (VTOI(dvp)->i_flags & APPEND))
+ error = EPERM;
+ else {
+ error = UFS_WAPBL_BEGIN(dvp->v_mount);
+ if (error == 0) {
+ error = ufs_dirremove(dvp, ulr,
+ ip, ap->a_cnp->cn_flags, 0);
+ UFS_WAPBL_END(dvp->v_mount);
+ }
+ }
+ VN_KNOTE(vp, NOTE_DELETE);
+ VN_KNOTE(dvp, NOTE_WRITE);
+ if (dvp == vp)
+ vrele(vp);
+ else
+ vput(vp);
+ vput(dvp);
+ fstrans_done(dvp->v_mount);
+ return (error);
+}
+
+/*
+ * ufs_link: create hard link.
+ */
+int
+ufs_link(void *v)
+{
+ struct vop_link_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *dvp = ap->a_dvp;
+ struct vnode *vp = ap->a_vp;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *ip;
+ struct direct *newdir;
+ int error;
+ struct ufs_lookup_results *ulr;
+
+ KASSERT(dvp != vp);
+ KASSERT(vp->v_type != VDIR);
+ KASSERT(dvp->v_mount == vp->v_mount);
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+ fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error) {
+ VOP_ABORTOP(dvp, cnp);
+ goto out2;
+ }
+ ip = VTOI(vp);
+ if ((nlink_t)ip->i_nlink >= LINK_MAX) {
+ VOP_ABORTOP(dvp, cnp);
+ error = EMLINK;
+ goto out1;
+ }
+ if (ip->i_flags & (IMMUTABLE | APPEND)) {
+ VOP_ABORTOP(dvp, cnp);
+ error = EPERM;
+ goto out1;
+ }
+ error = UFS_WAPBL_BEGIN(vp->v_mount);
+ if (error) {
+ VOP_ABORTOP(dvp, cnp);
+ goto out1;
+ }
+ ip->i_nlink++;
+ DIP_ASSIGN(ip, nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ error = UFS_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
+ if (!error) {
+ newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+ ufs_makedirentry(ip, cnp, newdir);
+ error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL);
+ pool_cache_put(ufs_direct_cache, newdir);
+ }
+ if (error) {
+ ip->i_nlink--;
+ DIP_ASSIGN(ip, nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
+ }
+ UFS_WAPBL_END(vp->v_mount);
+ out1:
+ VOP_UNLOCK(vp);
+ out2:
+ VN_KNOTE(vp, NOTE_LINK);
+ VN_KNOTE(dvp, NOTE_WRITE);
+ vput(dvp);
+ fstrans_done(dvp->v_mount);
+ return (error);
+}
+
+/*
+ * whiteout vnode call
+ */
+int
+ufs_whiteout(void *v)
+{
+ struct vop_whiteout_args /* {
+ struct vnode *a_dvp;
+ struct componentname *a_cnp;
+ int a_flags;
+ } */ *ap = v;
+ struct vnode *dvp = ap->a_dvp;
+ struct componentname *cnp = ap->a_cnp;
+ struct direct *newdir;
+ int error;
+ struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
+ struct ufs_lookup_results *ulr;
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+ error = 0;
+ switch (ap->a_flags) {
+ case LOOKUP:
+ /* 4.4 format directories support whiteout operations */
+ if (ump->um_maxsymlinklen > 0)
+ return (0);
+ return (EOPNOTSUPP);
+
+ case CREATE:
+ /* create a new directory whiteout */
+ fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+ error = UFS_WAPBL_BEGIN(dvp->v_mount);
+ if (error)
+ break;
+#ifdef DIAGNOSTIC
+ if (ump->um_maxsymlinklen <= 0)
+ panic("ufs_whiteout: old format filesystem");
+#endif
+
+ newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+ newdir->d_ino = WINO;
+ newdir->d_namlen = cnp->cn_namelen;
+ memcpy(newdir->d_name, cnp->cn_nameptr,
+ (size_t)cnp->cn_namelen);
+ newdir->d_name[cnp->cn_namelen] = '\0';
+ newdir->d_type = DT_WHT;
+ error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL);
+ pool_cache_put(ufs_direct_cache, newdir);
+ break;
+
+ case DELETE:
+ /* remove an existing directory whiteout */
+ fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+ error = UFS_WAPBL_BEGIN(dvp->v_mount);
+ if (error)
+ break;
+#ifdef DIAGNOSTIC
+ if (ump->um_maxsymlinklen <= 0)
+ panic("ufs_whiteout: old format filesystem");
+#endif
+
+ cnp->cn_flags &= ~DOWHITEOUT;
+ error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0);
+ break;
+ default:
+ panic("ufs_whiteout: unknown op");
+ /* NOTREACHED */
+ }
+ UFS_WAPBL_END(dvp->v_mount);
+ fstrans_done(dvp->v_mount);
+ return (error);
+}
+
+
+/*
+ * Rename vnode operation
+ * rename("foo", "bar");
+ * is essentially
+ * unlink("bar");
+ * link("foo", "bar");
+ * unlink("foo");
+ * but ``atomically''. Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time. Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ * target. This also ensure the inode won't be deleted out
+ * from underneath us while we work (it may be truncated by
+ * a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination. If destination already exists,
+ * delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ * directory was moved and the parent of the destination
+ * is different from the source, patch the ".." entry in the
+ * directory.
+ */
+
+/*
+ * Notes on rename locking:
+ *
+ * We lock parent vnodes before child vnodes. This means in particular
+ * that if A is above B in the directory tree then A must be locked
+ * before B. (This is true regardless of how many steps appear in
+ * between, because an arbitrary number of other processes could lock
+ * parent/child in between and establish a lock cycle and deadlock.)
+ *
+ * Therefore, if tdvp is above fdvp we must lock tdvp first; if fdvp
+ * is above tdvp we must lock fdvp first; and if they're
+ * incommensurate it doesn't matter. (But, we rely on the fact that
+ * there's a whole-volume rename lock to prevent deadlock among groups
+ * of renames upon overlapping sets of incommensurate vnodes.)
+ *
+ * In addition to establishing lock ordering the parent check also
+ * serves to rule out cases where someone tries to move a directory
+ * underneath itself, e.g. rename("a/b", "a/b/c"). If allowed to
+ * proceed such renames would detach portions of the directory tree
+ * and make fsck very unhappy.
+ *
+ * Note that it is an error for *fvp* to be above tdvp; however,
+ * *fdvp* can be above tdvp, as in rename("a/b", "a/c/d").
+ *
+ * The parent check searches up the tree from tdvp until it either
+ * finds fdvp or the root of the volume. It also returns the vnode it
+ * saw immediately before fdvp, if any. Later on (after looking up
+ * fvp) we will check to see if this *is* fvp and if so fail.
+ *
+ * If the parent check finds fdvp, it means fdvp is above tdvp, so we
+ * lock fdvp first and then tdvp. Otherwise, either tdvp is above fdvp
+ * or they're incommensurate and we lock tdvp first.
+ *
+ * In either case each of the child vnodes has to be looked up and
+ * locked immediately after its parent. The cases
+ *
+ * fdvp/fvp/[.../]tdvp/tvp
+ * tdvp/tvp/[.../]fdvp/fvp
+ *
+ * can cause deadlock otherwise. Note that both of these are error
+ * cases; the first fails the parent check and the second fails
+ * because tvp isn't empty. The parent check case is handled before
+ * we start locking; however, the nonempty case requires locking tvp
+ * to find out safely that it's nonempty.
+ *
+ * Therefore the procedure is either
+ *
+ * lock fdvp
+ * lookup fvp
+ * lock fvp
+ * lock tdvp
+ * lookup tvp
+ * lock tvp
+ *
+ * or
+ *
+ * lock tdvp
+ * lookup tvp
+ * lock tvp
+ * lock fdvp
+ * lookup fvp
+ * lock fvp
+ *
+ * This could in principle be simplified by always looking up fvp
+ * last; because of the parent check we know by the time we start
+ * locking that fvp cannot be directly above tdvp, so (given the
+ * whole-volume rename lock and other assumptions) it's safe to lock
+ * tdvp before fvp. This would allow the following scheme:
+ *
+ * lock fdvp
+ * lock tdvp
+ * or
+ * lock tdvp
+ * lock fdvp
+ *
+ * then
+ * lookup tvp
+ * lock tvp
+ * lookup fvp
+ * check if fvp is above of tdvp, fail if so
+ * lock fvp
+ *
+ * which is much, much simpler.
+ *
+ * However, current levels of vfs namei/lookup sanity do not permit
+ * this. It is impossible currently to look up fvp without locking it.
+ * (It gets locked regardless of whether LOCKLEAF is set; without
+ * LOCKLEAF it just gets unlocked again, which doesn't help.)
+ *
+ * Therefore, because we must look up fvp to know if it's above tdvp,
+ * which locks fvp, we must, at least in the case where fdvp is above
+ * tdvp, do that before locking tdvp. The longer scheme does that; the
+ * simpler scheme is not safe.
+ *
+ * Note that for now we aren't doing lookup() but relookup(); however,
+ * the differences are minor.
+ *
+ * On top of all the above, just to make everything more
+ * exciting, any two of the vnodes might end up being the same.
+ *
+ * FROMPARENT == FROMCHILD mv a/. foo is an error.
+ * FROMPARENT == TOPARENT mv a/b a/c is ok.
+ * FROMPARENT == TOCHILD mv a/b/c a/b will give ENOTEMPTY.
+ * FROMCHILD == TOPARENT mv a/b a/b/c fails the parent check.
+ * FROMCHILD == TOCHILD mv a/b a/b is ok.
+ * TOPARENT == TOCHILD mv foo a/. is an error.
+ *
+ * This introduces more cases in the locking, because each distinct
+ * vnode must be locked exactly once.
+ *
+ * When FROMPARENT == TOPARENT and FROMCHILD != TOCHILD we assume it
+ * doesn't matter what order the children are locked in, because the
+ * per-volume rename lock excludes other renames and no other
+ * operation locks two files in the same directory at once. (Note: if
+ * it turns out that link() does, link() is wrong.)
+ *
+ * Until such time as we can do lookups without the namei and lookup
+ * machinery "helpfully" locking the result vnode for us, we can't
+ * avoid tripping on cases where FROMCHILD == TOCHILD. Currently for
+ * non-directories we unlock the first one we lock while looking up
+ * the second, then relock it if necessary. This is more or less
+ * harmless since not much of interest can happen to the objects in
+ * that window while we have the containing directory locked; but it's
+ * not desirable and should be cleaned up when that becomes possible.
+ * The right way to do it is to check after looking the second one up
+ * and only lock it if it's different. (Note: for directories we don't
+ * do this dance because the same directory can't appear more than
+ * once.)
+ */
+
+/* XXX following lifted from ufs_lookup.c */
+#define FSFMT(vp) (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * Check if either entry referred to by FROM_ULR is within the range
+ * of entries named by TO_ULR.
+ */
+static int
+ulr_overlap(const struct ufs_lookup_results *from_ulr,
+ const struct ufs_lookup_results *to_ulr)
+{
+ doff_t from_start, from_prevstart;
+ doff_t to_start, to_end;
+
+ /*
+ * FROM is a DELETE result; offset points to the entry to
+ * remove and subtracting count gives the previous entry.
+ */
+ from_start = from_ulr->ulr_offset - from_ulr->ulr_count;
+ from_prevstart = from_ulr->ulr_offset;
+
+ /*
+ * TO is a RENAME (thus non-DELETE) result; offset points
+ * to the beginning of a region to write in, and adding
+ * count gives the end of the region.
+ */
+ to_start = to_ulr->ulr_offset;
+ to_end = to_ulr->ulr_offset + to_ulr->ulr_count;
+
+ if (from_prevstart >= to_start && from_prevstart < to_end) {
+ return 1;
+ }
+ if (from_start >= to_start && from_start < to_end) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Wrapper for relookup that also updates the supplemental results.
+ */
+static int
+do_relookup(struct vnode *dvp, struct ufs_lookup_results *ulr,
+ struct vnode **vp, struct componentname *cnp)
+{
+ int error;
+
+ error = relookup(dvp, vp, cnp, 0);
+ if (error) {
+ return error;
+ }
+ /* update the supplemental reasults */
+ *ulr = VTOI(dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+ return 0;
+}
+
+/*
+ * Lock and relookup a sequence of two directories and two children.
+ *
+ */
+static int
+lock_vnode_sequence(struct vnode *d1, struct ufs_lookup_results *ulr1,
+ struct vnode **v1_ret, struct componentname *cn1,
+ int v1_missing_ok,
+ int overlap_error,
+ struct vnode *d2, struct ufs_lookup_results *ulr2,
+ struct vnode **v2_ret, struct componentname *cn2,
+ int v2_missing_ok)
+{
+ struct vnode *v1, *v2;
+ int error;
+
+ KASSERT(d1 != d2);
+
+ vn_lock(d1, LK_EXCLUSIVE | LK_RETRY);
+ if (VTOI(d1)->i_size == 0) {
+ /* d1 has been rmdir'd */
+ VOP_UNLOCK(d1);
+ return ENOENT;
+ }
+ error = do_relookup(d1, ulr1, &v1, cn1);
+ if (v1_missing_ok) {
+ if (error == ENOENT) {
+ /*
+ * Note: currently if the name doesn't exist,
+ * relookup succeeds (it intercepts the
+ * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+ * to NULL. Therefore, we will never get
+ * ENOENT and this branch is not needed.
+ * However, in a saner future the EJUSTRETURN
+ * garbage will go away, so let's DTRT.
+ */
+ v1 = NULL;
+ error = 0;
+ }
+ } else {
+ if (error == 0 && v1 == NULL) {
+ /* This is what relookup sets if v1 disappeared. */
+ error = ENOENT;
+ }
+ }
+ if (error) {
+ VOP_UNLOCK(d1);
+ return error;
+ }
+ if (v1 && v1 == d2) {
+ VOP_UNLOCK(d1);
+ VOP_UNLOCK(v1);
+ vrele(v1);
+ return overlap_error;
+ }
+
+ /*
+ * The right way to do this is to do lookups without locking
+ * the results, and lock the results afterwards; then at the
+ * end we can avoid trying to lock v2 if v2 == v1.
+ *
+ * However, for the reasons described in the fdvp == tdvp case
+ * in rename below, we can't do that safely. So, in the case
+ * where v1 is not a directory, unlock it and lock it again
+ * afterwards. This is safe in locking order because a
+ * non-directory can't be above anything else in the tree. If
+ * v1 *is* a directory, that's not true, but then because d1
+ * != d2, v1 != v2.
+ */
+ if (v1 && v1->v_type != VDIR) {
+ VOP_UNLOCK(v1);
+ }
+ vn_lock(d2, LK_EXCLUSIVE | LK_RETRY);
+ if (VTOI(d2)->i_size == 0) {
+ /* d2 has been rmdir'd */
+ VOP_UNLOCK(d2);
+ if (v1 && v1->v_type == VDIR) {
+ VOP_UNLOCK(v1);
+ }
+ VOP_UNLOCK(d1);
+ if (v1) {
+ vrele(v1);
+ }
+ return ENOENT;
+ }
+ error = do_relookup(d2, ulr2, &v2, cn2);
+ if (v2_missing_ok) {
+ if (error == ENOENT) {
+ /* as above */
+ v2 = NULL;
+ error = 0;
+ }
+ } else {
+ if (error == 0 && v2 == NULL) {
+ /* This is what relookup sets if v2 disappeared. */
+ error = ENOENT;
+ }
+ }
+ if (error) {
+ VOP_UNLOCK(d2);
+ if (v1 && v1->v_type == VDIR) {
+ VOP_UNLOCK(v1);
+ }
+ VOP_UNLOCK(d1);
+ if (v1) {
+ vrele(v1);
+ }
+ return error;
+ }
+ if (v1 && v1->v_type != VDIR && v1 != v2) {
+ vn_lock(v1, LK_EXCLUSIVE | LK_RETRY);
+ }
+ *v1_ret = v1;
+ *v2_ret = v2;
+ return 0;
+}
+
+/*
+ * Rename vnode operation
+ * rename("foo", "bar");
+ * is essentially
+ * unlink("bar");
+ * link("foo", "bar");
+ * unlink("foo");
+ * but ``atomically''. Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time. Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ * target. This also ensure the inode won't be deleted out
+ * from underneath us while we work (it may be truncated by
+ * a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination. If destination already exists,
+ * delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ * directory was moved and the parent of the destination
+ * is different from the source, patch the ".." entry in the
+ * directory.
+ */
+int
+ufs_rename(void *v)
+{
+ struct vop_rename_args /* {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+ } */ *ap = v;
+ struct vnode *tvp, *tdvp, *fvp, *fdvp;
+ struct componentname *tcnp, *fcnp;
+ struct inode *ip, *txp, *fxp, *tdp, *fdp;
+ struct mount *mp;
+ struct direct *newdir;
+ int doingdirectory, error;
+ ino_t oldparent, newparent;
+
+ struct ufs_lookup_results from_ulr, to_ulr;
+
+ tvp = ap->a_tvp;
+ tdvp = ap->a_tdvp;
+ fvp = ap->a_fvp;
+ fdvp = ap->a_fdvp;
+ tcnp = ap->a_tcnp;
+ fcnp = ap->a_fcnp;
+ doingdirectory = error = 0;
+ oldparent = newparent = 0;
+
+ /* save the supplemental lookup results as they currently exist */
+ from_ulr = VTOI(fdvp)->i_crap;
+ to_ulr = VTOI(tdvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(fdvp));
+ UFS_CHECK_CRAPCOUNTER(VTOI(tdvp));
+
+ /*
+ * Owing to VFS oddities we are currently called with tdvp/tvp
+ * locked and not fdvp/fvp. In a sane world we'd be passed
+ * tdvp and fdvp only, unlocked, and two name strings. Pretend
+ * we have a sane world and unlock tdvp and tvp.
+ */
+ VOP_UNLOCK(tdvp);
+ if (tvp && tvp != tdvp) {
+ VOP_UNLOCK(tvp);
+ }
+
+ /* Also pretend we have a sane world and vrele fvp/tvp. */
+ vrele(fvp);
+ fvp = NULL;
+ if (tvp) {
+ vrele(tvp);
+ tvp = NULL;
+ }
+
+ /*
+ * Check for cross-device rename.
+ */
+ if (fdvp->v_mount != tdvp->v_mount) {
+ error = EXDEV;
+ goto abort;
+ }
+
+ /*
+ * Reject "." and ".."
+ */
+ if ((fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) ||
+ (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+ (tcnp->cn_namelen == 1 && tcnp->cn_nameptr[0] == '.')) {
+ error = EINVAL;
+ goto abort;
+ }
+
+ /*
+ * Get locks.
+ */
+
+ /* paranoia */
+ fcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+ tcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+
+ if (fdvp == tdvp) {
+ /* One directory. Lock it and relookup both children. */
+ vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+
+ if (VTOI(fdvp)->i_size == 0) {
+ /* directory has been rmdir'd */
+ VOP_UNLOCK(fdvp);
+ error = ENOENT;
+ goto abort;
+ }
+
+ error = do_relookup(fdvp, &from_ulr, &fvp, fcnp);
+ if (error == 0 && fvp == NULL) {
+ /* relookup may produce this if fvp disappears */
+ error = ENOENT;
+ }
+ if (error) {
+ VOP_UNLOCK(fdvp);
+ goto abort;
+ }
+
+ /*
+ * The right way to do this is to look up both children
+ * without locking either, and then lock both unless they
+ * turn out to be the same. However, due to deep-seated
+ * VFS-level issues all lookups lock the child regardless
+ * of whether LOCKLEAF is set (if LOCKLEAF is not set,
+ * the child is locked during lookup and then unlocked)
+ * so it is not safe to look up tvp while fvp is locked.
+ *
+ * Unlocking fvp here temporarily is more or less safe,
+ * because with the directory locked there's not much
+ * that can happen to it. However, ideally it wouldn't
+ * be necessary. XXX.
+ */
+ VOP_UNLOCK(fvp);
+ /* remember fdvp == tdvp so tdvp is locked */
+ error = do_relookup(tdvp, &to_ulr, &tvp, tcnp);
+ if (error && error != ENOENT) {
+ VOP_UNLOCK(fdvp);
+ goto abort;
+ }
+ if (error == ENOENT) {
+ /*
+ * Note: currently if the name doesn't exist,
+ * relookup succeeds (it intercepts the
+ * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+ * to NULL. Therefore, we will never get
+ * ENOENT and this branch is not needed.
+ * However, in a saner future the EJUSTRETURN
+ * garbage will go away, so let's DTRT.
+ */
+ tvp = NULL;
+ }
+
+ /* tvp is locked; lock fvp if necessary */
+ if (!tvp || tvp != fvp) {
+ vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
+ }
+ } else {
+ int found_fdvp;
+ struct vnode *illegal_fvp;
+
+ /*
+ * The source must not be above the destination. (If
+ * it were, the rename would detach a section of the
+ * tree.)
+ *
+ * Look up the tree from tdvp to see if we find fdvp,
+ * and if so, return the immediate child of fdvp we're
+ * under; that must not turn out to be the same as
+ * fvp.
+ *
+ * The per-volume rename lock guarantees that the
+ * result of this check remains true until we finish
+ * looking up and locking.
+ */
+ error = ufs_parentcheck(fdvp, tdvp, fcnp->cn_cred,
+ &found_fdvp, &illegal_fvp);
+ if (error) {
+ goto abort;
+ }
+
+ /* Must lock in tree order. */
+
+ if (found_fdvp) {
+ /* fdvp -> fvp -> tdvp -> tvp */
+ error = lock_vnode_sequence(fdvp, &from_ulr,
+ &fvp, fcnp, 0,
+ EINVAL,
+ tdvp, &to_ulr,
+ &tvp, tcnp, 1);
+ } else {
+ /* tdvp -> tvp -> fdvp -> fvp */
+ error = lock_vnode_sequence(tdvp, &to_ulr,
+ &tvp, tcnp, 1,
+ ENOTEMPTY,
+ fdvp, &from_ulr,
+ &fvp, fcnp, 0);
+ }
+ if (error) {
+ if (illegal_fvp) {
+ vrele(illegal_fvp);
+ }
+ goto abort;
+ }
+ KASSERT(fvp != NULL);
+
+ if (illegal_fvp && fvp == illegal_fvp) {
+ vrele(illegal_fvp);
+ error = EINVAL;
+ goto abort_withlocks;
+ }
+
+ if (illegal_fvp) {
+ vrele(illegal_fvp);
+ }
+ }
+
+ KASSERT(fdvp && VOP_ISLOCKED(fdvp));
+ KASSERT(fvp && VOP_ISLOCKED(fvp));
+ KASSERT(tdvp && VOP_ISLOCKED(tdvp));
+ KASSERT(tvp == NULL || VOP_ISLOCKED(tvp));
+
+ /* --- everything is now locked --- */
+
+ if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
+ (VTOI(tdvp)->i_flags & APPEND))) {
+ error = EPERM;
+ goto abort_withlocks;
+ }
+
+ /*
+ * Check if just deleting a link name.
+ */
+ if (fvp == tvp) {
+ if (fvp->v_type == VDIR) {
+ error = EINVAL;
+ goto abort_withlocks;
+ }
+
+ /* Release destination completely. Leave fdvp locked. */
+ VOP_ABORTOP(tdvp, tcnp);
+ if (fdvp != tdvp) {
+ VOP_UNLOCK(tdvp);
+ }
+ VOP_UNLOCK(tvp);
+ vrele(tdvp);
+ vrele(tvp);
+
+ /* Delete source. */
+ /* XXX: do we really need to relookup again? */
+
+ /*
+ * fdvp is still locked, but we just unlocked fvp
+ * (because fvp == tvp) so just decref fvp
+ */
+ vrele(fvp);
+ fcnp->cn_flags &= ~(MODMASK);
+ fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+ fcnp->cn_nameiop = DELETE;
+ if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+ vput(fdvp);
+ return (error);
+ }
+ return (VOP_REMOVE(fdvp, fvp, fcnp));
+ }
+ fdp = VTOI(fdvp);
+ ip = VTOI(fvp);
+ if ((nlink_t) ip->i_nlink >= LINK_MAX) {
+ error = EMLINK;
+ goto abort_withlocks;
+ }
+ if ((ip->i_flags & (IMMUTABLE | APPEND)) ||
+ (fdp->i_flags & APPEND)) {
+ error = EPERM;
+ goto abort_withlocks;
+ }
+ if ((ip->i_mode & IFMT) == IFDIR) {
+ /*
+ * Avoid ".", "..", and aliases of "." for obvious reasons.
+ */
+ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+ fdp == ip ||
+ (fcnp->cn_flags & ISDOTDOT) ||
+ (tcnp->cn_flags & ISDOTDOT) ||
+ (ip->i_flag & IN_RENAME)) {
+ error = EINVAL;
+ goto abort_withlocks;
+ }
+ ip->i_flag |= IN_RENAME;
+ doingdirectory = 1;
+ }
+ oldparent = fdp->i_number;
+ VN_KNOTE(fdvp, NOTE_WRITE); /* XXXLUKEM/XXX: right place? */
+
+ /*
+ * Both the directory
+ * and target vnodes are locked.
+ */
+ tdp = VTOI(tdvp);
+ txp = NULL;
+ if (tvp)
+ txp = VTOI(tvp);
+
+ mp = fdvp->v_mount;
+ fstrans_start(mp, FSTRANS_SHARED);
+
+ if (oldparent != tdp->i_number)
+ newparent = tdp->i_number;
+
+ /*
+ * If ".." must be changed (ie the directory gets a new
+ * parent) the user must have write permission in the source
+ * so as to be able to change "..".
+ */
+ if (doingdirectory && newparent) {
+ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+ if (error)
+ goto out;
+ }
+
+ KASSERT(fdvp != tvp);
+
+ if (newparent) {
+ /* Check for the rename("foo/foo", "foo") case. */
+ if (fdvp == tvp) {
+ error = doingdirectory ? ENOTEMPTY : EISDIR;
+ goto out;
+ }
+ }
+
+ fxp = VTOI(fvp);
+ fdp = VTOI(fdvp);
+
+ error = UFS_WAPBL_BEGIN(fdvp->v_mount);
+ if (error)
+ goto out2;
+
+ /*
+ * 1) Bump link count while we're moving stuff
+ * around. If we crash somewhere before
+ * completing our work, the link count
+ * may be wrong, but correctable.
+ */
+ ip->i_nlink++;
+ DIP_ASSIGN(ip, nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) {
+ goto bad;
+ }
+
+ /*
+ * 2) If target doesn't exist, link the target
+ * to the source and unlink the source.
+ * Otherwise, rewrite the target directory
+ * entry to reference the source inode and
+ * expunge the original entry's existence.
+ */
+ if (txp == NULL) {
+ if (tdp->i_dev != ip->i_dev)
+ panic("rename: EXDEV");
+ /*
+ * Account for ".." in new directory.
+ * When source and destination have the same
+ * parent we don't fool with the link count.
+ */
+ if (doingdirectory && newparent) {
+ if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
+ error = EMLINK;
+ goto bad;
+ }
+ tdp->i_nlink++;
+ DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+ tdp->i_flag |= IN_CHANGE;
+ if ((error = UFS_UPDATE(tdvp, NULL, NULL,
+ UPDATE_DIROP)) != 0) {
+ tdp->i_nlink--;
+ DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+ tdp->i_flag |= IN_CHANGE;
+ goto bad;
+ }
+ }
+ newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+ ufs_makedirentry(ip, tcnp, newdir);
+ error = ufs_direnter(tdvp, &to_ulr,
+ NULL, newdir, tcnp, NULL);
+ pool_cache_put(ufs_direct_cache, newdir);
+ if (error != 0) {
+ if (doingdirectory && newparent) {
+ tdp->i_nlink--;
+ DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+ tdp->i_flag |= IN_CHANGE;
+ (void)UFS_UPDATE(tdvp, NULL, NULL,
+ UPDATE_WAIT | UPDATE_DIROP);
+ }
+ goto bad;
+ }
+ VN_KNOTE(tdvp, NOTE_WRITE);
+ } else {
+ if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev)
+ panic("rename: EXDEV");
+ /*
+ * Short circuit rename(foo, foo).
+ */
+ if (txp->i_number == ip->i_number)
+ panic("rename: same file");
+ /*
+ * If the parent directory is "sticky", then the user must
+ * own the parent directory, or the destination of the rename,
+ * otherwise the destination may not be changed (except by
+ * root). This implements append-only directories.
+ */
+ if ((tdp->i_mode & S_ISTXT) &&
+ kauth_authorize_generic(tcnp->cn_cred,
+ KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
+ kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid &&
+ txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+ error = EPERM;
+ goto bad;
+ }
+ /*
+ * Target must be empty if a directory and have no links
+ * to it. Also, ensure source and target are compatible
+ * (both directories, or both not directories).
+ */
+ if ((txp->i_mode & IFMT) == IFDIR) {
+ if (txp->i_nlink > 2 ||
+ !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) {
+ error = ENOTEMPTY;
+ goto bad;
+ }
+ if (!doingdirectory) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ cache_purge(tdvp);
+ } else if (doingdirectory) {
+ error = EISDIR;
+ goto bad;
+ }
+ if ((error = ufs_dirrewrite(tdp, to_ulr.ulr_offset,
+ txp, ip->i_number,
+ IFTODT(ip->i_mode), doingdirectory && newparent ?
+ newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0)
+ goto bad;
+ if (doingdirectory) {
+ /*
+ * Truncate inode. The only stuff left in the directory
+ * is "." and "..". The "." reference is inconsequential
+ * since we are quashing it. We have removed the "."
+ * reference and the reference in the parent directory,
+ * but there may be other hard links.
+ */
+ if (!newparent) {
+ tdp->i_nlink--;
+ DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+ tdp->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
+ }
+ txp->i_nlink--;
+ DIP_ASSIGN(txp, nlink, txp->i_nlink);
+ txp->i_flag |= IN_CHANGE;
+ if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
+ tcnp->cn_cred)))
+ goto bad;
+ }
+ VN_KNOTE(tdvp, NOTE_WRITE);
+ VN_KNOTE(tvp, NOTE_DELETE);
+ }
+
+ /*
+ * Handle case where the directory entry we need to remove,
+ * which is/was at from_ulr.ulr_offset, or the one before it,
+ * which is/was at from_ulr.ulr_offset - from_ulr.ulr_count,
+ * may have been moved when the directory insertion above
+ * performed compaction.
+ */
+ if (tdp->i_number == fdp->i_number &&
+ ulr_overlap(&from_ulr, &to_ulr)) {
+
+ struct buf *bp;
+ struct direct *ep;
+ struct ufsmount *ump = fdp->i_ump;
+ doff_t curpos;
+ doff_t endsearch; /* offset to end directory search */
+ uint32_t prev_reclen;
+ int dirblksiz = ump->um_dirblksiz;
+ const int needswap = UFS_MPNEEDSWAP(ump);
+ u_long bmask;
+ int namlen, entryoffsetinblock;
+ char *dirbuf;
+
+ bmask = fdvp->v_mount->mnt_stat.f_iosize - 1;
+
+ /*
+ * The fcnp entry will be somewhere between the start of
+ * compaction (to_ulr.ulr_offset) and the original location
+ * (from_ulr.ulr_offset).
+ */
+ curpos = to_ulr.ulr_offset;
+ endsearch = from_ulr.ulr_offset + from_ulr.ulr_reclen;
+ entryoffsetinblock = 0;
+
+ /*
+ * Get the directory block containing the start of
+ * compaction.
+ */
+ error = ufs_blkatoff(fdvp, (off_t)to_ulr.ulr_offset, &dirbuf,
+ &bp, false);
+ if (error)
+ goto bad;
+
+ /*
+ * Keep existing ulr_count (length of previous record)
+ * for the case where compaction did not include the
+ * previous entry but started at the from-entry.
+ */
+ prev_reclen = from_ulr.ulr_count;
+
+ while (curpos < endsearch) {
+ uint32_t reclen;
+
+ /*
+ * If necessary, get the next directory block.
+ *
+ * dholland 7/13/11 to the best of my understanding
+ * this should never happen; compaction occurs only
+ * within single blocks. I think.
+ */
+ if ((curpos & bmask) == 0) {
+ if (bp != NULL)
+ brelse(bp, 0);
+ error = ufs_blkatoff(fdvp, (off_t)curpos,
+ &dirbuf, &bp, false);
+ if (error)
+ goto bad;
+ entryoffsetinblock = 0;
+ }
+
+ KASSERT(bp != NULL);
+ ep = (struct direct *)(dirbuf + entryoffsetinblock);
+ reclen = ufs_rw16(ep->d_reclen, needswap);
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (FSFMT(fdvp) && needswap == 0)
+ namlen = ep->d_type;
+ else
+ namlen = ep->d_namlen;
+#else
+ if (FSFMT(fdvp) && needswap != 0)
+ namlen = ep->d_type;
+ else
+ namlen = ep->d_namlen;
+#endif
+ if ((ep->d_ino != 0) &&
+ (ufs_rw32(ep->d_ino, needswap) != WINO) &&
+ (namlen == fcnp->cn_namelen) &&
+ memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) {
+ from_ulr.ulr_reclen = reclen;
+ break;
+ }
+ curpos += reclen;
+ entryoffsetinblock += reclen;
+ prev_reclen = reclen;
+ }
+
+ from_ulr.ulr_offset = curpos;
+ from_ulr.ulr_count = prev_reclen;
+
+ KASSERT(curpos <= endsearch);
+
+ /*
+ * If ulr_offset points to start of a directory block,
+ * clear ulr_count so ufs_dirremove() doesn't try to
+ * merge free space over a directory block boundary.
+ */
+ if ((from_ulr.ulr_offset & (dirblksiz - 1)) == 0)
+ from_ulr.ulr_count = 0;
+
+ brelse(bp, 0);
+ }
+
+ /*
+ * 3) Unlink the source.
+ */
+
+#if 0
+ /*
+ * Ensure that the directory entry still exists and has not
+ * changed while the new name has been entered. If the source is
+ * a file then the entry may have been unlinked or renamed. In
+ * either case there is no further work to be done. If the source
+ * is a directory then it cannot have been rmdir'ed; The IRENAME
+ * flag ensures that it cannot be moved by another rename or removed
+ * by a rmdir.
+ */
+#endif
+ KASSERT(fxp == ip);
+
+ /*
+ * If the source is a directory with a new parent, the link
+ * count of the old parent directory must be decremented and
+ * ".." set to point to the new parent.
+ */
+ if (doingdirectory && newparent) {
+ KASSERT(fdp != NULL);
+ ufs_dirrewrite(fxp, mastertemplate.dot_reclen,
+ fdp, newparent, DT_DIR, 0, IN_CHANGE);
+ cache_purge(fdvp);
+ }
+ error = ufs_dirremove(fdvp, &from_ulr,
+ fxp, fcnp->cn_flags, 0);
+ fxp->i_flag &= ~IN_RENAME;
+
+ VN_KNOTE(fvp, NOTE_RENAME);
+ goto done;
+
+ out:
+ goto out2;
+
+ /* exit routines from steps 1 & 2 */
+ bad:
+ if (doingdirectory)
+ ip->i_flag &= ~IN_RENAME;
+ ip->i_nlink--;
+ DIP_ASSIGN(ip, nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ ip->i_flag &= ~IN_RENAME;
+ UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
+ done:
+ UFS_WAPBL_END(fdvp->v_mount);
+ out2:
+ /*
+ * clear IN_RENAME - some exit paths happen too early to go
+ * through the cleanup done in the "bad" case above, so we
+ * always do this mini-cleanup here.
+ */
+ ip->i_flag &= ~IN_RENAME;
+
+ VOP_UNLOCK(fdvp);
+ if (tdvp != fdvp) {
+ VOP_UNLOCK(tdvp);
+ }
+ VOP_UNLOCK(fvp);
+ if (tvp && tvp != fvp) {
+ VOP_UNLOCK(tvp);
+ }
+
+ vrele(fdvp);
+ vrele(tdvp);
+ vrele(fvp);
+ if (tvp) {
+ vrele(tvp);
+ }
+
+ fstrans_done(mp);
+ return (error);
+
+ abort_withlocks:
+ VOP_UNLOCK(fdvp);
+ if (tdvp != fdvp) {
+ VOP_UNLOCK(tdvp);
+ }
+ VOP_UNLOCK(fvp);
+ if (tvp && tvp != fvp) {
+ VOP_UNLOCK(tvp);
+ }
+
+ abort:
+ VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+ VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+ vrele(tdvp);
+ if (tvp) {
+ vrele(tvp);
+ }
+ vrele(fdvp);
+ if (fvp) {
+ vrele(fvp);
+ }
+ return (error);
+}
+
+int
+ufs_mkdir(void *v)
+{
+ struct vop_mkdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap = v;
+ struct vnode *dvp = ap->a_dvp, *tvp;
+ struct vattr *vap = ap->a_vap;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *ip, *dp = VTOI(dvp);
+ struct buf *bp;
+ struct dirtemplate dirtemplate;
+ struct direct *newdir;
+ int error, dmode;
+ struct ufsmount *ump = dp->i_ump;
+ int dirblksiz = ump->um_dirblksiz;
+ struct ufs_lookup_results *ulr;
+
+ fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+
+ /* XXX should handle this material another way */
+ ulr = &dp->i_crap;
+ UFS_CHECK_CRAPCOUNTER(dp);
+
+ if ((nlink_t)dp->i_nlink >= LINK_MAX) {
+ error = EMLINK;
+ goto out;
+ }
+ dmode = vap->va_mode & ACCESSPERMS;
+ dmode |= IFDIR;
+ /*
+ * Must simulate part of ufs_makeinode here to acquire the inode,
+ * but not have it entered in the parent directory. The entry is
+ * made later after writing "." and ".." entries.
+ */
+ if ((error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, ap->a_vpp)) != 0)
+ goto out;
+
+ tvp = *ap->a_vpp;
+ ip = VTOI(tvp);
+
+ error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount);
+ if (error) {
+ UFS_VFREE(tvp, ip->i_number, dmode);
+ vput(tvp);
+ goto out;
+ }
+ ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+ DIP_ASSIGN(ip, uid, ip->i_uid);
+ ip->i_gid = dp->i_gid;
+ DIP_ASSIGN(ip, gid, ip->i_gid);
+#if defined(QUOTA) || defined(QUOTA2)
+ if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
+ UFS_VFREE(tvp, ip->i_number, dmode);
+ UFS_WAPBL_END(dvp->v_mount);
+ fstrans_done(dvp->v_mount);
+ vput(tvp);
+ vput(dvp);
+ return (error);
+ }
+#endif
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ ip->i_mode = dmode;
+ DIP_ASSIGN(ip, mode, dmode);
+ tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */
+ ip->i_nlink = 2;
+ DIP_ASSIGN(ip, nlink, 2);
+ if (cnp->cn_flags & ISWHITEOUT) {
+ ip->i_flags |= UF_OPAQUE;
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ }
+
+ /*
+ * Bump link count in parent directory to reflect work done below.
+ * Should be done before reference is created so cleanup is
+ * possible if we crash.
+ */
+ dp->i_nlink++;
+ DIP_ASSIGN(dp, nlink, dp->i_nlink);
+ dp->i_flag |= IN_CHANGE;
+ if ((error = UFS_UPDATE(dvp, NULL, NULL, UPDATE_DIROP)) != 0)
+ goto bad;
+
+ /*
+ * Initialize directory with "." and ".." from static template.
+ */
+ dirtemplate = mastertemplate;
+ dirtemplate.dotdot_reclen = dirblksiz - dirtemplate.dot_reclen;
+ dirtemplate.dot_ino = ufs_rw32(ip->i_number, UFS_MPNEEDSWAP(ump));
+ dirtemplate.dotdot_ino = ufs_rw32(dp->i_number, UFS_MPNEEDSWAP(ump));
+ dirtemplate.dot_reclen = ufs_rw16(dirtemplate.dot_reclen,
+ UFS_MPNEEDSWAP(ump));
+ dirtemplate.dotdot_reclen = ufs_rw16(dirtemplate.dotdot_reclen,
+ UFS_MPNEEDSWAP(ump));
+ if (ump->um_maxsymlinklen <= 0) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ if (UFS_MPNEEDSWAP(ump) == 0)
+#else
+ if (UFS_MPNEEDSWAP(ump) != 0)
+#endif
+ {
+ dirtemplate.dot_type = dirtemplate.dot_namlen;
+ dirtemplate.dotdot_type = dirtemplate.dotdot_namlen;
+ dirtemplate.dot_namlen = dirtemplate.dotdot_namlen = 0;
+ } else
+ dirtemplate.dot_type = dirtemplate.dotdot_type = 0;
+ }
+ if ((error = UFS_BALLOC(tvp, (off_t)0, dirblksiz, cnp->cn_cred,
+ B_CLRBUF, &bp)) != 0)
+ goto bad;
+ ip->i_size = dirblksiz;
+ DIP_ASSIGN(ip, size, dirblksiz);
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ uvm_vnp_setsize(tvp, ip->i_size);
+ memcpy((void *)bp->b_data, (void *)&dirtemplate, sizeof dirtemplate);
+
+ /*
+ * Directory set up, now install it's entry in the parent directory.
+ * We must write out the buffer containing the new directory body
+ * before entering the new name in the parent.
+ */
+ if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
+ goto bad;
+ if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) {
+ goto bad;
+ }
+ newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+ ufs_makedirentry(ip, cnp, newdir);
+ error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp);
+ pool_cache_put(ufs_direct_cache, newdir);
+ bad:
+ if (error == 0) {
+ VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+ UFS_WAPBL_END(dvp->v_mount);
+ } else {
+ dp->i_nlink--;
+ DIP_ASSIGN(dp, nlink, dp->i_nlink);
+ dp->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+ /*
+ * No need to do an explicit UFS_TRUNCATE here, vrele will
+ * do this for us because we set the link count to 0.
+ */
+ ip->i_nlink = 0;
+ DIP_ASSIGN(ip, nlink, 0);
+ ip->i_flag |= IN_CHANGE;
+ /* If IN_ADIROP, account for it */
+ UFS_UNMARK_VNODE(tvp);
+ UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP);
+ UFS_WAPBL_END(dvp->v_mount);
+ vput(tvp);
+ }
+ out:
+ fstrans_done(dvp->v_mount);
+ vput(dvp);
+ return (error);
+}
+
+int
+ufs_rmdir(void *v)
+{
+ struct vop_rmdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap = v;
+ struct vnode *vp, *dvp;
+ struct componentname *cnp;
+ struct inode *ip, *dp;
+ int error;
+ struct ufs_lookup_results *ulr;
+
+ vp = ap->a_vp;
+ dvp = ap->a_dvp;
+ cnp = ap->a_cnp;
+ ip = VTOI(vp);
+ dp = VTOI(dvp);
+
+ /* XXX should handle this material another way */
+ ulr = &dp->i_crap;
+ UFS_CHECK_CRAPCOUNTER(dp);
+
+ /*
+ * No rmdir "." or of mounted directories please.
+ */
+ if (dp == ip || vp->v_mountedhere != NULL) {
+ if (dp == ip)
+ vrele(dvp);
+ else
+ vput(dvp);
+ vput(vp);
+ return (EINVAL);
+ }
+
+ fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+
+ /*
+ * Do not remove a directory that is in the process of being renamed.
+ * Verify that the directory is empty (and valid). (Rmdir ".." won't
+ * be valid since ".." will contain a reference to the current
+ * directory and thus be non-empty.)
+ */
+ error = 0;
+ if (ip->i_flag & IN_RENAME) {
+ error = EINVAL;
+ goto out;
+ }
+ if (ip->i_nlink != 2 ||
+ !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
+ error = ENOTEMPTY;
+ goto out;
+ }
+ if ((dp->i_flags & APPEND) ||
+ (ip->i_flags & (IMMUTABLE | APPEND))) {
+ error = EPERM;
+ goto out;
+ }
+ error = UFS_WAPBL_BEGIN(dvp->v_mount);
+ if (error)
+ goto out;
+ /*
+ * Delete reference to directory before purging
+ * inode. If we crash in between, the directory
+ * will be reattached to lost+found,
+ */
+ error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1);
+ if (error) {
+ UFS_WAPBL_END(dvp->v_mount);
+ goto out;
+ }
+ VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+ cache_purge(dvp);
+ /*
+ * Truncate inode. The only stuff left in the directory is "." and
+ * "..". The "." reference is inconsequential since we're quashing
+ * it.
+ */
+ dp->i_nlink--;
+ DIP_ASSIGN(dp, nlink, dp->i_nlink);
+ dp->i_flag |= IN_CHANGE;
+ UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+ ip->i_nlink--;
+ DIP_ASSIGN(ip, nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
+ cache_purge(vp);
+ /*
+ * Unlock the log while we still have reference to unlinked
+ * directory vp so that it will not get locked for recycling
+ */
+ UFS_WAPBL_END(dvp->v_mount);
+#ifdef UFS_DIRHASH
+ if (ip->i_dirhash != NULL)
+ ufsdirhash_free(ip);
+#endif
+ out:
+ VN_KNOTE(vp, NOTE_DELETE);
+ vput(vp);
+ fstrans_done(dvp->v_mount);
+ vput(dvp);
+ return (error);
+}
+
+/*
+ * symlink -- make a symbolic link
+ */
+int
+ufs_symlink(void *v)
+{
+ struct vop_symlink_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+ } */ *ap = v;
+ struct vnode *vp, **vpp;
+ struct inode *ip;
+ int len, error;
+ struct ufs_lookup_results *ulr;
+
+ vpp = ap->a_vpp;
+
+ /* XXX should handle this material another way */
+ ulr = &VTOI(ap->a_dvp)->i_crap;
+ UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+ /*
+ * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+ * ufs_makeinode
+ */
+ fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
+ error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, ulr,
+ vpp, ap->a_cnp);
+ if (error)
+ goto out;
+ VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ vp = *vpp;
+ len = strlen(ap->a_target);
+ ip = VTOI(vp);
+ if (len < ip->i_ump->um_maxsymlinklen) {
+ memcpy((char *)SHORTLINK(ip), ap->a_target, len);
+ ip->i_size = len;
+ DIP_ASSIGN(ip, size, len);
+ uvm_vnp_setsize(vp, ip->i_size);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (vp->v_mount->mnt_flag & MNT_RELATIME)
+ ip->i_flag |= IN_ACCESS;
+ UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+ } else
+ error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
+ UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED,
+ ap->a_cnp->cn_cred, NULL, NULL);
+ UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
+ if (error)
+ vput(vp);
+out:
+ fstrans_done(ap->a_dvp->v_mount);
+ return (error);
+}
+
+/*
+ * Vnode op for reading directories.
+ *
+ * This routine handles converting from the on-disk directory format
+ * "struct direct" to the in-memory format "struct dirent" as well as
+ * byte swapping the entries if necessary.
+ */
+int
+ufs_readdir(void *v)
+{
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ kauth_cred_t a_cred;
+ int *a_eofflag;
+ off_t **a_cookies;
+ int *ncookies;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct direct *cdp, *ecdp;
+ struct dirent *ndp;
+ char *cdbuf, *ndbuf, *endp;
+ struct uio auio, *uio;
+ struct iovec aiov;
+ int error;
+ size_t count, ccount, rcount;
+ off_t off, *ccp;
+ off_t startoff;
+ size_t skipbytes;
+ struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+ int nswap = UFS_MPNEEDSWAP(ump);
+#if BYTE_ORDER == LITTLE_ENDIAN
+ int needswap = ump->um_maxsymlinklen <= 0 && nswap == 0;
+#else
+ int needswap = ump->um_maxsymlinklen <= 0 && nswap != 0;
+#endif
+ uio = ap->a_uio;
+ count = uio->uio_resid;
+ rcount = count - ((uio->uio_offset + count) & (ump->um_dirblksiz - 1));
+
+ if (rcount < _DIRENT_MINSIZE(cdp) || count < _DIRENT_MINSIZE(ndp))
+ return EINVAL;
+
+ startoff = uio->uio_offset & ~(ump->um_dirblksiz - 1);
+ skipbytes = uio->uio_offset - startoff;
+ rcount += skipbytes;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = startoff;
+ auio.uio_resid = rcount;
+ UIO_SETUP_SYSSPACE(&auio);
+ auio.uio_rw = UIO_READ;
+ cdbuf = malloc(rcount, M_TEMP, M_WAITOK);
+ aiov.iov_base = cdbuf;
+ aiov.iov_len = rcount;
+ error = VOP_READ(vp, &auio, 0, ap->a_cred);
+ if (error != 0) {
+ free(cdbuf, M_TEMP);
+ return error;
+ }
+
+ rcount -= auio.uio_resid;
+
+ cdp = (struct direct *)(void *)cdbuf;
+ ecdp = (struct direct *)(void *)&cdbuf[rcount];
+
+ ndbuf = malloc(count, M_TEMP, M_WAITOK);
+ ndp = (struct dirent *)(void *)ndbuf;
+ endp = &ndbuf[count];
+
+ off = uio->uio_offset;
+ if (ap->a_cookies) {
+ ccount = rcount / _DIRENT_RECLEN(cdp, 1);
+ ccp = *(ap->a_cookies) = malloc(ccount * sizeof(*ccp),
+ M_TEMP, M_WAITOK);
+ } else {
+ /* XXX: GCC */
+ ccount = 0;
+ ccp = NULL;
+ }
+
+ while (cdp < ecdp) {
+ cdp->d_reclen = ufs_rw16(cdp->d_reclen, nswap);
+ if (skipbytes > 0) {
+ if (cdp->d_reclen <= skipbytes) {
+ skipbytes -= cdp->d_reclen;
+ cdp = _DIRENT_NEXT(cdp);
+ continue;
+ }
+ /*
+ * invalid cookie.
+ */
+ error = EINVAL;
+ goto out;
+ }
+ if (cdp->d_reclen == 0) {
+ struct dirent *ondp = ndp;
+ ndp->d_reclen = _DIRENT_MINSIZE(ndp);
+ ndp = _DIRENT_NEXT(ndp);
+ ondp->d_reclen = 0;
+ cdp = ecdp;
+ break;
+ }
+ if (needswap) {
+ ndp->d_type = cdp->d_namlen;
+ ndp->d_namlen = cdp->d_type;
+ } else {
+ ndp->d_type = cdp->d_type;
+ ndp->d_namlen = cdp->d_namlen;
+ }
+ ndp->d_reclen = _DIRENT_RECLEN(ndp, ndp->d_namlen);
+ if ((char *)(void *)ndp + ndp->d_reclen +
+ _DIRENT_MINSIZE(ndp) > endp)
+ break;
+ ndp->d_fileno = ufs_rw32(cdp->d_ino, nswap);
+ (void)memcpy(ndp->d_name, cdp->d_name, ndp->d_namlen);
+ memset(&ndp->d_name[ndp->d_namlen], 0,
+ ndp->d_reclen - _DIRENT_NAMEOFF(ndp) - ndp->d_namlen);
+ off += cdp->d_reclen;
+ if (ap->a_cookies) {
+ KASSERT(ccp - *(ap->a_cookies) < ccount);
+ *(ccp++) = off;
+ }
+ ndp = _DIRENT_NEXT(ndp);
+ cdp = _DIRENT_NEXT(cdp);
+ }
+
+ count = ((char *)(void *)ndp - ndbuf);
+ error = uiomove(ndbuf, count, uio);
+out:
+ if (ap->a_cookies) {
+ if (error) {
+ free(*(ap->a_cookies), M_TEMP);
+ *(ap->a_cookies) = NULL;
+ *(ap->a_ncookies) = 0;
+ } else {
+ *ap->a_ncookies = ccp - *(ap->a_cookies);
+ }
+ }
+ uio->uio_offset = off;
+ free(ndbuf, M_TEMP);
+ free(cdbuf, M_TEMP);
+ *ap->a_eofflag = VTOI(vp)->i_size <= uio->uio_offset;
+ return error;
+}
+
+/*
+ * Return target name of a symbolic link
+ */
+int
+ufs_readlink(void *v)
+{
+ struct vop_readlink_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+ int isize;
+
+ isize = ip->i_size;
+ if (isize < ump->um_maxsymlinklen ||
+ (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) {
+ uiomove((char *)SHORTLINK(ip), isize, ap->a_uio);
+ return (0);
+ }
+ return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+/*
+ * Calculate the logical to physical mapping if not done already,
+ * then call the device strategy routine.
+ */
+int
+ufs_strategy(void *v)
+{
+ struct vop_strategy_args /* {
+ struct vnode *a_vp;
+ struct buf *a_bp;
+ } */ *ap = v;
+ struct buf *bp;
+ struct vnode *vp;
+ struct inode *ip;
+ struct mount *mp;
+ int error;
+
+ bp = ap->a_bp;
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ panic("ufs_strategy: spec");
+ KASSERT(bp->b_bcount != 0);
+ if (bp->b_blkno == bp->b_lblkno) {
+ error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL);
+ if (error) {
+ bp->b_error = error;
+ biodone(bp);
+ return (error);
+ }
+ if (bp->b_blkno == -1) /* no valid data */
+ clrbuf(bp);
+ }
+ if (bp->b_blkno < 0) { /* block is not on disk */
+ biodone(bp);
+ return (0);
+ }
+ vp = ip->i_devvp;
+
+ error = VOP_STRATEGY(vp, bp);
+ if (error)
+ return error;
+
+ if (!BUF_ISREAD(bp))
+ return 0;
+
+ mp = wapbl_vptomp(vp);
+ if (mp == NULL || mp->mnt_wapbl_replay == NULL ||
+ !WAPBL_REPLAY_ISOPEN(mp) ||
+ !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount))
+ return 0;
+
+ error = biowait(bp);
+ if (error)
+ return error;
+
+ error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount);
+ if (error) {
+ mutex_enter(&bufcache_lock);
+ SET(bp->b_cflags, BC_INVAL);
+ mutex_exit(&bufcache_lock);
+ }
+ return error;
+}
+
+/*
+ * Print out the contents of an inode.
+ */
+int
+ufs_print(void *v)
+{
+ struct vop_print_args /* {
+ struct vnode *a_vp;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ printf("tag VT_UFS, ino %llu, on dev %llu, %llu",
+ (unsigned long long)ip->i_number,
+ (unsigned long long)major(ip->i_dev),
+ (unsigned long long)minor(ip->i_dev));
+ printf(" flags 0x%x, nlink %d\n",
+ ip->i_flag, ip->i_nlink);
+ printf("\tmode 0%o, owner %d, group %d, size %qd",
+ ip->i_mode, ip->i_uid, ip->i_gid,
+ (long long)ip->i_size);
+ if (vp->v_type == VFIFO)
+ VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v);
+ printf("\n");
+ return (0);
+}
+
+/*
+ * Read wrapper for special devices.
+ */
+int
+ufsspec_read(void *v)
+{
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ /*
+ * Set access flag.
+ */
+ if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
+ VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
+ return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap));
+}
+
+/*
+ * Write wrapper for special devices.
+ */
+int
+ufsspec_write(void *v)
+{
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ /*
+ * Set update and change flags.
+ */
+ if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
+ VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
+ return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap));
+}
+
+/*
+ * Close wrapper for special devices.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+ufsspec_close(void *v)
+{
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ if (vp->v_usecount > 1)
+ UFS_ITIMES(vp, NULL, NULL, NULL);
+ return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Read wrapper for fifo's
+ */
+int
+ufsfifo_read(void *v)
+{
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ /*
+ * Set access flag.
+ */
+ VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
+ return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap));
+}
+
+/*
+ * Write wrapper for fifo's.
+ */
+int
+ufsfifo_write(void *v)
+{
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+
+ /*
+ * Set update and change flags.
+ */
+ VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
+ return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap));
+}
+
+/*
+ * Close wrapper for fifo's.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+ufsfifo_close(void *v)
+{
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ kauth_cred_t a_cred;
+ } */ *ap = v;
+ struct vnode *vp;
+ struct inode *ip;
+
+ vp = ap->a_vp;
+ ip = VTOI(vp);
+ if (ap->a_vp->v_usecount > 1)
+ UFS_ITIMES(vp, NULL, NULL, NULL);
+ return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Return POSIX pathconf information applicable to ufs filesystems.
+ */
+int
+ufs_pathconf(void *v)
+{
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ register_t *a_retval;
+ } */ *ap = v;
+
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = LINK_MAX;
+ return (0);
+ case _PC_NAME_MAX:
+ *ap->a_retval = FFS_MAXNAMLEN;
+ return (0);
+ case _PC_PATH_MAX:
+ *ap->a_retval = PATH_MAX;
+ return (0);
+ case _PC_PIPE_BUF:
+ *ap->a_retval = PIPE_BUF;
+ return (0);
+ case _PC_CHOWN_RESTRICTED:
+ *ap->a_retval = 1;
+ return (0);
+ case _PC_NO_TRUNC:
+ *ap->a_retval = 1;
+ return (0);
+ case _PC_SYNC_IO:
+ *ap->a_retval = 1;
+ return (0);
+ case _PC_FILESIZEBITS:
+ *ap->a_retval = 42;
+ return (0);
+ case _PC_SYMLINK_MAX:
+ *ap->a_retval = MAXPATHLEN;
+ return (0);
+ case _PC_2_SYMLINKS:
+ *ap->a_retval = 1;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+ufs_advlock(void *v)
+{
+ struct vop_advlock_args /* {
+ struct vnode *a_vp;
+ void * a_id;
+ int a_op;
+ struct flock *a_fl;
+ int a_flags;
+ } */ *ap = v;
+ struct inode *ip;
+
+ ip = VTOI(ap->a_vp);
+ return lf_advlock(ap, &ip->i_lockf, ip->i_size);
+}
+
+/*
+ * Initialize the vnode associated with a new inode, handle aliased
+ * vnodes.
+ */
+void
+ufs_vinit(struct mount *mntp, int (**specops)(void *), int (**fifoops)(void *),
+ struct vnode **vpp)
+{
+ struct timeval tv;
+ struct inode *ip;
+ struct vnode *vp;
+ dev_t rdev;
+ struct ufsmount *ump;
+
+ vp = *vpp;
+ ip = VTOI(vp);
+ switch(vp->v_type = IFTOVT(ip->i_mode)) {
+ case VCHR:
+ case VBLK:
+ vp->v_op = specops;
+ ump = ip->i_ump;
+ if (ump->um_fstype == UFS1)
+ rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
+ UFS_MPNEEDSWAP(ump));
+ else
+ rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
+ UFS_MPNEEDSWAP(ump));
+ spec_node_init(vp, rdev);
+ break;
+ case VFIFO:
+ vp->v_op = fifoops;
+ break;
+ case VNON:
+ case VBAD:
+ case VSOCK:
+ case VLNK:
+ case VDIR:
+ case VREG:
+ break;
+ }
+ if (ip->i_number == ROOTINO)
+ vp->v_vflag |= VV_ROOT;
+ /*
+ * Initialize modrev times
+ */
+ getmicrouptime(&tv);
+ ip->i_modrev = (uint64_t)(uint)tv.tv_sec << 32
+ | tv.tv_usec * 4294u;
+ *vpp = vp;
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+ufs_makeinode(int mode, struct vnode *dvp, const struct ufs_lookup_results *ulr,
+ struct vnode **vpp, struct componentname *cnp)
+{
+ struct inode *ip, *pdir;
+ struct direct *newdir;
+ struct vnode *tvp;
+ int error, ismember = 0;
+
+ UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount);
+
+ pdir = VTOI(dvp);
+
+ if ((mode & IFMT) == 0)
+ mode |= IFREG;
+
+ if ((error = UFS_VALLOC(dvp, mode, cnp->cn_cred, vpp)) != 0) {
+ vput(dvp);
+ return (error);
+ }
+ tvp = *vpp;
+ ip = VTOI(tvp);
+ ip->i_gid = pdir->i_gid;
+ DIP_ASSIGN(ip, gid, ip->i_gid);
+ ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+ DIP_ASSIGN(ip, uid, ip->i_uid);
+ error = UFS_WAPBL_BEGIN1(dvp->v_mount, dvp);
+ if (error) {
+ /*
+ * Note, we can't VOP_VFREE(tvp) here like we should
+ * because we can't write to the disk. Instead, we leave
+ * the vnode dangling from the journal.
+ */
+ vput(tvp);
+ vput(dvp);
+ return (error);
+ }
+#if defined(QUOTA) || defined(QUOTA2)
+ if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
+ UFS_VFREE(tvp, ip->i_number, mode);
+ UFS_WAPBL_END1(dvp->v_mount, dvp);
+ vput(tvp);
+ vput(dvp);
+ return (error);
+ }
+#endif
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ ip->i_mode = mode;
+ DIP_ASSIGN(ip, mode, mode);
+ tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */
+ ip->i_nlink = 1;
+ DIP_ASSIGN(ip, nlink, 1);
+ if ((ip->i_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+ ip->i_gid, &ismember) != 0 || !ismember) &&
+ kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+ ip->i_mode &= ~ISGID;
+ DIP_ASSIGN(ip, mode, ip->i_mode);
+ }
+
+ if (cnp->cn_flags & ISWHITEOUT) {
+ ip->i_flags |= UF_OPAQUE;
+ DIP_ASSIGN(ip, flags, ip->i_flags);
+ }
+
+ /*
+ * Make sure inode goes to disk before directory entry.
+ */
+ if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0)
+ goto bad;
+ newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+ ufs_makedirentry(ip, cnp, newdir);
+ error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL);
+ pool_cache_put(ufs_direct_cache, newdir);
+ if (error)
+ goto bad;
+ vput(dvp);
+ *vpp = tvp;
+ return (0);
+
+ bad:
+ /*
+ * Write error occurred trying to update the inode
+ * or the directory so must deallocate the inode.
+ */
+ ip->i_nlink = 0;
+ DIP_ASSIGN(ip, nlink, 0);
+ ip->i_flag |= IN_CHANGE;
+ /* If IN_ADIROP, account for it */
+ UFS_UNMARK_VNODE(tvp);
+ UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0);
+ tvp->v_type = VNON; /* explodes later if VBLK */
+ UFS_WAPBL_END1(dvp->v_mount, dvp);
+ vput(tvp);
+ vput(dvp);
+ return (error);
+}
+
+/*
+ * Allocate len bytes at offset off.
+ */
+int
+ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+ kauth_cred_t cred)
+{
+ struct inode *ip = VTOI(vp);
+ int error, delta, bshift, bsize;
+ UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+ error = 0;
+ bshift = vp->v_mount->mnt_fs_bshift;
+ bsize = 1 << bshift;
+
+ delta = off & (bsize - 1);
+ off -= delta;
+ len += delta;
+
+ while (len > 0) {
+ bsize = MIN(bsize, len);
+
+ error = UFS_BALLOC(vp, off, bsize, cred, flags, NULL);
+ if (error) {
+ goto out;
+ }
+
+ /*
+ * increase file size now, UFS_BALLOC() requires that
+ * EOF be up-to-date before each call.
+ */
+
+ if (ip->i_size < off + bsize) {
+ UVMHIST_LOG(ubchist, "vp %p old 0x%x new 0x%x",
+ vp, ip->i_size, off + bsize, 0);
+ ip->i_size = off + bsize;
+ DIP_ASSIGN(ip, size, ip->i_size);
+ }
+
+ off += bsize;
+ len -= bsize;
+ }
+
+out:
+ UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+ return error;
+}
+
+void
+ufs_gop_markupdate(struct vnode *vp, int flags)
+{
+ u_int32_t mask = 0;
+
+ if ((flags & GOP_UPDATE_ACCESSED) != 0) {
+ mask = IN_ACCESS;
+ }
+ if ((flags & GOP_UPDATE_MODIFIED) != 0) {
+ if (vp->v_type == VREG) {
+ mask |= IN_CHANGE | IN_UPDATE;
+ } else {
+ mask |= IN_MODIFY;
+ }
+ }
+ if (mask) {
+ struct inode *ip = VTOI(vp);
+
+ ip->i_flag |= mask;
+ }
+}
--- /dev/null
+/* $NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $ */
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+#ifdef WAPBL_DEBUG_INODES
+#error WAPBL_DEBUG_INODES: not functional before ufs_wapbl.c is updated
+void
+ufs_wapbl_verify_inodes(struct mount *mp, const char *str)
+{
+ struct vnode *vp, *nvp;
+ struct inode *ip;
+ struct buf *bp, *nbp;
+
+ mutex_enter(&mntvnode_lock);
+ loop:
+ TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
+ /*
+ * If the vnode that we are about to sync is no longer
+ * associated with this mount point, start over.
+ */
+ if (vp->v_mount != mp)
+ goto loop;
+ mutex_enter(&vp->v_interlock);
+ nvp = TAILQ_NEXT(vp, v_mntvnodes);
+ ip = VTOI(vp);
+ if (vp->v_type == VNON) {
+ mutex_exit(&vp->v_interlock);
+ continue;
+ }
+ /* verify that update has been called on all inodes */
+ if (ip->i_flag & (IN_CHANGE | IN_UPDATE)) {
+ panic("wapbl_verify: mp %p: dirty vnode %p (inode %p): 0x%x\n",
+ mp, vp, ip, ip->i_flag);
+ }
+ mutex_exit(&mntvnode_lock);
+
+ mutex_enter(&bufcache_lock);
+ for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+ if ((bp->b_cflags & BC_BUSY)) {
+ continue;
+ }
+ KASSERT((bp->b_oflags & BO_DELWRI) != 0);
+ KASSERT((bp->b_flags & B_LOCKED) != 0);
+ }
+ mutex_exit(&bufcache_lock);
+ mutex_exit(&vp->v_interlock);
+
+ mutex_enter(&mntvnode_lock);
+ }
+ mutex_exit(&mntvnode_lock);
+
+ vp = VFSTOUFS(mp)->um_devvp;
+ mutex_enter(&vp->v_interlock);
+ mutex_enter(&bufcache_lock);
+ for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+ nbp = LIST_NEXT(bp, b_vnbufs);
+ if ((bp->b_cflags & BC_BUSY)) {
+ continue;
+ }
+ KASSERT((bp->b_oflags & BO_DELWRI) != 0);
+ KASSERT((bp->b_flags & B_LOCKED) != 0);
+ }
+ mutex_exit(&bufcache_lock);
+ mutex_exit(&vp->v_interlock);
+}
+#endif /* WAPBL_DEBUG_INODES */
# Timestamp in UTC,minixpath,netbsdpath
# minixpath: path in Minix source tree (starting from /usr/src/)
# netbsdpath: path in BSD source tree (starting from src/)
+2011/12/25 06:09:09,sys/arch/i386/stand
2012/02/10 16:16:12,share/zoneinfo
2011/05/26 00:00:00,external/public-domain/xz
2011/09/30 01:32:21,usr.bin/gzip
2011/08/27 12:55:09,bin/date
2011/10/17 09:24:54,common/lib/libprop
-2011/11/28 12:50:07,include/ufs,sys/ufs
+2011/11/28 12:50:07,sys/ufs
2010/09/10 15:51:20,sbin/newfs_ext2fs
2011/09/16 16:13:18,sbin/fsck_ext2fs
2011/09/30 22:08:19,lib/libprop
-2011/08/30 12:39:55,common/include/arch/i386,sys/arch/i386/include
2011/11/13 22:19:09,common/include
2011/01/17 18:11:10,common/lib/libc
2011/01/21 23:36:49,lib/libc
2011/09/01 13:37:33,usr.bin/du
2010/07/07 21:24:34,usr.bin/man
2009/05/08 12:48:43,usr.bin/apropos
-2011/01/12 23:02:22,usr.bin/mdocml,external/bsd/mdocml
+2011/01/12 23:02:22,external/bsd/mdocml
2011/11/03 20:46:41,usr.sbin/installboot
2011/01/04 10:01:51,usr.sbin/pwd_mkdb
2011/01/04 10:30:21,usr.sbin/user
2007/05/28 12:06:25,usr.bin/bzip2recover
2009/04/02 21:39:33,libexec/makewhatis
2010/05/14 16:43:34,dist/bzip2
-2011/08/17 00:07:38,sys/arch/i386/stand/bootxx
-2011/12/25 06:09:09,sys/arch/i386/stand/boot
-2011/05/20 22:29:55,sys/arch/i386/stand/cdboot
-2011/09/21 18:15:59,sys/arch/i386/stand/mbr
-2011/11/28 07:56:54,sys/arch/i386/stand/lib
2012/01/16 18:47:57,sys/lib/libsa
2011/10/30 00:28:57,sys/lib/libz
.include <bsd.own.mk>
# NetBSD imports
-SUBDIR= indent m4 stat tic sed mkdep uniq seq du man mdocml \
+SUBDIR= indent m4 stat tic sed mkdep uniq seq du man \
apropos chpass newgrp passwd bzip2 bzip2recover gzip
# Non-NetBSD imports